Skip to content

Commit d3aa7db

Browse files
lemireanonrig
authored andcommitted
Optimize domain processing.
1 parent 13f2726 commit d3aa7db

File tree

3 files changed

+84
-10
lines changed

3 files changed

+84
-10
lines changed

include/ada/unicode.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,12 +82,21 @@ ada_really_inline constexpr bool is_forbidden_host_code_point(
8282
const char c) noexcept;
8383

8484
/**
85-
* Checks if the input is a forbidden domain code point.
85+
* Checks if the input contains a forbidden domain code point.
8686
* @see https://url.spec.whatwg.org/#forbidden-domain-code-point
8787
*/
8888
ada_really_inline constexpr bool contains_forbidden_domain_code_point(
8989
const char* input, size_t length) noexcept;
9090

91+
/**
92+
* Checks if the input contains a forbidden domain code point in which case
93+
* the first bit is set to 1. If the input contains an upper case ASCII letter,
94+
* then the second bit is set to 1.
95+
* @see https://url.spec.whatwg.org/#forbidden-domain-code-point
96+
*/
97+
ada_really_inline constexpr bool contains_forbidden_domain_code_point_or_upper(
98+
const char* input, size_t length) noexcept;
99+
91100
/**
92101
* Checks if the input is a forbidden doamin code point.
93102
* @see https://url.spec.whatwg.org/#forbidden-domain-code-point

src/unicode.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,44 @@ ada_really_inline constexpr bool contains_forbidden_domain_code_point(
146146
return accumulator;
147147
}
148148

149+
constexpr static uint8_t is_forbidden_domain_code_point_table_or_upper[] = {
150+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
151+
1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
152+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
153+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 0,
154+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
155+
0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
156+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
157+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
158+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
159+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
160+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
161+
162+
static_assert(sizeof(is_forbidden_domain_code_point_table_or_upper) == 256);
163+
static_assert(is_forbidden_domain_code_point_table_or_upper[uint8_t('A')] == 2);
164+
static_assert(is_forbidden_domain_code_point_table_or_upper[uint8_t('Z')] == 2);
165+
166+
ada_really_inline constexpr bool contains_forbidden_domain_code_point_or_upper(
167+
const char* input, size_t length) noexcept {
168+
size_t i = 0;
169+
uint8_t accumulator{};
170+
for (; i + 4 <= length; i += 4) {
171+
accumulator |=
172+
is_forbidden_domain_code_point_table_or_upper[uint8_t(input[i])];
173+
accumulator |=
174+
is_forbidden_domain_code_point_table_or_upper[uint8_t(input[i + 1])];
175+
accumulator |=
176+
is_forbidden_domain_code_point_table_or_upper[uint8_t(input[i + 2])];
177+
accumulator |=
178+
is_forbidden_domain_code_point_table_or_upper[uint8_t(input[i + 3])];
179+
}
180+
for (; i < length; i++) {
181+
accumulator |=
182+
is_forbidden_domain_code_point_table_or_upper[uint8_t(input[i])];
183+
}
184+
return accumulator;
185+
}
186+
149187
static_assert(unicode::is_forbidden_domain_code_point('%'));
150188
static_assert(unicode::is_forbidden_domain_code_point('\x7f'));
151189
static_assert(unicode::is_forbidden_domain_code_point('\0'));

src/url_aggregator.cpp

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -458,23 +458,50 @@ ada_really_inline bool url_aggregator::parse_host(std::string_view input) {
458458
// to ASCII with domain and false. The most common case is an ASCII input, in
459459
// which case we do not need to call the expensive 'to_ascii' if a few
460460
// conditions are met: no '%' and no 'xn-' subsequence.
461-
std::string _buffer = std::string(input);
462-
// This next function checks that the result is ascii, but we are going to
463-
// to check anyhow with is_forbidden.
464-
// bool is_ascii =
465-
unicode::to_lower_ascii(_buffer.data(), _buffer.size());
466-
bool is_forbidden = unicode::contains_forbidden_domain_code_point(
467-
_buffer.data(), _buffer.size());
468-
if (is_forbidden == 0 && _buffer.find("xn-") == std::string_view::npos) {
461+
462+
// Often, the input does not contain any forbidden code points, and no upper
463+
// case ASCII letter, then we can just copy it to the buffer. We want to
464+
// optimize for such a common case.
465+
uint8_t is_forbidden_or_upper =
466+
unicode::contains_forbidden_domain_code_point_or_upper(input.data(),
467+
input.size());
468+
// Minor optimization opportunity:
469+
// contains_forbidden_domain_code_point_or_upper could be extend to check for
470+
// the presence of characters that cannot appear in the ipv4 address and we
471+
// could also check whether x and n and - are present, and so we could skip
472+
// some of the checks below. However, the gains are likely to be small, and
473+
// the code would be more complex.
474+
if (is_forbidden_or_upper == 0 &&
475+
input.find("xn-") == std::string_view::npos) {
469476
// fast path
470-
update_base_hostname(_buffer);
477+
update_base_hostname(input);
471478
if (checkers::is_ipv4(get_hostname())) {
472479
ada_log("parse_host fast path ipv4");
473480
return parse_ipv4(get_hostname());
474481
}
475482
ada_log("parse_host fast path ", get_hostname());
476483
return true;
484+
} else if (is_forbidden_or_upper == 2) {
485+
// We have encountered at least one upper case ASCII letter, let us
486+
// try to convert it to lower case. If there is no 'xn-' in the result,
487+
// we can then use a secondary fast path.
488+
std::string _buffer = std::string(input);
489+
unicode::to_lower_ascii(_buffer.data(), _buffer.size());
490+
if (input.find("xn-") == std::string_view::npos) {
491+
// secondary fast path when input is not all lower case
492+
update_base_hostname(input);
493+
if (checkers::is_ipv4(get_hostname())) {
494+
ada_log("parse_host fast path ipv4");
495+
return parse_ipv4(get_hostname());
496+
}
497+
ada_log("parse_host fast path ", get_hostname());
498+
return true;
499+
}
477500
}
501+
// We have encountered at least one forbidden code point or the input contains
502+
// 'xn-' (case insensitive), so we need to call 'to_ascii' to perform the full
503+
// conversion.
504+
478505
ada_log("parse_host calling to_ascii");
479506
std::optional<std::string> host = std::string(get_hostname());
480507
is_valid = ada::unicode::to_ascii(host, input, input.find('%'));

0 commit comments

Comments
 (0)