1
- /* auto-generated on 2023-03-30 17:00:48 -0400. Do not edit! */
1
+ /* auto-generated on 2023-04-17 12:20:41 -0400. Do not edit! */
2
2
/* begin file src/ada.cpp */
3
3
#include "ada.h"
4
4
/* begin file src/checkers.cpp */
@@ -2753,7 +2753,7 @@ bool ascii_has_upper_case(char* input, size_t length) {
2753
2753
auto broadcast = [](uint8_t v) -> uint64_t { return 0x101010101010101 * v; };
2754
2754
uint64_t broadcast_80 = broadcast(0x80);
2755
2755
uint64_t broadcast_Ap = broadcast(128 - 'A');
2756
- uint64_t broadcast_Zp = broadcast(128 - 'Z');
2756
+ uint64_t broadcast_Zp = broadcast(128 - 'Z' - 1 );
2757
2757
size_t i = 0;
2758
2758
2759
2759
uint64_t runner{0};
@@ -2775,7 +2775,7 @@ void ascii_map(char* input, size_t length) {
2775
2775
auto broadcast = [](uint8_t v) -> uint64_t { return 0x101010101010101 * v; };
2776
2776
uint64_t broadcast_80 = broadcast(0x80);
2777
2777
uint64_t broadcast_Ap = broadcast(128 - 'A');
2778
- uint64_t broadcast_Zp = broadcast(128 - 'Z');
2778
+ uint64_t broadcast_Zp = broadcast(128 - 'Z' - 1 );
2779
2779
size_t i = 0;
2780
2780
2781
2781
for (; i + 7 < length; i += 8) {
@@ -9845,7 +9845,7 @@ constexpr bool to_lower_ascii(char* input, size_t length) noexcept {
9845
9845
auto broadcast = [](uint8_t v) -> uint64_t { return 0x101010101010101 * v; };
9846
9846
uint64_t broadcast_80 = broadcast(0x80);
9847
9847
uint64_t broadcast_Ap = broadcast(128 - 'A');
9848
- uint64_t broadcast_Zp = broadcast(128 - 'Z');
9848
+ uint64_t broadcast_Zp = broadcast(128 - 'Z' - 1 );
9849
9849
uint64_t non_ascii = 0;
9850
9850
size_t i = 0;
9851
9851
@@ -9961,7 +9961,7 @@ ada_really_inline constexpr bool is_forbidden_domain_code_point(
9961
9961
}
9962
9962
9963
9963
ada_really_inline constexpr bool contains_forbidden_domain_code_point(
9964
- char* input, size_t length) noexcept {
9964
+ const char* input, size_t length) noexcept {
9965
9965
size_t i = 0;
9966
9966
uint8_t accumulator{};
9967
9967
for (; i + 4 <= length; i += 4) {
@@ -9976,6 +9976,44 @@ ada_really_inline constexpr bool contains_forbidden_domain_code_point(
9976
9976
return accumulator;
9977
9977
}
9978
9978
9979
+ constexpr static uint8_t is_forbidden_domain_code_point_table_or_upper[] = {
9980
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
9981
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
9982
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
9983
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 0,
9984
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
9985
+ 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
9986
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
9987
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
9988
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
9989
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
9990
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
9991
+
9992
+ static_assert(sizeof(is_forbidden_domain_code_point_table_or_upper) == 256);
9993
+ static_assert(is_forbidden_domain_code_point_table_or_upper[uint8_t('A')] == 2);
9994
+ static_assert(is_forbidden_domain_code_point_table_or_upper[uint8_t('Z')] == 2);
9995
+
9996
+ ada_really_inline constexpr bool contains_forbidden_domain_code_point_or_upper(
9997
+ const char* input, size_t length) noexcept {
9998
+ size_t i = 0;
9999
+ uint8_t accumulator{};
10000
+ for (; i + 4 <= length; i += 4) {
10001
+ accumulator |=
10002
+ is_forbidden_domain_code_point_table_or_upper[uint8_t(input[i])];
10003
+ accumulator |=
10004
+ is_forbidden_domain_code_point_table_or_upper[uint8_t(input[i + 1])];
10005
+ accumulator |=
10006
+ is_forbidden_domain_code_point_table_or_upper[uint8_t(input[i + 2])];
10007
+ accumulator |=
10008
+ is_forbidden_domain_code_point_table_or_upper[uint8_t(input[i + 3])];
10009
+ }
10010
+ for (; i < length; i++) {
10011
+ accumulator |=
10012
+ is_forbidden_domain_code_point_table_or_upper[uint8_t(input[i])];
10013
+ }
10014
+ return accumulator;
10015
+ }
10016
+
9979
10017
static_assert(unicode::is_forbidden_domain_code_point('%'));
9980
10018
static_assert(unicode::is_forbidden_domain_code_point('\x7f'));
9981
10019
static_assert(unicode::is_forbidden_domain_code_point('\0'));
@@ -13473,23 +13511,50 @@ ada_really_inline bool url_aggregator::parse_host(std::string_view input) {
13473
13511
// to ASCII with domain and false. The most common case is an ASCII input, in
13474
13512
// which case we do not need to call the expensive 'to_ascii' if a few
13475
13513
// conditions are met: no '%' and no 'xn-' subsequence.
13476
- std::string _buffer = std::string(input);
13477
- // This next function checks that the result is ascii, but we are going to
13478
- // to check anyhow with is_forbidden.
13479
- // bool is_ascii =
13480
- unicode::to_lower_ascii(_buffer.data(), _buffer.size());
13481
- bool is_forbidden = unicode::contains_forbidden_domain_code_point(
13482
- _buffer.data(), _buffer.size());
13483
- if (is_forbidden == 0 && _buffer.find("xn-") == std::string_view::npos) {
13514
+
13515
+ // Often, the input does not contain any forbidden code points, and no upper
13516
+ // case ASCII letter, then we can just copy it to the buffer. We want to
13517
+ // optimize for such a common case.
13518
+ uint8_t is_forbidden_or_upper =
13519
+ unicode::contains_forbidden_domain_code_point_or_upper(input.data(),
13520
+ input.size());
13521
+ // Minor optimization opportunity:
13522
+ // contains_forbidden_domain_code_point_or_upper could be extend to check for
13523
+ // the presence of characters that cannot appear in the ipv4 address and we
13524
+ // could also check whether x and n and - are present, and so we could skip
13525
+ // some of the checks below. However, the gains are likely to be small, and
13526
+ // the code would be more complex.
13527
+ if (is_forbidden_or_upper == 0 &&
13528
+ input.find("xn-") == std::string_view::npos) {
13484
13529
// fast path
13485
- update_base_hostname(_buffer );
13530
+ update_base_hostname(input );
13486
13531
if (checkers::is_ipv4(get_hostname())) {
13487
13532
ada_log("parse_host fast path ipv4");
13488
13533
return parse_ipv4(get_hostname());
13489
13534
}
13490
13535
ada_log("parse_host fast path ", get_hostname());
13491
13536
return true;
13537
+ } else if (is_forbidden_or_upper == 2) {
13538
+ // We have encountered at least one upper case ASCII letter, let us
13539
+ // try to convert it to lower case. If there is no 'xn-' in the result,
13540
+ // we can then use a secondary fast path.
13541
+ std::string _buffer = std::string(input);
13542
+ unicode::to_lower_ascii(_buffer.data(), _buffer.size());
13543
+ if (input.find("xn-") == std::string_view::npos) {
13544
+ // secondary fast path when input is not all lower case
13545
+ update_base_hostname(input);
13546
+ if (checkers::is_ipv4(get_hostname())) {
13547
+ ada_log("parse_host fast path ipv4");
13548
+ return parse_ipv4(get_hostname());
13549
+ }
13550
+ ada_log("parse_host fast path ", get_hostname());
13551
+ return true;
13552
+ }
13492
13553
}
13554
+ // We have encountered at least one forbidden code point or the input contains
13555
+ // 'xn-' (case insensitive), so we need to call 'to_ascii' to perform the full
13556
+ // conversion.
13557
+
13493
13558
ada_log("parse_host calling to_ascii");
13494
13559
std::optional<std::string> host = std::string(get_hostname());
13495
13560
is_valid = ada::unicode::to_ascii(host, input, input.find('%'));
0 commit comments