1
- /* auto-generated on 2024-12-17 14:54:59 -0500. Do not edit! */
1
+ /* auto-generated on 2024-12-26 12:42:33 -0500. Do not edit! */
2
2
/* begin file src/simdutf.cpp */
3
3
#include "simdutf.h"
4
4
// We include base64_tables once.
@@ -697,6 +697,15 @@ static_assert(to_base64_url_value[uint8_t('_')] == 63,
697
697
#include <climits>
698
698
#include <type_traits>
699
699
700
+ static_assert(sizeof(uint8_t) == sizeof(char),
701
+ "simdutf requires that uint8_t be a char");
702
+ static_assert(sizeof(uint16_t) == sizeof(char16_t),
703
+ "simdutf requires that char16_t be 16 bits");
704
+ static_assert(sizeof(uint32_t) == sizeof(char32_t),
705
+ "simdutf requires that char32_t be 32 bits");
706
+ // next line is redundant, but it is kept to catch defective systems.
707
+ static_assert(CHAR_BIT == 8, "simdutf requires 8-bit bytes");
708
+
700
709
// Useful for debugging purposes
701
710
namespace simdutf {
702
711
namespace {
@@ -9746,24 +9755,23 @@ inline simdutf_warn_unused uint16_t swap_bytes(const uint16_t word) {
9746
9755
}
9747
9756
9748
9757
template <endianness big_endian>
9749
- inline simdutf_warn_unused bool validate(const char16_t *buf ,
9758
+ inline simdutf_warn_unused bool validate(const char16_t *data ,
9750
9759
size_t len) noexcept {
9751
- const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
9752
9760
uint64_t pos = 0;
9753
9761
while (pos < len) {
9754
- uint16_t word =
9762
+ char16_t word =
9755
9763
!match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
9756
9764
if ((word & 0xF800) == 0xD800) {
9757
9765
if (pos + 1 >= len) {
9758
9766
return false;
9759
9767
}
9760
- uint16_t diff = uint16_t (word - 0xD800);
9768
+ char16_t diff = char16_t (word - 0xD800);
9761
9769
if (diff > 0x3FF) {
9762
9770
return false;
9763
9771
}
9764
- uint16_t next_word =
9772
+ char16_t next_word =
9765
9773
!match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
9766
- uint16_t diff2 = uint16_t (next_word - 0xDC00);
9774
+ char16_t diff2 = char16_t (next_word - 0xDC00);
9767
9775
if (diff2 > 0x3FF) {
9768
9776
return false;
9769
9777
}
@@ -9776,24 +9784,23 @@ inline simdutf_warn_unused bool validate(const char16_t *buf,
9776
9784
}
9777
9785
9778
9786
template <endianness big_endian>
9779
- inline simdutf_warn_unused result validate_with_errors(const char16_t *buf ,
9787
+ inline simdutf_warn_unused result validate_with_errors(const char16_t *data ,
9780
9788
size_t len) noexcept {
9781
- const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
9782
9789
size_t pos = 0;
9783
9790
while (pos < len) {
9784
- uint16_t word =
9791
+ char16_t word =
9785
9792
!match_system(big_endian) ? swap_bytes(data[pos]) : data[pos];
9786
9793
if ((word & 0xF800) == 0xD800) {
9787
9794
if (pos + 1 >= len) {
9788
9795
return result(error_code::SURROGATE, pos);
9789
9796
}
9790
- uint16_t diff = uint16_t (word - 0xD800);
9797
+ char16_t diff = char16_t (word - 0xD800);
9791
9798
if (diff > 0x3FF) {
9792
9799
return result(error_code::SURROGATE, pos);
9793
9800
}
9794
- uint16_t next_word =
9801
+ char16_t next_word =
9795
9802
!match_system(big_endian) ? swap_bytes(data[pos + 1]) : data[pos + 1];
9796
- uint16_t diff2 = uint16_t(next_word - 0xDC00);
9803
+ char16_t diff2 = uint16_t(next_word - 0xDC00);
9797
9804
if (diff2 > 0x3FF) {
9798
9805
return result(error_code::SURROGATE, pos);
9799
9806
}
@@ -9806,24 +9813,22 @@ inline simdutf_warn_unused result validate_with_errors(const char16_t *buf,
9806
9813
}
9807
9814
9808
9815
template <endianness big_endian>
9809
- inline size_t count_code_points(const char16_t *buf , size_t len) {
9816
+ inline size_t count_code_points(const char16_t *p , size_t len) {
9810
9817
// We are not BOM aware.
9811
- const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
9812
9818
size_t counter{0};
9813
9819
for (size_t i = 0; i < len; i++) {
9814
- uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
9820
+ char16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
9815
9821
counter += ((word & 0xFC00) != 0xDC00);
9816
9822
}
9817
9823
return counter;
9818
9824
}
9819
9825
9820
9826
template <endianness big_endian>
9821
- inline size_t utf8_length_from_utf16(const char16_t *buf , size_t len) {
9827
+ inline size_t utf8_length_from_utf16(const char16_t *p , size_t len) {
9822
9828
// We are not BOM aware.
9823
- const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
9824
9829
size_t counter{0};
9825
9830
for (size_t i = 0; i < len; i++) {
9826
- uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
9831
+ char16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
9827
9832
counter++; // ASCII
9828
9833
counter += static_cast<size_t>(
9829
9834
word >
@@ -9835,25 +9840,22 @@ inline size_t utf8_length_from_utf16(const char16_t *buf, size_t len) {
9835
9840
}
9836
9841
9837
9842
template <endianness big_endian>
9838
- inline size_t utf32_length_from_utf16(const char16_t *buf , size_t len) {
9843
+ inline size_t utf32_length_from_utf16(const char16_t *p , size_t len) {
9839
9844
// We are not BOM aware.
9840
- const uint16_t *p = reinterpret_cast<const uint16_t *>(buf);
9841
9845
size_t counter{0};
9842
9846
for (size_t i = 0; i < len; i++) {
9843
- uint16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
9847
+ char16_t word = !match_system(big_endian) ? swap_bytes(p[i]) : p[i];
9844
9848
counter += ((word & 0xFC00) != 0xDC00);
9845
9849
}
9846
9850
return counter;
9847
9851
}
9848
9852
9849
9853
inline size_t latin1_length_from_utf16(size_t len) { return len; }
9850
9854
9851
- simdutf_really_inline void change_endianness_utf16(const char16_t *in,
9852
- size_t size, char16_t *out) {
9853
- const uint16_t *input = reinterpret_cast<const uint16_t *>(in);
9854
- uint16_t *output = reinterpret_cast<uint16_t *>(out);
9855
+ simdutf_really_inline void
9856
+ change_endianness_utf16(const char16_t *input, size_t size, char16_t *output) {
9855
9857
for (size_t i = 0; i < size; i++) {
9856
- *output++ = uint16_t (input[i] >> 8 | input[i] << 8);
9858
+ *output++ = char16_t (input[i] >> 8 | input[i] << 8);
9857
9859
}
9858
9860
}
9859
9861
@@ -21042,6 +21044,9 @@ struct validating_transcoder {
21042
21044
uint64_t utf8_continuation_mask =
21043
21045
input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
21044
21046
// this case, we also have ASCII to account for.
21047
+ if (utf8_continuation_mask & 1) {
21048
+ return 0; // error
21049
+ }
21045
21050
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
21046
21051
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
21047
21052
// We process in blocks of up to 12 bytes except possibly
@@ -26717,6 +26722,14 @@ compress_decode_base64(char *dst, const chartype *src, size_t srclen,
26717
26722
}
26718
26723
26719
26724
if (!ignore_garbage && equalsigns > 0) {
26725
+ if (last_chunk_options == last_chunk_handling_options::strict) {
26726
+ return {BASE64_INPUT_REMAINDER, size_t(src - srcinit),
26727
+ size_t(dst - dstinit)};
26728
+ }
26729
+ if (last_chunk_options ==
26730
+ last_chunk_handling_options::stop_before_partial) {
26731
+ return {SUCCESS, size_t(src - srcinit), size_t(dst - dstinit)};
26732
+ }
26720
26733
if ((size_t(dst - dstinit) % 3 == 0) ||
26721
26734
((size_t(dst - dstinit) % 3) + 1 + equalsigns != 4)) {
26722
26735
return {INVALID_BASE64_CHARACTER, equallocation, size_t(dst - dstinit)};
@@ -33161,6 +33174,9 @@ struct validating_transcoder {
33161
33174
uint64_t utf8_continuation_mask =
33162
33175
input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
33163
33176
// this case, we also have ASCII to account for.
33177
+ if (utf8_continuation_mask & 1) {
33178
+ return 0; // error
33179
+ }
33164
33180
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
33165
33181
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
33166
33182
// We process in blocks of up to 12 bytes except possibly
@@ -43013,6 +43029,9 @@ struct validating_transcoder {
43013
43029
uint64_t utf8_continuation_mask =
43014
43030
input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
43015
43031
// this case, we also have ASCII to account for.
43032
+ if (utf8_continuation_mask & 1) {
43033
+ return 0; // error
43034
+ }
43016
43035
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
43017
43036
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
43018
43037
// We process in blocks of up to 12 bytes except possibly
@@ -48110,6 +48129,9 @@ struct validating_transcoder {
48110
48129
uint64_t utf8_continuation_mask =
48111
48130
input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
48112
48131
// this case, we also have ASCII to account for.
48132
+ if (utf8_continuation_mask & 1) {
48133
+ return 0; // error
48134
+ }
48113
48135
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
48114
48136
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
48115
48137
// We process in blocks of up to 12 bytes except possibly
@@ -54454,6 +54476,9 @@ struct validating_transcoder {
54454
54476
uint64_t utf8_continuation_mask =
54455
54477
input.lt(-65 + 1); // -64 is 1100 0000 in twos complement. Note: in
54456
54478
// this case, we also have ASCII to account for.
54479
+ if (utf8_continuation_mask & 1) {
54480
+ return 0; // error
54481
+ }
54457
54482
uint64_t utf8_leading_mask = ~utf8_continuation_mask;
54458
54483
uint64_t utf8_end_of_code_point_mask = utf8_leading_mask >> 1;
54459
54484
// We process in blocks of up to 12 bytes except possibly
0 commit comments