Skip to content

Commit b241a1d

Browse files
deps: update simdutf to 4.0.9
PR-URL: #51655 Reviewed-By: Luigi Pinca <luigipinca@gmail.com> Reviewed-By: Marco Ippolito <marcoippolito54@gmail.com> Reviewed-By: Rafael Gonzaga <rafael.nunu@hotmail.com> Reviewed-By: Ulises Gascón <ulisesgascongonzalez@gmail.com>
1 parent 8f9f5db commit b241a1d

File tree

2 files changed

+40
-42
lines changed

2 files changed

+40
-42
lines changed

deps/simdutf/simdutf.cpp

+36-38
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* auto-generated on 2023-12-01 13:59:01 -0500. Do not edit! */
1+
/* auto-generated on 2024-01-29 10:40:15 -0500. Do not edit! */
22
/* begin file src/simdutf.cpp */
33
#include "simdutf.h"
44
/* begin file src/implementation.cpp */
@@ -1151,7 +1151,7 @@ namespace icelake {
11511151
// We should not get warnings while including <x86intrin.h> yet we do
11521152
// under some versions of GCC.
11531153
// If the x86intrin.h header has uninitialized values that are problematic,
1154-
// it is a GCC issue, we want to ignore these warnigns.
1154+
// it is a GCC issue, we want to ignore these warnings.
11551155
SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
11561156
#endif
11571157

@@ -1568,7 +1568,7 @@ class implementation final : public simdutf::implementation {
15681568
// We should not get warnings while including <x86intrin.h> yet we do
15691569
// under some versions of GCC.
15701570
// If the x86intrin.h header has uninitialized values that are problematic,
1571-
// it is a GCC issue, we want to ignore these warnigns.
1571+
// it is a GCC issue, we want to ignore these warnings.
15721572
SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
15731573
#endif
15741574

@@ -2498,7 +2498,7 @@ class implementation final : public simdutf::implementation {
24982498
// We should not get warnings while including <x86intrin.h> yet we do
24992499
// under some versions of GCC.
25002500
// If the x86intrin.h header has uninitialized values that are problematic,
2501-
// it is a GCC issue, we want to ignore these warnigns.
2501+
// it is a GCC issue, we want to ignore these warnings.
25022502
SIMDUTF_DISABLE_GCC_WARNING(-Wuninitialized)
25032503
#endif
25042504

@@ -11655,7 +11655,7 @@ inline result convert_with_errors(const char* buf, size_t len, char16_t* utf16_o
1165511655
*
1165611656
* The caller is responsible to ensure that len > 0.
1165711657
*
11658-
* If the error is believed to have occured prior to 'buf', the count value contain in the result
11658+
* If the error is believed to have occurred prior to 'buf', the count value contain in the result
1165911659
* will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
1166011660
*/
1166111661
template <endianness endian>
@@ -11934,7 +11934,7 @@ inline result convert_with_errors(const char* buf, size_t len, char32_t* utf32_o
1193411934
*
1193511935
* The caller is responsible to ensure that len > 0.
1193611936
*
11937-
* If the error is believed to have occured prior to 'buf', the count value contain in the result
11937+
* If the error is believed to have occurred prior to 'buf', the count value contain in the result
1193811938
* will be SIZE_T - 1, SIZE_T - 2, or SIZE_T - 3.
1193911939
*/
1194011940
inline result rewind_and_convert_with_errors(size_t prior_bytes, const char* buf, size_t len, char32_t* utf32_output) {
@@ -16084,11 +16084,11 @@ using namespace simd;
1608416084
// 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
1608516085
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
1608616086
// much more than 8 bytes. However, you cannot generally assume that you have valid
16087-
// UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
16087+
// UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
1608816088
// to give us a good margin.
1608916089
size_t leading_byte = 0;
1609016090
size_t margin = size;
16091-
for(; margin > 0 && leading_byte < 4; margin--) {
16091+
for(; margin > 0 && leading_byte < 8; margin--) {
1609216092
leading_byte += (int8_t(in[margin-1]) > -65);
1609316093
}
1609416094
// If the input is long enough, then we have that margin-1 is the fourth last leading byte.
@@ -16158,11 +16158,11 @@ using namespace simd;
1615816158
// 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
1615916159
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
1616016160
// much more than 8 bytes. However, you cannot generally assume that you have valid
16161-
// UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
16161+
// UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
1616216162
// to give us a good margin.
1616316163
size_t leading_byte = 0;
1616416164
size_t margin = size;
16165-
for(; margin > 0 && leading_byte < 4; margin--) {
16165+
for(; margin > 0 && leading_byte < 8; margin--) {
1616616166
leading_byte += (int8_t(in[margin-1]) > -65);
1616716167
}
1616816168
// If the input is long enough, then we have that margin-1 is the fourth last leading byte.
@@ -17930,7 +17930,7 @@ simdutf_really_inline bool process_block_utf8_to_utf16(const char *&in, char16_t
1793017930
__m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff,
1793117931
indexofsecondlastbytes); // indices of the second last bytes
1793217932
__m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34,
17933-
clearedbytes); // only those that are the third last byte of a sequece
17933+
clearedbytes); // only those that are the third last byte of a sequence
1793417934
__m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes,
1793517935
thirdlastbyte); // the third last bytes (of three byte sequences, hi
1793617936
// surrogate)
@@ -17992,7 +17992,7 @@ simdutf_really_inline bool process_block_utf8_to_utf16(const char *&in, char16_t
1799217992
__m512i indexofthirdlastbytes = _mm512_add_epi16(mask_ffffffff,
1799317993
indexofsecondlastbytes); // indices of the second last bytes
1799417994
__m512i thirdlastbyte = _mm512_maskz_mov_epi8(m34,
17995-
clearedbytes); // only those that are the third last byte of a sequece
17995+
clearedbytes); // only those that are the third last byte of a sequence
1799617996
__m512i thirdlastbytes = _mm512_maskz_permutexvar_epi8(0x5555555555555555, indexofthirdlastbytes,
1799717997
thirdlastbyte); // the third last bytes (of three byte sequences, hi
1799817998
// surrogate)
@@ -18048,7 +18048,7 @@ simdutf_really_inline bool process_block_utf8_to_utf16(const char *&in, char16_t
1804818048
}
1804918049
// Fast path 2: all ASCII or 2 byte
1805018050
__mmask64 continuation_or_ascii = (tail == SIMDUTF_FULL) ? _knot_mask64(m234) : _kand_mask64(_knot_mask64(m234), b);
18051-
// on top of -0xc0 we substract -2 which we get back later of the
18051+
// on top of -0xc0 we subtract -2 which we get back later of the
1805218052
// continuation byte tags
1805318053
__m512i leading2byte = _mm512_maskz_sub_epi8(m234, input, mask_c2c2c2c2);
1805418054
__mmask64 leading = tail == (tail == SIMDUTF_FULL) ? _kor_mask64(m1, m234) : _kand_mask64(_kor_mask64(m1, m234), b); // first bytes of each sequence
@@ -18296,7 +18296,7 @@ __m512i rotate_by_N_epi8(const __m512i input) {
1829618296
stored at separate 32-bit lanes.
1829718297

1829818298
For each lane we have also a character class (`char_class), given in form
18299-
0x8080800N, where N is 4 higest bits from the leading byte; 0x80 resets
18299+
0x8080800N, where N is 4 highest bits from the leading byte; 0x80 resets
1830018300
corresponding bytes during pshufb.
1830118301
*/
1830218302
simdutf_really_inline __m512i expanded_utf8_to_utf32(__m512i char_class, __m512i utf8) {
@@ -19214,7 +19214,7 @@ simdutf_really_inline size_t process_block_from_utf8_to_latin1(const char *buf,
1921419214
// _mm512_storeu_si512((__m512i *)latin_output, output); I tried using
1921519215
// _mm512_storeu_si512 and have the next process_block start from the
1921619216
// "written_out" point but the compiler shuffles memory in such a way that it
19217-
// is signifcantly slower...
19217+
// is significantly slower...
1921819218
// ****************************
1921919219
_mm512_mask_storeu_epi8((__m512i *)latin_output, store_mask, output);
1922019220

@@ -22028,10 +22028,9 @@ simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd
2202822028
}
2202922029

2203022030
simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
22031-
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
22032-
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
22033-
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
22034-
return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
22031+
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be > 0x80
22032+
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be > 0x80
22033+
return simd8<bool>(is_third_byte | is_fourth_byte);
2203522034
}
2203622035

2203722036
/* begin file src/haswell/avx2_detect_encodings.cpp */
@@ -25495,11 +25494,11 @@ using namespace simd;
2549525494
// 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
2549625495
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
2549725496
// much more than 8 bytes. However, you cannot generally assume that you have valid
25498-
// UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
25497+
// UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
2549925498
// to give us a good margin.
2550025499
size_t leading_byte = 0;
2550125500
size_t margin = size;
25502-
for(; margin > 0 && leading_byte < 4; margin--) {
25501+
for(; margin > 0 && leading_byte < 8; margin--) {
2550325502
leading_byte += (int8_t(in[margin-1]) > -65);
2550425503
}
2550525504
// If the input is long enough, then we have that margin-1 is the fourth last leading byte.
@@ -25569,11 +25568,11 @@ using namespace simd;
2556925568
// 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
2557025569
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
2557125570
// much more than 8 bytes. However, you cannot generally assume that you have valid
25572-
// UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
25571+
// UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
2557325572
// to give us a good margin.
2557425573
size_t leading_byte = 0;
2557525574
size_t margin = size;
25576-
for(; margin > 0 && leading_byte < 4; margin--) {
25575+
for(; margin > 0 && leading_byte < 8; margin--) {
2557725576
leading_byte += (int8_t(in[margin-1]) > -65);
2557825577
}
2557925578
// If the input is long enough, then we have that margin-1 is the fourth last leading byte.
@@ -26887,10 +26886,10 @@ simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd
2688726886
}
2688826887

2688926888
simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
26890-
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
26891-
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
26889+
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80
26890+
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80
2689226891
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
26893-
return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
26892+
return simd8<bool>(is_third_byte | is_fourth_byte);
2689426893
}
2689526894

2689626895
} // unnamed namespace
@@ -27867,11 +27866,11 @@ using namespace simd;
2786727866
// 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
2786827867
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
2786927868
// much more than 8 bytes. However, you cannot generally assume that you have valid
27870-
// UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
27869+
// UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
2787127870
// to give us a good margin.
2787227871
size_t leading_byte = 0;
2787327872
size_t margin = size;
27874-
for(; margin > 0 && leading_byte < 4; margin--) {
27873+
for(; margin > 0 && leading_byte < 8; margin--) {
2787527874
leading_byte += (int8_t(in[margin-1]) > -65);
2787627875
}
2787727876
// If the input is long enough, then we have that margin-1 is the fourth last leading byte.
@@ -27941,11 +27940,11 @@ using namespace simd;
2794127940
// 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
2794227941
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
2794327942
// much more than 8 bytes. However, you cannot generally assume that you have valid
27944-
// UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
27943+
// UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
2794527944
// to give us a good margin.
2794627945
size_t leading_byte = 0;
2794727946
size_t margin = size;
27948-
for(; margin > 0 && leading_byte < 4; margin--) {
27947+
for(; margin > 0 && leading_byte < 8; margin--) {
2794927948
leading_byte += (int8_t(in[margin-1]) > -65);
2795027949
}
2795127950
// If the input is long enough, then we have that margin-1 is the fourth last leading byte.
@@ -28407,10 +28406,9 @@ simdutf_unused simdutf_really_inline simd8<bool> must_be_continuation(const simd
2840728406
}
2840828407

2840928408
simdutf_really_inline simd8<bool> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
28410-
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0b11100000u-1); // Only 111_____ will be > 0
28411-
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0b11110000u-1); // Only 1111____ will be > 0
28412-
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
28413-
return simd8<int8_t>(is_third_byte | is_fourth_byte) > int8_t(0);
28409+
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80
28410+
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80
28411+
return simd8<bool>(is_third_byte | is_fourth_byte);
2841428412
}
2841528413

2841628414
/* begin file src/westmere/internal/loader.cpp */
@@ -31874,11 +31872,11 @@ using namespace simd;
3187431872
// 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
3187531873
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
3187631874
// much more than 8 bytes. However, you cannot generally assume that you have valid
31877-
// UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
31875+
// UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
3187831876
// to give us a good margin.
3187931877
size_t leading_byte = 0;
3188031878
size_t margin = size;
31881-
for(; margin > 0 && leading_byte < 4; margin--) {
31879+
for(; margin > 0 && leading_byte < 8; margin--) {
3188231880
leading_byte += (int8_t(in[margin-1]) > -65);
3188331881
}
3188431882
// If the input is long enough, then we have that margin-1 is the fourth last leading byte.
@@ -31948,11 +31946,11 @@ using namespace simd;
3194831946
// 8 bytes when calling convert_masked_utf8_to_utf32. If you skip the last 16 bytes,
3194931947
// and if the data is valid, then it is entirely safe because 16 UTF-8 bytes generate
3195031948
// much more than 8 bytes. However, you cannot generally assume that you have valid
31951-
// UTF-8 input, so we are going to go back from the end counting 4 leading bytes,
31949+
// UTF-8 input, so we are going to go back from the end counting 8 leading bytes,
3195231950
// to give us a good margin.
3195331951
size_t leading_byte = 0;
3195431952
size_t margin = size;
31955-
for(; margin > 0 && leading_byte < 4; margin--) {
31953+
for(; margin > 0 && leading_byte < 8; margin--) {
3195631954
leading_byte += (int8_t(in[margin-1]) > -65);
3195731955
}
3195831956
// If the input is long enough, then we have that margin-1 is the fourth last leading byte.

deps/simdutf/simdutf.h

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* auto-generated on 2023-12-01 13:59:01 -0500. Do not edit! */
1+
/* auto-generated on 2024-01-29 10:40:15 -0500. Do not edit! */
22
/* begin file include/simdutf.h */
33
#ifndef SIMDUTF_H
44
#define SIMDUTF_H
@@ -567,7 +567,7 @@ SIMDUTF_DISABLE_UNDESIRED_WARNINGS
567567
#define SIMDUTF_SIMDUTF_VERSION_H
568568

569569
/** The version of simdutf being used (major.minor.revision) */
570-
#define SIMDUTF_VERSION "4.0.8"
570+
#define SIMDUTF_VERSION "4.0.9"
571571

572572
namespace simdutf {
573573
enum {
@@ -582,7 +582,7 @@ enum {
582582
/**
583583
* The revision (major.minor.REVISION) of simdutf being used.
584584
*/
585-
SIMDUTF_VERSION_REVISION = 8
585+
SIMDUTF_VERSION_REVISION = 9
586586
};
587587
} // namespace simdutf
588588

@@ -874,7 +874,7 @@ simdutf_really_inline simdutf_warn_unused simdutf::encoding_type autodetect_enco
874874
* E.g., if the input might be UTF-16LE or UTF-8, this function returns
875875
* the value (simdutf::encoding_type::UTF8 | simdutf::encoding_type::UTF16_LE).
876876
*
877-
* Overriden by each implementation.
877+
* Overridden by each implementation.
878878
*
879879
* @param input the string to analyze.
880880
* @param length the length of the string in bytes.

0 commit comments

Comments
 (0)