Skip to content

Commit 8191e1f

Browse files
authored
src: improve utf8 string generation performance
PR-URL: #54873 Reviewed-By: Daniel Lemire <daniel@lemire.me> Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Stephen Belanger <admin@stephenbelanger.com>
1 parent 305137f commit 8191e1f

File tree

2 files changed

+47
-21
lines changed

2 files changed

+47
-21
lines changed

src/string_bytes.cc

+22-18
Original file line numberDiff line numberDiff line change
@@ -388,47 +388,47 @@ Maybe<size_t> StringBytes::StorageSize(Isolate* isolate,
388388
Local<Value> val,
389389
enum encoding encoding) {
390390
HandleScope scope(isolate);
391-
size_t data_size = 0;
392-
bool is_buffer = Buffer::HasInstance(val);
393391

394-
if (is_buffer && (encoding == BUFFER || encoding == LATIN1)) {
392+
if (Buffer::HasInstance(val) && (encoding == BUFFER || encoding == LATIN1)) {
395393
return Just(Buffer::Length(val));
396394
}
397395

398396
Local<String> str;
399397
if (!val->ToString(isolate->GetCurrentContext()).ToLocal(&str))
400398
return Nothing<size_t>();
399+
String::ValueView view(isolate, str);
400+
size_t data_size = 0;
401401

402402
switch (encoding) {
403403
case ASCII:
404404
case LATIN1:
405-
data_size = str->Length();
405+
data_size = view.length();
406406
break;
407407

408408
case BUFFER:
409409
case UTF8:
410410
// A single UCS2 codepoint never takes up more than 3 utf8 bytes.
411411
// It is an exercise for the caller to decide when a string is
412412
// long enough to justify calling Size() instead of StorageSize()
413-
data_size = 3 * str->Length();
413+
data_size = 3 * view.length();
414414
break;
415415

416416
case UCS2:
417-
data_size = str->Length() * sizeof(uint16_t);
417+
data_size = view.length() * sizeof(uint16_t);
418418
break;
419419

420420
case BASE64URL:
421-
data_size = simdutf::base64_length_from_binary(str->Length(),
421+
data_size = simdutf::base64_length_from_binary(view.length(),
422422
simdutf::base64_url);
423423
break;
424424

425425
case BASE64:
426-
data_size = simdutf::base64_length_from_binary(str->Length());
426+
data_size = simdutf::base64_length_from_binary(view.length());
427427
break;
428428

429429
case HEX:
430-
CHECK(str->Length() % 2 == 0 && "invalid hex string length");
431-
data_size = str->Length() / 2;
430+
CHECK(view.length() % 2 == 0 && "invalid hex string length");
431+
data_size = view.length() / 2;
432432
break;
433433

434434
default:
@@ -449,32 +449,36 @@ Maybe<size_t> StringBytes::Size(Isolate* isolate,
449449
Local<String> str;
450450
if (!val->ToString(isolate->GetCurrentContext()).ToLocal(&str))
451451
return Nothing<size_t>();
452+
String::ValueView view(isolate, str);
452453

453454
switch (encoding) {
454455
case ASCII:
455456
case LATIN1:
456-
return Just<size_t>(str->Length());
457+
return Just<size_t>(view.length());
457458

458459
case BUFFER:
459460
case UTF8:
460-
return Just<size_t>(str->Utf8Length(isolate));
461+
if (view.is_one_byte()) {
462+
return Just<size_t>(simdutf::utf8_length_from_latin1(
463+
reinterpret_cast<const char*>(view.data8()), view.length()));
464+
}
465+
return Just<size_t>(simdutf::utf8_length_from_utf16(
466+
reinterpret_cast<const char16_t*>(view.data16()), view.length()));
461467

462468
case UCS2:
463-
return Just(str->Length() * sizeof(uint16_t));
469+
return Just(view.length() * sizeof(uint16_t));
464470

465471
case BASE64URL: {
466-
String::Value value(isolate, str);
467-
return Just(simdutf::base64_length_from_binary(value.length(),
472+
return Just(simdutf::base64_length_from_binary(view.length(),
468473
simdutf::base64_url));
469474
}
470475

471476
case BASE64: {
472-
String::Value value(isolate, str);
473-
return Just(simdutf::base64_length_from_binary(value.length()));
477+
return Just(simdutf::base64_length_from_binary(view.length()));
474478
}
475479

476480
case HEX:
477-
return Just<size_t>(str->Length() / 2);
481+
return Just<size_t>(view.length() / 2);
478482
}
479483

480484
UNREACHABLE();

src/util.cc

+25-3
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@
4848
#include <sys/types.h>
4949
#endif
5050

51+
#include <simdutf.h>
52+
5153
#include <atomic>
5254
#include <cstdio>
5355
#include <cstring>
@@ -100,11 +102,31 @@ static void MakeUtf8String(Isolate* isolate,
100102
MaybeStackBuffer<T>* target) {
101103
Local<String> string;
102104
if (!value->ToString(isolate->GetCurrentContext()).ToLocal(&string)) return;
105+
String::ValueView value_view(isolate, string);
106+
107+
auto value_length = value_view.length();
108+
109+
if (value_view.is_one_byte()) {
110+
auto const_char = reinterpret_cast<const char*>(value_view.data8());
111+
auto expected_length =
112+
target->capacity() < (static_cast<size_t>(value_length) * 2 + 1)
113+
? simdutf::utf8_length_from_latin1(const_char, value_length)
114+
: value_length * 2;
115+
116+
// Add +1 for null termination.
117+
target->AllocateSufficientStorage(expected_length + 1);
118+
const auto actual_length = simdutf::convert_latin1_to_utf8(
119+
const_char, value_length, target->out());
120+
target->SetLengthAndZeroTerminate(actual_length);
121+
return;
122+
}
103123

104-
size_t storage;
105-
if (!StringBytes::StorageSize(isolate, string, UTF8).To(&storage)) return;
106-
storage += 1;
124+
// Add +1 for null termination.
125+
size_t storage = (3 * value_length) + 1;
107126
target->AllocateSufficientStorage(storage);
127+
128+
// TODO(@anonrig): Use simdutf to speed up non-one-byte strings once it's
129+
// implemented
108130
const int flags =
109131
String::NO_NULL_TERMINATION | String::REPLACE_INVALID_UTF8;
110132
const int length =

0 commit comments

Comments
 (0)