Skip to content

Commit 9eccd7d

Browse files
mertcanaltintargos
authored andcommitted
util: add fast path for Latin1 decoding
PR-URL: #55275 Reviewed-By: Rafael Gonzaga <rafael.nunu@hotmail.com> Reviewed-By: Yagiz Nizipli <yagiz@nizipli.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Daniel Lemire <daniel@lemire.me>
1 parent 52dfe5a commit 9eccd7d

File tree

5 files changed

+212
-2
lines changed

5 files changed

+212
-2
lines changed

benchmark/util/text-decoder.js

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
const common = require('../common.js');
44

55
const bench = common.createBenchmark(main, {
6-
encoding: ['utf-8', 'latin1', 'iso-8859-3'],
6+
encoding: ['utf-8', 'windows-1252', 'iso-8859-3'],
77
ignoreBOM: [0, 1],
88
fatal: [0, 1],
99
len: [256, 1024 * 16, 1024 * 128],

lib/internal/encoding.js

+9-1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ const kDecoder = Symbol('decoder');
2929
const kEncoder = Symbol('encoder');
3030
const kFatal = Symbol('kFatal');
3131
const kUTF8FastPath = Symbol('kUTF8FastPath');
32+
const kLatin1FastPath = Symbol('kLatin1FastPath');
3233
const kIgnoreBOM = Symbol('kIgnoreBOM');
3334

3435
const {
@@ -55,6 +56,7 @@ const {
5556
encodeIntoResults,
5657
encodeUtf8String,
5758
decodeUTF8,
59+
decodeLatin1,
5860
} = binding;
5961

6062
const { Buffer } = require('buffer');
@@ -419,9 +421,10 @@ function makeTextDecoderICU() {
419421
this[kFatal] = Boolean(options?.fatal);
420422
// Only support fast path for UTF-8.
421423
this[kUTF8FastPath] = enc === 'utf-8';
424+
this[kLatin1FastPath] = enc === 'windows-1252';
422425
this[kHandle] = undefined;
423426

424-
if (!this[kUTF8FastPath]) {
427+
if (!this[kUTF8FastPath] && !this[kLatin1FastPath]) {
425428
this.#prepareConverter();
426429
}
427430
}
@@ -438,11 +441,16 @@ function makeTextDecoderICU() {
438441
validateDecoder(this);
439442

440443
this[kUTF8FastPath] &&= !(options?.stream);
444+
this[kLatin1FastPath] &&= !(options?.stream);
441445

442446
if (this[kUTF8FastPath]) {
443447
return decodeUTF8(input, this[kIgnoreBOM], this[kFatal]);
444448
}
445449

450+
if (this[kLatin1FastPath]) {
451+
return decodeLatin1(input, this[kIgnoreBOM], this[kFatal]);
452+
}
453+
446454
this.#prepareConverter();
447455

448456
validateObject(options, 'options', kValidateObjectAllowObjectsAndNull);

src/encoding_binding.cc

+46
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include "encoding_binding.h"
22
#include "ada.h"
33
#include "env-inl.h"
4+
#include "node_buffer.h"
45
#include "node_errors.h"
56
#include "node_external_reference.h"
67
#include "simdutf.h"
@@ -226,6 +227,7 @@ void BindingData::CreatePerIsolateProperties(IsolateData* isolate_data,
226227
SetMethodNoSideEffect(isolate, target, "decodeUTF8", DecodeUTF8);
227228
SetMethodNoSideEffect(isolate, target, "toASCII", ToASCII);
228229
SetMethodNoSideEffect(isolate, target, "toUnicode", ToUnicode);
230+
SetMethodNoSideEffect(isolate, target, "decodeLatin1", DecodeLatin1);
229231
}
230232

231233
void BindingData::CreatePerContextProperties(Local<Object> target,
@@ -243,6 +245,50 @@ void BindingData::RegisterTimerExternalReferences(
243245
registry->Register(DecodeUTF8);
244246
registry->Register(ToASCII);
245247
registry->Register(ToUnicode);
248+
registry->Register(DecodeLatin1);
249+
}
250+
251+
void BindingData::DecodeLatin1(const FunctionCallbackInfo<Value>& args) {
252+
Environment* env = Environment::GetCurrent(args);
253+
254+
CHECK_GE(args.Length(), 1);
255+
if (!(args[0]->IsArrayBuffer() || args[0]->IsSharedArrayBuffer() ||
256+
args[0]->IsArrayBufferView())) {
257+
return node::THROW_ERR_INVALID_ARG_TYPE(
258+
env->isolate(),
259+
"The \"input\" argument must be an instance of ArrayBuffer, "
260+
"SharedArrayBuffer, or ArrayBufferView.");
261+
}
262+
263+
bool ignore_bom = args[1]->IsTrue();
264+
bool has_fatal = args[2]->IsTrue();
265+
266+
ArrayBufferViewContents<uint8_t> buffer(args[0]);
267+
const uint8_t* data = buffer.data();
268+
size_t length = buffer.length();
269+
270+
if (ignore_bom && length > 0 && data[0] == 0xFF) {
271+
data++;
272+
length--;
273+
}
274+
275+
if (length == 0) {
276+
return args.GetReturnValue().SetEmptyString();
277+
}
278+
279+
std::string result(length * 2, '\0');
280+
281+
size_t written = simdutf::convert_latin1_to_utf8(
282+
reinterpret_cast<const char*>(data), length, result.data());
283+
284+
if (has_fatal && written == 0) {
285+
return node::THROW_ERR_ENCODING_INVALID_ENCODED_DATA(
286+
env->isolate(), "The encoded data was not valid for encoding latin1");
287+
}
288+
289+
Local<Object> buffer_result =
290+
node::Buffer::Copy(env, result.c_str(), written).ToLocalChecked();
291+
args.GetReturnValue().Set(buffer_result);
246292
}
247293

248294
} // namespace encoding_binding

src/encoding_binding.h

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ class BindingData : public SnapshotableObject {
3131
static void EncodeInto(const v8::FunctionCallbackInfo<v8::Value>& args);
3232
static void EncodeUtf8String(const v8::FunctionCallbackInfo<v8::Value>& args);
3333
static void DecodeUTF8(const v8::FunctionCallbackInfo<v8::Value>& args);
34+
static void DecodeLatin1(const v8::FunctionCallbackInfo<v8::Value>& args);
3435

3536
static void ToASCII(const v8::FunctionCallbackInfo<v8::Value>& args);
3637
static void ToUnicode(const v8::FunctionCallbackInfo<v8::Value>& args);

test/cctest/test_encoding_binding.cc

+155
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
#include "encoding_binding.h"
2+
#include "env-inl.h"
3+
#include "gtest/gtest.h"
4+
#include "node_test_fixture.h"
5+
#include "v8.h"
6+
7+
namespace node {
8+
namespace encoding_binding {
9+
10+
bool RunDecodeLatin1(Environment* env,
11+
Local<Value> args[],
12+
bool ignore_bom,
13+
bool has_fatal,
14+
Local<Value>* result) {
15+
Isolate* isolate = env->isolate();
16+
TryCatch try_catch(isolate);
17+
18+
Local<Boolean> ignoreBOMValue = Boolean::New(isolate, ignore_bom);
19+
Local<Boolean> fatalValue = Boolean::New(isolate, has_fatal);
20+
21+
Local<Value> updatedArgs[] = {args[0], ignoreBOMValue, fatalValue};
22+
23+
BindingData::DecodeLatin1(FunctionCallbackInfo<Value>(updatedArgs));
24+
25+
if (try_catch.HasCaught()) {
26+
return false;
27+
}
28+
29+
*result = try_catch.Exception();
30+
return true;
31+
}
32+
33+
class EncodingBindingTest : public NodeTestFixture {};
34+
35+
TEST_F(EncodingBindingTest, DecodeLatin1_ValidInput) {
36+
Environment* env = CreateEnvironment();
37+
Isolate* isolate = env->isolate();
38+
HandleScope handle_scope(isolate);
39+
40+
const uint8_t latin1_data[] = {0xC1, 0xE9, 0xF3};
41+
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, sizeof(latin1_data));
42+
memcpy(ab->GetBackingStore()->Data(), latin1_data, sizeof(latin1_data));
43+
44+
Local<Uint8Array> array = Uint8Array::New(ab, 0, sizeof(latin1_data));
45+
Local<Value> args[] = {array};
46+
47+
Local<Value> result;
48+
EXPECT_TRUE(RunDecodeLatin1(env, args, false, false, &result));
49+
50+
String::Utf8Value utf8_result(isolate, result);
51+
EXPECT_STREQ(*utf8_result, "Áéó");
52+
}
53+
54+
TEST_F(EncodingBindingTest, DecodeLatin1_EmptyInput) {
55+
Environment* env = CreateEnvironment();
56+
Isolate* isolate = env->isolate();
57+
HandleScope handle_scope(isolate);
58+
59+
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, 0);
60+
Local<Uint8Array> array = Uint8Array::New(ab, 0, 0);
61+
Local<Value> args[] = {array};
62+
63+
Local<Value> result;
64+
EXPECT_TRUE(RunDecodeLatin1(env, args, false, false, &result));
65+
66+
String::Utf8Value utf8_result(isolate, result);
67+
EXPECT_STREQ(*utf8_result, "");
68+
}
69+
70+
TEST_F(EncodingBindingTest, DecodeLatin1_InvalidInput) {
71+
Environment* env = CreateEnvironment();
72+
Isolate* isolate = env->isolate();
73+
HandleScope handle_scope(isolate);
74+
75+
Local<Value> args[] = {String::NewFromUtf8Literal(isolate, "Invalid input")};
76+
77+
Local<Value> result;
78+
EXPECT_FALSE(RunDecodeLatin1(env, args, false, false, &result));
79+
}
80+
81+
TEST_F(EncodingBindingTest, DecodeLatin1_IgnoreBOM) {
82+
Environment* env = CreateEnvironment();
83+
Isolate* isolate = env->isolate();
84+
HandleScope handle_scope(isolate);
85+
86+
const uint8_t latin1_data[] = {0xFE, 0xFF, 0xC1, 0xE9, 0xF3};
87+
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, sizeof(latin1_data));
88+
memcpy(ab->GetBackingStore()->Data(), latin1_data, sizeof(latin1_data));
89+
90+
Local<Uint8Array> array = Uint8Array::New(ab, 0, sizeof(latin1_data));
91+
Local<Value> args[] = {array};
92+
93+
Local<Value> result;
94+
EXPECT_TRUE(RunDecodeLatin1(env, args, true, false, &result));
95+
96+
String::Utf8Value utf8_result(isolate, result);
97+
EXPECT_STREQ(*utf8_result, "Áéó");
98+
}
99+
100+
TEST_F(EncodingBindingTest, DecodeLatin1_FatalInvalidInput) {
101+
Environment* env = CreateEnvironment();
102+
Isolate* isolate = env->isolate();
103+
HandleScope handle_scope(isolate);
104+
105+
const uint8_t invalid_data[] = {0xFF, 0xFF, 0xFF};
106+
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, sizeof(invalid_data));
107+
memcpy(ab->GetBackingStore()->Data(), invalid_data, sizeof(invalid_data));
108+
109+
Local<Uint8Array> array = Uint8Array::New(ab, 0, sizeof(invalid_data));
110+
Local<Value> args[] = {array};
111+
112+
Local<Value> result;
113+
EXPECT_FALSE(RunDecodeLatin1(env, args, false, true, &result));
114+
}
115+
116+
TEST_F(EncodingBindingTest, DecodeLatin1_IgnoreBOMAndFatal) {
117+
Environment* env = CreateEnvironment();
118+
Isolate* isolate = env->isolate();
119+
HandleScope handle_scope(isolate);
120+
121+
const uint8_t latin1_data[] = {0xFE, 0xFF, 0xC1, 0xE9, 0xF3};
122+
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, sizeof(latin1_data));
123+
memcpy(ab->GetBackingStore()->Data(), latin1_data, sizeof(latin1_data));
124+
125+
Local<Uint8Array> array = Uint8Array::New(ab, 0, sizeof(latin1_data));
126+
Local<Value> args[] = {array};
127+
128+
Local<Value> result;
129+
EXPECT_TRUE(RunDecodeLatin1(env, args, true, true, &result));
130+
131+
String::Utf8Value utf8_result(isolate, result);
132+
EXPECT_STREQ(*utf8_result, "Áéó");
133+
}
134+
135+
TEST_F(EncodingBindingTest, DecodeLatin1_BOMPresent) {
136+
Environment* env = CreateEnvironment();
137+
Isolate* isolate = env->isolate();
138+
HandleScope handle_scope(isolate);
139+
140+
const uint8_t latin1_data[] = {0xFF, 0xC1, 0xE9, 0xF3};
141+
Local<ArrayBuffer> ab = ArrayBuffer::New(isolate, sizeof(latin1_data));
142+
memcpy(ab->GetBackingStore()->Data(), latin1_data, sizeof(latin1_data));
143+
144+
Local<Uint8Array> array = Uint8Array::New(ab, 0, sizeof(latin1_data));
145+
Local<Value> args[] = {array};
146+
147+
Local<Value> result;
148+
EXPECT_TRUE(RunDecodeLatin1(env, args, true, false, &result));
149+
150+
String::Utf8Value utf8_result(isolate, result);
151+
EXPECT_STREQ(*utf8_result, "Áéó");
152+
}
153+
154+
} // namespace encoding_binding
155+
} // namespace node

0 commit comments

Comments
 (0)