Skip to content

Commit 872856c

Browse files
anonrigtargos
authored andcommitted
src: improve buffer.transcode performance
PR-URL: #54153 Reviewed-By: Daniel Lemire <daniel@lemire.me> Reviewed-By: Benjamin Gruenbaum <benjamingr@gmail.com> Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: Minwoo Jung <nodecorelab@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com>
1 parent cf283d9 commit 872856c

File tree

2 files changed

+79
-63
lines changed

2 files changed

+79
-63
lines changed

benchmark/buffers/buffer-transcode.js

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
'use strict';
2+
const common = require('../common.js');
3+
const assert = require('node:assert');
4+
const buffer = require('node:buffer');
5+
6+
const hasIntl = !!process.config.variables.v8_enable_i18n_support;
7+
const encodings = ['latin1', 'ascii', 'ucs2', 'utf8'];
8+
9+
if (!hasIntl) {
10+
console.log('Skipping: `transcode` is only available on platforms that support i18n`');
11+
process.exit(0);
12+
}
13+
14+
const bench = common.createBenchmark(main, {
15+
fromEncoding: encodings,
16+
toEncoding: encodings,
17+
length: [1, 10, 1000],
18+
n: [1e5],
19+
}, {
20+
combinationFilter(p) {
21+
return !(p.fromEncoding === 'ucs2' && p.toEncoding === 'utf8');
22+
},
23+
});
24+
25+
function main({ n, fromEncoding, toEncoding, length }) {
26+
const input = Buffer.from('a'.repeat(length));
27+
let out = 0;
28+
bench.start();
29+
for (let i = 0; i < n; i++) {
30+
const dest = buffer.transcode(input, fromEncoding, toEncoding);
31+
out += dest.buffer.byteLength;
32+
}
33+
bench.end(n);
34+
assert.ok(out >= 0);
35+
}

src/node_i18n.cc

+44-63
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242

4343
#include "node_i18n.h"
4444
#include "node_external_reference.h"
45+
#include "simdutf.h"
4546

4647
#if defined(NODE_HAVE_I18N_SUPPORT)
4748

@@ -146,7 +147,6 @@ MaybeLocal<Object> Transcode(Environment* env,
146147
const char* source,
147148
const size_t source_length,
148149
UErrorCode* status) {
149-
*status = U_ZERO_ERROR;
150150
MaybeLocal<Object> ret;
151151
MaybeStackBuffer<char> result;
152152
Converter to(toEncoding);
@@ -169,22 +169,21 @@ MaybeLocal<Object> Transcode(Environment* env,
169169
return ret;
170170
}
171171

172-
MaybeLocal<Object> TranscodeToUcs2(Environment* env,
173-
const char* fromEncoding,
174-
const char* toEncoding,
175-
const char* source,
176-
const size_t source_length,
177-
UErrorCode* status) {
178-
*status = U_ZERO_ERROR;
179-
MaybeLocal<Object> ret;
172+
MaybeLocal<Object> TranscodeLatin1ToUcs2(Environment* env,
173+
const char* fromEncoding,
174+
const char* toEncoding,
175+
const char* source,
176+
const size_t source_length,
177+
UErrorCode* status) {
180178
MaybeStackBuffer<UChar> destbuf(source_length);
181-
Converter from(fromEncoding);
182-
const size_t length_in_chars = source_length * sizeof(UChar);
183-
ucnv_toUChars(from.conv(), *destbuf, length_in_chars,
184-
source, source_length, status);
185-
if (U_SUCCESS(*status))
186-
ret = ToBufferEndian(env, &destbuf);
187-
return ret;
179+
auto actual_length =
180+
simdutf::convert_latin1_to_utf16le(source, source_length, destbuf.out());
181+
if (actual_length == 0) {
182+
*status = U_INVALID_CHAR_FOUND;
183+
return {};
184+
}
185+
186+
return Buffer::New(env, &destbuf);
188187
}
189188

190189
MaybeLocal<Object> TranscodeFromUcs2(Environment* env,
@@ -193,13 +192,11 @@ MaybeLocal<Object> TranscodeFromUcs2(Environment* env,
193192
const char* source,
194193
const size_t source_length,
195194
UErrorCode* status) {
196-
*status = U_ZERO_ERROR;
197195
MaybeStackBuffer<UChar> sourcebuf;
198196
MaybeLocal<Object> ret;
199197
Converter to(toEncoding);
200198

201-
size_t sublen = ucnv_getMinCharSize(to.conv());
202-
std::string sub(sublen, '?');
199+
std::string sub(to.min_char_size(), '?');
203200
to.set_subst_chars(sub.c_str());
204201

205202
const size_t length_in_chars = source_length / sizeof(UChar);
@@ -220,26 +217,18 @@ MaybeLocal<Object> TranscodeUcs2FromUtf8(Environment* env,
220217
const char* source,
221218
const size_t source_length,
222219
UErrorCode* status) {
223-
*status = U_ZERO_ERROR;
224-
MaybeStackBuffer<UChar> destbuf;
225-
int32_t result_length;
226-
u_strFromUTF8(*destbuf, destbuf.capacity(), &result_length,
227-
source, source_length, status);
228-
MaybeLocal<Object> ret;
229-
if (U_SUCCESS(*status)) {
230-
destbuf.SetLength(result_length);
231-
ret = ToBufferEndian(env, &destbuf);
232-
} else if (*status == U_BUFFER_OVERFLOW_ERROR) {
233-
*status = U_ZERO_ERROR;
234-
destbuf.AllocateSufficientStorage(result_length);
235-
u_strFromUTF8(*destbuf, result_length, &result_length,
236-
source, source_length, status);
237-
if (U_SUCCESS(*status)) {
238-
destbuf.SetLength(result_length);
239-
ret = ToBufferEndian(env, &destbuf);
240-
}
220+
size_t expected_utf16_length =
221+
simdutf::utf16_length_from_utf8(source, source_length);
222+
MaybeStackBuffer<UChar> destbuf(expected_utf16_length);
223+
auto actual_length =
224+
simdutf::convert_utf8_to_utf16le(source, source_length, destbuf.out());
225+
226+
if (actual_length == 0) {
227+
*status = U_INVALID_CHAR_FOUND;
228+
return {};
241229
}
242-
return ret;
230+
231+
return Buffer::New(env, &destbuf);
243232
}
244233

245234
MaybeLocal<Object> TranscodeUtf8FromUcs2(Environment* env,
@@ -248,32 +237,25 @@ MaybeLocal<Object> TranscodeUtf8FromUcs2(Environment* env,
248237
const char* source,
249238
const size_t source_length,
250239
UErrorCode* status) {
251-
*status = U_ZERO_ERROR;
252-
MaybeLocal<Object> ret;
253240
const size_t length_in_chars = source_length / sizeof(UChar);
254-
int32_t result_length;
255-
MaybeStackBuffer<UChar> sourcebuf;
256-
MaybeStackBuffer<char> destbuf;
257-
CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars);
258-
u_strToUTF8(*destbuf, destbuf.capacity(), &result_length,
259-
*sourcebuf, length_in_chars, status);
260-
if (U_SUCCESS(*status)) {
261-
destbuf.SetLength(result_length);
262-
ret = ToBufferEndian(env, &destbuf);
263-
} else if (*status == U_BUFFER_OVERFLOW_ERROR) {
264-
*status = U_ZERO_ERROR;
265-
destbuf.AllocateSufficientStorage(result_length);
266-
u_strToUTF8(*destbuf, result_length, &result_length, *sourcebuf,
267-
length_in_chars, status);
268-
if (U_SUCCESS(*status)) {
269-
destbuf.SetLength(result_length);
270-
ret = ToBufferEndian(env, &destbuf);
271-
}
241+
size_t expected_utf8_length = simdutf::utf8_length_from_utf16le(
242+
reinterpret_cast<const char16_t*>(source), length_in_chars);
243+
244+
MaybeStackBuffer<char> destbuf(expected_utf8_length);
245+
auto actual_length = simdutf::convert_utf16le_to_utf8(
246+
reinterpret_cast<const char16_t*>(source),
247+
length_in_chars,
248+
destbuf.out());
249+
250+
if (actual_length == 0) {
251+
*status = U_INVALID_CHAR_FOUND;
252+
return {};
272253
}
273-
return ret;
254+
255+
return Buffer::New(env, &destbuf);
274256
}
275257

276-
const char* EncodingName(const enum encoding encoding) {
258+
constexpr const char* EncodingName(const enum encoding encoding) {
277259
switch (encoding) {
278260
case ASCII: return "us-ascii";
279261
case LATIN1: return "iso8859-1";
@@ -283,7 +265,7 @@ const char* EncodingName(const enum encoding encoding) {
283265
}
284266
}
285267

286-
bool SupportedEncoding(const enum encoding encoding) {
268+
constexpr bool SupportedEncoding(const enum encoding encoding) {
287269
switch (encoding) {
288270
case ASCII:
289271
case LATIN1:
@@ -308,8 +290,7 @@ void Transcode(const FunctionCallbackInfo<Value>&args) {
308290
switch (fromEncoding) {
309291
case ASCII:
310292
case LATIN1:
311-
if (toEncoding == UCS2)
312-
tfn = &TranscodeToUcs2;
293+
if (toEncoding == UCS2) tfn = &TranscodeLatin1ToUcs2;
313294
break;
314295
case UTF8:
315296
if (toEncoding == UCS2)

0 commit comments

Comments
 (0)