Skip to content

Commit 3a74c40

Browse files
anonrigRafaelGSS
authored andcommittedAug 25, 2024
src: improve buffer.transcode performance
PR-URL: #54153 Reviewed-By: Daniel Lemire <daniel@lemire.me> Reviewed-By: Benjamin Gruenbaum <benjamingr@gmail.com> Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: Minwoo Jung <nodecorelab@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com>
1 parent cbe30a0 commit 3a74c40

File tree

2 files changed

+79
-63
lines changed

2 files changed

+79
-63
lines changed
 

‎benchmark/buffers/buffer-transcode.js

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
'use strict';
2+
const common = require('../common.js');
3+
const assert = require('node:assert');
4+
const buffer = require('node:buffer');
5+
6+
const hasIntl = !!process.config.variables.v8_enable_i18n_support;
7+
const encodings = ['latin1', 'ascii', 'ucs2', 'utf8'];
8+
9+
if (!hasIntl) {
10+
console.log('Skipping: `transcode` is only available on platforms that support i18n`');
11+
process.exit(0);
12+
}
13+
14+
const bench = common.createBenchmark(main, {
15+
fromEncoding: encodings,
16+
toEncoding: encodings,
17+
length: [1, 10, 1000],
18+
n: [1e5],
19+
}, {
20+
combinationFilter(p) {
21+
return !(p.fromEncoding === 'ucs2' && p.toEncoding === 'utf8');
22+
},
23+
});
24+
25+
function main({ n, fromEncoding, toEncoding, length }) {
26+
const input = Buffer.from('a'.repeat(length));
27+
let out = 0;
28+
bench.start();
29+
for (let i = 0; i < n; i++) {
30+
const dest = buffer.transcode(input, fromEncoding, toEncoding);
31+
out += dest.buffer.byteLength;
32+
}
33+
bench.end(n);
34+
assert.ok(out >= 0);
35+
}

‎src/node_i18n.cc

+44-63
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242

4343
#include "node_i18n.h"
4444
#include "node_external_reference.h"
45+
#include "simdutf.h"
4546

4647
#if defined(NODE_HAVE_I18N_SUPPORT)
4748

@@ -147,7 +148,6 @@ MaybeLocal<Object> Transcode(Environment* env,
147148
const char* source,
148149
const size_t source_length,
149150
UErrorCode* status) {
150-
*status = U_ZERO_ERROR;
151151
MaybeLocal<Object> ret;
152152
MaybeStackBuffer<char> result;
153153
Converter to(toEncoding);
@@ -170,22 +170,21 @@ MaybeLocal<Object> Transcode(Environment* env,
170170
return ret;
171171
}
172172

173-
MaybeLocal<Object> TranscodeToUcs2(Environment* env,
174-
const char* fromEncoding,
175-
const char* toEncoding,
176-
const char* source,
177-
const size_t source_length,
178-
UErrorCode* status) {
179-
*status = U_ZERO_ERROR;
180-
MaybeLocal<Object> ret;
173+
MaybeLocal<Object> TranscodeLatin1ToUcs2(Environment* env,
174+
const char* fromEncoding,
175+
const char* toEncoding,
176+
const char* source,
177+
const size_t source_length,
178+
UErrorCode* status) {
181179
MaybeStackBuffer<UChar> destbuf(source_length);
182-
Converter from(fromEncoding);
183-
const size_t length_in_chars = source_length * sizeof(UChar);
184-
ucnv_toUChars(from.conv(), *destbuf, length_in_chars,
185-
source, source_length, status);
186-
if (U_SUCCESS(*status))
187-
ret = ToBufferEndian(env, &destbuf);
188-
return ret;
180+
auto actual_length =
181+
simdutf::convert_latin1_to_utf16le(source, source_length, destbuf.out());
182+
if (actual_length == 0) {
183+
*status = U_INVALID_CHAR_FOUND;
184+
return {};
185+
}
186+
187+
return Buffer::New(env, &destbuf);
189188
}
190189

191190
MaybeLocal<Object> TranscodeFromUcs2(Environment* env,
@@ -194,13 +193,11 @@ MaybeLocal<Object> TranscodeFromUcs2(Environment* env,
194193
const char* source,
195194
const size_t source_length,
196195
UErrorCode* status) {
197-
*status = U_ZERO_ERROR;
198196
MaybeStackBuffer<UChar> sourcebuf;
199197
MaybeLocal<Object> ret;
200198
Converter to(toEncoding);
201199

202-
size_t sublen = ucnv_getMinCharSize(to.conv());
203-
std::string sub(sublen, '?');
200+
std::string sub(to.min_char_size(), '?');
204201
to.set_subst_chars(sub.c_str());
205202

206203
const size_t length_in_chars = source_length / sizeof(UChar);
@@ -221,26 +218,18 @@ MaybeLocal<Object> TranscodeUcs2FromUtf8(Environment* env,
221218
const char* source,
222219
const size_t source_length,
223220
UErrorCode* status) {
224-
*status = U_ZERO_ERROR;
225-
MaybeStackBuffer<UChar> destbuf;
226-
int32_t result_length;
227-
u_strFromUTF8(*destbuf, destbuf.capacity(), &result_length,
228-
source, source_length, status);
229-
MaybeLocal<Object> ret;
230-
if (U_SUCCESS(*status)) {
231-
destbuf.SetLength(result_length);
232-
ret = ToBufferEndian(env, &destbuf);
233-
} else if (*status == U_BUFFER_OVERFLOW_ERROR) {
234-
*status = U_ZERO_ERROR;
235-
destbuf.AllocateSufficientStorage(result_length);
236-
u_strFromUTF8(*destbuf, result_length, &result_length,
237-
source, source_length, status);
238-
if (U_SUCCESS(*status)) {
239-
destbuf.SetLength(result_length);
240-
ret = ToBufferEndian(env, &destbuf);
241-
}
221+
size_t expected_utf16_length =
222+
simdutf::utf16_length_from_utf8(source, source_length);
223+
MaybeStackBuffer<UChar> destbuf(expected_utf16_length);
224+
auto actual_length =
225+
simdutf::convert_utf8_to_utf16le(source, source_length, destbuf.out());
226+
227+
if (actual_length == 0) {
228+
*status = U_INVALID_CHAR_FOUND;
229+
return {};
242230
}
243-
return ret;
231+
232+
return Buffer::New(env, &destbuf);
244233
}
245234

246235
MaybeLocal<Object> TranscodeUtf8FromUcs2(Environment* env,
@@ -249,32 +238,25 @@ MaybeLocal<Object> TranscodeUtf8FromUcs2(Environment* env,
249238
const char* source,
250239
const size_t source_length,
251240
UErrorCode* status) {
252-
*status = U_ZERO_ERROR;
253-
MaybeLocal<Object> ret;
254241
const size_t length_in_chars = source_length / sizeof(UChar);
255-
int32_t result_length;
256-
MaybeStackBuffer<UChar> sourcebuf;
257-
MaybeStackBuffer<char> destbuf;
258-
CopySourceBuffer(&sourcebuf, source, source_length, length_in_chars);
259-
u_strToUTF8(*destbuf, destbuf.capacity(), &result_length,
260-
*sourcebuf, length_in_chars, status);
261-
if (U_SUCCESS(*status)) {
262-
destbuf.SetLength(result_length);
263-
ret = ToBufferEndian(env, &destbuf);
264-
} else if (*status == U_BUFFER_OVERFLOW_ERROR) {
265-
*status = U_ZERO_ERROR;
266-
destbuf.AllocateSufficientStorage(result_length);
267-
u_strToUTF8(*destbuf, result_length, &result_length, *sourcebuf,
268-
length_in_chars, status);
269-
if (U_SUCCESS(*status)) {
270-
destbuf.SetLength(result_length);
271-
ret = ToBufferEndian(env, &destbuf);
272-
}
242+
size_t expected_utf8_length = simdutf::utf8_length_from_utf16le(
243+
reinterpret_cast<const char16_t*>(source), length_in_chars);
244+
245+
MaybeStackBuffer<char> destbuf(expected_utf8_length);
246+
auto actual_length = simdutf::convert_utf16le_to_utf8(
247+
reinterpret_cast<const char16_t*>(source),
248+
length_in_chars,
249+
destbuf.out());
250+
251+
if (actual_length == 0) {
252+
*status = U_INVALID_CHAR_FOUND;
253+
return {};
273254
}
274-
return ret;
255+
256+
return Buffer::New(env, &destbuf);
275257
}
276258

277-
const char* EncodingName(const enum encoding encoding) {
259+
constexpr const char* EncodingName(const enum encoding encoding) {
278260
switch (encoding) {
279261
case ASCII: return "us-ascii";
280262
case LATIN1: return "iso8859-1";
@@ -284,7 +266,7 @@ const char* EncodingName(const enum encoding encoding) {
284266
}
285267
}
286268

287-
bool SupportedEncoding(const enum encoding encoding) {
269+
constexpr bool SupportedEncoding(const enum encoding encoding) {
288270
switch (encoding) {
289271
case ASCII:
290272
case LATIN1:
@@ -309,8 +291,7 @@ void Transcode(const FunctionCallbackInfo<Value>&args) {
309291
switch (fromEncoding) {
310292
case ASCII:
311293
case LATIN1:
312-
if (toEncoding == UCS2)
313-
tfn = &TranscodeToUcs2;
294+
if (toEncoding == UCS2) tfn = &TranscodeLatin1ToUcs2;
314295
break;
315296
case UTF8:
316297
if (toEncoding == UCS2)

0 commit comments

Comments
 (0)
Please sign in to comment.