Skip to content

Commit 4db9cbb

Browse files
KhafraDevtpoisseau
authored andcommitted
esm: use Undici/fetch data: URL parser
Fixes: nodejs#53775 PR-URL: nodejs#54748 Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: Antoine du Hamel <duhamelantoine1995@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com>
1 parent 51db6db commit 4db9cbb

File tree

4 files changed

+397
-17
lines changed

4 files changed

+397
-17
lines changed

lib/internal/data_url.js

+352
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,352 @@
1+
'use strict';
2+
3+
const {
4+
RegExpPrototypeExec,
5+
RegExpPrototypeSymbolReplace,
6+
StringFromCharCodeApply,
7+
StringPrototypeCharCodeAt,
8+
StringPrototypeIndexOf,
9+
StringPrototypeSlice,
10+
TypedArrayPrototypeSubarray,
11+
Uint8Array,
12+
} = primordials;
13+
14+
const assert = require('internal/assert');
15+
const { Buffer } = require('buffer');
16+
const { MIMEType } = require('internal/mime');
17+
18+
let encoder;
19+
function lazyEncoder() {
20+
if (encoder === undefined) {
21+
const { TextEncoder } = require('internal/encoding');
22+
encoder = new TextEncoder();
23+
}
24+
25+
return encoder;
26+
}
27+
28+
const ASCII_WHITESPACE_REPLACE_REGEX = /[\u0009\u000A\u000C\u000D\u0020]/g // eslint-disable-line
29+
30+
// https://fetch.spec.whatwg.org/#data-url-processor
31+
/** @param {URL} dataURL */
32+
function dataURLProcessor(dataURL) {
33+
// 1. Assert: dataURL's scheme is "data".
34+
assert(dataURL.protocol === 'data:');
35+
36+
// 2. Let input be the result of running the URL
37+
// serializer on dataURL with exclude fragment
38+
// set to true.
39+
let input = URLSerializer(dataURL, true);
40+
41+
// 3. Remove the leading "data:" string from input.
42+
input = StringPrototypeSlice(input, 5);
43+
44+
// 4. Let position point at the start of input.
45+
const position = { position: 0 };
46+
47+
// 5. Let mimeType be the result of collecting a
48+
// sequence of code points that are not equal
49+
// to U+002C (,), given position.
50+
let mimeType = collectASequenceOfCodePointsFast(
51+
',',
52+
input,
53+
position,
54+
);
55+
56+
// 6. Strip leading and trailing ASCII whitespace
57+
// from mimeType.
58+
// Undici implementation note: we need to store the
59+
// length because if the mimetype has spaces removed,
60+
// the wrong amount will be sliced from the input in
61+
// step #9
62+
const mimeTypeLength = mimeType.length;
63+
mimeType = removeASCIIWhitespace(mimeType, true, true);
64+
65+
// 7. If position is past the end of input, then
66+
// return failure
67+
if (position.position >= input.length) {
68+
return 'failure';
69+
}
70+
71+
// 8. Advance position by 1.
72+
position.position++;
73+
74+
// 9. Let encodedBody be the remainder of input.
75+
const encodedBody = StringPrototypeSlice(input, mimeTypeLength + 1);
76+
77+
// 10. Let body be the percent-decoding of encodedBody.
78+
let body = stringPercentDecode(encodedBody);
79+
80+
// 11. If mimeType ends with U+003B (;), followed by
81+
// zero or more U+0020 SPACE, followed by an ASCII
82+
// case-insensitive match for "base64", then:
83+
if (RegExpPrototypeExec(/;(\u0020){0,}base64$/i, mimeType) !== null) {
84+
// 1. Let stringBody be the isomorphic decode of body.
85+
const stringBody = isomorphicDecode(body);
86+
87+
// 2. Set body to the forgiving-base64 decode of
88+
// stringBody.
89+
body = forgivingBase64(stringBody);
90+
91+
// 3. If body is failure, then return failure.
92+
if (body === 'failure') {
93+
return 'failure';
94+
}
95+
96+
// 4. Remove the last 6 code points from mimeType.
97+
mimeType = StringPrototypeSlice(mimeType, 0, -6);
98+
99+
// 5. Remove trailing U+0020 SPACE code points from mimeType,
100+
// if any.
101+
mimeType = RegExpPrototypeSymbolReplace(/(\u0020)+$/, mimeType, '');
102+
103+
// 6. Remove the last U+003B (;) code point from mimeType.
104+
mimeType = StringPrototypeSlice(mimeType, 0, -1);
105+
}
106+
107+
// 12. If mimeType starts with U+003B (;), then prepend
108+
// "text/plain" to mimeType.
109+
if (mimeType[0] === ';') {
110+
mimeType = 'text/plain' + mimeType;
111+
}
112+
113+
// 13. Let mimeTypeRecord be the result of parsing
114+
// mimeType.
115+
// 14. If mimeTypeRecord is failure, then set
116+
// mimeTypeRecord to text/plain;charset=US-ASCII.
117+
let mimeTypeRecord;
118+
119+
try {
120+
mimeTypeRecord = new MIMEType(mimeType);
121+
} catch {
122+
mimeTypeRecord = new MIMEType('text/plain;charset=US-ASCII');
123+
}
124+
125+
// 15. Return a new data: URL struct whose MIME
126+
// type is mimeTypeRecord and body is body.
127+
// https://fetch.spec.whatwg.org/#data-url-struct
128+
return { mimeType: mimeTypeRecord, body };
129+
}
130+
131+
// https://url.spec.whatwg.org/#concept-url-serializer
132+
/**
133+
* @param {URL} url
134+
* @param {boolean} excludeFragment
135+
*/
136+
function URLSerializer(url, excludeFragment = false) {
137+
const { href } = url;
138+
139+
if (!excludeFragment) {
140+
return href;
141+
}
142+
143+
const hashLength = url.hash.length;
144+
const serialized = hashLength === 0 ? href : StringPrototypeSlice(href, 0, href.length - hashLength);
145+
146+
if (!hashLength && href[href.length - 1] === '#') {
147+
return StringPrototypeSlice(serialized, 0, -1);
148+
}
149+
150+
return serialized;
151+
}
152+
153+
/**
154+
* A faster collectASequenceOfCodePoints that only works when comparing a single character.
155+
* @param {string} char
156+
* @param {string} input
157+
* @param {{ position: number }} position
158+
*/
159+
function collectASequenceOfCodePointsFast(char, input, position) {
160+
const idx = StringPrototypeIndexOf(input, char, position.position);
161+
const start = position.position;
162+
163+
if (idx === -1) {
164+
position.position = input.length;
165+
return StringPrototypeSlice(input, start);
166+
}
167+
168+
position.position = idx;
169+
return StringPrototypeSlice(input, start, position.position);
170+
}
171+
172+
// https://url.spec.whatwg.org/#string-percent-decode
173+
/** @param {string} input */
174+
function stringPercentDecode(input) {
175+
// 1. Let bytes be the UTF-8 encoding of input.
176+
const bytes = lazyEncoder().encode(input);
177+
178+
// 2. Return the percent-decoding of bytes.
179+
return percentDecode(bytes);
180+
}
181+
182+
/**
183+
* @param {number} byte
184+
*/
185+
function isHexCharByte(byte) {
186+
// 0-9 A-F a-f
187+
return (byte >= 0x30 && byte <= 0x39) || (byte >= 0x41 && byte <= 0x46) || (byte >= 0x61 && byte <= 0x66);
188+
}
189+
190+
/**
191+
* @param {number} byte
192+
*/
193+
function hexByteToNumber(byte) {
194+
return (
195+
// 0-9
196+
byte >= 0x30 && byte <= 0x39 ?
197+
(byte - 48) :
198+
// Convert to uppercase
199+
// ((byte & 0xDF) - 65) + 10
200+
((byte & 0xDF) - 55)
201+
);
202+
}
203+
204+
// https://url.spec.whatwg.org/#percent-decode
205+
/** @param {Uint8Array} input */
206+
function percentDecode(input) {
207+
const length = input.length;
208+
// 1. Let output be an empty byte sequence.
209+
/** @type {Uint8Array} */
210+
const output = new Uint8Array(length);
211+
let j = 0;
212+
// 2. For each byte byte in input:
213+
for (let i = 0; i < length; ++i) {
214+
const byte = input[i];
215+
216+
// 1. If byte is not 0x25 (%), then append byte to output.
217+
if (byte !== 0x25) {
218+
output[j++] = byte;
219+
220+
// 2. Otherwise, if byte is 0x25 (%) and the next two bytes
221+
// after byte in input are not in the ranges
222+
// 0x30 (0) to 0x39 (9), 0x41 (A) to 0x46 (F),
223+
// and 0x61 (a) to 0x66 (f), all inclusive, append byte
224+
// to output.
225+
} else if (
226+
byte === 0x25 &&
227+
!(isHexCharByte(input[i + 1]) && isHexCharByte(input[i + 2]))
228+
) {
229+
output[j++] = 0x25;
230+
231+
// 3. Otherwise:
232+
} else {
233+
// 1. Let bytePoint be the two bytes after byte in input,
234+
// decoded, and then interpreted as hexadecimal number.
235+
// 2. Append a byte whose value is bytePoint to output.
236+
output[j++] = (hexByteToNumber(input[i + 1]) << 4) | hexByteToNumber(input[i + 2]);
237+
238+
// 3. Skip the next two bytes in input.
239+
i += 2;
240+
}
241+
}
242+
243+
// 3. Return output.
244+
return length === j ? output : TypedArrayPrototypeSubarray(output, 0, j);
245+
}
246+
247+
// https://infra.spec.whatwg.org/#forgiving-base64-decode
248+
/** @param {string} data */
249+
function forgivingBase64(data) {
250+
// 1. Remove all ASCII whitespace from data.
251+
data = RegExpPrototypeSymbolReplace(ASCII_WHITESPACE_REPLACE_REGEX, data, '');
252+
253+
let dataLength = data.length;
254+
// 2. If data's code point length divides by 4 leaving
255+
// no remainder, then:
256+
if (dataLength % 4 === 0) {
257+
// 1. If data ends with one or two U+003D (=) code points,
258+
// then remove them from data.
259+
if (data[dataLength - 1] === '=') {
260+
--dataLength;
261+
if (data[dataLength - 1] === '=') {
262+
--dataLength;
263+
}
264+
}
265+
}
266+
267+
// 3. If data's code point length divides by 4 leaving
268+
// a remainder of 1, then return failure.
269+
if (dataLength % 4 === 1) {
270+
return 'failure';
271+
}
272+
273+
// 4. If data contains a code point that is not one of
274+
// U+002B (+)
275+
// U+002F (/)
276+
// ASCII alphanumeric
277+
// then return failure.
278+
if (RegExpPrototypeExec(/[^+/0-9A-Za-z]/, data.length === dataLength ? data : StringPrototypeSlice(data, 0, dataLength)) !== null) {
279+
return 'failure';
280+
}
281+
282+
const buffer = Buffer.from(data, 'base64');
283+
return new Uint8Array(buffer.buffer, buffer.byteOffset, buffer.byteLength);
284+
}
285+
286+
/**
287+
* @see https://infra.spec.whatwg.org/#ascii-whitespace
288+
* @param {number} char
289+
*/
290+
function isASCIIWhitespace(char) {
291+
// "\r\n\t\f "
292+
return char === 0x00d || char === 0x00a || char === 0x009 || char === 0x00c || char === 0x020;
293+
}
294+
295+
/**
296+
* @see https://infra.spec.whatwg.org/#strip-leading-and-trailing-ascii-whitespace
297+
* @param {string} str
298+
* @param {boolean} [leading=true]
299+
* @param {boolean} [trailing=true]
300+
*/
301+
function removeASCIIWhitespace(str, leading = true, trailing = true) {
302+
return removeChars(str, leading, trailing, isASCIIWhitespace);
303+
}
304+
305+
/**
306+
* @param {string} str
307+
* @param {boolean} leading
308+
* @param {boolean} trailing
309+
* @param {(charCode: number) => boolean} predicate
310+
*/
311+
function removeChars(str, leading, trailing, predicate) {
312+
let lead = 0;
313+
let trail = str.length - 1;
314+
315+
if (leading) {
316+
while (lead < str.length && predicate(StringPrototypeCharCodeAt(str, lead))) lead++;
317+
}
318+
319+
if (trailing) {
320+
while (trail > 0 && predicate(StringPrototypeCharCodeAt(str, trail))) trail--;
321+
}
322+
323+
return lead === 0 && trail === str.length - 1 ? str : StringPrototypeSlice(str, lead, trail + 1);
324+
}
325+
326+
/**
327+
* @see https://infra.spec.whatwg.org/#isomorphic-decode
328+
* @param {Uint8Array} input
329+
* @returns {string}
330+
*/
331+
function isomorphicDecode(input) {
332+
// 1. To isomorphic decode a byte sequence input, return a string whose code point
333+
// length is equal to input's length and whose code points have the same values
334+
// as the values of input's bytes, in the same order.
335+
const length = input.length;
336+
if ((2 << 15) - 1 > length) {
337+
return StringFromCharCodeApply(input);
338+
}
339+
let result = ''; let i = 0;
340+
let addition = (2 << 15) - 1;
341+
while (i < length) {
342+
if (i + addition > length) {
343+
addition = length - i;
344+
}
345+
result += StringFromCharCodeApply(TypedArrayPrototypeSubarray(input, i, i += addition));
346+
}
347+
return result;
348+
}
349+
350+
module.exports = {
351+
dataURLProcessor,
352+
};

0 commit comments

Comments
 (0)