Skip to content

Commit 7a3a43a

Browse files
committed
fix(marshal)!: compare strings by codepoint
1 parent 193e403 commit 7a3a43a

File tree

7 files changed

+146
-4
lines changed

7 files changed

+146
-4
lines changed

packages/marshal/NEWS.md

+8
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
User-visible changes in `@endo/marshal`:
22

3+
# next release
4+
5+
- JavaScript's relational comparison operators like `<` compare strings by lexicographic UTF16 code unit order, which is exposes an internal representational detail not relevant to the string's meaning as a Unicode string. Previously, `compareRank` and associated functions compared strings using this JavaScript-native comparison. Now `compareRank` and associated functions compare strings by lexicographic Unicode Code Point order. ***This change only affects strings containing so-called supplementary characters, i.e., those whose Unicode character code does not fit in 16 bits***.
6+
- This release does not change the `encodePassable` encoding. But now, when we say it is order preserving, we need to be careful about which order we mean. `encodePassable` is rank-order preserving when the encoded strings are compared using `compareRank`.
7+
- The key order of strings defined by the @endo/patterns module is still defined to be the same as the rank ordering of those strings. So this release changes key order among strings to also be lexicographic comparison of Unicode Code Points. To accommodate this change, you may need to adapt applications that relied on key-order being the same as JS native order. This could include the use of any patterns expressing key inequality tests, like `M.gte(string)`.
8+
- These string ordering changes brings Endo into conformance with any string ordering components of the OCapN standard.
9+
- To accommodate these change, you may need to adapt applications that relied on rank-order or key-order being the same as JS native order. You may need to resort any data that had previously been rank sorted using the prior `compareRank` function. You may need to revisit any use of patterns like `M.gte(string)` expressing inequalities over strings.
10+
311
# v0.8.1 (2022-12-23)
412

513
- Remote objects now reflect methods present on their prototype chain.

packages/marshal/index.js

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ export {
1717

1818
export {
1919
trivialComparator,
20+
compareByCodePoints,
2021
assertRankSorted,
2122
compareRank,
2223
isRankSorted,

packages/marshal/src/rankOrder.js

+41-2
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,46 @@ const { entries, fromEntries, setPrototypeOf, is } = Object;
4646
*/
4747
const sameValueZero = (x, y) => x === y || is(x, y);
4848

49+
/**
50+
* @param {any} left
51+
* @param {any} right
52+
* @returns {RankComparison}
53+
*/
4954
export const trivialComparator = (left, right) =>
5055
// eslint-disable-next-line no-nested-ternary, @endo/restrict-comparison-operands
5156
left < right ? -1 : left === right ? 0 : 1;
57+
harden(trivialComparator);
58+
59+
// Apparently eslint confused about whether the function can ever exit
60+
// without an explicit return.
61+
// eslint-disable-next-line jsdoc/require-returns-check
62+
/**
63+
* @param {string} left
64+
* @param {string} right
65+
* @returns {RankComparison}
66+
*/
67+
export const compareByCodePoints = (left, right) => {
68+
const leftIter = left[Symbol.iterator]();
69+
const rightIter = right[Symbol.iterator]();
70+
for (;;) {
71+
const { value: leftChar } = leftIter.next();
72+
const { value: rightChar } = rightIter.next();
73+
if (leftChar === undefined && rightChar === undefined) {
74+
return 0;
75+
} else if (leftChar === undefined) {
76+
// left is a prefix of right.
77+
return -1;
78+
} else if (rightChar === undefined) {
79+
// right is a prefix of left.
80+
return 1;
81+
}
82+
const leftCodepoint = /** @type {number} */ (leftChar.codePointAt(0));
83+
const rightCodepoint = /** @type {number} */ (rightChar.codePointAt(0));
84+
if (leftCodepoint < rightCodepoint) return -1;
85+
if (leftCodepoint > rightCodepoint) return 1;
86+
}
87+
};
88+
harden(compareByCodePoints);
5289

5390
/**
5491
* @typedef {Record<PassStyle, { index: number, cover: RankCover }>} PassStyleRanksRecord
@@ -140,8 +177,7 @@ export const makeComparatorKit = (compareRemotables = (_x, _y) => 0) => {
140177
return 0;
141178
}
142179
case 'boolean':
143-
case 'bigint':
144-
case 'string': {
180+
case 'bigint': {
145181
// Within each of these passStyles, the rank ordering agrees with
146182
// JavaScript's relational operators `<` and `>`.
147183
if (left < right) {
@@ -151,6 +187,9 @@ export const makeComparatorKit = (compareRemotables = (_x, _y) => 0) => {
151187
return 1;
152188
}
153189
}
190+
case 'string': {
191+
return compareByCodePoints(left, right);
192+
}
154193
case 'symbol': {
155194
return comparator(
156195
nameForPassableSymbol(left),

packages/marshal/test/test-encodePassable.js

+2-2
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ const encodePassableInternal = makeEncodePassable({
6767
encodeError: er => encodeThing('!', er),
6868
});
6969

70-
const encodePassable = passable => {
70+
export const encodePassable = passable => {
7171
resetBuffers();
7272
return encodePassableInternal(passable);
7373
};
@@ -78,7 +78,7 @@ const decodePassableInternal = makeDecodePassable({
7878
decodeError: e => decodeThing('!', e),
7979
});
8080

81-
const decodePassable = encoded => {
81+
export const decodePassable = encoded => {
8282
resetCursors();
8383
return decodePassableInternal(encoded);
8484
};
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import { test } from './prepare-test-env-ava.js';
2+
3+
import { compareRank } from '../src/rankOrder.js';
4+
import { encodePassable } from './test-encodePassable.js';
5+
6+
test('unicode code point order', t => {
7+
// Test case from
8+
// https://icu-project.org/docs/papers/utf16_code_point_order.html
9+
const str0 = '\u{ff61}';
10+
const str3 = '\u{d800}\u{dc02}';
11+
12+
// str1 and str2 become impossible examples once we prohibit
13+
// non - well - formed strings.
14+
// See https://github.com/endojs/endo/pull/2002
15+
const str1 = '\u{d800}X';
16+
const str2 = '\u{d800}\u{ff61}';
17+
18+
// harden to ensure it is not sorted in place, just for sanity
19+
const strs = harden([str0, str1, str2, str3]);
20+
21+
/**
22+
* @param {string} left
23+
* @param {string} right
24+
* @returns {import('../src/types.js').RankComparison}
25+
*/
26+
const nativeComp = (left, right) =>
27+
// eslint-disable-next-line no-nested-ternary
28+
left < right ? -1 : left > right ? 1 : 0;
29+
30+
const nativeSorted = strs.toSorted(nativeComp);
31+
32+
t.deepEqual(nativeSorted, [str1, str3, str2, str0]);
33+
34+
const rankSorted = strs.toSorted(compareRank);
35+
36+
t.deepEqual(rankSorted, [str1, str2, str0, str3]);
37+
38+
const nativeEncComp = (left, right) =>
39+
nativeComp(encodePassable(left), encodePassable(right));
40+
41+
const nativeEncSorted = strs.toSorted(nativeEncComp);
42+
43+
t.deepEqual(nativeEncSorted, nativeSorted);
44+
45+
const rankEncComp = (left, right) =>
46+
compareRank(encodePassable(left), encodePassable(right));
47+
48+
const rankEncSorted = strs.toSorted(rankEncComp);
49+
50+
t.deepEqual(rankEncSorted, rankSorted);
51+
});

packages/patterns/NEWS.md

+5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
User-visible changes in `@endo/patterns`:
22

3+
# next release
4+
5+
- JavaScript's relational comparison operators like `<` compare strings by lexicographic UTF16 code unit order, which is exposes an internal representational detail not relevant to the string's meaning as a Unicode string. Previously, `compareKeys` and associated functions compared strings using this JavaScript-native comparison. Now `compareKeys` and associated functions compare strings by lexicographic Unicode Code Point order. ***This change only affects strings containing so-called supplementary characters, i.e., those whose Unicode character code does not fit in 16 bits***.
6+
- See the NEWS.md of @endo/marshal for more on this change.
7+
38
# v0.2.6 (2023-09-11)
49

510
- Adds support for CopyMap patterns (e.g., `matches(specimen, makeCopyMap([]))`).
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// modeled on test-string-rank-order.js
2+
3+
import { test } from './prepare-test-env-ava.js';
4+
5+
import { compareKeys } from '../src/keys/compareKeys.js';
6+
7+
test('unicode code point order', t => {
8+
// Test case from
9+
// https://icu-project.org/docs/papers/utf16_code_point_order.html
10+
const str0 = '\u{ff61}';
11+
const str3 = '\u{d800}\u{dc02}';
12+
13+
// str1 and str2 become impossible examples once we prohibit
14+
// non - well - formed strings.
15+
// See https://github.com/endojs/endo/pull/2002
16+
const str1 = '\u{d800}X';
17+
const str2 = '\u{d800}\u{ff61}';
18+
19+
// harden to ensure it is not sorted in place, just for sanity
20+
const strs = harden([str0, str1, str2, str3]);
21+
22+
/**
23+
* @param {string} left
24+
* @param {string} right
25+
* @returns {import('../src/types.js').KeyComparison}
26+
*/
27+
const nativeComp = (left, right) =>
28+
// eslint-disable-next-line no-nested-ternary
29+
left < right ? -1 : left > right ? 1 : 0;
30+
31+
const nativeSorted = strs.toSorted(nativeComp);
32+
33+
t.deepEqual(nativeSorted, [str1, str3, str2, str0]);
34+
35+
const keySorted = strs.toSorted(compareKeys);
36+
37+
t.deepEqual(keySorted, [str1, str2, str0, str3]);
38+
});

0 commit comments

Comments
 (0)