Skip to content

Commit 467dbca

Browse files
committed
fix(marshal)!: compare strings by codepoint
1 parent 0a91fbe commit 467dbca

8 files changed

+269
-87
lines changed

packages/marshal/NEWS.md

+8
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
11
User-visible changes in `@endo/marshal`:
22

3+
# Next release
4+
5+
- JavaScript's relational comparison operators like `<` compare strings by lexicographic UTF16 code unit order, which exposes an internal representational detail not relevant to the string's meaning as a Unicode string. Previously, `compareRank` and associated functions compared strings using this JavaScript-native comparison. Now `compareRank` and associated functions compare strings by lexicographic Unicode Code Point order. ***This change only affects strings containing so-called supplementary characters, i.e., those whose Unicode character code does not fit in 16 bits***.
6+
- This release does not change the `encodePassable` encoding. But now, when we say it is order preserving, we need to be careful about which order we mean. `encodePassable` is rank-order preserving when the encoded strings are compared using `compareRank`.
7+
- The key order of strings defined by the @endo/patterns module is still defined to be the same as the rank ordering of those strings. So this release changes key order among strings to also be lexicographic comparison of Unicode Code Points. To accommodate this change, you may need to adapt applications that relied on key-order being the same as JS native order. This could include the use of any patterns expressing key inequality tests, like `M.gte(string)`.
8+
- These string ordering changes brings Endo into conformance with any string ordering components of the OCapN standard.
9+
- To accommodate these change, you may need to adapt applications that relied on rank-order or key-order being the same as JS native order. You may need to resort any data that had previously been rank sorted using the prior `compareRank` function. You may need to revisit any use of patterns like `M.gte(string)` expressing inequalities over strings.
10+
311
# v1.3.0 (2024-02-22)
412

513
- Sending and receiving extended errors.

packages/marshal/index.js

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ export {
1818

1919
export {
2020
trivialComparator,
21+
compareByCodePoints,
2122
assertRankSorted,
2223
compareRank,
2324
isRankSorted,

packages/marshal/src/rankOrder.js

+42-3
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import {
88

99
/**
1010
* @import {Passable, PassStyle} from '@endo/pass-style'
11-
* @import {FullCompare, RankCompare, RankCover} from './types.js'
11+
* @import {FullCompare, RankCompare, RankCover, RankComparison} from './types.js'
1212
*/
1313

1414
const { entries, fromEntries, setPrototypeOf, is } = Object;
@@ -44,9 +44,46 @@ const { entries, fromEntries, setPrototypeOf, is } = Object;
4444
*/
4545
const sameValueZero = (x, y) => x === y || is(x, y);
4646

47+
/**
48+
* @param {any} left
49+
* @param {any} right
50+
* @returns {RankComparison}
51+
*/
4752
export const trivialComparator = (left, right) =>
4853
// eslint-disable-next-line no-nested-ternary, @endo/restrict-comparison-operands
4954
left < right ? -1 : left === right ? 0 : 1;
55+
harden(trivialComparator);
56+
57+
// Apparently eslint confused about whether the function can ever exit
58+
// without an explicit return.
59+
// eslint-disable-next-line jsdoc/require-returns-check
60+
/**
61+
* @param {string} left
62+
* @param {string} right
63+
* @returns {RankComparison}
64+
*/
65+
export const compareByCodePoints = (left, right) => {
66+
const leftIter = left[Symbol.iterator]();
67+
const rightIter = right[Symbol.iterator]();
68+
for (;;) {
69+
const { value: leftChar } = leftIter.next();
70+
const { value: rightChar } = rightIter.next();
71+
if (leftChar === undefined && rightChar === undefined) {
72+
return 0;
73+
} else if (leftChar === undefined) {
74+
// left is a prefix of right.
75+
return -1;
76+
} else if (rightChar === undefined) {
77+
// right is a prefix of left.
78+
return 1;
79+
}
80+
const leftCodepoint = /** @type {number} */ (leftChar.codePointAt(0));
81+
const rightCodepoint = /** @type {number} */ (rightChar.codePointAt(0));
82+
if (leftCodepoint < rightCodepoint) return -1;
83+
if (leftCodepoint > rightCodepoint) return 1;
84+
}
85+
};
86+
harden(compareByCodePoints);
5087

5188
/**
5289
* @typedef {Record<PassStyle, { index: number, cover: RankCover }>} PassStyleRanksRecord
@@ -138,8 +175,7 @@ export const makeComparatorKit = (compareRemotables = (_x, _y) => 0) => {
138175
return 0;
139176
}
140177
case 'boolean':
141-
case 'bigint':
142-
case 'string': {
178+
case 'bigint': {
143179
// Within each of these passStyles, the rank ordering agrees with
144180
// JavaScript's relational operators `<` and `>`.
145181
if (left < right) {
@@ -149,6 +185,9 @@ export const makeComparatorKit = (compareRemotables = (_x, _y) => 0) => {
149185
return 1;
150186
}
151187
}
188+
case 'string': {
189+
return compareByCodePoints(left, right);
190+
}
152191
case 'symbol': {
153192
return comparator(
154193
nameForPassableSymbol(left),
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/* eslint-disable no-bitwise, @endo/restrict-comparison-operands */
2+
import { Fail, q } from '@endo/errors';
3+
4+
import {
5+
makeEncodePassable,
6+
makeDecodePassable,
7+
} from '../src/encodePassable.js';
8+
import { compareRank, makeComparatorKit } from '../src/rankOrder.js';
9+
10+
const buffers = {
11+
__proto__: null,
12+
r: [],
13+
'?': [],
14+
'!': [],
15+
};
16+
const resetBuffers = () => {
17+
buffers.r = [];
18+
buffers['?'] = [];
19+
buffers['!'] = [];
20+
};
21+
const cursors = {
22+
__proto__: null,
23+
r: 0,
24+
'?': 0,
25+
'!': 0,
26+
};
27+
const resetCursors = () => {
28+
cursors.r = 0;
29+
cursors['?'] = 0;
30+
cursors['!'] = 0;
31+
};
32+
33+
const encodeThing = (prefix, r) => {
34+
buffers[prefix].push(r);
35+
// With this encoding, all things with the same prefix have the same rank
36+
return prefix;
37+
};
38+
39+
const decodeThing = (prefix, e) => {
40+
prefix === e ||
41+
Fail`expected encoding ${q(e)} to simply be the prefix ${q(prefix)}`;
42+
(cursors[prefix] >= 0 && cursors[prefix] < buffers[prefix].length) ||
43+
Fail`while decoding ${q(e)}, expected cursors[${q(prefix)}], i.e., ${q(
44+
cursors[prefix],
45+
)} <= ${q(buffers[prefix].length)}`;
46+
const thing = buffers[prefix][cursors[prefix]];
47+
cursors[prefix] += 1;
48+
return thing;
49+
};
50+
51+
const encodePassableInternal = makeEncodePassable({
52+
encodeRemotable: r => encodeThing('r', r),
53+
encodePromise: p => encodeThing('?', p),
54+
encodeError: er => encodeThing('!', er),
55+
});
56+
57+
export const encodePassableInternal2 = makeEncodePassable({
58+
encodeRemotable: r => encodeThing('r', r),
59+
encodePromise: p => encodeThing('?', p),
60+
encodeError: er => encodeThing('!', er),
61+
format: 'compactOrdered',
62+
});
63+
64+
export const encodePassable = passable => {
65+
resetBuffers();
66+
return encodePassableInternal(passable);
67+
};
68+
69+
export const encodePassable2 = passable => {
70+
resetBuffers();
71+
return encodePassableInternal2(passable);
72+
};
73+
export const decodePassableInternal = makeDecodePassable({
74+
decodeRemotable: e => decodeThing('r', e),
75+
decodePromise: e => decodeThing('?', e),
76+
decodeError: e => decodeThing('!', e),
77+
});
78+
79+
export const decodePassable = encoded => {
80+
resetCursors();
81+
return decodePassableInternal(encoded);
82+
};
83+
84+
const compareRemotables = (x, y) =>
85+
compareRank(encodeThing('r', x), encodeThing('r', y));
86+
87+
export const { comparator: compareFull } = makeComparatorKit(compareRemotables);

packages/marshal/test/encodePassable.test.js

+11-84
Original file line numberDiff line numberDiff line change
@@ -5,91 +5,20 @@ import test from '@endo/ses-ava/prepare-endo.js';
55
import { fc } from '@fast-check/ava';
66
import { Remotable } from '@endo/pass-style';
77
import { arbPassable } from '@endo/pass-style/tools.js';
8-
import { Fail, q } from '@endo/errors';
8+
import { Fail } from '@endo/errors';
99

10-
import {
11-
makePassableKit,
12-
makeEncodePassable,
13-
makeDecodePassable,
14-
} from '../src/encodePassable.js';
15-
import { compareRank, makeComparatorKit } from '../src/rankOrder.js';
10+
import { makePassableKit, makeEncodePassable } from '../src/encodePassable.js';
11+
import { compareRank } from '../src/rankOrder.js';
1612
import { unsortedSample } from './marshal-test-data.js';
1713

18-
const buffers = {
19-
__proto__: null,
20-
r: [],
21-
'?': [],
22-
'!': [],
23-
};
24-
const resetBuffers = () => {
25-
buffers.r = [];
26-
buffers['?'] = [];
27-
buffers['!'] = [];
28-
};
29-
const cursors = {
30-
__proto__: null,
31-
r: 0,
32-
'?': 0,
33-
'!': 0,
34-
};
35-
const resetCursors = () => {
36-
cursors.r = 0;
37-
cursors['?'] = 0;
38-
cursors['!'] = 0;
39-
};
40-
41-
const encodeThing = (prefix, r) => {
42-
buffers[prefix].push(r);
43-
// With this encoding, all things with the same prefix have the same rank
44-
return prefix;
45-
};
46-
47-
const decodeThing = (prefix, e) => {
48-
prefix === e ||
49-
Fail`expected encoding ${q(e)} to simply be the prefix ${q(prefix)}`;
50-
(cursors[prefix] >= 0 && cursors[prefix] < buffers[prefix].length) ||
51-
Fail`while decoding ${q(e)}, expected cursors[${q(prefix)}], i.e., ${q(
52-
cursors[prefix],
53-
)} <= ${q(buffers[prefix].length)}`;
54-
const thing = buffers[prefix][cursors[prefix]];
55-
cursors[prefix] += 1;
56-
return thing;
57-
};
58-
59-
const compareRemotables = (x, y) =>
60-
compareRank(encodeThing('r', x), encodeThing('r', y));
61-
62-
const encodePassableInternal = makeEncodePassable({
63-
encodeRemotable: r => encodeThing('r', r),
64-
encodePromise: p => encodeThing('?', p),
65-
encodeError: er => encodeThing('!', er),
66-
});
67-
const encodePassableInternal2 = makeEncodePassable({
68-
encodeRemotable: r => encodeThing('r', r),
69-
encodePromise: p => encodeThing('?', p),
70-
encodeError: er => encodeThing('!', er),
71-
format: 'compactOrdered',
72-
});
73-
74-
const encodePassable = passable => {
75-
resetBuffers();
76-
return encodePassableInternal(passable);
77-
};
78-
const encodePassable2 = passable => {
79-
resetBuffers();
80-
return encodePassableInternal2(passable);
81-
};
82-
83-
const decodePassableInternal = makeDecodePassable({
84-
decodeRemotable: e => decodeThing('r', e),
85-
decodePromise: e => decodeThing('?', e),
86-
decodeError: e => decodeThing('!', e),
87-
});
88-
89-
const decodePassable = encoded => {
90-
resetCursors();
91-
return decodePassableInternal(encoded);
92-
};
14+
import {
15+
encodePassable,
16+
encodePassable2,
17+
encodePassableInternal2,
18+
decodePassable,
19+
decodePassableInternal,
20+
compareFull,
21+
} from './encodePassable-for-testing.js';
9322

9423
test('makePassableKit output shape', t => {
9524
const kit = makePassableKit();
@@ -133,8 +62,6 @@ test(
13362
(...args) => makePassableKit(...args).encodePassable,
13463
);
13564

136-
const { comparator: compareFull } = makeComparatorKit(compareRemotables);
137-
13865
const asNumber = new Float64Array(1);
13966
const asBits = new BigUint64Array(asNumber.buffer);
14067
const getNaN = (hexEncoding = '0008000000000000') => {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
import test from '@endo/ses-ava/prepare-endo.js';
2+
3+
import { compareRank } from '../src/rankOrder.js';
4+
import { encodePassable } from './encodePassable-for-testing.js';
5+
6+
/**
7+
* Essentially a ponyfill for Array.prototype.toSorted, for use before
8+
* we can always rely on the platform to provide it.
9+
*
10+
* @param {string[]} strings
11+
* @param {(
12+
* left: string,
13+
* right: string
14+
* ) => import('../src/types.js').RankComparison} comp
15+
* @returns {string[]}
16+
*/
17+
const sorted = (strings, comp) => [...strings].sort(comp);
18+
19+
test('unicode code point order', t => {
20+
// Test case from
21+
// https://icu-project.org/docs/papers/utf16_code_point_order.html
22+
const str0 = '\u{ff61}';
23+
const str3 = '\u{d800}\u{dc02}';
24+
25+
// str1 and str2 become impossible examples once we prohibit
26+
// non - well - formed strings.
27+
// See https://github.com/endojs/endo/pull/2002
28+
const str1 = '\u{d800}X';
29+
const str2 = '\u{d800}\u{ff61}';
30+
31+
// harden to ensure it is not sorted in place, just for sanity
32+
const strs = harden([str0, str1, str2, str3]);
33+
34+
/**
35+
* @param {string} left
36+
* @param {string} right
37+
* @returns {import('../src/types.js').RankComparison}
38+
*/
39+
const nativeComp = (left, right) =>
40+
// eslint-disable-next-line no-nested-ternary
41+
left < right ? -1 : left > right ? 1 : 0;
42+
43+
const nativeSorted = sorted(strs, nativeComp);
44+
45+
t.deepEqual(nativeSorted, [str1, str3, str2, str0]);
46+
47+
const rankSorted = sorted(strs, compareRank);
48+
49+
t.deepEqual(rankSorted, [str1, str2, str0, str3]);
50+
51+
const nativeEncComp = (left, right) =>
52+
nativeComp(encodePassable(left), encodePassable(right));
53+
54+
const nativeEncSorted = sorted(strs, nativeEncComp);
55+
56+
t.deepEqual(nativeEncSorted, nativeSorted);
57+
58+
const rankEncComp = (left, right) =>
59+
compareRank(encodePassable(left), encodePassable(right));
60+
61+
const rankEncSorted = sorted(strs, rankEncComp);
62+
63+
t.deepEqual(rankEncSorted, rankSorted);
64+
});

packages/patterns/NEWS.md

+2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ User-visible changes in `@endo/patterns`:
44

55
- `Passable` is now an accurate type instead of `any`. Downstream type checking may require changes ([example](https://github.com/Agoric/agoric-sdk/pull/8774))
66
- Some downstream types that take or return `Passable` were changed to `any` to defer downstream work to accomodate.
7+
- JavaScript's relational comparison operators like `<` compare strings by lexicographic UTF16 code unit order, which exposes an internal representational detail not relevant to the string's meaning as a Unicode string. Previously, `compareKeys` and associated functions compared strings using this JavaScript-native comparison. Now `compareKeys` and associated functions compare strings by lexicographic Unicode Code Point order. ***This change only affects strings containing so-called supplementary characters, i.e., those whose Unicode character code does not fit in 16 bits***.
8+
- See the NEWS.md of @endo/marshal for more on this change.
79

810
# v1.2.0 (2024-02-22)
911

0 commit comments

Comments
 (0)