Skip to content

Commit aa1774c

Browse files
authored
Improve lowercase mapping & add context dependent Sigma (#1113)
1 parent 8e73c0c commit aa1774c

12 files changed

+16132
-8503
lines changed

std/assembly/string.ts

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
11
/// <reference path="./rt/index.d.ts" />
22

33
import { BLOCK, BLOCK_OVERHEAD, BLOCK_MAXSIZE } from "./rt/common";
4-
import { compareImpl, strtol, strtod, isSpace, isAscii, toLower8, toUpper8 } from "./util/string";
4+
import { compareImpl, strtol, strtod, isSpace, isAscii, isFinalSigma, toLower8, toUpper8 } from "./util/string";
55
import { SPECIALS_UPPER, casemap, bsearch } from "./util/casemap";
66
import { E_INVALIDLENGTH } from "./util/error";
7-
import { idof } from "./builtins";
87

98
@sealed export abstract class String {
109

@@ -54,7 +53,7 @@ import { idof } from "./builtins";
5453
if (<u32>pos >= <u32>len) return -1; // (undefined)
5554
var first = <i32>load<u16>(changetype<usize>(this) + (<usize>pos << 1));
5655
if ((first & 0xFC00) != 0xD800 || pos + 1 == len) return first;
57-
var second = <i32>load<u16>(changetype<usize>(this) + ((<usize>pos + 1) << 1));
56+
var second = <i32>load<u16>(changetype<usize>(this) + (<usize>pos << 1), 2);
5857
if ((second & 0xFC00) != 0xDC00) return first;
5958
return (first - 0xD800 << 10) + (second - 0xDC00) + 0x10000;
6059
}
@@ -526,6 +525,13 @@ import { idof } from "./builtins";
526525
// 0x0130 -> [0x0069, 0x0307]
527526
store<u32>(codes + (j << 1), (0x0307 << 16) | 0x0069);
528527
++j;
528+
} else if (c == 0x03A3) { // 'Σ'
529+
// Σ maps to σ but except at the end of a word where it maps to ς
530+
let sigma = 0x03C3; // σ
531+
if (len > 1 && isFinalSigma(changetype<usize>(this), i, len)) {
532+
sigma = 0x03C2; // ς
533+
}
534+
store<u16>(codes + (j << 1), sigma);
529535
} else if (c - 0x24B6 <= 0x24CF - 0x24B6) {
530536
// Range 0x24B6 <= c <= 0x24CF not covered by casemap and require special early handling
531537
store<u16>(codes + (j << 1), c + 26);
@@ -552,7 +558,8 @@ import { idof } from "./builtins";
552558
var len = <usize>this.length;
553559
if (!len) return this;
554560
var codes = __alloc(len * 3 * 2, idof<String>());
555-
var specialsUpperLen = SPECIALS_UPPER.length;
561+
var specialsPtr = changetype<usize>(SPECIALS_UPPER);
562+
var specialsLen = SPECIALS_UPPER.length;
556563
var j: usize = 0;
557564
for (let i: usize = 0; i < len; ++i, ++j) {
558565
let c = <u32>load<u16>(changetype<usize>(this) + (i << 1));
@@ -578,15 +585,15 @@ import { idof } from "./builtins";
578585
// monkey patch
579586
store<u16>(codes + (j << 1), c - 26);
580587
} else {
581-
let index = -1;
588+
let index = -1 as usize;
582589
// Fast range check. See first and last rows in specialsUpper table
583590
if (c - 0x00DF <= 0xFB17 - 0x00DF) {
584-
index = <usize>bsearch(c, changetype<usize>(SPECIALS_UPPER), specialsUpperLen);
591+
index = <usize>bsearch(c, specialsPtr, specialsLen);
585592
}
586593
if (~index) {
587594
// load next 3 code points from row with `index` offset for specialsUpper table
588-
let ab = load<u32>(changetype<usize>(SPECIALS_UPPER) + (index << 1), 2);
589-
let cc = load<u16>(changetype<usize>(SPECIALS_UPPER) + (index << 1), 6);
595+
let ab = load<u32>(specialsPtr + (index << 1), 2);
596+
let cc = load<u16>(specialsPtr + (index << 1), 6);
590597
store<u32>(codes + (j << 1), ab, 0);
591598
store<u16>(codes + (j << 1), cc, 4);
592599
j += 1 + usize(cc != 0);

std/assembly/util/string.ts

Lines changed: 553 additions & 61 deletions
Large diffs are not rendered by default.

tests/compiler/std/string.js renamed to tests/compiler/std/string-casemapping.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
exports.preInstantiate = function preInstantiate(imports) {
2-
imports.string = {
2+
imports.string_casemapping = {
33
toUpperCaseFromIndex: function toUpperCaseFromIndex(index, codePointIndex) {
44
const code = String.fromCodePoint(index).toUpperCase().codePointAt(codePointIndex|0);
55
return code !== undefined ? code : -1;
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"asc_flags": [
3+
"--runtime half",
4+
"--explicitStart",
5+
"--use ASC_RTRACE=1"
6+
]
7+
}

tests/compiler/std/string-casemapping.optimized.wat

Lines changed: 4671 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
@external("string_casemapping", "toUpperCaseFromIndex")
2+
declare function toUpperCaseFromIndex(index: i32, codePointIndex: i32): i32;
3+
4+
@external("string_casemapping", "toLowerCaseFromIndex")
5+
declare function toLowerCaseFromIndex(index: i32, codePointIndex: i32): i32;
6+
7+
// Basic case mapping tests
8+
assert("".toUpperCase() == "");
9+
assert("".toLowerCase() == "");
10+
assert("09_AZ az.!\n".toUpperCase() == "09_AZ AZ.!\n");
11+
assert("09_AZ az.!\t".toLowerCase() == "09_az az.!\t");
12+
assert("Der Wechsel allein ist das Beständige".toUpperCase() == "DER WECHSEL ALLEIN IST DAS BESTÄNDIGE");
13+
assert("DER WECHSEL ALLEIN IST DAS BESTÄNDIGE".toLowerCase() == "der wechsel allein ist das beständige");
14+
assert("@ — Друг человека!".toUpperCase() == "@ — ДРУГ ЧЕЛОВЕКА!");
15+
assert("@ — ДРУГ ЧЕЛОВЕКА!".toLowerCase() == "@ — друг человека!");
16+
assert("∮ E⋅da = Q, n → ∞, ∑ f(i) = ∏ g(i)".toUpperCase() == "∮ E⋅DA = Q, N → ∞, ∑ F(I) = ∏ G(I)");
17+
assert("∮ E⋅DA = Q, N → ∞, ∑ F(I) = ∏ G(I)".toLowerCase() == "∮ e⋅da = q, n → ∞, ∑ f(i) = ∏ g(i)");
18+
assert("ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃn".toUpperCase() == "ÐI INTƏˈNÆƩƏNƏL FƏˈNƐTIK ƏSOƱSIˈEIƩN");
19+
assert("ÐI INTƏˈNÆƩƏNƏL FƏˈNƐTIK ƏSOƱSIˈEIƩN".toLowerCase() == "ði intəˈnæʃənəl fəˈnɛtik əsoʊsiˈeiʃn");
20+
assert("Σὲ γνωρίζω ἀπὸ τὴν κόψη".toUpperCase() == "ΣῈ ΓΝΩΡΊΖΩ ἈΠῸ ΤῊΝ ΚΌΨΗ");
21+
assert("τοῦ σπαθιοῦ τὴν τρομερή,".toUpperCase() == "ΤΟΥ͂ ΣΠΑΘΙΟΥ͂ ΤῊΝ ΤΡΟΜΕΡΉ,");
22+
assert("σὲ γνωρίζω ἀπὸ τὴν ὄψη".toUpperCase() == "ΣῈ ΓΝΩΡΊΖΩ ἈΠῸ ΤῊΝ ὌΨΗ");
23+
assert("ποὺ μὲ βία μετράει τὴ γῆ.".toUpperCase() == "ΠΟῪ ΜῈ ΒΊΑ ΜΕΤΡΆΕΙ ΤῊ ΓΗ͂.");
24+
assert("Απ᾿ τὰ κόκκαλα βγαλμένη".toUpperCase() == "ΑΠ᾿ ΤᾺ ΚΌΚΚΑΛΑ ΒΓΑΛΜΈΝΗ");
25+
assert("τῶν ῾Ελλήνων τὰ ἱερά".toUpperCase() == "ΤΩ͂Ν ῾ΕΛΛΉΝΩΝ ΤᾺ ἹΕΡΆ");
26+
assert("καὶ σὰν πρῶτα ἀνδρειωμένη".toUpperCase() == "ΚΑῚ ΣᾺΝ ΠΡΩ͂ΤΑ ἈΝΔΡΕΙΩΜΈΝΗ");
27+
assert("χαῖρε, ὦ χαῖρε, ᾿Ελευθεριά!".toUpperCase() == "ΧΑΙ͂ΡΕ, Ὦ ΧΑΙ͂ΡΕ, ᾿ΕΛΕΥΘΕΡΙΆ!");
28+
assert(
29+
"ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789abcdefghijklmnopqrstuvwxyz".toUpperCase() ==
30+
"ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
31+
);
32+
assert(
33+
"ABCDEFGHIJKLMNOPQRSTUVWXYZ /0123456789abcdefghijklmnopqrstuvwxyz".toLowerCase() ==
34+
"abcdefghijklmnopqrstuvwxyz /0123456789abcdefghijklmnopqrstuvwxyz"
35+
);
36+
assert("ß".toUpperCase() == "SS");
37+
assert("İ".toLowerCase() == "i̇"); // 0x0130
38+
assert(
39+
"£©µÀÆÖÞßéöÿ–—‘“”„†•…‰™œŠŸž€ ΑΒΓΔΩαβγδω АБВГДабвгд∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა".toUpperCase() ==
40+
"£©ΜÀÆÖÞSSÉÖŸ–—‘“”„†•…‰™ŒŠŸŽ€ ΑΒΓΔΩΑΒΓΔΩ АБВГДАБВГД∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ FI�⑀₂ἨḂӤẄⱯː⍎אԱᲐ"
41+
);
42+
assert("ß".toUpperCase().toLowerCase() == "ss");
43+
assert("fi".toUpperCase().toLowerCase() == "fi");
44+
assert(
45+
"𠜎 𠜱 𠝹 𠱓 𠱸 𠲖 𠳏 𠳕 𠴕 𠵼 𠵿 𠸎 𠸏 𠹷 𠺝 𠺢 𠻗 𠻹 𠻺 𠼭 𠼮 𠽌 𠾴 𠾼 𠿪 𡁜 𡁯 𡁵 𡁶 𡁻 𡃁"
46+
.toUpperCase().toLowerCase() ==
47+
"𠜎 𠜱 𠝹 𠱓 𠱸 𠲖 𠳏 𠳕 𠴕 𠵼 𠵿 𠸎 𠸏 𠹷 𠺝 𠺢 𠻗 𠻹 𠻺 𠼭 𠼮 𠽌 𠾴 𠾼 𠿪 𡁜 𡁯 𡁵 𡁶 𡁻 𡃁"
48+
);
49+
50+
assert(String.fromCodePoint(0x10000).toLowerCase() == "𐀀");
51+
assert(String.fromCodePoint(0x10000).toUpperCase() == "𐀀");
52+
53+
// Tests some special casing for lower case mapping
54+
assert("\u1F88".toLowerCase() == "\u1F80");
55+
assert("\u1F8F".toLowerCase() == "\u1F87");
56+
assert("\u1FFC".toLowerCase() == "\u1FF3");
57+
58+
// Tests sigma 'Σ' special cases
59+
assert("Σ".toLowerCase() == "σ");
60+
assert(" Σ".toLowerCase() == " σ");
61+
assert("Σ ".toLowerCase() == "σ ");
62+
assert(" Σ ".toLowerCase() == " σ ");
63+
assert("aΣ ".toLowerCase() == "aς ");
64+
assert("aΣ\n".toLowerCase() == "aς\n");
65+
assert("aΣ".toLowerCase() == "aς");
66+
assert("aΣb".toLowerCase() == "aσb");
67+
assert("ΣΣ ".toLowerCase() == "σς ");
68+
assert("1Σ ".toLowerCase() == "1σ ");
69+
assert(";Σ ".toLowerCase() == ";σ ");
70+
assert("\u0301Σ ".toLowerCase() == "\u0301σ ");
71+
72+
assert("Σ\u0301Σ\u0301 ".toLowerCase() == "σ́ς́ ");
73+
assert("ΣΣ-".toLowerCase() == "σς-");
74+
assert("Σ\u0301Σ\u0301-".toLowerCase() == "σ́ς́-");
75+
assert("Σ\u0301Σ\u0301猪".toLowerCase() == "σ́ς́猪");
76+
77+
// sigma tests from Test262
78+
assert("\uD835\uDCA2\u03A3".toLowerCase() == "\uD835\uDCA2\u03C2");
79+
assert("A.\u03A3".toLowerCase() == "a.\u03C2");
80+
assert("A\u00AD\u03A3".toLowerCase() == "a\u00AD\u03C2");
81+
assert("A\uD834\uDE42\u03A3".toLowerCase() == "a\uD834\uDE42\u03C2");
82+
assert("\u0345\u03A3".toLowerCase() == "\u0345\u03C3");
83+
assert("\u0391\u0345\u03A3".toLowerCase() == "\u03B1\u0345\u03C2");
84+
assert("A\u03A3B".toLowerCase() == "a\u03C3b");
85+
assert("A\u03A3\uD835\uDCA2".toLowerCase() == "a\u03C3\uD835\uDCA2");
86+
assert("A\u03A3.b".toLowerCase() == "a\u03C3.b");
87+
assert("A\u03A3\u00ADB".toLowerCase() == "a\u03C3\u00ADb");
88+
assert("A\u03A3\uD834\uDE42B".toLowerCase() == "a\u03C3\uD834\uDE42b");
89+
assert("A\u03A3\u0345".toLowerCase() == "a\u03C2\u0345");
90+
assert("A\u03A3\u0345\u0391".toLowerCase() == "a\u03C3\u0345\u03B1");
91+
assert("A\u180E\u03A3".toLowerCase() == "a\u180E\u03C2");
92+
assert("A\u180E\u03A3B".toLowerCase() == "a\u180E\u03C3b");
93+
assert("A\u03A3\u180E".toLowerCase() == "a\u03C2\u180E");
94+
assert("A\u03A3\u180EB".toLowerCase() == "a\u03C3\u180Eb");
95+
assert("A\u180E\u03A3\u180E".toLowerCase() == "a\u180E\u03C2\u180E");
96+
assert("A\u180E\u03A3\u180EB".toLowerCase() == "a\u180E\u03C3\u180Eb");
97+
98+
// Tests some special casing for upper case mapping
99+
assert("\uFB00".toUpperCase() == "FF");
100+
assert("\uFB01".toUpperCase() == "FI");
101+
assert("\uFB02".toUpperCase() == "FL");
102+
assert("\uFB03".toUpperCase() == "FFI");
103+
assert("\uFB04".toUpperCase() == "FFL");
104+
assert("\uFB05".toUpperCase() == "ST");
105+
assert("\uFB06".toUpperCase() == "ST");
106+
assert("\u01F0".toUpperCase() == "J\u030C");
107+
assert("\u1E96".toUpperCase() == "H\u0331");
108+
assert("\u1E97".toUpperCase() == "T\u0308");
109+
assert("\u1E98".toUpperCase() == "W\u030A");
110+
assert("\u1E99".toUpperCase() == "Y\u030A");
111+
assert("\u1E9A".toUpperCase() == "A\u02BE");
112+
113+
// Test full unicode range `0x0 - 0x10FFFF` and asserting with v8 engine.
114+
for (let i = 0; i <= 0x10FFFF; i++) {
115+
let source = String.fromCodePoint(i);
116+
let origLower = source.toLowerCase();
117+
let origUpper = source.toUpperCase();
118+
let code1: u64, code2: u64;
119+
120+
// collect all code points for lower case on AssemblyScript side
121+
let origLowerCode = <u64>origLower.codePointAt(0);
122+
if ((code1 = origLower.codePointAt(1)) >= 0) origLowerCode += <u64>code1 << 16;
123+
if ((code2 = origLower.codePointAt(2)) >= 0) origLowerCode += <u64>code2 << 32;
124+
125+
// collect all code points for upper case on AssemblyScript side
126+
let origUpperCode = <u64>origUpper.codePointAt(0);
127+
if ((code1 = origUpper.codePointAt(1)) >= 0) origUpperCode += <u64>code1 << 16;
128+
if ((code2 = origUpper.codePointAt(2)) >= 0) origUpperCode += <u64>code2 << 32;
129+
130+
// collect all code points for lower case on JavaScript side
131+
let expectLowerCode = <u64>toLowerCaseFromIndex(i, 0);
132+
if ((code1 = <i64>toLowerCaseFromIndex(i, 1)) >= 0) expectLowerCode += <u64>code1 << 16;
133+
if ((code2 = <i64>toLowerCaseFromIndex(i, 2)) >= 0) expectLowerCode += <u64>code2 << 32;
134+
135+
// collect all code points for upper case on JavaScript side
136+
let expectUpperCode = <u64>toUpperCaseFromIndex(i, 0);
137+
if ((code1 = <i64>toUpperCaseFromIndex(i, 1)) >= 0) expectUpperCode += <u64>code1 << 16;
138+
if ((code2 = <i64>toUpperCaseFromIndex(i, 2)) >= 0) expectUpperCode += <u64>code2 << 32;
139+
140+
if (origLowerCode != expectLowerCode) {
141+
trace("origLowerCode != expectLowerCode", 3, i, <f64>origLowerCode, <f64>expectLowerCode);
142+
}
143+
144+
if (origUpperCode != expectUpperCode) {
145+
trace("origUpperCode != expectUpperCode", 3, i, <f64>origUpperCode, <f64>expectUpperCode);
146+
}
147+
148+
assert(origLowerCode == expectLowerCode);
149+
assert(origUpperCode == expectUpperCode);
150+
}

0 commit comments

Comments
 (0)