Skip to content

Commit 25319d5

Browse files
committed
src: implement Windows-1252 encoding support and update related tests
1 parent e42c6c8 commit 25319d5

File tree

6 files changed

+189
-42
lines changed

6 files changed

+189
-42
lines changed

lib/internal/encoding.js

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ const kEncoding = Symbol('encoding');
2828
const kDecoder = Symbol('decoder');
2929
const kFatal = Symbol('kFatal');
3030
const kUTF8FastPath = Symbol('kUTF8FastPath');
31-
const kLatin1FastPath = Symbol('kLatin1FastPath');
31+
const kWindows1252FastPath = Symbol('kWindows1252FastPath');
3232
const kIgnoreBOM = Symbol('kIgnoreBOM');
3333

3434
const {
@@ -55,7 +55,7 @@ const {
5555
encodeIntoResults,
5656
encodeUtf8String,
5757
decodeUTF8,
58-
decodeLatin1,
58+
decodeWindows1252,
5959
} = binding;
6060

6161
const { Buffer } = require('buffer');
@@ -420,10 +420,10 @@ function makeTextDecoderICU() {
420420
this[kFatal] = Boolean(options?.fatal);
421421
// Only support fast path for UTF-8.
422422
this[kUTF8FastPath] = enc === 'utf-8';
423-
this[kLatin1FastPath] = enc === 'windows-1252';
423+
this[kWindows1252FastPath] = enc === 'windows-1252';
424424
this[kHandle] = undefined;
425425

426-
if (!this[kUTF8FastPath] && !this[kLatin1FastPath]) {
426+
if (!this[kUTF8FastPath] && !this[kWindows1252FastPath]) {
427427
this.#prepareConverter();
428428
}
429429
}
@@ -440,14 +440,14 @@ function makeTextDecoderICU() {
440440
validateDecoder(this);
441441

442442
this[kUTF8FastPath] &&= !(options?.stream);
443-
this[kLatin1FastPath] &&= !(options?.stream);
443+
this[kWindows1252FastPath] &&= !(options?.stream);
444444

445445
if (this[kUTF8FastPath]) {
446446
return decodeUTF8(input, this[kIgnoreBOM], this[kFatal]);
447447
}
448448

449-
if (this[kLatin1FastPath]) {
450-
return decodeLatin1(input, this[kIgnoreBOM], this[kFatal]);
449+
if (this[kWindows1252FastPath]) {
450+
return decodeWindows1252(input, this[kIgnoreBOM], this[kFatal]);
451451
}
452452

453453
this.#prepareConverter();

src/encoding_binding.cc

Lines changed: 107 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,8 @@ void BindingData::CreatePerIsolateProperties(IsolateData* isolate_data,
221221
SetMethodNoSideEffect(isolate, target, "decodeUTF8", DecodeUTF8);
222222
SetMethodNoSideEffect(isolate, target, "toASCII", ToASCII);
223223
SetMethodNoSideEffect(isolate, target, "toUnicode", ToUnicode);
224-
SetMethodNoSideEffect(isolate, target, "decodeLatin1", DecodeLatin1);
224+
SetMethodNoSideEffect(
225+
isolate, target, "decodeWindows1252", DecodeWindows1252);
225226
}
226227

227228
void BindingData::CreatePerContextProperties(Local<Object> target,
@@ -239,10 +240,10 @@ void BindingData::RegisterTimerExternalReferences(
239240
registry->Register(DecodeUTF8);
240241
registry->Register(ToASCII);
241242
registry->Register(ToUnicode);
242-
registry->Register(DecodeLatin1);
243+
registry->Register(DecodeWindows1252);
243244
}
244245

245-
void BindingData::DecodeLatin1(const FunctionCallbackInfo<Value>& args) {
246+
void BindingData::DecodeWindows1252(const FunctionCallbackInfo<Value>& args) {
246247
Environment* env = Environment::GetCurrent(args);
247248

248249
CHECK_GE(args.Length(), 1);
@@ -255,7 +256,6 @@ void BindingData::DecodeLatin1(const FunctionCallbackInfo<Value>& args) {
255256
}
256257

257258
bool ignore_bom = args[1]->IsTrue();
258-
bool has_fatal = args[2]->IsTrue();
259259

260260
ArrayBufferViewContents<uint8_t> buffer(args[0]);
261261
const uint8_t* data = buffer.data();
@@ -270,20 +270,115 @@ void BindingData::DecodeLatin1(const FunctionCallbackInfo<Value>& args) {
270270
return args.GetReturnValue().SetEmptyString();
271271
}
272272

273-
std::string result(length * 2, '\0');
273+
// Windows-1252 specific mapping for bytes 128-159
274+
// These differ from Latin-1/ISO-8859-1
275+
static const uint16_t windows1252_mapping[32] = {
276+
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
277+
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
278+
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
279+
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F
280+
};
281+
282+
std::string result;
283+
result.reserve(length * 3); // Reserve space for UTF-8 output
284+
285+
for (size_t i = 0; i < length; i++) {
286+
uint8_t byte = data[i];
287+
uint32_t codepoint;
288+
289+
// Check if byte is in the special Windows-1252 range (128-159)
290+
if (byte >= 0x80 && byte <= 0x9F) {
291+
codepoint = windows1252_mapping[byte - 0x80];
292+
} else {
293+
// For all other bytes, Windows-1252 is identical to Latin-1
294+
codepoint = byte;
295+
}
274296

275-
size_t written = simdutf::convert_latin1_to_utf8(
276-
reinterpret_cast<const char*>(data), length, result.data());
297+
// Convert codepoint to UTF-8
298+
if (codepoint < 0x80) {
299+
result.push_back(static_cast<char>(codepoint));
300+
} else if (codepoint < 0x800) {
301+
result.push_back(static_cast<char>(0xC0 | (codepoint >> 6)));
302+
result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
303+
} else {
304+
result.push_back(static_cast<char>(0xE0 | (codepoint >> 12)));
305+
result.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
306+
result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
307+
}
308+
}
277309

278-
if (has_fatal && written == 0) {
279-
return node::THROW_ERR_ENCODING_INVALID_ENCODED_DATA(
280-
env->isolate(), "The encoded data was not valid for encoding latin1");
310+
Local<Value> ret;
311+
if (ToV8Value(env->context(), result, env->isolate()).ToLocal(&ret)) {
312+
args.GetReturnValue().Set(ret);
281313
}
314+
}
282315

283-
std::string_view view(result.c_str(), written);
316+
void BindingData::DecodeWindows1252(const FunctionCallbackInfo<Value>& args) {
317+
Environment* env = Environment::GetCurrent(args);
318+
319+
CHECK_GE(args.Length(), 1);
320+
if (!(args[0]->IsArrayBuffer() || args[0]->IsSharedArrayBuffer() ||
321+
args[0]->IsArrayBufferView())) {
322+
return node::THROW_ERR_INVALID_ARG_TYPE(
323+
env->isolate(),
324+
"The \"input\" argument must be an instance of ArrayBuffer, "
325+
"SharedArrayBuffer, or ArrayBufferView.");
326+
}
327+
328+
bool ignore_bom = args[1]->IsTrue();
329+
330+
ArrayBufferViewContents<uint8_t> buffer(args[0]);
331+
const uint8_t* data = buffer.data();
332+
size_t length = buffer.length();
333+
334+
if (ignore_bom && length > 0 && data[0] == 0xFF) {
335+
data++;
336+
length--;
337+
}
338+
339+
if (length == 0) {
340+
return args.GetReturnValue().SetEmptyString();
341+
}
342+
343+
// Windows-1252 specific mapping for bytes 128-159
344+
// These differ from Latin-1/ISO-8859-1
345+
static const uint16_t windows1252_mapping[32] = {
346+
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
347+
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
348+
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
349+
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F
350+
};
351+
352+
std::string result;
353+
result.reserve(length * 3); // Reserve space for UTF-8 output
354+
355+
for (size_t i = 0; i < length; i++) {
356+
uint8_t byte = data[i];
357+
uint32_t codepoint;
358+
359+
// Check if byte is in the special Windows-1252 range (128-159)
360+
if (byte >= 0x80 && byte <= 0x9F) {
361+
codepoint = windows1252_mapping[byte - 0x80];
362+
} else {
363+
// For all other bytes, Windows-1252 is identical to Latin-1
364+
codepoint = byte;
365+
}
366+
367+
// Convert codepoint to UTF-8
368+
if (codepoint < 0x80) {
369+
result.push_back(static_cast<char>(codepoint));
370+
} else if (codepoint < 0x800) {
371+
result.push_back(static_cast<char>(0xC0 | (codepoint >> 6)));
372+
result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
373+
} else {
374+
result.push_back(static_cast<char>(0xE0 | (codepoint >> 12)));
375+
result.push_back(static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)));
376+
result.push_back(static_cast<char>(0x80 | (codepoint & 0x3F)));
377+
}
378+
}
284379

285380
Local<Value> ret;
286-
if (ToV8Value(env->context(), view, env->isolate()).ToLocal(&ret)) {
381+
if (ToV8Value(env->context(), result, env->isolate()).ToLocal(&ret)) {
287382
args.GetReturnValue().Set(ret);
288383
}
289384
}

src/encoding_binding.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ class BindingData : public SnapshotableObject {
3131
static void EncodeInto(const v8::FunctionCallbackInfo<v8::Value>& args);
3232
static void EncodeUtf8String(const v8::FunctionCallbackInfo<v8::Value>& args);
3333
static void DecodeUTF8(const v8::FunctionCallbackInfo<v8::Value>& args);
34-
static void DecodeLatin1(const v8::FunctionCallbackInfo<v8::Value>& args);
34+
static void DecodeWindows1252(
35+
const v8::FunctionCallbackInfo<v8::Value>& args);
3536

3637
static void ToASCII(const v8::FunctionCallbackInfo<v8::Value>& args);
3738
static void ToUnicode(const v8::FunctionCallbackInfo<v8::Value>& args);

test/parallel/test-internal-encoding-binding.js

Lines changed: 26 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -8,41 +8,46 @@ const assert = require('node:assert');
88
const { internalBinding } = require('internal/test/binding');
99
const binding = internalBinding('encoding_binding');
1010

11+
// Windows-1252 specific tests
1112
{
12-
// Valid input
13-
const buf = Uint8Array.from([0xC1, 0xE9, 0xF3]);
14-
assert.strictEqual(binding.decodeLatin1(buf, false, false), 'Áéó');
13+
// Test Windows-1252 special characters in 128-159 range
14+
// These differ from Latin-1
15+
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x80), false, false), '€');
16+
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x82), false, false), '‚');
17+
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x83), false, false), 'ƒ');
18+
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x9F), false, false), 'Ÿ');
1519
}
1620

1721
{
18-
// Empty input
19-
const buf = Uint8Array.from([]);
20-
assert.strictEqual(binding.decodeLatin1(buf, false, false), '');
22+
// Test Windows-1252 characters outside 128-159 range (same as Latin-1)
23+
const buf = Uint8Array.from([0xC1, 0xE9, 0xF3]);
24+
assert.strictEqual(binding.decodeWindows1252(buf, false, false), 'Áéó');
2125
}
2226

2327
{
24-
// Invalid input, but Latin1 has no invalid chars and should never throw.
25-
const buf = new TextEncoder().encode('Invalid Latin1 🧑‍🧑‍🧒‍🧒');
26-
assert.strictEqual(
27-
binding.decodeLatin1(buf, false, false),
28-
'Invalid Latin1 ð\x9F§\x91â\x80\x8Dð\x9F§\x91â\x80\x8Dð\x9F§\x92â\x80\x8Dð\x9F§\x92'
29-
);
28+
// Empty input
29+
const buf = Uint8Array.from([]);
30+
assert.strictEqual(binding.decodeWindows1252(buf, false, false), '');
3031
}
3132

33+
// Windows-1252 specific tests
3234
{
33-
// IgnoreBOM with BOM
34-
const buf = Uint8Array.from([0xFE, 0xFF, 0xC1, 0xE9, 0xF3]);
35-
assert.strictEqual(binding.decodeLatin1(buf, true, false), 'þÿÁéó');
35+
// Test Windows-1252 special characters in 128-159 range
36+
// These differ from Latin-1
37+
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x80), false, false), '€');
38+
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x82), false, false), '‚');
39+
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x83), false, false), 'ƒ');
40+
assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x9F), false, false), 'Ÿ');
3641
}
3742

3843
{
39-
// Fatal and InvalidInput, but Latin1 has no invalid chars and should never throw.
40-
const buf = Uint8Array.from([0xFF, 0xFF, 0xFF]);
41-
assert.strictEqual(binding.decodeLatin1(buf, false, true), 'ÿÿÿ');
44+
// Test Windows-1252 characters outside 128-159 range (same as Latin-1)
45+
const buf = Uint8Array.from([0xC1, 0xE9, 0xF3]);
46+
assert.strictEqual(binding.decodeWindows1252(buf, false, false), 'Áéó');
4247
}
4348

4449
{
45-
// IgnoreBOM and Fatal, but Latin1 has no invalid chars and should never throw.
46-
const buf = Uint8Array.from([0xFE, 0xFF, 0xC1, 0xE9, 0xF3]);
47-
assert.strictEqual(binding.decodeLatin1(buf, true, true), 'þÿÁéó');
50+
// Empty input
51+
const buf = Uint8Array.from([]);
52+
assert.strictEqual(binding.decodeWindows1252(buf, false, false), '');
4853
}

test/parallel/test-util-text-decoder.js

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,49 @@ test('TextDecoder correctly decodes windows-1252 encoded data', { skip: !common.
1515

1616
assert.strictEqual(decodedString, expectedString);
1717
});
18+
19+
// Test for the difference between Latin1 and Windows-1252 in the 128-159
20+
// range
21+
// Ref: https://github.com/nodejs/node/issues/60888
22+
test('TextDecoder correctly decodes windows-1252 special characters in ' +
23+
'128-159 range', { skip: !common.hasIntl }, () => {
24+
const decoder = new TextDecoder('windows-1252');
25+
26+
// Test specific characters that differ between Latin1 and Windows-1252.
27+
// € Euro sign
28+
assert.strictEqual(decoder.decode(Uint8Array.of(128)).codePointAt(0),
29+
8364);
30+
// ‚ Single low-9 quotation mark
31+
assert.strictEqual(decoder.decode(Uint8Array.of(130)).codePointAt(0),
32+
8218);
33+
// Latin small letter f with hook (ƒ)
34+
assert.strictEqual(decoder.decode(Uint8Array.of(131)).codePointAt(0),
35+
402);
36+
// Ÿ Latin capital letter Y with diaeresis
37+
assert.strictEqual(decoder.decode(Uint8Array.of(159)).codePointAt(0),
38+
376);
39+
40+
// Test the full range to ensure no character is treated as Latin1
41+
// Directly.
42+
const expectedMappings = [
43+
[128, 8364], [129, 129], [130, 8218], [131, 402], [132, 8222],
44+
[133, 8230], [134, 8224], [135, 8225], [136, 710], [137, 8240],
45+
[138, 352], [139, 8249], [140, 338], [141, 141], [142, 381],
46+
[143, 143], [144, 144], [145, 8216], [146, 8217], [147, 8220],
47+
[148, 8221], [149, 8226], [150, 8211], [151, 8212], [152, 732],
48+
[153, 8482], [154, 353], [155, 8250], [156, 339], [157, 157],
49+
[158, 382], [159, 376],
50+
];
51+
52+
for (const [byte, expectedCodePoint] of expectedMappings) {
53+
const result = decoder.decode(Uint8Array.of(byte));
54+
const actualCodePoint = result.codePointAt(0);
55+
assert.strictEqual(
56+
actualCodePoint,
57+
expectedCodePoint,
58+
`Byte 0x${byte.toString(16)} should decode to ` +
59+
`U+${expectedCodePoint.toString(16)} but got ` +
60+
`U+${actualCodePoint.toString(16)}`
61+
);
62+
}
63+
});

typings/internalBinding/encoding_binding.d.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@ export interface EncodingBinding {
44
decodeUTF8(buffer: ArrayBufferView | ArrayBuffer | SharedArrayBuffer, ignoreBOM?: boolean, hasFatal?: boolean): string;
55
toASCII(input: string): string;
66
toUnicode(input: string): string;
7-
decodeLatin1(buffer: ArrayBufferView | ArrayBuffer | SharedArrayBuffer, ignoreBOM?: boolean, hasFatal?: boolean): string;
7+
decodeWindows1252(buffer: ArrayBufferView | ArrayBuffer | SharedArrayBuffer, ignoreBOM?: boolean, hasFatal?: boolean): string;
88
}

0 commit comments

Comments
 (0)