Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 25 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,22 +60,37 @@ require(

## API

### `utf8.encode(string)`
### `utf8.encode(string, opts)`

Encodes any given JavaScript string (`string`) as UTF-8, and returns the UTF-8-encoded version of the string. It throws an error if the input string contains a non-scalar value, i.e. a lone surrogate. (If you need to be able to encode non-scalar values as well, use [WTF-8](https://mths.be/wtf8) instead.)
Encodes any given JavaScript string (`string`) as UTF-8 (the `opts` object being optional), and returns the UTF-8-encoded version of the string. It throws an error if the input string contains a non-scalar value, i.e. a lone surrogate. (If you need to be able to encode non-scalar values as well, use [WTF-8](https://mths.be/wtf8) instead.)

Available options:

* `strict`: whether encountering a lone surrogate should throw an error (defaults to `true`). Else, each lone surrogate is replaced by the character U+FFFD.

```js
// U+00A9 COPYRIGHT SIGN; see http://codepoints.net/U+00A9
utf8.encode('\xA9');
// → '\xC2\xA9'
// U+10001 LINEAR B SYLLABLE B038 E; see http://codepoints.net/U+10001

utf8.encode('\uD800\uDC01');
// → '\xF0\x90\x80\x81'

utf8.encode('\uDC00');
// → throws 'Lone surrogate is not a scalar value' error

utf8.encode('\uDC00', { strict: false });
// → '\xEF\xBF\xBD'
```

### `utf8.decode(byteString)`
### `utf8.decode(byteString, opts)`

Decodes any given UTF-8-encoded string (`byteString`) as UTF-8, and returns the UTF-8-decoded version of the string. It throws an error when malformed UTF-8 is detected. (If you need to be able to decode encoded non-scalar values as well, use [WTF-8](https://mths.be/wtf8) instead.)
Decodes any given UTF-8-encoded string (`byteString`) as UTF-8 (the `opts` object being optional), and returns the UTF-8-decoded version of the string. It throws an error when malformed UTF-8 is detected. (If you need to be able to decode encoded non-scalar values as well, use [WTF-8](https://mths.be/wtf8) instead.)

Available options:

* `strict`: whether encountering a non-scalar value should throw an error (defaults to `true`). Else, each non-scalar value is decoded as U+FFFD.

```js
utf8.decode('\xC2\xA9');
Expand All @@ -84,6 +99,12 @@ utf8.decode('\xC2\xA9');
utf8.decode('\xF0\x90\x80\x81');
// → '\uD800\uDC01'
// → U+10001 LINEAR B SYLLABLE B038 E

utf8.decode('\xED\xB0\x80');
// → throws 'Lone surrogate is not a scalar value' error

utf8.decode('\xED\xB0\x80', { strict: false });
// → '\uFFFD'
```

### `utf8.version`
Expand Down
41 changes: 40 additions & 1 deletion tests/tests.js
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,11 @@
'decoded': '\u2C3C',
'encoded': '\xE2\xB0\xBC'
},
{
'codePoint': 0xFFFD,
'decoded': '\uFFFD',
'encoded': '\xEF\xBF\xBD',
},
{
'codePoint': 0xFFFF,
'decoded': '\uFFFF',
Expand All @@ -101,74 +106,98 @@
{
'codePoint': 0xD800,
'decoded': '\uD800',
'decodedNonStrict': '\uFFFD',
'encoded': '\xED\xA0\x80',
'encodedNonStrict': '\xEF\xBF\xBD',
'error': true
},
{
'description': 'High surrogate followed by another high surrogate',
'decoded': '\uD800\uD800',
'decodedNonStrict': '\uFFFD\uFFFD',
'encoded': '\xED\xA0\x80\xED\xA0\x80',
'encodedNonStrict': '\xEF\xBF\xBD\xEF\xBF\xBD',
'error': true
},
{
'description': 'High surrogate followed by a symbol that is not a surrogate',
'decoded': '\uD800A',
'decodedNonStrict': '\uFFFDA',
'encoded': '\xED\xA0\x80A',
'encodedNonStrict': '\xEF\xBF\xBDA',
'error': true
},
{
'description': 'Unmatched high surrogate, followed by a surrogate pair, followed by an unmatched high surrogate',
'decoded': '\uD800\uD834\uDF06\uD800',
'decodedNonStrict': '\uFFFD\uD834\uDF06\uFFFD',
'encoded': '\xED\xA0\x80\xF0\x9D\x8C\x86\xED\xA0\x80',
'encodedNonStrict': '\xEF\xBF\xBD\xF0\x9D\x8C\x86\xEF\xBF\xBD',
'error': true
},
{
'codePoint': 0xD9AF,
'decoded': '\uD9AF',
'decodedNonStrict': '\uFFFD',
'encoded': '\xED\xA6\xAF',
'encodedNonStrict': '\xEF\xBF\xBD',
'error': true
},
{
'codePoint': 0xDBFF,
'decoded': '\uDBFF',
'decodedNonStrict': '\uFFFD',
'encoded': '\xED\xAF\xBF',
'encodedNonStrict': '\xEF\xBF\xBD',
'error': true
},
// low surrogates: 0xDC00 to 0xDFFF
{
'codePoint': 0xDC00,
'decoded': '\uDC00',
'decodedNonStrict': '\uFFFD',
'encoded': '\xED\xB0\x80',
'encodedNonStrict': '\xEF\xBF\xBD',
'error': true
},
{
'description': 'Low surrogate followed by another low surrogate',
'decoded': '\uDC00\uDC00',
'decodedNonStrict': '\uFFFD\uFFFD',
'encoded': '\xED\xB0\x80\xED\xB0\x80',
'encodedNonStrict': '\xEF\xBF\xBD\xEF\xBF\xBD',
'error': true
},
{
'description': 'Low surrogate followed by a symbol that is not a surrogate',
'decoded': '\uDC00A',
'decodedNonStrict': '\uFFFDA',
'encoded': '\xED\xB0\x80A',
'encodedNonStrict': '\xEF\xBF\xBDA',
'error': true
},
{
'description': 'Unmatched low surrogate, followed by a surrogate pair, followed by an unmatched low surrogate',
'decoded': '\uDC00\uD834\uDF06\uDC00',
'decodedNonStrict': '\uFFFD\uD834\uDF06\uFFFD',
'encoded': '\xED\xB0\x80\xF0\x9D\x8C\x86\xED\xB0\x80',
'encodedNonStrict': '\xEF\xBF\xBD\xF0\x9D\x8C\x86\xEF\xBF\xBD',
'error': true
},
{
'codePoint': 0xDEEE,
'decoded': '\uDEEE',
'decodedNonStrict': '\uFFFD',
'encoded': '\xED\xBB\xAE',
'encodedNonStrict': '\xEF\xBF\xBD',
'error': true
},
{
'codePoint': 0xDFFF,
'decoded': '\uDFFF',
'decodedNonStrict': '\uFFFD',
'encoded': '\xED\xBF\xBF',
'encodedNonStrict': '\xEF\xBF\xBD',
'error': true
},

Expand Down Expand Up @@ -204,7 +233,7 @@
test('encode/decode', function() {
forEach(data, function(object) {
var description = object.description || 'U+' + object.codePoint.toString(16).toUpperCase();
;

if (object.error) {
raises(
function() {
Expand All @@ -220,6 +249,16 @@
Error,
'Error: non-scalar value detected'
);
equal(
object.encodedNonStrict,
utf8.encode(object.decoded, { strict: false }),
'Encoding (non-strict): ' + description
);
equal(
object.decodedNonStrict,
utf8.decode(object.encoded, { strict: false }),
'Decoding (non-strict): ' + description
);
} else {
equal(
object.encoded,
Expand Down
39 changes: 25 additions & 14 deletions utf8.js
Original file line number Diff line number Diff line change
Expand Up @@ -64,21 +64,25 @@
return output;
}

function checkScalarValue(codePoint) {
function checkScalarValue(codePoint, strict) {
if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
throw Error(
'Lone surrogate U+' + codePoint.toString(16).toUpperCase() +
' is not a scalar value'
);
if (strict) {
throw Error(
'Lone surrogate U+' + codePoint.toString(16).toUpperCase() +
' is not a scalar value'
);
}
return false;
}
return true;
}
/*--------------------------------------------------------------------------*/

function createByte(codePoint, shift) {
return stringFromCharCode(((codePoint >> shift) & 0x3F) | 0x80);
}

function encodeCodePoint(codePoint) {
function encodeCodePoint(codePoint, strict) {
if ((codePoint & 0xFFFFFF80) == 0) { // 1-byte sequence
return stringFromCharCode(codePoint);
}
Expand All @@ -87,7 +91,9 @@
symbol = stringFromCharCode(((codePoint >> 6) & 0x1F) | 0xC0);
}
else if ((codePoint & 0xFFFF0000) == 0) { // 3-byte sequence
checkScalarValue(codePoint);
if (!checkScalarValue(codePoint, strict)) {
codePoint = 0xFFFD;
}
symbol = stringFromCharCode(((codePoint >> 12) & 0x0F) | 0xE0);
symbol += createByte(codePoint, 6);
}
Expand All @@ -100,15 +106,18 @@
return symbol;
}

function utf8encode(string) {
function utf8encode(string, opts) {
opts = opts || {};
var strict = false !== opts.strict;

var codePoints = ucs2decode(string);
var length = codePoints.length;
var index = -1;
var codePoint;
var byteString = '';
while (++index < length) {
codePoint = codePoints[index];
byteString += encodeCodePoint(codePoint);
byteString += encodeCodePoint(codePoint, strict);
}
return byteString;
}
Expand All @@ -131,7 +140,7 @@
throw Error('Invalid continuation byte');
}

function decodeSymbol() {
function decodeSymbol(strict) {
var byte1;
var byte2;
var byte3;
Expand Down Expand Up @@ -172,8 +181,7 @@
byte3 = readContinuationByte();
codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3;
if (codePoint >= 0x0800) {
checkScalarValue(codePoint);
return codePoint;
return checkScalarValue(codePoint, strict) ? codePoint : 0xFFFD;
} else {
throw Error('Invalid continuation byte');
}
Expand All @@ -197,13 +205,16 @@
var byteArray;
var byteCount;
var byteIndex;
function utf8decode(byteString) {
function utf8decode(byteString, opts) {
opts = opts || {};
var strict = false !== opts.strict;

byteArray = ucs2decode(byteString);
byteCount = byteArray.length;
byteIndex = 0;
var codePoints = [];
var tmp;
while ((tmp = decodeSymbol()) !== false) {
while ((tmp = decodeSymbol(strict)) !== false) {
codePoints.push(tmp);
}
return ucs2encode(codePoints);
Expand Down