Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,5 @@ Thumbs.db
# Files that might appear on external disks
.Spotlight-V100
.Trashes

.idea
25 changes: 25 additions & 0 deletions tests/test-large.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
'use strict';

var b = new Buffer('x', 'utf8');

// test with node.js on 50MB data encoding

var utf8 = require('../utf8.js');
console.log('started. mem: %d', mem());
var buffer = new Buffer(1024*1024*50);
var letters = ['a', 'ы', '6', 'ð'];
for (var i = 0; i < buffer.length; i++) {
buffer[i] = letters[i % letters.length].charCodeAt(0);
}
var str = buffer.toString('utf16le');
buffer = null;
console.log('created string. mem: %d', mem());
str = utf8.encode(str);
console.log('encoded string (%d chars). mem: %d', str.length, mem());
str = utf8.decode(str);
console.log('converted string (%d chars). mem: %d', str.length, mem());

function mem() {
if (global.gc) global.gc();
return Math.round(process.memoryUsage().rss/1024/1024);
}
19 changes: 19 additions & 0 deletions tests/tests.js
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,18 @@
'codePoint': 0x10FFF,
'decoded': '\uDBFF\uDFFF',
'encoded': '\xF4\x8F\xBF\xBF'
},

// Long strings
{
'description': 'Long string',
'decoded': new Array(2000).join('x') + new Array(1000).join('\uDBFF\uDFFF'),
'encoded': new Array(2000).join('x') + new Array(1000).join('\xF4\x8F\xBF\xBF')
},
{
'description': 'Long string another',
'decoded': new Array(1025).join('x'),
'encoded': new Array(1025).join('x')
}
];

Expand Down Expand Up @@ -263,6 +275,13 @@
Error,
'Error: invalid byte index'
);
raises(
function() {
utf8.decode('\xC0\x0F');
},
Error,
'Error: invalid byte index'
);
});

/*--------------------------------------------------------------------------*/
Expand Down
244 changes: 133 additions & 111 deletions utf8.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,64 +15,42 @@
root = freeGlobal;
}

/*--------------------------------------------------------------------------*/

var stringFromCharCode = String.fromCharCode;

// Taken from https://mths.be/punycode
function ucs2decode(string) {
var output = [];
var counter = 0;
var length = string.length;
var value;
var extra;
while (counter < length) {
value = string.charCodeAt(counter++);
if (value >= 0xD800 && value <= 0xDBFF && counter < length) {
// high surrogate, and there is a next character
extra = string.charCodeAt(counter++);
if ((extra & 0xFC00) == 0xDC00) { // low surrogate
output.push(((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000);
} else {
// unmatched surrogate; only append this code unit, in case the next
// code unit is the high surrogate of a surrogate pair
output.push(value);
counter--;
}
} else {
output.push(value);
}
}
return output;
/**
* UCS-2 decoder stream
* @param {string} source - source string
* @constructor
*/
function Ucs2StreamDecoder(source) {
this.source = source;
this.len = source.length;
this.pos = 0;
}

// Taken from https://mths.be/punycode
function ucs2encode(array) {
var length = array.length;
var index = -1;
var value;
var output = '';
while (++index < length) {
value = array[index];
if (value > 0xFFFF) {
value -= 0x10000;
output += stringFromCharCode(value >>> 10 & 0x3FF | 0xD800);
value = 0xDC00 | value & 0x3FF;
}
output += stringFromCharCode(value);
/**
* Get next UCS2 char code
* Taken from https://mths.be/punycode
* @return {number} positive value meaning char code; NaN, if there's no symbol
*/
Ucs2StreamDecoder.prototype.next = function () {
if (this.pos >= this.len) {
return NaN;
}
return output;
}

function checkScalarValue(codePoint) {
if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
throw Error(
'Lone surrogate U+' + codePoint.toString(16).toUpperCase() +
' is not a scalar value'
);
var value = this.source.charCodeAt(this.pos++);
if (value >= 0xD800 && value <= 0xDBFF) {
// high surrogate, and there is a next character
var extra = this.source.charCodeAt(this.pos++);
if ((extra & 0xFC00) == 0xDC00) { // low surrogate
value = ((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000;
} else {
// unmatched surrogate; only append this code unit, in case the next
// code unit is the high surrogate of a surrogate pair
this.pos--;
}
}
}
/*--------------------------------------------------------------------------*/
return value;
};

function createByte(codePoint, shift) {
return stringFromCharCode(((codePoint >> shift) & 0x3F) | 0x80);
Expand Down Expand Up @@ -100,55 +78,72 @@
return symbol;
}

function utf8encode(string) {
var codePoints = ucs2decode(string);
var length = codePoints.length;
var index = -1;
var codePoint;
var byteString = '';
while (++index < length) {
codePoint = codePoints[index];
byteString += encodeCodePoint(codePoint);
function checkScalarValue(codePoint) {
if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
throw Error(
'Lone surrogate U+' + codePoint.toString(16).toUpperCase() +
' is not a scalar value'
);
}
return byteString;
}

/*--------------------------------------------------------------------------*/
/**
* Creates temporary buffer for faster string joining
* @param {number} strLen - converted string length
* @returns {Array}
*/
function createTmpBuffer(strLen) {
return new Array(strLen < 100 ? 128 : 1024);
}

function readContinuationByte() {
if (byteIndex >= byteCount) {
throw Error('Invalid byte index');
function utf8encode(str) {
var decoder = new Ucs2StreamDecoder(str),
codePoint,
arr = createTmpBuffer(str.length),
arrIx = 0, arrLen = arr.length,
result = '';
while (true) {
codePoint = decoder.next();
if (isNaN(codePoint)) {
break;
}
arr[arrIx++] = encodeCodePoint(codePoint);
if (arrIx === arrLen) {
result += arr.join('');
arrIx = 0;
}
}

var continuationByte = byteArray[byteIndex] & 0xFF;
byteIndex++;

if ((continuationByte & 0xC0) == 0x80) {
return continuationByte & 0x3F;
if (arrIx > 0) {
arr.length = arrIx;
result += arr.join('');
}
return result;
}

// If we end up here, it’s not a continuation byte
throw Error('Invalid continuation byte');
/**
* codePoint decoder stream
* @param {Ucs2StreamDecoder} source - source stream
* @constructor
*/
function CodePointStreamDecoder(source) {
this.source = source;
}

function decodeSymbol() {
var byte1;
var byte2;
var byte3;
var byte4;
/**
* Get next char
* @return {number} positive value meaning char code; NaN, if there's no symbol
*/
CodePointStreamDecoder.prototype.next = function () {
var byte1, byte2, byte3, byte4;
var codePoint;

if (byteIndex > byteCount) {
throw Error('Invalid byte index');
}

if (byteIndex == byteCount) {
return false;
byte1 = this.source.next();
if (isNaN(byte1)) {
return NaN;
}

// Read first byte
byte1 = byteArray[byteIndex] & 0xFF;
byteIndex++;
byte1 = byte1 & 0xFF;

// 1-byte sequence (no continuation bytes)
if ((byte1 & 0x80) == 0) {
Expand All @@ -157,7 +152,7 @@

// 2-byte sequence
if ((byte1 & 0xE0) == 0xC0) {
var byte2 = readContinuationByte();
byte2 = this.readContinuationByte();
codePoint = ((byte1 & 0x1F) << 6) | byte2;
if (codePoint >= 0x80) {
return codePoint;
Expand All @@ -168,8 +163,8 @@

// 3-byte sequence (may include unpaired surrogates)
if ((byte1 & 0xF0) == 0xE0) {
byte2 = readContinuationByte();
byte3 = readContinuationByte();
byte2 = this.readContinuationByte();
byte3 = this.readContinuationByte();
codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3;
if (codePoint >= 0x0800) {
checkScalarValue(codePoint);
Expand All @@ -181,36 +176,63 @@

// 4-byte sequence
if ((byte1 & 0xF8) == 0xF0) {
byte2 = readContinuationByte();
byte3 = readContinuationByte();
byte4 = readContinuationByte();
byte2 = this.readContinuationByte();
byte3 = this.readContinuationByte();
byte4 = this.readContinuationByte();
codePoint = ((byte1 & 0x0F) << 0x12) | (byte2 << 0x0C) |
(byte3 << 0x06) | byte4;
(byte3 << 0x06) | byte4;
if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
return codePoint;
}
}

throw Error('Invalid UTF-8 detected');
}

var byteArray;
var byteCount;
var byteIndex;
function utf8decode(byteString) {
byteArray = ucs2decode(byteString);
byteCount = byteArray.length;
byteIndex = 0;
var codePoints = [];
var tmp;
while ((tmp = decodeSymbol()) !== false) {
codePoints.push(tmp);
}
return ucs2encode(codePoints);
}
};

/*--------------------------------------------------------------------------*/
CodePointStreamDecoder.prototype.readContinuationByte = function () {
var ch = this.source.next();
if (!isNaN(ch)) {
var continuationByte = ch & 0xFF;
if ((continuationByte & 0xC0) == 0x80) {
return continuationByte & 0x3F;
}
}
// If we end up here, it’s not a continuation byte
throw Error('Invalid continuation byte');
};

function utf8decode(str) {
var decoder = new CodePointStreamDecoder(new Ucs2StreamDecoder(str)),
codePoint,
arr = createTmpBuffer(str.length),
arrIx = 0, arrLen = arr.length,
result = '';
while (true) {
codePoint = decoder.next();
if (isNaN(codePoint)) {
break;
}
if (codePoint > 0xFFFF) {
codePoint -= 0x10000;
arr[arrIx++] = stringFromCharCode(codePoint >>> 10 & 0x3FF | 0xD800);
codePoint = 0xDC00 | codePoint & 0x3FF;
if (arrIx === arrLen) {
result += arr.join('');
arrIx = 0;
}
}
arr[arrIx++] = stringFromCharCode(codePoint);
if (arrIx === arrLen) {
result += arr.join('');
arrIx = 0;
}
}
if (arrIx > 0) {
arr.length = arrIx;
result += arr.join('');
}
return result;
}
var utf8 = {
'version': '2.0.0',
'encode': utf8encode,
Expand Down