mathiasbynens · antelle · Jun 3, 2015 · Jun 3, 2015 · Jun 3, 2015
diff --git a/.gitignore b/.gitignore
@@ -18,3 +18,5 @@ Thumbs.db
 # Files that might appear on external disks
 .Spotlight-V100
 .Trashes
+
+.idea
diff --git a/tests/test-large.js b/tests/test-large.js
@@ -0,0 +1,25 @@
+'use strict';
+
+var b = new Buffer('x', 'utf8');
+
+// test with node.js on 50MB data encoding
+
+var utf8 = require('../utf8.js');
+console.log('started. mem: %d', mem());
+var buffer = new Buffer(1024*1024*50);
+var letters = ['a', 'ы', '6', 'ð'];
+for (var i = 0; i < buffer.length; i++) {
+	buffer[i] = letters[i % letters.length].charCodeAt(0);
+}
+var str = buffer.toString('utf16le');
+buffer = null;
+console.log('created string. mem: %d', mem());
+str = utf8.encode(str);
+console.log('encoded string (%d chars). mem: %d', str.length, mem());
+str = utf8.decode(str);
+console.log('converted string (%d chars). mem: %d', str.length, mem());
+
+function mem() {
+	if (global.gc) global.gc();
+	return Math.round(process.memoryUsage().rss/1024/1024);
+}
diff --git a/tests/tests.js b/tests/tests.js
@@ -187,6 +187,18 @@
 			'codePoint': 0x10FFF,
 			'decoded': '\uDBFF\uDFFF',
 			'encoded': '\xF4\x8F\xBF\xBF'
+		},
+
+		// Long strings
+		{
+			'description': 'Long string',
+			'decoded': new Array(2000).join('x') + new Array(1000).join('\uDBFF\uDFFF'),
+			'encoded': new Array(2000).join('x') + new Array(1000).join('\xF4\x8F\xBF\xBF')
+		},
+		{
+			'description': 'Long string another',
+			'decoded': new Array(1025).join('x'),
+			'encoded': new Array(1025).join('x')
 		}
 	];
 
@@ -263,6 +275,13 @@
 			Error,
 			'Error: invalid byte index'
 		);
+		raises(
+			function() {
+				utf8.decode('\xC0\x0F');
+			},
+			Error,
+			'Error: invalid byte index'
+		);
 	});
 
 	/*--------------------------------------------------------------------------*/

diff --git a/utf8.js b/utf8.js
@@ -15,64 +15,42 @@
 		root = freeGlobal;
 	}
 
-	/*--------------------------------------------------------------------------*/
-
 	var stringFromCharCode = String.fromCharCode;
 
-	// Taken from https://mths.be/punycode
-	function ucs2decode(string) {
-		var output = [];
-		var counter = 0;
-		var length = string.length;
-		var value;
-		var extra;
-		while (counter < length) {
-			value = string.charCodeAt(counter++);
-			if (value >= 0xD800 && value <= 0xDBFF && counter < length) {
-				// high surrogate, and there is a next character
-				extra = string.charCodeAt(counter++);
-				if ((extra & 0xFC00) == 0xDC00) { // low surrogate
-					output.push(((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000);
-				} else {
-					// unmatched surrogate; only append this code unit, in case the next
-					// code unit is the high surrogate of a surrogate pair
-					output.push(value);
-					counter--;
-				}
-			} else {
-				output.push(value);
-			}
-		}
-		return output;
+	/**
+	 * UCS-2 decoder stream
+	 * @param {string} source - source string
+	 * @constructor
+	 */
+	function Ucs2StreamDecoder(source) {
+		this.source = source;
+		this.len = source.length;
+		this.pos = 0;
 	}
 
-	// Taken from https://mths.be/punycode
-	function ucs2encode(array) {
-		var length = array.length;
-		var index = -1;
-		var value;
-		var output = '';
-		while (++index < length) {
-			value = array[index];
-			if (value > 0xFFFF) {
-				value -= 0x10000;
-				output += stringFromCharCode(value >>> 10 & 0x3FF | 0xD800);
-				value = 0xDC00 | value & 0x3FF;
-			}
-			output += stringFromCharCode(value);
+	/**
+	 * Get next UCS2 char code
+	 * Taken from https://mths.be/punycode
+	 * @return {number} positive value meaning char code; NaN, if there's no symbol
+	 */
+	Ucs2StreamDecoder.prototype.next = function () {
+		if (this.pos >= this.len) {
+			return NaN;
 		}
-		return output;
-	}
-
-	function checkScalarValue(codePoint) {
-		if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
-			throw Error(
-				'Lone surrogate U+' + codePoint.toString(16).toUpperCase() +
-				' is not a scalar value'
-			);
+		var value = this.source.charCodeAt(this.pos++);
+		if (value >= 0xD800 && value <= 0xDBFF) {
+			// high surrogate, and there is a next character
+			var extra = this.source.charCodeAt(this.pos++);
+			if ((extra & 0xFC00) == 0xDC00) { // low surrogate
+				value = ((value & 0x3FF) << 10) + (extra & 0x3FF) + 0x10000;
+			} else {
+				// unmatched surrogate; only append this code unit, in case the next
+				// code unit is the high surrogate of a surrogate pair
+				this.pos--;
+			}
 		}
-	}
-	/*--------------------------------------------------------------------------*/
+		return value;
+	};
 
 	function createByte(codePoint, shift) {
 		return stringFromCharCode(((codePoint >> shift) & 0x3F) | 0x80);
@@ -100,55 +78,72 @@
 		return symbol;
 	}
 
-	function utf8encode(string) {
-		var codePoints = ucs2decode(string);
-		var length = codePoints.length;
-		var index = -1;
-		var codePoint;
-		var byteString = '';
-		while (++index < length) {
-			codePoint = codePoints[index];
-			byteString += encodeCodePoint(codePoint);
+	function checkScalarValue(codePoint) {
+		if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
+			throw Error(
+				'Lone surrogate U+' + codePoint.toString(16).toUpperCase() +
+				' is not a scalar value'
+			);
 		}
-		return byteString;
 	}
 
-	/*--------------------------------------------------------------------------*/
+	/**
+	 * Creates temporary buffer for faster string joining
+	 * @param {number} strLen - converted string length
+	 * @returns {Array}
+	 */
+	function createTmpBuffer(strLen) {
+		return new Array(strLen < 100 ? 128 : 1024);
+	}
 
-	function readContinuationByte() {
-		if (byteIndex >= byteCount) {
-			throw Error('Invalid byte index');
+	function utf8encode(str) {
+		var decoder = new Ucs2StreamDecoder(str),
+			codePoint,
+			arr = createTmpBuffer(str.length),
+			arrIx = 0, arrLen = arr.length,
+			result = '';
+		while (true) {
+			codePoint = decoder.next();
+			if (isNaN(codePoint)) {
+				break;
+			}
+			arr[arrIx++] = encodeCodePoint(codePoint);
+			if (arrIx === arrLen) {
+				result += arr.join('');
+				arrIx = 0;
+			}
 		}
-
-		var continuationByte = byteArray[byteIndex] & 0xFF;
-		byteIndex++;
-
-		if ((continuationByte & 0xC0) == 0x80) {
-			return continuationByte & 0x3F;
+		if (arrIx > 0) {
+			arr.length = arrIx;
+			result += arr.join('');
 		}
+		return result;
+	}
 
-		// If we end up here, it’s not a continuation byte
-		throw Error('Invalid continuation byte');
+	/**
+	 * codePoint decoder stream
+	 * @param {Ucs2StreamDecoder} source - source stream
+	 * @constructor
+	 */
+	function CodePointStreamDecoder(source) {
+		this.source = source;
 	}
 
-	function decodeSymbol() {
-		var byte1;
-		var byte2;
-		var byte3;
-		var byte4;
+	/**
+	 * Get next char
+	 * @return {number} positive value meaning char code; NaN, if there's no symbol
+	 */
+	CodePointStreamDecoder.prototype.next = function () {
+		var byte1, byte2, byte3, byte4;
 		var codePoint;
 
-		if (byteIndex > byteCount) {
-			throw Error('Invalid byte index');
-		}
-
-		if (byteIndex == byteCount) {
-			return false;
+		byte1 = this.source.next();
+		if (isNaN(byte1)) {
+			return NaN;
 		}
 
 		// Read first byte
-		byte1 = byteArray[byteIndex] & 0xFF;
-		byteIndex++;
+		byte1 = byte1 & 0xFF;
 
 		// 1-byte sequence (no continuation bytes)
 		if ((byte1 & 0x80) == 0) {
@@ -157,7 +152,7 @@
 
 		// 2-byte sequence
 		if ((byte1 & 0xE0) == 0xC0) {
-			var byte2 = readContinuationByte();
+			byte2 = this.readContinuationByte();
 			codePoint = ((byte1 & 0x1F) << 6) | byte2;
 			if (codePoint >= 0x80) {
 				return codePoint;
@@ -168,8 +163,8 @@
 
 		// 3-byte sequence (may include unpaired surrogates)
 		if ((byte1 & 0xF0) == 0xE0) {
-			byte2 = readContinuationByte();
-			byte3 = readContinuationByte();
+			byte2 = this.readContinuationByte();
+			byte3 = this.readContinuationByte();
 			codePoint = ((byte1 & 0x0F) << 12) | (byte2 << 6) | byte3;
 			if (codePoint >= 0x0800) {
 				checkScalarValue(codePoint);
@@ -181,36 +176,63 @@
 
 		// 4-byte sequence
 		if ((byte1 & 0xF8) == 0xF0) {
-			byte2 = readContinuationByte();
-			byte3 = readContinuationByte();
-			byte4 = readContinuationByte();
+			byte2 = this.readContinuationByte();
+			byte3 = this.readContinuationByte();
+			byte4 = this.readContinuationByte();
 			codePoint = ((byte1 & 0x0F) << 0x12) | (byte2 << 0x0C) |
-				(byte3 << 0x06) | byte4;
+			(byte3 << 0x06) | byte4;
 			if (codePoint >= 0x010000 && codePoint <= 0x10FFFF) {
 				return codePoint;
 			}
 		}
 
 		throw Error('Invalid UTF-8 detected');
-	}
-
-	var byteArray;
-	var byteCount;
-	var byteIndex;
-	function utf8decode(byteString) {
-		byteArray = ucs2decode(byteString);
-		byteCount = byteArray.length;
-		byteIndex = 0;
-		var codePoints = [];
-		var tmp;
-		while ((tmp = decodeSymbol()) !== false) {
-			codePoints.push(tmp);
-		}
-		return ucs2encode(codePoints);
-	}
+	};
 
-	/*--------------------------------------------------------------------------*/
+	CodePointStreamDecoder.prototype.readContinuationByte = function () {
+		var ch = this.source.next();
+		if (!isNaN(ch)) {
+			var continuationByte = ch & 0xFF;
+			if ((continuationByte & 0xC0) == 0x80) {
+				return continuationByte & 0x3F;
+			}
+		}
+		// If we end up here, it’s not a continuation byte
+		throw Error('Invalid continuation byte');
+	};
 
+	function utf8decode(str) {
+		var decoder = new CodePointStreamDecoder(new Ucs2StreamDecoder(str)),
+			codePoint,
+			arr = createTmpBuffer(str.length),
+			arrIx = 0, arrLen = arr.length,
+			result = '';
+		while (true) {
+			codePoint = decoder.next();
+			if (isNaN(codePoint)) {
+				break;
+			}
+			if (codePoint > 0xFFFF) {
+				codePoint -= 0x10000;
+				arr[arrIx++] = stringFromCharCode(codePoint >>> 10 & 0x3FF | 0xD800);
+				codePoint = 0xDC00 | codePoint & 0x3FF;
+				if (arrIx === arrLen) {
+					result += arr.join('');
+					arrIx = 0;
+				}
+			}
+			arr[arrIx++] = stringFromCharCode(codePoint);
+			if (arrIx === arrLen) {
+				result += arr.join('');
+				arrIx = 0;
+			}
+		}
+		if (arrIx > 0) {
+			arr.length = arrIx;
+			result += arr.join('');
+		}
+		return result;
+	}
 	var utf8 = {
 		'version': '2.0.0',
 		'encode': utf8encode,