From 4c5ac3df760c2cfe9b244374582ab329b8f90fd3 Mon Sep 17 00:00:00 2001
From: Chris Williams <chris.a.williams@gmail.com>
Date: Mon, 29 Jul 2019 10:54:07 -0400
Subject: [PATCH] feat(node): add string_decoder module

Fixes TIMOB-27286
---
 .../ti.internal/extensions/node/index.js      |   2 +
 .../extensions/node/string_decoder.js         | 345 ++++++++++++++++++
 tests/Resources/string_decoder.addontest.js   | 172 +++++++++
 3 files changed, 519 insertions(+)
 create mode 100644 common/Resources/ti.internal/extensions/node/string_decoder.js
 create mode 100644 tests/Resources/string_decoder.addontest.js

diff --git a/common/Resources/ti.internal/extensions/node/index.js b/common/Resources/ti.internal/extensions/node/index.js
index 168f107afae..b434d485935 100644
--- a/common/Resources/ti.internal/extensions/node/index.js
+++ b/common/Resources/ti.internal/extensions/node/index.js
@@ -7,6 +7,7 @@ import util from './util';
 import assert from './assert';
 import events from './events';
 import BufferModule from './buffer';
+import StringDecoder from './string_decoder';
 
 // hook our implementations to get loaded by require
 import { register } from '../binding';
@@ -17,6 +18,7 @@ register('util', util);
 register('assert', assert);
 register('events', events);
 register('buffer', BufferModule);
+register('string_decoder', StringDecoder);
 
 // Register require('buffer').Buffer as global
 global.Buffer = BufferModule.Buffer;
diff --git a/common/Resources/ti.internal/extensions/node/string_decoder.js b/common/Resources/ti.internal/extensions/node/string_decoder.js
new file mode 100644
index 00000000000..04d035f7cda
--- /dev/null
+++ b/common/Resources/ti.internal/extensions/node/string_decoder.js
@@ -0,0 +1,345 @@
+/**
+ * @param {string} [encoding='utf8'] The character encoding the `StringDecoder` will use.
+ */
+function StringDecoder(encoding = 'utf8') {
+	this.encoding = encoding.toLowerCase();
+	switch (this.encoding) {
+		case 'utf8':
+		case 'utf-8':
+			this._impl = new Utf8StringDecoder();
+			break;
+		case 'ucs2':
+		case 'ucs-2':
+		case 'utf16-le':
+		case 'utf16le':
+			this._impl = new Utf16StringDecoder();
+			break;
+		case 'base64':
+			this._impl = new Base64StringDecoder();
+			break;
+		default:
+			this._impl = new StringDecoderImpl(this.encoding);
+			break;
+	}
+}
+
+/**
+ * Returns any remaining input stored in the internal buffer as a string.
+ * Bytes representing incomplete UTF-8 and UTF-16 characters will be replaced with substitution
+ * characters appropriate for the character encoding.
+ *
+ * If the buffer argument is provided, one final call to stringDecoder.write() is performed before returning the remaining input.
+ * @param {Buffer} [buffer] containing the bytes to decode.
+ * @returns {string}
+ */
+StringDecoder.prototype.end = function end(buffer) {
+	return this._impl.end(buffer);
+};
+
+/**
+ * Returns a decoded string, ensuring that any incomplete multibyte characters at the end of the Buffer, or
+ * TypedArray, or DataView are omitted from the returned string and stored in an internal buffer for the
+ * next call to stringDecoder.write() or stringDecoder.end().
+ * @param {Buffer|TypedArray|DataView} buffer containing the bytes to decode.
+ * @returns {string}
+ */
+StringDecoder.prototype.write = function write(buffer) {
+	if (typeof buffer === 'string') {
+		return buffer;
+	}
+	// empty string for empty buffer
+	if (buffer.length === 0) {
+		return '';
+	}
+	return this._impl.write(buffer);
+};
+
+/**
+ * This is the base class. We override parts of it for certain encodings. For ascii/hex/binary/latin1 the impl is super-easy
+ */
+class StringDecoderImpl {
+	constructor(encoding = 'utf8') {
+		this.encoding = encoding;
+		this.byteCount = 0;
+		this.charLength = 1;
+	}
+
+	// the actual underlying implementation!
+	end(buffer) {
+		if (buffer && buffer.length !== 0) {
+			return this.write(buffer);
+		}
+		return '';
+	}
+
+	write(buffer) {
+		if (buffer && buffer.length !== 0) {
+			return buffer.toString(this.encoding); // single byte character encodings are a cinch
+		}
+		return ''; // no buffer, or empty
+	}
+}
+
+// For multi-byte encodings, let's implement some base logic...
+class MultiByteStringDecoderImpl extends StringDecoderImpl {
+	constructor(encoding, bytesPerChar) {
+		super(encoding);
+		this.incomplete = Buffer.allocUnsafe(bytesPerChar); // temporary incomplete character buffer
+	}
+
+	/**
+	 * @typedef {Object} IncompleteCharObject
+	 * @property {integer} bytesNeeded bytes missing to complete the character
+	 * @property {integer} charLength bytes expected to complete the character
+	 * @property {integer} index location in the buffer where the character starts
+	 */
+
+	/**
+	 * Given a Buffer, sees if we have an incomplete "character" at the end of it.
+	 * Returns info on that:
+	 * - bytesNeeded: 0-3, number of bytes still remaining
+	 * - charLength: expected number of bytes for the incomplete character
+	 * - index: index in the buffer where the incomplete character begins
+	 * @param {Buffer} _buffer Buffer we are checking to see if it has an incompelte "character" at the end
+	 * @returns {IncompleteCharObject}
+	 */
+	_checkIncompleteBytes(_buffer) {
+		throw new Error('subclasses must override!');
+	}
+
+	_incompleteEnd() {
+		throw new Error('subclasses must override!');
+	}
+
+	_incompleteBufferEmptied() {
+		// typically we reset byte count back to 0 and character length to 1
+		this.byteCount = 0;
+		this.charLength = 1;
+	}
+
+	end(buffer) {
+		let result = super.end(buffer);
+		if (this.byteCount !== 0) {
+			// we have incomplete characters!
+			result += this._incompleteEnd();
+		}
+		this._incompleteBufferEmptied(); // reset our internals to "wipe" the incomplete buffer
+		return result;
+	}
+
+	write(buffer) {
+		// first let's see if we had some multi-byte character we didn't finish...
+		let char = '';
+		if (this.byteCount !== 0) {
+			// we still needed some bytes to finish the character
+			// How many bytes do we still need? charLength - bytes we received
+			const left = this.charLength - this.byteCount; // need 4, have 1? then we have 3 "left"
+
+			const bytesCopied = Math.min(left, buffer.length); // copy up to that many bytes
+			// copy bytes from `buffer` to our incomplete buffer
+			buffer.copy(this.incomplete, this.byteCount, 0, bytesCopied);
+			this.byteCount += bytesCopied; // record how many more bytes we copied...
+
+			if (bytesCopied < left) { // still need more bytes to complete!
+				return '';
+			}
+
+			// we were able to complete, yay!
+			// grab the character we completed
+			char = this.incomplete.slice(0, this.charLength).toString(this.encoding);
+			// reset our counters
+			this._incompleteBufferEmptied();
+			// do we have any bytes left in this buffer?
+			if (bytesCopied === buffer.length) {
+				return char; // if not, return the character we finished!
+			}
+			// we still have more bytes, so slice the buffer up
+			buffer = buffer.slice(bytesCopied, buffer.length);
+		}
+
+		// check this buffer to see if it indicates we need more bytes?
+		const incompleteCharData = this._checkIncompleteBytes(buffer);
+		if (incompleteCharData.bytesNeeded === 0) {
+			return char + buffer.toString(this.encoding); // no incomplete bytes, return any character we completed plus the buffer
+		}
+
+		// ok so the buffer holds an incomplete character at it's end
+		this.charLength = incompleteCharData.charLength; // record how many bytes we need for the 'character'
+		const incompleteCharIndex = incompleteCharData.index; // this is the index of the multibyte character that is incomplete
+
+		// copy from index of incomplete character to end of buffer
+		const bytesToCopy = buffer.length - incompleteCharIndex;
+		buffer.copy(this.incomplete, 0, incompleteCharIndex, buffer.length);
+		this.byteCount = bytesToCopy; // record how many bytes we actually copied
+
+		if (bytesToCopy < buffer.length) { // buffer had bytes before the incomplete character
+			// so smush any character we may have completed with any complete characters in the buffer
+			return char + buffer.toString(this.encoding, 0, incompleteCharIndex);
+		}
+		return char; // any now-completed character that was previously incomplete, possibly empty
+	}
+}
+
+class Utf8StringDecoder extends MultiByteStringDecoderImpl {
+	constructor() {
+		super('utf8', 4);
+	}
+
+	_checkIncompleteBytes(buffer) {
+		const length = buffer.length;
+		// FIXME: In Node, they check the last character first!
+		// And they rely on Buffer#toString() to handle injecting the '\ufffd' character for busted multi-byte sequences!
+		// iOS apparently just returns undefined in that special case and
+		// Android differs here because we don't work backwards from the last char
+		// Can we cheat here and...
+		// see https://github.com/nodejs/string_decoder/blob/master/lib/string_decoder.js#L173-L198
+		// - if we see a multi-byte character start, validate the next characters are continuation chars
+		// - if they're not replace the sequence with '\ufffd', treat like that multi-byte character was "completed"
+
+		// Note that even if we do hack this, if there's some invalid multi-byte UTF-8 in the buffer that isn't at the last 3 bytes
+		// then we're at the mercy of the JS engine/platform code for handling that
+		// Here's someone's hack there: https://gist.github.com/oleganza/997155
+
+		// if buffer.length >= 3, check 3rd to last byte
+		if (length >= 3) {
+			let charLength = checkCharLengthForUTF8(buffer[length - 3]);
+			if (charLength === 4) {
+				return {
+					bytesNeeded: 1, // we have 3 last bytes, need 4th
+					index: length - 3,
+					charLength: 4
+				};
+			}
+		}
+		// if buffer.length >= 2, check 2nd to last byte
+		if (length >= 2) {
+			let charLength = checkCharLengthForUTF8(buffer[length - 2]);
+			if (charLength >= 3) {
+				return {
+					bytesNeeded: charLength - 2, // we have 2 bytes of whatever we need
+					index: length - 2,
+					charLength
+				};
+			}
+		}
+		// if buffer.length >= 1, check last byte
+		if (length >= 1) {
+			let charLength = checkCharLengthForUTF8(buffer[length - 1]);
+			if (charLength >= 2) {
+				return {
+					bytesNeeded: charLength - 1, // we have 1 byte of whatever we need
+					index: length - 1,
+					charLength
+				};
+			}
+		}
+		// base case, no bytes needed - ends on complete character
+		return {
+			bytesNeeded: 0,
+			index: length - 1,
+			charLength: 1
+		};
+	}
+
+	_incompleteEnd() {
+		return '\ufffd'; // we replace the missing character with a special utf8 char
+	}
+}
+
+class Utf16StringDecoder extends MultiByteStringDecoderImpl {
+	constructor() {
+		super('utf16le', 4);
+	}
+
+	_checkIncompleteBytes(buffer) {
+		const length = buffer.length;
+		const modulo = length % 2;
+		// ok, we have a multiple of 2 bytes
+		if (modulo === 0) {
+			// is the last byte a leading/high surrogate?
+			const byte = buffer[buffer.length - 1];
+			if (byte >= 0xD8 && byte <= 0xDB) {
+				return {
+					bytesNeeded: 2,
+					charLength: 4,
+					index: length - 2
+				};
+			}
+
+			// we're good, not a surrogate, so we have our needed 2 bytes
+			return {
+				bytesNeeded: 0,
+				charLength: 2
+			};
+		}
+
+		// ok we have 1 byte left over, assume we need 2 to form the character
+		return {
+			bytesNeeded: 1,
+			index: length - 1,
+			charLength: 2
+		};
+	}
+
+	_incompleteEnd() {
+		// Just write out the last N bytes, hopefully the engine can handle it for us?
+		return this.incomplete.toString('utf16le', 0, this.byteCount);
+	}
+}
+
+class Base64StringDecoder extends MultiByteStringDecoderImpl {
+	constructor() {
+		super('base64', 3);
+		this.charLength = 3; // always 3!
+	}
+
+	_checkIncompleteBytes(buffer) {
+		const length = buffer.length;
+		const modulo = length % 3;
+		// base64 needs 3 bytes always, so if we have that many (or a multiple), we have a complete buffer
+		if (modulo === 0) {
+			return {
+				bytesNeeded: 0,
+				charLength: 3
+			};
+		}
+
+		// ok we have 1 or 2 bytes left over
+		return {
+			bytesNeeded: 3 - modulo, // always need 3, so if we have 1 left over -> need 2
+			index: length - modulo,
+			charLength: 3 // always need 3
+		};
+	}
+
+	_incompleteBufferEmptied() {
+		this.byteCount = 0;
+		this.charLength = 3; // always 3!
+	}
+
+	_incompleteEnd() {
+		// Just write out the last N bytes, it should insert the '=' placeholders
+		// it's not really 'missing'/'incomplete', just needs placeholder insertion
+		return this.incomplete.toString('base64', 0, this.byteCount);
+	}
+}
+
+function checkCharLengthForUTF8(byte) {
+	// 11110XXX => 1110 => 0x1E
+	if (byte >> 3 === 0x1E) {
+		return 4;
+	}
+
+	// 1110XXXX => 1110 => 0x1E
+	if (byte >> 4 === 0x0E) {
+		return 3;
+	}
+
+	// 110XXXXX => 110 => 0x06
+	if (byte >> 5 === 0x06) {
+		return 2;
+	}
+	return 1;
+}
+
+export default { StringDecoder };
diff --git a/tests/Resources/string_decoder.addontest.js b/tests/Resources/string_decoder.addontest.js
new file mode 100644
index 00000000000..2a1428c4e61
--- /dev/null
+++ b/tests/Resources/string_decoder.addontest.js
@@ -0,0 +1,172 @@
+/*
+ * Appcelerator Titanium Mobile
+ * Copyright (c) 2019-Present by Appcelerator, Inc. All Rights Reserved.
+ * Licensed under the terms of the Apache Public License
+ * Please see the LICENSE included with this distribution for details.
+ */
+/* eslint-env mocha */
+/* eslint no-unused-expressions: "off" */
+'use strict';
+const should = require('./utilities/assertions');
+let StringDecoder;
+
+describe('string_decoder', () => {
+	it('can be loaded as a core module', () => {
+		StringDecoder = require('string_decoder').StringDecoder;
+		should(StringDecoder).exist;
+	});
+
+	it('uses \'utf8\' as the default encoding', () => {
+		const decoder = new StringDecoder();
+		should(decoder.encoding).eql('utf8');
+	});
+
+	describe('#end()', () => {
+		it('is a Function', () => {
+			const decoder = new StringDecoder();
+			should(decoder.end).be.a.Function;
+		});
+
+		it('handles base64 with single byte', () => {
+			const decoder = new StringDecoder('base64');
+			should(decoder.write(Buffer.from([ 0x61 ]))).eql('');
+			should(decoder.end()).eql('YQ==');
+			// resets internals so writing empty buffer doesn't output anything anymore
+			should(decoder.write(Buffer.from([]))).eql('');
+			should(decoder.end()).eql('');
+		});
+
+		it('handles base64 with single byte, writing again', () => {
+			const decoder = new StringDecoder('base64');
+			should(decoder.write(Buffer.from([ 0x61 ]))).eql('');
+			should(decoder.end()).eql('YQ==');
+			// resets internals so writing empty buffer doesn't output anything anymore
+			should(decoder.write(Buffer.from([]))).eql('');
+			should(decoder.end()).eql('');
+
+			should(decoder.write(Buffer.from([ 0x61 ]))).eql('');
+			should(decoder.end()).eql('YQ==');
+			// resets internals so writing empty buffer doesn't output anything anymore
+			should(decoder.write(Buffer.from([]))).eql('');
+			should(decoder.end()).eql('');
+		});
+
+		it('handles base64 with two bytes', () => {
+			const decoder = new StringDecoder('base64');
+			should(decoder.write(Buffer.from([ 0x61, 0x61 ]))).eql('');
+			should(decoder.end()).eql('YWE=');
+			// resets internals so writing empty buffer doesn't output anything anymore
+			should(decoder.write(Buffer.from([]))).eql('');
+			should(decoder.end()).eql('');
+		});
+
+		it('handles base64 with two bytes, writing again', () => {
+			const decoder = new StringDecoder('base64');
+			should(decoder.write(Buffer.from([ 0x61, 0x61 ]))).eql('');
+			should(decoder.end()).eql('YWE=');
+			// resets internals so writing empty buffer doesn't output anything anymore
+			should(decoder.write(Buffer.from([]))).eql('');
+			should(decoder.end()).eql('');
+
+			should(decoder.write(Buffer.from([ 0x61 ]))).eql('');
+			should(decoder.end()).eql('YQ==');
+			// resets internals so writing empty buffer doesn't output anything anymore
+			should(decoder.write(Buffer.from([]))).eql('');
+			should(decoder.end()).eql('');
+		});
+
+		it('handles base64 with three bytes', () => {
+			const decoder = new StringDecoder('base64');
+			should(decoder.write(Buffer.from([ 0x61, 0x61, 0x61 ]))).eql('YWFh'); // we got our 3 bytes!
+			should(decoder.end()).eql(''); // don't add anything else
+			// resets internals so writing empty buffer doesn't output anything anymore
+			should(decoder.write(Buffer.from([]))).eql('');
+			should(decoder.end()).eql('');
+		});
+
+		it('handles base64 with three bytes, wirting again', () => {
+			const decoder = new StringDecoder('base64');
+			should(decoder.write(Buffer.from([ 0x61, 0x61, 0x61 ]))).eql('YWFh'); // we got our 3 bytes!
+			should(decoder.end()).eql(''); // don't add anything else
+			// resets internals so writing empty buffer doesn't output anything anymore
+			should(decoder.write(Buffer.from([]))).eql('');
+			should(decoder.end()).eql('');
+
+			should(decoder.write(Buffer.from([ 0x61 ]))).eql('');
+			should(decoder.end()).eql('YQ==');
+			// resets internals so writing empty buffer doesn't output anything anymore
+			should(decoder.write(Buffer.from([]))).eql('');
+			should(decoder.end()).eql('');
+		});
+	});
+
+	describe('#write()', () => {
+		it('is a Function', () => {
+			const decoder = new StringDecoder();
+			should(decoder.write).be.a.Function;
+		});
+
+		it('holds multi-byte utf-8 characters until end is reached', () => {
+			const decoder = new StringDecoder('utf8');
+			should(decoder.write(Buffer.from([ 0xE2 ]))).eql('');
+			should(decoder.write(Buffer.from([ 0x82 ]))).eql('');
+			should(decoder.end(Buffer.from([ 0xAC ]))).eql('€'); // FIXME: Failing!
+		});
+	});
+
+	it('works without new keyword', () => {
+		const decoder2 = {};
+		StringDecoder.call(decoder2);
+		should(decoder2.encoding).eql('utf8');
+		should(decoder2.write).be.a.Function;
+		should(decoder2.end).be.a.Function;
+	});
+
+	it('handles standard utf-8 buffers', () => {
+		decodeTest('utf8', Buffer.from('$', 'utf8'), '$');
+		decodeTest('utf-8', Buffer.from('¢', 'utf-8'), '¢');
+		decodeTest('utf-8', Buffer.from('€', 'utf-8'), '€');
+		decodeTest('utf-8', Buffer.from('𤭢', 'utf-8'), '𤭢');
+	});
+
+	it('handles mixed ascii and non-ascii', () => {
+		decodeTest(
+			'utf-8',
+			Buffer.from([ 0xCB, 0xA4, 0x64, 0xE1, 0x8B, 0xA4, 0x30, 0xE3, 0x81, 0x85 ]),
+			'\u02e4\u0064\u12e4\u0030\u3045'
+		);
+	});
+
+	// FIXMEL Requires native fix in iOS for Ti.Buffer, see https://github.com/appcelerator/titanium_mobile/pull/11095#issue-302964559
+	it.allBroken('handles invalid utf-8 input', () => {
+		decodeTest('utf-8', Buffer.from('C9B5A941', 'hex'), '\u0275\ufffdA');
+		decodeTest('utf-8', Buffer.from('E2', 'hex'), '\ufffd');
+		decodeTest('utf-8', Buffer.from('E241', 'hex'), '\ufffdA'); // FIXME: Failing!
+		decodeTest('utf-8', Buffer.from('CCCCB8', 'hex'), '\ufffd\u0338');
+		decodeTest('utf-8', Buffer.from('F0B841', 'hex'), '\ufffdA');
+		decodeTest('utf-8', Buffer.from('F1CCB8', 'hex'), '\ufffd\u0338');
+		decodeTest('utf-8', Buffer.from('F0FB00', 'hex'), '\ufffd\ufffd\0');
+		decodeTest('utf-8', Buffer.from('CCE2B8B8', 'hex'), '\ufffd\u2e38');
+		decodeTest('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\u0338');
+		decodeTest('utf-8', Buffer.from('E2FBCC01', 'hex'), '\ufffd\ufffd\ufffd\u0001');
+		decodeTest('utf-8', Buffer.from('CCB8CDB9', 'hex'), '\u0338\u0379');
+	});
+
+	// UCS-2
+	it('handles UCS-2', () => {
+		decodeTest('ucs2', Buffer.from('ababc', 'ucs2'), 'ababc');
+	});
+
+	// UTF-16LE
+	it('handles UTF-16LE', () => {
+		decodeTest('utf16le', Buffer.from('3DD84DDC', 'hex'), '\ud83d\udc4d'); // thumbs up
+	});
+});
+
+function decodeTest(encoding, input, expected) {
+	const decoder = new StringDecoder(encoding);
+	let output = '';
+	output += decoder.write(input);
+	output += decoder.end();
+	should(output).eql(expected);
+}