#50 : do not split surrogate pair for astral characters (code point >…

… 0x10000) Tentative fix for #64 - supposed duplicate
Siorki · Jan 31, 2017 · 7d2ab39 · 7d2ab39
1 parent 0afdab8
commit 7d2ab39
Show file tree

Hide file tree

Showing 5 changed files with 138 additions and 9 deletions.
diff --git a/TestCases/gitHub#64-URIError.js b/TestCases/gitHub#64-URIError.js
@@ -0,0 +1 @@
+c.fillText('✪🅼🅼🅼', 5, 165);
diff --git a/regPack.js b/regPack.js
@@ -154,18 +154,28 @@ RegPack.prototype = {
 				var found=true;	// stop as soon as no substring of length t is found twice
 				for(var t=2;found;++t) {
 					found=false;
-					for(i=0;++i<s.length-t;)
-						if(!matches[x=s.substr(j=i,t)])
-						{
-							if(~(j=s.indexOf(x,j+t)))
+					for(i=0;++i<s.length-t;) {
+						var beginCode = s.charCodeAt(i);
+						var endCode = s.charCodeAt(i+t-1);
+						// #50 : if the first character is a low surrogate (second character of a surrogate pair
+						// representing an astral character), skip it - we cannot have it begin the string
+						// and thus break the pair
+						// Same issue if the last character is a high surrogate (first in surrogate pair).
+						if ((beginCode<0xDC00 || beginCode>0xDFFF)
+							&& (endCode<0xD800 || endCode>0xDBFF)) {
+							if(!matches[x=s.substr(j=i,t)])
 							{
-								found=true;
-								for(matches[x]=1;~j;matches[x]++)
+								if(~(j=s.indexOf(x,j+t)))
 								{
-									j=s.indexOf(x,j+t);
+									found=true;
+									for(matches[x]=1;~j;matches[x]++)
+									{
+										j=s.indexOf(x,j+t);
+									}
 								}
 							}
 						}
+					}
 				}
 			} else {	// only recompute the values of previously found matches
 				var newMatches={};

diff --git a/tests/allTests.js b/tests/allTests.js
@@ -12,12 +12,13 @@ var testIssue0042_patternViewer = require("./testIssue0042_patternViewer");
 var testIssue0044_setIntervalArrowFunction = require("./testIssue0044_setIntervalArrowFunction");
 var testIssue0045_closingBracket = require("./testIssue0045_closingBracket");
 var testIssue0047_EscapeInCharClass = require("./testIssue0047_EscapeInCharClass");
+var testIssue0050_unicodeSurrogate = require("./testIssue0050_unicodeSurrogate");
 var testIssue0055_stringDelimiters = require("./testIssue0055_stringDelimiters");
 var testIssue0056_setIntervalDefaultParams = require("./testIssue0056_setIntervalDefaultParams");
 var testIssue0057_replacementInString = require("./testIssue0057_replacementInString");
 var testIssue0058_numberAsLoopVariable = require("./testIssue0058_numberAsLoopVariable");
 var testIssue0063_backtickFunctionParam = require("./testIssue0063_backtickFunctionParam");
-//var testIssue0050_unicodeSurrogate = require("./testIssue0050_unicodeSurrogate");
+var testIssue0064_utf8EncodeURI = require("./testIssue0064_utf8EncodeURI");
 
 // Execute all tests in sequence
 // Recommendation : put new tests at the very beginning while debugging
@@ -36,9 +37,10 @@ testIssue0042_patternViewer();
 testIssue0044_setIntervalArrowFunction();
 testIssue0045_closingBracket();
 testIssue0047_EscapeInCharClass();
+testIssue0050_unicodeSurrogate();
 testIssue0055_stringDelimiters();
 testIssue0056_setIntervalDefaultParams();
 testIssue0057_replacementInString();
 testIssue0058_numberAsLoopVariable();
 testIssue0063_backtickFunctionParam();
-//testIssue0050_unicodeSurrogate();
+testIssue0064_utf8EncodeURI();
diff --git a/tests/testIssue0050_unicodeSurrogate.js b/tests/testIssue0050_unicodeSurrogate.js
@@ -0,0 +1,71 @@
+ï»¿var RegPack = require("../regPack")
+var assert = require("assert");
+
+function runTests() {
+	console.log("Issue #0050 - Unicode surrogate byte length : start");
+	testByteLength();
+	testSurrogatePacking();
+	console.log("Issue #0050 - Unicode surrogate byte length : done");
+}
+
+/**
+ * Github issue #50 - Support for characters in the astral plane
+ * First, make sure that the astral characters (composed of two 16-bit codes,
+ * first one in [0xD800, 0xDBFF] and second one in [0xDC00, DFFF]) are correctly read
+ *
+ */
+function testByteLength() {
+	// standard ASCII
+ 	var input = "0123456789abcdefghijklmnopqrstuvwxyz";
+	assert.equal(36, RegPack.packer.getByteLength(input));
+
+	// 2-byte UTF-8
+	input = "®";
+	assert.equal(3, RegPack.packer.getByteLength(input));
+
+	// 4-byte UTF-8 with surrogates
+	input = "\uD83D\uDD25\uD83D\uDD25\uD83D\uDD25";
+	assert.equal(12, RegPack.packer.getByteLength(input));
+
+	input = "ðŸ”¥ðŸ”¥ðŸ”¥";
+	assert.equal(12, RegPack.packer.getByteLength(input));
+}
+
+
+/**
+ * Github issue #50 - Support for characters in the astral plane
+ * Then, check that the crusher does not attempt to break the input in between the two surrogate characters,
+ * since a string starting with the second one would yield a malformed URI
+ *
+ */
+function testSurrogatePacking() {
+	// 4-byte UTF-8 with surrogates
+	input = "\uD83D\uDD25\uD83D\uDD25\uD83D\uDD25\uD83D\uDD25\uD83D\uDD25\uD83D\uDD25";
+	var options = {
+		withMath : false,
+		hash2DContext : false,
+		hashWebGLContext : false,
+		hashAudioContext : false,
+		contextVariableName : false,
+		contextType : parseInt(0),
+		reassignVars : false,
+		varsNotReassigned : [],
+		crushGainFactor : parseFloat(1),
+		crushLengthFactor : parseFloat(0),
+		crushCopiesFactor : parseFloat(0),
+		crushTiebreakerFactor : parseInt(1),
+		wrapInSetInterval : false,
+		timeVariableName : ""
+	};
+	var result = RegPack.packer.runPacker(input, options);
+
+	// Expected result : no exception thrown before, internal check successful, 
+	// and the unicode characters are excluded from the token range
+	assert(RegPack.packer.getByteLength(result[0].result[2][1]) > 0);
+	assert.notEqual(result[0].result[2][1].indexOf("uffff]"), -1);
+	assert.notEqual(result[0].result[2][2].indexOf("Final check : passed"), -1);
+}
+
+
+
+module.exports = runTests;
diff --git a/tests/testIssue0064_utf8EncodeURI.js b/tests/testIssue0064_utf8EncodeURI.js
@@ -0,0 +1,45 @@
+var RegPack = require("../regPack")
+var fs = require("fs");
+var assert = require("assert");
+
+function runTests() {
+	console.log("Issue #0064 - EncodeURI in UTF-8 : start");
+	testEncodeURI();
+	console.log("Issue #0064 - EncodeURI in UTF-8 : done");
+}
+
+
+/**
+ * Github issue #64 - Accept unicode characters
+ * Make sure the Unicode characters are explicitely filtered out
+ * by the RegExp in the negated char class
+ *
+ * Associated test file : gitHub#64-URIError.js
+ */
+function testEncodeURI() {
+	var input = fs.readFileSync("../TestCases/gitHub#64-URIError.js", { encoding:"utf8"});
+	var options = {
+			withMath : false,
+			hash2DContext : false,
+			hashWebGLContext : false,
+			hashAudioContext : false,
+			contextVariableName : false,
+			contextType : parseInt(0),
+			reassignVars : false,
+			varsNotReassigned : [],
+			crushGainFactor : parseFloat(1),
+			crushLengthFactor : parseFloat(0),
+			crushCopiesFactor : parseFloat(0),
+			crushTiebreakerFactor : parseInt(1),
+			wrapInSetInterval : false,
+			timeVariableName : ""
+		};
+	var result = RegPack.packer.runPacker(input, options);
+
+	// Expected result : internal check successful, 
+	// and the unicode characters are excluded from the token range
+	assert.notEqual(result[0].result[2][1].indexOf("uffff]"), -1);
+	assert.notEqual(result[0].result[2][2].indexOf("Final check : passed"), -1);
+}
+
+module.exports = runTests;