Skip to content

Commit

Permalink
#50 : do not split surrogate pair for astral characters (code point >…
Browse files Browse the repository at this point in the history
… 0x10000)

Tentative fix for #64 - supposed duplicate
  • Loading branch information
Siorki committed Jan 31, 2017
1 parent 0afdab8 commit 7d2ab39
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 9 deletions.
1 change: 1 addition & 0 deletions TestCases/gitHub#64-URIError.js
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
c.fillText('✪🅼🅼🅼', 5, 165);
24 changes: 17 additions & 7 deletions regPack.js
Original file line number Diff line number Diff line change
Expand Up @@ -154,18 +154,28 @@ RegPack.prototype = {
var found=true; // stop as soon as no substring of length t is found twice
for(var t=2;found;++t) {
found=false;
for(i=0;++i<s.length-t;)
if(!matches[x=s.substr(j=i,t)])
{
if(~(j=s.indexOf(x,j+t)))
for(i=0;++i<s.length-t;) {
var beginCode = s.charCodeAt(i);
var endCode = s.charCodeAt(i+t-1);
// #50 : if the first character is a low surrogate (second character of a surrogate pair
// representing an astral character), skip it - we cannot have it begin the string
// and thus break the pair
// Same issue if the last character is a high surrogate (first in surrogate pair).
if ((beginCode<0xDC00 || beginCode>0xDFFF)
&& (endCode<0xD800 || endCode>0xDBFF)) {
if(!matches[x=s.substr(j=i,t)])
{
found=true;
for(matches[x]=1;~j;matches[x]++)
if(~(j=s.indexOf(x,j+t)))
{
j=s.indexOf(x,j+t);
found=true;
for(matches[x]=1;~j;matches[x]++)
{
j=s.indexOf(x,j+t);
}
}
}
}
}
}
} else { // only recompute the values of previously found matches
var newMatches={};
Expand Down
6 changes: 4 additions & 2 deletions tests/allTests.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,13 @@ var testIssue0042_patternViewer = require("./testIssue0042_patternViewer");
var testIssue0044_setIntervalArrowFunction = require("./testIssue0044_setIntervalArrowFunction");
var testIssue0045_closingBracket = require("./testIssue0045_closingBracket");
var testIssue0047_EscapeInCharClass = require("./testIssue0047_EscapeInCharClass");
var testIssue0050_unicodeSurrogate = require("./testIssue0050_unicodeSurrogate");
var testIssue0055_stringDelimiters = require("./testIssue0055_stringDelimiters");
var testIssue0056_setIntervalDefaultParams = require("./testIssue0056_setIntervalDefaultParams");
var testIssue0057_replacementInString = require("./testIssue0057_replacementInString");
var testIssue0058_numberAsLoopVariable = require("./testIssue0058_numberAsLoopVariable");
var testIssue0063_backtickFunctionParam = require("./testIssue0063_backtickFunctionParam");
//var testIssue0050_unicodeSurrogate = require("./testIssue0050_unicodeSurrogate");
var testIssue0064_utf8EncodeURI = require("./testIssue0064_utf8EncodeURI");

// Execute all tests in sequence
// Recommendation : put new tests at the very beginning while debugging
Expand All @@ -36,9 +37,10 @@ testIssue0042_patternViewer();
testIssue0044_setIntervalArrowFunction();
testIssue0045_closingBracket();
testIssue0047_EscapeInCharClass();
testIssue0050_unicodeSurrogate();
testIssue0055_stringDelimiters();
testIssue0056_setIntervalDefaultParams();
testIssue0057_replacementInString();
testIssue0058_numberAsLoopVariable();
testIssue0063_backtickFunctionParam();
//testIssue0050_unicodeSurrogate();
testIssue0064_utf8EncodeURI();
71 changes: 71 additions & 0 deletions tests/testIssue0050_unicodeSurrogate.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
var RegPack = require("../regPack")
var assert = require("assert");

function runTests() {
console.log("Issue #0050 - Unicode surrogate byte length : start");
testByteLength();
testSurrogatePacking();
console.log("Issue #0050 - Unicode surrogate byte length : done");
}

/**
* Github issue #50 - Support for characters in the astral plane
* First, make sure that the astral characters (composed of two 16-bit codes,
* first one in [0xD800, 0xDBFF] and second one in [0xDC00, DFFF]) are correctly read
*
*/
function testByteLength() {
// standard ASCII
var input = "0123456789abcdefghijklmnopqrstuvwxyz";
assert.equal(36, RegPack.packer.getByteLength(input));

// 2-byte UTF-8
input = "®";
assert.equal(3, RegPack.packer.getByteLength(input));

// 4-byte UTF-8 with surrogates
input = "\uD83D\uDD25\uD83D\uDD25\uD83D\uDD25";
assert.equal(12, RegPack.packer.getByteLength(input));

input = "🔥🔥🔥";
assert.equal(12, RegPack.packer.getByteLength(input));
}


/**
* Github issue #50 - Support for characters in the astral plane
* Then, check that the crusher does not attempt to break the input in between the two surrogate characters,
* since a string starting with the second one would yield a malformed URI
*
*/
function testSurrogatePacking() {
// 4-byte UTF-8 with surrogates
input = "\uD83D\uDD25\uD83D\uDD25\uD83D\uDD25\uD83D\uDD25\uD83D\uDD25\uD83D\uDD25";
var options = {
withMath : false,
hash2DContext : false,
hashWebGLContext : false,
hashAudioContext : false,
contextVariableName : false,
contextType : parseInt(0),
reassignVars : false,
varsNotReassigned : [],
crushGainFactor : parseFloat(1),
crushLengthFactor : parseFloat(0),
crushCopiesFactor : parseFloat(0),
crushTiebreakerFactor : parseInt(1),
wrapInSetInterval : false,
timeVariableName : ""
};
var result = RegPack.packer.runPacker(input, options);

// Expected result : no exception thrown before, internal check successful,
// and the unicode characters are excluded from the token range
assert(RegPack.packer.getByteLength(result[0].result[2][1]) > 0);
assert.notEqual(result[0].result[2][1].indexOf("uffff]"), -1);
assert.notEqual(result[0].result[2][2].indexOf("Final check : passed"), -1);
}



module.exports = runTests;
45 changes: 45 additions & 0 deletions tests/testIssue0064_utf8EncodeURI.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
var RegPack = require("../regPack")
var fs = require("fs");
var assert = require("assert");

function runTests() {
console.log("Issue #0064 - EncodeURI in UTF-8 : start");
testEncodeURI();
console.log("Issue #0064 - EncodeURI in UTF-8 : done");
}


/**
* Github issue #64 - Accept unicode characters
* Make sure the Unicode characters are explicitely filtered out
* by the RegExp in the negated char class
*
* Associated test file : gitHub#64-URIError.js
*/
function testEncodeURI() {
var input = fs.readFileSync("../TestCases/gitHub#64-URIError.js", { encoding:"utf8"});
var options = {
withMath : false,
hash2DContext : false,
hashWebGLContext : false,
hashAudioContext : false,
contextVariableName : false,
contextType : parseInt(0),
reassignVars : false,
varsNotReassigned : [],
crushGainFactor : parseFloat(1),
crushLengthFactor : parseFloat(0),
crushCopiesFactor : parseFloat(0),
crushTiebreakerFactor : parseInt(1),
wrapInSetInterval : false,
timeVariableName : ""
};
var result = RegPack.packer.runPacker(input, options);

// Expected result : internal check successful,
// and the unicode characters are excluded from the token range
assert.notEqual(result[0].result[2][1].indexOf("uffff]"), -1);
assert.notEqual(result[0].result[2][2].indexOf("Final check : passed"), -1);
}

module.exports = runTests;

0 comments on commit 7d2ab39

Please sign in to comment.