From 0820a9592e4af88bb093ff0e8a6ff31d1da823ad Mon Sep 17 00:00:00 2001 From: Robert Hafner Date: Sat, 18 Nov 2017 09:05:36 -0800 Subject: [PATCH 1/4] Add python style unicode support (`\U0001F3B5`) --- src/index.js | 14 +++++++++----- test/index.js | 5 +++++ 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/index.js b/src/index.js index 496cd8b..fbccc1e 100644 --- a/src/index.js +++ b/src/index.js @@ -6,15 +6,17 @@ import 'string.fromcodepoint'; * u\{([0-9A-Fa-f]+)\} - first alternative; matches the variable-length hexadecimal escape sequence (\u{ABCD0}) * | * u([0-9A-Fa-f]{4}) - second alternative; matches the 4-digit hexadecimal escape sequence (\uABCD) - * | + * | * x([0-9A-Fa-f]{2}) - third alternative; matches the 2-digit hexadecimal escape sequence (\xA5) - * | + * | * ([1-7][0-7]{0,2}|[0-7]{2,3}) - fourth alternative; matches the up-to-3-digit octal escape sequence (\5 or \512) - * | + * | * (['"tbrnfv0\\]) - fifth alternative; matches the special escape characters (\t, \n and so on) + * | + * \U([0-9A-Fa-f]+) - first alternative; matches the variable-length hexadecimal escape sequence (\u{ABCD0}) * ) */ -const jsEscapeRegex = /\\(u\{([0-9A-Fa-f]+)\}|u([0-9A-Fa-f]{4})|x([0-9A-Fa-f]{2})|([1-7][0-7]{0,2}|[0-7]{2,3})|(['"tbrnfv0\\]))/g; +const jsEscapeRegex = /\\(u\{([0-9A-Fa-f]+)\}|u([0-9A-Fa-f]{4})|x([0-9A-Fa-f]{2})|([1-7][0-7]{0,2}|[0-7]{2,3})|(['"tbrnfv0\\]))|\U([0-9A-Fa-f]+)/g; const usualEscapeSequences = { '0': '\0', @@ -33,7 +35,7 @@ const fromHex = (str) => String.fromCodePoint(parseInt(str, 16)); const fromOct = (str) => String.fromCodePoint(parseInt(str, 8)); export default (string) => { - return string.replace(jsEscapeRegex, (_, __, varHex, longHex, shortHex, octal, specialCharacter) => { + return string.replace(jsEscapeRegex, (_, __, varHex, longHex, shortHex, octal, specialCharacter, python) => { if (varHex !== undefined) { return fromHex(varHex); } else if (longHex !== undefined) { @@ -42,6 +44,8 @@ export default (string) => { return fromHex(shortHex); } else if (octal !== undefined) { return fromOct(octal); + } else if (python !== undefined) { + return fromHex(python); } else { return usualEscapeSequences[specialCharacter]; } diff --git a/test/index.js b/test/index.js index d60b552..017324c 100644 --- a/test/index.js +++ b/test/index.js @@ -39,3 +39,8 @@ test('avoids double unescape cascade', t => { t.is(unescapeJs('---\\\\x41---'), '---\\x41---'); t.is(unescapeJs('---\\x5cx41---'), '---\\x41---'); }); + +test('python hex escape sequences', t => { + t.is(unescapeJs('---\U000000A9---'), '---\u00A9---'); + t.is(unescapeJs('---\U0001F3B5---'), '---\uD83C\uDFB5---'); +}); From 8cf91520ca6342049176c0539fae4c8d2e1dec7f Mon Sep 17 00:00:00 2001 From: Robert Hafner Date: Sat, 18 Nov 2017 09:08:42 -0800 Subject: [PATCH 2/4] fix comment --- src/index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/index.js b/src/index.js index fbccc1e..2b0a5be 100644 --- a/src/index.js +++ b/src/index.js @@ -13,7 +13,7 @@ import 'string.fromcodepoint'; * | * (['"tbrnfv0\\]) - fifth alternative; matches the special escape characters (\t, \n and so on) * | - * \U([0-9A-Fa-f]+) - first alternative; matches the variable-length hexadecimal escape sequence (\u{ABCD0}) + * \U([0-9A-Fa-f]+) - sixth alternative; matches the 8-digit hexadecimal escape sequence used by python (\U0001F3B5) * ) */ const jsEscapeRegex = /\\(u\{([0-9A-Fa-f]+)\}|u([0-9A-Fa-f]{4})|x([0-9A-Fa-f]{2})|([1-7][0-7]{0,2}|[0-7]{2,3})|(['"tbrnfv0\\]))|\U([0-9A-Fa-f]+)/g; From 5a95b7ed9b5e90f682489804a159b29423b69b73 Mon Sep 17 00:00:00 2001 From: Robert Hafner Date: Sat, 18 Nov 2017 09:37:41 -0800 Subject: [PATCH 3/4] make python test check for fixed character count instead of dynamic --- src/index.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/index.js b/src/index.js index 2b0a5be..ae3db0d 100644 --- a/src/index.js +++ b/src/index.js @@ -16,7 +16,7 @@ import 'string.fromcodepoint'; * \U([0-9A-Fa-f]+) - sixth alternative; matches the 8-digit hexadecimal escape sequence used by python (\U0001F3B5) * ) */ -const jsEscapeRegex = /\\(u\{([0-9A-Fa-f]+)\}|u([0-9A-Fa-f]{4})|x([0-9A-Fa-f]{2})|([1-7][0-7]{0,2}|[0-7]{2,3})|(['"tbrnfv0\\]))|\U([0-9A-Fa-f]+)/g; +const jsEscapeRegex = /\\(u\{([0-9A-Fa-f]+)\}|u([0-9A-Fa-f]{4})|x([0-9A-Fa-f]{2})|([1-7][0-7]{0,2}|[0-7]{2,3})|(['"tbrnfv0\\]))|\U([0-9A-Fa-f]{8})/g; const usualEscapeSequences = { '0': '\0', From 368192ec6fd0c387fab89eb946de73706b61a0ec Mon Sep 17 00:00:00 2001 From: Robert Hafner Date: Sun, 19 Nov 2017 13:46:54 -0800 Subject: [PATCH 4/4] Correct handling of backslashes on python unescaping --- src/index.js | 2 +- test/index.js | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/index.js b/src/index.js index ae3db0d..5eea7c8 100644 --- a/src/index.js +++ b/src/index.js @@ -16,7 +16,7 @@ import 'string.fromcodepoint'; * \U([0-9A-Fa-f]+) - sixth alternative; matches the 8-digit hexadecimal escape sequence used by python (\U0001F3B5) * ) */ -const jsEscapeRegex = /\\(u\{([0-9A-Fa-f]+)\}|u([0-9A-Fa-f]{4})|x([0-9A-Fa-f]{2})|([1-7][0-7]{0,2}|[0-7]{2,3})|(['"tbrnfv0\\]))|\U([0-9A-Fa-f]{8})/g; +const jsEscapeRegex = /\\(u\{([0-9A-Fa-f]+)\}|u([0-9A-Fa-f]{4})|x([0-9A-Fa-f]{2})|([1-7][0-7]{0,2}|[0-7]{2,3})|(['"tbrnfv0\\]))|\\U([0-9A-Fa-f]{8})/g; const usualEscapeSequences = { '0': '\0', diff --git a/test/index.js b/test/index.js index 017324c..c854df1 100644 --- a/test/index.js +++ b/test/index.js @@ -41,6 +41,6 @@ test('avoids double unescape cascade', t => { }); test('python hex escape sequences', t => { - t.is(unescapeJs('---\U000000A9---'), '---\u00A9---'); - t.is(unescapeJs('---\U0001F3B5---'), '---\uD83C\uDFB5---'); + t.is(unescapeJs('---\\U000000A9---'), '---\u00A9---'); + t.is(unescapeJs('---\\U0001F3B5---'), '---\uD83C\uDFB5---'); });