Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for extended Unicode escape sequences in strings and templates #2169

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
23 changes: 0 additions & 23 deletions src/compiler/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -623,29 +623,6 @@ module ts {
"\u0085": "\\u0085" // nextLine
};

/**
* Based heavily on the abstract 'Quote'/ 'QuoteJSONString' operation from ECMA-262 (24.3.2.2),
* but augmented for a few select characters.
* Note that this doesn't actually wrap the input in double quotes.
*/
export function escapeString(s: string): string {
// Prioritize '"' and '\'
s = backslashOrDoubleQuote.test(s) ? s.replace(backslashOrDoubleQuote, getReplacement) : s;
s = escapedCharsRegExp.test(s) ? s.replace(escapedCharsRegExp, getReplacement) : s;

return s;

function getReplacement(c: string) {
return escapedCharsMap[c] || unicodeEscape(c);
}

function unicodeEscape(c: string): string {
var hexCharCode = c.charCodeAt(0).toString(16);
var paddedHexCode = ("0000" + hexCharCode).slice(-4);
return "\\u" + paddedHexCode;
}
}

export function getDefaultLibFileName(options: CompilerOptions): string {
return options.target === ScriptTarget.ES6 ? "lib.es6.d.ts" : "lib.d.ts";
}
Expand Down
2 changes: 2 additions & 0 deletions src/compiler/diagnosticInformationMap.generated.ts
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ module ts {
Catch_clause_variable_name_must_be_an_identifier: { code: 1195, category: DiagnosticCategory.Error, key: "Catch clause variable name must be an identifier." },
Catch_clause_variable_cannot_have_a_type_annotation: { code: 1196, category: DiagnosticCategory.Error, key: "Catch clause variable cannot have a type annotation." },
Catch_clause_variable_cannot_have_an_initializer: { code: 1197, category: DiagnosticCategory.Error, key: "Catch clause variable cannot have an initializer." },
An_extended_Unicode_escape_value_must_be_between_0x0_and_0x10FFFF_inclusive: { code: 1198, category: DiagnosticCategory.Error, key: "An extended Unicode escape value must be between 0x0 and 0x10FFFF inclusive." },
Unterminated_Unicode_escape_sequence: { code: 1199, category: DiagnosticCategory.Error, key: "Unterminated Unicode escape sequence." },
Duplicate_identifier_0: { code: 2300, category: DiagnosticCategory.Error, key: "Duplicate identifier '{0}'." },
Initializer_of_instance_member_variable_0_cannot_reference_identifier_1_declared_in_the_constructor: { code: 2301, category: DiagnosticCategory.Error, key: "Initializer of instance member variable '{0}' cannot reference identifier '{1}' declared in the constructor." },
Static_members_cannot_reference_class_type_parameters: { code: 2302, category: DiagnosticCategory.Error, key: "Static members cannot reference class type parameters." },
Expand Down
9 changes: 8 additions & 1 deletion src/compiler/diagnosticMessages.json
Original file line number Diff line number Diff line change
Expand Up @@ -611,7 +611,14 @@
"category": "Error",
"code": 1197
},

"An extended Unicode escape value must be between 0x0 and 0x10FFFF inclusive.": {
"category": "Error",
"code": 1198
},
"Unterminated Unicode escape sequence.": {
"category": "Error",
"code": 1199
},
"Duplicate identifier '{0}'.": {
"category": "Error",
"code": 2300
Expand Down
66 changes: 50 additions & 16 deletions src/compiler/emitter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2178,36 +2178,70 @@ module ts {
}
}

function isBinaryOrOctalIntegerLiteral(text: string): boolean {
if (text.length <= 0) {
return false;
}

if (text.charCodeAt(1) === CharacterCodes.B || text.charCodeAt(1) === CharacterCodes.b ||
text.charCodeAt(1) === CharacterCodes.O || text.charCodeAt(1) === CharacterCodes.o) {
return true;
function isBinaryOrOctalIntegerLiteral(node: LiteralExpression, text: string): boolean {
if (node.kind === SyntaxKind.NumericLiteral && text.length > 1) {
switch (text.charCodeAt(1)) {
case CharacterCodes.b:
case CharacterCodes.B:
case CharacterCodes.o:
case CharacterCodes.O:
return true;
}
}

return false;
}

function emitLiteral(node: LiteralExpression) {
var text = languageVersion < ScriptTarget.ES6 && isTemplateLiteralKind(node.kind) ? getTemplateLiteralAsStringLiteral(node) :
node.parent ? getSourceTextOfNodeFromSourceFile(currentSourceFile, node) :
node.text;
var text = getLiteralText(node);

if (compilerOptions.sourceMap && (node.kind === SyntaxKind.StringLiteral || isTemplateLiteralKind(node.kind))) {
writer.writeLiteral(text);
}
// For version below ES6, emit binary integer literal and octal integer literal in canonical form
else if (languageVersion < ScriptTarget.ES6 && node.kind === SyntaxKind.NumericLiteral && isBinaryOrOctalIntegerLiteral(text)) {
// For versions below ES6, emit binary & octal literals in their canonical decimal form.
else if (languageVersion < ScriptTarget.ES6 && isBinaryOrOctalIntegerLiteral(node, text)) {
write(node.text);
}
else {
write(text);
}
}

function getTemplateLiteralAsStringLiteral(node: LiteralExpression): string {
return '"' + escapeString(node.text) + '"';

function getLiteralText(node: LiteralExpression) {
// Any template literal or string literal with an extended escape
// (e.g. "\u{0067}") will need to be downleveled as a escaped string literal.
if (languageVersion < ScriptTarget.ES6 && (isTemplateLiteralKind(node.kind) || node.hasExtendedUnicodeEscape)) {
return getQuotedEscapedLiteralText('"', node.text, '"');
}

// If we don't need to downlevel and we can reach the original source text using
// the node's parent reference, then simply get the text as it was originally written.
if (node.parent) {
return getSourceTextOfNodeFromSourceFile(currentSourceFile, node);
}

// If we can't reach the original source text, use the canonical form if it's a number,
// or an escaped quoted form of the original text if it's string-like.
switch (node.kind) {
case SyntaxKind.StringLiteral:
return getQuotedEscapedLiteralText('"', node.text, '"');
case SyntaxKind.NoSubstitutionTemplateLiteral:
return getQuotedEscapedLiteralText('`', node.text, '`');
case SyntaxKind.TemplateHead:
return getQuotedEscapedLiteralText('`', node.text, '${');
case SyntaxKind.TemplateMiddle:
return getQuotedEscapedLiteralText('}', node.text, '${');
case SyntaxKind.TemplateTail:
return getQuotedEscapedLiteralText('}', node.text, '`');
case SyntaxKind.NumericLiteral:
return node.text;
}

Debug.fail(`Literal kind '${node.kind}' not accounted for.`);
}

function getQuotedEscapedLiteralText(leftQuote: string, text: string, rightQuote: string) {
return leftQuote + escapeNonAsciiCharacters(escapeString(text)) + rightQuote;
}

function emitDownlevelRawTemplateLiteral(node: LiteralExpression) {
Expand Down
4 changes: 4 additions & 0 deletions src/compiler/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2163,6 +2163,10 @@ module ts {
var text = scanner.getTokenValue();
node.text = internName ? internIdentifier(text) : text;

if (scanner.hasExtendedUnicodeEscape()) {
node.hasExtendedUnicodeEscape = true;
}

if (scanner.isUnterminated()) {
node.isUnterminated = true;
}
Expand Down
110 changes: 97 additions & 13 deletions src/compiler/scanner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ module ts {
getTokenPos(): number;
getTokenText(): string;
getTokenValue(): string;
hasExtendedUnicodeEscape(): boolean;
hasPrecedingLineBreak(): boolean;
isIdentifier(): boolean;
isReservedWord(): boolean;
Expand Down Expand Up @@ -556,6 +557,7 @@ module ts {
var token: SyntaxKind;
var tokenValue: string;
var precedingLineBreak: boolean;
var hasExtendedUnicodeEscape: boolean;
var tokenIsUnterminated: boolean;

function error(message: DiagnosticMessage, length?: number): void {
Expand Down Expand Up @@ -606,11 +608,27 @@ module ts {
}
return +(text.substring(start, pos));
}

/**
* Scans the given number of hexadecimal digits in the text,
* returning -1 if the given number is unavailable.
*/
function scanExactNumberOfHexDigits(count: number): number {
return scanHexDigits(/*minCount*/ count, /*scanAsManyAsPossible*/ false);
}

/**
* Scans as many hexadecimal digits as are available in the text,
* returning -1 if the given number of digits was unavailable.
*/
function scanMinimumNumberOfHexDigits(count: number): number {
return scanHexDigits(/*minCount*/ count, /*scanAsManyAsPossible*/ true);
}

function scanHexDigits(count: number, mustMatchCount?: boolean): number {
function scanHexDigits(minCount: number, scanAsManyAsPossible: boolean): number {
var digits = 0;
var value = 0;
while (digits < count || !mustMatchCount) {
while (digits < minCount || scanAsManyAsPossible) {
var ch = text.charCodeAt(pos);
if (ch >= CharacterCodes._0 && ch <= CharacterCodes._9) {
value = value * 16 + ch - CharacterCodes._0;
Expand All @@ -627,7 +645,7 @@ module ts {
pos++;
digits++;
}
if (digits < count) {
if (digits < minCount) {
value = -1;
}
return value;
Expand Down Expand Up @@ -764,16 +782,20 @@ module ts {
return "\'";
case CharacterCodes.doubleQuote:
return "\"";
case CharacterCodes.x:
case CharacterCodes.u:
var ch = scanHexDigits(ch === CharacterCodes.x ? 2 : 4, /*mustMatchCount*/ true);
if (ch >= 0) {
return String.fromCharCode(ch);
}
else {
error(Diagnostics.Hexadecimal_digit_expected);
return ""
// '\u{DDDDDDDD}'
if (pos < len && text.charCodeAt(pos) === CharacterCodes.openBrace) {
hasExtendedUnicodeEscape = true;
pos++;
return scanExtendedUnicodeEscape();
}

// '\uDDDD'
return scanHexadecimalEscape(/*numDigits*/ 4)

case CharacterCodes.x:
// '\xDD'
return scanHexadecimalEscape(/*numDigits*/ 2)

// when encountering a LineContinuation (i.e. a backslash and a line terminator sequence),
// the line terminator is interpreted to be "the empty code unit sequence".
Expand All @@ -790,14 +812,74 @@ module ts {
return String.fromCharCode(ch);
}
}

function scanHexadecimalEscape(numDigits: number): string {
var escapedValue = scanExactNumberOfHexDigits(numDigits);

if (escapedValue >= 0) {
return String.fromCharCode(escapedValue);
}
else {
error(Diagnostics.Hexadecimal_digit_expected);
return ""
}
}

function scanExtendedUnicodeEscape(): string {
var escapedValue = scanMinimumNumberOfHexDigits(1);
var isInvalidExtendedEscape = false;

// Validate the value of the digit
if (escapedValue < 0) {
error(Diagnostics.Hexadecimal_digit_expected)
isInvalidExtendedEscape = true;
}
else if (escapedValue > 0x10FFFF) {
error(Diagnostics.An_extended_Unicode_escape_value_must_be_between_0x0_and_0x10FFFF_inclusive);
isInvalidExtendedEscape = true;
}

if (pos >= len) {
error(Diagnostics.Unexpected_end_of_text);
isInvalidExtendedEscape = true;
}
else if (text.charCodeAt(pos) == CharacterCodes.closeBrace) {
// Only swallow the following character up if it's a '}'.
pos++;
}
else {
error(Diagnostics.Unterminated_Unicode_escape_sequence);
isInvalidExtendedEscape = true;
}

if (isInvalidExtendedEscape) {
return "";
}

return utf16EncodeAsString(escapedValue);
}

// Derived from the 10.1.1 UTF16Encoding of the ES6 Spec.
function utf16EncodeAsString(codePoint: number): string {
Debug.assert(0x0 <= codePoint && codePoint <= 0x10FFFF);

if (codePoint <= 65535) {
return String.fromCharCode(codePoint);
}

var codeUnit1 = Math.floor((codePoint - 65536) / 1024) + 0xD800;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you just refernece the relevant portion of theES6 spec?

Can you just do "|0" to floor instead?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's exactly the same as 10.1.1

var codeUnit2 = ((codePoint - 65536) % 1024) + 0xDC00;

return String.fromCharCode(codeUnit1, codeUnit2);
}

// Current character is known to be a backslash. Check for Unicode escape of the form '\uXXXX'
// and return code point value if valid Unicode escape is found. Otherwise return -1.
function peekUnicodeEscape(): number {
if (pos + 5 < len && text.charCodeAt(pos + 1) === CharacterCodes.u) {
var start = pos;
pos += 2;
var value = scanHexDigits(4, /*mustMatchCount*/ true);
var value = scanExactNumberOfHexDigits(4);
pos = start;
return value;
}
Expand Down Expand Up @@ -869,6 +951,7 @@ module ts {

function scan(): SyntaxKind {
startPos = pos;
hasExtendedUnicodeEscape = false;
precedingLineBreak = false;
tokenIsUnterminated = false;
while (true) {
Expand Down Expand Up @@ -1034,7 +1117,7 @@ module ts {
case CharacterCodes._0:
if (pos + 2 < len && (text.charCodeAt(pos + 1) === CharacterCodes.X || text.charCodeAt(pos + 1) === CharacterCodes.x)) {
pos += 2;
var value = scanHexDigits(1, /*mustMatchCount*/ false);
var value = scanMinimumNumberOfHexDigits(1);
if (value < 0) {
error(Diagnostics.Hexadecimal_digit_expected);
value = 0;
Expand Down Expand Up @@ -1336,6 +1419,7 @@ module ts {
getTokenPos: () => tokenPos,
getTokenText: () => text.substring(tokenPos, pos),
getTokenValue: () => tokenValue,
hasExtendedUnicodeEscape: () => hasExtendedUnicodeEscape,
hasPrecedingLineBreak: () => precedingLineBreak,
isIdentifier: () => token === SyntaxKind.Identifier || token > SyntaxKind.LastReservedWord,
isReservedWord: () => token >= SyntaxKind.FirstReservedWord && token <= SyntaxKind.LastReservedWord,
Expand Down
1 change: 1 addition & 0 deletions src/compiler/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,7 @@ module ts {
export interface LiteralExpression extends PrimaryExpression {
text: string;
isUnterminated?: boolean;
hasExtendedUnicodeEscape?: boolean;
}

export interface StringLiteralExpression extends LiteralExpression {
Expand Down
Loading