Skip to content

Commit

Permalink
Merge pull request #2169 from Microsoft/withANameLikeUnicodeYoudThink…
Browse files Browse the repository at this point in the history
…ThereWouldntBeSoManyWaysToDoIt

Add support for extended Unicode escape sequences in strings and templates
  • Loading branch information
DanielRosenwasser committed Mar 3, 2015
2 parents e5a8deb + 5c5a489 commit 7212912
Show file tree
Hide file tree
Showing 407 changed files with 2,787 additions and 77 deletions.
23 changes: 0 additions & 23 deletions src/compiler/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -623,29 +623,6 @@ module ts {
"\u0085": "\\u0085" // nextLine
};

/**
* Based heavily on the abstract 'Quote'/ 'QuoteJSONString' operation from ECMA-262 (24.3.2.2),
* but augmented for a few select characters.
* Note that this doesn't actually wrap the input in double quotes.
*/
export function escapeString(s: string): string {
// Prioritize '"' and '\'
s = backslashOrDoubleQuote.test(s) ? s.replace(backslashOrDoubleQuote, getReplacement) : s;
s = escapedCharsRegExp.test(s) ? s.replace(escapedCharsRegExp, getReplacement) : s;

return s;

function getReplacement(c: string) {
return escapedCharsMap[c] || unicodeEscape(c);
}

function unicodeEscape(c: string): string {
var hexCharCode = c.charCodeAt(0).toString(16);
var paddedHexCode = ("0000" + hexCharCode).slice(-4);
return "\\u" + paddedHexCode;
}
}

export function getDefaultLibFileName(options: CompilerOptions): string {
return options.target === ScriptTarget.ES6 ? "lib.es6.d.ts" : "lib.d.ts";
}
Expand Down
2 changes: 2 additions & 0 deletions src/compiler/diagnosticInformationMap.generated.ts
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,8 @@ module ts {
Catch_clause_variable_name_must_be_an_identifier: { code: 1195, category: DiagnosticCategory.Error, key: "Catch clause variable name must be an identifier." },
Catch_clause_variable_cannot_have_a_type_annotation: { code: 1196, category: DiagnosticCategory.Error, key: "Catch clause variable cannot have a type annotation." },
Catch_clause_variable_cannot_have_an_initializer: { code: 1197, category: DiagnosticCategory.Error, key: "Catch clause variable cannot have an initializer." },
An_extended_Unicode_escape_value_must_be_between_0x0_and_0x10FFFF_inclusive: { code: 1198, category: DiagnosticCategory.Error, key: "An extended Unicode escape value must be between 0x0 and 0x10FFFF inclusive." },
Unterminated_Unicode_escape_sequence: { code: 1199, category: DiagnosticCategory.Error, key: "Unterminated Unicode escape sequence." },
Duplicate_identifier_0: { code: 2300, category: DiagnosticCategory.Error, key: "Duplicate identifier '{0}'." },
Initializer_of_instance_member_variable_0_cannot_reference_identifier_1_declared_in_the_constructor: { code: 2301, category: DiagnosticCategory.Error, key: "Initializer of instance member variable '{0}' cannot reference identifier '{1}' declared in the constructor." },
Static_members_cannot_reference_class_type_parameters: { code: 2302, category: DiagnosticCategory.Error, key: "Static members cannot reference class type parameters." },
Expand Down
9 changes: 8 additions & 1 deletion src/compiler/diagnosticMessages.json
Original file line number Diff line number Diff line change
Expand Up @@ -611,7 +611,14 @@
"category": "Error",
"code": 1197
},

"An extended Unicode escape value must be between 0x0 and 0x10FFFF inclusive.": {
"category": "Error",
"code": 1198
},
"Unterminated Unicode escape sequence.": {
"category": "Error",
"code": 1199
},
"Duplicate identifier '{0}'.": {
"category": "Error",
"code": 2300
Expand Down
66 changes: 50 additions & 16 deletions src/compiler/emitter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2223,36 +2223,70 @@ module ts {
}
}

function isBinaryOrOctalIntegerLiteral(text: string): boolean {
if (text.length <= 0) {
return false;
}

if (text.charCodeAt(1) === CharacterCodes.B || text.charCodeAt(1) === CharacterCodes.b ||
text.charCodeAt(1) === CharacterCodes.O || text.charCodeAt(1) === CharacterCodes.o) {
return true;
function isBinaryOrOctalIntegerLiteral(node: LiteralExpression, text: string): boolean {
if (node.kind === SyntaxKind.NumericLiteral && text.length > 1) {
switch (text.charCodeAt(1)) {
case CharacterCodes.b:
case CharacterCodes.B:
case CharacterCodes.o:
case CharacterCodes.O:
return true;
}
}

return false;
}

function emitLiteral(node: LiteralExpression) {
var text = languageVersion < ScriptTarget.ES6 && isTemplateLiteralKind(node.kind) ? getTemplateLiteralAsStringLiteral(node) :
node.parent ? getSourceTextOfNodeFromSourceFile(currentSourceFile, node) :
node.text;
var text = getLiteralText(node);

if (compilerOptions.sourceMap && (node.kind === SyntaxKind.StringLiteral || isTemplateLiteralKind(node.kind))) {
writer.writeLiteral(text);
}
// For version below ES6, emit binary integer literal and octal integer literal in canonical form
else if (languageVersion < ScriptTarget.ES6 && node.kind === SyntaxKind.NumericLiteral && isBinaryOrOctalIntegerLiteral(text)) {
// For versions below ES6, emit binary & octal literals in their canonical decimal form.
else if (languageVersion < ScriptTarget.ES6 && isBinaryOrOctalIntegerLiteral(node, text)) {
write(node.text);
}
else {
write(text);
}
}

function getTemplateLiteralAsStringLiteral(node: LiteralExpression): string {
return '"' + escapeString(node.text) + '"';

function getLiteralText(node: LiteralExpression) {
// Any template literal or string literal with an extended escape
// (e.g. "\u{0067}") will need to be downleveled as a escaped string literal.
if (languageVersion < ScriptTarget.ES6 && (isTemplateLiteralKind(node.kind) || node.hasExtendedUnicodeEscape)) {
return getQuotedEscapedLiteralText('"', node.text, '"');
}

// If we don't need to downlevel and we can reach the original source text using
// the node's parent reference, then simply get the text as it was originally written.
if (node.parent) {
return getSourceTextOfNodeFromSourceFile(currentSourceFile, node);
}

// If we can't reach the original source text, use the canonical form if it's a number,
// or an escaped quoted form of the original text if it's string-like.
switch (node.kind) {
case SyntaxKind.StringLiteral:
return getQuotedEscapedLiteralText('"', node.text, '"');
case SyntaxKind.NoSubstitutionTemplateLiteral:
return getQuotedEscapedLiteralText('`', node.text, '`');
case SyntaxKind.TemplateHead:
return getQuotedEscapedLiteralText('`', node.text, '${');
case SyntaxKind.TemplateMiddle:
return getQuotedEscapedLiteralText('}', node.text, '${');
case SyntaxKind.TemplateTail:
return getQuotedEscapedLiteralText('}', node.text, '`');
case SyntaxKind.NumericLiteral:
return node.text;
}

Debug.fail(`Literal kind '${node.kind}' not accounted for.`);
}

function getQuotedEscapedLiteralText(leftQuote: string, text: string, rightQuote: string) {
return leftQuote + escapeNonAsciiCharacters(escapeString(text)) + rightQuote;
}

function emitDownlevelRawTemplateLiteral(node: LiteralExpression) {
Expand Down
4 changes: 4 additions & 0 deletions src/compiler/parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2163,6 +2163,10 @@ module ts {
var text = scanner.getTokenValue();
node.text = internName ? internIdentifier(text) : text;

if (scanner.hasExtendedUnicodeEscape()) {
node.hasExtendedUnicodeEscape = true;
}

if (scanner.isUnterminated()) {
node.isUnterminated = true;
}
Expand Down
110 changes: 97 additions & 13 deletions src/compiler/scanner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ module ts {
getTokenPos(): number;
getTokenText(): string;
getTokenValue(): string;
hasExtendedUnicodeEscape(): boolean;
hasPrecedingLineBreak(): boolean;
isIdentifier(): boolean;
isReservedWord(): boolean;
Expand Down Expand Up @@ -556,6 +557,7 @@ module ts {
var token: SyntaxKind;
var tokenValue: string;
var precedingLineBreak: boolean;
var hasExtendedUnicodeEscape: boolean;
var tokenIsUnterminated: boolean;

function error(message: DiagnosticMessage, length?: number): void {
Expand Down Expand Up @@ -606,11 +608,27 @@ module ts {
}
return +(text.substring(start, pos));
}

/**
* Scans the given number of hexadecimal digits in the text,
* returning -1 if the given number is unavailable.
*/
function scanExactNumberOfHexDigits(count: number): number {
return scanHexDigits(/*minCount*/ count, /*scanAsManyAsPossible*/ false);
}

/**
* Scans as many hexadecimal digits as are available in the text,
* returning -1 if the given number of digits was unavailable.
*/
function scanMinimumNumberOfHexDigits(count: number): number {
return scanHexDigits(/*minCount*/ count, /*scanAsManyAsPossible*/ true);
}

function scanHexDigits(count: number, mustMatchCount?: boolean): number {
function scanHexDigits(minCount: number, scanAsManyAsPossible: boolean): number {
var digits = 0;
var value = 0;
while (digits < count || !mustMatchCount) {
while (digits < minCount || scanAsManyAsPossible) {
var ch = text.charCodeAt(pos);
if (ch >= CharacterCodes._0 && ch <= CharacterCodes._9) {
value = value * 16 + ch - CharacterCodes._0;
Expand All @@ -627,7 +645,7 @@ module ts {
pos++;
digits++;
}
if (digits < count) {
if (digits < minCount) {
value = -1;
}
return value;
Expand Down Expand Up @@ -764,16 +782,20 @@ module ts {
return "\'";
case CharacterCodes.doubleQuote:
return "\"";
case CharacterCodes.x:
case CharacterCodes.u:
var ch = scanHexDigits(ch === CharacterCodes.x ? 2 : 4, /*mustMatchCount*/ true);
if (ch >= 0) {
return String.fromCharCode(ch);
}
else {
error(Diagnostics.Hexadecimal_digit_expected);
return ""
// '\u{DDDDDDDD}'
if (pos < len && text.charCodeAt(pos) === CharacterCodes.openBrace) {
hasExtendedUnicodeEscape = true;
pos++;
return scanExtendedUnicodeEscape();
}

// '\uDDDD'
return scanHexadecimalEscape(/*numDigits*/ 4)

case CharacterCodes.x:
// '\xDD'
return scanHexadecimalEscape(/*numDigits*/ 2)

// when encountering a LineContinuation (i.e. a backslash and a line terminator sequence),
// the line terminator is interpreted to be "the empty code unit sequence".
Expand All @@ -790,14 +812,74 @@ module ts {
return String.fromCharCode(ch);
}
}

function scanHexadecimalEscape(numDigits: number): string {
var escapedValue = scanExactNumberOfHexDigits(numDigits);

if (escapedValue >= 0) {
return String.fromCharCode(escapedValue);
}
else {
error(Diagnostics.Hexadecimal_digit_expected);
return ""
}
}

function scanExtendedUnicodeEscape(): string {
var escapedValue = scanMinimumNumberOfHexDigits(1);
var isInvalidExtendedEscape = false;

// Validate the value of the digit
if (escapedValue < 0) {
error(Diagnostics.Hexadecimal_digit_expected)
isInvalidExtendedEscape = true;
}
else if (escapedValue > 0x10FFFF) {
error(Diagnostics.An_extended_Unicode_escape_value_must_be_between_0x0_and_0x10FFFF_inclusive);
isInvalidExtendedEscape = true;
}

if (pos >= len) {
error(Diagnostics.Unexpected_end_of_text);
isInvalidExtendedEscape = true;
}
else if (text.charCodeAt(pos) == CharacterCodes.closeBrace) {
// Only swallow the following character up if it's a '}'.
pos++;
}
else {
error(Diagnostics.Unterminated_Unicode_escape_sequence);
isInvalidExtendedEscape = true;
}

if (isInvalidExtendedEscape) {
return "";
}

return utf16EncodeAsString(escapedValue);
}

// Derived from the 10.1.1 UTF16Encoding of the ES6 Spec.
function utf16EncodeAsString(codePoint: number): string {
Debug.assert(0x0 <= codePoint && codePoint <= 0x10FFFF);

if (codePoint <= 65535) {
return String.fromCharCode(codePoint);
}

var codeUnit1 = Math.floor((codePoint - 65536) / 1024) + 0xD800;
var codeUnit2 = ((codePoint - 65536) % 1024) + 0xDC00;

return String.fromCharCode(codeUnit1, codeUnit2);
}

// Current character is known to be a backslash. Check for Unicode escape of the form '\uXXXX'
// and return code point value if valid Unicode escape is found. Otherwise return -1.
function peekUnicodeEscape(): number {
if (pos + 5 < len && text.charCodeAt(pos + 1) === CharacterCodes.u) {
var start = pos;
pos += 2;
var value = scanHexDigits(4, /*mustMatchCount*/ true);
var value = scanExactNumberOfHexDigits(4);
pos = start;
return value;
}
Expand Down Expand Up @@ -869,6 +951,7 @@ module ts {

function scan(): SyntaxKind {
startPos = pos;
hasExtendedUnicodeEscape = false;
precedingLineBreak = false;
tokenIsUnterminated = false;
while (true) {
Expand Down Expand Up @@ -1034,7 +1117,7 @@ module ts {
case CharacterCodes._0:
if (pos + 2 < len && (text.charCodeAt(pos + 1) === CharacterCodes.X || text.charCodeAt(pos + 1) === CharacterCodes.x)) {
pos += 2;
var value = scanHexDigits(1, /*mustMatchCount*/ false);
var value = scanMinimumNumberOfHexDigits(1);
if (value < 0) {
error(Diagnostics.Hexadecimal_digit_expected);
value = 0;
Expand Down Expand Up @@ -1336,6 +1419,7 @@ module ts {
getTokenPos: () => tokenPos,
getTokenText: () => text.substring(tokenPos, pos),
getTokenValue: () => tokenValue,
hasExtendedUnicodeEscape: () => hasExtendedUnicodeEscape,
hasPrecedingLineBreak: () => precedingLineBreak,
isIdentifier: () => token === SyntaxKind.Identifier || token > SyntaxKind.LastReservedWord,
isReservedWord: () => token >= SyntaxKind.FirstReservedWord && token <= SyntaxKind.LastReservedWord,
Expand Down
1 change: 1 addition & 0 deletions src/compiler/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,7 @@ module ts {
export interface LiteralExpression extends PrimaryExpression {
text: string;
isUnterminated?: boolean;
hasExtendedUnicodeEscape?: boolean;
}

export interface StringLiteralExpression extends LiteralExpression {
Expand Down
Loading

0 comments on commit 7212912

Please sign in to comment.