Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correct Regular Expressions Behavior Related to Annex B #58320

Merged
merged 12 commits into from
May 29, 2024
78 changes: 50 additions & 28 deletions src/compiler/scanner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1556,9 +1556,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
tokenFlags |= TokenFlags.ContainsInvalidEscape;
if (isRegularExpression || shouldEmitInvalidEscapeError) {
const code = parseInt(text.substring(start + 1, pos), 8);
if (isRegularExpression !== "annex-b") {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this was the only place we check for "annex-b", we can make isRegularExpression a boolean again.

Copy link
Contributor Author

@graphemecluster graphemecluster May 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought the same too, but there is a isRegularExpression === true at the bottom

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This probably is worth being refactored, since shouldEmitInvalidEscapeError does not explain itself but is taken over to mean anyUnicodeMode for regular expressions. But this will be yet another PR I guess.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you use an enum for such a case?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I found that I once created a enum named ClassSetExpressionType.

error(Diagnostics.Octal_escape_sequences_are_not_allowed_Use_the_syntax_0, start, pos - start, "\\x" + code.toString(16).padStart(2, "0"));
}
error(Diagnostics.Octal_escape_sequences_are_not_allowed_Use_the_syntax_0, start, pos - start, "\\x" + code.toString(16).padStart(2, "0"));
return String.fromCharCode(code);
}
return text.substring(start, pos);
Expand Down Expand Up @@ -2426,6 +2424,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
// Quickly get to the end of regex such that we know the flags
let p = tokenStart + 1;
let inEscape = false;
let namedCaptureGroups = false;
// Although nested character classes are allowed in Unicode Sets mode,
// an unescaped slash is nevertheless invalid even in a character class in Unicode mode.
// Additionally, parsing nested character classes will misinterpret regexes like `/[[]/`
Expand Down Expand Up @@ -2469,6 +2468,15 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
else if (ch === CharacterCodes.closeBracket) {
inCharacterClass = false;
}
else if (
ch === CharacterCodes.openParen
&& charCodeChecked(p + 1) === CharacterCodes.question
&& charCodeChecked(p + 2) === CharacterCodes.lessThan
&& charCodeChecked(p + 3) !== CharacterCodes.equals
&& charCodeChecked(p + 3) !== CharacterCodes.exclamation
) {
namedCaptureGroups = true;
}
p++;
}
const isUnterminated = !!(tokenFlags & TokenFlags.Unterminated);
Expand Down Expand Up @@ -2505,7 +2513,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
const saveEnd = end;
pos = tokenStart + 1;
end = endOfBody;
scanRegularExpressionWorker(regExpFlags, isUnterminated, /*annexB*/ true);
scanRegularExpressionWorker(regExpFlags, isUnterminated, /*annexB*/ true, namedCaptureGroups);
tokenStart = saveTokenStart;
tokenFlags = saveTokenFlags;
pos = savePos;
Expand All @@ -2517,7 +2525,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
return token;
}

function scanRegularExpressionWorker(regExpFlags: RegularExpressionFlags, isUnterminated: boolean, annexB: boolean) {
function scanRegularExpressionWorker(regExpFlags: RegularExpressionFlags, isUnterminated: boolean, annexB: boolean, namedCaptureGroups: boolean) {
// Why var? It avoids TDZ checks in the runtime which can be costly.
// See: https://github.com/microsoft/TypeScript/issues/52924
/* eslint-disable no-var */
Expand All @@ -2527,10 +2535,9 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
/** Grammar parameter */
var unicodeMode = !!(regExpFlags & RegularExpressionFlags.UnicodeMode);

if (unicodeMode) {
// Annex B treats any unicode mode as the strict syntax.
annexB = false;
}
// Regular expressions are checked more strictly when either in 'u' or 'v' mode, or
// when not using the looser interpretation of the syntax from ECMA-262 Annex B.
var anyUnicodeModeOrNonAnnexB = unicodeMode || !annexB;

/** @see {scanClassSetExpression} */
var mayContainStrings = false;
Expand Down Expand Up @@ -2626,7 +2633,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
case CharacterCodes.exclamation:
pos++;
// In Annex B, `(?=Disjunction)` and `(?!Disjunction)` are quantifiable
isPreviousTermQuantifiable = annexB;
isPreviousTermQuantifiable = !anyUnicodeModeOrNonAnnexB;
break;
case CharacterCodes.lessThan:
const groupNameStart = pos;
Expand Down Expand Up @@ -2675,6 +2682,10 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
const digitsStart = pos;
scanDigits();
const min = tokenValue;
if (!anyUnicodeModeOrNonAnnexB && !min) {
isPreviousTermQuantifiable = true;
break;
}
if (charCodeChecked(pos) === CharacterCodes.comma) {
pos++;
scanDigits();
Expand All @@ -2684,26 +2695,32 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
error(Diagnostics.Incomplete_quantifier_Digit_expected, digitsStart, 0);
}
else {
if (unicodeMode) {
error(Diagnostics.Unexpected_0_Did_you_mean_to_escape_it_with_backslash, start, 1, String.fromCharCode(ch));
}
error(Diagnostics.Unexpected_0_Did_you_mean_to_escape_it_with_backslash, start, 1, String.fromCharCode(ch));
isPreviousTermQuantifiable = true;
break;
}
}
if (max && Number.parseInt(min) > Number.parseInt(max)) {
else if (max && Number.parseInt(min) > Number.parseInt(max) && (anyUnicodeModeOrNonAnnexB || charCodeChecked(pos) === CharacterCodes.closeBrace)) {
error(Diagnostics.Numbers_out_of_order_in_quantifier, digitsStart, pos - digitsStart);
}
}
else if (!min) {
if (unicodeMode) {
if (anyUnicodeModeOrNonAnnexB) {
error(Diagnostics.Unexpected_0_Did_you_mean_to_escape_it_with_backslash, start, 1, String.fromCharCode(ch));
}
isPreviousTermQuantifiable = true;
break;
}
scanExpectedChar(CharacterCodes.closeBrace);
pos--;
if (charCodeChecked(pos) !== CharacterCodes.closeBrace) {
if (anyUnicodeModeOrNonAnnexB) {
error(Diagnostics._0_expected, pos, 0, String.fromCharCode(CharacterCodes.closeBrace));
pos--;
}
else {
isPreviousTermQuantifiable = true;
break;
}
}
// falls through
case CharacterCodes.asterisk:
case CharacterCodes.plus:
Expand Down Expand Up @@ -2744,7 +2761,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
// Assume what starting from the character to be outside of the regex
return;
}
if (unicodeMode || ch === CharacterCodes.closeParen) {
if (anyUnicodeModeOrNonAnnexB || ch === CharacterCodes.closeParen) {
error(Diagnostics.Unexpected_0_Did_you_mean_to_escape_it_with_backslash, pos, 1, String.fromCharCode(ch));
}
pos++;
Expand Down Expand Up @@ -2801,7 +2818,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
scanGroupName(/*isReference*/ true);
scanExpectedChar(CharacterCodes.greaterThan);
}
else if (unicodeMode) {
else if (namedCaptureGroups) {
error(Diagnostics.k_must_be_followed_by_a_capturing_group_name_enclosed_in_angle_brackets, pos - 2, 2);
}
break;
Expand Down Expand Up @@ -2851,10 +2868,10 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
pos++;
return String.fromCharCode(ch & 0x1f);
}
if (unicodeMode) {
if (anyUnicodeModeOrNonAnnexB) {
error(Diagnostics.c_must_be_followed_by_an_ASCII_letter, pos - 2, 2);
}
else if (atomEscape && annexB) {
else if (atomEscape) {
// Annex B treats
//
// ExtendedAtom : `\` [lookahead = `c`]
Expand Down Expand Up @@ -2887,7 +2904,7 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
return "\\";
}
pos--;
return scanEscapeSequence(/*shouldEmitInvalidEscapeError*/ unicodeMode, /*isRegularExpression*/ annexB ? "annex-b" : true);
return scanEscapeSequence(/*shouldEmitInvalidEscapeError*/ unicodeMode, /*isRegularExpression*/ anyUnicodeModeOrNonAnnexB || "annex-b");
rbuckton marked this conversation as resolved.
Show resolved Hide resolved
}
}

Expand Down Expand Up @@ -2936,12 +2953,12 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
if (isClassContentExit(ch)) {
return;
}
if (!minCharacter && !annexB) {
if (!minCharacter && anyUnicodeModeOrNonAnnexB) {
error(Diagnostics.A_character_class_range_must_not_be_bounded_by_another_character_class, minStart, pos - 1 - minStart);
}
const maxStart = pos;
const maxCharacter = scanClassAtom();
if (!maxCharacter && !annexB) {
if (!maxCharacter && anyUnicodeModeOrNonAnnexB) {
error(Diagnostics.A_character_class_range_must_not_be_bounded_by_another_character_class, maxStart, pos - maxStart);
continue;
}
Expand Down Expand Up @@ -3437,9 +3454,13 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
error(Diagnostics.Unicode_property_value_expressions_are_only_available_when_the_Unicode_u_flag_or_the_Unicode_Sets_v_flag_is_set, start, pos - start);
}
}
else if (unicodeMode) {
else if (anyUnicodeModeOrNonAnnexB) {
error(Diagnostics._0_must_be_followed_by_a_Unicode_property_value_expression_enclosed_in_braces, pos - 2, 2, String.fromCharCode(ch));
}
else {
pos--;
return false;
}
return true;
}
return false;
Expand Down Expand Up @@ -3481,9 +3502,10 @@ export function createScanner(languageVersion: ScriptTarget, skipTrivia: boolean
}
});
forEach(decimalEscapes, escape => {
// in AnnexB, if a DecimalEscape is greater than the number of capturing groups then it is treated as
// either a LegacyOctalEscapeSequence or IdentityEscape
if (!annexB && escape.value > numberOfCapturingGroups) {
// Although a DecimalEscape with a value greater than the number of capturing groups
// is treated as either a LegacyOctalEscapeSequence or an IdentityEscape in Annex B,
// an error is nevertheless reported since it's most likely a mistake.
if (escape.value > numberOfCapturingGroups) {
if (numberOfCapturingGroups) {
error(Diagnostics.This_backreference_refers_to_a_group_that_does_not_exist_There_are_only_0_capturing_groups_in_this_regular_expression, escape.pos, escape.end - escape.pos, numberOfCapturingGroups);
}
Expand Down
Loading