From f918404590b6800c5a5ef26fb358abbc28622deb Mon Sep 17 00:00:00 2001 From: littledan Date: Thu, 4 May 2017 05:33:38 -0700 Subject: [PATCH] Revert of [regexp] Support unicode capture names in non-unicode patterns (patchset #3 id:40001 of https://codereview.chromium.org/2791163003/ ) Reason for revert: The decision for the specification was to not have this syntax, and instead the syntax before this patch. Original issue's description: > [regexp] Support unicode capture names in non-unicode patterns > > This ensures that capture names containing surrogate pairs are parsed > correctly even in non-unicode RegExp patterns by introducing a new > scanning mode which unconditionally combines surrogate pairs. > > BUG=v8:5437,v8:6192 > > Review-Url: https://codereview.chromium.org/2791163003 > Cr-Commit-Position: refs/heads/master@{#44466} > Committed: https://chromium.googlesource.com/v8/v8/+/a8651c5671dd6e41595ffb438e7ea013082f2d38 R=yangguo@chromium.org,jgruber@chromium.org # Not skipping CQ checks because original CL landed more than 1 days ago. BUG=v8:5437,v8:6192 Review-Url: https://codereview.chromium.org/2859933003 Cr-Commit-Position: refs/heads/master@{#45088} --- src/regexp/regexp-parser.cc | 30 ++++++++----------- src/regexp/regexp-parser.h | 14 +++------ test/mjsunit/harmony/regexp-named-captures.js | 4 +-- 3 files changed, 19 insertions(+), 29 deletions(-) diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc index 319b1881695f..20f023930fcd 100644 --- a/src/regexp/regexp-parser.cc +++ b/src/regexp/regexp-parser.cc @@ -46,13 +46,13 @@ RegExpParser::RegExpParser(FlatStringReader* in, Handle* error, Advance(); } -inline uc32 RegExpParser::ReadNext(bool update_position, ScanMode mode) { +template +inline uc32 RegExpParser::ReadNext() { int position = next_pos_; uc32 c0 = in()->Get(position); position++; - const bool try_combine_surrogate_pairs = - (unicode() || mode == ScanMode::FORCE_COMBINE_SURROGATE_PAIRS); - if (try_combine_surrogate_pairs && position < in()->length() && + // Read the whole surrogate pair in case of unicode flag, if possible. + if (unicode() && position < in()->length() && unibrow::Utf16::IsLeadSurrogate(static_cast(c0))) { uc16 c1 = in()->Get(position); if (unibrow::Utf16::IsTrailSurrogate(c1)) { @@ -67,13 +67,13 @@ inline uc32 RegExpParser::ReadNext(bool update_position, ScanMode mode) { uc32 RegExpParser::Next() { if (has_next()) { - return ReadNext(false, ScanMode::DEFAULT); + return ReadNext(); } else { return kEndMarker; } } -void RegExpParser::Advance(ScanMode mode) { +void RegExpParser::Advance() { if (has_next()) { StackLimitCheck check(isolate()); if (check.HasOverflowed()) { @@ -83,7 +83,7 @@ void RegExpParser::Advance(ScanMode mode) { } else if (zone()->excess_allocation()) { ReportError(CStrVector("Regular expression too large")); } else { - current_ = ReadNext(true, mode); + current_ = ReadNext(); } } else { current_ = kEndMarker; @@ -101,9 +101,9 @@ void RegExpParser::Reset(int pos) { Advance(); } -void RegExpParser::Advance(int dist, ScanMode mode) { +void RegExpParser::Advance(int dist) { next_pos_ += dist - 1; - Advance(mode); + Advance(); } @@ -326,6 +326,7 @@ RegExpTree* RegExpParser::ParseDisjunction() { if (FLAG_harmony_regexp_named_captures) { has_named_captures_ = true; is_named_capture = true; + Advance(); break; } // Fall through. @@ -761,24 +762,18 @@ static void push_code_unit(ZoneVector* v, uint32_t code_unit) { const ZoneVector* RegExpParser::ParseCaptureGroupName() { DCHECK(FLAG_harmony_regexp_named_captures); - DCHECK_EQ(current(), '<'); ZoneVector* name = new (zone()->New(sizeof(ZoneVector))) ZoneVector(zone()); - // Capture names can always contain surrogate pairs, and we need to scan - // accordingly. - const ScanMode scan_mode = ScanMode::FORCE_COMBINE_SURROGATE_PAIRS; - Advance(scan_mode); - bool at_start = true; while (true) { uc32 c = current(); - Advance(scan_mode); + Advance(); // Convert unicode escapes. if (c == '\\' && current() == 'u') { - Advance(scan_mode); + Advance(); if (!ParseUnicodeEscape(&c)) { ReportError(CStrVector("Invalid Unicode escape sequence")); return nullptr; @@ -849,6 +844,7 @@ bool RegExpParser::ParseNamedBackReference(RegExpBuilder* builder, return false; } + Advance(); const ZoneVector* name = ParseCaptureGroupName(); if (name == nullptr) { return false; diff --git a/src/regexp/regexp-parser.h b/src/regexp/regexp-parser.h index b34932fa00a6..a3ef22d8b7ac 100644 --- a/src/regexp/regexp-parser.h +++ b/src/regexp/regexp-parser.h @@ -184,18 +184,11 @@ class RegExpParser BASE_EMBEDDED { // can be reparsed. bool ParseBackReferenceIndex(int* index_out); - // The default behavior is to combine surrogate pairs in unicode mode and - // don't combine them otherwise (a quantifier after a surrogate pair would - // then apply only to the trailing surrogate). Forcing combination is required - // when parsing capture names since they can always legally contain surrogate - // pairs. - enum class ScanMode { DEFAULT, FORCE_COMBINE_SURROGATE_PAIRS }; - bool ParseClassProperty(ZoneList* result); CharacterRange ParseClassAtom(uc16* char_class); RegExpTree* ReportError(Vector message); - void Advance(ScanMode mode = ScanMode::DEFAULT); - void Advance(int dist, ScanMode mode = ScanMode::DEFAULT); + void Advance(); + void Advance(int dist); void Reset(int pos); // Reports whether the pattern might be used as a literal search string. @@ -311,7 +304,8 @@ class RegExpParser BASE_EMBEDDED { bool has_more() { return has_more_; } bool has_next() { return next_pos_ < in()->length(); } uc32 Next(); - uc32 ReadNext(bool update_position, ScanMode mode); + template + uc32 ReadNext(); FlatStringReader* in() { return in_; } void ScanForCaptures(); diff --git a/test/mjsunit/harmony/regexp-named-captures.js b/test/mjsunit/harmony/regexp-named-captures.js index 4658af24be36..be90427cfac0 100644 --- a/test/mjsunit/harmony/regexp-named-captures.js +++ b/test/mjsunit/harmony/regexp-named-captures.js @@ -147,7 +147,7 @@ assertThrows('/(?<𐒤>a)/u', SyntaxError); // ID_Continue but not ID_Start. assertEquals("a", /(?<π>a)/.exec("bab").groups.π); assertEquals("a", /(?<$>a)/.exec("bab").groups.$); assertEquals("a", /(?<_>a)/.exec("bab").groups._); -assertEquals("a", /(?<$𐒤>a)/.exec("bab").groups.$𐒤); +assertThrows("/(?<$𐒤>a)/", SyntaxError); assertEquals("a", /(?<ಠ_ಠ>a)/.exec("bab").groups.ಠ_ಠ); assertThrows('/(?<❤>a)/', SyntaxError); assertThrows('/(?<𐒤>a)/', SyntaxError); // ID_Continue but not ID_Start. @@ -219,7 +219,7 @@ assertThrows("/(?.)/", SyntaxError); // Trail assertThrows("/(?<\\u{0041}>.)/", SyntaxError); // Non-surrogate assertThrows("/(?.)/", SyntaxError); // Surrogate, ID_Continue assertTrue(RegExp("(?<\u{0041}>.)").test("a")); // Non-surrogate -assertTrue(RegExp("(?.)").test("a")); // Surrogate, ID_Continue +assertThrows("(?.)", SyntaxError); // Surrogate, ID_Continue assertTrue(RegExp("(?<\\u0041>.)").test("a")); // Non-surrogate // @@replace with a callable replacement argument (no named captures).