From 353d4d6256acf02d1bf5ee2ac294d9e2f2952ba2 Mon Sep 17 00:00:00 2001
From: Yuji Sugiura <y.sugiura.0316@gmail.com>
Date: Thu, 22 Aug 2024 11:16:22 +0900
Subject: [PATCH] Refactor duplicated capturing group name

---
 .../src/body_parser/parser.rs                 | 14 ++-
 .../src/body_parser/state.rs                  | 89 +++++++++++--------
 2 files changed, 61 insertions(+), 42 deletions(-)

diff --git a/crates/oxc_regular_expression/src/body_parser/parser.rs b/crates/oxc_regular_expression/src/body_parser/parser.rs
index 4164df49ef332f..373519852eabbd 100644
--- a/crates/oxc_regular_expression/src/body_parser/parser.rs
+++ b/crates/oxc_regular_expression/src/body_parser/parser.rs
@@ -44,7 +44,8 @@ impl<'a> PatternParser<'a> {
         // - Pros: Code is simple enough and easy to understand
         // - Cons: 1st pass is completely useless if the pattern does not contain any capturing groups
         // We may re-consider this if we need more performance rather than simplicity.
-        self.state.initialize_with_parsing(self.source_text);
+        let duplicated_named_capturing_groups =
+            self.state.initialize_with_parsing(self.source_text);
 
         // [SS:EE] Pattern :: Disjunction
         // It is a Syntax Error if CountLeftCapturingParensWithin(Pattern) ≥ 2**32 - 1.
@@ -59,8 +60,13 @@ impl<'a> PatternParser<'a> {
         }
         // [SS:EE] Pattern :: Disjunction
         // It is a Syntax Error if Pattern contains two or more GroupSpecifiers for which the CapturingGroupName of GroupSpecifier is the same.
-        if self.state.num_of_named_capturing_groups as usize != self.state.found_group_names.len() {
-            return Err(OxcDiagnostic::error("Invalid regular expression: Duplicated group name"));
+        if !duplicated_named_capturing_groups.is_empty() {
+            return Err(OxcDiagnostic::error("Invalid regular expression: Duplicated group name")
+                .with_labels(
+                    duplicated_named_capturing_groups
+                        .iter()
+                        .map(|(start, end)| self.span_factory.create(*start, *end)),
+                ));
         }
 
         // Let's start parsing!
@@ -503,7 +509,7 @@ impl<'a> PatternParser<'a> {
             if let Some(name) = self.consume_group_name()? {
                 // [SS:EE] AtomEscape :: k GroupName
                 // It is a Syntax Error if GroupSpecifiersThatMatch(GroupName) is empty.
-                if !self.state.found_group_names.contains(name.as_str()) {
+                if !self.state.capturing_group_names.contains(name.as_str()) {
                     return Err(OxcDiagnostic::error(
                         "Invalid regular expression: Group specifier is empty",
                     )
diff --git a/crates/oxc_regular_expression/src/body_parser/state.rs b/crates/oxc_regular_expression/src/body_parser/state.rs
index cc6f21f134b35a..109c5acc3adf5c 100644
--- a/crates/oxc_regular_expression/src/body_parser/state.rs
+++ b/crates/oxc_regular_expression/src/body_parser/state.rs
@@ -12,8 +12,7 @@ pub struct State<'a> {
     pub named_capture_groups: bool,
     // Other states
     pub num_of_capturing_groups: u32,
-    pub num_of_named_capturing_groups: u32,
-    pub found_group_names: FxHashSet<&'a str>,
+    pub capturing_group_names: FxHashSet<&'a str>,
 }
 
 impl<'a> State<'a> {
@@ -23,33 +22,36 @@ impl<'a> State<'a> {
             unicode_sets_mode,
             named_capture_groups: false,
             num_of_capturing_groups: 0,
-            num_of_named_capturing_groups: 0,
-            found_group_names: FxHashSet::default(),
+            capturing_group_names: FxHashSet::default(),
         }
     }
 
-    pub fn initialize_with_parsing(&mut self, source_text: &'a str) {
-        let (num_of_left_parens, num_of_named_capturing_groups, named_capturing_groups) =
-            parse_capturing_groups(source_text);
+    pub fn initialize_with_parsing(&mut self, source_text: &'a str) -> Vec<(usize, usize)> {
+        let (
+            num_of_left_capturing_parens,
+            capturing_group_names,
+            duplicated_named_capturing_groups,
+        ) = parse_capturing_groups(source_text);
 
         // In Annex B, this is `false` by default.
         // It is `true`
         // - if `u` or `v` flag is set
         // - or if `GroupName` is found in pattern
         self.named_capture_groups =
-            self.unicode_mode || self.unicode_sets_mode || 0 < num_of_named_capturing_groups;
+            self.unicode_mode || self.unicode_sets_mode || !capturing_group_names.is_empty();
 
-        self.num_of_capturing_groups = num_of_left_parens;
-        self.num_of_named_capturing_groups = num_of_named_capturing_groups;
-        self.found_group_names = named_capturing_groups;
+        self.num_of_capturing_groups = num_of_left_capturing_parens;
+        self.capturing_group_names = capturing_group_names;
+
+        duplicated_named_capturing_groups
     }
 }
 
-/// Returns: (num_of_left_parens, num_of_named_capturing_groups, named_capturing_groups)
-fn parse_capturing_groups(source_text: &str) -> (u32, u32, FxHashSet<&str>) {
-    let mut num_of_left_parens = 0;
-    let mut num_of_named_capturing_groups = 0;
-    let mut named_capturing_groups = FxHashSet::default();
+/// Returns: (num_of_left_parens, capturing_group_names, duplicated_named_capturing_groups)
+fn parse_capturing_groups(source_text: &str) -> (u32, FxHashSet<&str>, Vec<(usize, usize)>) {
+    let mut num_of_left_capturing_parens = 0;
+    let mut capturing_group_names = FxHashSet::default();
+    let mut duplicated_named_capturing_groups = vec![];
 
     let mut reader = Reader::new(source_text, true);
 
@@ -84,9 +86,10 @@ fn parse_capturing_groups(source_text: &str) -> (u32, u32, FxHashSet<&str>) {
                 continue;
             }
 
-            num_of_left_parens += 1;
+            // Count named or unnamed capturing groups
+            num_of_left_capturing_parens += 1;
 
-            // Named capturing group
+            // Collect capturing group names
             if reader.eat2('?', '<') {
                 let span_start = reader.offset();
                 while let Some(ch) = reader.peek() {
@@ -99,9 +102,11 @@ fn parse_capturing_groups(source_text: &str) -> (u32, u32, FxHashSet<&str>) {
 
                 if reader.eat('>') {
                     let group_name = &source_text[span_start..span_end];
-                    // May be duplicated, but it's OK
-                    named_capturing_groups.insert(group_name);
-                    num_of_named_capturing_groups += 1;
+                    // May be duplicated
+                    if !capturing_group_names.insert(group_name) {
+                        // Report them with `Span`
+                        duplicated_named_capturing_groups.push((span_start, span_end));
+                    }
                     continue;
                 }
             }
@@ -110,7 +115,7 @@ fn parse_capturing_groups(source_text: &str) -> (u32, u32, FxHashSet<&str>) {
         reader.advance();
     }
 
-    (num_of_left_parens, num_of_named_capturing_groups, named_capturing_groups)
+    (num_of_left_capturing_parens, capturing_group_names, duplicated_named_capturing_groups)
 }
 
 #[cfg(test)]
@@ -119,23 +124,31 @@ mod tests {
 
     #[test]
     fn test_count_capturing_groups() {
-        for (source_text, expected_num_of_left_parens, expected_num_of_named_capturing_groups) in [
-            ("()", 1, 0),
-            (r"\1()", 1, 0),
-            ("(foo)", 1, 0),
-            ("(foo)(bar)", 2, 0),
-            ("(foo(bar))", 2, 0),
-            ("(foo)[(bar)]", 1, 0),
-            (r"(foo)\(bar\)", 1, 0),
-            ("(foo)(?<n>bar)", 2, 1),
-            ("(foo)(?=...)(?!...)(?<=...)(?<!...)(?:...)", 1, 0),
-            ("(foo)(?<n>bar)(?<nn>baz)", 3, 2),
-            ("(?<n>.)(?<n>..)", 2, 2),
+        for (source_text, expected) in [
+            ("()", (1, 0, false)),
+            (r"\1()", (1, 0, false)),
+            ("(foo)", (1, 0, false)),
+            ("(foo)(bar)", (2, 0, false)),
+            ("(foo(bar))", (2, 0, false)),
+            ("(foo)[(bar)]", (1, 0, false)),
+            (r"(foo)\(bar\)", (1, 0, false)),
+            ("(foo)(?<n>bar)", (2, 1, false)),
+            ("(foo)(?=...)(?!...)(?<=...)(?<!...)(?:...)", (1, 0, false)),
+            ("(foo)(?<n>bar)(?<nn>baz)", (3, 2, false)),
+            ("(?<n>.)(?<n>..)", (2, 1, true)),
+            ("(?<n>.(?<n>..))", (2, 1, true)),
         ] {
-            let (num_of_left_parens, num_of_named_capturing_groups, _) =
-                parse_capturing_groups(source_text);
-            assert_eq!(expected_num_of_left_parens, num_of_left_parens);
-            assert_eq!(expected_num_of_named_capturing_groups, num_of_named_capturing_groups);
+            let (
+                num_of_left_capturing_parens,
+                capturing_group_names,
+                duplicated_named_capturing_groups,
+            ) = parse_capturing_groups(source_text);
+            let actual = (
+                num_of_left_capturing_parens,
+                capturing_group_names.len(),
+                !duplicated_named_capturing_groups.is_empty(),
+            );
+            assert_eq!(expected, actual, "{source_text}");
         }
     }
 }