From 353d4d6256acf02d1bf5ee2ac294d9e2f2952ba2 Mon Sep 17 00:00:00 2001 From: Yuji Sugiura Date: Thu, 22 Aug 2024 11:16:22 +0900 Subject: [PATCH] Refactor duplicated capturing group name --- .../src/body_parser/parser.rs | 14 ++- .../src/body_parser/state.rs | 89 +++++++++++-------- 2 files changed, 61 insertions(+), 42 deletions(-) diff --git a/crates/oxc_regular_expression/src/body_parser/parser.rs b/crates/oxc_regular_expression/src/body_parser/parser.rs index 4164df49ef332f..373519852eabbd 100644 --- a/crates/oxc_regular_expression/src/body_parser/parser.rs +++ b/crates/oxc_regular_expression/src/body_parser/parser.rs @@ -44,7 +44,8 @@ impl<'a> PatternParser<'a> { // - Pros: Code is simple enough and easy to understand // - Cons: 1st pass is completely useless if the pattern does not contain any capturing groups // We may re-consider this if we need more performance rather than simplicity. - self.state.initialize_with_parsing(self.source_text); + let duplicated_named_capturing_groups = + self.state.initialize_with_parsing(self.source_text); // [SS:EE] Pattern :: Disjunction // It is a Syntax Error if CountLeftCapturingParensWithin(Pattern) ≥ 2**32 - 1. @@ -59,8 +60,13 @@ impl<'a> PatternParser<'a> { } // [SS:EE] Pattern :: Disjunction // It is a Syntax Error if Pattern contains two or more GroupSpecifiers for which the CapturingGroupName of GroupSpecifier is the same. - if self.state.num_of_named_capturing_groups as usize != self.state.found_group_names.len() { - return Err(OxcDiagnostic::error("Invalid regular expression: Duplicated group name")); + if !duplicated_named_capturing_groups.is_empty() { + return Err(OxcDiagnostic::error("Invalid regular expression: Duplicated group name") + .with_labels( + duplicated_named_capturing_groups + .iter() + .map(|(start, end)| self.span_factory.create(*start, *end)), + )); } // Let's start parsing! @@ -503,7 +509,7 @@ impl<'a> PatternParser<'a> { if let Some(name) = self.consume_group_name()? { // [SS:EE] AtomEscape :: k GroupName // It is a Syntax Error if GroupSpecifiersThatMatch(GroupName) is empty. - if !self.state.found_group_names.contains(name.as_str()) { + if !self.state.capturing_group_names.contains(name.as_str()) { return Err(OxcDiagnostic::error( "Invalid regular expression: Group specifier is empty", ) diff --git a/crates/oxc_regular_expression/src/body_parser/state.rs b/crates/oxc_regular_expression/src/body_parser/state.rs index cc6f21f134b35a..109c5acc3adf5c 100644 --- a/crates/oxc_regular_expression/src/body_parser/state.rs +++ b/crates/oxc_regular_expression/src/body_parser/state.rs @@ -12,8 +12,7 @@ pub struct State<'a> { pub named_capture_groups: bool, // Other states pub num_of_capturing_groups: u32, - pub num_of_named_capturing_groups: u32, - pub found_group_names: FxHashSet<&'a str>, + pub capturing_group_names: FxHashSet<&'a str>, } impl<'a> State<'a> { @@ -23,33 +22,36 @@ impl<'a> State<'a> { unicode_sets_mode, named_capture_groups: false, num_of_capturing_groups: 0, - num_of_named_capturing_groups: 0, - found_group_names: FxHashSet::default(), + capturing_group_names: FxHashSet::default(), } } - pub fn initialize_with_parsing(&mut self, source_text: &'a str) { - let (num_of_left_parens, num_of_named_capturing_groups, named_capturing_groups) = - parse_capturing_groups(source_text); + pub fn initialize_with_parsing(&mut self, source_text: &'a str) -> Vec<(usize, usize)> { + let ( + num_of_left_capturing_parens, + capturing_group_names, + duplicated_named_capturing_groups, + ) = parse_capturing_groups(source_text); // In Annex B, this is `false` by default. // It is `true` // - if `u` or `v` flag is set // - or if `GroupName` is found in pattern self.named_capture_groups = - self.unicode_mode || self.unicode_sets_mode || 0 < num_of_named_capturing_groups; + self.unicode_mode || self.unicode_sets_mode || !capturing_group_names.is_empty(); - self.num_of_capturing_groups = num_of_left_parens; - self.num_of_named_capturing_groups = num_of_named_capturing_groups; - self.found_group_names = named_capturing_groups; + self.num_of_capturing_groups = num_of_left_capturing_parens; + self.capturing_group_names = capturing_group_names; + + duplicated_named_capturing_groups } } -/// Returns: (num_of_left_parens, num_of_named_capturing_groups, named_capturing_groups) -fn parse_capturing_groups(source_text: &str) -> (u32, u32, FxHashSet<&str>) { - let mut num_of_left_parens = 0; - let mut num_of_named_capturing_groups = 0; - let mut named_capturing_groups = FxHashSet::default(); +/// Returns: (num_of_left_parens, capturing_group_names, duplicated_named_capturing_groups) +fn parse_capturing_groups(source_text: &str) -> (u32, FxHashSet<&str>, Vec<(usize, usize)>) { + let mut num_of_left_capturing_parens = 0; + let mut capturing_group_names = FxHashSet::default(); + let mut duplicated_named_capturing_groups = vec![]; let mut reader = Reader::new(source_text, true); @@ -84,9 +86,10 @@ fn parse_capturing_groups(source_text: &str) -> (u32, u32, FxHashSet<&str>) { continue; } - num_of_left_parens += 1; + // Count named or unnamed capturing groups + num_of_left_capturing_parens += 1; - // Named capturing group + // Collect capturing group names if reader.eat2('?', '<') { let span_start = reader.offset(); while let Some(ch) = reader.peek() { @@ -99,9 +102,11 @@ fn parse_capturing_groups(source_text: &str) -> (u32, u32, FxHashSet<&str>) { if reader.eat('>') { let group_name = &source_text[span_start..span_end]; - // May be duplicated, but it's OK - named_capturing_groups.insert(group_name); - num_of_named_capturing_groups += 1; + // May be duplicated + if !capturing_group_names.insert(group_name) { + // Report them with `Span` + duplicated_named_capturing_groups.push((span_start, span_end)); + } continue; } } @@ -110,7 +115,7 @@ fn parse_capturing_groups(source_text: &str) -> (u32, u32, FxHashSet<&str>) { reader.advance(); } - (num_of_left_parens, num_of_named_capturing_groups, named_capturing_groups) + (num_of_left_capturing_parens, capturing_group_names, duplicated_named_capturing_groups) } #[cfg(test)] @@ -119,23 +124,31 @@ mod tests { #[test] fn test_count_capturing_groups() { - for (source_text, expected_num_of_left_parens, expected_num_of_named_capturing_groups) in [ - ("()", 1, 0), - (r"\1()", 1, 0), - ("(foo)", 1, 0), - ("(foo)(bar)", 2, 0), - ("(foo(bar))", 2, 0), - ("(foo)[(bar)]", 1, 0), - (r"(foo)\(bar\)", 1, 0), - ("(foo)(?bar)", 2, 1), - ("(foo)(?=...)(?!...)(?<=...)(?bar)(?baz)", 3, 2), - ("(?.)(?..)", 2, 2), + for (source_text, expected) in [ + ("()", (1, 0, false)), + (r"\1()", (1, 0, false)), + ("(foo)", (1, 0, false)), + ("(foo)(bar)", (2, 0, false)), + ("(foo(bar))", (2, 0, false)), + ("(foo)[(bar)]", (1, 0, false)), + (r"(foo)\(bar\)", (1, 0, false)), + ("(foo)(?bar)", (2, 1, false)), + ("(foo)(?=...)(?!...)(?<=...)(?bar)(?baz)", (3, 2, false)), + ("(?.)(?..)", (2, 1, true)), + ("(?.(?..))", (2, 1, true)), ] { - let (num_of_left_parens, num_of_named_capturing_groups, _) = - parse_capturing_groups(source_text); - assert_eq!(expected_num_of_left_parens, num_of_left_parens); - assert_eq!(expected_num_of_named_capturing_groups, num_of_named_capturing_groups); + let ( + num_of_left_capturing_parens, + capturing_group_names, + duplicated_named_capturing_groups, + ) = parse_capturing_groups(source_text); + let actual = ( + num_of_left_capturing_parens, + capturing_group_names.len(), + !duplicated_named_capturing_groups.is_empty(), + ); + assert_eq!(expected, actual, "{source_text}"); } } }