Skip to content

Commit

Permalink
Refactor duplicated capturing group name
Browse files Browse the repository at this point in the history
  • Loading branch information
leaysgur committed Aug 22, 2024
1 parent 195ff4d commit 353d4d6
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 42 deletions.
14 changes: 10 additions & 4 deletions crates/oxc_regular_expression/src/body_parser/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ impl<'a> PatternParser<'a> {
// - Pros: Code is simple enough and easy to understand
// - Cons: 1st pass is completely useless if the pattern does not contain any capturing groups
// We may re-consider this if we need more performance rather than simplicity.
self.state.initialize_with_parsing(self.source_text);
let duplicated_named_capturing_groups =
self.state.initialize_with_parsing(self.source_text);

// [SS:EE] Pattern :: Disjunction
// It is a Syntax Error if CountLeftCapturingParensWithin(Pattern) ≥ 2**32 - 1.
Expand All @@ -59,8 +60,13 @@ impl<'a> PatternParser<'a> {
}
// [SS:EE] Pattern :: Disjunction
// It is a Syntax Error if Pattern contains two or more GroupSpecifiers for which the CapturingGroupName of GroupSpecifier is the same.
if self.state.num_of_named_capturing_groups as usize != self.state.found_group_names.len() {
return Err(OxcDiagnostic::error("Invalid regular expression: Duplicated group name"));
if !duplicated_named_capturing_groups.is_empty() {
return Err(OxcDiagnostic::error("Invalid regular expression: Duplicated group name")
.with_labels(
duplicated_named_capturing_groups
.iter()
.map(|(start, end)| self.span_factory.create(*start, *end)),
));
}

// Let's start parsing!
Expand Down Expand Up @@ -503,7 +509,7 @@ impl<'a> PatternParser<'a> {
if let Some(name) = self.consume_group_name()? {
// [SS:EE] AtomEscape :: k GroupName
// It is a Syntax Error if GroupSpecifiersThatMatch(GroupName) is empty.
if !self.state.found_group_names.contains(name.as_str()) {
if !self.state.capturing_group_names.contains(name.as_str()) {
return Err(OxcDiagnostic::error(
"Invalid regular expression: Group specifier is empty",
)
Expand Down
89 changes: 51 additions & 38 deletions crates/oxc_regular_expression/src/body_parser/state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ pub struct State<'a> {
pub named_capture_groups: bool,
// Other states
pub num_of_capturing_groups: u32,
pub num_of_named_capturing_groups: u32,
pub found_group_names: FxHashSet<&'a str>,
pub capturing_group_names: FxHashSet<&'a str>,
}

impl<'a> State<'a> {
Expand All @@ -23,33 +22,36 @@ impl<'a> State<'a> {
unicode_sets_mode,
named_capture_groups: false,
num_of_capturing_groups: 0,
num_of_named_capturing_groups: 0,
found_group_names: FxHashSet::default(),
capturing_group_names: FxHashSet::default(),
}
}

pub fn initialize_with_parsing(&mut self, source_text: &'a str) {
let (num_of_left_parens, num_of_named_capturing_groups, named_capturing_groups) =
parse_capturing_groups(source_text);
pub fn initialize_with_parsing(&mut self, source_text: &'a str) -> Vec<(usize, usize)> {
let (
num_of_left_capturing_parens,
capturing_group_names,
duplicated_named_capturing_groups,
) = parse_capturing_groups(source_text);

// In Annex B, this is `false` by default.
// It is `true`
// - if `u` or `v` flag is set
// - or if `GroupName` is found in pattern
self.named_capture_groups =
self.unicode_mode || self.unicode_sets_mode || 0 < num_of_named_capturing_groups;
self.unicode_mode || self.unicode_sets_mode || !capturing_group_names.is_empty();

self.num_of_capturing_groups = num_of_left_parens;
self.num_of_named_capturing_groups = num_of_named_capturing_groups;
self.found_group_names = named_capturing_groups;
self.num_of_capturing_groups = num_of_left_capturing_parens;
self.capturing_group_names = capturing_group_names;

duplicated_named_capturing_groups
}
}

/// Returns: (num_of_left_parens, num_of_named_capturing_groups, named_capturing_groups)
fn parse_capturing_groups(source_text: &str) -> (u32, u32, FxHashSet<&str>) {
let mut num_of_left_parens = 0;
let mut num_of_named_capturing_groups = 0;
let mut named_capturing_groups = FxHashSet::default();
/// Returns: (num_of_left_parens, capturing_group_names, duplicated_named_capturing_groups)
fn parse_capturing_groups(source_text: &str) -> (u32, FxHashSet<&str>, Vec<(usize, usize)>) {
let mut num_of_left_capturing_parens = 0;
let mut capturing_group_names = FxHashSet::default();
let mut duplicated_named_capturing_groups = vec![];

let mut reader = Reader::new(source_text, true);

Expand Down Expand Up @@ -84,9 +86,10 @@ fn parse_capturing_groups(source_text: &str) -> (u32, u32, FxHashSet<&str>) {
continue;
}

num_of_left_parens += 1;
// Count named or unnamed capturing groups
num_of_left_capturing_parens += 1;

// Named capturing group
// Collect capturing group names
if reader.eat2('?', '<') {
let span_start = reader.offset();
while let Some(ch) = reader.peek() {
Expand All @@ -99,9 +102,11 @@ fn parse_capturing_groups(source_text: &str) -> (u32, u32, FxHashSet<&str>) {

if reader.eat('>') {
let group_name = &source_text[span_start..span_end];
// May be duplicated, but it's OK
named_capturing_groups.insert(group_name);
num_of_named_capturing_groups += 1;
// May be duplicated
if !capturing_group_names.insert(group_name) {
// Report them with `Span`
duplicated_named_capturing_groups.push((span_start, span_end));
}
continue;
}
}
Expand All @@ -110,7 +115,7 @@ fn parse_capturing_groups(source_text: &str) -> (u32, u32, FxHashSet<&str>) {
reader.advance();
}

(num_of_left_parens, num_of_named_capturing_groups, named_capturing_groups)
(num_of_left_capturing_parens, capturing_group_names, duplicated_named_capturing_groups)
}

#[cfg(test)]
Expand All @@ -119,23 +124,31 @@ mod tests {

#[test]
fn test_count_capturing_groups() {
for (source_text, expected_num_of_left_parens, expected_num_of_named_capturing_groups) in [
("()", 1, 0),
(r"\1()", 1, 0),
("(foo)", 1, 0),
("(foo)(bar)", 2, 0),
("(foo(bar))", 2, 0),
("(foo)[(bar)]", 1, 0),
(r"(foo)\(bar\)", 1, 0),
("(foo)(?<n>bar)", 2, 1),
("(foo)(?=...)(?!...)(?<=...)(?<!...)(?:...)", 1, 0),
("(foo)(?<n>bar)(?<nn>baz)", 3, 2),
("(?<n>.)(?<n>..)", 2, 2),
for (source_text, expected) in [
("()", (1, 0, false)),
(r"\1()", (1, 0, false)),
("(foo)", (1, 0, false)),
("(foo)(bar)", (2, 0, false)),
("(foo(bar))", (2, 0, false)),
("(foo)[(bar)]", (1, 0, false)),
(r"(foo)\(bar\)", (1, 0, false)),
("(foo)(?<n>bar)", (2, 1, false)),
("(foo)(?=...)(?!...)(?<=...)(?<!...)(?:...)", (1, 0, false)),
("(foo)(?<n>bar)(?<nn>baz)", (3, 2, false)),
("(?<n>.)(?<n>..)", (2, 1, true)),
("(?<n>.(?<n>..))", (2, 1, true)),
] {
let (num_of_left_parens, num_of_named_capturing_groups, _) =
parse_capturing_groups(source_text);
assert_eq!(expected_num_of_left_parens, num_of_left_parens);
assert_eq!(expected_num_of_named_capturing_groups, num_of_named_capturing_groups);
let (
num_of_left_capturing_parens,
capturing_group_names,
duplicated_named_capturing_groups,
) = parse_capturing_groups(source_text);
let actual = (
num_of_left_capturing_parens,
capturing_group_names.len(),
!duplicated_named_capturing_groups.is_empty(),
);
assert_eq!(expected, actual, "{source_text}");
}
}
}

0 comments on commit 353d4d6

Please sign in to comment.