diff --git a/crates/oxc_ast/src/generated/assert_layouts.rs b/crates/oxc_ast/src/generated/assert_layouts.rs index 344a76cfbb5b4..851fc08cc497a 100644 --- a/crates/oxc_ast/src/generated/assert_layouts.rs +++ b/crates/oxc_ast/src/generated/assert_layouts.rs @@ -1507,6 +1507,7 @@ const _: () = { assert!(offset_of!(CharacterClass, span) == 0usize); assert!(offset_of!(CharacterClass, negative) == 8usize); assert!(offset_of!(CharacterClass, kind) == 9usize); + assert!(offset_of!(CharacterClass, strings) == 10usize); assert!(offset_of!(CharacterClass, body) == 16usize); assert!(size_of::() == 1usize); @@ -3061,6 +3062,7 @@ const _: () = { assert!(offset_of!(CharacterClass, span) == 0usize); assert!(offset_of!(CharacterClass, negative) == 8usize); assert!(offset_of!(CharacterClass, kind) == 9usize); + assert!(offset_of!(CharacterClass, strings) == 10usize); assert!(offset_of!(CharacterClass, body) == 12usize); assert!(size_of::() == 1usize); diff --git a/crates/oxc_regular_expression/src/ast.rs b/crates/oxc_regular_expression/src/ast.rs index 5db3e44200687..e592d14b5a4e8 100644 --- a/crates/oxc_regular_expression/src/ast.rs +++ b/crates/oxc_regular_expression/src/ast.rs @@ -238,6 +238,7 @@ pub struct CharacterClass<'a> { pub span: Span, pub negative: bool, pub kind: CharacterClassContentsKind, + pub strings: bool, pub body: Vec<'a, CharacterClassContents<'a>>, } diff --git a/crates/oxc_regular_expression/src/body_parser/mod.rs b/crates/oxc_regular_expression/src/body_parser/mod.rs index 00cc7f969bcb0..0be06b888c624 100644 --- a/crates/oxc_regular_expression/src/body_parser/mod.rs +++ b/crates/oxc_regular_expression/src/body_parser/mod.rs @@ -221,6 +221,7 @@ mod test { (r"\P{Basic_Emoji}", ParserOptions::default().with_unicode_sets_mode(), true), (r"[^\p{Basic_Emoji}]", ParserOptions::default().with_unicode_sets_mode(), true), (r"[[^\p{Basic_Emoji}]]", ParserOptions::default().with_unicode_sets_mode(), true), + (r"[^\q{}]", ParserOptions::default().with_unicode_sets_mode(), true), (r"[[^\q{}]]", ParserOptions::default().with_unicode_sets_mode(), true), (r"[[^\q{ng}]]", ParserOptions::default().with_unicode_sets_mode(), true), (r"[[^\q{a|}]]", ParserOptions::default().with_unicode_sets_mode(), true), @@ -237,6 +238,12 @@ mod test { (r"[[^\q{ng}--\q{o|k}]]", ParserOptions::default().with_unicode_sets_mode(), true), (r"[[^\q{o|k}--\q{ng}]]", ParserOptions::default().with_unicode_sets_mode(), false), (r"[[z-a]]", ParserOptions::default().with_unicode_sets_mode(), true), + (r"[[[[[^[[[[\q{ng}]]]]]]]]]", ParserOptions::default().with_unicode_sets_mode(), true), + ( + r"[^[[[[[[[[[[[[[[[[\q{ng}]]]]]]]]]]]]]]]]]", + ParserOptions::default().with_unicode_sets_mode(), + true, + ), ] { assert_eq!( PatternParser::new(&allocator, source_text, *options).parse().is_err(), diff --git a/crates/oxc_regular_expression/src/body_parser/parser.rs b/crates/oxc_regular_expression/src/body_parser/parser.rs index 80f260c5d6b37..116cb7e9cff6c 100644 --- a/crates/oxc_regular_expression/src/body_parser/parser.rs +++ b/crates/oxc_regular_expression/src/body_parser/parser.rs @@ -727,20 +727,11 @@ impl<'a> PatternParser<'a> { let (kind, body) = self.parse_class_contents()?; if self.reader.eat(']') { + let strings = body.iter().any(PatternParser::may_contain_strings_in_class_contents); + // [SS:EE] CharacterClass :: [^ ClassContents ] // It is a Syntax Error if MayContainStrings of the ClassContents is true. - if negative - && body.iter().any(|item| match item { - // MayContainStrings is true - // - if ClassContents contains UnicodePropertyValueExpression - // - and the UnicodePropertyValueExpression is LoneUnicodePropertyNameOrValue - // - and it is binary property of strings(can be true only with `UnicodeSetsMode`) - ast::CharacterClassContents::UnicodePropertyEscape( - unicode_property_escape, - ) => unicode_property_escape.strings, - _ => false, - }) - { + if negative && strings { return Err(diagnostics::invalid_character_class( self.span_factory.create(span_start, self.reader.offset()), )); @@ -750,6 +741,7 @@ impl<'a> PatternParser<'a> { span: self.span_factory.create(span_start, self.reader.offset()), negative, kind, + strings, body, })); } @@ -1267,55 +1259,37 @@ impl<'a> PatternParser<'a> { let (kind, body) = self.parse_class_contents()?; if self.reader.eat(']') { + let strings = match kind { + // MayContainStrings is true + // - if ClassContents is ClassUnion + // - && ClassUnion has ClassOperands + // - && at least 1 ClassOperand has MayContainStrings: true + ast::CharacterClassContentsKind::Union => { + body.iter().any(PatternParser::may_contain_strings_in_class_contents) + } + // MayContainStrings is true + // - if ClassContents is ClassIntersection + // - && ClassIntersection has ClassOperands + // - && all ClassOperands have MayContainStrings: true + ast::CharacterClassContentsKind::Intersection => { + body.iter().all(PatternParser::may_contain_strings_in_class_contents) + } + // MayContainStrings is true + // - if ClassContents is ClassSubtraction + // - && ClassSubtraction has ClassOperands + // - && the first ClassOperand has MayContainStrings: true + ast::CharacterClassContentsKind::Subtraction => body + .iter() + .next() + .map_or(false, PatternParser::may_contain_strings_in_class_contents), + }; + // [SS:EE] NestedClass :: [^ ClassContents ] // It is a Syntax Error if MayContainStrings of the ClassContents is true. - if negative { - let may_contain_strings = |item: &ast::CharacterClassContents| match item { - // MayContainStrings is true - // - if ClassContents contains UnicodePropertyValueExpression - // - && UnicodePropertyValueExpression is LoneUnicodePropertyNameOrValue - // - && it is binary property of strings(can be true only with `UnicodeSetsMode`) - ast::CharacterClassContents::UnicodePropertyEscape( - unicode_property_escape, - ) => unicode_property_escape.strings, - // MayContainStrings is true - // - if ClassStringDisjunction is [empty] - // - || if ClassStringDisjunction contains ClassString - // - && ClassString is [empty] - // - || ClassString contains 2 more ClassSetCharacters - ast::CharacterClassContents::ClassStringDisjunction( - class_string_disjunction, - ) => class_string_disjunction.strings, - _ => false, - }; - - if match kind { - // MayContainStrings is true - // - if ClassContents is ClassUnion - // - && ClassUnion has ClassOperands - // - && at least 1 ClassOperand has MayContainStrings: true - ast::CharacterClassContentsKind::Union => { - body.iter().any(may_contain_strings) - } - // MayContainStrings is true - // - if ClassContents is ClassIntersection - // - && ClassIntersection has ClassOperands - // - && all ClassOperands have MayContainStrings: true - ast::CharacterClassContentsKind::Intersection => { - body.iter().all(may_contain_strings) - } - // MayContainStrings is true - // - if ClassContents is ClassSubtraction - // - && ClassSubtraction has ClassOperands - // - && the first ClassOperand has MayContainStrings: true - ast::CharacterClassContentsKind::Subtraction => { - body.iter().next().map_or(false, may_contain_strings) - } - } { - return Err(diagnostics::character_class_contents_invalid_operands( - self.span_factory.create(span_start, self.reader.offset()), - )); - } + if negative && strings { + return Err(diagnostics::character_class_contents_invalid_operands( + self.span_factory.create(span_start, self.reader.offset()), + )); } return Ok(Some(ast::CharacterClassContents::NestedCharacterClass(Box::new_in( @@ -1323,6 +1297,7 @@ impl<'a> PatternParser<'a> { span: self.span_factory.create(span_start, self.reader.offset()), negative, kind, + strings, body, }, self.allocator, @@ -2185,4 +2160,30 @@ impl<'a> PatternParser<'a> { Some(value) } + + // --- + + fn may_contain_strings_in_class_contents(item: &ast::CharacterClassContents) -> bool { + match item { + // MayContainStrings is true + // - if ClassContents contains UnicodePropertyValueExpression + // - && UnicodePropertyValueExpression is LoneUnicodePropertyNameOrValue + // - && it is binary property of strings(can be true only with `UnicodeSetsMode`) + ast::CharacterClassContents::UnicodePropertyEscape(unicode_property_escape) => { + unicode_property_escape.strings + } + // MayContainStrings is true + // - if ClassStringDisjunction is [empty] + // - || if ClassStringDisjunction contains ClassString + // - && ClassString is [empty] + // - || ClassString contains 2 more ClassSetCharacters + ast::CharacterClassContents::ClassStringDisjunction(class_string_disjunction) => { + class_string_disjunction.strings + } + // MayContainStrings is true + // - if NestedClass has MayContainStrings: true + ast::CharacterClassContents::NestedCharacterClass(nested_class) => nested_class.strings, + _ => false, + } + } } diff --git a/crates/oxc_regular_expression/src/generated/derive_clone_in.rs b/crates/oxc_regular_expression/src/generated/derive_clone_in.rs index 6a3f9be434338..7ae6fc2676767 100644 --- a/crates/oxc_regular_expression/src/generated/derive_clone_in.rs +++ b/crates/oxc_regular_expression/src/generated/derive_clone_in.rs @@ -230,6 +230,7 @@ impl<'old_alloc, 'new_alloc> CloneIn<'new_alloc> for CharacterClass<'old_alloc> span: CloneIn::clone_in(&self.span, allocator), negative: CloneIn::clone_in(&self.negative, allocator), kind: CloneIn::clone_in(&self.kind, allocator), + strings: CloneIn::clone_in(&self.strings, allocator), body: CloneIn::clone_in(&self.body, allocator), } } diff --git a/crates/oxc_regular_expression/src/generated/derive_content_eq.rs b/crates/oxc_regular_expression/src/generated/derive_content_eq.rs index 1dd6c2d5cb928..df63972e9e894 100644 --- a/crates/oxc_regular_expression/src/generated/derive_content_eq.rs +++ b/crates/oxc_regular_expression/src/generated/derive_content_eq.rs @@ -179,6 +179,7 @@ impl<'a> ContentEq for CharacterClass<'a> { fn content_eq(&self, other: &Self) -> bool { ContentEq::content_eq(&self.negative, &other.negative) && ContentEq::content_eq(&self.kind, &other.kind) + && ContentEq::content_eq(&self.strings, &other.strings) && ContentEq::content_eq(&self.body, &other.body) } } diff --git a/crates/oxc_regular_expression/src/generated/derive_content_hash.rs b/crates/oxc_regular_expression/src/generated/derive_content_hash.rs index 5d3b722899039..58bd592818d76 100644 --- a/crates/oxc_regular_expression/src/generated/derive_content_hash.rs +++ b/crates/oxc_regular_expression/src/generated/derive_content_hash.rs @@ -144,6 +144,7 @@ impl<'a> ContentHash for CharacterClass<'a> { fn content_hash(&self, state: &mut H) { ContentHash::content_hash(&self.negative, state); ContentHash::content_hash(&self.kind, state); + ContentHash::content_hash(&self.strings, state); ContentHash::content_hash(&self.body, state); } } diff --git a/tasks/coverage/parser_typescript.snap b/tasks/coverage/parser_typescript.snap index 7551f516f3908..601ddd2c60b65 100644 --- a/tasks/coverage/parser_typescript.snap +++ b/tasks/coverage/parser_typescript.snap @@ -10957,6 +10957,46 @@ Expect to Parse: tasks/coverage/typescript/tests/cases/conformance/salsa/private 43 │ /[^\p{Emoji}[[\p{RGI_Emoji}]]][^\p{Emoji}--[[\p{RGI_Emoji}]]][^\p{Emoji}&&[[\p{RGI_Emoji}]]]/v, ╰──── + × Invalid regular expression: Invalid character class with strings unicode property + ╭─[typescript/tests/cases/compiler/regularExpressionScanning.ts:43:3] + 42 │ /[^\p{RGI_Emoji}\q{foo}][^\p{RGI_Emoji}--\q{foo}][^\p{RGI_Emoji}&&\q{foo}]/v, + 43 │ /[^\p{Emoji}[[\p{RGI_Emoji}]]][^\p{Emoji}--[[\p{RGI_Emoji}]]][^\p{Emoji}&&[[\p{RGI_Emoji}]]]/v, + · ───────────────────────────── + 44 │ /[^[[\p{RGI_Emoji}]]\p{Emoji}][^[[\p{RGI_Emoji}]]--\p{Emoji}][^[[\p{RGI_Emoji}]]&&\p{Emoji}]/v, + ╰──── + + × Invalid regular expression: Invalid character class with strings unicode property + ╭─[typescript/tests/cases/compiler/regularExpressionScanning.ts:44:3] + 43 │ /[^\p{Emoji}[[\p{RGI_Emoji}]]][^\p{Emoji}--[[\p{RGI_Emoji}]]][^\p{Emoji}&&[[\p{RGI_Emoji}]]]/v, + 44 │ /[^[[\p{RGI_Emoji}]]\p{Emoji}][^[[\p{RGI_Emoji}]]--\p{Emoji}][^[[\p{RGI_Emoji}]]&&\p{Emoji}]/v, + · ───────────────────────────── + 45 │ /[^[[\p{RGI_Emoji}]]\q{foo}][^[[\p{RGI_Emoji}]]--\q{foo}][^[[\p{RGI_Emoji}]]&&\q{foo}]/v, + ╰──── + + × Invalid regular expression: Invalid character class with strings unicode property + ╭─[typescript/tests/cases/compiler/regularExpressionScanning.ts:45:3] + 44 │ /[^[[\p{RGI_Emoji}]]\p{Emoji}][^[[\p{RGI_Emoji}]]--\p{Emoji}][^[[\p{RGI_Emoji}]]&&\p{Emoji}]/v, + 45 │ /[^[[\p{RGI_Emoji}]]\q{foo}][^[[\p{RGI_Emoji}]]--\q{foo}][^[[\p{RGI_Emoji}]]&&\q{foo}]/v, + · ─────────────────────────── + 46 │ /[^\q{foo|bar|baz}--\q{foo}--\q{bar}--\q{baz}][^\p{L}--\q{foo}--[\q{bar}]--[^[\q{baz}]]]/v, + ╰──── + + × Invalid regular expression: Invalid character class with strings unicode property + ╭─[typescript/tests/cases/compiler/regularExpressionScanning.ts:46:3] + 45 │ /[^[[\p{RGI_Emoji}]]\q{foo}][^[[\p{RGI_Emoji}]]--\q{foo}][^[[\p{RGI_Emoji}]]&&\q{foo}]/v, + 46 │ /[^\q{foo|bar|baz}--\q{foo}--\q{bar}--\q{baz}][^\p{L}--\q{foo}--[\q{bar}]--[^[\q{baz}]]]/v, + · ───────────────────────────────────────────── + 47 │ /[^[[\q{foo|bar|baz}]]--\q{foo}--\q{bar}--\q{baz}][^[^[^\p{L}]]--\q{foo}--[\q{bar}]--[^[\q{baz}]]]/v, + ╰──── + + × Invalid regular expression: Invalid character class with strings unicode property + ╭─[typescript/tests/cases/compiler/regularExpressionScanning.ts:47:3] + 46 │ /[^\q{foo|bar|baz}--\q{foo}--\q{bar}--\q{baz}][^\p{L}--\q{foo}--[\q{bar}]--[^[\q{baz}]]]/v, + 47 │ /[^[[\q{foo|bar|baz}]]--\q{foo}--\q{bar}--\q{baz}][^[^[^\p{L}]]--\q{foo}--[\q{bar}]--[^[\q{baz}]]]/v, + · ───────────────────────────────────────────────── + 48 │ ]; + ╰──── + × The 'u' and 'v' regular expression flags cannot be enabled at the same time ╭─[typescript/tests/cases/compiler/regularExpressionScanning.ts:3:2] 2 │ // Flags