From fe5a1f15bdd571a7679b39911cd60bb0f593e78d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Thu, 5 Dec 2024 16:49:47 +0100 Subject: [PATCH 01/12] ``: Circumflex ^ should negate character classes in basic regular expressions --- stl/inc/regex | 17 +++++++++-------- tests/std/tests/VSO_0000000_regex_use/test.cpp | 13 +++++++++++++ 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index 77efc9d32f3..c88a6b02638 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -1699,8 +1699,8 @@ private: void _Error(regex_constants::error_type); bool _Is_esc() const; - void _Trans(); - void _Next(); + void _Trans(bool _Beg_char_class); + void _Next(bool _Beg_char_class = false); void _Expect(_Meta_type, regex_constants::error_type); // parsing @@ -3823,7 +3823,7 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_Is_esc() const { // assumes _Pat != _En } template -void _Parser<_FwdIt, _Elem, _RxTraits>::_Trans() { // map character to meta-character +void _Parser<_FwdIt, _Elem, _RxTraits>::_Trans(const bool _Beg_char_class) { // map character to meta-character static constexpr char _Meta_map[] = {_Meta_lpar, _Meta_rpar, _Meta_dlr, _Meta_caret, _Meta_dot, _Meta_star, _Meta_plus, _Meta_query, _Meta_lsq, _Meta_rsq, _Meta_bar, _Meta_esc, _Meta_dash, _Meta_lbr, _Meta_rbr, _Meta_comma, _Meta_colon, _Meta_equal, _Meta_exc, _Meta_nl, _Meta_cr, _Meta_bsp, 0}; // array of meta chars @@ -3874,7 +3874,7 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Trans() { // map character to meta-char break; case _Meta_caret: - if ((_L_flags & _L_anch_rstr) && !_Nfa._Beg_expr()) { + if (!_Beg_char_class && (_L_flags & _L_anch_rstr) && !_Nfa._Beg_expr()) { _Mchar = _Meta_chr; } @@ -3911,7 +3911,8 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Trans() { // map character to meta-char } template -void _Parser<_FwdIt, _Elem, _RxTraits>::_Next() { // advance to next input character +void _Parser<_FwdIt, _Elem, _RxTraits>::_Next( + const bool _Beg_char_class /* = false */) { // advance to next input character if (_Pat != _End) { // advance if (*_Pat == _Meta_esc && _Is_esc()) { ++_Pat; @@ -3919,7 +3920,7 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Next() { // advance to next input chara ++_Pat; } - _Trans(); + _Trans(_Beg_char_class); } template @@ -4428,7 +4429,7 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_Alternative() { // check for valid alte _AtomEscape(); } } else if (_Mchar == _Meta_lsq) { // add bracket expression - _Next(); + _Next(true); _CharacterClass(); _Expect(_Meta_rsq, regex_constants::error_brack); } else if (_Mchar == _Meta_lpar) { // check for valid group @@ -4614,7 +4615,7 @@ _Parser<_FwdIt, _Elem, _RxTraits>::_Parser( _Nfa._Setlong(); } - _Trans(); + _Trans(false); } #if _HAS_TR1_NAMESPACE diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index ad2efc0d4a6..e56ec280f8c 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -669,6 +669,18 @@ void test_gh_5160() { neg_regex.should_search_fail(L"xxxYxx\x2009xxxZxxx"); // U+2009 THIN SPACE } +void test_gh_5165() { + // GH-5165: circumflex ^ should negate character classes in basic regular expressions + g_regexTester.should_match("yz", "y[^x]", basic); + g_regexTester.should_match("yz", "y[^x]", grep); + g_regexTester.should_match("y^", "y[^x]", basic); + g_regexTester.should_match("y^", "y[^x]", grep); + g_regexTester.should_not_match("yx", "y[^x]", basic); + g_regexTester.should_not_match("yx", "y[^x]", grep); + g_regexTester.should_not_match("y^", "y[^x^]", basic); + g_regexTester.should_not_match("y^", "y[^x^]", grep); +} + int main() { test_dev10_449367_case_insensitivity_should_work(); test_dev11_462743_regex_collate_should_not_disable_regex_icase(); @@ -699,6 +711,7 @@ int main() { test_gh_4995(); test_gh_5058(); test_gh_5160(); + test_gh_5165(); return g_regexTester.result(); } From fbe0a0c4d648ef72e4c6650b607cc07625a74d77 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 9 Dec 2024 13:17:28 -0800 Subject: [PATCH 02/12] Revert product code. --- stl/inc/regex | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index c88a6b02638..77efc9d32f3 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -1699,8 +1699,8 @@ private: void _Error(regex_constants::error_type); bool _Is_esc() const; - void _Trans(bool _Beg_char_class); - void _Next(bool _Beg_char_class = false); + void _Trans(); + void _Next(); void _Expect(_Meta_type, regex_constants::error_type); // parsing @@ -3823,7 +3823,7 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_Is_esc() const { // assumes _Pat != _En } template -void _Parser<_FwdIt, _Elem, _RxTraits>::_Trans(const bool _Beg_char_class) { // map character to meta-character +void _Parser<_FwdIt, _Elem, _RxTraits>::_Trans() { // map character to meta-character static constexpr char _Meta_map[] = {_Meta_lpar, _Meta_rpar, _Meta_dlr, _Meta_caret, _Meta_dot, _Meta_star, _Meta_plus, _Meta_query, _Meta_lsq, _Meta_rsq, _Meta_bar, _Meta_esc, _Meta_dash, _Meta_lbr, _Meta_rbr, _Meta_comma, _Meta_colon, _Meta_equal, _Meta_exc, _Meta_nl, _Meta_cr, _Meta_bsp, 0}; // array of meta chars @@ -3874,7 +3874,7 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Trans(const bool _Beg_char_class) { // break; case _Meta_caret: - if (!_Beg_char_class && (_L_flags & _L_anch_rstr) && !_Nfa._Beg_expr()) { + if ((_L_flags & _L_anch_rstr) && !_Nfa._Beg_expr()) { _Mchar = _Meta_chr; } @@ -3911,8 +3911,7 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Trans(const bool _Beg_char_class) { // } template -void _Parser<_FwdIt, _Elem, _RxTraits>::_Next( - const bool _Beg_char_class /* = false */) { // advance to next input character +void _Parser<_FwdIt, _Elem, _RxTraits>::_Next() { // advance to next input character if (_Pat != _End) { // advance if (*_Pat == _Meta_esc && _Is_esc()) { ++_Pat; @@ -3920,7 +3919,7 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Next( ++_Pat; } - _Trans(_Beg_char_class); + _Trans(); } template @@ -4429,7 +4428,7 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_Alternative() { // check for valid alte _AtomEscape(); } } else if (_Mchar == _Meta_lsq) { // add bracket expression - _Next(true); + _Next(); _CharacterClass(); _Expect(_Meta_rsq, regex_constants::error_brack); } else if (_Mchar == _Meta_lpar) { // check for valid group @@ -4615,7 +4614,7 @@ _Parser<_FwdIt, _Elem, _RxTraits>::_Parser( _Nfa._Setlong(); } - _Trans(false); + _Trans(); } #if _HAS_TR1_NAMESPACE From f78726515643292f163f360a6016207ca7327b03 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 9 Dec 2024 13:52:37 -0800 Subject: [PATCH 03/12] Move (and De Morgan) `_L_anch_rstr` logic to "add bol node". --- stl/inc/regex | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index 77efc9d32f3..b59e65fb809 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -3874,10 +3874,9 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Trans() { // map character to meta-char break; case _Meta_caret: - if ((_L_flags & _L_anch_rstr) && !_Nfa._Beg_expr()) { - _Mchar = _Meta_chr; - } - + // A caret can always negate a bracket expression, + // but _L_anch_rstr (used by basic/grep) restricts caret anchors to the beginning. + // We'll handle that restriction when we're about to add a bol node. break; case _Meta_dlr: @@ -4435,7 +4434,7 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_Alternative() { // check for valid alte _Next(); _Quant = _Wrapped_disjunction(); _Expect(_Meta_rpar, regex_constants::error_paren); - } else if (_Mchar == _Meta_caret) { // add bol node + } else if (_Mchar == _Meta_caret && (!(_L_flags & _L_anch_rstr) || _Nfa._Beg_expr())) { // add bol node _Nfa._Add_bol(); _Next(); _Quant = false; From 3269019d881c0dd48497c3aaff8cc3f7121a403e Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 9 Dec 2024 13:55:50 -0800 Subject: [PATCH 04/12] circumflex => Caret --- tests/std/tests/VSO_0000000_regex_use/test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index e56ec280f8c..2e7682a8e33 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -670,7 +670,7 @@ void test_gh_5160() { } void test_gh_5165() { - // GH-5165: circumflex ^ should negate character classes in basic regular expressions + // GH-5165: Caret ^ should negate character classes in basic regular expressions g_regexTester.should_match("yz", "y[^x]", basic); g_regexTester.should_match("yz", "y[^x]", grep); g_regexTester.should_match("y^", "y[^x]", basic); From fcd95cf9cc6cfbb86ccd30a1312c1f00487f1ae8 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 9 Dec 2024 14:38:04 -0800 Subject: [PATCH 05/12] Expand test coverage. --- .../std/tests/VSO_0000000_regex_use/test.cpp | 41 +++++++++++++++---- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index 2e7682a8e33..a355fe8b15e 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -669,16 +669,41 @@ void test_gh_5160() { neg_regex.should_search_fail(L"xxxYxx\x2009xxxZxxx"); // U+2009 THIN SPACE } +void test_gh_5165_syntax_option(const syntax_option_type basic_or_grep) { + g_regexTester.should_not_match("yx", "y[^x]", basic_or_grep); + g_regexTester.should_match("yz", "y[^x]", basic_or_grep); + g_regexTester.should_match("y^", "y[^x]", basic_or_grep); + + g_regexTester.should_match("yx", "y[x^]", basic_or_grep); + g_regexTester.should_not_match("yz", "y[x^]", basic_or_grep); + g_regexTester.should_match("y^", "y[x^]", basic_or_grep); + + g_regexTester.should_not_match("yx", "y[^x^]", basic_or_grep); + g_regexTester.should_match("yz", "y[^x^]", basic_or_grep); + g_regexTester.should_not_match("y^", "y[^x^]", basic_or_grep); + + { + const test_regex no_anchor(&g_regexTester, "meo[wW]", basic_or_grep); + no_anchor.should_search_match("meow_machine", "meow"); + no_anchor.should_search_match("homeowner", "meow"); + } + { + const test_regex beginning_anchor(&g_regexTester, "^meo[wW]", basic_or_grep); + beginning_anchor.should_search_match("meow_machine", "meow"); + beginning_anchor.should_search_fail("homeowner"); + } + { + const test_regex middle_anchor(&g_regexTester, "me^o[wW]", basic_or_grep); + middle_anchor.should_search_fail("meow_machine"); + middle_anchor.should_search_fail("homeowner"); + middle_anchor.should_search_match("home^owner", "me^ow"); + } +} + void test_gh_5165() { // GH-5165: Caret ^ should negate character classes in basic regular expressions - g_regexTester.should_match("yz", "y[^x]", basic); - g_regexTester.should_match("yz", "y[^x]", grep); - g_regexTester.should_match("y^", "y[^x]", basic); - g_regexTester.should_match("y^", "y[^x]", grep); - g_regexTester.should_not_match("yx", "y[^x]", basic); - g_regexTester.should_not_match("yx", "y[^x]", grep); - g_regexTester.should_not_match("y^", "y[^x^]", basic); - g_regexTester.should_not_match("y^", "y[^x^]", grep); + test_gh_5165_syntax_option(basic); + test_gh_5165_syntax_option(grep); } int main() { From edada370fb3653ad2d31ab0263b2af35254170a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Sun, 15 Dec 2024 16:16:42 +0100 Subject: [PATCH 06/12] fix miscompilation of double carets and treat carets as anchors at the beginning of alternatives bonus: eliminates order-dependency between lexer tokenization and NFA additions --- stl/inc/regex | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index b59e65fb809..93501572938 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -1485,7 +1485,6 @@ public: using _Difft = typename iterator_traits<_FwdIt>::difference_type; _Builder(const _RxTraits& _Tr, regex_constants::syntax_option_type); - bool _Beg_expr() const; void _Setlong(); // _Discard_pattern is an ABI zombie name void _Tidy() noexcept; @@ -1521,7 +1520,6 @@ private: static void _Insert_node(_Node_base*, _Node_base*); _Node_base* _New_node(_Node_type _Kind); void _Add_str_node(); - bool _Beg_expr(_Node_base*) const; void _Add_char_to_bitmap(_Elem _Ch); void _Add_char_to_array(_Elem _Ch); void _Add_elts(_Node_class<_Elem, _RxTraits>*, _Regex_traits_base::char_class_type, bool); @@ -2755,17 +2753,6 @@ _Node_base* _Builder<_FwdIt, _Elem, _RxTraits>::_Getmark() const { return _Current; } -template -bool _Builder<_FwdIt, _Elem, _RxTraits>::_Beg_expr(_Node_base* _Nx) const { - // test for beginning of expression or subexpression - return _Nx->_Kind == _N_begin || _Nx->_Kind == _N_group || _Nx->_Kind == _N_capture; -} - -template -bool _Builder<_FwdIt, _Elem, _RxTraits>::_Beg_expr() const { // test for beginning of expression or subexpression - return _Beg_expr(_Current) || (_Current->_Kind == _N_bol && _Beg_expr(_Current->_Prev)); -} - template _Node_base* _Builder<_FwdIt, _Elem, _RxTraits>::_Link_node(_Node_base* _Nx) { // insert _Nx at current location _Nx->_Prev = _Current; @@ -3867,10 +3854,10 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_Trans() { // map character to meta-char break; case _Meta_star: - if ((_L_flags & _L_star_beg) && _Nfa._Beg_expr()) { - _Mchar = _Meta_chr; - } - + // A star can always act as a quantifier outside bracket expressions, + // but _L_star_beg (used by basic/grep) allows its use as an ordinary character + // at the beginning of a (sub-)expression (potentially after an optional caret anchor). + // We'll handle that when we are parsing alternatives in disjunctions. break; case _Meta_caret: @@ -4434,15 +4421,21 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_Alternative() { // check for valid alte _Next(); _Quant = _Wrapped_disjunction(); _Expect(_Meta_rpar, regex_constants::error_paren); - } else if (_Mchar == _Meta_caret && (!(_L_flags & _L_anch_rstr) || _Nfa._Beg_expr())) { // add bol node + } else if (_Mchar == _Meta_caret && (!(_L_flags & _L_anch_rstr) || !_Found)) { // add bol node _Nfa._Add_bol(); _Next(); - _Quant = false; + if ((_L_flags & _L_star_beg) && _Mchar == _Meta_star) { + _Nfa._Add_char(_Char); + _Next(); + } else { + _Quant = false; + } } else if (_Mchar == _Meta_dlr) { // add eol node _Nfa._Add_eol(); _Next(); _Quant = false; - } else if (_Mchar == _Meta_star || _Mchar == _Meta_plus || _Mchar == _Meta_query || _Mchar == _Meta_lbr) { + } else if ((_Mchar == _Meta_star && (!(_L_flags & _L_star_beg) || !_Found)) || _Mchar == _Meta_plus + || _Mchar == _Meta_query || _Mchar == _Meta_lbr) { _Error(regex_constants::error_badrepeat); } else if (_Mchar == _Meta_rbr && !(_L_flags & _L_paren_bal)) { _Error(regex_constants::error_brace); From d3de073d2d70aa91e8b13afdb867d39ec8701556 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Sun, 15 Dec 2024 15:28:53 +0100 Subject: [PATCH 07/12] add test coverage for double carets and caret anchors at the beginning of expressions, subexpressions and grep-mode newline alternatives --- .../std/tests/VSO_0000000_regex_use/test.cpp | 187 ++++++++++++++++++ 1 file changed, 187 insertions(+) diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index a355fe8b15e..df76e81a59a 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -698,12 +698,199 @@ void test_gh_5165_syntax_option(const syntax_option_type basic_or_grep) { middle_anchor.should_search_fail("homeowner"); middle_anchor.should_search_match("home^owner", "me^ow"); } + { + const test_regex double_carets(&g_regexTester, "^^meo[wW]", basic_or_grep); + double_carets.should_search_fail("meow_machine"); + double_carets.should_search_fail("homeowner"); + double_carets.should_search_match("^meow_machine", "^meow"); + double_carets.should_search_fail("^^meow_machine"); + double_carets.should_search_fail("ho^meowner"); + double_carets.should_search_fail("ho^^meowner"); + } + + g_regexTester.should_not_match("me^ow", R"-(me\(^o[wW]\))-", basic_or_grep); + g_regexTester.should_not_match("meow", R"-(me\(^o[wW]\))-", basic_or_grep); + + + { + const test_regex firstgroup_anchor(&g_regexTester, R"-(\(^meo[wW]\))-", basic_or_grep); + firstgroup_anchor.should_search_match("meow_machine", "meow"); + firstgroup_anchor.should_search_fail("homeowner"); + } + + { + const test_regex prefixedgroup_anchor(&g_regexTester, R"-(.*\(^meo[wW]\))-", basic_or_grep); + prefixedgroup_anchor.should_search_match("meow_machine", "meow"); + prefixedgroup_anchor.should_search_fail("homeowner"); + } + + { + const test_regex secondgroup_anchor(&g_regexTester, R"-(\(.*\)\(^meo[wW]\))-", basic_or_grep); + secondgroup_anchor.should_search_match("meow_machine", "meow"); + secondgroup_anchor.should_search_fail("homeowner"); + } + + { + const test_regex nested_anchor(&g_regexTester, R"-(.*\(^\(^meo[wW]\)\))-", basic_or_grep); + nested_anchor.should_search_match("meow_machine", "meow"); + nested_anchor.should_search_fail("homeowner"); + } + + { + const test_regex double_carets(&g_regexTester, R"-(.*\(^^meo[wW]\))-", basic_or_grep); + double_carets.should_search_fail("meow_machine"); + double_carets.should_search_fail("homeowner"); + double_carets.should_search_match("^meow_machine", "^meow"); + double_carets.should_search_fail("^^meow_machine"); + double_carets.should_search_fail("ho^meowner"); + double_carets.should_search_fail("ho^^meowner"); + } } void test_gh_5165() { // GH-5165: Caret ^ should negate character classes in basic regular expressions test_gh_5165_syntax_option(basic); test_gh_5165_syntax_option(grep); + + // test cases specific for basic regular expressions + { + const test_regex middle_bar(&g_regexTester, "^a|a", basic); + middle_bar.should_search_match("a|a", "a|a"); + middle_bar.should_search_fail("^a|a"); + middle_bar.should_search_fail("ba|a"); + middle_bar.should_search_fail("a"); + } + + { + const test_regex middle_nl(&g_regexTester, "^a\na", basic); + middle_nl.should_search_match("a\na", "a\na"); + middle_nl.should_search_fail("ba\na"); + middle_nl.should_search_fail("^a\na"); + middle_nl.should_search_fail("a"); + } + + { + const test_regex group_middle_bar(&g_regexTester, "^\\(a|a\\)", basic); + group_middle_bar.should_search_match("a|a", "a|a"); + group_middle_bar.should_search_fail("^a|a"); + group_middle_bar.should_search_fail("ba|a"); + group_middle_bar.should_search_fail("a"); + } + + { + const test_regex group_middle_nl(&g_regexTester, "^\\(a\na\\)", basic); + group_middle_nl.should_search_match("a\na", "a\na"); + group_middle_nl.should_search_fail("^a\na"); + group_middle_nl.should_search_fail("ba\na"); + group_middle_nl.should_search_fail("a"); + } + + { + const test_regex middle_bar_with_caret(&g_regexTester, "^a|^b", basic); + middle_bar_with_caret.should_search_match("a|^b", "a|^b"); + middle_bar_with_caret.should_search_fail("a|b"); + middle_bar_with_caret.should_search_fail("a"); + middle_bar_with_caret.should_search_fail("b"); + } + + { + const test_regex middle_bar_with_nl(&g_regexTester, "^a\n^b", basic); + middle_bar_with_nl.should_search_match("a\n^b", "a\n^b"); + middle_bar_with_nl.should_search_fail("a\nb"); + middle_bar_with_nl.should_search_fail("a"); + middle_bar_with_nl.should_search_fail("b"); + } + + { + const test_regex group_middle_bar_with_caret(&g_regexTester, "^\\(a|^b\\)", basic); + group_middle_bar_with_caret.should_search_match("a|^b", "a|^b"); + group_middle_bar_with_caret.should_search_fail("a|b"); + group_middle_bar_with_caret.should_search_fail("a"); + group_middle_bar_with_caret.should_search_fail("b"); + } + + { + const test_regex group_middle_nl_with_caret(&g_regexTester, "^\\(a\n^b\\)", basic); + group_middle_nl_with_caret.should_search_match("a\n^b", "a\n^b"); + group_middle_nl_with_caret.should_search_fail("a\nb"); + group_middle_nl_with_caret.should_search_fail("a"); + group_middle_nl_with_caret.should_search_fail("b"); + } + + // test cases specific for grep mode + { + const test_regex middle_bar(&g_regexTester, "^a|a", grep); + middle_bar.should_search_match("a|a", "a|a"); + middle_bar.should_search_fail("^a|a"); + middle_bar.should_search_fail("ba|a"); + middle_bar.should_search_fail("a"); + } + + { + const test_regex middle_nl(&g_regexTester, "^a\na", grep); + middle_nl.should_search_match("a\na", "a"); + middle_nl.should_search_match("ba\na", "a"); + middle_nl.should_search_match("^a\na", "a"); + middle_nl.should_search_match("a", "a"); + } + + { + const test_regex group_middle_bar(&g_regexTester, "^\\(a|a\\)", grep); + group_middle_bar.should_search_match("a|a", "a|a"); + group_middle_bar.should_search_fail("^a|a"); + group_middle_bar.should_search_fail("ba|a"); + group_middle_bar.should_search_fail("a"); + } + + { + // Regex is not accepted by POSIX grep, but the regex parser currently does not reject it. + // If parser is changed to reject it, adjust this test case. + const test_regex group_middle_nl(&g_regexTester, "^\\(a\na\\)", grep); + group_middle_nl.should_search_match("a\na", "a\na"); + group_middle_nl.should_search_fail("^a\na"); + group_middle_nl.should_search_fail("ba\na"); + group_middle_nl.should_search_fail("a"); + } + + { + const test_regex middle_bar_with_caret(&g_regexTester, "^a|^b", grep); + middle_bar_with_caret.should_search_match("a|^b", "a|^b"); + middle_bar_with_caret.should_search_fail("a|b"); + middle_bar_with_caret.should_search_fail("a"); + middle_bar_with_caret.should_search_fail("b"); + } + + { + const test_regex middle_bar_with_nl(&g_regexTester, "^a\n^b", grep); + middle_bar_with_nl.should_search_match("a\n^b", "a"); + middle_bar_with_nl.should_search_match("a\nb", "a"); + middle_bar_with_nl.should_search_match("ab", "a"); + middle_bar_with_nl.should_search_match("a", "a"); + middle_bar_with_nl.should_search_match("b", "b"); + middle_bar_with_nl.should_search_match("ba", "b"); + middle_bar_with_nl.should_search_fail("^a"); + middle_bar_with_nl.should_search_fail("^b"); + middle_bar_with_nl.should_search_fail("ca"); + middle_bar_with_nl.should_search_fail("cb"); + } + + { + const test_regex group_middle_bar_with_caret(&g_regexTester, "^\\(a|^b\\)", grep); + group_middle_bar_with_caret.should_search_match("a|^b", "a|^b"); + group_middle_bar_with_caret.should_search_fail("a|b"); + group_middle_bar_with_caret.should_search_fail("a"); + group_middle_bar_with_caret.should_search_fail("b"); + } + + { + // Regex is not accepted by POSIX grep, but the regex parser currently does not reject it. + // If parser is changed to reject it, adjust this test case. + const test_regex group_middle_nl_with_caret(&g_regexTester, "^\\(a\n^b\\)", grep); + group_middle_nl_with_caret.should_search_match("a\n^b", "a\n^b"); + group_middle_nl_with_caret.should_search_fail("a\nb"); + group_middle_nl_with_caret.should_search_fail("a"); + group_middle_nl_with_caret.should_search_fail("b"); + } } int main() { From 625e1a52b674f62adf84ebc0bd317590a583c34d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Sun, 15 Dec 2024 20:08:30 +0100 Subject: [PATCH 08/12] Make _L_star_beg setting orthogonal to _L_anchr_restr --- stl/inc/regex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/inc/regex b/stl/inc/regex index 93501572938..90947baa7f8 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -4424,7 +4424,7 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_Alternative() { // check for valid alte } else if (_Mchar == _Meta_caret && (!(_L_flags & _L_anch_rstr) || !_Found)) { // add bol node _Nfa._Add_bol(); _Next(); - if ((_L_flags & _L_star_beg) && _Mchar == _Meta_star) { + if ((_L_flags & _L_star_beg) && _Mchar == _Meta_star && !_Found) { _Nfa._Add_char(_Char); _Next(); } else { From 3363b6d3f2e94b3b31c9db7b4774ce3fbffe3ca8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Sun, 15 Dec 2024 20:54:54 +0100 Subject: [PATCH 09/12] fix inverted condition --- stl/inc/regex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stl/inc/regex b/stl/inc/regex index 90947baa7f8..0e1a5a27031 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -4434,7 +4434,7 @@ bool _Parser<_FwdIt, _Elem, _RxTraits>::_Alternative() { // check for valid alte _Nfa._Add_eol(); _Next(); _Quant = false; - } else if ((_Mchar == _Meta_star && (!(_L_flags & _L_star_beg) || !_Found)) || _Mchar == _Meta_plus + } else if ((_Mchar == _Meta_star && (!(_L_flags & _L_star_beg) || _Found)) || _Mchar == _Meta_plus || _Mchar == _Meta_query || _Mchar == _Meta_lbr) { _Error(regex_constants::error_badrepeat); } else if (_Mchar == _Meta_rbr && !(_L_flags & _L_paren_bal)) { From b6c2765eaa05bea1f94017658ad03508eece7df8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Sun, 15 Dec 2024 21:18:06 +0100 Subject: [PATCH 10/12] add test coverage for initial * in expressions, subexpressions and alternatives --- .../std/tests/VSO_0000000_regex_use/test.cpp | 67 ++++++++++++++++++- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index df76e81a59a..8353e9cca80 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -745,6 +745,32 @@ void test_gh_5165_syntax_option(const syntax_option_type basic_or_grep) { double_carets.should_search_fail("ho^meowner"); double_carets.should_search_fail("ho^^meowner"); } + + g_regexTester.should_match("*", "*", basic_or_grep); + g_regexTester.should_match("*aa", "*a*", basic_or_grep); + g_regexTester.should_not_match("*a*", "*a*", basic_or_grep); + g_regexTester.should_match("*", "^*", basic_or_grep); + g_regexTester.should_match("*aa", "^*a*", basic_or_grep); + g_regexTester.should_match("****", "**", basic_or_grep); + g_regexTester.should_match("****", "^**", basic_or_grep); + g_regexTester.should_not_match("*a*", "^*a*", basic_or_grep); + g_regexTester.should_not_match("^*", "^*", basic_or_grep); + g_regexTester.should_not_match("^*aa", "^*a*", basic_or_grep); + g_regexTester.should_not_match("^*a*", "^*a*", basic_or_grep); + g_regexTester.should_match("*", R"-(\(*\))-", basic_or_grep); + g_regexTester.should_match("****", R"-(\(**\))-", basic_or_grep); + g_regexTester.should_match("*aa", R"-(\(*a*\))-", basic_or_grep); + g_regexTester.should_match("*a", R"-(\(*a*\))-", basic_or_grep); + g_regexTester.should_not_match("*a*", R"-(\(*a*\))-", basic_or_grep); + g_regexTester.should_match("*", R"-(\(^*\))-", basic_or_grep); + g_regexTester.should_match("***", R"-(\(^**\))-", basic_or_grep); + g_regexTester.should_match("*aa", R"-(\(^*a*\))-", basic_or_grep); + g_regexTester.should_not_match("^*a", R"-(\(^*a*\))-", basic_or_grep); + g_regexTester.should_not_match("^*a*", R"-(\(^*a*\))-", basic_or_grep); + g_regexTester.should_match("*", R"-(.*\(^*\))-", basic_or_grep); + g_regexTester.should_match("*aa", R"-(.*\(^*a*\))-", basic_or_grep); + g_regexTester.should_not_match("^*a", R"-(.*\(^*a*\))-", basic_or_grep); + g_regexTester.should_not_match("^*a*", R"-(.*\(^*a*\))-", basic_or_grep); } void test_gh_5165() { @@ -817,6 +843,19 @@ void test_gh_5165() { group_middle_nl_with_caret.should_search_fail("b"); } + g_regexTester.should_match("a||b", "a|*b", basic); + g_regexTester.should_not_match("a|*b", "a|*b", basic); + g_regexTester.should_match("a||b", "^a|*b", basic); + g_regexTester.should_not_match("a|*b", "^a|^*b", basic); + g_regexTester.should_match("a|^^b", "^a|^*b", basic); + g_regexTester.should_not_match("a|^*b", "^a|^*b", basic); + g_regexTester.should_match("a\n\nb", "a\n*b", basic); + g_regexTester.should_not_match("a\n*b", "a\n*b", basic); + g_regexTester.should_match("a\n\nb", "^a\n*b", basic); + g_regexTester.should_not_match("a\n*b", "^a\n^*b", basic); + g_regexTester.should_match("a\n^^b", "^a\n^*b", basic); + g_regexTester.should_not_match("a\n^*b", "^a\n^*b", basic); + // test cases specific for grep mode { const test_regex middle_bar(&g_regexTester, "^a|a", grep); @@ -843,7 +882,7 @@ void test_gh_5165() { } { - // Regex is not accepted by POSIX grep, but the regex parser currently does not reject it. + // This regular expression is not accepted by POSIX grep, but currently the regex parser does not reject it. // If parser is changed to reject it, adjust this test case. const test_regex group_middle_nl(&g_regexTester, "^\\(a\na\\)", grep); group_middle_nl.should_search_match("a\na", "a\na"); @@ -883,7 +922,7 @@ void test_gh_5165() { } { - // Regex is not accepted by POSIX grep, but the regex parser currently does not reject it. + // This regular expression is not accepted by POSIX grep, but currently the regex parser does not reject it. // If parser is changed to reject it, adjust this test case. const test_regex group_middle_nl_with_caret(&g_regexTester, "^\\(a\n^b\\)", grep); group_middle_nl_with_caret.should_search_match("a\n^b", "a\n^b"); @@ -891,6 +930,30 @@ void test_gh_5165() { group_middle_nl_with_caret.should_search_fail("a"); group_middle_nl_with_caret.should_search_fail("b"); } + + g_regexTester.should_match("a||b", "a|*b", grep); + g_regexTester.should_not_match("a|*b", "a|*b", grep); + g_regexTester.should_throw("a|**b", error_badrepeat, grep); + g_regexTester.should_match("a||b", "^a|*b", grep); + g_regexTester.should_not_match("a|*b", "^a|^*b", grep); + g_regexTester.should_match("a|^^b", "^a|^*b", grep); + g_regexTester.should_not_match("a|^*b", "^a|^*b", grep); + g_regexTester.should_throw("^a|**b", error_badrepeat, grep); + g_regexTester.should_not_match("a\n\nb", "a\n*b", grep); + g_regexTester.should_not_match("a\n*b", "a\n*b", grep); + g_regexTester.should_match("a", "a\n*b", grep); + g_regexTester.should_match("*b", "a\n*b", grep); + g_regexTester.should_not_match("a\n\nb", "^a\n*b", grep); + g_regexTester.should_not_match("a\n*b", "^a\n^*b", grep); + g_regexTester.should_match("a", "^a\n*b", grep); + g_regexTester.should_match("*b", "^a\n*b", grep); + g_regexTester.should_match("a", "^a\n**b", grep); + g_regexTester.should_match("****b", "^a\n**b", grep); + g_regexTester.should_not_match("a\n^^b", "^a\n^*b", grep); + g_regexTester.should_not_match("a\nb", "^a\n^*b", grep); + g_regexTester.should_not_match("^*b", "^a\n^*b", grep); + g_regexTester.should_match("a", "^a\n^*b", grep); + g_regexTester.should_match("*b", "^a\n^*b", grep); } int main() { From 38ed60b76689000f6938a6ffe0b1914e7192c65e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Mon, 16 Dec 2024 10:37:00 +0100 Subject: [PATCH 11/12] adjust comment --- tests/std/tests/VSO_0000000_regex_use/test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index 8353e9cca80..3eb76bcab0b 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -774,7 +774,7 @@ void test_gh_5165_syntax_option(const syntax_option_type basic_or_grep) { } void test_gh_5165() { - // GH-5165: Caret ^ should negate character classes in basic regular expressions + // GH-5165: Revise caret parsing in basic and grep mode test_gh_5165_syntax_option(basic); test_gh_5165_syntax_option(grep); From f827e289e69f62ee05b61a1bee4bd395ad6ce09d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Tue, 17 Dec 2024 16:53:44 +0100 Subject: [PATCH 12/12] extend and clean up tests --- .../std/tests/VSO_0000000_regex_use/test.cpp | 251 +++++++++++------- 1 file changed, 148 insertions(+), 103 deletions(-) diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index 3eb76bcab0b..5dd644942d4 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -715,72 +715,112 @@ void test_gh_5165_syntax_option(const syntax_option_type basic_or_grep) { { const test_regex firstgroup_anchor(&g_regexTester, R"-(\(^meo[wW]\))-", basic_or_grep); firstgroup_anchor.should_search_match("meow_machine", "meow"); + firstgroup_anchor.should_search_fail("^meow_machine"); firstgroup_anchor.should_search_fail("homeowner"); + firstgroup_anchor.should_search_fail("ho^meowner"); } { const test_regex prefixedgroup_anchor(&g_regexTester, R"-(.*\(^meo[wW]\))-", basic_or_grep); prefixedgroup_anchor.should_search_match("meow_machine", "meow"); + prefixedgroup_anchor.should_search_fail("^meow_machine"); prefixedgroup_anchor.should_search_fail("homeowner"); + prefixedgroup_anchor.should_search_fail("ho^meowner"); } { const test_regex secondgroup_anchor(&g_regexTester, R"-(\(.*\)\(^meo[wW]\))-", basic_or_grep); secondgroup_anchor.should_search_match("meow_machine", "meow"); + secondgroup_anchor.should_search_fail("^meow_machine"); secondgroup_anchor.should_search_fail("homeowner"); + secondgroup_anchor.should_search_fail("ho^meowner"); } { const test_regex nested_anchor(&g_regexTester, R"-(.*\(^\(^meo[wW]\)\))-", basic_or_grep); nested_anchor.should_search_match("meow_machine", "meow"); + nested_anchor.should_search_fail("^meow_machine"); + nested_anchor.should_search_fail("^^meow_machine"); nested_anchor.should_search_fail("homeowner"); + nested_anchor.should_search_fail("ho^meowner"); + nested_anchor.should_search_fail("ho^^meowner"); } { const test_regex double_carets(&g_regexTester, R"-(.*\(^^meo[wW]\))-", basic_or_grep); double_carets.should_search_fail("meow_machine"); - double_carets.should_search_fail("homeowner"); double_carets.should_search_match("^meow_machine", "^meow"); double_carets.should_search_fail("^^meow_machine"); + double_carets.should_search_fail("homeowner"); double_carets.should_search_fail("ho^meowner"); double_carets.should_search_fail("ho^^meowner"); } + // Validate correct handling of star at the + // beginning of an expression (with or without optional caret). g_regexTester.should_match("*", "*", basic_or_grep); + g_regexTester.should_not_match("**", "*", basic_or_grep); + g_regexTester.should_match("****", "**", basic_or_grep); + g_regexTester.should_throw("***", error_badrepeat, basic_or_grep); + + g_regexTester.should_match("*", "^*", basic_or_grep); + g_regexTester.should_not_match("**", "^*", basic_or_grep); + g_regexTester.should_not_match("^*", "^*", basic_or_grep); + g_regexTester.should_match("****", "^**", basic_or_grep); + g_regexTester.should_throw("^***", error_badrepeat, basic_or_grep); + g_regexTester.should_match("*aa", "*a*", basic_or_grep); + g_regexTester.should_match("*a", "*a*", basic_or_grep); + g_regexTester.should_not_match("aa", "*a*", basic_or_grep); g_regexTester.should_not_match("*a*", "*a*", basic_or_grep); - g_regexTester.should_match("*", "^*", basic_or_grep); + g_regexTester.should_match("*aa", "^*a*", basic_or_grep); - g_regexTester.should_match("****", "**", basic_or_grep); - g_regexTester.should_match("****", "^**", basic_or_grep); + g_regexTester.should_not_match("aa", "^*a*", basic_or_grep); g_regexTester.should_not_match("*a*", "^*a*", basic_or_grep); - g_regexTester.should_not_match("^*", "^*", basic_or_grep); + g_regexTester.should_not_match("^*a", "^*a*", basic_or_grep); g_regexTester.should_not_match("^*aa", "^*a*", basic_or_grep); g_regexTester.should_not_match("^*a*", "^*a*", basic_or_grep); + g_regexTester.should_match("*", R"-(\(*\))-", basic_or_grep); + g_regexTester.should_not_match("**", R"-(\(*\))-", basic_or_grep); g_regexTester.should_match("****", R"-(\(**\))-", basic_or_grep); + g_regexTester.should_throw(R"-(\(***\))-", error_badrepeat, basic_or_grep); + + g_regexTester.should_match("*", R"-(\(^*\))-", basic_or_grep); + g_regexTester.should_not_match("**", R"-(\(^*\))-", basic_or_grep); + g_regexTester.should_not_match("^*", R"-(\(^*\))-", basic_or_grep); + g_regexTester.should_match("***", R"-(\(^**\))-", basic_or_grep); + g_regexTester.should_throw(R"-(\(^***\))-", error_badrepeat, basic_or_grep); + g_regexTester.should_match("*aa", R"-(\(*a*\))-", basic_or_grep); g_regexTester.should_match("*a", R"-(\(*a*\))-", basic_or_grep); + g_regexTester.should_not_match("aa", R"-(\(*a*\))-", basic_or_grep); g_regexTester.should_not_match("*a*", R"-(\(*a*\))-", basic_or_grep); - g_regexTester.should_match("*", R"-(\(^*\))-", basic_or_grep); - g_regexTester.should_match("***", R"-(\(^**\))-", basic_or_grep); + g_regexTester.should_match("*aa", R"-(\(^*a*\))-", basic_or_grep); + g_regexTester.should_not_match("aa", R"-(\(^*a*\))-", basic_or_grep); + g_regexTester.should_not_match("*a*", R"-(\(^*a*\))-", basic_or_grep); g_regexTester.should_not_match("^*a", R"-(\(^*a*\))-", basic_or_grep); + g_regexTester.should_not_match("^*aa", R"-(\(^*a*\))-", basic_or_grep); g_regexTester.should_not_match("^*a*", R"-(\(^*a*\))-", basic_or_grep); + g_regexTester.should_match("*", R"-(.*\(^*\))-", basic_or_grep); - g_regexTester.should_match("*aa", R"-(.*\(^*a*\))-", basic_or_grep); + g_regexTester.should_not_match("**", R"-(.*\(^*\))-", basic_or_grep); + g_regexTester.should_not_match("^*", R"-(.*\(^*\))-", basic_or_grep); + g_regexTester.should_match("***", R"-(.*\(^**\))-", basic_or_grep); + g_regexTester.should_throw(R"-(.*\(^***\))-", error_badrepeat, basic_or_grep); + + g_regexTester.should_match("*aa", R"-(\(.*^*a*\))-", basic_or_grep); + g_regexTester.should_not_match("aa", R"-(.*\(^*a*\))-", basic_or_grep); + g_regexTester.should_not_match("*a*", R"-(.*\(^*a*\))-", basic_or_grep); g_regexTester.should_not_match("^*a", R"-(.*\(^*a*\))-", basic_or_grep); + g_regexTester.should_not_match("^*aa", R"-(.*\(^*a*\))-", basic_or_grep); g_regexTester.should_not_match("^*a*", R"-(.*\(^*a*\))-", basic_or_grep); -} -void test_gh_5165() { - // GH-5165: Revise caret parsing in basic and grep mode - test_gh_5165_syntax_option(basic); - test_gh_5165_syntax_option(grep); - - // test cases specific for basic regular expressions + // Validate that there is no special behavior near bars, + // as they are alternation operators in regex modes other than basic or grep. { - const test_regex middle_bar(&g_regexTester, "^a|a", basic); + const test_regex middle_bar(&g_regexTester, "^a|a", basic_or_grep); middle_bar.should_search_match("a|a", "a|a"); middle_bar.should_search_fail("^a|a"); middle_bar.should_search_fail("ba|a"); @@ -788,15 +828,7 @@ void test_gh_5165() { } { - const test_regex middle_nl(&g_regexTester, "^a\na", basic); - middle_nl.should_search_match("a\na", "a\na"); - middle_nl.should_search_fail("ba\na"); - middle_nl.should_search_fail("^a\na"); - middle_nl.should_search_fail("a"); - } - - { - const test_regex group_middle_bar(&g_regexTester, "^\\(a|a\\)", basic); + const test_regex group_middle_bar(&g_regexTester, "^\\(a|a\\)", basic_or_grep); group_middle_bar.should_search_match("a|a", "a|a"); group_middle_bar.should_search_fail("^a|a"); group_middle_bar.should_search_fail("ba|a"); @@ -804,81 +836,104 @@ void test_gh_5165() { } { - const test_regex group_middle_nl(&g_regexTester, "^\\(a\na\\)", basic); - group_middle_nl.should_search_match("a\na", "a\na"); - group_middle_nl.should_search_fail("^a\na"); - group_middle_nl.should_search_fail("ba\na"); - group_middle_nl.should_search_fail("a"); - } - - { - const test_regex middle_bar_with_caret(&g_regexTester, "^a|^b", basic); + const test_regex middle_bar_with_caret(&g_regexTester, "^a|^b", basic_or_grep); middle_bar_with_caret.should_search_match("a|^b", "a|^b"); middle_bar_with_caret.should_search_fail("a|b"); + middle_bar_with_caret.should_search_fail("^a|^b"); + middle_bar_with_caret.should_search_fail("ca|^b"); middle_bar_with_caret.should_search_fail("a"); middle_bar_with_caret.should_search_fail("b"); } { - const test_regex middle_bar_with_nl(&g_regexTester, "^a\n^b", basic); - middle_bar_with_nl.should_search_match("a\n^b", "a\n^b"); - middle_bar_with_nl.should_search_fail("a\nb"); - middle_bar_with_nl.should_search_fail("a"); - middle_bar_with_nl.should_search_fail("b"); - } - - { - const test_regex group_middle_bar_with_caret(&g_regexTester, "^\\(a|^b\\)", basic); + const test_regex group_middle_bar_with_caret(&g_regexTester, "^\\(a|^b\\)", basic_or_grep); group_middle_bar_with_caret.should_search_match("a|^b", "a|^b"); group_middle_bar_with_caret.should_search_fail("a|b"); + group_middle_bar_with_caret.should_search_fail("^a|^b"); + group_middle_bar_with_caret.should_search_fail("ca|^b"); group_middle_bar_with_caret.should_search_fail("a"); group_middle_bar_with_caret.should_search_fail("b"); } + g_regexTester.should_match("ab", "a|*b", basic_or_grep); + g_regexTester.should_match("a||b", "a|*b", basic_or_grep); + g_regexTester.should_not_match("a|*b", "a|*b", basic_or_grep); + g_regexTester.should_throw("a|**b", error_badrepeat, basic_or_grep); + + g_regexTester.should_match("ab", "^a|*b", basic_or_grep); + g_regexTester.should_match("a||b", "^a|*b", basic_or_grep); + g_regexTester.should_not_match("a|*b", "^a|*b", basic_or_grep); + g_regexTester.should_throw("^a|**b", error_badrepeat, basic_or_grep); + + g_regexTester.should_match("a|b", "^a|^*b", basic_or_grep); + g_regexTester.should_match("a|^^b", "^a|^*b", basic_or_grep); + g_regexTester.should_not_match("a|*b", "^a|^*b", basic_or_grep); + g_regexTester.should_not_match("a|^*b", "^a|^*b", basic_or_grep); + g_regexTester.should_throw("^a|^**b", error_badrepeat, basic_or_grep); +} + +void test_gh_5165() { + // GH-5165: Revise caret parsing in basic and grep mode + test_gh_5165_syntax_option(basic); + test_gh_5165_syntax_option(grep); + + // test cases specific for basic regular expressions + { + const test_regex middle_nl(&g_regexTester, "^a\na", basic); + middle_nl.should_search_match("a\na", "a\na"); + middle_nl.should_search_fail("^a\na"); + middle_nl.should_search_fail("ba\na"); + middle_nl.should_search_fail("a"); + } + + { + const test_regex group_middle_nl(&g_regexTester, "^\\(a\na\\)", basic); + group_middle_nl.should_search_match("a\na", "a\na"); + group_middle_nl.should_search_fail("^a\na"); + group_middle_nl.should_search_fail("ba\na"); + group_middle_nl.should_search_fail("a"); + } + + { + const test_regex middle_nl_with_caret(&g_regexTester, "^a\n^b", basic); + middle_nl_with_caret.should_search_match("a\n^b", "a\n^b"); + middle_nl_with_caret.should_search_fail("a\nb"); + middle_nl_with_caret.should_search_fail("^a\n^b"); + middle_nl_with_caret.should_search_fail("ca\n^b"); + middle_nl_with_caret.should_search_fail("a"); + middle_nl_with_caret.should_search_fail("b"); + } + { const test_regex group_middle_nl_with_caret(&g_regexTester, "^\\(a\n^b\\)", basic); group_middle_nl_with_caret.should_search_match("a\n^b", "a\n^b"); group_middle_nl_with_caret.should_search_fail("a\nb"); + group_middle_nl_with_caret.should_search_fail("^a\n^b"); + group_middle_nl_with_caret.should_search_fail("ca\n^b"); group_middle_nl_with_caret.should_search_fail("a"); group_middle_nl_with_caret.should_search_fail("b"); } - g_regexTester.should_match("a||b", "a|*b", basic); - g_regexTester.should_not_match("a|*b", "a|*b", basic); - g_regexTester.should_match("a||b", "^a|*b", basic); - g_regexTester.should_not_match("a|*b", "^a|^*b", basic); - g_regexTester.should_match("a|^^b", "^a|^*b", basic); - g_regexTester.should_not_match("a|^*b", "^a|^*b", basic); + g_regexTester.should_match("ab", "a\n*b", basic); g_regexTester.should_match("a\n\nb", "a\n*b", basic); g_regexTester.should_not_match("a\n*b", "a\n*b", basic); g_regexTester.should_match("a\n\nb", "^a\n*b", basic); - g_regexTester.should_not_match("a\n*b", "^a\n^*b", basic); + g_regexTester.should_throw("^a\n**b", error_badrepeat, basic); + + g_regexTester.should_match("a\nb", "^a\n^*b", basic); g_regexTester.should_match("a\n^^b", "^a\n^*b", basic); + g_regexTester.should_not_match("a\n*b", "^a\n^*b", basic); g_regexTester.should_not_match("a\n^*b", "^a\n^*b", basic); + g_regexTester.should_throw("^a\n^**b", error_badrepeat, basic); // test cases specific for grep mode - { - const test_regex middle_bar(&g_regexTester, "^a|a", grep); - middle_bar.should_search_match("a|a", "a|a"); - middle_bar.should_search_fail("^a|a"); - middle_bar.should_search_fail("ba|a"); - middle_bar.should_search_fail("a"); - } - { const test_regex middle_nl(&g_regexTester, "^a\na", grep); middle_nl.should_search_match("a\na", "a"); - middle_nl.should_search_match("ba\na", "a"); middle_nl.should_search_match("^a\na", "a"); + middle_nl.should_search_match("ba\na", "a"); middle_nl.should_search_match("a", "a"); - } - - { - const test_regex group_middle_bar(&g_regexTester, "^\\(a|a\\)", grep); - group_middle_bar.should_search_match("a|a", "a|a"); - group_middle_bar.should_search_fail("^a|a"); - group_middle_bar.should_search_fail("ba|a"); - group_middle_bar.should_search_fail("a"); + middle_nl.should_search_fail("b"); } { @@ -892,33 +947,18 @@ void test_gh_5165() { } { - const test_regex middle_bar_with_caret(&g_regexTester, "^a|^b", grep); - middle_bar_with_caret.should_search_match("a|^b", "a|^b"); - middle_bar_with_caret.should_search_fail("a|b"); - middle_bar_with_caret.should_search_fail("a"); - middle_bar_with_caret.should_search_fail("b"); - } - - { - const test_regex middle_bar_with_nl(&g_regexTester, "^a\n^b", grep); - middle_bar_with_nl.should_search_match("a\n^b", "a"); - middle_bar_with_nl.should_search_match("a\nb", "a"); - middle_bar_with_nl.should_search_match("ab", "a"); - middle_bar_with_nl.should_search_match("a", "a"); - middle_bar_with_nl.should_search_match("b", "b"); - middle_bar_with_nl.should_search_match("ba", "b"); - middle_bar_with_nl.should_search_fail("^a"); - middle_bar_with_nl.should_search_fail("^b"); - middle_bar_with_nl.should_search_fail("ca"); - middle_bar_with_nl.should_search_fail("cb"); - } - - { - const test_regex group_middle_bar_with_caret(&g_regexTester, "^\\(a|^b\\)", grep); - group_middle_bar_with_caret.should_search_match("a|^b", "a|^b"); - group_middle_bar_with_caret.should_search_fail("a|b"); - group_middle_bar_with_caret.should_search_fail("a"); - group_middle_bar_with_caret.should_search_fail("b"); + const test_regex middle_nl_with_caret(&g_regexTester, "^a\n^b", grep); + middle_nl_with_caret.should_search_match("a\n^b", "a"); + middle_nl_with_caret.should_search_match("a\nb", "a"); + middle_nl_with_caret.should_search_match("ab", "a"); + middle_nl_with_caret.should_search_match("a", "a"); + middle_nl_with_caret.should_search_match("b", "b"); + middle_nl_with_caret.should_search_match("ba", "b"); + middle_nl_with_caret.should_search_fail("^a"); + middle_nl_with_caret.should_search_fail("ca"); + middle_nl_with_caret.should_search_fail("^b"); + middle_nl_with_caret.should_search_fail("ca"); + middle_nl_with_caret.should_search_fail("cb"); } { @@ -927,33 +967,38 @@ void test_gh_5165() { const test_regex group_middle_nl_with_caret(&g_regexTester, "^\\(a\n^b\\)", grep); group_middle_nl_with_caret.should_search_match("a\n^b", "a\n^b"); group_middle_nl_with_caret.should_search_fail("a\nb"); + group_middle_nl_with_caret.should_search_fail("^a\n^b"); + group_middle_nl_with_caret.should_search_fail("ca\n^b"); group_middle_nl_with_caret.should_search_fail("a"); group_middle_nl_with_caret.should_search_fail("b"); } - g_regexTester.should_match("a||b", "a|*b", grep); - g_regexTester.should_not_match("a|*b", "a|*b", grep); - g_regexTester.should_throw("a|**b", error_badrepeat, grep); - g_regexTester.should_match("a||b", "^a|*b", grep); - g_regexTester.should_not_match("a|*b", "^a|^*b", grep); - g_regexTester.should_match("a|^^b", "^a|^*b", grep); - g_regexTester.should_not_match("a|^*b", "^a|^*b", grep); - g_regexTester.should_throw("^a|**b", error_badrepeat, grep); + g_regexTester.should_not_match("ab", "a\n*b", grep); g_regexTester.should_not_match("a\n\nb", "a\n*b", grep); g_regexTester.should_not_match("a\n*b", "a\n*b", grep); g_regexTester.should_match("a", "a\n*b", grep); g_regexTester.should_match("*b", "a\n*b", grep); + g_regexTester.should_match("a", "a\n**b", grep); + g_regexTester.should_match("***b", "a\n**b", grep); + + g_regexTester.should_not_match("ab", "^a\n*b", grep); g_regexTester.should_not_match("a\n\nb", "^a\n*b", grep); g_regexTester.should_not_match("a\n*b", "^a\n^*b", grep); g_regexTester.should_match("a", "^a\n*b", grep); g_regexTester.should_match("*b", "^a\n*b", grep); g_regexTester.should_match("a", "^a\n**b", grep); g_regexTester.should_match("****b", "^a\n**b", grep); - g_regexTester.should_not_match("a\n^^b", "^a\n^*b", grep); + g_regexTester.should_not_match("a\nb", "^a\n^*b", grep); + g_regexTester.should_not_match("a\n^^b", "^a\n^*b", grep); + g_regexTester.should_not_match("a\n*b", "^a\n^*b", grep); + g_regexTester.should_not_match("a\n^*b", "^a\n^*b", grep); g_regexTester.should_not_match("^*b", "^a\n^*b", grep); g_regexTester.should_match("a", "^a\n^*b", grep); g_regexTester.should_match("*b", "^a\n^*b", grep); + g_regexTester.should_not_match("**b", "^a\n^*b", grep); + g_regexTester.should_match("a", "^a\n^**b", grep); + g_regexTester.should_match("****b", "^a\n^**b", grep); } int main() {