From e2c6bc179531f55062e9c68dd1b53b0bdc770b65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20M=C3=BCller?= Date: Sat, 18 Oct 2025 21:13:59 +0200 Subject: [PATCH 1/5] ``: Process greedy simple loops non-recursively --- stl/inc/regex | 134 ++++++------------ .../std/tests/VSO_0000000_regex_use/test.cpp | 79 +++++++++++ 2 files changed, 122 insertions(+), 91 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index d886365a2f3..3253a858212 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -1681,6 +1681,7 @@ enum class _Rx_unwind_ops { _Disjunction_eval_alt_always, _Do_nothing, _Loop_simple_nongreedy, + _Loop_simple_greedy, }; template @@ -1815,7 +1816,6 @@ private: void _Decrease_stack_usage_count(); void _Increase_complexity_count(); - bool _Do_rep0(_Node_rep*); bool _Do_rep(_Node_rep*, bool, int); void _Prepare_rep(_Node_rep*); bool _Find_first_inner_capture_group(_Node_base*, _Loop_vals_v2_t*); @@ -3413,72 +3413,6 @@ void _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Increase_complexity_coun } } -template -bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep0(_Node_rep* _Node) { - // apply repetition to loop with no nested if/do - int _Ix = _Node->_Min; - const size_t _Frame_idx = _Loop_vals[_Node->_Loop_number]._Loop_frame_idx; - _Loop_vals[_Node->_Loop_number]._Loop_idx = _Ix + 2; - - _Tgt_state_t<_It> _Final; - bool _Matched0 = false; - _It _Saved_pos = _Tgt_state._Cur; - bool _Done = false; - - if (_Match_pat(_Node->_End_rep->_Next)) { - // record an acceptable match and continue - _Final = _Tgt_state; - _Matched0 = true; - } - - if (_Ix == 0 && _Node->_Max != 0) { - _Tgt_state._Cur = _Saved_pos; - _Tgt_state._Grp_valid = _Frames[_Frame_idx]._Match_state._Grp_valid; - - if (!_Match_pat(_Node->_Next)) { // rep match failed, we are done - _Done = true; - } else if (_Saved_pos == _Tgt_state._Cur) { // match empty, try no more repetitions - _Done = true; - // we only potentially accept/try tail for POSIX - if ((_Sflags & regex_constants::_Any_posix) && _Match_pat(_Node->_End_rep->_Next)) { - return true; // go with current match - } - } else { - _Saved_pos = _Tgt_state._Cur; - if (_Match_pat(_Node->_End_rep->_Next)) { - // record match and continue - _Final = _Tgt_state; - _Matched0 = true; - } - } - _Ix = 1; - } - - if (!_Done) { - while (_Node->_Max == -1 || _Ix++ < _Node->_Max) { // try another rep/tail match - _Tgt_state._Cur = _Saved_pos; - _Tgt_state._Grp_valid = _Frames[_Frame_idx]._Match_state._Grp_valid; - if (!_Match_pat(_Node->_Next) || _Tgt_state._Cur == _Saved_pos) { - break; // rep match failed, quit loop - } - - // since loop is branchless, empty rep match is not possible at this point - _Saved_pos = _Tgt_state._Cur; - if (_Match_pat(_Node->_End_rep->_Next)) { - // record match and continue - _Final = _Tgt_state; - _Matched0 = true; - } - } - } - - if (_Matched0) { // record final match - _Tgt_state = _Final; - } - - return _Matched0; -} - template bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Do_rep(_Node_rep* _Node, bool _Greedy, int _Init_idx) { // apply repetition @@ -4117,7 +4051,7 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N break; case _N_rep: - { + { // handle start of loop auto _Node = static_cast<_Node_rep*>(_Nx); _Prepare_rep(_Node); bool _Greedy = (_Node->_Flags & _Fl_greedy) != 0; @@ -4125,14 +4059,16 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N if (_Node->_Simple_loop == 1) { auto& _Sav = _Loop_vals[_Node->_Loop_number]; _Sav._Loop_frame_idx = _Push_frame(_Rx_unwind_ops::_Do_nothing); - if (_Node->_Min > 0) { // try to match a rep - _Increase_complexity_count(); + _Increase_complexity_count(); + if (_Node->_Min > 0 || (_Greedy && !_Longest && _Node->_Max != 0)) { // try a rep first _Sav._Loop_idx = 1; // _Next is already assigned correctly for matching a rep - } else if (!_Greedy || _Longest) { // non-greedy matching - _Increase_complexity_count(); - // try tail first + // set up stack unwinding for greedy matching if no rep is allowed + if (_Node->_Min == 0) { + _Push_frame(_Rx_unwind_ops::_Loop_simple_greedy, _Node); + } + } else { // try tail first _Sav._Loop_idx = 0; _Next = _Node->_End_rep->_Next; @@ -4140,9 +4076,6 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N if (_Node->_Max != 0) { _Push_frame(_Rx_unwind_ops::_Loop_simple_nongreedy, _Node); } - } else { - _Failed = !_Do_rep0(_Node); - _Next = nullptr; } } else { _Failed = !_Do_rep(_Node, _Greedy, 0); @@ -4153,7 +4086,7 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N break; case _N_end_rep: - { + { // handle end of loop _Node_rep* _Nr = static_cast<_Node_end_rep*>(_Nx)->_Begin_rep; auto& _Sav = _Loop_vals[_Nr->_Loop_number]; bool _Greedy = (_Nr->_Flags & _Fl_greedy) != 0; @@ -4163,31 +4096,36 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N == _Frames[_Sav._Loop_frame_idx]._Match_state._Cur) { // initial match empty // loop is branchless, so it will only ever match empty strings // -> we only try tail for POSIX or if minimum number of reps is non-zero - if ((_Sflags & regex_constants::_Any_posix) || _Nr->_Min > 0) { - _Increase_complexity_count(); - // _Next is already assigned correctly for matching tail - } else { + // _Next is already assigned correctly for matching tail + + if (!(_Sflags & regex_constants::_Any_posix) && _Nr->_Min == 0) { _Failed = true; } } else if (_Sav._Loop_idx < _Nr->_Min) { // at least one more rep to reach minimum - _Increase_complexity_count(); - _Next = _Nr->_Next; // GH-5365: We have to reset the capture groups from the second iteration on. _Tgt_state._Grp_valid = _Frames[_Sav._Loop_frame_idx]._Match_state._Grp_valid; ++_Sav._Loop_idx; - } else if (_Longest || !_Greedy) { - _Increase_complexity_count(); + } else if (_Greedy && !_Longest && _Sav._Loop_idx != _Nr->_Max) { // one more rep to try next + // set up stack unwinding for greedy matching + _Push_frame(_Rx_unwind_ops::_Loop_simple_greedy, _Nr); + + _Next = _Nr->_Next; + // GH-5365: We have to reset the capture groups from the second iteration on. + _Tgt_state._Grp_valid = _Frames[_Sav._Loop_frame_idx]._Match_state._Grp_valid; + if (_Sav._Loop_idx < INT_MAX) { // avoid overfloading _Loop_idx + ++_Sav._Loop_idx; + } + } else { // non-greedy matching or greedy matching with maximum reached // set up stack unwinding for non-greedy matching if one more rep is allowed if (_Sav._Loop_idx != _Nr->_Max) { _Push_frame(_Rx_unwind_ops::_Loop_simple_nongreedy, _Nr); } // _Next is already assigned correctly for matching tail - } else if (_Sav._Loop_idx == _Nr->_Min) { // greedy and minimum number of reps reached - _Failed = !_Do_rep0(_Nr); - _Next = nullptr; - } else { // internal _Match_pat(_Node->_Next) call in _Do_rep0() - _Next = nullptr; + } + + if (!_Failed) { + _Increase_complexity_count(); } } else { _Failed = !_Do_rep(_Nr, _Greedy, _Sav._Loop_idx); @@ -4297,6 +4235,20 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N } break; + case _Rx_unwind_ops::_Loop_simple_greedy: + // try tail if matching one more rep failed + if (_Failed) { + auto _Node = static_cast<_Node_rep*>(_Frame._Node); + auto& _Sav = _Loop_vals[_Node->_Loop_number]; + + _Increase_complexity_count(); + _Nx = _Node->_End_rep->_Next; + _Tgt_state._Cur = _Frame._Match_state._Cur; + _Tgt_state._Grp_valid = _Frames[_Sav._Loop_frame_idx]._Match_state._Grp_valid; + _Failed = false; + } + break; + default: #if _ITERATOR_DEBUG_LEVEL != 0 _STL_REPORT_ERROR("internal stack of regex matcher corrupted"); @@ -5299,7 +5251,7 @@ void _Parser2<_FwdIt, _Elem, _RxTraits>::_Calculate_loop_simplicity( break; case _N_rep: // _Node_rep inside another _Node_rep makes both not simple if _Outer_rep can be repeated more than once - // because _Matcher3::_Do_rep0() does not reset capture group boundaries when control is returned to it. + // because the matcher does not reset capture group boundaries when handling simple loops. // If _Outer_rep can repeat at most once, we have to analyze the structure of the inner loop. if (_Outer_rep) { _Outer_rep->_Simple_loop = 0; diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index b4fc9b2f8fa..5d61de5856a 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -2143,6 +2143,84 @@ void test_gh_5774() { g_regexTester.should_match("aaab", "a{1,3}?b"); } +void test_gh_5790() { + // GH-5790: Process greedy simple loops non-recursively. + // This extends our test coverage on (mainly greedy) simple loops. + g_regexTester.should_not_match("", "a+"); + g_regexTester.should_not_match("ab", "a{0}b"); + g_regexTester.should_match("ab", "a{0,1}b"); + g_regexTester.should_not_match("aab", "a{0,1}b"); + g_regexTester.should_match("aab", "a{0,2}b"); + g_regexTester.should_match("aab", "a{1,2}b"); + g_regexTester.should_not_match("aab", "a{1}b"); + g_regexTester.should_not_match("aaab", "a{1,2}b"); + g_regexTester.should_match("aaab", "a{1,3}b"); + + // Check that greedy and non-greedy search find the appropriate match. + // for the following regexes, greedy and leftmost-longest search yield the same matches. + for (syntax_option_type options : {ECMAScript, extended}) { + { + test_regex greedy_a_star(&g_regexTester, "a*", options); + greedy_a_star.should_search_match("aaaaaaaaaa", "aaaaaaaaaa"); + } + + { + test_regex bounded_greedy_a_rep(&g_regexTester, "a{5}", options); + bounded_greedy_a_rep.should_search_match("aaaaaaaaaa", "aaaaa"); + } + + { + test_regex upper_bounded_greedy_a_rep(&g_regexTester, "a{0,5}", options); + upper_bounded_greedy_a_rep.should_search_match("aaaaaaaaaa", "aaaaa"); + } + + { + test_regex lower_bounded_greedy_a_rep(&g_regexTester, "a{4,1000}", options); + lower_bounded_greedy_a_rep.should_search_match("aaaaaaaaaa", "aaaaaaaaaa"); + } + + { + test_regex lower_and_upper_bounded_greedy_a_rep(&g_regexTester, "a{2,5}", options); + lower_and_upper_bounded_greedy_a_rep.should_search_match("aaaaaaaaaa", "aaaaa"); + } + + { + test_regex lower_and_upper_bounded_greedy_a_rep(&g_regexTester, "a{2,5}", options); + lower_and_upper_bounded_greedy_a_rep.should_search_match("aaaaaaaaaa", "aaaaa"); + } + + { + test_regex too_large_min_greedy_a_rep(&g_regexTester, "a{11,1000}"); + too_large_min_greedy_a_rep.should_search_fail("aaaaaaaaaa"); + } + } + + { + test_regex nongreedy_a_star(&g_regexTester, "a*?"); + nongreedy_a_star.should_search_match("aaaaaaaaaa", ""); + } + + { + test_regex bounded_nongreedy_a_rep(&g_regexTester, "a{5}?"); + bounded_nongreedy_a_rep.should_search_match("aaaaaaaaaa", "aaaaa"); + } + + { + test_regex upper_bounded_nongreedy_a_rep(&g_regexTester, "a{0,5}?"); + upper_bounded_nongreedy_a_rep.should_search_match("aaaaaaaaaa", ""); + } + + { + test_regex lower_bounded_nongreedy_a_rep(&g_regexTester, "a{4,1000}?"); + lower_bounded_nongreedy_a_rep.should_search_match("aaaaaaaaaa", "aaaa"); + } + + { + test_regex too_large_min_nongreedy_a_rep(&g_regexTester, "a{11,1000}?"); + too_large_min_nongreedy_a_rep.should_search_fail("aaaaaaaaaa"); + } +} + int main() { test_dev10_449367_case_insensitivity_should_work(); test_dev11_462743_regex_collate_should_not_disable_regex_icase(); @@ -2195,6 +2273,7 @@ int main() { test_gh_5576(); test_gh_5672(); test_gh_5774(); + test_gh_5790(); return g_regexTester.result(); } From e34170617a3964d33223491f11b80bea32ebada0 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 20 Oct 2025 13:13:57 -0700 Subject: [PATCH 2/5] Comment cleanups. --- stl/inc/regex | 2 +- tests/std/tests/VSO_0000000_regex_use/test.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/stl/inc/regex b/stl/inc/regex index 3253a858212..adc8749ad35 100644 --- a/stl/inc/regex +++ b/stl/inc/regex @@ -4113,7 +4113,7 @@ bool _Matcher3<_BidIt, _Elem, _RxTraits, _It, _Alloc>::_Match_pat(_Node_base* _N _Next = _Nr->_Next; // GH-5365: We have to reset the capture groups from the second iteration on. _Tgt_state._Grp_valid = _Frames[_Sav._Loop_frame_idx]._Match_state._Grp_valid; - if (_Sav._Loop_idx < INT_MAX) { // avoid overfloading _Loop_idx + if (_Sav._Loop_idx < INT_MAX) { // avoid overflowing _Loop_idx ++_Sav._Loop_idx; } } else { // non-greedy matching or greedy matching with maximum reached diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index 5d61de5856a..2f2c8e0d4ea 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -2157,7 +2157,7 @@ void test_gh_5790() { g_regexTester.should_match("aaab", "a{1,3}b"); // Check that greedy and non-greedy search find the appropriate match. - // for the following regexes, greedy and leftmost-longest search yield the same matches. + // For the following regexes, greedy and leftmost-longest search yield the same matches. for (syntax_option_type options : {ECMAScript, extended}) { { test_regex greedy_a_star(&g_regexTester, "a*", options); From f0f2943084602e00fa66b0e04c60acef48650d39 Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 20 Oct 2025 13:33:22 -0700 Subject: [PATCH 3/5] Add missing `options`. --- tests/std/tests/VSO_0000000_regex_use/test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index 2f2c8e0d4ea..a29e3245e63 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -2190,7 +2190,7 @@ void test_gh_5790() { } { - test_regex too_large_min_greedy_a_rep(&g_regexTester, "a{11,1000}"); + test_regex too_large_min_greedy_a_rep(&g_regexTester, "a{11,1000}", options); too_large_min_greedy_a_rep.should_search_fail("aaaaaaaaaa"); } } From 142b9c1b3827e8c72d7312d62713ed9c11b73fba Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 20 Oct 2025 13:41:47 -0700 Subject: [PATCH 4/5] Drop duplicated test. --- tests/std/tests/VSO_0000000_regex_use/test.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index a29e3245e63..39fe35631e3 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -2184,11 +2184,6 @@ void test_gh_5790() { lower_and_upper_bounded_greedy_a_rep.should_search_match("aaaaaaaaaa", "aaaaa"); } - { - test_regex lower_and_upper_bounded_greedy_a_rep(&g_regexTester, "a{2,5}", options); - lower_and_upper_bounded_greedy_a_rep.should_search_match("aaaaaaaaaa", "aaaaa"); - } - { test_regex too_large_min_greedy_a_rep(&g_regexTester, "a{11,1000}", options); too_large_min_greedy_a_rep.should_search_fail("aaaaaaaaaa"); From 84dc9aea7adbd11804532a3e26d7d68ea658867d Mon Sep 17 00:00:00 2001 From: "Stephan T. Lavavej" Date: Mon, 20 Oct 2025 13:32:40 -0700 Subject: [PATCH 5/5] Add test cases. --- tests/std/tests/VSO_0000000_regex_use/test.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/std/tests/VSO_0000000_regex_use/test.cpp b/tests/std/tests/VSO_0000000_regex_use/test.cpp index 39fe35631e3..90fe42f4c39 100644 --- a/tests/std/tests/VSO_0000000_regex_use/test.cpp +++ b/tests/std/tests/VSO_0000000_regex_use/test.cpp @@ -2133,6 +2133,7 @@ void test_gh_5774() { // GH-5774: Process non-greedy and longest-mode simple loops non-recursively. // This extends our test coverage on non-greedy simple loops with bounded number of repetitions. g_regexTester.should_not_match("", "a+?"); + g_regexTester.should_match("b", "a{0}?b"); g_regexTester.should_not_match("ab", "a{0}?b"); g_regexTester.should_match("ab", "a{0,1}?b"); g_regexTester.should_not_match("aab", "a{0,1}?b"); @@ -2147,6 +2148,7 @@ void test_gh_5790() { // GH-5790: Process greedy simple loops non-recursively. // This extends our test coverage on (mainly greedy) simple loops. g_regexTester.should_not_match("", "a+"); + g_regexTester.should_match("b", "a{0}b"); g_regexTester.should_not_match("ab", "a{0}b"); g_regexTester.should_match("ab", "a{0,1}b"); g_regexTester.should_not_match("aab", "a{0,1}b"); @@ -2210,6 +2212,11 @@ void test_gh_5790() { lower_bounded_nongreedy_a_rep.should_search_match("aaaaaaaaaa", "aaaa"); } + { + test_regex lower_and_upper_bounded_nongreedy_a_rep(&g_regexTester, "a{2,5}?"); + lower_and_upper_bounded_nongreedy_a_rep.should_search_match("aaaaaaaaaa", "aa"); + } + { test_regex too_large_min_nongreedy_a_rep(&g_regexTester, "a{11,1000}?"); too_large_min_nongreedy_a_rep.should_search_fail("aaaaaaaaaa");