Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make (*THEN) work - first draft #22215

Draft
wants to merge 14 commits into
base: blead
Choose a base branch
from
1 change: 1 addition & 0 deletions AUTHORS
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,7 @@ Fergal Daly <fergal@esatclear.ie>
Fingle Nark <finglenark@gmail.com>
Firas Khalil Khana <firasuke@gmail.com>
Florent Guillaume
Florian Pensec <pensec.florian@gmail.com>
Florian Ragwitz <rafl@debian.org>
Florian Weimer <fweimer@redhat.com>
Frank Crawford
Expand Down
8 changes: 8 additions & 0 deletions regcomp.c
Original file line number Diff line number Diff line change
Expand Up @@ -1681,6 +1681,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
RExC_seen = 0;
RExC_maxlen = 0;
RExC_in_lookaround = 0;
RExC_has_cutgroup = 0;
RExC_seen_zerolen = *exp == '^' ? -1 : 0;
RExC_recode_x_to_native = 0;
RExC_in_multi_char_class = 0;
Expand Down Expand Up @@ -3093,6 +3094,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
I32 num; /* numeric backreferences */
SV * max_open; /* Max number of unclosed parens */
I32 was_in_lookaround = RExC_in_lookaround;
I32 had_cutgroup = RExC_has_cutgroup;
I32 fake_eval = 0; /* matches paren */

/* The difference between the following variables can be seen with *
Expand Down Expand Up @@ -3259,6 +3261,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
if ( memEQs(start_verb, verb_len,"THEN") ) {
op = CUTGROUP;
RExC_seen |= REG_CUTGROUP_SEEN;
RExC_has_cutgroup = 1;
}
break;
case 'a':
Expand Down Expand Up @@ -4247,11 +4250,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
reginsert(pRExC_state, BRANCHJ, br, depth+1);
ARG2a_SET(REGNODE_p(br), npar_before_regbranch);
ARG2b_SET(REGNODE_p(br), (U16)RExC_npar - 1);
BRANCH_HAS_CUTGROUP(REGNODE_p(br)) = RExC_has_cutgroup;
}
else {
reginsert(pRExC_state, BRANCH, br, depth+1);
ARG1a_SET(REGNODE_p(br), (U16)npar_before_regbranch);
ARG1b_SET(REGNODE_p(br), (U16)RExC_npar - 1);
BRANCH_HAS_CUTGROUP(REGNODE_p(br)) = RExC_has_cutgroup;
}
have_branch = 1;
}
Expand Down Expand Up @@ -4301,13 +4306,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
ARG1b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br)));
else
ARG2b_SET(REGNODE_p(lastbr),ARG1a(REGNODE_p(br)));
BRANCH_HAS_CUTGROUP(REGNODE_p(br)) = RExC_has_cutgroup;
}
else
if (OP(REGNODE_p(br)) == BRANCHJ) {
if (OP(REGNODE_p(lastbr)) == BRANCH)
ARG1b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br)));
else
ARG2b_SET(REGNODE_p(lastbr),ARG2a(REGNODE_p(br)));
BRANCH_HAS_CUTGROUP(REGNODE_p(br)) = RExC_has_cutgroup;
}

lastbr = br;
Expand Down Expand Up @@ -4512,6 +4519,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
RExC_logical_npar = after_freeze;

RExC_in_lookaround = was_in_lookaround;
RExC_has_cutgroup = had_cutgroup;

return(ret);
}
Expand Down
6 changes: 6 additions & 0 deletions regcomp.h
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,11 @@ struct regnode_ssc {

#define PARNO(p) ARG1u(p) /* APPLIES for OPEN and CLOSE only */

/* Only for BRANCH and BRANCHJ. Currently, those nodes don't use the flags field
* so it is repurposed has a bool and the flags field is written to 1 byte at a time.
*/
#define BRANCH_HAS_CUTGROUP(p) FLAGS(p)

#define NODE_ALIGN_FILL(node) (FLAGS(node) = 0)

/* Long strings. Currently limited to length 18 bits, which handles a 262000
Expand Down Expand Up @@ -1220,6 +1225,7 @@ typedef struct {
* zzz|abc|ab/ after matching the chars abc, the
* accepted word is #2, and the previous accepted
* word is #3 */
U8 has_cutgroup;
U32 len; /* how many chars long is this word? */
U32 accept; /* accept state for this word */
} reg_trie_wordinfo;
Expand Down
2 changes: 2 additions & 0 deletions regcomp_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,7 @@ struct RExC_state_t {
through */
U32 study_chunk_recursed_bytes; /* bytes in bitmap */
I32 in_lookaround;
I32 has_cutgroup;
I32 contains_locale;
I32 override_recoding;
I32 recode_x_to_native;
Expand Down Expand Up @@ -238,6 +239,7 @@ struct RExC_state_t {
#define RExC_study_chunk_recursed_bytes \
(pRExC_state->study_chunk_recursed_bytes)
#define RExC_in_lookaround (pRExC_state->in_lookaround)
#define RExC_has_cutgroup (pRExC_state->has_cutgroup)
#define RExC_contains_locale (pRExC_state->contains_locale)
#define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)

Expand Down
9 changes: 5 additions & 4 deletions regcomp_trie.c
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ is the recommended Unicode-aware way of saying
TRIE_LIST_LEN( state ) = 4; \
} STMT_END

#define TRIE_HANDLE_WORD(state) STMT_START { \
#define TRIE_HANDLE_WORD(state, has_cutgroup_arg) STMT_START { \
U16 dupe= trie->states[ state ].wordnum; \
regnode * const noper_next = regnext( noper ); \
\
Expand All @@ -467,6 +467,7 @@ is the recommended Unicode-aware way of saying
trie->wordinfo[curword].prev = 0; \
trie->wordinfo[curword].len = wordlen; \
trie->wordinfo[curword].accept = state; \
trie->wordinfo[curword].has_cutgroup = (has_cutgroup_arg); \
\
if ( noper_next < tail ) { \
if (!trie->jump) { \
Expand Down Expand Up @@ -985,8 +986,8 @@ Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
*/
noper= REGNODE_AFTER(cur);
}
TRIE_HANDLE_WORD(state);

TRIE_HANDLE_WORD(state, BRANCH_HAS_CUTGROUP(cur));
} /* end second pass */

/* next alloc is the NEXT state to be allocated */
Expand Down Expand Up @@ -1199,7 +1200,7 @@ Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
noper= REGNODE_AFTER(cur);
}
accept_state = TRIE_NODENUM( state );
TRIE_HANDLE_WORD(accept_state);
TRIE_HANDLE_WORD(accept_state, BRANCH_HAS_CUTGROUP(cur));

} /* end second pass */

Expand Down
24 changes: 20 additions & 4 deletions regexec.c
Original file line number Diff line number Diff line change
Expand Up @@ -6483,7 +6483,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
SV *sv_yes_mark = NULL; /* last mark name we have seen
during a successful match */
U32 lastopen = 0; /* last open we saw */
bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0;
bool has_cutgroup;
SV* const oreplsv = GvSVn(PL_replgv);
/* these three flags are set by various ops to signal information to
* the very next op. They have a useful lifetime of exactly one loop
Expand Down Expand Up @@ -6868,6 +6868,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
goto trie_first_try; /* jump into the fail handler */
}}
NOT_REACHED; /* NOTREACHED */

case TRIE_next:
sayYES;
NOT_REACHED; /* NOTREACHED */

case TRIE_next_fail: /* we failed - try next alternative */
{
Expand Down Expand Up @@ -6997,13 +7001,24 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
);
});

if ( ST.accepted > 1 || has_cutgroup || ST.jump ) {
if ( ST.accepted > 1 || ST.jump ) {
if (RE_PESSIMISTIC_PARENS) {
(void)regcppush(rex, 0, maxopenparen);
REGCP_SET(ST.lastcp);
}
PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
script_run_begin);

reg_trie_data * const trie = (reg_trie_data*)rexi->data->data[ARG1u(ST.me)];
has_cutgroup = trie->wordinfo[ST.nextword].has_cutgroup;

if (has_cutgroup) {
PUSH_YES_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
script_run_begin);
}
else {
PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol,
script_run_begin);
}

NOT_REACHED; /* NOTREACHED */
}
/* only one choice left - just continue */
Expand Down Expand Up @@ -9084,6 +9099,7 @@ NULL
ST.before_paren = ARG1a(scan);
ST.after_paren = ARG1b(scan);
branch_logic:
has_cutgroup = BRANCH_HAS_CUTGROUP(scan);
scan = REGNODE_AFTER_opcode(scan,state_num); /* scan now points to inner node */
assert(scan);
ST.lastparen = RXp_LASTPAREN(rex);
Expand Down
14 changes: 14 additions & 0 deletions t/re/pat_advanced.t
Original file line number Diff line number Diff line change
Expand Up @@ -2115,6 +2115,20 @@ EOP
qr/(A (.*) (?{ $count_b++ }) (CE*){0,6} (*THEN) | A D) z/x,
1,
],
[
0,
"abcde",
qr/ \A a (b (?{ $count_a++ }) | . (?{ $count_a++ })) (*THEN) (*FAIL) /x,
qr/ \A a (b (?{ $count_b++ }) | . (?{ $count_b++ })) (*THEN) z /x,
1,
],
[
1,
"abcde",
qr/ \A a (b (?{ $count_a++ }) | . (?{ $count_a++ })) (*THEN) (*FAIL) | \A (?{ $count_a++ }) a /x,
qr/ \A a (b (?{ $count_b++ }) | . (?{ $count_b++ })) (*THEN) z | \A (?{ $count_b++ }) a /x,
2,
],
) {
$c++;
$count_a = 0;
Expand Down
30 changes: 30 additions & 0 deletions t/re/re_tests
Original file line number Diff line number Diff line change
Expand Up @@ -2164,6 +2164,36 @@ AB\s+\x{100} AB \x{100}X y - -
[^\W\S]* a y $&
[^\W\S]? a y $&

# (*THEN) tests
/\A(?:.|..)(*THEN)c/ abc n - -

/ \A a (?: (?: b | bc (*ACCEPT) ) (*THEN) (*FAIL) | bcd ) /x abcde y $& abcd
/ \A a (?: (?: (??{""}) b | bc (*ACCEPT) ) (*THEN) (*FAIL) | bcd ) /x abcde y $& abcd

/ \A (?: abd | ab (?: c | (*ACCEPT)) (*THEN) z | abcd ) /x abcde y $& abcd
/ \A (?: (??{""}) abd | ab (?: c | (*ACCEPT)) (*THEN) z | abcd ) /x abcde y $& abcd

/ \A (?: abd | ab (?: c | (*ACCEPT)) (*THEN) z | a ) /x abcde y $& a
/ \A (?: abd | (??{""}) ab (?: c | (*ACCEPT)) (*THEN) z | a ) /x abcde y $& a

# testing that backtracking can happen in between the start of a branch and the cutgroup as long as the control flow doesn't yet go past the cutgroup
/ \A a (?: (?: b | bc (*ACCEPT) ) (?: z | (*ACCEPT)) (*THEN) (*FAIL) | bcd ) /x abcde y $& ab
/ \A a (?: (?: (??{""}) b | bc (*ACCEPT) ) (?: z | (*ACCEPT)) (*THEN) (*FAIL) | bcd ) /x abcde y $& ab

/ \A a (?: (?: b | bc (*ACCEPT) ) (?: . | (*FAIL)) (*THEN) (*FAIL) | bcd ) /x abcde y $& abcd
/ \A a (?: (?: (??{""}) b | bc (*ACCEPT) ) (?: . | (*FAIL)) (*THEN) (*FAIL) | bcd ) /x abcde y $& abcd

# testing that the cut leads to a backtracking to the right branch
/ \A a (?: (?: b | bc (*ACCEPT) ) (?: . | cd (*ACCEPT) ) (*THEN) (*FAIL) | bcde ) /x abcde y $& abcde
/ \A a (?: (?: (??{""}) b | bc (*ACCEPT) ) (?: . | cd (*ACCEPT) ) (*THEN) (*FAIL) | bcde ) /x abcde y $& abcde

# testing the cut doesn't remove the choice point of its innermost outter branch, when there is no additional choice point between the start of a branch and the choice point
/ \A (?: a z | abc ) /x abcde y $& abc
/ \A (?: a (*THEN) z | abc ) /x abcde y $& abc
/ \A (?: (??{""}) a z | abc ) /x abcde y $& abc
/ \A (?: (??{""}) a (*THEN) z | abc ) /x abcde y $& abc


# Keep these lines at the end of the file
# pat string y/n/etc expr expected-expr skip-reason comment
# vim: softtabstop=0 noexpandtab
Loading