Skip to content

Shrink struct RExC_state_t #23441

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jul 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 23 additions & 23 deletions regcomp.c
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,7 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
*plen_p = d - dst;
*pat_p = (char*) dst;
SAVEFREEPV(*pat_p);
RExC_orig_utf8 = RExC_utf8 = 1;
RExC_orig_utf8 = RExC_utf8 = true;
}


Expand Down Expand Up @@ -1602,7 +1602,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
}

/* ignore the utf8ness if the pattern is 0 length */
RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? false : cBOOL(SvUTF8(pat));
RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);


Expand Down Expand Up @@ -1637,7 +1637,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,

if ( old_re
&& !recompile
&& cBOOL(RX_UTF8(old_re)) == cBOOL(RExC_utf8)
&& cBOOL(RX_UTF8(old_re)) == RExC_utf8
&& ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) )
&& RX_PRELEN(old_re) == plen
&& memEQ(RX_PRECOMP(old_re), exp, plen)
Expand Down Expand Up @@ -1669,7 +1669,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
/* Set to use unicode semantics if the pattern is in utf8 and has the
* 'depends' charset specified, as it means unicode when utf8 */
set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
RExC_uni_semantics = 1;
RExC_uni_semantics = true;
}

RExC_pm_flags = pm_flags;
Expand All @@ -1688,14 +1688,14 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
}
assert(!pRExC_state->runtime_code_qr);

RExC_sawback = 0;
RExC_sawback = false;

RExC_seen = 0;
RExC_maxlen = 0;
RExC_in_lookaround = 0;
RExC_in_lookaround = false;
RExC_seen_zerolen = *exp == '^' ? -1 : 0;
RExC_recode_x_to_native = 0;
RExC_in_multi_char_class = 0;
RExC_recode_x_to_native = false;
RExC_in_multi_char_class = false;

RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
RExC_precomp_end = RExC_end = exp + plen;
Expand Down Expand Up @@ -2910,7 +2910,7 @@ S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
RExC_rxi->data->data[num]=(void*)sv_dat;
SvREFCNT_inc_simple_void_NN(sv_dat);
}
RExC_sawback = 1;
RExC_sawback = true;
ret = reg2node(pRExC_state,
((! FOLD)
? REFN
Expand Down Expand Up @@ -2945,7 +2945,7 @@ S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state,
* If the construct is empty generates a NOTHING op and returns its
* regnode_offset, which the caller would then return to its caller.
*
* If the construct is not empty increments RExC_in_lookaround, and turns
* If the construct is not empty sets RExC_in_lookaround, and turns
* on any flags provided in RExC_seen, and then returns 0 to signify
* that parsing should continue.
*
Expand Down Expand Up @@ -2976,7 +2976,7 @@ S_reg_la_NOTHING(pTHX_ RExC_state_t *pRExC_state, U32 flags,
}

RExC_seen |= flags;
RExC_in_lookaround++;
RExC_in_lookaround = true;
return 0; /* keep parsing! */
}

Expand All @@ -2993,7 +2993,7 @@ S_reg_la_NOTHING(pTHX_ RExC_state_t *pRExC_state, U32 flags,
* If the construct is empty generates an OPFAIL op and returns its
* regnode_offset which the caller should then return to its caller.
*
* If the construct is not empty increments RExC_in_lookaround, and also
* If the construct is not empty sets RExC_in_lookaround, and also
* increments RExC_seen_zerolen, and turns on the flags provided in
* RExC_seen, and then returns 0 to signify that parsing should continue.
*
Expand Down Expand Up @@ -3026,7 +3026,7 @@ S_reg_la_OPFAIL(pTHX_ RExC_state_t *pRExC_state, U32 flags,
* does not match ever. */
RExC_seen_zerolen++;
RExC_seen |= flags;
RExC_in_lookaround++;
RExC_in_lookaround = true;
return 0; /* keep parsing! */
}

Expand Down Expand Up @@ -3105,7 +3105,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
I32 after_freeze = 0;
I32 num; /* numeric backreferences */
SV * max_open; /* Max number of unclosed parens */
I32 was_in_lookaround = RExC_in_lookaround;
bool was_in_lookaround = RExC_in_lookaround;
I32 fake_eval = 0; /* matches paren */

/* The difference between the following variables can be seen with *
Expand Down Expand Up @@ -3427,7 +3427,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth)
}

RExC_seen_zerolen++;
RExC_in_lookaround++;
RExC_in_lookaround = true;
RExC_seen |= seen_flag_set;

RExC_parse_set(start_arg);
Expand Down Expand Up @@ -5453,7 +5453,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,

/* The values are Unicode, and therefore have to be converted to native
* on a non-Unicode (meaning non-ASCII) platform. */
SET_recode_x_to_native(1);
SET_recode_x_to_native(true);
}

/* Here, we have the string the name evaluates to, ready to be parsed,
Expand All @@ -5479,7 +5479,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
RExC_start = save_start;
RExC_parse_set(endbrace);
RExC_end = orig_end;
SET_recode_x_to_native(0);
SET_recode_x_to_native(true);

SvREFCNT_dec_NN(substitute_parse);

Expand Down Expand Up @@ -5901,7 +5901,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
RExC_seen_d_op = true;
}
else if (op == BOUNDL) {
RExC_contains_locale = 1;
RExC_contains_locale = true;
}

if (invert) {
Expand Down Expand Up @@ -6203,7 +6203,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
REQUIRE_PARENS_PASS;
}
}
RExC_sawback = 1;
RExC_sawback = true;
ret = reg2node(pRExC_state,
((! FOLD)
? REF
Expand Down Expand Up @@ -6770,7 +6770,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
* existing node, so can start a new node with this one */
if (! len) {
node_type = EXACTFL;
RExC_contains_locale = 1;
RExC_contains_locale = true;
}
else if (node_type == EXACT) {
p = oldp;
Expand Down Expand Up @@ -10822,7 +10822,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
RExC_parse_set( RExC_start );
RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len;
RExC_end = RExC_parse + len;
RExC_in_multi_char_class = 1;
RExC_in_multi_char_class = true;

ret = reg(pRExC_state, 1, &reg_flags, depth+1);

Expand All @@ -10832,7 +10832,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
RExC_parse_set(save_parse);
RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start;
RExC_end = save_end;
RExC_in_multi_char_class = 0;
RExC_in_multi_char_class = false;
SvREFCNT_dec_NN(multi_char_matches);
SvREFCNT_dec(properties);
SvREFCNT_dec(cp_list);
Expand Down Expand Up @@ -11245,7 +11245,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
}

if (anyof_flags & ANYOF_LOCALE_FLAGS) {
RExC_contains_locale = 1;
RExC_contains_locale = true;
}

if (optimizable) {
Expand Down
41 changes: 21 additions & 20 deletions regcomp_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,16 @@ struct RExC_state_t {
regnode *emit_start; /* Start of emitted-code area */
regnode_offset emit; /* Code-emit pointer */
I32 naughty; /* How bad is this pattern? */
I32 sawback; /* Did we see \1, ...? */
bool sawback; /* Did we see \1, ...? */

bool utf8; /* whether the pattern is utf8 or not */
bool orig_utf8; /* whether the pattern was originally in utf8 */
/* XXX use this for future optimisation of case
* where pattern must be upgraded to utf8. */
bool uni_semantics; /* If a d charset modifier should use unicode
rules, even if the pattern is not in
utf8 */

SSize_t size; /* Number of regnode equivalents in
pattern */
Size_t sets_depth; /* Counts recursion depth of already-
Expand Down Expand Up @@ -127,37 +136,29 @@ struct RExC_state_t {
accept */
I32 seen_zerolen;
regnode *end_op; /* END node in program */
I32 utf8; /* whether the pattern is utf8 or not */
I32 orig_utf8; /* whether the pattern was originally in utf8 */
/* XXX use this for future optimisation of case
* where pattern must be upgraded to utf8. */
I32 uni_semantics; /* If a d charset modifier should use unicode
rules, even if the pattern is not in
utf8 */

bool in_lookaround;
bool contains_locale;
bool recode_x_to_native;
bool in_multi_char_class;
I32 recurse_count; /* Number of recurse regops we have generated */
regnode **recurse; /* Recurse regops */
U8 *study_chunk_recursed; /* bitmap of which subs we have moved
through */
U32 study_chunk_recursed_bytes; /* bytes in bitmap */
I32 in_lookaround;
I32 contains_locale;
I32 recode_x_to_native;
I32 in_multi_char_class;
int code_index; /* next code_blocks[] slot */
struct reg_code_blocks *code_blocks;/* positions of literal (?{})
within pattern */
SSize_t maxlen; /* minimum possible number of chars in string to match */
scan_frame *frame_head;
scan_frame *frame_last;
U32 frame_count;
AV *warn_text;
HV *unlexed_names;
SV *runtime_code_qr; /* qr with the runtime code blocks */
bool seen_d_op;
bool strict;
bool study_started;
bool in_script_run;
AV *warn_text;
HV *unlexed_names;
SV *runtime_code_qr; /* qr with the runtime code blocks */
bool use_BRANCHJ;
bool sWARN_EXPERIMENTAL__VLB;
bool sWARN_EXPERIMENTAL__REGEX_SETS;
Expand All @@ -170,12 +171,12 @@ struct RExC_state_t {
* See GH Issue #21558 and also ba6e2c38aafc23cf114f3ba0d0ff3baead34328b
*/
#if defined(DEBUGGING) || !defined(USE_DYNAMIC_LOADING)
const char *lastparse;
I32 lastnum;
U32 study_chunk_recursed_count;
const char *lastparse;
AV *paren_name_list; /* idx -> name */
SV *mysv1;
SV *mysv2;
U32 study_chunk_recursed_count;
#endif
};

Expand Down Expand Up @@ -497,7 +498,7 @@ struct RExC_state_t {
STMT_START { \
if (DEPENDS_SEMANTICS) { \
set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
RExC_uni_semantics = 1; \
RExC_uni_semantics = true; \
if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
/* No need to restart the parse if we haven't seen \
* anything that differs between /u and /d, and no need \
Expand Down Expand Up @@ -740,7 +741,7 @@ static const scan_data_t zero_scan_data = {



#define UTF cBOOL(RExC_utf8)
#define UTF RExC_utf8

/* The enums for all these are ordered so things work out correctly */
#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
Expand Down
Loading