From 2e4b163cba870808f5fea69086e5855d8bf203a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dagfinn=20Ilmari=20Manns=C3=A5ker?= Date: Tue, 15 Jul 2025 17:03:54 +0100 Subject: [PATCH 1/8] RExC_state_t: convert in_lookaround to boolean --- regcomp.c | 14 +++++++------- regcomp_internal.h | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/regcomp.c b/regcomp.c index 0a84ad07606f..f69f6609e7f2 100644 --- a/regcomp.c +++ b/regcomp.c @@ -1692,7 +1692,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_seen = 0; RExC_maxlen = 0; - RExC_in_lookaround = 0; + RExC_in_lookaround = false; RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_recode_x_to_native = 0; RExC_in_multi_char_class = 0; @@ -2945,7 +2945,7 @@ S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, * If the construct is empty generates a NOTHING op and returns its * regnode_offset, which the caller would then return to its caller. * - * If the construct is not empty increments RExC_in_lookaround, and turns + * If the construct is not empty sets RExC_in_lookaround, and turns * on any flags provided in RExC_seen, and then returns 0 to signify * that parsing should continue. * @@ -2976,7 +2976,7 @@ S_reg_la_NOTHING(pTHX_ RExC_state_t *pRExC_state, U32 flags, } RExC_seen |= flags; - RExC_in_lookaround++; + RExC_in_lookaround = true; return 0; /* keep parsing! */ } @@ -2993,7 +2993,7 @@ S_reg_la_NOTHING(pTHX_ RExC_state_t *pRExC_state, U32 flags, * If the construct is empty generates an OPFAIL op and returns its * regnode_offset which the caller should then return to its caller. * - * If the construct is not empty increments RExC_in_lookaround, and also + * If the construct is not empty sets RExC_in_lookaround, and also * increments RExC_seen_zerolen, and turns on the flags provided in * RExC_seen, and then returns 0 to signify that parsing should continue. * @@ -3026,7 +3026,7 @@ S_reg_la_OPFAIL(pTHX_ RExC_state_t *pRExC_state, U32 flags, * does not match ever. */ RExC_seen_zerolen++; RExC_seen |= flags; - RExC_in_lookaround++; + RExC_in_lookaround = true; return 0; /* keep parsing! */ } @@ -3105,7 +3105,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) I32 after_freeze = 0; I32 num; /* numeric backreferences */ SV * max_open; /* Max number of unclosed parens */ - I32 was_in_lookaround = RExC_in_lookaround; + bool was_in_lookaround = RExC_in_lookaround; I32 fake_eval = 0; /* matches paren */ /* The difference between the following variables can be seen with * @@ -3427,7 +3427,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) } RExC_seen_zerolen++; - RExC_in_lookaround++; + RExC_in_lookaround = true; RExC_seen |= seen_flag_set; RExC_parse_set(start_arg); diff --git a/regcomp_internal.h b/regcomp_internal.h index 997c2844660b..262c04cb4f55 100644 --- a/regcomp_internal.h +++ b/regcomp_internal.h @@ -140,7 +140,7 @@ struct RExC_state_t { U8 *study_chunk_recursed; /* bitmap of which subs we have moved through */ U32 study_chunk_recursed_bytes; /* bytes in bitmap */ - I32 in_lookaround; + bool in_lookaround; I32 contains_locale; I32 recode_x_to_native; I32 in_multi_char_class; From 5d7ea5c6da318e5688c55b8d4389f5014038472d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dagfinn=20Ilmari=20Manns=C3=A5ker?= Date: Tue, 15 Jul 2025 17:06:33 +0100 Subject: [PATCH 2/8] RExC_state_t: convert contains_locale to boolean --- regcomp.c | 6 +++--- regcomp_internal.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/regcomp.c b/regcomp.c index f69f6609e7f2..242fef2efcfd 100644 --- a/regcomp.c +++ b/regcomp.c @@ -5901,7 +5901,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) RExC_seen_d_op = true; } else if (op == BOUNDL) { - RExC_contains_locale = 1; + RExC_contains_locale = true; } if (invert) { @@ -6770,7 +6770,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * existing node, so can start a new node with this one */ if (! len) { node_type = EXACTFL; - RExC_contains_locale = 1; + RExC_contains_locale = true; } else if (node_type == EXACT) { p = oldp; @@ -11245,7 +11245,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } if (anyof_flags & ANYOF_LOCALE_FLAGS) { - RExC_contains_locale = 1; + RExC_contains_locale = true; } if (optimizable) { diff --git a/regcomp_internal.h b/regcomp_internal.h index 262c04cb4f55..083e6678ff4f 100644 --- a/regcomp_internal.h +++ b/regcomp_internal.h @@ -141,7 +141,7 @@ struct RExC_state_t { through */ U32 study_chunk_recursed_bytes; /* bytes in bitmap */ bool in_lookaround; - I32 contains_locale; + bool contains_locale; I32 recode_x_to_native; I32 in_multi_char_class; int code_index; /* next code_blocks[] slot */ From cc8c9ac44e4086656046574838388d14ebe0cf38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dagfinn=20Ilmari=20Manns=C3=A5ker?= Date: Tue, 15 Jul 2025 17:09:44 +0100 Subject: [PATCH 3/8] RExC_state_t: convert recode_x_to_native to boolean --- regcomp.c | 6 +++--- regcomp_internal.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/regcomp.c b/regcomp.c index 242fef2efcfd..ea59d618b0ff 100644 --- a/regcomp.c +++ b/regcomp.c @@ -1694,7 +1694,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_maxlen = 0; RExC_in_lookaround = false; RExC_seen_zerolen = *exp == '^' ? -1 : 0; - RExC_recode_x_to_native = 0; + RExC_recode_x_to_native = false; RExC_in_multi_char_class = 0; RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp; @@ -5453,7 +5453,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, /* The values are Unicode, and therefore have to be converted to native * on a non-Unicode (meaning non-ASCII) platform. */ - SET_recode_x_to_native(1); + SET_recode_x_to_native(true); } /* Here, we have the string the name evaluates to, ready to be parsed, @@ -5479,7 +5479,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, RExC_start = save_start; RExC_parse_set(endbrace); RExC_end = orig_end; - SET_recode_x_to_native(0); + SET_recode_x_to_native(true); SvREFCNT_dec_NN(substitute_parse); diff --git a/regcomp_internal.h b/regcomp_internal.h index 083e6678ff4f..d6fd586e390f 100644 --- a/regcomp_internal.h +++ b/regcomp_internal.h @@ -142,7 +142,7 @@ struct RExC_state_t { U32 study_chunk_recursed_bytes; /* bytes in bitmap */ bool in_lookaround; bool contains_locale; - I32 recode_x_to_native; + bool recode_x_to_native; I32 in_multi_char_class; int code_index; /* next code_blocks[] slot */ struct reg_code_blocks *code_blocks;/* positions of literal (?{}) From 6159afcfcfbe3c9fa5dee40daf7935e1df8021dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dagfinn=20Ilmari=20Manns=C3=A5ker?= Date: Tue, 15 Jul 2025 17:10:54 +0100 Subject: [PATCH 4/8] RExC_state_t: convert in_multi_char_class to boolean --- regcomp.c | 6 +++--- regcomp_internal.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/regcomp.c b/regcomp.c index ea59d618b0ff..5569201016e8 100644 --- a/regcomp.c +++ b/regcomp.c @@ -1695,7 +1695,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_in_lookaround = false; RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_recode_x_to_native = false; - RExC_in_multi_char_class = 0; + RExC_in_multi_char_class = false; RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp; RExC_precomp_end = RExC_end = exp + plen; @@ -10822,7 +10822,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, RExC_parse_set( RExC_start ); RExC_copy_start_in_constructed = RExC_start + constructed_prefix_len; RExC_end = RExC_parse + len; - RExC_in_multi_char_class = 1; + RExC_in_multi_char_class = true; ret = reg(pRExC_state, 1, ®_flags, depth+1); @@ -10832,7 +10832,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, RExC_parse_set(save_parse); RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = save_start; RExC_end = save_end; - RExC_in_multi_char_class = 0; + RExC_in_multi_char_class = false; SvREFCNT_dec_NN(multi_char_matches); SvREFCNT_dec(properties); SvREFCNT_dec(cp_list); diff --git a/regcomp_internal.h b/regcomp_internal.h index d6fd586e390f..de3f70ccab9a 100644 --- a/regcomp_internal.h +++ b/regcomp_internal.h @@ -143,7 +143,7 @@ struct RExC_state_t { bool in_lookaround; bool contains_locale; bool recode_x_to_native; - I32 in_multi_char_class; + bool in_multi_char_class; int code_index; /* next code_blocks[] slot */ struct reg_code_blocks *code_blocks;/* positions of literal (?{}) within pattern */ From 449cfab63768dd1d462117b7123b4365bb07681b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dagfinn=20Ilmari=20Manns=C3=A5ker?= Date: Tue, 15 Jul 2025 17:19:21 +0100 Subject: [PATCH 5/8] RExC_state_t: convert (orig_)utf8 to boolean --- regcomp.c | 6 +++--- regcomp_internal.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/regcomp.c b/regcomp.c index 5569201016e8..f73103d75595 100644 --- a/regcomp.c +++ b/regcomp.c @@ -595,7 +595,7 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, *plen_p = d - dst; *pat_p = (char*) dst; SAVEFREEPV(*pat_p); - RExC_orig_utf8 = RExC_utf8 = 1; + RExC_orig_utf8 = RExC_utf8 = true; } @@ -1602,7 +1602,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, } /* ignore the utf8ness if the pattern is 0 length */ - RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); + RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? false : cBOOL(SvUTF8(pat)); RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT); @@ -1637,7 +1637,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, if ( old_re && !recompile - && cBOOL(RX_UTF8(old_re)) == cBOOL(RExC_utf8) + && cBOOL(RX_UTF8(old_re)) == RExC_utf8 && ( RX_COMPFLAGS(old_re) == ( orig_rx_flags & RXf_PMf_FLAGCOPYMASK ) ) && RX_PRELEN(old_re) == plen && memEQ(RX_PRECOMP(old_re), exp, plen) diff --git a/regcomp_internal.h b/regcomp_internal.h index de3f70ccab9a..91fd0d84f4ac 100644 --- a/regcomp_internal.h +++ b/regcomp_internal.h @@ -127,8 +127,8 @@ struct RExC_state_t { accept */ I32 seen_zerolen; regnode *end_op; /* END node in program */ - I32 utf8; /* whether the pattern is utf8 or not */ - I32 orig_utf8; /* whether the pattern was originally in utf8 */ + bool utf8; /* whether the pattern is utf8 or not */ + bool orig_utf8; /* whether the pattern was originally in utf8 */ /* XXX use this for future optimisation of case * where pattern must be upgraded to utf8. */ I32 uni_semantics; /* If a d charset modifier should use unicode @@ -740,7 +740,7 @@ static const scan_data_t zero_scan_data = { -#define UTF cBOOL(RExC_utf8) +#define UTF RExC_utf8 /* The enums for all these are ordered so things work out correctly */ #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) From f0ca3283b59a839cf4c9efc70dbcbdd242dfd13e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dagfinn=20Ilmari=20Manns=C3=A5ker?= Date: Tue, 15 Jul 2025 17:19:49 +0100 Subject: [PATCH 6/8] RExC_state_t: convert uni_semantics to boolean --- regcomp.c | 2 +- regcomp_internal.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/regcomp.c b/regcomp.c index f73103d75595..91aaa9996fd6 100644 --- a/regcomp.c +++ b/regcomp.c @@ -1669,7 +1669,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* Set to use unicode semantics if the pattern is in utf8 and has the * 'depends' charset specified, as it means unicode when utf8 */ set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); - RExC_uni_semantics = 1; + RExC_uni_semantics = true; } RExC_pm_flags = pm_flags; diff --git a/regcomp_internal.h b/regcomp_internal.h index 91fd0d84f4ac..bfdb45fc8ad5 100644 --- a/regcomp_internal.h +++ b/regcomp_internal.h @@ -131,7 +131,7 @@ struct RExC_state_t { bool orig_utf8; /* whether the pattern was originally in utf8 */ /* XXX use this for future optimisation of case * where pattern must be upgraded to utf8. */ - I32 uni_semantics; /* If a d charset modifier should use unicode + bool uni_semantics; /* If a d charset modifier should use unicode rules, even if the pattern is not in utf8 */ @@ -497,7 +497,7 @@ struct RExC_state_t { STMT_START { \ if (DEPENDS_SEMANTICS) { \ set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \ - RExC_uni_semantics = 1; \ + RExC_uni_semantics = true; \ if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \ /* No need to restart the parse if we haven't seen \ * anything that differs between /u and /d, and no need \ From 0bb5b35abb2d89aafaeecde44d4871b1dac74caa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dagfinn=20Ilmari=20Manns=C3=A5ker?= Date: Tue, 15 Jul 2025 17:22:34 +0100 Subject: [PATCH 7/8] RExC_state_t: convert sawback to boolean --- regcomp.c | 6 +++--- regcomp_internal.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/regcomp.c b/regcomp.c index 91aaa9996fd6..75b2ff6ead6e 100644 --- a/regcomp.c +++ b/regcomp.c @@ -1688,7 +1688,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, } assert(!pRExC_state->runtime_code_qr); - RExC_sawback = 0; + RExC_sawback = false; RExC_seen = 0; RExC_maxlen = 0; @@ -2910,7 +2910,7 @@ S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, RExC_rxi->data->data[num]=(void*)sv_dat; SvREFCNT_inc_simple_void_NN(sv_dat); } - RExC_sawback = 1; + RExC_sawback = true; ret = reg2node(pRExC_state, ((! FOLD) ? REFN @@ -6203,7 +6203,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) REQUIRE_PARENS_PASS; } } - RExC_sawback = 1; + RExC_sawback = true; ret = reg2node(pRExC_state, ((! FOLD) ? REF diff --git a/regcomp_internal.h b/regcomp_internal.h index bfdb45fc8ad5..8e2c246836e3 100644 --- a/regcomp_internal.h +++ b/regcomp_internal.h @@ -53,7 +53,7 @@ struct RExC_state_t { regnode *emit_start; /* Start of emitted-code area */ regnode_offset emit; /* Code-emit pointer */ I32 naughty; /* How bad is this pattern? */ - I32 sawback; /* Did we see \1, ...? */ + bool sawback; /* Did we see \1, ...? */ SSize_t size; /* Number of regnode equivalents in pattern */ Size_t sets_depth; /* Counts recursion depth of already- From 1a0ea8bdcec30f8d325ca7711b8aef2e95f74611 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dagfinn=20Ilmari=20Manns=C3=A5ker?= Date: Tue, 15 Jul 2025 17:33:00 +0100 Subject: [PATCH 8/8] RExC_state_t: reorder to eliminate holes Combined with the previous previous commits on this branch this shrinks the size of the struct on a 64-bit Linux non-debugging build from: /* size: 368, cachelines: 6, members: 63 */ /* sum members: 363, holes: 1, sum holes: 4 */ /* padding: 1 */ /* last cacheline: 48 bytes */ to: /* size: 344, cachelines: 6, members: 63 */ /* padding: 5 */ /* last cacheline: 24 bytes */ And on a debugging build from: /* size: 408, cachelines: 7, members: 69 */ /* sum members: 403, holes: 2, sum holes: 5 */ /* last cacheline: 24 bytes */ to: /* size: 384, cachelines: 6, members: 69 */ /* sum members: 379, holes: 1, sum holes: 1 */ /* padding: 4 */ --- regcomp_internal.h | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/regcomp_internal.h b/regcomp_internal.h index 8e2c246836e3..28babe4d0bf0 100644 --- a/regcomp_internal.h +++ b/regcomp_internal.h @@ -54,6 +54,15 @@ struct RExC_state_t { regnode_offset emit; /* Code-emit pointer */ I32 naughty; /* How bad is this pattern? */ bool sawback; /* Did we see \1, ...? */ + + bool utf8; /* whether the pattern is utf8 or not */ + bool orig_utf8; /* whether the pattern was originally in utf8 */ + /* XXX use this for future optimisation of case + * where pattern must be upgraded to utf8. */ + bool uni_semantics; /* If a d charset modifier should use unicode + rules, even if the pattern is not in + utf8 */ + SSize_t size; /* Number of regnode equivalents in pattern */ Size_t sets_depth; /* Counts recursion depth of already- @@ -127,23 +136,15 @@ struct RExC_state_t { accept */ I32 seen_zerolen; regnode *end_op; /* END node in program */ - bool utf8; /* whether the pattern is utf8 or not */ - bool orig_utf8; /* whether the pattern was originally in utf8 */ - /* XXX use this for future optimisation of case - * where pattern must be upgraded to utf8. */ - bool uni_semantics; /* If a d charset modifier should use unicode - rules, even if the pattern is not in - utf8 */ - + bool in_lookaround; + bool contains_locale; + bool recode_x_to_native; + bool in_multi_char_class; I32 recurse_count; /* Number of recurse regops we have generated */ regnode **recurse; /* Recurse regops */ U8 *study_chunk_recursed; /* bitmap of which subs we have moved through */ U32 study_chunk_recursed_bytes; /* bytes in bitmap */ - bool in_lookaround; - bool contains_locale; - bool recode_x_to_native; - bool in_multi_char_class; int code_index; /* next code_blocks[] slot */ struct reg_code_blocks *code_blocks;/* positions of literal (?{}) within pattern */ @@ -151,13 +152,13 @@ struct RExC_state_t { scan_frame *frame_head; scan_frame *frame_last; U32 frame_count; - AV *warn_text; - HV *unlexed_names; - SV *runtime_code_qr; /* qr with the runtime code blocks */ bool seen_d_op; bool strict; bool study_started; bool in_script_run; + AV *warn_text; + HV *unlexed_names; + SV *runtime_code_qr; /* qr with the runtime code blocks */ bool use_BRANCHJ; bool sWARN_EXPERIMENTAL__VLB; bool sWARN_EXPERIMENTAL__REGEX_SETS; @@ -170,12 +171,12 @@ struct RExC_state_t { * See GH Issue #21558 and also ba6e2c38aafc23cf114f3ba0d0ff3baead34328b */ #if defined(DEBUGGING) || !defined(USE_DYNAMIC_LOADING) - const char *lastparse; I32 lastnum; - U32 study_chunk_recursed_count; + const char *lastparse; AV *paren_name_list; /* idx -> name */ SV *mysv1; SV *mysv2; + U32 study_chunk_recursed_count; #endif };