Skip to content

Commit 3ce9b6d

Browse files
authored
Merge pull request #260 from sysprog21/string-interning
Use string interning for identifier deduplication
2 parents 6a97bd7 + d6e1889 commit 3ce9b6d

File tree

3 files changed

+91
-17
lines changed

3 files changed

+91
-17
lines changed

src/defs.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,16 @@ typedef struct {
224224
int count;
225225
} token_buffer_t;
226226

227+
/* String pool for identifier deduplication */
228+
typedef struct {
229+
hashmap_t *strings; /* Map string -> interned string */
230+
} string_pool_t;
231+
232+
/* String literal pool for deduplicating string constants */
233+
typedef struct {
234+
hashmap_t *literals; /* Map string literal -> ELF data offset */
235+
} string_literal_pool_t;
236+
227237
/* builtin types */
228238
typedef enum {
229239
TYPE_void = 0,

src/globals.c

Lines changed: 66 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313

1414
#include "defs.h"
1515

16+
/* Forward declaration for string interning */
17+
char *intern_string(char *str);
18+
1619
/* Lexer */
1720
char token_str[MAX_TOKEN_LEN];
1821
token_t next_token;
@@ -673,7 +676,8 @@ void add_alias(char *alias, char *value)
673676
printf("Failed to allocate alias_t\n");
674677
return;
675678
}
676-
strcpy(al->alias, alias);
679+
/* Use interned string for alias name */
680+
strcpy(al->alias, intern_string(alias));
677681
hashmap_put(ALIASES_MAP, alias, al);
678682
}
679683
strcpy(al->value, value);
@@ -707,7 +711,8 @@ macro_t *add_macro(char *name)
707711
printf("Failed to allocate macro_t\n");
708712
return NULL;
709713
}
710-
strcpy(ma->name, name);
714+
/* Use interned string for macro name */
715+
strcpy(ma->name, intern_string(name));
711716
hashmap_put(MACROS_MAP, name, ma);
712717
}
713718
ma->disabled = false;
@@ -733,6 +738,41 @@ bool remove_macro(char *name)
733738
}
734739

735740
void error(char *msg);
741+
742+
/* String pool global */
743+
string_pool_t *string_pool;
744+
string_literal_pool_t *string_literal_pool;
745+
746+
/* Safe string interning that works with self-hosting */
747+
char *intern_string(char *str)
748+
{
749+
char *existing;
750+
char *interned;
751+
int len;
752+
753+
/* Safety: return original if NULL */
754+
if (!str)
755+
return NULL;
756+
757+
/* Safety: can't intern before initialization */
758+
if (!GENERAL_ARENA || !string_pool)
759+
return str;
760+
761+
/* Check if already interned */
762+
existing = hashmap_get(string_pool->strings, str);
763+
if (existing)
764+
return existing;
765+
766+
/* Allocate and store new string */
767+
len = strlen(str) + 1;
768+
interned = arena_alloc(GENERAL_ARENA, len);
769+
strcpy(interned, str);
770+
771+
hashmap_put(string_pool->strings, interned, interned);
772+
773+
return interned;
774+
}
775+
736776
int find_macro_param_src_idx(char *name, block_t *parent)
737777
{
738778
macro_t *macro = parent->macro;
@@ -761,7 +801,8 @@ type_t *add_type(void)
761801
type_t *add_named_type(char *name)
762802
{
763803
type_t *type = add_type();
764-
strcpy(type->type_name, name);
804+
/* Use interned string for type name */
805+
strcpy(type->type_name, intern_string(name));
765806
return type;
766807
}
767808

@@ -773,7 +814,8 @@ void add_constant(char alias[], int value)
773814
return;
774815
}
775816

776-
strcpy(constant->alias, alias);
817+
/* Use interned string for constant name */
818+
strcpy(constant->alias, intern_string(alias));
777819
constant->value = value;
778820
hashmap_put(CONSTANTS_MAP, alias, constant);
779821
}
@@ -877,7 +919,8 @@ func_t *add_func(char *func_name, bool synthesize)
877919

878920
func = arena_alloc_func();
879921
hashmap_put(FUNC_MAP, func_name, func);
880-
strcpy(func->return_def.var_name, func_name);
922+
/* Use interned string for function name */
923+
strcpy(func->return_def.var_name, intern_string(func_name));
881924
func->stack_size = 4;
882925

883926
if (synthesize)
@@ -1042,7 +1085,7 @@ void add_insn(block_t *block,
10421085
n->idx = 0;
10431086

10441087
if (str)
1045-
strcpy(n->str, str);
1088+
strcpy(n->str, intern_string(str));
10461089
else
10471090
n->str[0] = '\0';
10481091

@@ -1151,6 +1194,16 @@ void global_init(void)
11511194
TYPES = arena_alloc(GENERAL_ARENA, MAX_TYPES * sizeof(type_t));
11521195
PH2_IR_FLATTEN =
11531196
arena_alloc(GENERAL_ARENA, MAX_IR_INSTR * sizeof(ph2_ir_t *));
1197+
1198+
/* Initialize string pool for identifier deduplication */
1199+
string_pool = arena_alloc(GENERAL_ARENA, sizeof(string_pool_t));
1200+
string_pool->strings = hashmap_create(512);
1201+
1202+
/* Initialize string literal pool for deduplicating string constants */
1203+
string_literal_pool =
1204+
arena_alloc(GENERAL_ARENA, sizeof(string_literal_pool_t));
1205+
string_literal_pool->literals = hashmap_create(256);
1206+
11541207
SOURCE = strbuf_create(MAX_SOURCE);
11551208
FUNC_MAP = hashmap_create(DEFAULT_FUNCS_SIZE);
11561209
INCLUSION_MAP = hashmap_create(DEFAULT_INCLUSIONS_SIZE);
@@ -1273,6 +1326,13 @@ void global_release(void)
12731326
lexer_cleanup();
12741327

12751328
hashmap_free(MACROS_MAP);
1329+
1330+
/* Free string interning hashmaps */
1331+
if (string_pool && string_pool->strings)
1332+
hashmap_free(string_pool->strings);
1333+
if (string_literal_pool && string_literal_pool->literals)
1334+
hashmap_free(string_literal_pool->literals);
1335+
12761336
arena_free(BLOCK_ARENA);
12771337
arena_free(INSN_ARENA);
12781338
arena_free(BB_ARENA);

src/parser.c

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -554,7 +554,7 @@ bool read_preproc_directive(void)
554554
while (lex_peek(T_identifier, alias)) {
555555
lex_expect(T_identifier);
556556
strcpy(macro->param_defs[macro->num_param_defs++].var_name,
557-
alias);
557+
intern_string(alias));
558558
lex_accept(T_comma);
559559
}
560560
if (lex_accept(T_elipsis))
@@ -1192,14 +1192,18 @@ void read_inner_var_decl(var_t *vd, int anon, int is_param)
11921192
/* is it function pointer declaration? */
11931193
if (lex_accept(T_open_bracket)) {
11941194
func_t func;
1195+
char temp_name[MAX_VAR_LEN];
11951196
lex_expect(T_asterisk);
1196-
lex_ident(T_identifier, vd->var_name);
1197+
lex_ident(T_identifier, temp_name);
1198+
strcpy(vd->var_name, intern_string(temp_name));
11971199
lex_expect(T_close_bracket);
11981200
read_parameter_list_decl(&func, 1);
11991201
vd->is_func = true;
12001202
} else {
12011203
if (anon == 0) {
1202-
lex_ident(T_identifier, vd->var_name);
1204+
char temp_name[MAX_VAR_LEN];
1205+
lex_ident(T_identifier, temp_name);
1206+
strcpy(vd->var_name, intern_string(temp_name));
12031207
if (!lex_peek(T_open_bracket, NULL) && !is_param) {
12041208
if (vd->is_global) {
12051209
opstack_push(vd);
@@ -2078,7 +2082,7 @@ void read_expr_operand(block_t *parent, basic_block_t **bb)
20782082
/* indirective function pointer assignment */
20792083
vd = require_var(parent);
20802084
vd->is_func = true;
2081-
strcpy(vd->var_name, token);
2085+
strcpy(vd->var_name, intern_string(token));
20822086
opstack_push(vd);
20832087
}
20842088
} else if (lex_accept(T_open_curly)) {
@@ -4431,7 +4435,7 @@ void read_global_statement(void)
44314435
if (!type)
44324436
type = add_type();
44334437

4434-
strcpy(type->type_name, token);
4438+
strcpy(type->type_name, intern_string(token));
44354439
type->base_type = TYPE_struct;
44364440

44374441
lex_expect(T_open_curly);
@@ -4469,7 +4473,7 @@ void read_global_statement(void)
44694473
if (!type)
44704474
type = add_type();
44714475

4472-
strcpy(type->type_name, token);
4476+
strcpy(type->type_name, intern_string(token));
44734477
type->base_type = TYPE_union;
44744478

44754479
lex_expect(T_open_curly);
@@ -4520,7 +4524,7 @@ void read_global_statement(void)
45204524
} while (lex_accept(T_comma));
45214525
lex_expect(T_close_curly);
45224526
lex_ident(T_identifier, token);
4523-
strcpy(type->type_name, token);
4527+
strcpy(type->type_name, intern_string(token));
45244528
lex_expect(T_semicolon);
45254529
} else if (lex_accept(T_struct)) {
45264530
int i = 0, size = 0, has_struct_def = 0;
@@ -4535,7 +4539,7 @@ void read_global_statement(void)
45354539
if (!tag) {
45364540
tag = add_type();
45374541
tag->base_type = TYPE_struct;
4538-
strcpy(tag->type_name, token);
4542+
strcpy(tag->type_name, intern_string(token));
45394543
}
45404544
}
45414545

@@ -4574,7 +4578,7 @@ void read_global_statement(void)
45744578
strcpy(token, tag->type_name);
45754579
memcpy(tag, type, sizeof(type_t));
45764580
tag->base_type = TYPE_struct;
4577-
strcpy(tag->type_name, token);
4581+
strcpy(tag->type_name, intern_string(token));
45784582
} else {
45794583
/* If it is a forward declaration, build a connection between
45804584
* structure tag and alias. In 'find_type', it will retrieve
@@ -4597,7 +4601,7 @@ void read_global_statement(void)
45974601
if (!tag) {
45984602
tag = add_type();
45994603
tag->base_type = TYPE_union;
4600-
strcpy(tag->type_name, token);
4604+
strcpy(tag->type_name, intern_string(token));
46014605
}
46024606
}
46034607

@@ -4640,7 +4644,7 @@ void read_global_statement(void)
46404644
strcpy(token, tag->type_name);
46414645
memcpy(tag, type, sizeof(type_t));
46424646
tag->base_type = TYPE_union;
4643-
strcpy(tag->type_name, token);
4647+
strcpy(tag->type_name, intern_string(token));
46444648
} else {
46454649
/* If it is a forward declaration, build a connection between
46464650
* union tag and alias. In 'find_type', it will retrieve

0 commit comments

Comments
 (0)