8
8
#include < algorithm>
9
9
#include < cstring>
10
10
11
+ ThreadFilter::ShardHead ThreadFilter::_free_heads[ThreadFilter::kShardCount ] {};
12
+
11
13
ThreadFilter::ThreadFilter () : _enabled(false ) {
12
14
// Initialize chunk pointers to null (lazy allocation)
13
15
for (int i = 0 ; i < kMaxChunks ; ++i) {
14
16
_chunks[i].store (nullptr , std::memory_order_relaxed);
15
17
}
16
18
_free_list = std::make_unique<FreeListNode[]>(kFreeListSize );
17
-
19
+
18
20
// Initialize the first chunk
19
21
initializeChunk (0 );
20
22
clear ();
@@ -30,14 +32,14 @@ ThreadFilter::~ThreadFilter() {
30
32
31
33
void ThreadFilter::initializeChunk (int chunk_idx) {
32
34
if (chunk_idx >= kMaxChunks ) return ;
33
-
35
+
34
36
// Check if chunk already exists
35
37
ChunkStorage* existing = _chunks[chunk_idx].load (std::memory_order_acquire);
36
38
if (existing != nullptr ) return ;
37
-
39
+
38
40
// Allocate new chunk
39
41
ChunkStorage* new_chunk = new ChunkStorage ();
40
-
42
+
41
43
// Try to install it atomically
42
44
ChunkStorage* expected = nullptr ;
43
45
if (_chunks[chunk_idx].compare_exchange_strong (expected, new_chunk, std::memory_order_acq_rel)) {
@@ -68,24 +70,24 @@ ThreadFilter::SlotID ThreadFilter::registerThread() {
68
70
}
69
71
70
72
const int chunk_idx = index >> kChunkShift ;
71
-
73
+
72
74
// Ensure the chunk is initialized (lock-free)
73
75
if (chunk_idx >= _num_chunks.load (std::memory_order_acquire)) {
74
76
// Update the chunk count atomically
75
77
int expected_chunks = chunk_idx;
76
78
int desired_chunks = chunk_idx + 1 ;
77
- while (!_num_chunks.compare_exchange_weak (expected_chunks, desired_chunks,
79
+ while (!_num_chunks.compare_exchange_weak (expected_chunks, desired_chunks,
78
80
std::memory_order_acq_rel)) {
79
81
if (expected_chunks > chunk_idx) {
80
82
break ; // Another thread already updated it
81
83
}
82
84
desired_chunks = expected_chunks + 1 ;
83
85
}
84
86
}
85
-
87
+
86
88
// Initialize the chunk if needed
87
89
initializeChunk (chunk_idx);
88
-
90
+
89
91
return index;
90
92
}
91
93
@@ -100,64 +102,61 @@ void ThreadFilter::clear() {
100
102
}
101
103
}
102
104
}
103
-
105
+
104
106
// Clear the free list
105
107
for (int i = 0 ; i < kFreeListSize ; ++i) {
106
108
_free_list[i].value .store (-1 , std::memory_order_relaxed);
107
109
_free_list[i].next .store (-1 , std::memory_order_relaxed);
108
110
}
109
- _free_list_head.store (-1 , std::memory_order_relaxed);
110
- _active_slots.store (0 , std::memory_order_relaxed);
111
+
112
+ // Reset the free heads for each shard
113
+ for (int s = 0 ; s < kShardCount ; ++s) {
114
+ _free_heads[s].head .store (-1 , std::memory_order_relaxed);
115
+ }
111
116
}
112
117
113
118
bool ThreadFilter::accept (SlotID slot_id) const {
114
119
if (!_enabled) {
115
120
return true ;
116
121
}
117
122
if (slot_id < 0 ) return false ;
118
-
123
+
119
124
int chunk_idx = slot_id >> kChunkShift ;
120
125
int slot_idx = slot_id & kChunkMask ;
121
-
126
+
122
127
if (chunk_idx >= kMaxChunks ) return false ;
123
128
ChunkStorage* chunk = _chunks[chunk_idx].load (std::memory_order_relaxed);
124
129
if (chunk == nullptr ) return false ; // Fail-fast if not allocated
125
-
130
+
126
131
return chunk->slots [slot_idx].value .load (std::memory_order_acquire) != -1 ;
127
132
}
128
133
129
134
void ThreadFilter::add (int tid, SlotID slot_id) {
130
135
if (slot_id < 0 ) return ;
131
-
136
+
132
137
int chunk_idx = slot_id >> kChunkShift ;
133
138
int slot_idx = slot_id & kChunkMask ;
134
-
139
+
135
140
if (chunk_idx >= kMaxChunks ) return ;
136
141
ChunkStorage* chunk = _chunks[chunk_idx].load (std::memory_order_relaxed);
137
142
if (chunk == nullptr ) return ; // Fail-fast if not allocated
138
-
139
- // Store the tid and increment active slots if this was previously empty
140
- int old_value = chunk->slots [slot_idx].value .exchange (tid, std::memory_order_acq_rel);
141
- if (old_value == -1 ) {
142
- _active_slots.fetch_add (1 , std::memory_order_relaxed);
143
- }
143
+
144
+ // Store the tid
145
+ chunk->slots [slot_idx].value .store (tid, std::memory_order_release);
144
146
}
145
147
146
148
void ThreadFilter::remove (SlotID slot_id) {
147
149
if (slot_id < 0 ) return ;
148
-
150
+
149
151
int chunk_idx = slot_id >> kChunkShift ;
150
152
int slot_idx = slot_id & kChunkMask ;
151
-
153
+
152
154
if (chunk_idx >= kMaxChunks ) return ;
153
155
ChunkStorage* chunk = _chunks[chunk_idx].load (std::memory_order_relaxed);
154
156
if (chunk == nullptr ) return ; // Fail-fast if not allocated
155
-
156
- // Remove the tid and decrement active slots if this was previously occupied
157
- int old_value = chunk->slots [slot_idx].value .exchange (-1 , std::memory_order_acq_rel);
158
- if (old_value != -1 ) {
159
- _active_slots.fetch_sub (1 , std::memory_order_relaxed);
160
- }
157
+
158
+ // Remove the tid
159
+ chunk->slots [slot_idx].value .store (-1 , std::memory_order_acq_rel);
161
160
}
162
161
163
162
void ThreadFilter::unregisterThread (SlotID slot_id) {
@@ -171,54 +170,60 @@ void ThreadFilter::unregisterThread(SlotID slot_id) {
171
170
}
172
171
173
172
bool ThreadFilter::pushToFreeList (SlotID slot_id) {
174
- // Lock-free Treiber stack push
173
+ // Lock-free sharded Treiber stack push
174
+ const int shard = shardOfSlot (slot_id);
175
+ auto & head = _free_heads[shard].head ; // private cache-line
176
+
175
177
for (int i = 0 ; i < kFreeListSize ; ++i) {
176
178
int expected = -1 ;
177
- if (_free_list[i].value .compare_exchange_strong (expected, slot_id, std::memory_order_acq_rel)) {
178
- // Successfully stored in this slot
179
- int old_head = _free_list_head.load (std::memory_order_acquire);
179
+ if (_free_list[i].value .compare_exchange_strong (
180
+ expected, slot_id, std::memory_order_acq_rel)) {
181
+ // Link node into this shard’s Treiber stack
182
+ int old_head = head.load (std::memory_order_acquire);
180
183
do {
181
184
_free_list[i].next .store (old_head, std::memory_order_relaxed);
182
- } while (!_free_list_head.compare_exchange_weak (old_head, i, std::memory_order_acq_rel));
185
+ } while (!head.compare_exchange_weak (old_head, i,
186
+ std::memory_order_acq_rel, std::memory_order_relaxed));
183
187
return true ;
184
188
}
185
189
}
186
190
return false ; // Free list full, slot is lost but this is rare
187
191
}
188
192
189
193
ThreadFilter::SlotID ThreadFilter::popFromFreeList () {
190
- // Lock-free Treiber stack pop
191
- while (true ) {
192
- int head = _free_list_head.load (std::memory_order_acquire);
193
- if (head == -1 ) {
194
- return -1 ; // Empty list
195
- }
196
-
197
- int slot_id = _free_list[head].value .load (std::memory_order_acquire);
198
- int next = _free_list[head].next .load (std::memory_order_acquire);
199
-
200
- // Try to update the head
201
- if (_free_list_head.compare_exchange_weak (head, next, std::memory_order_acq_rel)) {
202
- // Clear the node
203
- _free_list[head].value .store (-1 , std::memory_order_relaxed);
204
- _free_list[head].next .store (-1 , std::memory_order_relaxed);
205
- return slot_id;
194
+ // Lock-free sharded Treiber stack pop
195
+ int hash = static_cast <int >(std::hash<std::thread::id>{}(std::this_thread::get_id ()));
196
+ int start = shardOf (hash);
197
+
198
+ for (int pass = 0 ; pass < kShardCount ; ++pass) {
199
+ int s = (start + pass) & (kShardCount - 1 );
200
+ auto & head = _free_heads[s].head ;
201
+
202
+ while (true ) {
203
+ int node = head.load (std::memory_order_acquire);
204
+ if (node == -1 ) break ; // shard empty → try next
205
+
206
+ int next = _free_list[node].next .load (std::memory_order_relaxed);
207
+ if (head.compare_exchange_weak (node, next,
208
+ std::memory_order_acq_rel,
209
+ std::memory_order_relaxed))
210
+ {
211
+ int id = _free_list[node].value .exchange (-1 ,
212
+ std::memory_order_relaxed);
213
+ _free_list[node].next .store (-1 , std::memory_order_relaxed);
214
+ return id;
215
+ }
206
216
}
207
- // Retry if another thread modified the head
208
217
}
218
+ return -1 ; // Empty list
209
219
}
210
220
211
221
void ThreadFilter::collect (std::vector<int >& tids) const {
212
222
tids.clear ();
213
223
214
- // Early exit if no active slots
215
- int active_count = _active_slots.load (std::memory_order_relaxed);
216
- if (active_count == 0 ) {
217
- return ;
218
- }
219
-
220
224
// Reserve space for efficiency
221
- tids.reserve (active_count);
225
+ // The eventual resize is not the bottleneck, so we reserve a reasonable size
226
+ tids.reserve (512 );
222
227
223
228
// Scan only initialized chunks
224
229
int num_chunks = _num_chunks.load (std::memory_order_relaxed);
0 commit comments