Exterminate the last remnants of false sharing

jbachorik · jbachorik · commit e0ac246d7ce4 · 2025-07-10T14:43:55.000+02:00
diff --git a/ddprof-lib/src/main/cpp/threadFilter.cpp b/ddprof-lib/src/main/cpp/threadFilter.cpp
@@ -8,13 +8,15 @@
 #include <algorithm>
 #include <cstring>
 
+ThreadFilter::ShardHead ThreadFilter::_free_heads[ThreadFilter::kShardCount] {};
+
 ThreadFilter::ThreadFilter() : _enabled(false) {
     // Initialize chunk pointers to null (lazy allocation)
     for (int i = 0; i < kMaxChunks; ++i) {
         _chunks[i].store(nullptr, std::memory_order_relaxed);
     }
     _free_list = std::make_unique<FreeListNode[]>(kFreeListSize);
-    
+
     // Initialize the first chunk
     initializeChunk(0);
     clear();
@@ -30,14 +32,14 @@ ThreadFilter::~ThreadFilter() {
 
 void ThreadFilter::initializeChunk(int chunk_idx) {
     if (chunk_idx >= kMaxChunks) return;
-    
+
     // Check if chunk already exists
     ChunkStorage* existing = _chunks[chunk_idx].load(std::memory_order_acquire);
     if (existing != nullptr) return;
-    
+
     // Allocate new chunk
     ChunkStorage* new_chunk = new ChunkStorage();
-    
+
     // Try to install it atomically
     ChunkStorage* expected = nullptr;
     if (_chunks[chunk_idx].compare_exchange_strong(expected, new_chunk, std::memory_order_acq_rel)) {
@@ -68,24 +70,24 @@ ThreadFilter::SlotID ThreadFilter::registerThread() {
     }
 
     const int chunk_idx = index >> kChunkShift;
-    
+
     // Ensure the chunk is initialized (lock-free)
     if (chunk_idx >= _num_chunks.load(std::memory_order_acquire)) {
         // Update the chunk count atomically
         int expected_chunks = chunk_idx;
         int desired_chunks = chunk_idx + 1;
-        while (!_num_chunks.compare_exchange_weak(expected_chunks, desired_chunks, 
+        while (!_num_chunks.compare_exchange_weak(expected_chunks, desired_chunks,
                                                    std::memory_order_acq_rel)) {
             if (expected_chunks > chunk_idx) {
                 break; // Another thread already updated it
             }
             desired_chunks = expected_chunks + 1;
         }
     }
-    
+
     // Initialize the chunk if needed
     initializeChunk(chunk_idx);
-    
+
     return index;
 }
 
@@ -100,64 +102,61 @@ void ThreadFilter::clear() {
             }
         }
     }
-    
+
     // Clear the free list
     for (int i = 0; i < kFreeListSize; ++i) {
         _free_list[i].value.store(-1, std::memory_order_relaxed);
         _free_list[i].next.store(-1, std::memory_order_relaxed);
     }
-    _free_list_head.store(-1, std::memory_order_relaxed);
-    _active_slots.store(0, std::memory_order_relaxed);
+
+    // Reset the free heads for each shard
+    for (int s = 0; s < kShardCount; ++s) {
+        _free_heads[s].head.store(-1, std::memory_order_relaxed);
+    }
 }
 
 bool ThreadFilter::accept(SlotID slot_id) const {
     if (!_enabled) {
         return true;
     }
     if (slot_id < 0) return false;
-    
+
     int chunk_idx = slot_id >> kChunkShift;
     int slot_idx = slot_id & kChunkMask;
-    
+
     if (chunk_idx >= kMaxChunks) return false;
     ChunkStorage* chunk = _chunks[chunk_idx].load(std::memory_order_relaxed);
     if (chunk == nullptr) return false;  // Fail-fast if not allocated
-    
+
     return chunk->slots[slot_idx].value.load(std::memory_order_acquire) != -1;
 }
 
 void ThreadFilter::add(int tid, SlotID slot_id) {
     if (slot_id < 0) return;
-    
+
     int chunk_idx = slot_id >> kChunkShift;
     int slot_idx = slot_id & kChunkMask;
-    
+
     if (chunk_idx >= kMaxChunks) return;
     ChunkStorage* chunk = _chunks[chunk_idx].load(std::memory_order_relaxed);
     if (chunk == nullptr) return;  // Fail-fast if not allocated
-    
-    // Store the tid and increment active slots if this was previously empty
-    int old_value = chunk->slots[slot_idx].value.exchange(tid, std::memory_order_acq_rel);
-    if (old_value == -1) {
-        _active_slots.fetch_add(1, std::memory_order_relaxed);
-    }
+
+    // Store the tid
+    chunk->slots[slot_idx].value.store(tid, std::memory_order_release);
 }
 
 void ThreadFilter::remove(SlotID slot_id) {
     if (slot_id < 0) return;
-    
+
     int chunk_idx = slot_id >> kChunkShift;
     int slot_idx = slot_id & kChunkMask;
-    
+
     if (chunk_idx >= kMaxChunks) return;
     ChunkStorage* chunk = _chunks[chunk_idx].load(std::memory_order_relaxed);
     if (chunk == nullptr) return;  // Fail-fast if not allocated
-    
-    // Remove the tid and decrement active slots if this was previously occupied
-    int old_value = chunk->slots[slot_idx].value.exchange(-1, std::memory_order_acq_rel);
-    if (old_value != -1) {
-        _active_slots.fetch_sub(1, std::memory_order_relaxed);
-    }
+
+    // Remove the tid
+    chunk->slots[slot_idx].value.store(-1, std::memory_order_acq_rel);
 }
 
 void ThreadFilter::unregisterThread(SlotID slot_id) {
@@ -171,54 +170,60 @@ void ThreadFilter::unregisterThread(SlotID slot_id) {
 }
 
 bool ThreadFilter::pushToFreeList(SlotID slot_id) {
-    // Lock-free Treiber stack push
+    // Lock-free sharded Treiber stack push
+    const int shard = shardOfSlot(slot_id);
+    auto& head      = _free_heads[shard].head; // private cache-line
+
     for (int i = 0; i < kFreeListSize; ++i) {
         int expected = -1;
-        if (_free_list[i].value.compare_exchange_strong(expected, slot_id, std::memory_order_acq_rel)) {
-            // Successfully stored in this slot
-            int old_head = _free_list_head.load(std::memory_order_acquire);
+        if (_free_list[i].value.compare_exchange_strong(
+                expected, slot_id, std::memory_order_acq_rel)) {
+            // Link node into this shard’s Treiber stack
+            int old_head = head.load(std::memory_order_acquire);
             do {
                 _free_list[i].next.store(old_head, std::memory_order_relaxed);
-            } while (!_free_list_head.compare_exchange_weak(old_head, i, std::memory_order_acq_rel));
+            } while (!head.compare_exchange_weak(old_head, i,
+                       std::memory_order_acq_rel, std::memory_order_relaxed));
             return true;
         }
     }
     return false; // Free list full, slot is lost but this is rare
 }
 
 ThreadFilter::SlotID ThreadFilter::popFromFreeList() {
-    // Lock-free Treiber stack pop
-    while (true) {
-        int head = _free_list_head.load(std::memory_order_acquire);
-        if (head == -1) {
-            return -1; // Empty list
-        }
-        
-        int slot_id = _free_list[head].value.load(std::memory_order_acquire);
-        int next = _free_list[head].next.load(std::memory_order_acquire);
-        
-        // Try to update the head
-        if (_free_list_head.compare_exchange_weak(head, next, std::memory_order_acq_rel)) {
-            // Clear the node
-            _free_list[head].value.store(-1, std::memory_order_relaxed);
-            _free_list[head].next.store(-1, std::memory_order_relaxed);
-            return slot_id;
+    // Lock-free sharded Treiber stack pop
+    int hash = static_cast<int>(std::hash<std::thread::id>{}(std::this_thread::get_id()));
+    int start = shardOf(hash);
+
+    for (int pass = 0; pass < kShardCount; ++pass) {
+        int s      = (start + pass) & (kShardCount - 1);
+        auto& head = _free_heads[s].head;
+
+        while (true) {
+            int node = head.load(std::memory_order_acquire);
+            if (node == -1) break;                 // shard empty → try next
+
+            int next = _free_list[node].next.load(std::memory_order_relaxed);
+            if (head.compare_exchange_weak(node, next,
+                                           std::memory_order_acq_rel,
+                                           std::memory_order_relaxed))
+            {
+                int id = _free_list[node].value.exchange(-1,
+                              std::memory_order_relaxed);
+                _free_list[node].next.store(-1, std::memory_order_relaxed);
+                return id;
+            }
         }
-        // Retry if another thread modified the head
     }
+    return -1; // Empty list
 }
 
 void ThreadFilter::collect(std::vector<int>& tids) const {
     tids.clear();
     
-    // Early exit if no active slots
-    int active_count = _active_slots.load(std::memory_order_relaxed);
-    if (active_count == 0) {
-        return;
-    }
-    
     // Reserve space for efficiency
-    tids.reserve(active_count);
+    // The eventual resize is not the bottleneck, so we reserve a reasonable size
+    tids.reserve(512);
     
     // Scan only initialized chunks
     int num_chunks = _num_chunks.load(std::memory_order_relaxed);
diff --git a/ddprof-lib/src/main/cpp/threadFilter.h b/ddprof-lib/src/main/cpp/threadFilter.h
@@ -17,7 +17,10 @@ class ThreadFilter {
     static constexpr int kChunkMask = kChunkSize - 1;
     static constexpr int kMaxThreads = 2048;
     static constexpr int kMaxChunks = (kMaxThreads + kChunkSize - 1) / kChunkSize;  // = 8 chunks
-    
+    // High-performance free list using Treiber stack, 64 shards
+    static constexpr int kFreeListSize  = 1024;
+    static constexpr int kShardCount    = 64;          // power-of-two
+
     ThreadFilter();
     ~ThreadFilter();
 
@@ -52,21 +55,21 @@ class ThreadFilter {
     };
 
     bool _enabled = false;
-    
+
     // Lazily allocated storage for chunks
     std::atomic<ChunkStorage*> _chunks[kMaxChunks];
     std::atomic<int> _num_chunks{1};
-    
+
     // Lock-free slot allocation
     std::atomic<SlotID> _next_index{0};
-    
-    // High-performance free list using Treiber stack
-    static constexpr int kFreeListSize = 1024;  // Increased from 128
+
     std::unique_ptr<FreeListNode[]> _free_list;
-    std::atomic<int> _free_list_head{-1};
-    
-    // Active slot tracking for efficient collect()
-    std::atomic<int> _active_slots{0};
+
+    struct alignas(64) ShardHead { std::atomic<int> head{-1}; };
+    static ShardHead _free_heads[kShardCount];         // one cache-line each
+
+    static inline int shardOf(int tid)  { return tid & (kShardCount - 1); }
+    static inline int shardOfSlot(int s){ return s  & (kShardCount - 1); }
     
     // Helper methods for lock-free operations
     void initializeChunk(int chunk_idx);
diff --git a/ddprof-lib/src/main/cpp/threadIdTable.h b/ddprof-lib/src/main/cpp/threadIdTable.h
@@ -32,7 +32,7 @@ class ThreadIdTable {
         
         int start_slot = hash(tid);
         for (int probe = 0; probe < TABLE_SIZE; probe++) {
-            int slot = (start_slot + probe) % TABLE_SIZE;
+            int slot = (start_slot + (probe * 7)) % TABLE_SIZE;
             int expected = 0;
             
             // Try to claim empty slot
diff --git a/ddprof-stresstest/src/jmh/java/com/datadoghq/profiler/stresstest/Main.java b/ddprof-stresstest/src/jmh/java/com/datadoghq/profiler/stresstest/Main.java
@@ -8,6 +8,7 @@
 import org.openjdk.jmh.runner.options.OptionsBuilder;
 import org.openjdk.jmh.runner.options.TimeValue;
 
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.concurrent.TimeUnit;
 
@@ -17,11 +18,9 @@ public class Main {
 
     public static void main(String... args) throws Exception {
         String filter = "*";
-        if (args.length == 1) {
+        if (args.length >= 1) {
             filter = args[0];
-        } else if (args.length > 1) {
-            System.err.println("Usage: java -jar ddprof-stresstest.jar [scenario filter]");
-            System.exit(1);
+            args = Arrays.copyOfRange(args, 1, args.length);
         }
         CommandLineOptions commandLineOptions = new CommandLineOptions(args);
         Mode mode = Mode.AverageTime;
diff --git a/ddprof-stresstest/src/jmh/java/com/datadoghq/profiler/stresstest/scenarios/throughput/ThreadFilterBenchmark.java b/ddprof-stresstest/src/jmh/java/com/datadoghq/profiler/stresstest/scenarios/throughput/ThreadFilterBenchmark.java