DataDog
diff --git a/‎ddprof-lib/src/main/cpp/lockFree.h‎
Lines changed: 51 additions & 0 deletions b/‎ddprof-lib/src/main/cpp/lockFree.h‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎ddprof-lib/src/main/cpp/os_linux_dd.cpp‎
Lines changed: 11 additions & 2 deletions b/‎ddprof-lib/src/main/cpp/os_linux_dd.cpp‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎ddprof-lib/src/main/cpp/thread.cpp‎
Lines changed: 58 additions & 12 deletions b/‎ddprof-lib/src/main/cpp/thread.cpp‎
Lines changed: 58 additions & 12 deletions
diff --git a/‎ddprof-lib/src/main/cpp/thread.h‎
Lines changed: 12 additions & 1 deletion b/‎ddprof-lib/src/main/cpp/thread.h‎
Lines changed: 12 additions & 1 deletion
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2025, Datadog, Inc.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef _LOCKFREE_H
+#define _LOCKFREE_H
+
+#include <atomic>
+#include <cstddef>
+
+/**
+ * Lock-free atomic primitives and utilities.
+ *
+ * This header provides building blocks for lock-free data structures:
+ * - PaddedAtomic: Cache-line padded atomics to prevent false sharing
+ * - Future: Atomic counters, sequence locks, etc.
+ *
+ * For complete synchronization classes (SpinLock, mutexes), see spinLock.h
+ */
+
+// Cache line size for preventing false sharing (typical for x86/ARM)
+// Note: This duplicates DEFAULT_CACHE_LINE_SIZE from arch_dd.h for standalone use
+constexpr size_t CACHE_LINE_SIZE = 64;
+
+/**
+ * Atomic value padded to its own cache line to prevent false sharing.
+ *
+ * Use this when you have an array of atomics that are frequently accessed
+ * by different threads. Without padding, atomics in adjacent array elements
+ * may share a cache line, causing false sharing that degrades performance.
+ *
+ * False sharing occurs when:
+ * - Thread A modifies atomic at index 0
+ * - Thread B modifies atomic at index 1
+ * - Both atomics are on the same cache line
+ * - CPU must invalidate entire cache line, forcing both threads to reload
+ *
+ * Example usage:
+ *   static PaddedAtomic<uint64_t> counters[128];  // Each counter on own cache line
+ *   counters[i].value.fetch_add(1, std::memory_order_relaxed);
+ *
+ * @tparam T The atomic value type (e.g., uint64_t, int, bool)
+ */
+template<typename T>
+struct alignas(CACHE_LINE_SIZE) PaddedAtomic {
+  std::atomic<T> value;
+  // Padding is automatic due to alignas - ensures this struct occupies full cache line
+};
+
+#endif // _LOCKFREE_H
@@ -1,6 +1,7 @@
 #ifdef __linux__
 
 #include "os_dd.h"
+#include "thread.h"
 #include "common.h"
 #include <signal.h>
 #include <unistd.h>
@@ -164,7 +165,7 @@ int ddprof::OS::getThreadCount() {
   if (!status) {
     return -1;
   }
-  
+
   char line[256];
   int thread_count = -1;
   while (fgets(line, sizeof(line), status)) {
@@ -292,7 +293,15 @@ static void* threadDirectoryWatcherLoop(void* arg) {
         int tid = atoi(event->name);
         if (tid > 0) {
           if (event->mask & (IN_CREATE | IN_MOVED_TO)) {
-            if (g_on_new_thread) g_on_new_thread(tid);
+            // Small delay (20ms) to allow JVMTI ThreadStart callback to register Java threads
+            // This virtually eliminates the race condition between thread creation and JVMTI callback
+            struct timespec delay = {0, 20000000}; // 20ms
+            nanosleep(&delay, nullptr);
+
+            // Skip sending signal to likely Java threads
+            if (!ProfiledThread::isLikelyJavaThread(tid) && g_on_new_thread) {
+              g_on_new_thread(tid);
+            }
           } else if (event->mask & (IN_DELETE | IN_MOVED_FROM)) {
             if (g_on_dead_thread) g_on_dead_thread(tid);
           }
 
@@ -1,3 +1,5 @@
+#include "arch_dd.h"
+#include "lockFree.h"
 #include "thread.h"
 #include "os_dd.h"
 #include "profiler.h"
@@ -11,6 +13,15 @@
 // TLS priming signal number
 static int g_tls_prime_signal = -1;
 
+// Define ProfiledThread static members for Java thread tracking
+PaddedAtomic<uint64_t> ProfiledThread::_java_thread_bitset[ProfiledThread::JAVA_THREAD_BITSET_WORDS];
+
+void ProfiledThread::initJavaThreadBitset() {
+  for (size_t i = 0; i < JAVA_THREAD_BITSET_WORDS; i++) {
+    _java_thread_bitset[i].value.store(0, std::memory_order_relaxed);
+  }
+}
+
 pthread_key_t ProfiledThread::_tls_key;
 int ProfiledThread::_buffer_size = 0;
 volatile int ProfiledThread::_running_buffer_pos = 0;
@@ -35,7 +46,8 @@ inline void ProfiledThread::freeKey(void *key) {
       // Buffer-allocated: reset and return to buffer for reuse
       tls_ref->releaseFromBuffer();
     } else {
-      // Non-buffer (JVMTI-allocated): delete the instance
+      // Non-buffer (JVMTI-allocated): unregister Java thread and delete the instance
+      ProfiledThread::unregisterJavaThread(tls_ref->_tid);
       delete tls_ref;
     }
   }
@@ -54,6 +66,9 @@ void ProfiledThread::initCurrentThread() {
   int tid = OS::threadId();
   ProfiledThread *tls = ProfiledThread::forTid(tid);
   pthread_setspecific(_tls_key, (const void *)tls);
+
+  // Register this thread as a Java thread for TLS priming optimization
+  ProfiledThread::registerJavaThread(tid);
 }
 
 void ProfiledThread::initExistingThreads() {
@@ -131,6 +146,9 @@ void ProfiledThread::doInitExistingThreads() {
     return; // Avoid double initialization
   }
 
+  // Initialize Java thread bitset
+  initJavaThreadBitset();
+
   // Register fork handler to prevent issues in forked child processes
   ensureTlsForkHandlerRegistered();
 
@@ -152,17 +170,17 @@ void ProfiledThread::doInitExistingThreads() {
   // Set DD_PROFILER_TLS_WATCHER=1 to enable for native thread priming
   // Supports both environment variable and system property (for JMH forked JVMs)
   const char* watcher_env = std::getenv("DD_PROFILER_TLS_WATCHER");
-  bool watcher_enabled = (watcher_env != nullptr && std::strcmp(watcher_env, "1") == 0);
-
-  // If not set via environment variable, check system property (for JMH compatibility)
-  if (!watcher_enabled) {
-    char* watcher_prop = nullptr;
-    jvmtiEnv *jvmti = VM::jvmti();
-    if (jvmti != nullptr && jvmti->GetSystemProperty("DD_PROFILER_TLS_WATCHER", &watcher_prop) == 0 && watcher_prop != nullptr) {
-      watcher_enabled = (std::strcmp(watcher_prop, "1") == 0);
-      jvmti->Deallocate((unsigned char*)watcher_prop);
-    }
-  }
+  bool watcher_enabled = false; //(watcher_env == nullptr || std::strcmp(watcher_env, "1") == 0);
+
+  // // If not set via environment variable, check system property (for JMH compatibility)
+  // if (watcher_enabled) {
+  //   char* watcher_prop = nullptr;
+  //   jvmtiEnv *jvmti = VM::jvmti();
+  //   if (jvmti != nullptr && jvmti->GetSystemProperty("DD_PROFILER_TLS_WATCHER", &watcher_prop) == 0 && watcher_prop != nullptr) {
+  //     watcher_enabled = (std::strcmp(watcher_prop, "1") != 0);
+  //     jvmti->Deallocate((unsigned char*)watcher_prop);
+  //   }
+  // }
 
   if (watcher_enabled) {
     // Start thread directory watcher to prime new threads (no mass-priming of existing threads)
@@ -390,3 +408,31 @@ void ProfiledThread::simpleTlsSignalHandler(int signo) {
     initCurrentThreadWithBuffer();
   }
 }
+
+void ProfiledThread::registerJavaThread(int tid) {
+  // Apply Knuth multiplicative hash for better distribution
+  size_t hash = static_cast<size_t>(tid) * KNUTH_MULTIPLICATIVE_CONSTANT;
+  size_t bit_index = hash % JAVA_THREAD_BITSET_SIZE;
+  size_t word_index = bit_index / 64;
+  uint64_t bit_mask = 1ULL << (bit_index % 64);
+  _java_thread_bitset[word_index].value.fetch_or(bit_mask, std::memory_order_release);
+}
+
+bool ProfiledThread::isLikelyJavaThread(int tid) {
+  // Apply Knuth multiplicative hash for better distribution
+  size_t hash = static_cast<size_t>(tid) * KNUTH_MULTIPLICATIVE_CONSTANT;
+  size_t bit_index = hash % JAVA_THREAD_BITSET_SIZE;
+  size_t word_index = bit_index / 64;
+  uint64_t bit_mask = 1ULL << (bit_index % 64);
+  uint64_t word = _java_thread_bitset[word_index].value.load(std::memory_order_acquire);
+  return (word & bit_mask) != 0;
+}
+
+void ProfiledThread::unregisterJavaThread(int tid) {
+  // Apply Knuth multiplicative hash for better distribution
+  size_t hash = static_cast<size_t>(tid) * KNUTH_MULTIPLICATIVE_CONSTANT;
+  size_t bit_index = hash % JAVA_THREAD_BITSET_SIZE;
+  size_t word_index = bit_index / 64;
+  uint64_t bit_mask = 1ULL << (bit_index % 64);
+  _java_thread_bitset[word_index].value.fetch_and(~bit_mask, std::memory_order_release);
+}
@@ -9,6 +9,7 @@
 #include "os_dd.h"
 #include "threadLocalData.h"
 #include "unwindStats.h"
+#include "lockFree.h"
 #include <atomic>
 #include <cstdint>
 #include <jvmti.h>
@@ -184,8 +185,18 @@ class ProfiledThread : public ThreadLocalData {
   inline bool isContextTlsInitialized() {
     return _ctx_tls_initialized;
   }
-  
+
+  // Java thread tracking for TLS priming optimization
+  static void registerJavaThread(int tid);
+  static void unregisterJavaThread(int tid);
+  static bool isLikelyJavaThread(int tid);
+
 private:
+  // Lock-free bitset for Java thread tracking
+  static constexpr size_t JAVA_THREAD_BITSET_SIZE = 8192;
+  static constexpr size_t JAVA_THREAD_BITSET_WORDS = JAVA_THREAD_BITSET_SIZE / 64;
+  static PaddedAtomic<uint64_t> _java_thread_bitset[JAVA_THREAD_BITSET_WORDS];
+  static void initJavaThreadBitset();
   // Atomic flag for signal handler reentrancy protection within the same thread
   // Must be atomic because a signal handler can interrupt normal execution mid-instruction,
   // and both contexts may attempt to enter the critical section. Without atomic exchange(),