Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,25 @@ With separate debug symbol packages for production debugging support.

- This ensures the full build log is captured to a file and only a summary is shown in the main session.

## GitHub Operations

### Updating PR Descriptions

The `gh pr edit` command may fail with a GraphQL error about "Projects (classic)" deprecation. Use the GitHub API directly instead:

```bash
# 1. Write the PR body to a file (e.g., pr-body.md)

# 2. Convert to JSON and update via API
jq -Rs '{body: .}' pr-body.md > /tmp/pr-update.json
gh api repos/DataDog/java-profiler/pulls/<PR_NUMBER> -X PATCH --input /tmp/pr-update.json

# 3. Verify the update
gh pr view <PR_NUMBER> --json body -q '.body' | head -30
```

This workaround properly escapes the markdown content and avoids the GraphQL Projects deprecation error.

## Ground rules
- Never replace the code you work on with stubs
- Never 'fix' the tests by testing constants against constants
Expand Down
5 changes: 5 additions & 0 deletions ddprof-lib/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -695,6 +695,11 @@ tasks.register('sourcesJar', Jar) {
archiveVersion = component_version
}

javadoc {
// Exclude classes that use internal JDK APIs not visible to javadoc
exclude '**/BufferWriter8.java'
}

tasks.register('javadocJar', Jar) {
dependsOn javadoc
archiveBaseName = libraryName
Expand Down
2 changes: 1 addition & 1 deletion ddprof-lib/src/main/cpp/callTraceStorage.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ uint64_t HazardPointer::occupied_bitmap[HazardPointer::BITMAP_WORDS] = {};
int HazardPointer::getThreadHazardSlot() {
// Signal-safe collision resolution: use OS::threadId() with semi-random prime step probing
// This avoids thread_local allocation issues
ProfiledThread* thrd = ProfiledThread::currentSignalSafe();
ProfiledThread* thrd = ProfiledThread::get();
int tid = thrd != nullptr ? thrd->tid() : OS::threadId();

// Semi-random prime step probing to eliminate secondary clustering
Expand Down
7 changes: 4 additions & 3 deletions ddprof-lib/src/main/cpp/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,16 @@
DLLEXPORT thread_local Context context_tls_v1;

Context& Contexts::initializeContextTls() {
// ProfiledThread::current() will never return nullptr
Context& ctx = context_tls_v1;
// Store pointer for signal-safe access
ProfiledThread::current()->markContextTlsInitialized(&ctx);
// Use getOrCreate() because this can be called before profiling starts
// (e.g., context TLS init during library loading, before onThreadStart callback)
ProfiledThread::getOrCreate()->markContextTlsInitialized(&ctx);
return ctx;
}

Context& Contexts::get() {
ProfiledThread* thrd = ProfiledThread::currentSignalSafe();
ProfiledThread* thrd = ProfiledThread::get();
if (thrd == nullptr || !thrd->isContextTlsInitialized()) {
return DD_EMPTY_CONTEXT;
}
Expand Down
4 changes: 2 additions & 2 deletions ddprof-lib/src/main/cpp/criticalSection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
uint64_t CriticalSection::_fallback_bitmap[CriticalSection::FALLBACK_BITMAP_WORDS] = {};

CriticalSection::CriticalSection() : _entered(false), _using_fallback(false), _word_index(0), _bit_mask(0) {
ProfiledThread* current = ProfiledThread::currentSignalSafe();
ProfiledThread* current = ProfiledThread::get();
if (current != nullptr) {
// Primary path: Use ProfiledThread storage (fast and memory-efficient)
_entered = current->tryEnterCriticalSection();
Expand All @@ -39,7 +39,7 @@ CriticalSection::~CriticalSection() {
__atomic_fetch_and(&_fallback_bitmap[_word_index], ~_bit_mask, __ATOMIC_RELAXED);
} else {
// Release ProfiledThread flag
ProfiledThread* current = ProfiledThread::currentSignalSafe();
ProfiledThread* current = ProfiledThread::get();
if (current != nullptr) {
current->exitCriticalSection();
}
Expand Down
11 changes: 3 additions & 8 deletions ddprof-lib/src/main/cpp/ctimer_linux.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,11 @@ static int pthread_setspecific_hook(pthread_key_t key, const void *value) {
}

if (value != NULL) {
ProfiledThread::initCurrentThread();
int result = pthread_setspecific(key, value);
Profiler::registerThread(ProfiledThread::currentTid());
return result;
Profiler::registerCurrentThread();
return pthread_setspecific(key, value);
} else {
int tid = ProfiledThread::currentTid();
Profiler::unregisterThread(tid);
ProfiledThread::release();
return pthread_setspecific(key, value);
}
}
Expand Down Expand Up @@ -88,8 +85,6 @@ int CTimer::_signal;

int CTimer::registerThread(int tid) {
if (tid >= _max_timers) {
Log::warn("tid[%d] > pid_max[%d]. Restart profiler after changing pid_max",
tid, _max_timers);
return -1;
}

Expand Down Expand Up @@ -210,7 +205,7 @@ void CTimer::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) {
if (!__atomic_load_n(&_enabled, __ATOMIC_ACQUIRE))
return;
int tid = 0;
ProfiledThread *current = ProfiledThread::currentSignalSafe();
ProfiledThread *current = ProfiledThread::get();
assert(current == nullptr || !current->isDeepCrashHandler());
if (current != NULL) {
current->noteCPUSample(Profiler::instance()->recordingEpoch());
Expand Down
2 changes: 1 addition & 1 deletion ddprof-lib/src/main/cpp/itimer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ void ITimer::signalHandler(int signo, siginfo_t *siginfo, void *ucontext) {
return; // Another critical section is active, defer profiling
}
int tid = 0;
ProfiledThread *current = ProfiledThread::currentSignalSafe();
ProfiledThread *current = ProfiledThread::get();
if (current != NULL) {
current->noteCPUSample(Profiler::instance()->recordingEpoch());
tid = current->tid();
Expand Down
6 changes: 4 additions & 2 deletions ddprof-lib/src/main/cpp/javaApi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,8 @@ Java_com_datadoghq_profiler_JavaProfiler_getSamples(JNIEnv *env,
// still compatible in the event of signature changes in the future.
extern "C" DLLEXPORT void JNICALL
JavaCritical_com_datadoghq_profiler_JavaProfiler_filterThreadAdd0() {
ProfiledThread *current = ProfiledThread::current();
// TLS is guaranteed to be set up by onThreadStart() before any Java code runs
ProfiledThread *current = ProfiledThread::get();
if (unlikely(current == nullptr)) {
return;
}
Expand Down Expand Up @@ -163,7 +164,8 @@ JavaCritical_com_datadoghq_profiler_JavaProfiler_filterThreadAdd0() {

extern "C" DLLEXPORT void JNICALL
JavaCritical_com_datadoghq_profiler_JavaProfiler_filterThreadRemove0() {
ProfiledThread *current = ProfiledThread::current();
// TLS is guaranteed to be set up by onThreadStart() before any Java code runs
ProfiledThread *current = ProfiledThread::get();
if (unlikely(current == nullptr)) {
return;
}
Expand Down
238 changes: 238 additions & 0 deletions ddprof-lib/src/main/cpp/lockFree.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
/*
* Copyright 2025, Datadog, Inc.
* SPDX-License-Identifier: Apache-2.0
*/

#ifndef _LOCKFREE_H
#define _LOCKFREE_H

#include "common.h"

#include <atomic>
#include <cstddef>
#include <cstdint>

/**
* Lock-free atomic primitives and utilities.
*
* This header provides building blocks for lock-free data structures:
* - PaddedAtomic: Cache-line padded atomics to prevent false sharing
* - LockFreeBitset: Lock-free bitset for concurrent membership tracking
*
* For complete synchronization classes (SpinLock, mutexes), see spinLock.h
*/

// Cache line size for preventing false sharing (typical for x86/ARM)
// Note: This duplicates DEFAULT_CACHE_LINE_SIZE from arch_dd.h for standalone use
constexpr size_t CACHE_LINE_SIZE = 64;

/**
* Atomic value padded to its own cache line to prevent false sharing.
*
* Use this when you have an array of atomics that are frequently accessed
* by different threads. Without padding, atomics in adjacent array elements
* may share a cache line, causing false sharing that degrades performance.
*
* False sharing occurs when:
* - Thread A modifies atomic at index 0
* - Thread B modifies atomic at index 1
* - Both atomics are on the same cache line
* - CPU must invalidate entire cache line, forcing both threads to reload
*
* Example usage:
* static PaddedAtomic<uint64_t> counters[128]; // Each counter on own cache line
* counters[i].value.fetch_add(1, std::memory_order_relaxed);
*
* @tparam T The atomic value type (e.g., uint64_t, int, bool)
*/
template<typename T>
struct alignas(CACHE_LINE_SIZE) PaddedAtomic {
std::atomic<T> value;
// Padding is automatic due to alignas - ensures this struct occupies full cache line
};

/**
* Lock-free bitset for concurrent membership tracking.
*
* A fixed-size bitset that supports lock-free set, clear, and test operations.
* Uses cache-line padded atomic words to prevent false sharing between threads
* operating on different portions of the bitset.
*
* Hash-based operations use double-hashing with two independent hash functions
* to minimize false positives. A key is considered "set" only if both
* corresponding bits (from both hash functions) are set. This reduces the
* false positive probability from p to p² compared to single-hash approaches.
*
* Thread safety:
* - All operations are lock-free and async-signal-safe
* - Uses atomic operations with appropriate memory ordering
* - Safe to call from signal handlers
*
* Example usage:
* static LockFreeBitset<8192> threadSet;
*
* // Hash-based operations (for integer keys like thread IDs)
* threadSet.set(tid); // Mark thread as member
* if (threadSet.test(tid)) { ... } // Check membership
* threadSet.clear(tid); // Remove from set
*
* // Raw bit operations (when you manage indexing yourself)
* threadSet.setRaw(42); // Set bit 42
* threadSet.clearRaw(42); // Clear bit 42
*
* @tparam NumBits Total number of bits per array (should be power of 2 for efficient hashing)
*/
template<size_t NumBits>
class LockFreeBitset {
public:
static constexpr size_t NUM_BITS = NumBits;
static constexpr size_t BITS_PER_WORD = 64;
static constexpr size_t NUM_WORDS = (NumBits + BITS_PER_WORD - 1) / BITS_PER_WORD;

/**
* Initializes the bitset with all bits cleared in both arrays.
*/
void init() {
for (size_t i = 0; i < NUM_WORDS * 2; i++) {
_words[i].value.store(0, std::memory_order_relaxed);
}
}

/**
* Sets the bits for the given key using double-hash indexing.
* Sets bits in both arrays using two independent hash functions.
*
* @param key Integer key to hash and set
*/
void set(size_t key) {
setBit(hashKey1(key), 0); // Array 1 at even indices
setBit(hashKey2(key), 1); // Array 2 at odd indices
}

/**
* Clears the bits for the given key using double-hash indexing.
* Clears bits in both arrays using two independent hash functions.
*
* @param key Integer key to hash and clear
*/
void clear(size_t key) {
clearBit(hashKey1(key), 0); // Array 1 at even indices
clearBit(hashKey2(key), 1); // Array 2 at odd indices
}

/**
* Tests if the key is set using double-hash indexing.
* Returns true only if BOTH bits (from both hash functions) are set.
* This minimizes false positives compared to single-hash approaches.
*
* @param key Integer key to hash and test
* @return true if both bits are set, false otherwise
*/
bool test(size_t key) const {
return testBit(hashKey1(key), 0) && testBit(hashKey2(key), 1);
}

/**
* Sets the bit at the given raw index in the primary array (no hashing).
*
* @param bit_index Raw bit index (0 to NumBits-1)
*/
void setRaw(size_t bit_index) {
setBit(bit_index, 0); // Use array 1 (even indices)
}

/**
* Clears the bit at the given raw index in the primary array (no hashing).
*
* @param bit_index Raw bit index (0 to NumBits-1)
*/
void clearRaw(size_t bit_index) {
clearBit(bit_index, 0); // Use array 1 (even indices)
}

/**
* Tests if the bit at the given raw index is set in the primary array (no hashing).
*
* @param bit_index Raw bit index (0 to NumBits-1)
* @return true if the bit is set, false otherwise
*/
bool testRaw(size_t bit_index) const {
return testBit(bit_index, 0); // Use array 1 (even indices)
}

/**
* Clears all bits in both arrays.
*/
void clearAll() {
init();
}

private:
// Second hash constant - FNV offset basis provides good independence from Knuth constant
static constexpr size_t HASH2_CONSTANT = 0x517cc1b727220a95ULL;

// Interleaved array layout for L1 cache optimization.
// Layout: [word1_0, word2_0, word1_1, word2_1, ..., word1_N-1, word2_N-1]
// When test() accesses both hash positions, if they map to similar word indices,
// they'll be on adjacent cache lines, improving cache hit rate.
// Total memory: NUM_WORDS * 2 * 64 bytes (e.g., 256 * 2 * 64 = 32 KB for 16384 bits)
PaddedAtomic<uint64_t> _words[NUM_WORDS * 2];

/**
* Primary hash function using Knuth multiplicative hash.
*/
static size_t hashKey1(size_t key) {
return (key * KNUTH_MULTIPLICATIVE_CONSTANT) % NumBits;
}

/**
* Secondary hash function using upper bits of multiplication.
* While hash1 uses lower bits (via modulo), hash2 uses upper bits
* to provide true independence between the two hash functions.
*/
static size_t hashKey2(size_t key) {
// Use upper 32 bits of the multiplication result
// This provides independence from hash1 which uses lower bits via modulo
size_t product = key * HASH2_CONSTANT;
return (product >> 32) % NumBits;
}

/**
* Sets a bit in the interleaved array.
* @param bit_index The bit index within the logical array
* @param array_offset 0 for array1 (even indices), 1 for array2 (odd indices)
*/
void setBit(size_t bit_index, size_t array_offset) {
size_t word_index = bit_index / BITS_PER_WORD;
size_t interleaved_index = word_index * 2 + array_offset;
uint64_t bit_mask = 1ULL << (bit_index % BITS_PER_WORD);
_words[interleaved_index].value.fetch_or(bit_mask, std::memory_order_release);
}

/**
* Clears a bit in the interleaved array.
* @param bit_index The bit index within the logical array
* @param array_offset 0 for array1 (even indices), 1 for array2 (odd indices)
*/
void clearBit(size_t bit_index, size_t array_offset) {
size_t word_index = bit_index / BITS_PER_WORD;
size_t interleaved_index = word_index * 2 + array_offset;
uint64_t bit_mask = 1ULL << (bit_index % BITS_PER_WORD);
_words[interleaved_index].value.fetch_and(~bit_mask, std::memory_order_release);
}

/**
* Tests a bit in the interleaved array.
* @param bit_index The bit index within the logical array
* @param array_offset 0 for array1 (even indices), 1 for array2 (odd indices)
*/
bool testBit(size_t bit_index, size_t array_offset) const {
size_t word_index = bit_index / BITS_PER_WORD;
size_t interleaved_index = word_index * 2 + array_offset;
uint64_t bit_mask = 1ULL << (bit_index % BITS_PER_WORD);
uint64_t word = _words[interleaved_index].value.load(std::memory_order_acquire);
return (word & bit_mask) != 0;
}
};

#endif // _LOCKFREE_H
Loading
Loading