clayborg · dmpots · Oct 17, 2025 · Sep 12, 2025 · Oct 10, 2025 · Oct 10, 2025
diff --git a/lldb/include/lldb/Host/common/NativeThreadProtocol.h b/lldb/include/lldb/Host/common/NativeThreadProtocol.h
@@ -56,6 +56,8 @@ class NativeThreadProtocol {
     return llvm::make_error<UnimplementedError>();
   }
 
+  bool HasValidStopReason();
+
 protected:
   NativeProcessProtocol &m_process;
   lldb::tid_t m_tid;

diff --git a/lldb/source/Host/common/NativeThreadProtocol.cpp b/lldb/source/Host/common/NativeThreadProtocol.cpp
@@ -17,3 +17,13 @@ using namespace lldb_private;
 NativeThreadProtocol::NativeThreadProtocol(NativeProcessProtocol &process,
                                            lldb::tid_t tid)
     : m_process(process), m_tid(tid) {}
+
+bool NativeThreadProtocol::HasValidStopReason() {
+  ThreadStopInfo stop_info;
+  std::string stop_description;
+  if (!GetStopReason(stop_info, stop_description))
+    return false;
+
+  return stop_info.reason != lldb::eStopReasonInvalid &&
+         stop_info.reason != lldb::eStopReasonNone;
+}
diff --git a/lldb/test/API/gpu/amd/basic/TestBasicAmdGpuPlugin.py b/lldb/test/API/gpu/amd/basic/TestBasicAmdGpuPlugin.py
@@ -8,6 +8,8 @@
 from lldbsuite.test.lldbtest import *
 from amdgpu_testcase import *
 
+SHADOW_THREAD_NAME = "AMD Native Shadow Thread"
+
 
 class BasicAmdGpuTestCase(AmdGpuTestCaseBase):
     def test_gpu_target_created_on_demand(self):
@@ -27,7 +29,9 @@ def test_gpu_target_created_on_demand(self):
         # Make sure the GPU target was created and has the default thread.
         self.assertEqual(self.dbg.GetNumTargets(), 2, "There are two targets")
         gpu_thread = self.gpu_process.GetThreadAtIndex(0)
-        self.assertEqual(gpu_thread.GetName(), "AMD Native Shadow Thread", "GPU thread has the right name")
+        self.assertEqual(
+            gpu_thread.GetName(), SHADOW_THREAD_NAME, "GPU thread has the right name"
+        )
 
         # The target should have the triple set correctly.
         self.assertIn("amdgcn-amd-amdhsa", self.gpu_target.GetTriple())
@@ -45,6 +49,31 @@ def test_gpu_breakpoint_hit(self):
         )
         self.assertNotEqual(None, gpu_threads, "GPU should be stopped at breakpoint")
 
+    def test_num_threads(self):
+        """Test that we get the expected number of threads."""
+        self.build()
+
+        # GPU breakpoint should get hit by at least one thread.
+        source = "hello_world.hip"
+        gpu_threads_at_bp = self.run_to_gpu_breakpoint(
+            source, "// GPU BREAKPOINT", "// CPU BREAKPOINT - BEFORE LAUNCH"
+        )
+        self.assertNotEqual(
+            None, gpu_threads_at_bp, "GPU should be stopped at breakpoint"
+        )
+
+        # We launch one thread for each character in the output string.
+        gpu_threads = self.gpu_process.threads
+        num_expected_threads = len("Hello, world!")
+        self.assertEqual(len(gpu_threads), num_expected_threads)
+
+        # The shadow thread should not be listed once we have real threads
+        for thread in gpu_threads:
+            self.assertNotEqual(SHADOW_THREAD_NAME, thread.GetName())
+
+        # All threads should be stopped at the breakpoint.
+        self.assertEqual(len(gpu_threads_at_bp), num_expected_threads)
+
     def test_no_unexpected_stop(self):
         """Test that we do not unexpectedly hit a stop in the debugger when
         No breakpoints are set."""

diff --git a/lldb/test/API/gpu/amd/multi-wave/Makefile b/lldb/test/API/gpu/amd/multi-wave/Makefile
@@ -0,0 +1,3 @@
+HIP_SOURCES := multi-wave.hip
+
+include Makefile.rules
diff --git a/lldb/test/API/gpu/amd/multi-wave/TestMultiWaveAmdGpuPlugin.py b/lldb/test/API/gpu/amd/multi-wave/TestMultiWaveAmdGpuPlugin.py
@@ -0,0 +1,42 @@
+"""
+Basic tests for the AMDGPU plugin with a multi-wave kernel.
+"""
+
+
+import lldb
+import lldbsuite.test.lldbutil as lldbutil
+from lldbsuite.test.lldbtest import *
+from amdgpu_testcase import *
+
+SHADOW_THREAD_NAME = "AMD Native Shadow Thread"
+
+
+class BasicAmdGpuTestCase(AmdGpuTestCaseBase):
+    def run_to_breakpoint(self):
+        # GPU breakpoint should get hit by at least one thread.
+        source = "multi-wave.hip"
+        gpu_threads_at_bp = self.run_to_gpu_breakpoint(
+            source, "// GPU BREAKPOINT", "// CPU BREAKPOINT - BEFORE LAUNCH"
+        )
+        self.assertNotEqual(
+            None, gpu_threads_at_bp, "GPU should be stopped at breakpoint"
+        )
+
+        return gpu_threads_at_bp
+
+
+    def test_num_threads(self):
+        """Test that we get the expected number of threads."""
+        self.build()
+
+        gpu_threads_at_breakpoint = self.run_to_breakpoint()
+
+        # We launch 960 total threads (8 blocks * 120 threads per block).
+        gpu_threads = self.gpu_process.threads
+        self.assertEqual(len(gpu_threads), 960)
+
+        # But not all waves may reach the breakpoint at the same time.
+        # So here we check that we have at least one wave's worth of threads
+        # stopped at the breakpoint. With wave size of 64, this means we should
+        # have at least 56 threads (120 = 64 + 56) hitting the breakpoint.
+        self.assertGreaterEqual(len(gpu_threads_at_breakpoint), 56)
diff --git a/lldb/test/API/gpu/amd/multi-wave/multi-wave.hip b/lldb/test/API/gpu/amd/multi-wave/multi-wave.hip
@@ -0,0 +1,151 @@
+#include <cstddef>
+#include <hip/hip_runtime.h>
+
+#include <iostream>
+#include <limits>
+
+/// \brief Checks if the provided error code is \p hipSuccess and if not,
+/// prints an error message to the standard error output and terminates the
+/// program with an error code.
+constexpr int error_exit_code = -1;
+#define HIP_CHECK(condition)                                              \
+  {                                                                       \
+    const hipError_t error = condition;                                   \
+    if (error != hipSuccess) {                                            \
+      std::cerr << "An error encountered: \"" << hipGetErrorString(error) \
+                << "\" at " << __FILE__ << ':' << __LINE__ << std::endl;  \
+      std::exit(error_exit_code);                                         \
+    }                                                                     \
+  }
+
+static constexpr unsigned divideCeil(unsigned Numerator, unsigned Denominator) {
+  assert(Denominator && "Division by zero");
+  uint64_t Bias = (Numerator != 0);
+  return (Numerator - Bias) / Denominator + Bias;
+}
+
+static std::string get_num_threads_in_wave(unsigned num_threads_in_block, unsigned wave_size ) {
+  std::vector<unsigned> num_threads_in_wave(divideCeil(num_threads_in_block, wave_size), wave_size);
+  if (num_threads_in_block % wave_size != 0) {
+    num_threads_in_wave.back() = num_threads_in_block % wave_size;
+  }
+
+  std::string result = "[";
+  for (size_t i = 0; i < num_threads_in_wave.size(); ++i) {
+    result += std::to_string(num_threads_in_wave[i]);
+    if (i != num_threads_in_wave.size() - 1) {
+      result += ", ";
+    }
+  }
+  result += "]";
+  return result;
+}
+
+__device__ unsigned block_counter = 0;
+__device__ void wait_for_all_threads() {
+  // Wait until all threads in the block have reached this point.
+  __syncthreads();
+
+  // Wait until all blocks have reached this point.
+  const unsigned int num_blocks = gridDim.x * gridDim.y * gridDim.z;
+  const bool is_first_thread_in_block = (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0);
+  if (is_first_thread_in_block) {
+    atomicAdd(&block_counter, 1);
+    while (atomicAdd(&block_counter, 0) < num_blocks) {
+    }
+  }
+
+  // Wait until all threads in the block have reached this point.
+  // This is to ensure that all blocks have incremented the block counter
+  // before continuing.
+  __syncthreads();
+}
+
+__device__ unsigned int get_global_idx() {
+  // Calculate block ID in the grid
+  unsigned block_id =
+      blockIdx.x + blockIdx.y * gridDim.y + blockIdx.z * gridDim.x * gridDim.y;
+
+  // Calculate thread ID within the block
+  unsigned thread_id_in_block = threadIdx.x + threadIdx.y * blockDim.x +
+                                threadIdx.z * blockDim.x * blockDim.y;
+
+  // Calculate global thread ID
+  unsigned threads_per_block = blockDim.x * blockDim.y * blockDim.z;
+  unsigned global_thread_id = block_id * threads_per_block + thread_id_in_block;
+
+  return global_thread_id;
+}
+
+// Simple kernel that assigns each thread a unique ID
+// and stores it in the output buffer.
+__global__ void multi_wave_kernel(unsigned *buf, bool sync_threads) {
+  if (sync_threads) {
+    wait_for_all_threads();
+  }
+  unsigned int thread_idx = get_global_idx();
+  buf[thread_idx] = thread_idx; // GPU BREAKPOINT
+}
+
+int main() {
+  const dim3 blocks(2, 2, 2);  // 3D grid specifying number of blocks to launch
+  const dim3 threads(5, 4, 6); // 3D grid specifying number of threads to launch
+  const size_t num_blocks = blocks.x * blocks.y * blocks.z;
+  const size_t threads_per_block = threads.x * threads.y * threads.z;
+  const size_t num_threads = num_blocks * threads_per_block;
+  const size_t buffer_size = num_threads * sizeof(unsigned);
+
+  int device;
+  HIP_CHECK(hipGetDevice(&device));
+  hipDeviceProp_t deviceProp;
+  HIP_CHECK(hipGetDeviceProperties(&deviceProp, device));
+  unsigned wave_size = deviceProp.warpSize;
+  printf("===================TEST CONFIGURATION===================\n");
+  printf("Running on device:       %s\n", deviceProp.name);
+  printf("Wave size:               %d\n", wave_size);
+  printf("Total number of threads: %zu\n", num_threads);
+  printf("Total number of blocks:  %zu\n", num_blocks);
+  printf("Total number of waves:   %d\n", divideCeil(num_threads, wave_size));
+  printf("Threads per block:       %zu\n", threads_per_block);
+  printf("Waves per block:         %d\n",
+         divideCeil(threads_per_block, wave_size));
+  printf("Threads per wave:        %s\n",
+         get_num_threads_in_wave(threads_per_block, wave_size).c_str());
+  printf("===================TEST CONFIGURATION===================\n");
+  printf("\n\n");
+
+  // Allocate host vectors
+  std::vector<unsigned> h_buf(num_threads,
+                              std::numeric_limits<unsigned>::max());
+
+  // Allocate device memory for the output data
+  unsigned *d_buf;
+  HIP_CHECK(hipMalloc(&d_buf, buffer_size));
+
+  // Copy data from host to device
+  printf("Copying data to device...\n");
+  HIP_CHECK(hipMemcpy(d_buf, h_buf.data(), buffer_size, hipMemcpyHostToDevice));
+
+  // Launch the kernel.
+  printf("Launching multi-wave kernel with %zu threads...\n",
+         num_threads); // CPU BREAKPOINT - BEFORE LAUNCH
+  multi_wave_kernel<<<blocks, threads, 0, hipStreamDefault>>>(d_buf, true);
+
+  // Copy data from device to host
+  printf("Copying data to host...\n");   // CPU BREAKPOINT - AFTER LAUNCH
+  HIP_CHECK(hipMemcpy(h_buf.data(), d_buf, buffer_size, hipMemcpyDeviceToHost));
+
+  // Free device memory
+  HIP_CHECK(hipFree(d_buf)); // CPU BREAKPOINT - AFTER FINISH
+
+  // Print the output
+  printf("Validating output...\n");
+  for (size_t i = 0; i < num_threads; ++i) { 
+    if (h_buf[i] != i) {
+      std::cerr << "Error: Expected " << i << " but got " << h_buf[i] << std::endl;
+      return error_exit_code;
+    }
+  }
+  printf("Output matches expected values!\n");
+  return 0;
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		HIP_SOURCES := multi-wave.hip

		include Makefile.rules