mkn · PhilipDeegan · Feb 27, 2025 · Feb 23, 2025
diff --git a/.clang-format b/.clang-format
@@ -1,3 +1,8 @@
+IndentWidth: 2
 BasedOnStyle: 'google'
 ColumnLimit: 100
-SortIncludes: false
+SortIncludes: false
+QualifierAlignment: Right
+UseTab: Never
+DerivePointerAlignment: false
+PointerAlignment: Left
diff --git a/inc/mkn/gpu/cuda.hpp b/inc/mkn/gpu/cuda.hpp
@@ -50,7 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   {                                       \
     gpuAssert((ans), __FILE__, __LINE__); \
   }
-inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true) {
+inline void gpuAssert(cudaError_t code, char const* file, int line, bool abort = true) {
   if (code != cudaSuccess) {
     fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
     if (abort) std::abort();

diff --git a/inc/mkn/gpu/launchers.hpp b/inc/mkn/gpu/launchers.hpp
@@ -48,7 +48,7 @@ auto as_values(Args&... args) {
 
 template <bool _sync = true>
 struct GDLauncher : public GLauncher {
-  GDLauncher(std::size_t s, size_t dev = 0) : GLauncher{s, dev} {}
+  GDLauncher(std::size_t const s, size_t const dev = 0) : GLauncher{s, dev} {}
 
   template <typename F, typename... Args>
   auto operator()(F&& f, Args&&... args) {
@@ -69,7 +69,7 @@ struct GDLauncher : public GLauncher {
 
 template <bool _sync = true>
 struct DLauncher : public Launcher {
-  DLauncher(size_t /*dev*/ = 0) : Launcher{{}, {}} {}
+  DLauncher(size_t const /*dev*/ = 0) : Launcher{{}, {}} {}
 
   template <typename F, typename... Args>
   auto operator()(F&& f, Args&&... args) {

diff --git a/inc/mkn/gpu/rocm.hpp b/inc/mkn/gpu/rocm.hpp
@@ -46,7 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
   {                                       \
     gpuAssert((ans), __FILE__, __LINE__); \
   }
-inline void gpuAssert(hipError_t code, const char* file, int line, bool abort = true) {
+inline void gpuAssert(hipError_t code, char const* file, int line, bool abort = true) {
   if (code != hipSuccess) {
     fprintf(stderr, "GPUassert: %s %s %d\n", hipGetErrorString(code), file, line);
     if (abort) std::abort();

diff --git a/inc/mkn/gpu/tuple.hpp b/inc/mkn/gpu/tuple.hpp
@@ -121,7 +121,7 @@ struct SpanSet : ASpanSet<T, SIZE, GPU> {
       curr_pos += sv->sizes[curr_ptr++];
       return *this;
     }
-    bool operator!=(const iterator& /*other*/) const __device__ {
+    bool operator!=(iterator const& /*other*/) const __device__ {
       return curr_ptr != sv->sizes.size();
     }
     Span<T, SIZE> operator*() const {

diff --git a/test/any/array.cpp b/test/any/array.cpp
@@ -6,7 +6,7 @@ static constexpr uint32_t NUM = WIDTH * HEIGHT;
 static constexpr uint32_t THREADS_PER_BLOCK_X = 16, THREADS_PER_BLOCK_Y = 16;
 
 template <typename T>
-__global__ void vectoradd(T* a, const T* b, const T* c) {
+__global__ void vectoradd(T* a, T const* b, T const* c) {
   auto i = mkn::gpu::idx();
   a[i] = b[i] + c[i];
 }

diff --git a/test/any/async_streaming.cpp b/test/any/async_streaming.cpp
@@ -2,6 +2,7 @@
 #include <thread>
 #include <algorithm>
 
+#include "mkn/gpu.hpp"
 #include "mkn/kul/dbg.hpp"
 #include "mkn/kul/time.hpp"
 #include "mkn/gpu/multi_launch.hpp"
@@ -179,7 +180,7 @@ std::uint32_t test_threaded_host_group_idx(std::size_t const& nthreads = 2) {
       .dev([=] __device__(auto const& i) { views[i][mkn::gpu::idx()] += 3; })();
 
   std::size_t val = 5;
-  for (std::size_t i = 0; i < vecs.size(); i++) {
+  for (std::size_t i = 0; i < vecs.size(); ++i) {
     if (i % group_size == 0) {
       for (auto const& e : vecs[i])
         if (e != val + 1) return 1;
@@ -193,12 +194,48 @@ std::uint32_t test_threaded_host_group_idx(std::size_t const& nthreads = 2) {
   return 0;
 }
 
+std::uint32_t test_threaded_detached_stream_fns(std::size_t const& nthreads = 2) {
+  using T = double;
+  KUL_DBG_FUNC_ENTER;
+
+  std::vector<ManagedVector<T>> vecs(C, ManagedVector<T>(NUM, 0));
+  for (std::size_t i = 0; i < vecs.size(); ++i) std::fill_n(vecs[i].data(), NUM, i);
+
+  ManagedVector<T*> datas(C);
+  for (std::size_t i = 0; i < vecs.size(); ++i) datas[i] = vecs[i].data();
+  auto views = datas.data();
+
+  ThreadedStreamLauncher launcher{vecs, nthreads};
+  launcher
+      .host([&](auto i) mutable {
+        launcher.streams[i].sync();  // wait for first kernel per stream
+        for (auto& e : vecs[i]) e += 1;
+      })
+      .dev([=] __device__(auto const& i) { views[i][mkn::gpu::idx()] += 3; });
+
+  for (std::size_t i = 0; i < datas.size(); ++i) {
+    mkn::gpu::GDLauncher<false>{NUM}.stream(
+        launcher.streams[i], [=, idx = i] __device__() { views[idx][mkn::gpu::idx()] += 1; });
+  }
+
+  launcher();
+
+  std::size_t val = 5;
+  for (std::size_t i = 0; i < vecs.size(); ++i) {
+    for (std::size_t j = 0; j < vecs[i].size(); ++j)
+      if (val != vecs[i][j]) return 1;
+    ++val;
+  };
+
+  return 0;
+}
+
 int main() {
   KOUT(NON) << __FILE__;
   return test()                              //
          + test_threaded()                   //
          + test_threaded(6)                  //
          + test_threaded_group_barrier()     //
          + test_threaded_host_group_mutex()  //
-         + test_threaded_host_group_idx();
+         + test_threaded_host_group_idx() + test_threaded_detached_stream_fns();
 }
diff --git a/test/cpu/namespace.cpp b/test/cpu/namespace.cpp
@@ -6,7 +6,7 @@ static constexpr uint32_t NUM = WIDTH * HEIGHT;
 static constexpr uint32_t THREADS_PER_BLOCK_X = 16, THREADS_PER_BLOCK_Y = 16;
 
 template <typename T>
-__global__ void vectoradd(T* a, const T* b, const T* c) {
+__global__ void vectoradd(T* a, T const* b, T const* c) {
   auto i = mkn::gpu::cpu::idx();
   a[i] = b[i] + c[i];
 }

diff --git a/test/cuda/add.cpp b/test/cuda/add.cpp
@@ -6,7 +6,7 @@ static constexpr uint32_t NUM = WIDTH * HEIGHT;
 static constexpr uint32_t THREADS_PER_BLOCK_X = 16, THREADS_PER_BLOCK_Y = 16;
 
 template <typename T>
-__global__ void vectoradd(T* a, const T* b, const T* c) {
+__global__ void vectoradd(T* a, T const* b, T const* c) {
   auto i = mkn::gpu::cuda::idx();
   a[i] = b[i] + c[i];
 }

diff --git a/test/cuda/async.cpp b/test/cuda/async.cpp
@@ -43,15 +43,15 @@ inline cudaError_t checkCuda(cudaError_t result) {
   return result;
 }
 
-__global__ void kernel(float *a, int offset) {
+__global__ void kernel(float* a, int offset) {
   int i = offset + threadIdx.x + blockIdx.x * blockDim.x;
   float x = (float)i;
   float s = sinf(x);
   float c = cosf(x);
   a[i] = a[i] + sqrtf(s * s + c * c);
 }
 
-float maxError(float *a, int n) {
+float maxError(float* a, int n) {
   float maxE = 0;
   for (int i = 0; i < n; i++) {
     float error = fabs(a[i] - 1.0f);
@@ -60,12 +60,12 @@ float maxError(float *a, int n) {
   return maxE;
 }
 
-int main(int argc, char **argv) {
-  const int blockSize = 256, nStreams = 4;
-  const int n = 4 * 1024 * blockSize * nStreams;
-  const int streamSize = n / nStreams;
-  const int streamBytes = streamSize * sizeof(float);
-  const int bytes = n * sizeof(float);
+int main(int argc, char** argv) {
+  int const blockSize = 256, nStreams = 4;
+  int const n = 4 * 1024 * blockSize * nStreams;
+  int const streamSize = n / nStreams;
+  int const streamBytes = streamSize * sizeof(float);
+  int const bytes = n * sizeof(float);
 
   int devId = 0;
   if (argc > 1) devId = atoi(argv[1]);
@@ -77,8 +77,8 @@ int main(int argc, char **argv) {
 
   // allocate pinned host memory and device memory
   float *a, *d_a;
-  checkCuda(cudaMallocHost((void **)&a, bytes));  // host pinned
-  checkCuda(cudaMalloc((void **)&d_a, bytes));    // device
+  checkCuda(cudaMallocHost((void**)&a, bytes));  // host pinned
+  checkCuda(cudaMalloc((void**)&d_a, bytes));    // device
 
   float ms;  // elapsed time in milliseconds
 

diff --git a/test/cuda/atomic.cpp b/test/cuda/atomic.cpp
@@ -16,7 +16,7 @@ struct S {
 }  // namespace NS0
 
 template <typename T>
-__global__ void vectoradd(T* a, const T* b, const T* c) {
+__global__ void vectoradd(T* a, T const* b, T const* c) {
   auto i = mkn::gpu::cuda::idx();
 
   NS0::S<T>{a[i]} += b[i] + c[i];

diff --git a/test/hip/add.cpp b/test/hip/add.cpp
@@ -6,7 +6,7 @@ static constexpr uint32_t NUM = WIDTH * HEIGHT;
 static constexpr uint32_t THREADS_PER_BLOCK_X = 16, THREADS_PER_BLOCK_Y = 16;
 
 template <typename T>
-__global__ void vectoradd(T* a, const T* b, const T* c) {
+__global__ void vectoradd(T* a, T const* b, T const* c) {
   auto i = mkn::gpu::hip::idx();
   a[i] = b[i] + c[i];
 }

diff --git a/test/hip/async.cpp b/test/hip/async.cpp
@@ -43,15 +43,15 @@ inline void checkHip([[maybe_unused]] hipError_t result) {
   // return result;
 }
 
-__global__ void kernel(float *a, int offset) {
+__global__ void kernel(float* a, int offset) {
   int i = offset + threadIdx.x + blockIdx.x * blockDim.x;
   float x = (float)i;
   float s = sinf(x);
   float c = cosf(x);
   a[i] = a[i] + sqrtf(s * s + c * c);
 }
 
-float maxError(float *a, int n) {
+float maxError(float* a, int n) {
   float maxE = 0;
   for (int i = 0; i < n; i++) {
     float error = fabs(a[i] - 1.0f);
@@ -60,12 +60,12 @@ float maxError(float *a, int n) {
   return maxE;
 }
 
-int main(int argc, char **argv) {
-  const int blockSize = 256, nStreams = 4;
-  const int n = 4 * 1024 * blockSize * nStreams;
-  const int streamSize = n / nStreams;
-  const int streamBytes = streamSize * sizeof(float);
-  const int bytes = n * sizeof(float);
+int main(int argc, char** argv) {
+  int const blockSize = 256, nStreams = 4;
+  int const n = 4 * 1024 * blockSize * nStreams;
+  int const streamSize = n / nStreams;
+  int const streamBytes = streamSize * sizeof(float);
+  int const bytes = n * sizeof(float);
 
   int devId = 0;
   if (argc > 1) devId = atoi(argv[1]);
@@ -77,8 +77,8 @@ int main(int argc, char **argv) {
 
   // allocate pinned host memory and device memory
   float *a, *d_a;
-  checkHip(hipHostMalloc((void **)&a, bytes));  // host pinned
-  checkHip(hipMalloc((void **)&d_a, bytes));    // device
+  checkHip(hipHostMalloc((void**)&a, bytes));  // host pinned
+  checkHip(hipMalloc((void**)&d_a, bytes));    // device
 
   float ms;  // elapsed time in milliseconds