Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@

# mkn.gpu

# mkn.gpu
CUDA/HIP C++20 convenience wrappers

[README](https://raw.githubusercontent.com/mkn/mkn.gpu/master/README.noformat)
7 changes: 6 additions & 1 deletion README.noformat
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
mkn.gpu

CUDA/HIP C++17 convenience wrappers
CUDA/HIP C++20 convenience wrappers

======

Expand All @@ -27,6 +27,11 @@ Description expose functions explicitly via
mkn::gpu::hip::*
mkn::gpu::cuda::*

Key _MKN_GPU_WARP_SIZE_
Type uint
Default use manufacturer provided (eg warpSize), usually 32
Description override use if defined

Key _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_
Type uint
Default 1
Expand Down
11 changes: 8 additions & 3 deletions inc/mkn/gpu/cpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "mkn/kul/span.hpp"
#include "mkn/kul/tuple.hpp"
#include "mkn/kul/assert.hpp"
#include "mkn/kul/threads.hpp"

#include "mkn/gpu/cli.hpp"
#include "mkn/gpu/def.hpp"

#include <algorithm>
#include <cassert>
#include <cstring>
#include <algorithm>

#define MKN_GPU_ASSERT(x) (KASSERT((x)))

Expand Down Expand Up @@ -90,12 +88,14 @@ struct dim3 {
};

dim3 static inline threadIdx, blockIdx;
static constexpr int warpSize = 1;

#endif // MKN_CPU_DO_NOT_DEFINE_DIM3

//

namespace MKN_GPU_NS {
static constexpr int warp_size = warpSize;

void inline setLimitMallocHeapSize(std::size_t const& /*bytes*/) {} /*noop*/

Expand Down Expand Up @@ -280,6 +280,11 @@ void fill(Container& c, T const val) {
fill(c, c.size(), val);
}

template <typename T>
void zero(T* const t, std::size_t const size) {
std::fill(t, t + size, 0);
}

void inline prinfo(std::size_t /*dev*/ = 0) { KOUT(NON) << "Psuedo GPU in use"; }

} // namespace MKN_GPU_NS
Expand Down
13 changes: 7 additions & 6 deletions inc/mkn/gpu/cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef _MKN_GPU_CUDA_HPP_
#define _MKN_GPU_CUDA_HPP_

#include <vector>

#include "mkn/kul/log.hpp"
#include "mkn/kul/span.hpp"
#include "mkn/kul/tuple.hpp"
#include "mkn/kul/assert.hpp"

#include "mkn/gpu/def.hpp"
#include "mkn/gpu/cli.hpp"

#include <cuda_runtime.h>

Expand Down Expand Up @@ -76,8 +72,6 @@ __device__ SIZE block_idx_x() {

} // namespace mkn::gpu::cuda

//

#if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
#define MKN_GPU_NS mkn::gpu::cuda
#else
Expand All @@ -86,6 +80,12 @@ __device__ SIZE block_idx_x() {

namespace MKN_GPU_NS {

#ifdef _MKN_GPU_WARP_SIZE_
static constexpr int warp_size = _MKN_GPU_WARP_SIZE_;
#else
static constexpr int warp_size = warpSize;
#endif /*_MKN_GPU_WARP_SIZE_ */

void inline setLimitMallocHeapSize(std::size_t const& bytes) {
MKN_GPU_ASSERT(cudaDeviceSetLimit(cudaLimitMallocHeapSize, bytes));
}
Expand Down Expand Up @@ -310,6 +310,7 @@ __global__ static void global_d_kernel(F f, Args... args) {
}

#include "launchers.hpp"
#include "devfunc.hpp"

template <typename T, typename V>
__global__ void _vector_fill(T* a, V t, std::size_t s) {
Expand Down
44 changes: 44 additions & 0 deletions inc/mkn/gpu/devfunc.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// IWYU pragma: private, include "mkn/gpu.hpp"
/**
Copyright (c) 2025, Philip Deegan.
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:

* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Philip Deegan nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _MKN_GPU_DEVFUNC_HPP_
#define _MKN_GPU_DEVFUNC_HPP_

template <bool sync = true, typename T>
void __device__ zero(T* const t, std::size_t const size) {
std::size_t chunk = 0;
auto const each = size / warpSize;
for (; chunk < each; ++chunk) t[chunk * warpSize + threadIdx.x] = 0;
if (threadIdx.x < size - (warpSize * each)) t[chunk * warpSize + threadIdx.x] = 0;
if constexpr (sync) __syncthreads();
}

#endif /* _MKN_GPU_DEVFUNC_HPP_ */
8 changes: 7 additions & 1 deletion inc/mkn/gpu/launchers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,13 @@ struct GDLauncher : public GLauncher {

template <bool _sync = true>
struct DLauncher : public Launcher {
DLauncher(size_t const /*dev*/ = 0) : Launcher{{}, {}} {}
DLauncher() : Launcher{dim3{1}, dim3{warp_size}} {}
DLauncher(size_t const /*dev*/) : Launcher{{}, {}} {}

template <typename... Args>
DLauncher(Args&&... args)
requires(sizeof...(Args) > 0)
: Launcher{args...} {}

template <typename F, typename... Args>
auto operator()(F&& f, Args&&... args) {
Expand Down
23 changes: 17 additions & 6 deletions inc/mkn/gpu/multi_launch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef _MKN_GPU_MULTI_LAUNCH_HPP_
#define _MKN_GPU_MULTI_LAUNCH_HPP_

#include "mkn/gpu.hpp"
#include "mkn/kul/log.hpp"
#include "mkn/kul/time.hpp"
#include "mkn/kul/except.hpp"

#include <mutex>
#include <chrono>
#include <thread>
Expand All @@ -41,10 +46,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <fstream>
#include <stdexcept>

#include "mkn/gpu.hpp"
#include "mkn/kul/log.hpp"
#include "mkn/kul/time.hpp"

namespace mkn::gpu::detail {
template <typename Type>
auto& deref(Type&& type) {
Expand All @@ -57,6 +58,13 @@ auto& deref(Type&& type) {

namespace mkn::gpu {

class StreamLauncherException : public kul::Exception {
public:
StreamLauncherException(char const* f, std::uint16_t const& l, std::string const& s)
: Exception{f, l, s} {}
StreamLauncherException(StreamLauncherException const& e) : Exception{e} {}
};

enum class StreamFunctionMode { HOST_WAIT = 0, DEVICE_WAIT, BARRIER };
enum class StreamFunctionStatus { HOST_BUSY = 0, DEVICE_BUSY };

Expand Down Expand Up @@ -390,10 +398,10 @@ struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLaunc
constexpr static std::size_t wait_add_ms = _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_ADD_;
constexpr static std::size_t wait_max_ms = _MKN_GPU_THREADED_STREAM_LAUNCHER_WAIT_MS_MAX_;

ThreadedStreamLauncher(Datas& datas, std::size_t const _n_threads = 1,
ThreadedStreamLauncher(Datas& datas, std::size_t const _n_threads = 0,
std::size_t const device = 0)
: Super{datas}, n_threads{_n_threads}, device_id{device} {
thread_status.resize(n_threads, SFP::NEXT);
thread_status.resize(_n_threads + 1, SFP::NEXT);
status.resize(datas.size(), SFS::FIRST);
}

Expand Down Expand Up @@ -500,6 +508,9 @@ struct ThreadedStreamLauncher : public StreamLauncher<Datas, ThreadedStreamLaunc
}

This& join(bool const work = true, bool const clear = false) {
if (n_threads == 0 and !work) {
KEXCEPT(StreamLauncherException, "no available threads, join must work");
}
if (!started) start();
if (joined) return *this;
joined = true;
Expand Down
37 changes: 26 additions & 11 deletions inc/mkn/gpu/rocm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "mkn/kul/tuple.hpp"
#include "mkn/kul/assert.hpp"

#include "mkn/gpu/def.hpp"
#include "mkn/gpu/cli.hpp"
#include "mkn/gpu/def.hpp"

#include "hip/hip_runtime.h"

Expand Down Expand Up @@ -68,8 +68,6 @@ __device__ SIZE idx() {

} // namespace mkn::gpu::hip

//

#if defined(MKN_GPU_FN_PER_NS) && MKN_GPU_FN_PER_NS
#define MKN_GPU_NS mkn::gpu::hip
#else
Expand All @@ -78,6 +76,12 @@ __device__ SIZE idx() {

namespace MKN_GPU_NS {

#ifdef _MKN_GPU_WARP_SIZE_
static constexpr int warp_size = _MKN_GPU_WARP_SIZE_;
#else
static constexpr int warp_size = warpSize;
#endif /*_MKN_GPU_WARP_SIZE_ */

void inline setLimitMallocHeapSize(std::size_t const& bytes) {
MKN_GPU_ASSERT(hipDeviceSetLimit(hipLimitMallocHeapSize, bytes));
}
Expand Down Expand Up @@ -310,6 +314,7 @@ __global__ static void global_d_kernel(F f, Args... args) {
}

#include "launchers.hpp"
#include "devfunc.hpp"

template <typename T, typename V>
__global__ void _vector_fill(T* a, V t, std::size_t s) {
Expand All @@ -330,14 +335,24 @@ void fill(Container& c, T val) {
void inline prinfo(size_t dev = 0) {
hipDeviceProp_t devProp;
MKN_GPU_ASSERT(hipGetDeviceProperties(&devProp, dev));
KOUT(NON) << " System version " << devProp.major << "." << devProp.minor;
KOUT(NON) << " agent name " << devProp.name;
KOUT(NON) << " cores " << devProp.multiProcessorCount;
KOUT(NON) << " threadsPCore " << devProp.maxThreadsPerMultiProcessor;
KOUT(NON) << " TotalMem " << (devProp.totalGlobalMem / 1000000) << " MB";
KOUT(NON) << " BlockMem " << (devProp.sharedMemPerBlock / 1000) << " KB";
KOUT(NON) << " warpSize " << devProp.warpSize;
KOUT(NON) << " threadsPBlock " << devProp.maxThreadsPerBlock;
KOUT(NON) << " System version " << devProp.major << "." << devProp.minor;
KOUT(NON) << " agent name " << devProp.name;
KOUT(NON) << " cores " << devProp.multiProcessorCount;
KOUT(NON) << " threadsPCore " << devProp.maxThreadsPerMultiProcessor;
KOUT(NON) << " TotalMem " << (devProp.totalGlobalMem / 1000000) << " MB";
KOUT(NON) << " BlockMem " << (devProp.sharedMemPerBlock / 1000) << " KB";
KOUT(NON) << " device warpSize " << devProp.warpSize;
KOUT(NON) << " threadsPBlock " << devProp.maxThreadsPerBlock;

#ifdef _MKN_GPU_WARP_SIZE_
KOUT(NON) << " warpSize used " << _MKN_GPU_WARP_SIZE_;
#else
KOUT(NON) << " warpSize used " << warp_size;
if (warp_size != devProp.warpSize) {
KOUT(NON) << " warpSize MISMATCH!!! " << warp_size << " vs " << devProp.warpSize;
KOUT(NON) << " SEE mkn.gpu README for -D_MKN_GPU_WARP_SIZE_=###";
}
#endif
}

void inline print_gpu_mem_used() {
Expand Down
4 changes: 3 additions & 1 deletion mkn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,20 @@ profile:

- name: rocm
parent: headers
arg: -DMKN_GPU_ROCM=1
test: test/any/(\w).cpp
test/hip/(\w).cpp

- name: cuda
parent: headers
arg: -DMKN_GPU_CUDA=1
test: test/any/(\w).cpp
test/cuda/(\w).cpp

# if you have no GPU but want to test your code
- name: cpu
parent: headers
arg: -DMKN_GPU_CPU
arg: -DMKN_GPU_CPU=1
test: test/any/(\w).cpp
test/cpu/(\w).cpp

Expand Down
10 changes: 5 additions & 5 deletions test/any/async_streaming.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,10 +232,10 @@ std::uint32_t test_threaded_detached_stream_fns(std::size_t const& nthreads = 2)

int main() {
KOUT(NON) << __FILE__;
return test() //
+ test_threaded() //
+ test_threaded(6) //
+ test_threaded_group_barrier() //
+ test_threaded_host_group_mutex() //
return test() //
+ test_threaded(0) + test_threaded() //
+ test_threaded(6) //
+ test_threaded_group_barrier() //
+ test_threaded_host_group_mutex() //
+ test_threaded_host_group_idx() + test_threaded_detached_stream_fns();
}
21 changes: 20 additions & 1 deletion test/any/managed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,28 @@ std::uint32_t test_lambda_ref_copy_capture_views() {
return 0;
}

std::uint32_t test_zero() {
auto const size = 1000; // not warp size divisible!
ManagedVector<float> mem0(size, 1);
ManagedVector<float> mem1(size, 2);

auto* view0 = mem0.data();
auto* view1 = mem1.data();

mkn::gpu::DLauncher()([=] __device__() {
mkn::gpu::zero(view0, size);
mkn::gpu::zero(view1, size);
});

for (std::uint32_t i = 0; i < size; ++i)
if (mem0[i] + mem1[i] != 0) return 1;

return 0;
}

int main() {
KOUT(NON) << __FILE__;
return test() + //
return test() + test_zero() + //
test_guess() + //
test_lambda_copy_capture_views() + //
test_lambda_ref_copy_capture_views();
Expand Down