Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion .clang-format
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
IndentWidth: 2
BasedOnStyle: 'google'
ColumnLimit: 100
SortIncludes: false
SortIncludes: false
QualifierAlignment: Right
UseTab: Never
DerivePointerAlignment: false
PointerAlignment: Left
2 changes: 1 addition & 1 deletion inc/mkn/gpu/cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
{ \
gpuAssert((ans), __FILE__, __LINE__); \
}
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true) {
inline void gpuAssert(cudaError_t code, char const* file, int line, bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) std::abort();
Expand Down
4 changes: 2 additions & 2 deletions inc/mkn/gpu/launchers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ auto as_values(Args&... args) {

template <bool _sync = true>
struct GDLauncher : public GLauncher {
GDLauncher(std::size_t s, size_t dev = 0) : GLauncher{s, dev} {}
GDLauncher(std::size_t const s, size_t const dev = 0) : GLauncher{s, dev} {}

template <typename F, typename... Args>
auto operator()(F&& f, Args&&... args) {
Expand All @@ -69,7 +69,7 @@ struct GDLauncher : public GLauncher {

template <bool _sync = true>
struct DLauncher : public Launcher {
DLauncher(size_t /*dev*/ = 0) : Launcher{{}, {}} {}
DLauncher(size_t const /*dev*/ = 0) : Launcher{{}, {}} {}

template <typename F, typename... Args>
auto operator()(F&& f, Args&&... args) {
Expand Down
2 changes: 1 addition & 1 deletion inc/mkn/gpu/rocm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
{ \
gpuAssert((ans), __FILE__, __LINE__); \
}
inline void gpuAssert(hipError_t code, const char* file, int line, bool abort = true) {
inline void gpuAssert(hipError_t code, char const* file, int line, bool abort = true) {
if (code != hipSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", hipGetErrorString(code), file, line);
if (abort) std::abort();
Expand Down
2 changes: 1 addition & 1 deletion inc/mkn/gpu/tuple.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ struct SpanSet : ASpanSet<T, SIZE, GPU> {
curr_pos += sv->sizes[curr_ptr++];
return *this;
}
bool operator!=(const iterator& /*other*/) const __device__ {
bool operator!=(iterator const& /*other*/) const __device__ {
return curr_ptr != sv->sizes.size();
}
Span<T, SIZE> operator*() const {
Expand Down
2 changes: 1 addition & 1 deletion test/any/array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ static constexpr uint32_t NUM = WIDTH * HEIGHT;
static constexpr uint32_t THREADS_PER_BLOCK_X = 16, THREADS_PER_BLOCK_Y = 16;

template <typename T>
__global__ void vectoradd(T* a, const T* b, const T* c) {
__global__ void vectoradd(T* a, T const* b, T const* c) {
auto i = mkn::gpu::idx();
a[i] = b[i] + c[i];
}
Expand Down
41 changes: 39 additions & 2 deletions test/any/async_streaming.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <thread>
#include <algorithm>

#include "mkn/gpu.hpp"
#include "mkn/kul/dbg.hpp"
#include "mkn/kul/time.hpp"
#include "mkn/gpu/multi_launch.hpp"
Expand Down Expand Up @@ -179,7 +180,7 @@ std::uint32_t test_threaded_host_group_idx(std::size_t const& nthreads = 2) {
.dev([=] __device__(auto const& i) { views[i][mkn::gpu::idx()] += 3; })();

std::size_t val = 5;
for (std::size_t i = 0; i < vecs.size(); i++) {
for (std::size_t i = 0; i < vecs.size(); ++i) {
if (i % group_size == 0) {
for (auto const& e : vecs[i])
if (e != val + 1) return 1;
Expand All @@ -193,12 +194,48 @@ std::uint32_t test_threaded_host_group_idx(std::size_t const& nthreads = 2) {
return 0;
}

std::uint32_t test_threaded_detached_stream_fns(std::size_t const& nthreads = 2) {
using T = double;
KUL_DBG_FUNC_ENTER;

std::vector<ManagedVector<T>> vecs(C, ManagedVector<T>(NUM, 0));
for (std::size_t i = 0; i < vecs.size(); ++i) std::fill_n(vecs[i].data(), NUM, i);

ManagedVector<T*> datas(C);
for (std::size_t i = 0; i < vecs.size(); ++i) datas[i] = vecs[i].data();
auto views = datas.data();

ThreadedStreamLauncher launcher{vecs, nthreads};
launcher
.host([&](auto i) mutable {
launcher.streams[i].sync(); // wait for first kernel per stream
for (auto& e : vecs[i]) e += 1;
})
.dev([=] __device__(auto const& i) { views[i][mkn::gpu::idx()] += 3; });

for (std::size_t i = 0; i < datas.size(); ++i) {
mkn::gpu::GDLauncher<false>{NUM}.stream(
launcher.streams[i], [=, idx = i] __device__() { views[idx][mkn::gpu::idx()] += 1; });
}

launcher();

std::size_t val = 5;
for (std::size_t i = 0; i < vecs.size(); ++i) {
for (std::size_t j = 0; j < vecs[i].size(); ++j)
if (val != vecs[i][j]) return 1;
++val;
};

return 0;
}

int main() {
KOUT(NON) << __FILE__;
return test() //
+ test_threaded() //
+ test_threaded(6) //
+ test_threaded_group_barrier() //
+ test_threaded_host_group_mutex() //
+ test_threaded_host_group_idx();
+ test_threaded_host_group_idx() + test_threaded_detached_stream_fns();
}
2 changes: 1 addition & 1 deletion test/cpu/namespace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ static constexpr uint32_t NUM = WIDTH * HEIGHT;
static constexpr uint32_t THREADS_PER_BLOCK_X = 16, THREADS_PER_BLOCK_Y = 16;

template <typename T>
__global__ void vectoradd(T* a, const T* b, const T* c) {
__global__ void vectoradd(T* a, T const* b, T const* c) {
auto i = mkn::gpu::cpu::idx();
a[i] = b[i] + c[i];
}
Expand Down
2 changes: 1 addition & 1 deletion test/cuda/add.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ static constexpr uint32_t NUM = WIDTH * HEIGHT;
static constexpr uint32_t THREADS_PER_BLOCK_X = 16, THREADS_PER_BLOCK_Y = 16;

template <typename T>
__global__ void vectoradd(T* a, const T* b, const T* c) {
__global__ void vectoradd(T* a, T const* b, T const* c) {
auto i = mkn::gpu::cuda::idx();
a[i] = b[i] + c[i];
}
Expand Down
20 changes: 10 additions & 10 deletions test/cuda/async.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,15 @@ inline cudaError_t checkCuda(cudaError_t result) {
return result;
}

__global__ void kernel(float *a, int offset) {
__global__ void kernel(float* a, int offset) {
int i = offset + threadIdx.x + blockIdx.x * blockDim.x;
float x = (float)i;
float s = sinf(x);
float c = cosf(x);
a[i] = a[i] + sqrtf(s * s + c * c);
}

float maxError(float *a, int n) {
float maxError(float* a, int n) {
float maxE = 0;
for (int i = 0; i < n; i++) {
float error = fabs(a[i] - 1.0f);
Expand All @@ -60,12 +60,12 @@ float maxError(float *a, int n) {
return maxE;
}

int main(int argc, char **argv) {
const int blockSize = 256, nStreams = 4;
const int n = 4 * 1024 * blockSize * nStreams;
const int streamSize = n / nStreams;
const int streamBytes = streamSize * sizeof(float);
const int bytes = n * sizeof(float);
int main(int argc, char** argv) {
int const blockSize = 256, nStreams = 4;
int const n = 4 * 1024 * blockSize * nStreams;
int const streamSize = n / nStreams;
int const streamBytes = streamSize * sizeof(float);
int const bytes = n * sizeof(float);

int devId = 0;
if (argc > 1) devId = atoi(argv[1]);
Expand All @@ -77,8 +77,8 @@ int main(int argc, char **argv) {

// allocate pinned host memory and device memory
float *a, *d_a;
checkCuda(cudaMallocHost((void **)&a, bytes)); // host pinned
checkCuda(cudaMalloc((void **)&d_a, bytes)); // device
checkCuda(cudaMallocHost((void**)&a, bytes)); // host pinned
checkCuda(cudaMalloc((void**)&d_a, bytes)); // device

float ms; // elapsed time in milliseconds

Expand Down
2 changes: 1 addition & 1 deletion test/cuda/atomic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ struct S {
} // namespace NS0

template <typename T>
__global__ void vectoradd(T* a, const T* b, const T* c) {
__global__ void vectoradd(T* a, T const* b, T const* c) {
auto i = mkn::gpu::cuda::idx();

NS0::S<T>{a[i]} += b[i] + c[i];
Expand Down
2 changes: 1 addition & 1 deletion test/hip/add.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ static constexpr uint32_t NUM = WIDTH * HEIGHT;
static constexpr uint32_t THREADS_PER_BLOCK_X = 16, THREADS_PER_BLOCK_Y = 16;

template <typename T>
__global__ void vectoradd(T* a, const T* b, const T* c) {
__global__ void vectoradd(T* a, T const* b, T const* c) {
auto i = mkn::gpu::hip::idx();
a[i] = b[i] + c[i];
}
Expand Down
20 changes: 10 additions & 10 deletions test/hip/async.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,15 @@ inline void checkHip([[maybe_unused]] hipError_t result) {
// return result;
}

__global__ void kernel(float *a, int offset) {
__global__ void kernel(float* a, int offset) {
int i = offset + threadIdx.x + blockIdx.x * blockDim.x;
float x = (float)i;
float s = sinf(x);
float c = cosf(x);
a[i] = a[i] + sqrtf(s * s + c * c);
}

float maxError(float *a, int n) {
float maxError(float* a, int n) {
float maxE = 0;
for (int i = 0; i < n; i++) {
float error = fabs(a[i] - 1.0f);
Expand All @@ -60,12 +60,12 @@ float maxError(float *a, int n) {
return maxE;
}

int main(int argc, char **argv) {
const int blockSize = 256, nStreams = 4;
const int n = 4 * 1024 * blockSize * nStreams;
const int streamSize = n / nStreams;
const int streamBytes = streamSize * sizeof(float);
const int bytes = n * sizeof(float);
int main(int argc, char** argv) {
int const blockSize = 256, nStreams = 4;
int const n = 4 * 1024 * blockSize * nStreams;
int const streamSize = n / nStreams;
int const streamBytes = streamSize * sizeof(float);
int const bytes = n * sizeof(float);

int devId = 0;
if (argc > 1) devId = atoi(argv[1]);
Expand All @@ -77,8 +77,8 @@ int main(int argc, char **argv) {

// allocate pinned host memory and device memory
float *a, *d_a;
checkHip(hipHostMalloc((void **)&a, bytes)); // host pinned
checkHip(hipMalloc((void **)&d_a, bytes)); // device
checkHip(hipHostMalloc((void**)&a, bytes)); // host pinned
checkHip(hipMalloc((void**)&d_a, bytes)); // device

float ms; // elapsed time in milliseconds

Expand Down