Skip to content

Commit 378a3fc

Browse files
committed
Initial cuda port
1 parent 668f489 commit 378a3fc

File tree

3 files changed

+213
-0
lines changed

3 files changed

+213
-0
lines changed
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/bin/bash
2+
# SPDX-License-Identifier: MIT
3+
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
4+
5+
name="finegrained_allocator"
6+
7+
# Warnings forwarded to host compiler (GCC/Clang)
8+
basic_warnings="-Xcompiler=-Wall -Xcompiler=-Wextra"
9+
10+
strict_warnings="-Xcompiler=-Wshadow \
11+
-Xcompiler=-Wnon-virtual-dtor \
12+
-Xcompiler=-Wold-style-cast \
13+
-Xcompiler=-Wcast-align \
14+
-Xcompiler=-Woverloaded-virtual \
15+
-Xcompiler=-Wconversion \
16+
-Xcompiler=-Wsign-conversion \
17+
-Xcompiler=-Wnull-dereference \
18+
-Xcompiler=-Wdouble-promotion \
19+
-Xcompiler=-Wformat=2"
20+
21+
# NVCC supports -std=c++17 directly
22+
std_flags="-std=c++17"
23+
24+
# Output settings
25+
output_flags="-Xcompiler=-fPIC -shared -o lib${name}.so"
26+
27+
nvcc -arch=sm_90 $basic_warnings $strict_warnings $std_flags $output_flags ${name}.cu
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
// SPDX-License-Identifier: MIT
2+
// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
3+
4+
#include <sys/types.h>
5+
6+
#include <cstdlib> // For getenv
7+
#include <iostream>
8+
9+
// #include "hip/hip_runtime.h"
10+
11+
#include "cuda_runtime.h"
12+
13+
#define hip_try(error) \
14+
if (error != cudaSuccess) { \
15+
std::cerr << "[finegrained_allocator] Hip error: " << cudaGetErrorString(error) << " at line " \
16+
<< __LINE__ << std::endl; \
17+
std::exit(EXIT_FAILURE); \
18+
}
19+
20+
inline bool is_logging_enabled() { return std::getenv("LOG_FINEGRAINED_ALLOCATOR") != nullptr; }
21+
22+
inline void log_allocation(const char* operation, void* ptr, ssize_t size, int device) {
23+
if (is_logging_enabled()) {
24+
std::cout << "[finegrained_allocator] " << operation << ": ptr=" << ptr << ", size=" << size
25+
<< " bytes, device=" << device << std::endl;
26+
}
27+
}
28+
29+
extern "C" {
30+
void* finegrained_hipMalloc(ssize_t size, int device, cudaStream_t stream [[maybe_unused]]) {
31+
void* ptr;
32+
// const auto flags = hipDeviceMallocFinegrained;
33+
34+
int current_device;
35+
hip_try(cudaGetDevice(&current_device));
36+
hip_try(cudaSetDevice(device));
37+
hip_try(cudaMalloc(&ptr, static_cast<size_t>(size)));
38+
39+
log_allocation("Allocation", ptr, size, device);
40+
41+
hip_try(cudaSetDevice(current_device));
42+
return ptr;
43+
}
44+
45+
void finegrained_hipFree(void* ptr,
46+
ssize_t size [[maybe_unused]],
47+
int device,
48+
cudaStream_t stream [[maybe_unused]]) {
49+
int current_device;
50+
hip_try(cudaGetDevice(&current_device));
51+
hip_try(cudaSetDevice(device));
52+
53+
log_allocation("Deallocation", ptr, size, device);
54+
55+
hip_try(cudaFree(ptr));
56+
hip_try(cudaSetDevice(current_device));
57+
}
58+
}

iris/cuda.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# SPDX-License-Identifier: MIT
2+
# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
3+
4+
import ctypes
5+
import numpy as np
6+
import sys
7+
8+
rt_path = "libcudart.so"
9+
cuda_runtime = ctypes.cdll.LoadLibrary(rt_path)
10+
11+
12+
def cuda_try(err):
13+
if err != 0:
14+
cuda_runtime.cudaGetErrorString.restype = ctypes.c_char_p
15+
error_string = cuda_runtime.cudaGetErrorString(ctypes.c_int(err)).decode("utf-8")
16+
raise RuntimeError(f"cuda error code {err}: {error_string}")
17+
18+
19+
class cudaIpcMemHandle_t(ctypes.Structure):
20+
_fields_ = [("internal", ctypes.c_byte * 128)]
21+
22+
23+
def open_ipc_handle(ipc_handle_data, rank):
24+
ptr = ctypes.c_void_p()
25+
cudaIpcMemLazyEnablePeerAccess = ctypes.c_uint(1)
26+
cuda_runtime.cudaIpcOpenMemHandle.argtypes = [
27+
ctypes.POINTER(ctypes.c_void_p),
28+
cudaIpcMemHandle_t,
29+
ctypes.c_uint,
30+
]
31+
if isinstance(ipc_handle_data, np.ndarray):
32+
if ipc_handle_data.dtype != np.uint8 or ipc_handle_data.size != 128:
33+
raise ValueError("ipc_handle_data must be a 128-element uint8 numpy array")
34+
ipc_handle_bytes = ipc_handle_data.tobytes()
35+
ipc_handle_data = (ctypes.c_char * 128).from_buffer_copy(ipc_handle_bytes)
36+
else:
37+
raise TypeError("ipc_handle_data must be a numpy.ndarray of dtype uint8 with 128 elements")
38+
39+
raw_memory = ctypes.create_string_buffer(128)
40+
ctypes.memset(raw_memory, 0x00, 128)
41+
ipc_handle_struct = cudaIpcMemHandle_t.from_buffer(raw_memory)
42+
ipc_handle_data_bytes = bytes(ipc_handle_data)
43+
ctypes.memmove(raw_memory, ipc_handle_data_bytes, 128)
44+
45+
cuda_try(
46+
cuda_runtime.cudaIpcOpenMemHandle(
47+
ctypes.byref(ptr),
48+
ipc_handle_struct,
49+
cudaIpcMemLazyEnablePeerAccess,
50+
)
51+
)
52+
53+
return ptr.value
54+
55+
56+
def get_ipc_handle(ptr, rank):
57+
ipc_handle = cudaIpcMemHandle_t()
58+
cuda_try(cuda_runtime.cudaIpcGetMemHandle(ctypes.byref(ipc_handle), ptr))
59+
return ipc_handle
60+
61+
62+
def count_devices():
63+
device_count = ctypes.c_int()
64+
cuda_try(cuda_runtime.cudaGetDeviceCount(ctypes.byref(device_count)))
65+
return device_count.value
66+
67+
68+
def set_device(gpu_id):
69+
cuda_try(cuda_runtime.cudaSetDevice(gpu_id))
70+
71+
72+
def get_device_id():
73+
device_id = ctypes.c_int()
74+
cuda_try(cuda_runtime.cudaGetDevice(ctypes.byref(device_id)))
75+
return device_id.value
76+
77+
78+
def get_cu_count(device_id=None):
79+
if device_id is None:
80+
device_id = get_device_id()
81+
82+
cudaDeviceAttributeMultiprocessorCount = 16
83+
cu_count = ctypes.c_int()
84+
85+
cuda_try(cuda_runtime.cudaDeviceGetAttribute(ctypes.byref(cu_count), cudaDeviceAttributeMultiprocessorCount, device_id))
86+
87+
return cu_count.value
88+
89+
90+
# Starting ROCm 6.5
91+
# def get_xcc_count(device_id=None):
92+
# if device_id is None:
93+
# device_id = get_device()
94+
95+
# cudaDeviceAttributeNumberOfXccs = ??
96+
# xcc_count = ctypes.c_int()
97+
98+
# cuda_try(cuda_runtime.cudaDeviceGetAttribute(
99+
# ctypes.byref(xcc_count),
100+
# cudaDeviceAttributeNumberOfXccs,
101+
# device_id
102+
# ))
103+
104+
# return xcc_count
105+
106+
107+
def get_wall_clock_rate(device_id):
108+
cudaDevAttrMemoryClockRate = 36
109+
wall_clock_rate = ctypes.c_int()
110+
status = cuda_runtime.cudaDeviceGetAttribute(
111+
ctypes.byref(wall_clock_rate), cudaDevAttrMemoryClockRate, device_id
112+
)
113+
cuda_try(status)
114+
return wall_clock_rate.value
115+
116+
117+
def malloc_fine_grained(size):
118+
return cuda_malloc(size)
119+
120+
121+
def cuda_malloc(size):
122+
ptr = ctypes.c_void_p()
123+
cuda_try(cuda_runtime.cudaMalloc(ctypes.byref(ptr), size))
124+
return ptr
125+
126+
127+
def cuda_free(ptr):
128+
cuda_try(cuda_runtime.cudaFree(ptr))

0 commit comments

Comments
 (0)