Initial cuda port

astroC86 · astroC86 · commit 378a3fc027a9 · 2025-08-27T09:01:10.000+02:00
diff --git a/csrc/finegrained_alloc/build_cuda.sh b/csrc/finegrained_alloc/build_cuda.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+name="finegrained_allocator"
+
+# Warnings forwarded to host compiler (GCC/Clang)
+basic_warnings="-Xcompiler=-Wall -Xcompiler=-Wextra"
+
+strict_warnings="-Xcompiler=-Wshadow \
+ -Xcompiler=-Wnon-virtual-dtor \
+ -Xcompiler=-Wold-style-cast \
+ -Xcompiler=-Wcast-align \
+ -Xcompiler=-Woverloaded-virtual \
+ -Xcompiler=-Wconversion \
+ -Xcompiler=-Wsign-conversion \
+ -Xcompiler=-Wnull-dereference \
+ -Xcompiler=-Wdouble-promotion \
+ -Xcompiler=-Wformat=2"
+
+# NVCC supports -std=c++17 directly
+std_flags="-std=c++17"
+
+# Output settings
+output_flags="-Xcompiler=-fPIC -shared -o lib${name}.so"
+
+nvcc -arch=sm_90 $basic_warnings $strict_warnings $std_flags $output_flags ${name}.cu
diff --git a/csrc/finegrained_alloc/finegrained_allocator.cu b/csrc/finegrained_alloc/finegrained_allocator.cu
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+#include <sys/types.h>
+
+#include <cstdlib>  // For getenv
+#include <iostream>
+
+// #include "hip/hip_runtime.h"
+
+#include "cuda_runtime.h"
+
+#define hip_try(error)                                                                            \
+  if (error != cudaSuccess) {                                                                     \
+    std::cerr << "[finegrained_allocator] Hip error: " << cudaGetErrorString(error) << " at line " \
+              << __LINE__ << std::endl;                                                           \
+    std::exit(EXIT_FAILURE);                                                                      \
+  }
+
+inline bool is_logging_enabled() { return std::getenv("LOG_FINEGRAINED_ALLOCATOR") != nullptr; }
+
+inline void log_allocation(const char* operation, void* ptr, ssize_t size, int device) {
+  if (is_logging_enabled()) {
+    std::cout << "[finegrained_allocator] " << operation << ": ptr=" << ptr << ", size=" << size
+              << " bytes, device=" << device << std::endl;
+  }
+}
+
+extern "C" {
+void* finegrained_hipMalloc(ssize_t size, int device, cudaStream_t stream [[maybe_unused]]) {
+  void* ptr;
+  // const auto flags = hipDeviceMallocFinegrained;
+
+  int current_device;
+  hip_try(cudaGetDevice(&current_device));
+  hip_try(cudaSetDevice(device));
+  hip_try(cudaMalloc(&ptr, static_cast<size_t>(size)));
+
+  log_allocation("Allocation", ptr, size, device);
+
+  hip_try(cudaSetDevice(current_device));
+  return ptr;
+}
+
+void finegrained_hipFree(void* ptr,
+                         ssize_t size [[maybe_unused]],
+                         int device,
+                         cudaStream_t stream [[maybe_unused]]) {
+  int current_device;
+  hip_try(cudaGetDevice(&current_device));
+  hip_try(cudaSetDevice(device));
+
+  log_allocation("Deallocation", ptr, size, device);
+
+  hip_try(cudaFree(ptr));
+  hip_try(cudaSetDevice(current_device));
+}
+}
diff --git a/iris/cuda.py b/iris/cuda.py
@@ -0,0 +1,128 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+import ctypes
+import numpy as np
+import sys
+
+rt_path = "libcudart.so"
+cuda_runtime = ctypes.cdll.LoadLibrary(rt_path)
+
+
+def cuda_try(err):
+    if err != 0:
+        cuda_runtime.cudaGetErrorString.restype = ctypes.c_char_p
+        error_string = cuda_runtime.cudaGetErrorString(ctypes.c_int(err)).decode("utf-8")
+        raise RuntimeError(f"cuda error code {err}: {error_string}")
+
+
+class cudaIpcMemHandle_t(ctypes.Structure):
+    _fields_ = [("internal", ctypes.c_byte * 128)]
+
+
+def open_ipc_handle(ipc_handle_data, rank):
+    ptr = ctypes.c_void_p()
+    cudaIpcMemLazyEnablePeerAccess = ctypes.c_uint(1)
+    cuda_runtime.cudaIpcOpenMemHandle.argtypes = [
+        ctypes.POINTER(ctypes.c_void_p),
+        cudaIpcMemHandle_t,
+        ctypes.c_uint,
+    ]
+    if isinstance(ipc_handle_data, np.ndarray):
+        if ipc_handle_data.dtype != np.uint8 or ipc_handle_data.size != 128:
+            raise ValueError("ipc_handle_data must be a 128-element uint8 numpy array")
+        ipc_handle_bytes = ipc_handle_data.tobytes()
+        ipc_handle_data = (ctypes.c_char * 128).from_buffer_copy(ipc_handle_bytes)
+    else:
+        raise TypeError("ipc_handle_data must be a numpy.ndarray of dtype uint8 with 128 elements")
+
+    raw_memory = ctypes.create_string_buffer(128)
+    ctypes.memset(raw_memory, 0x00, 128)
+    ipc_handle_struct = cudaIpcMemHandle_t.from_buffer(raw_memory)
+    ipc_handle_data_bytes = bytes(ipc_handle_data)
+    ctypes.memmove(raw_memory, ipc_handle_data_bytes, 128)
+
+    cuda_try(
+        cuda_runtime.cudaIpcOpenMemHandle(
+            ctypes.byref(ptr),
+            ipc_handle_struct,
+            cudaIpcMemLazyEnablePeerAccess,
+        )
+    )
+
+    return ptr.value
+
+
+def get_ipc_handle(ptr, rank):
+    ipc_handle = cudaIpcMemHandle_t()
+    cuda_try(cuda_runtime.cudaIpcGetMemHandle(ctypes.byref(ipc_handle), ptr))
+    return ipc_handle
+
+
+def count_devices():
+    device_count = ctypes.c_int()
+    cuda_try(cuda_runtime.cudaGetDeviceCount(ctypes.byref(device_count)))
+    return device_count.value
+
+
+def set_device(gpu_id):
+    cuda_try(cuda_runtime.cudaSetDevice(gpu_id))
+
+
+def get_device_id():
+    device_id = ctypes.c_int()
+    cuda_try(cuda_runtime.cudaGetDevice(ctypes.byref(device_id)))
+    return device_id.value
+
+
+def get_cu_count(device_id=None):
+    if device_id is None:
+        device_id = get_device_id()
+
+    cudaDeviceAttributeMultiprocessorCount = 16
+    cu_count = ctypes.c_int()
+
+    cuda_try(cuda_runtime.cudaDeviceGetAttribute(ctypes.byref(cu_count), cudaDeviceAttributeMultiprocessorCount, device_id))
+
+    return cu_count.value
+
+
+# Starting ROCm 6.5
+# def get_xcc_count(device_id=None):
+#     if device_id is None:
+#         device_id = get_device()
+
+#     cudaDeviceAttributeNumberOfXccs = ??
+#     xcc_count = ctypes.c_int()
+
+#     cuda_try(cuda_runtime.cudaDeviceGetAttribute(
+#         ctypes.byref(xcc_count),
+#         cudaDeviceAttributeNumberOfXccs,
+#         device_id
+#     ))
+
+#     return xcc_count
+
+
+def get_wall_clock_rate(device_id):
+    cudaDevAttrMemoryClockRate = 36
+    wall_clock_rate = ctypes.c_int()
+    status = cuda_runtime.cudaDeviceGetAttribute(
+        ctypes.byref(wall_clock_rate), cudaDevAttrMemoryClockRate, device_id
+    )
+    cuda_try(status)
+    return wall_clock_rate.value
+
+
+def malloc_fine_grained(size):
+    return cuda_malloc(size)
+
+
+def cuda_malloc(size):
+    ptr = ctypes.c_void_p()
+    cuda_try(cuda_runtime.cudaMalloc(ctypes.byref(ptr), size))
+    return ptr
+
+
+def cuda_free(ptr):
+    cuda_try(cuda_runtime.cudaFree(ptr))