Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ members = [

"xtask",

"examples/cuda/async_api",
"examples/cuda/async_api/kernels",
"examples/cuda/vecadd",
"examples/cuda/vecadd/kernels",
"examples/cuda/gemm",
Expand Down
11 changes: 11 additions & 0 deletions examples/cuda/async_api/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[package]
name = "async_api"
version = "0.1.0"
edition = "2024"

[dependencies]
cust = { path = "../../../crates/cust" }
nanorand = "0.7"

[build-dependencies]
cuda_builder = { workspace = true, default-features = false }
17 changes: 17 additions & 0 deletions examples/cuda/async_api/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
use std::env;
use std::path;

use cuda_builder::CudaBuilder;

fn main() {
println!("cargo::rerun-if-changed=build.rs");
println!("cargo::rerun-if-changed=kernels");

let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
let manifest_dir = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());

CudaBuilder::new(manifest_dir.join("kernels"))
.copy_to(out_path.join("kernels.ptx"))
.build()
.unwrap();
}
10 changes: 10 additions & 0 deletions examples/cuda/async_api/kernels/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[package]
name = "async_api-kernels"
version = "0.1.0"
edition = "2024"

[dependencies]
cuda_std = { path = "../../../../crates/cuda_std" }

[lib]
crate-type = ["cdylib", "rlib"]
17 changes: 17 additions & 0 deletions examples/cuda/async_api/kernels/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
use cuda_std::prelude::*;

#[kernel]
#[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
pub unsafe fn increment(g_data: *mut u32, inc_value: u32) {
// This can also be obtained directly as
//
// let idx: usize = cuda_std::thread::index() as usize;
let idx: usize = (
cuda_std::thread::block_dim().x
* cuda_std::thread::block_idx().x
+ cuda_std::thread::thread_idx().x
) as usize;

let elem: &mut u32 = unsafe { &mut *g_data.add(idx) };
*elem = *elem + inc_value;
}
129 changes: 129 additions & 0 deletions examples/cuda/async_api/src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@

use cust::memory::{DeviceBuffer, LockedBuffer, AsyncCopyDestination};
use cust::event::{Event, EventFlags};
use cust::prelude::EventStatus;
use cust::stream::{Stream, StreamFlags};
use cust::module::Module;
use cust::context::Context;
use cust::{launch, CudaFlags};
use cust::device::Device;
use cust::function::{GridSize, BlockSize};
use std::time::Instant;

static PTX: &str = include_str!(concat!(env!("OUT_DIR"), "/kernels.ptx"));

fn correct_output(data: &[u32], x: u32) -> bool {
let not_matching_element = data
.iter()
.enumerate()
.find(|&(_, &elem)| elem != x);

match not_matching_element {
Some((index, elem)) => println!("Error! data[{index}] = {elem}, ref = {x}"),
None => println!("All elements of the array match the value!")
}

not_matching_element.is_none()
}

fn main() -> Result<(), cust::error::CudaError> {
cust::init(CudaFlags::empty()).expect("Couldn't initialize CUDA environment!");

let device = Device::get_device(0)
.expect("Couldn't find Cuda supported devices!");

println!("Device Name: {}", device.name().unwrap());

// Set up the context, load the module, and create a stream to run kernels in.
let _ctx = Context::new(device);
let module = Module::from_ptx(PTX, &[]).expect("Module couldn't be init!");
let increment = module.get_function("increment").expect("Kernel function not found!");
let stream = Stream::new(StreamFlags::NON_BLOCKING, None).expect("Stream couldn't be init!");

const N: usize = 16 * 1024 * 1024;
const N_BYTES: usize = N * (i32::BITS as usize);
let value = 26;

let blocks = BlockSize::xy(512, 1);
let grids = GridSize::xy((N / (blocks.x as usize)).try_into().unwrap(), 1);

let start_event = Event::new(EventFlags::DEFAULT)?;
let stop_event = Event::new(EventFlags::DEFAULT)?;

// Create buffers for data on host-side
// Ideally should be page-locked for efficiency
let mut host_a = LockedBuffer::new(&0u32, N).expect("host array couldn't be initialized!");
let mut device_a = DeviceBuffer::from_slice(&[u32::MAX; N]).expect("device array couldn't be initialized!");

start_event.record(&stream).expect("Failed to record start_event in the CUDA stream!");
let start = Instant::now();

// SAFETY: until the stop_event is triggered:
// 1. `host_a` is not being modified
// 2. Both `device_a` and `host_a` are not deallocated
// 3. Until `stop_query` yields `EventStatus::Ready`, `device_a` is not involved in any other operation
// other than those of the operations in the stream.
unsafe {
device_a.async_copy_from(&host_a, &stream).expect("Could not copy from host to device!");
}

// SAFETY: number of threads * number of blocks = total number of elements.
// Hence there will not be any out-of-bounds issues.
unsafe {
let result = launch!(increment<<<grids, blocks, 0, stream>>>(
device_a.as_device_ptr(),
value
));
result.expect("Result of `increment` kernel did not process!");
}

// SAFETY: until the stop_event is triggered:
// 1. `device_a` is not being modified
// 2. Both `device_a` and `host_a` are not deallocated
// 3. At this point, until `stop_query` yields `EventStatus::Ready`,
// `host_a` is not involved in any other operation.
unsafe {
device_a.async_copy_to(&mut host_a, &stream).expect("Could not copy from device to host!");
}

stop_event.record(&stream).expect("Failed to record stop_event in the CUDA stream!");
let cpu_time: u128 = start.elapsed().as_micros();

let mut counter: u64 = 0;
while stop_event.query() != Ok(EventStatus::Ready) { counter += 1 }

let gpu_time: u128 = stop_event
.elapsed(&start_event)
.expect("Failed to calculate duration of GPU operations!")
.as_micros();

println!("Time spent executing by the GPU: {gpu_time} microseconds");
println!("Time spent by CPU in CUDA calls: {cpu_time} microseconds");
println!("CPU executed {counter} iterations while waiting for GPU to finish.");

assert!(correct_output(host_a.as_slice(), value));

// Stream is synchronized as a safety measure
stream.synchronize().expect("Stream couldn't synchronize!");

// Events and buffers can be safely dropped now
match Event::drop(start_event) {
Ok(()) => println!("Successfully destroyed start_event"),
Err((cuda_error, _event)) => {
println!("Failed to destroy start_event: {:?}", cuda_error);
},
}

match Event::drop(stop_event) {
Ok(()) => println!("Successfully destroyed stop_event"),
Err((cuda_error, _event)) => {
println!("Failed to destroy stop_event: {:?}", cuda_error);
},
}

DeviceBuffer::drop(device_a).expect("Couldn't drop device array!");
LockedBuffer::drop(host_a).expect("Couldn't drop host array!");

println!("test PASSED");
Ok(())
}
Loading