Skip to content

Commit 416411d

Browse files
committed
Emulate i128 ops
LLVM 7.1 (and thus nvvm) lacks native 128-bit integer intrinsics, so emulate them using 64-bit operations. I also added an example with `sha2` to confirm it works and show folks that crates.io crates can often be used directly unmodified. Fixes #207.
1 parent 9fa4487 commit 416411d

File tree

9 files changed

+893
-15
lines changed

9 files changed

+893
-15
lines changed

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ members = [
1414
"examples/cuda/gemm/kernels",
1515
"examples/cuda/path_tracer",
1616
"examples/cuda/path_tracer/kernels",
17+
"examples/cuda/sha2_crates_io",
18+
"examples/cuda/sha2_crates_io/kernels",
1719

1820
"examples/optix/*",
1921
"tests/compiletests",

crates/rustc_codegen_nvvm/src/builder.rs

Lines changed: 518 additions & 7 deletions
Large diffs are not rendered by default.

crates/rustc_codegen_nvvm/src/intrinsic.rs

Lines changed: 71 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,67 @@ use rustc_target::callconv::PassMode;
1717
use tracing::trace;
1818

1919
use crate::abi::LlvmType;
20-
use crate::builder::Builder;
20+
use crate::builder::{Builder, CountZerosKind};
2121
use crate::context::CodegenCx;
2222
use crate::llvm::{self, Type, Value};
2323
use crate::ty::LayoutLlvmExt;
2424

25-
// libnvvm does not support some advanced intrinsics for i128 so we just abort on them for now. In the future
26-
// we should emulate them in software.
27-
fn handle_128_bit_intrinsic<'ll>(b: &mut Builder<'_, 'll, '_>) -> &'ll Value {
28-
b.abort_and_ret_i128()
25+
fn handle_128_bit_intrinsic<'ll>(
26+
b: &mut Builder<'_, 'll, '_>,
27+
name: Symbol,
28+
args: &[OperandRef<'_, &'ll Value>],
29+
) -> &'ll Value {
30+
match name {
31+
sym::ctlz | sym::cttz => {
32+
// TODO(@LegNeato): LLVM 7.1 doesn't have llvm.ctlz.i128/llvm.cttz.i128
33+
// When we upgrade NVVM, we can call the real intrinsic directly
34+
let kind = if name == sym::ctlz {
35+
CountZerosKind::Leading
36+
} else {
37+
CountZerosKind::Trailing
38+
};
39+
b.emulate_i128_count_zeros(args[0].immediate(), kind, false)
40+
}
41+
sym::ctlz_nonzero | sym::cttz_nonzero => {
42+
// TODO(@LegNeato): LLVM 7.1 doesn't have llvm.ctlz.i128/llvm.cttz.i128
43+
// When we upgrade NVVM, we can call the real intrinsic directly
44+
let kind = if name == sym::ctlz_nonzero {
45+
CountZerosKind::Leading
46+
} else {
47+
CountZerosKind::Trailing
48+
};
49+
b.emulate_i128_count_zeros(args[0].immediate(), kind, true)
50+
}
51+
sym::ctpop => {
52+
// TODO(@LegNeato): LLVM 7.1 doesn't have llvm.ctpop.i128
53+
// When we upgrade NVVM, we can call the real intrinsic directly
54+
b.emulate_i128_ctpop(args[0].immediate())
55+
}
56+
sym::bswap => {
57+
// TODO(@LegNeato): LLVM 7.1 doesn't have llvm.bswap.i128 (added in LLVM 9.0)
58+
// When we upgrade NVVM, we can call the real intrinsic directly
59+
// For now, emulate it by swapping the two i64 halves and byte-swapping each
60+
b.emulate_i128_bswap(args[0].immediate())
61+
}
62+
sym::bitreverse => {
63+
// TODO(@LegNeato): LLVM 7.1 doesn't have llvm.bitreverse.i128
64+
// When we upgrade NVVM, we can call the real intrinsic directly
65+
b.emulate_i128_bitreverse(args[0].immediate())
66+
}
67+
sym::rotate_left | sym::rotate_right => {
68+
// TODO(@LegNeato): LLVM 7.1 doesn't have llvm.fshl.i128/llvm.fshr.i128
69+
// When we upgrade NVVM, we can call the real intrinsic directly
70+
let is_left = name == sym::rotate_left;
71+
let val = args[0].immediate();
72+
let shift = args[1].immediate();
73+
b.emulate_i128_rotate(val, shift, is_left)
74+
}
75+
_ => {
76+
// For any unsupported 128-bit intrinsics, return a fatal error
77+
// This shouldn't happen with the current set of intrinsics
78+
b.fatal(format!("unsupported 128-bit intrinsic: {}", name))
79+
}
80+
}
2981
}
3082

3183
// llvm 7 does not have saturating intrinsics, so we reimplement them right here.
@@ -55,6 +107,19 @@ fn saturating_intrinsic_impl<'ll, 'tcx>(
55107
_ => unreachable!(),
56108
};
57109

110+
// For 128-bit, we need to handle the constants differently
111+
if width == 128 {
112+
// For 128-bit saturating operations, use LLVM's saturating intrinsics directly
113+
let lhs = args[0].immediate();
114+
let rhs = args[1].immediate();
115+
let llvm_name = format!(
116+
"llvm.{}{}.sat.i128",
117+
if signed { 's' } else { 'u' },
118+
if is_add { "add" } else { "sub" }
119+
);
120+
return b.call_intrinsic(&llvm_name, &[lhs, rhs]);
121+
}
122+
58123
let unsigned_max_value = match width {
59124
8 => u8::MAX as i64,
60125
16 => u16::MAX as i64,
@@ -400,7 +465,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
400465
args,
401466
)
402467
} else if width == 128 {
403-
handle_128_bit_intrinsic(self)
468+
handle_128_bit_intrinsic(self, name, args)
404469
} else {
405470
match name {
406471
sym::ctlz | sym::cttz => {

examples/cuda/README.md

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,19 @@
22

33
The examples in here showcase both the GPU side and the CPU side of writing a tool which uses the GPU.
44

5-
## [Interactive Path Tracer](cpu/path_tracer)
5+
## Available Examples
66

7-
This example showcases a very simple interactive Path Tracer inspired by [Ray Tracing In One Weekend](https://raytracing.github.io/books/RayTracingInOneWeekend.html)
7+
### [vecadd](vecadd)
8+
A simple vector addition example demonstrating basic CUDA kernel usage.
9+
10+
### [gemm](gemm)
11+
General Matrix Multiplication (GEMM) implementation showing more complex CUDA operations.
12+
13+
### [sha2_crates_io](sha2_crates_io)
14+
Demonstrates using an existing Rust crate ([`sha2`](https://crates.io/crates/sha2) from crates.io) on both CPU and GPU without modification. Shows that the same cryptographic hashing code can run on CUDA, producing identical results to the CPU implementation.
15+
16+
### [Interactive Path Tracer](path_tracer)
17+
A very simple interactive Path Tracer inspired by [Ray Tracing In One Weekend](https://raytracing.github.io/books/RayTracingInOneWeekend.html)
818
which runs on CPU or GPU, with the additional option of running OptiX denoising.
919

1020
![Path Tracer](assets/path_tracer.png)
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
[package]
2+
name = "sha2_crates_io"
3+
version = "0.1.0"
4+
edition = "2024"
5+
6+
[dependencies]
7+
cust = { path = "../../../crates/cust" }
8+
sha2_crates_io_kernels = { path = "kernels" }
9+
sha2 = "0.10"
10+
11+
[build-dependencies]
12+
cuda_builder = { path = "../../../crates/cuda_builder" }
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
use std::env;
2+
use std::path;
3+
4+
use cuda_builder::CudaBuilder;
5+
6+
fn main() {
7+
println!("cargo::rerun-if-changed=build.rs");
8+
println!("cargo::rerun-if-changed=kernels");
9+
10+
let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
11+
let manifest_dir = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
12+
13+
CudaBuilder::new(manifest_dir.join("kernels"))
14+
.copy_to(out_path.join("kernels.ptx"))
15+
.build()
16+
.unwrap();
17+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
[package]
2+
name = "sha2_crates_io_kernels"
3+
version = "0.1.0"
4+
edition = "2024"
5+
6+
[dependencies]
7+
cuda_std = { path = "../../../../crates/cuda_std" }
8+
sha2 = { version = "0.10", default-features = false }
9+
10+
[lib]
11+
crate-type = ["cdylib", "rlib"]
Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
use cuda_std::prelude::*;
2+
use sha2::{Digest, Sha256, Sha512};
3+
4+
// One-shot API for SHA256
5+
#[kernel]
6+
#[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
7+
pub unsafe fn sha256_oneshot(input: &[u8], output: *mut [u8; 32]) {
8+
let idx = thread::index_1d() as usize;
9+
10+
if idx == 0 {
11+
let hash = Sha256::digest(input);
12+
13+
unsafe {
14+
let output_slice = &mut *output;
15+
output_slice.copy_from_slice(&hash);
16+
}
17+
}
18+
}
19+
20+
// Incremental API for SHA256
21+
#[kernel]
22+
#[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
23+
pub unsafe fn sha256_incremental(input1: &[u8], input2: &[u8], output: *mut [u8; 32]) {
24+
let idx = thread::index_1d() as usize;
25+
26+
if idx == 0 {
27+
let mut hasher = Sha256::new();
28+
hasher.update(input1);
29+
hasher.update(input2);
30+
let hash = hasher.finalize();
31+
32+
unsafe {
33+
let output_slice = &mut *output;
34+
output_slice.copy_from_slice(&hash);
35+
}
36+
}
37+
}
38+
39+
// One-shot API for SHA512
40+
#[kernel]
41+
#[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
42+
pub unsafe fn sha512_oneshot(input: &[u8], output: *mut [u8; 64]) {
43+
let idx = thread::index_1d() as usize;
44+
45+
if idx == 0 {
46+
let hash = Sha512::digest(input);
47+
48+
unsafe {
49+
let output_slice = &mut *output;
50+
output_slice.copy_from_slice(&hash);
51+
}
52+
}
53+
}
54+
55+
// Incremental API for SHA512
56+
#[kernel]
57+
#[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
58+
pub unsafe fn sha512_incremental(input: &[u8], output: *mut [u8; 64]) {
59+
let idx = thread::index_1d() as usize;
60+
61+
if idx == 0 {
62+
let mut hasher = Sha512::new();
63+
hasher.update(input);
64+
let hash = hasher.finalize();
65+
66+
unsafe {
67+
let output_slice = &mut *output;
68+
output_slice.copy_from_slice(&hash);
69+
}
70+
}
71+
}

0 commit comments

Comments
 (0)