Emulate i128 ops

LegNeato · LegNeato · commit 416411dcc445 · 2025-08-07T23:46:44.000-05:00
LLVM 7.1 (and thus nvvm) lacks native 128-bit integer intrinsics, so emulate them using 64-bit operations. I also added an example with `sha2` to confirm it works and show folks that crates.io crates can often be used directly unmodified. Fixes #207.
diff --git a/Cargo.toml b/Cargo.toml
@@ -14,6 +14,8 @@ members = [
   "examples/cuda/gemm/kernels",
   "examples/cuda/path_tracer",
   "examples/cuda/path_tracer/kernels",
+  "examples/cuda/sha2_crates_io",
+  "examples/cuda/sha2_crates_io/kernels",
 
   "examples/optix/*",
   "tests/compiletests",
diff --git a/crates/rustc_codegen_nvvm/src/builder.rs b/crates/rustc_codegen_nvvm/src/builder.rs
diff --git a/crates/rustc_codegen_nvvm/src/intrinsic.rs b/crates/rustc_codegen_nvvm/src/intrinsic.rs
@@ -17,15 +17,67 @@ use rustc_target::callconv::PassMode;
 use tracing::trace;
 
 use crate::abi::LlvmType;
-use crate::builder::Builder;
+use crate::builder::{Builder, CountZerosKind};
 use crate::context::CodegenCx;
 use crate::llvm::{self, Type, Value};
 use crate::ty::LayoutLlvmExt;
 
-// libnvvm does not support some advanced intrinsics for i128 so we just abort on them for now. In the future
-// we should emulate them in software.
-fn handle_128_bit_intrinsic<'ll>(b: &mut Builder<'_, 'll, '_>) -> &'ll Value {
-    b.abort_and_ret_i128()
+fn handle_128_bit_intrinsic<'ll>(
+    b: &mut Builder<'_, 'll, '_>,
+    name: Symbol,
+    args: &[OperandRef<'_, &'ll Value>],
+) -> &'ll Value {
+    match name {
+        sym::ctlz | sym::cttz => {
+            // TODO(@LegNeato): LLVM 7.1 doesn't have llvm.ctlz.i128/llvm.cttz.i128
+            // When we upgrade NVVM, we can call the real intrinsic directly
+            let kind = if name == sym::ctlz {
+                CountZerosKind::Leading
+            } else {
+                CountZerosKind::Trailing
+            };
+            b.emulate_i128_count_zeros(args[0].immediate(), kind, false)
+        }
+        sym::ctlz_nonzero | sym::cttz_nonzero => {
+            // TODO(@LegNeato): LLVM 7.1 doesn't have llvm.ctlz.i128/llvm.cttz.i128
+            // When we upgrade NVVM, we can call the real intrinsic directly
+            let kind = if name == sym::ctlz_nonzero {
+                CountZerosKind::Leading
+            } else {
+                CountZerosKind::Trailing
+            };
+            b.emulate_i128_count_zeros(args[0].immediate(), kind, true)
+        }
+        sym::ctpop => {
+            // TODO(@LegNeato): LLVM 7.1 doesn't have llvm.ctpop.i128
+            // When we upgrade NVVM, we can call the real intrinsic directly
+            b.emulate_i128_ctpop(args[0].immediate())
+        }
+        sym::bswap => {
+            // TODO(@LegNeato): LLVM 7.1 doesn't have llvm.bswap.i128 (added in LLVM 9.0)
+            // When we upgrade NVVM, we can call the real intrinsic directly
+            // For now, emulate it by swapping the two i64 halves and byte-swapping each
+            b.emulate_i128_bswap(args[0].immediate())
+        }
+        sym::bitreverse => {
+            // TODO(@LegNeato): LLVM 7.1 doesn't have llvm.bitreverse.i128
+            // When we upgrade NVVM, we can call the real intrinsic directly
+            b.emulate_i128_bitreverse(args[0].immediate())
+        }
+        sym::rotate_left | sym::rotate_right => {
+            // TODO(@LegNeato): LLVM 7.1 doesn't have llvm.fshl.i128/llvm.fshr.i128
+            // When we upgrade NVVM, we can call the real intrinsic directly
+            let is_left = name == sym::rotate_left;
+            let val = args[0].immediate();
+            let shift = args[1].immediate();
+            b.emulate_i128_rotate(val, shift, is_left)
+        }
+        _ => {
+            // For any unsupported 128-bit intrinsics, return a fatal error
+            // This shouldn't happen with the current set of intrinsics
+            b.fatal(format!("unsupported 128-bit intrinsic: {}", name))
+        }
+    }
 }
 
 // llvm 7 does not have saturating intrinsics, so we reimplement them right here.
@@ -55,6 +107,19 @@ fn saturating_intrinsic_impl<'ll, 'tcx>(
         _ => unreachable!(),
     };
 
+    // For 128-bit, we need to handle the constants differently
+    if width == 128 {
+        // For 128-bit saturating operations, use LLVM's saturating intrinsics directly
+        let lhs = args[0].immediate();
+        let rhs = args[1].immediate();
+        let llvm_name = format!(
+            "llvm.{}{}.sat.i128",
+            if signed { 's' } else { 'u' },
+            if is_add { "add" } else { "sub" }
+        );
+        return b.call_intrinsic(&llvm_name, &[lhs, rhs]);
+    }
+
     let unsigned_max_value = match width {
         8 => u8::MAX as i64,
         16 => u16::MAX as i64,
@@ -400,7 +465,7 @@ impl<'ll, 'tcx> IntrinsicCallBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> {
                         args,
                     )
                 } else if width == 128 {
-                    handle_128_bit_intrinsic(self)
+                    handle_128_bit_intrinsic(self, name, args)
                 } else {
                     match name {
                         sym::ctlz | sym::cttz => {
diff --git a/examples/cuda/README.md b/examples/cuda/README.md
@@ -2,9 +2,19 @@
 
 The examples in here showcase both the GPU side and the CPU side of writing a tool which uses the GPU.
 
-## [Interactive Path Tracer](cpu/path_tracer)
+## Available Examples
 
-This example showcases a very simple interactive Path Tracer inspired by [Ray Tracing In One Weekend](https://raytracing.github.io/books/RayTracingInOneWeekend.html)
+### [vecadd](vecadd)
+A simple vector addition example demonstrating basic CUDA kernel usage.
+
+### [gemm](gemm)
+General Matrix Multiplication (GEMM) implementation showing more complex CUDA operations.
+
+### [sha2_crates_io](sha2_crates_io)
+Demonstrates using an existing Rust crate ([`sha2`](https://crates.io/crates/sha2) from crates.io) on both CPU and GPU without modification. Shows that the same cryptographic hashing code can run on CUDA, producing identical results to the CPU implementation.
+
+### [Interactive Path Tracer](path_tracer)
+A very simple interactive Path Tracer inspired by [Ray Tracing In One Weekend](https://raytracing.github.io/books/RayTracingInOneWeekend.html)
 which runs on CPU or GPU, with the additional option of running OptiX denoising.
 
 ![Path Tracer](assets/path_tracer.png)
diff --git a/examples/cuda/sha2_crates_io/Cargo.toml b/examples/cuda/sha2_crates_io/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "sha2_crates_io"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+cust = { path = "../../../crates/cust" }
+sha2_crates_io_kernels = { path = "kernels" }
+sha2 = "0.10"
+
+[build-dependencies]
+cuda_builder = { path = "../../../crates/cuda_builder" }
diff --git a/examples/cuda/sha2_crates_io/build.rs b/examples/cuda/sha2_crates_io/build.rs
@@ -0,0 +1,17 @@
+use std::env;
+use std::path;
+
+use cuda_builder::CudaBuilder;
+
+fn main() {
+    println!("cargo::rerun-if-changed=build.rs");
+    println!("cargo::rerun-if-changed=kernels");
+
+    let out_path = path::PathBuf::from(env::var("OUT_DIR").unwrap());
+    let manifest_dir = path::PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
+
+    CudaBuilder::new(manifest_dir.join("kernels"))
+        .copy_to(out_path.join("kernels.ptx"))
+        .build()
+        .unwrap();
+}
diff --git a/examples/cuda/sha2_crates_io/kernels/Cargo.toml b/examples/cuda/sha2_crates_io/kernels/Cargo.toml
@@ -0,0 +1,11 @@
+[package]
+name = "sha2_crates_io_kernels"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+cuda_std = { path = "../../../../crates/cuda_std" }
+sha2 = { version = "0.10", default-features = false }
+
+[lib]
+crate-type = ["cdylib", "rlib"]
diff --git a/examples/cuda/sha2_crates_io/kernels/src/lib.rs b/examples/cuda/sha2_crates_io/kernels/src/lib.rs
@@ -0,0 +1,71 @@
+use cuda_std::prelude::*;
+use sha2::{Digest, Sha256, Sha512};
+
+// One-shot API for SHA256
+#[kernel]
+#[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
+pub unsafe fn sha256_oneshot(input: &[u8], output: *mut [u8; 32]) {
+    let idx = thread::index_1d() as usize;
+
+    if idx == 0 {
+        let hash = Sha256::digest(input);
+
+        unsafe {
+            let output_slice = &mut *output;
+            output_slice.copy_from_slice(&hash);
+        }
+    }
+}
+
+// Incremental API for SHA256
+#[kernel]
+#[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
+pub unsafe fn sha256_incremental(input1: &[u8], input2: &[u8], output: *mut [u8; 32]) {
+    let idx = thread::index_1d() as usize;
+
+    if idx == 0 {
+        let mut hasher = Sha256::new();
+        hasher.update(input1);
+        hasher.update(input2);
+        let hash = hasher.finalize();
+
+        unsafe {
+            let output_slice = &mut *output;
+            output_slice.copy_from_slice(&hash);
+        }
+    }
+}
+
+// One-shot API for SHA512
+#[kernel]
+#[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
+pub unsafe fn sha512_oneshot(input: &[u8], output: *mut [u8; 64]) {
+    let idx = thread::index_1d() as usize;
+
+    if idx == 0 {
+        let hash = Sha512::digest(input);
+
+        unsafe {
+            let output_slice = &mut *output;
+            output_slice.copy_from_slice(&hash);
+        }
+    }
+}
+
+// Incremental API for SHA512
+#[kernel]
+#[allow(improper_ctypes_definitions, clippy::missing_safety_doc)]
+pub unsafe fn sha512_incremental(input: &[u8], output: *mut [u8; 64]) {
+    let idx = thread::index_1d() as usize;
+
+    if idx == 0 {
+        let mut hasher = Sha512::new();
+        hasher.update(input);
+        let hash = hasher.finalize();
+
+        unsafe {
+            let output_slice = &mut *output;
+            output_slice.copy_from_slice(&hash);
+        }
+    }
+}
diff --git a/examples/cuda/sha2_crates_io/src/main.rs b/examples/cuda/sha2_crates_io/src/main.rs