zama-ai · andrei-stoian-zama · Sep 18, 2025 · Sep 3, 2025
@@ -1004,6 +1004,11 @@ test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
 		--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
 		-E "test(/high_level_api::.*gpu.*/)"
 
+test_list_gpu: install_rs_build_toolchain install_cargo_nextest
+	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest list --cargo-profile $(CARGO_PROFILE) \
+		--features=integer,internal-keycache,gpu,zk-pok -p tfhe \
+		-E "test(/.*gpu.*/)"
+
 test_high_level_api_hpu: install_rs_build_toolchain install_cargo_nextest
 ifeq ($(HPU_CONFIG), v80)
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \

@@ -4,9 +4,7 @@
 #include <cstdint>
 #include <cstdio>
 #include <cstdlib>
-#include <cstring>
 #include <cuda_runtime.h>
-#include <vector>
 
 extern "C" {
 
@@ -141,4 +139,5 @@ bool cuda_check_support_thread_block_clusters();
 template <typename Torus>
 void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
                           Torus *d_array, Torus value, Torus n);
+
 #endif
@@ -4,6 +4,8 @@
 #include <variant>
 #include <vector>
 
+#include "integer/integer.h"
+
 extern std::mutex m;
 extern bool p2p_enabled;
 extern const int THRESHOLD_MULTI_GPU;
@@ -37,10 +39,149 @@ get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,
   }
 }
 
-int get_active_gpu_count(int num_inputs, int gpu_count);
+uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count);
 
 int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
 
 int get_gpu_offset(int total_num_inputs, int gpu_index, int gpu_count);
 
+// A Set of GPU Streams and associated GPUs
+// Can be constructed from the FFI struct CudaStreamsFFI which
+// is only used to pass the streams/gpus at the rust/C interface
+// This class should only be constructed from the FFI struct,
+// through class methods or through the copy constructor. The class
+// can also be constructed as an empty set
+struct CudaStreams {
+private:
+  cudaStream_t const *_streams;
+  uint32_t const *_gpu_indexes;
+  uint32_t _gpu_count;
+  bool _owns_streams;
+
+  // Prevent the construction of a CudaStreams class from user-code
+  CudaStreams(cudaStream_t const *streams, uint32_t const *gpu_indexes,
+              uint32_t gpu_count)
+      : _streams(streams), _gpu_indexes(gpu_indexes), _gpu_count(gpu_count),
+        _owns_streams(false) {}
+
+public:
+  // Construct an empty set. Invalid use of an empty set should raise an error
+  // right away through asserts or because of a nullptr dereference
+  CudaStreams()
+      : _streams(nullptr), _gpu_indexes(nullptr), _gpu_count((uint32_t)-1),
+        _owns_streams(false) {}
+
+  // Returns a subset of this set as an active subset. An active subset is one
+  // that is temporarily used to perform some computation
+  CudaStreams active_gpu_subset(int num_radix_blocks) {
+    return CudaStreams(_streams, _gpu_indexes,
+                       get_active_gpu_count(num_radix_blocks, _gpu_count));
+  }
+
+  // Returns a subset containing only the first gpu of this set. It
+  // is used to create subset of streams for mono-GPU functions
+  CudaStreams subset_first_gpu() const {
+    return CudaStreams(_streams, _gpu_indexes, 1);
+  }
+
+  // Synchronize all the streams in the set
+  void synchronize() const {
+    for (uint32_t i = 0; i < _gpu_count; i++) {
+      cuda_synchronize_stream(_streams[i], _gpu_indexes[i]);
+    }
+  }
+
+  cudaStream_t stream(uint32_t idx) const {
+    PANIC_IF_FALSE(idx < _gpu_count, "Invalid GPU index");
+    return _streams[idx];
+  }
+  uint32_t gpu_index(uint32_t idx) const {
+    PANIC_IF_FALSE(idx < _gpu_count, "Invalid GPU index");
+    return _gpu_indexes[idx];
+  }
+  uint32_t count() const { return _gpu_count; }
+
+  // Construct from the rust FFI stream set. Streams are created in rust
+  // using the bindings.
+  CudaStreams(CudaStreamsFFI &ffi)
+      : _streams((cudaStream_t *)ffi.streams), _gpu_indexes(ffi.gpu_indexes),
+        _gpu_count(ffi.gpu_count), _owns_streams(false) {}
+
+  // Create a new set of streams on the same gpus as those of the current stream
+  // set Can be used to parallelize computation by issuing kernels on multiple
+  // streams on the same GPU
+  void create_on_same_gpus(const CudaStreams &other) {
+    PANIC_IF_FALSE(_streams == nullptr,
+                   "Assign clone to non-empty cudastreams");
+
+    cudaStream_t *new_streams = new cudaStream_t[other._gpu_count];
+
+    uint32_t *gpu_indexes_clone = new uint32_t[_gpu_count];
+    for (uint32_t i = 0; i < other._gpu_count; ++i) {
+      new_streams[i] = cuda_create_stream(other._gpu_indexes[i]);
+      gpu_indexes_clone[i] = other._gpu_indexes[i];
+    }
+
+    this->_streams = new_streams;
+    this->_gpu_indexes = gpu_indexes_clone;
+    this->_gpu_count = other._gpu_count;
+
+    // Flag this instance as owning streams so that we can destroy
+    // the streams when they aren't needed anymore
+    this->_owns_streams = true;
+  }
+
+  // Copy constructor, setting the own flag to false
+  // Only the initial instance of CudaStreams created with
+  // assign_clone owns streams, all copies of it do not own the
+  // streams
+  CudaStreams(const CudaStreams &src)
+      : _streams(src._streams), _gpu_indexes(src._gpu_indexes),
+        _gpu_count(src._gpu_count), _owns_streams(false) {}
+
+  CudaStreams &operator=(CudaStreams const &other) {
+    PANIC_IF_FALSE(this->_streams == nullptr ||
+                       this->_streams == other._streams,
+                   "Assigning an already initialized CudaStreams");
+    this->_streams = other._streams;
+    this->_gpu_indexes = other._gpu_indexes;
+    this->_gpu_count = other._gpu_count;
+
+    // Only the initial instance of CudaStreams created with
+    // assign_clone owns streams, all copies of it do not own the
+    // streams
+    this->_owns_streams = false;
+    return *this;
+  }
+
+  // Destroy the streams if they are created by assign_clone.
+  // We require the developer to call `destroy` on all instances
+  // of cloned streams.
+  void release() {
+    // If this instance doesn't own streams, there's nothing to do
+    // as the streams were created on the Rust side.
+    if (_owns_streams) {
+      for (uint32_t i = 0; i < _gpu_count; ++i) {
+        cuda_destroy_stream(_streams[i], _gpu_indexes[i]);
+      }
+      delete[] _streams;
+      _streams = nullptr;
+      delete[] _gpu_indexes;
+      _gpu_indexes = nullptr;
+    }
+  }
+
+  // The destructor checks that streams created with assign_clone
+  // were destroyed manually with `destroy`.
+  ~CudaStreams() {
+    // Ensure streams are destroyed
+    PANIC_IF_FALSE(
+        !_owns_streams || _streams == nullptr,
+        "Destroy  (this=%p) was not called on a CudaStreams object that "
+        "is a clone "
+        "of another one, %p",
+        this, this->_streams);
+  }
+};
+
 #endif
@@ -2,6 +2,7 @@
 #define CUDA_INTEGER_COMPRESSION_H
 
 #include "../../pbs/pbs_enums.h"
+#include "../integer.h"
 
 typedef struct {
   void *ptr;
@@ -25,77 +26,65 @@ typedef struct {
 
 extern "C" {
 uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t compression_glwe_dimension,
-    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
-    uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t lwe_per_glwe, bool allocate_gpu_memory);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, uint32_t lwe_per_glwe, bool allocate_gpu_memory);
 
 uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
-    uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
-    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
-    uint32_t pbs_level, uint32_t pbs_base_log,
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
     uint32_t num_blocks_to_decompress, uint32_t message_modulus,
     uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
     PBS_MS_REDUCTION_T noise_reduction_type);
 
 void cuda_integer_compress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaPackedGlweCiphertextListFFI *glwe_array_out,
+    CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
     CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
     int8_t *mem_ptr);
 
 void cuda_integer_decompress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaLweCiphertextListFFI *lwe_array_out,
+    CudaStreamsFFI streams, CudaLweCiphertextListFFI *lwe_array_out,
     CudaPackedGlweCiphertextListFFI const *glwe_in,
     uint32_t const *indexes_array, void *const *bsks, int8_t *mem_ptr);
 
-void cleanup_cuda_integer_compress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
+void cleanup_cuda_integer_compress_radix_ciphertext_64(CudaStreamsFFI streams,
+                                                       int8_t **mem_ptr_void);
 
-void cleanup_cuda_integer_decompress_radix_ciphertext_64(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
+void cleanup_cuda_integer_decompress_radix_ciphertext_64(CudaStreamsFFI streams,
+                                                         int8_t **mem_ptr_void);
 
 uint64_t scratch_cuda_integer_compress_radix_ciphertext_128(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t compression_glwe_dimension,
-    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
-    uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
-    uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
-    uint32_t lwe_per_glwe, bool allocate_gpu_memory);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, uint32_t lwe_per_glwe, bool allocate_gpu_memory);
 
 uint64_t scratch_cuda_integer_decompress_radix_ciphertext_128(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr, uint32_t compression_glwe_dimension,
-    uint32_t compression_polynomial_size, uint32_t lwe_dimension,
-    uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
-    bool allocate_gpu_memory);
+    CudaStreamsFFI streams, int8_t **mem_ptr,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t num_radix_blocks, uint32_t message_modulus,
+    uint32_t carry_modulus, bool allocate_gpu_memory);
 
 void cuda_integer_compress_radix_ciphertext_128(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaPackedGlweCiphertextListFFI *glwe_array_out,
+    CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
     CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
     int8_t *mem_ptr);
 
 void cuda_integer_decompress_radix_ciphertext_128(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    CudaLweCiphertextListFFI *lwe_array_out,
+    CudaStreamsFFI streams, CudaLweCiphertextListFFI *lwe_array_out,
     CudaPackedGlweCiphertextListFFI const *glwe_in,
     uint32_t const *indexes_array, int8_t *mem_ptr);
 
-void cleanup_cuda_integer_compress_radix_ciphertext_128(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
+void cleanup_cuda_integer_compress_radix_ciphertext_128(CudaStreamsFFI streams,
+                                                        int8_t **mem_ptr_void);
 
 void cleanup_cuda_integer_decompress_radix_ciphertext_128(
-    void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
-    int8_t **mem_ptr_void);
+    CudaStreamsFFI streams, int8_t **mem_ptr_void);
 }
 
 #endif