chore(gpu): add assert macro

andrei-stoian-zama · andrei-stoian-zama · commit 71f427de9e37 · 2025-08-27T10:32:43.000+02:00
diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -19,13 +19,42 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
     std::abort();
   }
 }
+
+// The PANIC macro should be used to validate user-inputs to GPU functions
+// it will execute in all targets, including production settings
+// e.g., cudaMemCopy to the device should check that the destination pointer is
+// a device pointer
 #define PANIC(format, ...)                                                     \
   {                                                                            \
     std::fprintf(stderr, "%s::%d::%s: panic.\n" format "\n", __FILE__,         \
                  __LINE__, __func__, ##__VA_ARGS__);                           \
     std::abort();                                                              \
   }
 
+// This is a generic assertion checking macro with user defined printf-style
+// message
+#define PANIC_IF_FALSE(cond, format, ...)                                      \
+  do {                                                                         \
+    if (!(cond)) {                                                             \
+      PANIC(format "\n\n %s\n", ##__VA_ARGS__, #cond);                         \
+    }                                                                          \
+  } while (0)
+
+#ifndef GPU_ASSERTS_DISABLE
+// The GPU assert should be used to validate assumptions in algorithms,
+// for example, checking that two user-provided quantities have a certain
+// relationship or that the size of the buffer  provided to a function is
+// sufficient when it is filled with some algorithm that depends on
+// user-provided inputs e.g., OPRF corrections buffer should not have a size
+// higher than the number of blocks in the datatype that is generated
+#define GPU_ASSERT(cond, format, ...)                                          \
+  PANIC_IF_FALSE(cond, format, ##__VA_ARGS__)
+#else
+#define GPU_ASSERT(cond)                                                       \
+  do {                                                                         \
+  } while (0)
+#endif
+
 uint32_t cuda_get_device();
 void cuda_set_device(uint32_t gpu_index);
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
@@ -54,9 +54,11 @@ void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
                            int8_t *d_mem, uint32_t r, uint32_t glwe_dim,
                            uint32_t polynomial_size, uint32_t level_count,
                            uint32_t max_shared_memory) {
-  if (gpu_count != 1)
-    PANIC("GPU error (batch_fft_ggsw_vector): multi-GPU execution is not "
-          "supported yet.")
+  PANIC_IF_FALSE(gpu_count == 1,
+                 "GPU error (batch_fft_ggsw_vector): multi-GPU execution on %d "
+                 "gpus is not supported yet.",
+                 gpu_count);
+
   cuda_set_device(gpu_indexes[0]);
 
   int shared_memory_size = sizeof(double) * polynomial_size;
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -124,8 +124,10 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
                            num_blocks_per_sample, num_threads_x);
 
   int shared_mem = sizeof(Torus) * num_threads_y * num_threads_x;
-  if (num_blocks_per_sample > 65536)
-    PANIC("Cuda error (Keyswitch): number of blocks per sample is too large");
+  PANIC_IF_FALSE(
+      num_blocks_per_sample <= 65536,
+      "Cuda error (Keyswitch): number of blocks per sample (%d) is too large",
+      num_blocks_per_sample);
 
   // In multiplication of large integers (512, 1024, 2048), the number of
   // samples can be larger than 65536, so we need to set it in the first
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/packing_keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/packing_keyswitch.cuh
@@ -204,8 +204,9 @@ __host__ void host_packing_keyswitch_lwe_list_to_glwe(
 
   // Shared memory requirement is 8192 bytes for 64-bit Torus elements
   uint32_t shared_mem_size = get_shared_mem_size_tgemm<Torus>();
-  if (shared_mem_size > 8192)
-    PANIC("GEMM kernel error: shared memory required might be too large");
+  // Sanity check: the shared memory size is a constant defined by the algorithm
+  GPU_ASSERT(shared_mem_size <= 8192,
+             "GEMM kernel error: shared memory required might be too large");
 
   tgemm<Torus><<<grid_gemm, threads_gemm, shared_mem_size, stream>>>(
       num_lwes, glwe_accumulator_size, lwe_dimension, d_mem_0, fp_ksk_array,
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -298,15 +298,14 @@ __host__ void host_improve_noise_modulus_switch(
     const double input_variance, const double r_sigma, const double bound,
     uint32_t log_modulus) {
 
-  if (lwe_size < 512) {
-    PANIC("The lwe_size is less than 512, this is not supported\n");
-    return;
-  }
+  PANIC_IF_FALSE(lwe_size >= 512,
+                 "The lwe_size (%d) is less than 512, this is not supported\n",
+                 lwe_size);
+  PANIC_IF_FALSE(
+      lwe_size <= 1024,
+      "The lwe_size (%d) is greater than 1024, this is not supported\n",
+      lwe_size);
 
-  if (lwe_size > 1024) {
-    PANIC("The lwe_size is greater than 1024, this is not supported\n");
-    return;
-  }
   cuda_set_device(gpu_index);
 
   // This reduction requires a power of two num of threads
diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -196,14 +196,14 @@ void cuda_memcpy_with_size_tracking_async_gpu_to_gpu(
     return;
   cudaPointerAttributes attr_dest;
   check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
-  if (attr_dest.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
-  }
+  PANIC_IF_FALSE(
+      attr_dest.type == cudaMemoryTypeDevice,
+      "Cuda error: invalid dest device pointer in copy from GPU to GPU.");
   cudaPointerAttributes attr_src;
   check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
-  if (attr_src.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
-  }
+  PANIC_IF_FALSE(
+      attr_src.type == cudaMemoryTypeDevice,
+      "Cuda error: invalid src device pointer in copy from GPU to GPU.");
   cuda_set_device(gpu_index);
   if (attr_src.device == attr_dest.device) {
     check_cuda_error(
@@ -227,14 +227,14 @@ void cuda_memcpy_gpu_to_gpu(void *dest, void const *src, uint64_t size,
     return;
   cudaPointerAttributes attr_dest;
   check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
-  if (attr_dest.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.")
-  }
+  PANIC_IF_FALSE(
+      attr_dest.type == cudaMemoryTypeDevice,
+      "Cuda error: invalid dest device pointer in copy from GPU to GPU.");
   cudaPointerAttributes attr_src;
   check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
-  if (attr_src.type != cudaMemoryTypeDevice) {
-    PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.")
-  }
+  PANIC_IF_FALSE(
+      attr_src.type == cudaMemoryTypeDevice,
+      "Cuda error: invalid src device pointer in copy from GPU to GPU.");
   cuda_set_device(gpu_index);
   if (attr_src.device == attr_dest.device) {
     check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToDevice));
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/bitwise_ops.cuh
@@ -20,12 +20,15 @@ __host__ void host_integer_radix_bitop_kb(
     void *const *bsks, Torus *const *ksks,
     CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
 
-  if (lwe_array_out->num_radix_blocks != lwe_array_1->num_radix_blocks ||
-      lwe_array_out->num_radix_blocks != lwe_array_2->num_radix_blocks)
-    PANIC("Cuda error: input and output num radix blocks must be equal")
-  if (lwe_array_out->lwe_dimension != lwe_array_1->lwe_dimension ||
-      lwe_array_out->lwe_dimension != lwe_array_2->lwe_dimension)
-    PANIC("Cuda error: input and output lwe dimension must be equal")
+  PANIC_IF_FALSE(
+      lwe_array_out->num_radix_blocks == lwe_array_1->num_radix_blocks &&
+          lwe_array_out->num_radix_blocks == lwe_array_2->num_radix_blocks,
+      "Cuda error: input and output num radix blocks must be equal");
+
+  PANIC_IF_FALSE(lwe_array_out->lwe_dimension == lwe_array_1->lwe_dimension &&
+                     lwe_array_out->lwe_dimension == lwe_array_2->lwe_dimension,
+                 "Cuda error: input and output lwe dimension must be equal");
+
   auto lut = mem_ptr->lut;
   uint64_t degrees[lwe_array_1->num_radix_blocks];
   if (mem_ptr->op == BITOP_TYPE::BITAND) {
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cast.cuh
@@ -25,10 +25,10 @@ __host__ void host_trim_radix_blocks_lsb(CudaRadixCiphertextFFI *output,
   const uint32_t input_start_lwe_index =
       input->num_radix_blocks - output->num_radix_blocks;
 
-  if (input->num_radix_blocks <= output->num_radix_blocks) {
-    PANIC("Cuda error: input num blocks should be greater than output num "
-          "blocks");
-  }
+  PANIC_IF_FALSE(input->num_radix_blocks > output->num_radix_blocks,
+                 "Cuda error: input num blocks (%d) should be greater than "
+                 "output num blocks (%d)",
+                 input->num_radix_blocks, output->num_radix_blocks);
 
   copy_radix_ciphertext_slice_async<Torus>(
       streams[0], gpu_indexes[0], output, 0, output->num_radix_blocks, input,
@@ -70,9 +70,7 @@ __host__ void host_extend_radix_with_sign_msb(
   PUSH_RANGE("cast/extend")
   const uint32_t input_blocks = input->num_radix_blocks;
 
-  if (input_blocks == 0) {
-    PANIC("Cuda error: input blocks cannot be zero");
-  }
+  PANIC_IF_FALSE(input_blocks > 0, "Cuda error: input blocks cannot be zero");
 
   copy_radix_ciphertext_slice_async<Torus>(streams[0], gpu_indexes[0], output,
                                            0, input_blocks, input, 0,
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
@@ -15,14 +15,18 @@ zero_out_if(cudaStream_t const *streams, uint32_t const *gpu_indexes,
             Torus *const *ksks,
             CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key,
             uint32_t num_radix_blocks) {
-  if (lwe_array_out->num_radix_blocks < num_radix_blocks ||
-      lwe_array_input->num_radix_blocks < num_radix_blocks)
-    PANIC("Cuda error: input or output radix ciphertexts does not have enough "
-          "blocks")
-  if (lwe_array_out->lwe_dimension != lwe_array_input->lwe_dimension ||
-      lwe_array_input->lwe_dimension != lwe_condition->lwe_dimension)
-    PANIC("Cuda error: input and output radix ciphertexts must have the same "
-          "lwe dimension")
+  PANIC_IF_FALSE(
+      lwe_array_out->num_radix_blocks >= num_radix_blocks &&
+          lwe_array_input->num_radix_blocks >= num_radix_blocks,
+      "Cuda error: input or output radix ciphertexts does not have enough "
+      "blocks");
+
+  PANIC_IF_FALSE(
+      lwe_array_out->lwe_dimension == lwe_array_input->lwe_dimension &&
+          lwe_array_input->lwe_dimension == lwe_condition->lwe_dimension,
+      "Cuda error: input and output radix ciphertexts must have the same "
+      "lwe dimension");
+
   cuda_set_device(gpu_indexes[0]);
   auto params = mem_ptr->params;
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -101,17 +101,20 @@ __host__ void host_radix_blocks_rotate_right(
     cudaStream_t const *streams, uint32_t const *gpu_indexes,
     uint32_t gpu_count, CudaRadixCiphertextFFI *dst,
     CudaRadixCiphertextFFI *src, uint32_t rotations, uint32_t num_blocks) {
-  if (src == dst) {
-    PANIC("Cuda error (blocks_rotate_right): the source and destination "
-          "pointers should be different");
-  }
-  if (dst->lwe_dimension != src->lwe_dimension)
-    PANIC("Cuda error: input and output should have the same "
-          "lwe dimension")
-  if (dst->num_radix_blocks < num_blocks || src->num_radix_blocks < num_blocks)
-    PANIC("Cuda error: input and output should have more blocks than asked for "
-          "in the "
-          "function call")
+  PANIC_IF_FALSE(src != dst,
+                 "Cuda error (blocks_rotate_right): the source and destination "
+                 "pointers should be different");
+
+  PANIC_IF_FALSE(dst->lwe_dimension == src->lwe_dimension,
+                 "Cuda error: input and output should have the same "
+                 "lwe dimension");
+
+  PANIC_IF_FALSE(
+      dst->num_radix_blocks >= num_blocks &&
+          src->num_radix_blocks >= num_blocks,
+      "Cuda error: input and output should have more blocks than asked for "
+      "in the "
+      "function call");
 
   auto lwe_size = src->lwe_dimension + 1;
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -229,8 +229,9 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
     uint32_t level_count, uint32_t num_samples, uint32_t num_many_lut,
     uint32_t lut_stride) {
 
-  if (base_log > 64)
-    PANIC("Cuda error (multi-bit PBS): base log should be <= 64")
+  PANIC_IF_FALSE(base_log <= 64,
+                 "Cuda error (multi-bit PBS): base log (%d) should be <= 64",
+                 base_log);
 
   pbs_buffer<uint64_t, MULTI_BIT> *buffer =
       (pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
diff --git a/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh b/backends/tfhe-cuda-backend/cuda/src/zk/zk.cuh
@@ -31,13 +31,13 @@ __host__ void host_expand_without_verification(
                             : mem_ptr->casting_params.small_lwe_dimension);
   auto d_lwe_compact_input_indexes = mem_ptr->d_lwe_compact_input_indexes;
   auto d_body_id_per_compact_list = mem_ptr->d_body_id_per_compact_list;
-  if (sizeof(Torus) == 8) {
-    cuda_lwe_expand_64(streams[0], gpu_indexes[0], expanded_lwes,
-                       lwe_flattened_compact_array_in, lwe_dimension, num_lwes,
-                       d_lwe_compact_input_indexes, d_body_id_per_compact_list);
 
-  } else
-    PANIC("Cuda error: expand is only supported on 64 bits")
+  GPU_ASSERT(sizeof(Torus) == 8,
+             "Cuda error: expand is only supported on 64 bits");
+
+  cuda_lwe_expand_64(streams[0], gpu_indexes[0], expanded_lwes,
+                     lwe_flattened_compact_array_in, lwe_dimension, num_lwes,
+                     d_lwe_compact_input_indexes, d_body_id_per_compact_list);
 
   auto ksks = casting_keys;
   auto lwe_array_input = expanded_lwes;