fix(gpu): avoid out of memory when benchmarking throughput

guillermo-oyarzun · guillermo-oyarzun · commit 022cb3b18aee · 2025-09-19T14:44:12.000+02:00
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -756,18 +756,20 @@ template <typename Torus> struct int_radix_lut {
       CudaStreams streams, uint64_t max_num_radix_blocks,
       uint64_t &size_tracker, bool allocate_gpu_memory) {
     // We need to create the auxiliary array only in GPU 0
-    lwe_aligned_vec.resize(active_streams.count());
-    for (uint i = 0; i < active_streams.count(); i++) {
-      uint64_t size_tracker_on_array_i = 0;
-      auto inputs_on_gpu = std::max(
-          THRESHOLD_MULTI_GPU, get_num_inputs_on_gpu(max_num_radix_blocks, i,
-                                                     active_streams.count()));
-      Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
-          inputs_on_gpu * (params.big_lwe_dimension + 1) * sizeof(Torus),
-          streams.stream(0), streams.gpu_index(0), size_tracker_on_array_i,
-          allocate_gpu_memory);
-      lwe_aligned_vec[i] = d_array;
-      size_tracker += size_tracker_on_array_i;
+    if (active_streams.count() > 1) {
+      lwe_aligned_vec.resize(active_streams.count());
+      for (uint i = 0; i < active_streams.count(); i++) {
+        uint64_t size_tracker_on_array_i = 0;
+        auto inputs_on_gpu = std::max(
+            THRESHOLD_MULTI_GPU, get_num_inputs_on_gpu(max_num_radix_blocks, i,
+                                                       active_streams.count()));
+        Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
+            inputs_on_gpu * (params.big_lwe_dimension + 1) * sizeof(Torus),
+            streams.stream(0), streams.gpu_index(0), size_tracker_on_array_i,
+            allocate_gpu_memory);
+        lwe_aligned_vec[i] = d_array;
+        size_tracker += size_tracker_on_array_i;
+      }
     }
   }
 
@@ -1632,8 +1634,19 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
         luts_message_carry = new int_radix_lut<Torus>(
             streams, params, 2, pbs_count, true, size_tracker);
         allocated_luts_message_carry = true;
+        uint64_t message_modulus_bits =
+            (uint64_t)std::log2(params.message_modulus);
+        uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus);
+        uint64_t total_bits_per_block =
+            message_modulus_bits + carry_modulus_bits;
+        uint64_t denominator =
+            (uint64_t)std::ceil((pow(2, total_bits_per_block) - 1) /
+                                (pow(2, message_modulus_bits) - 1));
+
+        uint64_t upper_bound_num_blocks =
+            max_total_blocks_in_vec * 2 / denominator;
         luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes(
-            streams, this->max_total_blocks_in_vec, size_tracker, true);
+            streams, upper_bound_num_blocks, size_tracker, true);
       }
     }
     if (allocated_luts_message_carry) {
@@ -1731,9 +1744,17 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
     this->current_blocks = current_blocks;
     this->small_lwe_vector = small_lwe_vector;
     this->luts_message_carry = reused_lut;
+
+    uint64_t message_modulus_bits = (uint64_t)std::log2(params.message_modulus);
+    uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus);
+    uint64_t total_bits_per_block = message_modulus_bits + carry_modulus_bits;
+    uint64_t denominator =
+        (uint64_t)std::ceil((pow(2, total_bits_per_block) - 1) /
+                            (pow(2, message_modulus_bits) - 1));
+
+    uint64_t upper_bound_num_blocks = max_total_blocks_in_vec * 2 / denominator;
     this->luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes(
-        streams, this->max_total_blocks_in_vec, size_tracker,
-        allocate_gpu_memory);
+        streams, upper_bound_num_blocks, size_tracker, allocate_gpu_memory);
     setup_index_buffers(streams, size_tracker);
   }
 
diff --git a/tfhe-benchmark/src/utilities.rs b/tfhe-benchmark/src/utilities.rs
@@ -421,23 +421,32 @@ pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
     let block_multiplicator = (ref_block_count as f64 / num_block as f64).ceil().min(1.0);
     // Some operations with a high serial workload (e.g. division) would yield an operation
     // loading value so low that the number of elements in the end wouldn't be meaningful.
-    let minimum_loading = if num_block < 64 { 0.2 } else { 0.01 };
+    let minimum_loading = if num_block < 64 { 1.0 } else { 0.015 };
 
     #[cfg(feature = "gpu")]
     {
         let num_sms_per_gpu = get_number_of_sms();
         let total_num_sm = num_sms_per_gpu * get_number_of_gpus();
 
-        let total_blocks_per_sm = 4u32; // Assume each SM can handle 4 blocks concurrently
-        let total_num_sm = total_blocks_per_sm * total_num_sm;
+        let total_blocks_per_sm = 4u64; // Assume each SM can handle 4 blocks concurrently
         let min_num_waves = 4u64; //Enforce at least 4 waves in the GPU
-        let elements_per_wave = total_num_sm as u64 / (num_block as u64);
-
+        let block_factor = ((2.0f64 * num_block as f64) / 4.0f64).ceil() as u64;
+        let elements_per_wave = total_blocks_per_sm * total_num_sm as u64 / block_factor;
+        // We need to enable the new load for pbs benches and for sizes larger than 16 blocks in
+        // demanding operations for the rest of operations we maintain a minimum of 200
+        // elements
+        let min_elements = if op_pbs_count == 1
+            || (op_pbs_count > (num_block * num_block) as u64 && num_block >= 16)
+        {
+            elements_per_wave * min_num_waves
+        } else {
+            200u64
+        };
         let operation_loading = ((total_num_sm as u64 / op_pbs_count) as f64).max(minimum_loading);
         let elements = (total_num_sm as f64 * block_multiplicator * operation_loading) as u64;
-        elements.min(elements_per_wave * min_num_waves) // This threshold is useful for operation
-                                                        // with both a small number of
-                                                        // block and low PBs count.
+        elements.min(min_elements) // This threshold is useful for operation
+                                   // with both a small number of
+                                   // block and low PBs count.
     }
     #[cfg(feature = "hpu")]
     {