fix(gpu): avoid out of memory when benchmarking throughput

guillermo-oyarzun · guillermo-oyarzun · commit 978011b26bb1 · 2025-09-10T11:58:31.000+02:00
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -1681,8 +1681,19 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
             new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count, params, 2,
                                      pbs_count, true, size_tracker);
         allocated_luts_message_carry = true;
+        uint64_t message_modulus_bits =
+            (uint64_t)std::log2(params.message_modulus);
+        uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus);
+        uint64_t total_bits_per_block =
+            message_modulus_bits + carry_modulus_bits;
+        uint64_t denominator =
+            (uint64_t)std::ceil((pow(2, total_bits_per_block) - 1) /
+                                (pow(2, message_modulus_bits) - 1));
+
+        uint64_t upper_bound_num_blocks =
+            num_blocks_in_radix * num_blocks_in_radix * 2 / denominator;
         luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes(
-            streams, gpu_indexes, gpu_count, this->max_total_blocks_in_vec,
+            streams, gpu_indexes, gpu_count, upper_bound_num_blocks,
             size_tracker, true);
       }
     }
@@ -1781,9 +1792,19 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
     this->current_blocks = current_blocks;
     this->small_lwe_vector = small_lwe_vector;
     this->luts_message_carry = reused_lut;
+
+    uint64_t message_modulus_bits = (uint64_t)std::log2(params.message_modulus);
+    uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus);
+    uint64_t total_bits_per_block = message_modulus_bits + carry_modulus_bits;
+    uint64_t denominator =
+        (uint64_t)std::ceil((pow(2, total_bits_per_block) - 1) /
+                            (pow(2, message_modulus_bits) - 1));
+
+    uint64_t upper_bound_num_blocks =
+        num_blocks_in_radix * num_blocks_in_radix * 2 / denominator;
     this->luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes(
-        streams, gpu_indexes, gpu_count, this->max_total_blocks_in_vec,
-        size_tracker, allocate_gpu_memory);
+        streams, gpu_indexes, gpu_count, upper_bound_num_blocks, size_tracker,
+        allocate_gpu_memory);
     setup_index_buffers(streams, gpu_indexes, size_tracker);
   }
 
diff --git a/tfhe-benchmark/src/utilities.rs b/tfhe-benchmark/src/utilities.rs
@@ -437,12 +437,18 @@ pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
         let total_num_sm = total_blocks_per_sm * total_num_sm;
         let min_num_waves = 4u64; //Enforce at least 4 waves in the GPU
         let elements_per_wave = total_num_sm as u64 / (num_block as u64);
-
+        // This should ensure that operations with PBS count more than the number of blocks
+        // squared will default to 200 elements.
+        let min_elements = if op_pbs_count > num_block as u64 * num_block as u64 {
+            200u64
+        } else {
+            elements_per_wave * min_num_waves
+        };
         let operation_loading = ((total_num_sm as u64 / op_pbs_count) as f64).max(minimum_loading);
         let elements = (total_num_sm as f64 * block_multiplicator * operation_loading) as u64;
-        elements.min(elements_per_wave * min_num_waves) // This threshold is useful for operation
-                                                        // with both a small number of
-                                                        // block and low PBs count.
+        elements.min(min_elements) // This threshold is useful for operation
+                                   // with both a small number of
+                                   // block and low PBs count.
     }
     #[cfg(feature = "hpu")]
     {