fix(gpu): fix overflow sub and comparison issues

bbarbakadze · bbarbakadze · commit ef26e7a21d66 · 2025-09-01T15:11:43.000+04:00
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
@@ -4342,6 +4342,11 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
   CudaRadixCiphertextFFI *q2;                  // single block
   CudaRadixCiphertextFFI *q3;                  // single block
 
+  Torus **first_indexes_for_overflow_sub;
+  Torus **second_indexes_for_overflow_sub;
+  Torus **scalars_for_overflow_sub;
+  uint32_t max_indexes_to_erase;
+
   // allocate and initialize if needed, temporary arrays used to calculate
   // cuda integer div_rem_2_2 operation
   void init_temporary_buffers(cudaStream_t const *streams,
@@ -4610,6 +4615,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
     overflow_sub_mem_3 = new int_borrow_prop_memory<Torus>(
         streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
         allocate_gpu_memory, size_tracker);
+    uint32_t group_size = overflow_sub_mem_1->group_size;
+    bool use_seq = overflow_sub_mem_1->prop_simu_group_carries_mem
+                       ->use_sequential_algorithm_to_resolve_group_carries;
+    create_indexes_for_overflow_sub(streams, gpu_indexes, num_blocks,
+                                    group_size, use_seq, allocate_gpu_memory,
+                                    size_tracker);
     comparison_buffer_1 = new int_comparison_buffer<Torus>(
         streams, gpu_indexes, gpu_count, COMPARISON_TYPE::EQ, params,
         num_blocks, false, allocate_gpu_memory, size_tracker);
@@ -4665,6 +4676,102 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
     }
   }
 
+  void create_indexes_for_overflow_sub(cudaStream_t const *streams,
+                                       uint32_t const *gpu_indexes,
+                                       uint32_t num_blocks, uint32_t group_size,
+                                       bool use_seq, bool allocate_gpu_memory,
+                                       uint64_t &size_tracker) {
+    max_indexes_to_erase = num_blocks;
+
+    first_indexes_for_overflow_sub =
+        (Torus **)malloc(num_blocks * sizeof(Torus *));
+    second_indexes_for_overflow_sub =
+        (Torus **)malloc(num_blocks * sizeof(Torus *));
+    scalars_for_overflow_sub = (Torus **)malloc(num_blocks * sizeof(Torus *));
+
+    Torus *h_lut_indexes = (Torus *)malloc(num_blocks * sizeof(Torus));
+    Torus *h_scalar = (Torus *)malloc(num_blocks * sizeof(Torus));
+
+    // Extra indexes for the luts in first step
+    for (int nb = 1; nb <= num_blocks; nb++) {
+      first_indexes_for_overflow_sub[nb - 1] =
+          (Torus *)cuda_malloc_with_size_tracking_async(
+              nb * sizeof(Torus), streams[0], gpu_indexes[0], size_tracker,
+              allocate_gpu_memory);
+      for (int index = 0; index < nb; index++) {
+        uint32_t grouping_index = index / group_size;
+        bool is_in_first_grouping = (grouping_index == 0);
+        uint32_t index_in_grouping = index % group_size;
+        bool is_last_index = (index == (nb - 1));
+        if (is_last_index) {
+          if (nb == 1) {
+            h_lut_indexes[index] = 2 * group_size;
+          } else {
+            h_lut_indexes[index] = 2;
+          }
+        } else if (is_in_first_grouping) {
+          h_lut_indexes[index] = index_in_grouping;
+        } else {
+          h_lut_indexes[index] = index_in_grouping + group_size;
+        }
+      }
+      cuda_memcpy_with_size_tracking_async_to_gpu(
+          first_indexes_for_overflow_sub[nb - 1], h_lut_indexes,
+          nb * sizeof(Torus), streams[0], gpu_indexes[0], allocate_gpu_memory);
+    }
+    // Extra indexes for the luts in second step
+    for (int nb = 1; nb <= num_blocks; nb++) {
+      second_indexes_for_overflow_sub[nb - 1] =
+          (Torus *)cuda_malloc_with_size_tracking_async(
+              nb * sizeof(Torus), streams[0], gpu_indexes[0], size_tracker,
+              allocate_gpu_memory);
+      scalars_for_overflow_sub[nb - 1] =
+          (Torus *)cuda_malloc_with_size_tracking_async(
+              nb * sizeof(Torus), streams[0], gpu_indexes[0], size_tracker,
+              allocate_gpu_memory);
+
+      for (int index = 0; index < nb; index++) {
+        uint32_t grouping_index = index / group_size;
+        bool is_in_first_grouping = (grouping_index == 0);
+        uint32_t index_in_grouping = index % group_size;
+
+        if (is_in_first_grouping) {
+          h_lut_indexes[index] = index_in_grouping;
+        } else if (index_in_grouping == (group_size - 1)) {
+          if (use_seq) {
+            int inner_index = (grouping_index - 1) % (group_size - 1);
+            h_lut_indexes[index] = inner_index + 2 * group_size;
+          } else {
+            h_lut_indexes[index] = 2 * group_size;
+          }
+        } else {
+          h_lut_indexes[index] = index_in_grouping + group_size;
+        }
+
+        bool may_have_its_padding_bit_set =
+            !is_in_first_grouping && (index_in_grouping == group_size - 1);
+
+        if (may_have_its_padding_bit_set) {
+          if (use_seq) {
+            h_scalar[index] = 1 << ((grouping_index - 1) % (group_size - 1));
+          } else {
+            h_scalar[index] = 1;
+          }
+        } else {
+          h_scalar[index] = 0;
+        }
+      }
+      cuda_memcpy_with_size_tracking_async_to_gpu(
+          second_indexes_for_overflow_sub[nb - 1], h_lut_indexes,
+          nb * sizeof(Torus), streams[0], gpu_indexes[0], allocate_gpu_memory);
+      cuda_memcpy_with_size_tracking_async_to_gpu(
+          scalars_for_overflow_sub[nb - 1], h_scalar, nb * sizeof(Torus),
+          streams[0], gpu_indexes[0], allocate_gpu_memory);
+    }
+    free(h_lut_indexes);
+    free(h_scalar);
+  };
+
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                uint32_t gpu_count) {
     // release and delete integer ops memory objects
@@ -4793,6 +4900,21 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
     delete q1;
     delete q2;
     delete q3;
+
+    for (int i = 0; i < max_indexes_to_erase; i++) {
+      cuda_drop_with_size_tracking_async(first_indexes_for_overflow_sub[i],
+                                         streams[0], gpu_indexes[0],
+                                         gpu_memory_allocated);
+      cuda_drop_with_size_tracking_async(second_indexes_for_overflow_sub[i],
+                                         streams[0], gpu_indexes[0],
+                                         gpu_memory_allocated);
+      cuda_drop_with_size_tracking_async(scalars_for_overflow_sub[i],
+                                         streams[0], gpu_indexes[0],
+                                         gpu_memory_allocated);
+    }
+    free(first_indexes_for_overflow_sub);
+    free(second_indexes_for_overflow_sub);
+    free(scalars_for_overflow_sub);
   }
 };
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
@@ -113,20 +113,32 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
     copy_radix_ciphertext_slice_async<Torus>(
         streams[0], gpu_indexes[0], mem_ptr->rem, 0, slice_len, remainder,
         block_index, num_blocks);
-    uint32_t compute_borrow = 1;
+    uint32_t compute_overflow = 1;
     uint32_t uses_input_borrow = 0;
+    auto first_indexes =
+        mem_ptr->first_indexes_for_overflow_sub[mem_ptr->rem->num_radix_blocks -
+                                                1];
+    auto second_indexes =
+        mem_ptr
+            ->second_indexes_for_overflow_sub[mem_ptr->rem->num_radix_blocks -
+                                              1];
+    auto scalar_indexes =
+        mem_ptr->scalars_for_overflow_sub[mem_ptr->rem->num_radix_blocks - 1];
     auto sub_result_f = [&](cudaStream_t const *streams,
                             uint32_t const *gpu_indexes, uint32_t gpu_count,
                             CudaRadixCiphertextFFI *sub_result,
                             CudaRadixCiphertextFFI *sub_overflowed,
                             int_borrow_prop_memory<Torus> *overflow_sub_mem,
                             CudaRadixCiphertextFFI *low) {
       sub_result->num_radix_blocks = low->num_radix_blocks;
+      overflow_sub_mem->update_lut_indexes(streams, gpu_indexes, first_indexes,
+                                           second_indexes, scalar_indexes,
+                                           mem_ptr->rem->num_radix_blocks);
       host_integer_overflowing_sub<uint64_t>(
           streams, gpu_indexes, gpu_count, sub_result, mem_ptr->rem, low,
           sub_overflowed, (const CudaRadixCiphertextFFI *)nullptr,
-          overflow_sub_mem, bsks, ksks, ms_noise_reduction_key, compute_borrow,
-          uses_input_borrow);
+          overflow_sub_mem, bsks, ksks, ms_noise_reduction_key,
+          compute_overflow, uses_input_borrow);
     };
 
     auto cmp_f = [&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -139,25 +151,33 @@ __host__ void host_unsigned_integer_div_rem_kb_block_by_block_2_2(
       uint32_t slice_start = num_blocks - block_index;
       uint32_t slice_end = d->num_radix_blocks;
       as_radix_ciphertext_slice<Torus>(d_msb, d, slice_start, slice_end);
-      host_compare_blocks_with_zero<Torus>(
-          streams, gpu_indexes, gpu_count, comparison_blocks, d_msb,
-          comparison_buffer, bsks, ksks, ms_noise_reduction_key,
-          d_msb->num_radix_blocks, comparison_buffer->is_zero_lut);
-      are_all_comparisons_block_true(
-          streams, gpu_indexes, gpu_count, out_boolean_block, comparison_blocks,
-          comparison_buffer, bsks, ksks, ms_noise_reduction_key,
-          comparison_blocks->num_radix_blocks);
-
-      host_negation<Torus>(
-          streams[0], gpu_indexes[0], (Torus *)out_boolean_block->ptr,
-          (Torus *)out_boolean_block->ptr, radix_params.big_lwe_dimension, 1);
-      // we calculate encoding because this block works only for message_modulus
-      // = 4 and carry_modulus = 4.
-      const Torus encoded_scalar = 1ULL << (sizeof(Torus) * 8 - 5);
-      host_addition_plaintext_scalar<Torus>(
-          streams[0], gpu_indexes[0], (Torus *)out_boolean_block->ptr,
-          (Torus *)out_boolean_block->ptr, encoded_scalar,
-          radix_params.big_lwe_dimension, 1);
+      comparison_blocks->num_radix_blocks = d_msb->num_radix_blocks;
+      if (d_msb->num_radix_blocks == 0) {
+        cuda_memset_async((Torus *)out_boolean_block->ptr, 0,
+                          sizeof(Torus) *
+                              (out_boolean_block->lwe_dimension + 1),
+                          streams[0], gpu_indexes[0]);
+      } else {
+        host_compare_blocks_with_zero<Torus>(
+            streams, gpu_indexes, gpu_count, comparison_blocks, d_msb,
+            comparison_buffer, bsks, ksks, ms_noise_reduction_key,
+            d_msb->num_radix_blocks, comparison_buffer->is_zero_lut);
+        are_all_comparisons_block_true(
+            streams, gpu_indexes, gpu_count, out_boolean_block,
+            comparison_blocks, comparison_buffer, bsks, ksks,
+            ms_noise_reduction_key, comparison_blocks->num_radix_blocks);
+
+        host_negation<Torus>(
+            streams[0], gpu_indexes[0], (Torus *)out_boolean_block->ptr,
+            (Torus *)out_boolean_block->ptr, radix_params.big_lwe_dimension, 1);
+        // we calculate encoding because this block works only for
+        // message_modulus = 4 and carry_modulus = 4.
+        const Torus encoded_scalar = 1ULL << (sizeof(Torus) * 8 - 5);
+        host_addition_plaintext_scalar<Torus>(
+            streams[0], gpu_indexes[0], (Torus *)out_boolean_block->ptr,
+            (Torus *)out_boolean_block->ptr, encoded_scalar,
+            radix_params.big_lwe_dimension, 1);
+      }
       delete d_msb;
     };