@@ -4342,6 +4342,11 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
4342
4342
CudaRadixCiphertextFFI *q2; // single block
4343
4343
CudaRadixCiphertextFFI *q3; // single block
4344
4344
4345
+ Torus **first_indexes_for_overflow_sub;
4346
+ Torus **second_indexes_for_overflow_sub;
4347
+ Torus **scalars_for_overflow_sub;
4348
+ uint32_t max_indexes_to_erase;
4349
+
4345
4350
// allocate and initialize if needed, temporary arrays used to calculate
4346
4351
// cuda integer div_rem_2_2 operation
4347
4352
void init_temporary_buffers (cudaStream_t const *streams,
@@ -4610,6 +4615,12 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
4610
4615
overflow_sub_mem_3 = new int_borrow_prop_memory<Torus>(
4611
4616
streams, gpu_indexes, gpu_count, params, num_blocks, compute_overflow,
4612
4617
allocate_gpu_memory, size_tracker);
4618
+ uint32_t group_size = overflow_sub_mem_1->group_size ;
4619
+ bool use_seq = overflow_sub_mem_1->prop_simu_group_carries_mem
4620
+ ->use_sequential_algorithm_to_resolve_group_carries ;
4621
+ create_indexes_for_overflow_sub (streams, gpu_indexes, num_blocks,
4622
+ group_size, use_seq, allocate_gpu_memory,
4623
+ size_tracker);
4613
4624
comparison_buffer_1 = new int_comparison_buffer<Torus>(
4614
4625
streams, gpu_indexes, gpu_count, COMPARISON_TYPE::EQ, params,
4615
4626
num_blocks, false , allocate_gpu_memory, size_tracker);
@@ -4665,6 +4676,102 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
4665
4676
}
4666
4677
}
4667
4678
4679
+ void create_indexes_for_overflow_sub (cudaStream_t const *streams,
4680
+ uint32_t const *gpu_indexes,
4681
+ uint32_t num_blocks, uint32_t group_size,
4682
+ bool use_seq, bool allocate_gpu_memory,
4683
+ uint64_t &size_tracker) {
4684
+ max_indexes_to_erase = num_blocks;
4685
+
4686
+ first_indexes_for_overflow_sub =
4687
+ (Torus **)malloc (num_blocks * sizeof (Torus *));
4688
+ second_indexes_for_overflow_sub =
4689
+ (Torus **)malloc (num_blocks * sizeof (Torus *));
4690
+ scalars_for_overflow_sub = (Torus **)malloc (num_blocks * sizeof (Torus *));
4691
+
4692
+ Torus *h_lut_indexes = (Torus *)malloc (num_blocks * sizeof (Torus));
4693
+ Torus *h_scalar = (Torus *)malloc (num_blocks * sizeof (Torus));
4694
+
4695
+ // Extra indexes for the luts in first step
4696
+ for (int nb = 1 ; nb <= num_blocks; nb++) {
4697
+ first_indexes_for_overflow_sub[nb - 1 ] =
4698
+ (Torus *)cuda_malloc_with_size_tracking_async (
4699
+ nb * sizeof (Torus), streams[0 ], gpu_indexes[0 ], size_tracker,
4700
+ allocate_gpu_memory);
4701
+ for (int index = 0 ; index < nb; index++) {
4702
+ uint32_t grouping_index = index / group_size;
4703
+ bool is_in_first_grouping = (grouping_index == 0 );
4704
+ uint32_t index_in_grouping = index % group_size;
4705
+ bool is_last_index = (index == (nb - 1 ));
4706
+ if (is_last_index) {
4707
+ if (nb == 1 ) {
4708
+ h_lut_indexes[index] = 2 * group_size;
4709
+ } else {
4710
+ h_lut_indexes[index] = 2 ;
4711
+ }
4712
+ } else if (is_in_first_grouping) {
4713
+ h_lut_indexes[index] = index_in_grouping;
4714
+ } else {
4715
+ h_lut_indexes[index] = index_in_grouping + group_size;
4716
+ }
4717
+ }
4718
+ cuda_memcpy_with_size_tracking_async_to_gpu (
4719
+ first_indexes_for_overflow_sub[nb - 1 ], h_lut_indexes,
4720
+ nb * sizeof (Torus), streams[0 ], gpu_indexes[0 ], allocate_gpu_memory);
4721
+ }
4722
+ // Extra indexes for the luts in second step
4723
+ for (int nb = 1 ; nb <= num_blocks; nb++) {
4724
+ second_indexes_for_overflow_sub[nb - 1 ] =
4725
+ (Torus *)cuda_malloc_with_size_tracking_async (
4726
+ nb * sizeof (Torus), streams[0 ], gpu_indexes[0 ], size_tracker,
4727
+ allocate_gpu_memory);
4728
+ scalars_for_overflow_sub[nb - 1 ] =
4729
+ (Torus *)cuda_malloc_with_size_tracking_async (
4730
+ nb * sizeof (Torus), streams[0 ], gpu_indexes[0 ], size_tracker,
4731
+ allocate_gpu_memory);
4732
+
4733
+ for (int index = 0 ; index < nb; index++) {
4734
+ uint32_t grouping_index = index / group_size;
4735
+ bool is_in_first_grouping = (grouping_index == 0 );
4736
+ uint32_t index_in_grouping = index % group_size;
4737
+
4738
+ if (is_in_first_grouping) {
4739
+ h_lut_indexes[index] = index_in_grouping;
4740
+ } else if (index_in_grouping == (group_size - 1 )) {
4741
+ if (use_seq) {
4742
+ int inner_index = (grouping_index - 1 ) % (group_size - 1 );
4743
+ h_lut_indexes[index] = inner_index + 2 * group_size;
4744
+ } else {
4745
+ h_lut_indexes[index] = 2 * group_size;
4746
+ }
4747
+ } else {
4748
+ h_lut_indexes[index] = index_in_grouping + group_size;
4749
+ }
4750
+
4751
+ bool may_have_its_padding_bit_set =
4752
+ !is_in_first_grouping && (index_in_grouping == group_size - 1 );
4753
+
4754
+ if (may_have_its_padding_bit_set) {
4755
+ if (use_seq) {
4756
+ h_scalar[index] = 1 << ((grouping_index - 1 ) % (group_size - 1 ));
4757
+ } else {
4758
+ h_scalar[index] = 1 ;
4759
+ }
4760
+ } else {
4761
+ h_scalar[index] = 0 ;
4762
+ }
4763
+ }
4764
+ cuda_memcpy_with_size_tracking_async_to_gpu (
4765
+ second_indexes_for_overflow_sub[nb - 1 ], h_lut_indexes,
4766
+ nb * sizeof (Torus), streams[0 ], gpu_indexes[0 ], allocate_gpu_memory);
4767
+ cuda_memcpy_with_size_tracking_async_to_gpu (
4768
+ scalars_for_overflow_sub[nb - 1 ], h_scalar, nb * sizeof (Torus),
4769
+ streams[0 ], gpu_indexes[0 ], allocate_gpu_memory);
4770
+ }
4771
+ free (h_lut_indexes);
4772
+ free (h_scalar);
4773
+ };
4774
+
4668
4775
void release (cudaStream_t const *streams, uint32_t const *gpu_indexes,
4669
4776
uint32_t gpu_count) {
4670
4777
// release and delete integer ops memory objects
@@ -4793,6 +4900,21 @@ template <typename Torus> struct unsigned_int_div_rem_2_2_memory {
4793
4900
delete q1;
4794
4901
delete q2;
4795
4902
delete q3;
4903
+
4904
+ for (int i = 0 ; i < max_indexes_to_erase; i++) {
4905
+ cuda_drop_with_size_tracking_async (first_indexes_for_overflow_sub[i],
4906
+ streams[0 ], gpu_indexes[0 ],
4907
+ gpu_memory_allocated);
4908
+ cuda_drop_with_size_tracking_async (second_indexes_for_overflow_sub[i],
4909
+ streams[0 ], gpu_indexes[0 ],
4910
+ gpu_memory_allocated);
4911
+ cuda_drop_with_size_tracking_async (scalars_for_overflow_sub[i],
4912
+ streams[0 ], gpu_indexes[0 ],
4913
+ gpu_memory_allocated);
4914
+ }
4915
+ free (first_indexes_for_overflow_sub);
4916
+ free (second_indexes_for_overflow_sub);
4917
+ free (scalars_for_overflow_sub);
4796
4918
}
4797
4919
};
4798
4920
0 commit comments