@@ -244,8 +244,6 @@ struct int_radix_params {
244244 uint32_t carry_modulus;
245245 PBS_MS_REDUCTION_T noise_reduction_type;
246246
247- int_radix_params (){};
248-
249247 int_radix_params (PBS_TYPE pbs_type, uint32_t glwe_dimension,
250248 uint32_t polynomial_size, uint32_t big_lwe_dimension,
251249 uint32_t small_lwe_dimension, uint32_t ks_level,
@@ -262,6 +260,8 @@ struct int_radix_params {
262260 message_modulus(message_modulus), carry_modulus(carry_modulus),
263261 noise_reduction_type(noise_reduction_type){};
264262
263+ int_radix_params () = default;
264+
265265 void print () {
266266 printf (" pbs_type: %u, glwe_dimension: %u, "
267267 " polynomial_size: %u, "
@@ -2404,8 +2404,7 @@ template <typename Torus> struct int_sc_prop_memory {
24042404
24052405 int_sc_prop_memory (CudaStreams streams, int_radix_params params,
24062406 uint32_t num_radix_blocks, uint32_t requested_flag_in,
2407- uint32_t uses_carry, bool allocate_gpu_memory,
2408- uint64_t &size_tracker) {
2407+ bool allocate_gpu_memory, uint64_t &size_tracker) {
24092408 gpu_memory_allocated = allocate_gpu_memory;
24102409 this ->params = params;
24112410 auto glwe_dimension = params.glwe_dimension ;
@@ -3127,11 +3126,10 @@ template <typename Torus> struct int_mul_memory {
31273126 streams, params, num_radix_blocks, 2 * num_radix_blocks,
31283127 vector_result_sb, small_lwe_vector, luts_array, true ,
31293128 allocate_gpu_memory, size_tracker);
3130- uint32_t uses_carry = 0 ;
31313129 uint32_t requested_flag = outputFlag::FLAG_NONE;
31323130 sc_prop_mem = new int_sc_prop_memory<Torus>(
3133- streams, params, num_radix_blocks, requested_flag, uses_carry ,
3134- allocate_gpu_memory, size_tracker);
3131+ streams, params, num_radix_blocks, requested_flag, allocate_gpu_memory ,
3132+ size_tracker);
31353133 }
31363134
31373135 void release (CudaStreams streams) {
@@ -3731,36 +3729,56 @@ template <typename Torus> struct int_comparison_eq_buffer {
37313729 gpu_memory_allocated = allocate_gpu_memory;
37323730 this ->params = params;
37333731 this ->op = op;
3732+ Torus total_modulus = params.message_modulus * params.carry_modulus ;
37343733
37353734 are_all_block_true_buffer = new int_are_all_block_true_buffer<Torus>(
37363735 streams, op, params, num_radix_blocks, allocate_gpu_memory,
37373736 size_tracker);
3737+ auto active_streams = streams.active_gpu_subset (num_radix_blocks);
37383738
3739- // Operator LUT
3740- auto operator_f = [op](Torus lhs, Torus rhs) -> Torus {
3741- if (op == COMPARISON_TYPE::EQ) {
3742- // EQ
3743- return (lhs == rhs);
3744- } else {
3745- // NE
3746- return (lhs != rhs);
3739+ if (op == COMPARISON_TYPE::EQ || COMPARISON_TYPE::NE) {
3740+ // Operator LUT
3741+ auto operator_f = [op](Torus lhs, Torus rhs) -> Torus {
3742+ if (op == COMPARISON_TYPE::EQ) {
3743+ return (lhs == rhs);
3744+ } else if (op == COMPARISON_TYPE::NE) {
3745+ return (lhs != rhs);
3746+ PANIC (" Cuda error (eq/ne): invalid comparison type" )
3747+ }
3748+ };
3749+ // Scalar may have up to num_radix_blocks blocks
3750+ scalar_comparison_luts = new int_radix_lut<Torus>(
3751+ streams, params, total_modulus, num_radix_blocks, allocate_gpu_memory,
3752+ size_tracker);
3753+
3754+ for (int i = 0 ; i < total_modulus; i++) {
3755+ auto lut_f = [i, operator_f](Torus x) -> Torus {
3756+ return operator_f (i, x);
3757+ };
3758+
3759+ generate_device_accumulator<Torus>(
3760+ streams.stream (0 ), streams.gpu_index (0 ),
3761+ scalar_comparison_luts->get_lut (0 , i),
3762+ scalar_comparison_luts->get_degree (i),
3763+ scalar_comparison_luts->get_max_degree (i), params.glwe_dimension ,
3764+ params.polynomial_size , params.message_modulus ,
3765+ params.carry_modulus , lut_f, gpu_memory_allocated);
37473766 }
3748- } ;
3749- operator_lut =
3750- new int_radix_lut<Torus>(streams, params, 1 , num_radix_blocks,
3751- allocate_gpu_memory, size_tracker);
3767+ scalar_comparison_luts-> broadcast_lut (active_streams) ;
3768+ operator_lut =
3769+ new int_radix_lut<Torus>(streams, params, 1 , num_radix_blocks,
3770+ allocate_gpu_memory, size_tracker);
37523771
3753- generate_device_accumulator_bivariate<Torus>(
3754- streams.stream (0 ), streams.gpu_index (0 ), operator_lut->get_lut (0 , 0 ),
3755- operator_lut->get_degree (0 ), operator_lut->get_max_degree (0 ),
3756- params.glwe_dimension , params.polynomial_size , params.message_modulus ,
3757- params.carry_modulus , operator_f, gpu_memory_allocated);
3772+ generate_device_accumulator_bivariate<Torus>(
3773+ streams.stream (0 ), streams.gpu_index (0 ), operator_lut->get_lut (0 , 0 ),
3774+ operator_lut->get_degree (0 ), operator_lut->get_max_degree (0 ),
3775+ params.glwe_dimension , params.polynomial_size , params.message_modulus ,
3776+ params.carry_modulus , operator_f, gpu_memory_allocated);
37583777
3759- auto active_streams = streams. active_gpu_subset (num_radix_blocks );
3760- operator_lut-> broadcast_lut (active_streams);
3778+ operator_lut-> broadcast_lut (active_streams );
3779+ }
37613780
37623781 // f(x) -> x == 0
3763- Torus total_modulus = params.message_modulus * params.carry_modulus ;
37643782 auto is_non_zero_lut_f = [total_modulus](Torus x) -> Torus {
37653783 return (x % total_modulus) != 0 ;
37663784 };
@@ -3776,35 +3794,17 @@ template <typename Torus> struct int_comparison_eq_buffer {
37763794 params.carry_modulus , is_non_zero_lut_f, gpu_memory_allocated);
37773795
37783796 is_non_zero_lut->broadcast_lut (active_streams);
3779-
3780- // Scalar may have up to num_radix_blocks blocks
3781- scalar_comparison_luts = new int_radix_lut<Torus>(
3782- streams, params, total_modulus, num_radix_blocks, allocate_gpu_memory,
3783- size_tracker);
3784-
3785- for (int i = 0 ; i < total_modulus; i++) {
3786- auto lut_f = [i, operator_f](Torus x) -> Torus {
3787- return operator_f (i, x);
3788- };
3789-
3790- generate_device_accumulator<Torus>(
3791- streams.stream (0 ), streams.gpu_index (0 ),
3792- scalar_comparison_luts->get_lut (0 , i),
3793- scalar_comparison_luts->get_degree (i),
3794- scalar_comparison_luts->get_max_degree (i), params.glwe_dimension ,
3795- params.polynomial_size , params.message_modulus , params.carry_modulus ,
3796- lut_f, gpu_memory_allocated);
3797- }
3798- scalar_comparison_luts->broadcast_lut (active_streams);
37993797 }
38003798
38013799 void release (CudaStreams streams) {
3802- operator_lut->release (streams);
3803- delete operator_lut;
3800+ if (op == COMPARISON_TYPE::EQ || COMPARISON_TYPE::NE) {
3801+ operator_lut->release (streams);
3802+ delete operator_lut;
3803+ scalar_comparison_luts->release (streams);
3804+ delete scalar_comparison_luts;
3805+ }
38043806 is_non_zero_lut->release (streams);
38053807 delete is_non_zero_lut;
3806- scalar_comparison_luts->release (streams);
3807- delete scalar_comparison_luts;
38083808 are_all_block_true_buffer->release (streams);
38093809 delete are_all_block_true_buffer;
38103810 }
@@ -3926,8 +3926,7 @@ template <typename Torus> struct int_comparison_diff_buffer {
39263926 case LE:
39273927 return (x == IS_INFERIOR) || (x == IS_EQUAL);
39283928 default :
3929- // We don't need a default case but we need to return something
3930- return 42 ;
3929+ PANIC (" Cuda error (comparisons): unknown comparison type" )
39313930 }
39323931 };
39333932
@@ -4922,11 +4921,10 @@ template <typename Torus> struct int_scalar_mul_buffer {
49224921 streams, params, num_radix_blocks, num_ciphertext_bits, true ,
49234922 allocate_gpu_memory, last_step_mem);
49244923 }
4925- uint32_t uses_carry = 0 ;
49264924 uint32_t requested_flag = outputFlag::FLAG_NONE;
49274925 sc_prop_mem = new int_sc_prop_memory<Torus>(
4928- streams, params, num_radix_blocks, requested_flag, uses_carry ,
4929- allocate_gpu_memory, last_step_mem);
4926+ streams, params, num_radix_blocks, requested_flag, allocate_gpu_memory ,
4927+ last_step_mem);
49304928 if (anticipated_buffer_drop) {
49314929 size_tracker += std::max (anticipated_drop_mem, last_step_mem);
49324930 } else {
@@ -4982,10 +4980,9 @@ template <typename Torus> struct int_abs_buffer {
49824980 streams, SHIFT_OR_ROTATE_TYPE::RIGHT_SHIFT, params, num_radix_blocks,
49834981 allocate_gpu_memory, size_tracker);
49844982 uint32_t requested_flag = outputFlag::FLAG_NONE;
4985- uint32_t uses_carry = 0 ;
49864983 scp_mem = new int_sc_prop_memory<Torus>(streams, params, num_radix_blocks,
4987- requested_flag, uses_carry ,
4988- allocate_gpu_memory, size_tracker);
4984+ requested_flag, allocate_gpu_memory ,
4985+ size_tracker);
49894986 bitxor_mem = new int_bitop_buffer<Torus>(streams, BITOP_TYPE::BITXOR,
49904987 params, num_radix_blocks,
49914988 allocate_gpu_memory, size_tracker);
@@ -5061,13 +5058,12 @@ template <typename Torus> struct int_div_rem_memory {
50615058 abs_mem_2 = new int_abs_buffer<Torus>(streams, params, num_blocks,
50625059 allocate_gpu_memory, size_tracker);
50635060 uint32_t requested_flag = outputFlag::FLAG_NONE;
5064- uint32_t uses_carry = 0 ;
50655061 scp_mem_1 = new int_sc_prop_memory<Torus>(
5066- streams, params, num_blocks, requested_flag, uses_carry ,
5067- allocate_gpu_memory, size_tracker);
5062+ streams, params, num_blocks, requested_flag, allocate_gpu_memory ,
5063+ size_tracker);
50685064 scp_mem_2 = new int_sc_prop_memory<Torus>(
5069- streams, params, num_blocks, requested_flag, uses_carry ,
5070- allocate_gpu_memory, size_tracker);
5065+ streams, params, num_blocks, requested_flag, allocate_gpu_memory ,
5066+ size_tracker);
50715067
50725068 std::function<uint64_t (uint64_t )> quotient_predicate_lut_f =
50735069 [](uint64_t x) -> uint64_t { return x == 1 ; };
@@ -5251,7 +5247,7 @@ template <typename Torus> struct int_sub_and_propagate {
52515247 this ->allocate_gpu_memory = allocate_gpu_memory;
52525248
52535249 this ->sc_prop_mem = new int_sc_prop_memory<Torus>(
5254- streams, params, num_radix_blocks, requested_flag_in, ( uint32_t ) 0 ,
5250+ streams, params, num_radix_blocks, requested_flag_in,
52555251 allocate_gpu_memory, size_tracker);
52565252
52575253 this ->neg_rhs_array = new CudaRadixCiphertextFFI;
@@ -5391,8 +5387,8 @@ template <typename Torus> struct int_unsigned_scalar_div_mem {
53915387 streams, params, num_radix_blocks, scalar_divisor_ffi->active_bits ,
53925388 allocate_gpu_memory, size_tracker);
53935389 scp_mem = new int_sc_prop_memory<Torus>(
5394- streams, params, num_radix_blocks, FLAG_NONE, ( uint32_t ) 0 ,
5395- allocate_gpu_memory, size_tracker);
5390+ streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory ,
5391+ size_tracker);
53965392 sub_and_propagate_mem = new int_sub_and_propagate<Torus>(
53975393 streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
53985394 size_tracker);
@@ -5545,8 +5541,8 @@ template <typename Torus> struct int_signed_scalar_div_mem {
55455541 streams, RIGHT_SHIFT, params, num_radix_blocks,
55465542 allocate_gpu_memory, size_tracker);
55475543 scp_mem = new int_sc_prop_memory<Torus>(
5548- streams, params, num_radix_blocks, FLAG_NONE, ( uint32_t ) 0 ,
5549- allocate_gpu_memory, size_tracker);
5544+ streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory ,
5545+ size_tracker);
55505546
55515547 } else {
55525548
@@ -5567,7 +5563,7 @@ template <typename Torus> struct int_signed_scalar_div_mem {
55675563
55685564 if (scalar_divisor_ffi->is_chosen_multiplier_geq_two_pow_numerator ) {
55695565 scp_mem = new int_sc_prop_memory<Torus>(
5570- streams, params, num_radix_blocks, FLAG_NONE, ( uint32_t ) 0 ,
5566+ streams, params, num_radix_blocks, FLAG_NONE,
55715567 allocate_gpu_memory, size_tracker);
55725568 }
55735569 }
@@ -5711,8 +5707,8 @@ template <typename Torus> struct int_signed_scalar_div_rem_buffer {
57115707 allocate_gpu_memory, size_tracker);
57125708
57135709 this ->scp_mem = new int_sc_prop_memory<Torus>(
5714- streams, params, num_radix_blocks, FLAG_NONE, ( uint32_t ) 0 ,
5715- allocate_gpu_memory, size_tracker);
5710+ streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory ,
5711+ size_tracker);
57165712
57175713 bool is_divisor_one = scalar_divisor_ffi->is_abs_divisor_one &&
57185714 !scalar_divisor_ffi->is_divisor_negative ;
@@ -5907,7 +5903,7 @@ template <typename Torus> struct int_count_of_consecutive_bits_buffer {
59075903 allocate_gpu_memory, size_tracker);
59085904
59095905 this ->propagate_mem =
5910- new int_sc_prop_memory<Torus>(streams, params, counter_num_blocks, 0 , 0 ,
5906+ new int_sc_prop_memory<Torus>(streams, params, counter_num_blocks, 0 ,
59115907 allocate_gpu_memory, size_tracker);
59125908 }
59135909
0 commit comments