diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h index eac7ac8e1c..8e3a653edf 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h @@ -333,8 +333,8 @@ uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace( uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry, - bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type); + PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory, + PBS_MS_REDUCTION_T noise_reduction_type); uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace( CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension, @@ -342,8 +342,8 @@ uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace( uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry, - bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type); + PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory, + PBS_MS_REDUCTION_T noise_reduction_type); void cuda_propagate_single_carry_kb_64_inplace( CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array, diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h index 689d42ebe3..736aff79be 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h @@ -244,8 +244,6 @@ struct int_radix_params { uint32_t carry_modulus; PBS_MS_REDUCTION_T noise_reduction_type; - int_radix_params(){}; - int_radix_params(PBS_TYPE pbs_type, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t big_lwe_dimension, uint32_t small_lwe_dimension, uint32_t ks_level, @@ -262,6 +260,8 @@ struct int_radix_params { message_modulus(message_modulus), carry_modulus(carry_modulus), noise_reduction_type(noise_reduction_type){}; + int_radix_params() = default; + void print() { printf("pbs_type: %u, glwe_dimension: %u, " "polynomial_size: %u, " @@ -2404,8 +2404,7 @@ template struct int_sc_prop_memory { int_sc_prop_memory(CudaStreams streams, int_radix_params params, uint32_t num_radix_blocks, uint32_t requested_flag_in, - uint32_t uses_carry, bool allocate_gpu_memory, - uint64_t &size_tracker) { + bool allocate_gpu_memory, uint64_t &size_tracker) { gpu_memory_allocated = allocate_gpu_memory; this->params = params; auto glwe_dimension = params.glwe_dimension; @@ -3127,11 +3126,10 @@ template struct int_mul_memory { streams, params, num_radix_blocks, 2 * num_radix_blocks, vector_result_sb, small_lwe_vector, luts_array, true, allocate_gpu_memory, size_tracker); - uint32_t uses_carry = 0; uint32_t requested_flag = outputFlag::FLAG_NONE; sc_prop_mem = new int_sc_prop_memory( - streams, params, num_radix_blocks, requested_flag, uses_carry, - allocate_gpu_memory, size_tracker); + streams, params, num_radix_blocks, requested_flag, allocate_gpu_memory, + size_tracker); } void release(CudaStreams streams) { @@ -3731,36 +3729,13 @@ template struct int_comparison_eq_buffer { gpu_memory_allocated = allocate_gpu_memory; this->params = params; this->op = op; + Torus total_modulus = params.message_modulus * params.carry_modulus; are_all_block_true_buffer = new int_are_all_block_true_buffer( streams, op, params, num_radix_blocks, allocate_gpu_memory, size_tracker); - // Operator LUT - auto operator_f = [op](Torus lhs, Torus rhs) -> Torus { - if (op == COMPARISON_TYPE::EQ) { - // EQ - return (lhs == rhs); - } else { - // NE - return (lhs != rhs); - } - }; - operator_lut = - new int_radix_lut(streams, params, 1, num_radix_blocks, - allocate_gpu_memory, size_tracker); - - generate_device_accumulator_bivariate( - streams.stream(0), streams.gpu_index(0), operator_lut->get_lut(0, 0), - operator_lut->get_degree(0), operator_lut->get_max_degree(0), - params.glwe_dimension, params.polynomial_size, params.message_modulus, - params.carry_modulus, operator_f, gpu_memory_allocated); - - auto active_streams = streams.active_gpu_subset(num_radix_blocks); - operator_lut->broadcast_lut(active_streams); - // f(x) -> x == 0 - Torus total_modulus = params.message_modulus * params.carry_modulus; auto is_non_zero_lut_f = [total_modulus](Torus x) -> Torus { return (x % total_modulus) != 0; }; @@ -3775,38 +3750,74 @@ template struct int_comparison_eq_buffer { params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated); + auto active_streams = streams.active_gpu_subset(num_radix_blocks); is_non_zero_lut->broadcast_lut(active_streams); - // Scalar may have up to num_radix_blocks blocks - scalar_comparison_luts = new int_radix_lut( - streams, params, total_modulus, num_radix_blocks, allocate_gpu_memory, - size_tracker); - - for (int i = 0; i < total_modulus; i++) { - auto lut_f = [i, operator_f](Torus x) -> Torus { - return operator_f(i, x); + if (op == COMPARISON_TYPE::EQ || COMPARISON_TYPE::NE) { + // Operator LUT + auto operator_f = [op](Torus lhs, Torus rhs) -> Torus { + if (op == COMPARISON_TYPE::EQ) { + return (lhs == rhs); + } else if (op == COMPARISON_TYPE::NE) { + return (lhs != rhs); + PANIC("Cuda error (eq/ne): invalid comparison type") + } }; + // Scalar may have up to num_radix_blocks blocks + scalar_comparison_luts = new int_radix_lut( + streams, params, total_modulus, num_radix_blocks, allocate_gpu_memory, + size_tracker); - generate_device_accumulator( - streams.stream(0), streams.gpu_index(0), - scalar_comparison_luts->get_lut(0, i), - scalar_comparison_luts->get_degree(i), - scalar_comparison_luts->get_max_degree(i), params.glwe_dimension, - params.polynomial_size, params.message_modulus, params.carry_modulus, - lut_f, gpu_memory_allocated); + for (int i = 0; i < total_modulus; i++) { + auto lut_f = [i, operator_f](Torus x) -> Torus { + return operator_f(i, x); + }; + + generate_device_accumulator( + streams.stream(0), streams.gpu_index(0), + scalar_comparison_luts->get_lut(0, i), + scalar_comparison_luts->get_degree(i), + scalar_comparison_luts->get_max_degree(i), params.glwe_dimension, + params.polynomial_size, params.message_modulus, + params.carry_modulus, lut_f, gpu_memory_allocated); + } + scalar_comparison_luts->broadcast_lut(active_streams); + operator_lut = + new int_radix_lut(streams, params, 1, num_radix_blocks, + allocate_gpu_memory, size_tracker); + + generate_device_accumulator_bivariate( + streams.stream(0), streams.gpu_index(0), operator_lut->get_lut(0, 0), + operator_lut->get_degree(0), operator_lut->get_max_degree(0), + params.glwe_dimension, params.polynomial_size, params.message_modulus, + params.carry_modulus, operator_f, gpu_memory_allocated); + + operator_lut->broadcast_lut(active_streams); + } else { + scalar_comparison_luts = nullptr; + operator_lut = nullptr; } - scalar_comparison_luts->broadcast_lut(active_streams); } void release(CudaStreams streams) { - operator_lut->release(streams); - delete operator_lut; + if (op == COMPARISON_TYPE::EQ || COMPARISON_TYPE::NE) { + PANIC_IF_FALSE(operator_lut != nullptr, + "Cuda error: no operator lut was created"); + operator_lut->release(streams); + delete operator_lut; + operator_lut = nullptr; + PANIC_IF_FALSE(scalar_comparison_luts != nullptr, + "Cuda error: no scalar comparison luts were created"); + scalar_comparison_luts->release(streams); + delete scalar_comparison_luts; + scalar_comparison_luts = nullptr; + } is_non_zero_lut->release(streams); delete is_non_zero_lut; - scalar_comparison_luts->release(streams); - delete scalar_comparison_luts; + is_non_zero_lut = nullptr; are_all_block_true_buffer->release(streams); delete are_all_block_true_buffer; + are_all_block_true_buffer = nullptr; } }; @@ -3926,8 +3937,7 @@ template struct int_comparison_diff_buffer { case LE: return (x == IS_INFERIOR) || (x == IS_EQUAL); default: - // We don't need a default case but we need to return something - return 42; + PANIC("Cuda error (comparisons): unknown comparison type") } }; @@ -4922,11 +4932,10 @@ template struct int_scalar_mul_buffer { streams, params, num_radix_blocks, num_ciphertext_bits, true, allocate_gpu_memory, last_step_mem); } - uint32_t uses_carry = 0; uint32_t requested_flag = outputFlag::FLAG_NONE; sc_prop_mem = new int_sc_prop_memory( - streams, params, num_radix_blocks, requested_flag, uses_carry, - allocate_gpu_memory, last_step_mem); + streams, params, num_radix_blocks, requested_flag, allocate_gpu_memory, + last_step_mem); if (anticipated_buffer_drop) { size_tracker += std::max(anticipated_drop_mem, last_step_mem); } else { @@ -4982,10 +4991,9 @@ template struct int_abs_buffer { streams, SHIFT_OR_ROTATE_TYPE::RIGHT_SHIFT, params, num_radix_blocks, allocate_gpu_memory, size_tracker); uint32_t requested_flag = outputFlag::FLAG_NONE; - uint32_t uses_carry = 0; scp_mem = new int_sc_prop_memory(streams, params, num_radix_blocks, - requested_flag, uses_carry, - allocate_gpu_memory, size_tracker); + requested_flag, allocate_gpu_memory, + size_tracker); bitxor_mem = new int_bitop_buffer(streams, BITOP_TYPE::BITXOR, params, num_radix_blocks, allocate_gpu_memory, size_tracker); @@ -5061,13 +5069,12 @@ template struct int_div_rem_memory { abs_mem_2 = new int_abs_buffer(streams, params, num_blocks, allocate_gpu_memory, size_tracker); uint32_t requested_flag = outputFlag::FLAG_NONE; - uint32_t uses_carry = 0; scp_mem_1 = new int_sc_prop_memory( - streams, params, num_blocks, requested_flag, uses_carry, - allocate_gpu_memory, size_tracker); + streams, params, num_blocks, requested_flag, allocate_gpu_memory, + size_tracker); scp_mem_2 = new int_sc_prop_memory( - streams, params, num_blocks, requested_flag, uses_carry, - allocate_gpu_memory, size_tracker); + streams, params, num_blocks, requested_flag, allocate_gpu_memory, + size_tracker); std::function quotient_predicate_lut_f = [](uint64_t x) -> uint64_t { return x == 1; }; @@ -5251,7 +5258,7 @@ template struct int_sub_and_propagate { this->allocate_gpu_memory = allocate_gpu_memory; this->sc_prop_mem = new int_sc_prop_memory( - streams, params, num_radix_blocks, requested_flag_in, (uint32_t)0, + streams, params, num_radix_blocks, requested_flag_in, allocate_gpu_memory, size_tracker); this->neg_rhs_array = new CudaRadixCiphertextFFI; @@ -5391,8 +5398,8 @@ template struct int_unsigned_scalar_div_mem { streams, params, num_radix_blocks, scalar_divisor_ffi->active_bits, allocate_gpu_memory, size_tracker); scp_mem = new int_sc_prop_memory( - streams, params, num_radix_blocks, FLAG_NONE, (uint32_t)0, - allocate_gpu_memory, size_tracker); + streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory, + size_tracker); sub_and_propagate_mem = new int_sub_and_propagate( streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory, size_tracker); @@ -5545,8 +5552,8 @@ template struct int_signed_scalar_div_mem { streams, RIGHT_SHIFT, params, num_radix_blocks, allocate_gpu_memory, size_tracker); scp_mem = new int_sc_prop_memory( - streams, params, num_radix_blocks, FLAG_NONE, (uint32_t)0, - allocate_gpu_memory, size_tracker); + streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory, + size_tracker); } else { @@ -5567,7 +5574,7 @@ template struct int_signed_scalar_div_mem { if (scalar_divisor_ffi->is_chosen_multiplier_geq_two_pow_numerator) { scp_mem = new int_sc_prop_memory( - streams, params, num_radix_blocks, FLAG_NONE, (uint32_t)0, + streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory, size_tracker); } } @@ -5711,8 +5718,8 @@ template struct int_signed_scalar_div_rem_buffer { allocate_gpu_memory, size_tracker); this->scp_mem = new int_sc_prop_memory( - streams, params, num_radix_blocks, FLAG_NONE, (uint32_t)0, - allocate_gpu_memory, size_tracker); + streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory, + size_tracker); bool is_divisor_one = scalar_divisor_ffi->is_abs_divisor_one && !scalar_divisor_ffi->is_divisor_negative; @@ -5906,9 +5913,9 @@ template struct int_count_of_consecutive_bits_buffer { streams, params, counter_num_blocks, num_radix_blocks, true, allocate_gpu_memory, size_tracker); - this->propagate_mem = - new int_sc_prop_memory(streams, params, counter_num_blocks, 0, 0, - allocate_gpu_memory, size_tracker); + this->propagate_mem = new int_sc_prop_memory( + streams, params, counter_num_blocks, FLAG_NONE, allocate_gpu_memory, + size_tracker); } void release(CudaStreams streams) { diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh index eb3a4d70fa..5127a6e8c8 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh @@ -430,7 +430,6 @@ __host__ void tree_sign_reduction( "than the number of blocks to operate on") auto params = tree_buffer->params; - auto big_lwe_dimension = params.big_lwe_dimension; auto glwe_dimension = params.glwe_dimension; auto polynomial_size = params.polynomial_size; auto message_modulus = params.message_modulus; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu index ff9a30c1ad..3384688321 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu @@ -51,8 +51,8 @@ uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace( uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry, - bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) { + PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory, + PBS_MS_REDUCTION_T noise_reduction_type) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, big_lwe_dimension, small_lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log, grouping_factor, @@ -60,7 +60,7 @@ uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace( return scratch_cuda_propagate_single_carry_kb_inplace( CudaStreams(streams), (int_sc_prop_memory **)mem_ptr, - num_blocks, params, requested_flag, uses_carry, allocate_gpu_memory); + num_blocks, params, requested_flag, allocate_gpu_memory); } uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace( @@ -69,8 +69,8 @@ uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace( uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus, - PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry, - bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) { + PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory, + PBS_MS_REDUCTION_T noise_reduction_type) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, big_lwe_dimension, small_lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log, grouping_factor, @@ -78,7 +78,7 @@ uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace( return scratch_cuda_propagate_single_carry_kb_inplace( CudaStreams(streams), (int_sc_prop_memory **)mem_ptr, - num_blocks, params, requested_flag, uses_carry, allocate_gpu_memory); + num_blocks, params, requested_flag, allocate_gpu_memory); } uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace( diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index 34b144680e..4944de550d 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -242,8 +242,8 @@ __host__ void host_radix_cumulative_sum_in_groups(cudaStream_t stream, auto lwe_size = dest->lwe_dimension + 1; cuda_set_device(gpu_index); // Each CUDA block is responsible for a single group - int num_blocks = (num_radix_blocks + group_size - 1) / group_size, - num_threads = 512; + int num_blocks = CEIL_DIV(num_radix_blocks, group_size); + int num_threads = 512; device_radix_cumulative_sum_in_groups <<>>( (Torus *)dest->ptr, (Torus *)src->ptr, num_radix_blocks, lwe_size, @@ -1566,9 +1566,6 @@ void host_full_propagate_inplace( void *const *bsks, uint32_t num_blocks) { auto params = mem_ptr->lut->params; - int big_lwe_size = (params.glwe_dimension * params.polynomial_size + 1); - int small_lwe_size = (params.small_lwe_dimension + 1); - // In the case of extracting a single LWE this parameters are dummy uint32_t num_many_lut = 1; uint32_t lut_stride = 0; @@ -1969,12 +1966,12 @@ template uint64_t scratch_cuda_propagate_single_carry_kb_inplace( CudaStreams streams, int_sc_prop_memory **mem_ptr, uint32_t num_radix_blocks, int_radix_params params, uint32_t requested_flag, - uint32_t uses_carry, bool allocate_gpu_memory) { + bool allocate_gpu_memory) { PUSH_RANGE("scratch add & propagate sc") uint64_t size_tracker = 0; *mem_ptr = new int_sc_prop_memory(streams, params, num_radix_blocks, - requested_flag, uses_carry, - allocate_gpu_memory, size_tracker); + requested_flag, allocate_gpu_memory, + size_tracker); POP_RANGE() return size_tracker; } @@ -2116,9 +2113,6 @@ void host_add_and_propagate_single_carry( auto num_radix_blocks = lhs_array->num_radix_blocks; auto params = mem->params; - auto glwe_dimension = params.glwe_dimension; - auto polynomial_size = params.polynomial_size; - uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1; auto lut_stride = mem->lut_stride; auto num_many_lut = mem->num_many_lut; CudaRadixCiphertextFFI output_flag; @@ -2390,7 +2384,6 @@ __host__ void integer_radix_apply_noise_squashing_kb( PUSH_RANGE("apply noise squashing") auto params = lut->params; - auto pbs_type = params.pbs_type; auto big_lwe_dimension = params.big_lwe_dimension; auto small_lwe_dimension = params.small_lwe_dimension; auto ks_level = params.ks_level; diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs index 91660f4db2..f97368f1f3 100644 --- a/backends/tfhe-cuda-backend/src/bindings.rs +++ b/backends/tfhe-cuda-backend/src/bindings.rs @@ -808,7 +808,6 @@ unsafe extern "C" { carry_modulus: u32, pbs_type: PBS_TYPE, requested_flag: u32, - uses_carry: u32, allocate_gpu_memory: bool, noise_reduction_type: PBS_MS_REDUCTION_T, ) -> u64; @@ -831,7 +830,6 @@ unsafe extern "C" { carry_modulus: u32, pbs_type: PBS_TYPE, requested_flag: u32, - uses_carry: u32, allocate_gpu_memory: bool, noise_reduction_type: PBS_MS_REDUCTION_T, ) -> u64; diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index 80380b8104..1ce53781a2 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -2323,7 +2323,6 @@ pub(crate) unsafe fn propagate_single_carry_assign_async, ) -> u64 { let noise_reduction_type = resolve_noise_reduction_type(ms_noise_reduction_configuration); @@ -2385,7 +2383,6 @@ pub(crate) fn get_propagate_single_carry_assign_async_size_on_gpu( carry_modulus.0 as u32, pbs_type as u32, requested_flag as u32, - uses_carry, false, noise_reduction_type as u32, ) @@ -2412,7 +2409,6 @@ pub(crate) fn get_add_and_propagate_single_carry_assign_async_size_on_gpu( pbs_type: PBSType, grouping_factor: LweBskGroupingFactor, requested_flag: OutputFlag, - uses_carry: u32, ms_noise_reduction_configuration: Option<&CudaModulusSwitchNoiseReductionConfiguration>, ) -> u64 { let noise_reduction_type = resolve_noise_reduction_type(ms_noise_reduction_configuration); @@ -2437,7 +2433,6 @@ pub(crate) fn get_add_and_propagate_single_carry_assign_async_size_on_gpu( carry_modulus.0 as u32, pbs_type as u32, requested_flag as u32, - uses_carry, false, noise_reduction_type as u32, ) @@ -2759,7 +2754,6 @@ pub(crate) unsafe fn add_and_propagate_single_carry_assign_async