Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -333,17 +333,17 @@ uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);

uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);

void cuda_propagate_single_carry_kb_64_inplace(
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,
Expand Down
157 changes: 82 additions & 75 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,6 @@ struct int_radix_params {
uint32_t carry_modulus;
PBS_MS_REDUCTION_T noise_reduction_type;

int_radix_params(){};

int_radix_params(PBS_TYPE pbs_type, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t big_lwe_dimension,
uint32_t small_lwe_dimension, uint32_t ks_level,
Expand All @@ -262,6 +260,8 @@ struct int_radix_params {
message_modulus(message_modulus), carry_modulus(carry_modulus),
noise_reduction_type(noise_reduction_type){};

int_radix_params() = default;

void print() {
printf("pbs_type: %u, glwe_dimension: %u, "
"polynomial_size: %u, "
Expand Down Expand Up @@ -2404,8 +2404,7 @@ template <typename Torus> struct int_sc_prop_memory {

int_sc_prop_memory(CudaStreams streams, int_radix_params params,
uint32_t num_radix_blocks, uint32_t requested_flag_in,
uint32_t uses_carry, bool allocate_gpu_memory,
uint64_t &size_tracker) {
bool allocate_gpu_memory, uint64_t &size_tracker) {
gpu_memory_allocated = allocate_gpu_memory;
this->params = params;
auto glwe_dimension = params.glwe_dimension;
Expand Down Expand Up @@ -3127,11 +3126,10 @@ template <typename Torus> struct int_mul_memory {
streams, params, num_radix_blocks, 2 * num_radix_blocks,
vector_result_sb, small_lwe_vector, luts_array, true,
allocate_gpu_memory, size_tracker);
uint32_t uses_carry = 0;
uint32_t requested_flag = outputFlag::FLAG_NONE;
sc_prop_mem = new int_sc_prop_memory<Torus>(
streams, params, num_radix_blocks, requested_flag, uses_carry,
allocate_gpu_memory, size_tracker);
streams, params, num_radix_blocks, requested_flag, allocate_gpu_memory,
size_tracker);
}

void release(CudaStreams streams) {
Expand Down Expand Up @@ -3731,36 +3729,13 @@ template <typename Torus> struct int_comparison_eq_buffer {
gpu_memory_allocated = allocate_gpu_memory;
this->params = params;
this->op = op;
Torus total_modulus = params.message_modulus * params.carry_modulus;

are_all_block_true_buffer = new int_are_all_block_true_buffer<Torus>(
streams, op, params, num_radix_blocks, allocate_gpu_memory,
size_tracker);

// Operator LUT
auto operator_f = [op](Torus lhs, Torus rhs) -> Torus {
if (op == COMPARISON_TYPE::EQ) {
// EQ
return (lhs == rhs);
} else {
// NE
return (lhs != rhs);
}
};
operator_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);

generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), operator_lut->get_lut(0, 0),
operator_lut->get_degree(0), operator_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, operator_f, gpu_memory_allocated);

auto active_streams = streams.active_gpu_subset(num_radix_blocks);
operator_lut->broadcast_lut(active_streams);

// f(x) -> x == 0
Torus total_modulus = params.message_modulus * params.carry_modulus;
auto is_non_zero_lut_f = [total_modulus](Torus x) -> Torus {
return (x % total_modulus) != 0;
};
Expand All @@ -3775,38 +3750,74 @@ template <typename Torus> struct int_comparison_eq_buffer {
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);

auto active_streams = streams.active_gpu_subset(num_radix_blocks);
is_non_zero_lut->broadcast_lut(active_streams);

// Scalar may have up to num_radix_blocks blocks
scalar_comparison_luts = new int_radix_lut<Torus>(
streams, params, total_modulus, num_radix_blocks, allocate_gpu_memory,
size_tracker);

for (int i = 0; i < total_modulus; i++) {
auto lut_f = [i, operator_f](Torus x) -> Torus {
return operator_f(i, x);
if (op == COMPARISON_TYPE::EQ || COMPARISON_TYPE::NE) {
// Operator LUT
auto operator_f = [op](Torus lhs, Torus rhs) -> Torus {
if (op == COMPARISON_TYPE::EQ) {
return (lhs == rhs);
} else if (op == COMPARISON_TYPE::NE) {
return (lhs != rhs);
PANIC("Cuda error (eq/ne): invalid comparison type")
}
};
// Scalar may have up to num_radix_blocks blocks
scalar_comparison_luts = new int_radix_lut<Torus>(
streams, params, total_modulus, num_radix_blocks, allocate_gpu_memory,
size_tracker);

generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
scalar_comparison_luts->get_lut(0, i),
scalar_comparison_luts->get_degree(i),
scalar_comparison_luts->get_max_degree(i), params.glwe_dimension,
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f, gpu_memory_allocated);
for (int i = 0; i < total_modulus; i++) {
auto lut_f = [i, operator_f](Torus x) -> Torus {
return operator_f(i, x);
};

generate_device_accumulator<Torus>(
streams.stream(0), streams.gpu_index(0),
scalar_comparison_luts->get_lut(0, i),
scalar_comparison_luts->get_degree(i),
scalar_comparison_luts->get_max_degree(i), params.glwe_dimension,
params.polynomial_size, params.message_modulus,
params.carry_modulus, lut_f, gpu_memory_allocated);
}
scalar_comparison_luts->broadcast_lut(active_streams);
operator_lut =
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
allocate_gpu_memory, size_tracker);

generate_device_accumulator_bivariate<Torus>(
streams.stream(0), streams.gpu_index(0), operator_lut->get_lut(0, 0),
operator_lut->get_degree(0), operator_lut->get_max_degree(0),
params.glwe_dimension, params.polynomial_size, params.message_modulus,
params.carry_modulus, operator_f, gpu_memory_allocated);

operator_lut->broadcast_lut(active_streams);
} else {
scalar_comparison_luts = nullptr;
operator_lut = nullptr;
}
scalar_comparison_luts->broadcast_lut(active_streams);
}

void release(CudaStreams streams) {
operator_lut->release(streams);
delete operator_lut;
if (op == COMPARISON_TYPE::EQ || COMPARISON_TYPE::NE) {
PANIC_IF_FALSE(operator_lut != nullptr,
"Cuda error: no operator lut was created");
operator_lut->release(streams);
delete operator_lut;
operator_lut = nullptr;
PANIC_IF_FALSE(scalar_comparison_luts != nullptr,
"Cuda error: no scalar comparison luts were created");
scalar_comparison_luts->release(streams);
delete scalar_comparison_luts;
scalar_comparison_luts = nullptr;
}
is_non_zero_lut->release(streams);
delete is_non_zero_lut;
scalar_comparison_luts->release(streams);
delete scalar_comparison_luts;
is_non_zero_lut = nullptr;
are_all_block_true_buffer->release(streams);
delete are_all_block_true_buffer;
are_all_block_true_buffer = nullptr;
}
};

Expand Down Expand Up @@ -3926,8 +3937,7 @@ template <typename Torus> struct int_comparison_diff_buffer {
case LE:
return (x == IS_INFERIOR) || (x == IS_EQUAL);
default:
// We don't need a default case but we need to return something
return 42;
PANIC("Cuda error (comparisons): unknown comparison type")
}
};

Expand Down Expand Up @@ -4922,11 +4932,10 @@ template <typename Torus> struct int_scalar_mul_buffer {
streams, params, num_radix_blocks, num_ciphertext_bits, true,
allocate_gpu_memory, last_step_mem);
}
uint32_t uses_carry = 0;
uint32_t requested_flag = outputFlag::FLAG_NONE;
sc_prop_mem = new int_sc_prop_memory<Torus>(
streams, params, num_radix_blocks, requested_flag, uses_carry,
allocate_gpu_memory, last_step_mem);
streams, params, num_radix_blocks, requested_flag, allocate_gpu_memory,
last_step_mem);
if (anticipated_buffer_drop) {
size_tracker += std::max(anticipated_drop_mem, last_step_mem);
} else {
Expand Down Expand Up @@ -4982,10 +4991,9 @@ template <typename Torus> struct int_abs_buffer {
streams, SHIFT_OR_ROTATE_TYPE::RIGHT_SHIFT, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
scp_mem = new int_sc_prop_memory<Torus>(streams, params, num_radix_blocks,
requested_flag, uses_carry,
allocate_gpu_memory, size_tracker);
requested_flag, allocate_gpu_memory,
size_tracker);
bitxor_mem = new int_bitop_buffer<Torus>(streams, BITOP_TYPE::BITXOR,
params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
Expand Down Expand Up @@ -5061,13 +5069,12 @@ template <typename Torus> struct int_div_rem_memory {
abs_mem_2 = new int_abs_buffer<Torus>(streams, params, num_blocks,
allocate_gpu_memory, size_tracker);
uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
scp_mem_1 = new int_sc_prop_memory<Torus>(
streams, params, num_blocks, requested_flag, uses_carry,
allocate_gpu_memory, size_tracker);
streams, params, num_blocks, requested_flag, allocate_gpu_memory,
size_tracker);
scp_mem_2 = new int_sc_prop_memory<Torus>(
streams, params, num_blocks, requested_flag, uses_carry,
allocate_gpu_memory, size_tracker);
streams, params, num_blocks, requested_flag, allocate_gpu_memory,
size_tracker);

std::function<uint64_t(uint64_t)> quotient_predicate_lut_f =
[](uint64_t x) -> uint64_t { return x == 1; };
Expand Down Expand Up @@ -5251,7 +5258,7 @@ template <typename Torus> struct int_sub_and_propagate {
this->allocate_gpu_memory = allocate_gpu_memory;

this->sc_prop_mem = new int_sc_prop_memory<Torus>(
streams, params, num_radix_blocks, requested_flag_in, (uint32_t)0,
streams, params, num_radix_blocks, requested_flag_in,
allocate_gpu_memory, size_tracker);

this->neg_rhs_array = new CudaRadixCiphertextFFI;
Expand Down Expand Up @@ -5391,8 +5398,8 @@ template <typename Torus> struct int_unsigned_scalar_div_mem {
streams, params, num_radix_blocks, scalar_divisor_ffi->active_bits,
allocate_gpu_memory, size_tracker);
scp_mem = new int_sc_prop_memory<Torus>(
streams, params, num_radix_blocks, FLAG_NONE, (uint32_t)0,
allocate_gpu_memory, size_tracker);
streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
size_tracker);
sub_and_propagate_mem = new int_sub_and_propagate<Torus>(
streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
size_tracker);
Expand Down Expand Up @@ -5545,8 +5552,8 @@ template <typename Torus> struct int_signed_scalar_div_mem {
streams, RIGHT_SHIFT, params, num_radix_blocks,
allocate_gpu_memory, size_tracker);
scp_mem = new int_sc_prop_memory<Torus>(
streams, params, num_radix_blocks, FLAG_NONE, (uint32_t)0,
allocate_gpu_memory, size_tracker);
streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
size_tracker);

} else {

Expand All @@ -5567,7 +5574,7 @@ template <typename Torus> struct int_signed_scalar_div_mem {

if (scalar_divisor_ffi->is_chosen_multiplier_geq_two_pow_numerator) {
scp_mem = new int_sc_prop_memory<Torus>(
streams, params, num_radix_blocks, FLAG_NONE, (uint32_t)0,
streams, params, num_radix_blocks, FLAG_NONE,
allocate_gpu_memory, size_tracker);
}
}
Expand Down Expand Up @@ -5711,8 +5718,8 @@ template <typename Torus> struct int_signed_scalar_div_rem_buffer {
allocate_gpu_memory, size_tracker);

this->scp_mem = new int_sc_prop_memory<Torus>(
streams, params, num_radix_blocks, FLAG_NONE, (uint32_t)0,
allocate_gpu_memory, size_tracker);
streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
size_tracker);

bool is_divisor_one = scalar_divisor_ffi->is_abs_divisor_one &&
!scalar_divisor_ffi->is_divisor_negative;
Expand Down Expand Up @@ -5906,9 +5913,9 @@ template <typename Torus> struct int_count_of_consecutive_bits_buffer {
streams, params, counter_num_blocks, num_radix_blocks, true,
allocate_gpu_memory, size_tracker);

this->propagate_mem =
new int_sc_prop_memory<Torus>(streams, params, counter_num_blocks, 0, 0,
allocate_gpu_memory, size_tracker);
this->propagate_mem = new int_sc_prop_memory<Torus>(
streams, params, counter_num_blocks, FLAG_NONE, allocate_gpu_memory,
size_tracker);
}

void release(CudaStreams streams) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,6 @@ __host__ void tree_sign_reduction(
"than the number of blocks to operate on")

auto params = tree_buffer->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto message_modulus = params.message_modulus;
Expand Down
12 changes: 6 additions & 6 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -51,16 +51,16 @@ uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);

return scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
num_blocks, params, requested_flag, uses_carry, allocate_gpu_memory);
num_blocks, params, requested_flag, allocate_gpu_memory);
}

uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
Expand All @@ -69,16 +69,16 @@ uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry,
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
big_lwe_dimension, small_lwe_dimension, ks_level,
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
message_modulus, carry_modulus, noise_reduction_type);

return scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
num_blocks, params, requested_flag, uses_carry, allocate_gpu_memory);
num_blocks, params, requested_flag, allocate_gpu_memory);
}

uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace(
Expand Down
Loading
Loading