Skip to content

Commit 04fa3ca

Browse files
committed
chore(gpu): post hackathon cleanup
1 parent 1dcc3c8 commit 04fa3ca

File tree

11 files changed

+87
-112
lines changed

11 files changed

+87
-112
lines changed

backends/tfhe-cuda-backend/cuda/include/integer/integer.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -333,17 +333,17 @@ uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
333333
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
334334
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
335335
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
336-
PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry,
337-
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
336+
PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
337+
PBS_MS_REDUCTION_T noise_reduction_type);
338338

339339
uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
340340
CudaStreamsFFI streams, int8_t **mem_ptr, uint32_t glwe_dimension,
341341
uint32_t polynomial_size, uint32_t big_lwe_dimension,
342342
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
343343
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
344344
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
345-
PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry,
346-
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type);
345+
PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
346+
PBS_MS_REDUCTION_T noise_reduction_type);
347347

348348
void cuda_propagate_single_carry_kb_64_inplace(
349349
CudaStreamsFFI streams, CudaRadixCiphertextFFI *lwe_array,

backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h

Lines changed: 69 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,6 @@ struct int_radix_params {
244244
uint32_t carry_modulus;
245245
PBS_MS_REDUCTION_T noise_reduction_type;
246246

247-
int_radix_params(){};
248-
249247
int_radix_params(PBS_TYPE pbs_type, uint32_t glwe_dimension,
250248
uint32_t polynomial_size, uint32_t big_lwe_dimension,
251249
uint32_t small_lwe_dimension, uint32_t ks_level,
@@ -262,6 +260,8 @@ struct int_radix_params {
262260
message_modulus(message_modulus), carry_modulus(carry_modulus),
263261
noise_reduction_type(noise_reduction_type){};
264262

263+
int_radix_params() = default;
264+
265265
void print() {
266266
printf("pbs_type: %u, glwe_dimension: %u, "
267267
"polynomial_size: %u, "
@@ -2404,8 +2404,7 @@ template <typename Torus> struct int_sc_prop_memory {
24042404

24052405
int_sc_prop_memory(CudaStreams streams, int_radix_params params,
24062406
uint32_t num_radix_blocks, uint32_t requested_flag_in,
2407-
uint32_t uses_carry, bool allocate_gpu_memory,
2408-
uint64_t &size_tracker) {
2407+
bool allocate_gpu_memory, uint64_t &size_tracker) {
24092408
gpu_memory_allocated = allocate_gpu_memory;
24102409
this->params = params;
24112410
auto glwe_dimension = params.glwe_dimension;
@@ -3127,11 +3126,10 @@ template <typename Torus> struct int_mul_memory {
31273126
streams, params, num_radix_blocks, 2 * num_radix_blocks,
31283127
vector_result_sb, small_lwe_vector, luts_array, true,
31293128
allocate_gpu_memory, size_tracker);
3130-
uint32_t uses_carry = 0;
31313129
uint32_t requested_flag = outputFlag::FLAG_NONE;
31323130
sc_prop_mem = new int_sc_prop_memory<Torus>(
3133-
streams, params, num_radix_blocks, requested_flag, uses_carry,
3134-
allocate_gpu_memory, size_tracker);
3131+
streams, params, num_radix_blocks, requested_flag, allocate_gpu_memory,
3132+
size_tracker);
31353133
}
31363134

31373135
void release(CudaStreams streams) {
@@ -3731,36 +3729,56 @@ template <typename Torus> struct int_comparison_eq_buffer {
37313729
gpu_memory_allocated = allocate_gpu_memory;
37323730
this->params = params;
37333731
this->op = op;
3732+
Torus total_modulus = params.message_modulus * params.carry_modulus;
37343733

37353734
are_all_block_true_buffer = new int_are_all_block_true_buffer<Torus>(
37363735
streams, op, params, num_radix_blocks, allocate_gpu_memory,
37373736
size_tracker);
3737+
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
37383738

3739-
// Operator LUT
3740-
auto operator_f = [op](Torus lhs, Torus rhs) -> Torus {
3741-
if (op == COMPARISON_TYPE::EQ) {
3742-
// EQ
3743-
return (lhs == rhs);
3744-
} else {
3745-
// NE
3746-
return (lhs != rhs);
3739+
if (op == COMPARISON_TYPE::EQ || COMPARISON_TYPE::NE) {
3740+
// Operator LUT
3741+
auto operator_f = [op](Torus lhs, Torus rhs) -> Torus {
3742+
if (op == COMPARISON_TYPE::EQ) {
3743+
return (lhs == rhs);
3744+
} else if (op == COMPARISON_TYPE::NE) {
3745+
return (lhs != rhs);
3746+
PANIC("Cuda error (eq/ne): invalid comparison type")
3747+
}
3748+
};
3749+
// Scalar may have up to num_radix_blocks blocks
3750+
scalar_comparison_luts = new int_radix_lut<Torus>(
3751+
streams, params, total_modulus, num_radix_blocks, allocate_gpu_memory,
3752+
size_tracker);
3753+
3754+
for (int i = 0; i < total_modulus; i++) {
3755+
auto lut_f = [i, operator_f](Torus x) -> Torus {
3756+
return operator_f(i, x);
3757+
};
3758+
3759+
generate_device_accumulator<Torus>(
3760+
streams.stream(0), streams.gpu_index(0),
3761+
scalar_comparison_luts->get_lut(0, i),
3762+
scalar_comparison_luts->get_degree(i),
3763+
scalar_comparison_luts->get_max_degree(i), params.glwe_dimension,
3764+
params.polynomial_size, params.message_modulus,
3765+
params.carry_modulus, lut_f, gpu_memory_allocated);
37473766
}
3748-
};
3749-
operator_lut =
3750-
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
3751-
allocate_gpu_memory, size_tracker);
3767+
scalar_comparison_luts->broadcast_lut(active_streams);
3768+
operator_lut =
3769+
new int_radix_lut<Torus>(streams, params, 1, num_radix_blocks,
3770+
allocate_gpu_memory, size_tracker);
37523771

3753-
generate_device_accumulator_bivariate<Torus>(
3754-
streams.stream(0), streams.gpu_index(0), operator_lut->get_lut(0, 0),
3755-
operator_lut->get_degree(0), operator_lut->get_max_degree(0),
3756-
params.glwe_dimension, params.polynomial_size, params.message_modulus,
3757-
params.carry_modulus, operator_f, gpu_memory_allocated);
3772+
generate_device_accumulator_bivariate<Torus>(
3773+
streams.stream(0), streams.gpu_index(0), operator_lut->get_lut(0, 0),
3774+
operator_lut->get_degree(0), operator_lut->get_max_degree(0),
3775+
params.glwe_dimension, params.polynomial_size, params.message_modulus,
3776+
params.carry_modulus, operator_f, gpu_memory_allocated);
37583777

3759-
auto active_streams = streams.active_gpu_subset(num_radix_blocks);
3760-
operator_lut->broadcast_lut(active_streams);
3778+
operator_lut->broadcast_lut(active_streams);
3779+
}
37613780

37623781
// f(x) -> x == 0
3763-
Torus total_modulus = params.message_modulus * params.carry_modulus;
37643782
auto is_non_zero_lut_f = [total_modulus](Torus x) -> Torus {
37653783
return (x % total_modulus) != 0;
37663784
};
@@ -3776,35 +3794,17 @@ template <typename Torus> struct int_comparison_eq_buffer {
37763794
params.carry_modulus, is_non_zero_lut_f, gpu_memory_allocated);
37773795

37783796
is_non_zero_lut->broadcast_lut(active_streams);
3779-
3780-
// Scalar may have up to num_radix_blocks blocks
3781-
scalar_comparison_luts = new int_radix_lut<Torus>(
3782-
streams, params, total_modulus, num_radix_blocks, allocate_gpu_memory,
3783-
size_tracker);
3784-
3785-
for (int i = 0; i < total_modulus; i++) {
3786-
auto lut_f = [i, operator_f](Torus x) -> Torus {
3787-
return operator_f(i, x);
3788-
};
3789-
3790-
generate_device_accumulator<Torus>(
3791-
streams.stream(0), streams.gpu_index(0),
3792-
scalar_comparison_luts->get_lut(0, i),
3793-
scalar_comparison_luts->get_degree(i),
3794-
scalar_comparison_luts->get_max_degree(i), params.glwe_dimension,
3795-
params.polynomial_size, params.message_modulus, params.carry_modulus,
3796-
lut_f, gpu_memory_allocated);
3797-
}
3798-
scalar_comparison_luts->broadcast_lut(active_streams);
37993797
}
38003798

38013799
void release(CudaStreams streams) {
3802-
operator_lut->release(streams);
3803-
delete operator_lut;
3800+
if (op == COMPARISON_TYPE::EQ || COMPARISON_TYPE::NE) {
3801+
operator_lut->release(streams);
3802+
delete operator_lut;
3803+
scalar_comparison_luts->release(streams);
3804+
delete scalar_comparison_luts;
3805+
}
38043806
is_non_zero_lut->release(streams);
38053807
delete is_non_zero_lut;
3806-
scalar_comparison_luts->release(streams);
3807-
delete scalar_comparison_luts;
38083808
are_all_block_true_buffer->release(streams);
38093809
delete are_all_block_true_buffer;
38103810
}
@@ -3926,8 +3926,7 @@ template <typename Torus> struct int_comparison_diff_buffer {
39263926
case LE:
39273927
return (x == IS_INFERIOR) || (x == IS_EQUAL);
39283928
default:
3929-
// We don't need a default case but we need to return something
3930-
return 42;
3929+
PANIC("Cuda error (comparisons): unknown comparison type")
39313930
}
39323931
};
39333932

@@ -4922,11 +4921,10 @@ template <typename Torus> struct int_scalar_mul_buffer {
49224921
streams, params, num_radix_blocks, num_ciphertext_bits, true,
49234922
allocate_gpu_memory, last_step_mem);
49244923
}
4925-
uint32_t uses_carry = 0;
49264924
uint32_t requested_flag = outputFlag::FLAG_NONE;
49274925
sc_prop_mem = new int_sc_prop_memory<Torus>(
4928-
streams, params, num_radix_blocks, requested_flag, uses_carry,
4929-
allocate_gpu_memory, last_step_mem);
4926+
streams, params, num_radix_blocks, requested_flag, allocate_gpu_memory,
4927+
last_step_mem);
49304928
if (anticipated_buffer_drop) {
49314929
size_tracker += std::max(anticipated_drop_mem, last_step_mem);
49324930
} else {
@@ -4982,10 +4980,9 @@ template <typename Torus> struct int_abs_buffer {
49824980
streams, SHIFT_OR_ROTATE_TYPE::RIGHT_SHIFT, params, num_radix_blocks,
49834981
allocate_gpu_memory, size_tracker);
49844982
uint32_t requested_flag = outputFlag::FLAG_NONE;
4985-
uint32_t uses_carry = 0;
49864983
scp_mem = new int_sc_prop_memory<Torus>(streams, params, num_radix_blocks,
4987-
requested_flag, uses_carry,
4988-
allocate_gpu_memory, size_tracker);
4984+
requested_flag, allocate_gpu_memory,
4985+
size_tracker);
49894986
bitxor_mem = new int_bitop_buffer<Torus>(streams, BITOP_TYPE::BITXOR,
49904987
params, num_radix_blocks,
49914988
allocate_gpu_memory, size_tracker);
@@ -5061,13 +5058,12 @@ template <typename Torus> struct int_div_rem_memory {
50615058
abs_mem_2 = new int_abs_buffer<Torus>(streams, params, num_blocks,
50625059
allocate_gpu_memory, size_tracker);
50635060
uint32_t requested_flag = outputFlag::FLAG_NONE;
5064-
uint32_t uses_carry = 0;
50655061
scp_mem_1 = new int_sc_prop_memory<Torus>(
5066-
streams, params, num_blocks, requested_flag, uses_carry,
5067-
allocate_gpu_memory, size_tracker);
5062+
streams, params, num_blocks, requested_flag, allocate_gpu_memory,
5063+
size_tracker);
50685064
scp_mem_2 = new int_sc_prop_memory<Torus>(
5069-
streams, params, num_blocks, requested_flag, uses_carry,
5070-
allocate_gpu_memory, size_tracker);
5065+
streams, params, num_blocks, requested_flag, allocate_gpu_memory,
5066+
size_tracker);
50715067

50725068
std::function<uint64_t(uint64_t)> quotient_predicate_lut_f =
50735069
[](uint64_t x) -> uint64_t { return x == 1; };
@@ -5251,7 +5247,7 @@ template <typename Torus> struct int_sub_and_propagate {
52515247
this->allocate_gpu_memory = allocate_gpu_memory;
52525248

52535249
this->sc_prop_mem = new int_sc_prop_memory<Torus>(
5254-
streams, params, num_radix_blocks, requested_flag_in, (uint32_t)0,
5250+
streams, params, num_radix_blocks, requested_flag_in,
52555251
allocate_gpu_memory, size_tracker);
52565252

52575253
this->neg_rhs_array = new CudaRadixCiphertextFFI;
@@ -5391,8 +5387,8 @@ template <typename Torus> struct int_unsigned_scalar_div_mem {
53915387
streams, params, num_radix_blocks, scalar_divisor_ffi->active_bits,
53925388
allocate_gpu_memory, size_tracker);
53935389
scp_mem = new int_sc_prop_memory<Torus>(
5394-
streams, params, num_radix_blocks, FLAG_NONE, (uint32_t)0,
5395-
allocate_gpu_memory, size_tracker);
5390+
streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
5391+
size_tracker);
53965392
sub_and_propagate_mem = new int_sub_and_propagate<Torus>(
53975393
streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
53985394
size_tracker);
@@ -5545,8 +5541,8 @@ template <typename Torus> struct int_signed_scalar_div_mem {
55455541
streams, RIGHT_SHIFT, params, num_radix_blocks,
55465542
allocate_gpu_memory, size_tracker);
55475543
scp_mem = new int_sc_prop_memory<Torus>(
5548-
streams, params, num_radix_blocks, FLAG_NONE, (uint32_t)0,
5549-
allocate_gpu_memory, size_tracker);
5544+
streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
5545+
size_tracker);
55505546

55515547
} else {
55525548

@@ -5567,7 +5563,7 @@ template <typename Torus> struct int_signed_scalar_div_mem {
55675563

55685564
if (scalar_divisor_ffi->is_chosen_multiplier_geq_two_pow_numerator) {
55695565
scp_mem = new int_sc_prop_memory<Torus>(
5570-
streams, params, num_radix_blocks, FLAG_NONE, (uint32_t)0,
5566+
streams, params, num_radix_blocks, FLAG_NONE,
55715567
allocate_gpu_memory, size_tracker);
55725568
}
55735569
}
@@ -5711,8 +5707,8 @@ template <typename Torus> struct int_signed_scalar_div_rem_buffer {
57115707
allocate_gpu_memory, size_tracker);
57125708

57135709
this->scp_mem = new int_sc_prop_memory<Torus>(
5714-
streams, params, num_radix_blocks, FLAG_NONE, (uint32_t)0,
5715-
allocate_gpu_memory, size_tracker);
5710+
streams, params, num_radix_blocks, FLAG_NONE, allocate_gpu_memory,
5711+
size_tracker);
57165712

57175713
bool is_divisor_one = scalar_divisor_ffi->is_abs_divisor_one &&
57185714
!scalar_divisor_ffi->is_divisor_negative;
@@ -5907,7 +5903,7 @@ template <typename Torus> struct int_count_of_consecutive_bits_buffer {
59075903
allocate_gpu_memory, size_tracker);
59085904

59095905
this->propagate_mem =
5910-
new int_sc_prop_memory<Torus>(streams, params, counter_num_blocks, 0, 0,
5906+
new int_sc_prop_memory<Torus>(streams, params, counter_num_blocks, 0,
59115907
allocate_gpu_memory, size_tracker);
59125908
}
59135909

backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -430,7 +430,6 @@ __host__ void tree_sign_reduction(
430430
"than the number of blocks to operate on")
431431

432432
auto params = tree_buffer->params;
433-
auto big_lwe_dimension = params.big_lwe_dimension;
434433
auto glwe_dimension = params.glwe_dimension;
435434
auto polynomial_size = params.polynomial_size;
436435
auto message_modulus = params.message_modulus;

backends/tfhe-cuda-backend/cuda/src/integer/integer.cu

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,16 +51,16 @@ uint64_t scratch_cuda_propagate_single_carry_kb_64_inplace(
5151
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
5252
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
5353
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
54-
PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry,
55-
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
54+
PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
55+
PBS_MS_REDUCTION_T noise_reduction_type) {
5656
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
5757
big_lwe_dimension, small_lwe_dimension, ks_level,
5858
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
5959
message_modulus, carry_modulus, noise_reduction_type);
6060

6161
return scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
6262
CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
63-
num_blocks, params, requested_flag, uses_carry, allocate_gpu_memory);
63+
num_blocks, params, requested_flag, allocate_gpu_memory);
6464
}
6565

6666
uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
@@ -69,16 +69,16 @@ uint64_t scratch_cuda_add_and_propagate_single_carry_kb_64_inplace(
6969
uint32_t small_lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
7070
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
7171
uint32_t num_blocks, uint32_t message_modulus, uint32_t carry_modulus,
72-
PBS_TYPE pbs_type, uint32_t requested_flag, uint32_t uses_carry,
73-
bool allocate_gpu_memory, PBS_MS_REDUCTION_T noise_reduction_type) {
72+
PBS_TYPE pbs_type, uint32_t requested_flag, bool allocate_gpu_memory,
73+
PBS_MS_REDUCTION_T noise_reduction_type) {
7474
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
7575
big_lwe_dimension, small_lwe_dimension, ks_level,
7676
ks_base_log, pbs_level, pbs_base_log, grouping_factor,
7777
message_modulus, carry_modulus, noise_reduction_type);
7878

7979
return scratch_cuda_propagate_single_carry_kb_inplace<uint64_t>(
8080
CudaStreams(streams), (int_sc_prop_memory<uint64_t> **)mem_ptr,
81-
num_blocks, params, requested_flag, uses_carry, allocate_gpu_memory);
81+
num_blocks, params, requested_flag, allocate_gpu_memory);
8282
}
8383

8484
uint64_t scratch_cuda_integer_overflowing_sub_kb_64_inplace(

backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -242,8 +242,7 @@ __host__ void host_radix_cumulative_sum_in_groups(cudaStream_t stream,
242242
auto lwe_size = dest->lwe_dimension + 1;
243243
cuda_set_device(gpu_index);
244244
// Each CUDA block is responsible for a single group
245-
int num_blocks = (num_radix_blocks + group_size - 1) / group_size,
246-
num_threads = 512;
245+
int num_blocks = CEIL_DIV(num_radix_blocks, group_size), num_threads = 512;
247246
device_radix_cumulative_sum_in_groups<Torus>
248247
<<<num_blocks, num_threads, 0, stream>>>(
249248
(Torus *)dest->ptr, (Torus *)src->ptr, num_radix_blocks, lwe_size,
@@ -1566,9 +1565,6 @@ void host_full_propagate_inplace(
15661565
void *const *bsks, uint32_t num_blocks) {
15671566
auto params = mem_ptr->lut->params;
15681567

1569-
int big_lwe_size = (params.glwe_dimension * params.polynomial_size + 1);
1570-
int small_lwe_size = (params.small_lwe_dimension + 1);
1571-
15721568
// In the case of extracting a single LWE this parameters are dummy
15731569
uint32_t num_many_lut = 1;
15741570
uint32_t lut_stride = 0;
@@ -1969,12 +1965,12 @@ template <typename Torus>
19691965
uint64_t scratch_cuda_propagate_single_carry_kb_inplace(
19701966
CudaStreams streams, int_sc_prop_memory<Torus> **mem_ptr,
19711967
uint32_t num_radix_blocks, int_radix_params params, uint32_t requested_flag,
1972-
uint32_t uses_carry, bool allocate_gpu_memory) {
1968+
bool allocate_gpu_memory) {
19731969
PUSH_RANGE("scratch add & propagate sc")
19741970
uint64_t size_tracker = 0;
19751971
*mem_ptr = new int_sc_prop_memory<Torus>(streams, params, num_radix_blocks,
1976-
requested_flag, uses_carry,
1977-
allocate_gpu_memory, size_tracker);
1972+
requested_flag, allocate_gpu_memory,
1973+
size_tracker);
19781974
POP_RANGE()
19791975
return size_tracker;
19801976
}
@@ -2116,9 +2112,6 @@ void host_add_and_propagate_single_carry(
21162112

21172113
auto num_radix_blocks = lhs_array->num_radix_blocks;
21182114
auto params = mem->params;
2119-
auto glwe_dimension = params.glwe_dimension;
2120-
auto polynomial_size = params.polynomial_size;
2121-
uint32_t big_lwe_size = glwe_dimension * polynomial_size + 1;
21222115
auto lut_stride = mem->lut_stride;
21232116
auto num_many_lut = mem->num_many_lut;
21242117
CudaRadixCiphertextFFI output_flag;
@@ -2390,7 +2383,6 @@ __host__ void integer_radix_apply_noise_squashing_kb(
23902383

23912384
PUSH_RANGE("apply noise squashing")
23922385
auto params = lut->params;
2393-
auto pbs_type = params.pbs_type;
23942386
auto big_lwe_dimension = params.big_lwe_dimension;
23952387
auto small_lwe_dimension = params.small_lwe_dimension;
23962388
auto ks_level = params.ks_level;

0 commit comments

Comments
 (0)