Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -756,18 +756,20 @@ template <typename Torus> struct int_radix_lut {
CudaStreams streams, uint64_t max_num_radix_blocks,
uint64_t &size_tracker, bool allocate_gpu_memory) {
// We need to create the auxiliary array only in GPU 0
lwe_aligned_vec.resize(active_streams.count());
for (uint i = 0; i < active_streams.count(); i++) {
uint64_t size_tracker_on_array_i = 0;
auto inputs_on_gpu = std::max(
THRESHOLD_MULTI_GPU, get_num_inputs_on_gpu(max_num_radix_blocks, i,
active_streams.count()));
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
inputs_on_gpu * (params.big_lwe_dimension + 1) * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), size_tracker_on_array_i,
allocate_gpu_memory);
lwe_aligned_vec[i] = d_array;
size_tracker += size_tracker_on_array_i;
if (active_streams.count() > 1) {
lwe_aligned_vec.resize(active_streams.count());
for (uint i = 0; i < active_streams.count(); i++) {
uint64_t size_tracker_on_array_i = 0;
auto inputs_on_gpu = std::max(
THRESHOLD_MULTI_GPU, get_num_inputs_on_gpu(max_num_radix_blocks, i,
active_streams.count()));
Torus *d_array = (Torus *)cuda_malloc_with_size_tracking_async(
inputs_on_gpu * (params.big_lwe_dimension + 1) * sizeof(Torus),
streams.stream(0), streams.gpu_index(0), size_tracker_on_array_i,
allocate_gpu_memory);
lwe_aligned_vec[i] = d_array;
size_tracker += size_tracker_on_array_i;
}
}
}

Expand Down Expand Up @@ -1632,8 +1634,19 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
luts_message_carry = new int_radix_lut<Torus>(
streams, params, 2, pbs_count, true, size_tracker);
allocated_luts_message_carry = true;
uint64_t message_modulus_bits =
(uint64_t)std::log2(params.message_modulus);
uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus);
uint64_t total_bits_per_block =
message_modulus_bits + carry_modulus_bits;
uint64_t denominator =
(uint64_t)std::ceil((pow(2, total_bits_per_block) - 1) /
(pow(2, message_modulus_bits) - 1));

uint64_t upper_bound_num_blocks =
max_total_blocks_in_vec * 2 / denominator;
luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes(
streams, this->max_total_blocks_in_vec, size_tracker, true);
streams, upper_bound_num_blocks, size_tracker, true);
}
}
if (allocated_luts_message_carry) {
Expand Down Expand Up @@ -1731,9 +1744,17 @@ template <typename Torus> struct int_sum_ciphertexts_vec_memory {
this->current_blocks = current_blocks;
this->small_lwe_vector = small_lwe_vector;
this->luts_message_carry = reused_lut;

uint64_t message_modulus_bits = (uint64_t)std::log2(params.message_modulus);
uint64_t carry_modulus_bits = (uint64_t)std::log2(params.carry_modulus);
uint64_t total_bits_per_block = message_modulus_bits + carry_modulus_bits;
uint64_t denominator =
(uint64_t)std::ceil((pow(2, total_bits_per_block) - 1) /
(pow(2, message_modulus_bits) - 1));

uint64_t upper_bound_num_blocks = max_total_blocks_in_vec * 2 / denominator;
this->luts_message_carry->allocate_lwe_vector_for_non_trivial_indexes(
streams, this->max_total_blocks_in_vec, size_tracker,
allocate_gpu_memory);
streams, upper_bound_num_blocks, size_tracker, allocate_gpu_memory);
setup_index_buffers(streams, size_tracker);
}

Expand Down
25 changes: 17 additions & 8 deletions tfhe-benchmark/src/utilities.rs
Original file line number Diff line number Diff line change
Expand Up @@ -421,23 +421,32 @@ pub fn throughput_num_threads(num_block: usize, op_pbs_count: u64) -> u64 {
let block_multiplicator = (ref_block_count as f64 / num_block as f64).ceil().min(1.0);
// Some operations with a high serial workload (e.g. division) would yield an operation
// loading value so low that the number of elements in the end wouldn't be meaningful.
let minimum_loading = if num_block < 64 { 0.2 } else { 0.01 };
let minimum_loading = if num_block < 64 { 1.0 } else { 0.015 };

#[cfg(feature = "gpu")]
{
let num_sms_per_gpu = get_number_of_sms();
let total_num_sm = num_sms_per_gpu * get_number_of_gpus();

let total_blocks_per_sm = 4u32; // Assume each SM can handle 4 blocks concurrently
let total_num_sm = total_blocks_per_sm * total_num_sm;
let total_blocks_per_sm = 4u64; // Assume each SM can handle 4 blocks concurrently
let min_num_waves = 4u64; //Enforce at least 4 waves in the GPU
let elements_per_wave = total_num_sm as u64 / (num_block as u64);

let block_factor = ((2.0f64 * num_block as f64) / 4.0f64).ceil() as u64;
let elements_per_wave = total_blocks_per_sm * total_num_sm as u64 / block_factor;
// We need to enable the new load for pbs benches and for sizes larger than 16 blocks in
// demanding operations for the rest of operations we maintain a minimum of 200
// elements
let min_elements = if op_pbs_count == 1
|| (op_pbs_count > (num_block * num_block) as u64 && num_block >= 16)
{
elements_per_wave * min_num_waves
} else {
200u64
};
let operation_loading = ((total_num_sm as u64 / op_pbs_count) as f64).max(minimum_loading);
let elements = (total_num_sm as f64 * block_multiplicator * operation_loading) as u64;
elements.min(elements_per_wave * min_num_waves) // This threshold is useful for operation
// with both a small number of
// block and low PBs count.
elements.min(min_elements) // This threshold is useful for operation
// with both a small number of
// block and low PBs count.
}
#[cfg(feature = "hpu")]
{
Expand Down
Loading