Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -1004,6 +1004,11 @@ test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
-E "test(/high_level_api::.*gpu.*/)"

test_list_gpu: install_rs_build_toolchain install_cargo_nextest
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest list --cargo-profile $(CARGO_PROFILE) \
--features=integer,internal-keycache,gpu,zk-pok -p tfhe \
-E "test(/.*gpu.*/)"

test_high_level_api_hpu: install_rs_build_toolchain install_cargo_nextest
ifeq ($(HPU_CONFIG), v80)
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \
Expand Down
3 changes: 1 addition & 2 deletions backends/tfhe-cuda-backend/cuda/include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cuda_runtime.h>
#include <vector>

extern "C" {

Expand Down Expand Up @@ -141,4 +139,5 @@ bool cuda_check_support_thread_block_clusters();
template <typename Torus>
void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
Torus *d_array, Torus value, Torus n);

#endif
143 changes: 142 additions & 1 deletion backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#include <variant>
#include <vector>

#include "integer/integer.h"

extern std::mutex m;
extern bool p2p_enabled;
extern const int THRESHOLD_MULTI_GPU;
Expand Down Expand Up @@ -37,10 +39,149 @@ get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,
}
}

int get_active_gpu_count(int num_inputs, int gpu_count);
uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count);

int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);

int get_gpu_offset(int total_num_inputs, int gpu_index, int gpu_count);

// A Set of GPU Streams and associated GPUs
// Can be constructed from the FFI struct CudaStreamsFFI which
// is only used to pass the streams/gpus at the rust/C interface
// This class should only be constructed from the FFI struct,
// through class methods or through the copy constructor. The class
// can also be constructed as an empty set
struct CudaStreams {
private:
cudaStream_t const *_streams;
uint32_t const *_gpu_indexes;
uint32_t _gpu_count;
bool _owns_streams;

// Prevent the construction of a CudaStreams class from user-code
CudaStreams(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count)
: _streams(streams), _gpu_indexes(gpu_indexes), _gpu_count(gpu_count),
_owns_streams(false) {}

public:
// Construct an empty set. Invalid use of an empty set should raise an error
// right away through asserts or because of a nullptr dereference
CudaStreams()
: _streams(nullptr), _gpu_indexes(nullptr), _gpu_count((uint32_t)-1),
_owns_streams(false) {}

// Returns a subset of this set as an active subset. An active subset is one
// that is temporarily used to perform some computation
CudaStreams active_gpu_subset(int num_radix_blocks) {
return CudaStreams(_streams, _gpu_indexes,
get_active_gpu_count(num_radix_blocks, _gpu_count));
}

// Returns a subset containing only the first gpu of this set. It
// is used to create subset of streams for mono-GPU functions
CudaStreams subset_first_gpu() const {
return CudaStreams(_streams, _gpu_indexes, 1);
}

// Synchronize all the streams in the set
void synchronize() const {
for (uint32_t i = 0; i < _gpu_count; i++) {
cuda_synchronize_stream(_streams[i], _gpu_indexes[i]);
}
}

cudaStream_t stream(uint32_t idx) const {
PANIC_IF_FALSE(idx < _gpu_count, "Invalid GPU index");
return _streams[idx];
}
uint32_t gpu_index(uint32_t idx) const {
PANIC_IF_FALSE(idx < _gpu_count, "Invalid GPU index");
return _gpu_indexes[idx];
}
uint32_t count() const { return _gpu_count; }

// Construct from the rust FFI stream set. Streams are created in rust
// using the bindings.
CudaStreams(CudaStreamsFFI &ffi)
: _streams((cudaStream_t *)ffi.streams), _gpu_indexes(ffi.gpu_indexes),
_gpu_count(ffi.gpu_count), _owns_streams(false) {}

// Create a new set of streams on the same gpus as those of the current stream
// set Can be used to parallelize computation by issuing kernels on multiple
// streams on the same GPU
void create_on_same_gpus(const CudaStreams &other) {
PANIC_IF_FALSE(_streams == nullptr,
"Assign clone to non-empty cudastreams");

cudaStream_t *new_streams = new cudaStream_t[other._gpu_count];

uint32_t *gpu_indexes_clone = new uint32_t[_gpu_count];
for (uint32_t i = 0; i < other._gpu_count; ++i) {
new_streams[i] = cuda_create_stream(other._gpu_indexes[i]);
gpu_indexes_clone[i] = other._gpu_indexes[i];
}

this->_streams = new_streams;
this->_gpu_indexes = gpu_indexes_clone;
this->_gpu_count = other._gpu_count;

// Flag this instance as owning streams so that we can destroy
// the streams when they aren't needed anymore
this->_owns_streams = true;
}

// Copy constructor, setting the own flag to false
// Only the initial instance of CudaStreams created with
// assign_clone owns streams, all copies of it do not own the
// streams
CudaStreams(const CudaStreams &src)
: _streams(src._streams), _gpu_indexes(src._gpu_indexes),
_gpu_count(src._gpu_count), _owns_streams(false) {}

CudaStreams &operator=(CudaStreams const &other) {
PANIC_IF_FALSE(this->_streams == nullptr ||
this->_streams == other._streams,
"Assigning an already initialized CudaStreams");
this->_streams = other._streams;
this->_gpu_indexes = other._gpu_indexes;
this->_gpu_count = other._gpu_count;

// Only the initial instance of CudaStreams created with
// assign_clone owns streams, all copies of it do not own the
// streams
this->_owns_streams = false;
return *this;
}

// Destroy the streams if they are created by assign_clone.
// We require the developer to call `destroy` on all instances
// of cloned streams.
void release() {
// If this instance doesn't own streams, there's nothing to do
// as the streams were created on the Rust side.
if (_owns_streams) {
for (uint32_t i = 0; i < _gpu_count; ++i) {
cuda_destroy_stream(_streams[i], _gpu_indexes[i]);
}
delete[] _streams;
_streams = nullptr;
delete[] _gpu_indexes;
_gpu_indexes = nullptr;
}
}

// The destructor checks that streams created with assign_clone
// were destroyed manually with `destroy`.
~CudaStreams() {
// Ensure streams are destroyed
PANIC_IF_FALSE(
!_owns_streams || _streams == nullptr,
"Destroy (this=%p) was not called on a CudaStreams object that "
"is a clone "
"of another one, %p",
this, this->_streams);
}
};

#endif
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define CUDA_INTEGER_COMPRESSION_H

#include "../../pbs/pbs_enums.h"
#include "../integer.h"

typedef struct {
void *ptr;
Expand All @@ -25,77 +26,65 @@ typedef struct {

extern "C" {
uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t lwe_per_glwe, bool allocate_gpu_memory);
CudaStreamsFFI streams, int8_t **mem_ptr,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, bool allocate_gpu_memory);

uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t pbs_level, uint32_t pbs_base_log,
CudaStreamsFFI streams, int8_t **mem_ptr,
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t num_blocks_to_decompress, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
PBS_MS_REDUCTION_T noise_reduction_type);

void cuda_integer_compress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaPackedGlweCiphertextListFFI *glwe_array_out,
CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
int8_t *mem_ptr);

void cuda_integer_decompress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaLweCiphertextListFFI *lwe_array_out,
CudaStreamsFFI streams, CudaLweCiphertextListFFI *lwe_array_out,
CudaPackedGlweCiphertextListFFI const *glwe_in,
uint32_t const *indexes_array, void *const *bsks, int8_t *mem_ptr);

void cleanup_cuda_integer_compress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
void cleanup_cuda_integer_compress_radix_ciphertext_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);

void cleanup_cuda_integer_decompress_radix_ciphertext_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
void cleanup_cuda_integer_decompress_radix_ciphertext_64(CudaStreamsFFI streams,
int8_t **mem_ptr_void);

uint64_t scratch_cuda_integer_compress_radix_ciphertext_128(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t lwe_per_glwe, bool allocate_gpu_memory);
CudaStreamsFFI streams, int8_t **mem_ptr,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, bool allocate_gpu_memory);

uint64_t scratch_cuda_integer_decompress_radix_ciphertext_128(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
bool allocate_gpu_memory);
CudaStreamsFFI streams, int8_t **mem_ptr,
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
uint32_t lwe_dimension, uint32_t num_radix_blocks, uint32_t message_modulus,
uint32_t carry_modulus, bool allocate_gpu_memory);

void cuda_integer_compress_radix_ciphertext_128(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaPackedGlweCiphertextListFFI *glwe_array_out,
CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
int8_t *mem_ptr);

void cuda_integer_decompress_radix_ciphertext_128(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaLweCiphertextListFFI *lwe_array_out,
CudaStreamsFFI streams, CudaLweCiphertextListFFI *lwe_array_out,
CudaPackedGlweCiphertextListFFI const *glwe_in,
uint32_t const *indexes_array, int8_t *mem_ptr);

void cleanup_cuda_integer_compress_radix_ciphertext_128(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
void cleanup_cuda_integer_compress_radix_ciphertext_128(CudaStreamsFFI streams,
int8_t **mem_ptr_void);

void cleanup_cuda_integer_decompress_radix_ciphertext_128(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);
CudaStreamsFFI streams, int8_t **mem_ptr_void);
}

#endif
Loading
Loading