Skip to content

Commit 1dcc3c8

Browse files
chore(gpu): structure to encapsulate streams
1 parent 1a2643d commit 1dcc3c8

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

65 files changed

+4858
-6146
lines changed

Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -999,6 +999,11 @@ test_high_level_api_gpu: install_rs_build_toolchain install_cargo_nextest
999999
--test-threads=4 --features=integer,internal-keycache,gpu,zk-pok -p tfhe \
10001000
-E "test(/high_level_api::.*gpu.*/)"
10011001

1002+
test_list_gpu: install_rs_build_toolchain install_cargo_nextest
1003+
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest list --cargo-profile $(CARGO_PROFILE) \
1004+
--features=integer,internal-keycache,gpu,zk-pok -p tfhe \
1005+
-E "test(/.*gpu.*/)"
1006+
10021007
test_high_level_api_hpu: install_rs_build_toolchain install_cargo_nextest
10031008
ifeq ($(HPU_CONFIG), v80)
10041009
RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) nextest run --cargo-profile $(CARGO_PROFILE) \

backends/tfhe-cuda-backend/cuda/include/device.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,7 @@
44
#include <cstdint>
55
#include <cstdio>
66
#include <cstdlib>
7-
#include <cstring>
87
#include <cuda_runtime.h>
9-
#include <vector>
108

119
extern "C" {
1210

@@ -141,4 +139,5 @@ bool cuda_check_support_thread_block_clusters();
141139
template <typename Torus>
142140
void cuda_set_value_async(cudaStream_t stream, uint32_t gpu_index,
143141
Torus *d_array, Torus value, Torus n);
142+
144143
#endif

backends/tfhe-cuda-backend/cuda/include/helper_multi_gpu.h

Lines changed: 142 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
#include <variant>
55
#include <vector>
66

7+
#include "integer/integer.h"
8+
79
extern std::mutex m;
810
extern bool p2p_enabled;
911
extern const int THRESHOLD_MULTI_GPU;
@@ -37,10 +39,149 @@ get_variant_element(const std::variant<std::vector<Torus>, Torus> &variant,
3739
}
3840
}
3941

40-
int get_active_gpu_count(int num_inputs, int gpu_count);
42+
uint32_t get_active_gpu_count(uint32_t num_inputs, uint32_t gpu_count);
4143

4244
int get_num_inputs_on_gpu(int total_num_inputs, int gpu_index, int gpu_count);
4345

4446
int get_gpu_offset(int total_num_inputs, int gpu_index, int gpu_count);
4547

48+
// A Set of GPU Streams and associated GPUs
49+
// Can be constructed from the FFI struct CudaStreamsFFI which
50+
// is only used to pass the streams/gpus at the rust/C interface
51+
// This class should only be constructed from the FFI struct,
52+
// through class methods or through the copy constructor. The class
53+
// can also be constructed as an empty set
54+
struct CudaStreams {
55+
private:
56+
cudaStream_t const *_streams;
57+
uint32_t const *_gpu_indexes;
58+
uint32_t _gpu_count;
59+
bool _owns_streams;
60+
61+
// Prevent the construction of a CudaStreams class from user-code
62+
CudaStreams(cudaStream_t const *streams, uint32_t const *gpu_indexes,
63+
uint32_t gpu_count)
64+
: _streams(streams), _gpu_indexes(gpu_indexes), _gpu_count(gpu_count),
65+
_owns_streams(false) {}
66+
67+
public:
68+
// Construct an empty set. Invalid use of an empty set should raise an error
69+
// right away through asserts or because of a nullptr dereference
70+
CudaStreams()
71+
: _streams(nullptr), _gpu_indexes(nullptr), _gpu_count((uint32_t)-1),
72+
_owns_streams(false) {}
73+
74+
// Returns a subset of this set as an active subset. An active subset is one
75+
// that is temporarily used to perform some computation
76+
CudaStreams active_gpu_subset(int num_radix_blocks) {
77+
return CudaStreams(_streams, _gpu_indexes,
78+
get_active_gpu_count(num_radix_blocks, _gpu_count));
79+
}
80+
81+
// Returns a subset containing only the first gpu of this set. It
82+
// is used to create subset of streams for mono-GPU functions
83+
CudaStreams subset_first_gpu() const {
84+
return CudaStreams(_streams, _gpu_indexes, 1);
85+
}
86+
87+
// Synchronize all the streams in the set
88+
void synchronize() const {
89+
for (uint32_t i = 0; i < _gpu_count; i++) {
90+
cuda_synchronize_stream(_streams[i], _gpu_indexes[i]);
91+
}
92+
}
93+
94+
cudaStream_t stream(uint32_t idx) const {
95+
PANIC_IF_FALSE(idx < _gpu_count, "Invalid GPU index");
96+
return _streams[idx];
97+
}
98+
uint32_t gpu_index(uint32_t idx) const {
99+
PANIC_IF_FALSE(idx < _gpu_count, "Invalid GPU index");
100+
return _gpu_indexes[idx];
101+
}
102+
uint32_t count() const { return _gpu_count; }
103+
104+
// Construct from the rust FFI stream set. Streams are created in rust
105+
// using the bindings.
106+
CudaStreams(CudaStreamsFFI &ffi)
107+
: _streams((cudaStream_t *)ffi.streams), _gpu_indexes(ffi.gpu_indexes),
108+
_gpu_count(ffi.gpu_count), _owns_streams(false) {}
109+
110+
// Create a new set of streams on the same gpus as those of the current stream
111+
// set Can be used to parallelize computation by issuing kernels on multiple
112+
// streams on the same GPU
113+
void create_on_same_gpus(const CudaStreams &other) {
114+
PANIC_IF_FALSE(_streams == nullptr,
115+
"Assign clone to non-empty cudastreams");
116+
117+
cudaStream_t *new_streams = new cudaStream_t[other._gpu_count];
118+
119+
uint32_t *gpu_indexes_clone = new uint32_t[_gpu_count];
120+
for (uint32_t i = 0; i < other._gpu_count; ++i) {
121+
new_streams[i] = cuda_create_stream(other._gpu_indexes[i]);
122+
gpu_indexes_clone[i] = other._gpu_indexes[i];
123+
}
124+
125+
this->_streams = new_streams;
126+
this->_gpu_indexes = gpu_indexes_clone;
127+
this->_gpu_count = other._gpu_count;
128+
129+
// Flag this instance as owning streams so that we can destroy
130+
// the streams when they aren't needed anymore
131+
this->_owns_streams = true;
132+
}
133+
134+
// Copy constructor, setting the own flag to false
135+
// Only the initial instance of CudaStreams created with
136+
// assign_clone owns streams, all copies of it do not own the
137+
// streams
138+
CudaStreams(const CudaStreams &src)
139+
: _streams(src._streams), _gpu_indexes(src._gpu_indexes),
140+
_gpu_count(src._gpu_count), _owns_streams(false) {}
141+
142+
CudaStreams &operator=(CudaStreams const &other) {
143+
PANIC_IF_FALSE(this->_streams == nullptr ||
144+
this->_streams == other._streams,
145+
"Assigning an already initialized CudaStreams");
146+
this->_streams = other._streams;
147+
this->_gpu_indexes = other._gpu_indexes;
148+
this->_gpu_count = other._gpu_count;
149+
150+
// Only the initial instance of CudaStreams created with
151+
// assign_clone owns streams, all copies of it do not own the
152+
// streams
153+
this->_owns_streams = false;
154+
return *this;
155+
}
156+
157+
// Destroy the streams if they are created by assign_clone.
158+
// We require the developer to call `destroy` on all instances
159+
// of cloned streams.
160+
void release() {
161+
// If this instance doesn't own streams, there's nothing to do
162+
// as the streams were created on the Rust side.
163+
if (_owns_streams) {
164+
for (uint32_t i = 0; i < _gpu_count; ++i) {
165+
cuda_destroy_stream(_streams[i], _gpu_indexes[i]);
166+
}
167+
delete[] _streams;
168+
_streams = nullptr;
169+
delete[] _gpu_indexes;
170+
_gpu_indexes = nullptr;
171+
}
172+
}
173+
174+
// The destructor checks that streams created with assign_clone
175+
// were destroyed manually with `destroy`.
176+
~CudaStreams() {
177+
// Ensure streams are destroyed
178+
PANIC_IF_FALSE(
179+
!_owns_streams || _streams == nullptr,
180+
"Destroy (this=%p) was not called on a CudaStreams object that "
181+
"is a clone "
182+
"of another one, %p",
183+
this, this->_streams);
184+
}
185+
};
186+
46187
#endif

backends/tfhe-cuda-backend/cuda/include/integer/compression/compression.h

Lines changed: 30 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#define CUDA_INTEGER_COMPRESSION_H
33

44
#include "../../pbs/pbs_enums.h"
5+
#include "../integer.h"
56

67
typedef struct {
78
void *ptr;
@@ -25,77 +26,65 @@ typedef struct {
2526

2627
extern "C" {
2728
uint64_t scratch_cuda_integer_compress_radix_ciphertext_64(
28-
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
29-
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
30-
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
31-
uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
32-
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
33-
uint32_t lwe_per_glwe, bool allocate_gpu_memory);
29+
CudaStreamsFFI streams, int8_t **mem_ptr,
30+
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
31+
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
32+
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
33+
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, bool allocate_gpu_memory);
3434

3535
uint64_t scratch_cuda_integer_decompress_radix_ciphertext_64(
36-
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
37-
int8_t **mem_ptr, uint32_t encryption_glwe_dimension,
38-
uint32_t encryption_polynomial_size, uint32_t compression_glwe_dimension,
39-
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
40-
uint32_t pbs_level, uint32_t pbs_base_log,
36+
CudaStreamsFFI streams, int8_t **mem_ptr,
37+
uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
38+
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
39+
uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
4140
uint32_t num_blocks_to_decompress, uint32_t message_modulus,
4241
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory,
4342
PBS_MS_REDUCTION_T noise_reduction_type);
4443

4544
void cuda_integer_compress_radix_ciphertext_64(
46-
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
47-
CudaPackedGlweCiphertextListFFI *glwe_array_out,
45+
CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
4846
CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
4947
int8_t *mem_ptr);
5048

5149
void cuda_integer_decompress_radix_ciphertext_64(
52-
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
53-
CudaLweCiphertextListFFI *lwe_array_out,
50+
CudaStreamsFFI streams, CudaLweCiphertextListFFI *lwe_array_out,
5451
CudaPackedGlweCiphertextListFFI const *glwe_in,
5552
uint32_t const *indexes_array, void *const *bsks, int8_t *mem_ptr);
5653

57-
void cleanup_cuda_integer_compress_radix_ciphertext_64(
58-
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
59-
int8_t **mem_ptr_void);
54+
void cleanup_cuda_integer_compress_radix_ciphertext_64(CudaStreamsFFI streams,
55+
int8_t **mem_ptr_void);
6056

61-
void cleanup_cuda_integer_decompress_radix_ciphertext_64(
62-
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
63-
int8_t **mem_ptr_void);
57+
void cleanup_cuda_integer_decompress_radix_ciphertext_64(CudaStreamsFFI streams,
58+
int8_t **mem_ptr_void);
6459

6560
uint64_t scratch_cuda_integer_compress_radix_ciphertext_128(
66-
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
67-
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
68-
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
69-
uint32_t ks_level, uint32_t ks_base_log, uint32_t num_radix_blocks,
70-
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
71-
uint32_t lwe_per_glwe, bool allocate_gpu_memory);
61+
CudaStreamsFFI streams, int8_t **mem_ptr,
62+
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
63+
uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
64+
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
65+
PBS_TYPE pbs_type, uint32_t lwe_per_glwe, bool allocate_gpu_memory);
7266

7367
uint64_t scratch_cuda_integer_decompress_radix_ciphertext_128(
74-
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
75-
int8_t **mem_ptr, uint32_t compression_glwe_dimension,
76-
uint32_t compression_polynomial_size, uint32_t lwe_dimension,
77-
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
78-
bool allocate_gpu_memory);
68+
CudaStreamsFFI streams, int8_t **mem_ptr,
69+
uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
70+
uint32_t lwe_dimension, uint32_t num_radix_blocks, uint32_t message_modulus,
71+
uint32_t carry_modulus, bool allocate_gpu_memory);
7972

8073
void cuda_integer_compress_radix_ciphertext_128(
81-
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
82-
CudaPackedGlweCiphertextListFFI *glwe_array_out,
74+
CudaStreamsFFI streams, CudaPackedGlweCiphertextListFFI *glwe_array_out,
8375
CudaLweCiphertextListFFI const *lwe_array_in, void *const *fp_ksk,
8476
int8_t *mem_ptr);
8577

8678
void cuda_integer_decompress_radix_ciphertext_128(
87-
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
88-
CudaLweCiphertextListFFI *lwe_array_out,
79+
CudaStreamsFFI streams, CudaLweCiphertextListFFI *lwe_array_out,
8980
CudaPackedGlweCiphertextListFFI const *glwe_in,
9081
uint32_t const *indexes_array, int8_t *mem_ptr);
9182

92-
void cleanup_cuda_integer_compress_radix_ciphertext_128(
93-
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
94-
int8_t **mem_ptr_void);
83+
void cleanup_cuda_integer_compress_radix_ciphertext_128(CudaStreamsFFI streams,
84+
int8_t **mem_ptr_void);
9585

9686
void cleanup_cuda_integer_decompress_radix_ciphertext_128(
97-
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
98-
int8_t **mem_ptr_void);
87+
CudaStreamsFFI streams, int8_t **mem_ptr_void);
9988
}
10089

10190
#endif

0 commit comments

Comments
 (0)