zama-ai
diff --git a/‎backends/tfhe-cuda-backend/cuda/include/zk/zk.h‎
Lines changed: 0 additions & 7 deletions b/‎backends/tfhe-cuda-backend/cuda/include/zk/zk.h‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h‎
Lines changed: 116 additions & 61 deletions b/‎backends/tfhe-cuda-backend/cuda/include/zk/zk_utilities.h‎
Lines changed: 116 additions & 61 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/zk/expand.cu‎
Lines changed: 0 additions & 65 deletions b/‎backends/tfhe-cuda-backend/cuda/src/zk/expand.cu‎
Lines changed: 0 additions & 65 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/zk/expand.cuh‎
Lines changed: 12 additions & 19 deletions b/‎backends/tfhe-cuda-backend/cuda/src/zk/expand.cuh‎
Lines changed: 12 additions & 19 deletions
@@ -6,13 +6,6 @@
 #include <stdint.h>
 
 extern "C" {
-
-void cuda_lwe_expand_64(void *const stream, uint32_t gpu_index,
-                        void *lwe_array_out, const void *lwe_compact_array_in,
-                        uint32_t lwe_dimension, uint32_t num_lwe,
-                        const uint32_t *lwe_compact_input_indexes,
-                        const uint32_t *output_body_id_per_compact_list);
-
 uint64_t scratch_cuda_expand_without_verification_64(
     void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
     int8_t **mem_ptr, uint32_t glwe_dimension, uint32_t polynomial_size,
 
@@ -5,6 +5,96 @@
 #include "integer/integer.cuh"
 #include <cstdint>
 
+////////////////////////////////////
+// Helper structures used in expand
+template <typename Torus> struct lwe_mask {
+  Torus *mask;
+
+  lwe_mask(Torus *mask) : mask{mask} {}
+};
+
+template <typename Torus> struct compact_lwe_body {
+  Torus *body;
+  uint64_t monomial_degree;
+
+  /* Body id is the index of the body in the compact ciphertext list.
+   *  It's used to compute the rotation.
+   */
+  compact_lwe_body(Torus *body, const uint64_t body_id)
+      : body{body}, monomial_degree{body_id} {}
+};
+
+template <typename Torus> struct compact_lwe_list {
+  Torus *ptr;
+  uint32_t lwe_dimension;
+  uint32_t total_num_lwes;
+
+  compact_lwe_list(Torus *ptr, uint32_t lwe_dimension, uint32_t total_num_lwes)
+      : ptr{ptr}, lwe_dimension{lwe_dimension}, total_num_lwes{total_num_lwes} {
+  }
+
+  lwe_mask<Torus> get_mask() { return lwe_mask(ptr); }
+
+  // Returns the index-th body
+  compact_lwe_body<Torus> get_body(uint32_t index) {
+    if (index >= total_num_lwes) {
+      PANIC("index out of range in compact_lwe_list::get_body");
+    }
+
+    return compact_lwe_body(&ptr[lwe_dimension + index], uint64_t(index));
+  }
+};
+
+template <typename Torus> struct flattened_compact_lwe_lists {
+  Torus *d_ptr;
+  Torus **d_ptr_to_compact_list;
+  const uint32_t *h_num_lwes_per_compact_list;
+  uint32_t num_compact_lists;
+  uint32_t lwe_dimension;
+  uint32_t total_num_lwes;
+
+  flattened_compact_lwe_lists(Torus *d_ptr,
+                              const uint32_t *h_num_lwes_per_compact_list,
+                              uint32_t num_compact_lists,
+                              uint32_t lwe_dimension)
+      : d_ptr(d_ptr), h_num_lwes_per_compact_list(h_num_lwes_per_compact_list),
+        num_compact_lists(num_compact_lists), lwe_dimension(lwe_dimension) {
+    d_ptr_to_compact_list =
+        static_cast<Torus **>(malloc(num_compact_lists * sizeof(Torus **)));
+    total_num_lwes = 0;
+    auto curr_list = d_ptr;
+    for (auto i = 0; i < num_compact_lists; ++i) {
+      total_num_lwes += h_num_lwes_per_compact_list[i];
+      d_ptr_to_compact_list[i] = curr_list;
+      curr_list += lwe_dimension + h_num_lwes_per_compact_list[i];
+    }
+  }
+
+  compact_lwe_list<Torus> get_device_compact_list(uint32_t compact_list_index) {
+    if (compact_list_index >= num_compact_lists) {
+      PANIC("index out of range in flattened_compact_lwe_lists::get");
+    }
+
+    return compact_lwe_list(d_ptr_to_compact_list[compact_list_index],
+                            lwe_dimension,
+                            h_num_lwes_per_compact_list[compact_list_index]);
+  }
+};
+
+/*
+ * A expand_job tells the expand kernel exactly which input mask and body to use
+ * and what rotation to apply
+ */
+template <typename Torus> struct expand_job {
+  lwe_mask<Torus> mask_to_use;
+  compact_lwe_body<Torus> body_to_use;
+
+  expand_job(lwe_mask<Torus> mask_to_use, compact_lwe_body<Torus> body_to_use)
+      : mask_to_use{mask_to_use}, body_to_use{body_to_use} {}
+};
+
+////////////////////////////////////
+
 template <typename Torus> struct zk_expand_mem {
   int_radix_params computing_params;
   int_radix_params casting_params;
@@ -17,11 +107,12 @@ template <typename Torus> struct zk_expand_mem {
   Torus *tmp_expanded_lwes;
   Torus *tmp_ksed_small_to_big_expanded_lwes;
 
-  uint32_t *d_lwe_compact_input_indexes;
-
-  uint32_t *d_body_id_per_compact_list;
   bool gpu_memory_allocated;
 
+  uint32_t *num_lwes_per_compact_list;
+  expand_job<Torus> *d_expand_jobs;
+  expand_job<Torus> *h_expand_jobs;
+
   zk_expand_mem(cudaStream_t const *streams, uint32_t const *gpu_indexes,
                 uint32_t gpu_count, int_radix_params computing_params,
                 int_radix_params casting_params, KS_TYPE casting_key_type,
@@ -33,9 +124,17 @@ template <typename Torus> struct zk_expand_mem {
         casting_key_type(casting_key_type) {
 
     gpu_memory_allocated = allocate_gpu_memory;
+
+    // We copy num_lwes_per_compact_list so we get protection against
+    // num_lwes_per_compact_list being freed while this buffer is still in use
+    this->num_lwes_per_compact_list =
+        (uint32_t *)malloc(num_compact_lists * sizeof(uint32_t));
+    memcpy(this->num_lwes_per_compact_list, num_lwes_per_compact_list,
+           num_compact_lists * sizeof(uint32_t));
+
     num_lwes = 0;
     for (int i = 0; i < num_compact_lists; i++) {
-      num_lwes += num_lwes_per_compact_list[i];
+      num_lwes += this->num_lwes_per_compact_list[i];
     }
 
     if (computing_params.carry_modulus != computing_params.message_modulus) {
@@ -121,49 +220,14 @@ template <typename Torus> struct zk_expand_mem {
         malloc(num_packed_msgs * num_lwes * sizeof(Torus)));
     auto h_lut_indexes = static_cast<Torus *>(
         malloc(num_packed_msgs * num_lwes * sizeof(Torus)));
-    auto h_body_id_per_compact_list =
-        static_cast<uint32_t *>(malloc(num_lwes * sizeof(uint32_t)));
-    auto h_lwe_compact_input_indexes =
-        static_cast<uint32_t *>(malloc(num_lwes * sizeof(uint32_t)));
-
-    d_body_id_per_compact_list =
-        static_cast<uint32_t *>(cuda_malloc_with_size_tracking_async(
-            num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
-            size_tracker, allocate_gpu_memory));
-    d_lwe_compact_input_indexes =
-        static_cast<uint32_t *>(cuda_malloc_with_size_tracking_async(
-            num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
+
+    d_expand_jobs =
+        static_cast<expand_job<Torus> *>(cuda_malloc_with_size_tracking_async(
+            num_lwes * sizeof(expand_job<Torus>), streams[0], gpu_indexes[0],
             size_tracker, allocate_gpu_memory));
 
-    auto compact_list_id = 0;
-    auto idx = 0;
-    auto count = 0;
-    // During flattening, all num_lwes LWEs from all compact lists are stored
-    // sequentially on a Torus array. h_lwe_compact_input_indexes stores the
-    // index of the first LWE related to the compact list that contains the i-th
-    // LWE
-    for (int i = 0; i < num_lwes; i++) {
-      h_lwe_compact_input_indexes[i] = idx;
-      count++;
-      if (count == num_lwes_per_compact_list[compact_list_id]) {
-        compact_list_id++;
-        idx += casting_params.big_lwe_dimension + count;
-        count = 0;
-      }
-    }
-
-    // Stores the index of the i-th LWE (within each compact list) related to
-    // the k-th compact list.
-    auto offset = 0;
-    for (int k = 0; k < num_compact_lists; k++) {
-      auto num_lwes_in_kth_compact_list = num_lwes_per_compact_list[k];
-      uint32_t body_count = 0;
-      for (int i = 0; i < num_lwes_in_kth_compact_list; i++) {
-        h_body_id_per_compact_list[i + offset] = body_count;
-        body_count++;
-      }
-      offset += num_lwes_in_kth_compact_list;
-    }
+    h_expand_jobs = static_cast<expand_job<Torus> *>(
+        malloc(num_lwes * sizeof(expand_job<Torus>)));
 
     /*
      * Each LWE contains encrypted data in both carry and message spaces
@@ -198,9 +262,9 @@ template <typename Torus> struct zk_expand_mem {
      * num_packed_msgs to use the sanitization LUT (which ensures output is
      * exactly 0 or 1).
      */
-    offset = 0;
+    auto offset = 0;
     for (int k = 0; k < num_compact_lists; k++) {
-      auto num_lwes_in_kth = num_lwes_per_compact_list[k];
+      auto num_lwes_in_kth = this->num_lwes_per_compact_list[k];
       for (int i = 0; i < num_packed_msgs * num_lwes_in_kth; i++) {
         auto lwe_index = i + num_packed_msgs * offset;
         auto lwe_index_in_list = i % num_lwes_in_kth;
@@ -220,17 +284,9 @@ template <typename Torus> struct zk_expand_mem {
         streams[0], gpu_indexes[0], h_indexes_in, h_indexes_out);
     auto lut_indexes = message_and_carry_extract_luts->get_lut_indexes(0, 0);
 
-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        d_lwe_compact_input_indexes, h_lwe_compact_input_indexes,
-        num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
-        allocate_gpu_memory);
     cuda_memcpy_with_size_tracking_async_to_gpu(
         lut_indexes, h_lut_indexes, num_packed_msgs * num_lwes * sizeof(Torus),
         streams[0], gpu_indexes[0], allocate_gpu_memory);
-    cuda_memcpy_with_size_tracking_async_to_gpu(
-        d_body_id_per_compact_list, h_body_id_per_compact_list,
-        num_lwes * sizeof(uint32_t), streams[0], gpu_indexes[0],
-        allocate_gpu_memory);
 
     auto active_gpu_count = get_active_gpu_count(2 * num_lwes, gpu_count);
     message_and_carry_extract_luts->broadcast_lut(streams, gpu_indexes,
@@ -253,8 +309,6 @@ template <typename Torus> struct zk_expand_mem {
     free(h_indexes_in);
     free(h_indexes_out);
     free(h_lut_indexes);
-    free(h_body_id_per_compact_list);
-    free(h_lwe_compact_input_indexes);
   }
 
   void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
@@ -263,15 +317,16 @@ template <typename Torus> struct zk_expand_mem {
     message_and_carry_extract_luts->release(streams, gpu_indexes, gpu_count);
     delete message_and_carry_extract_luts;
 
-    cuda_drop_with_size_tracking_async(d_body_id_per_compact_list, streams[0],
-                                       gpu_indexes[0], gpu_memory_allocated);
-    cuda_drop_with_size_tracking_async(d_lwe_compact_input_indexes, streams[0],
-                                       gpu_indexes[0], gpu_memory_allocated);
     cuda_drop_with_size_tracking_async(tmp_expanded_lwes, streams[0],
                                        gpu_indexes[0], gpu_memory_allocated);
     cuda_drop_with_size_tracking_async(tmp_ksed_small_to_big_expanded_lwes,
                                        streams[0], gpu_indexes[0],
                                        gpu_memory_allocated);
+    cuda_drop_with_size_tracking_async(d_expand_jobs, streams[0],
+                                       gpu_indexes[0], gpu_memory_allocated);
+    cuda_synchronize_stream(streams[0], gpu_indexes[0]);
+    free(num_lwes_per_compact_list);
+    free(h_expand_jobs);
   }
 };
 
 
@@ -5,39 +5,35 @@
 #include "polynomial/functions.cuh"
 #include "polynomial/polynomial_math.cuh"
 #include "zk/zk.h"
+#include "zk/zk_utilities.h"
 #include <cstdint>
 
-#include "utils/helper.cuh"
-
 // Expand a LweCompactCiphertextList into a LweCiphertextList
 // - Each x-block computes one output ciphertext
 template <typename Torus, class params>
-__global__ void lwe_expand(Torus const *lwe_compact_array_in,
-                           Torus *lwe_array_out,
-                           const uint32_t *lwe_compact_input_indexes,
-                           const uint32_t *output_body_id_per_compact_list) {
+__global__ void lwe_expand(const expand_job<Torus> *jobs,
+                           Torus *lwe_array_out) {
   const auto lwe_output_id = blockIdx.x;
   const auto lwe_dimension = params::degree;
 
-  const auto body_id = output_body_id_per_compact_list[lwe_output_id];
+  const auto job = jobs[lwe_output_id];
 
-  const auto input_mask =
-      &lwe_compact_array_in[lwe_compact_input_indexes[lwe_output_id]];
-  const auto input_body = &input_mask[lwe_dimension + body_id];
+  const lwe_mask<Torus> input_mask = job.mask_to_use;
+  const compact_lwe_body<Torus> input_body = job.body_to_use;
 
   auto output_mask = &lwe_array_out[(lwe_dimension + 1) * lwe_output_id];
   auto output_body = &output_mask[lwe_dimension];
 
   // We rotate the input mask by i to calculate the mask related to the i-th
   // output
-  const auto monomial_degree = body_id;
+  const auto monomial_degree = input_body.monomial_degree;
   polynomial_accumulate_monic_monomial_mul<Torus>(
-      output_mask, input_mask, monomial_degree, threadIdx.x, lwe_dimension,
+      output_mask, input_mask.mask, monomial_degree, threadIdx.x, lwe_dimension,
       params::opt, true);
 
   // The output body is just copied
   if (threadIdx.x == 0)
-    *output_body = *input_body;
+    *output_body = *input_body.body;
 }
 
 template <typename Torus> bool is_power_of_2(Torus value) {
@@ -46,9 +42,7 @@ template <typename Torus> bool is_power_of_2(Torus value) {
 
 template <typename Torus, class params>
 void host_lwe_expand(cudaStream_t stream, int gpu_index, Torus *lwe_array_out,
-                     const Torus *lwe_compact_array_in, uint32_t num_lwes,
-                     const uint32_t *lwe_compact_input_indexes,
-                     const uint32_t *output_body_id_per_compact_list) {
+                     const expand_job<Torus> *d_jobs, uint32_t num_lwes) {
   // Set the GPU device
   cudaSetDevice(gpu_index);
 
@@ -63,9 +57,8 @@ void host_lwe_expand(cudaStream_t stream, int gpu_index, Torus *lwe_array_out,
     PANIC("Error: lwe_dimension must be a power of 2");
 
   // Launch the `lwe_expand` kernel
-  lwe_expand<Torus, params><<<num_blocks, threads_per_block, 0, stream>>>(
-      lwe_compact_array_in, lwe_array_out, lwe_compact_input_indexes,
-      output_body_id_per_compact_list);
+  lwe_expand<Torus, params>
+      <<<num_blocks, threads_per_block, 0, stream>>>(d_jobs, lwe_array_out);
   check_cuda_error(cudaGetLastError());
 }
 #endif // EXPAND_CUH