zama-ai
diff --git a/‎Makefile‎
Lines changed: 2 additions & 2 deletions b/‎Makefile‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h‎
Lines changed: 35 additions & 0 deletions b/‎backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/include/integer/rerand_utilities.h‎
Lines changed: 25 additions & 0 deletions b/‎backends/tfhe-cuda-backend/cuda/include/integer/rerand_utilities.h‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎backends/tfhe-cuda-backend/cuda/include/keyswitch/keyswitch.h‎
Lines changed: 10 additions & 1 deletion b/‎backends/tfhe-cuda-backend/cuda/include/keyswitch/keyswitch.h‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu‎
Lines changed: 32 additions & 6 deletions b/‎backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu‎
Lines changed: 32 additions & 6 deletions
@@ -773,7 +773,7 @@ build_debug_integer_short_run_gpu: install_rs_check_toolchain install_cargo_next
 	RUSTFLAGS="$(RUSTFLAGS)" cargo $(CARGO_RS_BUILD_TOOLCHAIN) test --profile debug_lto_off \
 		--features=integer,gpu-debug-fake-multi-gpu -p tfhe -- integer::gpu::server_key::radix::tests_long_run::test_random_op_sequence::test_gpu_short_random --list
 	@echo "To debug fake-multi-gpu short run tests run:"
-	@echo "TFHE_RS_TEST_LONG_TESTS_MINIMAL=TRUE <executable> integer::gpu::server_key::radix::tests_long_run::test_random_op_sequence::test_gpu_short_random_op_sequence_param_gpu_multi_bit_group_4_message_2_carry_2_ks_pbs_tuniform_2m128 --nocapture"
+	@echo "TFHE_RS_LONGRUN_TESTS_SEED=<SEED_FROM_CI> TFHE_RS_TEST_LONG_TESTS_MINIMAL=TRUE <executable> integer::gpu::server_key::radix::tests_long_run::test_random_op_sequence::test_gpu_short_random_op_sequence_param_gpu_multi_bit_group_4_message_2_carry_2_ks_pbs_tuniform_2m128 --nocapture"
 	@echo "Where <executable> = the one printed in the () in the 'Running unittests src/lib.rs ()' line above"
 
 .PHONY: test_integer_compression
@@ -806,7 +806,7 @@ test_unsigned_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 	NIGHTLY_TESTS="$(NIGHTLY_TESTS)" \
 		./scripts/integer-tests.sh --rust-toolchain $(CARGO_RS_CHECK_TOOLCHAIN) \
 		--cargo-profile "$(CARGO_PROFILE)" --backend "gpu" \
-		--unsigned-only --tfhe-package "tfhe"
+		--unsigned-only --tfhe-package "tfhe" -- --nocapture
 
 .PHONY: test_signed_integer_gpu_ci # Run the tests for signed integer ci on gpu backend
 test_signed_integer_gpu_ci: install_rs_check_toolchain install_cargo_nextest
 
@@ -13,6 +13,8 @@
 
 #include <stdio.h>
 
+#include "crypto/keyswitch.cuh"
+
 class NoiseLevel {
 public:
   // Constants equivalent to the Rust code
@@ -336,7 +338,11 @@ struct int_radix_lut_custom_input_output {
   std::vector<InputTorus *> lwe_after_ks_vec;
   std::vector<OutputTorus *> lwe_after_pbs_vec;
   std::vector<InputTorus *> lwe_trivial_indexes_vec;
+  std::vector<ks_mem<InputTorus> *>
+      ks_tmp_buf_vec; // buffers on each GPU to store keyswitch temporary data
+
   std::vector<InputTorus *> lwe_aligned_vec;
+  uint64_t numSamplesKsTmp = 0;
 
   bool gpu_memory_allocated;
 
@@ -439,6 +445,26 @@ struct int_radix_lut_custom_input_output {
     multi_gpu_copy_array_async(active_streams, lwe_trivial_indexes_vec,
                                lwe_trivial_indexes, num_radix_blocks,
                                allocate_gpu_memory);
+
+    auto inputs_on_gpu = std::max(
+        THRESHOLD_MULTI_GPU,
+        get_num_inputs_on_gpu(num_radix_blocks, 0, active_streams.count()));
+
+    this->numSamplesKsTmp = inputs_on_gpu;
+    if (inputs_on_gpu >= 144) {
+      for (auto i = 0; i < active_streams.count(); ++i) {
+        ks_mem<InputTorus> *ks_buffer;
+        uint64_t sub_size_tracker = scratch_cuda_keyswitch<InputTorus>(
+            active_streams.stream(i), active_streams.gpu_index(i), &ks_buffer,
+            params.small_lwe_dimension, params.big_lwe_dimension, num_blocks,
+            allocate_gpu_memory);
+
+        if (i == 0) {
+          size_tracker += sub_size_tracker;
+        }
+        ks_tmp_buf_vec.push_back(ks_buffer);
+      }
+    }
   }
 
   void setup_mem_reuse(uint32_t num_radix_blocks,
@@ -459,6 +485,8 @@ struct int_radix_lut_custom_input_output {
     lwe_after_pbs_vec = base_lut_object->lwe_after_pbs_vec;
     lwe_trivial_indexes_vec = base_lut_object->lwe_trivial_indexes_vec;
 
+    ks_tmp_buf_vec = base_lut_object->ks_tmp_buf_vec;
+
     mem_reuse = true;
   }
 
@@ -861,6 +889,13 @@ struct int_radix_lut_custom_input_output {
         }
         lwe_aligned_vec.clear();
       }
+
+      for (auto i = 0; i < ks_tmp_buf_vec.size(); i++) {
+        cleanup_cuda_keyswitch(active_streams.stream(i),
+                               active_streams.gpu_index(i), ks_tmp_buf_vec[i],
+                               gpu_memory_allocated);
+      }
+      ks_tmp_buf_vec.clear();
     }
     free(h_lut_indexes);
     free(degrees);
 
@@ -15,6 +15,9 @@ template <typename Torus> struct int_rerand_mem {
 
   bool gpu_memory_allocated;
 
+  std::vector<ks_mem<Torus> *>
+      ks_tmp_buf_vec; // buffers on each GPU to store keyswitch temporary data
+
   expand_job<Torus> *d_expand_jobs;
   expand_job<Torus> *h_expand_jobs;
 
@@ -56,6 +59,21 @@ template <typename Torus> struct int_rerand_mem {
 
     cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
 
+    for (auto i = 0; i < streams.count(); ++i) {
+      ks_mem<Torus> *ks_buffer;
+      uint64_t sub_size_tracker = scratch_cuda_keyswitch<Torus>(
+          streams.stream(i), streams.gpu_index(i), &ks_buffer,
+          params.small_lwe_dimension, params.big_lwe_dimension, num_lwes,
+          allocate_gpu_memory);
+
+      if (i == 0) {
+        size_tracker += sub_size_tracker;
+      }
+      ks_tmp_buf_vec.push_back(ks_buffer);
+    }
+
+    streams.synchronize();
+
     free(h_lwe_trivial_indexes);
   }
 
@@ -72,6 +90,13 @@ template <typename Torus> struct int_rerand_mem {
     cuda_drop_with_size_tracking_async(d_expand_jobs, streams.stream(0),
                                        streams.gpu_index(0),
                                        gpu_memory_allocated);
+
+    for (auto i = 0; i < ks_tmp_buf_vec.size(); i++) {
+      cleanup_cuda_keyswitch(streams.stream(i), streams.gpu_index(i),
+                             ks_tmp_buf_vec[i], gpu_memory_allocated);
+    }
+    ks_tmp_buf_vec.clear();
+
     cuda_synchronize_stream(streams.stream(0), streams.gpu_index(0));
     free(h_expand_jobs);
   }
 
@@ -17,13 +17,22 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
     void const *lwe_output_indexes, void const *lwe_array_in,
     void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
     uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples);
+    uint32_t num_samples, const void *ks_tmp_buffer, bool uses_trivial_indexes);
 
 uint64_t scratch_packing_keyswitch_lwe_list_to_glwe_64(
     void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t num_lwes, bool allocate_gpu_memory);
 
+uint64_t scratch_cuda_keyswitch_64(void *stream, uint32_t gpu_index,
+                                   void **ks_tmp_memory,
+                                   uint32_t lwe_dimension_in,
+                                   uint32_t lwe_dimension_out,
+                                   uint32_t num_lwes, bool allocate_gpu_memory);
+
+void cleanup_cuda_keyswitch_64(void *stream, uint32_t gpu_index,
+                               void **ks_tmp_memory, bool allocate_gpu_memory);
+
 void cuda_packing_keyswitch_lwe_list_to_glwe_64(
     void *stream, uint32_t gpu_index, void *glwe_array_out,
     void const *lwe_array_in, void const *fp_ksk_array, int8_t *fp_ks_buffer,
 
@@ -9,14 +9,16 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
-    uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  host_keyswitch_lwe_ciphertext_vector<uint32_t>(
+    uint32_t base_log, uint32_t level_count, uint32_t num_samples,
+    void *ksk_tmp_buffer, bool uses_trivial_indices) {
+  host_gemm_keyswitch_lwe_ciphertext_vector<uint32_t>(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint32_t *>(lwe_array_out),
       static_cast<uint32_t *>(lwe_output_indexes),
       static_cast<uint32_t *>(lwe_array_in),
       static_cast<uint32_t *>(lwe_input_indexes), static_cast<uint32_t *>(ksk),
-      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
+      lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples,
+      static_cast<uint32_t *>(ksk_tmp_buffer), uses_trivial_indices);
 }
 
 /* Perform keyswitch on a batch of 64 bits input LWE ciphertexts.
@@ -40,15 +42,19 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
     void const *lwe_output_indexes, void const *lwe_array_in,
     void const *lwe_input_indexes, void const *ksk, uint32_t lwe_dimension_in,
     uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples) {
-  host_keyswitch_lwe_ciphertext_vector<uint64_t>(
+    uint32_t num_samples, const void *ks_tmp_buffer,
+    bool uses_trivial_indices) {
+
+  host_gemm_keyswitch_lwe_ciphertext_vector<uint64_t>(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint64_t *>(lwe_array_out),
       static_cast<const uint64_t *>(lwe_output_indexes),
       static_cast<const uint64_t *>(lwe_array_in),
       static_cast<const uint64_t *>(lwe_input_indexes),
       static_cast<const uint64_t *>(ksk), lwe_dimension_in, lwe_dimension_out,
-      base_log, level_count, num_samples);
+      base_log, level_count, num_samples,
+      static_cast<const ks_mem<uint64_t> *>(ks_tmp_buffer)->d_buffer,
+      uses_trivial_indices);
 }
 
 uint64_t scratch_packing_keyswitch_lwe_list_to_glwe_64(
@@ -60,6 +66,26 @@ uint64_t scratch_packing_keyswitch_lwe_list_to_glwe_64(
       glwe_dimension, polynomial_size, num_lwes, allocate_gpu_memory);
 }
 
+uint64_t scratch_cuda_keyswitch_64(void *stream, uint32_t gpu_index,
+                                   void **ks_tmp_buffer,
+                                   uint32_t lwe_dimension_in,
+                                   uint32_t lwe_dimension_out,
+                                   uint32_t num_lwes,
+                                   bool allocate_gpu_memory) {
+  return scratch_cuda_keyswitch<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      (ks_mem<uint64_t> **)ks_tmp_buffer, lwe_dimension_in, lwe_dimension_out,
+      num_lwes, allocate_gpu_memory);
+}
+
+void cleanup_cuda_keyswitch_64(void *stream, uint32_t gpu_index,
+                               void **ks_tmp_buffer, bool allocate_gpu_memory) {
+  cleanup_cuda_keyswitch<uint64_t>(static_cast<cudaStream_t>(stream), gpu_index,
+                                   (ks_mem<uint64_t> *)*ks_tmp_buffer,
+                                   allocate_gpu_memory);
+  *ks_tmp_buffer = nullptr;
+}
+
 /* Perform functional packing keyswitch on a batch of 64 bits input LWE
  * ciphertexts.
  */