Skip to content

Edm/prepare count of consecutive bits #2362

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };

enum outputFlag { FLAG_NONE = 0, FLAG_OVERFLOW = 1, FLAG_CARRY = 2 };

enum Direction { TRAILING = 0, LEADING = 1 };

enum BitValue { ZERO = 0, ONE = 1 };

extern "C" {

typedef struct {
Expand Down Expand Up @@ -538,5 +542,26 @@ void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);

uint64_t scratch_cuda_prepare_count_of_consecutive_bits_buffer_kb_64(
void *const *streams, uint32_t const *gpu_indexes, const uint32_t gpu_count,
int8_t **mem_ptr, const uint32_t num_radix_blocks, const Direction dir,
const BitValue bit_value, const bool allocate_gpu_memory,
const uint32_t glwe_dimension, const uint32_t polynomial_size,
const uint32_t lwe_dimension, const uint32_t ks_level,
const uint32_t ks_base_log, const uint32_t pbs_level,
const uint32_t pbs_base_log, const uint32_t grouping_factor,
const uint32_t message_modulus, const uint32_t carry_modulus,
const PBS_TYPE pbs_type, const bool allocate_ms_array);

void host_cuda_prepare_count_of_consecutive_bits_buffer_kb_64(
void *const *streams, uint32_t const *gpu_indexes, const uint32_t gpu_count,
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
int8_t **mem_ptr, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);

void cleanup_cuda_prepare_count_of_consecutive_bits_buffer_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);

} // extern C
#endif // CUDA_INTEGER_H
335 changes: 262 additions & 73 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h

Large diffs are not rendered by default.

46 changes: 46 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/integer/ilog2.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#include "ilog2.cuh"

uint64_t scratch_cuda_prepare_count_of_consecutive_bits_buffer_kb_64(
void *const *streams, uint32_t const *gpu_indexes, const uint32_t gpu_count,
int8_t **mem_ptr, const uint32_t num_radix_blocks, const Direction dir,
const BitValue bit_value, const bool allocate_gpu_memory,
const uint32_t glwe_dimension, const uint32_t polynomial_size,
const uint32_t lwe_dimension, const uint32_t ks_level,
const uint32_t ks_base_log, const uint32_t pbs_level,
const uint32_t pbs_base_log, const uint32_t grouping_factor,
const uint32_t message_modulus, const uint32_t carry_modulus,
const PBS_TYPE pbs_type, const bool allocate_ms_array) {

int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus,
allocate_ms_array);

return scratch_cuda_prepare_count_of_consecutive_bits_buffer_kb<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count,
(int_prepare_count_of_consecutive_bits_buffer<uint64_t> **)mem_ptr,
num_radix_blocks, params, dir, bit_value, allocate_gpu_memory);
}

void host_cuda_prepare_count_of_consecutive_bits_buffer_kb_64(
void *const *streams, uint32_t const *gpu_indexes, const uint32_t gpu_count,
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
int8_t **mem_ptr, void *const *bsks, void *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

host_cuda_prepare_count_of_consecutive_bits_buffer_kb<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count, output, input,
(int_prepare_count_of_consecutive_bits_buffer<uint64_t> **)mem_ptr, bsks,
(uint64_t **)ksks, ms_noise_reduction_key);
}

void cleanup_cuda_prepare_count_of_consecutive_bits_buffer_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr) {

auto *buf =
(int_prepare_count_of_consecutive_bits_buffer<uint64_t> *)(*mem_ptr);

buf->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}
49 changes: 49 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/integer/ilog2.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#ifndef ILOG2_CUH
#define ILOG2_CUH

#include "device.h"
#include "integer.cuh"
#include "integer/integer_utilities.h"
#include <inttypes.h>

template <typename Torus>
__host__ uint64_t scratch_cuda_prepare_count_of_consecutive_bits_buffer_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count,
int_prepare_count_of_consecutive_bits_buffer<Torus> **mem_ptr,
uint32_t num_radix_blocks, int_radix_params params, Direction dir,
BitValue bit_value, bool allocate_gpu_memory) {

uint64_t size_tracker = 0;
*mem_ptr = new int_prepare_count_of_consecutive_bits_buffer<Torus>(
streams, gpu_indexes, gpu_count, dir, bit_value, params, num_radix_blocks,
allocate_gpu_memory, &size_tracker);

return size_tracker;
}

template <typename Torus>
__host__ void host_cuda_prepare_count_of_consecutive_bits_buffer_kb(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *output,
CudaRadixCiphertextFFI const *input,
int_prepare_count_of_consecutive_bits_buffer<Torus> **mem_ptr,
void *const *bsks, Torus *const *ksks,
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {

integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, (*mem_ptr)->copy_ct, input, bsks, ksks,
ms_noise_reduction_key, (*mem_ptr)->uni_lut, output->num_radix_blocks);

if ((*mem_ptr)->dir == Direction::LEADING) {
host_radix_blocks_reverse_inplace<Torus>((cudaStream_t *)streams,
gpu_indexes, (*mem_ptr)->copy_ct);
}

host_compute_prefix_sum_hillis_steele<Torus>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count, output,
(*mem_ptr)->copy_ct, (*mem_ptr)->bi_lut, bsks, ksks,
ms_noise_reduction_key, output->num_radix_blocks);
}

#endif
5 changes: 0 additions & 5 deletions backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
Original file line number Diff line number Diff line change
Expand Up @@ -234,11 +234,6 @@ void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
if (radix_lwe_vec->num_radix_blocks % radix_lwe_out->num_radix_blocks != 0)
PANIC("Cuda error: input vector length should be a multiple of the "
"output's number of radix blocks")
// FIXME: this should not be necessary, we should make sure sum_ctxt works in
// the general case
for (int i = 0; i < radix_lwe_vec->num_radix_blocks; i++) {
radix_lwe_vec->degrees[i] = mem->params.message_modulus - 1;
}
switch (mem->params.polynomial_size) {
case 512:
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
Expand Down
Loading
Loading