Skip to content

Commit 875c957

Browse files
committed
chore(gpu): refactor prepare_count_of_consecutive_bits
1 parent 02465c0 commit 875c957

File tree

8 files changed

+529
-79
lines changed

8 files changed

+529
-79
lines changed

backends/tfhe-cuda-backend/cuda/include/integer/integer.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
3737

3838
enum outputFlag { FLAG_NONE = 0, FLAG_OVERFLOW = 1, FLAG_CARRY = 2 };
3939

40+
enum Direction { TRAILING = 0, LEADING = 1 };
41+
42+
enum BitValue { ZERO = 0, ONE = 1 };
43+
4044
extern "C" {
4145

4246
typedef struct {
@@ -538,5 +542,26 @@ void cleanup_cuda_integer_is_at_least_one_comparisons_block_true(
538542
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
539543
int8_t **mem_ptr_void);
540544

545+
uint64_t scratch_cuda_prepare_count_of_consecutive_bits_buffer_kb_64(
546+
void *const *streams, uint32_t const *gpu_indexes, const uint32_t gpu_count,
547+
int8_t **mem_ptr, const uint32_t num_radix_blocks, const Direction dir,
548+
const BitValue bit_value, const bool allocate_gpu_memory,
549+
const uint32_t glwe_dimension, const uint32_t polynomial_size,
550+
const uint32_t lwe_dimension, const uint32_t ks_level,
551+
const uint32_t ks_base_log, const uint32_t pbs_level,
552+
const uint32_t pbs_base_log, const uint32_t grouping_factor,
553+
const uint32_t message_modulus, const uint32_t carry_modulus,
554+
const PBS_TYPE pbs_type, const bool allocate_ms_array);
555+
556+
void host_cuda_prepare_count_of_consecutive_bits_buffer_kb_64(
557+
void *const *streams, uint32_t const *gpu_indexes, const uint32_t gpu_count,
558+
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
559+
int8_t **mem_ptr, void *const *bsks, void *const *ksks,
560+
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key);
561+
562+
void cleanup_cuda_prepare_count_of_consecutive_bits_buffer_kb_64(
563+
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
564+
int8_t **mem_ptr_void);
565+
541566
} // extern C
542567
#endif // CUDA_INTEGER_H

backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
#ifndef CUDA_INTEGER_UTILITIES_H
22
#define CUDA_INTEGER_UTILITIES_H
33

4+
#include <cinttypes>
5+
46
#include "integer.h"
57
#include "integer/radix_ciphertext.cuh"
68
#include "integer/radix_ciphertext.h"
@@ -17,6 +19,10 @@ class NoiseLevel {
1719
static const uint64_t UNKNOWN = std::numeric_limits<uint64_t>::max();
1820
};
1921

22+
constexpr BitValue opposite(const BitValue bit) noexcept {
23+
return (bit == BitValue::ZERO) ? BitValue::ONE : BitValue::ZERO;
24+
}
25+
2026
template <typename Torus>
2127
__global__ void radix_blocks_rotate_right(Torus *dst, Torus *src,
2228
uint32_t value, uint32_t blocks_count,
@@ -4400,6 +4406,94 @@ template <typename Torus> struct unsigned_int_div_rem_memory {
44004406
}
44014407
};
44024408

4409+
template <typename Torus> struct int_prepare_count_of_consecutive_bits_buffer {
4410+
int_radix_params params;
4411+
int_radix_lut<Torus> *uni_lut;
4412+
int_radix_lut<Torus> *bi_lut;
4413+
CudaRadixCiphertextFFI *copy_ct;
4414+
Direction dir;
4415+
BitValue bit_val;
4416+
bool gpu_memory_allocated;
4417+
4418+
int_prepare_count_of_consecutive_bits_buffer(
4419+
cudaStream_t const *streams, uint32_t const *gpu_indexes,
4420+
uint32_t gpu_count, Direction dir, BitValue bit_val,
4421+
int_radix_params params, uint32_t num_radix_blocks,
4422+
bool allocate_gpu_memory, uint64_t *size_tracker) {
4423+
this->params = params;
4424+
this->dir = dir;
4425+
this->bit_val = bit_val;
4426+
this->gpu_memory_allocated = allocate_gpu_memory;
4427+
this->copy_ct = new CudaRadixCiphertextFFI;
4428+
4429+
const uint32_t bits = (uint32_t)std::log2(params.message_modulus);
4430+
4431+
create_zero_radix_ciphertext_async<Torus>(
4432+
streams[0], gpu_indexes[0], copy_ct, num_radix_blocks,
4433+
params.big_lwe_dimension, size_tracker, allocate_gpu_memory);
4434+
4435+
this->uni_lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count,
4436+
params, 1, num_radix_blocks,
4437+
allocate_gpu_memory, size_tracker);
4438+
4439+
auto lut_fn = [params, dir, bit_val, bits](Torus x) -> Torus {
4440+
x %= params.message_modulus;
4441+
Torus count = 0;
4442+
4443+
const int start = (dir == TRAILING ? 0 : (int)bits - 1);
4444+
const int end = (dir == TRAILING ? (int)bits : -1);
4445+
const int step = (dir == TRAILING ? 1 : -1);
4446+
4447+
for (uint32_t i = start; i != end; i += step) {
4448+
if (((x >> i) & (Torus)1) == (Torus)opposite(bit_val))
4449+
break;
4450+
++count;
4451+
}
4452+
return count;
4453+
};
4454+
4455+
generate_device_accumulator<Torus>(
4456+
streams[0], gpu_indexes[0], uni_lut->get_lut(0, 0),
4457+
uni_lut->get_degree(0), uni_lut->get_max_degree(0),
4458+
params.glwe_dimension, params.polynomial_size, params.message_modulus,
4459+
params.carry_modulus, lut_fn, gpu_memory_allocated);
4460+
4461+
uni_lut->broadcast_lut(streams, gpu_indexes, 0);
4462+
4463+
this->bi_lut = new int_radix_lut<Torus>(streams, gpu_indexes, gpu_count,
4464+
params, 1, num_radix_blocks,
4465+
allocate_gpu_memory, size_tracker);
4466+
4467+
auto sum_lut = [bits](Torus block_num_bit_count,
4468+
Torus more_significant_block_bit_count) -> Torus {
4469+
return (more_significant_block_bit_count == bits) ? block_num_bit_count
4470+
: 0;
4471+
};
4472+
4473+
generate_device_accumulator_bivariate<Torus>(
4474+
streams[0], gpu_indexes[0], bi_lut->get_lut(0, 0),
4475+
bi_lut->get_degree(0), bi_lut->get_max_degree(0), params.glwe_dimension,
4476+
params.polynomial_size, params.message_modulus, params.carry_modulus,
4477+
sum_lut, gpu_memory_allocated);
4478+
4479+
bi_lut->broadcast_lut(streams, gpu_indexes, 0);
4480+
}
4481+
4482+
void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
4483+
uint32_t gpu_count) {
4484+
4485+
release_radix_ciphertext_async(streams[0], gpu_indexes[0], copy_ct,
4486+
gpu_memory_allocated);
4487+
delete copy_ct;
4488+
4489+
uni_lut->release(streams, gpu_indexes, gpu_count);
4490+
delete uni_lut;
4491+
4492+
bi_lut->release(streams, gpu_indexes, gpu_count);
4493+
delete bi_lut;
4494+
}
4495+
};
4496+
44034497
template <typename Torus> struct int_bitop_buffer {
44044498

44054499
int_radix_params params;
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#include "ilog2.cuh"
2+
3+
uint64_t scratch_cuda_prepare_count_of_consecutive_bits_buffer_kb_64(
4+
void *const *streams, uint32_t const *gpu_indexes, const uint32_t gpu_count,
5+
int8_t **mem_ptr, const uint32_t num_radix_blocks, const Direction dir,
6+
const BitValue bit_value, const bool allocate_gpu_memory,
7+
const uint32_t glwe_dimension, const uint32_t polynomial_size,
8+
const uint32_t lwe_dimension, const uint32_t ks_level,
9+
const uint32_t ks_base_log, const uint32_t pbs_level,
10+
const uint32_t pbs_base_log, const uint32_t grouping_factor,
11+
const uint32_t message_modulus, const uint32_t carry_modulus,
12+
const PBS_TYPE pbs_type, const bool allocate_ms_array) {
13+
14+
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
15+
glwe_dimension * polynomial_size, lwe_dimension,
16+
ks_level, ks_base_log, pbs_level, pbs_base_log,
17+
grouping_factor, message_modulus, carry_modulus,
18+
allocate_ms_array);
19+
20+
return scratch_cuda_prepare_count_of_consecutive_bits_buffer_kb<uint64_t>(
21+
(cudaStream_t *)streams, gpu_indexes, gpu_count,
22+
(int_prepare_count_of_consecutive_bits_buffer<uint64_t> **)mem_ptr,
23+
num_radix_blocks, params, dir, bit_value, allocate_gpu_memory);
24+
}
25+
26+
void host_cuda_prepare_count_of_consecutive_bits_buffer_kb_64(
27+
void *const *streams, uint32_t const *gpu_indexes, const uint32_t gpu_count,
28+
CudaRadixCiphertextFFI *output, CudaRadixCiphertextFFI const *input,
29+
int8_t **mem_ptr, void *const *bsks, void *const *ksks,
30+
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
31+
32+
host_cuda_prepare_count_of_consecutive_bits_buffer_kb<uint64_t>(
33+
(cudaStream_t *)streams, gpu_indexes, gpu_count, output, input,
34+
(int_prepare_count_of_consecutive_bits_buffer<uint64_t> **)mem_ptr, bsks,
35+
(uint64_t **)ksks, ms_noise_reduction_key);
36+
}
37+
38+
void cleanup_cuda_prepare_count_of_consecutive_bits_buffer_kb_64(
39+
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
40+
int8_t **mem_ptr) {
41+
42+
auto *buf =
43+
(int_prepare_count_of_consecutive_bits_buffer<uint64_t> *)(*mem_ptr);
44+
45+
buf->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
46+
}
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
#ifndef ILOG2_CUH
2+
#define ILOG2_CUH
3+
4+
#include "device.h"
5+
#include "integer.cuh"
6+
#include "integer/integer_utilities.h"
7+
#include <inttypes.h>
8+
9+
template <typename Torus>
10+
__host__ uint64_t scratch_cuda_prepare_count_of_consecutive_bits_buffer_kb(
11+
cudaStream_t const *streams, uint32_t const *gpu_indexes,
12+
uint32_t gpu_count,
13+
int_prepare_count_of_consecutive_bits_buffer<Torus> **mem_ptr,
14+
uint32_t num_radix_blocks, int_radix_params params, Direction dir,
15+
BitValue bit_value, bool allocate_gpu_memory) {
16+
17+
uint64_t size_tracker = 0;
18+
*mem_ptr = new int_prepare_count_of_consecutive_bits_buffer<Torus>(
19+
streams, gpu_indexes, gpu_count, dir, bit_value, params, num_radix_blocks,
20+
allocate_gpu_memory, &size_tracker);
21+
22+
return size_tracker;
23+
}
24+
25+
template <typename Torus>
26+
__host__ void host_cuda_prepare_count_of_consecutive_bits_buffer_kb(
27+
cudaStream_t const *streams, uint32_t const *gpu_indexes,
28+
uint32_t gpu_count, CudaRadixCiphertextFFI *output,
29+
CudaRadixCiphertextFFI const *input,
30+
int_prepare_count_of_consecutive_bits_buffer<Torus> **mem_ptr,
31+
void *const *bsks, Torus *const *ksks,
32+
CudaModulusSwitchNoiseReductionKeyFFI const *ms_noise_reduction_key) {
33+
34+
integer_radix_apply_univariate_lookup_table_kb<Torus>(
35+
streams, gpu_indexes, gpu_count, (*mem_ptr)->copy_ct, input, bsks, ksks,
36+
ms_noise_reduction_key, (*mem_ptr)->uni_lut, output->num_radix_blocks);
37+
38+
if ((*mem_ptr)->dir == Direction::LEADING) {
39+
host_radix_blocks_reverse_inplace<Torus>((cudaStream_t *)streams,
40+
gpu_indexes, (*mem_ptr)->copy_ct);
41+
}
42+
43+
host_compute_prefix_sum_hillis_steele<Torus>(
44+
(cudaStream_t *)(streams), gpu_indexes, gpu_count, output,
45+
(*mem_ptr)->copy_ct, (*mem_ptr)->bi_lut, bsks, ksks,
46+
ms_noise_reduction_key, output->num_radix_blocks);
47+
}
48+
49+
#endif

backends/tfhe-cuda-backend/src/bindings.rs

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,12 @@ pub const COMPARISON_TYPE_LE: COMPARISON_TYPE = 5;
210210
pub const COMPARISON_TYPE_MAX: COMPARISON_TYPE = 6;
211211
pub const COMPARISON_TYPE_MIN: COMPARISON_TYPE = 7;
212212
pub type COMPARISON_TYPE = ffi::c_uint;
213+
pub const Direction_TRAILING: Direction = 0;
214+
pub const Direction_LEADING: Direction = 1;
215+
pub type Direction = ffi::c_uint;
216+
pub const BitValue_ZERO: BitValue = 0;
217+
pub const BitValue_ONE: BitValue = 1;
218+
pub type BitValue = ffi::c_uint;
213219
#[repr(C)]
214220
#[derive(Debug, Copy, Clone)]
215221
pub struct CudaRadixCiphertextFFI {
@@ -1316,6 +1322,51 @@ unsafe extern "C" {
13161322
mem_ptr_void: *mut *mut i8,
13171323
);
13181324
}
1325+
unsafe extern "C" {
1326+
pub fn scratch_cuda_prepare_count_of_consecutive_bits_buffer_kb_64(
1327+
streams: *const *mut ffi::c_void,
1328+
gpu_indexes: *const u32,
1329+
gpu_count: u32,
1330+
mem_ptr: *mut *mut i8,
1331+
num_radix_blocks: u32,
1332+
dir: Direction,
1333+
bit_value: BitValue,
1334+
allocate_gpu_memory: bool,
1335+
glwe_dimension: u32,
1336+
polynomial_size: u32,
1337+
lwe_dimension: u32,
1338+
ks_level: u32,
1339+
ks_base_log: u32,
1340+
pbs_level: u32,
1341+
pbs_base_log: u32,
1342+
grouping_factor: u32,
1343+
message_modulus: u32,
1344+
carry_modulus: u32,
1345+
pbs_type: PBS_TYPE,
1346+
allocate_ms_array: bool,
1347+
) -> u64;
1348+
}
1349+
unsafe extern "C" {
1350+
pub fn host_cuda_prepare_count_of_consecutive_bits_buffer_kb_64(
1351+
streams: *const *mut ffi::c_void,
1352+
gpu_indexes: *const u32,
1353+
gpu_count: u32,
1354+
output: *mut CudaRadixCiphertextFFI,
1355+
input: *const CudaRadixCiphertextFFI,
1356+
mem_ptr: *mut *mut i8,
1357+
bsks: *const *mut ffi::c_void,
1358+
ksks: *const *mut ffi::c_void,
1359+
ms_noise_reduction_key: *const CudaModulusSwitchNoiseReductionKeyFFI,
1360+
);
1361+
}
1362+
unsafe extern "C" {
1363+
pub fn cleanup_cuda_prepare_count_of_consecutive_bits_buffer_kb_64(
1364+
streams: *const *mut ffi::c_void,
1365+
gpu_indexes: *const u32,
1366+
gpu_count: u32,
1367+
mem_ptr_void: *mut *mut i8,
1368+
);
1369+
}
13191370
pub const KS_TYPE_BIG_TO_SMALL: KS_TYPE = 0;
13201371
pub const KS_TYPE_SMALL_TO_BIG: KS_TYPE = 1;
13211372
pub type KS_TYPE = ffi::c_uint;

0 commit comments

Comments
 (0)