Skip to content
64 changes: 59 additions & 5 deletions benches/bloom.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
use criterion::{criterion_group, criterion_main, Criterion};

fn filter_construction(c: &mut Criterion) {
fn standard_filter_construction(c: &mut Criterion) {
use lsm_tree::segment::filter::standard_bloom::Builder;

let mut filter = Builder::with_fp_rate(500_000_000, 0.01);

c.bench_function("bloom filter add key", |b| {
c.bench_function("standard bloom filter add key", |b| {
b.iter(|| {
let key = nanoid::nanoid!();
filter.set_with_hash(Builder::get_hash(key.as_bytes()));
});
});
}

fn filter_contains(c: &mut Criterion) {
use lsm_tree::segment::filter::standard_bloom::Builder;
fn standard_filter_contains(c: &mut Criterion) {
use lsm_tree::segment::filter::{standard_bloom::Builder, AMQ};

let keys = (0..100_000u128)
.map(|x| x.to_be_bytes().to_vec())
Expand Down Expand Up @@ -49,5 +49,59 @@ fn filter_contains(c: &mut Criterion) {
}
}

criterion_group!(benches, filter_construction, filter_contains,);
fn blocked_filter_construction(c: &mut Criterion) {
use lsm_tree::segment::filter::blocked_bloom::Builder;

let mut filter = Builder::with_fp_rate(500_000_000, 0.01);

c.bench_function("blocked bloom filter add key", |b| {
b.iter(|| {
let key = nanoid::nanoid!();
filter.set_with_hash(Builder::get_hash(key.as_bytes()));
});
});
}

fn blocked_filter_contains(c: &mut Criterion) {
use lsm_tree::segment::filter::{blocked_bloom::Builder, AMQ};

let keys = (0..100_000u128)
.map(|x| x.to_be_bytes().to_vec())
.collect::<Vec<_>>();

for fpr in [0.01, 0.001, 0.0001, 0.00001] {
let mut filter = Builder::with_fp_rate(100_000_000, fpr);

for key in &keys {
filter.set_with_hash(Builder::get_hash(key));
}

let mut rng = rand::rng();

let filter = filter.build();

c.bench_function(
&format!(
"blocked bloom filter contains key, true positive ({}%)",
fpr * 100.0,
),
|b| {
b.iter(|| {
use rand::seq::IndexedRandom;

let sample = keys.choose(&mut rng).unwrap();
let hash = Builder::get_hash(sample);
assert!(filter.contains_hash(hash));
});
},
);
}
}
criterion_group!(
benches,
standard_filter_construction,
standard_filter_contains,
blocked_filter_construction,
blocked_filter_contains,
);
criterion_main!(benches);
138 changes: 138 additions & 0 deletions src/segment/filter/blocked_bloom/builder.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
// Copyright (c) 2024-present, fjall-rs
// This source code is licensed under both the Apache 2.0 and MIT License
// (found in the LICENSE-* files in the repository)

use super::{super::bit_array::Builder as BitArrayBuilder, BlockedBloomFilter};
use crate::segment::filter::{bit_array::BitArrayReader, CACHE_LINE_BYTES};

/// Two hashes that are used for double hashing
pub type CompositeHash = (u64, u64);

#[derive(Debug, Eq, PartialEq)]
#[allow(clippy::module_name_repetitions)]
pub struct Builder {
/// Raw bytes exposed as bit array
inner: BitArrayBuilder,

/// Number of hash functions
k: usize,

/// Number of blocks in the blocked bloom filter
num_blocks: usize,
}

#[allow(clippy::len_without_is_empty)]
impl Builder {
#[must_use]
pub fn build(self) -> BlockedBloomFilter {
BlockedBloomFilter {
inner: BitArrayReader::new(self.inner.bytes().into()),
k: self.k,
num_blocks: self.num_blocks,
}
}

/// Constructs a bloom filter that can hold `n` items
/// while maintaining a certain false positive rate `fpr`.
#[must_use]
pub fn with_fp_rate(n: usize, fpr: f32) -> Self {
use std::f32::consts::LN_2;

assert!(n > 0);

// NOTE: Some sensible minimum
let fpr = fpr.max(0.000_001);

// TODO: m and k is still calculated by traditional standard bloom filter formula
let m = Self::calculate_m(n, fpr);
let bpk = m / n;
let k = (((bpk as f32) * LN_2) as usize).max(1);

let num_blocks = m.div_ceil(CACHE_LINE_BYTES * 8);

Self {
inner: BitArrayBuilder::with_capacity(num_blocks * CACHE_LINE_BYTES),
k,
num_blocks,
}
}

/// Constructs a bloom filter that can hold `n` items
/// with `bpk` bits per key.
///
/// 10 bits per key is a sensible default.
#[must_use]
pub fn with_bpk(n: usize, bpk: u8) -> Self {
use std::f32::consts::LN_2;

assert!(bpk > 0);
assert!(n > 0);

let bpk = bpk as usize;

let m = n * bpk;
let k = (((bpk as f32) * LN_2) as usize).max(1);

let num_blocks = m.div_ceil(CACHE_LINE_BYTES * 8);

Self {
inner: BitArrayBuilder::with_capacity(num_blocks * CACHE_LINE_BYTES),
k,
num_blocks,
}
}

fn calculate_m(n: usize, fp_rate: f32) -> usize {
use std::f32::consts::LN_2;

let n = n as f32;
let ln2_squared = LN_2.powi(2);

let numerator = n * fp_rate.ln();
let m = -(numerator / ln2_squared);

// Round up to next byte
((m / 8.0).ceil() * 8.0) as usize
}

/// Adds the key to the filter.
pub fn set_with_hash(&mut self, (mut h1, mut h2): CompositeHash) {
let block_idx = h1 % (self.num_blocks as u64);

for i in 1..(self.k as u64) {
let idx = h1 % (CACHE_LINE_BYTES as u64 * 8);

self.inner
.enable_bit(Self::get_bit_idx(block_idx as usize, idx as usize));

h1 = h1.wrapping_add(h2);
h2 = h2.wrapping_mul(i);
}
}

pub fn get_bit_idx(block_idx: usize, idx_in_block: usize) -> usize {
block_idx * CACHE_LINE_BYTES * 8 + idx_in_block
}

/// Gets the hash of a key.
#[must_use]
pub fn get_hash(key: &[u8]) -> CompositeHash {
let h0 = xxhash_rust::xxh3::xxh3_128(key);
let h1 = (h0 >> 64) as u64;
let h2 = h0 as u64;
(h1, h2)
}
}

#[cfg(test)]
mod tests {
use super::*;
use test_log::test;

#[test]
fn bloom_calculate_m() {
assert_eq!(9_592, Builder::calculate_m(1_000, 0.01));
assert_eq!(4_800, Builder::calculate_m(1_000, 0.1));
assert_eq!(4_792_536, Builder::calculate_m(1_000_000, 0.1));
}
}
Loading