Skip to content

Implement BlockedBloomFilter #127

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: 3.0.0
Choose a base branch
from
64 changes: 59 additions & 5 deletions benches/bloom.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
use criterion::{criterion_group, criterion_main, Criterion};

fn filter_construction(c: &mut Criterion) {
fn standard_filter_construction(c: &mut Criterion) {
use lsm_tree::segment::filter::standard_bloom::Builder;

let mut filter = Builder::with_fp_rate(500_000_000, 0.01);

c.bench_function("bloom filter add key", |b| {
c.bench_function("standard bloom filter add key", |b| {
b.iter(|| {
let key = nanoid::nanoid!();
filter.set_with_hash(Builder::get_hash(key.as_bytes()));
});
});
}

fn filter_contains(c: &mut Criterion) {
use lsm_tree::segment::filter::standard_bloom::Builder;
fn standard_filter_contains(c: &mut Criterion) {
use lsm_tree::segment::filter::{standard_bloom::Builder, AMQ};

let keys = (0..100_000u128)
.map(|x| x.to_be_bytes().to_vec())
Expand Down Expand Up @@ -49,5 +49,59 @@ fn filter_contains(c: &mut Criterion) {
}
}

criterion_group!(benches, filter_construction, filter_contains,);
fn blocked_filter_construction(c: &mut Criterion) {
use lsm_tree::segment::filter::blocked_bloom::Builder;

let mut filter = Builder::with_fp_rate(500_000_000, 0.01);

c.bench_function("blocked bloom filter add key", |b| {
b.iter(|| {
let key = nanoid::nanoid!();
filter.set_with_hash(Builder::get_hash(key.as_bytes()));
});
});
}

fn blocked_filter_contains(c: &mut Criterion) {
use lsm_tree::segment::filter::{blocked_bloom::Builder, AMQ};

let keys = (0..100_000u128)
.map(|x| x.to_be_bytes().to_vec())
.collect::<Vec<_>>();

for fpr in [0.01, 0.001, 0.0001, 0.00001] {
let mut filter = Builder::with_fp_rate(100_000_000, fpr);

for key in &keys {
filter.set_with_hash(Builder::get_hash(key));
}

let mut rng = rand::rng();

let filter = filter.build();

c.bench_function(
&format!(
"blocked bloom filter contains key, true positive ({}%)",
fpr * 100.0,
),
|b| {
b.iter(|| {
use rand::seq::IndexedRandom;

let sample = keys.choose(&mut rng).unwrap();
let hash = Builder::get_hash(sample);
assert!(filter.contains_hash(hash));
});
},
);
}
}
criterion_group!(
benches,
standard_filter_construction,
standard_filter_contains,
blocked_filter_construction,
blocked_filter_contains,
);
criterion_main!(benches);
138 changes: 138 additions & 0 deletions src/segment/filter/blocked_bloom/builder.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
// Copyright (c) 2024-present, fjall-rs
// This source code is licensed under both the Apache 2.0 and MIT License
// (found in the LICENSE-* files in the repository)

use super::{super::bit_array::Builder as BitArrayBuilder, BlockedBloomFilter};
use crate::segment::filter::{bit_array::BitArrayReader, CACHE_LINE_BYTES};

/// Two hashes that are used for double hashing
pub type CompositeHash = (u64, u64);

#[derive(Debug, Eq, PartialEq)]
#[allow(clippy::module_name_repetitions)]
pub struct Builder {
/// Raw bytes exposed as bit array
inner: BitArrayBuilder,

/// Number of hash functions
k: usize,

/// Number of blocks in the blocked bloom filter
num_blocks: usize,
}

#[allow(clippy::len_without_is_empty)]
impl Builder {
#[must_use]
pub fn build(self) -> BlockedBloomFilter {
BlockedBloomFilter {
inner: BitArrayReader::new(self.inner.bytes().into()),
k: self.k,
num_blocks: self.num_blocks,
}
}

/// Constructs a bloom filter that can hold `n` items
/// while maintaining a certain false positive rate `fpr`.
#[must_use]
pub fn with_fp_rate(n: usize, fpr: f32) -> Self {
use std::f32::consts::LN_2;

assert!(n > 0);

// NOTE: Some sensible minimum
let fpr = fpr.max(0.000_001);

// TODO: m and k is still calculated by traditional standard bloom filter formula
let m = Self::calculate_m(n, fpr);
let bpk = m / n;
let k = (((bpk as f32) * LN_2) as usize).max(1);

let num_blocks = m.div_ceil(CACHE_LINE_BYTES * 8);

Self {
inner: BitArrayBuilder::with_capacity(num_blocks * CACHE_LINE_BYTES),
k,
num_blocks,
}
}

/// Constructs a bloom filter that can hold `n` items
/// with `bpk` bits per key.
///
/// 10 bits per key is a sensible default.
#[must_use]
pub fn with_bpk(n: usize, bpk: u8) -> Self {
use std::f32::consts::LN_2;

assert!(bpk > 0);
assert!(n > 0);

let bpk = bpk as usize;

let m = n * bpk;
let k = (((bpk as f32) * LN_2) as usize).max(1);

let num_blocks = m.div_ceil(CACHE_LINE_BYTES * 8);

Self {
inner: BitArrayBuilder::with_capacity(num_blocks * CACHE_LINE_BYTES),
k,
num_blocks,
}
}

fn calculate_m(n: usize, fp_rate: f32) -> usize {
use std::f32::consts::LN_2;

let n = n as f32;
let ln2_squared = LN_2.powi(2);

let numerator = n * fp_rate.ln();
let m = -(numerator / ln2_squared);

// Round up to next byte
((m / 8.0).ceil() * 8.0) as usize
}

/// Adds the key to the filter.
pub fn set_with_hash(&mut self, (mut h1, mut h2): CompositeHash) {
let block_idx = h1 % (self.num_blocks as u64);

for i in 1..(self.k as u64) {
let idx = h1 % (CACHE_LINE_BYTES as u64 * 8);

self.inner
.enable_bit(Self::get_bit_idx(block_idx as usize, idx as usize));

h1 = h1.wrapping_add(h2);
h2 = h2.wrapping_mul(i);
}
}

pub fn get_bit_idx(block_idx: usize, idx_in_block: usize) -> usize {
block_idx * CACHE_LINE_BYTES * 8 + idx_in_block
}

/// Gets the hash of a key.
#[must_use]
pub fn get_hash(key: &[u8]) -> CompositeHash {
let h0 = xxhash_rust::xxh3::xxh3_128(key);
let h1 = (h0 >> 64) as u64;
let h2 = h0 as u64;
(h1, h2)
}
}

#[cfg(test)]
mod tests {
use super::*;
use test_log::test;

#[test]
fn bloom_calculate_m() {
assert_eq!(9_592, Builder::calculate_m(1_000, 0.01));
assert_eq!(4_800, Builder::calculate_m(1_000, 0.1));
assert_eq!(4_792_536, Builder::calculate_m(1_000_000, 0.1));
}
}
Loading