Skip to content

Commit 2bdf3e6

Browse files
authored
Merge pull request #112 from fjall-rs/perf/replace-partition-point
perf: replace std partition_point
2 parents 2c6a6f7 + 1751e48 commit 2bdf3e6

File tree

14 files changed

+197
-32
lines changed

14 files changed

+197
-32
lines changed

.gitignore

-3
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,5 @@ Cargo.lock
1414
*.pdb
1515

1616
.lsm.data
17-
.data
18-
/old_*
1917
.test*
20-
.block_index_test
2118
.bench

Cargo.toml

+6
Original file line numberDiff line numberDiff line change
@@ -101,3 +101,9 @@ name = "fd_table"
101101
harness = false
102102
path = "benches/fd_table.rs"
103103
required-features = []
104+
105+
[[bench]]
106+
name = "partition_point"
107+
harness = false
108+
path = "benches/partition_point.rs"
109+
required-features = []

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ A K.I.S.S. implementation of log-structured merge trees (LSM-trees/LSMTs) in Rus
1919
This is the most feature-rich LSM-tree implementation in Rust! It features:
2020

2121
- Thread-safe BTreeMap-like API
22-
- 100% safe & stable Rust
22+
- [99.9% safe](./UNSAFE.md) & stable Rust
2323
- Block-based tables with compression support
2424
- Range & prefix searching with forward and reverse iteration
2525
- Size-tiered, (concurrent) Leveled and FIFO compaction

UNSAFE.md

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# Unsafe usage
2+
3+
Currently, the project itself only uses one **1** unsafe block (ignoring dependencies which are tested themselves separately):
4+
5+
- https://github.com/fjall-rs/lsm-tree/blob/2d8686e873369bd9c4ff2b562ed988c1cea38331/src/binary_search.rs#L23-L25

benches/partition_point.rs

+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
use criterion::{criterion_group, criterion_main, Criterion};
2+
use lsm_tree::binary_search::partition_point;
3+
4+
fn bench_partition_point(c: &mut Criterion) {
5+
let mut group = c.benchmark_group("partition_point");
6+
7+
for item_count in [10, 100, 1_000, 10_000, 100_000, 1_000_000] {
8+
let items = (0..item_count).collect::<Vec<_>>();
9+
10+
// TODO: replace search key with random integer
11+
12+
group.bench_function(format!("native {item_count}"), |b| {
13+
b.iter(|| items.partition_point(|&x| x <= 5_000))
14+
});
15+
16+
group.bench_function(format!("rewrite {item_count}"), |b| {
17+
b.iter(|| partition_point(&items, |&x| x <= 5_000))
18+
});
19+
}
20+
}
21+
22+
criterion_group!(benches, bench_partition_point);
23+
criterion_main!(benches);

fuzz/.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
corpus
2+
artifacts

fuzz/Cargo.toml

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
[package]
2+
name = "lsm-tree-fuzz"
3+
version = "0.0.0"
4+
publish = false
5+
edition = "2021"
6+
7+
[package.metadata]
8+
cargo-fuzz = true
9+
10+
[dependencies]
11+
libfuzzer-sys = "0.4"
12+
lsm-tree = { path = ".." }
13+
14+
[[bin]]
15+
name = "partition_point"
16+
path = "fuzz_targets/partition_point.rs"
17+
test = false
18+
doc = false
19+
bench = false

fuzz/fuzz_targets/partition_point.rs

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#![no_main]
2+
use libfuzzer_sys::{
3+
arbitrary::{Arbitrary, Unstructured},
4+
fuzz_target,
5+
};
6+
use lsm_tree::binary_search::partition_point;
7+
8+
fuzz_target!(|data: &[u8]| {
9+
let mut unstructured = Unstructured::new(data);
10+
11+
if let Ok(mut items) = <Vec<u8> as Arbitrary>::arbitrary(&mut unstructured) {
12+
items.sort();
13+
items.dedup();
14+
15+
let idx = partition_point(&items, |&x| x < 128);
16+
let std_pp_idx = items.partition_point(|&x| x < 128);
17+
assert_eq!(std_pp_idx, idx);
18+
}
19+
});

src/binary_search.rs

+91
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
// Copyright (c) 2024-present, fjall-rs
2+
// This source code is licensed under both the Apache 2.0 and MIT License
3+
// (found in the LICENSE-* files in the repository)
4+
5+
/// Returns the index of the partition point according to the given predicate
6+
/// (the index of the first element of the second partition).
7+
///
8+
/// This seems to be faster than std's partition_point: https://github.com/rust-lang/rust/issues/138796
9+
pub fn partition_point<T, F>(slice: &[T], pred: F) -> usize
10+
where
11+
F: Fn(&T) -> bool,
12+
{
13+
let mut left = 0;
14+
let mut right = slice.len();
15+
16+
if right == 0 {
17+
return 0;
18+
}
19+
20+
while left < right {
21+
let mid = (left + right) / 2;
22+
23+
// SAFETY: See https://github.com/rust-lang/rust/blob/ebf0cf75d368c035f4c7e7246d203bd469ee4a51/library/core/src/slice/mod.rs#L2834-L2836
24+
#[warn(unsafe_code)]
25+
let item = unsafe { slice.get_unchecked(mid) };
26+
27+
if pred(item) {
28+
left = mid + 1;
29+
} else {
30+
right = mid;
31+
}
32+
}
33+
34+
left
35+
}
36+
37+
#[cfg(test)]
38+
mod tests {
39+
use super::partition_point;
40+
use test_log::test;
41+
42+
#[test]
43+
fn binary_search_first() {
44+
let items = [1, 2, 3, 4, 5];
45+
let idx = partition_point(&items, |&x| x < 1);
46+
assert_eq!(0, idx);
47+
48+
let std_pp_idx = items.partition_point(|&x| x < 1);
49+
assert_eq!(std_pp_idx, idx);
50+
}
51+
52+
#[test]
53+
fn binary_search_last() {
54+
let items = [1, 2, 3, 4, 5];
55+
let idx = partition_point(&items, |&x| x < 5);
56+
assert_eq!(4, idx);
57+
58+
let std_pp_idx = items.partition_point(|&x| x < 5);
59+
assert_eq!(std_pp_idx, idx);
60+
}
61+
62+
#[test]
63+
fn binary_search_middle() {
64+
let items = [1, 2, 3, 4, 5];
65+
let idx = partition_point(&items, |&x| x < 3);
66+
assert_eq!(2, idx);
67+
68+
let std_pp_idx = items.partition_point(|&x| x < 3);
69+
assert_eq!(std_pp_idx, idx);
70+
}
71+
72+
#[test]
73+
fn binary_search_none() {
74+
let items = [1, 2, 3, 4, 5];
75+
let idx = partition_point(&items, |&x| x < 10);
76+
assert_eq!(5, idx);
77+
78+
let std_pp_idx = items.partition_point(|&x| x < 10);
79+
assert_eq!(std_pp_idx, idx);
80+
}
81+
82+
#[test]
83+
fn binary_search_empty() {
84+
let items: [i32; 0] = [];
85+
let idx = partition_point(&items, |&x| x < 10);
86+
assert_eq!(0, idx);
87+
88+
let std_pp_idx = items.partition_point(|&x| x < 10);
89+
assert_eq!(std_pp_idx, idx);
90+
}
91+
}

src/level_manifest/level.rs

+15-15
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
// This source code is licensed under both the Apache 2.0 and MIT License
33
// (found in the LICENSE-* files in the repository)
44

5-
use crate::{segment::meta::SegmentId, HashSet, KeyRange, Segment, UserKey};
5+
use crate::{
6+
binary_search::partition_point, segment::meta::SegmentId, HashSet, KeyRange, Segment, UserKey,
7+
};
68
use std::ops::Bound;
79

810
/// Level of an LSM-tree
@@ -175,13 +177,11 @@ pub struct DisjointLevel<'a>(&'a Level);
175177
impl<'a> DisjointLevel<'a> {
176178
/// Returns the segment that possibly contains the key.
177179
pub fn get_segment_containing_key(&self, key: &[u8]) -> Option<Segment> {
178-
let level = &self.0;
179-
180-
let idx = level
181-
.segments
182-
.partition_point(|x| x.metadata.key_range.max() < &key);
180+
let idx = partition_point(&self.0.segments, |segment| {
181+
segment.metadata.key_range.max() < &key
182+
});
183183

184-
level
184+
self.0
185185
.segments
186186
.get(idx)
187187
.filter(|x| x.metadata.key_range.min() <= &key)
@@ -197,12 +197,12 @@ impl<'a> DisjointLevel<'a> {
197197

198198
let lo = match &key_range.0 {
199199
Bound::Unbounded => 0,
200-
Bound::Included(start_key) => {
201-
level.partition_point(|segment| segment.metadata.key_range.max() < start_key)
202-
}
203-
Bound::Excluded(start_key) => {
204-
level.partition_point(|segment| segment.metadata.key_range.max() <= start_key)
205-
}
200+
Bound::Included(start_key) => partition_point(level, |segment| {
201+
segment.metadata.key_range.max() < start_key
202+
}),
203+
Bound::Excluded(start_key) => partition_point(level, |segment| {
204+
segment.metadata.key_range.max() <= start_key
205+
}),
206206
};
207207

208208
if lo >= level.len() {
@@ -213,7 +213,7 @@ impl<'a> DisjointLevel<'a> {
213213
Bound::Unbounded => level.len() - 1,
214214
Bound::Included(end_key) => {
215215
let idx =
216-
level.partition_point(|segment| segment.metadata.key_range.min() <= end_key);
216+
partition_point(level, |segment| segment.metadata.key_range.min() <= end_key);
217217

218218
if idx == 0 {
219219
return None;
@@ -223,7 +223,7 @@ impl<'a> DisjointLevel<'a> {
223223
}
224224
Bound::Excluded(end_key) => {
225225
let idx =
226-
level.partition_point(|segment| segment.metadata.key_range.min() < end_key);
226+
partition_point(level, |segment| segment.metadata.key_range.min() < end_key);
227227

228228
if idx == 0 {
229229
return None;

src/lib.rs

+4-1
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@
9090
9191
#![doc(html_logo_url = "https://raw.githubusercontent.com/fjall-rs/lsm-tree/main/logo.png")]
9292
#![doc(html_favicon_url = "https://raw.githubusercontent.com/fjall-rs/lsm-tree/main/logo.png")]
93-
#![forbid(unsafe_code)]
93+
#![deny(unsafe_code)]
9494
#![deny(clippy::all, missing_docs, clippy::cargo)]
9595
#![deny(clippy::unwrap_used)]
9696
#![deny(clippy::indexing_slicing)]
@@ -124,6 +124,9 @@ mod any_tree;
124124

125125
mod r#abstract;
126126

127+
#[doc(hidden)]
128+
pub mod binary_search;
129+
127130
#[doc(hidden)]
128131
pub mod blob_tree;
129132

src/segment/block_index/mod.rs

+5-4
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ use super::{
1212
block::{offset::BlockOffset, Block},
1313
value_block::CachePolicy,
1414
};
15+
use crate::binary_search::partition_point;
1516
use block_handle::KeyedBlockHandle;
1617
use full_index::FullBlockIndex;
1718
use two_level_index::TwoLevelBlockIndex;
@@ -44,7 +45,7 @@ impl KeyedBlockIndex for [KeyedBlockHandle] {
4445
key: &[u8],
4546
_: CachePolicy,
4647
) -> crate::Result<Option<&KeyedBlockHandle>> {
47-
let idx = self.partition_point(|x| &*x.end_key < key);
48+
let idx = partition_point(self, |item| item.end_key < key);
4849
Ok(self.get(idx))
4950
}
5051

@@ -53,7 +54,7 @@ impl KeyedBlockIndex for [KeyedBlockHandle] {
5354
key: &[u8],
5455
_: CachePolicy,
5556
) -> crate::Result<Option<&KeyedBlockHandle>> {
56-
let idx = self.partition_point(|x| &*x.end_key <= key);
57+
let idx = partition_point(self, |x| &*x.end_key <= key);
5758

5859
if idx == 0 {
5960
return Ok(self.first());
@@ -129,10 +130,10 @@ pub enum BlockIndexImpl {
129130
#[allow(clippy::expect_used)]
130131
mod tests {
131132
use super::*;
132-
use crate::Slice;
133+
use crate::{segment::block::offset::BlockOffset, UserKey};
133134
use test_log::test;
134135

135-
fn bh<K: Into<Slice>>(end_key: K, offset: BlockOffset) -> KeyedBlockHandle {
136+
fn bh<K: Into<UserKey>>(end_key: K, offset: BlockOffset) -> KeyedBlockHandle {
136137
KeyedBlockHandle {
137138
end_key: end_key.into(),
138139
offset,

src/segment/value_block.rs

+5-5
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22
// This source code is licensed under both the Apache 2.0 and MIT License
33
// (found in the LICENSE-* files in the repository)
44

5-
use super::{
6-
block::{offset::BlockOffset, Block},
7-
id::GlobalSegmentId,
5+
use super::{block::Block, id::GlobalSegmentId};
6+
use crate::{
7+
binary_search::partition_point, descriptor_table::FileDescriptorTable,
8+
segment::block::offset::BlockOffset, value::InternalValue, Cache,
89
};
9-
use crate::{cache::Cache, descriptor_table::FileDescriptorTable, value::InternalValue};
1010
use std::sync::Arc;
1111

1212
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
@@ -28,7 +28,7 @@ pub type ValueBlock = Block<InternalValue>;
2828
impl ValueBlock {
2929
#[must_use]
3030
pub fn get_latest(&self, key: &[u8]) -> Option<&InternalValue> {
31-
let idx = self.items.partition_point(|item| &*item.key.user_key < key);
31+
let idx = partition_point(&self.items, |item| &*item.key.user_key < key);
3232

3333
self.items
3434
.get(idx)

src/segment/value_block_consumer.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
// (found in the LICENSE-* files in the repository)
44

55
use super::value_block::ValueBlock;
6-
use crate::value::InternalValue;
6+
use crate::{binary_search::partition_point, value::InternalValue};
77
use std::sync::Arc;
88

99
pub struct ValueBlockConsumer {
@@ -25,13 +25,13 @@ impl ValueBlockConsumer {
2525
end_key: Option<&[u8]>,
2626
) -> Self {
2727
let mut lo = start_key.as_ref().map_or(0, |key| {
28-
inner.items.partition_point(|x| &*x.key.user_key < *key)
28+
partition_point(&inner.items, |x| &*x.key.user_key < *key)
2929
});
3030

3131
let hi = end_key.as_ref().map_or_else(
3232
|| inner.items.len() - 1,
3333
|key| {
34-
let idx = inner.items.partition_point(|x| &*x.key.user_key <= *key);
34+
let idx = partition_point(&inner.items, |x| &*x.key.user_key <= *key);
3535

3636
if idx == 0 {
3737
let first = inner

0 commit comments

Comments
 (0)