Skip to content

Commit 3f5dcf1

Browse files
emmaling27Convex, Inc.
authored and
Convex, Inc.
committed
Search deleted terms tracker (#25254)
This PR adds the logic for the search deleted terms tracker. We'll write three tracker files - id tracker, deleted bitset, and deleted terms since not all of them are needed everywhere. The deleted terms file format is pretty similar to id tracker but instead of using perfect hashing, we store the term ordinals and number of deleted documents corresponding to that term in separate structures: `EliasFano` for the `TermOrdinal`s because they are monotonic increasing integers, and `DacsOpt` for the counts of deleted documents because they will be small positive integers. GitOrigin-RevId: 8e42c994af566b27d0a16219898e36459ec19af1
1 parent e61ca14 commit 3f5dcf1

File tree

13 files changed

+550
-123
lines changed

13 files changed

+550
-123
lines changed

Cargo.lock

Lines changed: 26 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,8 @@ sourcemap = "7"
109109
strum = { version = "0.26", features = [ "derive" ] }
110110
sucds = { version = "0.8.1", features = [ "intrinsics" ] }
111111
syn = { version = "2.0", features = [ "full" ] }
112-
tantivy = { git = "https://github.com/get-convex/tantivy", branch = "rakeeb/reexport-tantivy-fst" }
113-
tantivy-common = { git = "https://github.com/get-convex/tantivy", branch = "rakeeb/reexport-tantivy-fst" }
112+
tantivy = { git = "https://github.com/get-convex/tantivy", rev = "a0f4d3c3843dd255ac6a7210830b3f91cf16db8f" }
113+
tantivy-common = { git = "https://github.com/get-convex/tantivy", rev = "a0f4d3c3843dd255ac6a7210830b3f91cf16db8f" }
114114
tempfile = "3"
115115
thiserror = "1"
116116
thousands = "0.2.0"

crates/common/src/deleted_bitset.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,10 @@ use std::{
77
Write,
88
},
99
mem,
10-
path::PathBuf,
10+
path::{
11+
Path,
12+
PathBuf,
13+
},
1114
};
1215

1316
use bitvec::{
@@ -30,7 +33,7 @@ use byteorder::{
3033
/// - bitset blocks (dense array of little-endian u64s): bitset contents
3134
pub const DELETED_BITSET_VERSION: u8 = 1;
3235

33-
#[derive(Clone)]
36+
#[derive(Clone, Default)]
3437
pub struct DeletedBitset {
3538
deleted: BitVec,
3639
num_deleted: usize,
@@ -121,7 +124,10 @@ impl DeletedBitset {
121124
Ok(())
122125
}
123126

124-
pub fn load_from_path(path: PathBuf) -> anyhow::Result<Self> {
127+
pub fn load_from_path<P>(path: P) -> anyhow::Result<Self>
128+
where
129+
P: AsRef<Path>,
130+
{
125131
let deleted_file = File::open(path)?;
126132
Self::load(
127133
deleted_file.metadata()?.len() as usize,

crates/search/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ sucds = { workspace = true }
4040
tantivy = { workspace = true }
4141
tantivy-common = { workspace = true }
4242
tempfile = { workspace = true }
43+
text_search = { path = "../text_search" }
4344
tokio = { workspace = true }
4445
tracing = { workspace = true }
4546
uuid = { workspace = true }
@@ -60,6 +61,7 @@ proptest-derive = { workspace = true }
6061
rand = { workspace = true }
6162
runtime = { path = "../runtime", features = ["testing"] }
6263
storage = { path = "../storage", features = ["testing"] }
64+
text_search = { path = "../text_search", features = ["testing"] }
6365
value = { path = "../value", features = ["testing"] }
6466
vector = { path = "../vector", features = ["testing"] }
6567

@@ -72,6 +74,7 @@ testing = [
7274
"proptest",
7375
"proptest-derive",
7476
"storage/testing",
77+
"text_search/testing",
7578
"value/testing",
7679
"vector/testing",
7780
]

crates/search/src/archive/cache.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -377,6 +377,9 @@ fn is_immutable(search_file_type: SearchFileType) -> bool {
377377
SearchFileType::VectorIdTracker => true,
378378
// Text indexes do not appear to be read in readonly mode.
379379
SearchFileType::Text => false,
380+
SearchFileType::TextIdTracker => true,
381+
SearchFileType::TextDeletedBitset => true,
382+
SearchFileType::TextDeletedTerms => true,
380383
}
381384
}
382385

crates/search/src/convex_query.rs

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ use std::{
33
fmt,
44
};
55

6+
use common::deleted_bitset::DeletedBitset;
67
use tantivy::{
78
query::{
89
intersect_scorers,
@@ -25,7 +26,6 @@ use tantivy::{
2526
Term,
2627
TERMINATED,
2728
};
28-
use tantivy_common::ReadOnlyBitSet;
2929

3030
/// A query for documents that:
3131
/// 1. Contain at least one of the OR terms.
@@ -151,29 +151,26 @@ impl Weight for ConvexSearchWeight {
151151
#[derive(Clone)]
152152
pub struct DeletedDocuments {
153153
pub memory_deleted: BTreeSet<DocId>,
154-
pub segment_deleted: ReadOnlyBitSet,
155-
// NB: `ReadOnlyBitSet::len` is linear time, so use our precomputed count
156-
// of the number of documents deleted.
157-
pub num_segment_deleted: usize,
154+
pub segment_deleted: DeletedBitset,
158155
}
159156

160157
impl fmt::Debug for DeletedDocuments {
161158
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
162159
f.debug_struct("DeletedDocuments")
163160
.field("memory_deleted", &self.memory_deleted)
164161
.field("segment_deleted", &"<bitset>")
165-
.field("num_segment_deleted", &self.num_segment_deleted)
162+
.field("num_segment_deleted", &self.segment_deleted.num_deleted())
166163
.finish()
167164
}
168165
}
169166

170167
impl DeletedDocuments {
171168
pub fn contains(&self, doc: DocId) -> bool {
172-
self.memory_deleted.contains(&doc) || self.segment_deleted.contains(doc)
169+
self.memory_deleted.contains(&doc) || self.segment_deleted.is_deleted(doc)
173170
}
174171

175172
pub fn len(&self) -> usize {
176-
self.memory_deleted.len() + self.num_segment_deleted
173+
self.memory_deleted.len() + self.segment_deleted.num_deleted()
177174
}
178175
}
179176

crates/search/src/lib.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -875,6 +875,9 @@ pub enum SearchFileType {
875875
VectorDeletedBitset,
876876
VectorIdTracker,
877877
Text,
878+
TextIdTracker,
879+
TextDeletedBitset,
880+
TextDeletedTerms,
878881
}
879882

880883
#[cfg(test)]

crates/search/src/metrics.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,9 @@ impl SearchFileType {
425425
SearchFileType::VectorDeletedBitset => "vector_deleted_bitset",
426426
SearchFileType::VectorIdTracker => "vector_id_tracker",
427427
SearchFileType::Text => "text",
428+
SearchFileType::TextIdTracker => "text_id_tracker",
429+
SearchFileType::TextDeletedBitset => "text_deleted_bitset",
430+
SearchFileType::TextDeletedTerms => "text_deleted_terms",
428431
SearchFileType::FragmentedVectorSegment => "fragmented_vector_segment",
429432
};
430433
StaticMetricLabel::new(SEARCH_FILE_TYPE, search_type_str)

0 commit comments

Comments
 (0)