Skip to content

Commit 992ae8f

Browse files
emmaling27Convex, Inc.
authored andcommitted
Use tantivy's AliveBitSet instead of our DeletedBitset for storing deleted tantivy ids (#25520)
Using tantivy's `AliveBitSet` instead of the `DeletedBitset` we use for vector enables us to reuse their segment merge logic. Also splits `MemoryIdAndDeletionTracker` into `SearchMemoryIdTracker`, which is in charge of building the id tracker, and `MemoryDeletionTracker`, which tracks deletes in other segments. GitOrigin-RevId: ec770f176d56c847ce4968fad01ddf1cd72e0188
1 parent 657ceea commit 992ae8f

File tree

10 files changed

+192
-185
lines changed

10 files changed

+192
-185
lines changed

Cargo.lock

Lines changed: 10 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,8 @@ sourcemap = "7"
109109
strum = { version = "0.26", features = [ "derive" ] }
110110
sucds = { version = "0.8.1", features = [ "intrinsics" ] }
111111
syn = { version = "2.0", features = [ "full" ] }
112-
tantivy = { git = "https://github.com/get-convex/tantivy", rev = "a0f4d3c3843dd255ac6a7210830b3f91cf16db8f" }
113-
tantivy-common = { git = "https://github.com/get-convex/tantivy", rev = "a0f4d3c3843dd255ac6a7210830b3f91cf16db8f" }
112+
tantivy = { git = "https://github.com/get-convex/tantivy", rev = "b0bc7a40d54c92c8446098f20082756f218cdd9d" }
113+
tantivy-common = { git = "https://github.com/get-convex/tantivy", rev = "b0bc7a40d54c92c8446098f20082756f218cdd9d" }
114114
tempfile = "3"
115115
thiserror = "1"
116116
thousands = "0.2.0"

crates/common/src/id_tracker.rs

Lines changed: 6 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,6 @@ use byteorder::{
1616
};
1717
use csf::ls::Map as CsfMap;
1818

19-
use crate::deleted_bitset::DeletedBitset;
20-
2119
/// Version 1 of the id table has the following format:
2220
/// ```
2321
/// [ version ] [ count ] [ index_len ] [ ID ]* [ index ]
@@ -42,35 +40,23 @@ pub struct StaticIdTracker {
4240
/// Convex IDs in search/vector index id order.
4341
id_buf: Vec<u8>,
4442
csf_map: CsfMap,
45-
deleted: DeletedBitset,
4643
}
4744

4845
impl StaticIdTracker {
49-
pub fn load_from_path(
50-
id_table_path: PathBuf,
51-
deleted_bitset: DeletedBitset,
52-
) -> anyhow::Result<Self> {
46+
pub fn load_from_path(id_table_path: PathBuf) -> anyhow::Result<Self> {
5347
let uuid_file = File::open(id_table_path)?;
54-
StaticIdTracker::load(
55-
(
56-
uuid_file.metadata()?.len() as usize,
57-
BufReader::new(uuid_file),
58-
),
59-
deleted_bitset,
60-
)
48+
StaticIdTracker::load((
49+
uuid_file.metadata()?.len() as usize,
50+
BufReader::new(uuid_file),
51+
))
6152
}
6253

63-
pub fn load(
64-
uuid_file: (usize, impl Read),
65-
deleted_bitset: DeletedBitset,
66-
) -> anyhow::Result<Self> {
54+
pub fn load(uuid_file: (usize, impl Read)) -> anyhow::Result<Self> {
6755
let (count, uuid_buf, csf_map) = Self::load_ids(uuid_file.0, uuid_file.1)?;
68-
anyhow::ensure!(count == deleted_bitset.len());
6956
Ok(Self {
7057
count,
7158
id_buf: uuid_buf,
7259
csf_map,
73-
deleted: deleted_bitset,
7460
})
7561
}
7662

@@ -120,10 +106,6 @@ impl StaticIdTracker {
120106
self.count
121107
}
122108

123-
pub fn deleted(&self) -> &DeletedBitset {
124-
&self.deleted
125-
}
126-
127109
pub fn lookup(&self, convex_id: [u8; 16]) -> Option<u32> {
128110
let index_id = self.csf_map.get(&convex_id);
129111
let found_convex_id = self.get_convex_id(index_id as usize)?;

crates/search/src/convex_query.rs

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@ use std::{
33
fmt,
44
};
55

6-
use common::deleted_bitset::DeletedBitset;
76
use tantivy::{
7+
fastfield::AliveBitSet,
88
query::{
99
intersect_scorers,
1010
BooleanQuery,
@@ -151,26 +151,31 @@ impl Weight for ConvexSearchWeight {
151151
#[derive(Clone)]
152152
pub struct DeletedDocuments {
153153
pub memory_deleted: BTreeSet<DocId>,
154-
pub segment_deleted: DeletedBitset,
154+
pub segment_alive_bitset: AliveBitSet,
155155
}
156156

157157
impl fmt::Debug for DeletedDocuments {
158158
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
159159
f.debug_struct("DeletedDocuments")
160160
.field("memory_deleted", &self.memory_deleted)
161161
.field("segment_deleted", &"<bitset>")
162-
.field("num_segment_deleted", &self.segment_deleted.num_deleted())
162+
.field(
163+
"num_segment_alive",
164+
&self.segment_alive_bitset.num_alive_docs(),
165+
)
163166
.finish()
164167
}
165168
}
166169

167170
impl DeletedDocuments {
168171
pub fn contains(&self, doc: DocId) -> bool {
169-
self.memory_deleted.contains(&doc) || self.segment_deleted.is_deleted(doc)
172+
self.memory_deleted.contains(&doc) || self.segment_alive_bitset.is_deleted(doc)
170173
}
171174

172-
pub fn len(&self) -> usize {
173-
self.memory_deleted.len() + self.segment_deleted.num_deleted()
175+
pub fn approximate_num_alive_docs(&self) -> usize {
176+
self.segment_alive_bitset
177+
.num_alive_docs()
178+
.saturating_sub(self.memory_deleted.len())
174179
}
175180
}
176181

@@ -225,9 +230,7 @@ impl<T: DocSet> DocSet for ExcludeDeleted<T> {
225230
}
226231

227232
fn size_hint(&self) -> u32 {
228-
self.docset
229-
.size_hint()
230-
.saturating_sub(self.deleted_documents.len() as u32)
233+
self.deleted_documents.approximate_num_alive_docs() as u32
231234
}
232235
}
233236

crates/search/src/fragmented_segment.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -292,18 +292,18 @@ impl MutableFragmentedSegmentMetadata {
292292
download_single_file_zip(&original.deleted_bitset_key, &deleted_bitset_path, storage)
293293
.await?;
294294

295-
let deleted_bitset = DeletedBitset::load_from_path(deleted_bitset_path)?;
295+
let deleted = DeletedBitset::load_from_path(deleted_bitset_path)?;
296296

297297
// Clone is a bit of a hack here because these two deleted bitsets may become
298298
// inconsistent if one or more vectors are deleted via maybe_delete.
299299
// For now we don't care about the inconsistency because the loaded id tracker
300300
// is only used as part of maybe_delete, which is idempotent.
301-
let id_tracker = VectorStaticIdTracker(StaticIdTracker::load_from_path(
302-
id_tracker_path,
303-
deleted_bitset.clone(),
304-
)?);
301+
let id_tracker = VectorStaticIdTracker {
302+
id_tracker: StaticIdTracker::load_from_path(id_tracker_path)?,
303+
deleted_bitset: deleted.clone(),
304+
};
305305

306-
Ok(Self::new(original, id_tracker, deleted_bitset))
306+
Ok(Self::new(original, id_tracker, deleted))
307307
}
308308

309309
pub async fn upload_deleted_bitset(

crates/search/src/incremental_index.rs

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,10 @@ use tantivy::{
99
IndexBuilder,
1010
SingleSegmentIndexWriter,
1111
};
12-
use text_search::tracker::MemoryIdAndDeletionTracker;
12+
use text_search::tracker::{
13+
MemoryDeletionTracker,
14+
SearchMemoryIdTracker,
15+
};
1316

1417
use crate::{
1518
constants::CONVEX_EN_TOKENIZER,
@@ -24,7 +27,7 @@ const SEGMENT_MAX_SIZE_BYTES: usize = 10_000_000;
2427
#[allow(dead_code)]
2528
pub(crate) const ID_TRACKER_PATH: &str = "id_tracker";
2629
#[allow(dead_code)]
27-
pub(crate) const DELETED_TANTIVY_IDS_PATH: &str = "deleted_tantivy_ids";
30+
pub(crate) const ALIVE_BITSET_PATH: &str = "tantivy_alive_bitset";
2831
#[allow(dead_code)]
2932
pub(crate) const DELETED_TERMS_PATH: &str = "deleted_terms";
3033

@@ -42,7 +45,7 @@ pub async fn build_index(
4245
.tokenizers()
4346
.register(CONVEX_EN_TOKENIZER, convex_en());
4447
let mut segment_writer = SingleSegmentIndexWriter::new(index, SEGMENT_MAX_SIZE_BYTES)?;
45-
let mut tracker = MemoryIdAndDeletionTracker::default();
48+
let mut id_tracker = SearchMemoryIdTracker::default();
4649
futures::pin_mut!(revision_stream);
4750
// Keep track of the document IDs we've seen so we can check for duplicates.
4851
// We'll discard revisions to documents that we've already seen because we are
@@ -58,13 +61,14 @@ pub async fn build_index(
5861
let tantivy_document =
5962
tantivy_schema.index_into_tantivy_document(new_document, revision_pair.ts());
6063
let doc_id = segment_writer.add_document(tantivy_document)?;
61-
tracker.set_link(convex_id, doc_id)?;
64+
id_tracker.set_link(convex_id, doc_id)?;
6265
}
6366
}
6467
segment_writer.finalize()?;
68+
id_tracker.write(dir.to_path_buf().join(ID_TRACKER_PATH))?;
69+
let tracker = MemoryDeletionTracker::new(document_ids_seen.len() as u32);
6570
tracker.write(
66-
dir.to_path_buf().join(ID_TRACKER_PATH),
67-
dir.to_path_buf().join(DELETED_TANTIVY_IDS_PATH),
71+
dir.to_path_buf().join(ALIVE_BITSET_PATH),
6872
dir.to_path_buf().join(DELETED_TERMS_PATH),
6973
)?;
7074
Ok(())

0 commit comments

Comments
 (0)