Skip to content

Commit 637c1d4

Browse files
emmaling27Convex, Inc.
authored and
Convex, Inc.
committed
TermMetadata RPC (#26203)
This PR adds a new trait `TermMetadataFetcher` that will get the term metadata for a segment given term values and the number of documents containing that term that were deleted. This PR doesn't hook it up to the `build_new_segment` code, but the end goal is to get rid of `TermDictionary` lookups in backend. The searcher downloads the relevant segment, gets term ordinals based on the term value, and counts the number of terms that have been completely deleted (num_deleted_docs = doc frequency in the segment). GitOrigin-RevId: f911381c2b17272575c1bc672343e15d5c5f2723
1 parent 2b8c07c commit 637c1d4

File tree

3 files changed

+168
-1
lines changed

3 files changed

+168
-1
lines changed

crates/pb/protos/searchlight.proto

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ service Searchlight {
1212
rpc QueueVectorPrefetch(VectorPrefetchRequest) returns (VectorPrefetchResponse);
1313

1414
rpc NumberOfSegments(SegmentRequest) returns (SegmentResponse);
15+
rpc SegmentTermMetadata(SegmentTermMetadataRequest) returns (SegmentTermMetadataResponse);
1516

1617
// Query a set of tokens against the term dictionary, optionally allowing
1718
// for fuzzy matching and prefix matching. Take the top `K` results with
@@ -225,6 +226,27 @@ message SegmentResponse {
225226
uint32 number_of_segments = 1;
226227
}
227228

229+
message SegmentTermMetadataRequest {
230+
StorageType storage_type = 1;
231+
StorageKey segment = 2;
232+
repeated TermValueDeleteCount term_values_and_delete_counts = 3;
233+
}
234+
235+
message TermValueDeleteCount {
236+
optional bytes term_value = 1;
237+
optional uint32 num_docs_deleted = 2;
238+
}
239+
240+
message SegmentTermMetadataResponse {
241+
repeated TermOrdDeleteCount term_ords_and_delete_counts = 1;
242+
optional uint64 num_terms_deleted = 2;
243+
}
244+
245+
message TermOrdDeleteCount {
246+
optional uint64 term_ord = 1;
247+
optional uint32 num_docs_deleted = 2;
248+
}
249+
228250
message QueryTokensRequest {
229251
StorageType storage_type = 1;
230252
FragmentedTextSegmentPaths segment = 2;

crates/search/src/searcher/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@ pub use searcher::{
1616
PostingListQuery,
1717
Searcher,
1818
SearcherImpl,
19+
SegmentTermMetadata,
20+
SegmentTermMetadataFetcher,
1921
Term,
22+
TermValue,
2023
TextStorageKeys,
2124
TokenMatch,
2225
TokenQuery,

crates/search/src/searcher/searcher.rs

Lines changed: 143 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,10 @@ use pb::searchlight::{
4646
FragmentedVectorSegmentPaths,
4747
MultiSegmentMetadata,
4848
QueryBm25StatsResponse,
49+
SegmentTermMetadataResponse,
4950
SingleSegmentMetadata,
5051
StorageKey,
52+
TermOrdDeleteCount,
5153
};
5254
use storage::Storage;
5355
pub use tantivy::Term;
@@ -61,6 +63,7 @@ use tantivy::{
6163
EnableScoring,
6264
},
6365
schema::Field,
66+
termdict::TermOrdinal,
6467
SegmentReader,
6568
};
6669
use text_search::tracker::{
@@ -128,6 +131,7 @@ use crate::{
128131
TantivySearchIndexSchema,
129132
CREATION_TIME_FIELD_NAME,
130133
INTERNAL_ID_FIELD_NAME,
134+
SEARCH_FIELD_ID,
131135
TS_FIELD_NAME,
132136
};
133137

@@ -177,6 +181,87 @@ pub trait Searcher: VectorSearcher + Send + Sync + 'static {
177181
) -> anyhow::Result<Vec<PostingListMatch>>;
178182
}
179183

184+
#[cfg_attr(
185+
any(test, feature = "testing"),
186+
derive(proptest_derive::Arbitrary, PartialEq, Debug, Clone)
187+
)]
188+
/// Metadata about terms for a specific segment.
189+
pub struct SegmentTermMetadata {
190+
/// The number of documents containing the term that have been deleted, by
191+
/// term ordinal.
192+
pub term_documents_deleted: BTreeMap<TermOrdinal, u32>,
193+
/// The number of terms that have been completely deleted from the segment.
194+
pub num_terms_deleted: u64,
195+
}
196+
197+
impl TryFrom<SegmentTermMetadataResponse> for SegmentTermMetadata {
198+
type Error = anyhow::Error;
199+
200+
fn try_from(
201+
SegmentTermMetadataResponse {
202+
term_ords_and_delete_counts,
203+
num_terms_deleted,
204+
}: SegmentTermMetadataResponse,
205+
) -> Result<Self, Self::Error> {
206+
let term_documents_deleted = term_ords_and_delete_counts
207+
.into_iter()
208+
.map(
209+
|TermOrdDeleteCount {
210+
term_ord,
211+
num_docs_deleted,
212+
}| {
213+
let term_ord = term_ord.context("Missing term ord")?;
214+
let num_docs_deleted = num_docs_deleted.context("Missing term delete count")?;
215+
anyhow::Ok::<(TermOrdinal, u32)>((term_ord, num_docs_deleted))
216+
},
217+
)
218+
.try_collect()?;
219+
let num_terms_deleted = num_terms_deleted.context("Missing num terms deleted")?;
220+
Ok(SegmentTermMetadata {
221+
term_documents_deleted,
222+
num_terms_deleted,
223+
})
224+
}
225+
}
226+
227+
impl From<SegmentTermMetadata> for SegmentTermMetadataResponse {
228+
fn from(
229+
SegmentTermMetadata {
230+
term_documents_deleted,
231+
num_terms_deleted,
232+
}: SegmentTermMetadata,
233+
) -> Self {
234+
let term_ords_and_delete_counts = term_documents_deleted
235+
.into_iter()
236+
.map(|(term_ord, num_docs_deleted)| TermOrdDeleteCount {
237+
term_ord: Some(term_ord),
238+
num_docs_deleted: Some(num_docs_deleted),
239+
})
240+
.collect();
241+
SegmentTermMetadataResponse {
242+
term_ords_and_delete_counts,
243+
num_terms_deleted: Some(num_terms_deleted),
244+
}
245+
}
246+
}
247+
248+
/// The value of a tantivy `Term`, should only be constructed from
249+
/// `term.value_bytes()` or protos that contain the same bytes.
250+
pub type TermValue = Vec<u8>;
251+
252+
#[async_trait]
253+
pub trait SegmentTermMetadataFetcher {
254+
/// Gets the term ordinal from term values and determines how many terms
255+
/// have been completely deleted from a segment, given the number of
256+
/// documents deleted containing each term.
257+
async fn segment_term_metadata(
258+
&self,
259+
search_storage: Arc<dyn Storage>,
260+
segment: ObjectKey,
261+
terms: BTreeMap<TermValue, u32>,
262+
) -> anyhow::Result<SegmentTermMetadata>;
263+
}
264+
180265
pub struct SearcherImpl<RT: Runtime> {
181266
pub(crate) archive_cache: ArchiveCacheManager<RT>,
182267
segment_cache: SegmentCache<RT>,
@@ -535,6 +620,43 @@ impl<RT: Runtime> Searcher for SearcherImpl<RT> {
535620
}
536621
}
537622

623+
#[async_trait]
624+
impl<RT: Runtime> SegmentTermMetadataFetcher for SearcherImpl<RT> {
625+
async fn segment_term_metadata(
626+
&self,
627+
search_storage: Arc<dyn Storage>,
628+
segment: ObjectKey,
629+
terms: BTreeMap<TermValue, u32>,
630+
) -> anyhow::Result<SegmentTermMetadata> {
631+
let segment_path = self
632+
.archive_cache
633+
.get(search_storage, &segment, SearchFileType::Text)
634+
.await?;
635+
let reader = index_reader_for_directory(segment_path)?;
636+
let searcher = reader.searcher();
637+
// Multisegment indexes only write to one segment.
638+
let segment = searcher.segment_reader(0);
639+
let inverted_index = segment.inverted_index(Field::from_field_id(SEARCH_FIELD_ID))?;
640+
let term_dict = inverted_index.terms();
641+
let mut term_documents_deleted = BTreeMap::new();
642+
let mut num_terms_deleted = 0;
643+
for (term, num_documents_deleted) in terms {
644+
let term_ord = term_dict
645+
.term_ord(term)?
646+
.context("Segment must contain term")?;
647+
let doc_freq = term_dict.term_info_from_ord(term_ord).doc_freq;
648+
if doc_freq == num_documents_deleted {
649+
num_terms_deleted += 1;
650+
}
651+
term_documents_deleted.insert(term_ord, num_documents_deleted);
652+
}
653+
Ok(SegmentTermMetadata {
654+
term_documents_deleted,
655+
num_terms_deleted,
656+
})
657+
}
658+
}
659+
538660
#[async_trait]
539661
impl<RT: Runtime> VectorSearcher for SearcherImpl<RT> {
540662
async fn execute_multi_segment_vector_query(
@@ -1399,6 +1521,12 @@ mod tests {
13991521
types::Timestamp,
14001522
};
14011523
use futures::StreamExt;
1524+
use pb::searchlight::SegmentTermMetadataResponse;
1525+
use proptest::{
1526+
arbitrary::any,
1527+
prelude::*,
1528+
proptest,
1529+
};
14021530
use runtime::testing::TestRuntime;
14031531
use tantivy::{
14041532
Index,
@@ -1411,13 +1539,17 @@ mod tests {
14111539
};
14121540
use value::{
14131541
assert_obj,
1542+
testing::assert_roundtrips,
14141543
FieldPath,
14151544
InternalId,
14161545
ResolvedDocumentId,
14171546
TabletIdAndTableNumber,
14181547
};
14191548

1420-
use super::PostingListMatch;
1549+
use super::{
1550+
PostingListMatch,
1551+
SegmentTermMetadata,
1552+
};
14211553
use crate::{
14221554
convex_query::OrTerm,
14231555
disk_index::index_reader_for_directory,
@@ -2036,4 +2168,14 @@ mod tests {
20362168
id_tracker: StaticIdTracker::load_from_path(paths.id_tracker_path.clone())?,
20372169
})
20382170
}
2171+
2172+
proptest! {
2173+
#![proptest_config(
2174+
ProptestConfig { failure_persistence: None, ..ProptestConfig::default() }
2175+
)]
2176+
#[test]
2177+
fn term_metadata_roundtrips(term_metadata in any::<SegmentTermMetadata>()) {
2178+
assert_roundtrips::<SegmentTermMetadata, SegmentTermMetadataResponse>(term_metadata);
2179+
}
2180+
}
20392181
}

0 commit comments

Comments
 (0)