@@ -46,8 +46,10 @@ use pb::searchlight::{
46
46
FragmentedVectorSegmentPaths ,
47
47
MultiSegmentMetadata ,
48
48
QueryBm25StatsResponse ,
49
+ SegmentTermMetadataResponse ,
49
50
SingleSegmentMetadata ,
50
51
StorageKey ,
52
+ TermOrdDeleteCount ,
51
53
} ;
52
54
use storage:: Storage ;
53
55
pub use tantivy:: Term ;
@@ -61,6 +63,7 @@ use tantivy::{
61
63
EnableScoring ,
62
64
} ,
63
65
schema:: Field ,
66
+ termdict:: TermOrdinal ,
64
67
SegmentReader ,
65
68
} ;
66
69
use text_search:: tracker:: {
@@ -128,6 +131,7 @@ use crate::{
128
131
TantivySearchIndexSchema ,
129
132
CREATION_TIME_FIELD_NAME ,
130
133
INTERNAL_ID_FIELD_NAME ,
134
+ SEARCH_FIELD_ID ,
131
135
TS_FIELD_NAME ,
132
136
} ;
133
137
@@ -177,6 +181,87 @@ pub trait Searcher: VectorSearcher + Send + Sync + 'static {
177
181
) -> anyhow:: Result < Vec < PostingListMatch > > ;
178
182
}
179
183
184
+ #[ cfg_attr(
185
+ any( test, feature = "testing" ) ,
186
+ derive( proptest_derive:: Arbitrary , PartialEq , Debug , Clone )
187
+ ) ]
188
+ /// Metadata about terms for a specific segment.
189
+ pub struct SegmentTermMetadata {
190
+ /// The number of documents containing the term that have been deleted, by
191
+ /// term ordinal.
192
+ pub term_documents_deleted : BTreeMap < TermOrdinal , u32 > ,
193
+ /// The number of terms that have been completely deleted from the segment.
194
+ pub num_terms_deleted : u64 ,
195
+ }
196
+
197
+ impl TryFrom < SegmentTermMetadataResponse > for SegmentTermMetadata {
198
+ type Error = anyhow:: Error ;
199
+
200
+ fn try_from (
201
+ SegmentTermMetadataResponse {
202
+ term_ords_and_delete_counts,
203
+ num_terms_deleted,
204
+ } : SegmentTermMetadataResponse ,
205
+ ) -> Result < Self , Self :: Error > {
206
+ let term_documents_deleted = term_ords_and_delete_counts
207
+ . into_iter ( )
208
+ . map (
209
+ |TermOrdDeleteCount {
210
+ term_ord,
211
+ num_docs_deleted,
212
+ } | {
213
+ let term_ord = term_ord. context ( "Missing term ord" ) ?;
214
+ let num_docs_deleted = num_docs_deleted. context ( "Missing term delete count" ) ?;
215
+ anyhow:: Ok :: < ( TermOrdinal , u32 ) > ( ( term_ord, num_docs_deleted) )
216
+ } ,
217
+ )
218
+ . try_collect ( ) ?;
219
+ let num_terms_deleted = num_terms_deleted. context ( "Missing num terms deleted" ) ?;
220
+ Ok ( SegmentTermMetadata {
221
+ term_documents_deleted,
222
+ num_terms_deleted,
223
+ } )
224
+ }
225
+ }
226
+
227
+ impl From < SegmentTermMetadata > for SegmentTermMetadataResponse {
228
+ fn from (
229
+ SegmentTermMetadata {
230
+ term_documents_deleted,
231
+ num_terms_deleted,
232
+ } : SegmentTermMetadata ,
233
+ ) -> Self {
234
+ let term_ords_and_delete_counts = term_documents_deleted
235
+ . into_iter ( )
236
+ . map ( |( term_ord, num_docs_deleted) | TermOrdDeleteCount {
237
+ term_ord : Some ( term_ord) ,
238
+ num_docs_deleted : Some ( num_docs_deleted) ,
239
+ } )
240
+ . collect ( ) ;
241
+ SegmentTermMetadataResponse {
242
+ term_ords_and_delete_counts,
243
+ num_terms_deleted : Some ( num_terms_deleted) ,
244
+ }
245
+ }
246
+ }
247
+
248
+ /// The value of a tantivy `Term`, should only be constructed from
249
+ /// `term.value_bytes()` or protos that contain the same bytes.
250
+ pub type TermValue = Vec < u8 > ;
251
+
252
+ #[ async_trait]
253
+ pub trait SegmentTermMetadataFetcher {
254
+ /// Gets the term ordinal from term values and determines how many terms
255
+ /// have been completely deleted from a segment, given the number of
256
+ /// documents deleted containing each term.
257
+ async fn segment_term_metadata (
258
+ & self ,
259
+ search_storage : Arc < dyn Storage > ,
260
+ segment : ObjectKey ,
261
+ terms : BTreeMap < TermValue , u32 > ,
262
+ ) -> anyhow:: Result < SegmentTermMetadata > ;
263
+ }
264
+
180
265
pub struct SearcherImpl < RT : Runtime > {
181
266
pub ( crate ) archive_cache : ArchiveCacheManager < RT > ,
182
267
segment_cache : SegmentCache < RT > ,
@@ -535,6 +620,43 @@ impl<RT: Runtime> Searcher for SearcherImpl<RT> {
535
620
}
536
621
}
537
622
623
+ #[ async_trait]
624
+ impl < RT : Runtime > SegmentTermMetadataFetcher for SearcherImpl < RT > {
625
+ async fn segment_term_metadata (
626
+ & self ,
627
+ search_storage : Arc < dyn Storage > ,
628
+ segment : ObjectKey ,
629
+ terms : BTreeMap < TermValue , u32 > ,
630
+ ) -> anyhow:: Result < SegmentTermMetadata > {
631
+ let segment_path = self
632
+ . archive_cache
633
+ . get ( search_storage, & segment, SearchFileType :: Text )
634
+ . await ?;
635
+ let reader = index_reader_for_directory ( segment_path) ?;
636
+ let searcher = reader. searcher ( ) ;
637
+ // Multisegment indexes only write to one segment.
638
+ let segment = searcher. segment_reader ( 0 ) ;
639
+ let inverted_index = segment. inverted_index ( Field :: from_field_id ( SEARCH_FIELD_ID ) ) ?;
640
+ let term_dict = inverted_index. terms ( ) ;
641
+ let mut term_documents_deleted = BTreeMap :: new ( ) ;
642
+ let mut num_terms_deleted = 0 ;
643
+ for ( term, num_documents_deleted) in terms {
644
+ let term_ord = term_dict
645
+ . term_ord ( term) ?
646
+ . context ( "Segment must contain term" ) ?;
647
+ let doc_freq = term_dict. term_info_from_ord ( term_ord) . doc_freq ;
648
+ if doc_freq == num_documents_deleted {
649
+ num_terms_deleted += 1 ;
650
+ }
651
+ term_documents_deleted. insert ( term_ord, num_documents_deleted) ;
652
+ }
653
+ Ok ( SegmentTermMetadata {
654
+ term_documents_deleted,
655
+ num_terms_deleted,
656
+ } )
657
+ }
658
+ }
659
+
538
660
#[ async_trait]
539
661
impl < RT : Runtime > VectorSearcher for SearcherImpl < RT > {
540
662
async fn execute_multi_segment_vector_query (
@@ -1399,6 +1521,12 @@ mod tests {
1399
1521
types:: Timestamp ,
1400
1522
} ;
1401
1523
use futures:: StreamExt ;
1524
+ use pb:: searchlight:: SegmentTermMetadataResponse ;
1525
+ use proptest:: {
1526
+ arbitrary:: any,
1527
+ prelude:: * ,
1528
+ proptest,
1529
+ } ;
1402
1530
use runtime:: testing:: TestRuntime ;
1403
1531
use tantivy:: {
1404
1532
Index ,
@@ -1411,13 +1539,17 @@ mod tests {
1411
1539
} ;
1412
1540
use value:: {
1413
1541
assert_obj,
1542
+ testing:: assert_roundtrips,
1414
1543
FieldPath ,
1415
1544
InternalId ,
1416
1545
ResolvedDocumentId ,
1417
1546
TabletIdAndTableNumber ,
1418
1547
} ;
1419
1548
1420
- use super :: PostingListMatch ;
1549
+ use super :: {
1550
+ PostingListMatch ,
1551
+ SegmentTermMetadata ,
1552
+ } ;
1421
1553
use crate :: {
1422
1554
convex_query:: OrTerm ,
1423
1555
disk_index:: index_reader_for_directory,
@@ -2036,4 +2168,14 @@ mod tests {
2036
2168
id_tracker : StaticIdTracker :: load_from_path ( paths. id_tracker_path . clone ( ) ) ?,
2037
2169
} )
2038
2170
}
2171
+
2172
+ proptest ! {
2173
+ #![ proptest_config(
2174
+ ProptestConfig { failure_persistence: None , ..ProptestConfig :: default ( ) }
2175
+ ) ]
2176
+ #[ test]
2177
+ fn term_metadata_roundtrips( term_metadata in any:: <SegmentTermMetadata >( ) ) {
2178
+ assert_roundtrips:: <SegmentTermMetadata , SegmentTermMetadataResponse >( term_metadata) ;
2179
+ }
2180
+ }
2039
2181
}
0 commit comments