Skip to content

Commit aaa82ef

Browse files
emmaling27Convex, Inc.
authored and
Convex, Inc.
committed
Use search2 query path with a single segment (#26037)
This PR adds a knob for using the `search2` query path and adds support for using `search2` with the existing single-segment search indexes. For the single search index, the id tracker is optional and we construct an empty deletion tracker. GitOrigin-RevId: 03734183805da31a713217fd7250e32cfa75e64c
1 parent 05c9f64 commit aaa82ef

File tree

10 files changed

+356
-128
lines changed

10 files changed

+356
-128
lines changed

crates/common/src/knobs.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1018,3 +1018,7 @@ pub static BUILD_MULTI_SEGMENT_TEXT_INDEXES: LazyLock<bool> =
10181018
/// the "backend_startup" domain keyed by db cluster name.
10191019
pub static STARTUP_RATE_LIMIT_ENABLED: LazyLock<bool> =
10201020
LazyLock::new(|| env_config("STARTUP_RATE_LIMIT_ENABLED", false));
1021+
1022+
/// Use multi segment search algorithm for search queries.
1023+
pub static USE_MULTI_SEGMENT_SEARCH_QUERY: LazyLock<bool> =
1024+
LazyLock::new(|| env_config("USE_MULTI_SEGMENT_SEARCH_QUERY", false));

crates/database/src/tests/randomized_search_tests.rs

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,11 +71,11 @@ use search::{
7171
scoring::Bm25StatisticsDiff,
7272
searcher::{
7373
Bm25Stats,
74-
FragmentedTextSegmentStorageKeys,
7574
InProcessSearcher,
7675
PostingListMatch,
7776
PostingListQuery,
7877
Term,
78+
TextStorageKeys,
7979
TokenMatch,
8080
TokenQuery,
8181
},
@@ -963,10 +963,18 @@ impl Searcher for BrokenSearcher {
963963
anyhow::bail!("要");
964964
}
965965

966+
async fn number_of_segments(
967+
&self,
968+
_search_storage: Arc<dyn Storage>,
969+
_storage_key: ObjectKey,
970+
) -> anyhow::Result<usize> {
971+
anyhow::bail!("wut")
972+
}
973+
966974
async fn query_tokens(
967975
&self,
968976
_: Arc<dyn Storage>,
969-
_: FragmentedTextSegmentStorageKeys,
977+
_: TextStorageKeys,
970978
_: Vec<TokenQuery>,
971979
_: usize,
972980
) -> anyhow::Result<Vec<TokenMatch>> {
@@ -976,7 +984,7 @@ impl Searcher for BrokenSearcher {
976984
async fn query_bm25_stats(
977985
&self,
978986
_: Arc<dyn Storage>,
979-
_: FragmentedTextSegmentStorageKeys,
987+
_: TextStorageKeys,
980988
_: Vec<Term>,
981989
) -> anyhow::Result<Bm25Stats> {
982990
anyhow::bail!("plein")
@@ -985,7 +993,7 @@ impl Searcher for BrokenSearcher {
985993
async fn query_posting_lists(
986994
&self,
987995
_: Arc<dyn Storage>,
988-
_: FragmentedTextSegmentStorageKeys,
996+
_: TextStorageKeys,
989997
_: PostingListQuery,
990998
) -> anyhow::Result<Vec<PostingListMatch>> {
991999
anyhow::bail!("texte");

crates/database/src/tests/vector_test_utils.rs

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,12 @@ use search::{
5151
scoring::Bm25StatisticsDiff,
5252
searcher::{
5353
Bm25Stats,
54-
FragmentedTextSegmentStorageKeys,
5554
InProcessSearcher,
5655
PostingListMatch,
5756
PostingListQuery,
5857
Searcher,
5958
Term,
59+
TextStorageKeys,
6060
TokenMatch,
6161
TokenQuery,
6262
},
@@ -576,10 +576,20 @@ impl<RT: Runtime> Searcher for DeleteOnCompactSearchlight<RT> {
576576
.await
577577
}
578578

579+
async fn number_of_segments(
580+
&self,
581+
search_storage: Arc<dyn Storage>,
582+
storage_key: ObjectKey,
583+
) -> anyhow::Result<usize> {
584+
self.searcher
585+
.number_of_segments(search_storage, storage_key)
586+
.await
587+
}
588+
579589
async fn query_tokens(
580590
&self,
581591
search_storage: Arc<dyn Storage>,
582-
storage_keys: FragmentedTextSegmentStorageKeys,
592+
storage_keys: TextStorageKeys,
583593
queries: Vec<TokenQuery>,
584594
max_results: usize,
585595
) -> anyhow::Result<Vec<TokenMatch>> {
@@ -591,7 +601,7 @@ impl<RT: Runtime> Searcher for DeleteOnCompactSearchlight<RT> {
591601
async fn query_bm25_stats(
592602
&self,
593603
search_storage: Arc<dyn Storage>,
594-
storage_keys: FragmentedTextSegmentStorageKeys,
604+
storage_keys: TextStorageKeys,
595605
terms: Vec<Term>,
596606
) -> anyhow::Result<Bm25Stats> {
597607
self.searcher
@@ -602,7 +612,7 @@ impl<RT: Runtime> Searcher for DeleteOnCompactSearchlight<RT> {
602612
async fn query_posting_lists(
603613
&self,
604614
search_storage: Arc<dyn Storage>,
605-
storage_keys: FragmentedTextSegmentStorageKeys,
615+
storage_keys: TextStorageKeys,
606616
query: PostingListQuery,
607617
) -> anyhow::Result<Vec<PostingListMatch>> {
608618
self.searcher

crates/pb/protos/searchlight.proto

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ service Searchlight {
1111
rpc ExecuteVectorCompaction(VectorCompactionRequest) returns (VectorCompactionResponse);
1212
rpc QueueVectorPrefetch(VectorPrefetchRequest) returns (VectorPrefetchResponse);
1313

14+
rpc NumberOfSegments(SegmentRequest) returns (SegmentResponse);
15+
1416
// Query a set of tokens against the term dictionary, optionally allowing
1517
// for fuzzy matching and prefix matching. Take the top `K` results with
1618
// respect to to `(edit distance, term)` lexicographical order.
@@ -214,6 +216,15 @@ message StorageType {
214216
}
215217
}
216218

219+
message SegmentRequest {
220+
StorageKey segment = 1;
221+
StorageType storage_type = 2;
222+
}
223+
224+
message SegmentResponse {
225+
uint32 number_of_segments = 1;
226+
}
227+
217228
message QueryTokensRequest {
218229
StorageType storage_type = 1;
219230
FragmentedTextSegmentPaths segment = 2;
@@ -223,8 +234,20 @@ message QueryTokensRequest {
223234

224235
message FragmentedTextSegmentPaths {
225236
StorageKey segment = 1;
226-
StorageKey id_tracker = 2;
227-
StorageKey deletions = 3;
237+
oneof segment_metadata {
238+
SingleSegmentMetadata single_segment = 2;
239+
MultiSegmentMetadata multi_segment = 3;
240+
}
241+
}
242+
243+
message SingleSegmentMetadata {
244+
optional uint32 segment_ord = 1;
245+
}
246+
247+
message MultiSegmentMetadata {
248+
StorageKey id_tracker = 1;
249+
StorageKey deleted_terms_table = 2;
250+
StorageKey alive_bitset = 3;
228251
}
229252

230253
message TokenQuery {

crates/search/src/lib.rs

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ use common::{
4747
},
4848
document::ResolvedDocument,
4949
index::IndexKeyBytes,
50+
knobs::USE_MULTI_SEGMENT_SEARCH_QUERY,
5051
query::{
5152
search_value_to_bytes,
5253
InternalSearch,
@@ -84,7 +85,7 @@ use query::{
8485
RevisionWithKeys,
8586
TextQueryTerm,
8687
};
87-
use searcher::FragmentedTextSegmentStorageKeys;
88+
use searcher::TextStorageKeys;
8889
use storage::Storage;
8990
pub use tantivy::Document as TantivyDocument;
9091
use tantivy::{
@@ -410,7 +411,7 @@ impl TantivySearchIndexSchema {
410411
compiled_query: CompiledQuery,
411412
memory_index: &MemorySearchIndex,
412413
search_storage: Arc<dyn Storage>,
413-
segments: Vec<FragmentedTextSegmentStorageKeys>,
414+
segments: Vec<TextStorageKeys>,
414415
disk_index_ts: Timestamp,
415416
searcher: Arc<dyn Searcher>,
416417
) -> anyhow::Result<RevisionWithKeys> {
@@ -438,11 +439,11 @@ impl TantivySearchIndexSchema {
438439
// and merge the results to get the top terms.
439440
let mut match_aggregator = TokenMatchAggregator::new(MAX_UNIQUE_QUERY_TERMS);
440441
memory_index.query_tokens(&token_queries, &mut match_aggregator)?;
441-
for segment in &segments {
442+
for segment in segments.clone() {
442443
let segment_token_matches = searcher
443444
.query_tokens(
444445
search_storage.clone(),
445-
segment.clone(),
446+
segment,
446447
token_queries.clone(),
447448
MAX_UNIQUE_QUERY_TERMS,
448449
)
@@ -482,9 +483,9 @@ impl TantivySearchIndexSchema {
482483
// Step 3: Given the terms we decided on, query BM25 statistics across all of
483484
// the indexes and merge their results.
484485
let mut bm25_stats = Bm25Stats::empty();
485-
for segment in &segments {
486+
for segment in segments.clone() {
486487
bm25_stats += searcher
487-
.query_bm25_stats(search_storage.clone(), segment.clone(), terms.clone())
488+
.query_bm25_stats(search_storage.clone(), segment, terms.clone())
488489
.await?;
489490
}
490491
bm25_stats = memory_index.update_bm25_stats(disk_index_ts, &terms, bm25_stats)?;
@@ -545,9 +546,9 @@ impl TantivySearchIndexSchema {
545546
&mut match_aggregator,
546547
)?;
547548
}
548-
for segment in &segments {
549+
for segment in segments {
549550
let segment_matches = searcher
550-
.query_posting_lists(search_storage.clone(), segment.clone(), query.clone())
551+
.query_posting_lists(search_storage.clone(), segment, query.clone())
551552
.await?;
552553
for m in segment_matches {
553554
if !match_aggregator.insert(m) {
@@ -590,6 +591,27 @@ impl TantivySearchIndexSchema {
590591
disk_index_ts: Timestamp,
591592
searcher: Arc<dyn Searcher>,
592593
) -> anyhow::Result<RevisionWithKeys> {
594+
if *USE_MULTI_SEGMENT_SEARCH_QUERY {
595+
let number_of_segments = searcher
596+
.number_of_segments(search_storage.clone(), disk_index.clone())
597+
.await?;
598+
let segments = (0..number_of_segments)
599+
.map(|i| TextStorageKeys::SingleSegment {
600+
storage_key: disk_index.clone(),
601+
segment_ord: i as u32,
602+
})
603+
.collect();
604+
return self
605+
.search2(
606+
compiled_query,
607+
memory_index,
608+
search_storage,
609+
segments,
610+
disk_index_ts,
611+
searcher,
612+
)
613+
.await;
614+
}
593615
// 1. Fetch the memory index matches for each QueryTerm in the query and bound.
594616
let (term_shortlist, term_shortlist_ids) =
595617
memory_index.bound_and_evaluate_query_terms(&compiled_query.text_query);

crates/search/src/search_index_manager.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,10 @@ use crate::{
4949
CompiledQuery,
5050
RevisionWithKeys,
5151
},
52-
searcher::FragmentedTextSegmentStorageKeys,
52+
searcher::{
53+
FragmentedTextStorageKeys,
54+
TextStorageKeys,
55+
},
5356
QueryResults,
5457
Searcher,
5558
TantivySearchIndexSchema,
@@ -292,7 +295,11 @@ impl SearchIndexManager {
292295
segments
293296
.iter()
294297
.cloned()
295-
.map(FragmentedTextSegmentStorageKeys::from)
298+
.map(|segment| {
299+
TextStorageKeys::MultiSegment(FragmentedTextStorageKeys::from(
300+
segment,
301+
))
302+
})
296303
.collect(),
297304
*disk_index_ts,
298305
searcher,

crates/search/src/searcher/in_process.rs

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@ use vector::{
2525

2626
use super::searcher::{
2727
Bm25Stats,
28-
FragmentedTextSegmentStorageKeys,
2928
PostingListMatch,
3029
PostingListQuery,
30+
TextStorageKeys,
3131
TokenMatch,
3232
TokenQuery,
3333
};
@@ -61,10 +61,18 @@ impl Searcher for SearcherStub {
6161
Ok(SearchQueryResult::empty())
6262
}
6363

64+
async fn number_of_segments(
65+
&self,
66+
_search_storage: Arc<dyn Storage>,
67+
_storage_key: ObjectKey,
68+
) -> anyhow::Result<usize> {
69+
Ok(1)
70+
}
71+
6472
async fn query_tokens(
6573
&self,
6674
_search_storage: Arc<dyn Storage>,
67-
_storage_keys: FragmentedTextSegmentStorageKeys,
75+
_storage_keys: TextStorageKeys,
6876
_queries: Vec<TokenQuery>,
6977
_max_results: usize,
7078
) -> anyhow::Result<Vec<TokenMatch>> {
@@ -74,7 +82,7 @@ impl Searcher for SearcherStub {
7482
async fn query_bm25_stats(
7583
&self,
7684
_search_storage: Arc<dyn Storage>,
77-
_storage_keys: FragmentedTextSegmentStorageKeys,
85+
_storage_keys: TextStorageKeys,
7886
_terms: Vec<Term>,
7987
) -> anyhow::Result<Bm25Stats> {
8088
Ok(Bm25Stats {
@@ -87,7 +95,7 @@ impl Searcher for SearcherStub {
8795
async fn query_posting_lists(
8896
&self,
8997
_search_storage: Arc<dyn Storage>,
90-
_storage_keys: FragmentedTextSegmentStorageKeys,
98+
_storage_keys: TextStorageKeys,
9199
_query: PostingListQuery,
92100
) -> anyhow::Result<Vec<PostingListMatch>> {
93101
Ok(vec![])
@@ -161,10 +169,20 @@ impl<RT: Runtime> Searcher for InProcessSearcher<RT> {
161169
.await
162170
}
163171

172+
async fn number_of_segments(
173+
&self,
174+
search_storage: Arc<dyn Storage>,
175+
storage_key: ObjectKey,
176+
) -> anyhow::Result<usize> {
177+
self.searcher
178+
.number_of_segments(search_storage, storage_key)
179+
.await
180+
}
181+
164182
async fn query_tokens(
165183
&self,
166184
search_storage: Arc<dyn Storage>,
167-
storage_keys: FragmentedTextSegmentStorageKeys,
185+
storage_keys: TextStorageKeys,
168186
queries: Vec<TokenQuery>,
169187
max_results: usize,
170188
) -> anyhow::Result<Vec<TokenMatch>> {
@@ -176,7 +194,7 @@ impl<RT: Runtime> Searcher for InProcessSearcher<RT> {
176194
async fn query_bm25_stats(
177195
&self,
178196
search_storage: Arc<dyn Storage>,
179-
storage_keys: FragmentedTextSegmentStorageKeys,
197+
storage_keys: TextStorageKeys,
180198
terms: Vec<Term>,
181199
) -> anyhow::Result<Bm25Stats> {
182200
self.searcher
@@ -187,7 +205,7 @@ impl<RT: Runtime> Searcher for InProcessSearcher<RT> {
187205
async fn query_posting_lists(
188206
&self,
189207
search_storage: Arc<dyn Storage>,
190-
storage_keys: FragmentedTextSegmentStorageKeys,
208+
storage_keys: TextStorageKeys,
191209
query: PostingListQuery,
192210
) -> anyhow::Result<Vec<PostingListMatch>> {
193211
self.searcher

crates/search/src/searcher/mod.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,13 @@ pub use in_process::{
1111
};
1212
pub use searcher::{
1313
Bm25Stats,
14-
FragmentedTextSegmentStorageKeys,
14+
FragmentedTextStorageKeys,
1515
PostingListMatch,
1616
PostingListQuery,
1717
Searcher,
1818
SearcherImpl,
1919
Term,
20+
TextStorageKeys,
2021
TokenMatch,
2122
TokenQuery,
2223
};

0 commit comments

Comments
 (0)