Skip to content

Commit c896bb7

Browse files
Sujay JayakarConvex, Inc.
Sujay Jayakar
authored and
Convex, Inc.
committed
[RFC] Service definition for incremental text search Searchlight endpoints (#24870)
GitOrigin-RevId: db65000854c945963627648bdfefb86c37f91804
1 parent 6702e86 commit c896bb7

File tree

7 files changed

+1177
-3
lines changed

7 files changed

+1177
-3
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ strum = { version = "0.26", features = [ "derive" ] }
109109
sucds = { version = "0.8.1", features = [ "intrinsics" ] }
110110
syn = { version = "2.0", features = [ "full" ] }
111111
tantivy = { git = "https://github.com/get-convex/tantivy", branch = "rakeeb/reexport-tantivy-fst" }
112+
tantivy-common = { git = "https://github.com/get-convex/tantivy", branch = "rakeeb/reexport-tantivy-fst" }
112113
tempfile = "3"
113114
thiserror = "1"
114115
thousands = "0.2.0"

crates/pb/protos/searchlight.proto

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,3 +198,99 @@ message StorageType {
198198
LocalStorage local = 2;
199199
}
200200
}
201+
202+
service IncrementalSearchlight {
203+
// Query a set of tokens against the term dictionary, optionally allowing
204+
// for fuzzy matching and prefix matching. Take the top `K` results with
205+
// respect to to `(edit distance, term)` lexicographical order.
206+
rpc QueryTokens(QueryTokensRequest) returns (QueryTokensResponse);
207+
208+
// For the given index, compute the total number of documents and terms
209+
// in the index. Also, given a list of pointers to terms within the index,
210+
// compute the document frequency of each term.
211+
rpc QueryBm25Stats(QueryBm25StatsRequest) returns (QueryBm25StatsResponse);
212+
213+
// Given a AND + OR query of term pointers and BM25 statistics for the OR
214+
// terms, return the top `K` results with respect to BM25 score.
215+
rpc QueryPostingLists(QueryPostingListsRequest) returns (QueryPostingListsResponse);
216+
}
217+
218+
message QueryTokensRequest {
219+
StorageType storage_type = 1;
220+
FragmentedTextSegmentPaths segment = 2;
221+
SearchIndexConfig index_config = 3;
222+
repeated TokenQuery token_queries = 4;
223+
uint32 max_results = 5;
224+
}
225+
226+
message FragmentedTextSegmentPaths {
227+
StorageKey segment = 1;
228+
StorageKey id_tracker = 2;
229+
StorageKey deletions = 3;
230+
}
231+
232+
message TokenQuery {
233+
convex_token.FieldPath field_path = 1;
234+
bytes token = 2;
235+
uint32 max_distance = 3;
236+
bool prefix = 4;
237+
}
238+
239+
message QueryTokensResponse {
240+
repeated TokenMatch token_matches = 2;
241+
}
242+
243+
message TokenMatch {
244+
uint32 distance = 1;
245+
bool prefix = 2;
246+
bytes tantivy_bytes = 3;
247+
// Offset into `QueryTokensRequest.token_queries`.
248+
uint32 token_ord = 4;
249+
}
250+
251+
message QueryBm25StatsRequest {
252+
StorageType storage_type = 1;
253+
FragmentedTextSegmentPaths segment = 2;
254+
repeated bytes terms = 3;
255+
}
256+
257+
message QueryBm25StatsResponse {
258+
uint64 num_terms = 1;
259+
uint64 num_documents = 2;
260+
repeated DocFrequency doc_frequencies = 3;
261+
}
262+
263+
message DocFrequency {
264+
bytes term = 1;
265+
uint64 frequency = 2;
266+
}
267+
268+
message QueryPostingListsRequest {
269+
StorageType storage_type = 1;
270+
FragmentedTextSegmentPaths segment = 2;
271+
PostingListQuery query = 3;
272+
}
273+
274+
message PostingListQuery {
275+
repeated bytes deleted_internal_ids = 1;
276+
277+
repeated bytes or_terms = 2;
278+
repeated bytes and_terms = 3;
279+
280+
uint64 num_terms = 4;
281+
uint64 num_documents = 5;
282+
repeated DocFrequency doc_frequencies = 6;
283+
284+
uint32 max_results = 7;
285+
}
286+
287+
message QueryPostingListsResponse {
288+
repeated PostingListMatch matches = 1;
289+
}
290+
291+
message PostingListMatch {
292+
bytes internal_id = 1;
293+
uint64 ts = 2;
294+
double creation_time = 3;
295+
float bm25_score = 4;
296+
}

crates/search/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ serde_json = { workspace = true }
3838
storage = { path = "../storage" }
3939
sucds = { workspace = true }
4040
tantivy = { workspace = true }
41+
tantivy-common = { workspace = true }
4142
tempfile = { workspace = true }
4243
tokio = { workspace = true }
4344
tracing = { workspace = true }

0 commit comments

Comments
 (0)