Skip to content

Commit 2b121fb

Browse files
emmaling27Convex, Inc.
authored and
Convex, Inc.
committed
Fix Bm25Stats underflow bug (#26435)
GitOrigin-RevId: ca9793d438b2b5a215cd0288f5f42b5259893a76
1 parent 99468ef commit 2b121fb

File tree

2 files changed

+33
-4
lines changed

2 files changed

+33
-4
lines changed

crates/database/src/tests/randomized_search_tests.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -755,6 +755,26 @@ async fn test_filtering_match_updates(rt: TestRuntime) -> anyhow::Result<()> {
755755
anyhow::Ok(())
756756
}
757757

758+
#[convex_macro::test_runtime]
759+
async fn test_bm25_stats_no_underflow(rt: TestRuntime) -> anyhow::Result<()> {
760+
let mut scenario = Scenario::new(rt).await?;
761+
scenario
762+
.patch(TestKey::C, vec![TestValue::D], TestValue::A)
763+
.await?;
764+
scenario.execute(TestAction::Backfill).await?;
765+
scenario.execute(TestAction::Delete(TestKey::C)).await?;
766+
// This query doens't use the filter field, so the BM25 stats will not include
767+
// the filter field while the commit statistics will in the memory index from
768+
// the delete.
769+
scenario
770+
.execute(TestAction::QueryAndCheckScores(TestQuery {
771+
search: vec![TestValue::D],
772+
filter: None,
773+
}))
774+
.await?;
775+
anyhow::Ok(())
776+
}
777+
758778
// Regression test: We had a bug where we were computing the index of a matching
759779
// union term incorrectly.
760780
//

crates/search/src/memory_index/mod.rs

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -464,10 +464,19 @@ impl MemorySearchIndex {
464464
.checked_add_signed(commit_stats.total_docs_diff as i64)
465465
.context("num_documents underflow")?;
466466
for (field, total_term_diff) in &commit_stats.total_term_diff_by_field {
467-
let term_diff = stats.num_terms_by_field.entry(*field).or_insert(0);
468-
*term_diff = term_diff
469-
.checked_add_signed(*total_term_diff as i64)
470-
.context("num_terms underflow")?;
467+
// It's possible some fields are present in the commit statistics but not the
468+
// Bm25Stats because the Bm25Stats are query-specific, and the commit statistics
469+
// are not. e.g. a filter field that isn't in the query might not appear in the
470+
// Bm25 stats. We only need to update the fields that are in the Bm25Stats.
471+
if let Some(term_diff) = stats.num_terms_by_field.get_mut(field) {
472+
*term_diff = term_diff
473+
.checked_add_signed(*total_term_diff as i64)
474+
.context("num_terms underflow")?;
475+
} else if field == &Field::from_field_id(SEARCH_FIELD_ID) {
476+
stats
477+
.num_terms_by_field
478+
.insert(*field, (*total_term_diff as i64).try_into()?);
479+
}
471480
}
472481
for (term, term_id) in &term_ids {
473482
let Some(&increment) = commit_stats.term_freq_diffs.get(term_id) else {

0 commit comments

Comments
 (0)