Avoid building empty text segments, more backfilling tests. (#26180)

sjudd · Convex, Inc. · commit 2308de6c85c5 · 2024-05-22T23:25:52.000Z
GitOrigin-RevId: e9363f1e98116df8f7a10f2a1614d110ce75e9b6
diff --git a/crates/database/src/bootstrap_model/index.rs b/crates/database/src/bootstrap_model/index.rs
@@ -172,7 +172,8 @@ impl<'a, RT: Runtime> IndexModel<'a, RT> {
         let metadata = self
             .pending_index_metadata(namespace, index)?
             .ok_or_else(|| anyhow::anyhow!("Failed to find pending index: {}", index))?;
-        self.enable_index(&metadata.into_value()).await
+        self.enable_index(&metadata.into_value()).await?;
+        Ok(())
     }
 
     async fn enable_index(&mut self, backfilled_index: &TabletIndexMetadata) -> anyhow::Result<()> {
diff --git a/crates/database/src/text_index_worker/flusher2.rs b/crates/database/src/text_index_worker/flusher2.rs
@@ -249,7 +249,7 @@ impl<RT: Runtime> TextIndexFlusher2<RT> {
             let snapshot = TextIndexSnapshot {
                 data: TextIndexSnapshotData::MultiSegment(segments),
                 ts: backfill_ts,
-                version: TextSnapshotVersion::V0,
+                version: TextSnapshotVersion::V2UseStringIds,
             };
             let is_snapshotted = matches!(on_disk_state, SearchOnDiskState::SnapshottedAt(_));
             if is_snapshotted {
@@ -297,11 +297,16 @@ impl<RT: Runtime> TextIndexFlusher2<RT> {
 #[cfg(test)]
 mod tests {
     use common::{
-        bootstrap_model::index::IndexMetadata,
+        bootstrap_model::index::{
+            text_index::TextIndexState,
+            IndexConfig,
+            IndexMetadata,
+        },
         runtime::testing::TestRuntime,
         types::TabletIndexName,
     };
     use maplit::btreemap;
+    use must_let::must_let;
     use value::TableNamespace;
 
     use crate::tests::text_test_utils::{
@@ -408,6 +413,30 @@ mod tests {
         Ok(())
     }
 
+    #[convex_macro::test_runtime]
+    async fn backfill_with_two_documents_leaves_document_backfilling_after_first_flush(
+        rt: TestRuntime,
+    ) -> anyhow::Result<()> {
+        let fixtures = TextFixtures::new(rt).await?;
+        let IndexMetadata { name, .. } = fixtures.insert_backfilling_text_index().await?;
+
+        fixtures.add_document("cat").await?;
+        fixtures.add_document("dog").await?;
+
+        let mut flusher = fixtures
+            .new_search_flusher_builder()
+            .set_incremental_multipart_threshold_bytes(0)
+            .build();
+        // Build the first segment, which stops because the document size is > 0
+        flusher.step().await?;
+        let metadata = fixtures.get_index_metadata(name).await?;
+        must_let!(let IndexConfig::Search { on_disk_state, .. }= &metadata.config);
+        must_let!(let TextIndexState::Backfilling(backfilling_meta) = on_disk_state);
+        assert_eq!(backfilling_meta.segments.len(), 1);
+
+        Ok(())
+    }
+
     #[convex_macro::test_runtime]
     async fn backfill_with_two_documents_0_max_segment_size_includes_both_documents(
         rt: TestRuntime,
@@ -437,4 +466,140 @@ mod tests {
 
         Ok(())
     }
+
+    #[convex_macro::test_runtime]
+    async fn backfill_with_empty_index_adds_no_segments(rt: TestRuntime) -> anyhow::Result<()> {
+        let fixtures = TextFixtures::new(rt).await?;
+        let IndexMetadata { name, .. } = fixtures.insert_backfilling_text_index().await?;
+        let mut flusher = fixtures.new_search_flusher2();
+        flusher.step().await?;
+
+        let segments = fixtures.get_segments_metadata(name).await?;
+        assert_eq!(0, segments.len());
+
+        Ok(())
+    }
+
+    #[convex_macro::test_runtime]
+    async fn backfill_with_empty_backfilled_index_new_document_adds_document(
+        rt: TestRuntime,
+    ) -> anyhow::Result<()> {
+        let fixtures = TextFixtures::new(rt).await?;
+        let IndexMetadata { name, .. } = fixtures.insert_backfilling_text_index().await?;
+        let mut flusher = fixtures.new_search_flusher2();
+        flusher.step().await?;
+
+        let doc_id = fixtures.add_document("cat").await?;
+
+        flusher.step().await?;
+
+        fixtures.enable_index(&name).await?;
+        let results = fixtures.search(name, "cat").await?;
+        assert_eq!(doc_id, results.first().unwrap().id());
+
+        Ok(())
+    }
+
+    #[convex_macro::test_runtime]
+    async fn backfill_with_non_empty_backfilled_index_new_document_adds_document(
+        rt: TestRuntime,
+    ) -> anyhow::Result<()> {
+        let fixtures = TextFixtures::new(rt).await?;
+        let IndexMetadata { name, .. } = fixtures.insert_backfilling_text_index().await?;
+        fixtures.add_document("dog").await?;
+        let mut flusher = fixtures.new_search_flusher2();
+        flusher.step().await?;
+
+        let doc_id = fixtures.add_document("cat").await?;
+
+        flusher.step().await?;
+
+        fixtures.enable_index(&name).await?;
+        let results = fixtures.search(name, "cat").await?;
+        assert_eq!(doc_id, results.first().unwrap().id());
+
+        Ok(())
+    }
+
+    #[convex_macro::test_runtime]
+    async fn backfill_with_empty_enabled_index_new_document_adds_document(
+        rt: TestRuntime,
+    ) -> anyhow::Result<()> {
+        let fixtures = TextFixtures::new(rt).await?;
+        let IndexMetadata { name, .. } = fixtures.insert_backfilling_text_index().await?;
+        let mut flusher = fixtures.new_search_flusher2();
+        flusher.step().await?;
+        fixtures.enable_index(&name).await?;
+
+        let doc_id = fixtures.add_document("cat").await?;
+
+        flusher.step().await?;
+
+        let results = fixtures.search(name, "cat").await?;
+        assert_eq!(doc_id, results.first().unwrap().id());
+
+        Ok(())
+    }
+
+    #[convex_macro::test_runtime]
+    async fn backfill_with_non_empty_enabled_index_new_document_adds_document(
+        rt: TestRuntime,
+    ) -> anyhow::Result<()> {
+        let fixtures = TextFixtures::new(rt).await?;
+        let IndexMetadata { name, .. } = fixtures.insert_backfilling_text_index().await?;
+        fixtures.add_document("dog").await?;
+        let mut flusher = fixtures.new_search_flusher2();
+        flusher.step().await?;
+        fixtures.enable_index(&name).await?;
+
+        let doc_id = fixtures.add_document("cat").await?;
+
+        flusher.step().await?;
+
+        let results = fixtures.search(name, "cat").await?;
+        assert_eq!(doc_id, results.first().unwrap().id());
+
+        Ok(())
+    }
+
+    #[convex_macro::test_runtime]
+    async fn backfill_with_non_empty_enabled_index_new_document_adds_new_segment(
+        rt: TestRuntime,
+    ) -> anyhow::Result<()> {
+        let fixtures = TextFixtures::new(rt).await?;
+        let IndexMetadata { name, .. } = fixtures.insert_backfilling_text_index().await?;
+        fixtures.add_document("dog").await?;
+        let mut flusher = fixtures.new_search_flusher2();
+        flusher.step().await?;
+        fixtures.enable_index(&name).await?;
+
+        fixtures.add_document("cat").await?;
+
+        flusher.step().await?;
+
+        let segments = fixtures.get_segments_metadata(name).await?;
+        assert_eq!(segments.len(), 2);
+
+        Ok(())
+    }
+    #[convex_macro::test_runtime]
+    async fn backfill_with_non_empty_backfilled_index_new_document_adds_new_segment(
+        rt: TestRuntime,
+    ) -> anyhow::Result<()> {
+        let fixtures = TextFixtures::new(rt).await?;
+        let IndexMetadata { name, .. } = fixtures.insert_backfilling_text_index().await?;
+        fixtures.add_document("dog").await?;
+        let mut flusher = fixtures.new_search_flusher2();
+        flusher.step().await?;
+
+        fixtures.add_document("cat").await?;
+
+        flusher.step().await?;
+
+        fixtures.enable_index(&name).await?;
+        let segments = fixtures.get_segments_metadata(name).await?;
+        assert_eq!(segments.len(), 2);
+
+        Ok(())
+    }
 }
diff --git a/crates/database/src/text_index_worker/text_meta.rs b/crates/database/src/text_index_worker/text_meta.rs
@@ -165,17 +165,13 @@ impl SearchIndex for TextSearchIndex {
         previous_segments: &mut Self::PreviousSegments,
     ) -> anyhow::Result<Option<Self::NewSegment>> {
         let revision_stream = Box::pin(stream_revision_pairs(documents, &reader));
-        // TODO(CX-6496): Make build_segment return None if there are no new documents
-        // to index.
-        Ok(Some(
-            build_new_segment(
-                revision_stream,
-                schema.clone(),
-                index_path,
-                previous_segments,
-            )
-            .await?,
-        ))
+        build_new_segment(
+            revision_stream,
+            schema.clone(),
+            index_path,
+            previous_segments,
+        )
+        .await
     }
 
     async fn upload_new_segment<RT: Runtime>(
diff --git a/crates/search/src/incremental_index.rs b/crates/search/src/incremental_index.rs
@@ -275,7 +275,7 @@ pub async fn build_new_segment(
     tantivy_schema: TantivySearchIndexSchema,
     dir: &Path,
     previous_segments: &mut PreviousTextSegments,
-) -> anyhow::Result<NewTextSegment> {
+) -> anyhow::Result<Option<NewTextSegment>> {
     let index_path = dir.join("index_path");
     std::fs::create_dir(&index_path)?;
     let index = IndexBuilder::new()
@@ -297,6 +297,8 @@ pub async fn build_new_segment(
 
     let mut num_indexed_documents = 0;
 
+    let mut is_at_least_one_document_indexed = false;
+
     while let Some(revision_pair) = revision_stream.try_next().await? {
         let convex_id = revision_pair.id.internal_id();
         // Skip documents we have already added to the segment, but update dangling
@@ -335,6 +337,7 @@ pub async fn build_new_segment(
         }
         // Addition
         if let Some(new_document) = revision_pair.document() {
+            is_at_least_one_document_indexed = true;
             num_indexed_documents += 1;
             dangling_deletes.remove(&convex_id);
             let tantivy_document =
@@ -348,6 +351,9 @@ pub async fn build_new_segment(
         "Dangling deletes is not empty. A document was deleted that is not present in other \
          segments nor in this stream"
     );
+    if !is_at_least_one_document_indexed {
+        return Ok(None);
+    }
     segment_writer.finalize()?;
 
     let new_deletion_tracker = MemoryDeletionTracker::new(new_id_tracker.num_ids() as u32);
@@ -363,10 +369,10 @@ pub async fn build_new_segment(
         alive_bit_set_path,
         deleted_terms_path,
     };
-    Ok(NewTextSegment {
+    Ok(Some(NewTextSegment {
         paths,
         num_indexed_documents,
-    })
+    }))
 }
 
 pub struct SearchSegmentForMerge {
diff --git a/crates/search/src/searcher/searcher.rs b/crates/search/src/searcher/searcher.rs
@@ -1506,7 +1506,8 @@ mod tests {
             test_dir.path(),
             &mut previous_segments,
         )
-        .await?;
+        .await?
+        .unwrap();
         let updated_segments = previous_segments.finalize();
         assert!(updated_segments.is_empty());
         println!("Indexed {dataset_path} in {:?}", start.elapsed());
@@ -1637,8 +1638,11 @@ mod tests {
 
     #[derive(Clone)]
     struct TestIndex {
+        /// Only used for printing debug info.
         strings_by_id: BTreeMap<ResolvedDocumentId, Option<String>>,
-        segment_paths: TextSegmentPaths,
+        /// Note - this is only the latest segment.  This struct and tests don't
+        /// support querying multiple segments within an index.
+        segment_paths: Option<TextSegmentPaths>,
         #[allow(dead_code)]
         previous_segment_dirs: Vec<PathBuf>,
     }
@@ -1703,7 +1707,7 @@ mod tests {
         }
         Ok(TestIndex {
             strings_by_id,
-            segment_paths: new_segment.paths,
+            segment_paths: new_segment.map(|segment| segment.paths),
             previous_segment_dirs,
         })
     }
@@ -1738,6 +1742,10 @@ mod tests {
         test_index: TestIndex,
     ) -> anyhow::Result<Vec<(PostingListMatch, String)>> {
         let segment_paths = test_index.segment_paths;
+        let Some(segment_paths) = segment_paths else {
+            println!("Empty segment!");
+            return Ok(vec![]);
+        };
 
         let index_reader = index_reader_for_directory(&segment_paths.index_path)?;
         let searcher = index_reader.searcher();
@@ -1925,13 +1933,29 @@ mod tests {
         let (posting_list_match, s) = posting_list_matches.first().unwrap();
         assert_eq!(posting_list_match.internal_id, id);
         assert_eq!(s, "emma is awesome!");
-        let previous_segments =
-            PreviousTextSegments(vec![UpdatableTextSegment::load(&test_index.segment_paths)?]);
+        let previous_segments = PreviousTextSegments(vec![UpdatableTextSegment::load(
+            &test_index.segment_paths.clone().unwrap(),
+        )?]);
         let test_dir = TempDir::new()?;
         let delete_document: Vec<_> = vec![(id, Some("emma is awesome!"), None)];
 
-        let test_index =
+        let new_test_index =
             build_test_index(delete_document.into(), test_dir.path(), previous_segments).await?;
+
+        let previous_segment_paths = new_test_index.previous_segment_dirs.first().unwrap();
+        let alive_bitset_path = previous_segment_paths.join(ALIVE_BITSET_PATH);
+        let deleted_terms_path = previous_segment_paths.join(DELETED_TERMS_PATH);
+
+        let test_index = TestIndex {
+            strings_by_id: new_test_index.strings_by_id,
+            segment_paths: Some(TextSegmentPaths {
+                alive_bit_set_path: alive_bitset_path,
+                deleted_terms_path,
+                ..test_index.segment_paths.unwrap()
+            }),
+            previous_segment_dirs: new_test_index.previous_segment_dirs,
+        };
+
         let posting_list_matches =
             incremental_search_with_deletions_helper(query, test_index.clone()).await?;
         assert_eq!(posting_list_matches.len(), 0);
@@ -1978,8 +2002,8 @@ mod tests {
         assert_eq!(s, "emma is awesome!");
 
         let segments = vec![
-            search_segment_from_path(&test_index_1.segment_paths)?,
-            search_segment_from_path(&test_index_2.segment_paths)?,
+            search_segment_from_path(&test_index_1.segment_paths.unwrap())?,
+            search_segment_from_path(&test_index_2.segment_paths.unwrap())?,
         ];
 
         let merged_dir = TempDir::new()?;
@@ -1989,7 +2013,7 @@ mod tests {
         merged_strings_by_id.append(&mut test_index_2.strings_by_id.clone());
         let merged_index = TestIndex {
             strings_by_id: merged_strings_by_id,
-            segment_paths: merged_paths,
+            segment_paths: Some(merged_paths),
             previous_segment_dirs: vec![],
         };
         let mut posting_list_matches =

Original file line number	Diff line number	Diff line change
`@@ -172,7 +172,8 @@ impl<'a, RT: Runtime> IndexModel<'a, RT> {`
`172`	`172`	`let metadata = self`
`173`	`173`	`.pending_index_metadata(namespace, index)?`
`174`	`174`	`.ok_or_else(\|\| anyhow::anyhow!("Failed to find pending index: {}", index))?;`
`175`		`- self.enable_index(&metadata.into_value()).await`
	`175`	`+ self.enable_index(&metadata.into_value()).await?;`
	`176`	`+ Ok(())`
`176`	`177`	`}`
`177`	`178`
`178`	`179`	`async fn enable_index(&mut self, backfilled_index: &TabletIndexMetadata) -> anyhow::Result<()> {`