Refactor

alamb · alamb · commit 55aa92e41a24 · 2025-05-09T15:24:50.000-04:00
diff --git a/parquet/benches/arrow_reader_clickbench.rs b/parquet/benches/arrow_reader_clickbench.rs
@@ -27,7 +27,7 @@
 //! [ClickBench]: https://benchmark.clickhouse.com/
 
 use arrow::compute::kernels::cmp::{eq, neq};
-use arrow::compute::{like, nlike};
+use arrow::compute::{like, nlike, or};
 use arrow_array::types::{Int16Type, Int32Type, Int64Type};
 use arrow_array::{ArrayRef, ArrowPrimitiveType, BooleanArray, PrimitiveArray, StringViewArray};
 use arrow_schema::{ArrowError, DataType, Schema};
@@ -96,9 +96,7 @@ struct Query {
     filter_columns: Vec<&'static str>,
     /// Which columns will by projected (decoded after filter)
     projection_columns: Vec<&'static str>,
-    /// Returns a Vec of `RunPredicateFn` that filter the data. The
-    /// `RecordBatch` passed to the fn has the columns specified in
-    /// `filter_columns`
+    /// Predicates to apply
     predicates: Vec<ClickBenchPredicate>,
     /// How many rows are expected to pass the predicate. This serves
     /// as a sanity check that the benchmark is working correctly.
@@ -420,8 +418,7 @@ fn all_queries() -> Vec<Query> {
                 // ClickBenchPredicate::gt_eq_literal::<Int16Type>(1, str_to_i16_date("2013-07-01")),
                 // ClickBenchPredicate::lt_eq_literal::<Int16Type>(1, str_to_i16_date("2013-07-31")),
                 ClickBenchPredicate::eq_literal::<Int16Type>(2, 0),
-                // TODO implement IN predicate
-                ClickBenchPredicate::eq_literal::<Int16Type>(3, -1), // IN -1, 6
+                ClickBenchPredicate::in_list::<Int16Type>(3, (-1, 6)), // IN -1, 6
                 ClickBenchPredicate::eq_literal::<Int64Type>(4, 3594120000172545465),
             ],
             expected_row_count: 24793,
@@ -512,6 +509,23 @@ impl ClickBenchPredicate {
         })
     }
 
+    /// Create Predicate: col IN (lit1, lit2)
+    fn in_list<T: ArrowPrimitiveType>(
+        column_index: usize,
+        literal_values: (T::Native, T::Native),
+    ) -> Self {
+        Self::new(column_index, move || {
+            let literal_1 = PrimitiveArray::<T>::new_scalar(literal_values.0);
+            let literal_2 = PrimitiveArray::<T>::new_scalar(literal_values.1);
+            Box::new(move |col| {
+                // use OR
+                let match1 = eq(&col, &literal_1)?;
+                let match2 = eq(&col, &literal_2)?;
+                or(&match1, &match2)
+            })
+        })
+    }
+
     /// Create predicate: col != ''
     fn neq_literal<T: ArrowPrimitiveType>(column_index: usize, literal_value: T::Native) -> Self {
         Self::new(column_index, move || {
@@ -660,19 +674,33 @@ impl FilterIndices {
 
 /// Encapsulates the test parameters for a single benchmark
 struct ReadTest {
+    /// Human identifiable name
+    name: &'static str,
+    /// Metadata from Parquet file
     arrow_reader_metadata: ArrowReaderMetadata,
-    // TODO keep only fields needed (inline Query field)
-    query: Query,
     /// Which columns in the file should be projected (decoded after filter)
     projection_mask: ProjectionMask,
     /// Which columns in the file should be passed to the filter.
     filter_mask: ProjectionMask,
     /// Mapping from column selected in filter mask to Query::filter_columns
     filter_indices: FilterIndices,
+    /// Predicates to apply
+    predicates: Vec<ClickBenchPredicate>,
+    /// How many rows are expected to pass the predicate. This serves
+    /// as a sanity check that the benchmark is working correctly.
+    expected_row_count: usize,
 }
 
 impl ReadTest {
     fn new(query: Query) -> Self {
+        let Query {
+            name,
+            filter_columns,
+            projection_columns,
+            predicates,
+            expected_row_count,
+        } = query;
+
         let arrow_reader_metadata = load_metadata(hits_1());
         let schema_descr = arrow_reader_metadata
             .metadata()
@@ -685,27 +713,27 @@ impl ReadTest {
         // Determine the correct selection ("ProjectionMask")
         //ProjectionMask::columns(schema, projection_columns)
 
-        let projection_columns = &query.projection_columns;
         let projection_mask = if projection_columns.iter().any(|&name| name == "*") {
             // * means all columns
             ProjectionMask::all()
         } else {
-            let projection_schema_indices = column_indices(schema_descr, &query.projection_columns);
+            let projection_schema_indices = column_indices(schema_descr, &projection_columns);
             ProjectionMask::leaves(schema_descr, projection_schema_indices)
         };
 
-        let filter_columns = &query.filter_columns;
-        let filter_schema_indices = column_indices(schema_descr, filter_columns);
+        let filter_schema_indices = column_indices(schema_descr, &filter_columns);
         let filter_mask =
             ProjectionMask::leaves(schema_descr, filter_schema_indices.iter().cloned());
         let filter_indices = FilterIndices::new(schema_descr, filter_schema_indices);
 
         Self {
+            name,
             arrow_reader_metadata,
-            query,
             projection_mask,
             filter_mask,
             filter_indices,
+            predicates,
+            expected_row_count,
         }
     }
 
@@ -774,7 +802,6 @@ impl ReadTest {
         //let run_predicate_fns = (self.query.predicate)();
         // Convert the predicates to ArrowPredicateFn to conform to the RowFilter API
         let arrow_predicates: Vec<_> = self
-            .query
             .predicates
             .iter()
             .map(|pred| {
@@ -795,11 +822,11 @@ impl ReadTest {
     }
 
     fn check_row_count(&self, row_count: usize) {
-        let expected_row_count = self.query.expected_row_count;
+        let expected_row_count = self.expected_row_count;
         assert_eq!(
             row_count, expected_row_count,
             "Expected {} rows, but got {} in {}",
-            expected_row_count, row_count, self.query,
+            expected_row_count, row_count, self.name,
         );
     }
 }