Update tpch, clickbench, sort_tpch to mark failed queries (#16182)

ding-young · web-flow · commit 9ae41b1bfaee · 2025-06-05T10:41:37.000+08:00
* Move struct QueryResult to util/run.rs

* Modify benches to continue query execution even on failure

* Mark benchmark query success on output json
diff --git a/benchmarks/compare.py b/benchmarks/compare.py
@@ -47,13 +47,15 @@ class QueryRun:
     query: int
     iterations: List[QueryResult]
     start_time: int
+    success: bool = True
 
     @classmethod
     def load_from(cls, data: Dict[str, Any]) -> QueryRun:
         return cls(
             query=data["query"],
             iterations=[QueryResult(**iteration) for iteration in data["iterations"]],
             start_time=data["start_time"],
+            success=data["success"],
         )
 
     @property
@@ -125,11 +127,26 @@ def compare(
     faster_count = 0
     slower_count = 0
     no_change_count = 0
+    failure_count = 0
     total_baseline_time = 0
     total_comparison_time = 0
 
     for baseline_result, comparison_result in zip(baseline.queries, comparison.queries):
         assert baseline_result.query == comparison_result.query
+        
+        base_failed = not baseline_result.success
+        comp_failed = not comparison_result.success 
+        # If a query fails, its execution time is excluded from the performance comparison
+        if base_failed or comp_failed:
+            change_text = "incomparable" 
+            failure_count += 1
+            table.add_row(
+                f"Q{baseline_result.query}",
+                "FAIL" if base_failed else f"{baseline_result.execution_time:.2f}ms",
+                "FAIL" if comp_failed else f"{comparison_result.execution_time:.2f}ms",
+                change_text,
+            )
+            continue
 
         total_baseline_time += baseline_result.execution_time
         total_comparison_time += comparison_result.execution_time
@@ -156,8 +173,12 @@ def compare(
     console.print(table)
 
     # Calculate averages
-    avg_baseline_time = total_baseline_time / len(baseline.queries)
-    avg_comparison_time = total_comparison_time / len(comparison.queries)
+    avg_baseline_time = 0.0
+    avg_comparison_time = 0.0
+    if len(baseline.queries) - failure_count > 0:
+        avg_baseline_time = total_baseline_time / (len(baseline.queries) - failure_count)
+    if len(comparison.queries) - failure_count > 0:
+        avg_comparison_time = total_comparison_time / (len(comparison.queries) - failure_count)
 
     # Summary table
     summary_table = Table(show_header=True, header_style="bold magenta")
@@ -171,6 +192,7 @@ def compare(
     summary_table.add_row("Queries Faster", str(faster_count))
     summary_table.add_row("Queries Slower", str(slower_count))
     summary_table.add_row("Queries with No Change", str(no_change_count))
+    summary_table.add_row("Queries with Failure", str(failure_count))
 
     console.print(summary_table)
 
diff --git a/benchmarks/src/bin/external_aggr.rs b/benchmarks/src/bin/external_aggr.rs
@@ -40,7 +40,7 @@ use datafusion::execution::SessionStateBuilder;
 use datafusion::physical_plan::display::DisplayableExecutionPlan;
 use datafusion::physical_plan::{collect, displayable};
 use datafusion::prelude::*;
-use datafusion_benchmarks::util::{BenchmarkRun, CommonOpt};
+use datafusion_benchmarks::util::{BenchmarkRun, CommonOpt, QueryResult};
 use datafusion_common::instant::Instant;
 use datafusion_common::utils::get_available_parallelism;
 use datafusion_common::{exec_err, DEFAULT_PARQUET_EXTENSION};
@@ -77,11 +77,6 @@ struct ExternalAggrConfig {
     output_path: Option<PathBuf>,
 }
 
-struct QueryResult {
-    elapsed: std::time::Duration,
-    row_count: usize,
-}
-
 /// Query Memory Limits
 /// Map query id to predefined memory limits
 ///
diff --git a/benchmarks/src/clickbench.rs b/benchmarks/src/clickbench.rs
@@ -18,7 +18,7 @@
 use std::path::Path;
 use std::path::PathBuf;
 
-use crate::util::{BenchmarkRun, CommonOpt};
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult};
 use datafusion::{
     error::{DataFusionError, Result},
     prelude::SessionContext,
@@ -128,36 +128,58 @@ impl RunOpt {
         let ctx = SessionContext::new_with_config_rt(config, rt_builder.build_arc()?);
         self.register_hits(&ctx).await?;
 
-        let iterations = self.common.iterations;
         let mut benchmark_run = BenchmarkRun::new();
         for query_id in query_range {
-            let mut millis = Vec::with_capacity(iterations);
             benchmark_run.start_new_case(&format!("Query {query_id}"));
-            let sql = queries.get_query(query_id)?;
-            println!("Q{query_id}: {sql}");
-
-            for i in 0..iterations {
-                let start = Instant::now();
-                let results = ctx.sql(sql).await?.collect().await?;
-                let elapsed = start.elapsed();
-                let ms = elapsed.as_secs_f64() * 1000.0;
-                millis.push(ms);
-                let row_count: usize = results.iter().map(|b| b.num_rows()).sum();
-                println!(
-                    "Query {query_id} iteration {i} took {ms:.1} ms and returned {row_count} rows"
-                );
-                benchmark_run.write_iter(elapsed, row_count);
+            let query_run = self.benchmark_query(&queries, query_id, &ctx).await;
+            match query_run {
+                Ok(query_results) => {
+                    for iter in query_results {
+                        benchmark_run.write_iter(iter.elapsed, iter.row_count);
+                    }
+                }
+                Err(e) => {
+                    benchmark_run.mark_failed();
+                    eprintln!("Query {query_id} failed: {e}");
+                }
             }
-            if self.common.debug {
-                ctx.sql(sql).await?.explain(false, false)?.show().await?;
-            }
-            let avg = millis.iter().sum::<f64>() / millis.len() as f64;
-            println!("Query {query_id} avg time: {avg:.2} ms");
         }
         benchmark_run.maybe_write_json(self.output_path.as_ref())?;
+        benchmark_run.maybe_print_failures();
         Ok(())
     }
 
+    async fn benchmark_query(
+        &self,
+        queries: &AllQueries,
+        query_id: usize,
+        ctx: &SessionContext,
+    ) -> Result<Vec<QueryResult>> {
+        let sql = queries.get_query(query_id)?;
+        println!("Q{query_id}: {sql}");
+
+        let mut millis = Vec::with_capacity(self.iterations());
+        let mut query_results = vec![];
+        for i in 0..self.iterations() {
+            let start = Instant::now();
+            let results = ctx.sql(sql).await?.collect().await?;
+            let elapsed = start.elapsed();
+            let ms = elapsed.as_secs_f64() * 1000.0;
+            millis.push(ms);
+            let row_count: usize = results.iter().map(|b| b.num_rows()).sum();
+            println!(
+                "Query {query_id} iteration {i} took {ms:.1} ms and returned {row_count} rows"
+            );
+            query_results.push(QueryResult { elapsed, row_count })
+        }
+        if self.common.debug {
+            ctx.sql(sql).await?.explain(false, false)?.show().await?;
+        }
+        let avg = millis.iter().sum::<f64>() / millis.len() as f64;
+        println!("Query {query_id} avg time: {avg:.2} ms");
+        Ok(query_results)
+    }
+
     /// Registers the `hits.parquet` as a table named `hits`
     async fn register_hits(&self, ctx: &SessionContext) -> Result<()> {
         let options = Default::default();
@@ -171,4 +193,8 @@ impl RunOpt {
                 )
             })
     }
+
+    fn iterations(&self) -> usize {
+        self.common.iterations
+    }
 }
diff --git a/benchmarks/src/imdb/run.rs b/benchmarks/src/imdb/run.rs
@@ -19,7 +19,7 @@ use std::path::PathBuf;
 use std::sync::Arc;
 
 use super::{get_imdb_table_schema, get_query_sql, IMDB_TABLES};
-use crate::util::{BenchmarkRun, CommonOpt};
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult};
 
 use arrow::record_batch::RecordBatch;
 use arrow::util::pretty::{self, pretty_format_batches};
@@ -475,11 +475,6 @@ impl RunOpt {
     }
 }
 
-struct QueryResult {
-    elapsed: std::time::Duration,
-    row_count: usize,
-}
-
 #[cfg(test)]
 // Only run with "ci" mode when we have the data
 #[cfg(feature = "ci")]
diff --git a/benchmarks/src/sort_tpch.rs b/benchmarks/src/sort_tpch.rs
@@ -40,7 +40,7 @@ use datafusion_common::instant::Instant;
 use datafusion_common::utils::get_available_parallelism;
 use datafusion_common::DEFAULT_PARQUET_EXTENSION;
 
-use crate::util::{BenchmarkRun, CommonOpt};
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult};
 
 #[derive(Debug, StructOpt)]
 pub struct RunOpt {
@@ -74,11 +74,6 @@ pub struct RunOpt {
     limit: Option<usize>,
 }
 
-struct QueryResult {
-    elapsed: std::time::Duration,
-    row_count: usize,
-}
-
 impl RunOpt {
     const SORT_TABLES: [&'static str; 1] = ["lineitem"];
 
@@ -179,7 +174,7 @@ impl RunOpt {
     /// If query is specified from command line, run only that query.
     /// Otherwise, run all queries.
     pub async fn run(&self) -> Result<()> {
-        let mut benchmark_run = BenchmarkRun::new();
+        let mut benchmark_run: BenchmarkRun = BenchmarkRun::new();
 
         let query_range = match self.query {
             Some(query_id) => query_id..=query_id,
@@ -189,14 +184,22 @@ impl RunOpt {
         for query_id in query_range {
             benchmark_run.start_new_case(&format!("{query_id}"));
 
-            let query_results = self.benchmark_query(query_id).await?;
-            for iter in query_results {
-                benchmark_run.write_iter(iter.elapsed, iter.row_count);
+            let query_results = self.benchmark_query(query_id).await;
+            match query_results {
+                Ok(query_results) => {
+                    for iter in query_results {
+                        benchmark_run.write_iter(iter.elapsed, iter.row_count);
+                    }
+                }
+                Err(e) => {
+                    benchmark_run.mark_failed();
+                    eprintln!("Query {query_id} failed: {e}");
+                }
             }
         }
 
         benchmark_run.maybe_write_json(self.output_path.as_ref())?;
-
+        benchmark_run.maybe_print_failures();
         Ok(())
     }
 
@@ -294,7 +297,7 @@ impl RunOpt {
 
         let mut stream = execute_stream(physical_plan.clone(), state.task_ctx())?;
         while let Some(batch) = stream.next().await {
-            row_count += batch.unwrap().num_rows();
+            row_count += batch?.num_rows();
         }
 
         if debug {
diff --git a/benchmarks/src/tpch/run.rs b/benchmarks/src/tpch/run.rs
@@ -21,7 +21,7 @@ use std::sync::Arc;
 use super::{
     get_query_sql, get_tbl_tpch_table_schema, get_tpch_table_schema, TPCH_TABLES,
 };
-use crate::util::{BenchmarkRun, CommonOpt};
+use crate::util::{BenchmarkRun, CommonOpt, QueryResult};
 
 use arrow::record_batch::RecordBatch;
 use arrow::util::pretty::{self, pretty_format_batches};
@@ -121,12 +121,21 @@ impl RunOpt {
 
         for query_id in query_range {
             benchmark_run.start_new_case(&format!("Query {query_id}"));
-            let query_run = self.benchmark_query(query_id, &ctx).await?;
-            for iter in query_run {
-                benchmark_run.write_iter(iter.elapsed, iter.row_count);
+            let query_run = self.benchmark_query(query_id, &ctx).await;
+            match query_run {
+                Ok(query_results) => {
+                    for iter in query_results {
+                        benchmark_run.write_iter(iter.elapsed, iter.row_count);
+                    }
+                }
+                Err(e) => {
+                    benchmark_run.mark_failed();
+                    eprintln!("Query {query_id} failed: {e}");
+                }
             }
         }
         benchmark_run.maybe_write_json(self.output_path.as_ref())?;
+        benchmark_run.maybe_print_failures();
         Ok(())
     }
 
@@ -320,11 +329,6 @@ impl RunOpt {
     }
 }
 
-struct QueryResult {
-    elapsed: std::time::Duration,
-    row_count: usize,
-}
-
 #[cfg(test)]
 // Only run with "ci" mode when we have the data
 #[cfg(feature = "ci")]
diff --git a/benchmarks/src/util/mod.rs b/benchmarks/src/util/mod.rs
@@ -22,4 +22,4 @@ mod run;
 
 pub use access_log::AccessLogOpt;
 pub use options::CommonOpt;
-pub use run::{BenchQuery, BenchmarkRun};
+pub use run::{BenchQuery, BenchmarkRun, QueryResult};
diff --git a/benchmarks/src/util/run.rs b/benchmarks/src/util/run.rs
@@ -90,8 +90,13 @@ pub struct BenchQuery {
     iterations: Vec<QueryIter>,
     #[serde(serialize_with = "serialize_start_time")]
     start_time: SystemTime,
+    success: bool,
+}
+/// Internal representation of a single benchmark query iteration result.
+pub struct QueryResult {
+    pub elapsed: Duration,
+    pub row_count: usize,
 }
-
 /// collects benchmark run data and then serializes it at the end
 pub struct BenchmarkRun {
     context: RunContext,
@@ -120,6 +125,7 @@ impl BenchmarkRun {
             query: id.to_owned(),
             iterations: vec![],
             start_time: SystemTime::now(),
+            success: true,
         });
         if let Some(c) = self.current_case.as_mut() {
             *c += 1;
@@ -138,6 +144,28 @@ impl BenchmarkRun {
         }
     }
 
+    /// Print the names of failed queries, if any
+    pub fn maybe_print_failures(&self) {
+        let failed_queries: Vec<&str> = self
+            .queries
+            .iter()
+            .filter_map(|q| (!q.success).then_some(q.query.as_str()))
+            .collect();
+
+        if !failed_queries.is_empty() {
+            println!("Failed Queries: {}", failed_queries.join(", "));
+        }
+    }
+
+    /// Mark current query
+    pub fn mark_failed(&mut self) {
+        if let Some(idx) = self.current_case {
+            self.queries[idx].success = false;
+        } else {
+            unreachable!("Cannot mark failure: no current case");
+        }
+    }
+
     /// Stringify data into formatted json
     pub fn to_json(&self) -> String {
         let mut output = HashMap::<&str, Value>::new();