Skip to content

Commit 077c595

Browse files
Dandandanclaude
andcommitted
Use generic evaluate_bounds for statistics propagation instead of CAST-specific code
Replace the CAST-specific statistics propagation with a generic approach using PhysicalExpr::evaluate_bounds(). This works for any expression that implements evaluate_bounds — including CAST, negation, and arithmetic with literals. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 38736c2 commit 077c595

File tree

1 file changed

+67
-34
lines changed

1 file changed

+67
-34
lines changed

datafusion/physical-expr/src/projection.rs

Lines changed: 67 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use std::ops::Deref;
2121
use std::sync::Arc;
2222

2323
use crate::PhysicalExpr;
24-
use crate::expressions::{CastExpr, Column, Literal};
24+
use crate::expressions::{Column, Literal};
2525
use crate::scalar_function::ScalarFunctionExpr;
2626
use crate::utils::collect_columns;
2727

@@ -33,6 +33,7 @@ use datafusion_common::{
3333
Result, ScalarValue, Statistics, assert_or_internal_err, internal_datafusion_err,
3434
plan_err,
3535
};
36+
use datafusion_expr_common::interval_arithmetic::Interval;
3637

3738
use datafusion_physical_expr_common::metrics::ExecutionPlanMetricsSet;
3839
use datafusion_physical_expr_common::metrics::ExpressionEvaluatorMetrics;
@@ -715,10 +716,11 @@ impl ProjectionExprs {
715716
}
716717
}
717718
} else {
718-
// Try to propagate statistics through expressions like CAST
719+
// Propagate statistics through expressions (CAST, arithmetic, etc.)
720+
// using the interval arithmetic system (evaluate_bounds).
719721
project_column_statistics_through_expr(
720722
expr.as_ref(),
721-
&mut stats.column_statistics,
723+
&stats.column_statistics,
722724
)
723725
};
724726
column_statistics.push(col_stats);
@@ -729,43 +731,70 @@ impl ProjectionExprs {
729731
}
730732
}
731733

732-
/// Propagate column statistics through expressions that preserve order.
733-
///
734-
/// Currently handles:
735-
/// - `Column` references (direct passthrough)
736-
/// - `CAST` expressions (casts min/max values to the target type)
737-
///
738-
/// For other expressions, returns unknown statistics.
734+
/// Propagate min/max statistics through an expression using
735+
/// [`PhysicalExpr::evaluate_bounds`]. Works for any expression that
736+
/// implements `evaluate_bounds` (CAST, negation, arithmetic with literals, etc.).
739737
fn project_column_statistics_through_expr(
740738
expr: &dyn PhysicalExpr,
741-
column_stats: &mut [ColumnStatistics],
739+
column_stats: &[ColumnStatistics],
742740
) -> ColumnStatistics {
743-
if let Some(col) = expr.as_any().downcast_ref::<Column>() {
744-
std::mem::take(&mut column_stats[col.index()])
745-
} else if let Some(cast_expr) = expr.as_any().downcast_ref::<CastExpr>() {
746-
let inner_stats =
747-
project_column_statistics_through_expr(cast_expr.expr.as_ref(), column_stats);
748-
let target_type = cast_expr.cast_type();
749-
ColumnStatistics {
750-
min_value: inner_stats
751-
.min_value
752-
.cast_to(target_type)
753-
.unwrap_or(Precision::Absent),
754-
max_value: inner_stats
755-
.max_value
756-
.cast_to(target_type)
757-
.unwrap_or(Precision::Absent),
758-
null_count: inner_stats.null_count,
759-
distinct_count: inner_stats.distinct_count,
760-
// Sum and byte size change under CAST, don't propagate
741+
match compute_bounds_and_exactness(expr, column_stats) {
742+
Some((interval, all_exact)) => ColumnStatistics {
743+
min_value: to_precision(interval.lower().clone(), all_exact),
744+
max_value: to_precision(interval.upper().clone(), all_exact),
745+
null_count: Precision::Absent,
746+
distinct_count: Precision::Absent,
761747
sum_value: Precision::Absent,
762748
byte_size: Precision::Absent,
763-
}
749+
},
750+
None => ColumnStatistics::new_unknown(),
751+
}
752+
}
753+
754+
/// Convert a bound value to the appropriate [`Precision`] level.
755+
fn to_precision(value: ScalarValue, exact: bool) -> Precision<ScalarValue> {
756+
if value.is_null() {
757+
Precision::Absent
758+
} else if exact {
759+
Precision::Exact(value)
764760
} else {
765-
ColumnStatistics::new_unknown()
761+
Precision::Inexact(value)
766762
}
767763
}
768764

765+
/// Recursively compute the output [`Interval`] and whether all leaf
766+
/// statistics are exact, in a single traversal of the expression tree.
767+
fn compute_bounds_and_exactness(
768+
expr: &dyn PhysicalExpr,
769+
column_stats: &[ColumnStatistics],
770+
) -> Option<(Interval, bool)> {
771+
if let Some(col) = expr.as_any().downcast_ref::<Column>() {
772+
let stats = &column_stats[col.index()];
773+
let min = stats.min_value.get_value()?;
774+
let max = stats.max_value.get_value()?;
775+
let exact = stats.min_value.is_exact().unwrap_or(false)
776+
&& stats.max_value.is_exact().unwrap_or(false);
777+
return Some((Interval::try_new(min.clone(), max.clone()).ok()?, exact));
778+
}
779+
780+
if let Some(lit) = expr.as_any().downcast_ref::<Literal>() {
781+
let val = lit.value();
782+
return Some((Interval::try_new(val.clone(), val.clone()).ok()?, true));
783+
}
784+
785+
let children = expr.children();
786+
let mut child_intervals = Vec::with_capacity(children.len());
787+
let mut all_exact = true;
788+
for child in &children {
789+
let (interval, exact) =
790+
compute_bounds_and_exactness(child.as_ref(), column_stats)?;
791+
child_intervals.push(interval);
792+
all_exact &= exact;
793+
}
794+
let child_refs: Vec<&Interval> = child_intervals.iter().collect();
795+
Some((expr.evaluate_bounds(&child_refs).ok()?, all_exact))
796+
}
797+
769798
impl<'a> IntoIterator for &'a ProjectionExprs {
770799
type Item = &'a ProjectionExpr;
771800
type IntoIter = std::slice::Iter<'a, ProjectionExpr>;
@@ -2824,13 +2853,17 @@ pub(crate) mod tests {
28242853
// Should have 2 column statistics
28252854
assert_eq!(output_stats.column_statistics.len(), 2);
28262855

2827-
// First column (expression) should have unknown statistics
2856+
// First column (col0 + 1) should have propagated min/max via evaluate_bounds
28282857
assert_eq!(
2829-
output_stats.column_statistics[0].distinct_count,
2830-
Precision::Absent
2858+
output_stats.column_statistics[0].min_value,
2859+
Precision::Exact(ScalarValue::Int64(Some(-3)))
28312860
);
28322861
assert_eq!(
28332862
output_stats.column_statistics[0].max_value,
2863+
Precision::Exact(ScalarValue::Int64(Some(22)))
2864+
);
2865+
assert_eq!(
2866+
output_stats.column_statistics[0].distinct_count,
28342867
Precision::Absent
28352868
);
28362869

0 commit comments

Comments
 (0)