Skip to content

Commit ee2dc83

Browse files
authored
Add an example of boundary analysis simple expressions. (apache#14688)
* feat(examples): Add an example of boundary analysis for AND/OR exprs The goal of this change is to add an example to explain data flow during boundary analysis of AND and OR expressions. * fix(examples): refine demo code for the example and cut the number of cases * fix(examples): remove left-over * fix(examples): address linting issues
1 parent ece4555 commit ee2dc83

File tree

1 file changed

+73
-1
lines changed

1 file changed

+73
-1
lines changed

datafusion-examples/examples/expr_api.rs

+73-1
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,9 @@ use arrow::array::{BooleanArray, Int32Array, Int8Array};
2222
use arrow::record_batch::RecordBatch;
2323

2424
use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit};
25+
use datafusion::common::stats::Precision;
2526
use datafusion::common::tree_node::{Transformed, TreeNode};
26-
use datafusion::common::DFSchema;
27+
use datafusion::common::{ColumnStatistics, DFSchema};
2728
use datafusion::common::{ScalarValue, ToDFSchema};
2829
use datafusion::error::Result;
2930
use datafusion::functions_aggregate::first_last::first_value_udaf;
@@ -80,6 +81,9 @@ async fn main() -> Result<()> {
8081
// See how to analyze ranges in expressions
8182
range_analysis_demo()?;
8283

84+
// See how to analyze boundaries in different kinds of expressions.
85+
boundary_analysis_and_selectivity_demo()?;
86+
8387
// See how to determine the data types of expressions
8488
expression_type_demo()?;
8589

@@ -275,6 +279,74 @@ fn range_analysis_demo() -> Result<()> {
275279
Ok(())
276280
}
277281

282+
// DataFusion's analysis can infer boundary statistics and selectivity in
283+
// various situations which can be helpful in building more efficient
284+
// query plans.
285+
fn boundary_analysis_and_selectivity_demo() -> Result<()> {
286+
// Consider the example where we want all rows with an `id` greater than
287+
// 5000.
288+
let id_greater_5000 = col("id").gt_eq(lit(5000i64));
289+
290+
// As in most examples we must tell DaataFusion the type of the column.
291+
let schema = Arc::new(Schema::new(vec![make_field("id", DataType::Int64)]));
292+
293+
// DataFusion is able to do cardinality estimation on various column types
294+
// these estimates represented by the `ColumnStatistics` type describe
295+
// properties such as the maximum and minimum value, the number of distinct
296+
// values and the number of null values.
297+
let column_stats = ColumnStatistics {
298+
null_count: Precision::Exact(0),
299+
max_value: Precision::Exact(ScalarValue::Int64(Some(10000))),
300+
min_value: Precision::Exact(ScalarValue::Int64(Some(1))),
301+
sum_value: Precision::Absent,
302+
distinct_count: Precision::Absent,
303+
};
304+
305+
// We can then build our expression boundaries from the column statistics
306+
// allowing the analysis to be more precise.
307+
let initial_boundaries =
308+
vec![ExprBoundaries::try_from_column(&schema, &column_stats, 0)?];
309+
310+
// With the above we can perform the boundary analysis similar to the previous
311+
// example.
312+
let df_schema = DFSchema::try_from(schema.clone())?;
313+
314+
// Analysis case id >= 5000
315+
let physical_expr1 =
316+
SessionContext::new().create_physical_expr(id_greater_5000, &df_schema)?;
317+
let analysis = analyze(
318+
&physical_expr1,
319+
AnalysisContext::new(initial_boundaries.clone()),
320+
df_schema.as_ref(),
321+
)?;
322+
323+
// The analysis will return better bounds thanks to the column statistics.
324+
assert_eq!(
325+
analysis.boundaries.first().map(|boundary| boundary
326+
.interval
327+
.clone()
328+
.unwrap()
329+
.into_bounds()),
330+
Some((
331+
ScalarValue::Int64(Some(5000)),
332+
ScalarValue::Int64(Some(10000))
333+
))
334+
);
335+
336+
// We can also infer selectivity from the column statistics by assuming
337+
// that the column is uniformly distributed and using the following
338+
// estimation formula:
339+
// Assuming the original range is [a, b] and the new range: [a', b']
340+
//
341+
// (a' - b' + 1) / (a - b)
342+
// (10000 - 5000 + 1) / (10000 - 1)
343+
assert!(analysis
344+
.selectivity
345+
.is_some_and(|selectivity| (0.5..=0.6).contains(&selectivity)));
346+
347+
Ok(())
348+
}
349+
278350
fn make_field(name: &str, data_type: DataType) -> Field {
279351
let nullable = false;
280352
Field::new(name, data_type, nullable)

0 commit comments

Comments
 (0)