@@ -22,8 +22,9 @@ use arrow::array::{BooleanArray, Int32Array, Int8Array};
22
22
use arrow:: record_batch:: RecordBatch ;
23
23
24
24
use datafusion:: arrow:: datatypes:: { DataType , Field , Schema , TimeUnit } ;
25
+ use datafusion:: common:: stats:: Precision ;
25
26
use datafusion:: common:: tree_node:: { Transformed , TreeNode } ;
26
- use datafusion:: common:: DFSchema ;
27
+ use datafusion:: common:: { ColumnStatistics , DFSchema } ;
27
28
use datafusion:: common:: { ScalarValue , ToDFSchema } ;
28
29
use datafusion:: error:: Result ;
29
30
use datafusion:: functions_aggregate:: first_last:: first_value_udaf;
@@ -80,6 +81,9 @@ async fn main() -> Result<()> {
80
81
// See how to analyze ranges in expressions
81
82
range_analysis_demo ( ) ?;
82
83
84
+ // See how to analyze boundaries in different kinds of expressions.
85
+ boundary_analysis_and_selectivity_demo ( ) ?;
86
+
83
87
// See how to determine the data types of expressions
84
88
expression_type_demo ( ) ?;
85
89
@@ -275,6 +279,74 @@ fn range_analysis_demo() -> Result<()> {
275
279
Ok ( ( ) )
276
280
}
277
281
282
+ // DataFusion's analysis can infer boundary statistics and selectivity in
283
+ // various situations which can be helpful in building more efficient
284
+ // query plans.
285
+ fn boundary_analysis_and_selectivity_demo ( ) -> Result < ( ) > {
286
+ // Consider the example where we want all rows with an `id` greater than
287
+ // 5000.
288
+ let id_greater_5000 = col ( "id" ) . gt_eq ( lit ( 5000i64 ) ) ;
289
+
290
+ // As in most examples we must tell DaataFusion the type of the column.
291
+ let schema = Arc :: new ( Schema :: new ( vec ! [ make_field( "id" , DataType :: Int64 ) ] ) ) ;
292
+
293
+ // DataFusion is able to do cardinality estimation on various column types
294
+ // these estimates represented by the `ColumnStatistics` type describe
295
+ // properties such as the maximum and minimum value, the number of distinct
296
+ // values and the number of null values.
297
+ let column_stats = ColumnStatistics {
298
+ null_count : Precision :: Exact ( 0 ) ,
299
+ max_value : Precision :: Exact ( ScalarValue :: Int64 ( Some ( 10000 ) ) ) ,
300
+ min_value : Precision :: Exact ( ScalarValue :: Int64 ( Some ( 1 ) ) ) ,
301
+ sum_value : Precision :: Absent ,
302
+ distinct_count : Precision :: Absent ,
303
+ } ;
304
+
305
+ // We can then build our expression boundaries from the column statistics
306
+ // allowing the analysis to be more precise.
307
+ let initial_boundaries =
308
+ vec ! [ ExprBoundaries :: try_from_column( & schema, & column_stats, 0 ) ?] ;
309
+
310
+ // With the above we can perform the boundary analysis similar to the previous
311
+ // example.
312
+ let df_schema = DFSchema :: try_from ( schema. clone ( ) ) ?;
313
+
314
+ // Analysis case id >= 5000
315
+ let physical_expr1 =
316
+ SessionContext :: new ( ) . create_physical_expr ( id_greater_5000, & df_schema) ?;
317
+ let analysis = analyze (
318
+ & physical_expr1,
319
+ AnalysisContext :: new ( initial_boundaries. clone ( ) ) ,
320
+ df_schema. as_ref ( ) ,
321
+ ) ?;
322
+
323
+ // The analysis will return better bounds thanks to the column statistics.
324
+ assert_eq ! (
325
+ analysis. boundaries. first( ) . map( |boundary| boundary
326
+ . interval
327
+ . clone( )
328
+ . unwrap( )
329
+ . into_bounds( ) ) ,
330
+ Some ( (
331
+ ScalarValue :: Int64 ( Some ( 5000 ) ) ,
332
+ ScalarValue :: Int64 ( Some ( 10000 ) )
333
+ ) )
334
+ ) ;
335
+
336
+ // We can also infer selectivity from the column statistics by assuming
337
+ // that the column is uniformly distributed and using the following
338
+ // estimation formula:
339
+ // Assuming the original range is [a, b] and the new range: [a', b']
340
+ //
341
+ // (a' - b' + 1) / (a - b)
342
+ // (10000 - 5000 + 1) / (10000 - 1)
343
+ assert ! ( analysis
344
+ . selectivity
345
+ . is_some_and( |selectivity| ( 0.5 ..=0.6 ) . contains( & selectivity) ) ) ;
346
+
347
+ Ok ( ( ) )
348
+ }
349
+
278
350
fn make_field ( name : & str , data_type : DataType ) -> Field {
279
351
let nullable = false ;
280
352
Field :: new ( name, data_type, nullable)
0 commit comments