@@ -31,9 +31,7 @@ use arrow::{
31
31
buffer:: Buffer ,
32
32
datatypes:: { ArrowNativeType , DataType , Field , Schema , SchemaRef , UInt16Type } ,
33
33
} ;
34
- use datafusion_common:: {
35
- exec_err, stats:: Precision , ColumnStatistics , Constraints , Result , Statistics ,
36
- } ;
34
+ use datafusion_common:: { exec_err, ColumnStatistics , Constraints , Result , Statistics } ;
37
35
use datafusion_common:: { DataFusionError , ScalarValue } ;
38
36
use datafusion_execution:: {
39
37
object_store:: ObjectStoreUrl , SendableRecordBatchStream , TaskContext ,
@@ -86,20 +84,22 @@ use crate::{
86
84
/// # Field::new("c4", DataType::Int32, false),
87
85
/// # ]));
88
86
/// # // Note: crate mock ParquetSource, as ParquetSource is not in the datasource crate
89
- /// # struct ParquetSource {};
87
+ /// # struct ParquetSource {
88
+ /// # projected_statistics: Option<Statistics>
89
+ /// # };
90
90
/// # impl FileSource for ParquetSource {
91
91
/// # fn create_file_opener(&self, _: Arc<dyn ObjectStore>, _: &FileScanConfig, _: usize) -> Arc<dyn FileOpener> { unimplemented!() }
92
92
/// # fn as_any(&self) -> &dyn Any { self }
93
93
/// # fn with_batch_size(&self, _: usize) -> Arc<dyn FileSource> { unimplemented!() }
94
94
/// # fn with_schema(&self, _: SchemaRef) -> Arc<dyn FileSource> { unimplemented!() }
95
95
/// # fn with_projection(&self, _: &FileScanConfig) -> Arc<dyn FileSource> { unimplemented!() }
96
- /// # fn with_statistics(&self, _ : Statistics) -> Arc<dyn FileSource> { Arc::new(Self::new() ) }
96
+ /// # fn with_statistics(&self, statistics : Statistics) -> Arc<dyn FileSource> { Arc::new(Self {projected_statistics: Some(statistics)} ) }
97
97
/// # fn metrics(&self) -> &ExecutionPlanMetricsSet { unimplemented!() }
98
- /// # fn statistics(&self) -> datafusion_common::Result<Statistics> { unimplemented!( ) }
98
+ /// # fn statistics(&self) -> datafusion_common::Result<Statistics> { Ok(self.projected_statistics.clone().expect("projected_statistics should be set") ) }
99
99
/// # fn file_type(&self) -> &str { "parquet" }
100
100
/// # }
101
101
/// # impl ParquetSource {
102
- /// # fn new() -> Self { Self{ } }
102
+ /// # fn new() -> Self { Self {projected_statistics: None } }
103
103
/// # }
104
104
/// // create FileScan config for reading parquet files from file://
105
105
/// let object_store_url = ObjectStoreUrl::local_filesystem();
@@ -244,7 +244,7 @@ impl DataSource for FileScanConfig {
244
244
}
245
245
246
246
fn statistics ( & self ) -> Result < Statistics > {
247
- self . file_source . statistics ( )
247
+ Ok ( self . projected_stats ( ) )
248
248
}
249
249
250
250
fn with_fetch ( & self , limit : Option < usize > ) -> Option < Arc < dyn DataSource > > {
@@ -324,13 +324,7 @@ impl FileScanConfig {
324
324
325
325
/// Set the file source
326
326
pub fn with_source ( mut self , file_source : Arc < dyn FileSource > ) -> Self {
327
- let (
328
- _projected_schema,
329
- _constraints,
330
- projected_statistics,
331
- _projected_output_ordering,
332
- ) = self . project ( ) ;
333
- self . file_source = file_source. with_statistics ( projected_statistics) ;
327
+ self . file_source = file_source. with_statistics ( self . statistics . clone ( ) ) ;
334
328
self
335
329
}
336
330
@@ -342,10 +336,75 @@ impl FileScanConfig {
342
336
343
337
/// Set the statistics of the files
344
338
pub fn with_statistics ( mut self , statistics : Statistics ) -> Self {
345
- self . statistics = statistics;
339
+ self . statistics = statistics. clone ( ) ;
340
+ self . file_source = self . file_source . with_statistics ( statistics) ;
346
341
self
347
342
}
348
343
344
+ fn projection_indices ( & self ) -> Vec < usize > {
345
+ match & self . projection {
346
+ Some ( proj) => proj. clone ( ) ,
347
+ None => ( 0 ..self . file_schema . fields ( ) . len ( )
348
+ + self . table_partition_cols . len ( ) )
349
+ . collect ( ) ,
350
+ }
351
+ }
352
+
353
+ fn projected_stats ( & self ) -> Statistics {
354
+ let statistics = self
355
+ . file_source
356
+ . statistics ( )
357
+ . unwrap_or ( self . statistics . clone ( ) ) ;
358
+
359
+ let table_cols_stats = self
360
+ . projection_indices ( )
361
+ . into_iter ( )
362
+ . map ( |idx| {
363
+ if idx < self . file_schema . fields ( ) . len ( ) {
364
+ statistics. column_statistics [ idx] . clone ( )
365
+ } else {
366
+ // TODO provide accurate stat for partition column (#1186)
367
+ ColumnStatistics :: new_unknown ( )
368
+ }
369
+ } )
370
+ . collect ( ) ;
371
+
372
+ Statistics {
373
+ num_rows : statistics. num_rows ,
374
+ // TODO correct byte size: https://github.com/apache/datafusion/issues/14936
375
+ total_byte_size : statistics. total_byte_size ,
376
+ column_statistics : table_cols_stats,
377
+ }
378
+ }
379
+
380
+ fn projected_schema ( & self ) -> Arc < Schema > {
381
+ let table_fields: Vec < _ > = self
382
+ . projection_indices ( )
383
+ . into_iter ( )
384
+ . map ( |idx| {
385
+ if idx < self . file_schema . fields ( ) . len ( ) {
386
+ self . file_schema . field ( idx) . clone ( )
387
+ } else {
388
+ let partition_idx = idx - self . file_schema . fields ( ) . len ( ) ;
389
+ self . table_partition_cols [ partition_idx] . clone ( )
390
+ }
391
+ } )
392
+ . collect ( ) ;
393
+
394
+ Arc :: new ( Schema :: new_with_metadata (
395
+ table_fields,
396
+ self . file_schema . metadata ( ) . clone ( ) ,
397
+ ) )
398
+ }
399
+
400
+ fn projected_constraints ( & self ) -> Constraints {
401
+ let indexes = self . projection_indices ( ) ;
402
+
403
+ self . constraints
404
+ . project ( & indexes)
405
+ . unwrap_or_else ( Constraints :: empty)
406
+ }
407
+
349
408
/// Set the projection of the files
350
409
pub fn with_projection ( mut self , projection : Option < Vec < usize > > ) -> Self {
351
410
self . projection = projection;
@@ -433,54 +492,13 @@ impl FileScanConfig {
433
492
) ;
434
493
}
435
494
436
- let proj_indices = if let Some ( proj) = & self . projection {
437
- proj
438
- } else {
439
- let len = self . file_schema . fields ( ) . len ( ) + self . table_partition_cols . len ( ) ;
440
- & ( 0 ..len) . collect :: < Vec < _ > > ( )
441
- } ;
442
-
443
- let mut table_fields = vec ! [ ] ;
444
- let mut table_cols_stats = vec ! [ ] ;
445
- for idx in proj_indices {
446
- if * idx < self . file_schema . fields ( ) . len ( ) {
447
- let field = self . file_schema . field ( * idx) ;
448
- table_fields. push ( field. clone ( ) ) ;
449
- table_cols_stats. push ( self . statistics . column_statistics [ * idx] . clone ( ) )
450
- } else {
451
- let partition_idx = idx - self . file_schema . fields ( ) . len ( ) ;
452
- table_fields. push ( self . table_partition_cols [ partition_idx] . to_owned ( ) ) ;
453
- // TODO provide accurate stat for partition column (#1186)
454
- table_cols_stats. push ( ColumnStatistics :: new_unknown ( ) )
455
- }
456
- }
457
-
458
- let table_stats = Statistics {
459
- num_rows : self . statistics . num_rows ,
460
- // TODO correct byte size?
461
- total_byte_size : Precision :: Absent ,
462
- column_statistics : table_cols_stats,
463
- } ;
464
-
465
- let projected_schema = Arc :: new ( Schema :: new_with_metadata (
466
- table_fields,
467
- self . file_schema . metadata ( ) . clone ( ) ,
468
- ) ) ;
495
+ let schema = self . projected_schema ( ) ;
496
+ let constraints = self . projected_constraints ( ) ;
497
+ let stats = self . projected_stats ( ) ;
469
498
470
- let projected_constraints = self
471
- . constraints
472
- . project ( proj_indices)
473
- . unwrap_or_else ( Constraints :: empty) ;
499
+ let output_ordering = get_projected_output_ordering ( self , & schema) ;
474
500
475
- let projected_output_ordering =
476
- get_projected_output_ordering ( self , & projected_schema) ;
477
-
478
- (
479
- projected_schema,
480
- projected_constraints,
481
- table_stats,
482
- projected_output_ordering,
483
- )
501
+ ( schema, constraints, stats, output_ordering)
484
502
}
485
503
486
504
#[ cfg_attr( not( feature = "avro" ) , allow( unused) ) ] // Only used by avro
@@ -1048,6 +1066,7 @@ mod tests {
1048
1066
compute:: SortOptions ,
1049
1067
} ;
1050
1068
1069
+ use datafusion_common:: stats:: Precision ;
1051
1070
use datafusion_common:: { assert_batches_eq, DFSchema } ;
1052
1071
use datafusion_expr:: { execution_props:: ExecutionProps , SortExpr } ;
1053
1072
use datafusion_physical_expr:: create_physical_expr;
@@ -1203,6 +1222,12 @@ mod tests {
1203
1222
) ,
1204
1223
] ;
1205
1224
// create a projected schema
1225
+ let statistics = Statistics {
1226
+ num_rows : Precision :: Inexact ( 3 ) ,
1227
+ total_byte_size : Precision :: Absent ,
1228
+ column_statistics : Statistics :: unknown_column ( & file_batch. schema ( ) ) ,
1229
+ } ;
1230
+
1206
1231
let conf = config_for_projection (
1207
1232
file_batch. schema ( ) ,
1208
1233
// keep all cols from file and 2 from partitioning
@@ -1213,9 +1238,23 @@ mod tests {
1213
1238
file_batch. schema( ) . fields( ) . len( ) ,
1214
1239
file_batch. schema( ) . fields( ) . len( ) + 2 ,
1215
1240
] ) ,
1216
- Statistics :: new_unknown ( & file_batch . schema ( ) ) ,
1241
+ statistics . clone ( ) ,
1217
1242
to_partition_cols ( partition_cols. clone ( ) ) ,
1218
1243
) ;
1244
+
1245
+ let source_statistics = conf. file_source . statistics ( ) . unwrap ( ) ;
1246
+ let conf_stats = conf. statistics ( ) . unwrap ( ) ;
1247
+
1248
+ // projection should be reflected in the file source statistics
1249
+ assert_eq ! ( conf_stats. num_rows, Precision :: Inexact ( 3 ) ) ;
1250
+
1251
+ // 3 original statistics + 2 partition statistics
1252
+ assert_eq ! ( conf_stats. column_statistics. len( ) , 5 ) ;
1253
+
1254
+ // file statics should not be modified
1255
+ assert_eq ! ( source_statistics, statistics) ;
1256
+ assert_eq ! ( source_statistics. column_statistics. len( ) , 3 ) ;
1257
+
1219
1258
let ( proj_schema, ..) = conf. project ( ) ;
1220
1259
// created a projector for that projected schema
1221
1260
let mut proj = PartitionColumnProjector :: new (
0 commit comments