Skip to content

Commit 2fd558f

Browse files
blagininalamb
andauthored
Set projection before configuring the source (#14685)
* Set projection before configuring the source * Refresh source manually * Update statistics on proj / partition columns update * Use `FileScanConfig` own `source` * Extend test to ensure stats are different * Unify names * Comment `total_byte_size` in tests * Use source stats as a base * Return correct stats in the `ParquetSource` mock * Revert test changes, add follow on ticket * Revert statistics total_byte_count change * Update test --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent cf2b7e6 commit 2fd558f

File tree

7 files changed

+177
-100
lines changed

7 files changed

+177
-100
lines changed

datafusion-testing

Submodule datafusion-testing updated 257 files

datafusion/core/src/datasource/file_format/parquet.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1934,6 +1934,7 @@ mod tests {
19341934

19351935
// test metadata
19361936
assert_eq!(exec.statistics()?.num_rows, Precision::Exact(8));
1937+
// TODO correct byte size: https://github.com/apache/datafusion/issues/14936
19371938
assert_eq!(exec.statistics()?.total_byte_size, Precision::Exact(671));
19381939

19391940
Ok(())
@@ -1976,6 +1977,7 @@ mod tests {
19761977

19771978
// note: even if the limit is set, the executor rounds up to the batch size
19781979
assert_eq!(exec.statistics()?.num_rows, Precision::Exact(8));
1980+
// TODO correct byte size: https://github.com/apache/datafusion/issues/14936
19791981
assert_eq!(exec.statistics()?.total_byte_size, Precision::Exact(671));
19801982
let batches = collect(exec, task_ctx).await?;
19811983
assert_eq!(1, batches.len());

datafusion/core/src/datasource/listing/table.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1239,6 +1239,7 @@ mod tests {
12391239

12401240
let exec = table.scan(&state, None, &[], None).await?;
12411241
assert_eq!(exec.statistics()?.num_rows, Precision::Exact(8));
1242+
// TODO correct byte size: https://github.com/apache/datafusion/issues/14936
12421243
assert_eq!(exec.statistics()?.total_byte_size, Precision::Exact(671));
12431244

12441245
Ok(())

datafusion/core/tests/parquet/file_statistics.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,8 @@ async fn load_table_stats_with_session_level_cache() {
8282
assert_eq!(exec1.statistics().unwrap().num_rows, Precision::Exact(8));
8383
assert_eq!(
8484
exec1.statistics().unwrap().total_byte_size,
85-
Precision::Exact(671)
85+
// TODO correct byte size: https://github.com/apache/datafusion/issues/14936
86+
Precision::Exact(671),
8687
);
8788
assert_eq!(get_static_cache_size(&state1), 1);
8889

@@ -93,7 +94,8 @@ async fn load_table_stats_with_session_level_cache() {
9394
assert_eq!(exec2.statistics().unwrap().num_rows, Precision::Exact(8));
9495
assert_eq!(
9596
exec2.statistics().unwrap().total_byte_size,
96-
Precision::Exact(671)
97+
// TODO correct byte size: https://github.com/apache/datafusion/issues/14936
98+
Precision::Exact(671),
9799
);
98100
assert_eq!(get_static_cache_size(&state2), 1);
99101

@@ -104,7 +106,8 @@ async fn load_table_stats_with_session_level_cache() {
104106
assert_eq!(exec3.statistics().unwrap().num_rows, Precision::Exact(8));
105107
assert_eq!(
106108
exec3.statistics().unwrap().total_byte_size,
107-
Precision::Exact(671)
109+
// TODO correct byte size: https://github.com/apache/datafusion/issues/14936
110+
Precision::Exact(671),
108111
);
109112
// List same file no increase
110113
assert_eq!(get_static_cache_size(&state1), 1);

datafusion/datasource/src/file_scan_config.rs

Lines changed: 102 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,7 @@ use arrow::{
3131
buffer::Buffer,
3232
datatypes::{ArrowNativeType, DataType, Field, Schema, SchemaRef, UInt16Type},
3333
};
34-
use datafusion_common::{
35-
exec_err, stats::Precision, ColumnStatistics, Constraints, Result, Statistics,
36-
};
34+
use datafusion_common::{exec_err, ColumnStatistics, Constraints, Result, Statistics};
3735
use datafusion_common::{DataFusionError, ScalarValue};
3836
use datafusion_execution::{
3937
object_store::ObjectStoreUrl, SendableRecordBatchStream, TaskContext,
@@ -86,20 +84,22 @@ use crate::{
8684
/// # Field::new("c4", DataType::Int32, false),
8785
/// # ]));
8886
/// # // Note: crate mock ParquetSource, as ParquetSource is not in the datasource crate
89-
/// # struct ParquetSource {};
87+
/// # struct ParquetSource {
88+
/// # projected_statistics: Option<Statistics>
89+
/// # };
9090
/// # impl FileSource for ParquetSource {
9191
/// # fn create_file_opener(&self, _: Arc<dyn ObjectStore>, _: &FileScanConfig, _: usize) -> Arc<dyn FileOpener> { unimplemented!() }
9292
/// # fn as_any(&self) -> &dyn Any { self }
9393
/// # fn with_batch_size(&self, _: usize) -> Arc<dyn FileSource> { unimplemented!() }
9494
/// # fn with_schema(&self, _: SchemaRef) -> Arc<dyn FileSource> { unimplemented!() }
9595
/// # fn with_projection(&self, _: &FileScanConfig) -> Arc<dyn FileSource> { unimplemented!() }
96-
/// # fn with_statistics(&self, _: Statistics) -> Arc<dyn FileSource> { Arc::new(Self::new()) }
96+
/// # fn with_statistics(&self, statistics: Statistics) -> Arc<dyn FileSource> { Arc::new(Self {projected_statistics: Some(statistics)} ) }
9797
/// # fn metrics(&self) -> &ExecutionPlanMetricsSet { unimplemented!() }
98-
/// # fn statistics(&self) -> datafusion_common::Result<Statistics> { unimplemented!() }
98+
/// # fn statistics(&self) -> datafusion_common::Result<Statistics> { Ok(self.projected_statistics.clone().expect("projected_statistics should be set")) }
9999
/// # fn file_type(&self) -> &str { "parquet" }
100100
/// # }
101101
/// # impl ParquetSource {
102-
/// # fn new() -> Self { Self{} }
102+
/// # fn new() -> Self { Self {projected_statistics: None} }
103103
/// # }
104104
/// // create FileScan config for reading parquet files from file://
105105
/// let object_store_url = ObjectStoreUrl::local_filesystem();
@@ -244,7 +244,7 @@ impl DataSource for FileScanConfig {
244244
}
245245

246246
fn statistics(&self) -> Result<Statistics> {
247-
self.file_source.statistics()
247+
Ok(self.projected_stats())
248248
}
249249

250250
fn with_fetch(&self, limit: Option<usize>) -> Option<Arc<dyn DataSource>> {
@@ -324,13 +324,7 @@ impl FileScanConfig {
324324

325325
/// Set the file source
326326
pub fn with_source(mut self, file_source: Arc<dyn FileSource>) -> Self {
327-
let (
328-
_projected_schema,
329-
_constraints,
330-
projected_statistics,
331-
_projected_output_ordering,
332-
) = self.project();
333-
self.file_source = file_source.with_statistics(projected_statistics);
327+
self.file_source = file_source.with_statistics(self.statistics.clone());
334328
self
335329
}
336330

@@ -342,10 +336,75 @@ impl FileScanConfig {
342336

343337
/// Set the statistics of the files
344338
pub fn with_statistics(mut self, statistics: Statistics) -> Self {
345-
self.statistics = statistics;
339+
self.statistics = statistics.clone();
340+
self.file_source = self.file_source.with_statistics(statistics);
346341
self
347342
}
348343

344+
fn projection_indices(&self) -> Vec<usize> {
345+
match &self.projection {
346+
Some(proj) => proj.clone(),
347+
None => (0..self.file_schema.fields().len()
348+
+ self.table_partition_cols.len())
349+
.collect(),
350+
}
351+
}
352+
353+
fn projected_stats(&self) -> Statistics {
354+
let statistics = self
355+
.file_source
356+
.statistics()
357+
.unwrap_or(self.statistics.clone());
358+
359+
let table_cols_stats = self
360+
.projection_indices()
361+
.into_iter()
362+
.map(|idx| {
363+
if idx < self.file_schema.fields().len() {
364+
statistics.column_statistics[idx].clone()
365+
} else {
366+
// TODO provide accurate stat for partition column (#1186)
367+
ColumnStatistics::new_unknown()
368+
}
369+
})
370+
.collect();
371+
372+
Statistics {
373+
num_rows: statistics.num_rows,
374+
// TODO correct byte size: https://github.com/apache/datafusion/issues/14936
375+
total_byte_size: statistics.total_byte_size,
376+
column_statistics: table_cols_stats,
377+
}
378+
}
379+
380+
fn projected_schema(&self) -> Arc<Schema> {
381+
let table_fields: Vec<_> = self
382+
.projection_indices()
383+
.into_iter()
384+
.map(|idx| {
385+
if idx < self.file_schema.fields().len() {
386+
self.file_schema.field(idx).clone()
387+
} else {
388+
let partition_idx = idx - self.file_schema.fields().len();
389+
self.table_partition_cols[partition_idx].clone()
390+
}
391+
})
392+
.collect();
393+
394+
Arc::new(Schema::new_with_metadata(
395+
table_fields,
396+
self.file_schema.metadata().clone(),
397+
))
398+
}
399+
400+
fn projected_constraints(&self) -> Constraints {
401+
let indexes = self.projection_indices();
402+
403+
self.constraints
404+
.project(&indexes)
405+
.unwrap_or_else(Constraints::empty)
406+
}
407+
349408
/// Set the projection of the files
350409
pub fn with_projection(mut self, projection: Option<Vec<usize>>) -> Self {
351410
self.projection = projection;
@@ -433,54 +492,13 @@ impl FileScanConfig {
433492
);
434493
}
435494

436-
let proj_indices = if let Some(proj) = &self.projection {
437-
proj
438-
} else {
439-
let len = self.file_schema.fields().len() + self.table_partition_cols.len();
440-
&(0..len).collect::<Vec<_>>()
441-
};
442-
443-
let mut table_fields = vec![];
444-
let mut table_cols_stats = vec![];
445-
for idx in proj_indices {
446-
if *idx < self.file_schema.fields().len() {
447-
let field = self.file_schema.field(*idx);
448-
table_fields.push(field.clone());
449-
table_cols_stats.push(self.statistics.column_statistics[*idx].clone())
450-
} else {
451-
let partition_idx = idx - self.file_schema.fields().len();
452-
table_fields.push(self.table_partition_cols[partition_idx].to_owned());
453-
// TODO provide accurate stat for partition column (#1186)
454-
table_cols_stats.push(ColumnStatistics::new_unknown())
455-
}
456-
}
457-
458-
let table_stats = Statistics {
459-
num_rows: self.statistics.num_rows,
460-
// TODO correct byte size?
461-
total_byte_size: Precision::Absent,
462-
column_statistics: table_cols_stats,
463-
};
464-
465-
let projected_schema = Arc::new(Schema::new_with_metadata(
466-
table_fields,
467-
self.file_schema.metadata().clone(),
468-
));
495+
let schema = self.projected_schema();
496+
let constraints = self.projected_constraints();
497+
let stats = self.projected_stats();
469498

470-
let projected_constraints = self
471-
.constraints
472-
.project(proj_indices)
473-
.unwrap_or_else(Constraints::empty);
499+
let output_ordering = get_projected_output_ordering(self, &schema);
474500

475-
let projected_output_ordering =
476-
get_projected_output_ordering(self, &projected_schema);
477-
478-
(
479-
projected_schema,
480-
projected_constraints,
481-
table_stats,
482-
projected_output_ordering,
483-
)
501+
(schema, constraints, stats, output_ordering)
484502
}
485503

486504
#[cfg_attr(not(feature = "avro"), allow(unused))] // Only used by avro
@@ -1048,6 +1066,7 @@ mod tests {
10481066
compute::SortOptions,
10491067
};
10501068

1069+
use datafusion_common::stats::Precision;
10511070
use datafusion_common::{assert_batches_eq, DFSchema};
10521071
use datafusion_expr::{execution_props::ExecutionProps, SortExpr};
10531072
use datafusion_physical_expr::create_physical_expr;
@@ -1203,6 +1222,12 @@ mod tests {
12031222
),
12041223
];
12051224
// create a projected schema
1225+
let statistics = Statistics {
1226+
num_rows: Precision::Inexact(3),
1227+
total_byte_size: Precision::Absent,
1228+
column_statistics: Statistics::unknown_column(&file_batch.schema()),
1229+
};
1230+
12061231
let conf = config_for_projection(
12071232
file_batch.schema(),
12081233
// keep all cols from file and 2 from partitioning
@@ -1213,9 +1238,23 @@ mod tests {
12131238
file_batch.schema().fields().len(),
12141239
file_batch.schema().fields().len() + 2,
12151240
]),
1216-
Statistics::new_unknown(&file_batch.schema()),
1241+
statistics.clone(),
12171242
to_partition_cols(partition_cols.clone()),
12181243
);
1244+
1245+
let source_statistics = conf.file_source.statistics().unwrap();
1246+
let conf_stats = conf.statistics().unwrap();
1247+
1248+
// projection should be reflected in the file source statistics
1249+
assert_eq!(conf_stats.num_rows, Precision::Inexact(3));
1250+
1251+
// 3 original statistics + 2 partition statistics
1252+
assert_eq!(conf_stats.column_statistics.len(), 5);
1253+
1254+
// file statics should not be modified
1255+
assert_eq!(source_statistics, statistics);
1256+
assert_eq!(source_statistics.column_statistics.len(), 3);
1257+
12191258
let (proj_schema, ..) = conf.project();
12201259
// created a projector for that projected schema
12211260
let mut proj = PartitionColumnProjector::new(

0 commit comments

Comments
 (0)