Skip to content

Commit b7af342

Browse files
committed
wire it in for real!
1 parent 3221378 commit b7af342

File tree

4 files changed

+41
-35
lines changed

4 files changed

+41
-35
lines changed

datafusion-examples/examples/parquet_index_advanced.rs

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -366,9 +366,6 @@ impl ParquetScanBuilder {
366366
.with_limit(limit)
367367
.with_projection(projection);
368368

369-
// need a set of files that matches the file scan config groups exactly
370-
let mut row_group_sets = vec![];
371-
372369
// Transform to the format needed to pass to ParquetExec
373370
// Create one file group per file (default to scanning them all in parallel)
374371
for (file_name, scanned_file) in files {
@@ -379,12 +376,12 @@ impl ParquetScanBuilder {
379376

380377
let path = dir.join(file_name);
381378
let canonical_path = fs::canonicalize(path)?;
382-
// TODO add the row group indexes somehow
383-
file_scan_config = file_scan_config.with_file(PartitionedFile::new(
384-
canonical_path.display().to_string(),
385-
file_size,
386-
));
387-
row_group_sets.push(vec![row_group_set]);
379+
let partitioned_file = PartitionedFile::new(canonical_path.display().to_string(), file_size)
380+
// add the row group set as an extension
381+
.with_extensions(Arc::new(row_group_set) as _);
382+
383+
384+
file_scan_config = file_scan_config.with_file(partitioned_file);
388385
}
389386

390387
let Some(parquet_file_reader_factory) = parquet_file_reader_factory else {
@@ -393,8 +390,7 @@ impl ParquetScanBuilder {
393390

394391
// build the actual parquet exec
395392
let mut builder = ParquetExec::builder(file_scan_config)
396-
.with_parquet_file_reader_factory(parquet_file_reader_factory)
397-
.with_row_groups(row_group_sets);
393+
.with_parquet_file_reader_factory(parquet_file_reader_factory);
398394

399395
if let Some(predicate) = predicate {
400396
builder = builder.with_predicate(predicate);

datafusion/core/src/datasource/listing/mod.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,12 @@ impl PartitionedFile {
134134
self.range = Some(FileRange { start, end });
135135
self
136136
}
137+
138+
/// Update the file with optional user metadata
139+
pub fn with_extensions(mut self, extensions: Arc<dyn std::any::Any + Send + Sync>) -> Self {
140+
self.extensions = Some(extensions);
141+
self
142+
}
137143
}
138144

139145
impl From<ObjectMeta> for PartitionedFile {

datafusion/core/src/datasource/physical_plan/parquet/mod.rs

Lines changed: 23 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ use parquet::basic::{ConvertedType, LogicalType};
6262
use parquet::file::{metadata::ParquetMetaData, properties::WriterProperties};
6363
use parquet::schema::types::ColumnDescriptor;
6464
use tokio::task::JoinSet;
65+
use datafusion_common::internal_err;
6566

6667
mod metrics;
6768
mod page_filter;
@@ -199,10 +200,6 @@ pub struct ParquetExec {
199200
table_parquet_options: TableParquetOptions,
200201
/// Optional user defined schema adapter
201202
schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
202-
/// Optional starting RowGroupSets for each file in the file groups
203-
/// TODO encapsulate into some sort of struct that can also have
204-
/// page filters / selections
205-
row_groups: Vec<Vec<RowGroupSet>>,
206203
}
207204

208205
/// [`ParquetExecBuilder`]`, builder for [`ParquetExec`].
@@ -215,8 +212,6 @@ pub struct ParquetExecBuilder {
215212
table_parquet_options: TableParquetOptions,
216213
parquet_file_reader_factory: Option<Arc<dyn ParquetFileReaderFactory>>,
217214
schema_adapter_factory: Option<Arc<dyn SchemaAdapterFactory>>,
218-
/// Optional starting RowGroupSets for each file in the file groups
219-
row_groups: Vec<Vec<RowGroupSet>>,
220215
}
221216

222217
impl ParquetExecBuilder {
@@ -238,7 +233,6 @@ impl ParquetExecBuilder {
238233
table_parquet_options,
239234
parquet_file_reader_factory: None,
240235
schema_adapter_factory: None,
241-
row_groups: vec![],
242236
}
243237
}
244238

@@ -295,20 +289,6 @@ impl ParquetExecBuilder {
295289
self
296290
}
297291

298-
/// Set the row group filter for the scan
299-
///
300-
/// The ParquetExec will only scan row groups specified
301-
/// the format is a vec of of row group indexes
302-
/// for each file in the file groups
303-
/// For example
304-
/// ```
305-
/// fooo
306-
/// ```
307-
pub fn with_row_groups(mut self, row_groups: Vec<Vec<RowGroupSet>>) -> Self {
308-
self.row_groups = row_groups;
309-
self
310-
}
311-
312292
/// Set optional schema adapter factory.
313293
///
314294
/// [`SchemaAdapterFactory`] allows user to specify how fields from the
@@ -338,7 +318,6 @@ impl ParquetExecBuilder {
338318
table_parquet_options,
339319
parquet_file_reader_factory,
340320
schema_adapter_factory,
341-
row_groups,
342321
} = self;
343322

344323
let base_config = file_scan_config;
@@ -397,7 +376,6 @@ impl ParquetExecBuilder {
397376
cache,
398377
table_parquet_options,
399378
schema_adapter_factory,
400-
row_groups,
401379
}
402380
}
403381
}
@@ -749,6 +727,7 @@ impl FileOpener for ParquetOpener {
749727
file_meta.location().as_ref(),
750728
&self.metrics,
751729
);
730+
let extensions = file_meta.extensions.clone();
752731

753732
let reader: Box<dyn AsyncFileReader> =
754733
self.parquet_file_reader_factory.create_reader(
@@ -823,7 +802,7 @@ impl FileOpener for ParquetOpener {
823802
let predicate = pruning_predicate.as_ref().map(|p| p.as_ref());
824803
let rg_metadata = file_metadata.row_groups();
825804
// track which row groups to actually read
826-
let mut row_groups = RowGroupSet::new(rg_metadata.len());
805+
let mut row_groups = create_row_group_set(extensions, rg_metadata.len())?;
827806
// if there is a range restricting what parts of the file to read
828807
if let Some(range) = file_range.as_ref() {
829808
row_groups.prune_by_range(rg_metadata, range);
@@ -890,6 +869,26 @@ impl FileOpener for ParquetOpener {
890869
}
891870
}
892871

872+
/// Return a `RowGroupSet` to read from a parquet file. If there is a
873+
/// RowGroupSet on the metadata, uses that, otherwise creates a new one.
874+
fn create_row_group_set(extensions: Option<Arc<dyn Any + Send + Sync>>, num_row_groups: usize) -> Result<RowGroupSet> {
875+
if let Some(extensions) = extensions {
876+
println!("Had extensions");
877+
if let Some(initial_row_group_set) = extensions.downcast_ref::<RowGroupSet>() {
878+
// use the row group set from the metadata
879+
println!("using row group set from metadata: {:?}", initial_row_group_set);
880+
if initial_row_group_set.len() != num_row_groups {
881+
return internal_err!(
882+
"Provided RowGroupSet length ({}) does not match number of row groups in file: {num_row_groups}",
883+
initial_row_group_set.len());
884+
}
885+
return Ok(initial_row_group_set.clone());
886+
}
887+
}
888+
// default to scanning all row groups
889+
Ok(RowGroupSet::new(num_row_groups))
890+
}
891+
893892
fn should_enable_page_index(
894893
enable_page_index: bool,
895894
page_pruning_predicate: &Option<Arc<PagePruningPredicate>>,

datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,11 @@ impl RowGroupSet {
6464
}
6565
}
6666

67+
/// Provide a reference to this struct as an `Any` reference
68+
pub fn as_any(&self) -> &dyn std::any::Any {
69+
self
70+
}
71+
6772
/// Set the i-th row group to true (should scan)
6873
pub fn do_scan(&mut self, idx: usize) {
6974
self.row_groups[idx] = true;

0 commit comments

Comments
 (0)