@@ -62,6 +62,7 @@ use parquet::basic::{ConvertedType, LogicalType};
62
62
use parquet:: file:: { metadata:: ParquetMetaData , properties:: WriterProperties } ;
63
63
use parquet:: schema:: types:: ColumnDescriptor ;
64
64
use tokio:: task:: JoinSet ;
65
+ use datafusion_common:: internal_err;
65
66
66
67
mod metrics;
67
68
mod page_filter;
@@ -199,10 +200,6 @@ pub struct ParquetExec {
199
200
table_parquet_options : TableParquetOptions ,
200
201
/// Optional user defined schema adapter
201
202
schema_adapter_factory : Option < Arc < dyn SchemaAdapterFactory > > ,
202
- /// Optional starting RowGroupSets for each file in the file groups
203
- /// TODO encapsulate into some sort of struct that can also have
204
- /// page filters / selections
205
- row_groups : Vec < Vec < RowGroupSet > > ,
206
203
}
207
204
208
205
/// [`ParquetExecBuilder`]`, builder for [`ParquetExec`].
@@ -215,8 +212,6 @@ pub struct ParquetExecBuilder {
215
212
table_parquet_options : TableParquetOptions ,
216
213
parquet_file_reader_factory : Option < Arc < dyn ParquetFileReaderFactory > > ,
217
214
schema_adapter_factory : Option < Arc < dyn SchemaAdapterFactory > > ,
218
- /// Optional starting RowGroupSets for each file in the file groups
219
- row_groups : Vec < Vec < RowGroupSet > > ,
220
215
}
221
216
222
217
impl ParquetExecBuilder {
@@ -238,7 +233,6 @@ impl ParquetExecBuilder {
238
233
table_parquet_options,
239
234
parquet_file_reader_factory : None ,
240
235
schema_adapter_factory : None ,
241
- row_groups : vec ! [ ] ,
242
236
}
243
237
}
244
238
@@ -295,20 +289,6 @@ impl ParquetExecBuilder {
295
289
self
296
290
}
297
291
298
- /// Set the row group filter for the scan
299
- ///
300
- /// The ParquetExec will only scan row groups specified
301
- /// the format is a vec of of row group indexes
302
- /// for each file in the file groups
303
- /// For example
304
- /// ```
305
- /// fooo
306
- /// ```
307
- pub fn with_row_groups ( mut self , row_groups : Vec < Vec < RowGroupSet > > ) -> Self {
308
- self . row_groups = row_groups;
309
- self
310
- }
311
-
312
292
/// Set optional schema adapter factory.
313
293
///
314
294
/// [`SchemaAdapterFactory`] allows user to specify how fields from the
@@ -338,7 +318,6 @@ impl ParquetExecBuilder {
338
318
table_parquet_options,
339
319
parquet_file_reader_factory,
340
320
schema_adapter_factory,
341
- row_groups,
342
321
} = self ;
343
322
344
323
let base_config = file_scan_config;
@@ -397,7 +376,6 @@ impl ParquetExecBuilder {
397
376
cache,
398
377
table_parquet_options,
399
378
schema_adapter_factory,
400
- row_groups,
401
379
}
402
380
}
403
381
}
@@ -749,6 +727,7 @@ impl FileOpener for ParquetOpener {
749
727
file_meta. location ( ) . as_ref ( ) ,
750
728
& self . metrics ,
751
729
) ;
730
+ let extensions = file_meta. extensions . clone ( ) ;
752
731
753
732
let reader: Box < dyn AsyncFileReader > =
754
733
self . parquet_file_reader_factory . create_reader (
@@ -823,7 +802,7 @@ impl FileOpener for ParquetOpener {
823
802
let predicate = pruning_predicate. as_ref ( ) . map ( |p| p. as_ref ( ) ) ;
824
803
let rg_metadata = file_metadata. row_groups ( ) ;
825
804
// track which row groups to actually read
826
- let mut row_groups = RowGroupSet :: new ( rg_metadata. len ( ) ) ;
805
+ let mut row_groups = create_row_group_set ( extensions , rg_metadata. len ( ) ) ? ;
827
806
// if there is a range restricting what parts of the file to read
828
807
if let Some ( range) = file_range. as_ref ( ) {
829
808
row_groups. prune_by_range ( rg_metadata, range) ;
@@ -890,6 +869,26 @@ impl FileOpener for ParquetOpener {
890
869
}
891
870
}
892
871
872
+ /// Return a `RowGroupSet` to read from a parquet file. If there is a
873
+ /// RowGroupSet on the metadata, uses that, otherwise creates a new one.
874
+ fn create_row_group_set ( extensions : Option < Arc < dyn Any + Send + Sync > > , num_row_groups : usize ) -> Result < RowGroupSet > {
875
+ if let Some ( extensions) = extensions {
876
+ println ! ( "Had extensions" ) ;
877
+ if let Some ( initial_row_group_set) = extensions. downcast_ref :: < RowGroupSet > ( ) {
878
+ // use the row group set from the metadata
879
+ println ! ( "using row group set from metadata: {:?}" , initial_row_group_set) ;
880
+ if initial_row_group_set. len ( ) != num_row_groups {
881
+ return internal_err ! (
882
+ "Provided RowGroupSet length ({}) does not match number of row groups in file: {num_row_groups}" ,
883
+ initial_row_group_set. len( ) ) ;
884
+ }
885
+ return Ok ( initial_row_group_set. clone ( ) ) ;
886
+ }
887
+ }
888
+ // default to scanning all row groups
889
+ Ok ( RowGroupSet :: new ( num_row_groups) )
890
+ }
891
+
893
892
fn should_enable_page_index (
894
893
enable_page_index : bool ,
895
894
page_pruning_predicate : & Option < Arc < PagePruningPredicate > > ,
0 commit comments