@@ -46,7 +46,7 @@ use datafusion::physical_plan::ExecutionPlan;
46
46
use datafusion:: prelude:: * ;
47
47
use datafusion_common:: config:: TableParquetOptions ;
48
48
use datafusion_common:: {
49
- internal_datafusion_err, DFSchema , DataFusionError , Result , ScalarValue ,
49
+ internal_datafusion_err, internal_err , DFSchema , DataFusionError , Result , ScalarValue ,
50
50
} ;
51
51
use datafusion_expr:: utils:: conjunction;
52
52
use datafusion_expr:: { TableProviderFilterPushDown , TableType } ;
@@ -259,27 +259,15 @@ impl TableProvider for IndexTableProvider {
259
259
// Use the index to find the row groups that might have data that matches the
260
260
// predicate. Any file that can not have data that matches the predicate
261
261
// will not be returned.
262
- let scan_builder = self . index . get_row_groups ( predicate . clone ( ) ) ? ;
263
- let file_scan_config = scan_builder
264
- . build ( self . schema ( ) , & self . dir ) ?
262
+ let exec = self
263
+ . index
264
+ . get_row_groups ( predicate . clone ( ) ) ?
265
265
. with_limit ( limit)
266
- . with_projection ( projection. cloned ( ) ) ;
266
+ . with_projection ( projection. cloned ( ) )
267
+ . with_predicate ( predicate)
268
+ . with_parquet_file_reader_factory ( Arc :: clone ( & self . index . parquet_factory ) )
269
+ . build ( self . schema ( ) , & self . dir ) ?;
267
270
268
- // build the actual parquet exec
269
- let metadata_size_hint = None ;
270
- let table_parquet_options = TableParquetOptions :: default ( ) ;
271
-
272
- // configure a Parquet opener that can provide the metadata for the
273
- // files that are being scanned
274
-
275
- // TODO make a builder for parquet exec
276
- let exec = ParquetExec :: new (
277
- file_scan_config,
278
- Some ( predicate) ,
279
- metadata_size_hint,
280
- table_parquet_options,
281
- )
282
- . with_parquet_file_reader_factory ( Arc :: clone ( & self . index . parquet_factory ) ) ;
283
271
Ok ( Arc :: new ( exec) )
284
272
}
285
273
@@ -300,6 +288,16 @@ impl TableProvider for IndexTableProvider {
300
288
struct ParquetScanBuilder {
301
289
/// Files to scan. Use btree map for deterministic order
302
290
files : BTreeMap < String , ScannedFile > ,
291
+ /// Columns on which to project the data. Indexes that are higher than the
292
+ /// number of columns of `file_schema` refer to `table_partition_cols`.
293
+ projection : Option < Vec < usize > > ,
294
+ /// The maximum number of records to read from this plan. If `None`,
295
+ /// all records after filtering are returned.
296
+ limit : Option < usize > ,
297
+ /// Optional predicate for row filtering during parquet scan
298
+ predicate : Option < Arc < dyn PhysicalExpr > > ,
299
+ /// user defined parquet file reader factory
300
+ parquet_file_reader_factory : Option < Arc < dyn ParquetFileReaderFactory > > ,
303
301
}
304
302
305
303
impl ParquetScanBuilder {
@@ -319,18 +317,56 @@ impl ParquetScanBuilder {
319
317
}
320
318
}
321
319
320
+ /// Set the projection of the scan being built
321
+ pub fn with_projection ( mut self , projection : Option < Vec < usize > > ) -> Self {
322
+ self . projection = projection;
323
+ self
324
+ }
325
+ /// Set the limit of the scan being built
326
+ pub fn with_limit ( mut self , limit : Option < usize > ) -> Self {
327
+ self . limit = limit;
328
+ self
329
+ }
330
+
331
+ /// Set the predicate of the scan being built
332
+ pub fn with_predicate ( mut self , predicate : Arc < dyn PhysicalExpr > ) -> Self {
333
+ self . predicate = Some ( predicate) ;
334
+ self
335
+ }
336
+
337
+ pub fn with_parquet_file_reader_factory (
338
+ mut self ,
339
+ parquet_file_reader_factory : Arc < dyn ParquetFileReaderFactory > ,
340
+ ) -> Self {
341
+ self . parquet_file_reader_factory = Some ( parquet_file_reader_factory) ;
342
+ self
343
+ }
344
+
322
345
/// Creates a ParquetExec that only scans the files and row groups specified in this builder
323
346
///
324
347
/// # Parameters
325
348
/// * `file_scan_config` - the base configuration for the scan (e.g. projection, limit)
326
349
/// * `dir` - the directory containing the files
327
- fn build ( self , schema : SchemaRef , dir : & Path ) -> Result < FileScanConfig > {
350
+ ///
351
+ /// # Returns
352
+ /// * a ParquetExec that scans only the files and row groups specified in this builder
353
+ fn build ( self , schema : SchemaRef , dir : & Path ) -> Result < ParquetExec > {
354
+ let Self {
355
+ files,
356
+ projection,
357
+ limit,
358
+ predicate,
359
+ parquet_file_reader_factory,
360
+ } = self ;
361
+
328
362
let object_store_url = ObjectStoreUrl :: parse ( "file://" ) ?;
329
- let mut file_scan_config = FileScanConfig :: new ( object_store_url, schema) ;
363
+ let mut file_scan_config = FileScanConfig :: new ( object_store_url, schema)
364
+ . with_limit ( limit)
365
+ . with_projection ( projection) ;
330
366
331
367
// Transform to the format needed to pass to ParquetExec
332
368
// Create one file group per file (default to scanning them all in parallel)
333
- for ( file_name, scanned_file) in self . files {
369
+ for ( file_name, scanned_file) in files {
334
370
let ScannedFile {
335
371
file_size,
336
372
row_groups,
@@ -345,7 +381,21 @@ impl ParquetScanBuilder {
345
381
) ) ;
346
382
}
347
383
348
- Ok ( file_scan_config)
384
+ let Some ( parquet_file_reader_factory) = parquet_file_reader_factory else {
385
+ return internal_err ! ( "Parquet file reader factory not set" ) ;
386
+ } ;
387
+
388
+ // build the actual parquet exec
389
+ let metadata_size_hint = None ;
390
+ let table_parquet_options = TableParquetOptions :: default ( ) ;
391
+ let exec = ParquetExec :: new (
392
+ file_scan_config,
393
+ predicate,
394
+ metadata_size_hint,
395
+ table_parquet_options,
396
+ )
397
+ . with_parquet_file_reader_factory ( parquet_file_reader_factory) ;
398
+ Ok ( exec)
349
399
}
350
400
}
351
401
0 commit comments