Skip to content

Commit 2f1b23f

Browse files
committed
reorganize
1 parent 7195b37 commit 2f1b23f

File tree

1 file changed

+74
-24
lines changed

1 file changed

+74
-24
lines changed

datafusion-examples/examples/parquet_index_advanced.rs

Lines changed: 74 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ use datafusion::physical_plan::ExecutionPlan;
4646
use datafusion::prelude::*;
4747
use datafusion_common::config::TableParquetOptions;
4848
use datafusion_common::{
49-
internal_datafusion_err, DFSchema, DataFusionError, Result, ScalarValue,
49+
internal_datafusion_err, internal_err, DFSchema, DataFusionError, Result, ScalarValue,
5050
};
5151
use datafusion_expr::utils::conjunction;
5252
use datafusion_expr::{TableProviderFilterPushDown, TableType};
@@ -259,27 +259,15 @@ impl TableProvider for IndexTableProvider {
259259
// Use the index to find the row groups that might have data that matches the
260260
// predicate. Any file that can not have data that matches the predicate
261261
// will not be returned.
262-
let scan_builder = self.index.get_row_groups(predicate.clone())?;
263-
let file_scan_config = scan_builder
264-
.build(self.schema(), &self.dir)?
262+
let exec = self
263+
.index
264+
.get_row_groups(predicate.clone())?
265265
.with_limit(limit)
266-
.with_projection(projection.cloned());
266+
.with_projection(projection.cloned())
267+
.with_predicate(predicate)
268+
.with_parquet_file_reader_factory(Arc::clone(&self.index.parquet_factory))
269+
.build(self.schema(), &self.dir)?;
267270

268-
// build the actual parquet exec
269-
let metadata_size_hint = None;
270-
let table_parquet_options = TableParquetOptions::default();
271-
272-
// configure a Parquet opener that can provide the metadata for the
273-
// files that are being scanned
274-
275-
// TODO make a builder for parquet exec
276-
let exec = ParquetExec::new(
277-
file_scan_config,
278-
Some(predicate),
279-
metadata_size_hint,
280-
table_parquet_options,
281-
)
282-
.with_parquet_file_reader_factory(Arc::clone(&self.index.parquet_factory));
283271
Ok(Arc::new(exec))
284272
}
285273

@@ -300,6 +288,16 @@ impl TableProvider for IndexTableProvider {
300288
struct ParquetScanBuilder {
301289
/// Files to scan. Use btree map for deterministic order
302290
files: BTreeMap<String, ScannedFile>,
291+
/// Columns on which to project the data. Indexes that are higher than the
292+
/// number of columns of `file_schema` refer to `table_partition_cols`.
293+
projection: Option<Vec<usize>>,
294+
/// The maximum number of records to read from this plan. If `None`,
295+
/// all records after filtering are returned.
296+
limit: Option<usize>,
297+
/// Optional predicate for row filtering during parquet scan
298+
predicate: Option<Arc<dyn PhysicalExpr>>,
299+
/// user defined parquet file reader factory
300+
parquet_file_reader_factory: Option<Arc<dyn ParquetFileReaderFactory>>,
303301
}
304302

305303
impl ParquetScanBuilder {
@@ -319,18 +317,56 @@ impl ParquetScanBuilder {
319317
}
320318
}
321319

320+
/// Set the projection of the scan being built
321+
pub fn with_projection(mut self, projection: Option<Vec<usize>>) -> Self {
322+
self.projection = projection;
323+
self
324+
}
325+
/// Set the limit of the scan being built
326+
pub fn with_limit(mut self, limit: Option<usize>) -> Self {
327+
self.limit = limit;
328+
self
329+
}
330+
331+
/// Set the predicate of the scan being built
332+
pub fn with_predicate(mut self, predicate: Arc<dyn PhysicalExpr>) -> Self {
333+
self.predicate = Some(predicate);
334+
self
335+
}
336+
337+
pub fn with_parquet_file_reader_factory(
338+
mut self,
339+
parquet_file_reader_factory: Arc<dyn ParquetFileReaderFactory>,
340+
) -> Self {
341+
self.parquet_file_reader_factory = Some(parquet_file_reader_factory);
342+
self
343+
}
344+
322345
/// Creates a ParquetExec that only scans the files and row groups specified in this builder
323346
///
324347
/// # Parameters
325348
/// * `file_scan_config` - the base configuration for the scan (e.g. projection, limit)
326349
/// * `dir` - the directory containing the files
327-
fn build(self, schema: SchemaRef, dir: &Path) -> Result<FileScanConfig> {
350+
///
351+
/// # Returns
352+
/// * a ParquetExec that scans only the files and row groups specified in this builder
353+
fn build(self, schema: SchemaRef, dir: &Path) -> Result<ParquetExec> {
354+
let Self {
355+
files,
356+
projection,
357+
limit,
358+
predicate,
359+
parquet_file_reader_factory,
360+
} = self;
361+
328362
let object_store_url = ObjectStoreUrl::parse("file://")?;
329-
let mut file_scan_config = FileScanConfig::new(object_store_url, schema);
363+
let mut file_scan_config = FileScanConfig::new(object_store_url, schema)
364+
.with_limit(limit)
365+
.with_projection(projection);
330366

331367
// Transform to the format needed to pass to ParquetExec
332368
// Create one file group per file (default to scanning them all in parallel)
333-
for (file_name, scanned_file) in self.files {
369+
for (file_name, scanned_file) in files {
334370
let ScannedFile {
335371
file_size,
336372
row_groups,
@@ -345,7 +381,21 @@ impl ParquetScanBuilder {
345381
));
346382
}
347383

348-
Ok(file_scan_config)
384+
let Some(parquet_file_reader_factory) = parquet_file_reader_factory else {
385+
return internal_err!("Parquet file reader factory not set");
386+
};
387+
388+
// build the actual parquet exec
389+
let metadata_size_hint = None;
390+
let table_parquet_options = TableParquetOptions::default();
391+
let exec = ParquetExec::new(
392+
file_scan_config,
393+
predicate,
394+
metadata_size_hint,
395+
table_parquet_options,
396+
)
397+
.with_parquet_file_reader_factory(parquet_file_reader_factory);
398+
Ok(exec)
349399
}
350400
}
351401

0 commit comments

Comments
 (0)