Skip to content

Commit ea46e82

Browse files
alambWeijun-H
andauthored
Add advanced_parquet_index.rs example of index in into parquet files (#10701)
* Add `advanced_parquet_index.rs` example of indexing into parquet files * pre-load page index * fix comment * Apply suggestions from code review Thank you @Weijun-H Co-authored-by: Alex Huang <[email protected]> * Add ASCII ART * Update datafusion-examples/README.md Co-authored-by: Alex Huang <[email protected]> * Update datafusion-examples/examples/advanced_parquet_index.rs Co-authored-by: Alex Huang <[email protected]> * Improve / clarify comments based on review * Add page index caveat --------- Co-authored-by: Alex Huang <[email protected]>
1 parent 6c0e4fb commit ea46e82

File tree

8 files changed

+695
-5
lines changed

8 files changed

+695
-5
lines changed

datafusion-examples/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ cargo run --example csv_sql
4545
- [`advanced_udaf.rs`](examples/advanced_udaf.rs): Define and invoke a more complicated User Defined Aggregate Function (UDAF)
4646
- [`advanced_udf.rs`](examples/advanced_udf.rs): Define and invoke a more complicated User Defined Scalar Function (UDF)
4747
- [`advanced_udwf.rs`](examples/advanced_udwf.rs): Define and invoke a more complicated User Defined Window Function (UDWF)
48+
- [`advanced_parquet_index.rs`](examples/advanced_parquet_index.rs): Creates a detailed secondary index that covers the contents of several parquet files
4849
- [`avro_sql.rs`](examples/avro_sql.rs): Build and run a query plan from a SQL statement against a local AVRO file
4950
- [`catalog.rs`](examples/catalog.rs): Register the table into a custom catalog
5051
- [`csv_sql.rs`](examples/csv_sql.rs): Build and run a query plan from a SQL statement against a local CSV file

datafusion-examples/examples/advanced_parquet_index.rs

Lines changed: 664 additions & 0 deletions
Large diffs are not rendered by default.

datafusion/common/src/column.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,13 @@ impl Column {
127127
})
128128
}
129129

130+
/// return the column's name.
131+
///
132+
/// Note: This ignores the relation and returns the column name only.
133+
pub fn name(&self) -> &str {
134+
&self.name
135+
}
136+
130137
/// Serialize column into a flat name string
131138
pub fn flat_name(&self) -> String {
132139
match &self.relation {

datafusion/common/src/config.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1393,6 +1393,13 @@ pub struct TableParquetOptions {
13931393
pub key_value_metadata: HashMap<String, Option<String>>,
13941394
}
13951395

1396+
impl TableParquetOptions {
1397+
/// Return new default TableParquetOptions
1398+
pub fn new() -> Self {
1399+
Self::default()
1400+
}
1401+
}
1402+
13961403
impl ConfigField for TableParquetOptions {
13971404
fn visit<V: Visit>(&self, v: &mut V, key_prefix: &str, description: &'static str) {
13981405
self.global.visit(v, key_prefix, description);

datafusion/core/src/datasource/physical_plan/parquet/access_plan.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,11 @@ impl ParquetAccessPlan {
139139
self.set(idx, RowGroupAccess::Skip);
140140
}
141141

142+
/// scan the i-th row group
143+
pub fn scan(&mut self, idx: usize) {
144+
self.set(idx, RowGroupAccess::Scan);
145+
}
146+
142147
/// Return true if the i-th row group should be scanned
143148
pub fn should_scan(&self, idx: usize) -> bool {
144149
self.row_groups[idx].should_scan()

datafusion/core/src/datasource/physical_plan/parquet/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,9 +186,9 @@ pub use writer::plan_to_parquet;
186186
/// let exec = ParquetExec::builder(file_scan_config).build();
187187
/// ```
188188
///
189-
/// For a complete example, see the [`parquet_index_advanced` example]).
189+
/// For a complete example, see the [`advanced_parquet_index` example]).
190190
///
191-
/// [`parquet_index_advanced` example]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/parquet_index_advanced.rs
191+
/// [`parquet_index_advanced` example]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/advanced_parquet_index.rs
192192
///
193193
/// # Execution Overview
194194
///

datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ impl RowGroupAccessPlanFilter {
5454
Self { access_plan }
5555
}
5656

57-
/// Return true if there are no row groups to scan
57+
/// Return true if there are no row groups
5858
pub fn is_empty(&self) -> bool {
5959
self.access_plan.is_empty()
6060
}

datafusion/core/src/physical_optimizer/pruning.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -471,8 +471,10 @@ pub struct PruningPredicate {
471471
/// Original physical predicate from which this predicate expr is derived
472472
/// (required for serialization)
473473
orig_expr: Arc<dyn PhysicalExpr>,
474-
/// [`LiteralGuarantee`]s that are used to try and prove a predicate can not
475-
/// possibly evaluate to `true`.
474+
/// [`LiteralGuarantee`]s used to try and prove a predicate can not possibly
475+
/// evaluate to `true`.
476+
///
477+
/// See [`PruningPredicate::literal_guarantees`] for more details.
476478
literal_guarantees: Vec<LiteralGuarantee>,
477479
}
478480

@@ -595,6 +597,10 @@ impl PruningPredicate {
595597
}
596598

597599
/// Returns a reference to the literal guarantees
600+
///
601+
/// Note that **All** `LiteralGuarantee`s must be satisfied for the
602+
/// expression to possibly be `true`. If any is not satisfied, the
603+
/// expression is guaranteed to be `null` or `false`.
598604
pub fn literal_guarantees(&self) -> &[LiteralGuarantee] {
599605
&self.literal_guarantees
600606
}

0 commit comments

Comments
 (0)