@@ -43,14 +43,51 @@ mod filter;
43
43
mod selection;
44
44
pub mod statistics;
45
45
46
- /// Builder for constructing parquet readers into arrow.
46
+ /// Builder for constructing Parquet readers that decode into [Apache Arrow]
47
+ /// arrays.
47
48
///
48
49
/// Most users should use one of the following specializations:
49
50
///
50
51
/// * synchronous API: [`ParquetRecordBatchReaderBuilder::try_new`]
51
52
/// * `async` API: [`ParquetRecordBatchStreamBuilder::new`]
52
53
///
54
+ /// # Features
55
+ /// * Projection pushdown: [`Self::with_projection`]
56
+ /// * Cached metadata: [`ArrowReaderMetadata::load`]
57
+ /// * Offset skipping: [`Self::with_offset`] and [`Self::with_limit`]
58
+ /// * Row group filtering: [`Self::with_row_groups`]
59
+ /// * Range filtering: [`Self::with_row_selection`]
60
+ /// * Row level filtering: [`Self::with_row_filter`]
61
+ ///
62
+ /// # Implementing Predicate Pushdown
63
+ ///
64
+ /// [`Self::with_row_filter`] permits filter evaluation *during* the decoding
65
+ /// process, which is efficient and allows the most low level optimizations.
66
+ ///
67
+ /// However, most Parquet based systems will apply filters at many steps prior
68
+ /// to decoding such as pruning files, row groups and data pages. This crate
69
+ /// provides the low level APIs needed to implement such filtering, but does not
70
+ /// include any logic to actually evaluate predicates. For example:
71
+ ///
72
+ /// * [`Self::with_row_groups`] for Row Group pruning
73
+ /// * [`Self::with_row_selection`] for data page pruning
74
+ /// * [`StatisticsConverter`] to convert Parquet statistics to Arrow arrays
75
+ ///
76
+ /// The rationale for this design is that implementing predicate pushdown is a
77
+ /// complex topic and varies significantly from system to system. For example
78
+ ///
79
+ /// 1. Predicates supported (do you support predicates like prefix matching, user defined functions, etc)
80
+ /// 2. Evaluating predicates on multiple files (with potentially different but compatible schemas)
81
+ /// 3. Evaluating predicates using information from an external metadata catalog (e.g. Apache Iceberg or similar)
82
+ /// 4. Interleaving fetching metadata, evaluating predicates, and decoding files
83
+ ///
84
+ /// You can read more about this design in the [Querying Parquet with
85
+ /// Millisecond Latency] Arrow blog post.
86
+ ///
53
87
/// [`ParquetRecordBatchStreamBuilder::new`]: crate::arrow::async_reader::ParquetRecordBatchStreamBuilder::new
88
+ /// [Apache Arrow]: https://arrow.apache.org/
89
+ /// [`StatisticsConverter`]: statistics::StatisticsConverter
90
+ /// [Querying Parquet with Millisecond Latency]: https://arrow.apache.org/blog/2022/12/26/querying-parquet-with-millisecond-latency/
54
91
pub struct ArrowReaderBuilder < T > {
55
92
pub ( crate ) input : T ,
56
93
0 commit comments