Skip to content

Commit 0d1abb7

Browse files
committed
Update docs
1 parent 26e234a commit 0d1abb7

File tree

1 file changed

+57
-50
lines changed

1 file changed

+57
-50
lines changed

datafusion/core/src/physical_plan/file_format/parquet/page_filter.rs

Lines changed: 57 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,59 @@ use crate::physical_optimizer::pruning::{PruningPredicate, PruningStatistics};
4040
use super::metrics::ParquetFileMetrics;
4141

4242
/// Create a RowSelection that may rule out ranges of rows based on
43-
/// parquet page level statistics, if any
43+
/// parquet page level statistics, if any.
4444
///
45-
/// TOOD: document parameters
45+
/// For example, given a row group with two column (chunks) for `A`
46+
/// and `B` with the following with page level statistics:
4647
///
47-
/// TODO add example picture here
48-
48+
/// ```text
49+
/// ┏━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━
50+
/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┃
51+
/// ┃ ┌──────────────┐ │ ┌──────────────┐ │ ┃
52+
/// ┃ │ │ │ │ │ │ ┃
53+
/// ┃ │ │ │ │ Page │ │
54+
/// │ │ │ │ │ 3 │ ┃
55+
/// ┃ │ │ │ │ min: "A" │ │ ┃
56+
/// ┃ │ │ │ │ │ max: "C" │ ┃
57+
/// ┃ │ Page │ │ │ first_row: 0 │ │
58+
/// │ │ 1 │ │ │ │ ┃
59+
/// ┃ │ min: 10 │ │ └──────────────┘ │ ┃
60+
/// ┃ │ │ max: 20 │ │ ┌──────────────┐ ┃
61+
/// ┃ │ first_row: 0 │ │ │ │ │
62+
/// │ │ │ │ │ Page │ ┃
63+
/// ┃ │ │ │ │ 4 │ │ ┃
64+
/// ┃ │ │ │ │ │ min: "D" │ ┃
65+
/// ┃ │ │ │ │ max: "G" │ │
66+
/// │ │ │ │ │first_row: 100│ ┃
67+
/// ┃ └──────────────┘ │ │ │ │ ┃
68+
/// ┃ │ ┌──────────────┐ │ │ │ ┃
69+
/// ┃ │ │ │ └──────────────┘ │
70+
/// │ │ Page │ │ ┌──────────────┐ ┃
71+
/// ┃ │ 2 │ │ │ │ │ ┃
72+
/// ┃ │ │ min: 30 │ │ │ Page │ ┃
73+
/// ┃ │ max: 40 │ │ │ 5 │ │
74+
/// │ │first_row: 200│ │ │ min: "H" │ ┃
75+
/// ┃ │ │ │ │ max: "Z" │ │ ┃
76+
/// ┃ │ │ │ │ │first_row: 250│ ┃
77+
/// ┃ └──────────────┘ │ │ │ │
78+
/// │ │ └──────────────┘ ┃
79+
/// ┃ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ┃
80+
/// ┃ ColumnChunk ColumnChunk ┃
81+
/// ┃ A B
82+
/// ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━┛
83+
///
84+
/// Total rows: 300
85+
///
86+
/// ```
87+
///
88+
/// Given the predicate `A > 35 AND B = 'F'`:
89+
///
90+
/// Using `A > 35`: can rule out all of values in Page 1 (rows 0 -> 199)
91+
///
92+
/// Using `B = 'F'`: can rule out all vaues in Page 3 and Page 5 (rows 0 -> 99, and 250 -> 299)
93+
///
94+
/// So we can entirely skip rows 0->199 and 250->299 as we know they
95+
/// can not contain rows that match the predicate.
4996
pub(crate) fn build_page_filter(
5097
pruning_predicate: Option<&PruningPredicate>,
5198
schema: SchemaRef,
@@ -112,54 +159,14 @@ pub(crate) fn build_page_filter(
112159
Ok(None)
113160
}
114161
}
115-
116-
/// For example:
117-
/// ```text
118-
/// ┏━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━
119-
/// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┃
120-
/// ┃ ┌──────────────┐ │ ┌──────────────┐ │ ┃
121-
/// ┃ │ │ │ │ │ │ ┃
122-
/// ┃ │ │ │ │ Page │ │
123-
/// │ │ │ │ │ 3 │ ┃
124-
/// ┃ │ │ │ │ min: "A" │ │ ┃
125-
/// ┃ │ │ │ │ │ max: "C" │ ┃
126-
/// ┃ │ Page │ │ │ first_row: 0 │ │
127-
/// │ │ 1 │ │ │ │ ┃
128-
/// ┃ │ min: 10 │ │ └──────────────┘ │ ┃
129-
/// ┃ │ │ max: 20 │ │ ┌──────────────┐ ┃
130-
/// ┃ │ first_row: 0 │ │ │ │ │
131-
/// │ │ │ │ │ Page │ ┃
132-
/// ┃ │ │ │ │ 4 │ │ ┃
133-
/// ┃ │ │ │ │ │ min: "D" │ ┃
134-
/// ┃ │ │ │ │ max: "G" │ │
135-
/// │ │ │ │ │first_row: 100│ ┃
136-
/// ┃ └──────────────┘ │ │ │ │ ┃
137-
/// ┃ │ ┌──────────────┐ │ │ │ ┃
138-
/// ┃ │ │ │ └──────────────┘ │
139-
/// │ │ Page │ │ ┌──────────────┐ ┃
140-
/// ┃ │ 2 │ │ │ │ │ ┃
141-
/// ┃ │ │ min: 30 │ │ │ Page │ ┃
142-
/// ┃ │ max: 40 │ │ │ 5 │ │
143-
/// │ │first_row: 200│ │ │ min: "H" │ ┃
144-
/// ┃ │ │ │ │ max: "Z" │ │ ┃
145-
/// ┃ │ │ │ │ │first_row: 250│ ┃
146-
/// ┃ └──────────────┘ │ │ │ │
147-
/// │ │ └──────────────┘ ┃
148-
/// ┃ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ┃
149-
/// ┃ ColumnChunk ColumnChunk ┃
150-
/// ┃ A B
151-
/// ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━┛
152-
///
153-
/// Total rows: 300
154-
/// ```
162+
/// Intersects the [`RowSelector`]s
155163
///
156-
/// Given the predicate 'A > 35 AND B = "F"':
157-
/// using `extract_page_index_push_down_predicates` get two single column predicate:
158-
/// Using 'A > 35': could get `RowSelector1: [ Skip(0~199), Read(200~299)]`
159-
/// Using B = "F": could get `RowSelector2: [ Skip(0~99), Read(100~249), Skip(250~299)]`
164+
/// For exampe, given:
165+
/// * `RowSelector1: [ Skip(0~199), Read(200~299)]`
166+
/// * `RowSelector2: [ Skip(0~99), Read(100~249), Skip(250~299)]`
160167
///
161-
/// As the Final selection is the intersection of each columns `RowSelectors:
162-
/// final_selection:[ Skip(0~199), Read(200~249), Skip(250~299)]`
168+
/// The final selection is the intersection of these `RowSelector`s:
169+
/// * `final_selection:[ Skip(0~199), Read(200~249), Skip(250~299)]`
163170
fn combine_multi_col_selection(
164171
row_selections: VecDeque<Vec<RowSelector>>,
165172
) -> Vec<RowSelector> {

0 commit comments

Comments
 (0)