@@ -40,12 +40,59 @@ use crate::physical_optimizer::pruning::{PruningPredicate, PruningStatistics};
40
40
use super :: metrics:: ParquetFileMetrics ;
41
41
42
42
/// Create a RowSelection that may rule out ranges of rows based on
43
- /// parquet page level statistics, if any
43
+ /// parquet page level statistics, if any.
44
44
///
45
- /// TOOD: document parameters
45
+ /// For example, given a row group with two column (chunks) for `A`
46
+ /// and `B` with the following with page level statistics:
46
47
///
47
- /// TODO add example picture here
48
-
48
+ /// ```text
49
+ /// ┏━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━
50
+ /// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┃
51
+ /// ┃ ┌──────────────┐ │ ┌──────────────┐ │ ┃
52
+ /// ┃ │ │ │ │ │ │ ┃
53
+ /// ┃ │ │ │ │ Page │ │
54
+ /// │ │ │ │ │ 3 │ ┃
55
+ /// ┃ │ │ │ │ min: "A" │ │ ┃
56
+ /// ┃ │ │ │ │ │ max: "C" │ ┃
57
+ /// ┃ │ Page │ │ │ first_row: 0 │ │
58
+ /// │ │ 1 │ │ │ │ ┃
59
+ /// ┃ │ min: 10 │ │ └──────────────┘ │ ┃
60
+ /// ┃ │ │ max: 20 │ │ ┌──────────────┐ ┃
61
+ /// ┃ │ first_row: 0 │ │ │ │ │
62
+ /// │ │ │ │ │ Page │ ┃
63
+ /// ┃ │ │ │ │ 4 │ │ ┃
64
+ /// ┃ │ │ │ │ │ min: "D" │ ┃
65
+ /// ┃ │ │ │ │ max: "G" │ │
66
+ /// │ │ │ │ │first_row: 100│ ┃
67
+ /// ┃ └──────────────┘ │ │ │ │ ┃
68
+ /// ┃ │ ┌──────────────┐ │ │ │ ┃
69
+ /// ┃ │ │ │ └──────────────┘ │
70
+ /// │ │ Page │ │ ┌──────────────┐ ┃
71
+ /// ┃ │ 2 │ │ │ │ │ ┃
72
+ /// ┃ │ │ min: 30 │ │ │ Page │ ┃
73
+ /// ┃ │ max: 40 │ │ │ 5 │ │
74
+ /// │ │first_row: 200│ │ │ min: "H" │ ┃
75
+ /// ┃ │ │ │ │ max: "Z" │ │ ┃
76
+ /// ┃ │ │ │ │ │first_row: 250│ ┃
77
+ /// ┃ └──────────────┘ │ │ │ │
78
+ /// │ │ └──────────────┘ ┃
79
+ /// ┃ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ┃
80
+ /// ┃ ColumnChunk ColumnChunk ┃
81
+ /// ┃ A B
82
+ /// ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━┛
83
+ ///
84
+ /// Total rows: 300
85
+ ///
86
+ /// ```
87
+ ///
88
+ /// Given the predicate `A > 35 AND B = 'F'`:
89
+ ///
90
+ /// Using `A > 35`: can rule out all of values in Page 1 (rows 0 -> 199)
91
+ ///
92
+ /// Using `B = 'F'`: can rule out all vaues in Page 3 and Page 5 (rows 0 -> 99, and 250 -> 299)
93
+ ///
94
+ /// So we can entirely skip rows 0->199 and 250->299 as we know they
95
+ /// can not contain rows that match the predicate.
49
96
pub ( crate ) fn build_page_filter (
50
97
pruning_predicate : Option < & PruningPredicate > ,
51
98
schema : SchemaRef ,
@@ -112,54 +159,14 @@ pub(crate) fn build_page_filter(
112
159
Ok ( None )
113
160
}
114
161
}
115
-
116
- /// For example:
117
- /// ```text
118
- /// ┏━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━
119
- /// ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┃
120
- /// ┃ ┌──────────────┐ │ ┌──────────────┐ │ ┃
121
- /// ┃ │ │ │ │ │ │ ┃
122
- /// ┃ │ │ │ │ Page │ │
123
- /// │ │ │ │ │ 3 │ ┃
124
- /// ┃ │ │ │ │ min: "A" │ │ ┃
125
- /// ┃ │ │ │ │ │ max: "C" │ ┃
126
- /// ┃ │ Page │ │ │ first_row: 0 │ │
127
- /// │ │ 1 │ │ │ │ ┃
128
- /// ┃ │ min: 10 │ │ └──────────────┘ │ ┃
129
- /// ┃ │ │ max: 20 │ │ ┌──────────────┐ ┃
130
- /// ┃ │ first_row: 0 │ │ │ │ │
131
- /// │ │ │ │ │ Page │ ┃
132
- /// ┃ │ │ │ │ 4 │ │ ┃
133
- /// ┃ │ │ │ │ │ min: "D" │ ┃
134
- /// ┃ │ │ │ │ max: "G" │ │
135
- /// │ │ │ │ │first_row: 100│ ┃
136
- /// ┃ └──────────────┘ │ │ │ │ ┃
137
- /// ┃ │ ┌──────────────┐ │ │ │ ┃
138
- /// ┃ │ │ │ └──────────────┘ │
139
- /// │ │ Page │ │ ┌──────────────┐ ┃
140
- /// ┃ │ 2 │ │ │ │ │ ┃
141
- /// ┃ │ │ min: 30 │ │ │ Page │ ┃
142
- /// ┃ │ max: 40 │ │ │ 5 │ │
143
- /// │ │first_row: 200│ │ │ min: "H" │ ┃
144
- /// ┃ │ │ │ │ max: "Z" │ │ ┃
145
- /// ┃ │ │ │ │ │first_row: 250│ ┃
146
- /// ┃ └──────────────┘ │ │ │ │
147
- /// │ │ └──────────────┘ ┃
148
- /// ┃ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘ ┃
149
- /// ┃ ColumnChunk ColumnChunk ┃
150
- /// ┃ A B
151
- /// ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━━ ━━┛
152
- ///
153
- /// Total rows: 300
154
- /// ```
162
+ /// Intersects the [`RowSelector`]s
155
163
///
156
- /// Given the predicate 'A > 35 AND B = "F"':
157
- /// using `extract_page_index_push_down_predicates` get two single column predicate:
158
- /// Using 'A > 35': could get `RowSelector1: [ Skip(0~199), Read(200~299)]`
159
- /// Using B = "F": could get `RowSelector2: [ Skip(0~99), Read(100~249), Skip(250~299)]`
164
+ /// For exampe, given:
165
+ /// * `RowSelector1: [ Skip(0~199), Read(200~299)]`
166
+ /// * `RowSelector2: [ Skip(0~99), Read(100~249), Skip(250~299)]`
160
167
///
161
- /// As the Final selection is the intersection of each columns `RowSelectors :
162
- /// final_selection:[ Skip(0~199), Read(200~249), Skip(250~299)]`
168
+ /// The final selection is the intersection of these `RowSelector`s :
169
+ /// * ` final_selection:[ Skip(0~199), Read(200~249), Skip(250~299)]`
163
170
fn combine_multi_col_selection (
164
171
row_selections : VecDeque < Vec < RowSelector > > ,
165
172
) -> Vec < RowSelector > {
0 commit comments