@@ -36,9 +36,11 @@ use futures::{try_join, FutureExt, StreamExt, TryFutureExt, TryStreamExt};
36
36
use parquet:: arrow:: arrow_reader:: { ArrowPredicateFn , ArrowReaderOptions , RowFilter , RowSelection } ;
37
37
use parquet:: arrow:: async_reader:: AsyncFileReader ;
38
38
use parquet:: arrow:: { ParquetRecordBatchStreamBuilder , ProjectionMask , PARQUET_FIELD_ID_META_KEY } ;
39
- use parquet:: file:: metadata:: { ParquetMetaData , ParquetMetaDataReader } ;
39
+ use parquet:: file:: metadata:: { ParquetMetaData , ParquetMetaDataReader , RowGroupMetaData } ;
40
40
use parquet:: schema:: types:: { SchemaDescriptor , Type as ParquetType } ;
41
+ use roaring:: RoaringTreemap ;
41
42
43
+ use crate :: arrow:: delete_file_manager:: DeleteFileManager ;
42
44
use crate :: arrow:: record_batch_transformer:: RecordBatchTransformer ;
43
45
use crate :: arrow:: { arrow_schema_to_schema, get_arrow_datum} ;
44
46
use crate :: error:: Result ;
@@ -145,6 +147,7 @@ impl ArrowReader {
145
147
file_io,
146
148
row_group_filtering_enabled,
147
149
row_selection_enabled,
150
+ concurrency_limit_data_files,
148
151
)
149
152
} )
150
153
. map_err ( |err| {
@@ -162,30 +165,24 @@ impl ArrowReader {
162
165
file_io : FileIO ,
163
166
row_group_filtering_enabled : bool ,
164
167
row_selection_enabled : bool ,
168
+ concurrency_limit_data_files : usize ,
165
169
) -> Result < ArrowRecordBatchStream > {
166
- // TODO: add support for delete files
167
- if !task. deletes . is_empty ( ) {
168
- return Err ( Error :: new (
169
- ErrorKind :: FeatureUnsupported ,
170
- "Delete files are not yet supported" ,
171
- ) ) ;
172
- }
173
-
174
- // Get the metadata for the Parquet file we need to read and build
175
- // a reader for the data within
176
- let parquet_file = file_io. new_input ( & task. data_file_path ) ?;
177
- let ( parquet_metadata, parquet_reader) =
178
- try_join ! ( parquet_file. metadata( ) , parquet_file. reader( ) ) ?;
179
- let parquet_file_reader = ArrowFileReader :: new ( parquet_metadata, parquet_reader) ;
180
-
181
- let should_load_page_index = row_selection_enabled && task. predicate . is_some ( ) ;
182
-
183
- // Start creating the record batch stream, which wraps the parquet file reader
184
- let mut record_batch_stream_builder = ParquetRecordBatchStreamBuilder :: new_with_options (
185
- parquet_file_reader,
186
- ArrowReaderOptions :: new ( ) . with_page_index ( should_load_page_index) ,
187
- )
188
- . await ?;
170
+ let should_load_page_index =
171
+ ( row_selection_enabled && task. predicate . is_some ( ) ) || !task. deletes . is_empty ( ) ;
172
+
173
+ // concurrently retrieve delete files and create RecordBatchStreamBuilder
174
+ let ( delete_file_manager, mut record_batch_stream_builder) = try_join ! (
175
+ DeleteFileManager :: load_deletes(
176
+ task. deletes. clone( ) ,
177
+ file_io. clone( ) ,
178
+ concurrency_limit_data_files
179
+ ) ,
180
+ Self :: create_parquet_record_batch_stream_builder(
181
+ & task. data_file_path,
182
+ file_io. clone( ) ,
183
+ should_load_page_index,
184
+ )
185
+ ) ?;
189
186
190
187
// Create a projection mask for the batch stream to select which columns in the
191
188
// Parquet file that we want in the response
@@ -197,7 +194,7 @@ impl ArrowReader {
197
194
) ?;
198
195
record_batch_stream_builder = record_batch_stream_builder. with_projection ( projection_mask) ;
199
196
200
- // RecordBatchTransformer performs any required transformations on the RecordBatches
197
+ // RecordBatchTransformer performs any transformations required on the RecordBatches
201
198
// that come back from the file, such as type promotion, default column insertion
202
199
// and column re-ordering
203
200
let mut record_batch_transformer =
@@ -207,49 +204,102 @@ impl ArrowReader {
207
204
record_batch_stream_builder = record_batch_stream_builder. with_batch_size ( batch_size) ;
208
205
}
209
206
210
- if let Some ( predicate) = & task. predicate {
207
+ let delete_predicate = delete_file_manager. build_delete_predicate ( task. schema . clone ( ) ) ?;
208
+
209
+ // In addition to the optional predicate supplied in the `FileScanTask`,
210
+ // we also have an optional predicate resulting from equality delete files.
211
+ // If both are present, we logical-AND them together to form a single filter
212
+ // predicate that we can pass to the `RecordBatchStreamBuilder`.
213
+ let final_predicate = match ( & task. predicate , delete_predicate) {
214
+ ( None , None ) => None ,
215
+ ( Some ( predicate) , None ) => Some ( predicate. clone ( ) ) ,
216
+ ( None , Some ( ref predicate) ) => Some ( predicate. clone ( ) ) ,
217
+ ( Some ( filter_predicate) , Some ( delete_predicate) ) => {
218
+ Some ( filter_predicate. clone ( ) . and ( delete_predicate) )
219
+ }
220
+ } ;
221
+
222
+ // There are two possible sources both for potential lists of selected RowGroup indices,
223
+ // and for `RowSelection`s.
224
+ // Selected RowGroup index lists can come from two sources:
225
+ // * When there are equality delete files that are applicable;
226
+ // * When there is a scan predicate and row_group_filtering_enabled = true.
227
+ // `RowSelection`s can be created in either or both of the following cases:
228
+ // * When there are positional delete files that are applicable;
229
+ // * When there is a scan predicate and row_selection_enabled = true
230
+ // Note that, in the former case we only perform row group filtering when
231
+ // there is a scan predicate AND row_group_filtering_enabled = true,
232
+ // but we perform row selection filtering if there are applicable
233
+ // equality delete files OR (there is a scan predicate AND row_selection_enabled),
234
+ // since the only implemented method of applying positional deletes is
235
+ // by using a `RowSelection`.
236
+ let mut selected_row_group_indices = None ;
237
+ let mut row_selection = None ;
238
+
239
+ if let Some ( predicate) = final_predicate {
211
240
let ( iceberg_field_ids, field_id_map) = Self :: build_field_id_set_and_map (
212
241
record_batch_stream_builder. parquet_schema ( ) ,
213
- predicate,
242
+ & predicate,
214
243
) ?;
215
244
216
245
let row_filter = Self :: get_row_filter (
217
- predicate,
246
+ & predicate,
218
247
record_batch_stream_builder. parquet_schema ( ) ,
219
248
& iceberg_field_ids,
220
249
& field_id_map,
221
250
) ?;
222
251
record_batch_stream_builder = record_batch_stream_builder. with_row_filter ( row_filter) ;
223
252
224
- let mut selected_row_groups = None ;
225
253
if row_group_filtering_enabled {
226
254
let result = Self :: get_selected_row_group_indices (
227
- predicate,
255
+ & predicate,
228
256
record_batch_stream_builder. metadata ( ) ,
229
257
& field_id_map,
230
258
& task. schema ,
231
259
) ?;
232
260
233
- selected_row_groups = Some ( result) ;
261
+ selected_row_group_indices = Some ( result) ;
234
262
}
235
263
236
264
if row_selection_enabled {
237
- let row_selection = Self :: get_row_selection (
238
- predicate,
265
+ row_selection = Some ( Self :: get_row_selection_for_filter_predicate (
266
+ & predicate,
239
267
record_batch_stream_builder. metadata ( ) ,
240
- & selected_row_groups ,
268
+ & selected_row_group_indices ,
241
269
& field_id_map,
242
270
& task. schema ,
243
- ) ?;
244
-
245
- record_batch_stream_builder =
246
- record_batch_stream_builder. with_row_selection ( row_selection) ;
271
+ ) ?) ;
247
272
}
273
+ }
248
274
249
- if let Some ( selected_row_groups) = selected_row_groups {
250
- record_batch_stream_builder =
251
- record_batch_stream_builder. with_row_groups ( selected_row_groups) ;
252
- }
275
+ let positional_delete_indexes =
276
+ delete_file_manager. get_positional_delete_indexes_for_data_file ( & task. data_file_path ) ;
277
+
278
+ if let Some ( positional_delete_indexes) = positional_delete_indexes {
279
+ let delete_row_selection = Self :: build_deletes_row_selection (
280
+ record_batch_stream_builder. metadata ( ) . row_groups ( ) ,
281
+ & selected_row_group_indices,
282
+ positional_delete_indexes,
283
+ ) ?;
284
+
285
+ // merge the row selection from the delete files with the row selection
286
+ // from the filter predicate, if there is one from the filter predicate
287
+ row_selection = match row_selection {
288
+ None => Some ( delete_row_selection) ,
289
+ Some ( filter_row_selection) => {
290
+ Some ( filter_row_selection. intersection ( & delete_row_selection) )
291
+ }
292
+ } ;
293
+ }
294
+
295
+ if let Some ( row_selection) = row_selection {
296
+ record_batch_stream_builder =
297
+ record_batch_stream_builder. with_row_selection ( row_selection) ;
298
+ }
299
+
300
+ if let Some ( selected_row_group_indices) = selected_row_group_indices {
301
+ record_batch_stream_builder =
302
+ record_batch_stream_builder. with_row_groups ( selected_row_group_indices) ;
253
303
}
254
304
255
305
// Build the batch stream and send all the RecordBatches that it generates
@@ -265,6 +315,43 @@ impl ArrowReader {
265
315
Ok ( Box :: pin ( record_batch_stream) as ArrowRecordBatchStream )
266
316
}
267
317
318
+ async fn create_parquet_record_batch_stream_builder (
319
+ data_file_path : & str ,
320
+ file_io : FileIO ,
321
+ should_load_page_index : bool ,
322
+ ) -> Result < ParquetRecordBatchStreamBuilder < ArrowFileReader < impl FileRead + Sized > > > {
323
+ // Get the metadata for the Parquet file we need to read and build
324
+ // a reader for the data within
325
+ let parquet_file = file_io. new_input ( data_file_path) ?;
326
+ let ( parquet_metadata, parquet_reader) =
327
+ try_join ! ( parquet_file. metadata( ) , parquet_file. reader( ) ) ?;
328
+ let parquet_file_reader = ArrowFileReader :: new ( parquet_metadata, parquet_reader) ;
329
+
330
+ // Create the record batch stream builder, which wraps the parquet file reader
331
+ let record_batch_stream_builder = ParquetRecordBatchStreamBuilder :: new_with_options (
332
+ parquet_file_reader,
333
+ ArrowReaderOptions :: new ( ) . with_page_index ( should_load_page_index) ,
334
+ )
335
+ . await ?;
336
+ Ok ( record_batch_stream_builder)
337
+ }
338
+
339
+ /// computes a `RowSelection` from positional delete indices.
340
+ ///
341
+ /// Using the Parquet page index, we build a `RowSelection` that rejects rows that are indicated
342
+ /// as having been deleted by a positional delete, taking into account any row groups that have
343
+ /// been skipped entirely by the filter predicate
344
+ #[ allow( unused) ]
345
+ fn build_deletes_row_selection (
346
+ row_group_metadata : & [ RowGroupMetaData ] ,
347
+ selected_row_groups : & Option < Vec < usize > > ,
348
+ mut positional_deletes : RoaringTreemap ,
349
+ ) -> Result < RowSelection > {
350
+ // TODO
351
+
352
+ Ok ( RowSelection :: default ( ) )
353
+ }
354
+
268
355
fn build_field_id_set_and_map (
269
356
parquet_schema : & SchemaDescriptor ,
270
357
predicate : & BoundPredicate ,
@@ -475,7 +562,7 @@ impl ArrowReader {
475
562
Ok ( results)
476
563
}
477
564
478
- fn get_row_selection (
565
+ fn get_row_selection_for_filter_predicate (
479
566
predicate : & BoundPredicate ,
480
567
parquet_metadata : & Arc < ParquetMetaData > ,
481
568
selected_row_groups : & Option < Vec < usize > > ,
0 commit comments