17
17
18
18
//! [`ParquetOpener`] for opening Parquet files
19
19
20
- use std:: sync:: Arc ;
21
-
22
20
use crate :: datasource:: file_format:: parquet:: {
23
21
coerce_file_schema_to_string_type, coerce_file_schema_to_view_type,
24
22
} ;
@@ -31,6 +29,9 @@ use crate::datasource::physical_plan::{
31
29
FileMeta , FileOpenFuture , FileOpener , ParquetFileMetrics , ParquetFileReaderFactory ,
32
30
} ;
33
31
use crate :: datasource:: schema_adapter:: SchemaAdapterFactory ;
32
+ use std:: cmp:: min;
33
+ use std:: collections:: HashMap ;
34
+ use std:: sync:: Arc ;
34
35
35
36
use arrow:: datatypes:: SchemaRef ;
36
37
use arrow:: error:: ArrowError ;
@@ -40,18 +41,21 @@ use datafusion_physical_optimizer::pruning::PruningPredicate;
40
41
use datafusion_physical_plan:: metrics:: ExecutionPlanMetricsSet ;
41
42
42
43
use futures:: { StreamExt , TryStreamExt } ;
43
- use log:: debug;
44
+ use log:: { debug, info , trace } ;
44
45
use parquet:: arrow:: arrow_reader:: { ArrowReaderMetadata , ArrowReaderOptions } ;
45
46
use parquet:: arrow:: async_reader:: AsyncFileReader ;
46
47
use parquet:: arrow:: { ParquetRecordBatchStreamBuilder , ProjectionMask } ;
48
+ use parquet:: schema:: types:: SchemaDescriptor ;
49
+ // use datafusion_common::DataFusionError;
50
+ use datafusion_common:: deep:: { has_deep_projection, rewrite_schema, splat_columns} ;
47
51
48
52
/// Implements [`FileOpener`] for a parquet file
49
53
pub ( super ) struct ParquetOpener {
50
54
/// Execution partition index
51
55
pub partition_index : usize ,
52
56
/// Column indexes in `table_schema` needed by the query
53
57
pub projection : Arc < [ usize ] > ,
54
- /// Target number of rows in each output RecordBatch
58
+ pub projection_deep : Arc < HashMap < usize , Vec < String > > > ,
55
59
pub batch_size : usize ,
56
60
/// Optional limit on the number of rows to read
57
61
pub limit : Option < usize > ,
@@ -105,11 +109,31 @@ impl FileOpener for ParquetOpener {
105
109
106
110
let batch_size = self . batch_size ;
107
111
108
- let projected_schema =
109
- SchemaRef :: from ( self . table_schema . project ( & self . projection ) ?) ;
112
+ let projection = self . projection . clone ( ) ;
113
+ let projection_vec = projection
114
+ . as_ref ( )
115
+ . iter ( )
116
+ . map ( |i| * i)
117
+ . collect :: < Vec < usize > > ( ) ;
118
+ info ! (
119
+ "ParquetOpener::open projection={:?}, deep_projection: {:?}" ,
120
+ projection, & self . projection_deep
121
+ ) ;
122
+ // FIXME @HStack: ADR: why do we need to do this ? our function needs another param maybe ?
123
+ // In the case when the projections requested are empty, we should return an empty schema
124
+ let projected_schema = if projection_vec. len ( ) == 0 {
125
+ SchemaRef :: from ( self . table_schema . project ( & projection) ?)
126
+ } else {
127
+ rewrite_schema (
128
+ self . table_schema . clone ( ) ,
129
+ & projection_vec,
130
+ self . projection_deep . as_ref ( ) ,
131
+ )
132
+ } ;
110
133
let schema_adapter = self
111
134
. schema_adapter_factory
112
135
. create ( projected_schema, Arc :: clone ( & self . table_schema ) ) ;
136
+ let projection_deep = self . projection_deep . clone ( ) ;
113
137
let predicate = self . predicate . clone ( ) ;
114
138
let pruning_predicate = self . pruning_predicate . clone ( ) ;
115
139
let page_pruning_predicate = self . page_pruning_predicate . clone ( ) ;
@@ -159,11 +183,37 @@ impl FileOpener for ParquetOpener {
159
183
let ( schema_mapping, adapted_projections) =
160
184
schema_adapter. map_schema ( & file_schema) ?;
161
185
162
- let mask = ProjectionMask :: roots (
163
- builder. parquet_schema ( ) ,
164
- adapted_projections. iter ( ) . cloned ( ) ,
165
- ) ;
186
+ // let mask = ProjectionMask::roots(
187
+ // builder.parquet_schema(),
188
+ // adapted_projections.iter().cloned(),
189
+ // );
190
+ let mask = if has_deep_projection ( Some ( projection_deep. clone ( ) . as_ref ( ) ) ) {
191
+ let leaves = generate_leaf_paths (
192
+ table_schema. clone ( ) ,
193
+ builder. parquet_schema ( ) ,
194
+ & projection_vec,
195
+ projection_deep. clone ( ) . as_ref ( ) ,
196
+ ) ;
197
+ info ! (
198
+ "ParquetOpener::open, using deep projection parquet leaves: {:?}" ,
199
+ leaves. clone( )
200
+ ) ;
201
+ // let tmp = builder.parquet_schema();
202
+ // for (i, col) in tmp.columns().iter().enumerate() {
203
+ // info!(" {} {}= {:?}", i, col.path(), col);
204
+ // }
205
+ ProjectionMask :: leaves ( builder. parquet_schema ( ) , leaves)
206
+ } else {
207
+ info ! (
208
+ "ParquetOpener::open, using root projections: {:?}" ,
209
+ & adapted_projections
210
+ ) ;
166
211
212
+ ProjectionMask :: roots (
213
+ builder. parquet_schema ( ) ,
214
+ adapted_projections. iter ( ) . cloned ( ) ,
215
+ )
216
+ } ;
167
217
// Filter pushdown: evaluate predicates during scan
168
218
if let Some ( predicate) = pushdown_filters. then_some ( predicate) . flatten ( ) {
169
219
let row_filter = row_filter:: build_row_filter (
@@ -303,3 +353,103 @@ fn create_initial_plan(
303
353
// default to scanning all row groups
304
354
Ok ( ParquetAccessPlan :: new_all ( row_group_count) )
305
355
}
356
+
357
+ // FIXME: @HStack ACTUALLY look at the arrow schema and handle map types correctly
358
+ // Right now, we are matching "map-like" parquet leaves like "key_value.key" etc
359
+ // But, we neeed to walk through both the arrow schema (which KNOWS about the map type)
360
+ // and the parquet leaves to do this correctly.
361
+ fn equivalent_projection_paths_from_parquet_schema (
362
+ _arrow_schema : SchemaRef ,
363
+ parquet_schema : & SchemaDescriptor ,
364
+ ) -> Vec < ( usize , ( String , String ) ) > {
365
+ let mut output: Vec < ( usize , ( String , String ) ) > = vec ! [ ] ;
366
+ for ( i, col) in parquet_schema. columns ( ) . iter ( ) . enumerate ( ) {
367
+ let original_path = col. path ( ) . string ( ) ;
368
+ let converted_path =
369
+ convert_parquet_path_to_deep_projection_path ( & original_path. as_str ( ) ) ;
370
+ output. push ( ( i, ( original_path. clone ( ) , converted_path) ) ) ;
371
+ }
372
+ output
373
+ }
374
+
375
+ fn convert_parquet_path_to_deep_projection_path ( parquet_path : & str ) -> String {
376
+ if parquet_path. contains ( ".key_value.key" )
377
+ || parquet_path. contains ( ".key_value.value" )
378
+ || parquet_path. contains ( ".entries.keys" )
379
+ || parquet_path. contains ( ".entries.values" )
380
+ || parquet_path. contains ( ".list.element" )
381
+ {
382
+ let tmp = parquet_path
383
+ . replace ( "key_value.key" , "*" )
384
+ . replace ( "key_value.value" , "*" )
385
+ . replace ( "entries.keys" , "*" )
386
+ . replace ( "entries.values" , "*" )
387
+ . replace ( "list.element" , "*" ) ;
388
+ tmp
389
+ } else {
390
+ parquet_path. to_string ( )
391
+ }
392
+ }
393
+
394
+ fn generate_leaf_paths (
395
+ arrow_schema : SchemaRef ,
396
+ parquet_schema : & SchemaDescriptor ,
397
+ projection : & Vec < usize > ,
398
+ projection_deep : & HashMap < usize , Vec < String > > ,
399
+ ) -> Vec < usize > {
400
+ let actual_projection = if projection. len ( ) == 0 {
401
+ ( 0 ..arrow_schema. fields ( ) . len ( ) ) . collect ( )
402
+ } else {
403
+ projection. clone ( )
404
+ } ;
405
+ let splatted =
406
+ splat_columns ( arrow_schema. clone ( ) , & actual_projection, & projection_deep) ;
407
+ trace ! ( target: "deep" , "generate_leaf_paths: splatted: {:?}" , & splatted) ;
408
+
409
+ let mut out: Vec < usize > = vec ! [ ] ;
410
+ for ( i, ( original, converted) ) in
411
+ equivalent_projection_paths_from_parquet_schema ( arrow_schema, parquet_schema)
412
+ {
413
+ // FIXME: @HStack
414
+ // for map fields, the actual parquet paths look like x.y.z.key_value.key, x.y.z.key_value.value
415
+ // since we are ignoring these names in the paths, we need to actually collapse this access to a *
416
+ // so we can filter for them
417
+ // also, we need BOTH the key and the value for maps otherwise we run into an arrow-rs error
418
+ // "partial projection of MapArray is not supported"
419
+
420
+ trace ! ( target: "deep" , " generate_leaf_paths looking at index {} {} = {}" , i, & original, & converted) ;
421
+
422
+ let mut found = false ;
423
+ for filter in splatted. iter ( ) {
424
+ // check if this filter matches this leaf path
425
+ let filter_pieces = filter. split ( "." ) . collect :: < Vec < & str > > ( ) ;
426
+ // let col_pieces = col_path.parts();
427
+ let col_pieces = converted. split ( "." ) . collect :: < Vec < _ > > ( ) ;
428
+ // let's check
429
+ let mut filter_found = true ;
430
+ for i in 0 ..min ( filter_pieces. len ( ) , col_pieces. len ( ) ) {
431
+ if i >= filter_pieces. len ( ) {
432
+ // we are at the end of the filter, and we matched until now, so we break, we match !
433
+ break ;
434
+ }
435
+ if i >= col_pieces. len ( ) {
436
+ // we have a longer filter, we matched until now, we match !
437
+ break ;
438
+ }
439
+ // we can actually check
440
+ if !( col_pieces[ i] == filter_pieces[ i] || filter_pieces[ i] == "*" ) {
441
+ filter_found = false ;
442
+ break ;
443
+ }
444
+ }
445
+ if filter_found {
446
+ found = true ;
447
+ break ;
448
+ }
449
+ }
450
+ if found {
451
+ out. push ( i) ;
452
+ }
453
+ }
454
+ out
455
+ }
0 commit comments