@@ -237,6 +237,10 @@ pub struct ParquetFileMetrics {
237
237
pub row_groups_pruned : metrics:: Count ,
238
238
/// Total number of bytes scanned
239
239
pub bytes_scanned : metrics:: Count ,
240
+ /// Total rows filtered out by predicates pushed into parquet scan
241
+ pub pushdown_rows_filtered : metrics:: Count ,
242
+ /// Total time spent evaluating pushdown filters
243
+ pub pushdown_eval_time : metrics:: Time ,
240
244
}
241
245
242
246
impl ParquetFileMetrics {
@@ -258,10 +262,20 @@ impl ParquetFileMetrics {
258
262
. with_new_label ( "filename" , filename. to_string ( ) )
259
263
. counter ( "bytes_scanned" , partition) ;
260
264
265
+ let pushdown_rows_filtered = MetricBuilder :: new ( metrics)
266
+ . with_new_label ( "filename" , filename. to_string ( ) )
267
+ . counter ( "pushdown_rows_filtered" , partition) ;
268
+
269
+ let pushdown_eval_time = MetricBuilder :: new ( metrics)
270
+ . with_new_label ( "filename" , filename. to_string ( ) )
271
+ . subset_time ( "pushdown_eval_time" , partition) ;
272
+
261
273
Self {
262
274
predicate_evaluation_errors,
263
275
row_groups_pruned,
264
276
bytes_scanned,
277
+ pushdown_rows_filtered,
278
+ pushdown_eval_time,
265
279
}
266
280
}
267
281
}
@@ -410,7 +424,7 @@ impl FileOpener for ParquetOpener {
410
424
) -> Result < FileOpenFuture > {
411
425
let file_range = file_meta. range . clone ( ) ;
412
426
413
- let metrics = ParquetFileMetrics :: new (
427
+ let file_metrics = ParquetFileMetrics :: new (
414
428
self . partition_index ,
415
429
file_meta. location ( ) . as_ref ( ) ,
416
430
& self . metrics ,
@@ -456,6 +470,8 @@ impl FileOpener for ParquetOpener {
456
470
table_schema. as_ref ( ) ,
457
471
builder. metadata ( ) ,
458
472
reorder_predicates,
473
+ & file_metrics. pushdown_rows_filtered ,
474
+ & file_metrics. pushdown_eval_time ,
459
475
) ;
460
476
461
477
match row_filter {
@@ -474,8 +490,12 @@ impl FileOpener for ParquetOpener {
474
490
475
491
let file_metadata = builder. metadata ( ) ;
476
492
let groups = file_metadata. row_groups ( ) ;
477
- let row_groups =
478
- prune_row_groups ( groups, file_range, pruning_predicate. clone ( ) , & metrics) ;
493
+ let row_groups = prune_row_groups (
494
+ groups,
495
+ file_range,
496
+ pruning_predicate. clone ( ) ,
497
+ & file_metrics,
498
+ ) ;
479
499
480
500
if enable_page_index && check_page_index_push_down_valid ( & pruning_predicate) {
481
501
let file_offset_indexes = file_metadata. offset_indexes ( ) ;
@@ -491,7 +511,7 @@ impl FileOpener for ParquetOpener {
491
511
pruning_predicate. clone ( ) ,
492
512
file_offset_indexes. get ( * r) ,
493
513
file_page_indexes. get ( * r) ,
494
- & metrics ,
514
+ & file_metrics ,
495
515
)
496
516
. map_err ( |e| {
497
517
ArrowError :: ParquetError ( format ! (
@@ -575,7 +595,7 @@ impl DefaultParquetFileReaderFactory {
575
595
struct ParquetFileReader {
576
596
store : Arc < dyn ObjectStore > ,
577
597
meta : ObjectMeta ,
578
- metrics : ParquetFileMetrics ,
598
+ file_metrics : ParquetFileMetrics ,
579
599
metadata_size_hint : Option < usize > ,
580
600
}
581
601
@@ -584,7 +604,7 @@ impl AsyncFileReader for ParquetFileReader {
584
604
& mut self ,
585
605
range : Range < usize > ,
586
606
) -> BoxFuture < ' _ , parquet:: errors:: Result < Bytes > > {
587
- self . metrics . bytes_scanned . add ( range. end - range. start ) ;
607
+ self . file_metrics . bytes_scanned . add ( range. end - range. start ) ;
588
608
589
609
self . store
590
610
. get_range ( & self . meta . location , range)
@@ -602,7 +622,7 @@ impl AsyncFileReader for ParquetFileReader {
602
622
Self : Send ,
603
623
{
604
624
let total = ranges. iter ( ) . map ( |r| r. end - r. start ) . sum ( ) ;
605
- self . metrics . bytes_scanned . add ( total) ;
625
+ self . file_metrics . bytes_scanned . add ( total) ;
606
626
607
627
async move {
608
628
self . store
@@ -647,7 +667,7 @@ impl ParquetFileReaderFactory for DefaultParquetFileReaderFactory {
647
667
metadata_size_hint : Option < usize > ,
648
668
metrics : & ExecutionPlanMetricsSet ,
649
669
) -> Result < Box < dyn AsyncFileReader + Send > > {
650
- let parquet_file_metrics = ParquetFileMetrics :: new (
670
+ let file_metrics = ParquetFileMetrics :: new (
651
671
partition_index,
652
672
file_meta. location ( ) . as_ref ( ) ,
653
673
metrics,
@@ -657,7 +677,7 @@ impl ParquetFileReaderFactory for DefaultParquetFileReaderFactory {
657
677
meta : file_meta. object_meta ,
658
678
store : Arc :: clone ( & self . store ) ,
659
679
metadata_size_hint,
660
- metrics : parquet_file_metrics ,
680
+ file_metrics ,
661
681
} ) )
662
682
}
663
683
}
@@ -1178,6 +1198,7 @@ mod tests {
1178
1198
use crate :: datasource:: listing:: { FileRange , PartitionedFile } ;
1179
1199
use crate :: datasource:: object_store:: ObjectStoreUrl ;
1180
1200
use crate :: execution:: options:: CsvReadOptions ;
1201
+ use crate :: physical_plan:: metrics:: MetricValue ;
1181
1202
use crate :: prelude:: { ParquetReadOptions , SessionConfig , SessionContext } ;
1182
1203
use crate :: test:: object_store:: local_unpartitioned_file;
1183
1204
use crate :: {
@@ -1210,23 +1231,46 @@ mod tests {
1210
1231
use std:: io:: Write ;
1211
1232
use tempfile:: TempDir ;
1212
1233
1213
- /// writes each RecordBatch as an individual parquet file and then
1214
- /// reads it back in to the named location.
1234
+ struct RoundTripResult {
1235
+ /// Data that was read back from ParquetFiles
1236
+ batches : Result < Vec < RecordBatch > > ,
1237
+ /// The physical plan that was created (that has statistics, etc)
1238
+ parquet_exec : Arc < ParquetExec > ,
1239
+ }
1240
+
1241
+ /// writes each RecordBatch as an individual parquet file and re-reads
1242
+ /// the data back. Returns the data as [RecordBatch]es
1215
1243
async fn round_trip_to_parquet (
1216
1244
batches : Vec < RecordBatch > ,
1217
1245
projection : Option < Vec < usize > > ,
1218
1246
schema : Option < SchemaRef > ,
1219
1247
predicate : Option < Expr > ,
1220
1248
pushdown_predicate : bool ,
1221
1249
) -> Result < Vec < RecordBatch > > {
1250
+ round_trip ( batches, projection, schema, predicate, pushdown_predicate)
1251
+ . await
1252
+ . batches
1253
+ }
1254
+
1255
+ /// Writes each RecordBatch as an individual parquet file and then
1256
+ /// reads them back. Returns the parquet exec as well as the data
1257
+ /// as [RecordBatch]es
1258
+ async fn round_trip (
1259
+ batches : Vec < RecordBatch > ,
1260
+ projection : Option < Vec < usize > > ,
1261
+ schema : Option < SchemaRef > ,
1262
+ predicate : Option < Expr > ,
1263
+ pushdown_predicate : bool ,
1264
+ ) -> RoundTripResult {
1222
1265
let file_schema = match schema {
1223
1266
Some ( schema) => schema,
1224
- None => Arc :: new ( Schema :: try_merge (
1225
- batches. iter ( ) . map ( |b| b. schema ( ) . as_ref ( ) . clone ( ) ) ,
1226
- ) ?) ,
1267
+ None => Arc :: new (
1268
+ Schema :: try_merge ( batches. iter ( ) . map ( |b| b. schema ( ) . as_ref ( ) . clone ( ) ) )
1269
+ . unwrap ( ) ,
1270
+ ) ,
1227
1271
} ;
1228
1272
1229
- let ( meta, _files) = store_parquet ( batches) . await ? ;
1273
+ let ( meta, _files) = store_parquet ( batches) . await . unwrap ( ) ;
1230
1274
let file_groups = meta. into_iter ( ) . map ( Into :: into) . collect ( ) ;
1231
1275
1232
1276
// prepare the scan
@@ -1253,7 +1297,11 @@ mod tests {
1253
1297
1254
1298
let session_ctx = SessionContext :: new ( ) ;
1255
1299
let task_ctx = session_ctx. task_ctx ( ) ;
1256
- collect ( Arc :: new ( parquet_exec) , task_ctx) . await
1300
+ let parquet_exec = Arc :: new ( parquet_exec) ;
1301
+ RoundTripResult {
1302
+ batches : collect ( parquet_exec. clone ( ) , task_ctx) . await ,
1303
+ parquet_exec,
1304
+ }
1257
1305
}
1258
1306
1259
1307
// Add a new column with the specified field name to the RecordBatch
@@ -1464,18 +1512,18 @@ mod tests {
1464
1512
let filter = col ( "c2" ) . eq ( lit ( 2_i64 ) ) ;
1465
1513
1466
1514
// read/write them files:
1467
- let read =
1468
- round_trip_to_parquet ( vec ! [ batch1, batch2] , None , None , Some ( filter) , true )
1469
- . await
1470
- . unwrap ( ) ;
1515
+ let rt = round_trip ( vec ! [ batch1, batch2] , None , None , Some ( filter) , true ) . await ;
1471
1516
let expected = vec ! [
1472
1517
"+----+----+----+" ,
1473
1518
"| c1 | c3 | c2 |" ,
1474
1519
"+----+----+----+" ,
1475
1520
"| | 20 | 2 |" ,
1476
1521
"+----+----+----+" ,
1477
1522
] ;
1478
- assert_batches_sorted_eq ! ( expected, & read) ;
1523
+ assert_batches_sorted_eq ! ( expected, & rt. batches. unwrap( ) ) ;
1524
+ let metrics = rt. parquet_exec . metrics ( ) . unwrap ( ) ;
1525
+ // Note there are were 6 rows in total (across three batches)
1526
+ assert_eq ! ( get_value( & metrics, "pushdown_rows_filtered" ) , 5 ) ;
1479
1527
}
1480
1528
1481
1529
#[ tokio:: test]
@@ -1598,7 +1646,7 @@ mod tests {
1598
1646
}
1599
1647
1600
1648
#[ tokio:: test]
1601
- async fn evolved_schema_disjoint_schema_filter_with_pushdown ( ) {
1649
+ async fn evolved_schema_disjoint_schema_with_filter_pushdown ( ) {
1602
1650
let c1: ArrayRef =
1603
1651
Arc :: new ( StringArray :: from ( vec ! [ Some ( "Foo" ) , None , Some ( "bar" ) ] ) ) ;
1604
1652
@@ -1613,10 +1661,7 @@ mod tests {
1613
1661
let filter = col ( "c2" ) . eq ( lit ( 1_i64 ) ) ;
1614
1662
1615
1663
// read/write them files:
1616
- let read =
1617
- round_trip_to_parquet ( vec ! [ batch1, batch2] , None , None , Some ( filter) , true )
1618
- . await
1619
- . unwrap ( ) ;
1664
+ let rt = round_trip ( vec ! [ batch1, batch2] , None , None , Some ( filter) , true ) . await ;
1620
1665
1621
1666
let expected = vec ! [
1622
1667
"+----+----+" ,
@@ -1625,7 +1670,10 @@ mod tests {
1625
1670
"| | 1 |" ,
1626
1671
"+----+----+" ,
1627
1672
] ;
1628
- assert_batches_sorted_eq ! ( expected, & read) ;
1673
+ assert_batches_sorted_eq ! ( expected, & rt. batches. unwrap( ) ) ;
1674
+ let metrics = rt. parquet_exec . metrics ( ) . unwrap ( ) ;
1675
+ // Note there are were 6 rows in total (across three batches)
1676
+ assert_eq ! ( get_value( & metrics, "pushdown_rows_filtered" ) , 5 ) ;
1629
1677
}
1630
1678
1631
1679
#[ tokio:: test]
@@ -1906,6 +1954,71 @@ mod tests {
1906
1954
Ok ( ( ) )
1907
1955
}
1908
1956
1957
+ #[ tokio:: test]
1958
+ async fn parquet_exec_metrics ( ) {
1959
+ let c1: ArrayRef = Arc :: new ( StringArray :: from ( vec ! [
1960
+ Some ( "Foo" ) ,
1961
+ None ,
1962
+ Some ( "bar" ) ,
1963
+ Some ( "bar" ) ,
1964
+ Some ( "bar" ) ,
1965
+ Some ( "bar" ) ,
1966
+ Some ( "zzz" ) ,
1967
+ ] ) ) ;
1968
+
1969
+ // batch1: c1(string)
1970
+ let batch1 = create_batch ( vec ! [ ( "c1" , c1. clone( ) ) ] ) ;
1971
+
1972
+ // on
1973
+ let filter = col ( "c1" ) . not_eq ( lit ( "bar" ) ) ;
1974
+
1975
+ // read/write them files:
1976
+ let rt = round_trip ( vec ! [ batch1] , None , None , Some ( filter) , true ) . await ;
1977
+
1978
+ let metrics = rt. parquet_exec . metrics ( ) . unwrap ( ) ;
1979
+
1980
+ // assert the batches and some metrics
1981
+ let expected = vec ! [
1982
+ "+-----+" , "| c1 |" , "+-----+" , "| Foo |" , "| zzz |" , "+-----+" ,
1983
+ ] ;
1984
+ assert_batches_sorted_eq ! ( expected, & rt. batches. unwrap( ) ) ;
1985
+
1986
+ // pushdown predicates have eliminated all 4 bar rows and the
1987
+ // null row for 5 rows total
1988
+ assert_eq ! ( get_value( & metrics, "pushdown_rows_filtered" ) , 5 ) ;
1989
+ assert ! (
1990
+ get_value( & metrics, "pushdown_eval_time" ) > 0 ,
1991
+ "no eval time in metrics: {:#?}" ,
1992
+ metrics
1993
+ ) ;
1994
+ }
1995
+
1996
+ /// returns the sum of all the metrics with the specified name
1997
+ /// the returned set.
1998
+ ///
1999
+ /// Count: returns value
2000
+ /// Time: returns elapsed nanoseconds
2001
+ ///
2002
+ /// Panics if no such metric.
2003
+ fn get_value ( metrics : & MetricsSet , metric_name : & str ) -> usize {
2004
+ let sum = metrics. sum ( |m| match m. value ( ) {
2005
+ MetricValue :: Count { name, .. } if name == metric_name => true ,
2006
+ MetricValue :: Time { name, .. } if name == metric_name => true ,
2007
+ _ => false ,
2008
+ } ) ;
2009
+
2010
+ match sum {
2011
+ Some ( MetricValue :: Count { count, .. } ) => count. value ( ) ,
2012
+ Some ( MetricValue :: Time { time, .. } ) => time. value ( ) ,
2013
+ _ => {
2014
+ panic ! (
2015
+ "Expected metric not found. Looking for '{}' in\n \n {:#?}" ,
2016
+ metric_name, metrics
2017
+ ) ;
2018
+ }
2019
+ }
2020
+ }
2021
+
1909
2022
fn parquet_file_metrics ( ) -> ParquetFileMetrics {
1910
2023
let metrics = Arc :: new ( ExecutionPlanMetricsSet :: new ( ) ) ;
1911
2024
ParquetFileMetrics :: new ( 0 , "file.parquet" , & metrics)
0 commit comments