Skip to content

Commit 087ac09

Browse files
liukun4515alamb
andauthored
Add test cases: row group filter with missing statistics for decimal data type (#4810)
* add null case for row group filter * Apply suggestions from code review Co-authored-by: Andrew Lamb <[email protected]> Co-authored-by: Andrew Lamb <[email protected]>
1 parent 80abc94 commit 087ac09

File tree

1 file changed

+58
-11
lines changed
  • datafusion/core/src/physical_plan/file_format/parquet

1 file changed

+58
-11
lines changed

datafusion/core/src/physical_plan/file_format/parquet/row_groups.rs

Lines changed: 58 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -475,10 +475,21 @@ mod tests {
475475
// c1 > 5, this row group will not be included in the results.
476476
vec![ParquetStatistics::int32(Some(10), Some(20), None, 0, false)],
477477
);
478+
let rgm3 = get_row_group_meta_data(
479+
&schema_descr,
480+
// [1, None]
481+
// c1 > 5, this row group can not be filtered out, so will be included in the results.
482+
vec![ParquetStatistics::int32(Some(100), None, None, 0, false)],
483+
);
478484
let metrics = parquet_file_metrics();
479485
assert_eq!(
480-
prune_row_groups(&[rgm1, rgm2], None, Some(&pruning_predicate), &metrics),
481-
vec![0]
486+
prune_row_groups(
487+
&[rgm1, rgm2, rgm3],
488+
None,
489+
Some(&pruning_predicate),
490+
&metrics
491+
),
492+
vec![0, 2]
482493
);
483494

484495
// INT32: c1 > 5, but parquet decimal type has different precision or scale to arrow decimal
@@ -528,15 +539,21 @@ mod tests {
528539
// c1 > 5, this row group will not be included in the results.
529540
vec![ParquetStatistics::int32(Some(0), Some(2), None, 0, false)],
530541
);
542+
let rgm4 = get_row_group_meta_data(
543+
&schema_descr,
544+
// [None, 2]
545+
// c1 > 5, this row group can not be filtered out, so will be included in the results.
546+
vec![ParquetStatistics::int32(None, Some(2), None, 0, false)],
547+
);
531548
let metrics = parquet_file_metrics();
532549
assert_eq!(
533550
prune_row_groups(
534-
&[rgm1, rgm2, rgm3],
551+
&[rgm1, rgm2, rgm3, rgm4],
535552
None,
536553
Some(&pruning_predicate),
537554
&metrics
538555
),
539-
vec![0, 1]
556+
vec![0, 1, 3]
540557
);
541558

542559
// INT64: c1 < 5, the c1 is decimal(18,2)
@@ -572,10 +589,20 @@ mod tests {
572589
// [0.1, 0.2]
573590
vec![ParquetStatistics::int64(Some(10), Some(20), None, 0, false)],
574591
);
592+
let rgm3 = get_row_group_meta_data(
593+
&schema_descr,
594+
// [0.1, 0.2]
595+
vec![ParquetStatistics::int64(None, None, None, 0, false)],
596+
);
575597
let metrics = parquet_file_metrics();
576598
assert_eq!(
577-
prune_row_groups(&[rgm1, rgm2], None, Some(&pruning_predicate), &metrics),
578-
vec![1]
599+
prune_row_groups(
600+
&[rgm1, rgm2, rgm3],
601+
None,
602+
Some(&pruning_predicate),
603+
&metrics
604+
),
605+
vec![1, 2]
579606
);
580607

581608
// FIXED_LENGTH_BYTE_ARRAY: c1 = decimal128(100000, 28, 3), the c1 is decimal(18,2)
@@ -631,13 +658,24 @@ mod tests {
631658
false,
632659
)],
633660
);
661+
662+
let rgm3 = get_row_group_meta_data(
663+
&schema_descr,
664+
vec![ParquetStatistics::fixed_len_byte_array(
665+
None, None, None, 0, false,
666+
)],
667+
);
634668
let metrics = parquet_file_metrics();
635669
assert_eq!(
636-
prune_row_groups(&[rgm1, rgm2], None, Some(&pruning_predicate), &metrics),
637-
vec![1]
670+
prune_row_groups(
671+
&[rgm1, rgm2, rgm3],
672+
None,
673+
Some(&pruning_predicate),
674+
&metrics
675+
),
676+
vec![1, 2]
638677
);
639678

640-
// TODO: BYTE_ARRAY support read decimal from parquet, after the 20.0.0 arrow-rs release
641679
// BYTE_ARRAY: c1 = decimal128(100000, 28, 3), the c1 is decimal(18,2)
642680
// the type of parquet is decimal(18,2)
643681
let schema =
@@ -683,10 +721,19 @@ mod tests {
683721
false,
684722
)],
685723
);
724+
let rgm3 = get_row_group_meta_data(
725+
&schema_descr,
726+
vec![ParquetStatistics::byte_array(None, None, None, 0, false)],
727+
);
686728
let metrics = parquet_file_metrics();
687729
assert_eq!(
688-
prune_row_groups(&[rgm1, rgm2], None, Some(&pruning_predicate), &metrics),
689-
vec![1]
730+
prune_row_groups(
731+
&[rgm1, rgm2, rgm3],
732+
None,
733+
Some(&pruning_predicate),
734+
&metrics
735+
),
736+
vec![1, 2]
690737
);
691738
}
692739

0 commit comments

Comments
 (0)