@@ -369,7 +369,13 @@ pub(crate) fn parse_statistics_string(str_setting: &str) -> Result<EnabledStatis
369
369
#[ cfg( feature = "parquet" ) ]
370
370
#[ cfg( test) ]
371
371
mod tests {
372
- use parquet:: { basic:: Compression , file:: properties:: EnabledStatistics } ;
372
+ use parquet:: {
373
+ basic:: Compression ,
374
+ file:: properties:: {
375
+ BloomFilterProperties , EnabledStatistics , DEFAULT_BLOOM_FILTER_FPP ,
376
+ DEFAULT_BLOOM_FILTER_NDV ,
377
+ } ,
378
+ } ;
373
379
use std:: collections:: HashMap ;
374
380
375
381
use crate :: config:: { ColumnOptions , ParquetOptions } ;
@@ -566,4 +572,291 @@ mod tests {
566
572
"the writer_props should have the same configuration as the session's TableParquetOptions" ,
567
573
) ;
568
574
}
575
+
576
+ /// Ensure that the configuration defaults for writing parquet files are
577
+ /// consistent with the options in arrow-rs
578
+ #[ test]
579
+ fn test_defaults_match ( ) {
580
+ // ensure the global settings are the same
581
+ let default_table_writer_opts = TableParquetOptions :: default ( ) ;
582
+ let default_parquet_opts = ParquetOptions :: default ( ) ;
583
+ assert_eq ! (
584
+ default_table_writer_opts. global,
585
+ default_parquet_opts,
586
+ "should have matching defaults for TableParquetOptions.global and ParquetOptions" ,
587
+ ) ;
588
+
589
+ // WriterProperties::default, a.k.a. using extern parquet's defaults
590
+ let default_writer_props = WriterProperties :: new ( ) ;
591
+
592
+ // WriterProperties::try_from(TableParquetOptions::default), a.k.a. using datafusion's defaults
593
+ let from_datafusion_defaults =
594
+ WriterPropertiesBuilder :: try_from ( & default_table_writer_opts)
595
+ . unwrap ( )
596
+ . build ( ) ;
597
+
598
+ // Expected: how the defaults should not match
599
+ assert_ne ! (
600
+ default_writer_props. created_by( ) ,
601
+ from_datafusion_defaults. created_by( ) ,
602
+ "should have different created_by sources" ,
603
+ ) ;
604
+ assert ! (
605
+ default_writer_props. created_by( ) . starts_with( "parquet-rs version" ) ,
606
+ "should indicate that writer_props defaults came from the extern parquet crate" ,
607
+ ) ;
608
+ assert ! (
609
+ default_table_writer_opts
610
+ . global
611
+ . created_by
612
+ . starts_with( "datafusion version" ) ,
613
+ "should indicate that table_parquet_opts defaults came from datafusion" ,
614
+ ) ;
615
+
616
+ // Expected: the remaining should match
617
+ let same_created_by = default_table_writer_opts. global . created_by . clone ( ) ;
618
+ let mut from_extern_parquet =
619
+ session_config_from_writer_props ( & default_writer_props) ;
620
+ from_extern_parquet. global . created_by = same_created_by;
621
+ // TODO: the remaining defaults do not match!
622
+ // refer to https://github.com/apache/datafusion/issues/11367
623
+ assert_ne ! (
624
+ default_table_writer_opts,
625
+ from_extern_parquet,
626
+ "the default writer_props should have the same configuration as the session's default TableParquetOptions" ,
627
+ ) ;
628
+
629
+ // Below here itemizes how the defaults **should** match, but do not.
630
+
631
+ // TODO: compression defaults do not match
632
+ // refer to https://github.com/apache/datafusion/issues/11367
633
+ assert_eq ! (
634
+ default_writer_props. compression( & "default" . into( ) ) ,
635
+ Compression :: UNCOMPRESSED ,
636
+ "extern parquet's default is None"
637
+ ) ;
638
+ assert ! (
639
+ matches!(
640
+ from_datafusion_defaults. compression( & "default" . into( ) ) ,
641
+ Compression :: ZSTD ( _)
642
+ ) ,
643
+ "datafusion's default is zstd"
644
+ ) ;
645
+
646
+ // TODO: data_page_row_count_limit defaults do not match
647
+ // refer to https://github.com/apache/datafusion/issues/11367
648
+ assert_eq ! (
649
+ default_writer_props. data_page_row_count_limit( ) ,
650
+ 20_000 ,
651
+ "extern parquet's default data_page_row_count_limit is 20_000"
652
+ ) ;
653
+ assert_eq ! (
654
+ from_datafusion_defaults. data_page_row_count_limit( ) ,
655
+ usize :: MAX ,
656
+ "datafusion's default is usize::MAX"
657
+ ) ;
658
+
659
+ // TODO: column_index_truncate_length do not match
660
+ // refer to https://github.com/apache/datafusion/issues/11367
661
+ assert_eq ! (
662
+ default_writer_props. column_index_truncate_length( ) ,
663
+ Some ( 64 ) ,
664
+ "extern parquet's default is 64"
665
+ ) ;
666
+ assert_eq ! (
667
+ from_datafusion_defaults. column_index_truncate_length( ) ,
668
+ None ,
669
+ "datafusion's default is None"
670
+ ) ;
671
+
672
+ // The next few examples are where datafusion's default is None.
673
+ // But once datafusion's TableParquetOptions are converted to a WriterProperties,
674
+ // then we get the extern parquet's defaults.
675
+ //
676
+ // In other words, we do not get indeterminate behavior in the output writer props.
677
+ // But this is only because we use the extern parquet's defaults when we leave
678
+ // the datafusion setting as None.
679
+
680
+ // datafusion's `None` for Option<bool> => becomes parquet's true
681
+ // TODO: should this be changed?
682
+ // refer to https://github.com/apache/datafusion/issues/11367
683
+ assert ! (
684
+ default_writer_props. dictionary_enabled( & "default" . into( ) ) ,
685
+ "extern parquet's default is true"
686
+ ) ;
687
+ assert_eq ! (
688
+ default_table_writer_opts. global. dictionary_enabled, None ,
689
+ "datafusion's has no default"
690
+ ) ;
691
+ assert ! (
692
+ from_datafusion_defaults. dictionary_enabled( & "default" . into( ) ) ,
693
+ "should see the extern parquet's default over-riding datafusion's None" ,
694
+ ) ;
695
+
696
+ // datafusion's `None` for Option<String> => becomes parquet's EnabledStatistics::Page
697
+ // TODO: should this be changed?
698
+ // refer to https://github.com/apache/datafusion/issues/11367
699
+ assert_eq ! (
700
+ default_writer_props. statistics_enabled( & "default" . into( ) ) ,
701
+ EnabledStatistics :: Page ,
702
+ "extern parquet's default is page"
703
+ ) ;
704
+ assert_eq ! (
705
+ default_table_writer_opts. global. statistics_enabled, None ,
706
+ "datafusion's has no default"
707
+ ) ;
708
+ assert_eq ! (
709
+ from_datafusion_defaults. statistics_enabled( & "default" . into( ) ) ,
710
+ EnabledStatistics :: Page ,
711
+ "should see the extern parquet's default over-riding datafusion's None" ,
712
+ ) ;
713
+
714
+ // datafusion's `None` for Option<usize> => becomes parquet's 4096
715
+ // TODO: should this be changed?
716
+ // refer to https://github.com/apache/datafusion/issues/11367
717
+ assert_eq ! (
718
+ default_writer_props. max_statistics_size( & "default" . into( ) ) ,
719
+ 4096 ,
720
+ "extern parquet's default is 4096"
721
+ ) ;
722
+ assert_eq ! (
723
+ default_table_writer_opts. global. max_statistics_size, None ,
724
+ "datafusion's has no default"
725
+ ) ;
726
+ assert_eq ! (
727
+ default_writer_props. max_statistics_size( & "default" . into( ) ) ,
728
+ 4096 ,
729
+ "should see the extern parquet's default over-riding datafusion's None" ,
730
+ ) ;
731
+
732
+ // Confirm all other settings are equal.
733
+ // First resolve the known discrepancies, (set as the same).
734
+ // TODO: once we fix the above mis-matches, we should be able to remove this.
735
+ let mut from_extern_parquet =
736
+ session_config_from_writer_props ( & default_writer_props) ;
737
+ from_extern_parquet. global . compression = Some ( "zstd(3)" . into ( ) ) ;
738
+ from_extern_parquet. global . data_page_row_count_limit = usize:: MAX ;
739
+ from_extern_parquet. global . column_index_truncate_length = None ;
740
+ from_extern_parquet. global . dictionary_enabled = None ;
741
+ from_extern_parquet. global . statistics_enabled = None ;
742
+ from_extern_parquet. global . max_statistics_size = None ;
743
+
744
+ // Expected: the remaining should match
745
+ let same_created_by = default_table_writer_opts. global . created_by . clone ( ) ; // we expect these to be different
746
+ from_extern_parquet. global . created_by = same_created_by; // we expect these to be different
747
+ assert_eq ! (
748
+ default_table_writer_opts,
749
+ from_extern_parquet,
750
+ "the default writer_props should have the same configuration as the session's default TableParquetOptions" ,
751
+ ) ;
752
+ }
753
+
754
+ #[ test]
755
+ fn test_bloom_filter_defaults ( ) {
756
+ // the TableParquetOptions::default, with only the bloom filter turned on
757
+ let mut default_table_writer_opts = TableParquetOptions :: default ( ) ;
758
+ default_table_writer_opts. global . bloom_filter_on_write = true ;
759
+
760
+ // the WriterProperties::default, with only the bloom filter turned on
761
+ let default_writer_props = WriterProperties :: new ( ) ;
762
+ let from_datafusion_defaults =
763
+ WriterPropertiesBuilder :: try_from ( & default_table_writer_opts)
764
+ . unwrap ( )
765
+ . set_bloom_filter_enabled ( true )
766
+ . build ( ) ;
767
+
768
+ // TODO: should have same behavior in either.
769
+ // refer to https://github.com/apache/datafusion/issues/11367
770
+ assert_ne ! (
771
+ default_writer_props. bloom_filter_properties( & "default" . into( ) ) ,
772
+ from_datafusion_defaults. bloom_filter_properties( & "default" . into( ) ) ,
773
+ "parquet and datafusion props, will not have the same bloom filter props" ,
774
+ ) ;
775
+ assert_eq ! (
776
+ default_writer_props. bloom_filter_properties( & "default" . into( ) ) ,
777
+ None ,
778
+ "extern parquet's default remains None"
779
+ ) ;
780
+ assert_eq ! (
781
+ from_datafusion_defaults. bloom_filter_properties( & "default" . into( ) ) ,
782
+ Some ( & BloomFilterProperties :: default ( ) ) ,
783
+ "datafusion's has BloomFilterProperties::default" ,
784
+ ) ;
785
+ }
786
+
787
+ #[ test]
788
+ fn test_bloom_filter_set_fpp_only ( ) {
789
+ // the TableParquetOptions::default, with only fpp set
790
+ let mut default_table_writer_opts = TableParquetOptions :: default ( ) ;
791
+ default_table_writer_opts. global . bloom_filter_on_write = true ;
792
+ default_table_writer_opts. global . bloom_filter_fpp = Some ( 0.42 ) ;
793
+
794
+ // the WriterProperties::default, with only fpp set
795
+ let default_writer_props = WriterProperties :: new ( ) ;
796
+ let from_datafusion_defaults =
797
+ WriterPropertiesBuilder :: try_from ( & default_table_writer_opts)
798
+ . unwrap ( )
799
+ . set_bloom_filter_enabled ( true )
800
+ . set_bloom_filter_fpp ( 0.42 )
801
+ . build ( ) ;
802
+
803
+ // TODO: should have same behavior in either.
804
+ // refer to https://github.com/apache/datafusion/issues/11367
805
+ assert_ne ! (
806
+ default_writer_props. bloom_filter_properties( & "default" . into( ) ) ,
807
+ from_datafusion_defaults. bloom_filter_properties( & "default" . into( ) ) ,
808
+ "parquet and datafusion props, will not have the same bloom filter props" ,
809
+ ) ;
810
+ assert_eq ! (
811
+ default_writer_props. bloom_filter_properties( & "default" . into( ) ) ,
812
+ None ,
813
+ "extern parquet's default remains None"
814
+ ) ;
815
+ assert_eq ! (
816
+ from_datafusion_defaults. bloom_filter_properties( & "default" . into( ) ) ,
817
+ Some ( & BloomFilterProperties {
818
+ fpp: 0.42 ,
819
+ ndv: DEFAULT_BLOOM_FILTER_NDV
820
+ } ) ,
821
+ "datafusion's has BloomFilterProperties" ,
822
+ ) ;
823
+ }
824
+
825
+ #[ test]
826
+ fn test_bloom_filter_set_ndv_only ( ) {
827
+ // the TableParquetOptions::default, with only ndv set
828
+ let mut default_table_writer_opts = TableParquetOptions :: default ( ) ;
829
+ default_table_writer_opts. global . bloom_filter_on_write = true ;
830
+ default_table_writer_opts. global . bloom_filter_ndv = Some ( 42 ) ;
831
+
832
+ // the WriterProperties::default, with only ndv set
833
+ let default_writer_props = WriterProperties :: new ( ) ;
834
+ let from_datafusion_defaults =
835
+ WriterPropertiesBuilder :: try_from ( & default_table_writer_opts)
836
+ . unwrap ( )
837
+ . set_bloom_filter_enabled ( true )
838
+ . set_bloom_filter_ndv ( 42 )
839
+ . build ( ) ;
840
+
841
+ // TODO: should have same behavior in either.
842
+ // refer to https://github.com/apache/datafusion/issues/11367
843
+ assert_ne ! (
844
+ default_writer_props. bloom_filter_properties( & "default" . into( ) ) ,
845
+ from_datafusion_defaults. bloom_filter_properties( & "default" . into( ) ) ,
846
+ "parquet and datafusion props, will not have the same bloom filter props" ,
847
+ ) ;
848
+ assert_eq ! (
849
+ default_writer_props. bloom_filter_properties( & "default" . into( ) ) ,
850
+ None ,
851
+ "extern parquet's default remains None"
852
+ ) ;
853
+ assert_eq ! (
854
+ from_datafusion_defaults. bloom_filter_properties( & "default" . into( ) ) ,
855
+ Some ( & BloomFilterProperties {
856
+ fpp: DEFAULT_BLOOM_FILTER_FPP ,
857
+ ndv: 42
858
+ } ) ,
859
+ "datafusion's has BloomFilterProperties" ,
860
+ ) ;
861
+ }
569
862
}
0 commit comments