Skip to content

Commit 4b840c0

Browse files
authored
Add tests that show the different defaults for ArrowWriter and TableParquetOptions (#11524)
* test(11367): define current behavior of parquet writer configuration defaults * chore(11367): update code comments to make it more explicit on the mismatches
1 parent adcfd85 commit 4b840c0

File tree

1 file changed

+294
-1
lines changed

1 file changed

+294
-1
lines changed

datafusion/common/src/file_options/parquet_writer.rs

Lines changed: 294 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -369,7 +369,13 @@ pub(crate) fn parse_statistics_string(str_setting: &str) -> Result<EnabledStatis
369369
#[cfg(feature = "parquet")]
370370
#[cfg(test)]
371371
mod tests {
372-
use parquet::{basic::Compression, file::properties::EnabledStatistics};
372+
use parquet::{
373+
basic::Compression,
374+
file::properties::{
375+
BloomFilterProperties, EnabledStatistics, DEFAULT_BLOOM_FILTER_FPP,
376+
DEFAULT_BLOOM_FILTER_NDV,
377+
},
378+
};
373379
use std::collections::HashMap;
374380

375381
use crate::config::{ColumnOptions, ParquetOptions};
@@ -566,4 +572,291 @@ mod tests {
566572
"the writer_props should have the same configuration as the session's TableParquetOptions",
567573
);
568574
}
575+
576+
/// Ensure that the configuration defaults for writing parquet files are
577+
/// consistent with the options in arrow-rs
578+
#[test]
579+
fn test_defaults_match() {
580+
// ensure the global settings are the same
581+
let default_table_writer_opts = TableParquetOptions::default();
582+
let default_parquet_opts = ParquetOptions::default();
583+
assert_eq!(
584+
default_table_writer_opts.global,
585+
default_parquet_opts,
586+
"should have matching defaults for TableParquetOptions.global and ParquetOptions",
587+
);
588+
589+
// WriterProperties::default, a.k.a. using extern parquet's defaults
590+
let default_writer_props = WriterProperties::new();
591+
592+
// WriterProperties::try_from(TableParquetOptions::default), a.k.a. using datafusion's defaults
593+
let from_datafusion_defaults =
594+
WriterPropertiesBuilder::try_from(&default_table_writer_opts)
595+
.unwrap()
596+
.build();
597+
598+
// Expected: how the defaults should not match
599+
assert_ne!(
600+
default_writer_props.created_by(),
601+
from_datafusion_defaults.created_by(),
602+
"should have different created_by sources",
603+
);
604+
assert!(
605+
default_writer_props.created_by().starts_with("parquet-rs version"),
606+
"should indicate that writer_props defaults came from the extern parquet crate",
607+
);
608+
assert!(
609+
default_table_writer_opts
610+
.global
611+
.created_by
612+
.starts_with("datafusion version"),
613+
"should indicate that table_parquet_opts defaults came from datafusion",
614+
);
615+
616+
// Expected: the remaining should match
617+
let same_created_by = default_table_writer_opts.global.created_by.clone();
618+
let mut from_extern_parquet =
619+
session_config_from_writer_props(&default_writer_props);
620+
from_extern_parquet.global.created_by = same_created_by;
621+
// TODO: the remaining defaults do not match!
622+
// refer to https://github.com/apache/datafusion/issues/11367
623+
assert_ne!(
624+
default_table_writer_opts,
625+
from_extern_parquet,
626+
"the default writer_props should have the same configuration as the session's default TableParquetOptions",
627+
);
628+
629+
// Below here itemizes how the defaults **should** match, but do not.
630+
631+
// TODO: compression defaults do not match
632+
// refer to https://github.com/apache/datafusion/issues/11367
633+
assert_eq!(
634+
default_writer_props.compression(&"default".into()),
635+
Compression::UNCOMPRESSED,
636+
"extern parquet's default is None"
637+
);
638+
assert!(
639+
matches!(
640+
from_datafusion_defaults.compression(&"default".into()),
641+
Compression::ZSTD(_)
642+
),
643+
"datafusion's default is zstd"
644+
);
645+
646+
// TODO: data_page_row_count_limit defaults do not match
647+
// refer to https://github.com/apache/datafusion/issues/11367
648+
assert_eq!(
649+
default_writer_props.data_page_row_count_limit(),
650+
20_000,
651+
"extern parquet's default data_page_row_count_limit is 20_000"
652+
);
653+
assert_eq!(
654+
from_datafusion_defaults.data_page_row_count_limit(),
655+
usize::MAX,
656+
"datafusion's default is usize::MAX"
657+
);
658+
659+
// TODO: column_index_truncate_length do not match
660+
// refer to https://github.com/apache/datafusion/issues/11367
661+
assert_eq!(
662+
default_writer_props.column_index_truncate_length(),
663+
Some(64),
664+
"extern parquet's default is 64"
665+
);
666+
assert_eq!(
667+
from_datafusion_defaults.column_index_truncate_length(),
668+
None,
669+
"datafusion's default is None"
670+
);
671+
672+
// The next few examples are where datafusion's default is None.
673+
// But once datafusion's TableParquetOptions are converted to a WriterProperties,
674+
// then we get the extern parquet's defaults.
675+
//
676+
// In other words, we do not get indeterminate behavior in the output writer props.
677+
// But this is only because we use the extern parquet's defaults when we leave
678+
// the datafusion setting as None.
679+
680+
// datafusion's `None` for Option<bool> => becomes parquet's true
681+
// TODO: should this be changed?
682+
// refer to https://github.com/apache/datafusion/issues/11367
683+
assert!(
684+
default_writer_props.dictionary_enabled(&"default".into()),
685+
"extern parquet's default is true"
686+
);
687+
assert_eq!(
688+
default_table_writer_opts.global.dictionary_enabled, None,
689+
"datafusion's has no default"
690+
);
691+
assert!(
692+
from_datafusion_defaults.dictionary_enabled(&"default".into()),
693+
"should see the extern parquet's default over-riding datafusion's None",
694+
);
695+
696+
// datafusion's `None` for Option<String> => becomes parquet's EnabledStatistics::Page
697+
// TODO: should this be changed?
698+
// refer to https://github.com/apache/datafusion/issues/11367
699+
assert_eq!(
700+
default_writer_props.statistics_enabled(&"default".into()),
701+
EnabledStatistics::Page,
702+
"extern parquet's default is page"
703+
);
704+
assert_eq!(
705+
default_table_writer_opts.global.statistics_enabled, None,
706+
"datafusion's has no default"
707+
);
708+
assert_eq!(
709+
from_datafusion_defaults.statistics_enabled(&"default".into()),
710+
EnabledStatistics::Page,
711+
"should see the extern parquet's default over-riding datafusion's None",
712+
);
713+
714+
// datafusion's `None` for Option<usize> => becomes parquet's 4096
715+
// TODO: should this be changed?
716+
// refer to https://github.com/apache/datafusion/issues/11367
717+
assert_eq!(
718+
default_writer_props.max_statistics_size(&"default".into()),
719+
4096,
720+
"extern parquet's default is 4096"
721+
);
722+
assert_eq!(
723+
default_table_writer_opts.global.max_statistics_size, None,
724+
"datafusion's has no default"
725+
);
726+
assert_eq!(
727+
default_writer_props.max_statistics_size(&"default".into()),
728+
4096,
729+
"should see the extern parquet's default over-riding datafusion's None",
730+
);
731+
732+
// Confirm all other settings are equal.
733+
// First resolve the known discrepancies, (set as the same).
734+
// TODO: once we fix the above mis-matches, we should be able to remove this.
735+
let mut from_extern_parquet =
736+
session_config_from_writer_props(&default_writer_props);
737+
from_extern_parquet.global.compression = Some("zstd(3)".into());
738+
from_extern_parquet.global.data_page_row_count_limit = usize::MAX;
739+
from_extern_parquet.global.column_index_truncate_length = None;
740+
from_extern_parquet.global.dictionary_enabled = None;
741+
from_extern_parquet.global.statistics_enabled = None;
742+
from_extern_parquet.global.max_statistics_size = None;
743+
744+
// Expected: the remaining should match
745+
let same_created_by = default_table_writer_opts.global.created_by.clone(); // we expect these to be different
746+
from_extern_parquet.global.created_by = same_created_by; // we expect these to be different
747+
assert_eq!(
748+
default_table_writer_opts,
749+
from_extern_parquet,
750+
"the default writer_props should have the same configuration as the session's default TableParquetOptions",
751+
);
752+
}
753+
754+
#[test]
755+
fn test_bloom_filter_defaults() {
756+
// the TableParquetOptions::default, with only the bloom filter turned on
757+
let mut default_table_writer_opts = TableParquetOptions::default();
758+
default_table_writer_opts.global.bloom_filter_on_write = true;
759+
760+
// the WriterProperties::default, with only the bloom filter turned on
761+
let default_writer_props = WriterProperties::new();
762+
let from_datafusion_defaults =
763+
WriterPropertiesBuilder::try_from(&default_table_writer_opts)
764+
.unwrap()
765+
.set_bloom_filter_enabled(true)
766+
.build();
767+
768+
// TODO: should have same behavior in either.
769+
// refer to https://github.com/apache/datafusion/issues/11367
770+
assert_ne!(
771+
default_writer_props.bloom_filter_properties(&"default".into()),
772+
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
773+
"parquet and datafusion props, will not have the same bloom filter props",
774+
);
775+
assert_eq!(
776+
default_writer_props.bloom_filter_properties(&"default".into()),
777+
None,
778+
"extern parquet's default remains None"
779+
);
780+
assert_eq!(
781+
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
782+
Some(&BloomFilterProperties::default()),
783+
"datafusion's has BloomFilterProperties::default",
784+
);
785+
}
786+
787+
#[test]
788+
fn test_bloom_filter_set_fpp_only() {
789+
// the TableParquetOptions::default, with only fpp set
790+
let mut default_table_writer_opts = TableParquetOptions::default();
791+
default_table_writer_opts.global.bloom_filter_on_write = true;
792+
default_table_writer_opts.global.bloom_filter_fpp = Some(0.42);
793+
794+
// the WriterProperties::default, with only fpp set
795+
let default_writer_props = WriterProperties::new();
796+
let from_datafusion_defaults =
797+
WriterPropertiesBuilder::try_from(&default_table_writer_opts)
798+
.unwrap()
799+
.set_bloom_filter_enabled(true)
800+
.set_bloom_filter_fpp(0.42)
801+
.build();
802+
803+
// TODO: should have same behavior in either.
804+
// refer to https://github.com/apache/datafusion/issues/11367
805+
assert_ne!(
806+
default_writer_props.bloom_filter_properties(&"default".into()),
807+
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
808+
"parquet and datafusion props, will not have the same bloom filter props",
809+
);
810+
assert_eq!(
811+
default_writer_props.bloom_filter_properties(&"default".into()),
812+
None,
813+
"extern parquet's default remains None"
814+
);
815+
assert_eq!(
816+
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
817+
Some(&BloomFilterProperties {
818+
fpp: 0.42,
819+
ndv: DEFAULT_BLOOM_FILTER_NDV
820+
}),
821+
"datafusion's has BloomFilterProperties",
822+
);
823+
}
824+
825+
#[test]
826+
fn test_bloom_filter_set_ndv_only() {
827+
// the TableParquetOptions::default, with only ndv set
828+
let mut default_table_writer_opts = TableParquetOptions::default();
829+
default_table_writer_opts.global.bloom_filter_on_write = true;
830+
default_table_writer_opts.global.bloom_filter_ndv = Some(42);
831+
832+
// the WriterProperties::default, with only ndv set
833+
let default_writer_props = WriterProperties::new();
834+
let from_datafusion_defaults =
835+
WriterPropertiesBuilder::try_from(&default_table_writer_opts)
836+
.unwrap()
837+
.set_bloom_filter_enabled(true)
838+
.set_bloom_filter_ndv(42)
839+
.build();
840+
841+
// TODO: should have same behavior in either.
842+
// refer to https://github.com/apache/datafusion/issues/11367
843+
assert_ne!(
844+
default_writer_props.bloom_filter_properties(&"default".into()),
845+
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
846+
"parquet and datafusion props, will not have the same bloom filter props",
847+
);
848+
assert_eq!(
849+
default_writer_props.bloom_filter_properties(&"default".into()),
850+
None,
851+
"extern parquet's default remains None"
852+
);
853+
assert_eq!(
854+
from_datafusion_defaults.bloom_filter_properties(&"default".into()),
855+
Some(&BloomFilterProperties {
856+
fpp: DEFAULT_BLOOM_FILTER_FPP,
857+
ndv: 42
858+
}),
859+
"datafusion's has BloomFilterProperties",
860+
);
861+
}
569862
}

0 commit comments

Comments
 (0)