Skip to content

Commit 6d77748

Browse files
lewiszlwalamb
andauthored
Split parquet bloom filter config and enable bloom filter on read by default (#10306)
* Split bloom filter config * Fix proto * Set bloom_filter on write as false * Fix tests * fmt md * Fix test * Fix slt tests * clippy * Update datafusion/common/src/config.rs Co-authored-by: Andrew Lamb <[email protected]> * Update datafusion/common/src/config.rs Co-authored-by: Andrew Lamb <[email protected]> * Remove enabled suffix * Regen proto and fix tests * Update configs.md * Improve bloom filter test --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 2c0afce commit 6d77748

File tree

16 files changed

+124
-46
lines changed

16 files changed

+124
-46
lines changed

datafusion/common/src/config.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -400,8 +400,11 @@ config_namespace! {
400400
/// default parquet writer setting
401401
pub encoding: Option<String>, default = None
402402

403-
/// Sets if bloom filter is enabled for any column
404-
pub bloom_filter_enabled: bool, default = false
403+
/// Use any available bloom filters when reading parquet files
404+
pub bloom_filter_on_read: bool, default = true
405+
406+
/// Write bloom filters for all columns when creating parquet files
407+
pub bloom_filter_on_write: bool, default = false
405408

406409
/// Sets bloom filter false positive probability. If NULL, uses
407410
/// default parquet writer setting
@@ -1662,6 +1665,7 @@ config_namespace! {
16621665
}
16631666

16641667
#[derive(Debug, Clone, PartialEq)]
1668+
#[allow(clippy::large_enum_variant)]
16651669
pub enum FormatOptions {
16661670
CSV(CsvOptions),
16671671
JSON(JsonOptions),

datafusion/common/src/file_options/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ mod tests {
6767
"format.data_page_row_count_limit".to_owned(),
6868
"123".to_owned(),
6969
);
70-
option_map.insert("format.bloom_filter_enabled".to_owned(), "true".to_owned());
70+
option_map.insert("format.bloom_filter_on_write".to_owned(), "true".to_owned());
7171
option_map.insert("format.encoding".to_owned(), "plain".to_owned());
7272
option_map.insert("format.dictionary_enabled".to_owned(), "true".to_owned());
7373
option_map.insert("format.compression".to_owned(), "zstd(4)".to_owned());

datafusion/common/src/file_options/parquet_writer.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ impl TryFrom<&TableParquetOptions> for ParquetWriterOptions {
6262
created_by,
6363
column_index_truncate_length,
6464
data_page_row_count_limit,
65-
bloom_filter_enabled,
65+
bloom_filter_on_write,
6666
encoding,
6767
dictionary_enabled,
6868
compression,
@@ -80,6 +80,7 @@ impl TryFrom<&TableParquetOptions> for ParquetWriterOptions {
8080
allow_single_file_parallelism: _,
8181
maximum_parallel_row_group_writers: _,
8282
maximum_buffered_record_batches_per_stream: _,
83+
bloom_filter_on_read: _,
8384
} = &parquet_options.global;
8485

8586
let key_value_metadata = if !parquet_options.key_value_metadata.is_empty() {
@@ -104,7 +105,7 @@ impl TryFrom<&TableParquetOptions> for ParquetWriterOptions {
104105
.set_created_by(created_by.clone())
105106
.set_column_index_truncate_length(*column_index_truncate_length)
106107
.set_data_page_row_count_limit(*data_page_row_count_limit)
107-
.set_bloom_filter_enabled(*bloom_filter_enabled)
108+
.set_bloom_filter_enabled(*bloom_filter_on_write)
108109
.set_key_value_metadata(key_value_metadata);
109110

110111
if let Some(encoding) = &encoding {

datafusion/core/src/datasource/listing/table.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1632,7 +1632,7 @@ mod tests {
16321632
"50".into(),
16331633
);
16341634
config_map.insert(
1635-
"datafusion.execution.parquet.bloom_filter_enabled".into(),
1635+
"datafusion.execution.parquet.bloom_filter_on_write".into(),
16361636
"true".into(),
16371637
);
16381638
config_map.insert(
@@ -1710,7 +1710,7 @@ mod tests {
17101710
"delta_binary_packed".into(),
17111711
);
17121712
config_map.insert(
1713-
"datafusion.execution.parquet.bloom_filter_enabled".into(),
1713+
"datafusion.execution.parquet.bloom_filter_on_write".into(),
17141714
"true".into(),
17151715
);
17161716
config_map.insert(

datafusion/core/src/datasource/physical_plan/parquet/mod.rs

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -243,14 +243,24 @@ impl ParquetExec {
243243
}
244244

245245
/// If enabled, the reader will read by the bloom filter
246-
pub fn with_enable_bloom_filter(mut self, enable_bloom_filter: bool) -> Self {
247-
self.table_parquet_options.global.bloom_filter_enabled = enable_bloom_filter;
246+
pub fn with_bloom_filter_on_read(mut self, bloom_filter_on_read: bool) -> Self {
247+
self.table_parquet_options.global.bloom_filter_on_read = bloom_filter_on_read;
248248
self
249249
}
250250

251-
/// Return the value described in [`Self::with_enable_bloom_filter`]
252-
fn enable_bloom_filter(&self) -> bool {
253-
self.table_parquet_options.global.bloom_filter_enabled
251+
/// If enabled, the writer will write by the bloom filter
252+
pub fn with_bloom_filter_on_write(
253+
mut self,
254+
enable_bloom_filter_on_write: bool,
255+
) -> Self {
256+
self.table_parquet_options.global.bloom_filter_on_write =
257+
enable_bloom_filter_on_write;
258+
self
259+
}
260+
261+
/// Return the value described in [`Self::with_bloom_filter_on_read`]
262+
fn bloom_filter_on_read(&self) -> bool {
263+
self.table_parquet_options.global.bloom_filter_on_read
254264
}
255265

256266
fn output_partitioning_helper(file_config: &FileScanConfig) -> Partitioning {
@@ -407,7 +417,7 @@ impl ExecutionPlan for ParquetExec {
407417
pushdown_filters: self.pushdown_filters(),
408418
reorder_filters: self.reorder_filters(),
409419
enable_page_index: self.enable_page_index(),
410-
enable_bloom_filter: self.enable_bloom_filter(),
420+
enable_bloom_filter: self.bloom_filter_on_read(),
411421
};
412422

413423
let stream =

datafusion/execution/src/config.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -352,12 +352,12 @@ impl SessionConfig {
352352

353353
/// Returns true if bloom filter should be used to skip parquet row groups
354354
pub fn parquet_bloom_filter_pruning(&self) -> bool {
355-
self.options.execution.parquet.bloom_filter_enabled
355+
self.options.execution.parquet.bloom_filter_on_read
356356
}
357357

358358
/// Enables or disables the use of bloom filter for parquet readers to skip row groups
359359
pub fn with_parquet_bloom_filter_pruning(mut self, enabled: bool) -> Self {
360-
self.options.execution.parquet.bloom_filter_enabled = enabled;
360+
self.options.execution.parquet.bloom_filter_on_read = enabled;
361361
self
362362
}
363363

datafusion/proto/proto/datafusion.proto

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1215,10 +1215,12 @@ message ParquetOptions {
12151215
uint64 data_pagesize_limit = 7; // default = 1024 * 1024
12161216
uint64 write_batch_size = 8; // default = 1024
12171217
string writer_version = 9; // default = "1.0"
1218-
bool bloom_filter_enabled = 20; // default = false
1218+
// bool bloom_filter_enabled = 20; // default = false
12191219
bool allow_single_file_parallelism = 23; // default = true
12201220
uint64 maximum_parallel_row_group_writers = 24; // default = 1
12211221
uint64 maximum_buffered_record_batches_per_stream = 25; // default = 2
1222+
bool bloom_filter_on_read = 26; // default = true
1223+
bool bloom_filter_on_write = 27; // default = false
12221224

12231225
oneof metadata_size_hint_opt {
12241226
uint64 metadata_size_hint = 4;

datafusion/proto/src/generated/pbjson.rs

Lines changed: 36 additions & 18 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/proto/src/generated/prost.rs

Lines changed: 8 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/proto/src/physical_plan/from_proto.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -880,7 +880,8 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions {
880880
protobuf::parquet_options::EncodingOpt::Encoding(v) => Some(v),
881881
})
882882
.unwrap_or(None),
883-
bloom_filter_enabled: value.bloom_filter_enabled,
883+
bloom_filter_on_read: value.bloom_filter_on_read,
884+
bloom_filter_on_write: value.bloom_filter_on_write,
884885
bloom_filter_fpp: value.clone()
885886
.bloom_filter_fpp_opt
886887
.map(|opt| match opt {

datafusion/proto/src/physical_plan/to_proto.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -907,7 +907,8 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions {
907907
column_index_truncate_length_opt: value.column_index_truncate_length.map(|v| protobuf::parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v as u64)),
908908
data_page_row_count_limit: value.data_page_row_count_limit as u64,
909909
encoding_opt: value.encoding.clone().map(protobuf::parquet_options::EncodingOpt::Encoding),
910-
bloom_filter_enabled: value.bloom_filter_enabled,
910+
bloom_filter_on_read: value.bloom_filter_on_read,
911+
bloom_filter_on_write: value.bloom_filter_on_write,
911912
bloom_filter_fpp_opt: value.bloom_filter_fpp.map(protobuf::parquet_options::BloomFilterFppOpt::BloomFilterFpp),
912913
bloom_filter_ndv_opt: value.bloom_filter_ndv.map(protobuf::parquet_options::BloomFilterNdvOpt::BloomFilterNdv),
913914
allow_single_file_parallelism: value.allow_single_file_parallelism,

datafusion/proto/tests/cases/roundtrip_logical_plan.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ async fn roundtrip_logical_plan_copy_to_writer_options() -> Result<()> {
344344
TableOptions::default_from_session_config(ctx.state().config_options());
345345
let mut parquet_format = table_options.parquet;
346346

347-
parquet_format.global.bloom_filter_enabled = true;
347+
parquet_format.global.bloom_filter_on_read = true;
348348
parquet_format.global.created_by = "DataFusion Test".to_string();
349349
parquet_format.global.writer_version = "PARQUET_2_0".to_string();
350350
parquet_format.global.write_batch_size = 111;

datafusion/sqllogictest/test_files/copy.slt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ OPTIONS (
271271
'format.created_by' 'DF copy.slt',
272272
'format.column_index_truncate_length' 123,
273273
'format.data_page_row_count_limit' 1234,
274-
'format.bloom_filter_enabled' true,
274+
'format.bloom_filter_on_read' true,
275275
'format.bloom_filter_enabled::col1' false,
276276
'format.bloom_filter_fpp::col2' 0.456,
277277
'format.bloom_filter_ndv::col2' 456,

0 commit comments

Comments
 (0)