Skip to content

Commit cb24fc0

Browse files
committed
move config to parquet-only
1 parent 5e434bc commit cb24fc0

File tree

17 files changed

+104
-42
lines changed

17 files changed

+104
-42
lines changed

benchmarks/src/clickbench.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,11 @@ impl RunOpt {
117117
};
118118

119119
let mut config = self.common.config();
120-
config.options_mut().execution.schema_force_string_view = self.common.string_view;
120+
config
121+
.options_mut()
122+
.execution
123+
.parquet
124+
.schema_force_string_view = self.common.string_view;
121125

122126
let ctx = SessionContext::new_with_config(config);
123127
self.register_hits(&ctx).await?;

benchmarks/src/tpch/run.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,11 @@ impl RunOpt {
120120
.config()
121121
.with_collect_statistics(!self.disable_statistics);
122122
config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
123-
config.options_mut().execution.schema_force_string_view = self.common.string_view;
123+
config
124+
.options_mut()
125+
.execution
126+
.parquet
127+
.schema_force_string_view = self.common.string_view;
124128
let ctx = SessionContext::new_with_config(config);
125129

126130
// register tables

datafusion/common/src/config.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -311,10 +311,6 @@ config_namespace! {
311311

312312
/// Should DataFusion keep the columns used for partition_by in the output RecordBatches
313313
pub keep_partition_by_columns: bool, default = false
314-
315-
/// If true, listing tables will read columns of `Utf8/Utf8Large` with `Utf8View`,
316-
/// and `Binary/BinaryLarge` with `BinaryView`.
317-
pub schema_force_string_view: bool, default = false
318314
}
319315
}
320316

@@ -455,6 +451,10 @@ config_namespace! {
455451
/// data frame.
456452
pub maximum_buffered_record_batches_per_stream: usize, default = 2
457453

454+
455+
/// If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`,
456+
/// and `Binary/BinaryLarge` with `BinaryView`.
457+
pub schema_force_string_view: bool, default = false
458458
}
459459
}
460460

datafusion/common/src/file_options/parquet_writer.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ impl TryFrom<&TableParquetOptions> for ParquetWriterOptions {
8181
maximum_parallel_row_group_writers: _,
8282
maximum_buffered_record_batches_per_stream: _,
8383
bloom_filter_on_read: _,
84+
schema_force_string_view: _,
8485
} = &parquet_options.global;
8586

8687
let key_value_metadata = if !parquet_options.key_value_metadata.is_empty() {

datafusion/core/example.parquet

976 Bytes
Binary file not shown.

datafusion/core/src/datasource/file_format/mod.rs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ use crate::error::Result;
4242
use crate::execution::context::SessionState;
4343
use crate::physical_plan::{ExecutionPlan, Statistics};
4444

45+
use arrow_schema::{DataType, Field, Schema};
4546
use datafusion_common::file_options::file_type::FileType;
4647
use datafusion_common::{internal_err, not_impl_err, GetExt};
4748
use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement};
@@ -194,6 +195,28 @@ pub fn file_type_to_format(
194195
}
195196
}
196197

198+
/// Transform a schema to use view types for Utf8 and Binary
199+
pub fn transform_schema_to_view(schema: &Schema) -> Schema {
200+
let transformed_fields: Vec<Arc<Field>> = schema
201+
.fields
202+
.iter()
203+
.map(|field| match field.data_type() {
204+
DataType::Utf8 | DataType::LargeUtf8 => Arc::new(Field::new(
205+
field.name(),
206+
DataType::Utf8View,
207+
field.is_nullable(),
208+
)),
209+
DataType::Binary | DataType::LargeBinary => Arc::new(Field::new(
210+
field.name(),
211+
DataType::BinaryView,
212+
field.is_nullable(),
213+
)),
214+
_ => field.clone(),
215+
})
216+
.collect();
217+
Schema::new_with_metadata(transformed_fields, schema.metadata.clone())
218+
}
219+
197220
#[cfg(test)]
198221
pub(crate) mod test_util {
199222
use std::ops::Range;

datafusion/core/src/datasource/file_format/parquet.rs

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ use std::sync::Arc;
2424

2525
use super::write::demux::start_demuxer_task;
2626
use super::write::{create_writer, SharedBuffer};
27-
use super::{FileFormat, FileFormatFactory, FileScanConfig};
27+
use super::{transform_schema_to_view, FileFormat, FileFormatFactory, FileScanConfig};
2828
use crate::arrow::array::RecordBatch;
2929
use crate::arrow::datatypes::{Fields, Schema, SchemaRef};
3030
use crate::datasource::file_format::file_compression_type::FileCompressionType;
@@ -305,6 +305,17 @@ impl FileFormat for ParquetFormat {
305305
Schema::try_merge(schemas)
306306
}?;
307307

308+
let schema = if state
309+
.config_options()
310+
.execution
311+
.parquet
312+
.schema_force_string_view
313+
{
314+
transform_schema_to_view(&schema)
315+
} else {
316+
schema
317+
};
318+
308319
Ok(Arc::new(schema))
309320
}
310321

datafusion/core/src/datasource/listing/table.rs

Lines changed: 2 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -410,31 +410,8 @@ impl ListingOptions {
410410
.try_collect()
411411
.await?;
412412

413-
let mut schema = self.format.infer_schema(state, &store, &files).await?;
414-
415-
if state.config_options().execution.schema_force_string_view {
416-
let transformed_fields: Vec<Arc<Field>> = schema
417-
.fields
418-
.iter()
419-
.map(|field| match field.data_type() {
420-
DataType::Utf8 | DataType::LargeUtf8 => Arc::new(Field::new(
421-
field.name(),
422-
DataType::Utf8View,
423-
field.is_nullable(),
424-
)),
425-
DataType::Binary | DataType::LargeBinary => Arc::new(Field::new(
426-
field.name(),
427-
DataType::BinaryView,
428-
field.is_nullable(),
429-
)),
430-
_ => field.clone(),
431-
})
432-
.collect();
433-
schema = Arc::new(Schema::new_with_metadata(
434-
transformed_fields,
435-
schema.metadata.clone(),
436-
));
437-
}
413+
let schema = self.format.infer_schema(state, &store, &files).await?;
414+
438415
Ok(schema)
439416
}
440417

datafusion/core/src/datasource/physical_plan/parquet/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -720,6 +720,10 @@ impl ExecutionPlan for ParquetExec {
720720
enable_page_index: self.enable_page_index(),
721721
enable_bloom_filter: self.bloom_filter_on_read(),
722722
schema_adapter_factory,
723+
schema_force_string_view: self
724+
.table_parquet_options
725+
.global
726+
.schema_force_string_view,
723727
};
724728

725729
let stream =

datafusion/core/src/datasource/physical_plan/parquet/opener.rs

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
//! [`ParquetOpener`] for opening Parquet files
1919
20+
use crate::datasource::file_format::transform_schema_to_view;
2021
use crate::datasource::physical_plan::parquet::page_filter::PagePruningPredicate;
2122
use crate::datasource::physical_plan::parquet::row_group_filter::RowGroupAccessPlanFilter;
2223
use crate::datasource::physical_plan::parquet::{
@@ -33,7 +34,7 @@ use datafusion_physical_expr_common::physical_expr::PhysicalExpr;
3334
use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet;
3435
use futures::{StreamExt, TryStreamExt};
3536
use log::debug;
36-
use parquet::arrow::arrow_reader::ArrowReaderOptions;
37+
use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions};
3738
use parquet::arrow::async_reader::AsyncFileReader;
3839
use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask};
3940
use std::sync::Arc;
@@ -56,6 +57,7 @@ pub(super) struct ParquetOpener {
5657
pub enable_page_index: bool,
5758
pub enable_bloom_filter: bool,
5859
pub schema_adapter_factory: Arc<dyn SchemaAdapterFactory>,
60+
pub schema_force_string_view: bool,
5961
}
6062

6163
impl FileOpener for ParquetOpener {
@@ -66,7 +68,7 @@ impl FileOpener for ParquetOpener {
6668
let file_metrics =
6769
ParquetFileMetrics::new(self.partition_index, &file_name, &self.metrics);
6870

69-
let reader: Box<dyn AsyncFileReader> =
71+
let mut reader: Box<dyn AsyncFileReader> =
7072
self.parquet_file_reader_factory.create_reader(
7173
self.partition_index,
7274
file_meta,
@@ -90,14 +92,27 @@ impl FileOpener for ParquetOpener {
9092
);
9193
let enable_bloom_filter = self.enable_bloom_filter;
9294
let limit = self.limit;
95+
let schema_force_string_view = self.schema_force_string_view;
9396

9497
Ok(Box::pin(async move {
98+
let options = ArrowReaderOptions::new().with_page_index(enable_page_index);
99+
100+
let metadata =
101+
ArrowReaderMetadata::load_async(&mut reader, options.clone()).await?;
102+
let mut schema = metadata.schema().clone();
103+
104+
if schema_force_string_view {
105+
schema = Arc::new(transform_schema_to_view(&schema));
106+
}
107+
95108
let options = ArrowReaderOptions::new()
96109
.with_page_index(enable_page_index)
97-
.with_schema(table_schema.clone());
110+
.with_schema(schema.clone());
111+
let metadata =
112+
ArrowReaderMetadata::try_new(metadata.metadata().clone(), options)?;
113+
98114
let mut builder =
99-
ParquetRecordBatchStreamBuilder::new_with_options(reader, options)
100-
.await?;
115+
ParquetRecordBatchStreamBuilder::new_with_metadata(reader, metadata);
101116

102117
let file_schema = builder.schema().clone();
103118

datafusion/proto-common/proto/datafusion_common.proto

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -478,6 +478,7 @@ message ParquetOptions {
478478
uint64 maximum_buffered_record_batches_per_stream = 25; // default = 2
479479
bool bloom_filter_on_read = 26; // default = true
480480
bool bloom_filter_on_write = 27; // default = false
481+
bool schema_force_string_view = 28; // default = false
481482

482483
oneof metadata_size_hint_opt {
483484
uint64 metadata_size_hint = 4;

datafusion/proto-common/src/from_proto/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -950,7 +950,7 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions {
950950
allow_single_file_parallelism: value.allow_single_file_parallelism,
951951
maximum_parallel_row_group_writers: value.maximum_parallel_row_group_writers as usize,
952952
maximum_buffered_record_batches_per_stream: value.maximum_buffered_record_batches_per_stream as usize,
953-
953+
schema_force_string_view: value.schema_force_string_view,
954954
})
955955
}
956956
}

datafusion/proto-common/src/generated/pbjson.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4634,6 +4634,9 @@ impl serde::Serialize for ParquetOptions {
46344634
if self.bloom_filter_on_write {
46354635
len += 1;
46364636
}
4637+
if self.schema_force_string_view {
4638+
len += 1;
4639+
}
46374640
if self.dictionary_page_size_limit != 0 {
46384641
len += 1;
46394642
}
@@ -4717,6 +4720,9 @@ impl serde::Serialize for ParquetOptions {
47174720
if self.bloom_filter_on_write {
47184721
struct_ser.serialize_field("bloomFilterOnWrite", &self.bloom_filter_on_write)?;
47194722
}
4723+
if self.schema_force_string_view {
4724+
struct_ser.serialize_field("schemaForceStringView", &self.schema_force_string_view)?;
4725+
}
47204726
if self.dictionary_page_size_limit != 0 {
47214727
#[allow(clippy::needless_borrow)]
47224728
struct_ser.serialize_field("dictionaryPageSizeLimit", ToString::to_string(&self.dictionary_page_size_limit).as_str())?;
@@ -4834,6 +4840,8 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
48344840
"bloomFilterOnRead",
48354841
"bloom_filter_on_write",
48364842
"bloomFilterOnWrite",
4843+
"schema_force_string_view",
4844+
"schemaForceStringView",
48374845
"dictionary_page_size_limit",
48384846
"dictionaryPageSizeLimit",
48394847
"data_page_row_count_limit",
@@ -4875,6 +4883,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
48754883
MaximumBufferedRecordBatchesPerStream,
48764884
BloomFilterOnRead,
48774885
BloomFilterOnWrite,
4886+
SchemaForceStringView,
48784887
DictionaryPageSizeLimit,
48794888
DataPageRowCountLimit,
48804889
MaxRowGroupSize,
@@ -4922,6 +4931,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
49224931
"maximumBufferedRecordBatchesPerStream" | "maximum_buffered_record_batches_per_stream" => Ok(GeneratedField::MaximumBufferedRecordBatchesPerStream),
49234932
"bloomFilterOnRead" | "bloom_filter_on_read" => Ok(GeneratedField::BloomFilterOnRead),
49244933
"bloomFilterOnWrite" | "bloom_filter_on_write" => Ok(GeneratedField::BloomFilterOnWrite),
4934+
"schemaForceStringView" | "schema_force_string_view" => Ok(GeneratedField::SchemaForceStringView),
49254935
"dictionaryPageSizeLimit" | "dictionary_page_size_limit" => Ok(GeneratedField::DictionaryPageSizeLimit),
49264936
"dataPageRowCountLimit" | "data_page_row_count_limit" => Ok(GeneratedField::DataPageRowCountLimit),
49274937
"maxRowGroupSize" | "max_row_group_size" => Ok(GeneratedField::MaxRowGroupSize),
@@ -4967,6 +4977,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
49674977
let mut maximum_buffered_record_batches_per_stream__ = None;
49684978
let mut bloom_filter_on_read__ = None;
49694979
let mut bloom_filter_on_write__ = None;
4980+
let mut schema_force_string_view__ = None;
49704981
let mut dictionary_page_size_limit__ = None;
49714982
let mut data_page_row_count_limit__ = None;
49724983
let mut max_row_group_size__ = None;
@@ -5068,6 +5079,12 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
50685079
}
50695080
bloom_filter_on_write__ = Some(map_.next_value()?);
50705081
}
5082+
GeneratedField::SchemaForceStringView => {
5083+
if schema_force_string_view__.is_some() {
5084+
return Err(serde::de::Error::duplicate_field("schemaForceStringView"));
5085+
}
5086+
schema_force_string_view__ = Some(map_.next_value()?);
5087+
}
50715088
GeneratedField::DictionaryPageSizeLimit => {
50725089
if dictionary_page_size_limit__.is_some() {
50735090
return Err(serde::de::Error::duplicate_field("dictionaryPageSizeLimit"));
@@ -5168,6 +5185,7 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions {
51685185
maximum_buffered_record_batches_per_stream: maximum_buffered_record_batches_per_stream__.unwrap_or_default(),
51695186
bloom_filter_on_read: bloom_filter_on_read__.unwrap_or_default(),
51705187
bloom_filter_on_write: bloom_filter_on_write__.unwrap_or_default(),
5188+
schema_force_string_view: schema_force_string_view__.unwrap_or_default(),
51715189
dictionary_page_size_limit: dictionary_page_size_limit__.unwrap_or_default(),
51725190
data_page_row_count_limit: data_page_row_count_limit__.unwrap_or_default(),
51735191
max_row_group_size: max_row_group_size__.unwrap_or_default(),

datafusion/proto-common/src/generated/prost.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -786,6 +786,9 @@ pub struct ParquetOptions {
786786
/// default = false
787787
#[prost(bool, tag = "27")]
788788
pub bloom_filter_on_write: bool,
789+
/// default = false
790+
#[prost(bool, tag = "28")]
791+
pub schema_force_string_view: bool,
789792
#[prost(uint64, tag = "12")]
790793
pub dictionary_page_size_limit: u64,
791794
#[prost(uint64, tag = "18")]

datafusion/proto-common/src/to_proto/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -825,6 +825,7 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions {
825825
allow_single_file_parallelism: value.allow_single_file_parallelism,
826826
maximum_parallel_row_group_writers: value.maximum_parallel_row_group_writers as u64,
827827
maximum_buffered_record_batches_per_stream: value.maximum_buffered_record_batches_per_stream as u64,
828+
schema_force_string_view: value.schema_force_string_view,
828829
})
829830
}
830831
}

datafusion/sqllogictest/test_files/information_schema.slt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,12 +200,12 @@ datafusion.execution.parquet.metadata_size_hint NULL
200200
datafusion.execution.parquet.pruning true
201201
datafusion.execution.parquet.pushdown_filters false
202202
datafusion.execution.parquet.reorder_filters false
203+
datafusion.execution.parquet.schema_force_string_view false
203204
datafusion.execution.parquet.skip_metadata true
204205
datafusion.execution.parquet.statistics_enabled NULL
205206
datafusion.execution.parquet.write_batch_size 1024
206207
datafusion.execution.parquet.writer_version 1.0
207208
datafusion.execution.planning_concurrency 13
208-
datafusion.execution.schema_force_string_view false
209209
datafusion.execution.soft_max_rows_per_output_file 50000000
210210
datafusion.execution.sort_in_place_threshold_bytes 1048576
211211
datafusion.execution.sort_spill_reservation_bytes 10485760
@@ -285,12 +285,12 @@ datafusion.execution.parquet.metadata_size_hint NULL If specified, the parquet r
285285
datafusion.execution.parquet.pruning true If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file
286286
datafusion.execution.parquet.pushdown_filters false If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization".
287287
datafusion.execution.parquet.reorder_filters false If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query
288+
datafusion.execution.parquet.schema_force_string_view false If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`.
288289
datafusion.execution.parquet.skip_metadata true If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata
289290
datafusion.execution.parquet.statistics_enabled NULL Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting
290291
datafusion.execution.parquet.write_batch_size 1024 Sets write_batch_size in bytes
291292
datafusion.execution.parquet.writer_version 1.0 Sets parquet writer version valid values are "1.0" and "2.0"
292293
datafusion.execution.planning_concurrency 13 Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system
293-
datafusion.execution.schema_force_string_view false If true, listing tables will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`.
294294
datafusion.execution.soft_max_rows_per_output_file 50000000 Target number of rows in output files when writing multiple. This is a soft max, so it can be exceeded slightly. There also will be one file smaller than the limit if the total number of rows written is not roughly divisible by the soft max
295295
datafusion.execution.sort_in_place_threshold_bytes 1048576 When sorting, below what size should data be concatenated and sorted in a single RecordBatch rather than sorted in batches and merged.
296296
datafusion.execution.sort_spill_reservation_bytes 10485760 Specifies the reserved memory for each spillable sort operation to facilitate an in-memory merge. When a sort operation spills to disk, the in-memory data must be sorted and merged before being written to a file. This setting reserves a specific amount of memory for that in-memory sort/merge process. Note: This setting is irrelevant if the sort operation cannot spill (i.e., if there's no `DiskManager` configured).

0 commit comments

Comments
 (0)