Skip to content

Commit 922b399

Browse files
committed
Revert "Enable reading StringViewArray by default from Parquet (8% improvement for entire ClickBench suite) (apache#13101)"
This reverts commit 2d7892b.
1 parent 50e1209 commit 922b399

File tree

14 files changed

+65
-47
lines changed

14 files changed

+65
-47
lines changed

benchmarks/src/bin/external_aggr.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,7 +193,12 @@ impl ExternalAggrConfig {
193193
) -> Result<Vec<QueryResult>> {
194194
let query_name =
195195
format!("Q{query_id}({})", human_readable_size(mem_limit as usize));
196-
let config = self.common.config();
196+
let mut config = self.common.config();
197+
config
198+
.options_mut()
199+
.execution
200+
.parquet
201+
.schema_force_view_types = self.common.force_view_types;
197202
let runtime_config = RuntimeConfig::new()
198203
.with_memory_pool(Arc::new(FairSpillPool::new(mem_limit as usize)))
199204
.build_arc()?;

benchmarks/src/clickbench.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ impl RunOpt {
119119
let mut config = self.common.config();
120120
{
121121
let parquet_options = &mut config.options_mut().execution.parquet;
122+
parquet_options.schema_force_view_types = self.common.force_view_types;
122123
// The hits_partitioned dataset specifies string columns
123124
// as binary due to how it was written. Force it to strings
124125
parquet_options.binary_as_string = true;

benchmarks/src/imdb/run.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,11 @@ impl RunOpt {
305305
.config()
306306
.with_collect_statistics(!self.disable_statistics);
307307
config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
308-
308+
config
309+
.options_mut()
310+
.execution
311+
.parquet
312+
.schema_force_view_types = self.common.force_view_types;
309313
let ctx = SessionContext::new_with_config(config);
310314

311315
// register tables
@@ -513,6 +517,7 @@ mod tests {
513517
partitions: Some(2),
514518
batch_size: 8192,
515519
debug: false,
520+
force_view_types: false,
516521
};
517522
let opt = RunOpt {
518523
query: Some(query),
@@ -546,6 +551,7 @@ mod tests {
546551
partitions: Some(2),
547552
batch_size: 8192,
548553
debug: false,
554+
force_view_types: false,
549555
};
550556
let opt = RunOpt {
551557
query: Some(query),

benchmarks/src/tpch/run.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,11 @@ impl RunOpt {
120120
.config()
121121
.with_collect_statistics(!self.disable_statistics);
122122
config.options_mut().optimizer.prefer_hash_join = self.prefer_hash_join;
123+
config
124+
.options_mut()
125+
.execution
126+
.parquet
127+
.schema_force_view_types = self.common.force_view_types;
123128
let ctx = SessionContext::new_with_config(config);
124129

125130
// register tables
@@ -340,6 +345,7 @@ mod tests {
340345
partitions: Some(2),
341346
batch_size: 8192,
342347
debug: false,
348+
force_view_types: false,
343349
};
344350
let opt = RunOpt {
345351
query: Some(query),
@@ -373,6 +379,7 @@ mod tests {
373379
partitions: Some(2),
374380
batch_size: 8192,
375381
debug: false,
382+
force_view_types: false,
376383
};
377384
let opt = RunOpt {
378385
query: Some(query),

benchmarks/src/util/options.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@ pub struct CommonOpt {
3737
/// Activate debug mode to see more details
3838
#[structopt(short, long)]
3939
pub debug: bool,
40+
41+
/// If true, will use StringView/BinaryViewArray instead of String/BinaryArray
42+
/// when reading ParquetFiles
43+
#[structopt(long)]
44+
pub force_view_types: bool,
4045
}
4146

4247
impl CommonOpt {

datafusion/common/src/config.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ config_namespace! {
399399

400400
/// (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`,
401401
/// and `Binary/BinaryLarge` with `BinaryView`.
402-
pub schema_force_view_types: bool, default = true
402+
pub schema_force_view_types: bool, default = false
403403

404404
/// (reading) If true, parquet reader will read columns of
405405
/// `Binary/LargeBinary` with `Utf8`, and `BinaryView` with `Utf8View`.

datafusion/common/src/scalar/mod.rs

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -980,11 +980,6 @@ impl ScalarValue {
980980
ScalarValue::from(val.into())
981981
}
982982

983-
/// Returns a [`ScalarValue::Utf8View`] representing `val`
984-
pub fn new_utf8view(val: impl Into<String>) -> Self {
985-
ScalarValue::Utf8View(Some(val.into()))
986-
}
987-
988983
/// Returns a [`ScalarValue::IntervalYearMonth`] representing
989984
/// `years` years and `months` months
990985
pub fn new_interval_ym(years: i32, months: i32) -> Self {

datafusion/core/tests/parquet/page_pruning.rs

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,8 @@ async fn page_index_filter_one_col() {
149149
let session_ctx = SessionContext::new();
150150
let task_ctx = session_ctx.task_ctx();
151151

152-
// 5.create filter date_string_col == "01/01/09"`;
153-
// Note this test doesn't apply type coercion so the literal must match the actual view type
154-
let filter = col("date_string_col").eq(lit(ScalarValue::new_utf8view("01/01/09")));
152+
// 5.create filter date_string_col == 1;
153+
let filter = col("date_string_col").eq(lit("01/01/09"));
155154
let parquet_exec = get_parquet_exec(&state, filter).await;
156155
let mut results = parquet_exec.execute(0, task_ctx.clone()).unwrap();
157156
let batch = results.next().await.unwrap().unwrap();

datafusion/sqllogictest/test_files/describe.slt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,8 +81,8 @@ int_col Int32 YES
8181
bigint_col Int64 YES
8282
float_col Float32 YES
8383
double_col Float64 YES
84-
date_string_col Utf8View YES
85-
string_col Utf8View YES
84+
date_string_col Utf8 YES
85+
string_col Utf8 YES
8686
timestamp_col Timestamp(Nanosecond, None) YES
8787
year Int32 YES
8888
month Int32 YES

datafusion/sqllogictest/test_files/explain.slt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -305,8 +305,8 @@ initial_physical_plan
305305
01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
306306
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
307307
initial_physical_plan_with_schema
308-
01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
309-
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
308+
01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
309+
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
310310
physical_plan after OutputRequirements
311311
01)OutputRequirementExec, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
312312
02)--GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
@@ -328,7 +328,7 @@ physical_plan after ProjectionPushdown SAME TEXT AS ABOVE
328328
physical_plan after LimitPushdown ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
329329
physical_plan after SanityCheckPlan SAME TEXT AS ABOVE
330330
physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
331-
physical_plan_with_schema ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
331+
physical_plan_with_schema ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
332332

333333

334334
statement ok
@@ -345,8 +345,8 @@ initial_physical_plan_with_stats
345345
01)GlobalLimitExec: skip=0, fetch=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
346346
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
347347
initial_physical_plan_with_schema
348-
01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
349-
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
348+
01)GlobalLimitExec: skip=0, fetch=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
349+
02)--ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
350350
physical_plan after OutputRequirements
351351
01)OutputRequirementExec
352352
02)--GlobalLimitExec: skip=0, fetch=10
@@ -369,7 +369,7 @@ physical_plan after LimitPushdown ParquetExec: file_groups={1 group: [[WORKSPACE
369369
physical_plan after SanityCheckPlan SAME TEXT AS ABOVE
370370
physical_plan ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10
371371
physical_plan_with_stats ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, statistics=[Rows=Exact(8), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:),(Col[3]:),(Col[4]:),(Col[5]:),(Col[6]:),(Col[7]:),(Col[8]:),(Col[9]:),(Col[10]:)]]
372-
physical_plan_with_schema ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:BinaryView;N, string_col:BinaryView;N, timestamp_col:Timestamp(Nanosecond, None);N]
372+
physical_plan_with_schema ParquetExec: file_groups={1 group: [[WORKSPACE_ROOT/parquet-testing/data/alltypes_plain.parquet]]}, projection=[id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col], limit=10, schema=[id:Int32;N, bool_col:Boolean;N, tinyint_col:Int32;N, smallint_col:Int32;N, int_col:Int32;N, bigint_col:Int64;N, float_col:Float32;N, double_col:Float64;N, date_string_col:Binary;N, string_col:Binary;N, timestamp_col:Timestamp(Nanosecond, None);N]
373373

374374

375375
statement ok

datafusion/sqllogictest/test_files/information_schema.slt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ datafusion.execution.parquet.metadata_size_hint NULL
202202
datafusion.execution.parquet.pruning true
203203
datafusion.execution.parquet.pushdown_filters false
204204
datafusion.execution.parquet.reorder_filters false
205-
datafusion.execution.parquet.schema_force_view_types true
205+
datafusion.execution.parquet.schema_force_view_types false
206206
datafusion.execution.parquet.skip_metadata true
207207
datafusion.execution.parquet.statistics_enabled page
208208
datafusion.execution.parquet.write_batch_size 1024
@@ -295,7 +295,7 @@ datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, the
295295
datafusion.execution.parquet.pruning true (reading) If true, the parquet reader attempts to skip entire row groups based on the predicate in the query and the metadata (min/max values) stored in the parquet file
296296
datafusion.execution.parquet.pushdown_filters false (reading) If true, filter expressions are be applied during the parquet decoding operation to reduce the number of rows decoded. This optimization is sometimes called "late materialization".
297297
datafusion.execution.parquet.reorder_filters false (reading) If true, filter expressions evaluated during the parquet decoding operation will be reordered heuristically to minimize the cost of evaluation. If false, the filters are applied in the same order as written in the query
298-
datafusion.execution.parquet.schema_force_view_types true (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`.
298+
datafusion.execution.parquet.schema_force_view_types false (reading) If true, parquet reader will read columns of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with `BinaryView`.
299299
datafusion.execution.parquet.skip_metadata true (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata
300300
datafusion.execution.parquet.statistics_enabled page (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting
301301
datafusion.execution.parquet.write_batch_size 1024 (writing) Sets write_batch_size in bytes

datafusion/sqllogictest/test_files/map.slt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ describe data;
4242
----
4343
ints Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false) NO
4444
strings Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false) NO
45-
timestamp Utf8View NO
45+
timestamp Utf8 NO
4646

4747
query ??T
4848
SELECT * FROM data ORDER by ints['bytes'] DESC LIMIT 10;

0 commit comments

Comments
 (0)