Skip to content

Commit daf182d

Browse files
authored
Add TPCH-DS planning benchmark (#9907)
* Move TPCH schema definition * Move schema definitions into test_util * Add tpcds planning benchmark
1 parent dfd4442 commit daf182d

File tree

5 files changed

+774
-686
lines changed

5 files changed

+774
-686
lines changed

datafusion/core/benches/sql_planner.rs

Lines changed: 57 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ use arrow::datatypes::{DataType, Field, Fields, Schema};
2626
use datafusion::datasource::MemTable;
2727
use datafusion::execution::context::SessionContext;
2828
use std::sync::Arc;
29+
use test_utils::tpcds::tpcds_schemas;
30+
use test_utils::tpch::tpch_schemas;
31+
use test_utils::TableDef;
2932
use tokio::runtime::Runtime;
3033

3134
/// Create a logical plan from the specified sql
@@ -48,116 +51,18 @@ fn physical_plan(ctx: &SessionContext, sql: &str) {
4851
}
4952

5053
/// Create schema with the specified number of columns
51-
pub fn create_schema(column_prefix: &str, num_columns: usize) -> Schema {
54+
fn create_schema(column_prefix: &str, num_columns: usize) -> Schema {
5255
let fields: Fields = (0..num_columns)
5356
.map(|i| Field::new(format!("{column_prefix}{i}"), DataType::Int32, true))
5457
.collect();
5558
Schema::new(fields)
5659
}
5760

58-
pub fn create_table_provider(column_prefix: &str, num_columns: usize) -> Arc<MemTable> {
61+
fn create_table_provider(column_prefix: &str, num_columns: usize) -> Arc<MemTable> {
5962
let schema = Arc::new(create_schema(column_prefix, num_columns));
6063
MemTable::try_new(schema, vec![]).map(Arc::new).unwrap()
6164
}
6265

63-
pub fn create_tpch_schemas() -> [(String, Schema); 8] {
64-
let lineitem_schema = Schema::new(vec![
65-
Field::new("l_orderkey", DataType::Int64, false),
66-
Field::new("l_partkey", DataType::Int64, false),
67-
Field::new("l_suppkey", DataType::Int64, false),
68-
Field::new("l_linenumber", DataType::Int32, false),
69-
Field::new("l_quantity", DataType::Decimal128(15, 2), false),
70-
Field::new("l_extendedprice", DataType::Decimal128(15, 2), false),
71-
Field::new("l_discount", DataType::Decimal128(15, 2), false),
72-
Field::new("l_tax", DataType::Decimal128(15, 2), false),
73-
Field::new("l_returnflag", DataType::Utf8, false),
74-
Field::new("l_linestatus", DataType::Utf8, false),
75-
Field::new("l_shipdate", DataType::Date32, false),
76-
Field::new("l_commitdate", DataType::Date32, false),
77-
Field::new("l_receiptdate", DataType::Date32, false),
78-
Field::new("l_shipinstruct", DataType::Utf8, false),
79-
Field::new("l_shipmode", DataType::Utf8, false),
80-
Field::new("l_comment", DataType::Utf8, false),
81-
]);
82-
83-
let orders_schema = Schema::new(vec![
84-
Field::new("o_orderkey", DataType::Int64, false),
85-
Field::new("o_custkey", DataType::Int64, false),
86-
Field::new("o_orderstatus", DataType::Utf8, false),
87-
Field::new("o_totalprice", DataType::Decimal128(15, 2), false),
88-
Field::new("o_orderdate", DataType::Date32, false),
89-
Field::new("o_orderpriority", DataType::Utf8, false),
90-
Field::new("o_clerk", DataType::Utf8, false),
91-
Field::new("o_shippriority", DataType::Int32, false),
92-
Field::new("o_comment", DataType::Utf8, false),
93-
]);
94-
95-
let part_schema = Schema::new(vec![
96-
Field::new("p_partkey", DataType::Int64, false),
97-
Field::new("p_name", DataType::Utf8, false),
98-
Field::new("p_mfgr", DataType::Utf8, false),
99-
Field::new("p_brand", DataType::Utf8, false),
100-
Field::new("p_type", DataType::Utf8, false),
101-
Field::new("p_size", DataType::Int32, false),
102-
Field::new("p_container", DataType::Utf8, false),
103-
Field::new("p_retailprice", DataType::Decimal128(15, 2), false),
104-
Field::new("p_comment", DataType::Utf8, false),
105-
]);
106-
107-
let supplier_schema = Schema::new(vec![
108-
Field::new("s_suppkey", DataType::Int64, false),
109-
Field::new("s_name", DataType::Utf8, false),
110-
Field::new("s_address", DataType::Utf8, false),
111-
Field::new("s_nationkey", DataType::Int64, false),
112-
Field::new("s_phone", DataType::Utf8, false),
113-
Field::new("s_acctbal", DataType::Decimal128(15, 2), false),
114-
Field::new("s_comment", DataType::Utf8, false),
115-
]);
116-
117-
let partsupp_schema = Schema::new(vec![
118-
Field::new("ps_partkey", DataType::Int64, false),
119-
Field::new("ps_suppkey", DataType::Int64, false),
120-
Field::new("ps_availqty", DataType::Int32, false),
121-
Field::new("ps_supplycost", DataType::Decimal128(15, 2), false),
122-
Field::new("ps_comment", DataType::Utf8, false),
123-
]);
124-
125-
let customer_schema = Schema::new(vec![
126-
Field::new("c_custkey", DataType::Int64, false),
127-
Field::new("c_name", DataType::Utf8, false),
128-
Field::new("c_address", DataType::Utf8, false),
129-
Field::new("c_nationkey", DataType::Int64, false),
130-
Field::new("c_phone", DataType::Utf8, false),
131-
Field::new("c_acctbal", DataType::Decimal128(15, 2), false),
132-
Field::new("c_mktsegment", DataType::Utf8, false),
133-
Field::new("c_comment", DataType::Utf8, false),
134-
]);
135-
136-
let nation_schema = Schema::new(vec![
137-
Field::new("n_nationkey", DataType::Int64, false),
138-
Field::new("n_name", DataType::Utf8, false),
139-
Field::new("n_regionkey", DataType::Int64, false),
140-
Field::new("n_comment", DataType::Utf8, false),
141-
]);
142-
143-
let region_schema = Schema::new(vec![
144-
Field::new("r_regionkey", DataType::Int64, false),
145-
Field::new("r_name", DataType::Utf8, false),
146-
Field::new("r_comment", DataType::Utf8, false),
147-
]);
148-
149-
[
150-
("lineitem".to_string(), lineitem_schema),
151-
("orders".to_string(), orders_schema),
152-
("part".to_string(), part_schema),
153-
("supplier".to_string(), supplier_schema),
154-
("partsupp".to_string(), partsupp_schema),
155-
("customer".to_string(), customer_schema),
156-
("nation".to_string(), nation_schema),
157-
("region".to_string(), region_schema),
158-
]
159-
}
160-
16166
fn create_context() -> SessionContext {
16267
let ctx = SessionContext::new();
16368
ctx.register_table("t1", create_table_provider("a", 200))
@@ -168,16 +73,19 @@ fn create_context() -> SessionContext {
16873
.unwrap();
16974
ctx.register_table("t1000", create_table_provider("d", 1000))
17075
.unwrap();
76+
ctx
77+
}
17178

172-
let tpch_schemas = create_tpch_schemas();
173-
tpch_schemas.iter().for_each(|(name, schema)| {
79+
/// Register the table definitions as a MemTable with the context and return the
80+
/// context
81+
fn register_defs(ctx: SessionContext, defs: Vec<TableDef>) -> SessionContext {
82+
defs.iter().for_each(|TableDef { name, schema }| {
17483
ctx.register_table(
17584
name,
17685
Arc::new(MemTable::try_new(Arc::new(schema.clone()), vec![]).unwrap()),
17786
)
17887
.unwrap();
17988
});
180-
18189
ctx
18290
}
18391

@@ -236,40 +144,79 @@ fn criterion_benchmark(c: &mut Criterion) {
236144
})
237145
});
238146

147+
// --- TPC-H ---
148+
149+
let tpch_ctx = register_defs(SessionContext::new(), tpch_schemas());
150+
239151
let tpch_queries = [
240152
"q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13",
241153
"q14", // "q15", q15 has multiple SQL statements which is not supported
242154
"q16", "q17", "q18", "q19", "q20", "q21", "q22",
243155
];
244156

245157
for q in tpch_queries {
246-
let sql = std::fs::read_to_string(format!("../../benchmarks/queries/{}.sql", q))
247-
.unwrap();
158+
let sql =
159+
std::fs::read_to_string(format!("../../benchmarks/queries/{q}.sql")).unwrap();
248160
c.bench_function(&format!("physical_plan_tpch_{}", q), |b| {
249-
b.iter(|| physical_plan(&ctx, &sql))
161+
b.iter(|| physical_plan(&tpch_ctx, &sql))
250162
});
251163
}
252164

253165
let all_tpch_sql_queries = tpch_queries
254166
.iter()
255167
.map(|q| {
256-
std::fs::read_to_string(format!("../../benchmarks/queries/{}.sql", q))
257-
.unwrap()
168+
std::fs::read_to_string(format!("../../benchmarks/queries/{q}.sql")).unwrap()
258169
})
259170
.collect::<Vec<_>>();
260171

261172
c.bench_function("physical_plan_tpch_all", |b| {
262173
b.iter(|| {
263174
for sql in &all_tpch_sql_queries {
264-
physical_plan(&ctx, sql)
175+
physical_plan(&tpch_ctx, sql)
265176
}
266177
})
267178
});
268179

269180
c.bench_function("logical_plan_tpch_all", |b| {
270181
b.iter(|| {
271182
for sql in &all_tpch_sql_queries {
272-
logical_plan(&ctx, sql)
183+
logical_plan(&tpch_ctx, sql)
184+
}
185+
})
186+
});
187+
188+
// --- TPC-DS ---
189+
190+
let tpcds_ctx = register_defs(SessionContext::new(), tpcds_schemas());
191+
192+
// 10, 35: Physical plan does not support logical expression Exists(<subquery>)
193+
// 45: Physical plan does not support logical expression (<subquery>)
194+
// 41: Optimizing disjunctions not supported
195+
let ignored = [10, 35, 41, 45];
196+
197+
let raw_tpcds_sql_queries = (1..100)
198+
.filter(|q| !ignored.contains(q))
199+
.map(|q| std::fs::read_to_string(format!("./tests/tpc-ds/{q}.sql")).unwrap())
200+
.collect::<Vec<_>>();
201+
202+
// some queries have multiple statements
203+
let all_tpcds_sql_queries = raw_tpcds_sql_queries
204+
.iter()
205+
.flat_map(|sql| sql.split(';').filter(|s| !s.trim().is_empty()))
206+
.collect::<Vec<_>>();
207+
208+
c.bench_function("physical_plan_tpcds_all", |b| {
209+
b.iter(|| {
210+
for sql in &all_tpcds_sql_queries {
211+
physical_plan(&tpcds_ctx, sql)
212+
}
213+
})
214+
});
215+
216+
c.bench_function("logical_plan_tpcds_all", |b| {
217+
b.iter(|| {
218+
for sql in &all_tpcds_sql_queries {
219+
logical_plan(&tpcds_ctx, sql)
273220
}
274221
})
275222
});

0 commit comments

Comments
 (0)