Skip to content

Commit 460c92d

Browse files
committed
merge main
2 parents eb8ac39 + 2f28327 commit 460c92d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+1738
-483
lines changed

.github/workflows/extended.yml

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,42 @@ on:
3333
- main
3434

3535
jobs:
36+
# Check crate compiles and base cargo check passes
37+
linux-build-lib:
38+
name: linux build test
39+
runs-on: ubuntu-latest
40+
container:
41+
image: amd64/rust
42+
steps:
43+
- uses: actions/checkout@v4
44+
- name: Setup Rust toolchain
45+
uses: ./.github/actions/setup-builder
46+
with:
47+
rust-version: stable
48+
- name: Prepare cargo build
49+
run: cargo check --profile ci --all-targets
50+
51+
# Run extended tests (with feature 'extended_tests')
52+
linux-test-extended:
53+
name: cargo test 'extended_tests' (amd64)
54+
needs: linux-build-lib
55+
runs-on: ubuntu-latest
56+
container:
57+
image: amd64/rust
58+
steps:
59+
- uses: actions/checkout@v4
60+
with:
61+
submodules: true
62+
fetch-depth: 1
63+
- name: Setup Rust toolchain
64+
uses: ./.github/actions/setup-builder
65+
with:
66+
rust-version: stable
67+
- name: Run tests (excluding doctests)
68+
run: cargo test --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --workspace --lib --tests --bins --features avro,json,backtrace,extended_tests
69+
- name: Verify Working Directory Clean
70+
run: git diff --exit-code
71+
3672
# Check answers are correct when hash values collide
3773
hash-collisions:
3874
name: cargo test hash collisions (amd64)
@@ -51,7 +87,8 @@ jobs:
5187
- name: Run tests
5288
run: |
5389
cd datafusion
54-
cargo test --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --exclude datafusion-sqllogictest --workspace --lib --tests --features=force_hash_collisions,avro
90+
cargo test --profile ci --exclude datafusion-examples --exclude datafusion-benchmarks --exclude datafusion-sqllogictest --workspace --lib --tests --features=force_hash_collisions,avro,extended_tests
91+
5592
sqllogictest-sqlite:
5693
name: "Run sqllogictests with the sqlite test suite"
5794
runs-on: ubuntu-latest

datafusion-cli/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion-testing

Submodule datafusion-testing updated 44 files

datafusion/core/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ unicode_expressions = [
8080
"datafusion-sql/unicode_expressions",
8181
"datafusion-functions/unicode_expressions",
8282
]
83+
extended_tests = []
8384

8485
[dependencies]
8586
apache-avro = { version = "0.17", optional = true }
@@ -150,6 +151,7 @@ rand_distr = "0.4.3"
150151
regex = { workspace = true }
151152
rstest = { workspace = true }
152153
serde_json = { workspace = true }
154+
sysinfo = "0.33.1"
153155
test-utils = { path = "../../test-utils" }
154156
tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot", "fs"] }
155157

datafusion/core/benches/sql_planner.rs

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,12 @@ mod data_utils;
2424

2525
use crate::criterion::Criterion;
2626
use arrow::datatypes::{DataType, Field, Fields, Schema};
27+
use arrow_array::{ArrayRef, RecordBatch};
2728
use criterion::Bencher;
2829
use datafusion::datasource::MemTable;
2930
use datafusion::execution::context::SessionContext;
3031
use datafusion_common::ScalarValue;
32+
use datafusion_expr::col;
3133
use itertools::Itertools;
3234
use std::fs::File;
3335
use std::io::{BufRead, BufReader};
@@ -147,6 +149,77 @@ fn benchmark_with_param_values_many_columns(ctx: &SessionContext, b: &mut Benche
147149
});
148150
}
149151

152+
/// Registers a table like this:
153+
/// c0,c1,c2...,c99
154+
/// 0,100...9900
155+
/// 0,200...19800
156+
/// 0,300...29700
157+
fn register_union_order_table(ctx: &SessionContext, num_columns: usize, num_rows: usize) {
158+
// ("c0", [0, 0, ...])
159+
// ("c1": [100, 200, ...])
160+
// etc
161+
let iter = (0..num_columns).map(|i| i as u64).map(|i| {
162+
let array: ArrayRef = Arc::new(arrow::array::UInt64Array::from_iter_values(
163+
(0..num_rows)
164+
.map(|j| j as u64 * 100 + i)
165+
.collect::<Vec<_>>(),
166+
));
167+
(format!("c{}", i), array)
168+
});
169+
let batch = RecordBatch::try_from_iter(iter).unwrap();
170+
let schema = batch.schema();
171+
let partitions = vec![vec![batch]];
172+
173+
// tell DataFusion that the table is sorted by all columns
174+
let sort_order = (0..num_columns)
175+
.map(|i| col(format!("c{}", i)).sort(true, true))
176+
.collect::<Vec<_>>();
177+
178+
// create the table
179+
let table = MemTable::try_new(schema, partitions)
180+
.unwrap()
181+
.with_sort_order(vec![sort_order]);
182+
183+
ctx.register_table("t", Arc::new(table)).unwrap();
184+
}
185+
186+
/// return a query like
187+
/// ```sql
188+
/// select c1, null as c2, ... null as cn from t ORDER BY c1
189+
/// UNION ALL
190+
/// select null as c1, c2, ... null as cn from t ORDER BY c2
191+
/// ...
192+
/// select null as c1, null as c2, ... cn from t ORDER BY cn
193+
/// ORDER BY c1, c2 ... CN
194+
/// ```
195+
fn union_orderby_query(n: usize) -> String {
196+
let mut query = String::new();
197+
for i in 0..n {
198+
if i != 0 {
199+
query.push_str("\n UNION ALL \n");
200+
}
201+
let select_list = (0..n)
202+
.map(|j| {
203+
if i == j {
204+
format!("c{j}")
205+
} else {
206+
format!("null as c{j}")
207+
}
208+
})
209+
.collect::<Vec<_>>()
210+
.join(", ");
211+
query.push_str(&format!("(SELECT {} FROM t ORDER BY c{})", select_list, i));
212+
}
213+
query.push_str(&format!(
214+
"\nORDER BY {}",
215+
(0..n)
216+
.map(|i| format!("c{}", i))
217+
.collect::<Vec<_>>()
218+
.join(", ")
219+
));
220+
query
221+
}
222+
150223
fn criterion_benchmark(c: &mut Criterion) {
151224
// verify that we can load the clickbench data prior to running the benchmark
152225
if !PathBuf::from(format!("{BENCHMARKS_PATH_1}{CLICKBENCH_DATA_PATH}")).exists()
@@ -289,6 +362,17 @@ fn criterion_benchmark(c: &mut Criterion) {
289362
});
290363
});
291364

365+
// -- Sorted Queries --
366+
register_union_order_table(&ctx, 100, 1000);
367+
368+
// this query has many expressions in its sort order so stresses
369+
// order equivalence validation
370+
c.bench_function("physical_sorted_union_orderby", |b| {
371+
// SELECT ... UNION ALL ...
372+
let query = union_orderby_query(20);
373+
b.iter(|| physical_plan(&ctx, &query))
374+
});
375+
292376
// --- TPC-H ---
293377

294378
let tpch_ctx = register_defs(SessionContext::new(), tpch_schemas());

datafusion/core/src/physical_optimizer/projection_pushdown.rs

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1255,7 +1255,7 @@ fn update_join_filter(
12551255
side: col_idx.side,
12561256
})
12571257
.collect(),
1258-
join_filter.schema().clone(),
1258+
Arc::clone(join_filter.schema()),
12591259
)
12601260
})
12611261
}
@@ -2246,11 +2246,11 @@ mod tests {
22462246
side: JoinSide::Left,
22472247
},
22482248
],
2249-
Schema::new(vec![
2249+
Arc::new(Schema::new(vec![
22502250
Field::new("b_left_inter", DataType::Int32, true),
22512251
Field::new("a_right_inter", DataType::Int32, true),
22522252
Field::new("c_left_inter", DataType::Int32, true),
2253-
]),
2253+
])),
22542254
)),
22552255
&JoinType::Inner,
22562256
true,
@@ -2360,11 +2360,11 @@ mod tests {
23602360
side: JoinSide::Left,
23612361
},
23622362
],
2363-
Schema::new(vec![
2363+
Arc::new(Schema::new(vec![
23642364
Field::new("b_left_inter", DataType::Int32, true),
23652365
Field::new("a_right_inter", DataType::Int32, true),
23662366
Field::new("c_left_inter", DataType::Int32, true),
2367-
]),
2367+
])),
23682368
)),
23692369
&JoinType::Inner,
23702370
true,
@@ -2462,7 +2462,7 @@ mod tests {
24622462
Some(JoinFilter::new(
24632463
filter_expr,
24642464
filter_column_indices,
2465-
filter_schema,
2465+
Arc::new(filter_schema),
24662466
)),
24672467
&JoinType::Inner,
24682468
None,
@@ -2536,11 +2536,11 @@ mod tests {
25362536
side: JoinSide::Left,
25372537
},
25382538
],
2539-
Schema::new(vec![
2539+
Arc::new(Schema::new(vec![
25402540
Field::new("b_left_inter", DataType::Int32, true),
25412541
Field::new("a_right_inter", DataType::Int32, true),
25422542
Field::new("c_left_inter", DataType::Int32, true),
2543-
]),
2543+
])),
25442544
)),
25452545
&JoinType::Inner,
25462546
None,

0 commit comments

Comments
 (0)