Skip to content

Commit 85adb6c

Browse files
authored
Remove Sort expression (Expr::Sort) (#12177)
* Take Sort (SortExpr) in file options Part of effort to remove `Expr::Sort`. * Return Sort from Expr.Sort Part of effort to remove `Expr::Sort`. * Accept Sort (SortExpr) in `LogicalPlanBuilder.sort` Take `expr::Sort` in `LogicalPlanBuilder.sort`. Accept any `Expr` in new function, `LogicalPlanBuilder.sort_by` which apply default sort ordering. Part of effort to remove `Expr::Sort`. * Operate on `Sort` in to_substrait_sort_field / from_substrait_sorts Part of effort to remove `Expr::Sort`. * Take Sort (SortExpr) in tests' TopKPlanNode Part of effort to remove `Expr::Sort`. * Remove Sort expression (`Expr::Sort`) Remove sort as an expression, i.e. remove `Expr::Sort` from `Expr` enum. Use `expr::Sort` directly when sorting. The sort expression was used in context of ordering (sort, topk, create table, file sorting). Those places require their sort expression to be of type Sort anyway and no other expression was allowed, so this change improves static typing. Sort as an expression was illegal in other contexts. * use assert_eq just like in LogicalPlan.with_new_exprs * avoid clone in replace_sort_expressions * reduce cloning in EliminateDuplicatedExpr * restore SortExprWrapper this commit is longer than advised in the review comment, but after squashing the diff will be smaller * shorthand SortExprWrapper struct definition
1 parent f5dcdf0 commit 85adb6c

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+704
-772
lines changed

datafusion-examples/examples/file_stream_provider.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ mod non_windows {
3939
use datafusion::datasource::TableProvider;
4040
use datafusion::prelude::{SessionConfig, SessionContext};
4141
use datafusion_common::{exec_err, Result};
42-
use datafusion_expr::Expr;
42+
use datafusion_expr::SortExpr;
4343

4444
// Number of lines written to FIFO
4545
const TEST_BATCH_SIZE: usize = 5;
@@ -49,7 +49,7 @@ mod non_windows {
4949
fn fifo_table(
5050
schema: SchemaRef,
5151
path: impl Into<PathBuf>,
52-
sort: Vec<Vec<Expr>>,
52+
sort: Vec<Vec<SortExpr>>,
5353
) -> Arc<dyn TableProvider> {
5454
let source = FileStreamProvider::new_file(schema, path.into())
5555
.with_batch_size(TEST_BATCH_SIZE)

datafusion/core/src/dataframe/mod.rs

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ use datafusion_common::config::{CsvOptions, JsonOptions};
5252
use datafusion_common::{
5353
plan_err, Column, DFSchema, DataFusionError, ParamValues, SchemaError, UnnestOptions,
5454
};
55-
use datafusion_expr::{case, is_null, lit};
55+
use datafusion_expr::{case, is_null, lit, SortExpr};
5656
use datafusion_expr::{
5757
utils::COUNT_STAR_EXPANSION, TableProviderFilterPushDown, UNNAMED_TABLE,
5858
};
@@ -577,7 +577,7 @@ impl DataFrame {
577577
self,
578578
on_expr: Vec<Expr>,
579579
select_expr: Vec<Expr>,
580-
sort_expr: Option<Vec<Expr>>,
580+
sort_expr: Option<Vec<SortExpr>>,
581581
) -> Result<DataFrame> {
582582
let plan = LogicalPlanBuilder::from(self.plan)
583583
.distinct_on(on_expr, select_expr, sort_expr)?
@@ -776,6 +776,15 @@ impl DataFrame {
776776
})
777777
}
778778

779+
/// Apply a sort by provided expressions with default direction
780+
pub fn sort_by(self, expr: Vec<Expr>) -> Result<DataFrame> {
781+
self.sort(
782+
expr.into_iter()
783+
.map(|e| e.sort(true, false))
784+
.collect::<Vec<SortExpr>>(),
785+
)
786+
}
787+
779788
/// Sort the DataFrame by the specified sorting expressions.
780789
///
781790
/// Note that any expression can be turned into
@@ -797,7 +806,7 @@ impl DataFrame {
797806
/// # Ok(())
798807
/// # }
799808
/// ```
800-
pub fn sort(self, expr: Vec<Expr>) -> Result<DataFrame> {
809+
pub fn sort(self, expr: Vec<SortExpr>) -> Result<DataFrame> {
801810
let plan = LogicalPlanBuilder::from(self.plan).sort(expr)?.build()?;
802811
Ok(DataFrame {
803812
session_state: self.session_state,

datafusion/core/src/datasource/file_format/options.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ use crate::datasource::{
3131
};
3232
use crate::error::Result;
3333
use crate::execution::context::{SessionConfig, SessionState};
34-
use crate::logical_expr::Expr;
3534

3635
use arrow::datatypes::{DataType, Schema, SchemaRef};
3736
use datafusion_common::config::TableOptions;
@@ -41,6 +40,7 @@ use datafusion_common::{
4140
};
4241

4342
use async_trait::async_trait;
43+
use datafusion_expr::SortExpr;
4444

4545
/// Options that control the reading of CSV files.
4646
///
@@ -84,7 +84,7 @@ pub struct CsvReadOptions<'a> {
8484
/// File compression type
8585
pub file_compression_type: FileCompressionType,
8686
/// Indicates how the file is sorted
87-
pub file_sort_order: Vec<Vec<Expr>>,
87+
pub file_sort_order: Vec<Vec<SortExpr>>,
8888
}
8989

9090
impl<'a> Default for CsvReadOptions<'a> {
@@ -199,7 +199,7 @@ impl<'a> CsvReadOptions<'a> {
199199
}
200200

201201
/// Configure if file has known sort order
202-
pub fn file_sort_order(mut self, file_sort_order: Vec<Vec<Expr>>) -> Self {
202+
pub fn file_sort_order(mut self, file_sort_order: Vec<Vec<SortExpr>>) -> Self {
203203
self.file_sort_order = file_sort_order;
204204
self
205205
}
@@ -231,7 +231,7 @@ pub struct ParquetReadOptions<'a> {
231231
/// based on data in file.
232232
pub schema: Option<&'a Schema>,
233233
/// Indicates how the file is sorted
234-
pub file_sort_order: Vec<Vec<Expr>>,
234+
pub file_sort_order: Vec<Vec<SortExpr>>,
235235
}
236236

237237
impl<'a> Default for ParquetReadOptions<'a> {
@@ -278,7 +278,7 @@ impl<'a> ParquetReadOptions<'a> {
278278
}
279279

280280
/// Configure if file has known sort order
281-
pub fn file_sort_order(mut self, file_sort_order: Vec<Vec<Expr>>) -> Self {
281+
pub fn file_sort_order(mut self, file_sort_order: Vec<Vec<SortExpr>>) -> Self {
282282
self.file_sort_order = file_sort_order;
283283
self
284284
}
@@ -397,7 +397,7 @@ pub struct NdJsonReadOptions<'a> {
397397
/// Flag indicating whether this file may be unbounded (as in a FIFO file).
398398
pub infinite: bool,
399399
/// Indicates how the file is sorted
400-
pub file_sort_order: Vec<Vec<Expr>>,
400+
pub file_sort_order: Vec<Vec<SortExpr>>,
401401
}
402402

403403
impl<'a> Default for NdJsonReadOptions<'a> {
@@ -452,7 +452,7 @@ impl<'a> NdJsonReadOptions<'a> {
452452
}
453453

454454
/// Configure if file has known sort order
455-
pub fn file_sort_order(mut self, file_sort_order: Vec<Vec<Expr>>) -> Self {
455+
pub fn file_sort_order(mut self, file_sort_order: Vec<Vec<SortExpr>>) -> Self {
456456
self.file_sort_order = file_sort_order;
457457
self
458458
}

datafusion/core/src/datasource/listing/helpers.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,11 +102,10 @@ pub fn expr_applicable_for_cols(col_names: &[String], expr: &Expr) -> bool {
102102
}
103103

104104
// TODO other expressions are not handled yet:
105-
// - AGGREGATE, WINDOW and SORT should not end up in filter conditions, except maybe in some edge cases
105+
// - AGGREGATE and WINDOW should not end up in filter conditions, except maybe in some edge cases
106106
// - Can `Wildcard` be considered as a `Literal`?
107107
// - ScalarVariable could be `applicable`, but that would require access to the context
108108
Expr::AggregateFunction { .. }
109-
| Expr::Sort { .. }
110109
| Expr::WindowFunction { .. }
111110
| Expr::Wildcard { .. }
112111
| Expr::Unnest { .. }

datafusion/core/src/datasource/listing/table.rs

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ use crate::datasource::{
3333
use crate::execution::context::SessionState;
3434
use datafusion_catalog::TableProvider;
3535
use datafusion_common::{DataFusionError, Result};
36-
use datafusion_expr::TableType;
3736
use datafusion_expr::{utils::conjunction, Expr, TableProviderFilterPushDown};
37+
use datafusion_expr::{SortExpr, TableType};
3838
use datafusion_physical_plan::{empty::EmptyExec, ExecutionPlan, Statistics};
3939

4040
use arrow::datatypes::{DataType, Field, SchemaBuilder, SchemaRef};
@@ -222,7 +222,7 @@ pub struct ListingOptions {
222222
/// ordering (encapsulated by a `Vec<Expr>`). If there aren't
223223
/// multiple equivalent orderings, the outer `Vec` will have a
224224
/// single element.
225-
pub file_sort_order: Vec<Vec<Expr>>,
225+
pub file_sort_order: Vec<Vec<SortExpr>>,
226226
}
227227

228228
impl ListingOptions {
@@ -385,7 +385,7 @@ impl ListingOptions {
385385
///
386386
/// assert_eq!(listing_options.file_sort_order, file_sort_order);
387387
/// ```
388-
pub fn with_file_sort_order(mut self, file_sort_order: Vec<Vec<Expr>>) -> Self {
388+
pub fn with_file_sort_order(mut self, file_sort_order: Vec<Vec<SortExpr>>) -> Self {
389389
self.file_sort_order = file_sort_order;
390390
self
391391
}
@@ -909,8 +909,7 @@ impl TableProvider for ListingTable {
909909
keep_partition_by_columns,
910910
};
911911

912-
let unsorted: Vec<Vec<Expr>> = vec![];
913-
let order_requirements = if self.options().file_sort_order != unsorted {
912+
let order_requirements = if !self.options().file_sort_order.is_empty() {
914913
// Multiple sort orders in outer vec are equivalent, so we pass only the first one
915914
let ordering = self
916915
.try_create_output_ordering()?
@@ -1160,11 +1159,6 @@ mod tests {
11601159
// (file_sort_order, expected_result)
11611160
let cases = vec![
11621161
(vec![], Ok(vec![])),
1163-
// not a sort expr
1164-
(
1165-
vec![vec![col("string_col")]],
1166-
Err("Expected Expr::Sort in output_ordering, but got string_col"),
1167-
),
11681162
// sort expr, but non column
11691163
(
11701164
vec![vec![

datafusion/core/src/datasource/memory.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ use datafusion_physical_plan::metrics::MetricsSet;
4343

4444
use async_trait::async_trait;
4545
use datafusion_catalog::Session;
46+
use datafusion_expr::SortExpr;
4647
use futures::StreamExt;
4748
use log::debug;
4849
use parking_lot::Mutex;
@@ -64,7 +65,7 @@ pub struct MemTable {
6465
column_defaults: HashMap<String, Expr>,
6566
/// Optional pre-known sort order(s). Must be `SortExpr`s.
6667
/// inserting data into this table removes the order
67-
pub sort_order: Arc<Mutex<Vec<Vec<Expr>>>>,
68+
pub sort_order: Arc<Mutex<Vec<Vec<SortExpr>>>>,
6869
}
6970

7071
impl MemTable {
@@ -118,7 +119,7 @@ impl MemTable {
118119
///
119120
/// Note that multiple sort orders are supported, if some are known to be
120121
/// equivalent,
121-
pub fn with_sort_order(self, mut sort_order: Vec<Vec<Expr>>) -> Self {
122+
pub fn with_sort_order(self, mut sort_order: Vec<Vec<SortExpr>>) -> Self {
122123
std::mem::swap(self.sort_order.lock().as_mut(), &mut sort_order);
123124
self
124125
}

datafusion/core/src/datasource/mod.rs

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -50,38 +50,39 @@ pub use statistics::get_statistics_with_limit;
5050

5151
use arrow_schema::{Schema, SortOptions};
5252
use datafusion_common::{plan_err, Result};
53-
use datafusion_expr::Expr;
53+
use datafusion_expr::{Expr, SortExpr};
5454
use datafusion_physical_expr::{expressions, LexOrdering, PhysicalSortExpr};
5555

5656
fn create_ordering(
5757
schema: &Schema,
58-
sort_order: &[Vec<Expr>],
58+
sort_order: &[Vec<SortExpr>],
5959
) -> Result<Vec<LexOrdering>> {
6060
let mut all_sort_orders = vec![];
6161

6262
for exprs in sort_order {
6363
// Construct PhysicalSortExpr objects from Expr objects:
6464
let mut sort_exprs = vec![];
65-
for expr in exprs {
66-
match expr {
67-
Expr::Sort(sort) => match sort.expr.as_ref() {
68-
Expr::Column(col) => match expressions::col(&col.name, schema) {
69-
Ok(expr) => {
70-
sort_exprs.push(PhysicalSortExpr {
71-
expr,
72-
options: SortOptions {
73-
descending: !sort.asc,
74-
nulls_first: sort.nulls_first,
75-
},
76-
});
77-
}
78-
// Cannot find expression in the projected_schema, stop iterating
79-
// since rest of the orderings are violated
80-
Err(_) => break,
65+
for sort in exprs {
66+
match sort.expr.as_ref() {
67+
Expr::Column(col) => match expressions::col(&col.name, schema) {
68+
Ok(expr) => {
69+
sort_exprs.push(PhysicalSortExpr {
70+
expr,
71+
options: SortOptions {
72+
descending: !sort.asc,
73+
nulls_first: sort.nulls_first,
74+
},
75+
});
8176
}
82-
expr => return plan_err!("Expected single column references in output_ordering, got {expr}"),
77+
// Cannot find expression in the projected_schema, stop iterating
78+
// since rest of the orderings are violated
79+
Err(_) => break,
80+
},
81+
expr => {
82+
return plan_err!(
83+
"Expected single column references in output_ordering, got {expr}"
84+
)
8385
}
84-
expr => return plan_err!("Expected Expr::Sort in output_ordering, but got {expr}"),
8586
}
8687
}
8788
if !sort_exprs.is_empty() {

datafusion/core/src/datasource/physical_plan/file_scan_config.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -979,7 +979,7 @@ mod tests {
979979
name: &'static str,
980980
file_schema: Schema,
981981
files: Vec<File>,
982-
sort: Vec<datafusion_expr::Expr>,
982+
sort: Vec<datafusion_expr::SortExpr>,
983983
expected_result: Result<Vec<Vec<&'static str>>, &'static str>,
984984
}
985985

datafusion/core/src/datasource/stream.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ use arrow_schema::SchemaRef;
3333
use datafusion_common::{config_err, plan_err, Constraints, DataFusionError, Result};
3434
use datafusion_common_runtime::SpawnedTask;
3535
use datafusion_execution::{SendableRecordBatchStream, TaskContext};
36-
use datafusion_expr::{CreateExternalTable, Expr, TableType};
36+
use datafusion_expr::{CreateExternalTable, Expr, SortExpr, TableType};
3737
use datafusion_physical_plan::insert::{DataSink, DataSinkExec};
3838
use datafusion_physical_plan::metrics::MetricsSet;
3939
use datafusion_physical_plan::stream::RecordBatchReceiverStreamBuilder;
@@ -248,7 +248,7 @@ impl StreamProvider for FileStreamProvider {
248248
#[derive(Debug)]
249249
pub struct StreamConfig {
250250
source: Arc<dyn StreamProvider>,
251-
order: Vec<Vec<Expr>>,
251+
order: Vec<Vec<SortExpr>>,
252252
constraints: Constraints,
253253
}
254254

@@ -263,7 +263,7 @@ impl StreamConfig {
263263
}
264264

265265
/// Specify a sort order for the stream
266-
pub fn with_order(mut self, order: Vec<Vec<Expr>>) -> Self {
266+
pub fn with_order(mut self, order: Vec<Vec<SortExpr>>) -> Self {
267267
self.order = order;
268268
self
269269
}

datafusion/core/src/physical_planner.rs

Lines changed: 14 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -73,13 +73,13 @@ use datafusion_common::{
7373
};
7474
use datafusion_expr::dml::CopyTo;
7575
use datafusion_expr::expr::{
76-
self, physical_name, AggregateFunction, Alias, GroupingSet, WindowFunction,
76+
physical_name, AggregateFunction, Alias, GroupingSet, WindowFunction,
7777
};
7878
use datafusion_expr::expr_rewriter::unnormalize_cols;
7979
use datafusion_expr::logical_plan::builder::wrap_projection_for_join_if_necessary;
8080
use datafusion_expr::{
81-
DescribeTable, DmlStatement, Extension, Filter, RecursiveQuery, StringifiedPlan,
82-
WindowFrame, WindowFrameBound, WriteOp,
81+
DescribeTable, DmlStatement, Extension, Filter, RecursiveQuery, SortExpr,
82+
StringifiedPlan, WindowFrame, WindowFrameBound, WriteOp,
8383
};
8484
use datafusion_physical_expr::aggregate::{AggregateExprBuilder, AggregateFunctionExpr};
8585
use datafusion_physical_expr::expressions::Literal;
@@ -1641,31 +1641,27 @@ pub fn create_aggregate_expr_and_maybe_filter(
16411641

16421642
/// Create a physical sort expression from a logical expression
16431643
pub fn create_physical_sort_expr(
1644-
e: &Expr,
1644+
e: &SortExpr,
16451645
input_dfschema: &DFSchema,
16461646
execution_props: &ExecutionProps,
16471647
) -> Result<PhysicalSortExpr> {
1648-
if let Expr::Sort(expr::Sort {
1648+
let SortExpr {
16491649
expr,
16501650
asc,
16511651
nulls_first,
1652-
}) = e
1653-
{
1654-
Ok(PhysicalSortExpr {
1655-
expr: create_physical_expr(expr, input_dfschema, execution_props)?,
1656-
options: SortOptions {
1657-
descending: !asc,
1658-
nulls_first: *nulls_first,
1659-
},
1660-
})
1661-
} else {
1662-
internal_err!("Expects a sort expression")
1663-
}
1652+
} = e;
1653+
Ok(PhysicalSortExpr {
1654+
expr: create_physical_expr(expr, input_dfschema, execution_props)?,
1655+
options: SortOptions {
1656+
descending: !asc,
1657+
nulls_first: *nulls_first,
1658+
},
1659+
})
16641660
}
16651661

16661662
/// Create vector of physical sort expression from a vector of logical expression
16671663
pub fn create_physical_sort_exprs(
1668-
exprs: &[Expr],
1664+
exprs: &[SortExpr],
16691665
input_dfschema: &DFSchema,
16701666
execution_props: &ExecutionProps,
16711667
) -> Result<LexOrdering> {

datafusion/core/src/test_util/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
4646
use arrow::record_batch::RecordBatch;
4747
use datafusion_common::TableReference;
4848
use datafusion_expr::utils::COUNT_STAR_EXPANSION;
49-
use datafusion_expr::{CreateExternalTable, Expr, TableType};
49+
use datafusion_expr::{CreateExternalTable, Expr, SortExpr, TableType};
5050
use datafusion_functions_aggregate::count::count_udaf;
5151
use datafusion_physical_expr::{expressions, EquivalenceProperties, PhysicalExpr};
5252

@@ -360,7 +360,7 @@ pub fn register_unbounded_file_with_ordering(
360360
schema: SchemaRef,
361361
file_path: &Path,
362362
table_name: &str,
363-
file_sort_order: Vec<Vec<Expr>>,
363+
file_sort_order: Vec<Vec<SortExpr>>,
364364
) -> Result<()> {
365365
let source = FileStreamProvider::new_file(schema, file_path.into());
366366
let config = StreamConfig::new(Arc::new(source)).with_order(file_sort_order);

0 commit comments

Comments
 (0)