Skip to content

Commit 58f79e1

Browse files
Recursive unnest (#11062)
* chore: fix map children of unnest * adjust test * remove debug * chore: move test to unnest.slt * chore: rename * add some comment * compile err * more comment * chore: address comment * more coverage * one more scenario
1 parent 43ea682 commit 58f79e1

File tree

6 files changed

+271
-76
lines changed

6 files changed

+271
-76
lines changed

datafusion/expr/src/expr.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -358,6 +358,11 @@ impl Unnest {
358358
expr: Box::new(expr),
359359
}
360360
}
361+
362+
/// Create a new Unnest expression.
363+
pub fn new_boxed(boxed: Box<Expr>) -> Self {
364+
Self { expr: boxed }
365+
}
361366
}
362367

363368
/// Alias expression

datafusion/expr/src/logical_plan/builder.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1455,7 +1455,6 @@ pub fn project(
14551455
_ => projected_expr.push(columnize_expr(normalize_col(e, &plan)?, &plan)?),
14561456
}
14571457
}
1458-
14591458
validate_unique_names("Projections", projected_expr.iter())?;
14601459

14611460
Projection::try_new(projected_expr, Arc::new(plan)).map(LogicalPlan::Projection)

datafusion/expr/src/tree_node.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,9 @@ impl TreeNode for Expr {
136136
| Expr::Exists { .. }
137137
| Expr::ScalarSubquery(_)
138138
| Expr::ScalarVariable(_, _)
139-
| Expr::Unnest(_)
140139
| Expr::Literal(_) => Transformed::no(self),
140+
Expr::Unnest(Unnest { expr, .. }) => transform_box(expr, &mut f)?
141+
.update_data(|be| Expr::Unnest(Unnest::new_boxed(be))),
141142
Expr::Alias(Alias {
142143
expr,
143144
relation,

datafusion/sql/src/select.rs

Lines changed: 56 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,8 @@ use crate::planner::{
2222
idents_to_table_reference, ContextProvider, PlannerContext, SqlToRel,
2323
};
2424
use crate::utils::{
25-
check_columns_satisfy_exprs, extract_aliases, rebase_expr,
26-
recursive_transform_unnest, resolve_aliases_to_exprs, resolve_columns,
27-
resolve_positions_to_exprs,
25+
check_columns_satisfy_exprs, extract_aliases, rebase_expr, resolve_aliases_to_exprs,
26+
resolve_columns, resolve_positions_to_exprs, transform_bottom_unnest,
2827
};
2928

3029
use datafusion_common::{not_impl_err, plan_err, DataFusionError, Result};
@@ -298,46 +297,61 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
298297
input: LogicalPlan,
299298
select_exprs: Vec<Expr>,
300299
) -> Result<LogicalPlan> {
301-
let mut unnest_columns = vec![];
302-
// from which column used for projection, before the unnest happen
303-
// including non unnest column and unnest column
304-
let mut inner_projection_exprs = vec![];
305-
306-
// expr returned here maybe different from the originals in inner_projection_exprs
307-
// for example:
308-
// - unnest(struct_col) will be transformed into unnest(struct_col).field1, unnest(struct_col).field2
309-
// - unnest(array_col) will be transformed into unnest(array_col).element
310-
// - unnest(array_col) + 1 will be transformed into unnest(array_col).element +1
311-
let outer_projection_exprs: Vec<Expr> = select_exprs
312-
.into_iter()
313-
.map(|expr| {
314-
recursive_transform_unnest(
315-
&input,
316-
&mut unnest_columns,
317-
&mut inner_projection_exprs,
318-
expr,
319-
)
320-
})
321-
.collect::<Result<Vec<_>>>()?
322-
.into_iter()
323-
.flatten()
324-
.collect();
325-
326-
// Do the final projection
327-
if unnest_columns.is_empty() {
328-
LogicalPlanBuilder::from(input)
329-
.project(inner_projection_exprs)?
330-
.build()
331-
} else {
332-
let columns = unnest_columns.into_iter().map(|col| col.into()).collect();
333-
// Set preserve_nulls to false to ensure compatibility with DuckDB and PostgreSQL
334-
let unnest_options = UnnestOptions::new().with_preserve_nulls(false);
335-
LogicalPlanBuilder::from(input)
336-
.project(inner_projection_exprs)?
337-
.unnest_columns_with_options(columns, unnest_options)?
338-
.project(outer_projection_exprs)?
339-
.build()
300+
let mut intermediate_plan = input;
301+
let mut intermediate_select_exprs = select_exprs;
302+
// Each expr in select_exprs can contains multiple unnest stage
303+
// The transformation happen bottom up, one at a time for each iteration
304+
// Ony exaust the loop if no more unnest transformation is found
305+
for i in 0.. {
306+
let mut unnest_columns = vec![];
307+
// from which column used for projection, before the unnest happen
308+
// including non unnest column and unnest column
309+
let mut inner_projection_exprs = vec![];
310+
311+
// expr returned here maybe different from the originals in inner_projection_exprs
312+
// for example:
313+
// - unnest(struct_col) will be transformed into unnest(struct_col).field1, unnest(struct_col).field2
314+
// - unnest(array_col) will be transformed into unnest(array_col).element
315+
// - unnest(array_col) + 1 will be transformed into unnest(array_col).element +1
316+
let outer_projection_exprs: Vec<Expr> = intermediate_select_exprs
317+
.iter()
318+
.map(|expr| {
319+
transform_bottom_unnest(
320+
&intermediate_plan,
321+
&mut unnest_columns,
322+
&mut inner_projection_exprs,
323+
expr,
324+
)
325+
})
326+
.collect::<Result<Vec<_>>>()?
327+
.into_iter()
328+
.flatten()
329+
.collect();
330+
331+
// No more unnest is possible
332+
if unnest_columns.is_empty() {
333+
// The original expr does not contain any unnest
334+
if i == 0 {
335+
return LogicalPlanBuilder::from(intermediate_plan)
336+
.project(inner_projection_exprs)?
337+
.build();
338+
}
339+
break;
340+
} else {
341+
let columns = unnest_columns.into_iter().map(|col| col.into()).collect();
342+
// Set preserve_nulls to false to ensure compatibility with DuckDB and PostgreSQL
343+
let unnest_options = UnnestOptions::new().with_preserve_nulls(false);
344+
let plan = LogicalPlanBuilder::from(intermediate_plan)
345+
.project(inner_projection_exprs)?
346+
.unnest_columns_with_options(columns, unnest_options)?
347+
.build()?;
348+
intermediate_plan = plan;
349+
intermediate_select_exprs = outer_projection_exprs;
350+
}
340351
}
352+
LogicalPlanBuilder::from(intermediate_plan)
353+
.project(intermediate_select_exprs)?
354+
.build()
341355
}
342356

343357
fn plan_selection(

datafusion/sql/src/utils.rs

Lines changed: 111 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@ use std::collections::HashMap;
2222
use arrow_schema::{
2323
DataType, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE,
2424
};
25-
use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
25+
use datafusion_common::tree_node::{
26+
Transformed, TransformedResult, TreeNode, TreeNodeRecursion,
27+
};
2628
use datafusion_common::{
2729
exec_err, internal_err, plan_err, Column, DataFusionError, Result, ScalarValue,
2830
};
@@ -267,11 +269,13 @@ pub(crate) fn normalize_ident(id: Ident) -> String {
267269
/// - For list column: unnest(col) with type list -> unnest(col) with type list::item
268270
/// - For struct column: unnest(struct(field1, field2)) -> unnest(struct).field1, unnest(struct).field2
269271
/// The transformed exprs will be used in the outer projection
270-
pub(crate) fn recursive_transform_unnest(
272+
/// If along the path from root to bottom, there are multiple unnest expressions, the transformation
273+
/// is done only for the bottom expression
274+
pub(crate) fn transform_bottom_unnest(
271275
input: &LogicalPlan,
272276
unnest_placeholder_columns: &mut Vec<String>,
273277
inner_projection_exprs: &mut Vec<Expr>,
274-
original_expr: Expr,
278+
original_expr: &Expr,
275279
) -> Result<Vec<Expr>> {
276280
let mut transform =
277281
|unnest_expr: &Expr, expr_in_unnest: &Expr| -> Result<Vec<Expr>> {
@@ -298,35 +302,53 @@ pub(crate) fn recursive_transform_unnest(
298302
.collect::<Vec<_>>();
299303
Ok(expr)
300304
};
301-
// expr transformed maybe either the same, or different from the originals exprs
302-
// for example:
303-
// - unnest(struct_col) will be transformed into unnest(struct_col).field1, unnest(struct_col).field2
305+
// This transformation is only done for list unnest
306+
// struct unnest is done at the root level, and at the later stage
307+
// because the syntax of TreeNode only support transform into 1 Expr, while
308+
// Unnest struct will be transformed into multiple Exprs
309+
// TODO: This can be resolved after this issue is resolved: https://github.com/apache/datafusion/issues/10102
310+
//
311+
// The transformation looks like:
304312
// - unnest(array_col) will be transformed into unnest(array_col)
305313
// - unnest(array_col) + 1 will be transformed into unnest(array_col) + 1
306-
307-
// Specifically handle root level unnest expr, this is the only place
308-
// unnest on struct can be handled
309-
if let Expr::Unnest(Unnest { expr: ref arg }) = original_expr {
310-
return transform(&original_expr, arg);
311-
}
312314
let Transformed {
313-
data: transformed_expr,
314-
transformed,
315-
tnr: _,
316-
} = original_expr.transform_up(|expr: Expr| {
317-
if let Expr::Unnest(Unnest { expr: ref arg }) = expr {
318-
let (data_type, _) = arg.data_type_and_nullable(input.schema())?;
319-
if let DataType::Struct(_) = data_type {
320-
return internal_err!("unnest on struct can ony be applied at the root level of select expression");
321-
}
322-
let transformed_exprs = transform(&expr, arg)?;
323-
Ok(Transformed::yes(transformed_exprs[0].clone()))
324-
} else {
325-
Ok(Transformed::no(expr))
315+
data: transformed_expr,
316+
transformed,
317+
tnr: _,
318+
} = original_expr.clone().transform_up(|expr: Expr| {
319+
let is_root_expr = &expr == original_expr;
320+
// Root expr is transformed separately
321+
if is_root_expr {
322+
return Ok(Transformed::no(expr));
323+
}
324+
if let Expr::Unnest(Unnest { expr: ref arg }) = expr {
325+
let (data_type, _) = arg.data_type_and_nullable(input.schema())?;
326+
327+
if let DataType::Struct(_) = data_type {
328+
return internal_err!("unnest on struct can ony be applied at the root level of select expression");
326329
}
327-
})?;
330+
331+
let mut transformed_exprs = transform(&expr, arg)?;
332+
// root_expr.push(transformed_exprs[0].clone());
333+
Ok(Transformed::new(
334+
transformed_exprs.swap_remove(0),
335+
true,
336+
TreeNodeRecursion::Stop,
337+
))
338+
} else {
339+
Ok(Transformed::no(expr))
340+
}
341+
})?;
328342

329343
if !transformed {
344+
// Because root expr need to transform separately
345+
// unnest struct is only possible here
346+
// The transformation looks like
347+
// - unnest(struct_col) will be transformed into unnest(struct_col).field1, unnest(struct_col).field2
348+
if let Expr::Unnest(Unnest { expr: ref arg }) = transformed_expr {
349+
return transform(&transformed_expr, arg);
350+
}
351+
330352
if matches!(&transformed_expr, Expr::Column(_)) {
331353
inner_projection_exprs.push(transformed_expr.clone());
332354
Ok(vec![transformed_expr])
@@ -351,12 +373,13 @@ mod tests {
351373
use arrow_schema::Fields;
352374
use datafusion_common::{DFSchema, Result};
353375
use datafusion_expr::{col, lit, unnest, EmptyRelation, LogicalPlan};
376+
use datafusion_functions::core::expr_ext::FieldAccessor;
354377
use datafusion_functions_aggregate::expr_fn::count;
355378

356-
use crate::utils::{recursive_transform_unnest, resolve_positions_to_exprs};
379+
use crate::utils::{resolve_positions_to_exprs, transform_bottom_unnest};
357380

358381
#[test]
359-
fn test_recursive_transform_unnest() -> Result<()> {
382+
fn test_transform_bottom_unnest() -> Result<()> {
360383
let schema = Schema::new(vec![
361384
Field::new(
362385
"struct_col",
@@ -390,11 +413,11 @@ mod tests {
390413

391414
// unnest(struct_col)
392415
let original_expr = unnest(col("struct_col"));
393-
let transformed_exprs = recursive_transform_unnest(
416+
let transformed_exprs = transform_bottom_unnest(
394417
&input,
395418
&mut unnest_placeholder_columns,
396419
&mut inner_projection_exprs,
397-
original_expr,
420+
&original_expr,
398421
)?;
399422
assert_eq!(
400423
transformed_exprs,
@@ -413,11 +436,11 @@ mod tests {
413436

414437
// unnest(array_col) + 1
415438
let original_expr = unnest(col("array_col")).add(lit(1i64));
416-
let transformed_exprs = recursive_transform_unnest(
439+
let transformed_exprs = transform_bottom_unnest(
417440
&input,
418441
&mut unnest_placeholder_columns,
419442
&mut inner_projection_exprs,
420-
original_expr,
443+
&original_expr,
421444
)?;
422445
assert_eq!(
423446
unnest_placeholder_columns,
@@ -440,6 +463,62 @@ mod tests {
440463
]
441464
);
442465

466+
// a nested structure struct[[]]
467+
let schema = Schema::new(vec![
468+
Field::new(
469+
"struct_col", // {array_col: [1,2,3]}
470+
ArrowDataType::Struct(Fields::from(vec![Field::new(
471+
"matrix",
472+
ArrowDataType::List(Arc::new(Field::new(
473+
"matrix_row",
474+
ArrowDataType::List(Arc::new(Field::new(
475+
"item",
476+
ArrowDataType::Int64,
477+
true,
478+
))),
479+
true,
480+
))),
481+
true,
482+
)])),
483+
false,
484+
),
485+
Field::new("int_col", ArrowDataType::Int32, false),
486+
]);
487+
488+
let dfschema = DFSchema::try_from(schema)?;
489+
490+
let input = LogicalPlan::EmptyRelation(EmptyRelation {
491+
produce_one_row: false,
492+
schema: Arc::new(dfschema),
493+
});
494+
495+
let mut unnest_placeholder_columns = vec![];
496+
let mut inner_projection_exprs = vec![];
497+
498+
// An expr with multiple unnest
499+
let original_expr = unnest(unnest(col("struct_col").field("matrix")));
500+
let transformed_exprs = transform_bottom_unnest(
501+
&input,
502+
&mut unnest_placeholder_columns,
503+
&mut inner_projection_exprs,
504+
&original_expr,
505+
)?;
506+
// Only the inner most/ bottom most unnest is transformed
507+
assert_eq!(
508+
transformed_exprs,
509+
vec![unnest(col("unnest(struct_col[matrix])"))]
510+
);
511+
assert_eq!(
512+
unnest_placeholder_columns,
513+
vec!["unnest(struct_col[matrix])"]
514+
);
515+
assert_eq!(
516+
inner_projection_exprs,
517+
vec![col("struct_col")
518+
.field("matrix")
519+
.alias("unnest(struct_col[matrix])"),]
520+
);
521+
443522
Ok(())
444523
}
445524

0 commit comments

Comments
 (0)