Skip to content

Commit c7e5d8d

Browse files
duongcongtoaialamb
andauthored
Improve recursive unnest options API (#12836)
* refactor * refactor unnest options * more test * resolve comments * add back doc * fix proto * flaky test * clippy * use indexmap * chore: compile err * chore: update cargo * chore: fmt cargotoml --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 7a34147 commit c7e5d8d

File tree

17 files changed

+504
-529
lines changed

17 files changed

+504
-529
lines changed

datafusion-cli/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/common/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ pub use scalar::{ScalarType, ScalarValue};
7070
pub use schema_reference::SchemaReference;
7171
pub use stats::{ColumnStatistics, Statistics};
7272
pub use table_reference::{ResolvedTableReference, TableReference};
73-
pub use unnest::UnnestOptions;
73+
pub use unnest::{RecursionUnnestOption, UnnestOptions};
7474
pub use utils::project_schema;
7575

7676
// These are hidden from docs purely to avoid polluting the public view of what this crate exports.

datafusion/common/src/unnest.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
//! [`UnnestOptions`] for unnesting structured types
1919
20+
use crate::Column;
21+
2022
/// Options for unnesting a column that contains a list type,
2123
/// replicating values in the other, non nested rows.
2224
///
@@ -60,17 +62,35 @@
6062
/// └─────────┘ └─────┘ └─────────┘ └─────┘
6163
/// c1 c2 c1 c2
6264
/// ```
65+
///
66+
/// `recursions` instruct how a column should be unnested (e.g unnesting a column multiple
67+
/// time, with depth = 1 and depth = 2). Any unnested column not being mentioned inside this
68+
/// options is inferred to be unnested with depth = 1
6369
#[derive(Debug, Clone, PartialEq, PartialOrd, Hash, Eq)]
6470
pub struct UnnestOptions {
6571
/// Should nulls in the input be preserved? Defaults to true
6672
pub preserve_nulls: bool,
73+
/// If specific columns need to be unnested multiple times (e.g at different depth),
74+
/// declare them here. Any unnested columns not being mentioned inside this option
75+
/// will be unnested with depth = 1
76+
pub recursions: Vec<RecursionUnnestOption>,
77+
}
78+
79+
/// Instruction on how to unnest a column (mostly with a list type)
80+
/// such as how to name the output, and how many level it should be unnested
81+
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)]
82+
pub struct RecursionUnnestOption {
83+
pub input_column: Column,
84+
pub output_column: Column,
85+
pub depth: usize,
6786
}
6887

6988
impl Default for UnnestOptions {
7089
fn default() -> Self {
7190
Self {
7291
// default to true to maintain backwards compatible behavior
7392
preserve_nulls: true,
93+
recursions: vec![],
7494
}
7595
}
7696
}
@@ -87,4 +107,10 @@ impl UnnestOptions {
87107
self.preserve_nulls = preserve_nulls;
88108
self
89109
}
110+
111+
/// Set the recursions for the unnest operation
112+
pub fn with_recursions(mut self, recursion: RecursionUnnestOption) -> Self {
113+
self.recursions.push(recursion);
114+
self
115+
}
90116
}

datafusion/expr/src/logical_plan/builder.rs

Lines changed: 70 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ use crate::{
4444
TableProviderFilterPushDown, TableSource, WriteOp,
4545
};
4646

47+
use super::dml::InsertOp;
48+
use super::plan::ColumnUnnestList;
4749
use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef};
4850
use datafusion_common::display::ToStringifiedPlan;
4951
use datafusion_common::file_options::file_type::FileType;
@@ -54,9 +56,6 @@ use datafusion_common::{
5456
};
5557
use datafusion_expr_common::type_coercion::binary::type_union_resolution;
5658

57-
use super::dml::InsertOp;
58-
use super::plan::{ColumnUnnestList, ColumnUnnestType};
59-
6059
/// Default table name for unnamed table
6160
pub const UNNAMED_TABLE: &str = "?table?";
6261

@@ -1186,7 +1185,7 @@ impl LogicalPlanBuilder {
11861185
) -> Result<Self> {
11871186
unnest_with_options(
11881187
Arc::unwrap_or_clone(self.plan),
1189-
vec![(column.into(), ColumnUnnestType::Inferred)],
1188+
vec![column.into()],
11901189
options,
11911190
)
11921191
.map(Self::new)
@@ -1197,26 +1196,6 @@ impl LogicalPlanBuilder {
11971196
self,
11981197
columns: Vec<Column>,
11991198
options: UnnestOptions,
1200-
) -> Result<Self> {
1201-
unnest_with_options(
1202-
Arc::unwrap_or_clone(self.plan),
1203-
columns
1204-
.into_iter()
1205-
.map(|c| (c, ColumnUnnestType::Inferred))
1206-
.collect(),
1207-
options,
1208-
)
1209-
.map(Self::new)
1210-
}
1211-
1212-
/// Unnest the given columns with the given [`UnnestOptions`]
1213-
/// if one column is a list type, it can be recursively and simultaneously
1214-
/// unnested into the desired recursion levels
1215-
/// e.g select unnest(list_col,depth=1), unnest(list_col,depth=2)
1216-
pub fn unnest_columns_recursive_with_options(
1217-
self,
1218-
columns: Vec<(Column, ColumnUnnestType)>,
1219-
options: UnnestOptions,
12201199
) -> Result<Self> {
12211200
unnest_with_options(Arc::unwrap_or_clone(self.plan), columns, options)
12221201
.map(Self::new)
@@ -1594,14 +1573,12 @@ impl TableSource for LogicalTableSource {
15941573

15951574
/// Create a [`LogicalPlan::Unnest`] plan
15961575
pub fn unnest(input: LogicalPlan, columns: Vec<Column>) -> Result<LogicalPlan> {
1597-
let unnestings = columns
1598-
.into_iter()
1599-
.map(|c| (c, ColumnUnnestType::Inferred))
1600-
.collect();
1601-
unnest_with_options(input, unnestings, UnnestOptions::default())
1576+
unnest_with_options(input, columns, UnnestOptions::default())
16021577
}
16031578

1604-
pub fn get_unnested_list_datatype_recursive(
1579+
// Get the data type of a multi-dimensional type after unnesting it
1580+
// with a given depth
1581+
fn get_unnested_list_datatype_recursive(
16051582
data_type: &DataType,
16061583
depth: usize,
16071584
) -> Result<DataType> {
@@ -1620,27 +1597,6 @@ pub fn get_unnested_list_datatype_recursive(
16201597
internal_err!("trying to unnest on invalid data type {:?}", data_type)
16211598
}
16221599

1623-
/// Infer the unnest type based on the data type:
1624-
/// - list type: infer to unnest(list(col, depth=1))
1625-
/// - struct type: infer to unnest(struct)
1626-
fn infer_unnest_type(
1627-
col_name: &String,
1628-
data_type: &DataType,
1629-
) -> Result<ColumnUnnestType> {
1630-
match data_type {
1631-
DataType::List(_) | DataType::FixedSizeList(_, _) | DataType::LargeList(_) => {
1632-
Ok(ColumnUnnestType::List(vec![ColumnUnnestList {
1633-
output_column: Column::from_name(col_name),
1634-
depth: 1,
1635-
}]))
1636-
}
1637-
DataType::Struct(_) => Ok(ColumnUnnestType::Struct),
1638-
_ => {
1639-
internal_err!("trying to unnest on invalid data type {:?}", data_type)
1640-
}
1641-
}
1642-
}
1643-
16441600
pub fn get_struct_unnested_columns(
16451601
col_name: &String,
16461602
inner_fields: &Fields,
@@ -1729,20 +1685,15 @@ pub fn get_unnested_columns(
17291685
/// ```
17301686
pub fn unnest_with_options(
17311687
input: LogicalPlan,
1732-
columns_to_unnest: Vec<(Column, ColumnUnnestType)>,
1688+
columns_to_unnest: Vec<Column>,
17331689
options: UnnestOptions,
17341690
) -> Result<LogicalPlan> {
17351691
let mut list_columns: Vec<(usize, ColumnUnnestList)> = vec![];
17361692
let mut struct_columns = vec![];
17371693
let indices_to_unnest = columns_to_unnest
17381694
.iter()
1739-
.map(|col_unnesting| {
1740-
Ok((
1741-
input.schema().index_of_column(&col_unnesting.0)?,
1742-
col_unnesting,
1743-
))
1744-
})
1745-
.collect::<Result<HashMap<usize, &(Column, ColumnUnnestType)>>>()?;
1695+
.map(|c| Ok((input.schema().index_of_column(c)?, c)))
1696+
.collect::<Result<HashMap<usize, &Column>>>()?;
17461697

17471698
let input_schema = input.schema();
17481699

@@ -1767,51 +1718,59 @@ pub fn unnest_with_options(
17671718
.enumerate()
17681719
.map(|(index, (original_qualifier, original_field))| {
17691720
match indices_to_unnest.get(&index) {
1770-
Some((column_to_unnest, unnest_type)) => {
1771-
let mut inferred_unnest_type = unnest_type.clone();
1772-
if let ColumnUnnestType::Inferred = unnest_type {
1773-
inferred_unnest_type = infer_unnest_type(
1721+
Some(column_to_unnest) => {
1722+
let recursions_on_column = options
1723+
.recursions
1724+
.iter()
1725+
.filter(|p| -> bool { &p.input_column == *column_to_unnest })
1726+
.collect::<Vec<_>>();
1727+
let mut transformed_columns = recursions_on_column
1728+
.iter()
1729+
.map(|r| {
1730+
list_columns.push((
1731+
index,
1732+
ColumnUnnestList {
1733+
output_column: r.output_column.clone(),
1734+
depth: r.depth,
1735+
},
1736+
));
1737+
Ok(get_unnested_columns(
1738+
&r.output_column.name,
1739+
original_field.data_type(),
1740+
r.depth,
1741+
)?
1742+
.into_iter()
1743+
.next()
1744+
.unwrap()) // because unnesting a list column always result into one result
1745+
})
1746+
.collect::<Result<Vec<(Column, Arc<Field>)>>>()?;
1747+
if transformed_columns.is_empty() {
1748+
transformed_columns = get_unnested_columns(
17741749
&column_to_unnest.name,
17751750
original_field.data_type(),
1751+
1,
17761752
)?;
1777-
}
1778-
let transformed_columns: Vec<(Column, Arc<Field>)> =
1779-
match inferred_unnest_type {
1780-
ColumnUnnestType::Struct => {
1753+
match original_field.data_type() {
1754+
DataType::Struct(_) => {
17811755
struct_columns.push(index);
1782-
get_unnested_columns(
1783-
&column_to_unnest.name,
1784-
original_field.data_type(),
1785-
1,
1786-
)?
17871756
}
1788-
ColumnUnnestType::List(unnest_lists) => {
1789-
list_columns.extend(
1790-
unnest_lists
1791-
.iter()
1792-
.map(|ul| (index, ul.to_owned().clone())),
1793-
);
1794-
unnest_lists
1795-
.iter()
1796-
.map(
1797-
|ColumnUnnestList {
1798-
output_column,
1799-
depth,
1800-
}| {
1801-
get_unnested_columns(
1802-
&output_column.name,
1803-
original_field.data_type(),
1804-
*depth,
1805-
)
1806-
},
1807-
)
1808-
.collect::<Result<Vec<Vec<(Column, Arc<Field>)>>>>()?
1809-
.into_iter()
1810-
.flatten()
1811-
.collect::<Vec<_>>()
1757+
DataType::List(_)
1758+
| DataType::FixedSizeList(_, _)
1759+
| DataType::LargeList(_) => {
1760+
list_columns.push((
1761+
index,
1762+
ColumnUnnestList {
1763+
output_column: Column::from_name(
1764+
&column_to_unnest.name,
1765+
),
1766+
depth: 1,
1767+
},
1768+
));
18121769
}
1813-
_ => return internal_err!("Invalid unnest type"),
1770+
_ => {}
18141771
};
1772+
}
1773+
18151774
// new columns dependent on the same original index
18161775
dependency_indices
18171776
.extend(std::iter::repeat(index).take(transformed_columns.len()));
@@ -1860,7 +1819,7 @@ mod tests {
18601819
use crate::logical_plan::StringifiedPlan;
18611820
use crate::{col, expr, expr_fn::exists, in_subquery, lit, scalar_subquery};
18621821

1863-
use datafusion_common::SchemaError;
1822+
use datafusion_common::{RecursionUnnestOption, SchemaError};
18641823

18651824
#[test]
18661825
fn plan_builder_simple() -> Result<()> {
@@ -2268,24 +2227,19 @@ mod tests {
22682227

22692228
// Simultaneously unnesting a list (with different depth) and a struct column
22702229
let plan = nested_table_scan("test_table")?
2271-
.unnest_columns_recursive_with_options(
2272-
vec![
2273-
(
2274-
"stringss".into(),
2275-
ColumnUnnestType::List(vec![
2276-
ColumnUnnestList {
2277-
output_column: Column::from_name("stringss_depth_1"),
2278-
depth: 1,
2279-
},
2280-
ColumnUnnestList {
2281-
output_column: Column::from_name("stringss_depth_2"),
2282-
depth: 2,
2283-
},
2284-
]),
2285-
),
2286-
("struct_singular".into(), ColumnUnnestType::Inferred),
2287-
],
2288-
UnnestOptions::default(),
2230+
.unnest_columns_with_options(
2231+
vec!["stringss".into(), "struct_singular".into()],
2232+
UnnestOptions::default()
2233+
.with_recursions(RecursionUnnestOption {
2234+
input_column: "stringss".into(),
2235+
output_column: "stringss_depth_1".into(),
2236+
depth: 1,
2237+
})
2238+
.with_recursions(RecursionUnnestOption {
2239+
input_column: "stringss".into(),
2240+
output_column: "stringss_depth_2".into(),
2241+
depth: 2,
2242+
}),
22892243
)?
22902244
.build()?;
22912245

datafusion/expr/src/logical_plan/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ pub use ddl::{
3535
};
3636
pub use dml::{DmlStatement, WriteOp};
3737
pub use plan::{
38-
projection_schema, Aggregate, Analyze, ColumnUnnestList, ColumnUnnestType, CrossJoin,
39-
DescribeTable, Distinct, DistinctOn, EmptyRelation, Explain, Extension, Filter, Join,
38+
projection_schema, Aggregate, Analyze, ColumnUnnestList, CrossJoin, DescribeTable,
39+
Distinct, DistinctOn, EmptyRelation, Explain, Extension, Filter, Join,
4040
JoinConstraint, JoinType, Limit, LogicalPlan, Partitioning, PlanType, Prepare,
4141
Projection, RecursiveQuery, Repartition, Sort, StringifiedPlan, Subquery,
4242
SubqueryAlias, TableScan, ToStringifiedPlan, Union, Unnest, Values, Window,

0 commit comments

Comments
 (0)