Skip to content

Commit aed84c2

Browse files
wiedldalamb
andauthored
Support Utf8View and BinaryView in substrait serialization. (#12199)
* feat(12118): logical plan support for Utf8View * feat(12118): physical plan support for Utf8View * feat(12118): logical plan support for BinaryView * feat(12118): physical plan support for BinaryView * refactor(12118): remove BinaryView work-arounds, now that upstream arrow changes are in --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent dd04929 commit aed84c2

File tree

6 files changed

+97
-6
lines changed

6 files changed

+97
-6
lines changed

datafusion/substrait/src/logical_plan/consumer.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ use crate::variation_const::{
4242
DECIMAL_128_TYPE_VARIATION_REF, DECIMAL_256_TYPE_VARIATION_REF,
4343
DEFAULT_CONTAINER_TYPE_VARIATION_REF, DEFAULT_TYPE_VARIATION_REF,
4444
INTERVAL_MONTH_DAY_NANO_TYPE_NAME, LARGE_CONTAINER_TYPE_VARIATION_REF,
45-
UNSIGNED_INTEGER_TYPE_VARIATION_REF,
45+
UNSIGNED_INTEGER_TYPE_VARIATION_REF, VIEW_CONTAINER_TYPE_VARIATION_REF,
4646
};
4747
#[allow(deprecated)]
4848
use crate::variation_const::{
@@ -1432,6 +1432,7 @@ fn from_substrait_type(
14321432
r#type::Kind::Binary(binary) => match binary.type_variation_reference {
14331433
DEFAULT_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::Binary),
14341434
LARGE_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::LargeBinary),
1435+
VIEW_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::BinaryView),
14351436
v => not_impl_err!(
14361437
"Unsupported Substrait type variation {v} of type {s_kind:?}"
14371438
),
@@ -1442,6 +1443,7 @@ fn from_substrait_type(
14421443
r#type::Kind::String(string) => match string.type_variation_reference {
14431444
DEFAULT_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::Utf8),
14441445
LARGE_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::LargeUtf8),
1446+
VIEW_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::Utf8View),
14451447
v => not_impl_err!(
14461448
"Unsupported Substrait type variation {v} of type {s_kind:?}"
14471449
),
@@ -1759,6 +1761,7 @@ fn from_substrait_literal(
17591761
Some(LiteralType::String(s)) => match lit.type_variation_reference {
17601762
DEFAULT_CONTAINER_TYPE_VARIATION_REF => ScalarValue::Utf8(Some(s.clone())),
17611763
LARGE_CONTAINER_TYPE_VARIATION_REF => ScalarValue::LargeUtf8(Some(s.clone())),
1764+
VIEW_CONTAINER_TYPE_VARIATION_REF => ScalarValue::Utf8View(Some(s.clone())),
17621765
others => {
17631766
return substrait_err!("Unknown type variation reference {others}");
17641767
}
@@ -1768,6 +1771,7 @@ fn from_substrait_literal(
17681771
LARGE_CONTAINER_TYPE_VARIATION_REF => {
17691772
ScalarValue::LargeBinary(Some(b.clone()))
17701773
}
1774+
VIEW_CONTAINER_TYPE_VARIATION_REF => ScalarValue::BinaryView(Some(b.clone())),
17711775
others => {
17721776
return substrait_err!("Unknown type variation reference {others}");
17731777
}

datafusion/substrait/src/logical_plan/producer.rs

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ use crate::variation_const::{
3737
DECIMAL_128_TYPE_VARIATION_REF, DECIMAL_256_TYPE_VARIATION_REF,
3838
DEFAULT_CONTAINER_TYPE_VARIATION_REF, DEFAULT_TYPE_VARIATION_REF,
3939
INTERVAL_MONTH_DAY_NANO_TYPE_NAME, LARGE_CONTAINER_TYPE_VARIATION_REF,
40-
UNSIGNED_INTEGER_TYPE_VARIATION_REF,
40+
UNSIGNED_INTEGER_TYPE_VARIATION_REF, VIEW_CONTAINER_TYPE_VARIATION_REF,
4141
};
4242
use datafusion::arrow::array::{Array, GenericListArray, OffsetSizeTrait};
4343
use datafusion::common::{
@@ -1450,6 +1450,12 @@ fn to_substrait_type(
14501450
nullability,
14511451
})),
14521452
}),
1453+
DataType::BinaryView => Ok(substrait::proto::Type {
1454+
kind: Some(r#type::Kind::Binary(r#type::Binary {
1455+
type_variation_reference: VIEW_CONTAINER_TYPE_VARIATION_REF,
1456+
nullability,
1457+
})),
1458+
}),
14531459
DataType::Utf8 => Ok(substrait::proto::Type {
14541460
kind: Some(r#type::Kind::String(r#type::String {
14551461
type_variation_reference: DEFAULT_CONTAINER_TYPE_VARIATION_REF,
@@ -1462,6 +1468,12 @@ fn to_substrait_type(
14621468
nullability,
14631469
})),
14641470
}),
1471+
DataType::Utf8View => Ok(substrait::proto::Type {
1472+
kind: Some(r#type::Kind::String(r#type::String {
1473+
type_variation_reference: VIEW_CONTAINER_TYPE_VARIATION_REF,
1474+
nullability,
1475+
})),
1476+
}),
14651477
DataType::List(inner) => {
14661478
let inner_type =
14671479
to_substrait_type(inner.data_type(), inner.is_nullable(), extensions)?;
@@ -1902,6 +1914,10 @@ fn to_substrait_literal(
19021914
LiteralType::Binary(b.clone()),
19031915
LARGE_CONTAINER_TYPE_VARIATION_REF,
19041916
),
1917+
ScalarValue::BinaryView(Some(b)) => (
1918+
LiteralType::Binary(b.clone()),
1919+
VIEW_CONTAINER_TYPE_VARIATION_REF,
1920+
),
19051921
ScalarValue::FixedSizeBinary(_, Some(b)) => (
19061922
LiteralType::FixedBinary(b.clone()),
19071923
DEFAULT_TYPE_VARIATION_REF,
@@ -1914,6 +1930,10 @@ fn to_substrait_literal(
19141930
LiteralType::String(s.clone()),
19151931
LARGE_CONTAINER_TYPE_VARIATION_REF,
19161932
),
1933+
ScalarValue::Utf8View(Some(s)) => (
1934+
LiteralType::String(s.clone()),
1935+
VIEW_CONTAINER_TYPE_VARIATION_REF,
1936+
),
19171937
ScalarValue::Decimal128(v, p, s) if v.is_some() => (
19181938
LiteralType::Decimal(Decimal {
19191939
value: v.unwrap().to_le_bytes().to_vec(),
@@ -2335,8 +2355,10 @@ mod test {
23352355
round_trip_type(DataType::Binary)?;
23362356
round_trip_type(DataType::FixedSizeBinary(10))?;
23372357
round_trip_type(DataType::LargeBinary)?;
2358+
round_trip_type(DataType::BinaryView)?;
23382359
round_trip_type(DataType::Utf8)?;
23392360
round_trip_type(DataType::LargeUtf8)?;
2361+
round_trip_type(DataType::Utf8View)?;
23402362
round_trip_type(DataType::Decimal128(10, 2))?;
23412363
round_trip_type(DataType::Decimal256(30, 2))?;
23422364

datafusion/substrait/src/physical_plan/consumer.rs

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,11 @@ use substrait::proto::{
3737
expression::MaskExpression, read_rel::ReadType, rel::RelType, Rel,
3838
};
3939

40+
use crate::variation_const::{
41+
DEFAULT_CONTAINER_TYPE_VARIATION_REF, LARGE_CONTAINER_TYPE_VARIATION_REF,
42+
VIEW_CONTAINER_TYPE_VARIATION_REF,
43+
};
44+
4045
/// Convert Substrait Rel to DataFusion ExecutionPlan
4146
#[async_recursion]
4247
pub async fn from_substrait_rel(
@@ -177,7 +182,27 @@ fn to_field(name: &String, r#type: &Type) -> Result<Field> {
177182
}
178183
Kind::String(string) => {
179184
nullable = is_nullable(string.nullability);
180-
Ok(DataType::Utf8)
185+
match string.type_variation_reference {
186+
DEFAULT_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::Utf8),
187+
LARGE_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::LargeUtf8),
188+
VIEW_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::Utf8View),
189+
_ => substrait_err!(
190+
"Invalid type variation found for substrait string type class: {}",
191+
string.type_variation_reference
192+
),
193+
}
194+
}
195+
Kind::Binary(binary) => {
196+
nullable = is_nullable(binary.nullability);
197+
match binary.type_variation_reference {
198+
DEFAULT_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::Binary),
199+
LARGE_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::LargeBinary),
200+
VIEW_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::BinaryView),
201+
_ => substrait_err!(
202+
"Invalid type variation found for substrait binary type class: {}",
203+
binary.type_variation_reference
204+
),
205+
}
181206
}
182207
_ => substrait_err!(
183208
"Unsupported kind: {:?} in the type with name {}",

datafusion/substrait/src/physical_plan/producer.rs

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ use std::collections::HashMap;
2323
use substrait::proto::expression::mask_expression::{StructItem, StructSelect};
2424
use substrait::proto::expression::MaskExpression;
2525
use substrait::proto::r#type::{
26-
Boolean, Fp64, Kind, Nullability, String as SubstraitString, Struct, I64,
26+
Binary, Boolean, Fp64, Kind, Nullability, String as SubstraitString, Struct, I64,
2727
};
2828
use substrait::proto::read_rel::local_files::file_or_files::ParquetReadOptions;
2929
use substrait::proto::read_rel::local_files::file_or_files::{FileFormat, PathType};
@@ -35,6 +35,11 @@ use substrait::proto::ReadRel;
3535
use substrait::proto::Rel;
3636
use substrait::proto::{extensions, NamedStruct, Type};
3737

38+
use crate::variation_const::{
39+
DEFAULT_CONTAINER_TYPE_VARIATION_REF, LARGE_CONTAINER_TYPE_VARIATION_REF,
40+
VIEW_CONTAINER_TYPE_VARIATION_REF,
41+
};
42+
3843
/// Convert DataFusion ExecutionPlan to Substrait Rel
3944
pub fn to_substrait_rel(
4045
plan: &dyn ExecutionPlan,
@@ -155,7 +160,37 @@ fn to_substrait_type(data_type: &DataType, nullable: bool) -> Result<Type> {
155160
}),
156161
DataType::Utf8 => Ok(Type {
157162
kind: Some(Kind::String(SubstraitString {
158-
type_variation_reference: 0,
163+
type_variation_reference: DEFAULT_CONTAINER_TYPE_VARIATION_REF,
164+
nullability,
165+
})),
166+
}),
167+
DataType::LargeUtf8 => Ok(Type {
168+
kind: Some(Kind::String(SubstraitString {
169+
type_variation_reference: LARGE_CONTAINER_TYPE_VARIATION_REF,
170+
nullability,
171+
})),
172+
}),
173+
DataType::Utf8View => Ok(Type {
174+
kind: Some(Kind::String(SubstraitString {
175+
type_variation_reference: VIEW_CONTAINER_TYPE_VARIATION_REF,
176+
nullability,
177+
})),
178+
}),
179+
DataType::Binary => Ok(Type {
180+
kind: Some(Kind::Binary(Binary {
181+
type_variation_reference: DEFAULT_CONTAINER_TYPE_VARIATION_REF,
182+
nullability,
183+
})),
184+
}),
185+
DataType::LargeBinary => Ok(Type {
186+
kind: Some(Kind::Binary(Binary {
187+
type_variation_reference: LARGE_CONTAINER_TYPE_VARIATION_REF,
188+
nullability,
189+
})),
190+
}),
191+
DataType::BinaryView => Ok(Type {
192+
kind: Some(Kind::Binary(Binary {
193+
type_variation_reference: VIEW_CONTAINER_TYPE_VARIATION_REF,
159194
nullability,
160195
})),
161196
}),

datafusion/substrait/src/variation_const.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ pub const DATE_32_TYPE_VARIATION_REF: u32 = 0;
5252
pub const DATE_64_TYPE_VARIATION_REF: u32 = 1;
5353
pub const DEFAULT_CONTAINER_TYPE_VARIATION_REF: u32 = 0;
5454
pub const LARGE_CONTAINER_TYPE_VARIATION_REF: u32 = 1;
55+
pub const VIEW_CONTAINER_TYPE_VARIATION_REF: u32 = 2;
5556
pub const DECIMAL_128_TYPE_VARIATION_REF: u32 = 0;
5657
pub const DECIMAL_256_TYPE_VARIATION_REF: u32 = 1;
5758

datafusion/substrait/tests/cases/roundtrip_logical_plan.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -716,8 +716,10 @@ async fn all_type_literal() -> Result<()> {
716716
date32_col = arrow_cast('2020-01-01', 'Date32') AND
717717
binary_col = arrow_cast('binary', 'Binary') AND
718718
large_binary_col = arrow_cast('large_binary', 'LargeBinary') AND
719+
view_binary_col = arrow_cast('binary_view', 'BinaryView') AND
719720
utf8_col = arrow_cast('utf8', 'Utf8') AND
720-
large_utf8_col = arrow_cast('large_utf8', 'LargeUtf8');",
721+
large_utf8_col = arrow_cast('large_utf8', 'LargeUtf8') AND
722+
view_utf8_col = arrow_cast('utf8_view', 'Utf8View');",
721723
)
722724
.await
723725
}
@@ -1231,9 +1233,11 @@ async fn create_all_type_context() -> Result<SessionContext> {
12311233
Field::new("date64_col", DataType::Date64, true),
12321234
Field::new("binary_col", DataType::Binary, true),
12331235
Field::new("large_binary_col", DataType::LargeBinary, true),
1236+
Field::new("view_binary_col", DataType::BinaryView, true),
12341237
Field::new("fixed_size_binary_col", DataType::FixedSizeBinary(42), true),
12351238
Field::new("utf8_col", DataType::Utf8, true),
12361239
Field::new("large_utf8_col", DataType::LargeUtf8, true),
1240+
Field::new("view_utf8_col", DataType::Utf8View, true),
12371241
Field::new_list("list_col", Field::new("item", DataType::Int64, true), true),
12381242
Field::new_list(
12391243
"large_list_col",

0 commit comments

Comments
 (0)