Skip to content

Commit 34ec9d4

Browse files
authored
Implement native stringview support for BTRIM (#11920)
* add utf8view support for generic_trim * add utf8view support for BTRIM * stop LTRIM and RTRIM from complaining generic_trim missing args * add tests to cover utf8view support of BTRIM * fix typo and tiny err * remove useless imports
1 parent b60cdc7 commit 34ec9d4

File tree

5 files changed

+131
-12
lines changed

5 files changed

+131
-12
lines changed

datafusion/functions/src/string/btrim.rs

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,8 @@
1616
// under the License.
1717

1818
use arrow::array::{ArrayRef, OffsetSizeTrait};
19-
use std::any::Any;
20-
2119
use arrow::datatypes::DataType;
20+
use std::any::Any;
2221

2322
use datafusion_common::{exec_err, Result};
2423
use datafusion_expr::function::Hint;
@@ -32,7 +31,8 @@ use crate::utils::{make_scalar_function, utf8_to_str_type};
3231
/// Returns the longest string with leading and trailing characters removed. If the characters are not specified, whitespace is removed.
3332
/// btrim('xyxtrimyyx', 'xyz') = 'trim'
3433
fn btrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
35-
general_trim::<T>(args, TrimType::Both)
34+
let use_string_view = args[0].data_type() == &DataType::Utf8View;
35+
general_trim::<T>(args, TrimType::Both, use_string_view)
3636
}
3737

3838
#[derive(Debug)]
@@ -52,7 +52,16 @@ impl BTrimFunc {
5252
use DataType::*;
5353
Self {
5454
signature: Signature::one_of(
55-
vec![Exact(vec![Utf8]), Exact(vec![Utf8, Utf8])],
55+
vec![
56+
// Planner attempts coercion to the target type starting with the most preferred candidate.
57+
// For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`.
58+
// If that fails, it proceeds to `(Utf8, Utf8)`.
59+
Exact(vec![Utf8View, Utf8View]),
60+
// Exact(vec![Utf8, Utf8View]),
61+
Exact(vec![Utf8, Utf8]),
62+
Exact(vec![Utf8View]),
63+
Exact(vec![Utf8]),
64+
],
5665
Volatility::Immutable,
5766
),
5867
aliases: vec![String::from("trim")],
@@ -79,15 +88,18 @@ impl ScalarUDFImpl for BTrimFunc {
7988

8089
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
8190
match args[0].data_type() {
82-
DataType::Utf8 => make_scalar_function(
91+
DataType::Utf8 | DataType::Utf8View => make_scalar_function(
8392
btrim::<i32>,
8493
vec![Hint::Pad, Hint::AcceptsSingular],
8594
)(args),
8695
DataType::LargeUtf8 => make_scalar_function(
8796
btrim::<i64>,
8897
vec![Hint::Pad, Hint::AcceptsSingular],
8998
)(args),
90-
other => exec_err!("Unsupported data type {other:?} for function btrim"),
99+
other => exec_err!(
100+
"Unsupported data type {other:?} for function btrim,\
101+
expected for Utf8, LargeUtf8 or Utf8View."
102+
),
91103
}
92104
}
93105

datafusion/functions/src/string/common.rs

Lines changed: 75 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use arrow::array::{
2525
use arrow::buffer::{Buffer, MutableBuffer, NullBuffer};
2626
use arrow::datatypes::DataType;
2727

28-
use datafusion_common::cast::as_generic_string_array;
28+
use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
2929
use datafusion_common::Result;
3030
use datafusion_common::{exec_err, ScalarValue};
3131
use datafusion_expr::ColumnarValue;
@@ -49,6 +49,7 @@ impl Display for TrimType {
4949
pub(crate) fn general_trim<T: OffsetSizeTrait>(
5050
args: &[ArrayRef],
5151
trim_type: TrimType,
52+
use_string_view: bool,
5253
) -> Result<ArrayRef> {
5354
let func = match trim_type {
5455
TrimType::Left => |input, pattern: &str| {
@@ -68,6 +69,74 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>(
6869
},
6970
};
7071

72+
if use_string_view {
73+
string_view_trim::<T>(trim_type, func, args)
74+
} else {
75+
string_trim::<T>(trim_type, func, args)
76+
}
77+
}
78+
79+
// removing 'a will cause compiler complaining lifetime of `func`
80+
fn string_view_trim<'a, T: OffsetSizeTrait>(
81+
trim_type: TrimType,
82+
func: fn(&'a str, &'a str) -> &'a str,
83+
args: &'a [ArrayRef],
84+
) -> Result<ArrayRef> {
85+
let string_array = as_string_view_array(&args[0])?;
86+
87+
match args.len() {
88+
1 => {
89+
let result = string_array
90+
.iter()
91+
.map(|string| string.map(|string: &str| func(string, " ")))
92+
.collect::<GenericStringArray<T>>();
93+
94+
Ok(Arc::new(result) as ArrayRef)
95+
}
96+
2 => {
97+
let characters_array = as_string_view_array(&args[1])?;
98+
99+
if characters_array.len() == 1 {
100+
if characters_array.is_null(0) {
101+
return Ok(new_null_array(
102+
// The schema is expecting utf8 as null
103+
&DataType::Utf8,
104+
string_array.len(),
105+
));
106+
}
107+
108+
let characters = characters_array.value(0);
109+
let result = string_array
110+
.iter()
111+
.map(|item| item.map(|string| func(string, characters)))
112+
.collect::<GenericStringArray<T>>();
113+
return Ok(Arc::new(result) as ArrayRef);
114+
}
115+
116+
let result = string_array
117+
.iter()
118+
.zip(characters_array.iter())
119+
.map(|(string, characters)| match (string, characters) {
120+
(Some(string), Some(characters)) => Some(func(string, characters)),
121+
_ => None,
122+
})
123+
.collect::<GenericStringArray<T>>();
124+
125+
Ok(Arc::new(result) as ArrayRef)
126+
}
127+
other => {
128+
exec_err!(
129+
"{trim_type} was called with {other} arguments. It requires at least 1 and at most 2."
130+
)
131+
}
132+
}
133+
}
134+
135+
fn string_trim<'a, T: OffsetSizeTrait>(
136+
trim_type: TrimType,
137+
func: fn(&'a str, &'a str) -> &'a str,
138+
args: &'a [ArrayRef],
139+
) -> Result<ArrayRef> {
71140
let string_array = as_generic_string_array::<T>(&args[0])?;
72141

73142
match args.len() {
@@ -84,7 +153,10 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>(
84153

85154
if characters_array.len() == 1 {
86155
if characters_array.is_null(0) {
87-
return Ok(new_null_array(args[0].data_type(), args[0].len()));
156+
return Ok(new_null_array(
157+
string_array.data_type(),
158+
string_array.len(),
159+
));
88160
}
89161

90162
let characters = characters_array.value(0);
@@ -109,7 +181,7 @@ pub(crate) fn general_trim<T: OffsetSizeTrait>(
109181
other => {
110182
exec_err!(
111183
"{trim_type} was called with {other} arguments. It requires at least 1 and at most 2."
112-
)
184+
)
113185
}
114186
}
115187
}

datafusion/functions/src/string/ltrim.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ use crate::utils::{make_scalar_function, utf8_to_str_type};
3232
/// Returns the longest string with leading characters removed. If the characters are not specified, whitespace is removed.
3333
/// ltrim('zzzytest', 'xyz') = 'test'
3434
fn ltrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
35-
general_trim::<T>(args, TrimType::Left)
35+
general_trim::<T>(args, TrimType::Left, false)
3636
}
3737

3838
#[derive(Debug)]

datafusion/functions/src/string/rtrim.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ use crate::utils::{make_scalar_function, utf8_to_str_type};
3232
/// Returns the longest string with trailing characters removed. If the characters are not specified, whitespace is removed.
3333
/// rtrim('testxxzx', 'xyz') = 'test'
3434
fn rtrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
35-
general_trim::<T>(args, TrimType::Right)
35+
general_trim::<T>(args, TrimType::Right, false)
3636
}
3737

3838
#[derive(Debug)]

datafusion/sqllogictest/test_files/string_view.slt

Lines changed: 36 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -563,15 +563,50 @@ SELECT
563563
228 0 NULL
564564

565565
## Ensure no casts for BTRIM
566+
# Test BTRIM with Utf8View input
567+
query TT
568+
EXPLAIN SELECT
569+
BTRIM(column1_utf8view) AS l
570+
FROM test;
571+
----
572+
logical_plan
573+
01)Projection: btrim(test.column1_utf8view) AS l
574+
02)--TableScan: test projection=[column1_utf8view]
575+
576+
# Test BTRIM with Utf8View input and Utf8View pattern
566577
query TT
567578
EXPLAIN SELECT
568579
BTRIM(column1_utf8view, 'foo') AS l
569580
FROM test;
570581
----
571582
logical_plan
572-
01)Projection: btrim(CAST(test.column1_utf8view AS Utf8), Utf8("foo")) AS l
583+
01)Projection: btrim(test.column1_utf8view, Utf8View("foo")) AS l
584+
02)--TableScan: test projection=[column1_utf8view]
585+
586+
# Test BTRIM with Utf8View bytes longer than 12
587+
query TT
588+
EXPLAIN SELECT
589+
BTRIM(column1_utf8view, 'this is longer than 12') AS l
590+
FROM test;
591+
----
592+
logical_plan
593+
01)Projection: btrim(test.column1_utf8view, Utf8View("this is longer than 12")) AS l
573594
02)--TableScan: test projection=[column1_utf8view]
574595

596+
# Test BTRIM outputs
597+
query TTTT
598+
SELECT
599+
BTRIM(column1_utf8view, 'foo') AS l1,
600+
BTRIM(column1_utf8view, 'A') AS l2,
601+
BTRIM(column1_utf8view) AS l3,
602+
BTRIM(column1_utf8view, NULL) AS l4
603+
FROM test;
604+
----
605+
Andrew ndrew Andrew NULL
606+
Xiangpeng Xiangpeng Xiangpeng NULL
607+
Raphael Raphael Raphael NULL
608+
NULL NULL NULL NULL
609+
575610
## Ensure no casts for CHARACTER_LENGTH
576611
query TT
577612
EXPLAIN SELECT

0 commit comments

Comments
 (0)