Skip to content

Commit 6b73c4f

Browse files
authored
feat/11953: Support StringView for TRANSLATE() fn (#11967)
* feat/11953: Support StringView for TRANSLATE() fn Signed-off-by: Devan <[email protected]> * formatting Signed-off-by: Devan <[email protected]> * fixes internal error for GenericByteArray cast Signed-off-by: Devan <[email protected]> * adds additional TRANSLATE test Signed-off-by: Devan <[email protected]> * adds additional TRANSLATE test Signed-off-by: Devan <[email protected]> * rm unnecessary generic Signed-off-by: Devan <[email protected]> * cleanup + fix typo Signed-off-by: Devan <[email protected]> * cleanup + fix typo Signed-off-by: Devan <[email protected]> * adds some additional testing to sqllogictests for TRANSLATE string_view Signed-off-by: Devan <[email protected]> --------- Signed-off-by: Devan <[email protected]>
1 parent 41f6dd9 commit 6b73c4f

File tree

2 files changed

+88
-22
lines changed

2 files changed

+88
-22
lines changed

datafusion/functions/src/unicode/translate.rs

Lines changed: 50 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,18 @@
1818
use std::any::Any;
1919
use std::sync::Arc;
2020

21-
use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait};
21+
use arrow::array::{
22+
ArrayAccessor, ArrayIter, ArrayRef, AsArray, GenericStringArray, OffsetSizeTrait,
23+
};
2224
use arrow::datatypes::DataType;
2325
use hashbrown::HashMap;
2426
use unicode_segmentation::UnicodeSegmentation;
2527

26-
use datafusion_common::cast::as_generic_string_array;
28+
use crate::utils::{make_scalar_function, utf8_to_str_type};
2729
use datafusion_common::{exec_err, Result};
2830
use datafusion_expr::TypeSignature::Exact;
2931
use datafusion_expr::{ColumnarValue, ScalarUDFImpl, Signature, Volatility};
3032

31-
use crate::utils::{make_scalar_function, utf8_to_str_type};
32-
3333
#[derive(Debug)]
3434
pub struct TranslateFunc {
3535
signature: Signature,
@@ -46,7 +46,10 @@ impl TranslateFunc {
4646
use DataType::*;
4747
Self {
4848
signature: Signature::one_of(
49-
vec![Exact(vec![Utf8, Utf8, Utf8])],
49+
vec![
50+
Exact(vec![Utf8View, Utf8, Utf8]),
51+
Exact(vec![Utf8, Utf8, Utf8]),
52+
],
5053
Volatility::Immutable,
5154
),
5255
}
@@ -71,27 +74,54 @@ impl ScalarUDFImpl for TranslateFunc {
7174
}
7275

7376
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
74-
match args[0].data_type() {
75-
DataType::Utf8 => make_scalar_function(translate::<i32>, vec![])(args),
76-
DataType::LargeUtf8 => make_scalar_function(translate::<i64>, vec![])(args),
77-
other => {
78-
exec_err!("Unsupported data type {other:?} for function translate")
79-
}
77+
make_scalar_function(invoke_translate, vec![])(args)
78+
}
79+
}
80+
81+
fn invoke_translate(args: &[ArrayRef]) -> Result<ArrayRef> {
82+
match args[0].data_type() {
83+
DataType::Utf8View => {
84+
let string_array = args[0].as_string_view();
85+
let from_array = args[1].as_string::<i32>();
86+
let to_array = args[2].as_string::<i32>();
87+
translate::<i32, _, _>(string_array, from_array, to_array)
88+
}
89+
DataType::Utf8 => {
90+
let string_array = args[0].as_string::<i32>();
91+
let from_array = args[1].as_string::<i32>();
92+
let to_array = args[2].as_string::<i32>();
93+
translate::<i32, _, _>(string_array, from_array, to_array)
94+
}
95+
DataType::LargeUtf8 => {
96+
let string_array = args[0].as_string::<i64>();
97+
let from_array = args[1].as_string::<i64>();
98+
let to_array = args[2].as_string::<i64>();
99+
translate::<i64, _, _>(string_array, from_array, to_array)
100+
}
101+
other => {
102+
exec_err!("Unsupported data type {other:?} for function translate")
80103
}
81104
}
82105
}
83106

84107
/// Replaces each character in string that matches a character in the from set with the corresponding character in the to set. If from is longer than to, occurrences of the extra characters in from are deleted.
85108
/// translate('12345', '143', 'ax') = 'a2x5'
86-
fn translate<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
87-
let string_array = as_generic_string_array::<T>(&args[0])?;
88-
let from_array = as_generic_string_array::<T>(&args[1])?;
89-
let to_array = as_generic_string_array::<T>(&args[2])?;
90-
91-
let result = string_array
92-
.iter()
93-
.zip(from_array.iter())
94-
.zip(to_array.iter())
109+
fn translate<'a, T: OffsetSizeTrait, V, B>(
110+
string_array: V,
111+
from_array: B,
112+
to_array: B,
113+
) -> Result<ArrayRef>
114+
where
115+
V: ArrayAccessor<Item = &'a str>,
116+
B: ArrayAccessor<Item = &'a str>,
117+
{
118+
let string_array_iter = ArrayIter::new(string_array);
119+
let from_array_iter = ArrayIter::new(from_array);
120+
let to_array_iter = ArrayIter::new(to_array);
121+
122+
let result = string_array_iter
123+
.zip(from_array_iter)
124+
.zip(to_array_iter)
95125
.map(|((string, from), to)| match (string, from, to) {
96126
(Some(string), Some(from), Some(to)) => {
97127
// create a hashmap of [char, index] to change from O(n) to O(1) for from list

datafusion/sqllogictest/test_files/string_view.slt

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,43 @@ logical_plan
425425
01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4
426426
02)--TableScan: test projection=[column1_utf8view]
427427

428+
### Test TRANSLATE
429+
430+
# Should run TRANSLATE using utf8view column successfully
431+
query T
432+
SELECT
433+
TRANSLATE(column1_utf8view, 'foo', 'bar') as c
434+
FROM test;
435+
----
436+
Andrew
437+
Xiangpeng
438+
Raphael
439+
NULL
440+
441+
# Should run TRANSLATE using utf8 column successfully
442+
query T
443+
SELECT
444+
TRANSLATE(column1_utf8, 'foo', 'bar') as c
445+
FROM test;
446+
----
447+
Andrew
448+
Xiangpeng
449+
Raphael
450+
NULL
451+
452+
# Should run TRANSLATE using large_utf8 column successfully
453+
query T
454+
SELECT
455+
TRANSLATE(column1_large_utf8, 'foo', 'bar') as c
456+
FROM test;
457+
----
458+
Andrew
459+
Xiangpeng
460+
Raphael
461+
NULL
462+
463+
464+
428465
### Initcap
429466

430467
query TT
@@ -1047,14 +1084,13 @@ logical_plan
10471084
02)--TableScan: test projection=[column1_utf8view, column2_utf8view]
10481085

10491086
## Ensure no casts for TRANSLATE
1050-
## TODO file ticket
10511087
query TT
10521088
EXPLAIN SELECT
10531089
TRANSLATE(column1_utf8view, 'foo', 'bar') as c
10541090
FROM test;
10551091
----
10561092
logical_plan
1057-
01)Projection: translate(CAST(test.column1_utf8view AS Utf8), Utf8("foo"), Utf8("bar")) AS c
1093+
01)Projection: translate(test.column1_utf8view, Utf8("foo"), Utf8("bar")) AS c
10581094
02)--TableScan: test projection=[column1_utf8view]
10591095

10601096
## Ensure no casts for FIND_IN_SET

0 commit comments

Comments
 (0)