Skip to content

Commit 8752e01

Browse files
XiangpengHaoalamb
andauthored
Improve performance of casting StringView/BinaryView to DictionaryArray (#5872)
* zero-copy dict to view * view to dict * refactor to use try_append_view * unchecked view * make fmt happy * update test * add comments --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent c6359bf commit 8752e01

File tree

2 files changed

+104
-16
lines changed

2 files changed

+104
-16
lines changed

arrow-cast/src/cast/dictionary.rs

Lines changed: 80 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -188,10 +188,34 @@ pub(crate) fn cast_to_dictionary<K: ArrowDictionaryKeyType>(
188188
Decimal256(_, _) => {
189189
pack_numeric_to_dictionary::<K, Decimal256Type>(array, dict_value_type, cast_options)
190190
}
191-
Utf8 => pack_byte_to_dictionary::<K, GenericStringType<i32>>(array, cast_options),
192-
LargeUtf8 => pack_byte_to_dictionary::<K, GenericStringType<i64>>(array, cast_options),
193-
Binary => pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array, cast_options),
194-
LargeBinary => pack_byte_to_dictionary::<K, GenericBinaryType<i64>>(array, cast_options),
191+
Utf8 => {
192+
// If the input is a view type, we can avoid casting (thus copying) the data
193+
if array.data_type() == &DataType::Utf8View {
194+
return string_view_to_dictionary::<K, i32>(array);
195+
}
196+
pack_byte_to_dictionary::<K, GenericStringType<i32>>(array, cast_options)
197+
}
198+
LargeUtf8 => {
199+
// If the input is a view type, we can avoid casting (thus copying) the data
200+
if array.data_type() == &DataType::Utf8View {
201+
return string_view_to_dictionary::<K, i64>(array);
202+
}
203+
pack_byte_to_dictionary::<K, GenericStringType<i64>>(array, cast_options)
204+
}
205+
Binary => {
206+
// If the input is a view type, we can avoid casting (thus copying) the data
207+
if array.data_type() == &DataType::BinaryView {
208+
return binary_view_to_dictionary::<K, i32>(array);
209+
}
210+
pack_byte_to_dictionary::<K, GenericBinaryType<i32>>(array, cast_options)
211+
}
212+
LargeBinary => {
213+
// If the input is a view type, we can avoid casting (thus copying) the data
214+
if array.data_type() == &DataType::BinaryView {
215+
return binary_view_to_dictionary::<K, i64>(array);
216+
}
217+
pack_byte_to_dictionary::<K, GenericBinaryType<i64>>(array, cast_options)
218+
}
195219
_ => Err(ArrowError::CastError(format!(
196220
"Unsupported output type for dictionary packing: {dict_value_type:?}"
197221
))),
@@ -226,6 +250,58 @@ where
226250
Ok(Arc::new(b.finish()))
227251
}
228252

253+
pub(crate) fn string_view_to_dictionary<K, O: OffsetSizeTrait>(
254+
array: &dyn Array,
255+
) -> Result<ArrayRef, ArrowError>
256+
where
257+
K: ArrowDictionaryKeyType,
258+
{
259+
let mut b = GenericByteDictionaryBuilder::<K, GenericStringType<O>>::with_capacity(
260+
array.len(),
261+
1024,
262+
1024,
263+
);
264+
let string_view = array.as_any().downcast_ref::<StringViewArray>().unwrap();
265+
for v in string_view.iter() {
266+
match v {
267+
Some(v) => {
268+
b.append(v)?;
269+
}
270+
None => {
271+
b.append_null();
272+
}
273+
}
274+
}
275+
276+
Ok(Arc::new(b.finish()))
277+
}
278+
279+
pub(crate) fn binary_view_to_dictionary<K, O: OffsetSizeTrait>(
280+
array: &dyn Array,
281+
) -> Result<ArrayRef, ArrowError>
282+
where
283+
K: ArrowDictionaryKeyType,
284+
{
285+
let mut b = GenericByteDictionaryBuilder::<K, GenericBinaryType<O>>::with_capacity(
286+
array.len(),
287+
1024,
288+
1024,
289+
);
290+
let binary_view = array.as_any().downcast_ref::<BinaryViewArray>().unwrap();
291+
for v in binary_view.iter() {
292+
match v {
293+
Some(v) => {
294+
b.append(v)?;
295+
}
296+
None => {
297+
b.append_null();
298+
}
299+
}
300+
}
301+
302+
Ok(Arc::new(b.finish()))
303+
}
304+
229305
// Packs the data as a GenericByteDictionaryBuilder, if possible, with the
230306
// key types of K
231307
pub(crate) fn pack_byte_to_dictionary<K, T>(

arrow-cast/src/cast/mod.rs

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5205,10 +5205,10 @@ mod tests {
52055205

52065206
const VIEW_TEST_DATA: [Option<&str>; 5] = [
52075207
Some("hello"),
5208-
Some("world"),
5208+
Some("repeated"),
52095209
None,
52105210
Some("large payload over 12 bytes"),
5211-
Some("lulu"),
5211+
Some("repeated"),
52125212
];
52135213

52145214
fn _test_string_to_view<O>()
@@ -5291,6 +5291,26 @@ mod tests {
52915291
assert_eq!(casted_binary_array.as_ref(), &binary_view_array);
52925292
}
52935293

5294+
#[test]
5295+
fn test_view_to_dict() {
5296+
let string_view_array = StringViewArray::from_iter(VIEW_TEST_DATA);
5297+
let string_dict_array: DictionaryArray<Int8Type> = VIEW_TEST_DATA.into_iter().collect();
5298+
let casted_type = string_dict_array.data_type();
5299+
let casted_dict_array = cast(&string_view_array, casted_type).unwrap();
5300+
assert_eq!(casted_dict_array.data_type(), casted_type);
5301+
assert_eq!(casted_dict_array.as_ref(), &string_dict_array);
5302+
5303+
let binary_view_array = BinaryViewArray::from_iter(VIEW_TEST_DATA);
5304+
let binary_dict_array = string_dict_array.downcast_dict::<StringArray>().unwrap();
5305+
let binary_buffer = cast(&binary_dict_array.values(), &DataType::Binary).unwrap();
5306+
let binary_dict_array =
5307+
DictionaryArray::<Int8Type>::new(binary_dict_array.keys().clone(), binary_buffer);
5308+
let casted_type = binary_dict_array.data_type();
5309+
let casted_binary_array = cast(&binary_view_array, casted_type).unwrap();
5310+
assert_eq!(casted_binary_array.data_type(), casted_type);
5311+
assert_eq!(casted_binary_array.as_ref(), &binary_dict_array);
5312+
}
5313+
52945314
#[test]
52955315
fn test_view_to_string() {
52965316
_test_view_to_string::<i32>();
@@ -5330,23 +5350,15 @@ mod tests {
53305350
where
53315351
O: OffsetSizeTrait,
53325352
{
5333-
let data: Vec<Option<&[u8]>> = vec![
5334-
Some(b"hello"),
5335-
Some(b"world"),
5336-
None,
5337-
Some(b"large payload over 12 bytes"),
5338-
Some(b"lulu"),
5339-
];
5340-
53415353
let view_array = {
53425354
let mut builder = BinaryViewBuilder::new().with_block_size(8); // multiple buffers.
5343-
for s in data.iter() {
5355+
for s in VIEW_TEST_DATA.iter() {
53445356
builder.append_option(*s);
53455357
}
53465358
builder.finish()
53475359
};
53485360

5349-
let expected_binary_array = GenericBinaryArray::<O>::from(data);
5361+
let expected_binary_array = GenericBinaryArray::<O>::from_iter(VIEW_TEST_DATA);
53505362
let expected_type = expected_binary_array.data_type();
53515363

53525364
assert!(can_cast_types(view_array.data_type(), expected_type));

0 commit comments

Comments
 (0)