-
Notifications
You must be signed in to change notification settings - Fork 929
Improve concat
performance, and add append_array
for some array builder implementations
#7309
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7245e26
5f4f91a
3c42e72
5e5be05
86e9ba1
060345d
55a1001
381ee83
2cab672
55df249
f9dd148
6b24c61
c64520e
0757bbb
1978d98
85e1416
aa1fdd4
70642d6
6ff3947
597112d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,7 +17,7 @@ | |
|
||
use crate::builder::{ArrayBuilder, BufferBuilder, UInt8BufferBuilder}; | ||
use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType}; | ||
use crate::{ArrayRef, GenericByteArray, OffsetSizeTrait}; | ||
use crate::{Array, ArrayRef, GenericByteArray, OffsetSizeTrait}; | ||
use arrow_buffer::NullBufferBuilder; | ||
use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer}; | ||
use arrow_data::ArrayDataBuilder; | ||
|
@@ -129,6 +129,48 @@ impl<T: ByteArrayType> GenericByteBuilder<T> { | |
self.offsets_builder.append(self.next_offset()); | ||
} | ||
|
||
/// Appends array values and null to this builder as is | ||
/// (this means that underlying null values are copied as is). | ||
#[inline] | ||
pub fn append_array(&mut self, array: &GenericByteArray<T>) { | ||
if array.len() == 0 { | ||
return; | ||
} | ||
|
||
let offsets = array.offsets(); | ||
|
||
// If the offsets are contiguous, we can append them directly avoiding the need to align | ||
// for example, when the first appended array is not sliced (starts at offset 0) | ||
if self.next_offset() == offsets[0] { | ||
self.offsets_builder.append_slice(&offsets[1..]); | ||
} else { | ||
// Shifting all the offsets | ||
let shift: T::Offset = self.next_offset() - offsets[0]; | ||
|
||
// Creating intermediate offsets instead of pushing each offset is faster | ||
// (even if we make MutableBuffer to avoid updating length on each push | ||
// and reserve the necessary capacity, it's still slower) | ||
let mut intermediate = Vec::with_capacity(offsets.len() - 1); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🤔 maybe we need something like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think many of these instances could be changed to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Filed follow on |
||
|
||
for &offset in &offsets[1..] { | ||
intermediate.push(offset + shift) | ||
} | ||
|
||
self.offsets_builder.append_slice(&intermediate); | ||
} | ||
|
||
// Append underlying values, starting from the first offset and ending at the last offset | ||
self.value_builder.append_slice( | ||
&array.values().as_slice()[offsets[0].as_usize()..offsets[array.len()].as_usize()], | ||
); | ||
|
||
if let Some(null_buffer) = array.nulls() { | ||
self.null_buffer_builder.append_buffer(null_buffer); | ||
} else { | ||
self.null_buffer_builder.append_n_non_nulls(array.len()); | ||
} | ||
} | ||
|
||
/// Builds the [`GenericByteArray`] and reset this builder. | ||
pub fn finish(&mut self) -> GenericByteArray<T> { | ||
let array_type = T::DATA_TYPE; | ||
|
@@ -358,6 +400,7 @@ mod tests { | |
use super::*; | ||
use crate::array::Array; | ||
use crate::GenericStringArray; | ||
use arrow_buffer::NullBuffer; | ||
use std::fmt::Write as _; | ||
use std::io::Write as _; | ||
|
||
|
@@ -593,4 +636,178 @@ mod tests { | |
&["foo".as_bytes(), "bar\n".as_bytes(), "fizbuz".as_bytes()] | ||
) | ||
} | ||
|
||
#[test] | ||
fn test_append_array_without_nulls() { | ||
let input = vec![ | ||
"hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well", | ||
"thank", "you", "for", "asking", | ||
]; | ||
let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec()); | ||
let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec()); | ||
let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec()); | ||
|
||
let mut builder = GenericStringBuilder::<i32>::new(); | ||
builder.append_array(&arr1); | ||
builder.append_array(&arr2); | ||
builder.append_array(&arr3); | ||
|
||
let actual = builder.finish(); | ||
let expected = GenericStringArray::<i32>::from(input); | ||
|
||
assert_eq!(actual, expected); | ||
} | ||
|
||
#[test] | ||
fn test_append_array_with_nulls() { | ||
let input = vec![ | ||
Some("hello"), | ||
None, | ||
Some("how"), | ||
None, | ||
None, | ||
None, | ||
None, | ||
Some("I"), | ||
Some("am"), | ||
Some("doing"), | ||
Some("well"), | ||
]; | ||
let arr1 = GenericStringArray::<i32>::from(input[..3].to_vec()); | ||
let arr2 = GenericStringArray::<i32>::from(input[3..7].to_vec()); | ||
let arr3 = GenericStringArray::<i32>::from(input[7..].to_vec()); | ||
|
||
let mut builder = GenericStringBuilder::<i32>::new(); | ||
builder.append_array(&arr1); | ||
builder.append_array(&arr2); | ||
builder.append_array(&arr3); | ||
|
||
let actual = builder.finish(); | ||
let expected = GenericStringArray::<i32>::from(input); | ||
|
||
assert_eq!(actual, expected); | ||
} | ||
|
||
#[test] | ||
fn test_append_empty_array() { | ||
rluvaton marked this conversation as resolved.
Show resolved
Hide resolved
|
||
let arr = GenericStringArray::<i32>::from(Vec::<&str>::new()); | ||
let mut builder = GenericStringBuilder::<i32>::new(); | ||
builder.append_array(&arr); | ||
let result = builder.finish(); | ||
assert_eq!(result.len(), 0); | ||
} | ||
|
||
#[test] | ||
fn test_append_array_with_offset_not_starting_at_0() { | ||
let input = vec![ | ||
Some("hello"), | ||
None, | ||
Some("how"), | ||
None, | ||
None, | ||
None, | ||
None, | ||
Some("I"), | ||
Some("am"), | ||
Some("doing"), | ||
Some("well"), | ||
]; | ||
let full_array = GenericStringArray::<i32>::from(input); | ||
let sliced = full_array.slice(1, 4); | ||
|
||
assert_ne!(sliced.offsets()[0].as_usize(), 0); | ||
assert_ne!(sliced.offsets().last(), full_array.offsets().last()); | ||
|
||
let mut builder = GenericStringBuilder::<i32>::new(); | ||
builder.append_array(&sliced); | ||
let actual = builder.finish(); | ||
|
||
let expected = GenericStringArray::<i32>::from(vec![None, Some("how"), None, None]); | ||
|
||
assert_eq!(actual, expected); | ||
} | ||
|
||
#[test] | ||
fn test_append_underlying_null_values_added_as_is() { | ||
let input_1_array_with_nulls = { | ||
let input = vec![ | ||
"hello", "world", "how", "are", "you", "doing", "today", "I", "am", | ||
]; | ||
let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts(); | ||
|
||
GenericStringArray::<i32>::new( | ||
offsets, | ||
buffer, | ||
Some(NullBuffer::from(&[ | ||
true, false, true, false, false, true, true, true, false, | ||
])), | ||
) | ||
}; | ||
let input_2_array_with_nulls = { | ||
let input = vec!["doing", "well", "thank", "you", "for", "asking"]; | ||
let (offsets, buffer, _) = GenericStringArray::<i32>::from(input).into_parts(); | ||
|
||
GenericStringArray::<i32>::new( | ||
offsets, | ||
buffer, | ||
Some(NullBuffer::from(&[false, false, true, false, true, true])), | ||
) | ||
}; | ||
|
||
let mut builder = GenericStringBuilder::<i32>::new(); | ||
builder.append_array(&input_1_array_with_nulls); | ||
builder.append_array(&input_2_array_with_nulls); | ||
|
||
let actual = builder.finish(); | ||
let expected = GenericStringArray::<i32>::from(vec![ | ||
Some("hello"), | ||
None, // world | ||
Some("how"), | ||
None, // are | ||
None, // you | ||
Some("doing"), | ||
Some("today"), | ||
Some("I"), | ||
None, // am | ||
None, // doing | ||
None, // well | ||
Some("thank"), | ||
None, // "you", | ||
Some("for"), | ||
Some("asking"), | ||
]); | ||
|
||
assert_eq!(actual, expected); | ||
|
||
let expected_underlying_buffer = Buffer::from( | ||
[ | ||
"hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", | ||
"well", "thank", "you", "for", "asking", | ||
] | ||
.join("") | ||
.as_bytes(), | ||
); | ||
assert_eq!(actual.values(), &expected_underlying_buffer); | ||
} | ||
|
||
#[test] | ||
fn append_array_with_continues_indices() { | ||
let input = vec![ | ||
"hello", "world", "how", "are", "you", "doing", "today", "I", "am", "doing", "well", | ||
"thank", "you", "for", "asking", | ||
]; | ||
let full_array = GenericStringArray::<i32>::from(input); | ||
let slice1 = full_array.slice(0, 3); | ||
let slice2 = full_array.slice(3, 4); | ||
let slice3 = full_array.slice(7, full_array.len() - 7); | ||
|
||
let mut builder = GenericStringBuilder::<i32>::new(); | ||
builder.append_array(&slice1); | ||
builder.append_array(&slice2); | ||
builder.append_array(&slice3); | ||
|
||
let actual = builder.finish(); | ||
|
||
assert_eq!(actual, full_array); | ||
} | ||
} |
Uh oh!
There was an error while loading. Please reload this page.