Skip to content

Commit ffbf53b

Browse files
committed
Encapsulate View manipulation
1 parent 520ad68 commit ffbf53b

File tree

5 files changed

+790
-88
lines changed

5 files changed

+790
-88
lines changed

arrow-array/src/array/byte_view_array.rs

Lines changed: 163 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -22,52 +22,36 @@ use crate::types::bytes::ByteArrayNativeType;
2222
use crate::types::{BinaryViewType, ByteViewType, StringViewType};
2323
use crate::{Array, ArrayAccessor, ArrayRef};
2424
use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
25-
use arrow_data::{ArrayData, ArrayDataBuilder, ByteView};
25+
use arrow_data::{ArrayData, ArrayDataBuilder, OffsetView, View};
2626
use arrow_schema::{ArrowError, DataType};
2727
use std::any::Any;
2828
use std::fmt::Debug;
2929
use std::marker::PhantomData;
3030
use std::sync::Arc;
3131

32-
/// [Variable-size Binary View Layout]: An array of variable length bytes view arrays.
33-
///
34-
/// Different than [`crate::GenericByteArray`] as it stores both an offset and length
35-
/// meaning that take / filter operations can be implemented without copying the underlying data.
36-
///
37-
/// See [`StringViewArray`] for storing utf8 encoded string data and
38-
/// [`BinaryViewArray`] for storing bytes.
39-
///
40-
/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
32+
/// [Variable-size Binary View Layout]: An array of variable length byte strings.
4133
///
4234
/// A `GenericByteViewArray` stores variable length byte strings. An array of
43-
/// `N` elements is stored as `N` fixed length "views" and a variable number
35+
/// `N` elements is stored as `N` fixed length [`View`]s and some number
4436
/// of variable length "buffers".
4537
///
46-
/// Each view is a `u128` value layout is different depending on the
47-
/// length of the string stored at that location:
38+
/// There are no constraints on offsets other than they must point into a valid
39+
/// buffer. The offsets can be out of order, non-continuous and overlapping.
4840
///
49-
/// ```text
50-
/// ┌──────┬────────────────────────┐
51-
/// │length│ string value │
52-
/// Strings (len <= 12) │ │ (padded with 0) │
53-
/// └──────┴────────────────────────┘
54-
/// 0 31 127
55-
///
56-
/// ┌───────┬───────┬───────┬───────┐
57-
/// │length │prefix │ buf │offset │
58-
/// Strings (len > 12) │ │ │ index │ │
59-
/// └───────┴───────┴───────┴───────┘
60-
/// 0 31 63 95 127
61-
/// ```
41+
/// Because `GenericByteViewArray` stores both an offset and length for each
42+
/// byte string, certain operations such as `take` and `filter` can be
43+
/// implemented without copying the underlying data, unlike
44+
/// [`GenericByteArray`], which requires the variable length data to be
45+
/// contiguous.
6246
///
63-
/// * Strings with length <= 12 are stored directly in the view.
47+
/// # See Also:
48+
/// * [`StringViewArray`] for storing UTF-8 string data
49+
/// * [`BinaryViewArray`] for storing bytes
50+
/// * [`View`] for the format of the views and interpreting the `u128` views
6451
///
65-
/// * Strings with length > 12: The first four bytes are stored inline in the
66-
/// view and the entire string is stored in one of the buffers.
52+
/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
6753
///
68-
/// Unlike [`GenericByteArray`], there are no constraints on the offsets other
69-
/// than they must point into a valid buffer. However, they can be out of order,
70-
/// non continuous and overlapping.
54+
/// # Example
7155
///
7256
/// For example, in the following diagram, the strings "FishWasInTownToday" and
7357
/// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a
@@ -93,6 +77,7 @@ use std::sync::Arc;
9377
/// └───┘
9478
/// ```
9579
/// [`GenericByteArray`]: crate::array::GenericByteArray
80+
/// [`View`]: arrow_data::View
9681
pub struct GenericByteViewArray<T: ByteViewType + ?Sized> {
9782
data_type: DataType,
9883
views: ScalarBuffer<u128>,
@@ -114,16 +99,26 @@ impl<T: ByteViewType + ?Sized> Clone for GenericByteViewArray<T> {
11499
}
115100

116101
impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
117-
/// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure
102+
/// Create a new [`GenericByteViewArray`] from the provided parts, panicking
103+
/// on failure.
118104
///
119-
/// # Panics
105+
/// See [Self::try_new] for parameters
120106
///
107+
/// # Panics
121108
/// Panics if [`GenericByteViewArray::try_new`] returns an error
109+
///
110+
/// [`View`]: arrow_data::View
122111
pub fn new(views: ScalarBuffer<u128>, buffers: Vec<Buffer>, nulls: Option<NullBuffer>) -> Self {
123112
Self::try_new(views, buffers, nulls).unwrap()
124113
}
125114

126-
/// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure
115+
/// Create a new [`GenericByteViewArray`] from the provided parts, returning
116+
/// an error on failure
117+
///
118+
/// # Parameters
119+
/// * `views`: a [`ScalarBuffer`] of u128 views (see [`View`] for format)
120+
/// * `buffers`: a vector of [`Buffer`]s storing the string data
121+
/// * `nulls`: an optional [`NullBuffer`] for null values
127122
///
128123
/// # Errors
129124
///
@@ -156,7 +151,10 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
156151
})
157152
}
158153

159-
/// Create a new [`GenericByteViewArray`] from the provided parts, without validation
154+
/// Create a new [`GenericByteViewArray`] from the provided parts, without
155+
/// validation
156+
///
157+
/// See [Self::try_new] for parameters
160158
///
161159
/// # Safety
162160
///
@@ -233,20 +231,68 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
233231
}
234232

235233
/// Returns the element at index `i`
234+
///
236235
/// # Safety
237236
/// Caller is responsible for ensuring that the index is within the bounds of the array
238237
pub unsafe fn value_unchecked(&self, idx: usize) -> &T::Native {
239238
let v = self.views.get_unchecked(idx);
240-
let len = *v as u32;
241-
let b = if len <= 12 {
242-
let ptr = self.views.as_ptr() as *const u8;
243-
std::slice::from_raw_parts(ptr.add(idx * 16 + 4), len as usize)
244-
} else {
245-
let view = ByteView::from(*v);
246-
let data = self.buffers.get_unchecked(view.buffer_index as usize);
247-
let offset = view.offset as usize;
248-
data.get_unchecked(offset..offset + len as usize)
249-
};
239+
match View::from(v) {
240+
View::Inline(inline_view) => {
241+
let bytes = inline_view.get_bytes_unchecked(v);
242+
T::Native::from_bytes_unchecked(bytes)
243+
}
244+
View::Offset(offset_view) => self.value_from_offset_view_unchecked(offset_view),
245+
}
246+
}
247+
248+
/// Return the value of element from this [`OffsetView`]
249+
///
250+
/// # Errors
251+
/// * the buffer index is out of bounds
252+
///* offset / length is out of bounds of the buffer
253+
/// * The data is not valid for `T::Native` (e.g. not Utf8)
254+
pub fn value_from_offset_view<'a>(
255+
&'a self,
256+
offset_view: OffsetView<'_>,
257+
) -> Result<&'a T::Native, ArrowError> {
258+
let data = self
259+
.buffers
260+
.get(offset_view.buffer_index() as usize)
261+
.ok_or_else(|| {
262+
ArrowError::InvalidArgumentError(format!(
263+
"Invalid ByteView. Requested buffer {} but only has {} buffers",
264+
offset_view.buffer_index(),
265+
self.buffers.len()
266+
))
267+
})?;
268+
269+
let b = data.get(offset_view.range()).ok_or_else(|| {
270+
ArrowError::InvalidArgumentError(format!(
271+
"Invalid ByteView. Requested range {:?} but buffer {} valid range is {:?}",
272+
offset_view.range(),
273+
offset_view.buffer_index(),
274+
0..data.len()
275+
))
276+
})?;
277+
278+
T::Native::try_from_bytes(b)
279+
}
280+
281+
/// Return the value from the [`OffsetView`]
282+
///
283+
/// # Safety
284+
/// The caller is responsible for ensuring:
285+
/// * the buffer index is within of bounds
286+
/// * offset / length is within of bounds of the buffer
287+
/// * The data is valid for `T::Native` (e.g Utf8 for Strings)
288+
pub unsafe fn value_from_offset_view_unchecked<'a>(
289+
&'a self,
290+
offset_view: OffsetView<'_>,
291+
) -> &'a T::Native {
292+
let data = self
293+
.buffers
294+
.get_unchecked(offset_view.buffer_index() as usize);
295+
let b = data.get_unchecked(offset_view.range());
250296
T::Native::from_bytes_unchecked(b)
251297
}
252298

@@ -487,7 +533,7 @@ mod tests {
487533
use crate::builder::{BinaryViewBuilder, StringViewBuilder};
488534
use crate::{Array, BinaryViewArray, StringViewArray};
489535
use arrow_buffer::{Buffer, ScalarBuffer};
490-
use arrow_data::ByteView;
536+
use arrow_data::{ByteView, OffsetView, View};
491537

492538
#[test]
493539
fn try_new_string() {
@@ -533,6 +579,72 @@ mod tests {
533579
assert!(array.is_empty());
534580
}
535581

582+
#[test]
583+
fn test_value_from_offset_view() {
584+
let array = test_array();
585+
let View::Offset(offset_view) = View::new(array.views().get(2).unwrap()) else {
586+
panic!("Expected offset view");
587+
};
588+
assert_eq!(
589+
array.value_from_offset_view(offset_view).unwrap(),
590+
"large payload over 12 bytes"
591+
);
592+
}
593+
594+
#[test]
595+
fn test_value_from_offset_view2() {
596+
let array = test_array();
597+
// Get last 60 bytes from buffer (60 is in hex 0x3c)
598+
// buffer is 65
599+
// offset 5, index 0, prefix=????, length 60
600+
let v = 0x00000005_00000000_00000000_0000003cu128;
601+
602+
assert_eq!(
603+
array.value_from_offset_view(OffsetView::from(&v)).unwrap(),
604+
" payload over 12 bytessome other large payload over 12 bytes"
605+
);
606+
}
607+
608+
#[test]
609+
#[should_panic(expected = "Invalid ByteView. Requested buffer 2 but only has 1 buffers")]
610+
fn test_value_from_offset_view_invalid_buffer() {
611+
let array = test_array();
612+
// offset 0, buffer = 2, prefix = ????, length = 256
613+
let v = 0x00000000_00000002_00000000_00000100u128;
614+
array.value_from_offset_view(OffsetView::from(&v)).unwrap();
615+
}
616+
617+
#[test]
618+
#[should_panic(
619+
expected = "Invalid ByteView. Requested range 256..271 but buffer 0 valid range is 0..65"
620+
)]
621+
fn test_value_from_offset_view_invalid_offset() {
622+
let array = test_array();
623+
// offset 256, buffer = 0, prefix = ????, length = 15
624+
let v = 0x00000100_00000000_00000000_0000000fu128;
625+
array.value_from_offset_view(OffsetView::from(&v)).unwrap();
626+
}
627+
628+
#[test]
629+
#[should_panic(
630+
expected = "Invalid ByteView. Requested range 0..256 but buffer 0 valid range is 0..65"
631+
)]
632+
fn test_value_from_offset_view_invalid_too_long() {
633+
let array = test_array();
634+
// offset 0, buffer = 0, prefix = ????, length = 256
635+
let v = 0x00000000_00000000_00000000_00000100u128;
636+
array.value_from_offset_view(OffsetView::from(&v)).unwrap();
637+
}
638+
639+
fn test_array() -> StringViewArray {
640+
let mut builder = StringViewBuilder::new();
641+
builder.append_value("hello");
642+
builder.append_null();
643+
builder.append_option(Some("large payload over 12 bytes"));
644+
builder.append_option(Some("some other large payload over 12 bytes"));
645+
builder.finish()
646+
}
647+
536648
#[test]
537649
fn test_append_string() {
538650
// test builder append
@@ -620,8 +732,8 @@ mod tests {
620732
view_buffer[0..4].copy_from_slice(&1u32.to_le_bytes());
621733
view_buffer[4..].copy_from_slice(&data);
622734

623-
let view = ByteView::from(u128::from_le_bytes(view_buffer));
624-
let views = ScalarBuffer::from(vec![view.into()]);
735+
let view = u128::from_le_bytes(view_buffer);
736+
let views = ScalarBuffer::from(vec![view]);
625737
let buffers = vec![];
626738
StringViewArray::new(views, buffers, None);
627739
}
@@ -639,8 +751,8 @@ mod tests {
639751
view_buffer[4..8].copy_from_slice(&input_str_1.as_bytes()[0..4]);
640752
view_buffer[8..12].copy_from_slice(&0u32.to_le_bytes());
641753
view_buffer[12..].copy_from_slice(&0u32.to_le_bytes());
642-
let view = ByteView::from(u128::from_le_bytes(view_buffer));
643-
let views = ScalarBuffer::from(vec![view.into()]);
754+
let view = u128::from_le_bytes(view_buffer);
755+
let views = ScalarBuffer::from(vec![view]);
644756
let buffers = vec![Buffer::from_slice_ref(input_str_2.as_bytes())];
645757

646758
StringViewArray::new(views, buffers, None);

arrow-array/src/builder/generic_bytes_view_builder.rs

Lines changed: 20 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use crate::builder::ArrayBuilder;
1919
use crate::types::{BinaryViewType, ByteViewType, StringViewType};
2020
use crate::{ArrayRef, GenericByteViewArray};
2121
use arrow_buffer::{Buffer, BufferBuilder, NullBufferBuilder, ScalarBuffer};
22-
use arrow_data::ByteView;
22+
use arrow_data::{OffsetViewBuilder, OwnedView};
2323

2424
use std::any::Any;
2525
use std::marker::PhantomData;
@@ -72,35 +72,28 @@ impl<T: ByteViewType + ?Sized> GenericByteViewBuilder<T> {
7272
#[inline]
7373
pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
7474
let v: &[u8] = value.as_ref().as_ref();
75-
let length: u32 = v.len().try_into().unwrap();
76-
if length <= 12 {
77-
let mut view_buffer = [0; 16];
78-
view_buffer[0..4].copy_from_slice(&length.to_le_bytes());
79-
view_buffer[4..4 + v.len()].copy_from_slice(v);
80-
self.views_builder.append(u128::from_le_bytes(view_buffer));
81-
self.null_buffer_builder.append_non_null();
82-
return;
83-
}
8475

85-
let required_cap = self.in_progress.len() + v.len();
86-
if self.in_progress.capacity() < required_cap {
87-
let in_progress = Vec::with_capacity(v.len().max(self.block_size as usize));
88-
let flushed = std::mem::replace(&mut self.in_progress, in_progress);
89-
if !flushed.is_empty() {
90-
assert!(self.completed.len() < u32::MAX as usize);
91-
self.completed.push(flushed.into());
76+
let view: u128 = match OwnedView::from(v) {
77+
OwnedView::Inline(view) => view,
78+
OwnedView::Offset(view) => {
79+
let required_cap = self.in_progress.len() + v.len();
80+
if self.in_progress.capacity() < required_cap {
81+
let in_progress = Vec::with_capacity(v.len().max(self.block_size as usize));
82+
let flushed = std::mem::replace(&mut self.in_progress, in_progress);
83+
if !flushed.is_empty() {
84+
assert!(self.completed.len() < u32::MAX as usize);
85+
self.completed.push(flushed.into());
86+
}
87+
};
88+
let builder = OffsetViewBuilder::from(view)
89+
.with_offset(self.in_progress.len() as u32)
90+
.with_buffer_index(self.completed.len() as u32);
91+
// copy the actual data into the in_progress buffer
92+
self.in_progress.extend_from_slice(v);
93+
builder.into()
9294
}
9395
};
94-
let offset = self.in_progress.len() as u32;
95-
self.in_progress.extend_from_slice(v);
96-
97-
let view = ByteView {
98-
length,
99-
prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()),
100-
buffer_index: self.completed.len() as u32,
101-
offset,
102-
};
103-
self.views_builder.append(view.into());
96+
self.views_builder.append(view);
10497
self.null_buffer_builder.append_non_null();
10598
}
10699

0 commit comments

Comments
 (0)