Skip to content

Commit 5bda2ae

Browse files
committed
fix reviewer comments
1 parent a6410ce commit 5bda2ae

File tree

12 files changed

+194
-126
lines changed

12 files changed

+194
-126
lines changed

arrow-array/src/array/bytes_view_array.rs renamed to arrow-array/src/array/byte_view_array.rs

Lines changed: 87 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -16,29 +16,34 @@
1616
// under the License.
1717

1818
use crate::array::print_long_array;
19-
use crate::builder::GenericBytesViewBuilder;
19+
use crate::builder::GenericByteViewBuilder;
2020
use crate::iterator::ArrayIter;
2121
use crate::types::bytes::ByteArrayNativeType;
22-
use crate::types::BytesViewType;
22+
use crate::types::{BinaryViewType, ByteViewType, StringViewType};
2323
use crate::{Array, ArrayAccessor, ArrayRef};
2424
use arrow_buffer::{Buffer, NullBuffer, ScalarBuffer};
25-
use arrow_data::{ArrayData, ArrayDataBuilder, BytesView};
25+
use arrow_data::{ArrayData, ArrayDataBuilder, ByteView};
2626
use arrow_schema::{ArrowError, DataType};
2727
use std::any::Any;
2828
use std::fmt::Debug;
2929
use std::marker::PhantomData;
3030
use std::sync::Arc;
3131

32-
/// An array of variable length bytes view arrays
33-
pub struct GenericBytesViewArray<T: BytesViewType + ?Sized> {
32+
/// [Variable-size Binary View Layout]: An array of variable length bytes view arrays.
33+
///
34+
/// Different than [`GenericByteArray`] as it stores both an offset and length
35+
/// meaning that take / filter operations can be implemented without copying the underlying data.
36+
///
37+
/// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
38+
pub struct GenericByteViewArray<T: ByteViewType + ?Sized> {
3439
data_type: DataType,
3540
views: ScalarBuffer<u128>,
3641
buffers: Vec<Buffer>,
3742
phantom: PhantomData<T>,
3843
nulls: Option<NullBuffer>,
3944
}
4045

41-
impl<T: BytesViewType + ?Sized> Clone for GenericBytesViewArray<T> {
46+
impl<T: ByteViewType + ?Sized> Clone for GenericByteViewArray<T> {
4247
fn clone(&self) -> Self {
4348
Self {
4449
data_type: T::DATA_TYPE,
@@ -50,22 +55,22 @@ impl<T: BytesViewType + ?Sized> Clone for GenericBytesViewArray<T> {
5055
}
5156
}
5257

53-
impl<T: BytesViewType + ?Sized> GenericBytesViewArray<T> {
54-
/// Create a new [`GenericBytesViewArray`] from the provided parts, panicking on failure
58+
impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
59+
/// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure
5560
///
5661
/// # Panics
5762
///
58-
/// Panics if [`GenericBytesViewArray::try_new`] returns an error
63+
/// Panics if [`GenericByteViewArray::try_new`] returns an error
5964
pub fn new(views: ScalarBuffer<u128>, buffers: Vec<Buffer>, nulls: Option<NullBuffer>) -> Self {
6065
Self::try_new(views, buffers, nulls).unwrap()
6166
}
6267

63-
/// Create a new [`GenericBytesViewArray`] from the provided parts, returning an error on failure
68+
/// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure
6469
///
6570
/// # Errors
6671
///
6772
/// * `views.len() != nulls.len()`
68-
/// * [BytesViewType::validate] fails
73+
/// * [ByteViewType::validate] fails
6974
pub fn try_new(
7075
views: ScalarBuffer<u128>,
7176
buffers: Vec<Buffer>,
@@ -93,7 +98,7 @@ impl<T: BytesViewType + ?Sized> GenericBytesViewArray<T> {
9398
})
9499
}
95100

96-
/// Create a new [`GenericBytesViewArray`] from the provided parts, without validation
101+
/// Create a new [`GenericByteViewArray`] from the provided parts, without validation
97102
///
98103
/// # Safety
99104
///
@@ -112,7 +117,7 @@ impl<T: BytesViewType + ?Sized> GenericBytesViewArray<T> {
112117
}
113118
}
114119

115-
/// Create a new [`GenericBytesViewArray`] of length `len` where all values are null
120+
/// Create a new [`GenericByteViewArray`] of length `len` where all values are null
116121
pub fn new_null(len: usize) -> Self {
117122
Self {
118123
data_type: T::DATA_TYPE,
@@ -123,14 +128,14 @@ impl<T: BytesViewType + ?Sized> GenericBytesViewArray<T> {
123128
}
124129
}
125130

126-
/// Creates a [`GenericBytesViewArray`] based on an iterator of values without nulls
131+
/// Creates a [`GenericByteViewArray`] based on an iterator of values without nulls
127132
pub fn from_iter_values<Ptr, I>(iter: I) -> Self
128133
where
129134
Ptr: AsRef<T::Native>,
130135
I: IntoIterator<Item = Ptr>,
131136
{
132137
let iter = iter.into_iter();
133-
let mut builder = GenericBytesViewBuilder::<T>::with_capacity(iter.size_hint().0);
138+
let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
134139
for v in iter {
135140
builder.append_value(v);
136141
}
@@ -179,7 +184,7 @@ impl<T: BytesViewType + ?Sized> GenericBytesViewArray<T> {
179184
let ptr = self.views.as_ptr() as *const u8;
180185
std::slice::from_raw_parts(ptr.add(idx * 16 + 4), len as usize)
181186
} else {
182-
let view = BytesView::from(*v);
187+
let view = ByteView::from(*v);
183188
let data = self.buffers.get_unchecked(view.buffer_index as usize);
184189
let offset = view.offset as usize;
185190
data.get_unchecked(offset..offset + len as usize)
@@ -204,7 +209,7 @@ impl<T: BytesViewType + ?Sized> GenericBytesViewArray<T> {
204209
}
205210
}
206211

207-
impl<T: BytesViewType + ?Sized> Debug for GenericBytesViewArray<T> {
212+
impl<T: ByteViewType + ?Sized> Debug for GenericByteViewArray<T> {
208213
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
209214
write!(f, "{}ViewArray\n[\n", T::PREFIX)?;
210215
print_long_array(self, f, |array, index, f| {
@@ -214,7 +219,7 @@ impl<T: BytesViewType + ?Sized> Debug for GenericBytesViewArray<T> {
214219
}
215220
}
216221

217-
impl<T: BytesViewType + ?Sized> Array for GenericBytesViewArray<T> {
222+
impl<T: ByteViewType + ?Sized> Array for GenericByteViewArray<T> {
218223
fn as_any(&self) -> &dyn Any {
219224
self
220225
}
@@ -265,19 +270,19 @@ impl<T: BytesViewType + ?Sized> Array for GenericBytesViewArray<T> {
265270
}
266271
}
267272

268-
impl<'a, T: BytesViewType + ?Sized> ArrayAccessor for &'a GenericBytesViewArray<T> {
273+
impl<'a, T: ByteViewType + ?Sized> ArrayAccessor for &'a GenericByteViewArray<T> {
269274
type Item = &'a T::Native;
270275

271276
fn value(&self, index: usize) -> Self::Item {
272-
GenericBytesViewArray::value(self, index)
277+
GenericByteViewArray::value(self, index)
273278
}
274279

275280
unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
276-
GenericBytesViewArray::value_unchecked(self, index)
281+
GenericByteViewArray::value_unchecked(self, index)
277282
}
278283
}
279284

280-
impl<'a, T: BytesViewType + ?Sized> IntoIterator for &'a GenericBytesViewArray<T> {
285+
impl<'a, T: ByteViewType + ?Sized> IntoIterator for &'a GenericByteViewArray<T> {
281286
type Item = Option<&'a T::Native>;
282287
type IntoIter = ArrayIter<Self>;
283288

@@ -286,7 +291,7 @@ impl<'a, T: BytesViewType + ?Sized> IntoIterator for &'a GenericBytesViewArray<T
286291
}
287292
}
288293

289-
impl<T: BytesViewType + ?Sized> From<ArrayData> for GenericBytesViewArray<T> {
294+
impl<T: ByteViewType + ?Sized> From<ArrayData> for GenericByteViewArray<T> {
290295
fn from(value: ArrayData) -> Self {
291296
let views = value.buffers()[0].clone();
292297
let views = ScalarBuffer::new(views, value.offset(), value.len());
@@ -301,8 +306,8 @@ impl<T: BytesViewType + ?Sized> From<ArrayData> for GenericBytesViewArray<T> {
301306
}
302307
}
303308

304-
impl<T: BytesViewType + ?Sized> From<GenericBytesViewArray<T>> for ArrayData {
305-
fn from(mut array: GenericBytesViewArray<T>) -> Self {
309+
impl<T: ByteViewType + ?Sized> From<GenericByteViewArray<T>> for ArrayData {
310+
fn from(mut array: GenericByteViewArray<T>) -> Self {
306311
let len = array.len();
307312
array.buffers.insert(0, array.views.into_inner());
308313
let builder = ArrayDataBuilder::new(T::DATA_TYPE)
@@ -314,30 +319,30 @@ impl<T: BytesViewType + ?Sized> From<GenericBytesViewArray<T>> for ArrayData {
314319
}
315320
}
316321

317-
impl<Ptr, T: BytesViewType + ?Sized> FromIterator<Option<Ptr>> for GenericBytesViewArray<T>
322+
impl<Ptr, T: ByteViewType + ?Sized> FromIterator<Option<Ptr>> for GenericByteViewArray<T>
318323
where
319324
Ptr: AsRef<T::Native>,
320325
{
321326
fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
322327
let iter = iter.into_iter();
323-
let mut builder = GenericBytesViewBuilder::<T>::with_capacity(iter.size_hint().0);
328+
let mut builder = GenericByteViewBuilder::<T>::with_capacity(iter.size_hint().0);
324329
builder.extend(iter);
325330
builder.finish()
326331
}
327332
}
328333

329-
/// A [`GenericBytesViewArray`] of `[u8]`
330-
pub type BinaryViewArray = GenericBytesViewArray<[u8]>;
334+
/// A [`GenericByteViewArray`] of `[u8]`
335+
pub type BinaryViewArray = GenericByteViewArray<BinaryViewType>;
331336

332-
/// A [`GenericBytesViewArray`] of `str`
337+
/// A [`GenericByteViewArray`] of `str`
333338
///
334339
/// ```
335340
/// use arrow_array::StringViewArray;
336341
/// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]);
337342
/// assert_eq!(array.value(0), "hello");
338343
/// assert_eq!(array.value(3), "large payload over 12 bytes");
339344
/// ```
340-
pub type StringViewArray = GenericBytesViewArray<str>;
345+
pub type StringViewArray = GenericByteViewArray<StringViewType>;
341346

342347
impl From<Vec<&str>> for StringViewArray {
343348
fn from(v: Vec<&str>) -> Self {
@@ -348,8 +353,9 @@ impl From<Vec<&str>> for StringViewArray {
348353
#[cfg(test)]
349354
mod tests {
350355
use crate::builder::StringViewBuilder;
351-
use crate::types::BytesViewType;
352356
use crate::{Array, BinaryViewArray, StringViewArray};
357+
use arrow_buffer::{Buffer, ScalarBuffer};
358+
use arrow_data::ByteView;
353359

354360
#[test]
355361
fn try_new() {
@@ -363,20 +369,22 @@ mod tests {
363369
assert_eq!(array.value(3), "large payload over 12 bytes");
364370

365371
let array = BinaryViewArray::from_iter_values(vec![
366-
b"hello".to_bytes(),
367-
b"world".to_bytes(),
368-
b"lulu".to_bytes(),
369-
b"large payload over 12 bytes".to_bytes(),
372+
b"hello".as_slice(),
373+
b"world".as_slice(),
374+
b"lulu".as_slice(),
375+
b"large payload over 12 bytes".as_slice(),
370376
]);
371377
assert_eq!(array.value(0), b"hello");
372378
assert_eq!(array.value(3), b"large payload over 12 bytes");
373379

380+
// test empty array
374381
let array = {
375382
let mut builder = StringViewBuilder::new();
376383
builder.finish()
377384
};
378385
assert!(array.is_empty());
379386

387+
// test builder append
380388
let array = {
381389
let mut builder = StringViewBuilder::new();
382390
builder.append_value("hello");
@@ -387,5 +395,48 @@ mod tests {
387395
assert_eq!(array.value(0), "hello");
388396
assert!(array.is_null(1));
389397
assert_eq!(array.value(2), "large payload over 12 bytes");
398+
399+
// test builder's in_progress re-created
400+
let array = {
401+
// make a builder with small block size.
402+
let mut builder = StringViewBuilder::new().with_block_size(14);
403+
builder.append_value("large payload over 12 bytes");
404+
builder.append_option(Some("another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created"));
405+
builder.finish()
406+
};
407+
assert_eq!(array.value(0), "large payload over 12 bytes");
408+
assert_eq!(array.value(1), "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created");
409+
}
410+
411+
#[test]
412+
#[should_panic(expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers")]
413+
fn new_with_invalid_view_data() {
414+
let v = "large payload over 12 bytes";
415+
let view = ByteView {
416+
length: 13,
417+
prefix: u32::from_le_bytes(v.as_bytes()[0..4].try_into().unwrap()),
418+
buffer_index: 3,
419+
offset: 1,
420+
};
421+
let views = ScalarBuffer::from(vec![view.into()]);
422+
let buffers = vec![Buffer::from_slice_ref(v)];
423+
StringViewArray::new(views, buffers, None);
424+
}
425+
426+
#[test]
427+
#[should_panic(
428+
expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0"
429+
)]
430+
fn new_with_invalid_utf8_data() {
431+
let v: Vec<u8> = vec![0xf0, 0x80, 0x80, 0x80];
432+
let view = ByteView {
433+
length: v.len() as u32,
434+
prefix: u32::from_le_bytes(v[0..4].try_into().unwrap()),
435+
buffer_index: 0,
436+
offset: 0,
437+
};
438+
let views = ScalarBuffer::from(vec![view.into()]);
439+
let buffers = vec![Buffer::from_slice_ref(v)];
440+
StringViewArray::new(views, buffers, None);
390441
}
391442
}

arrow-array/src/array/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,9 @@ mod run_array;
6868

6969
pub use run_array::*;
7070

71-
mod bytes_view_array;
71+
mod byte_view_array;
7272

73-
pub use bytes_view_array::*;
73+
pub use byte_view_array::*;
7474

7575
/// An array in the [arrow columnar format](https://arrow.apache.org/docs/format/Columnar.html)
7676
pub trait Array: std::fmt::Debug + Send + Sync {

0 commit comments

Comments
 (0)