@@ -22,52 +22,36 @@ use crate::types::bytes::ByteArrayNativeType;
22
22
use crate :: types:: { BinaryViewType , ByteViewType , StringViewType } ;
23
23
use crate :: { Array , ArrayAccessor , ArrayRef } ;
24
24
use arrow_buffer:: { Buffer , NullBuffer , ScalarBuffer } ;
25
- use arrow_data:: { ArrayData , ArrayDataBuilder , ByteView } ;
25
+ use arrow_data:: { ArrayData , ArrayDataBuilder , OffsetView , View } ;
26
26
use arrow_schema:: { ArrowError , DataType } ;
27
27
use std:: any:: Any ;
28
28
use std:: fmt:: Debug ;
29
29
use std:: marker:: PhantomData ;
30
30
use std:: sync:: Arc ;
31
31
32
- /// [Variable-size Binary View Layout]: An array of variable length bytes view arrays.
33
- ///
34
- /// Different than [`crate::GenericByteArray`] as it stores both an offset and length
35
- /// meaning that take / filter operations can be implemented without copying the underlying data.
36
- ///
37
- /// See [`StringViewArray`] for storing utf8 encoded string data and
38
- /// [`BinaryViewArray`] for storing bytes.
39
- ///
40
- /// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
32
+ /// [Variable-size Binary View Layout]: An array of variable length byte strings.
41
33
///
42
34
/// A `GenericByteViewArray` stores variable length byte strings. An array of
43
- /// `N` elements is stored as `N` fixed length "views" and a variable number
35
+ /// `N` elements is stored as `N` fixed length [`View`]s and some number
44
36
/// of variable length "buffers".
45
37
///
46
- /// Each view is a `u128` value layout is different depending on the
47
- /// length of the string stored at that location:
38
+ /// There are no constraints on offsets other than they must point into a valid
39
+ /// buffer. The offsets can be out of order, non-continuous and overlapping.
48
40
///
49
- /// ```text
50
- /// ┌──────┬────────────────────────┐
51
- /// │length│ string value │
52
- /// Strings (len <= 12) │ │ (padded with 0) │
53
- /// └──────┴────────────────────────┘
54
- /// 0 31 127
55
- ///
56
- /// ┌───────┬───────┬───────┬───────┐
57
- /// │length │prefix │ buf │offset │
58
- /// Strings (len > 12) │ │ │ index │ │
59
- /// └───────┴───────┴───────┴───────┘
60
- /// 0 31 63 95 127
61
- /// ```
41
+ /// Because `GenericByteViewArray` stores both an offset and length for each
42
+ /// byte string, certain operations such as `take` and `filter` can be
43
+ /// implemented without copying the underlying data, unlike
44
+ /// [`GenericByteArray`], which requires the variable length data to be
45
+ /// contiguous.
62
46
///
63
- /// * Strings with length <= 12 are stored directly in the view.
47
+ /// # See Also:
48
+ /// * [`StringViewArray`] for storing UTF-8 string data
49
+ /// * [`BinaryViewArray`] for storing bytes
50
+ /// * [`View`] for the format of the views and interpreting the `u128` views
64
51
///
65
- /// * Strings with length > 12: The first four bytes are stored inline in the
66
- /// view and the entire string is stored in one of the buffers.
52
+ /// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
67
53
///
68
- /// Unlike [`GenericByteArray`], there are no constraints on the offsets other
69
- /// than they must point into a valid buffer. However, they can be out of order,
70
- /// non continuous and overlapping.
54
+ /// # Example
71
55
///
72
56
/// For example, in the following diagram, the strings "FishWasInTownToday" and
73
57
/// "CrumpleFacedFish" are both longer than 12 bytes and thus are stored in a
@@ -93,6 +77,7 @@ use std::sync::Arc;
93
77
/// └───┘
94
78
/// ```
95
79
/// [`GenericByteArray`]: crate::array::GenericByteArray
80
+ /// [`View`]: arrow_data::View
96
81
pub struct GenericByteViewArray < T : ByteViewType + ?Sized > {
97
82
data_type : DataType ,
98
83
views : ScalarBuffer < u128 > ,
@@ -114,16 +99,26 @@ impl<T: ByteViewType + ?Sized> Clone for GenericByteViewArray<T> {
114
99
}
115
100
116
101
impl < T : ByteViewType + ?Sized > GenericByteViewArray < T > {
117
- /// Create a new [`GenericByteViewArray`] from the provided parts, panicking on failure
102
+ /// Create a new [`GenericByteViewArray`] from the provided parts, panicking
103
+ /// on failure.
118
104
///
119
- /// # Panics
105
+ /// See [Self::try_new] for parameters
120
106
///
107
+ /// # Panics
121
108
/// Panics if [`GenericByteViewArray::try_new`] returns an error
109
+ ///
110
+ /// [`View`]: arrow_data::View
122
111
pub fn new ( views : ScalarBuffer < u128 > , buffers : Vec < Buffer > , nulls : Option < NullBuffer > ) -> Self {
123
112
Self :: try_new ( views, buffers, nulls) . unwrap ( )
124
113
}
125
114
126
- /// Create a new [`GenericByteViewArray`] from the provided parts, returning an error on failure
115
+ /// Create a new [`GenericByteViewArray`] from the provided parts, returning
116
+ /// an error on failure
117
+ ///
118
+ /// # Parameters
119
+ /// * `views`: a [`ScalarBuffer`] of u128 views (see [`View`] for format)
120
+ /// * `buffers`: a vector of [`Buffer`]s storing the string data
121
+ /// * `nulls`: an optional [`NullBuffer`] for null values
127
122
///
128
123
/// # Errors
129
124
///
@@ -156,7 +151,10 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
156
151
} )
157
152
}
158
153
159
- /// Create a new [`GenericByteViewArray`] from the provided parts, without validation
154
+ /// Create a new [`GenericByteViewArray`] from the provided parts, without
155
+ /// validation
156
+ ///
157
+ /// See [Self::try_new] for parameters
160
158
///
161
159
/// # Safety
162
160
///
@@ -233,20 +231,68 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> {
233
231
}
234
232
235
233
/// Returns the element at index `i`
234
+ ///
236
235
/// # Safety
237
236
/// Caller is responsible for ensuring that the index is within the bounds of the array
238
237
pub unsafe fn value_unchecked ( & self , idx : usize ) -> & T :: Native {
239
238
let v = self . views . get_unchecked ( idx) ;
240
- let len = * v as u32 ;
241
- let b = if len <= 12 {
242
- let ptr = self . views . as_ptr ( ) as * const u8 ;
243
- std:: slice:: from_raw_parts ( ptr. add ( idx * 16 + 4 ) , len as usize )
244
- } else {
245
- let view = ByteView :: from ( * v) ;
246
- let data = self . buffers . get_unchecked ( view. buffer_index as usize ) ;
247
- let offset = view. offset as usize ;
248
- data. get_unchecked ( offset..offset + len as usize )
249
- } ;
239
+ match View :: from ( v) {
240
+ View :: Inline ( inline_view) => {
241
+ let bytes = inline_view. get_bytes_unchecked ( v) ;
242
+ T :: Native :: from_bytes_unchecked ( bytes)
243
+ }
244
+ View :: Offset ( offset_view) => self . value_from_offset_view_unchecked ( offset_view) ,
245
+ }
246
+ }
247
+
248
+ /// Return the value of element from this [`OffsetView`]
249
+ ///
250
+ /// # Errors
251
+ /// * the buffer index is out of bounds
252
+ ///* offset / length is out of bounds of the buffer
253
+ /// * The data is not valid for `T::Native` (e.g. not Utf8)
254
+ pub fn value_from_offset_view < ' a > (
255
+ & ' a self ,
256
+ offset_view : OffsetView < ' _ > ,
257
+ ) -> Result < & ' a T :: Native , ArrowError > {
258
+ let data = self
259
+ . buffers
260
+ . get ( offset_view. buffer_index ( ) as usize )
261
+ . ok_or_else ( || {
262
+ ArrowError :: InvalidArgumentError ( format ! (
263
+ "Invalid ByteView. Requested buffer {} but only has {} buffers" ,
264
+ offset_view. buffer_index( ) ,
265
+ self . buffers. len( )
266
+ ) )
267
+ } ) ?;
268
+
269
+ let b = data. get ( offset_view. range ( ) ) . ok_or_else ( || {
270
+ ArrowError :: InvalidArgumentError ( format ! (
271
+ "Invalid ByteView. Requested range {:?} but buffer {} valid range is {:?}" ,
272
+ offset_view. range( ) ,
273
+ offset_view. buffer_index( ) ,
274
+ 0 ..data. len( )
275
+ ) )
276
+ } ) ?;
277
+
278
+ T :: Native :: try_from_bytes ( b)
279
+ }
280
+
281
+ /// Return the value from the [`OffsetView`]
282
+ ///
283
+ /// # Safety
284
+ /// The caller is responsible for ensuring:
285
+ /// * the buffer index is within of bounds
286
+ /// * offset / length is within of bounds of the buffer
287
+ /// * The data is valid for `T::Native` (e.g Utf8 for Strings)
288
+ pub unsafe fn value_from_offset_view_unchecked < ' a > (
289
+ & ' a self ,
290
+ offset_view : OffsetView < ' _ > ,
291
+ ) -> & ' a T :: Native {
292
+ let data = self
293
+ . buffers
294
+ . get_unchecked ( offset_view. buffer_index ( ) as usize ) ;
295
+ let b = data. get_unchecked ( offset_view. range ( ) ) ;
250
296
T :: Native :: from_bytes_unchecked ( b)
251
297
}
252
298
@@ -487,7 +533,7 @@ mod tests {
487
533
use crate :: builder:: { BinaryViewBuilder , StringViewBuilder } ;
488
534
use crate :: { Array , BinaryViewArray , StringViewArray } ;
489
535
use arrow_buffer:: { Buffer , ScalarBuffer } ;
490
- use arrow_data:: ByteView ;
536
+ use arrow_data:: { ByteView , OffsetView , View } ;
491
537
492
538
#[ test]
493
539
fn try_new_string ( ) {
@@ -533,6 +579,72 @@ mod tests {
533
579
assert ! ( array. is_empty( ) ) ;
534
580
}
535
581
582
+ #[ test]
583
+ fn test_value_from_offset_view ( ) {
584
+ let array = test_array ( ) ;
585
+ let View :: Offset ( offset_view) = View :: new ( array. views ( ) . get ( 2 ) . unwrap ( ) ) else {
586
+ panic ! ( "Expected offset view" ) ;
587
+ } ;
588
+ assert_eq ! (
589
+ array. value_from_offset_view( offset_view) . unwrap( ) ,
590
+ "large payload over 12 bytes"
591
+ ) ;
592
+ }
593
+
594
+ #[ test]
595
+ fn test_value_from_offset_view2 ( ) {
596
+ let array = test_array ( ) ;
597
+ // Get last 60 bytes from buffer (60 is in hex 0x3c)
598
+ // buffer is 65
599
+ // offset 5, index 0, prefix=????, length 60
600
+ let v = 0x00000005_00000000_00000000_0000003cu128 ;
601
+
602
+ assert_eq ! (
603
+ array. value_from_offset_view( OffsetView :: from( & v) ) . unwrap( ) ,
604
+ " payload over 12 bytessome other large payload over 12 bytes"
605
+ ) ;
606
+ }
607
+
608
+ #[ test]
609
+ #[ should_panic( expected = "Invalid ByteView. Requested buffer 2 but only has 1 buffers" ) ]
610
+ fn test_value_from_offset_view_invalid_buffer ( ) {
611
+ let array = test_array ( ) ;
612
+ // offset 0, buffer = 2, prefix = ????, length = 256
613
+ let v = 0x00000000_00000002_00000000_00000100u128 ;
614
+ array. value_from_offset_view ( OffsetView :: from ( & v) ) . unwrap ( ) ;
615
+ }
616
+
617
+ #[ test]
618
+ #[ should_panic(
619
+ expected = "Invalid ByteView. Requested range 256..271 but buffer 0 valid range is 0..65"
620
+ ) ]
621
+ fn test_value_from_offset_view_invalid_offset ( ) {
622
+ let array = test_array ( ) ;
623
+ // offset 256, buffer = 0, prefix = ????, length = 15
624
+ let v = 0x00000100_00000000_00000000_0000000fu128 ;
625
+ array. value_from_offset_view ( OffsetView :: from ( & v) ) . unwrap ( ) ;
626
+ }
627
+
628
+ #[ test]
629
+ #[ should_panic(
630
+ expected = "Invalid ByteView. Requested range 0..256 but buffer 0 valid range is 0..65"
631
+ ) ]
632
+ fn test_value_from_offset_view_invalid_too_long ( ) {
633
+ let array = test_array ( ) ;
634
+ // offset 0, buffer = 0, prefix = ????, length = 256
635
+ let v = 0x00000000_00000000_00000000_00000100u128 ;
636
+ array. value_from_offset_view ( OffsetView :: from ( & v) ) . unwrap ( ) ;
637
+ }
638
+
639
+ fn test_array ( ) -> StringViewArray {
640
+ let mut builder = StringViewBuilder :: new ( ) ;
641
+ builder. append_value ( "hello" ) ;
642
+ builder. append_null ( ) ;
643
+ builder. append_option ( Some ( "large payload over 12 bytes" ) ) ;
644
+ builder. append_option ( Some ( "some other large payload over 12 bytes" ) ) ;
645
+ builder. finish ( )
646
+ }
647
+
536
648
#[ test]
537
649
fn test_append_string ( ) {
538
650
// test builder append
@@ -620,8 +732,8 @@ mod tests {
620
732
view_buffer[ 0 ..4 ] . copy_from_slice ( & 1u32 . to_le_bytes ( ) ) ;
621
733
view_buffer[ 4 ..] . copy_from_slice ( & data) ;
622
734
623
- let view = ByteView :: from ( u128:: from_le_bytes ( view_buffer) ) ;
624
- let views = ScalarBuffer :: from ( vec ! [ view. into ( ) ] ) ;
735
+ let view = u128:: from_le_bytes ( view_buffer) ;
736
+ let views = ScalarBuffer :: from ( vec ! [ view] ) ;
625
737
let buffers = vec ! [ ] ;
626
738
StringViewArray :: new ( views, buffers, None ) ;
627
739
}
@@ -639,8 +751,8 @@ mod tests {
639
751
view_buffer[ 4 ..8 ] . copy_from_slice ( & input_str_1. as_bytes ( ) [ 0 ..4 ] ) ;
640
752
view_buffer[ 8 ..12 ] . copy_from_slice ( & 0u32 . to_le_bytes ( ) ) ;
641
753
view_buffer[ 12 ..] . copy_from_slice ( & 0u32 . to_le_bytes ( ) ) ;
642
- let view = ByteView :: from ( u128:: from_le_bytes ( view_buffer) ) ;
643
- let views = ScalarBuffer :: from ( vec ! [ view. into ( ) ] ) ;
754
+ let view = u128:: from_le_bytes ( view_buffer) ;
755
+ let views = ScalarBuffer :: from ( vec ! [ view] ) ;
644
756
let buffers = vec ! [ Buffer :: from_slice_ref( input_str_2. as_bytes( ) ) ] ;
645
757
646
758
StringViewArray :: new ( views, buffers, None ) ;
0 commit comments