16
16
// under the License.
17
17
18
18
use crate :: array:: print_long_array;
19
- use crate :: builder:: GenericBytesViewBuilder ;
19
+ use crate :: builder:: GenericByteViewBuilder ;
20
20
use crate :: iterator:: ArrayIter ;
21
21
use crate :: types:: bytes:: ByteArrayNativeType ;
22
- use crate :: types:: BytesViewType ;
22
+ use crate :: types:: { BinaryViewType , ByteViewType , StringViewType } ;
23
23
use crate :: { Array , ArrayAccessor , ArrayRef } ;
24
24
use arrow_buffer:: { Buffer , NullBuffer , ScalarBuffer } ;
25
- use arrow_data:: { ArrayData , ArrayDataBuilder , BytesView } ;
25
+ use arrow_data:: { ArrayData , ArrayDataBuilder , ByteView } ;
26
26
use arrow_schema:: { ArrowError , DataType } ;
27
27
use std:: any:: Any ;
28
28
use std:: fmt:: Debug ;
29
29
use std:: marker:: PhantomData ;
30
30
use std:: sync:: Arc ;
31
31
32
- /// An array of variable length bytes view arrays
33
- pub struct GenericBytesViewArray < T : BytesViewType + ?Sized > {
32
+ /// [Variable-size Binary View Layout]: An array of variable length bytes view arrays.
33
+ ///
34
+ /// Different than [`GenericByteArray`] as it stores both an offset and length
35
+ /// meaning that take / filter operations can be implemented without copying the underlying data.
36
+ ///
37
+ /// [Variable-size Binary View Layout]: https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-view-layout
38
+ pub struct GenericByteViewArray < T : ByteViewType + ?Sized > {
34
39
data_type : DataType ,
35
40
views : ScalarBuffer < u128 > ,
36
41
buffers : Vec < Buffer > ,
37
42
phantom : PhantomData < T > ,
38
43
nulls : Option < NullBuffer > ,
39
44
}
40
45
41
- impl < T : BytesViewType + ?Sized > Clone for GenericBytesViewArray < T > {
46
+ impl < T : ByteViewType + ?Sized > Clone for GenericByteViewArray < T > {
42
47
fn clone ( & self ) -> Self {
43
48
Self {
44
49
data_type : T :: DATA_TYPE ,
@@ -50,22 +55,22 @@ impl<T: BytesViewType + ?Sized> Clone for GenericBytesViewArray<T> {
50
55
}
51
56
}
52
57
53
- impl < T : BytesViewType + ?Sized > GenericBytesViewArray < T > {
54
- /// Create a new [`GenericBytesViewArray `] from the provided parts, panicking on failure
58
+ impl < T : ByteViewType + ?Sized > GenericByteViewArray < T > {
59
+ /// Create a new [`GenericByteViewArray `] from the provided parts, panicking on failure
55
60
///
56
61
/// # Panics
57
62
///
58
- /// Panics if [`GenericBytesViewArray ::try_new`] returns an error
63
+ /// Panics if [`GenericByteViewArray ::try_new`] returns an error
59
64
pub fn new ( views : ScalarBuffer < u128 > , buffers : Vec < Buffer > , nulls : Option < NullBuffer > ) -> Self {
60
65
Self :: try_new ( views, buffers, nulls) . unwrap ( )
61
66
}
62
67
63
- /// Create a new [`GenericBytesViewArray `] from the provided parts, returning an error on failure
68
+ /// Create a new [`GenericByteViewArray `] from the provided parts, returning an error on failure
64
69
///
65
70
/// # Errors
66
71
///
67
72
/// * `views.len() != nulls.len()`
68
- /// * [BytesViewType ::validate] fails
73
+ /// * [ByteViewType ::validate] fails
69
74
pub fn try_new (
70
75
views : ScalarBuffer < u128 > ,
71
76
buffers : Vec < Buffer > ,
@@ -93,7 +98,7 @@ impl<T: BytesViewType + ?Sized> GenericBytesViewArray<T> {
93
98
} )
94
99
}
95
100
96
- /// Create a new [`GenericBytesViewArray `] from the provided parts, without validation
101
+ /// Create a new [`GenericByteViewArray `] from the provided parts, without validation
97
102
///
98
103
/// # Safety
99
104
///
@@ -112,7 +117,7 @@ impl<T: BytesViewType + ?Sized> GenericBytesViewArray<T> {
112
117
}
113
118
}
114
119
115
- /// Create a new [`GenericBytesViewArray `] of length `len` where all values are null
120
+ /// Create a new [`GenericByteViewArray `] of length `len` where all values are null
116
121
pub fn new_null ( len : usize ) -> Self {
117
122
Self {
118
123
data_type : T :: DATA_TYPE ,
@@ -123,14 +128,14 @@ impl<T: BytesViewType + ?Sized> GenericBytesViewArray<T> {
123
128
}
124
129
}
125
130
126
- /// Creates a [`GenericBytesViewArray `] based on an iterator of values without nulls
131
+ /// Creates a [`GenericByteViewArray `] based on an iterator of values without nulls
127
132
pub fn from_iter_values < Ptr , I > ( iter : I ) -> Self
128
133
where
129
134
Ptr : AsRef < T :: Native > ,
130
135
I : IntoIterator < Item = Ptr > ,
131
136
{
132
137
let iter = iter. into_iter ( ) ;
133
- let mut builder = GenericBytesViewBuilder :: < T > :: with_capacity ( iter. size_hint ( ) . 0 ) ;
138
+ let mut builder = GenericByteViewBuilder :: < T > :: with_capacity ( iter. size_hint ( ) . 0 ) ;
134
139
for v in iter {
135
140
builder. append_value ( v) ;
136
141
}
@@ -179,7 +184,7 @@ impl<T: BytesViewType + ?Sized> GenericBytesViewArray<T> {
179
184
let ptr = self . views . as_ptr ( ) as * const u8 ;
180
185
std:: slice:: from_raw_parts ( ptr. add ( idx * 16 + 4 ) , len as usize )
181
186
} else {
182
- let view = BytesView :: from ( * v) ;
187
+ let view = ByteView :: from ( * v) ;
183
188
let data = self . buffers . get_unchecked ( view. buffer_index as usize ) ;
184
189
let offset = view. offset as usize ;
185
190
data. get_unchecked ( offset..offset + len as usize )
@@ -204,7 +209,7 @@ impl<T: BytesViewType + ?Sized> GenericBytesViewArray<T> {
204
209
}
205
210
}
206
211
207
- impl < T : BytesViewType + ?Sized > Debug for GenericBytesViewArray < T > {
212
+ impl < T : ByteViewType + ?Sized > Debug for GenericByteViewArray < T > {
208
213
fn fmt ( & self , f : & mut std:: fmt:: Formatter ) -> std:: fmt:: Result {
209
214
write ! ( f, "{}ViewArray\n [\n " , T :: PREFIX ) ?;
210
215
print_long_array ( self , f, |array, index, f| {
@@ -214,7 +219,7 @@ impl<T: BytesViewType + ?Sized> Debug for GenericBytesViewArray<T> {
214
219
}
215
220
}
216
221
217
- impl < T : BytesViewType + ?Sized > Array for GenericBytesViewArray < T > {
222
+ impl < T : ByteViewType + ?Sized > Array for GenericByteViewArray < T > {
218
223
fn as_any ( & self ) -> & dyn Any {
219
224
self
220
225
}
@@ -265,19 +270,19 @@ impl<T: BytesViewType + ?Sized> Array for GenericBytesViewArray<T> {
265
270
}
266
271
}
267
272
268
- impl < ' a , T : BytesViewType + ?Sized > ArrayAccessor for & ' a GenericBytesViewArray < T > {
273
+ impl < ' a , T : ByteViewType + ?Sized > ArrayAccessor for & ' a GenericByteViewArray < T > {
269
274
type Item = & ' a T :: Native ;
270
275
271
276
fn value ( & self , index : usize ) -> Self :: Item {
272
- GenericBytesViewArray :: value ( self , index)
277
+ GenericByteViewArray :: value ( self , index)
273
278
}
274
279
275
280
unsafe fn value_unchecked ( & self , index : usize ) -> Self :: Item {
276
- GenericBytesViewArray :: value_unchecked ( self , index)
281
+ GenericByteViewArray :: value_unchecked ( self , index)
277
282
}
278
283
}
279
284
280
- impl < ' a , T : BytesViewType + ?Sized > IntoIterator for & ' a GenericBytesViewArray < T > {
285
+ impl < ' a , T : ByteViewType + ?Sized > IntoIterator for & ' a GenericByteViewArray < T > {
281
286
type Item = Option < & ' a T :: Native > ;
282
287
type IntoIter = ArrayIter < Self > ;
283
288
@@ -286,7 +291,7 @@ impl<'a, T: BytesViewType + ?Sized> IntoIterator for &'a GenericBytesViewArray<T
286
291
}
287
292
}
288
293
289
- impl < T : BytesViewType + ?Sized > From < ArrayData > for GenericBytesViewArray < T > {
294
+ impl < T : ByteViewType + ?Sized > From < ArrayData > for GenericByteViewArray < T > {
290
295
fn from ( value : ArrayData ) -> Self {
291
296
let views = value. buffers ( ) [ 0 ] . clone ( ) ;
292
297
let views = ScalarBuffer :: new ( views, value. offset ( ) , value. len ( ) ) ;
@@ -301,8 +306,8 @@ impl<T: BytesViewType + ?Sized> From<ArrayData> for GenericBytesViewArray<T> {
301
306
}
302
307
}
303
308
304
- impl < T : BytesViewType + ?Sized > From < GenericBytesViewArray < T > > for ArrayData {
305
- fn from ( mut array : GenericBytesViewArray < T > ) -> Self {
309
+ impl < T : ByteViewType + ?Sized > From < GenericByteViewArray < T > > for ArrayData {
310
+ fn from ( mut array : GenericByteViewArray < T > ) -> Self {
306
311
let len = array. len ( ) ;
307
312
array. buffers . insert ( 0 , array. views . into_inner ( ) ) ;
308
313
let builder = ArrayDataBuilder :: new ( T :: DATA_TYPE )
@@ -314,30 +319,30 @@ impl<T: BytesViewType + ?Sized> From<GenericBytesViewArray<T>> for ArrayData {
314
319
}
315
320
}
316
321
317
- impl < Ptr , T : BytesViewType + ?Sized > FromIterator < Option < Ptr > > for GenericBytesViewArray < T >
322
+ impl < Ptr , T : ByteViewType + ?Sized > FromIterator < Option < Ptr > > for GenericByteViewArray < T >
318
323
where
319
324
Ptr : AsRef < T :: Native > ,
320
325
{
321
326
fn from_iter < I : IntoIterator < Item = Option < Ptr > > > ( iter : I ) -> Self {
322
327
let iter = iter. into_iter ( ) ;
323
- let mut builder = GenericBytesViewBuilder :: < T > :: with_capacity ( iter. size_hint ( ) . 0 ) ;
328
+ let mut builder = GenericByteViewBuilder :: < T > :: with_capacity ( iter. size_hint ( ) . 0 ) ;
324
329
builder. extend ( iter) ;
325
330
builder. finish ( )
326
331
}
327
332
}
328
333
329
- /// A [`GenericBytesViewArray `] of `[u8]`
330
- pub type BinaryViewArray = GenericBytesViewArray < [ u8 ] > ;
334
+ /// A [`GenericByteViewArray `] of `[u8]`
335
+ pub type BinaryViewArray = GenericByteViewArray < BinaryViewType > ;
331
336
332
- /// A [`GenericBytesViewArray `] of `str`
337
+ /// A [`GenericByteViewArray `] of `str`
333
338
///
334
339
/// ```
335
340
/// use arrow_array::StringViewArray;
336
341
/// let array = StringViewArray::from_iter_values(vec!["hello", "world", "lulu", "large payload over 12 bytes"]);
337
342
/// assert_eq!(array.value(0), "hello");
338
343
/// assert_eq!(array.value(3), "large payload over 12 bytes");
339
344
/// ```
340
- pub type StringViewArray = GenericBytesViewArray < str > ;
345
+ pub type StringViewArray = GenericByteViewArray < StringViewType > ;
341
346
342
347
impl From < Vec < & str > > for StringViewArray {
343
348
fn from ( v : Vec < & str > ) -> Self {
@@ -348,8 +353,9 @@ impl From<Vec<&str>> for StringViewArray {
348
353
#[ cfg( test) ]
349
354
mod tests {
350
355
use crate :: builder:: StringViewBuilder ;
351
- use crate :: types:: BytesViewType ;
352
356
use crate :: { Array , BinaryViewArray , StringViewArray } ;
357
+ use arrow_buffer:: { Buffer , ScalarBuffer } ;
358
+ use arrow_data:: ByteView ;
353
359
354
360
#[ test]
355
361
fn try_new ( ) {
@@ -363,20 +369,22 @@ mod tests {
363
369
assert_eq ! ( array. value( 3 ) , "large payload over 12 bytes" ) ;
364
370
365
371
let array = BinaryViewArray :: from_iter_values ( vec ! [
366
- b"hello" . to_bytes ( ) ,
367
- b"world" . to_bytes ( ) ,
368
- b"lulu" . to_bytes ( ) ,
369
- b"large payload over 12 bytes" . to_bytes ( ) ,
372
+ b"hello" . as_slice ( ) ,
373
+ b"world" . as_slice ( ) ,
374
+ b"lulu" . as_slice ( ) ,
375
+ b"large payload over 12 bytes" . as_slice ( ) ,
370
376
] ) ;
371
377
assert_eq ! ( array. value( 0 ) , b"hello" ) ;
372
378
assert_eq ! ( array. value( 3 ) , b"large payload over 12 bytes" ) ;
373
379
380
+ // test empty array
374
381
let array = {
375
382
let mut builder = StringViewBuilder :: new ( ) ;
376
383
builder. finish ( )
377
384
} ;
378
385
assert ! ( array. is_empty( ) ) ;
379
386
387
+ // test builder append
380
388
let array = {
381
389
let mut builder = StringViewBuilder :: new ( ) ;
382
390
builder. append_value ( "hello" ) ;
@@ -387,5 +395,48 @@ mod tests {
387
395
assert_eq ! ( array. value( 0 ) , "hello" ) ;
388
396
assert ! ( array. is_null( 1 ) ) ;
389
397
assert_eq ! ( array. value( 2 ) , "large payload over 12 bytes" ) ;
398
+
399
+ // test builder's in_progress re-created
400
+ let array = {
401
+ // make a builder with small block size.
402
+ let mut builder = StringViewBuilder :: new ( ) . with_block_size ( 14 ) ;
403
+ builder. append_value ( "large payload over 12 bytes" ) ;
404
+ builder. append_option ( Some ( "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created" ) ) ;
405
+ builder. finish ( )
406
+ } ;
407
+ assert_eq ! ( array. value( 0 ) , "large payload over 12 bytes" ) ;
408
+ assert_eq ! ( array. value( 1 ) , "another large payload over 12 bytes that double than the first one, so that we can trigger the in_progress in builder re-created" ) ;
409
+ }
410
+
411
+ #[ test]
412
+ #[ should_panic( expected = "Invalid buffer index at 0: got index 3 but only has 1 buffers" ) ]
413
+ fn new_with_invalid_view_data ( ) {
414
+ let v = "large payload over 12 bytes" ;
415
+ let view = ByteView {
416
+ length : 13 ,
417
+ prefix : u32:: from_le_bytes ( v. as_bytes ( ) [ 0 ..4 ] . try_into ( ) . unwrap ( ) ) ,
418
+ buffer_index : 3 ,
419
+ offset : 1 ,
420
+ } ;
421
+ let views = ScalarBuffer :: from ( vec ! [ view. into( ) ] ) ;
422
+ let buffers = vec ! [ Buffer :: from_slice_ref( v) ] ;
423
+ StringViewArray :: new ( views, buffers, None ) ;
424
+ }
425
+
426
+ #[ test]
427
+ #[ should_panic(
428
+ expected = "Encountered non-UTF-8 data at index 0: invalid utf-8 sequence of 1 bytes from index 0"
429
+ ) ]
430
+ fn new_with_invalid_utf8_data ( ) {
431
+ let v: Vec < u8 > = vec ! [ 0xf0 , 0x80 , 0x80 , 0x80 ] ;
432
+ let view = ByteView {
433
+ length : v. len ( ) as u32 ,
434
+ prefix : u32:: from_le_bytes ( v[ 0 ..4 ] . try_into ( ) . unwrap ( ) ) ,
435
+ buffer_index : 0 ,
436
+ offset : 0 ,
437
+ } ;
438
+ let views = ScalarBuffer :: from ( vec ! [ view. into( ) ] ) ;
439
+ let buffers = vec ! [ Buffer :: from_slice_ref( v) ] ;
440
+ StringViewArray :: new ( views, buffers, None ) ;
390
441
}
391
442
}
0 commit comments