@@ -25,7 +25,7 @@ mod stream;
25
25
pub use stream:: * ;
26
26
27
27
use flatbuffers:: { VectorIter , VerifierOptions } ;
28
- use std:: collections:: HashMap ;
28
+ use std:: collections:: { HashMap , VecDeque } ;
29
29
use std:: fmt;
30
30
use std:: io:: { BufReader , Read , Seek , SeekFrom } ;
31
31
use std:: sync:: Arc ;
@@ -64,14 +64,21 @@ fn read_buffer(
64
64
65
65
/// Coordinates reading arrays based on data types.
66
66
///
67
+ /// `variadic_counts` encodes the number of buffers to read for variadic types (e.g., Utf8View, BinaryView)
68
+ /// When encounter such types, we pop from the front of the queue to get the number of buffers to read.
69
+ ///
67
70
/// Notes:
68
71
/// * In the IPC format, null buffers are always set, but may be empty. We discard them if an array has 0 nulls
69
72
/// * Numeric values inside list arrays are often stored as 64-bit values regardless of their data type size.
70
73
/// We thus:
71
74
/// - check if the bit width of non-64-bit numbers is 64, and
72
75
/// - read the buffer as 64-bit (signed integer or float), and
73
76
/// - cast the 64-bit array to the appropriate data type
74
- fn create_array ( reader : & mut ArrayReader , field : & Field ) -> Result < ArrayRef , ArrowError > {
77
+ fn create_array (
78
+ reader : & mut ArrayReader ,
79
+ field : & Field ,
80
+ variadic_counts : & mut VecDeque < i64 > ,
81
+ ) -> Result < ArrayRef , ArrowError > {
75
82
let data_type = field. data_type ( ) ;
76
83
match data_type {
77
84
Utf8 | Binary | LargeBinary | LargeUtf8 => create_primitive_array (
@@ -83,6 +90,18 @@ fn create_array(reader: &mut ArrayReader, field: &Field) -> Result<ArrayRef, Arr
83
90
reader. next_buffer ( ) ?,
84
91
] ,
85
92
) ,
93
+ BinaryView | Utf8View => {
94
+ let count = variadic_counts
95
+ . pop_front ( )
96
+ . ok_or ( ArrowError :: IpcError ( format ! (
97
+ "Missing variadic count for {data_type} column"
98
+ ) ) ) ?;
99
+ let count = count + 2 ; // view and null buffer.
100
+ let buffers = ( 0 ..count)
101
+ . map ( |_| reader. next_buffer ( ) )
102
+ . collect :: < Result < Vec < _ > , _ > > ( ) ?;
103
+ create_primitive_array ( reader. next_node ( field) ?, data_type, & buffers)
104
+ }
86
105
FixedSizeBinary ( _) => create_primitive_array (
87
106
reader. next_node ( field) ?,
88
107
data_type,
@@ -91,13 +110,13 @@ fn create_array(reader: &mut ArrayReader, field: &Field) -> Result<ArrayRef, Arr
91
110
List ( ref list_field) | LargeList ( ref list_field) | Map ( ref list_field, _) => {
92
111
let list_node = reader. next_node ( field) ?;
93
112
let list_buffers = [ reader. next_buffer ( ) ?, reader. next_buffer ( ) ?] ;
94
- let values = create_array ( reader, list_field) ?;
113
+ let values = create_array ( reader, list_field, variadic_counts ) ?;
95
114
create_list_array ( list_node, data_type, & list_buffers, values)
96
115
}
97
116
FixedSizeList ( ref list_field, _) => {
98
117
let list_node = reader. next_node ( field) ?;
99
118
let list_buffers = [ reader. next_buffer ( ) ?] ;
100
- let values = create_array ( reader, list_field) ?;
119
+ let values = create_array ( reader, list_field, variadic_counts ) ?;
101
120
create_list_array ( list_node, data_type, & list_buffers, values)
102
121
}
103
122
Struct ( struct_fields) => {
@@ -109,7 +128,7 @@ fn create_array(reader: &mut ArrayReader, field: &Field) -> Result<ArrayRef, Arr
109
128
// TODO investigate whether just knowing the number of buffers could
110
129
// still work
111
130
for struct_field in struct_fields {
112
- let child = create_array ( reader, struct_field) ?;
131
+ let child = create_array ( reader, struct_field, variadic_counts ) ?;
113
132
struct_arrays. push ( ( struct_field. clone ( ) , child) ) ;
114
133
}
115
134
let null_count = struct_node. null_count ( ) as usize ;
@@ -123,8 +142,8 @@ fn create_array(reader: &mut ArrayReader, field: &Field) -> Result<ArrayRef, Arr
123
142
}
124
143
RunEndEncoded ( run_ends_field, values_field) => {
125
144
let run_node = reader. next_node ( field) ?;
126
- let run_ends = create_array ( reader, run_ends_field) ?;
127
- let values = create_array ( reader, values_field) ?;
145
+ let run_ends = create_array ( reader, run_ends_field, variadic_counts ) ?;
146
+ let values = create_array ( reader, values_field, variadic_counts ) ?;
128
147
129
148
let run_array_length = run_node. length ( ) as usize ;
130
149
let data = ArrayData :: builder ( data_type. clone ( ) )
@@ -177,7 +196,7 @@ fn create_array(reader: &mut ArrayReader, field: &Field) -> Result<ArrayRef, Arr
177
196
let mut ids = Vec :: with_capacity ( fields. len ( ) ) ;
178
197
179
198
for ( id, field) in fields. iter ( ) {
180
- let child = create_array ( reader, field) ?;
199
+ let child = create_array ( reader, field, variadic_counts ) ?;
181
200
children. push ( ( field. as_ref ( ) . clone ( ) , child) ) ;
182
201
ids. push ( id) ;
183
202
}
@@ -230,6 +249,11 @@ fn create_primitive_array(
230
249
. null_bit_buffer ( null_buffer)
231
250
. build_aligned ( ) ?
232
251
}
252
+ BinaryView | Utf8View => ArrayData :: builder ( data_type. clone ( ) )
253
+ . len ( length)
254
+ . buffers ( buffers[ 1 ..] . to_vec ( ) )
255
+ . null_bit_buffer ( null_buffer)
256
+ . build_aligned ( ) ?,
233
257
_ if data_type. is_primitive ( ) || matches ! ( data_type, Boolean | FixedSizeBinary ( _) ) => {
234
258
// read 2 buffers: null buffer (optional) and data buffer
235
259
ArrayData :: builder ( data_type. clone ( ) )
@@ -328,7 +352,11 @@ impl<'a> ArrayReader<'a> {
328
352
} )
329
353
}
330
354
331
- fn skip_field ( & mut self , field : & Field ) -> Result < ( ) , ArrowError > {
355
+ fn skip_field (
356
+ & mut self ,
357
+ field : & Field ,
358
+ variadic_count : & mut VecDeque < i64 > ,
359
+ ) -> Result < ( ) , ArrowError > {
332
360
self . next_node ( field) ?;
333
361
334
362
match field. data_type ( ) {
@@ -337,30 +365,42 @@ impl<'a> ArrayReader<'a> {
337
365
self . skip_buffer ( )
338
366
}
339
367
}
368
+ Utf8View | BinaryView => {
369
+ let count = variadic_count
370
+ . pop_front ( )
371
+ . ok_or ( ArrowError :: IpcError ( format ! (
372
+ "Missing variadic count for {} column" ,
373
+ field. data_type( )
374
+ ) ) ) ?;
375
+ let count = count + 2 ; // view and null buffer.
376
+ for _i in 0 ..count {
377
+ self . skip_buffer ( )
378
+ }
379
+ }
340
380
FixedSizeBinary ( _) => {
341
381
self . skip_buffer ( ) ;
342
382
self . skip_buffer ( ) ;
343
383
}
344
384
List ( list_field) | LargeList ( list_field) | Map ( list_field, _) => {
345
385
self . skip_buffer ( ) ;
346
386
self . skip_buffer ( ) ;
347
- self . skip_field ( list_field) ?;
387
+ self . skip_field ( list_field, variadic_count ) ?;
348
388
}
349
389
FixedSizeList ( list_field, _) => {
350
390
self . skip_buffer ( ) ;
351
- self . skip_field ( list_field) ?;
391
+ self . skip_field ( list_field, variadic_count ) ?;
352
392
}
353
393
Struct ( struct_fields) => {
354
394
self . skip_buffer ( ) ;
355
395
356
396
// skip for each field
357
397
for struct_field in struct_fields {
358
- self . skip_field ( struct_field) ?
398
+ self . skip_field ( struct_field, variadic_count ) ?
359
399
}
360
400
}
361
401
RunEndEncoded ( run_ends_field, values_field) => {
362
- self . skip_field ( run_ends_field) ?;
363
- self . skip_field ( values_field) ?;
402
+ self . skip_field ( run_ends_field, variadic_count ) ?;
403
+ self . skip_field ( values_field, variadic_count ) ?;
364
404
}
365
405
Dictionary ( _, _) => {
366
406
self . skip_buffer ( ) ; // Nulls
@@ -375,7 +415,7 @@ impl<'a> ArrayReader<'a> {
375
415
} ;
376
416
377
417
for ( _, field) in fields. iter ( ) {
378
- self . skip_field ( field) ?
418
+ self . skip_field ( field, variadic_count ) ?
379
419
}
380
420
}
381
421
Null => { } // No buffer increases
@@ -403,6 +443,10 @@ pub fn read_record_batch(
403
443
let field_nodes = batch. nodes ( ) . ok_or_else ( || {
404
444
ArrowError :: IpcError ( "Unable to get field nodes from IPC RecordBatch" . to_string ( ) )
405
445
} ) ?;
446
+
447
+ let mut variadic_counts: VecDeque < i64 > =
448
+ batch. variadicBufferCounts ( ) . into_iter ( ) . flatten ( ) . collect ( ) ;
449
+
406
450
let batch_compression = batch. compression ( ) ;
407
451
let compression = batch_compression
408
452
. map ( |batch_compression| batch_compression. codec ( ) . try_into ( ) )
@@ -425,12 +469,13 @@ pub fn read_record_batch(
425
469
for ( idx, field) in schema. fields ( ) . iter ( ) . enumerate ( ) {
426
470
// Create array for projected field
427
471
if let Some ( proj_idx) = projection. iter ( ) . position ( |p| p == & idx) {
428
- let child = create_array ( & mut reader, field) ?;
472
+ let child = create_array ( & mut reader, field, & mut variadic_counts ) ?;
429
473
arrays. push ( ( proj_idx, child) ) ;
430
474
} else {
431
- reader. skip_field ( field) ?;
475
+ reader. skip_field ( field, & mut variadic_counts ) ?;
432
476
}
433
477
}
478
+ assert ! ( variadic_counts. is_empty( ) ) ;
434
479
arrays. sort_by_key ( |t| t. 0 ) ;
435
480
RecordBatch :: try_new_with_options (
436
481
Arc :: new ( schema. project ( projection) ?) ,
@@ -441,9 +486,10 @@ pub fn read_record_batch(
441
486
let mut children = vec ! [ ] ;
442
487
// keep track of index as lists require more than one node
443
488
for field in schema. fields ( ) {
444
- let child = create_array ( & mut reader, field) ?;
489
+ let child = create_array ( & mut reader, field, & mut variadic_counts ) ?;
445
490
children. push ( child) ;
446
491
}
492
+ assert ! ( variadic_counts. is_empty( ) ) ;
447
493
RecordBatch :: try_new_with_options ( schema, children, & options)
448
494
}
449
495
}
@@ -1759,6 +1805,121 @@ mod tests {
1759
1805
assert_eq ! ( input_batch, output_batch) ;
1760
1806
}
1761
1807
1808
+ const LONG_TEST_STRING : & str =
1809
+ "This is a long string to make sure binary view array handles it" ;
1810
+
1811
+ #[ test]
1812
+ fn test_roundtrip_view_types ( ) {
1813
+ let schema = Schema :: new ( vec ! [
1814
+ Field :: new( "field_1" , DataType :: BinaryView , true ) ,
1815
+ Field :: new( "field_2" , DataType :: Utf8 , true ) ,
1816
+ Field :: new( "field_3" , DataType :: Utf8View , true ) ,
1817
+ ] ) ;
1818
+ let bin_values: Vec < Option < & [ u8 ] > > = vec ! [
1819
+ Some ( b"foo" ) ,
1820
+ None ,
1821
+ Some ( b"bar" ) ,
1822
+ Some ( LONG_TEST_STRING . as_bytes( ) ) ,
1823
+ ] ;
1824
+ let utf8_values: Vec < Option < & str > > =
1825
+ vec ! [ Some ( "foo" ) , None , Some ( "bar" ) , Some ( LONG_TEST_STRING ) ] ;
1826
+ let bin_view_array = BinaryViewArray :: from_iter ( bin_values) ;
1827
+ let utf8_array = StringArray :: from_iter ( utf8_values. iter ( ) ) ;
1828
+ let utf8_view_array = StringViewArray :: from_iter ( utf8_values) ;
1829
+ let record_batch = RecordBatch :: try_new (
1830
+ Arc :: new ( schema. clone ( ) ) ,
1831
+ vec ! [
1832
+ Arc :: new( bin_view_array) ,
1833
+ Arc :: new( utf8_array) ,
1834
+ Arc :: new( utf8_view_array) ,
1835
+ ] ,
1836
+ )
1837
+ . unwrap ( ) ;
1838
+
1839
+ assert_eq ! ( record_batch, roundtrip_ipc( & record_batch) ) ;
1840
+ assert_eq ! ( record_batch, roundtrip_ipc_stream( & record_batch) ) ;
1841
+
1842
+ let sliced_batch = record_batch. slice ( 1 , 2 ) ;
1843
+ assert_eq ! ( sliced_batch, roundtrip_ipc( & sliced_batch) ) ;
1844
+ assert_eq ! ( sliced_batch, roundtrip_ipc_stream( & sliced_batch) ) ;
1845
+ }
1846
+
1847
+ #[ test]
1848
+ fn test_roundtrip_view_types_nested_dict ( ) {
1849
+ let bin_values: Vec < Option < & [ u8 ] > > = vec ! [
1850
+ Some ( b"foo" ) ,
1851
+ None ,
1852
+ Some ( b"bar" ) ,
1853
+ Some ( LONG_TEST_STRING . as_bytes( ) ) ,
1854
+ Some ( b"field" ) ,
1855
+ ] ;
1856
+ let utf8_values: Vec < Option < & str > > = vec ! [
1857
+ Some ( "foo" ) ,
1858
+ None ,
1859
+ Some ( "bar" ) ,
1860
+ Some ( LONG_TEST_STRING ) ,
1861
+ Some ( "field" ) ,
1862
+ ] ;
1863
+ let bin_view_array = Arc :: new ( BinaryViewArray :: from_iter ( bin_values) ) ;
1864
+ let utf8_view_array = Arc :: new ( StringViewArray :: from_iter ( utf8_values) ) ;
1865
+
1866
+ let key_dict_keys = Int8Array :: from_iter_values ( [ 0 , 0 , 1 , 2 , 0 , 1 , 3 ] ) ;
1867
+ let key_dict_array = DictionaryArray :: new ( key_dict_keys, utf8_view_array. clone ( ) ) ;
1868
+ let keys_field = Arc :: new ( Field :: new_dict (
1869
+ "keys" ,
1870
+ DataType :: Dictionary ( Box :: new ( DataType :: Int8 ) , Box :: new ( DataType :: Utf8View ) ) ,
1871
+ true ,
1872
+ 1 ,
1873
+ false ,
1874
+ ) ) ;
1875
+
1876
+ let value_dict_keys = Int8Array :: from_iter_values ( [ 0 , 3 , 0 , 1 , 2 , 0 , 1 ] ) ;
1877
+ let value_dict_array = DictionaryArray :: new ( value_dict_keys, bin_view_array) ;
1878
+ let values_field = Arc :: new ( Field :: new_dict (
1879
+ "values" ,
1880
+ DataType :: Dictionary ( Box :: new ( DataType :: Int8 ) , Box :: new ( DataType :: BinaryView ) ) ,
1881
+ true ,
1882
+ 2 ,
1883
+ false ,
1884
+ ) ) ;
1885
+ let entry_struct = StructArray :: from ( vec ! [
1886
+ ( keys_field, make_array( key_dict_array. into_data( ) ) ) ,
1887
+ ( values_field, make_array( value_dict_array. into_data( ) ) ) ,
1888
+ ] ) ;
1889
+
1890
+ let map_data_type = DataType :: Map (
1891
+ Arc :: new ( Field :: new (
1892
+ "entries" ,
1893
+ entry_struct. data_type ( ) . clone ( ) ,
1894
+ false ,
1895
+ ) ) ,
1896
+ false ,
1897
+ ) ;
1898
+ let entry_offsets = Buffer :: from_slice_ref ( [ 0 , 2 , 4 , 7 ] ) ;
1899
+ let map_data = ArrayData :: builder ( map_data_type)
1900
+ . len ( 3 )
1901
+ . add_buffer ( entry_offsets)
1902
+ . add_child_data ( entry_struct. into_data ( ) )
1903
+ . build ( )
1904
+ . unwrap ( ) ;
1905
+ let map_array = MapArray :: from ( map_data) ;
1906
+
1907
+ let dict_keys = Int8Array :: from_iter_values ( [ 0 , 1 , 0 , 1 , 1 , 2 , 0 , 1 , 2 ] ) ;
1908
+ let dict_dict_array = DictionaryArray :: new ( dict_keys, Arc :: new ( map_array) ) ;
1909
+ let schema = Arc :: new ( Schema :: new ( vec ! [ Field :: new(
1910
+ "f1" ,
1911
+ dict_dict_array. data_type( ) . clone( ) ,
1912
+ false ,
1913
+ ) ] ) ) ;
1914
+ let batch = RecordBatch :: try_new ( schema, vec ! [ Arc :: new( dict_dict_array) ] ) . unwrap ( ) ;
1915
+ assert_eq ! ( batch, roundtrip_ipc( & batch) ) ;
1916
+ assert_eq ! ( batch, roundtrip_ipc_stream( & batch) ) ;
1917
+
1918
+ let sliced_batch = batch. slice ( 1 , 2 ) ;
1919
+ assert_eq ! ( sliced_batch, roundtrip_ipc( & sliced_batch) ) ;
1920
+ assert_eq ! ( sliced_batch, roundtrip_ipc_stream( & sliced_batch) ) ;
1921
+ }
1922
+
1762
1923
#[ test]
1763
1924
fn test_no_columns_batch ( ) {
1764
1925
let schema = Arc :: new ( Schema :: empty ( ) ) ;
0 commit comments