Skip to content

Commit 17058c7

Browse files
XiangpengHaobkietztustvoldalamb
authored
IPC format support for StringViewArray and BinaryViewArray (#5525)
* check in ipc format for view types * update tests * fix variadic counting * fix linting, address comments * Apply suggestions from code review Co-authored-by: Benjamin Kietzman <[email protected]> Co-authored-by: Raphael Taylor-Davies <[email protected]> * address some review comments * update comments * Add tests and fix bugs with dict types * make clippy happy * update test cases --------- Co-authored-by: Benjamin Kietzman <[email protected]> Co-authored-by: Raphael Taylor-Davies <[email protected]> Co-authored-by: Andrew Lamb <[email protected]>
1 parent 9f36c88 commit 17058c7

File tree

3 files changed

+304
-19
lines changed

3 files changed

+304
-19
lines changed

arrow-ipc/src/convert.rs

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,8 +247,10 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat
247247
}
248248
}
249249
crate::Type::Binary => DataType::Binary,
250+
crate::Type::BinaryView => DataType::BinaryView,
250251
crate::Type::LargeBinary => DataType::LargeBinary,
251252
crate::Type::Utf8 => DataType::Utf8,
253+
crate::Type::Utf8View => DataType::Utf8View,
252254
crate::Type::LargeUtf8 => DataType::LargeUtf8,
253255
crate::Type::FixedSizeBinary => {
254256
let fsb = field.type_as_fixed_size_binary().unwrap();
@@ -548,7 +550,16 @@ pub(crate) fn get_fb_field_type<'a>(
548550
.as_union_value(),
549551
children: Some(fbb.create_vector(&empty_fields[..])),
550552
},
551-
BinaryView | Utf8View => unimplemented!("unimplemented"),
553+
BinaryView => FBFieldType {
554+
type_type: crate::Type::BinaryView,
555+
type_: crate::BinaryViewBuilder::new(fbb).finish().as_union_value(),
556+
children: Some(fbb.create_vector(&empty_fields[..])),
557+
},
558+
Utf8View => FBFieldType {
559+
type_type: crate::Type::Utf8View,
560+
type_: crate::Utf8ViewBuilder::new(fbb).finish().as_union_value(),
561+
children: Some(fbb.create_vector(&empty_fields[..])),
562+
},
552563
Utf8 => FBFieldType {
553564
type_type: crate::Type::Utf8,
554565
type_: crate::Utf8Builder::new(fbb).finish().as_union_value(),
@@ -921,7 +932,9 @@ mod tests {
921932
true,
922933
),
923934
Field::new("utf8", DataType::Utf8, false),
935+
Field::new("utf8_view", DataType::Utf8View, false),
924936
Field::new("binary", DataType::Binary, false),
937+
Field::new("binary_view", DataType::BinaryView, false),
925938
Field::new_list("list[u8]", Field::new("item", DataType::UInt8, false), true),
926939
Field::new_fixed_size_list(
927940
"fixed_size_list[u8]",

arrow-ipc/src/reader.rs

Lines changed: 179 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ mod stream;
2525
pub use stream::*;
2626

2727
use flatbuffers::{VectorIter, VerifierOptions};
28-
use std::collections::HashMap;
28+
use std::collections::{HashMap, VecDeque};
2929
use std::fmt;
3030
use std::io::{BufReader, Read, Seek, SeekFrom};
3131
use std::sync::Arc;
@@ -64,14 +64,21 @@ fn read_buffer(
6464

6565
/// Coordinates reading arrays based on data types.
6666
///
67+
/// `variadic_counts` encodes the number of buffers to read for variadic types (e.g., Utf8View, BinaryView)
68+
/// When encounter such types, we pop from the front of the queue to get the number of buffers to read.
69+
///
6770
/// Notes:
6871
/// * In the IPC format, null buffers are always set, but may be empty. We discard them if an array has 0 nulls
6972
/// * Numeric values inside list arrays are often stored as 64-bit values regardless of their data type size.
7073
/// We thus:
7174
/// - check if the bit width of non-64-bit numbers is 64, and
7275
/// - read the buffer as 64-bit (signed integer or float), and
7376
/// - cast the 64-bit array to the appropriate data type
74-
fn create_array(reader: &mut ArrayReader, field: &Field) -> Result<ArrayRef, ArrowError> {
77+
fn create_array(
78+
reader: &mut ArrayReader,
79+
field: &Field,
80+
variadic_counts: &mut VecDeque<i64>,
81+
) -> Result<ArrayRef, ArrowError> {
7582
let data_type = field.data_type();
7683
match data_type {
7784
Utf8 | Binary | LargeBinary | LargeUtf8 => create_primitive_array(
@@ -83,6 +90,18 @@ fn create_array(reader: &mut ArrayReader, field: &Field) -> Result<ArrayRef, Arr
8390
reader.next_buffer()?,
8491
],
8592
),
93+
BinaryView | Utf8View => {
94+
let count = variadic_counts
95+
.pop_front()
96+
.ok_or(ArrowError::IpcError(format!(
97+
"Missing variadic count for {data_type} column"
98+
)))?;
99+
let count = count + 2; // view and null buffer.
100+
let buffers = (0..count)
101+
.map(|_| reader.next_buffer())
102+
.collect::<Result<Vec<_>, _>>()?;
103+
create_primitive_array(reader.next_node(field)?, data_type, &buffers)
104+
}
86105
FixedSizeBinary(_) => create_primitive_array(
87106
reader.next_node(field)?,
88107
data_type,
@@ -91,13 +110,13 @@ fn create_array(reader: &mut ArrayReader, field: &Field) -> Result<ArrayRef, Arr
91110
List(ref list_field) | LargeList(ref list_field) | Map(ref list_field, _) => {
92111
let list_node = reader.next_node(field)?;
93112
let list_buffers = [reader.next_buffer()?, reader.next_buffer()?];
94-
let values = create_array(reader, list_field)?;
113+
let values = create_array(reader, list_field, variadic_counts)?;
95114
create_list_array(list_node, data_type, &list_buffers, values)
96115
}
97116
FixedSizeList(ref list_field, _) => {
98117
let list_node = reader.next_node(field)?;
99118
let list_buffers = [reader.next_buffer()?];
100-
let values = create_array(reader, list_field)?;
119+
let values = create_array(reader, list_field, variadic_counts)?;
101120
create_list_array(list_node, data_type, &list_buffers, values)
102121
}
103122
Struct(struct_fields) => {
@@ -109,7 +128,7 @@ fn create_array(reader: &mut ArrayReader, field: &Field) -> Result<ArrayRef, Arr
109128
// TODO investigate whether just knowing the number of buffers could
110129
// still work
111130
for struct_field in struct_fields {
112-
let child = create_array(reader, struct_field)?;
131+
let child = create_array(reader, struct_field, variadic_counts)?;
113132
struct_arrays.push((struct_field.clone(), child));
114133
}
115134
let null_count = struct_node.null_count() as usize;
@@ -123,8 +142,8 @@ fn create_array(reader: &mut ArrayReader, field: &Field) -> Result<ArrayRef, Arr
123142
}
124143
RunEndEncoded(run_ends_field, values_field) => {
125144
let run_node = reader.next_node(field)?;
126-
let run_ends = create_array(reader, run_ends_field)?;
127-
let values = create_array(reader, values_field)?;
145+
let run_ends = create_array(reader, run_ends_field, variadic_counts)?;
146+
let values = create_array(reader, values_field, variadic_counts)?;
128147

129148
let run_array_length = run_node.length() as usize;
130149
let data = ArrayData::builder(data_type.clone())
@@ -177,7 +196,7 @@ fn create_array(reader: &mut ArrayReader, field: &Field) -> Result<ArrayRef, Arr
177196
let mut ids = Vec::with_capacity(fields.len());
178197

179198
for (id, field) in fields.iter() {
180-
let child = create_array(reader, field)?;
199+
let child = create_array(reader, field, variadic_counts)?;
181200
children.push((field.as_ref().clone(), child));
182201
ids.push(id);
183202
}
@@ -230,6 +249,11 @@ fn create_primitive_array(
230249
.null_bit_buffer(null_buffer)
231250
.build_aligned()?
232251
}
252+
BinaryView | Utf8View => ArrayData::builder(data_type.clone())
253+
.len(length)
254+
.buffers(buffers[1..].to_vec())
255+
.null_bit_buffer(null_buffer)
256+
.build_aligned()?,
233257
_ if data_type.is_primitive() || matches!(data_type, Boolean | FixedSizeBinary(_)) => {
234258
// read 2 buffers: null buffer (optional) and data buffer
235259
ArrayData::builder(data_type.clone())
@@ -328,7 +352,11 @@ impl<'a> ArrayReader<'a> {
328352
})
329353
}
330354

331-
fn skip_field(&mut self, field: &Field) -> Result<(), ArrowError> {
355+
fn skip_field(
356+
&mut self,
357+
field: &Field,
358+
variadic_count: &mut VecDeque<i64>,
359+
) -> Result<(), ArrowError> {
332360
self.next_node(field)?;
333361

334362
match field.data_type() {
@@ -337,30 +365,42 @@ impl<'a> ArrayReader<'a> {
337365
self.skip_buffer()
338366
}
339367
}
368+
Utf8View | BinaryView => {
369+
let count = variadic_count
370+
.pop_front()
371+
.ok_or(ArrowError::IpcError(format!(
372+
"Missing variadic count for {} column",
373+
field.data_type()
374+
)))?;
375+
let count = count + 2; // view and null buffer.
376+
for _i in 0..count {
377+
self.skip_buffer()
378+
}
379+
}
340380
FixedSizeBinary(_) => {
341381
self.skip_buffer();
342382
self.skip_buffer();
343383
}
344384
List(list_field) | LargeList(list_field) | Map(list_field, _) => {
345385
self.skip_buffer();
346386
self.skip_buffer();
347-
self.skip_field(list_field)?;
387+
self.skip_field(list_field, variadic_count)?;
348388
}
349389
FixedSizeList(list_field, _) => {
350390
self.skip_buffer();
351-
self.skip_field(list_field)?;
391+
self.skip_field(list_field, variadic_count)?;
352392
}
353393
Struct(struct_fields) => {
354394
self.skip_buffer();
355395

356396
// skip for each field
357397
for struct_field in struct_fields {
358-
self.skip_field(struct_field)?
398+
self.skip_field(struct_field, variadic_count)?
359399
}
360400
}
361401
RunEndEncoded(run_ends_field, values_field) => {
362-
self.skip_field(run_ends_field)?;
363-
self.skip_field(values_field)?;
402+
self.skip_field(run_ends_field, variadic_count)?;
403+
self.skip_field(values_field, variadic_count)?;
364404
}
365405
Dictionary(_, _) => {
366406
self.skip_buffer(); // Nulls
@@ -375,7 +415,7 @@ impl<'a> ArrayReader<'a> {
375415
};
376416

377417
for (_, field) in fields.iter() {
378-
self.skip_field(field)?
418+
self.skip_field(field, variadic_count)?
379419
}
380420
}
381421
Null => {} // No buffer increases
@@ -403,6 +443,10 @@ pub fn read_record_batch(
403443
let field_nodes = batch.nodes().ok_or_else(|| {
404444
ArrowError::IpcError("Unable to get field nodes from IPC RecordBatch".to_string())
405445
})?;
446+
447+
let mut variadic_counts: VecDeque<i64> =
448+
batch.variadicBufferCounts().into_iter().flatten().collect();
449+
406450
let batch_compression = batch.compression();
407451
let compression = batch_compression
408452
.map(|batch_compression| batch_compression.codec().try_into())
@@ -425,12 +469,13 @@ pub fn read_record_batch(
425469
for (idx, field) in schema.fields().iter().enumerate() {
426470
// Create array for projected field
427471
if let Some(proj_idx) = projection.iter().position(|p| p == &idx) {
428-
let child = create_array(&mut reader, field)?;
472+
let child = create_array(&mut reader, field, &mut variadic_counts)?;
429473
arrays.push((proj_idx, child));
430474
} else {
431-
reader.skip_field(field)?;
475+
reader.skip_field(field, &mut variadic_counts)?;
432476
}
433477
}
478+
assert!(variadic_counts.is_empty());
434479
arrays.sort_by_key(|t| t.0);
435480
RecordBatch::try_new_with_options(
436481
Arc::new(schema.project(projection)?),
@@ -441,9 +486,10 @@ pub fn read_record_batch(
441486
let mut children = vec![];
442487
// keep track of index as lists require more than one node
443488
for field in schema.fields() {
444-
let child = create_array(&mut reader, field)?;
489+
let child = create_array(&mut reader, field, &mut variadic_counts)?;
445490
children.push(child);
446491
}
492+
assert!(variadic_counts.is_empty());
447493
RecordBatch::try_new_with_options(schema, children, &options)
448494
}
449495
}
@@ -1759,6 +1805,121 @@ mod tests {
17591805
assert_eq!(input_batch, output_batch);
17601806
}
17611807

1808+
const LONG_TEST_STRING: &str =
1809+
"This is a long string to make sure binary view array handles it";
1810+
1811+
#[test]
1812+
fn test_roundtrip_view_types() {
1813+
let schema = Schema::new(vec![
1814+
Field::new("field_1", DataType::BinaryView, true),
1815+
Field::new("field_2", DataType::Utf8, true),
1816+
Field::new("field_3", DataType::Utf8View, true),
1817+
]);
1818+
let bin_values: Vec<Option<&[u8]>> = vec![
1819+
Some(b"foo"),
1820+
None,
1821+
Some(b"bar"),
1822+
Some(LONG_TEST_STRING.as_bytes()),
1823+
];
1824+
let utf8_values: Vec<Option<&str>> =
1825+
vec![Some("foo"), None, Some("bar"), Some(LONG_TEST_STRING)];
1826+
let bin_view_array = BinaryViewArray::from_iter(bin_values);
1827+
let utf8_array = StringArray::from_iter(utf8_values.iter());
1828+
let utf8_view_array = StringViewArray::from_iter(utf8_values);
1829+
let record_batch = RecordBatch::try_new(
1830+
Arc::new(schema.clone()),
1831+
vec![
1832+
Arc::new(bin_view_array),
1833+
Arc::new(utf8_array),
1834+
Arc::new(utf8_view_array),
1835+
],
1836+
)
1837+
.unwrap();
1838+
1839+
assert_eq!(record_batch, roundtrip_ipc(&record_batch));
1840+
assert_eq!(record_batch, roundtrip_ipc_stream(&record_batch));
1841+
1842+
let sliced_batch = record_batch.slice(1, 2);
1843+
assert_eq!(sliced_batch, roundtrip_ipc(&sliced_batch));
1844+
assert_eq!(sliced_batch, roundtrip_ipc_stream(&sliced_batch));
1845+
}
1846+
1847+
#[test]
1848+
fn test_roundtrip_view_types_nested_dict() {
1849+
let bin_values: Vec<Option<&[u8]>> = vec![
1850+
Some(b"foo"),
1851+
None,
1852+
Some(b"bar"),
1853+
Some(LONG_TEST_STRING.as_bytes()),
1854+
Some(b"field"),
1855+
];
1856+
let utf8_values: Vec<Option<&str>> = vec![
1857+
Some("foo"),
1858+
None,
1859+
Some("bar"),
1860+
Some(LONG_TEST_STRING),
1861+
Some("field"),
1862+
];
1863+
let bin_view_array = Arc::new(BinaryViewArray::from_iter(bin_values));
1864+
let utf8_view_array = Arc::new(StringViewArray::from_iter(utf8_values));
1865+
1866+
let key_dict_keys = Int8Array::from_iter_values([0, 0, 1, 2, 0, 1, 3]);
1867+
let key_dict_array = DictionaryArray::new(key_dict_keys, utf8_view_array.clone());
1868+
let keys_field = Arc::new(Field::new_dict(
1869+
"keys",
1870+
DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::Utf8View)),
1871+
true,
1872+
1,
1873+
false,
1874+
));
1875+
1876+
let value_dict_keys = Int8Array::from_iter_values([0, 3, 0, 1, 2, 0, 1]);
1877+
let value_dict_array = DictionaryArray::new(value_dict_keys, bin_view_array);
1878+
let values_field = Arc::new(Field::new_dict(
1879+
"values",
1880+
DataType::Dictionary(Box::new(DataType::Int8), Box::new(DataType::BinaryView)),
1881+
true,
1882+
2,
1883+
false,
1884+
));
1885+
let entry_struct = StructArray::from(vec![
1886+
(keys_field, make_array(key_dict_array.into_data())),
1887+
(values_field, make_array(value_dict_array.into_data())),
1888+
]);
1889+
1890+
let map_data_type = DataType::Map(
1891+
Arc::new(Field::new(
1892+
"entries",
1893+
entry_struct.data_type().clone(),
1894+
false,
1895+
)),
1896+
false,
1897+
);
1898+
let entry_offsets = Buffer::from_slice_ref([0, 2, 4, 7]);
1899+
let map_data = ArrayData::builder(map_data_type)
1900+
.len(3)
1901+
.add_buffer(entry_offsets)
1902+
.add_child_data(entry_struct.into_data())
1903+
.build()
1904+
.unwrap();
1905+
let map_array = MapArray::from(map_data);
1906+
1907+
let dict_keys = Int8Array::from_iter_values([0, 1, 0, 1, 1, 2, 0, 1, 2]);
1908+
let dict_dict_array = DictionaryArray::new(dict_keys, Arc::new(map_array));
1909+
let schema = Arc::new(Schema::new(vec![Field::new(
1910+
"f1",
1911+
dict_dict_array.data_type().clone(),
1912+
false,
1913+
)]));
1914+
let batch = RecordBatch::try_new(schema, vec![Arc::new(dict_dict_array)]).unwrap();
1915+
assert_eq!(batch, roundtrip_ipc(&batch));
1916+
assert_eq!(batch, roundtrip_ipc_stream(&batch));
1917+
1918+
let sliced_batch = batch.slice(1, 2);
1919+
assert_eq!(sliced_batch, roundtrip_ipc(&sliced_batch));
1920+
assert_eq!(sliced_batch, roundtrip_ipc_stream(&sliced_batch));
1921+
}
1922+
17621923
#[test]
17631924
fn test_no_columns_batch() {
17641925
let schema = Arc::new(Schema::empty());

0 commit comments

Comments
 (0)