Skip to content

Commit e6bd74b

Browse files
authored
Add support for StringView and BinaryView statistics in StatisticsConverter (#6181)
* Add StringView and BinaryView support for the macro `get_statistics` * Add StringView and BinaryView support for the macro `get_data_page_statistics` * add tests to cover the support for StringView and BinaryView in the macro get_data_page_statistics * found potential bugs and ignore the tests * fake alarm! no bugs, fix the code by initiating all batches to have 5 rows * make the get_stat StringView and BinaryView tests cover bytes greater than 12
1 parent 36d567b commit e6bd74b

File tree

3 files changed

+270
-20
lines changed

3 files changed

+270
-20
lines changed

parquet/src/arrow/arrow_reader/statistics.rs

Lines changed: 164 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ use crate::file::page_index::index::{Index, PageIndex};
2626
use crate::file::statistics::Statistics as ParquetStatistics;
2727
use crate::schema::types::SchemaDescriptor;
2828
use arrow_array::builder::{
29-
BooleanBuilder, FixedSizeBinaryBuilder, LargeStringBuilder, StringBuilder,
29+
BinaryViewBuilder, BooleanBuilder, FixedSizeBinaryBuilder, LargeStringBuilder, StringBuilder,
30+
StringViewBuilder,
3031
};
3132
use arrow_array::{
3233
new_empty_array, new_null_array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array,
@@ -446,14 +447,43 @@ macro_rules! get_statistics {
446447
},
447448
DataType::Dictionary(_, value_type) => {
448449
[<$stat_type_prefix:lower _ statistics>](value_type, $iterator)
450+
},
451+
DataType::Utf8View => {
452+
let iterator = [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator);
453+
let mut builder = StringViewBuilder::new();
454+
for x in iterator {
455+
let Some(x) = x else {
456+
builder.append_null(); // no statistics value
457+
continue;
458+
};
459+
460+
let Ok(x) = std::str::from_utf8(x) else {
461+
builder.append_null();
462+
continue;
463+
};
464+
465+
builder.append_value(x);
466+
}
467+
Ok(Arc::new(builder.finish()))
468+
},
469+
DataType::BinaryView => {
470+
let iterator = [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator);
471+
let mut builder = BinaryViewBuilder::new();
472+
for x in iterator {
473+
let Some(x) = x else {
474+
builder.append_null(); // no statistics value
475+
continue;
476+
};
477+
478+
builder.append_value(x);
479+
}
480+
Ok(Arc::new(builder.finish()))
449481
}
450482

451483
DataType::Map(_,_) |
452484
DataType::Duration(_) |
453485
DataType::Interval(_) |
454486
DataType::Null |
455-
DataType::BinaryView |
456-
DataType::Utf8View |
457487
DataType::List(_) |
458488
DataType::ListView(_) |
459489
DataType::FixedSizeList(_, _) |
@@ -919,7 +949,7 @@ macro_rules! get_data_page_statistics {
919949
}
920950
})
921951
},
922-
Some(DataType::FixedSizeBinary(size)) => {
952+
Some(DataType::FixedSizeBinary(size)) => {
923953
let mut builder = FixedSizeBinaryBuilder::new(*size);
924954
let iterator = [<$stat_type_prefix FixedLenByteArrayDataPageStatsIterator>]::new($iterator);
925955
for x in iterator {
@@ -943,7 +973,58 @@ macro_rules! get_data_page_statistics {
943973
}
944974
Ok(Arc::new(builder.finish()))
945975
},
946-
_ => unimplemented!()
976+
Some(DataType::Utf8View) => {
977+
let mut builder = StringViewBuilder::new();
978+
let iterator = [<$stat_type_prefix ByteArrayDataPageStatsIterator>]::new($iterator);
979+
for x in iterator {
980+
for x in x.into_iter() {
981+
let Some(x) = x else {
982+
builder.append_null(); // no statistics value
983+
continue;
984+
};
985+
986+
let Ok(x) = std::str::from_utf8(x.data()) else {
987+
builder.append_null();
988+
continue;
989+
};
990+
991+
builder.append_value(x);
992+
}
993+
}
994+
Ok(Arc::new(builder.finish()))
995+
},
996+
Some(DataType::BinaryView) => {
997+
let mut builder = BinaryViewBuilder::new();
998+
let iterator = [<$stat_type_prefix ByteArrayDataPageStatsIterator>]::new($iterator);
999+
for x in iterator {
1000+
for x in x.into_iter() {
1001+
let Some(x) = x else {
1002+
builder.append_null(); // no statistics value
1003+
continue;
1004+
};
1005+
1006+
builder.append_value(x);
1007+
}
1008+
}
1009+
Ok(Arc::new(builder.finish()))
1010+
},
1011+
Some(DataType::Null) |
1012+
Some(DataType::Duration(_)) |
1013+
Some(DataType::Interval(_)) |
1014+
Some(DataType::List(_)) |
1015+
Some(DataType::ListView(_)) |
1016+
Some(DataType::FixedSizeList(_, _)) |
1017+
Some(DataType::LargeList(_)) |
1018+
Some(DataType::LargeListView(_)) |
1019+
Some(DataType::Struct(_)) |
1020+
Some(DataType::Union(_, _)) |
1021+
Some(DataType::Map(_, _)) |
1022+
Some(DataType::RunEndEncoded(_, _)) => {
1023+
let len = $iterator.count();
1024+
// don't know how to extract statistics, so return a null array
1025+
Ok(new_null_array($data_type.unwrap(), len))
1026+
},
1027+
None => unimplemented!() // not sure how to handle this
9471028
}
9481029
}
9491030
}
@@ -1499,10 +1580,10 @@ mod test {
14991580
use arrow::datatypes::{i256, Date32Type, Date64Type};
15001581
use arrow::util::test_util::parquet_test_data;
15011582
use arrow_array::{
1502-
new_empty_array, new_null_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array,
1503-
Date64Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array, Int16Array,
1504-
Int32Array, Int64Array, Int8Array, LargeBinaryArray, RecordBatch, StringArray, StructArray,
1505-
TimestampNanosecondArray,
1583+
new_empty_array, new_null_array, Array, ArrayRef, BinaryArray, BinaryViewArray,
1584+
BooleanArray, Date32Array, Date64Array, Decimal128Array, Decimal256Array, Float32Array,
1585+
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray, RecordBatch,
1586+
StringArray, StringViewArray, StructArray, TimestampNanosecondArray,
15061587
};
15071588
use arrow_schema::{DataType, Field, SchemaRef};
15081589
use bytes::Bytes;
@@ -1916,6 +1997,65 @@ mod test {
19161997
.run()
19171998
}
19181999

2000+
#[test]
2001+
fn roundtrip_string_view() {
2002+
Test {
2003+
input: string_view_array([
2004+
// row group 1
2005+
Some("A"),
2006+
None,
2007+
Some("Q"),
2008+
// row group 2
2009+
Some("ZZ"),
2010+
Some("A_longerthan12"),
2011+
None,
2012+
// row group 3
2013+
Some("A_longerthan12"),
2014+
None,
2015+
None,
2016+
]),
2017+
expected_min: string_view_array([
2018+
Some("A"),
2019+
Some("A_longerthan12"),
2020+
Some("A_longerthan12"),
2021+
]),
2022+
expected_max: string_view_array([Some("Q"), Some("ZZ"), Some("A_longerthan12")]),
2023+
}
2024+
.run()
2025+
}
2026+
2027+
#[test]
2028+
fn roundtrip_binary_view() {
2029+
let input: Vec<Option<&[u8]>> = vec![
2030+
// row group 1
2031+
Some(b"A"),
2032+
None,
2033+
Some(b"Q"),
2034+
// row group 2
2035+
Some(b"ZZ"),
2036+
Some(b"A_longerthan12"),
2037+
None,
2038+
// row group 3
2039+
Some(b"A_longerthan12"),
2040+
None,
2041+
None,
2042+
];
2043+
2044+
let expected_min: Vec<Option<&[u8]>> =
2045+
vec![Some(b"A"), Some(b"A_longerthan12"), Some(b"A_longerthan12")];
2046+
let expected_max: Vec<Option<&[u8]>> =
2047+
vec![Some(b"Q"), Some(b"ZZ"), Some(b"A_longerthan12")];
2048+
2049+
let array = binary_view_array(input);
2050+
2051+
Test {
2052+
input: array,
2053+
expected_min: binary_view_array(expected_min),
2054+
expected_max: binary_view_array(expected_max),
2055+
}
2056+
.run()
2057+
}
2058+
19192059
#[test]
19202060
fn roundtrip_struct() {
19212061
let mut test = Test {
@@ -2539,4 +2679,19 @@ mod test {
25392679

25402680
Arc::new(array)
25412681
}
2682+
2683+
fn string_view_array<'a>(input: impl IntoIterator<Item = Option<&'a str>>) -> ArrayRef {
2684+
let array: StringViewArray = input
2685+
.into_iter()
2686+
.map(|s| s.map(|s| s.to_string()))
2687+
.collect();
2688+
2689+
Arc::new(array)
2690+
}
2691+
2692+
fn binary_view_array(input: Vec<Option<&[u8]>>) -> ArrayRef {
2693+
let array = BinaryViewArray::from(input.into_iter().collect::<Vec<Option<&[u8]>>>());
2694+
2695+
Arc::new(array)
2696+
}
25422697
}

parquet/tests/arrow_reader/mod.rs

Lines changed: 48 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,13 @@
1717

1818
use arrow_array::types::{Int32Type, Int8Type};
1919
use arrow_array::{
20-
Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array,
21-
Decimal256Array, DictionaryArray, FixedSizeBinaryArray, Float16Array, Float32Array,
22-
Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray,
23-
LargeStringArray, RecordBatch, StringArray, StructArray, Time32MillisecondArray,
24-
Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray,
25-
TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array,
26-
UInt32Array, UInt64Array, UInt8Array,
20+
Array, ArrayRef, BinaryArray, BinaryViewArray, BooleanArray, Date32Array, Date64Array,
21+
Decimal128Array, Decimal256Array, DictionaryArray, FixedSizeBinaryArray, Float16Array,
22+
Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray,
23+
LargeStringArray, RecordBatch, StringArray, StringViewArray, StructArray,
24+
Time32MillisecondArray, Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray,
25+
TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray,
26+
TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array,
2727
};
2828
use arrow_buffer::i256;
2929
use arrow_schema::{DataType, Field, Schema, TimeUnit};
@@ -88,6 +88,8 @@ enum Scenario {
8888
PeriodsInColumnNames,
8989
StructArray,
9090
UTF8,
91+
UTF8View,
92+
BinaryView,
9193
}
9294

9395
fn make_boolean_batch(v: Vec<Option<bool>>) -> RecordBatch {
@@ -589,6 +591,16 @@ fn make_utf8_batch(value: Vec<Option<&str>>) -> RecordBatch {
589591
.unwrap()
590592
}
591593

594+
fn make_utf8_view_batch(value: Vec<Option<&str>>) -> RecordBatch {
595+
let utf8_view = StringViewArray::from(value);
596+
RecordBatch::try_from_iter(vec![("utf8_view", Arc::new(utf8_view) as _)]).unwrap()
597+
}
598+
599+
fn make_binary_view_batch(value: Vec<Option<&[u8]>>) -> RecordBatch {
600+
let binary_view = BinaryViewArray::from(value);
601+
RecordBatch::try_from_iter(vec![("binary_view", Arc::new(binary_view) as _)]).unwrap()
602+
}
603+
592604
fn make_dict_batch() -> RecordBatch {
593605
let values = [
594606
Some("abc"),
@@ -972,6 +984,35 @@ fn create_data_batch(scenario: Scenario) -> Vec<RecordBatch> {
972984
make_utf8_batch(vec![Some("e"), Some("f"), Some("g"), Some("h"), Some("i")]),
973985
]
974986
}
987+
Scenario::UTF8View => {
988+
// Make utf8_view batch including string length <12 and >12 bytes
989+
// as the internal representation of StringView is differed for strings
990+
// shorter and longer than that length
991+
vec![
992+
make_utf8_view_batch(vec![Some("a"), Some("b"), Some("c"), Some("d"), None]),
993+
make_utf8_view_batch(vec![Some("a"), Some("e_longerthan12"), None, None, None]),
994+
make_utf8_view_batch(vec![
995+
Some("e_longerthan12"),
996+
Some("f_longerthan12"),
997+
Some("g_longerthan12"),
998+
Some("h_longerthan12"),
999+
Some("i_longerthan12"),
1000+
]),
1001+
]
1002+
}
1003+
Scenario::BinaryView => {
1004+
vec![
1005+
make_binary_view_batch(vec![Some(b"a"), Some(b"b"), Some(b"c"), Some(b"d"), None]),
1006+
make_binary_view_batch(vec![Some(b"a"), Some(b"e_longerthan12"), None, None, None]),
1007+
make_binary_view_batch(vec![
1008+
Some(b"e_longerthan12"),
1009+
Some(b"f_longerthan12"),
1010+
Some(b"g_longerthan12"),
1011+
Some(b"h_longerthan12"),
1012+
Some(b"i_longerthan12"),
1013+
]),
1014+
]
1015+
}
9751016
}
9761017
}
9771018

parquet/tests/arrow_reader/statistics.rs

Lines changed: 58 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,11 @@ use arrow::datatypes::{
2929
TimestampNanosecondType, TimestampSecondType,
3030
};
3131
use arrow_array::{
32-
make_array, new_null_array, Array, ArrayRef, BinaryArray, BooleanArray, Date32Array,
33-
Date64Array, Decimal128Array, Decimal256Array, FixedSizeBinaryArray, Float16Array,
32+
make_array, new_null_array, Array, ArrayRef, BinaryArray, BinaryViewArray, BooleanArray,
33+
Date32Array, Date64Array, Decimal128Array, Decimal256Array, FixedSizeBinaryArray, Float16Array,
3434
Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, LargeBinaryArray,
35-
LargeStringArray, RecordBatch, StringArray, Time32MillisecondArray, Time32SecondArray,
36-
Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray,
35+
LargeStringArray, RecordBatch, StringArray, StringViewArray, Time32MillisecondArray,
36+
Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray,
3737
TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array,
3838
UInt32Array, UInt64Array, UInt8Array,
3939
};
@@ -2059,6 +2059,60 @@ async fn test_utf8() {
20592059
.run();
20602060
}
20612061

2062+
// UTF8View
2063+
#[tokio::test]
2064+
async fn test_utf8_view() {
2065+
let reader = TestReader {
2066+
scenario: Scenario::UTF8View,
2067+
row_per_group: 5,
2068+
}
2069+
.build()
2070+
.await;
2071+
2072+
// test for utf8_view
2073+
Test {
2074+
reader: &reader,
2075+
expected_min: Arc::new(StringViewArray::from(vec!["a", "a", "e_longerthan12"])),
2076+
expected_max: Arc::new(StringViewArray::from(vec![
2077+
"d",
2078+
"e_longerthan12",
2079+
"i_longerthan12",
2080+
])),
2081+
expected_null_counts: UInt64Array::from(vec![1, 3, 0]),
2082+
expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])),
2083+
column_name: "utf8_view",
2084+
check: Check::Both,
2085+
}
2086+
.run()
2087+
}
2088+
2089+
// BinaryView
2090+
#[tokio::test]
2091+
async fn test_binary_view() {
2092+
let reader = TestReader {
2093+
scenario: Scenario::BinaryView,
2094+
row_per_group: 5,
2095+
}
2096+
.build()
2097+
.await;
2098+
2099+
let expected_min: Vec<Option<&[u8]>> = vec![Some(b"a"), Some(b"a"), Some(b"e_longerthan12")];
2100+
let expected_max: Vec<Option<&[u8]>> =
2101+
vec![Some(b"d"), Some(b"e_longerthan12"), Some(b"i_longerthan12")];
2102+
2103+
// test for utf8_view
2104+
Test {
2105+
reader: &reader,
2106+
expected_min: Arc::new(BinaryViewArray::from(expected_min)),
2107+
expected_max: Arc::new(BinaryViewArray::from(expected_max)),
2108+
expected_null_counts: UInt64Array::from(vec![1, 3, 0]),
2109+
expected_row_counts: Some(UInt64Array::from(vec![5, 5, 5])),
2110+
column_name: "binary_view",
2111+
check: Check::Both,
2112+
}
2113+
.run()
2114+
}
2115+
20622116
////// Files with missing statistics ///////
20632117

20642118
#[tokio::test]

0 commit comments

Comments
 (0)