@@ -26,7 +26,8 @@ use crate::file::page_index::index::{Index, PageIndex};
26
26
use crate :: file:: statistics:: Statistics as ParquetStatistics ;
27
27
use crate :: schema:: types:: SchemaDescriptor ;
28
28
use arrow_array:: builder:: {
29
- BooleanBuilder , FixedSizeBinaryBuilder , LargeStringBuilder , StringBuilder ,
29
+ BinaryViewBuilder , BooleanBuilder , FixedSizeBinaryBuilder , LargeStringBuilder , StringBuilder ,
30
+ StringViewBuilder ,
30
31
} ;
31
32
use arrow_array:: {
32
33
new_empty_array, new_null_array, ArrayRef , BinaryArray , BooleanArray , Date32Array , Date64Array ,
@@ -446,14 +447,43 @@ macro_rules! get_statistics {
446
447
} ,
447
448
DataType :: Dictionary ( _, value_type) => {
448
449
[ <$stat_type_prefix: lower _ statistics>] ( value_type, $iterator)
450
+ } ,
451
+ DataType :: Utf8View => {
452
+ let iterator = [ <$stat_type_prefix ByteArrayStatsIterator >] :: new( $iterator) ;
453
+ let mut builder = StringViewBuilder :: new( ) ;
454
+ for x in iterator {
455
+ let Some ( x) = x else {
456
+ builder. append_null( ) ; // no statistics value
457
+ continue ;
458
+ } ;
459
+
460
+ let Ok ( x) = std:: str :: from_utf8( x) else {
461
+ builder. append_null( ) ;
462
+ continue ;
463
+ } ;
464
+
465
+ builder. append_value( x) ;
466
+ }
467
+ Ok ( Arc :: new( builder. finish( ) ) )
468
+ } ,
469
+ DataType :: BinaryView => {
470
+ let iterator = [ <$stat_type_prefix ByteArrayStatsIterator >] :: new( $iterator) ;
471
+ let mut builder = BinaryViewBuilder :: new( ) ;
472
+ for x in iterator {
473
+ let Some ( x) = x else {
474
+ builder. append_null( ) ; // no statistics value
475
+ continue ;
476
+ } ;
477
+
478
+ builder. append_value( x) ;
479
+ }
480
+ Ok ( Arc :: new( builder. finish( ) ) )
449
481
}
450
482
451
483
DataType :: Map ( _, _) |
452
484
DataType :: Duration ( _) |
453
485
DataType :: Interval ( _) |
454
486
DataType :: Null |
455
- DataType :: BinaryView |
456
- DataType :: Utf8View |
457
487
DataType :: List ( _) |
458
488
DataType :: ListView ( _) |
459
489
DataType :: FixedSizeList ( _, _) |
@@ -919,7 +949,7 @@ macro_rules! get_data_page_statistics {
919
949
}
920
950
} )
921
951
} ,
922
- Some ( DataType :: FixedSizeBinary ( size) ) => {
952
+ Some ( DataType :: FixedSizeBinary ( size) ) => {
923
953
let mut builder = FixedSizeBinaryBuilder :: new( * size) ;
924
954
let iterator = [ <$stat_type_prefix FixedLenByteArrayDataPageStatsIterator >] :: new( $iterator) ;
925
955
for x in iterator {
@@ -943,7 +973,58 @@ macro_rules! get_data_page_statistics {
943
973
}
944
974
Ok ( Arc :: new( builder. finish( ) ) )
945
975
} ,
946
- _ => unimplemented!( )
976
+ Some ( DataType :: Utf8View ) => {
977
+ let mut builder = StringViewBuilder :: new( ) ;
978
+ let iterator = [ <$stat_type_prefix ByteArrayDataPageStatsIterator >] :: new( $iterator) ;
979
+ for x in iterator {
980
+ for x in x. into_iter( ) {
981
+ let Some ( x) = x else {
982
+ builder. append_null( ) ; // no statistics value
983
+ continue ;
984
+ } ;
985
+
986
+ let Ok ( x) = std:: str :: from_utf8( x. data( ) ) else {
987
+ builder. append_null( ) ;
988
+ continue ;
989
+ } ;
990
+
991
+ builder. append_value( x) ;
992
+ }
993
+ }
994
+ Ok ( Arc :: new( builder. finish( ) ) )
995
+ } ,
996
+ Some ( DataType :: BinaryView ) => {
997
+ let mut builder = BinaryViewBuilder :: new( ) ;
998
+ let iterator = [ <$stat_type_prefix ByteArrayDataPageStatsIterator >] :: new( $iterator) ;
999
+ for x in iterator {
1000
+ for x in x. into_iter( ) {
1001
+ let Some ( x) = x else {
1002
+ builder. append_null( ) ; // no statistics value
1003
+ continue ;
1004
+ } ;
1005
+
1006
+ builder. append_value( x) ;
1007
+ }
1008
+ }
1009
+ Ok ( Arc :: new( builder. finish( ) ) )
1010
+ } ,
1011
+ Some ( DataType :: Null ) |
1012
+ Some ( DataType :: Duration ( _) ) |
1013
+ Some ( DataType :: Interval ( _) ) |
1014
+ Some ( DataType :: List ( _) ) |
1015
+ Some ( DataType :: ListView ( _) ) |
1016
+ Some ( DataType :: FixedSizeList ( _, _) ) |
1017
+ Some ( DataType :: LargeList ( _) ) |
1018
+ Some ( DataType :: LargeListView ( _) ) |
1019
+ Some ( DataType :: Struct ( _) ) |
1020
+ Some ( DataType :: Union ( _, _) ) |
1021
+ Some ( DataType :: Map ( _, _) ) |
1022
+ Some ( DataType :: RunEndEncoded ( _, _) ) => {
1023
+ let len = $iterator. count( ) ;
1024
+ // don't know how to extract statistics, so return a null array
1025
+ Ok ( new_null_array( $data_type. unwrap( ) , len) )
1026
+ } ,
1027
+ None => unimplemented!( ) // not sure how to handle this
947
1028
}
948
1029
}
949
1030
}
@@ -1499,10 +1580,10 @@ mod test {
1499
1580
use arrow:: datatypes:: { i256, Date32Type , Date64Type } ;
1500
1581
use arrow:: util:: test_util:: parquet_test_data;
1501
1582
use arrow_array:: {
1502
- new_empty_array, new_null_array, Array , ArrayRef , BinaryArray , BooleanArray , Date32Array ,
1503
- Date64Array , Decimal128Array , Decimal256Array , Float32Array , Float64Array , Int16Array ,
1504
- Int32Array , Int64Array , Int8Array , LargeBinaryArray , RecordBatch , StringArray , StructArray ,
1505
- TimestampNanosecondArray ,
1583
+ new_empty_array, new_null_array, Array , ArrayRef , BinaryArray , BinaryViewArray ,
1584
+ BooleanArray , Date32Array , Date64Array , Decimal128Array , Decimal256Array , Float32Array ,
1585
+ Float64Array , Int16Array , Int32Array , Int64Array , Int8Array , LargeBinaryArray , RecordBatch ,
1586
+ StringArray , StringViewArray , StructArray , TimestampNanosecondArray ,
1506
1587
} ;
1507
1588
use arrow_schema:: { DataType , Field , SchemaRef } ;
1508
1589
use bytes:: Bytes ;
@@ -1916,6 +1997,65 @@ mod test {
1916
1997
. run ( )
1917
1998
}
1918
1999
2000
+ #[ test]
2001
+ fn roundtrip_string_view ( ) {
2002
+ Test {
2003
+ input : string_view_array ( [
2004
+ // row group 1
2005
+ Some ( "A" ) ,
2006
+ None ,
2007
+ Some ( "Q" ) ,
2008
+ // row group 2
2009
+ Some ( "ZZ" ) ,
2010
+ Some ( "A_longerthan12" ) ,
2011
+ None ,
2012
+ // row group 3
2013
+ Some ( "A_longerthan12" ) ,
2014
+ None ,
2015
+ None ,
2016
+ ] ) ,
2017
+ expected_min : string_view_array ( [
2018
+ Some ( "A" ) ,
2019
+ Some ( "A_longerthan12" ) ,
2020
+ Some ( "A_longerthan12" ) ,
2021
+ ] ) ,
2022
+ expected_max : string_view_array ( [ Some ( "Q" ) , Some ( "ZZ" ) , Some ( "A_longerthan12" ) ] ) ,
2023
+ }
2024
+ . run ( )
2025
+ }
2026
+
2027
+ #[ test]
2028
+ fn roundtrip_binary_view ( ) {
2029
+ let input: Vec < Option < & [ u8 ] > > = vec ! [
2030
+ // row group 1
2031
+ Some ( b"A" ) ,
2032
+ None ,
2033
+ Some ( b"Q" ) ,
2034
+ // row group 2
2035
+ Some ( b"ZZ" ) ,
2036
+ Some ( b"A_longerthan12" ) ,
2037
+ None ,
2038
+ // row group 3
2039
+ Some ( b"A_longerthan12" ) ,
2040
+ None ,
2041
+ None ,
2042
+ ] ;
2043
+
2044
+ let expected_min: Vec < Option < & [ u8 ] > > =
2045
+ vec ! [ Some ( b"A" ) , Some ( b"A_longerthan12" ) , Some ( b"A_longerthan12" ) ] ;
2046
+ let expected_max: Vec < Option < & [ u8 ] > > =
2047
+ vec ! [ Some ( b"Q" ) , Some ( b"ZZ" ) , Some ( b"A_longerthan12" ) ] ;
2048
+
2049
+ let array = binary_view_array ( input) ;
2050
+
2051
+ Test {
2052
+ input : array,
2053
+ expected_min : binary_view_array ( expected_min) ,
2054
+ expected_max : binary_view_array ( expected_max) ,
2055
+ }
2056
+ . run ( )
2057
+ }
2058
+
1919
2059
#[ test]
1920
2060
fn roundtrip_struct ( ) {
1921
2061
let mut test = Test {
@@ -2539,4 +2679,19 @@ mod test {
2539
2679
2540
2680
Arc :: new ( array)
2541
2681
}
2682
+
2683
+ fn string_view_array < ' a > ( input : impl IntoIterator < Item = Option < & ' a str > > ) -> ArrayRef {
2684
+ let array: StringViewArray = input
2685
+ . into_iter ( )
2686
+ . map ( |s| s. map ( |s| s. to_string ( ) ) )
2687
+ . collect ( ) ;
2688
+
2689
+ Arc :: new ( array)
2690
+ }
2691
+
2692
+ fn binary_view_array ( input : Vec < Option < & [ u8 ] > > ) -> ArrayRef {
2693
+ let array = BinaryViewArray :: from ( input. into_iter ( ) . collect :: < Vec < Option < & [ u8 ] > > > ( ) ) ;
2694
+
2695
+ Arc :: new ( array)
2696
+ }
2542
2697
}
0 commit comments