@@ -21,10 +21,8 @@ use arrow_array::cast::AsArray;
21
21
use arrow_array:: Array ;
22
22
use arrow_array:: { RecordBatch , RecordBatchReader } ;
23
23
use arrow_schema:: { ArrowError , DataType as ArrowType , Schema , SchemaRef } ;
24
- use arrow_select:: filter:: prep_null_mask_filter;
25
24
pub use filter:: { ArrowPredicate , ArrowPredicateFn , RowFilter } ;
26
25
pub use selection:: { RowSelection , RowSelector } ;
27
- use std:: collections:: VecDeque ;
28
26
use std:: sync:: Arc ;
29
27
30
28
pub use crate :: arrow:: array_reader:: RowGroups ;
@@ -39,7 +37,10 @@ use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
39
37
use crate :: file:: reader:: { ChunkReader , SerializedPageReader } ;
40
38
use crate :: schema:: types:: SchemaDescriptor ;
41
39
40
+ use read_plan:: { ReadPlan , ReadPlanBuilder } ;
41
+
42
42
mod filter;
43
+ pub ( crate ) mod read_plan;
43
44
mod selection;
44
45
pub mod statistics;
45
46
@@ -679,38 +680,33 @@ impl<T: ChunkReader + 'static> ParquetRecordBatchReaderBuilder<T> {
679
680
} ;
680
681
681
682
let mut filter = self . filter ;
682
- let mut selection = self . selection ;
683
+ let mut plan_builder = ReadPlanBuilder :: new ( batch_size ) . with_selection ( self . selection ) ;
683
684
685
+ // Update selection based on any filters
684
686
if let Some ( filter) = filter. as_mut ( ) {
685
687
for predicate in filter. predicates . iter_mut ( ) {
686
- if !selects_any ( selection. as_ref ( ) ) {
688
+ // break early if we have already ruled out all rows
689
+ if !plan_builder. selects_any ( ) {
687
690
break ;
688
691
}
689
692
693
+ // TODO move this into the read_plan
690
694
let array_reader =
691
695
build_array_reader ( self . fields . as_deref ( ) , predicate. projection ( ) , & reader) ?;
692
696
693
- selection = Some ( evaluate_predicate (
694
- batch_size,
695
- array_reader,
696
- selection,
697
- predicate. as_mut ( ) ,
698
- ) ?) ;
697
+ plan_builder = plan_builder. with_predicate ( array_reader, predicate. as_mut ( ) ) ?;
699
698
}
700
699
}
701
700
702
701
let array_reader = build_array_reader ( self . fields . as_deref ( ) , & self . projection , & reader) ?;
702
+ let read_plan = plan_builder
703
+ . limited ( reader. num_rows ( ) )
704
+ . with_offset ( self . offset )
705
+ . with_limit ( self . limit )
706
+ . build_limited ( )
707
+ . build ( ) ;
703
708
704
- // If selection is empty, truncate
705
- if !selects_any ( selection. as_ref ( ) ) {
706
- selection = Some ( RowSelection :: from ( vec ! [ ] ) ) ;
707
- }
708
-
709
- Ok ( ParquetRecordBatchReader :: new (
710
- batch_size,
711
- array_reader,
712
- apply_range ( selection, reader. num_rows ( ) , self . offset , self . limit ) ,
713
- ) )
709
+ Ok ( ParquetRecordBatchReader :: new ( array_reader, read_plan) )
714
710
}
715
711
}
716
712
@@ -789,20 +785,20 @@ impl<T: ChunkReader + 'static> PageIterator for ReaderPageIterator<T> {}
789
785
/// An `Iterator<Item = ArrowResult<RecordBatch>>` that yields [`RecordBatch`]
790
786
/// read from a parquet data source
791
787
pub struct ParquetRecordBatchReader {
792
- batch_size : usize ,
793
788
array_reader : Box < dyn ArrayReader > ,
794
789
schema : SchemaRef ,
795
- selection : Option < VecDeque < RowSelector > > ,
790
+ read_plan : ReadPlan ,
796
791
}
797
792
798
793
impl Iterator for ParquetRecordBatchReader {
799
794
type Item = Result < RecordBatch , ArrowError > ;
800
795
801
796
fn next ( & mut self ) -> Option < Self :: Item > {
802
797
let mut read_records = 0 ;
803
- match self . selection . as_mut ( ) {
798
+ let batch_size = self . batch_size ( ) ;
799
+ match self . read_plan . selection_mut ( ) {
804
800
Some ( selection) => {
805
- while read_records < self . batch_size && !selection. is_empty ( ) {
801
+ while read_records < batch_size && !selection. is_empty ( ) {
806
802
let front = selection. pop_front ( ) . unwrap ( ) ;
807
803
if front. skip {
808
804
let skipped = match self . array_reader . skip_records ( front. row_count ) {
@@ -828,7 +824,7 @@ impl Iterator for ParquetRecordBatchReader {
828
824
}
829
825
830
826
// try to read record
831
- let need_read = self . batch_size - read_records;
827
+ let need_read = batch_size - read_records;
832
828
let to_read = match front. row_count . checked_sub ( need_read) {
833
829
Some ( remaining) if remaining != 0 => {
834
830
// if page row count less than batch_size we must set batch size to page row count.
@@ -846,7 +842,7 @@ impl Iterator for ParquetRecordBatchReader {
846
842
}
847
843
}
848
844
None => {
849
- if let Err ( error) = self . array_reader . read_records ( self . batch_size ) {
845
+ if let Err ( error) = self . array_reader . read_records ( self . batch_size ( ) ) {
850
846
return Some ( Err ( error. into ( ) ) ) ;
851
847
}
852
848
}
@@ -903,116 +899,37 @@ impl ParquetRecordBatchReader {
903
899
let array_reader =
904
900
build_array_reader ( levels. levels . as_ref ( ) , & ProjectionMask :: all ( ) , row_groups) ?;
905
901
902
+ let read_plan = ReadPlanBuilder :: new ( batch_size)
903
+ . with_selection ( selection)
904
+ . build ( ) ;
905
+
906
906
Ok ( Self {
907
- batch_size,
908
907
array_reader,
909
908
schema : Arc :: new ( Schema :: new ( levels. fields . clone ( ) ) ) ,
910
- selection : selection . map ( |s| s . trim ( ) . into ( ) ) ,
909
+ read_plan ,
911
910
} )
912
911
}
913
912
914
913
/// Create a new [`ParquetRecordBatchReader`] that will read at most `batch_size` rows at
915
914
/// a time from [`ArrayReader`] based on the configured `selection`. If `selection` is `None`
916
915
/// all rows will be returned
917
- pub ( crate ) fn new (
918
- batch_size : usize ,
919
- array_reader : Box < dyn ArrayReader > ,
920
- selection : Option < RowSelection > ,
921
- ) -> Self {
916
+ pub ( crate ) fn new ( array_reader : Box < dyn ArrayReader > , read_plan : ReadPlan ) -> Self {
922
917
let schema = match array_reader. get_data_type ( ) {
923
918
ArrowType :: Struct ( ref fields) => Schema :: new ( fields. clone ( ) ) ,
924
919
_ => unreachable ! ( "Struct array reader's data type is not struct!" ) ,
925
920
} ;
926
921
927
922
Self {
928
- batch_size,
929
923
array_reader,
930
924
schema : Arc :: new ( schema) ,
931
- selection : selection . map ( |s| s . trim ( ) . into ( ) ) ,
925
+ read_plan ,
932
926
}
933
927
}
934
- }
935
928
936
- /// Returns `true` if `selection` is `None` or selects some rows
937
- pub ( crate ) fn selects_any ( selection : Option < & RowSelection > ) -> bool {
938
- selection. map ( |x| x. selects_any ( ) ) . unwrap_or ( true )
939
- }
940
-
941
- /// Applies an optional offset and limit to an optional [`RowSelection`]
942
- pub ( crate ) fn apply_range (
943
- mut selection : Option < RowSelection > ,
944
- row_count : usize ,
945
- offset : Option < usize > ,
946
- limit : Option < usize > ,
947
- ) -> Option < RowSelection > {
948
- // If an offset is defined, apply it to the `selection`
949
- if let Some ( offset) = offset {
950
- selection = Some ( match row_count. checked_sub ( offset) {
951
- None => RowSelection :: from ( vec ! [ ] ) ,
952
- Some ( remaining) => selection
953
- . map ( |selection| selection. offset ( offset) )
954
- . unwrap_or_else ( || {
955
- RowSelection :: from ( vec ! [
956
- RowSelector :: skip( offset) ,
957
- RowSelector :: select( remaining) ,
958
- ] )
959
- } ) ,
960
- } ) ;
961
- }
962
-
963
- // If a limit is defined, apply it to the final `selection`
964
- if let Some ( limit) = limit {
965
- selection = Some (
966
- selection
967
- . map ( |selection| selection. limit ( limit) )
968
- . unwrap_or_else ( || {
969
- RowSelection :: from ( vec ! [ RowSelector :: select( limit. min( row_count) ) ] )
970
- } ) ,
971
- ) ;
972
- }
973
- selection
974
- }
975
-
976
- /// Evaluates an [`ArrowPredicate`], returning a [`RowSelection`] indicating
977
- /// which rows to return.
978
- ///
979
- /// `input_selection`: Optional pre-existing selection. If `Some`, then the
980
- /// final [`RowSelection`] will be the conjunction of it and the rows selected
981
- /// by `predicate`.
982
- ///
983
- /// Note: A pre-existing selection may come from evaluating a previous predicate
984
- /// or if the [`ParquetRecordBatchReader`] specified an explicit
985
- /// [`RowSelection`] in addition to one or more predicates.
986
- pub ( crate ) fn evaluate_predicate (
987
- batch_size : usize ,
988
- array_reader : Box < dyn ArrayReader > ,
989
- input_selection : Option < RowSelection > ,
990
- predicate : & mut dyn ArrowPredicate ,
991
- ) -> Result < RowSelection > {
992
- let reader = ParquetRecordBatchReader :: new ( batch_size, array_reader, input_selection. clone ( ) ) ;
993
- let mut filters = vec ! [ ] ;
994
- for maybe_batch in reader {
995
- let maybe_batch = maybe_batch?;
996
- let input_rows = maybe_batch. num_rows ( ) ;
997
- let filter = predicate. evaluate ( maybe_batch) ?;
998
- // Since user supplied predicate, check error here to catch bugs quickly
999
- if filter. len ( ) != input_rows {
1000
- return Err ( arrow_err ! (
1001
- "ArrowPredicate predicate returned {} rows, expected {input_rows}" ,
1002
- filter. len( )
1003
- ) ) ;
1004
- }
1005
- match filter. null_count ( ) {
1006
- 0 => filters. push ( filter) ,
1007
- _ => filters. push ( prep_null_mask_filter ( & filter) ) ,
1008
- } ;
929
+ #[ inline( always) ]
930
+ pub ( crate ) fn batch_size ( & self ) -> usize {
931
+ self . read_plan . batch_size ( )
1009
932
}
1010
-
1011
- let raw = RowSelection :: from_filters ( & filters) ;
1012
- Ok ( match input_selection {
1013
- Some ( selection) => selection. and_then ( & raw ) ,
1014
- None => raw,
1015
- } )
1016
933
}
1017
934
1018
935
#[ cfg( test) ]
@@ -3991,7 +3908,7 @@ mod tests {
3991
3908
. build ( )
3992
3909
. unwrap ( ) ;
3993
3910
assert_ne ! ( 1024 , num_rows) ;
3994
- assert_eq ! ( reader. batch_size, num_rows as usize ) ;
3911
+ assert_eq ! ( reader. read_plan . batch_size( ) , num_rows as usize ) ;
3995
3912
}
3996
3913
3997
3914
#[ test]
0 commit comments