@@ -21,10 +21,8 @@ use arrow_array::cast::AsArray;
21
21
use arrow_array:: Array ;
22
22
use arrow_array:: { RecordBatch , RecordBatchReader } ;
23
23
use arrow_schema:: { ArrowError , DataType as ArrowType , Schema , SchemaRef } ;
24
- use arrow_select:: filter:: prep_null_mask_filter;
25
24
pub use filter:: { ArrowPredicate , ArrowPredicateFn , RowFilter } ;
26
25
pub use selection:: { RowSelection , RowSelector } ;
27
- use std:: collections:: VecDeque ;
28
26
use std:: sync:: Arc ;
29
27
30
28
pub use crate :: arrow:: array_reader:: RowGroups ;
@@ -39,7 +37,10 @@ use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
39
37
use crate :: file:: reader:: { ChunkReader , SerializedPageReader } ;
40
38
use crate :: schema:: types:: SchemaDescriptor ;
41
39
40
+ use read_plan:: { ReadPlan , ReadPlanBuilder } ;
41
+
42
42
mod filter;
43
+ pub ( crate ) mod read_plan;
43
44
mod selection;
44
45
pub mod statistics;
45
46
@@ -679,38 +680,33 @@ impl<T: ChunkReader + 'static> ParquetRecordBatchReaderBuilder<T> {
679
680
} ;
680
681
681
682
let mut filter = self . filter ;
682
- let mut selection = self . selection ;
683
+ let mut plan_builder = ReadPlanBuilder :: new ( batch_size ) . with_selection ( self . selection ) ;
683
684
685
+ // Update selection based on any filters
684
686
if let Some ( filter) = filter. as_mut ( ) {
685
687
for predicate in filter. predicates . iter_mut ( ) {
686
- if !selects_any ( selection. as_ref ( ) ) {
688
+ // break early if we have already ruled out all rows
689
+ if !plan_builder. selects_any ( ) {
687
690
break ;
688
691
}
689
692
693
+ // TODO move this into the read_plan
690
694
let array_reader =
691
695
build_array_reader ( self . fields . as_deref ( ) , predicate. projection ( ) , & reader) ?;
692
696
693
- selection = Some ( evaluate_predicate (
694
- batch_size,
695
- array_reader,
696
- selection,
697
- predicate. as_mut ( ) ,
698
- ) ?) ;
697
+ plan_builder = plan_builder. with_predicate ( array_reader, predicate. as_mut ( ) ) ?;
699
698
}
700
699
}
701
700
702
701
let array_reader = build_array_reader ( self . fields . as_deref ( ) , & self . projection , & reader) ?;
702
+ let read_plan = plan_builder
703
+ . limited ( reader. num_rows ( ) )
704
+ . with_offset ( self . offset )
705
+ . with_limit ( self . limit )
706
+ . build_limited ( )
707
+ . build ( ) ;
703
708
704
- // If selection is empty, truncate
705
- if !selects_any ( selection. as_ref ( ) ) {
706
- selection = Some ( RowSelection :: from ( vec ! [ ] ) ) ;
707
- }
708
-
709
- Ok ( ParquetRecordBatchReader :: new (
710
- batch_size,
711
- array_reader,
712
- apply_range ( selection, reader. num_rows ( ) , self . offset , self . limit ) ,
713
- ) )
709
+ Ok ( ParquetRecordBatchReader :: new ( array_reader, read_plan) )
714
710
}
715
711
}
716
712
@@ -789,21 +785,20 @@ impl<T: ChunkReader + 'static> PageIterator for ReaderPageIterator<T> {}
789
785
/// An `Iterator<Item = ArrowResult<RecordBatch>>` that yields [`RecordBatch`]
790
786
/// read from a parquet data source
791
787
pub struct ParquetRecordBatchReader {
792
- batch_size : usize ,
793
788
array_reader : Box < dyn ArrayReader > ,
794
789
schema : SchemaRef ,
795
- /// Row ranges to be selected from the data source
796
- selection : Option < VecDeque < RowSelector > > ,
790
+ read_plan : ReadPlan ,
797
791
}
798
792
799
793
impl Iterator for ParquetRecordBatchReader {
800
794
type Item = Result < RecordBatch , ArrowError > ;
801
795
802
796
fn next ( & mut self ) -> Option < Self :: Item > {
803
797
let mut read_records = 0 ;
804
- match self . selection . as_mut ( ) {
798
+ let batch_size = self . batch_size ( ) ;
799
+ match self . read_plan . selection_mut ( ) {
805
800
Some ( selection) => {
806
- while read_records < self . batch_size && !selection. is_empty ( ) {
801
+ while read_records < batch_size && !selection. is_empty ( ) {
807
802
let front = selection. pop_front ( ) . unwrap ( ) ;
808
803
if front. skip {
809
804
let skipped = match self . array_reader . skip_records ( front. row_count ) {
@@ -829,7 +824,7 @@ impl Iterator for ParquetRecordBatchReader {
829
824
}
830
825
831
826
// try to read record
832
- let need_read = self . batch_size - read_records;
827
+ let need_read = batch_size - read_records;
833
828
let to_read = match front. row_count . checked_sub ( need_read) {
834
829
Some ( remaining) if remaining != 0 => {
835
830
// if page row count less than batch_size we must set batch size to page row count.
@@ -847,7 +842,7 @@ impl Iterator for ParquetRecordBatchReader {
847
842
}
848
843
}
849
844
None => {
850
- if let Err ( error) = self . array_reader . read_records ( self . batch_size ) {
845
+ if let Err ( error) = self . array_reader . read_records ( self . batch_size ( ) ) {
851
846
return Some ( Err ( error. into ( ) ) ) ;
852
847
}
853
848
}
@@ -904,116 +899,37 @@ impl ParquetRecordBatchReader {
904
899
let array_reader =
905
900
build_array_reader ( levels. levels . as_ref ( ) , & ProjectionMask :: all ( ) , row_groups) ?;
906
901
902
+ let read_plan = ReadPlanBuilder :: new ( batch_size)
903
+ . with_selection ( selection)
904
+ . build ( ) ;
905
+
907
906
Ok ( Self {
908
- batch_size,
909
907
array_reader,
910
908
schema : Arc :: new ( Schema :: new ( levels. fields . clone ( ) ) ) ,
911
- selection : selection . map ( |s| s . trim ( ) . into ( ) ) ,
909
+ read_plan ,
912
910
} )
913
911
}
914
912
915
913
/// Create a new [`ParquetRecordBatchReader`] that will read at most `batch_size` rows at
916
914
/// a time from [`ArrayReader`] based on the configured `selection`. If `selection` is `None`
917
915
/// all rows will be returned
918
- pub ( crate ) fn new (
919
- batch_size : usize ,
920
- array_reader : Box < dyn ArrayReader > ,
921
- selection : Option < RowSelection > ,
922
- ) -> Self {
916
+ pub ( crate ) fn new ( array_reader : Box < dyn ArrayReader > , read_plan : ReadPlan ) -> Self {
923
917
let schema = match array_reader. get_data_type ( ) {
924
918
ArrowType :: Struct ( ref fields) => Schema :: new ( fields. clone ( ) ) ,
925
919
_ => unreachable ! ( "Struct array reader's data type is not struct!" ) ,
926
920
} ;
927
921
928
922
Self {
929
- batch_size,
930
923
array_reader,
931
924
schema : Arc :: new ( schema) ,
932
- selection : selection . map ( |s| s . trim ( ) . into ( ) ) ,
925
+ read_plan ,
933
926
}
934
927
}
935
- }
936
928
937
- /// Returns `true` if `selection` is `None` or selects some rows
938
- pub ( crate ) fn selects_any ( selection : Option < & RowSelection > ) -> bool {
939
- selection. map ( |x| x. selects_any ( ) ) . unwrap_or ( true )
940
- }
941
-
942
- /// Applies an optional offset and limit to an optional [`RowSelection`]
943
- pub ( crate ) fn apply_range (
944
- mut selection : Option < RowSelection > ,
945
- row_count : usize ,
946
- offset : Option < usize > ,
947
- limit : Option < usize > ,
948
- ) -> Option < RowSelection > {
949
- // If an offset is defined, apply it to the `selection`
950
- if let Some ( offset) = offset {
951
- selection = Some ( match row_count. checked_sub ( offset) {
952
- None => RowSelection :: from ( vec ! [ ] ) ,
953
- Some ( remaining) => selection
954
- . map ( |selection| selection. offset ( offset) )
955
- . unwrap_or_else ( || {
956
- RowSelection :: from ( vec ! [
957
- RowSelector :: skip( offset) ,
958
- RowSelector :: select( remaining) ,
959
- ] )
960
- } ) ,
961
- } ) ;
962
- }
963
-
964
- // If a limit is defined, apply it to the final `selection`
965
- if let Some ( limit) = limit {
966
- selection = Some (
967
- selection
968
- . map ( |selection| selection. limit ( limit) )
969
- . unwrap_or_else ( || {
970
- RowSelection :: from ( vec ! [ RowSelector :: select( limit. min( row_count) ) ] )
971
- } ) ,
972
- ) ;
973
- }
974
- selection
975
- }
976
-
977
- /// Evaluates an [`ArrowPredicate`], returning a [`RowSelection`] indicating
978
- /// which rows to return.
979
- ///
980
- /// `input_selection`: Optional pre-existing selection. If `Some`, then the
981
- /// final [`RowSelection`] will be the conjunction of it and the rows selected
982
- /// by `predicate`.
983
- ///
984
- /// Note: A pre-existing selection may come from evaluating a previous predicate
985
- /// or if the [`ParquetRecordBatchReader`] specified an explicit
986
- /// [`RowSelection`] in addition to one or more predicates.
987
- pub ( crate ) fn evaluate_predicate (
988
- batch_size : usize ,
989
- array_reader : Box < dyn ArrayReader > ,
990
- input_selection : Option < RowSelection > ,
991
- predicate : & mut dyn ArrowPredicate ,
992
- ) -> Result < RowSelection > {
993
- let reader = ParquetRecordBatchReader :: new ( batch_size, array_reader, input_selection. clone ( ) ) ;
994
- let mut filters = vec ! [ ] ;
995
- for maybe_batch in reader {
996
- let maybe_batch = maybe_batch?;
997
- let input_rows = maybe_batch. num_rows ( ) ;
998
- let filter = predicate. evaluate ( maybe_batch) ?;
999
- // Since user supplied predicate, check error here to catch bugs quickly
1000
- if filter. len ( ) != input_rows {
1001
- return Err ( arrow_err ! (
1002
- "ArrowPredicate predicate returned {} rows, expected {input_rows}" ,
1003
- filter. len( )
1004
- ) ) ;
1005
- }
1006
- match filter. null_count ( ) {
1007
- 0 => filters. push ( filter) ,
1008
- _ => filters. push ( prep_null_mask_filter ( & filter) ) ,
1009
- } ;
929
+ #[ inline( always) ]
930
+ pub ( crate ) fn batch_size ( & self ) -> usize {
931
+ self . read_plan . batch_size ( )
1010
932
}
1011
-
1012
- let raw = RowSelection :: from_filters ( & filters) ;
1013
- Ok ( match input_selection {
1014
- Some ( selection) => selection. and_then ( & raw ) ,
1015
- None => raw,
1016
- } )
1017
933
}
1018
934
1019
935
#[ cfg( test) ]
@@ -3992,7 +3908,7 @@ mod tests {
3992
3908
. build ( )
3993
3909
. unwrap ( ) ;
3994
3910
assert_ne ! ( 1024 , num_rows) ;
3995
- assert_eq ! ( reader. batch_size, num_rows as usize ) ;
3911
+ assert_eq ! ( reader. read_plan . batch_size( ) , num_rows as usize ) ;
3996
3912
}
3997
3913
3998
3914
#[ test]
0 commit comments