@@ -648,12 +648,12 @@ def _read_deletes(fs: FileSystem, data_file: DataFile) -> Dict[str, pa.ChunkedAr
648
648
}
649
649
650
650
651
- def _combine_positional_deletes (positional_deletes : List [pa .ChunkedArray ], rows : int ) -> pa .Array :
651
+ def _combine_positional_deletes (positional_deletes : List [pa .ChunkedArray ], start_index : int , end_index : int ) -> pa .Array :
652
652
if len (positional_deletes ) == 1 :
653
653
all_chunks = positional_deletes [0 ]
654
654
else :
655
655
all_chunks = pa .chunked_array (itertools .chain (* [arr .chunks for arr in positional_deletes ]))
656
- return np .setdiff1d (np .arange (rows ), all_chunks , assume_unique = False )
656
+ return np .subtract ( np . setdiff1d (np .arange (start_index , end_index ), all_chunks , assume_unique = False ), start_index )
657
657
658
658
659
659
def pyarrow_to_schema (schema : pa .Schema , name_mapping : Optional [NameMapping ] = None ) -> Schema :
@@ -960,17 +960,16 @@ def _field_id(self, field: pa.Field) -> int:
960
960
return - 1
961
961
962
962
963
- def _task_to_table (
963
+ def _task_to_record_batches (
964
964
fs : FileSystem ,
965
965
task : FileScanTask ,
966
966
bound_row_filter : BooleanExpression ,
967
967
projected_schema : Schema ,
968
968
projected_field_ids : Set [int ],
969
969
positional_deletes : Optional [List [ChunkedArray ]],
970
970
case_sensitive : bool ,
971
- limit : Optional [int ] = None ,
972
971
name_mapping : Optional [NameMapping ] = None ,
973
- ) -> Optional [pa .Table ]:
972
+ ) -> Iterator [pa .RecordBatch ]:
974
973
_ , _ , path = PyArrowFileIO .parse_location (task .file .file_path )
975
974
arrow_format = ds .ParquetFileFormat (pre_buffer = True , buffer_size = (ONE_MEGABYTE * 8 ))
976
975
with fs .open_input_file (path ) as fin :
@@ -998,36 +997,27 @@ def _task_to_table(
998
997
columns = [col .name for col in file_project_schema .columns ],
999
998
)
1000
999
1001
- if positional_deletes :
1002
- # Create the mask of indices that we're interested in
1003
- indices = _combine_positional_deletes (positional_deletes , fragment .count_rows ())
1004
-
1005
- if limit :
1006
- if pyarrow_filter is not None :
1007
- # In case of the filter, we don't exactly know how many rows
1008
- # we need to fetch upfront, can be optimized in the future:
1009
- # https://github.com/apache/arrow/issues/35301
1010
- arrow_table = fragment_scanner .take (indices )
1011
- arrow_table = arrow_table .filter (pyarrow_filter )
1012
- arrow_table = arrow_table .slice (0 , limit )
1013
- else :
1014
- arrow_table = fragment_scanner .take (indices [0 :limit ])
1015
- else :
1016
- arrow_table = fragment_scanner .take (indices )
1000
+ current_index = 0
1001
+ batches = fragment_scanner .to_batches ()
1002
+ for batch in batches :
1003
+ if positional_deletes :
1004
+ # Create the mask of indices that we're interested in
1005
+ indices = _combine_positional_deletes (positional_deletes , current_index , len (batch ))
1006
+
1007
+ batch = batch .take (indices )
1017
1008
# Apply the user filter
1018
1009
if pyarrow_filter is not None :
1010
+ # we need to switch back and forth between RecordBatch and Table
1011
+ # as Expression filter isn't yet supported in RecordBatch
1012
+ # https://github.com/apache/arrow/issues/39220
1013
+ arrow_table = pa .Table .from_batches ([batch ])
1019
1014
arrow_table = arrow_table .filter (pyarrow_filter )
1020
- else :
1021
- # If there are no deletes, we can just take the head
1022
- # and the user-filter is already applied
1023
- if limit :
1024
- arrow_table = fragment_scanner .head (limit )
1015
+ arrow_batches = arrow_table .to_batches ()
1016
+ for arrow_batch in arrow_batches :
1017
+ yield to_requested_schema (projected_schema , file_project_schema , arrow_table )
1025
1018
else :
1026
- arrow_table = fragment_scanner .to_table ()
1027
-
1028
- if len (arrow_table ) < 1 :
1029
- return None
1030
- return to_requested_schema (projected_schema , file_project_schema , arrow_table )
1019
+ yield to_requested_schema (projected_schema , file_project_schema , arrow_table )
1020
+ current_index += len (batch )
1031
1021
1032
1022
1033
1023
def _read_all_delete_files (fs : FileSystem , tasks : Iterable [FileScanTask ]) -> Dict [str , List [ChunkedArray ]]:
@@ -1140,7 +1130,7 @@ def project_table(
1140
1130
return result
1141
1131
1142
1132
1143
- def to_requested_schema (requested_schema : Schema , file_schema : Schema , table : pa .Table ) -> pa .Table :
1133
+ def to_requested_schema (requested_schema : Schema , file_schema : Schema , table : pa .RecordBatch ) -> pa .RecordBatch :
1144
1134
struct_array = visit_with_partner (requested_schema , table , ArrowProjectionVisitor (file_schema ), ArrowAccessor (file_schema ))
1145
1135
1146
1136
arrays = []
@@ -1149,7 +1139,7 @@ def to_requested_schema(requested_schema: Schema, file_schema: Schema, table: pa
1149
1139
array = struct_array .field (pos )
1150
1140
arrays .append (array )
1151
1141
fields .append (pa .field (field .name , array .type , field .optional ))
1152
- return pa .Table .from_arrays (arrays , schema = pa .schema (fields ))
1142
+ return pa .RecordBatch .from_arrays (arrays , schema = pa .schema (fields ))
1153
1143
1154
1144
1155
1145
class ArrowProjectionVisitor (SchemaWithPartnerVisitor [pa .Array , Optional [pa .Array ]]):
0 commit comments