@@ -876,42 +876,47 @@ fn spawn_parquet_parallel_serialization_task(
876
876
) ?;
877
877
let mut current_rg_rows = 0 ;
878
878
879
- while let Some ( rb) = data. recv ( ) . await {
880
- if current_rg_rows + rb. num_rows ( ) < max_row_group_rows {
881
- send_arrays_to_col_writers ( & col_array_channels, & rb, schema. clone ( ) )
882
- . await ?;
883
- current_rg_rows += rb. num_rows ( ) ;
884
- } else {
885
- let rows_left = max_row_group_rows - current_rg_rows;
886
- let a = rb. slice ( 0 , rows_left) ;
887
- send_arrays_to_col_writers ( & col_array_channels, & a, schema. clone ( ) )
888
- . await ?;
879
+ while let Some ( mut rb) = data. recv ( ) . await {
880
+ // This loop allows the "else" block to repeatedly split the RecordBatch to handle the case
881
+ // when max_row_group_rows < execution.batch_size as an alternative to a recursive async
882
+ // function.
883
+ loop {
884
+ if current_rg_rows + rb. num_rows ( ) < max_row_group_rows {
885
+ send_arrays_to_col_writers ( & col_array_channels, & rb, schema. clone ( ) )
886
+ . await ?;
887
+ current_rg_rows += rb. num_rows ( ) ;
888
+ break ;
889
+ } else {
890
+ let rows_left = max_row_group_rows - current_rg_rows;
891
+ let a = rb. slice ( 0 , rows_left) ;
892
+ send_arrays_to_col_writers ( & col_array_channels, & a, schema. clone ( ) )
893
+ . await ?;
894
+
895
+ // Signal the parallel column writers that the RowGroup is done, join and finalize RowGroup
896
+ // on a separate task, so that we can immediately start on the next RG before waiting
897
+ // for the current one to finish.
898
+ drop ( col_array_channels) ;
899
+ let finalize_rg_task = spawn_rg_join_and_finalize_task (
900
+ column_writer_handles,
901
+ max_row_group_rows,
902
+ ) ;
903
+
904
+ serialize_tx. send ( finalize_rg_task) . await . map_err ( |_| {
905
+ DataFusionError :: Internal (
906
+ "Unable to send closed RG to concat task!" . into ( ) ,
907
+ )
908
+ } ) ?;
889
909
890
- // Signal the parallel column writers that the RowGroup is done, join and finalize RowGroup
891
- // on a separate task, so that we can immediately start on the next RG before waiting
892
- // for the current one to finish.
893
- drop ( col_array_channels) ;
894
- let finalize_rg_task = spawn_rg_join_and_finalize_task (
895
- column_writer_handles,
896
- max_row_group_rows,
897
- ) ;
898
-
899
- serialize_tx. send ( finalize_rg_task) . await . map_err ( |_| {
900
- DataFusionError :: Internal (
901
- "Unable to send closed RG to concat task!" . into ( ) ,
902
- )
903
- } ) ?;
910
+ current_rg_rows = 0 ;
911
+ rb = rb. slice ( rows_left, rb. num_rows ( ) - rows_left) ;
904
912
905
- let b = rb. slice ( rows_left, rb. num_rows ( ) - rows_left) ;
906
- ( column_writer_handles, col_array_channels) =
907
- spawn_column_parallel_row_group_writer (
908
- schema. clone ( ) ,
909
- writer_props. clone ( ) ,
910
- max_buffer_rb,
911
- ) ?;
912
- send_arrays_to_col_writers ( & col_array_channels, & b, schema. clone ( ) )
913
- . await ?;
914
- current_rg_rows = b. num_rows ( ) ;
913
+ ( column_writer_handles, col_array_channels) =
914
+ spawn_column_parallel_row_group_writer (
915
+ schema. clone ( ) ,
916
+ writer_props. clone ( ) ,
917
+ max_buffer_rb,
918
+ ) ?;
919
+ }
915
920
}
916
921
}
917
922
0 commit comments