@@ -220,10 +220,8 @@ struct ExternalSorter {
220
220
// STATE BUFFERS:
221
221
// Fields that hold intermediate data during sorting
222
222
// ========================================================================
223
- /// Potentially unsorted in memory buffer
223
+ /// Unsorted input batches stored in the memory buffer
224
224
in_mem_batches : Vec < RecordBatch > ,
225
- /// if `Self::in_mem_batches` are sorted
226
- in_mem_batches_sorted : bool ,
227
225
228
226
/// During external sorting, in-memory intermediate data will be appended to
229
227
/// this file incrementally. Once finished, this file will be moved to [`Self::finished_spill_files`].
@@ -304,7 +302,6 @@ impl ExternalSorter {
304
302
Ok ( Self {
305
303
schema,
306
304
in_mem_batches : vec ! [ ] ,
307
- in_mem_batches_sorted : false ,
308
305
in_progress_spill_file : None ,
309
306
finished_spill_files : vec ! [ ] ,
310
307
expr : expr. into ( ) ,
@@ -341,7 +338,6 @@ impl ExternalSorter {
341
338
}
342
339
343
340
self . in_mem_batches . push ( input) ;
344
- self . in_mem_batches_sorted = false ;
345
341
Ok ( ( ) )
346
342
}
347
343
@@ -418,16 +414,13 @@ impl ExternalSorter {
418
414
self . metrics . spill_metrics . spill_file_count . value ( )
419
415
}
420
416
421
- /// When calling, all `in_mem_batches` must be sorted (*), and then all of them will
422
- /// be appended to the in-progress spill file.
423
- ///
424
- /// (*) 'Sorted' here means globally sorted for all buffered batches when the
425
- /// memory limit is reached, instead of partially sorted within the batch.
426
- async fn spill_append ( & mut self ) -> Result < ( ) > {
427
- assert ! ( self . in_mem_batches_sorted) ;
428
-
429
- // we could always get a chance to free some memory as long as we are holding some
430
- if self . in_mem_batches . is_empty ( ) {
417
+ /// Appending globally sorted batches to the in-progress spill file, and clears
418
+ /// the `globally_sorted_batches` (also its memory reservation) afterwards.
419
+ async fn consume_and_spill_append (
420
+ & mut self ,
421
+ globally_sorted_batches : & mut Vec < RecordBatch > ,
422
+ ) -> Result < ( ) > {
423
+ if globally_sorted_batches. is_empty ( ) {
431
424
return Ok ( ( ) ) ;
432
425
}
433
426
@@ -437,21 +430,25 @@ impl ExternalSorter {
437
430
Some ( self . spill_manager . create_in_progress_file ( "Sorting" ) ?) ;
438
431
}
439
432
440
- self . organize_stringview_arrays ( ) ?;
433
+ Self :: organize_stringview_arrays ( globally_sorted_batches ) ?;
441
434
442
435
debug ! ( "Spilling sort data of ExternalSorter to disk whilst inserting" ) ;
443
436
444
- let batches = std:: mem:: take ( & mut self . in_mem_batches ) ;
437
+ let batches_to_spill = std:: mem:: take ( globally_sorted_batches ) ;
445
438
self . reservation . free ( ) ;
446
439
447
440
let in_progress_file = self . in_progress_spill_file . as_mut ( ) . ok_or_else ( || {
448
441
internal_datafusion_err ! ( "In-progress spill file should be initialized" )
449
442
} ) ?;
450
443
451
- for batch in batches {
444
+ for batch in batches_to_spill {
452
445
in_progress_file. append_batch ( & batch) ?;
453
446
}
454
447
448
+ if !globally_sorted_batches. is_empty ( ) {
449
+ return internal_err ! ( "This function consumes globally_sorted_batches, so it should be empty after taking." ) ;
450
+ }
451
+
455
452
Ok ( ( ) )
456
453
}
457
454
@@ -470,7 +467,7 @@ impl ExternalSorter {
470
467
Ok ( ( ) )
471
468
}
472
469
473
- /// Reconstruct `self.in_mem_batches ` to organize the payload buffers of each
470
+ /// Reconstruct `globally_sorted_batches ` to organize the payload buffers of each
474
471
/// `StringViewArray` in sequential order by calling `gc()` on them.
475
472
///
476
473
/// Note this is a workaround until <https://github.com/apache/arrow-rs/issues/7185> is
@@ -499,10 +496,12 @@ impl ExternalSorter {
499
496
///
500
497
/// Then when spilling each batch, the writer has to write all referenced buffers
501
498
/// repeatedly.
502
- fn organize_stringview_arrays ( & mut self ) -> Result < ( ) > {
503
- let mut organized_batches = Vec :: with_capacity ( self . in_mem_batches . len ( ) ) ;
499
+ fn organize_stringview_arrays (
500
+ globally_sorted_batches : & mut Vec < RecordBatch > ,
501
+ ) -> Result < ( ) > {
502
+ let mut organized_batches = Vec :: with_capacity ( globally_sorted_batches. len ( ) ) ;
504
503
505
- for batch in self . in_mem_batches . drain ( ..) {
504
+ for batch in globally_sorted_batches . drain ( ..) {
506
505
let mut new_columns: Vec < Arc < dyn Array > > =
507
506
Vec :: with_capacity ( batch. num_columns ( ) ) ;
508
507
@@ -528,20 +527,17 @@ impl ExternalSorter {
528
527
organized_batches. push ( organized_batch) ;
529
528
}
530
529
531
- self . in_mem_batches = organized_batches;
530
+ * globally_sorted_batches = organized_batches;
532
531
533
532
Ok ( ( ) )
534
533
}
535
534
536
- /// Sorts the in_mem_batches in place
535
+ /// Sorts the in_mem_batches and potentially spill the sorted batches.
537
536
///
538
- /// Sorting may have freed memory, especially if fetch is `Some`. If
539
- /// the memory usage has dropped by a factor of 2, then we don't have
540
- /// to spill. Otherwise, we spill to free up memory for inserting
541
- /// more batches.
542
- /// The factor of 2 aims to avoid a degenerate case where the
543
- /// memory required for `fetch` is just under the memory available,
544
- /// causing repeated re-sorting of data
537
+ /// If the memory usage has dropped by a factor of 2, it might be a sort with
538
+ /// fetch (e.g. sorting 1M rows but only keep the top 100), so we keep the
539
+ /// sorted entries inside `in_mem_batches` to be sorted in the next iteration.
540
+ /// Otherwise, we spill the sorted run to free up memory for inserting more batches.
545
541
///
546
542
/// # Arguments
547
543
///
@@ -560,10 +556,18 @@ impl ExternalSorter {
560
556
561
557
let mut sorted_stream =
562
558
self . in_mem_sort_stream ( self . metrics . baseline . intermediate ( ) ) ?;
559
+ // After `in_mem_sort_stream()` is constructed, all `in_mem_batches` is taken
560
+ // to construct a globally sorted stream.
561
+ if !self . in_mem_batches . is_empty ( ) {
562
+ return internal_err ! (
563
+ "in_mem_batches should be empty after constructing sorted stream"
564
+ ) ;
565
+ }
566
+ // 'global' here refers to all buffered batches when the memory limit is
567
+ // reached. This variable will buffer the sorted batches after
568
+ // sort-preserving merge and incrementally append to spill files.
569
+ let mut globally_sorted_batches: Vec < RecordBatch > = vec ! [ ] ;
563
570
564
- // `self.in_mem_batches` is already taken away by the sort_stream, now it is empty.
565
- // We'll gradually collect the sorted stream into self.in_mem_batches, or directly
566
- // write sorted batches to disk when the memory is insufficient.
567
571
let mut spilled = false ;
568
572
while let Some ( batch) = sorted_stream. next ( ) . await {
569
573
let batch = batch?;
@@ -572,12 +576,12 @@ impl ExternalSorter {
572
576
// Although the reservation is not enough, the batch is
573
577
// already in memory, so it's okay to combine it with previously
574
578
// sorted batches, and spill together.
575
- self . in_mem_batches . push ( batch) ;
576
- self . spill_append ( ) . await ?; // reservation is freed in spill()
579
+ globally_sorted_batches. push ( batch) ;
580
+ self . consume_and_spill_append ( & mut globally_sorted_batches)
581
+ . await ?; // reservation is freed in spill()
577
582
spilled = true ;
578
583
} else {
579
- self . in_mem_batches . push ( batch) ;
580
- self . in_mem_batches_sorted = true ;
584
+ globally_sorted_batches. push ( batch) ;
581
585
}
582
586
}
583
587
@@ -591,12 +595,27 @@ impl ExternalSorter {
591
595
if ( self . reservation . size ( ) > before / 2 ) || force_spill {
592
596
// We have not freed more than 50% of the memory, so we have to spill to
593
597
// free up more memory
594
- self . spill_append ( ) . await ?;
598
+ self . consume_and_spill_append ( & mut globally_sorted_batches)
599
+ . await ?;
595
600
spilled = true ;
596
601
}
597
602
598
603
if spilled {
604
+ // There might be some buffered batches that haven't trigger a spill yet.
605
+ self . consume_and_spill_append ( & mut globally_sorted_batches)
606
+ . await ?;
599
607
self . spill_finish ( ) . await ?;
608
+ } else {
609
+ // If the memory limit has reached before calling this function, and it
610
+ // didn't spill anything, it means this is a sorting with fetch top K
611
+ // element: after sorting only the top K elements will be kept in memory.
612
+ // For simplicity, those sorted top K entries are put back to unsorted
613
+ // `in_mem_batches` to be consumed by the next sort/merge.
614
+ if !self . in_mem_batches . is_empty ( ) {
615
+ return internal_err ! ( "in_mem_batches should be cleared before" ) ;
616
+ }
617
+
618
+ self . in_mem_batches = std:: mem:: take ( & mut globally_sorted_batches) ;
600
619
}
601
620
602
621
// Reserve headroom for next sort/merge
0 commit comments