@@ -24,7 +24,7 @@ use std::fmt;
24
24
use std:: fmt:: { Debug , Formatter } ;
25
25
use std:: sync:: Arc ;
26
26
27
- use crate :: common:: { spawn_buffered, IPCWriter } ;
27
+ use crate :: common:: spawn_buffered;
28
28
use crate :: execution_plan:: { Boundedness , CardinalityEffect , EmissionType } ;
29
29
use crate :: expressions:: PhysicalSortExpr ;
30
30
use crate :: limit:: LimitStream ;
@@ -44,7 +44,9 @@ use crate::{
44
44
Statistics ,
45
45
} ;
46
46
47
- use arrow:: array:: { Array , RecordBatch , RecordBatchOptions , UInt32Array } ;
47
+ use arrow:: array:: {
48
+ Array , RecordBatch , RecordBatchOptions , StringViewArray , UInt32Array ,
49
+ } ;
48
50
use arrow:: compute:: { concat_batches, lexsort_to_indices, take_arrays, SortColumn } ;
49
51
use arrow:: datatypes:: { DataType , SchemaRef } ;
50
52
use arrow:: row:: { RowConverter , SortField } ;
@@ -300,6 +302,7 @@ impl ExternalSorter {
300
302
if input. num_rows ( ) == 0 {
301
303
return Ok ( ( ) ) ;
302
304
}
305
+
303
306
self . reserve_memory_for_merge ( ) ?;
304
307
305
308
let size = get_reserved_byte_for_record_batch ( & input) ;
@@ -397,6 +400,8 @@ impl ExternalSorter {
397
400
return Ok ( 0 ) ;
398
401
}
399
402
403
+ self . organize_stringview_arrays ( ) ?;
404
+
400
405
debug ! ( "Spilling sort data of ExternalSorter to disk whilst inserting" ) ;
401
406
402
407
let spill_file = self . runtime . disk_manager . create_tmp_file ( "Sorting" ) ?;
@@ -414,6 +419,69 @@ impl ExternalSorter {
414
419
Ok ( used)
415
420
}
416
421
422
+ /// Reconstruct `self.in_mem_batches` to organize the payload buffers of each
423
+ /// `StringViewArray` in sequential order by calling `gc()` on them.
424
+ ///
425
+ /// Note this is a workaround until <https://github.com/apache/arrow-rs/issues/7185> is
426
+ /// available
427
+ ///
428
+ /// # Rationale
429
+ /// After (merge-based) sorting, all batches will be sorted into a single run,
430
+ /// but physically this sorted run is chunked into many small batches. For
431
+ /// `StringViewArray`s inside each sorted run, their inner buffers are not
432
+ /// re-constructed by default, leading to non-sequential payload locations
433
+ /// (permutated by `interleave()` Arrow kernel). A single payload buffer might
434
+ /// be shared by multiple `RecordBatch`es.
435
+ /// When writing each batch to disk, the writer has to write all referenced buffers,
436
+ /// because they have to be read back one by one to reduce memory usage. This
437
+ /// causes extra disk reads and writes, and potentially execution failure.
438
+ ///
439
+ /// # Example
440
+ /// Before sorting:
441
+ /// batch1 -> buffer1
442
+ /// batch2 -> buffer2
443
+ ///
444
+ /// sorted_batch1 -> buffer1
445
+ /// -> buffer2
446
+ /// sorted_batch2 -> buffer1
447
+ /// -> buffer2
448
+ ///
449
+ /// Then when spilling each batch, the writer has to write all referenced buffers
450
+ /// repeatedly.
451
+ fn organize_stringview_arrays ( & mut self ) -> Result < ( ) > {
452
+ let mut organized_batches = Vec :: with_capacity ( self . in_mem_batches . len ( ) ) ;
453
+
454
+ for batch in self . in_mem_batches . drain ( ..) {
455
+ let mut new_columns: Vec < Arc < dyn Array > > =
456
+ Vec :: with_capacity ( batch. num_columns ( ) ) ;
457
+
458
+ let mut arr_mutated = false ;
459
+ for array in batch. columns ( ) {
460
+ if let Some ( string_view_array) =
461
+ array. as_any ( ) . downcast_ref :: < StringViewArray > ( )
462
+ {
463
+ let new_array = string_view_array. gc ( ) ;
464
+ new_columns. push ( Arc :: new ( new_array) ) ;
465
+ arr_mutated = true ;
466
+ } else {
467
+ new_columns. push ( Arc :: clone ( array) ) ;
468
+ }
469
+ }
470
+
471
+ let organized_batch = if arr_mutated {
472
+ RecordBatch :: try_new ( batch. schema ( ) , new_columns) ?
473
+ } else {
474
+ batch
475
+ } ;
476
+
477
+ organized_batches. push ( organized_batch) ;
478
+ }
479
+
480
+ self . in_mem_batches = organized_batches;
481
+
482
+ Ok ( ( ) )
483
+ }
484
+
417
485
/// Sorts the in_mem_batches in place
418
486
///
419
487
/// Sorting may have freed memory, especially if fetch is `Some`. If
@@ -439,54 +507,29 @@ impl ExternalSorter {
439
507
// `self.in_mem_batches` is already taken away by the sort_stream, now it is empty.
440
508
// We'll gradually collect the sorted stream into self.in_mem_batches, or directly
441
509
// write sorted batches to disk when the memory is insufficient.
442
- let mut spill_writer: Option < IPCWriter > = None ;
443
510
while let Some ( batch) = sorted_stream. next ( ) . await {
444
511
let batch = batch?;
445
- match & mut spill_writer {
446
- None => {
447
- let sorted_size = get_reserved_byte_for_record_batch ( & batch) ;
448
- if self . reservation . try_grow ( sorted_size) . is_err ( ) {
449
- // Directly write in_mem_batches as well as all the remaining batches in
450
- // sorted_stream to disk. Further batches fetched from `sorted_stream` will
451
- // be handled by the `Some(writer)` matching arm.
452
- let spill_file =
453
- self . runtime . disk_manager . create_tmp_file ( "Sorting" ) ?;
454
- let mut writer = IPCWriter :: new ( spill_file. path ( ) , & self . schema ) ?;
455
- // Flush everything in memory to the spill file
456
- for batch in self . in_mem_batches . drain ( ..) {
457
- writer. write ( & batch) ?;
458
- }
459
- // as well as the newly sorted batch
460
- writer. write ( & batch) ?;
461
- spill_writer = Some ( writer) ;
462
- self . reservation . free ( ) ;
463
- self . spills . push ( spill_file) ;
464
- } else {
465
- self . in_mem_batches . push ( batch) ;
466
- self . in_mem_batches_sorted = true ;
467
- }
468
- }
469
- Some ( writer) => {
470
- writer. write ( & batch) ?;
471
- }
512
+ let sorted_size = get_reserved_byte_for_record_batch ( & batch) ;
513
+ if self . reservation . try_grow ( sorted_size) . is_err ( ) {
514
+ // Although the reservation is not enough, the batch is
515
+ // already in memory, so it's okay to combine it with previously
516
+ // sorted batches, and spill together.
517
+ self . in_mem_batches . push ( batch) ;
518
+ self . spill ( ) . await ?; // reservation is freed in spill()
519
+ } else {
520
+ self . in_mem_batches . push ( batch) ;
521
+ self . in_mem_batches_sorted = true ;
472
522
}
473
523
}
474
524
475
525
// Drop early to free up memory reserved by the sorted stream, otherwise the
476
526
// upcoming `self.reserve_memory_for_merge()` may fail due to insufficient memory.
477
527
drop ( sorted_stream) ;
478
528
479
- if let Some ( writer) = & mut spill_writer {
480
- writer. finish ( ) ?;
481
- self . metrics . spill_count . add ( 1 ) ;
482
- self . metrics . spilled_rows . add ( writer. num_rows ) ;
483
- self . metrics . spilled_bytes . add ( writer. num_bytes ) ;
484
- }
485
-
486
529
// Sorting may free up some memory especially when fetch is `Some`. If we have
487
530
// not freed more than 50% of the memory, then we have to spill to free up more
488
531
// memory for inserting more batches.
489
- if spill_writer . is_none ( ) && self . reservation . size ( ) > before / 2 {
532
+ if self . reservation . size ( ) > before / 2 {
490
533
// We have not freed more than 50% of the memory, so we have to spill to
491
534
// free up more memory
492
535
self . spill ( ) . await ?;
@@ -1422,10 +1465,14 @@ mod tests {
1422
1465
let spill_count = metrics. spill_count ( ) . unwrap ( ) ;
1423
1466
let spilled_rows = metrics. spilled_rows ( ) . unwrap ( ) ;
1424
1467
let spilled_bytes = metrics. spilled_bytes ( ) . unwrap ( ) ;
1425
- // Processing 840 KB of data using 400 KB of memory requires at least 2 spills
1426
- // It will spill roughly 18000 rows and 800 KBytes.
1427
- // We leave a little wiggle room for the actual numbers.
1428
- assert ! ( ( 2 ..=10 ) . contains( & spill_count) ) ;
1468
+
1469
+ // This test case is processing 840KB of data using 400KB of memory. Note
1470
+ // that buffered batches can't be dropped until all sorted batches are
1471
+ // generated, so we can only buffer `sort_spill_reservation_bytes` of sorted
1472
+ // batches.
1473
+ // The number of spills is roughly calculated as:
1474
+ // `number_of_batches / (sort_spill_reservation_bytes / batch_size)`
1475
+ assert ! ( ( 12 ..=18 ) . contains( & spill_count) ) ;
1429
1476
assert ! ( ( 15000 ..=20000 ) . contains( & spilled_rows) ) ;
1430
1477
assert ! ( ( 700000 ..=900000 ) . contains( & spilled_bytes) ) ;
1431
1478
0 commit comments