@@ -43,7 +43,6 @@ pub(crate) struct CachedPredicateResultBuilder {
43
43
/// <https://github.com/apache/arrow-rs/issues/6692>
44
44
in_progress_arrays : Vec < Box < dyn InProgressArray > > ,
45
45
filters : Vec < BooleanArray > ,
46
- batch_size : usize ,
47
46
}
48
47
49
48
impl CachedPredicateResultBuilder {
@@ -79,7 +78,6 @@ impl CachedPredicateResultBuilder {
79
78
Self {
80
79
in_progress_arrays,
81
80
filters : vec ! [ ] ,
82
- batch_size,
83
81
}
84
82
} ;
85
83
}
@@ -106,7 +104,6 @@ impl CachedPredicateResultBuilder {
106
104
Self {
107
105
in_progress_arrays,
108
106
filters : vec ! [ ] ,
109
- batch_size,
110
107
}
111
108
}
112
109
@@ -135,7 +132,6 @@ impl CachedPredicateResultBuilder {
135
132
let Self {
136
133
in_progress_arrays,
137
134
filters,
138
- batch_size : _,
139
135
} = self ;
140
136
141
137
let new_selection = RowSelection :: from_filters ( & filters) ;
@@ -316,29 +312,36 @@ trait InProgressArray {
316
312
/// results is not possible.
317
313
fn create_in_progress_array (
318
314
in_projection : bool ,
319
- _data_type : & DataType ,
315
+ data_type : & DataType ,
320
316
batch_size : usize ,
321
317
) -> Box < dyn InProgressArray > {
322
- if in_projection {
323
- Box :: new ( InProgressArrayImpl :: new ( batch_size, GenericArrayBuilder :: new ( ) ) )
324
-
325
- } else {
318
+ if !in_projection {
326
319
// column is not in the projection, so no need to cache
327
- Box :: new ( NoOpInProgressArray :: new ( ) )
328
-
320
+ return Box :: new ( NoOpInProgressArray :: new ( ) )
321
+ }
322
+
323
+ match data_type {
324
+ DataType :: Utf8View => {
325
+ Box :: new ( InProgressArrayImpl :: new ( batch_size,
326
+ InProgressStringViewBuilder :: new_with_capacity ( batch_size) ) )
327
+ }
328
+ _ => {
329
+ // TODO implement more specific types
330
+ Box :: new ( InProgressArrayImpl :: new ( batch_size, GenericArrayBuilder :: new ( ) ) )
331
+ }
329
332
}
330
333
}
331
334
332
335
333
336
/// A builder for creating an InProgressArray. Trait so we can use Dyn dispatch
334
337
trait InProgressArrayBuilder {
335
338
/// Appends all values of the array to the in progress array
336
- ///
339
+ ///
337
340
/// TODO: potentially pass in filter and unfiltered array to avoid a copy
338
341
fn append ( & mut self , array : ArrayRef ) ;
339
342
340
343
/// Finalizes the in progress array, resetting state and returning the new array.
341
- ///
344
+ ///
342
345
/// Returns None if there are no rows in progress
343
346
fn try_build ( & mut self ) -> Result < Option < ArrayRef > > ;
344
347
}
@@ -363,7 +366,7 @@ impl <B: InProgressArrayBuilder> InProgressArrayImpl<B> {
363
366
inner,
364
367
}
365
368
}
366
-
369
+
367
370
/// Combines all arrays in `current` into a new array in `finished` and returns the
368
371
/// number of rows in the array added to `self.finished`
369
372
fn finish_current ( & mut self ) -> Result < usize > {
@@ -372,11 +375,11 @@ impl <B: InProgressArrayBuilder> InProgressArrayImpl<B> {
372
375
return Ok ( 0 ) ;
373
376
}
374
377
let Some ( new_array) = self . inner . try_build ( ) ? else {
375
- // no rows in current
378
+ // no rows in current
376
379
self . current_rows = 0 ;
377
380
return Ok ( 0 ) ;
378
381
} ;
379
-
382
+
380
383
let num_rows = new_array. len ( ) ;
381
384
self . finished . push ( new_array) ;
382
385
self . current_rows = 0 ;
@@ -442,7 +445,7 @@ impl NoOpInProgressArray {
442
445
}
443
446
444
447
/// Implements a GenericArrayBuilder used for any array type by using buffering and `concat`
445
- ///
448
+ ///
446
449
/// TODO avoid this by using type specific array builders
447
450
struct GenericArrayBuilder {
448
451
arrays : Vec < ArrayRef >
@@ -480,38 +483,42 @@ impl InProgressArrayBuilder for GenericArrayBuilder {
480
483
}
481
484
482
485
483
-
484
-
485
486
/// An implementation of InProgressArray for StringViewArray
486
487
/// that knows how to efficiently and incrementally concatenate arrays
487
- struct StringViewInProgressArray {
488
+ ///
489
+ /// TODO move this to StringViewBuilder (basically probably add `append_array` to it)
490
+ struct InProgressStringViewBuilder {
488
491
new_views : Vec < u128 > ,
489
492
null_buffer_builder : NullBufferBuilder ,
490
493
buffers : Vec < Buffer > ,
494
+ initial_capacity : usize ,
491
495
}
492
496
493
- impl StringViewInProgressArray {
494
- fn new_with_capacity ( num_rows : usize ) -> Self {
495
- todo ! ( )
496
- }
497
+ impl InProgressStringViewBuilder {
498
+ fn new_with_capacity ( initial_capacity : usize ) -> Self {
499
+ Self {
500
+ new_views : Vec :: with_capacity ( initial_capacity) ,
501
+ null_buffer_builder : NullBufferBuilder :: new ( initial_capacity) ,
502
+ buffers : Vec :: with_capacity ( 100 ) , // TODO better estimate of number of buffers
503
+ initial_capacity,
504
+ }
505
+ }
497
506
}
498
507
499
- fn concat_string_view_arrays ( arrays : & [ ArrayRef ] ) -> Result < ArrayRef > {
500
- // Special case for StringViewArray inspired by DataFusion:
501
- // https://github.com/apache/datafusion/blob/9d2f04996604e709ee440b65f41e7b882f50b788/datafusion/physical-plan/src/coalesce/mod.rs#L222-L221
502
-
503
- let total_rows = arrays. iter ( ) . map ( |a| a. len ( ) ) . sum ( ) ;
504
- let mut new_views = Vec :: with_capacity ( total_rows) ;
505
- let mut null_buffer_builder = NullBufferBuilder :: new ( total_rows) ;
506
- let mut buffers = Vec :: with_capacity ( 100 ) ; // better estimate of buffer sizes
507
-
508
- // copy each input array into the to the output, one at a time
509
- for array in arrays. iter ( ) {
508
+ impl InProgressArrayBuilder for InProgressStringViewBuilder {
509
+ fn append ( & mut self , array : ArrayRef ) {
510
+ // Special case for StringViewArray inspired by DataFusion:
511
+ // https://github.com/apache/datafusion/blob/9d2f04996604e709ee440b65f41e7b882f50b788/datafusion/physical-plan/src/coalesce/mod.rs#L222-L221
510
512
let num_rows = array. len ( ) ;
511
513
if num_rows == 0 {
512
- continue ;
514
+ return ; // nothing to do
513
515
}
514
516
let array = array. as_string_view ( ) ;
517
+
518
+ let null_buffer_builder = & mut self . null_buffer_builder ;
519
+ let buffers = & mut self . buffers ;
520
+ let new_views = & mut self . new_views ;
521
+
515
522
// Copy nulls
516
523
if let Some ( nulls) = array. nulls ( ) {
517
524
null_buffer_builder. append_buffer ( nulls) ;
@@ -550,7 +557,7 @@ fn concat_string_view_arrays(arrays: &[ArrayRef]) -> Result<ArrayRef> {
550
557
// buffer as well as updating the views
551
558
let mut new_buffer: Vec < u8 > = Vec :: with_capacity ( ideal_buffer_size) ;
552
559
let new_buffer_index = buffers. len ( ) as u32 ; // making one new buffer
553
- // Update any views that point to the old buffers.
560
+ // Update any views that point to the old buffers.
554
561
for v in new_views[ starting_view..] . iter_mut ( ) {
555
562
let view_len = * v as u32 ;
556
563
// if view_len is 12 or less, data is inlined and doesn't need an update
@@ -572,12 +579,18 @@ fn concat_string_view_arrays(arrays: &[ArrayRef]) -> Result<ArrayRef> {
572
579
}
573
580
}
574
581
575
- // Form output array
576
- let nulls = null_buffer_builder. finish ( ) ;
577
- // safety: we know what we are doing above
578
- let new_array =
579
- unsafe { StringViewArray :: new_unchecked ( ScalarBuffer :: from ( new_views) , buffers, nulls) } ;
580
- Ok ( Arc :: new ( new_array) )
582
+ fn try_build ( & mut self ) -> Result < Option < ArrayRef > > {
583
+ // Form output array
584
+ let nulls = self . null_buffer_builder . finish ( ) ;
585
+ let new_views = std:: mem:: replace ( & mut self . new_views , Vec :: with_capacity ( self . initial_capacity ) ) ;
586
+ let buffers= std:: mem:: replace ( & mut self . buffers , Vec :: with_capacity ( 100 ) ) ; // TODO better buffer estimate
587
+
588
+ // safety: we know what we are doing above
589
+ let new_array =
590
+ unsafe { StringViewArray :: new_unchecked ( ScalarBuffer :: from ( new_views) , buffers, nulls) } ;
591
+
592
+ Ok ( Some ( Arc :: new ( new_array) ) )
593
+ }
581
594
}
582
595
583
596
/// return the size required for buffers to hold all strings
0 commit comments