68
68
69
69
use std:: sync:: Arc ;
70
70
71
+ use arrow:: array:: StringViewArray ;
71
72
use arrow:: {
72
73
array:: { DictionaryArray , Float64Array , Int64Array , StringArray } ,
73
74
compute:: SortOptions ,
74
75
datatypes:: { Int32Type , Schema } ,
75
76
record_batch:: RecordBatch ,
76
77
} ;
77
-
78
78
use datafusion:: physical_plan:: sorts:: sort:: SortExec ;
79
79
use datafusion:: {
80
80
execution:: context:: TaskContext ,
@@ -114,11 +114,24 @@ fn criterion_benchmark(c: &mut Criterion) {
114
114
( "f64" , & f64_streams) ,
115
115
( "utf8 low cardinality" , & utf8_low_cardinality_streams) ,
116
116
( "utf8 high cardinality" , & utf8_high_cardinality_streams) ,
117
+ (
118
+ "utf8 view low cardinality" ,
119
+ & utf8_view_low_cardinality_streams,
120
+ ) ,
121
+ (
122
+ "utf8 view high cardinality" ,
123
+ & utf8_view_high_cardinality_streams,
124
+ ) ,
117
125
( "utf8 tuple" , & utf8_tuple_streams) ,
126
+ ( "utf8 view tuple" , & utf8_view_tuple_streams) ,
118
127
( "utf8 dictionary" , & dictionary_streams) ,
119
128
( "utf8 dictionary tuple" , & dictionary_tuple_streams) ,
120
129
( "mixed dictionary tuple" , & mixed_dictionary_tuple_streams) ,
121
130
( "mixed tuple" , & mixed_tuple_streams) ,
131
+ (
132
+ "mixed tuple with utf8 view" ,
133
+ & mixed_tuple_with_utf8_view_streams,
134
+ ) ,
122
135
] ;
123
136
124
137
for ( name, f) in cases {
@@ -308,6 +321,30 @@ fn utf8_low_cardinality_streams(sorted: bool) -> PartitionedBatches {
308
321
} )
309
322
}
310
323
324
+ /// Create streams of random low cardinality utf8_view values
325
+ fn utf8_view_low_cardinality_streams ( sorted : bool ) -> PartitionedBatches {
326
+ let mut values = DataGenerator :: new ( ) . utf8_low_cardinality_values ( ) ;
327
+ if sorted {
328
+ values. sort_unstable ( ) ;
329
+ }
330
+ split_tuples ( values, |v| {
331
+ let array: StringViewArray = v. into_iter ( ) . collect ( ) ;
332
+ RecordBatch :: try_from_iter ( vec ! [ ( "utf_view_low" , Arc :: new( array) as _) ] ) . unwrap ( )
333
+ } )
334
+ }
335
+
336
+ /// Create streams of high cardinality (~ no duplicates) utf8_view values
337
+ fn utf8_view_high_cardinality_streams ( sorted : bool ) -> PartitionedBatches {
338
+ let mut values = DataGenerator :: new ( ) . utf8_high_cardinality_values ( ) ;
339
+ if sorted {
340
+ values. sort_unstable ( ) ;
341
+ }
342
+ split_tuples ( values, |v| {
343
+ let array: StringViewArray = v. into_iter ( ) . collect ( ) ;
344
+ RecordBatch :: try_from_iter ( vec ! [ ( "utf_view_high" , Arc :: new( array) as _) ] ) . unwrap ( )
345
+ } )
346
+ }
347
+
311
348
/// Create streams of high cardinality (~ no duplicates) utf8 values
312
349
fn utf8_high_cardinality_streams ( sorted : bool ) -> PartitionedBatches {
313
350
let mut values = DataGenerator :: new ( ) . utf8_high_cardinality_values ( ) ;
@@ -353,6 +390,39 @@ fn utf8_tuple_streams(sorted: bool) -> PartitionedBatches {
353
390
} )
354
391
}
355
392
393
+ /// Create a batch of (utf8_view_low, utf8_view_low, utf8_view_high)
394
+ fn utf8_view_tuple_streams ( sorted : bool ) -> PartitionedBatches {
395
+ let mut gen = DataGenerator :: new ( ) ;
396
+
397
+ // need to sort by the combined key, so combine them together
398
+ let mut tuples: Vec < _ > = gen
399
+ . utf8_low_cardinality_values ( )
400
+ . into_iter ( )
401
+ . zip ( gen. utf8_low_cardinality_values ( ) )
402
+ . zip ( gen. utf8_high_cardinality_values ( ) )
403
+ . collect ( ) ;
404
+
405
+ if sorted {
406
+ tuples. sort_unstable ( ) ;
407
+ }
408
+
409
+ split_tuples ( tuples, |tuples| {
410
+ let ( tuples, utf8_high) : ( Vec < _ > , Vec < _ > ) = tuples. into_iter ( ) . unzip ( ) ;
411
+ let ( utf8_low1, utf8_low2) : ( Vec < _ > , Vec < _ > ) = tuples. into_iter ( ) . unzip ( ) ;
412
+
413
+ let utf8_view_high: StringViewArray = utf8_high. into_iter ( ) . collect ( ) ;
414
+ let utf8_view_low1: StringViewArray = utf8_low1. into_iter ( ) . collect ( ) ;
415
+ let utf8_view_low2: StringViewArray = utf8_low2. into_iter ( ) . collect ( ) ;
416
+
417
+ RecordBatch :: try_from_iter ( vec ! [
418
+ ( "utf_view_low1" , Arc :: new( utf8_view_low1) as _) ,
419
+ ( "utf_view_low2" , Arc :: new( utf8_view_low2) as _) ,
420
+ ( "utf_view_high" , Arc :: new( utf8_view_high) as _) ,
421
+ ] )
422
+ . unwrap ( )
423
+ } )
424
+ }
425
+
356
426
/// Create a batch of (f64, utf8_low, utf8_low, i64)
357
427
fn mixed_tuple_streams ( sorted : bool ) -> PartitionedBatches {
358
428
let mut gen = DataGenerator :: new ( ) ;
@@ -391,6 +461,44 @@ fn mixed_tuple_streams(sorted: bool) -> PartitionedBatches {
391
461
} )
392
462
}
393
463
464
+ /// Create a batch of (f64, utf8_view_low, utf8_view_low, i64)
465
+ fn mixed_tuple_with_utf8_view_streams ( sorted : bool ) -> PartitionedBatches {
466
+ let mut gen = DataGenerator :: new ( ) ;
467
+
468
+ // need to sort by the combined key, so combine them together
469
+ let mut tuples: Vec < _ > = gen
470
+ . i64_values ( )
471
+ . into_iter ( )
472
+ . zip ( gen. utf8_low_cardinality_values ( ) )
473
+ . zip ( gen. utf8_low_cardinality_values ( ) )
474
+ . zip ( gen. i64_values ( ) )
475
+ . collect ( ) ;
476
+
477
+ if sorted {
478
+ tuples. sort_unstable ( ) ;
479
+ }
480
+
481
+ split_tuples ( tuples, |tuples| {
482
+ let ( tuples, i64_values) : ( Vec < _ > , Vec < _ > ) = tuples. into_iter ( ) . unzip ( ) ;
483
+ let ( tuples, utf8_low2) : ( Vec < _ > , Vec < _ > ) = tuples. into_iter ( ) . unzip ( ) ;
484
+ let ( f64_values, utf8_low1) : ( Vec < _ > , Vec < _ > ) = tuples. into_iter ( ) . unzip ( ) ;
485
+
486
+ let f64_values: Float64Array = f64_values. into_iter ( ) . map ( |v| v as f64 ) . collect ( ) ;
487
+
488
+ let utf8_view_low1: StringViewArray = utf8_low1. into_iter ( ) . collect ( ) ;
489
+ let utf8_view_low2: StringViewArray = utf8_low2. into_iter ( ) . collect ( ) ;
490
+ let i64_values: Int64Array = i64_values. into_iter ( ) . collect ( ) ;
491
+
492
+ RecordBatch :: try_from_iter ( vec ! [
493
+ ( "f64" , Arc :: new( f64_values) as _) ,
494
+ ( "utf_view_low1" , Arc :: new( utf8_view_low1) as _) ,
495
+ ( "utf_view_low2" , Arc :: new( utf8_view_low2) as _) ,
496
+ ( "i64" , Arc :: new( i64_values) as _) ,
497
+ ] )
498
+ . unwrap ( )
499
+ } )
500
+ }
501
+
394
502
/// Create a batch of (utf8_dict)
395
503
fn dictionary_streams ( sorted : bool ) -> PartitionedBatches {
396
504
let mut gen = DataGenerator :: new ( ) ;
@@ -402,7 +510,6 @@ fn dictionary_streams(sorted: bool) -> PartitionedBatches {
402
510
split_tuples ( values, |v| {
403
511
let dictionary: DictionaryArray < Int32Type > =
404
512
v. iter ( ) . map ( Option :: as_deref) . collect ( ) ;
405
-
406
513
RecordBatch :: try_from_iter ( vec ! [ ( "dict" , Arc :: new( dictionary) as _) ] ) . unwrap ( )
407
514
} )
408
515
}
0 commit comments