@@ -215,10 +215,6 @@ async fn test_semi_join_1k() {
215
215
. await
216
216
}
217
217
218
- // The test is flaky
219
- // https://github.com/apache/datafusion/issues/10886
220
- // SMJ produces 1 more row in the output
221
- #[ ignore]
222
218
#[ tokio:: test]
223
219
async fn test_semi_join_1k_filtered ( ) {
224
220
JoinFuzzTestCase :: new (
@@ -442,18 +438,45 @@ impl JoinFuzzTestCase {
442
438
443
439
if debug {
444
440
println ! ( "The debug is ON. Input data will be saved" ) ;
445
- let out_dir_name = & format ! ( "fuzz_test_debug_batch_size_{batch_size}" ) ;
446
- Self :: save_as_parquet ( & self . input1 , out_dir_name, "input1" ) ;
447
- Self :: save_as_parquet ( & self . input2 , out_dir_name, "input2" ) ;
441
+ let fuzz_debug = "fuzz_test_debug" ;
442
+ std:: fs:: remove_dir_all ( fuzz_debug) . unwrap_or ( ( ) ) ;
443
+ std:: fs:: create_dir_all ( fuzz_debug) . unwrap ( ) ;
444
+ let out_dir_name = & format ! ( "{fuzz_debug}/batch_size_{batch_size}" ) ;
445
+ Self :: save_partitioned_batches_as_parquet (
446
+ & self . input1 ,
447
+ out_dir_name,
448
+ "input1" ,
449
+ ) ;
450
+ Self :: save_partitioned_batches_as_parquet (
451
+ & self . input2 ,
452
+ out_dir_name,
453
+ "input2" ,
454
+ ) ;
448
455
449
456
if join_tests. contains ( & JoinTestType :: NljHj ) {
450
- Self :: save_as_parquet ( & nlj_collected, out_dir_name, "nlj" ) ;
451
- Self :: save_as_parquet ( & hj_collected, out_dir_name, "hj" ) ;
457
+ Self :: save_partitioned_batches_as_parquet (
458
+ & nlj_collected,
459
+ out_dir_name,
460
+ "nlj" ,
461
+ ) ;
462
+ Self :: save_partitioned_batches_as_parquet (
463
+ & hj_collected,
464
+ out_dir_name,
465
+ "hj" ,
466
+ ) ;
452
467
}
453
468
454
469
if join_tests. contains ( & JoinTestType :: HjSmj ) {
455
- Self :: save_as_parquet ( & hj_collected, out_dir_name, "hj" ) ;
456
- Self :: save_as_parquet ( & smj_collected, out_dir_name, "smj" ) ;
470
+ Self :: save_partitioned_batches_as_parquet (
471
+ & hj_collected,
472
+ out_dir_name,
473
+ "hj" ,
474
+ ) ;
475
+ Self :: save_partitioned_batches_as_parquet (
476
+ & smj_collected,
477
+ out_dir_name,
478
+ "smj" ,
479
+ ) ;
457
480
}
458
481
}
459
482
@@ -527,11 +550,26 @@ impl JoinFuzzTestCase {
527
550
/// as a parquet files preserving partitioning.
528
551
/// Once the data is saved it is possible to run a custom test on top of the saved data and debug
529
552
///
553
+ /// #[tokio::test]
554
+ /// async fn test1() {
555
+ /// let left: Vec<RecordBatch> = JoinFuzzTestCase::load_partitioned_batches_from_parquet("fuzz_test_debug/batch_size_2/input1").await.unwrap();
556
+ /// let right: Vec<RecordBatch> = JoinFuzzTestCase::load_partitioned_batches_from_parquet("fuzz_test_debug/batch_size_2/input2").await.unwrap();
557
+ ///
558
+ /// JoinFuzzTestCase::new(
559
+ /// left,
560
+ /// right,
561
+ /// JoinType::LeftSemi,
562
+ /// Some(Box::new(col_lt_col_filter)),
563
+ /// )
564
+ /// .run_test(&[JoinTestType::HjSmj], false)
565
+ /// .await
566
+ /// }
567
+ ///
530
568
/// let ctx: SessionContext = SessionContext::new();
531
569
/// let df = ctx
532
570
/// .read_parquet(
533
571
/// "/tmp/input1/*.parquet",
534
- /// ParquetReadOptions::default(),
572
+ /// datafusion::prelude:: ParquetReadOptions::default(),
535
573
/// )
536
574
/// .await
537
575
/// .unwrap();
@@ -540,7 +578,7 @@ impl JoinFuzzTestCase {
540
578
/// let df = ctx
541
579
/// .read_parquet(
542
580
/// "/tmp/input2/*.parquet",
543
- /// ParquetReadOptions::default(),
581
+ /// datafusion::prelude:: ParquetReadOptions::default(),
544
582
/// )
545
583
/// .await
546
584
/// .unwrap();
@@ -554,8 +592,11 @@ impl JoinFuzzTestCase {
554
592
/// )
555
593
/// .run_test()
556
594
/// .await
557
- /// }
558
- fn save_as_parquet ( input : & [ RecordBatch ] , output_dir : & str , out_name : & str ) {
595
+ fn save_partitioned_batches_as_parquet (
596
+ input : & [ RecordBatch ] ,
597
+ output_dir : & str ,
598
+ out_name : & str ,
599
+ ) {
559
600
let out_path = & format ! ( "{output_dir}/{out_name}" ) ;
560
601
std:: fs:: remove_dir_all ( out_path) . unwrap_or ( ( ) ) ;
561
602
std:: fs:: create_dir_all ( out_path) . unwrap ( ) ;
@@ -576,6 +617,39 @@ impl JoinFuzzTestCase {
576
617
577
618
println ! ( "The data {out_name} saved as parquet into {out_path}" ) ;
578
619
}
620
+
621
+ /// Read parquet files preserving partitions, i.e. 1 file -> 1 partition
622
+ /// Files can be of different sizes
623
+ /// The method can be useful to read partitions have been saved by `save_partitioned_batches_as_parquet`
624
+ /// for test debugging purposes
625
+ #[ allow( dead_code) ]
626
+ async fn load_partitioned_batches_from_parquet (
627
+ dir : & str ,
628
+ ) -> std:: io:: Result < Vec < RecordBatch > > {
629
+ let ctx: SessionContext = SessionContext :: new ( ) ;
630
+ let mut batches: Vec < RecordBatch > = vec ! [ ] ;
631
+
632
+ for entry in std:: fs:: read_dir ( dir) ? {
633
+ let entry = entry?;
634
+ let path = entry. path ( ) ;
635
+
636
+ if path. is_file ( ) {
637
+ let mut batch = ctx
638
+ . read_parquet (
639
+ path. to_str ( ) . unwrap ( ) ,
640
+ datafusion:: prelude:: ParquetReadOptions :: default ( ) ,
641
+ )
642
+ . await
643
+ . unwrap ( )
644
+ . collect ( )
645
+ . await
646
+ . unwrap ( ) ;
647
+
648
+ batches. append ( & mut batch) ;
649
+ }
650
+ }
651
+ Ok ( batches)
652
+ }
579
653
}
580
654
581
655
/// Return randomly sized record batches with:
0 commit comments