@@ -33,12 +33,18 @@ use datafusion::common::tree_node::{Transformed, TreeNode, TreeNodeRecursion, Tr
33
33
// use datafusion::config::ConfigFileType;
34
34
use datafusion:: error:: { DataFusionError , Result } ;
35
35
use datafusion:: execution:: disk_manager:: DiskManagerConfig ;
36
+ use datafusion:: execution:: runtime_env:: { RuntimeEnv , RuntimeEnvBuilder } ;
36
37
// use datafusion::execution::runtime_env::RuntimeEnvBuilder;
37
38
use datafusion:: execution:: SessionStateBuilder ;
38
39
use datafusion:: logical_expr:: expr:: Alias ;
39
40
use datafusion:: logical_expr:: {
40
41
Aggregate , Explain , Filter , LogicalPlan , PlanType , Projection , ToStringifiedPlan ,
41
42
} ;
43
+ use datafusion:: physical_expr:: create_physical_expr;
44
+ use datafusion:: physical_plan:: coalesce_batches:: CoalesceBatchesExec ;
45
+ use datafusion:: physical_plan:: repartition:: RepartitionExec ;
46
+ use datafusion:: physical_plan:: { collect as PhysicalPlanCollect , ExecutionPlan , Partitioning } ;
47
+ use datafusion:: physical_plan:: filter:: FilterExec ;
42
48
// use datafusion::physical_plan::execution_plan::EmissionType;
43
49
// use datafusion::physical_plan::{collect, execute_stream, ExecutionPlanProperties};
44
50
use datafusion:: prelude:: * ;
@@ -759,50 +765,133 @@ struct BenchmarkResult {
759
765
elapsed_seconds : f64 ,
760
766
}
761
767
762
- #[ derive( Debug , Serialize ) ]
763
- struct BenchmarkResponse {
764
- results : Vec < BenchmarkResult > ,
765
- }
766
768
767
769
pub async fn run_benchmark ( ) {
768
770
const TRIES : usize = 1 ;
769
771
770
772
let mut results = Vec :: new ( ) ;
771
773
let mut query_num = 1 ;
774
+
775
+ // 1. Configure Runtime Environment with parallelism
776
+ let runtime_config = RuntimeEnvBuilder :: new ( ) // Number of partitions for parallel processing
777
+ . with_disk_manager ( DiskManagerConfig :: NewOs ) ;
778
+
779
+ let runtime = RuntimeEnv :: new ( runtime_config) . unwrap ( ) ;
780
+
781
+
772
782
// Create session context
773
- let ctx = SessionContext :: new ( ) ;
783
+ let mut config = SessionConfig :: new ( ) . with_coalesce_batches ( true )
784
+ . with_target_partitions ( 8 )
785
+ . with_batch_size ( 50000 ) ;
786
+ config. options_mut ( ) . execution . parquet . binary_as_string = true ;
787
+ config. options_mut ( ) . execution . use_row_number_estimates_to_optimize_partitioning = true ;
788
+ config. options_mut ( ) . execution . parquet . pushdown_filters = true ;
789
+ config. options_mut ( ) . execution . parquet . enable_page_index = true ;
790
+ config. options_mut ( ) . execution . parquet . pruning = true ;
791
+ config. options_mut ( ) . execution . parquet . reorder_filters = true ;
792
+ let state = SessionStateBuilder :: new ( )
793
+ . with_default_features ( )
794
+ . with_config ( config)
795
+ . with_runtime_env ( Arc :: new ( runtime) )
796
+ . build ( ) ;
797
+ let ctx = SessionContext :: new_with_state ( state) ;
774
798
let sql = "CREATE EXTERNAL TABLE hits STORED AS PARQUET LOCATION '/home/ubuntu/clickbench/hits.parquet' OPTIONS ('binary_as_string' 'true')" ;
775
799
let _ = ctx. sql ( & sql) . await . unwrap ( ) . collect ( ) . await . unwrap ( ) ;
776
800
// Read queries from file
777
801
let queries = fs:: read_to_string ( "/home/ubuntu/queries.sql" ) . unwrap ( ) ;
778
802
779
803
780
804
for query in queries. lines ( ) {
781
- // Write current query to temporary file
782
805
fs:: write ( "/tmp/query.sql" , & query) . unwrap ( ) ;
783
-
806
+
784
807
for iteration in 1 ..=TRIES {
785
- // Clear caches
786
808
clear_caches ( ) . unwrap ( ) ;
809
+
810
+
811
+ // Create the query plan
812
+ let df = ctx. sql ( & query) . await . unwrap ( ) ;
813
+ let logical_plan = df. logical_plan ( ) . clone ( ) ;
814
+ let physical_plan = df. create_physical_plan ( ) . await . unwrap ( ) ;
815
+
816
+ // Add coalesce
817
+ let mut exec_plan: Arc < dyn ExecutionPlan > = Arc :: new ( CoalesceBatchesExec :: new ( physical_plan, 50000 ) ) ;
818
+
819
+ // Check if plan contains filter and add FilterExec
820
+ fn has_filter ( plan : & LogicalPlan ) -> bool {
821
+ match plan {
822
+ LogicalPlan :: Filter ( _) => true ,
823
+ LogicalPlan :: Projection ( proj) => has_filter ( proj. input . as_ref ( ) ) ,
824
+ LogicalPlan :: Aggregate ( agg) => has_filter ( agg. input . as_ref ( ) ) ,
825
+ LogicalPlan :: Join ( join) => {
826
+ has_filter ( join. left . as_ref ( ) ) || has_filter ( join. right . as_ref ( ) )
827
+ } ,
828
+ LogicalPlan :: Window ( window) => has_filter ( window. input . as_ref ( ) ) ,
829
+ LogicalPlan :: Sort ( sort) => has_filter ( sort. input . as_ref ( ) ) ,
830
+ LogicalPlan :: Limit ( limit) => has_filter ( limit. input . as_ref ( ) ) ,
831
+ _ => false ,
832
+ }
833
+ }
834
+
835
+ // Extract filter expressions from logical plan
836
+ fn extract_filters ( plan : & LogicalPlan ) -> Vec < Expr > {
837
+ match plan {
838
+ LogicalPlan :: Filter ( filter) => vec ! [ filter. predicate. clone( ) ] ,
839
+ LogicalPlan :: Projection ( proj) => extract_filters ( proj. input . as_ref ( ) ) ,
840
+ LogicalPlan :: Aggregate ( agg) => extract_filters ( agg. input . as_ref ( ) ) ,
841
+ LogicalPlan :: Join ( join) => {
842
+ let mut filters = extract_filters ( join. left . as_ref ( ) ) ;
843
+ filters. extend ( extract_filters ( join. right . as_ref ( ) ) ) ;
844
+ filters
845
+ } ,
846
+ _ => vec ! [ ] ,
847
+ }
848
+ }
849
+
850
+ if has_filter ( & logical_plan) {
851
+ let filters = extract_filters ( & logical_plan) ;
852
+ for filter_expr in filters {
853
+
854
+
855
+ if let Ok ( physical_filter_expr) = create_physical_expr (
856
+ & filter_expr,
857
+ & logical_plan. schema ( ) ,
858
+ & ctx. state ( ) . execution_props ( ) ,
859
+
860
+ ) {
861
+ exec_plan = Arc :: new ( FilterExec :: try_new (
862
+ physical_filter_expr,
863
+ exec_plan,
864
+ ) . unwrap ( ) ) ;
865
+ }
787
866
788
- // Execute and time the query
867
+
868
+ }
869
+ }
870
+
871
+ // Execute the plan
872
+ let task_ctx = ctx. task_ctx ( ) ;
789
873
let start = Instant :: now ( ) ;
790
- ctx. sql ( & query) . await . unwrap ( ) . collect ( ) . await . unwrap ( ) ;
874
+
875
+ //let _ = execute_parallel(exec_plan.clone(), ctx.task_ctx()).await.unwrap();
876
+ // Add repartitioning for better parallelism
877
+ let repartitioned = Arc :: new ( RepartitionExec :: try_new (
878
+ exec_plan,
879
+ Partitioning :: RoundRobinBatch ( 8 ) ,
880
+ ) . unwrap ( ) ) ;
881
+ let _ = PhysicalPlanCollect ( repartitioned, task_ctx) . await . unwrap ( ) ;
882
+
791
883
let elapsed = start. elapsed ( ) . as_secs_f64 ( ) ;
792
884
let benchmark_result = BenchmarkResult {
793
885
query_num,
794
886
iteration,
795
887
elapsed_seconds : elapsed,
796
888
} ;
797
889
println ! ( "Query {query_num} iteration {iteration} took {elapsed} seconds" ) ;
798
- // Record result
799
890
results. push ( benchmark_result) ;
800
-
801
891
}
802
892
query_num += 1 ;
803
893
}
804
894
805
- println ! ( "{}" , serde_json:: to_string( & BenchmarkResponse { results } ) . unwrap( ) ) ;
806
895
}
807
896
808
897
fn clear_caches ( ) -> io:: Result < ( ) > {
0 commit comments