@@ -156,7 +156,8 @@ impl Default for DataFrameWriteOptions {
156
156
/// ```
157
157
#[ derive( Debug , Clone ) ]
158
158
pub struct DataFrame {
159
- session_state : SessionState ,
159
+ // Box the (large) SessionState to reduce the size of DataFrame on the stack
160
+ session_state : Box < SessionState > ,
160
161
plan : LogicalPlan ,
161
162
}
162
163
@@ -168,7 +169,7 @@ impl DataFrame {
168
169
/// `DataFrame` from an existing datasource.
169
170
pub fn new ( session_state : SessionState , plan : LogicalPlan ) -> Self {
170
171
Self {
171
- session_state,
172
+ session_state : Box :: new ( session_state ) ,
172
173
plan,
173
174
}
174
175
}
@@ -230,7 +231,10 @@ impl DataFrame {
230
231
} ;
231
232
let project_plan = LogicalPlanBuilder :: from ( plan) . project ( expr_list) ?. build ( ) ?;
232
233
233
- Ok ( DataFrame :: new ( self . session_state , project_plan) )
234
+ Ok ( DataFrame {
235
+ session_state : self . session_state ,
236
+ plan : project_plan,
237
+ } )
234
238
}
235
239
236
240
/// Expand each list element of a column to multiple rows.
@@ -269,7 +273,10 @@ impl DataFrame {
269
273
let plan = LogicalPlanBuilder :: from ( self . plan )
270
274
. unnest_column_with_options ( column, options) ?
271
275
. build ( ) ?;
272
- Ok ( DataFrame :: new ( self . session_state , plan) )
276
+ Ok ( DataFrame {
277
+ session_state : self . session_state ,
278
+ plan,
279
+ } )
273
280
}
274
281
275
282
/// Return a DataFrame with only rows for which `predicate` evaluates to
@@ -294,7 +301,10 @@ impl DataFrame {
294
301
let plan = LogicalPlanBuilder :: from ( self . plan )
295
302
. filter ( predicate) ?
296
303
. build ( ) ?;
297
- Ok ( DataFrame :: new ( self . session_state , plan) )
304
+ Ok ( DataFrame {
305
+ session_state : self . session_state ,
306
+ plan,
307
+ } )
298
308
}
299
309
300
310
/// Return a new `DataFrame` that aggregates the rows of the current
@@ -325,7 +335,10 @@ impl DataFrame {
325
335
let plan = LogicalPlanBuilder :: from ( self . plan )
326
336
. aggregate ( group_expr, aggr_expr) ?
327
337
. build ( ) ?;
328
- Ok ( DataFrame :: new ( self . session_state , plan) )
338
+ Ok ( DataFrame {
339
+ session_state : self . session_state ,
340
+ plan,
341
+ } )
329
342
}
330
343
331
344
/// Return a new DataFrame that adds the result of evaluating one or more
@@ -334,7 +347,10 @@ impl DataFrame {
334
347
let plan = LogicalPlanBuilder :: from ( self . plan )
335
348
. window ( window_exprs) ?
336
349
. build ( ) ?;
337
- Ok ( DataFrame :: new ( self . session_state , plan) )
350
+ Ok ( DataFrame {
351
+ session_state : self . session_state ,
352
+ plan,
353
+ } )
338
354
}
339
355
340
356
/// Returns a new `DataFrame` with a limited number of rows.
@@ -359,7 +375,10 @@ impl DataFrame {
359
375
let plan = LogicalPlanBuilder :: from ( self . plan )
360
376
. limit ( skip, fetch) ?
361
377
. build ( ) ?;
362
- Ok ( DataFrame :: new ( self . session_state , plan) )
378
+ Ok ( DataFrame {
379
+ session_state : self . session_state ,
380
+ plan,
381
+ } )
363
382
}
364
383
365
384
/// Calculate the union of two [`DataFrame`]s, preserving duplicate rows.
@@ -383,7 +402,10 @@ impl DataFrame {
383
402
let plan = LogicalPlanBuilder :: from ( self . plan )
384
403
. union ( dataframe. plan ) ?
385
404
. build ( ) ?;
386
- Ok ( DataFrame :: new ( self . session_state , plan) )
405
+ Ok ( DataFrame {
406
+ session_state : self . session_state ,
407
+ plan,
408
+ } )
387
409
}
388
410
389
411
/// Calculate the distinct union of two [`DataFrame`]s.
@@ -405,12 +427,13 @@ impl DataFrame {
405
427
/// # }
406
428
/// ```
407
429
pub fn union_distinct ( self , dataframe : DataFrame ) -> Result < DataFrame > {
408
- Ok ( DataFrame :: new (
409
- self . session_state ,
410
- LogicalPlanBuilder :: from ( self . plan )
411
- . union_distinct ( dataframe. plan ) ?
412
- . build ( ) ?,
413
- ) )
430
+ let plan = LogicalPlanBuilder :: from ( self . plan )
431
+ . union_distinct ( dataframe. plan ) ?
432
+ . build ( ) ?;
433
+ Ok ( DataFrame {
434
+ session_state : self . session_state ,
435
+ plan,
436
+ } )
414
437
}
415
438
416
439
/// Return a new `DataFrame` with all duplicated rows removed.
@@ -428,10 +451,11 @@ impl DataFrame {
428
451
/// # }
429
452
/// ```
430
453
pub fn distinct ( self ) -> Result < DataFrame > {
431
- Ok ( DataFrame :: new (
432
- self . session_state ,
433
- LogicalPlanBuilder :: from ( self . plan ) . distinct ( ) ?. build ( ) ?,
434
- ) )
454
+ let plan = LogicalPlanBuilder :: from ( self . plan ) . distinct ( ) ?. build ( ) ?;
455
+ Ok ( DataFrame {
456
+ session_state : self . session_state ,
457
+ plan,
458
+ } )
435
459
}
436
460
437
461
/// Return a new `DataFrame` that has statistics for a DataFrame.
@@ -599,15 +623,18 @@ impl DataFrame {
599
623
describe_record_batch. schema ( ) ,
600
624
vec ! [ vec![ describe_record_batch] ] ,
601
625
) ?;
602
- Ok ( DataFrame :: new (
603
- self . session_state ,
604
- LogicalPlanBuilder :: scan (
605
- UNNAMED_TABLE ,
606
- provider_as_source ( Arc :: new ( provider) ) ,
607
- None ,
608
- ) ?
609
- . build ( ) ?,
610
- ) )
626
+
627
+ let plan = LogicalPlanBuilder :: scan (
628
+ UNNAMED_TABLE ,
629
+ provider_as_source ( Arc :: new ( provider) ) ,
630
+ None ,
631
+ ) ?
632
+ . build ( ) ?;
633
+
634
+ Ok ( DataFrame {
635
+ session_state : self . session_state ,
636
+ plan,
637
+ } )
611
638
}
612
639
613
640
/// Sort the DataFrame by the specified sorting expressions.
@@ -633,7 +660,10 @@ impl DataFrame {
633
660
/// ```
634
661
pub fn sort ( self , expr : Vec < Expr > ) -> Result < DataFrame > {
635
662
let plan = LogicalPlanBuilder :: from ( self . plan ) . sort ( expr) ?. build ( ) ?;
636
- Ok ( DataFrame :: new ( self . session_state , plan) )
663
+ Ok ( DataFrame {
664
+ session_state : self . session_state ,
665
+ plan,
666
+ } )
637
667
}
638
668
639
669
/// Join this `DataFrame` with another `DataFrame` using explicitly specified
@@ -687,7 +717,10 @@ impl DataFrame {
687
717
filter,
688
718
) ?
689
719
. build ( ) ?;
690
- Ok ( DataFrame :: new ( self . session_state , plan) )
720
+ Ok ( DataFrame {
721
+ session_state : self . session_state ,
722
+ plan,
723
+ } )
691
724
}
692
725
693
726
/// Join this `DataFrame` with another `DataFrame` using the specified
@@ -737,7 +770,10 @@ impl DataFrame {
737
770
let plan = LogicalPlanBuilder :: from ( self . plan )
738
771
. join_on ( right. plan , join_type, expr) ?
739
772
. build ( ) ?;
740
- Ok ( DataFrame :: new ( self . session_state , plan) )
773
+ Ok ( DataFrame {
774
+ session_state : self . session_state ,
775
+ plan,
776
+ } )
741
777
}
742
778
743
779
/// Repartition a DataFrame based on a logical partitioning scheme.
@@ -758,7 +794,10 @@ impl DataFrame {
758
794
let plan = LogicalPlanBuilder :: from ( self . plan )
759
795
. repartition ( partitioning_scheme) ?
760
796
. build ( ) ?;
761
- Ok ( DataFrame :: new ( self . session_state , plan) )
797
+ Ok ( DataFrame {
798
+ session_state : self . session_state ,
799
+ plan,
800
+ } )
762
801
}
763
802
764
803
/// Return the total number of rows in this `DataFrame`.
@@ -863,7 +902,7 @@ impl DataFrame {
863
902
864
903
/// Return a new [`TaskContext`] which would be used to execute this DataFrame
865
904
pub fn task_ctx ( & self ) -> TaskContext {
866
- TaskContext :: from ( & self . session_state )
905
+ TaskContext :: from ( self . session_state . as_ref ( ) )
867
906
}
868
907
869
908
/// Executes this DataFrame and returns a stream over a single partition
@@ -969,7 +1008,7 @@ impl DataFrame {
969
1008
970
1009
/// Returns both the [`LogicalPlan`] and [`SessionState`] that comprise this [`DataFrame`]
971
1010
pub fn into_parts ( self ) -> ( SessionState , LogicalPlan ) {
972
- ( self . session_state , self . plan )
1011
+ ( * self . session_state , self . plan )
973
1012
}
974
1013
975
1014
/// Return the [`LogicalPlan`] represented by this DataFrame without running
@@ -1023,7 +1062,10 @@ impl DataFrame {
1023
1062
let plan = LogicalPlanBuilder :: from ( self . plan )
1024
1063
. explain ( verbose, analyze) ?
1025
1064
. build ( ) ?;
1026
- Ok ( DataFrame :: new ( self . session_state , plan) )
1065
+ Ok ( DataFrame {
1066
+ session_state : self . session_state ,
1067
+ plan,
1068
+ } )
1027
1069
}
1028
1070
1029
1071
/// Return a `FunctionRegistry` used to plan udf's calls
@@ -1042,7 +1084,7 @@ impl DataFrame {
1042
1084
/// # }
1043
1085
/// ```
1044
1086
pub fn registry ( & self ) -> & dyn FunctionRegistry {
1045
- & self . session_state
1087
+ self . session_state . as_ref ( )
1046
1088
}
1047
1089
1048
1090
/// Calculate the intersection of two [`DataFrame`]s. The two [`DataFrame`]s must have exactly the same schema
@@ -1062,10 +1104,11 @@ impl DataFrame {
1062
1104
pub fn intersect ( self , dataframe : DataFrame ) -> Result < DataFrame > {
1063
1105
let left_plan = self . plan ;
1064
1106
let right_plan = dataframe. plan ;
1065
- Ok ( DataFrame :: new (
1066
- self . session_state ,
1067
- LogicalPlanBuilder :: intersect ( left_plan, right_plan, true ) ?,
1068
- ) )
1107
+ let plan = LogicalPlanBuilder :: intersect ( left_plan, right_plan, true ) ?;
1108
+ Ok ( DataFrame {
1109
+ session_state : self . session_state ,
1110
+ plan,
1111
+ } )
1069
1112
}
1070
1113
1071
1114
/// Calculate the exception of two [`DataFrame`]s. The two [`DataFrame`]s must have exactly the same schema
@@ -1085,11 +1128,11 @@ impl DataFrame {
1085
1128
pub fn except ( self , dataframe : DataFrame ) -> Result < DataFrame > {
1086
1129
let left_plan = self . plan ;
1087
1130
let right_plan = dataframe. plan ;
1088
-
1089
- Ok ( DataFrame :: new (
1090
- self . session_state ,
1091
- LogicalPlanBuilder :: except ( left_plan , right_plan , true ) ? ,
1092
- ) )
1131
+ let plan = LogicalPlanBuilder :: except ( left_plan , right_plan , true ) ? ;
1132
+ Ok ( DataFrame {
1133
+ session_state : self . session_state ,
1134
+ plan ,
1135
+ } )
1093
1136
}
1094
1137
1095
1138
/// Execute this `DataFrame` and write the results to `table_name`.
@@ -1114,7 +1157,13 @@ impl DataFrame {
1114
1157
write_options. overwrite ,
1115
1158
) ?
1116
1159
. build ( ) ?;
1117
- DataFrame :: new ( self . session_state , plan) . collect ( ) . await
1160
+
1161
+ DataFrame {
1162
+ session_state : self . session_state ,
1163
+ plan,
1164
+ }
1165
+ . collect ( )
1166
+ . await
1118
1167
}
1119
1168
1120
1169
/// Execute the `DataFrame` and write the results to CSV file(s).
@@ -1162,7 +1211,13 @@ impl DataFrame {
1162
1211
options. partition_by ,
1163
1212
) ?
1164
1213
. build ( ) ?;
1165
- DataFrame :: new ( self . session_state , plan) . collect ( ) . await
1214
+
1215
+ DataFrame {
1216
+ session_state : self . session_state ,
1217
+ plan,
1218
+ }
1219
+ . collect ( )
1220
+ . await
1166
1221
}
1167
1222
1168
1223
/// Execute the `DataFrame` and write the results to JSON file(s).
@@ -1211,7 +1266,13 @@ impl DataFrame {
1211
1266
options. partition_by ,
1212
1267
) ?
1213
1268
. build ( ) ?;
1214
- DataFrame :: new ( self . session_state , plan) . collect ( ) . await
1269
+
1270
+ DataFrame {
1271
+ session_state : self . session_state ,
1272
+ plan,
1273
+ }
1274
+ . collect ( )
1275
+ . await
1215
1276
}
1216
1277
1217
1278
/// Add an additional column to the DataFrame.
@@ -1258,7 +1319,10 @@ impl DataFrame {
1258
1319
1259
1320
let project_plan = LogicalPlanBuilder :: from ( plan) . project ( fields) ?. build ( ) ?;
1260
1321
1261
- Ok ( DataFrame :: new ( self . session_state , project_plan) )
1322
+ Ok ( DataFrame {
1323
+ session_state : self . session_state ,
1324
+ plan : project_plan,
1325
+ } )
1262
1326
}
1263
1327
1264
1328
/// Rename one column by applying a new projection. This is a no-op if the column to be
@@ -1322,7 +1386,10 @@ impl DataFrame {
1322
1386
let project_plan = LogicalPlanBuilder :: from ( self . plan )
1323
1387
. project ( projection) ?
1324
1388
. build ( ) ?;
1325
- Ok ( DataFrame :: new ( self . session_state , project_plan) )
1389
+ Ok ( DataFrame {
1390
+ session_state : self . session_state ,
1391
+ plan : project_plan,
1392
+ } )
1326
1393
}
1327
1394
1328
1395
/// Replace all parameters in logical plan with the specified
@@ -1384,7 +1451,10 @@ impl DataFrame {
1384
1451
/// ```
1385
1452
pub fn with_param_values ( self , query_values : impl Into < ParamValues > ) -> Result < Self > {
1386
1453
let plan = self . plan . with_param_values ( query_values) ?;
1387
- Ok ( Self :: new ( self . session_state , plan) )
1454
+ Ok ( DataFrame {
1455
+ session_state : self . session_state ,
1456
+ plan,
1457
+ } )
1388
1458
}
1389
1459
1390
1460
/// Cache DataFrame as a memory table.
@@ -1401,7 +1471,7 @@ impl DataFrame {
1401
1471
/// # }
1402
1472
/// ```
1403
1473
pub async fn cache ( self ) -> Result < DataFrame > {
1404
- let context = SessionContext :: new_with_state ( self . session_state . clone ( ) ) ;
1474
+ let context = SessionContext :: new_with_state ( ( * self . session_state ) . clone ( ) ) ;
1405
1475
// The schema is consistent with the output
1406
1476
let plan = self . clone ( ) . create_physical_plan ( ) . await ?;
1407
1477
let schema = plan. schema ( ) ;
0 commit comments