Skip to content

Commit d4eb72c

Browse files
authored
Reduce DataFrame stack size and fix large futures warnings (#10123)
1 parent 1f71d79 commit d4eb72c

File tree

3 files changed

+158
-70
lines changed

3 files changed

+158
-70
lines changed

datafusion/core/src/dataframe/mod.rs

Lines changed: 122 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,8 @@ impl Default for DataFrameWriteOptions {
156156
/// ```
157157
#[derive(Debug, Clone)]
158158
pub struct DataFrame {
159-
session_state: SessionState,
159+
// Box the (large) SessionState to reduce the size of DataFrame on the stack
160+
session_state: Box<SessionState>,
160161
plan: LogicalPlan,
161162
}
162163

@@ -168,7 +169,7 @@ impl DataFrame {
168169
/// `DataFrame` from an existing datasource.
169170
pub fn new(session_state: SessionState, plan: LogicalPlan) -> Self {
170171
Self {
171-
session_state,
172+
session_state: Box::new(session_state),
172173
plan,
173174
}
174175
}
@@ -230,7 +231,10 @@ impl DataFrame {
230231
};
231232
let project_plan = LogicalPlanBuilder::from(plan).project(expr_list)?.build()?;
232233

233-
Ok(DataFrame::new(self.session_state, project_plan))
234+
Ok(DataFrame {
235+
session_state: self.session_state,
236+
plan: project_plan,
237+
})
234238
}
235239

236240
/// Expand each list element of a column to multiple rows.
@@ -269,7 +273,10 @@ impl DataFrame {
269273
let plan = LogicalPlanBuilder::from(self.plan)
270274
.unnest_column_with_options(column, options)?
271275
.build()?;
272-
Ok(DataFrame::new(self.session_state, plan))
276+
Ok(DataFrame {
277+
session_state: self.session_state,
278+
plan,
279+
})
273280
}
274281

275282
/// Return a DataFrame with only rows for which `predicate` evaluates to
@@ -294,7 +301,10 @@ impl DataFrame {
294301
let plan = LogicalPlanBuilder::from(self.plan)
295302
.filter(predicate)?
296303
.build()?;
297-
Ok(DataFrame::new(self.session_state, plan))
304+
Ok(DataFrame {
305+
session_state: self.session_state,
306+
plan,
307+
})
298308
}
299309

300310
/// Return a new `DataFrame` that aggregates the rows of the current
@@ -325,7 +335,10 @@ impl DataFrame {
325335
let plan = LogicalPlanBuilder::from(self.plan)
326336
.aggregate(group_expr, aggr_expr)?
327337
.build()?;
328-
Ok(DataFrame::new(self.session_state, plan))
338+
Ok(DataFrame {
339+
session_state: self.session_state,
340+
plan,
341+
})
329342
}
330343

331344
/// Return a new DataFrame that adds the result of evaluating one or more
@@ -334,7 +347,10 @@ impl DataFrame {
334347
let plan = LogicalPlanBuilder::from(self.plan)
335348
.window(window_exprs)?
336349
.build()?;
337-
Ok(DataFrame::new(self.session_state, plan))
350+
Ok(DataFrame {
351+
session_state: self.session_state,
352+
plan,
353+
})
338354
}
339355

340356
/// Returns a new `DataFrame` with a limited number of rows.
@@ -359,7 +375,10 @@ impl DataFrame {
359375
let plan = LogicalPlanBuilder::from(self.plan)
360376
.limit(skip, fetch)?
361377
.build()?;
362-
Ok(DataFrame::new(self.session_state, plan))
378+
Ok(DataFrame {
379+
session_state: self.session_state,
380+
plan,
381+
})
363382
}
364383

365384
/// Calculate the union of two [`DataFrame`]s, preserving duplicate rows.
@@ -383,7 +402,10 @@ impl DataFrame {
383402
let plan = LogicalPlanBuilder::from(self.plan)
384403
.union(dataframe.plan)?
385404
.build()?;
386-
Ok(DataFrame::new(self.session_state, plan))
405+
Ok(DataFrame {
406+
session_state: self.session_state,
407+
plan,
408+
})
387409
}
388410

389411
/// Calculate the distinct union of two [`DataFrame`]s.
@@ -405,12 +427,13 @@ impl DataFrame {
405427
/// # }
406428
/// ```
407429
pub fn union_distinct(self, dataframe: DataFrame) -> Result<DataFrame> {
408-
Ok(DataFrame::new(
409-
self.session_state,
410-
LogicalPlanBuilder::from(self.plan)
411-
.union_distinct(dataframe.plan)?
412-
.build()?,
413-
))
430+
let plan = LogicalPlanBuilder::from(self.plan)
431+
.union_distinct(dataframe.plan)?
432+
.build()?;
433+
Ok(DataFrame {
434+
session_state: self.session_state,
435+
plan,
436+
})
414437
}
415438

416439
/// Return a new `DataFrame` with all duplicated rows removed.
@@ -428,10 +451,11 @@ impl DataFrame {
428451
/// # }
429452
/// ```
430453
pub fn distinct(self) -> Result<DataFrame> {
431-
Ok(DataFrame::new(
432-
self.session_state,
433-
LogicalPlanBuilder::from(self.plan).distinct()?.build()?,
434-
))
454+
let plan = LogicalPlanBuilder::from(self.plan).distinct()?.build()?;
455+
Ok(DataFrame {
456+
session_state: self.session_state,
457+
plan,
458+
})
435459
}
436460

437461
/// Return a new `DataFrame` that has statistics for a DataFrame.
@@ -599,15 +623,18 @@ impl DataFrame {
599623
describe_record_batch.schema(),
600624
vec![vec![describe_record_batch]],
601625
)?;
602-
Ok(DataFrame::new(
603-
self.session_state,
604-
LogicalPlanBuilder::scan(
605-
UNNAMED_TABLE,
606-
provider_as_source(Arc::new(provider)),
607-
None,
608-
)?
609-
.build()?,
610-
))
626+
627+
let plan = LogicalPlanBuilder::scan(
628+
UNNAMED_TABLE,
629+
provider_as_source(Arc::new(provider)),
630+
None,
631+
)?
632+
.build()?;
633+
634+
Ok(DataFrame {
635+
session_state: self.session_state,
636+
plan,
637+
})
611638
}
612639

613640
/// Sort the DataFrame by the specified sorting expressions.
@@ -633,7 +660,10 @@ impl DataFrame {
633660
/// ```
634661
pub fn sort(self, expr: Vec<Expr>) -> Result<DataFrame> {
635662
let plan = LogicalPlanBuilder::from(self.plan).sort(expr)?.build()?;
636-
Ok(DataFrame::new(self.session_state, plan))
663+
Ok(DataFrame {
664+
session_state: self.session_state,
665+
plan,
666+
})
637667
}
638668

639669
/// Join this `DataFrame` with another `DataFrame` using explicitly specified
@@ -687,7 +717,10 @@ impl DataFrame {
687717
filter,
688718
)?
689719
.build()?;
690-
Ok(DataFrame::new(self.session_state, plan))
720+
Ok(DataFrame {
721+
session_state: self.session_state,
722+
plan,
723+
})
691724
}
692725

693726
/// Join this `DataFrame` with another `DataFrame` using the specified
@@ -737,7 +770,10 @@ impl DataFrame {
737770
let plan = LogicalPlanBuilder::from(self.plan)
738771
.join_on(right.plan, join_type, expr)?
739772
.build()?;
740-
Ok(DataFrame::new(self.session_state, plan))
773+
Ok(DataFrame {
774+
session_state: self.session_state,
775+
plan,
776+
})
741777
}
742778

743779
/// Repartition a DataFrame based on a logical partitioning scheme.
@@ -758,7 +794,10 @@ impl DataFrame {
758794
let plan = LogicalPlanBuilder::from(self.plan)
759795
.repartition(partitioning_scheme)?
760796
.build()?;
761-
Ok(DataFrame::new(self.session_state, plan))
797+
Ok(DataFrame {
798+
session_state: self.session_state,
799+
plan,
800+
})
762801
}
763802

764803
/// Return the total number of rows in this `DataFrame`.
@@ -863,7 +902,7 @@ impl DataFrame {
863902

864903
/// Return a new [`TaskContext`] which would be used to execute this DataFrame
865904
pub fn task_ctx(&self) -> TaskContext {
866-
TaskContext::from(&self.session_state)
905+
TaskContext::from(self.session_state.as_ref())
867906
}
868907

869908
/// Executes this DataFrame and returns a stream over a single partition
@@ -969,7 +1008,7 @@ impl DataFrame {
9691008

9701009
/// Returns both the [`LogicalPlan`] and [`SessionState`] that comprise this [`DataFrame`]
9711010
pub fn into_parts(self) -> (SessionState, LogicalPlan) {
972-
(self.session_state, self.plan)
1011+
(*self.session_state, self.plan)
9731012
}
9741013

9751014
/// Return the [`LogicalPlan`] represented by this DataFrame without running
@@ -1023,7 +1062,10 @@ impl DataFrame {
10231062
let plan = LogicalPlanBuilder::from(self.plan)
10241063
.explain(verbose, analyze)?
10251064
.build()?;
1026-
Ok(DataFrame::new(self.session_state, plan))
1065+
Ok(DataFrame {
1066+
session_state: self.session_state,
1067+
plan,
1068+
})
10271069
}
10281070

10291071
/// Return a `FunctionRegistry` used to plan udf's calls
@@ -1042,7 +1084,7 @@ impl DataFrame {
10421084
/// # }
10431085
/// ```
10441086
pub fn registry(&self) -> &dyn FunctionRegistry {
1045-
&self.session_state
1087+
self.session_state.as_ref()
10461088
}
10471089

10481090
/// Calculate the intersection of two [`DataFrame`]s. The two [`DataFrame`]s must have exactly the same schema
@@ -1062,10 +1104,11 @@ impl DataFrame {
10621104
pub fn intersect(self, dataframe: DataFrame) -> Result<DataFrame> {
10631105
let left_plan = self.plan;
10641106
let right_plan = dataframe.plan;
1065-
Ok(DataFrame::new(
1066-
self.session_state,
1067-
LogicalPlanBuilder::intersect(left_plan, right_plan, true)?,
1068-
))
1107+
let plan = LogicalPlanBuilder::intersect(left_plan, right_plan, true)?;
1108+
Ok(DataFrame {
1109+
session_state: self.session_state,
1110+
plan,
1111+
})
10691112
}
10701113

10711114
/// Calculate the exception of two [`DataFrame`]s. The two [`DataFrame`]s must have exactly the same schema
@@ -1085,11 +1128,11 @@ impl DataFrame {
10851128
pub fn except(self, dataframe: DataFrame) -> Result<DataFrame> {
10861129
let left_plan = self.plan;
10871130
let right_plan = dataframe.plan;
1088-
1089-
Ok(DataFrame::new(
1090-
self.session_state,
1091-
LogicalPlanBuilder::except(left_plan, right_plan, true)?,
1092-
))
1131+
let plan = LogicalPlanBuilder::except(left_plan, right_plan, true)?;
1132+
Ok(DataFrame {
1133+
session_state: self.session_state,
1134+
plan,
1135+
})
10931136
}
10941137

10951138
/// Execute this `DataFrame` and write the results to `table_name`.
@@ -1114,7 +1157,13 @@ impl DataFrame {
11141157
write_options.overwrite,
11151158
)?
11161159
.build()?;
1117-
DataFrame::new(self.session_state, plan).collect().await
1160+
1161+
DataFrame {
1162+
session_state: self.session_state,
1163+
plan,
1164+
}
1165+
.collect()
1166+
.await
11181167
}
11191168

11201169
/// Execute the `DataFrame` and write the results to CSV file(s).
@@ -1162,7 +1211,13 @@ impl DataFrame {
11621211
options.partition_by,
11631212
)?
11641213
.build()?;
1165-
DataFrame::new(self.session_state, plan).collect().await
1214+
1215+
DataFrame {
1216+
session_state: self.session_state,
1217+
plan,
1218+
}
1219+
.collect()
1220+
.await
11661221
}
11671222

11681223
/// Execute the `DataFrame` and write the results to JSON file(s).
@@ -1211,7 +1266,13 @@ impl DataFrame {
12111266
options.partition_by,
12121267
)?
12131268
.build()?;
1214-
DataFrame::new(self.session_state, plan).collect().await
1269+
1270+
DataFrame {
1271+
session_state: self.session_state,
1272+
plan,
1273+
}
1274+
.collect()
1275+
.await
12151276
}
12161277

12171278
/// Add an additional column to the DataFrame.
@@ -1258,7 +1319,10 @@ impl DataFrame {
12581319

12591320
let project_plan = LogicalPlanBuilder::from(plan).project(fields)?.build()?;
12601321

1261-
Ok(DataFrame::new(self.session_state, project_plan))
1322+
Ok(DataFrame {
1323+
session_state: self.session_state,
1324+
plan: project_plan,
1325+
})
12621326
}
12631327

12641328
/// Rename one column by applying a new projection. This is a no-op if the column to be
@@ -1322,7 +1386,10 @@ impl DataFrame {
13221386
let project_plan = LogicalPlanBuilder::from(self.plan)
13231387
.project(projection)?
13241388
.build()?;
1325-
Ok(DataFrame::new(self.session_state, project_plan))
1389+
Ok(DataFrame {
1390+
session_state: self.session_state,
1391+
plan: project_plan,
1392+
})
13261393
}
13271394

13281395
/// Replace all parameters in logical plan with the specified
@@ -1384,7 +1451,10 @@ impl DataFrame {
13841451
/// ```
13851452
pub fn with_param_values(self, query_values: impl Into<ParamValues>) -> Result<Self> {
13861453
let plan = self.plan.with_param_values(query_values)?;
1387-
Ok(Self::new(self.session_state, plan))
1454+
Ok(DataFrame {
1455+
session_state: self.session_state,
1456+
plan,
1457+
})
13881458
}
13891459

13901460
/// Cache DataFrame as a memory table.
@@ -1401,7 +1471,7 @@ impl DataFrame {
14011471
/// # }
14021472
/// ```
14031473
pub async fn cache(self) -> Result<DataFrame> {
1404-
let context = SessionContext::new_with_state(self.session_state.clone());
1474+
let context = SessionContext::new_with_state((*self.session_state).clone());
14051475
// The schema is consistent with the output
14061476
let plan = self.clone().create_physical_plan().await?;
14071477
let schema = plan.schema();

datafusion/core/src/dataframe/parquet.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,12 @@ impl DataFrame {
6868
options.partition_by,
6969
)?
7070
.build()?;
71-
DataFrame::new(self.session_state, plan).collect().await
71+
DataFrame {
72+
session_state: self.session_state,
73+
plan,
74+
}
75+
.collect()
76+
.await
7277
}
7378
}
7479

0 commit comments

Comments
 (0)