Skip to content

Commit 6d517fd

Browse files
alambjonahgao
andauthored
Improve SQL Planner docs (#14669)
* Improve SQL Planner docs * fix docs * Apply suggestions from code review Co-authored-by: Jonah Gao <[email protected]> * Restore builtin term * Add docs to `is_system_variable` * clarify type checking * fix rendering --------- Co-authored-by: Jonah Gao <[email protected]>
1 parent 78e8493 commit 6d517fd

File tree

4 files changed

+87
-45
lines changed

4 files changed

+87
-45
lines changed

datafusion/core/src/lib.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -229,9 +229,9 @@
229229
//! 1. The query string is parsed to an Abstract Syntax Tree (AST)
230230
//! [`Statement`] using [sqlparser].
231231
//!
232-
//! 2. The AST is converted to a [`LogicalPlan`] and logical
233-
//! expressions [`Expr`]s to compute the desired result by the
234-
//! [`SqlToRel`] planner.
232+
//! 2. The AST is converted to a [`LogicalPlan`] and logical expressions
233+
//! [`Expr`]s to compute the desired result by [`SqlToRel`]. This phase
234+
//! also includes name and type resolution ("binding").
235235
//!
236236
//! [`Statement`]: https://docs.rs/sqlparser/latest/sqlparser/ast/enum.Statement.html
237237
//!

datafusion/expr/src/planner.rs

Lines changed: 60 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,18 @@ use sqlparser::ast;
2929

3030
use crate::{AggregateUDF, Expr, GetFieldAccess, ScalarUDF, TableSource, WindowUDF};
3131

32-
/// Provides the `SQL` query planner meta-data about tables and
33-
/// functions referenced in SQL statements, without a direct dependency on other
34-
/// DataFusion structures
32+
/// Provides the `SQL` query planner meta-data about tables and
33+
/// functions referenced in SQL statements, without a direct dependency on the
34+
/// `datafusion` Catalog structures such as [`TableProvider`]
35+
///
36+
/// [`TableProvider`]: https://docs.rs/datafusion/latest/datafusion/catalog/trait.TableProvider.html
3537
pub trait ContextProvider {
36-
/// Getter for a datasource
38+
/// Returns a table by reference, if it exists
3739
fn get_table_source(&self, name: TableReference) -> Result<Arc<dyn TableSource>>;
3840

41+
/// Return the type of a file based on its extension (e.g. `.parquet`)
42+
///
43+
/// This is used to plan `COPY` statements
3944
fn get_file_type(&self, _ext: &str) -> Result<Arc<dyn FileType>> {
4045
not_impl_err!("Registered file types are not supported")
4146
}
@@ -49,11 +54,20 @@ pub trait ContextProvider {
4954
not_impl_err!("Table Functions are not supported")
5055
}
5156

52-
/// This provides a worktable (an intermediate table that is used to store the results of a CTE during execution)
53-
/// We don't directly implement this in the logical plan's ['SqlToRel`]
54-
/// because the sql code needs access to a table that contains execution-related types that can't be a direct dependency
55-
/// of the sql crate (namely, the `CteWorktable`).
57+
/// Provides an intermediate table that is used to store the results of a CTE during execution
58+
///
59+
/// CTE stands for "Common Table Expression"
60+
///
61+
/// # Notes
62+
/// We don't directly implement this in [`SqlToRel`] as implementing this function
63+
/// often requires access to a table that contains
64+
/// execution-related types that can't be a direct dependency
65+
/// of the sql crate (for example [`CteWorkTable`]).
66+
///
5667
/// The [`ContextProvider`] provides a way to "hide" this dependency.
68+
///
69+
/// [`SqlToRel`]: https://docs.rs/datafusion/latest/datafusion/sql/planner/struct.SqlToRel.html
70+
/// [`CteWorkTable`]: https://docs.rs/datafusion/latest/datafusion/datasource/cte_worktable/struct.CteWorkTable.html
5771
fn create_cte_work_table(
5872
&self,
5973
_name: &str,
@@ -62,39 +76,44 @@ pub trait ContextProvider {
6276
not_impl_err!("Recursive CTE is not implemented")
6377
}
6478

65-
/// Getter for expr planners
79+
/// Return [`ExprPlanner`] extensions for planning expressions
6680
fn get_expr_planners(&self) -> &[Arc<dyn ExprPlanner>] {
6781
&[]
6882
}
6983

70-
/// Getter for the data type planner
84+
/// Return [`TypePlanner`] extensions for planning data types
7185
fn get_type_planner(&self) -> Option<Arc<dyn TypePlanner>> {
7286
None
7387
}
7488

75-
/// Getter for a UDF description
89+
/// Return the scalar function with a given name, if any
7690
fn get_function_meta(&self, name: &str) -> Option<Arc<ScalarUDF>>;
77-
/// Getter for a UDAF description
91+
92+
/// Return the aggregate function with a given name, if any
7893
fn get_aggregate_meta(&self, name: &str) -> Option<Arc<AggregateUDF>>;
79-
/// Getter for a UDWF
94+
95+
/// Return the window function with a given name, if any
8096
fn get_window_meta(&self, name: &str) -> Option<Arc<WindowUDF>>;
81-
/// Getter for system/user-defined variable type
97+
98+
/// Return the system/user-defined variable type, if any
99+
///
100+
/// A user defined variable is typically accessed via `@var_name`
82101
fn get_variable_type(&self, variable_names: &[String]) -> Option<DataType>;
83102

84-
/// Get configuration options
103+
/// Return overall configuration options
85104
fn options(&self) -> &ConfigOptions;
86105

87-
/// Get all user defined scalar function names
106+
/// Return all scalar function names
88107
fn udf_names(&self) -> Vec<String>;
89108

90-
/// Get all user defined aggregate function names
109+
/// Return all aggregate function names
91110
fn udaf_names(&self) -> Vec<String>;
92111

93-
/// Get all user defined window function names
112+
/// Return all window function names
94113
fn udwf_names(&self) -> Vec<String>;
95114
}
96115

97-
/// This trait allows users to customize the behavior of the SQL planner
116+
/// Customize planning of SQL AST expressions to [`Expr`]s
98117
pub trait ExprPlanner: Debug + Send + Sync {
99118
/// Plan the binary operation between two expressions, returns original
100119
/// BinaryExpr if not possible
@@ -106,9 +125,9 @@ pub trait ExprPlanner: Debug + Send + Sync {
106125
Ok(PlannerResult::Original(expr))
107126
}
108127

109-
/// Plan the field access expression
128+
/// Plan the field access expression, such as `foo.bar`
110129
///
111-
/// returns original FieldAccessExpr if not possible
130+
/// returns original [`RawFieldAccessExpr`] if not possible
112131
fn plan_field_access(
113132
&self,
114133
expr: RawFieldAccessExpr,
@@ -117,7 +136,7 @@ pub trait ExprPlanner: Debug + Send + Sync {
117136
Ok(PlannerResult::Original(expr))
118137
}
119138

120-
/// Plan the array literal, returns OriginalArray if not possible
139+
/// Plan an array literal, such as `[1, 2, 3]`
121140
///
122141
/// Returns origin expression arguments if not possible
123142
fn plan_array_literal(
@@ -128,13 +147,14 @@ pub trait ExprPlanner: Debug + Send + Sync {
128147
Ok(PlannerResult::Original(exprs))
129148
}
130149

131-
// Plan the POSITION expression, e.g., POSITION(<expr> in <expr>)
132-
// returns origin expression arguments if not possible
150+
/// Plan a `POSITION` expression, such as `POSITION(<expr> in <expr>)`
151+
///
152+
/// returns origin expression arguments if not possible
133153
fn plan_position(&self, args: Vec<Expr>) -> Result<PlannerResult<Vec<Expr>>> {
134154
Ok(PlannerResult::Original(args))
135155
}
136156

137-
/// Plan the dictionary literal `{ key: value, ...}`
157+
/// Plan a dictionary literal, such as `{ key: value, ...}`
138158
///
139159
/// Returns origin expression arguments if not possible
140160
fn plan_dictionary_literal(
@@ -145,27 +165,26 @@ pub trait ExprPlanner: Debug + Send + Sync {
145165
Ok(PlannerResult::Original(expr))
146166
}
147167

148-
/// Plan an extract expression, e.g., `EXTRACT(month FROM foo)`
168+
/// Plan an extract expression, such as`EXTRACT(month FROM foo)`
149169
///
150170
/// Returns origin expression arguments if not possible
151171
fn plan_extract(&self, args: Vec<Expr>) -> Result<PlannerResult<Vec<Expr>>> {
152172
Ok(PlannerResult::Original(args))
153173
}
154174

155-
/// Plan an substring expression, e.g., `SUBSTRING(<expr> [FROM <expr>] [FOR <expr>])`
175+
/// Plan an substring expression, such as `SUBSTRING(<expr> [FROM <expr>] [FOR <expr>])`
156176
///
157177
/// Returns origin expression arguments if not possible
158178
fn plan_substring(&self, args: Vec<Expr>) -> Result<PlannerResult<Vec<Expr>>> {
159179
Ok(PlannerResult::Original(args))
160180
}
161181

162-
/// Plans a struct `struct(expression1[, ..., expression_n])`
163-
/// literal based on the given input expressions.
164-
/// This function takes a vector of expressions and a boolean flag indicating whether
165-
/// the struct uses the optional name
182+
/// Plans a struct literal, such as `{'field1' : expr1, 'field2' : expr2, ...}`
183+
///
184+
/// This function takes a vector of expressions and a boolean flag
185+
/// indicating whether the struct uses the optional name
166186
///
167-
/// Returns a `PlannerResult` containing either the planned struct expressions or the original
168-
/// input expressions if planning is not possible.
187+
/// Returns the original input expressions if planning is not possible.
169188
fn plan_struct_literal(
170189
&self,
171190
args: Vec<Expr>,
@@ -174,26 +193,26 @@ pub trait ExprPlanner: Debug + Send + Sync {
174193
Ok(PlannerResult::Original(args))
175194
}
176195

177-
/// Plans an overlay expression eg `overlay(str PLACING substr FROM pos [FOR count])`
196+
/// Plans an overlay expression, such as `overlay(str PLACING substr FROM pos [FOR count])`
178197
///
179198
/// Returns origin expression arguments if not possible
180199
fn plan_overlay(&self, args: Vec<Expr>) -> Result<PlannerResult<Vec<Expr>>> {
181200
Ok(PlannerResult::Original(args))
182201
}
183202

184-
/// Plan a make_map expression, e.g., `make_map(key1, value1, key2, value2, ...)`
203+
/// Plans a `make_map` expression, such as `make_map(key1, value1, key2, value2, ...)`
185204
///
186205
/// Returns origin expression arguments if not possible
187206
fn plan_make_map(&self, args: Vec<Expr>) -> Result<PlannerResult<Vec<Expr>>> {
188207
Ok(PlannerResult::Original(args))
189208
}
190209

191-
/// Plans compound identifier eg `db.schema.table` for non-empty nested names
210+
/// Plans compound identifier such as `db.schema.table` for non-empty nested names
192211
///
193-
/// Note:
212+
/// # Note:
194213
/// Currently compound identifier for outer query schema is not supported.
195214
///
196-
/// Returns planned expression
215+
/// Returns original expression if not possible
197216
fn plan_compound_identifier(
198217
&self,
199218
_field: &Field,
@@ -205,7 +224,7 @@ pub trait ExprPlanner: Debug + Send + Sync {
205224
)
206225
}
207226

208-
/// Plans `ANY` expression, e.g., `expr = ANY(array_expr)`
227+
/// Plans `ANY` expression, such as `expr = ANY(array_expr)`
209228
///
210229
/// Returns origin binary expression if not possible
211230
fn plan_any(&self, expr: RawBinaryExpr) -> Result<PlannerResult<RawBinaryExpr>> {
@@ -256,9 +275,9 @@ pub enum PlannerResult<T> {
256275
Original(T),
257276
}
258277

259-
/// This trait allows users to customize the behavior of the data type planning
278+
/// Customize planning SQL types to DataFusion (Arrow) types.
260279
pub trait TypePlanner: Debug + Send + Sync {
261-
/// Plan SQL type to DataFusion data type
280+
/// Plan SQL [`ast::DataType`] to DataFusion [`DataType`]
262281
///
263282
/// Returns None if not possible
264283
fn plan_type(&self, _sql_type: &ast::DataType) -> Result<Option<DataType>> {

datafusion/expr/src/var_provider.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@ pub trait VarProvider: std::fmt::Debug {
3838
fn get_type(&self, var_names: &[String]) -> Option<DataType>;
3939
}
4040

41+
/// Returns true if the specified string is a "system" variable such as
42+
/// `@@version`
43+
///
44+
/// See [`SessionContext::register_variable`] for more details
45+
///
46+
/// [`SessionContext::register_variable`]: https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.register_variable
4147
pub fn is_system_variables(variable_names: &[String]) -> bool {
4248
!variable_names.is_empty() && variable_names[0].get(0..2) == Some("@@")
4349
}

datafusion/sql/src/planner.rs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,24 @@ impl PlannerContext {
224224
}
225225
}
226226

227-
/// SQL query planner
227+
/// SQL query planner and binder
228+
///
229+
/// This struct is used to convert a SQL AST into a [`LogicalPlan`].
230+
///
231+
/// You can control the behavior of the planner by providing [`ParserOptions`].
232+
///
233+
/// It performs the following tasks:
234+
///
235+
/// 1. Name and type resolution (called "binding" in other systems). This
236+
/// phase looks up table and column names using the [`ContextProvider`].
237+
/// 2. Mechanical translation of the AST into a [`LogicalPlan`].
238+
///
239+
/// It does not perform type coercion, or perform optimization, which are done
240+
/// by subsequent passes.
241+
///
242+
/// Key interfaces are:
243+
/// * [`Self::sql_statement_to_plan`]: Convert a statement (e.g. `SELECT ...`) into a [`LogicalPlan`]
244+
/// * [`Self::sql_to_expr`]: Convert an expression (e.g. `1 + 2`) into an [`Expr`]
228245
pub struct SqlToRel<'a, S: ContextProvider> {
229246
pub(crate) context_provider: &'a S,
230247
pub(crate) options: ParserOptions,

0 commit comments

Comments
 (0)