Skip to content

Commit d452d51

Browse files
xinlifoobaralamb
andauthored
Add parser option enable_options_value_normalization (#11330)
* draft option enable_options_value_normalization * Add unit tests * Fix ci * Fix bad merge * Update configs.md * Fix ci 2 * Fix doc gen * Fix comments * Fix ut * fix format * fix fmt --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 6fd57b2 commit d452d51

File tree

12 files changed

+191
-91
lines changed

12 files changed

+191
-91
lines changed

datafusion/common/src/config.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,9 @@ config_namespace! {
210210
/// When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted)
211211
pub enable_ident_normalization: bool, default = true
212212

213+
/// When set to true, SQL parser will normalize options value (convert value to lowercase)
214+
pub enable_options_value_normalization: bool, default = true
215+
213216
/// Configure the SQL dialect used by DataFusion's parser; supported values include: Generic,
214217
/// MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, and Ansi.
215218
pub dialect: String, default = "generic".to_string()

datafusion/core/src/execution/session_state.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,8 @@ impl SessionState {
512512
ParserOptions {
513513
parse_float_as_decimal: sql_parser_options.parse_float_as_decimal,
514514
enable_ident_normalization: sql_parser_options.enable_ident_normalization,
515+
enable_options_value_normalization: sql_parser_options
516+
.enable_options_value_normalization,
515517
support_varchar_with_length: sql_parser_options.support_varchar_with_length,
516518
}
517519
}

datafusion/sql/src/cte.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
3838
// Process CTEs from top to bottom
3939
for cte in with.cte_tables {
4040
// A `WITH` block can't use the same name more than once
41-
let cte_name = self.normalizer.normalize(cte.alias.name.clone());
41+
let cte_name = self.ident_normalizer.normalize(cte.alias.name.clone());
4242
if planner_context.contains_cte(&cte_name) {
4343
return plan_err!(
4444
"WITH query name {cte_name:?} specified more than once"

datafusion/sql/src/expr/identifier.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
5050
// interpret names with '.' as if they were
5151
// compound identifiers, but this is not a compound
5252
// identifier. (e.g. it is "foo.bar" not foo.bar)
53-
let normalize_ident = self.normalizer.normalize(id);
53+
let normalize_ident = self.ident_normalizer.normalize(id);
5454

5555
// Check for qualified field with unqualified name
5656
if let Ok((qualifier, _)) =
@@ -96,7 +96,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
9696
if ids[0].value.starts_with('@') {
9797
let var_names: Vec<_> = ids
9898
.into_iter()
99-
.map(|id| self.normalizer.normalize(id))
99+
.map(|id| self.ident_normalizer.normalize(id))
100100
.collect();
101101
let ty = self
102102
.context_provider
@@ -110,7 +110,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
110110
} else {
111111
let ids = ids
112112
.into_iter()
113-
.map(|id| self.normalizer.normalize(id))
113+
.map(|id| self.ident_normalizer.normalize(id))
114114
.collect::<Vec<_>>();
115115

116116
// Currently not supporting more than one nested level

datafusion/sql/src/planner.rs

Lines changed: 46 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,10 @@ use arrow_schema::*;
2424
use datafusion_common::{
2525
field_not_found, internal_err, plan_datafusion_err, DFSchemaRef, SchemaError,
2626
};
27-
use sqlparser::ast::TimezoneInfo;
2827
use sqlparser::ast::{ArrayElemTypeDef, ExactNumberInfo};
2928
use sqlparser::ast::{ColumnDef as SQLColumnDef, ColumnOption};
3029
use sqlparser::ast::{DataType as SQLDataType, Ident, ObjectName, TableAlias};
30+
use sqlparser::ast::{TimezoneInfo, Value};
3131

3232
use datafusion_common::TableReference;
3333
use datafusion_common::{
@@ -38,8 +38,7 @@ use datafusion_expr::logical_plan::{LogicalPlan, LogicalPlanBuilder};
3838
use datafusion_expr::utils::find_column_exprs;
3939
use datafusion_expr::{col, Expr};
4040

41-
use crate::utils::make_decimal_type;
42-
41+
use crate::utils::{make_decimal_type, value_to_string};
4342
pub use datafusion_expr::planner::ContextProvider;
4443

4544
/// SQL parser options
@@ -48,6 +47,7 @@ pub struct ParserOptions {
4847
pub parse_float_as_decimal: bool,
4948
pub enable_ident_normalization: bool,
5049
pub support_varchar_with_length: bool,
50+
pub enable_options_value_normalization: bool,
5151
}
5252

5353
impl Default for ParserOptions {
@@ -56,6 +56,7 @@ impl Default for ParserOptions {
5656
parse_float_as_decimal: false,
5757
enable_ident_normalization: true,
5858
support_varchar_with_length: true,
59+
enable_options_value_normalization: true,
5960
}
6061
}
6162
}
@@ -86,6 +87,32 @@ impl IdentNormalizer {
8687
}
8788
}
8889

90+
/// Value Normalizer
91+
#[derive(Debug)]
92+
pub struct ValueNormalizer {
93+
normalize: bool,
94+
}
95+
96+
impl Default for ValueNormalizer {
97+
fn default() -> Self {
98+
Self { normalize: true }
99+
}
100+
}
101+
102+
impl ValueNormalizer {
103+
pub fn new(normalize: bool) -> Self {
104+
Self { normalize }
105+
}
106+
107+
pub fn normalize(&self, value: Value) -> Option<String> {
108+
match (value_to_string(&value), self.normalize) {
109+
(Some(s), true) => Some(s.to_ascii_lowercase()),
110+
(Some(s), false) => Some(s),
111+
(None, _) => None,
112+
}
113+
}
114+
}
115+
89116
/// Struct to store the states used by the Planner. The Planner will leverage the states to resolve
90117
/// CTEs, Views, subqueries and PREPARE statements. The states include
91118
/// Common Table Expression (CTE) provided with WITH clause and
@@ -184,7 +211,8 @@ impl PlannerContext {
184211
pub struct SqlToRel<'a, S: ContextProvider> {
185212
pub(crate) context_provider: &'a S,
186213
pub(crate) options: ParserOptions,
187-
pub(crate) normalizer: IdentNormalizer,
214+
pub(crate) ident_normalizer: IdentNormalizer,
215+
pub(crate) value_normalizer: ValueNormalizer,
188216
}
189217

190218
impl<'a, S: ContextProvider> SqlToRel<'a, S> {
@@ -195,12 +223,14 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
195223

196224
/// Create a new query planner
197225
pub fn new_with_options(context_provider: &'a S, options: ParserOptions) -> Self {
198-
let normalize = options.enable_ident_normalization;
226+
let ident_normalize = options.enable_ident_normalization;
227+
let options_value_normalize = options.enable_options_value_normalization;
199228

200229
SqlToRel {
201230
context_provider,
202231
options,
203-
normalizer: IdentNormalizer::new(normalize),
232+
ident_normalizer: IdentNormalizer::new(ident_normalize),
233+
value_normalizer: ValueNormalizer::new(options_value_normalize),
204234
}
205235
}
206236

@@ -214,7 +244,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
214244
.iter()
215245
.any(|x| x.option == ColumnOption::NotNull);
216246
fields.push(Field::new(
217-
self.normalizer.normalize(column.name),
247+
self.ident_normalizer.normalize(column.name),
218248
data_type,
219249
!not_nullable,
220250
));
@@ -252,8 +282,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
252282
let default_expr = self
253283
.sql_to_expr(default_sql_expr.clone(), &empty_schema, planner_context)
254284
.map_err(error_desc)?;
255-
column_defaults
256-
.push((self.normalizer.normalize(column.name.clone()), default_expr));
285+
column_defaults.push((
286+
self.ident_normalizer.normalize(column.name.clone()),
287+
default_expr,
288+
));
257289
}
258290
}
259291
Ok(column_defaults)
@@ -268,7 +300,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
268300
let plan = self.apply_expr_alias(plan, alias.columns)?;
269301

270302
LogicalPlanBuilder::from(plan)
271-
.alias(TableReference::bare(self.normalizer.normalize(alias.name)))?
303+
.alias(TableReference::bare(
304+
self.ident_normalizer.normalize(alias.name),
305+
))?
272306
.build()
273307
}
274308

@@ -289,7 +323,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
289323
let fields = plan.schema().fields().clone();
290324
LogicalPlanBuilder::from(plan)
291325
.project(fields.iter().zip(idents.into_iter()).map(|(field, ident)| {
292-
col(field.name()).alias(self.normalizer.normalize(ident))
326+
col(field.name()).alias(self.ident_normalizer.normalize(ident))
293327
}))?
294328
.build()
295329
}
@@ -415,7 +449,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
415449
None => Ident::new(format!("c{idx}"))
416450
};
417451
Ok(Arc::new(Field::new(
418-
self.normalizer.normalize(field_name),
452+
self.ident_normalizer.normalize(field_name),
419453
data_type,
420454
true,
421455
)))

datafusion/sql/src/relation/join.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
115115
JoinConstraint::Using(idents) => {
116116
let keys: Vec<Column> = idents
117117
.into_iter()
118-
.map(|x| Column::from_name(self.normalizer.normalize(x)))
118+
.map(|x| Column::from_name(self.ident_normalizer.normalize(x)))
119119
.collect();
120120
LogicalPlanBuilder::from(left)
121121
.join_using(right, join_type, keys)?

datafusion/sql/src/select.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -576,7 +576,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
576576
&[&[plan.schema()]],
577577
&plan.using_columns()?,
578578
)?;
579-
let name = self.normalizer.normalize(alias);
579+
let name = self.ident_normalizer.normalize(alias);
580580
// avoiding adding an alias if the column name is the same.
581581
let expr = match &col {
582582
Expr::Column(column) if column.name.eq(&name) => col,

datafusion/sql/src/statement.rs

Lines changed: 37 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -66,30 +66,6 @@ fn ident_to_string(ident: &Ident) -> String {
6666
normalize_ident(ident.to_owned())
6767
}
6868

69-
fn value_to_string(value: &Value) -> Option<String> {
70-
match value {
71-
Value::SingleQuotedString(s) => Some(s.to_string()),
72-
Value::DollarQuotedString(s) => Some(s.to_string()),
73-
Value::Number(_, _) | Value::Boolean(_) => Some(value.to_string()),
74-
Value::DoubleQuotedString(_)
75-
| Value::EscapedStringLiteral(_)
76-
| Value::NationalStringLiteral(_)
77-
| Value::SingleQuotedByteStringLiteral(_)
78-
| Value::DoubleQuotedByteStringLiteral(_)
79-
| Value::TripleSingleQuotedString(_)
80-
| Value::TripleDoubleQuotedString(_)
81-
| Value::TripleSingleQuotedByteStringLiteral(_)
82-
| Value::TripleDoubleQuotedByteStringLiteral(_)
83-
| Value::SingleQuotedRawStringLiteral(_)
84-
| Value::DoubleQuotedRawStringLiteral(_)
85-
| Value::TripleSingleQuotedRawStringLiteral(_)
86-
| Value::TripleDoubleQuotedRawStringLiteral(_)
87-
| Value::HexStringLiteral(_)
88-
| Value::Null
89-
| Value::Placeholder(_) => None,
90-
}
91-
}
92-
9369
fn object_name_to_string(object_name: &ObjectName) -> String {
9470
object_name
9571
.0
@@ -881,25 +857,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
881857
}
882858
};
883859

884-
let mut options = HashMap::new();
885-
for (key, value) in statement.options {
886-
let value_string = match value_to_string(&value) {
887-
None => {
888-
return plan_err!("Unsupported Value in COPY statement {}", value);
889-
}
890-
Some(v) => v,
891-
};
892-
893-
if !(&key.contains('.')) {
894-
// If config does not belong to any namespace, assume it is
895-
// a format option and apply the format prefix for backwards
896-
// compatibility.
897-
let renamed_key = format!("format.{}", key);
898-
options.insert(renamed_key.to_lowercase(), value_string.to_lowercase());
899-
} else {
900-
options.insert(key.to_lowercase(), value_string.to_lowercase());
901-
}
902-
}
860+
let options_map = self.parse_options_map(statement.options, true)?;
903861

904862
let maybe_file_type = if let Some(stored_as) = &statement.stored_as {
905863
if let Ok(ext_file_type) = self.context_provider.get_file_type(stored_as) {
@@ -946,7 +904,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
946904
output_url: statement.target,
947905
file_type,
948906
partition_by,
949-
options,
907+
options: options_map,
950908
}))
951909
}
952910

@@ -1007,29 +965,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
1007965
let inline_constraints = calc_inline_constraints_from_columns(&columns);
1008966
all_constraints.extend(inline_constraints);
1009967

1010-
let mut options_map = HashMap::<String, String>::new();
1011-
for (key, value) in options {
1012-
if options_map.contains_key(&key) {
1013-
return plan_err!("Option {key} is specified multiple times");
1014-
}
1015-
1016-
let Some(value_string) = value_to_string(&value) else {
1017-
return plan_err!(
1018-
"Unsupported Value in CREATE EXTERNAL TABLE statement {}",
1019-
value
1020-
);
1021-
};
1022-
1023-
if !(&key.contains('.')) {
1024-
// If a config does not belong to any namespace, we assume it is
1025-
// a format option and apply the format prefix for backwards
1026-
// compatibility.
1027-
let renamed_key = format!("format.{}", key.to_lowercase());
1028-
options_map.insert(renamed_key, value_string.to_lowercase());
1029-
} else {
1030-
options_map.insert(key.to_lowercase(), value_string.to_lowercase());
1031-
}
1032-
}
968+
let options_map = self.parse_options_map(options, false)?;
1033969

1034970
let compression = options_map
1035971
.get("format.compression")
@@ -1081,6 +1017,36 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
10811017
)))
10821018
}
10831019

1020+
fn parse_options_map(
1021+
&self,
1022+
options: Vec<(String, Value)>,
1023+
allow_duplicates: bool,
1024+
) -> Result<HashMap<String, String>> {
1025+
let mut options_map = HashMap::new();
1026+
for (key, value) in options {
1027+
if !allow_duplicates && options_map.contains_key(&key) {
1028+
return plan_err!("Option {key} is specified multiple times");
1029+
}
1030+
1031+
let Some(value_string) = self.value_normalizer.normalize(value.clone())
1032+
else {
1033+
return plan_err!("Unsupported Value {}", value);
1034+
};
1035+
1036+
if !(&key.contains('.')) {
1037+
// If config does not belong to any namespace, assume it is
1038+
// a format option and apply the format prefix for backwards
1039+
// compatibility.
1040+
let renamed_key = format!("format.{}", key);
1041+
options_map.insert(renamed_key.to_lowercase(), value_string);
1042+
} else {
1043+
options_map.insert(key.to_lowercase(), value_string);
1044+
}
1045+
}
1046+
1047+
Ok(options_map)
1048+
}
1049+
10841050
/// Generate a plan for EXPLAIN ... that will print out a plan
10851051
///
10861052
/// Note this is the sqlparser explain statement, not the
@@ -1204,7 +1170,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
12041170
// parse value string from Expr
12051171
let value_string = match &value[0] {
12061172
SQLExpr::Identifier(i) => ident_to_string(i),
1207-
SQLExpr::Value(v) => match value_to_string(v) {
1173+
SQLExpr::Value(v) => match crate::utils::value_to_string(v) {
12081174
None => {
12091175
return plan_err!("Unsupported Value {}", value[0]);
12101176
}
@@ -1365,8 +1331,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
13651331
None => {
13661332
// If the target table has an alias, use it to qualify the column name
13671333
if let Some(alias) = &table_alias {
1368-
Expr::Column(Column::new(
1369-
Some(self.normalizer.normalize(alias.name.clone())),
1334+
datafusion_expr::Expr::Column(Column::new(
1335+
Some(self.ident_normalizer.normalize(alias.name.clone())),
13701336
field.name(),
13711337
))
13721338
} else {
@@ -1421,7 +1387,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> {
14211387
let mut value_indices = vec![None; table_schema.fields().len()];
14221388
let fields = columns
14231389
.into_iter()
1424-
.map(|c| self.normalizer.normalize(c))
1390+
.map(|c| self.ident_normalizer.normalize(c))
14251391
.enumerate()
14261392
.map(|(i, c)| {
14271393
let column_index = table_schema

0 commit comments

Comments
 (0)