Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ info:
success: true
exit_code: 0
----- stdout -----
[{"Utf8(\"\\\")":"\\","Utf8(\"\\\\\")":"\\\\","Utf8(\"\\\\\\\\\\\")":"\\\\\\\\\\","Utf8(\"dsdsds\\\\\\\\\")":"dsdsds\\\\\\\\","Utf8(\"\\t\")":"\\t","Utf8(\"\\0\")":"\\0","Utf8(\"\\n\")":"\\n"}]
[{"Utf8(\"\\\")":"\\","Utf8(\"\\\\\")":"\\\\","Utf8(\"\\\\\\\\\")":"\\\\\\\\","Utf8(\"dsdsds\\\\\\\\\")":"dsdsds\\\\\\\\","Utf8(\"\\t\")":"\\t","Utf8(\"\\0\")":"\\0","Utf8(\"\\n\")":"\\n"}]

----- stderr -----
2 changes: 1 addition & 1 deletion datafusion-cli/tests/sql/backslash.sql
Original file line number Diff line number Diff line change
@@ -1 +1 @@
select '\', '\\', '\\\\\', 'dsdsds\\\\', '\t', '\0', '\n';
select '\', '\\', '\\\\', 'dsdsds\\\\', '\t', '\0', '\n'
5 changes: 5 additions & 0 deletions datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,11 @@ config_namespace! {
/// Default is true.
pub map_string_types_to_utf8view: bool, default = true

/// When set to true, SQL string literals use Spark-compatible backslash
/// escape handling during SQL planning. This should only be enabled for
/// Spark compatibility mode.
pub spark_string_literal_unescape: bool, default = false

/// When set to true, the source locations relative to the original SQL
/// query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected
/// and recorded in the logical plan nodes.
Expand Down
2 changes: 2 additions & 0 deletions datafusion/core/src/execution/session_state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,8 @@ impl SessionState {
.enable_options_value_normalization,
support_varchar_with_length: sql_parser_options.support_varchar_with_length,
map_string_types_to_utf8view: sql_parser_options.map_string_types_to_utf8view,
spark_string_literal_unescape: sql_parser_options
.spark_string_literal_unescape,
collect_spans: sql_parser_options.collect_spans,
default_null_ordering: sql_parser_options
.default_null_ordering
Expand Down
6 changes: 6 additions & 0 deletions datafusion/spark/src/session_state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,12 @@ impl SessionStateBuilderSpark for SessionStateBuilder {
.map(|f| (f.name().to_string(), f)),
);

self.config()
.get_or_insert_with(Default::default)
.options_mut()
.sql_parser
.spark_string_literal_unescape = true;

self
}
}
Expand Down
104 changes: 100 additions & 4 deletions datafusion/sql/src/expr/value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,14 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
param_data_types: &[FieldRef],
) -> Result<Expr> {
match value {
Value::Number(n, _) => self.parse_sql_number(&n, false),
Value::SingleQuotedString(s) | Value::DoubleQuotedString(s) => Ok(lit(s)),
Value::Number(n,_) => self.parse_sql_number(&n, false),
Value::SingleQuotedString(s) | Value::DoubleQuotedString(s) => {
if self.options.spark_string_literal_unescape {
Ok(lit(unescape_string_literal(&s)?))
} else {
Ok(lit(s))
}
}
Value::Null => Ok(Expr::Literal(ScalarValue::Null, None)),
Value::Boolean(n) => Ok(lit(n)),
Value::Placeholder(param) => {
Expand All @@ -63,7 +69,13 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
}
}
Value::DollarQuotedString(s) => Ok(lit(s.value)),
Value::EscapedStringLiteral(s) => Ok(lit(s)),
Value::EscapedStringLiteral(s) => {
if self.options.spark_string_literal_unescape {
Ok(lit(unescape_string_literal(&s)?))
} else {
Ok(lit(s))
}
}
_ => plan_err!("Unsupported Value '{value:?}'"),
}
}
Expand Down Expand Up @@ -305,6 +317,62 @@ fn interval_literal(interval_value: SQLExpr, negative: bool) -> Result<String> {
if negative { Ok(format!("-{s}")) } else { Ok(s) }
}

fn unescape_string_literal(s: &str) -> Result<String> {
let mut out = String::with_capacity(s.len());
let mut chars = s.chars().peekable();

while let Some(ch) = chars.next() {
if ch != '\\' {
out.push(ch);
continue;
}

let Some(next) = chars.next() else {
out.push('\\');
break;
};

match next {
'0' => out.push('\0'),
'b' => out.push('\u{0008}'),
'n' => out.push('\n'),
'r' => out.push('\r'),
't' => out.push('\t'),
'Z' => out.push('\u{001A}'),
'\\' => out.push('\\'),
'\'' => out.push('\''),
'"' => out.push('"'),
'%' => out.push('%'),
'_' => out.push('_'),

'0'..='7' => {
let mut octal = String::new();
octal.push(next);

for _ in 0..2 {
match chars.peek() {
Some('0'..='7') => {
octal.push(chars.next().unwrap());
}
_ => break,
}
}

let value = u8::from_str_radix(&octal, 8).map_err(|_| {
DataFusionError::from(ParserError(format!(
"Invalid octal escape sequence: \\{octal}"
)))
})?;
out.push(value as char);
}

other => out.push(other),
}
}

Ok(out)
}

/// Try to decode bytes from hex literal string.
///
/// None will be returned if the input literal is hex-invalid.
Expand Down Expand Up @@ -422,6 +490,34 @@ fn parse_decimal(unsigned_number: &str, negative: bool) -> Result<Expr> {
mod tests {
use super::*;

#[test]
fn test_unescape_string_literal_basic_escapes() {
assert_eq!(unescape_string_literal(r"\t hello").unwrap(), "\t hello");
assert_eq!(unescape_string_literal(r"\n hello").unwrap(), "\n hello");
assert_eq!(unescape_string_literal(r"\r hello").unwrap(), "\r hello");
assert_eq!(unescape_string_literal(r"\\").unwrap(), "\\");
assert_eq!(unescape_string_literal(r"it\'s").unwrap(), "it's");
assert_eq!(unescape_string_literal(r#"a\"b"#).unwrap(), "a\"b");
}

#[test]
fn test_unescape_string_literal_octal() {
assert_eq!(unescape_string_literal(r"\101").unwrap(), "A");
assert_eq!(unescape_string_literal(r"\141").unwrap(), "a");
assert_eq!(unescape_string_literal(r"\7").unwrap(), "\x07");
}

#[test]
fn test_unescape_string_literal_unknown_escape() {
assert_eq!(unescape_string_literal(r"\x").unwrap(), "x");
assert_eq!(unescape_string_literal(r"abc\qdef").unwrap(), "abcqdef");
}

#[test]
fn test_unescape_string_literal_trailing_backslash() {
assert_eq!(unescape_string_literal("abc\\").unwrap(), "abc\\");
}

#[test]
fn test_decode_hex_literal() {
let cases = [
Expand Down Expand Up @@ -517,4 +613,4 @@ mod tests {
"This feature is not implemented: Decimal precision 77 exceeds the maximum supported precision: 76"
);
}
}
}
2 changes: 1 addition & 1 deletion datafusion/sql/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2203,4 +2203,4 @@ mod tests {
"Expected: end of expression, found: bar",
)
}
}
}
10 changes: 10 additions & 0 deletions datafusion/sql/src/planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ pub struct ParserOptions {
pub collect_spans: bool,
/// Whether string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning.
pub map_string_types_to_utf8view: bool,
/// Whether to use Spark-compatible string literal unescaping.
pub spark_string_literal_unescape: bool,
/// Default null ordering for sorting expressions.
pub default_null_ordering: NullOrdering,
}
Expand All @@ -78,6 +80,7 @@ impl ParserOptions {
map_string_types_to_utf8view: true,
enable_options_value_normalization: false,
collect_spans: false,
spark_string_literal_unescape: false,
// By default, `nulls_max` is used to follow Postgres's behavior.
// postgres rule: https://www.postgresql.org/docs/current/queries-order.html
default_null_ordering: NullOrdering::NullsMax,
Expand Down Expand Up @@ -124,6 +127,12 @@ impl ParserOptions {
self
}

/// Sets the spark_string_literal_unescape option.
pub fn with_spark_string_literal_unescape(mut self, value: bool) -> Self {
self.spark_string_literal_unescape = value;
self
}

/// Sets the `enable_options_value_normalization` option.
pub fn with_enable_options_value_normalization(mut self, value: bool) -> Self {
self.enable_options_value_normalization = value;
Expand Down Expand Up @@ -159,6 +168,7 @@ impl From<&SqlParserOptions> for ParserOptions {
enable_options_value_normalization: options
.enable_options_value_normalization,
collect_spans: options.collect_spans,
spark_string_literal_unescape: options.spark_string_literal_unescape,
default_null_ordering: options.default_null_ordering.as_str().into(),
}
}
Expand Down
3 changes: 3 additions & 0 deletions datafusion/sql/tests/sql_integration.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3571,6 +3571,7 @@ fn parse_decimals_parser_options() -> ParserOptions {
map_string_types_to_utf8view: true,
enable_options_value_normalization: false,
collect_spans: false,
spark_string_literal_unescape: false,
default_null_ordering: NullOrdering::NullsMax,
}
}
Expand All @@ -3583,6 +3584,7 @@ fn ident_normalization_parser_options_no_ident_normalization() -> ParserOptions
map_string_types_to_utf8view: true,
enable_options_value_normalization: false,
collect_spans: false,
spark_string_literal_unescape: false,
default_null_ordering: NullOrdering::NullsMax,
}
}
Expand All @@ -3595,6 +3597,7 @@ fn ident_normalization_parser_options_ident_normalization() -> ParserOptions {
map_string_types_to_utf8view: true,
enable_options_value_normalization: false,
collect_spans: false,
spark_string_literal_unescape: false,
default_null_ordering: NullOrdering::NullsMax,
}
}
Expand Down
2 changes: 2 additions & 0 deletions datafusion/sqllogictest/test_files/information_schema.slt
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,7 @@ datafusion.sql_parser.enable_options_value_normalization false
datafusion.sql_parser.map_string_types_to_utf8view true
datafusion.sql_parser.parse_float_as_decimal false
datafusion.sql_parser.recursion_limit 50
datafusion.sql_parser.spark_string_literal_unescape false
datafusion.sql_parser.support_varchar_with_length true

# show all variables with verbose
Expand Down Expand Up @@ -488,6 +489,7 @@ datafusion.sql_parser.enable_options_value_normalization false When set to true,
datafusion.sql_parser.map_string_types_to_utf8view true If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. If false, they are mapped to `Utf8`. Default is true.
datafusion.sql_parser.parse_float_as_decimal false When set to true, SQL parser will parse float as decimal type
datafusion.sql_parser.recursion_limit 50 Specifies the recursion depth limit when parsing complex SQL Queries
datafusion.sql_parser.spark_string_literal_unescape false When set to true, SQL string literals use Spark-compatible backslash escape handling during SQL planning. This should only be enabled for Spark compatibility mode.
datafusion.sql_parser.support_varchar_with_length true If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits.

# show_variable_in_config_options
Expand Down
6 changes: 3 additions & 3 deletions datafusion/sqllogictest/test_files/spark/array/array.slt
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,10 @@ SELECT array(1, NULL, 3);
[1, NULL, 3]


query ?
SELECT array['hello', '', null, 'nULl', 'nULlx', 'aa"bb', 'mm\nn', 'uu,vv', 'yy zz'];
query I
SELECT ascii(substr(array['hello', '', null, 'nULl', 'nULlx', 'aa"bb', 'mm\nn', 'uu,vv', 'yy zz'][7], 3 ,1));
----
[hello, , NULL, nULl, nULlx, aa"bb, mm\nn, uu,vv, yy zz]
10

query ?
SELECT array(array(1,2),array(3,4));
Expand Down
12 changes: 6 additions & 6 deletions datafusion/sqllogictest/test_files/spark/string/soundex.slt
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,10 @@ SELECT soundex(' hello');
----
hello

query T
SELECT soundex('\thello');
query I
SELECT ascii(substr(soundex('\thello'),1 ,1));
----
\thello
9

query T
SELECT soundex('😀hello');
Expand Down Expand Up @@ -190,10 +190,10 @@ SELECT soundex('#');
----
#

query T
SELECT soundex('\nhello');
query I
SELECT ascii(substr(soundex('\nhello'),1 ,1));
----
\nhello
10

query T
SELECT concat(soundex(' '), 'Spark')
Expand Down