Skip to content

Commit b7fb32a

Browse files
authored
feat: parse_float_as_decimal supports scientific notation and Decimal256 (#13806)
* feat: `parse_float_as_decimal` supports scientific notation and Decimal256 * Fix test * Add test * Add test * Refine negative scales * Update comment * Refine bigint_to_i256 * UT for bigint_to_i256 * Add ut for parse_decimal
1 parent 32a13d8 commit b7fb32a

File tree

4 files changed

+245
-47
lines changed

4 files changed

+245
-47
lines changed

datafusion-cli/Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

datafusion/sql/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ unparser = []
4444
arrow = { workspace = true }
4545
arrow-array = { workspace = true }
4646
arrow-schema = { workspace = true }
47+
bigdecimal = { workspace = true }
4748
datafusion-common = { workspace = true, default-features = true }
4849
datafusion-expr = { workspace = true }
4950
indexmap = { workspace = true }

datafusion/sql/src/expr/value.rs

Lines changed: 157 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,13 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
1919
use arrow::compute::kernels::cast_utils::{
2020
parse_interval_month_day_nano_config, IntervalParseConfig, IntervalUnit,
2121
};
22-
use arrow::datatypes::DECIMAL128_MAX_PRECISION;
23-
use arrow_schema::DataType;
22+
use arrow::datatypes::{i256, DECIMAL128_MAX_PRECISION};
23+
use arrow_schema::{DataType, DECIMAL256_MAX_PRECISION};
24+
use bigdecimal::num_bigint::BigInt;
25+
use bigdecimal::{BigDecimal, Signed, ToPrimitive};
2426
use datafusion_common::{
25-
internal_err, not_impl_err, plan_err, DFSchema, DataFusionError, Result, ScalarValue,
27+
internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema,
28+
DataFusionError, Result, ScalarValue,
2629
};
2730
use datafusion_expr::expr::{BinaryExpr, Placeholder};
2831
use datafusion_expr::planner::PlannerResult;
@@ -31,6 +34,9 @@ use log::debug;
3134
use sqlparser::ast::{BinaryOperator, Expr as SQLExpr, Interval, UnaryOperator, Value};
3235
use sqlparser::parser::ParserError::ParserError;
3336
use std::borrow::Cow;
37+
use std::cmp::Ordering;
38+
use std::ops::Neg;
39+
use std::str::FromStr;
3440

3541
impl<S: ContextProvider> SqlToRel<'_, S> {
3642
pub(crate) fn parse_value(
@@ -84,7 +90,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
8490
}
8591

8692
if self.options.parse_float_as_decimal {
87-
parse_decimal_128(unsigned_number, negative)
93+
parse_decimal(unsigned_number, negative)
8894
} else {
8995
signed_number.parse::<f64>().map(lit).map_err(|_| {
9096
DataFusionError::from(ParserError(format!(
@@ -315,45 +321,84 @@ const fn try_decode_hex_char(c: u8) -> Option<u8> {
315321
}
316322
}
317323

318-
/// Parse Decimal128 from a string
319-
///
320-
/// TODO: support parsing from scientific notation
321-
fn parse_decimal_128(unsigned_number: &str, negative: bool) -> Result<Expr> {
322-
// remove leading zeroes
323-
let trimmed = unsigned_number.trim_start_matches('0');
324-
// Parse precision and scale, remove decimal point if exists
325-
let (precision, scale, replaced_str) = if trimmed == "." {
326-
// Special cases for numbers such as “0.”, “000.”, and so on.
327-
(1, 0, Cow::Borrowed("0"))
328-
} else if let Some(i) = trimmed.find('.') {
329-
(
330-
trimmed.len() - 1,
331-
trimmed.len() - i - 1,
332-
Cow::Owned(trimmed.replace('.', "")),
333-
)
334-
} else {
335-
// No decimal point, keep as is
336-
(trimmed.len(), 0, Cow::Borrowed(trimmed))
337-
};
324+
/// Returns None if the value can't be converted to i256.
325+
/// Modified from <https://github.com/apache/arrow-rs/blob/c4dbf0d8af6ca5a19b8b2ea777da3c276807fc5e/arrow-buffer/src/bigint/mod.rs#L303>
326+
fn bigint_to_i256(v: &BigInt) -> Option<i256> {
327+
let v_bytes = v.to_signed_bytes_le();
328+
match v_bytes.len().cmp(&32) {
329+
Ordering::Less => {
330+
let mut bytes = if v.is_negative() {
331+
[255_u8; 32]
332+
} else {
333+
[0; 32]
334+
};
335+
bytes[0..v_bytes.len()].copy_from_slice(&v_bytes[..v_bytes.len()]);
336+
Some(i256::from_le_bytes(bytes))
337+
}
338+
Ordering::Equal => Some(i256::from_le_bytes(v_bytes.try_into().unwrap())),
339+
Ordering::Greater => None,
340+
}
341+
}
338342

339-
let number = replaced_str.parse::<i128>().map_err(|e| {
343+
fn parse_decimal(unsigned_number: &str, negative: bool) -> Result<Expr> {
344+
let mut dec = BigDecimal::from_str(unsigned_number).map_err(|e| {
340345
DataFusionError::from(ParserError(format!(
341-
"Cannot parse {replaced_str} as i128 when building decimal: {e}"
346+
"Cannot parse {unsigned_number} as BigDecimal: {e}"
342347
)))
343348
})?;
344-
345-
// Check precision overflow
346-
if precision as u8 > DECIMAL128_MAX_PRECISION {
347-
return Err(DataFusionError::from(ParserError(format!(
348-
"Cannot parse {replaced_str} as i128 when building decimal: precision overflow"
349-
))));
349+
if negative {
350+
dec = dec.neg();
350351
}
351352

352-
Ok(Expr::Literal(ScalarValue::Decimal128(
353-
Some(if negative { -number } else { number }),
354-
precision as u8,
355-
scale as i8,
356-
)))
353+
let digits = dec.digits();
354+
let (int_val, scale) = dec.into_bigint_and_exponent();
355+
if scale < i8::MIN as i64 {
356+
return not_impl_err!(
357+
"Decimal scale {} exceeds the minimum supported scale: {}",
358+
scale,
359+
i8::MIN
360+
);
361+
}
362+
let precision = if scale > 0 {
363+
// arrow-rs requires the precision to include the positive scale.
364+
// See <https://github.com/apache/arrow-rs/blob/123045cc766d42d1eb06ee8bb3f09e39ea995ddc/arrow-array/src/types.rs#L1230>
365+
std::cmp::max(digits, scale.unsigned_abs())
366+
} else {
367+
digits
368+
};
369+
if precision <= DECIMAL128_MAX_PRECISION as u64 {
370+
let val = int_val.to_i128().ok_or_else(|| {
371+
// Failures are unexpected here as we have already checked the precision
372+
internal_datafusion_err!(
373+
"Unexpected overflow when converting {} to i128",
374+
int_val
375+
)
376+
})?;
377+
Ok(Expr::Literal(ScalarValue::Decimal128(
378+
Some(val),
379+
precision as u8,
380+
scale as i8,
381+
)))
382+
} else if precision <= DECIMAL256_MAX_PRECISION as u64 {
383+
let val = bigint_to_i256(&int_val).ok_or_else(|| {
384+
// Failures are unexpected here as we have already checked the precision
385+
internal_datafusion_err!(
386+
"Unexpected overflow when converting {} to i256",
387+
int_val
388+
)
389+
})?;
390+
Ok(Expr::Literal(ScalarValue::Decimal256(
391+
Some(val),
392+
precision as u8,
393+
scale as i8,
394+
)))
395+
} else {
396+
not_impl_err!(
397+
"Decimal precision {} exceeds the maximum supported precision: {}",
398+
precision,
399+
DECIMAL256_MAX_PRECISION
400+
)
401+
}
357402
}
358403

359404
#[cfg(test)]
@@ -379,4 +424,79 @@ mod tests {
379424
assert_eq!(output, expect);
380425
}
381426
}
427+
428+
#[test]
429+
fn test_bigint_to_i256() {
430+
let cases = [
431+
(BigInt::from(0), Some(i256::from(0))),
432+
(BigInt::from(1), Some(i256::from(1))),
433+
(BigInt::from(-1), Some(i256::from(-1))),
434+
(
435+
BigInt::from_str(i256::MAX.to_string().as_str()).unwrap(),
436+
Some(i256::MAX),
437+
),
438+
(
439+
BigInt::from_str(i256::MIN.to_string().as_str()).unwrap(),
440+
Some(i256::MIN),
441+
),
442+
(
443+
// Can't fit into i256
444+
BigInt::from_str((i256::MAX.to_string() + "1").as_str()).unwrap(),
445+
None,
446+
),
447+
];
448+
449+
for (input, expect) in cases {
450+
let output = bigint_to_i256(&input);
451+
assert_eq!(output, expect);
452+
}
453+
}
454+
455+
#[test]
456+
fn test_parse_decimal() {
457+
// Supported cases
458+
let cases = [
459+
("0", ScalarValue::Decimal128(Some(0), 1, 0)),
460+
("1", ScalarValue::Decimal128(Some(1), 1, 0)),
461+
("123.45", ScalarValue::Decimal128(Some(12345), 5, 2)),
462+
// Digit count is less than scale
463+
("0.001", ScalarValue::Decimal128(Some(1), 3, 3)),
464+
// Scientific notation
465+
("123.456e-2", ScalarValue::Decimal128(Some(123456), 6, 5)),
466+
// Negative scale
467+
("123456e128", ScalarValue::Decimal128(Some(123456), 6, -128)),
468+
// Decimal256
469+
(
470+
&("9".repeat(39) + "." + "99999"),
471+
ScalarValue::Decimal256(
472+
Some(i256::from_string(&"9".repeat(44)).unwrap()),
473+
44,
474+
5,
475+
),
476+
),
477+
];
478+
for (input, expect) in cases {
479+
let output = parse_decimal(input, true).unwrap();
480+
assert_eq!(output, Expr::Literal(expect.arithmetic_negate().unwrap()));
481+
482+
let output = parse_decimal(input, false).unwrap();
483+
assert_eq!(output, Expr::Literal(expect));
484+
}
485+
486+
// scale < i8::MIN
487+
assert_eq!(
488+
parse_decimal("1e129", false)
489+
.unwrap_err()
490+
.strip_backtrace(),
491+
"This feature is not implemented: Decimal scale -129 exceeds the minimum supported scale: -128"
492+
);
493+
494+
// Unsupported precision
495+
assert_eq!(
496+
parse_decimal(&"1".repeat(77), false)
497+
.unwrap_err()
498+
.strip_backtrace(),
499+
"This feature is not implemented: Decimal precision 77 exceeds the maximum supported precision: 76"
500+
);
501+
}
382502
}

datafusion/sqllogictest/test_files/options.slt

Lines changed: 86 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -192,19 +192,95 @@ select arrow_typeof(00009999999999999999999999999999999999.9999),
192192
----
193193
Decimal128(38, 4) Decimal128(38, 4) Decimal128(20, 0)
194194

195-
# precision overflow
196-
statement error DataFusion error: SQL error: ParserError\("Cannot parse 123456789012345678901234567890123456789 as i128 when building decimal: precision overflow"\)
197-
select 123456789.012345678901234567890123456789
195+
# scientific notation
196+
query RTRTRT
197+
select 1.23e3, arrow_typeof(1.23e3),
198+
+1.23e1, arrow_typeof(+1.23e1),
199+
-1234.56e-3, arrow_typeof(-1234.56e-3)
200+
----
201+
1230 Decimal128(3, -1) 12.3 Decimal128(3, 1) -1.23456 Decimal128(6, 5)
202+
203+
query RTRTRT
204+
select 1.23e-2, arrow_typeof(1.23e-2),
205+
1.23456e0, arrow_typeof(1.23456e0),
206+
-.0123e2, arrow_typeof(-.0123e2)
207+
----
208+
0.0123 Decimal128(4, 4) 1.23456 Decimal128(6, 5) -1.23 Decimal128(3, 2)
209+
210+
# Decimal256 cases
211+
query RT
212+
select 123456789.0123456789012345678901234567890,
213+
arrow_typeof(123456789.0123456789012345678901234567890)
214+
----
215+
123456789.012345678901 Decimal256(40, 31)
216+
217+
query RT
218+
select -123456789.0123456789012345678901234567890,
219+
arrow_typeof(-123456789.0123456789012345678901234567890)
220+
----
221+
-123456789.012345678901 Decimal256(40, 31)
222+
223+
# max precision and scale of Decimal256
224+
query RTRT
225+
select -1e-76, arrow_typeof(-1e-76),
226+
-1.234567e-70, arrow_typeof(-1.234567e-70)
227+
----
228+
0 Decimal256(76, 76) 0 Decimal256(76, 76)
198229

199-
statement error SQL error: ParserError\("Cannot parse 123456789012345678901234567890123456789 as i128 when building decimal: precision overflow"\)
200-
select -123456789.012345678901234567890123456789
230+
# Decimal256::MAX for nonnegative scale
231+
query RT
232+
select 9999999999999999999999999999999999999999999999999999999999999999999999999999,
233+
arrow_typeof(9999999999999999999999999999999999999999999999999999999999999999999999999999);
234+
----
235+
9999999999999999999999999999999999999999999999999999999999999999999999999999 Decimal256(76, 0)
201236

202-
# can not fit in i128
203-
statement error SQL error: ParserError\("Cannot parse 1234567890123456789012345678901234567890 as i128 when building decimal: number too large to fit in target type"\)
204-
select 123456789.0123456789012345678901234567890
237+
# Decimal256::MIN
238+
query RT
239+
select -9999999999999999999999999999999999999999999999999999999999999999999999999999,
240+
arrow_typeof(-9999999999999999999999999999999999999999999999999999999999999999999999999999);
241+
----
242+
-9999999999999999999999999999999999999999999999999999999999999999999999999999 Decimal256(76, 0)
205243

206-
statement error SQL error: ParserError\("Cannot parse 1234567890123456789012345678901234567890 as i128 when building decimal: number too large to fit in target type"\)
207-
select -123456789.0123456789012345678901234567890
244+
# boundaries between decimal128 and decimal256
245+
query RTRT
246+
select 1e-38, arrow_typeof(1e-38),
247+
1e-39, arrow_typeof(1e-39);
248+
----
249+
0 Decimal128(38, 38) 0 Decimal256(39, 39)
250+
251+
query RTRT
252+
select -1e-38, arrow_typeof(-1e-38),
253+
-1e-39, arrow_typeof(-1e-39);
254+
----
255+
0 Decimal128(38, 38) 0 Decimal256(39, 39)
256+
257+
# unsupported precision
258+
query error Decimal precision 77 exceeds the maximum supported precision: 76
259+
select -1e-77;
260+
261+
query error Decimal precision 79 exceeds the maximum supported precision: 76
262+
select 1.000000000000000000000000000000000000000000000000000000000000000000000000000001;
263+
264+
# negative scales
265+
query TR
266+
select arrow_typeof(1e77), 1e77
267+
----
268+
Decimal128(1, -77) 100000000000000000000000000000000000000000000000000000000000000000000000000000
269+
270+
query T
271+
select arrow_typeof(1e128)
272+
----
273+
Decimal128(1, -128)
274+
275+
query error Decimal scale \-129 exceeds the minimum supported scale: \-128
276+
select 1e129
277+
278+
# simple arithmetic
279+
query RTRT
280+
select 1e40 + 1e40, arrow_typeof(1e40 + 1e40),
281+
1e-40 + -1e-40, arrow_typeof(1e-40 + -1e-40)
282+
----
283+
20000000000000000000000000000000000000000 Decimal128(2, -40) 0 Decimal256(41, 40)
208284

209285
# Restore option to default value
210286
statement ok

0 commit comments

Comments
 (0)