-
Notifications
You must be signed in to change notification settings - Fork 984
feat: add rounding logic and scale zero fix parse_decimal to match parse_string_to_decimal_native behavior #7179
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
bef2992
8b81039
136c0a6
ef07780
47b0541
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -850,7 +850,18 @@ fn parse_e_notation<T: DecimalType>( | |
} | ||
|
||
if exp < 0 { | ||
result = result.div_wrapping(base.pow_wrapping(-exp as _)); | ||
let result_with_scale = result.div_wrapping(base.pow_wrapping(-exp as _)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. does this change the behavior of parsing e notation? If so I didn't see any tests There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. I missed porting the tests while splitting the large PR. It rounds instead of current behavior - truncate. I'll add the tests. |
||
let result_with_one_scale_up = | ||
result.div_wrapping(base.pow_wrapping(-exp.add_wrapping(1) as _)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume this logic is correct, but just for me to understand. E.g. for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
before this check, now, to round up or down, we need to capture the digit next to last digit in the result, in this case 3. How we get it is if rounding_digit >=5 then we add +1 to the result |
||
let rounding_digit = | ||
result_with_one_scale_up.sub_wrapping(result_with_scale.mul_wrapping(base)); | ||
//rounding digit is the next digit after result with scale, it helps in rounding to nearest integer | ||
// with scale 1 rounding digit for 247e-2 is 7, hence result is 2.5, whereas rounding digit for 244e-2 is 4, hence result is 2.4 | ||
if rounding_digit >= T::Native::usize_as(5) { | ||
result = result_with_scale.add_wrapping(T::Native::usize_as(1)); | ||
} else { | ||
result = result_with_scale; | ||
} | ||
} else { | ||
result = result.mul_wrapping(base.pow_wrapping(exp as _)); | ||
} | ||
|
@@ -866,8 +877,9 @@ pub fn parse_decimal<T: DecimalType>( | |
scale: i8, | ||
) -> Result<T::Native, ArrowError> { | ||
let mut result = T::Native::usize_as(0); | ||
let mut fractionals: i8 = 0; | ||
let mut digits: u8 = 0; | ||
let mut fractionals: i16 = 0; | ||
let mut digits: u16 = 0; | ||
let mut rounding_digit = -1; // to store digit after the scale for rounding | ||
let base = T::Native::usize_as(10); | ||
|
||
let bs = s.as_bytes(); | ||
|
@@ -897,6 +909,13 @@ pub fn parse_decimal<T: DecimalType>( | |
// Ignore leading zeros. | ||
continue; | ||
} | ||
if fractionals == scale as i16 && scale != 0 { | ||
// Capture the rounding digit once | ||
if rounding_digit < 0 { | ||
rounding_digit = (b - b'0') as i8; | ||
} | ||
continue; | ||
} | ||
digits += 1; | ||
result = result.mul_wrapping(base); | ||
result = result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); | ||
|
@@ -909,8 +928,8 @@ pub fn parse_decimal<T: DecimalType>( | |
if *b == b'e' || *b == b'E' { | ||
result = parse_e_notation::<T>( | ||
s, | ||
digits as u16, | ||
fractionals as i16, | ||
digits, | ||
fractionals, | ||
result, | ||
point_index, | ||
precision as u16, | ||
|
@@ -925,11 +944,17 @@ pub fn parse_decimal<T: DecimalType>( | |
"can't parse the string value {s} to decimal" | ||
))); | ||
} | ||
if fractionals == scale && scale != 0 { | ||
if fractionals == scale as i16 { | ||
// Capture the rounding digit once | ||
if rounding_digit < 0 { | ||
rounding_digit = (b - b'0') as i8; | ||
} | ||
// We have processed all the digits that we need. All that | ||
// is left is to validate that the rest of the string contains | ||
// valid digits. | ||
continue; | ||
if scale != 0 { | ||
continue; | ||
} | ||
} | ||
fractionals += 1; | ||
digits += 1; | ||
|
@@ -951,8 +976,8 @@ pub fn parse_decimal<T: DecimalType>( | |
b'e' | b'E' => { | ||
result = parse_e_notation::<T>( | ||
s, | ||
digits as u16, | ||
fractionals as i16, | ||
digits, | ||
fractionals, | ||
result, | ||
index, | ||
precision as u16, | ||
|
@@ -972,20 +997,28 @@ pub fn parse_decimal<T: DecimalType>( | |
} | ||
|
||
if !is_e_notation { | ||
if fractionals < scale { | ||
let exp = scale - fractionals; | ||
if exp as u8 + digits > precision { | ||
if fractionals < scale as i16 { | ||
let exp = scale as i16 - fractionals; | ||
if exp + digits as i16 > precision as i16 { | ||
return Err(ArrowError::ParseError(format!( | ||
"parse decimal overflow ({s})" | ||
))); | ||
} | ||
let mul = base.pow_wrapping(exp as _); | ||
result = result.mul_wrapping(mul); | ||
} else if digits > precision { | ||
} else if digits > precision as u16 { | ||
return Err(ArrowError::ParseError(format!( | ||
"parse decimal overflow ({s})" | ||
))); | ||
} | ||
if scale == 0 { | ||
result = result.div_wrapping(base.pow_wrapping(fractionals as u32)) | ||
} | ||
//rounding digit is the next digit after result with scale, it is used to do rounding to nearest integer | ||
// with scale 1 rounding digit for 2.47 is 7, hence result is 2.5, whereas rounding digit for 2.44 is 4,hence result is 2.4 | ||
if rounding_digit >= 5 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wondering where There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. first we figure out what is the rounding_digit - digit which is next to the last digit in the final result (without rounding logic applied), if the value of the rounding_digit is >=5, then we add +1 to round up the result, else it remains same. "1265E-4" -> with scale 3 -> 0.127
in scale 3 the number would be 0.126 and rounding digit will be 5, as rounding digit >= 5, the result becomes 0.127
1264E-4" -> with scale 3 -> 0.126
here rounding_digit is 4, which is less than 5, so no need to add 1. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah ok, makes sense There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps we could add a comment in the code to explain this point to future readers who may have the same question |
||
result = result.add_wrapping(T::Native::usize_as(1)); | ||
} | ||
} | ||
|
||
Ok(if negative { | ||
|
@@ -2564,6 +2597,18 @@ mod tests { | |
assert_eq!(i256::from_i128(i), result_256.unwrap()); | ||
} | ||
|
||
let tests_with_varying_scale = [ | ||
("123.4567891", 12345679_i128, 5), | ||
("123.4567891", 123_i128, 0), | ||
("123.45", 12345000_i128, 5), | ||
("-2.5", -3_i128, 0), | ||
("-2.49", -2_i128, 0), | ||
]; | ||
for (str, e, scale) in tests_with_varying_scale { | ||
let result_128_a = parse_decimal::<Decimal128Type>(str, 20, scale); | ||
assert_eq!(result_128_a.unwrap(), e); | ||
} | ||
|
||
let e_notation_tests = [ | ||
("1.23e3", "1230.0", 2), | ||
("5.6714e+2", "567.14", 4), | ||
|
@@ -2599,6 +2644,9 @@ mod tests { | |
("000001.1034567002e0", "000001.1034567002", 3), | ||
("1.234e16", "12340000000000000", 0), | ||
("123.4e16", "1234000000000000000", 0), | ||
("4e+5", "400000", 4), | ||
("4e7", "40000000", 2), | ||
("1265E-4", ".1265", 3), | ||
]; | ||
for (e, d, scale) in e_notation_tests { | ||
let result_128_e = parse_decimal::<Decimal128Type>(e, 20, scale); | ||
|
@@ -2608,6 +2656,7 @@ mod tests { | |
let result_256_d = parse_decimal::<Decimal256Type>(d, 20, scale); | ||
assert_eq!(result_256_e.unwrap(), result_256_d.unwrap()); | ||
} | ||
|
||
let can_not_parse_tests = [ | ||
"123,123", | ||
".", | ||
|
@@ -2780,6 +2829,55 @@ mod tests { | |
} | ||
} | ||
|
||
#[test] | ||
fn test_parse_decimal_rounding() { | ||
let test_rounding_for_e_notation_varying_scale = [ | ||
("1.2345e4", "12345", 2), | ||
("12345e-5", "0.12", 2), | ||
("12345E-5", "0.123", 3), | ||
("12345e-5", "0.1235", 4), | ||
("1265E-4", ".127", 3), | ||
("12.345e3", "12345.000", 3), | ||
("1.2345e4", "12345", 0), | ||
("1.2345e3", "1235", 0), | ||
("1.23e-3", "0", 0), | ||
("123e-2", "1", 0), | ||
("-1e-15", "-0.0000000000", 10), | ||
("1e-15", "0.0000000000", 10), | ||
("1e15", "1000000000000000", 2), | ||
]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. lets also add couple more tests on negative
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there are existing tests for such scenarios here. Nevertheless, I'll add these as well. |
||
|
||
for (e, d, scale) in test_rounding_for_e_notation_varying_scale { | ||
let result_128_e = parse_decimal::<Decimal128Type>(e, 38, scale); | ||
let result_128_d = parse_decimal::<Decimal128Type>(d, 38, scale); | ||
assert_eq!(result_128_e.unwrap(), result_128_d.unwrap()); | ||
let result_256_e = parse_decimal::<Decimal256Type>(e, 38, scale); | ||
let result_256_d = parse_decimal::<Decimal256Type>(d, 38, scale); | ||
assert_eq!(result_256_e.unwrap(), result_256_d.unwrap()); | ||
} | ||
|
||
let edge_tests_256_error = [ | ||
(&f64::INFINITY.to_string(), 0), | ||
(&f64::NEG_INFINITY.to_string(), 0), | ||
]; | ||
for (s, scale) in edge_tests_256_error { | ||
let result = parse_decimal::<Decimal256Type>(s, 76, scale); | ||
assert_eq!( | ||
format!("Parser error: can't parse the string value {s} to decimal"), | ||
result.unwrap_err().to_string() | ||
); | ||
} | ||
|
||
let edge_tests_256_overflow = [(&f64::MIN.to_string(), 0), (&f64::MAX.to_string(), 0)]; | ||
for (s, scale) in edge_tests_256_overflow { | ||
let result = parse_decimal::<Decimal256Type>(s, 76, scale); | ||
assert_eq!( | ||
format!("Parser error: parse decimal overflow ({s})"), | ||
result.unwrap_err().to_string() | ||
); | ||
} | ||
} | ||
|
||
#[test] | ||
fn test_parse_empty() { | ||
assert_eq!(Int32Type::parse(""), None); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1286,7 +1286,7 @@ mod tests { | |
assert_eq!("53.002666", lat.value_as_string(1)); | ||
assert_eq!("52.412811", lat.value_as_string(2)); | ||
assert_eq!("51.481583", lat.value_as_string(3)); | ||
assert_eq!("12.123456", lat.value_as_string(4)); | ||
assert_eq!("12.123457", lat.value_as_string(4)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @alamb you can see the behavior change in this test of arrow-csv reader which uses parse_decimal |
||
assert_eq!("50.760000", lat.value_as_string(5)); | ||
assert_eq!("0.123000", lat.value_as_string(6)); | ||
assert_eq!("123.000000", lat.value_as_string(7)); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Having the same behavior in these two functions seems like a reasonable change to me
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
once we are able to move to using parse_decimal for casting and deprecate parse_string_to_decimal_native , these tests will be changed to assert the value in the message section of the assert.