Skip to content

Commit a1949e4

Browse files
gruuyamartin-g
andauthored
fix: liberal parsing of zero scale decimals (#8700)
# Which issue does this PR close? - Closes #8699. Alternative to #8702 (see #8699 for a discussion). # Rationale for this change Make parsing of zero-scale decimals more correct/consistent. # What changes are included in this PR? When a decimal with scale 0 is being parsed, but it has some digits pass the decimal point those will effectively be discarded. # Are these changes tested? Yes. # Are there any user-facing changes? Aligned parsing of zero-scale decimals. --------- Co-authored-by: Martin Grigorov <martin-g@users.noreply.github.com>
1 parent ba21a83 commit a1949e4

File tree

1 file changed

+92
-61
lines changed

1 file changed

+92
-61
lines changed

arrow-cast/src/parse.rs

Lines changed: 92 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -741,72 +741,57 @@ fn parse_e_notation<T: DecimalType>(
741741
let mut exp: i16 = 0;
742742
let base = T::Native::usize_as(10);
743743

744-
let mut exp_start: bool = false;
745744
// e has a plus sign
746745
let mut pos_shift_direction: bool = true;
747746

748-
// skip to point or exponent index
749-
let mut bs;
750-
if fractionals > 0 {
751-
// it's a fraction, so the point index needs to be skipped, so +1
752-
bs = s.as_bytes().iter().skip(index + fractionals as usize + 1);
753-
} else {
754-
// it's actually an integer that is already written into the result, so let's skip on to e
755-
bs = s.as_bytes().iter().skip(index);
756-
}
747+
// skip to the exponent index directly or just after any processed fractionals
748+
let mut bs = s.as_bytes().iter().skip(index + fractionals as usize);
757749

758-
while let Some(b) = bs.next() {
750+
// This function is only called from `parse_decimal`, in which we skip parsing any fractionals
751+
// after we reach `scale` digits, not knowing ahead of time whether the decimal contains an
752+
// e-notation or not.
753+
// So once we do hit into an e-notation, and drop down into this function, we need to parse the
754+
// remaining unprocessed fractionals too, since otherwise we might lose precision.
755+
for b in bs.by_ref() {
759756
match b {
760757
b'0'..=b'9' => {
761758
result = result.mul_wrapping(base);
762759
result = result.add_wrapping(T::Native::usize_as((b - b'0') as usize));
763-
if fractionals > 0 {
764-
fractionals += 1;
765-
}
760+
fractionals += 1;
766761
digits += 1;
767762
}
768-
&b'e' | &b'E' => {
769-
exp_start = true;
763+
b'e' | b'E' => {
764+
break;
770765
}
771766
_ => {
772767
return Err(ArrowError::ParseError(format!(
773768
"can't parse the string value {s} to decimal"
774769
)));
775770
}
776771
};
772+
}
777773

778-
if exp_start {
779-
pos_shift_direction = match bs.next() {
780-
Some(&b'-') => false,
781-
Some(&b'+') => true,
782-
Some(b) => {
783-
if !b.is_ascii_digit() {
784-
return Err(ArrowError::ParseError(format!(
785-
"can't parse the string value {s} to decimal"
786-
)));
787-
}
788-
789-
exp *= 10;
790-
exp += (b - b'0') as i16;
791-
792-
true
793-
}
794-
None => {
795-
return Err(ArrowError::ParseError(format!(
796-
"can't parse the string value {s} to decimal"
797-
)));
798-
}
799-
};
800-
801-
for b in bs.by_ref() {
802-
if !b.is_ascii_digit() {
803-
return Err(ArrowError::ParseError(format!(
804-
"can't parse the string value {s} to decimal"
805-
)));
806-
}
774+
// parse the exponent itself
775+
let mut signed = false;
776+
for b in bs {
777+
match b {
778+
b'-' if !signed => {
779+
pos_shift_direction = false;
780+
signed = true;
781+
}
782+
b'+' if !signed => {
783+
pos_shift_direction = true;
784+
signed = true;
785+
}
786+
b if b.is_ascii_digit() => {
807787
exp *= 10;
808788
exp += (b - b'0') as i16;
809789
}
790+
_ => {
791+
return Err(ArrowError::ParseError(format!(
792+
"can't parse the string value {s} to decimal"
793+
)));
794+
}
810795
}
811796
}
812797

@@ -850,7 +835,11 @@ fn parse_e_notation<T: DecimalType>(
850835
}
851836

852837
/// Parse the string format decimal value to i128/i256 format and checking the precision and scale.
853-
/// The result value can't be out of bounds.
838+
/// Expected behavior:
839+
/// - The result value can't be out of bounds.
840+
/// - When parsing a decimal with scale 0, all fractional digits will be discarded. The final
841+
/// fractional digits may be a subset or a superset of the digits after the decimal point when
842+
/// e-notation is used.
854843
pub fn parse_decimal<T: DecimalType>(
855844
s: &str,
856845
precision: u8,
@@ -862,18 +851,24 @@ pub fn parse_decimal<T: DecimalType>(
862851
let base = T::Native::usize_as(10);
863852

864853
let bs = s.as_bytes();
865-
let (signed, negative) = match bs.first() {
866-
Some(b'-') => (true, true),
867-
Some(b'+') => (true, false),
868-
_ => (false, false),
869-
};
870854

871-
if bs.is_empty() || signed && bs.len() == 1 {
855+
if !bs
856+
.last()
857+
.is_some_and(|b| b.is_ascii_digit() || (b == &b'.' && s.len() > 1))
858+
{
859+
// If the last character is not a digit (or a decimal point prefixed with some digits), then
860+
// it's not a valid decimal.
872861
return Err(ArrowError::ParseError(format!(
873862
"can't parse the string value {s} to decimal"
874863
)));
875864
}
876865

866+
let (signed, negative) = match bs.first() {
867+
Some(b'-') => (true, true),
868+
Some(b'+') => (true, false),
869+
_ => (false, false),
870+
};
871+
877872
// Iterate over the raw input bytes, skipping the sign if any
878873
let mut bs = bs.iter().enumerate().skip(signed as usize);
879874

@@ -903,7 +898,7 @@ pub fn parse_decimal<T: DecimalType>(
903898
digits as u16,
904899
fractionals as i16,
905900
result,
906-
point_index,
901+
point_index + 1,
907902
precision as u16,
908903
scale as i16,
909904
)?;
@@ -916,7 +911,7 @@ pub fn parse_decimal<T: DecimalType>(
916911
"can't parse the string value {s} to decimal"
917912
)));
918913
}
919-
if fractionals == scale && scale != 0 {
914+
if fractionals == scale {
920915
// We have processed all the digits that we need. All that
921916
// is left is to validate that the rest of the string contains
922917
// valid digits.
@@ -931,13 +926,6 @@ pub fn parse_decimal<T: DecimalType>(
931926
if is_e_notation {
932927
break;
933928
}
934-
935-
// Fail on "."
936-
if digits == 0 {
937-
return Err(ArrowError::ParseError(format!(
938-
"can't parse the string value {s} to decimal"
939-
)));
940-
}
941929
}
942930
b'e' | b'E' => {
943931
result = parse_e_notation::<T>(
@@ -2613,6 +2601,9 @@ mod tests {
26132601
"1.1e.12",
26142602
"1.23e+3.",
26152603
"1.23e+3.1",
2604+
"1e",
2605+
"1e+",
2606+
"1e-",
26162607
];
26172608
for s in can_not_parse_tests {
26182609
let result_128 = parse_decimal::<Decimal128Type>(s, 20, 3);
@@ -2636,6 +2627,7 @@ mod tests {
26362627
("12345678908765.123456", 3),
26372628
("123456789087651234.56e-4", 3),
26382629
("1234560000000", 0),
2630+
("12345678900.0", 0),
26392631
("1.23456e12", 0),
26402632
];
26412633
for (s, scale) in overflow_parse_tests {
@@ -2752,6 +2744,45 @@ mod tests {
27522744
let result = parse_decimal::<Decimal256Type>(s, 76, scale);
27532745
assert_eq!(i, result.unwrap());
27542746
}
2747+
2748+
let zero_scale_tests = [
2749+
(".123", 0, 3),
2750+
("0.123", 0, 3),
2751+
("1.0", 1, 3),
2752+
("1.2", 1, 3),
2753+
("1.00", 1, 3),
2754+
("1.23", 1, 3),
2755+
("1.000", 1, 3),
2756+
("1.123", 1, 3),
2757+
("123.0", 123, 3),
2758+
("123.4", 123, 3),
2759+
("123.00", 123, 3),
2760+
("123.45", 123, 3),
2761+
("123.000000000000000000004", 123, 3),
2762+
("0.123e2", 12, 3),
2763+
("0.123e4", 1230, 10),
2764+
("1.23e4", 12300, 10),
2765+
("12.3e4", 123000, 10),
2766+
("123e4", 1230000, 10),
2767+
(
2768+
"20000000000000000000000000000000000002.0",
2769+
20000000000000000000000000000000000002,
2770+
38,
2771+
),
2772+
];
2773+
for (s, i, precision) in zero_scale_tests {
2774+
let result_128 = parse_decimal::<Decimal128Type>(s, precision, 0).unwrap();
2775+
assert_eq!(i, result_128);
2776+
}
2777+
2778+
let can_not_parse_zero_scale = [".", "blag", "", "+", "-", "e"];
2779+
for s in can_not_parse_zero_scale {
2780+
let result_128 = parse_decimal::<Decimal128Type>(s, 5, 0);
2781+
assert_eq!(
2782+
format!("Parser error: can't parse the string value {s} to decimal"),
2783+
result_128.unwrap_err().to_string(),
2784+
);
2785+
}
27552786
}
27562787

27572788
#[test]

0 commit comments

Comments
 (0)