fix: liberal parsing of zero scale decimals (#8700)

gruuya · martin-g · web-flow · commit a1949e46f31f · 2026-01-19T10:08:13.000-05:00
# Which issue does this PR close? - Closes #8699. Alternative to #8702 (see #8699 for a discussion). # Rationale for this change Make parsing of zero-scale decimals more correct/consistent. # What changes are included in this PR? When a decimal with scale 0 is being parsed, but it has some digits pass the decimal point those will effectively be discarded. # Are these changes tested? Yes. # Are there any user-facing changes? Aligned parsing of zero-scale decimals. --------- Co-authored-by: Martin Grigorov <martin-g@users.noreply.github.com>
diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs
@@ -741,72 +741,57 @@ fn parse_e_notation<T: DecimalType>(
     let mut exp: i16 = 0;
     let base = T::Native::usize_as(10);
 
-    let mut exp_start: bool = false;
     // e has a plus sign
     let mut pos_shift_direction: bool = true;
 
-    // skip to point or exponent index
-    let mut bs;
-    if fractionals > 0 {
-        // it's a fraction, so the point index needs to be skipped, so +1
-        bs = s.as_bytes().iter().skip(index + fractionals as usize + 1);
-    } else {
-        // it's actually an integer that is already written into the result, so let's skip on to e
-        bs = s.as_bytes().iter().skip(index);
-    }
+    // skip to the exponent index directly or just after any processed fractionals
+    let mut bs = s.as_bytes().iter().skip(index + fractionals as usize);
 
-    while let Some(b) = bs.next() {
+    // This function is only called from `parse_decimal`, in which we skip parsing any fractionals
+    // after we reach `scale` digits, not knowing ahead of time whether the decimal contains an
+    // e-notation or not.
+    // So once we do hit into an e-notation, and drop down into this function, we need to parse the
+    // remaining unprocessed fractionals too, since otherwise we might lose precision.
+    for b in bs.by_ref() {
         match b {
             b'0'..=b'9' => {
                 result = result.mul_wrapping(base);
                 result = result.add_wrapping(T::Native::usize_as((b - b'0') as usize));
-                if fractionals > 0 {
-                    fractionals += 1;
-                }
+                fractionals += 1;
                 digits += 1;
             }
-            &b'e' | &b'E' => {
-                exp_start = true;
+            b'e' | b'E' => {
+                break;
             }
             _ => {
                 return Err(ArrowError::ParseError(format!(
                     "can't parse the string value {s} to decimal"
                 )));
             }
         };
+    }
 
-        if exp_start {
-            pos_shift_direction = match bs.next() {
-                Some(&b'-') => false,
-                Some(&b'+') => true,
-                Some(b) => {
-                    if !b.is_ascii_digit() {
-                        return Err(ArrowError::ParseError(format!(
-                            "can't parse the string value {s} to decimal"
-                        )));
-                    }
-
-                    exp *= 10;
-                    exp += (b - b'0') as i16;
-
-                    true
-                }
-                None => {
-                    return Err(ArrowError::ParseError(format!(
-                        "can't parse the string value {s} to decimal"
-                    )));
-                }
-            };
-
-            for b in bs.by_ref() {
-                if !b.is_ascii_digit() {
-                    return Err(ArrowError::ParseError(format!(
-                        "can't parse the string value {s} to decimal"
-                    )));
-                }
+    // parse the exponent itself
+    let mut signed = false;
+    for b in bs {
+        match b {
+            b'-' if !signed => {
+                pos_shift_direction = false;
+                signed = true;
+            }
+            b'+' if !signed => {
+                pos_shift_direction = true;
+                signed = true;
+            }
+            b if b.is_ascii_digit() => {
                 exp *= 10;
                 exp += (b - b'0') as i16;
             }
+            _ => {
+                return Err(ArrowError::ParseError(format!(
+                    "can't parse the string value {s} to decimal"
+                )));
+            }
         }
     }
 
@@ -850,7 +835,11 @@ fn parse_e_notation<T: DecimalType>(
 }
 
 /// Parse the string format decimal value to i128/i256 format and checking the precision and scale.
-/// The result value can't be out of bounds.
+/// Expected behavior:
+/// - The result value can't be out of bounds.
+/// - When parsing a decimal with scale 0, all fractional digits will be discarded. The final
+///   fractional digits may be a subset or a superset of the digits after the decimal point when
+///   e-notation is used.
 pub fn parse_decimal<T: DecimalType>(
     s: &str,
     precision: u8,
@@ -862,18 +851,24 @@ pub fn parse_decimal<T: DecimalType>(
     let base = T::Native::usize_as(10);
 
     let bs = s.as_bytes();
-    let (signed, negative) = match bs.first() {
-        Some(b'-') => (true, true),
-        Some(b'+') => (true, false),
-        _ => (false, false),
-    };
 
-    if bs.is_empty() || signed && bs.len() == 1 {
+    if !bs
+        .last()
+        .is_some_and(|b| b.is_ascii_digit() || (b == &b'.' && s.len() > 1))
+    {
+        // If the last character is not a digit (or a decimal point prefixed with some digits), then
+        // it's not a valid decimal.
         return Err(ArrowError::ParseError(format!(
             "can't parse the string value {s} to decimal"
         )));
     }
 
+    let (signed, negative) = match bs.first() {
+        Some(b'-') => (true, true),
+        Some(b'+') => (true, false),
+        _ => (false, false),
+    };
+
     // Iterate over the raw input bytes, skipping the sign if any
     let mut bs = bs.iter().enumerate().skip(signed as usize);
 
@@ -903,7 +898,7 @@ pub fn parse_decimal<T: DecimalType>(
                                 digits as u16,
                                 fractionals as i16,
                                 result,
-                                point_index,
+                                point_index + 1,
                                 precision as u16,
                                 scale as i16,
                             )?;
@@ -916,7 +911,7 @@ pub fn parse_decimal<T: DecimalType>(
                             "can't parse the string value {s} to decimal"
                         )));
                     }
-                    if fractionals == scale && scale != 0 {
+                    if fractionals == scale {
                         // We have processed all the digits that we need. All that
                         // is left is to validate that the rest of the string contains
                         // valid digits.
@@ -931,13 +926,6 @@ pub fn parse_decimal<T: DecimalType>(
                 if is_e_notation {
                     break;
                 }
-
-                // Fail on "."
-                if digits == 0 {
-                    return Err(ArrowError::ParseError(format!(
-                        "can't parse the string value {s} to decimal"
-                    )));
-                }
             }
             b'e' | b'E' => {
                 result = parse_e_notation::<T>(
@@ -2613,6 +2601,9 @@ mod tests {
             "1.1e.12",
             "1.23e+3.",
             "1.23e+3.1",
+            "1e",
+            "1e+",
+            "1e-",
         ];
         for s in can_not_parse_tests {
             let result_128 = parse_decimal::<Decimal128Type>(s, 20, 3);
@@ -2636,6 +2627,7 @@ mod tests {
             ("12345678908765.123456", 3),
             ("123456789087651234.56e-4", 3),
             ("1234560000000", 0),
+            ("12345678900.0", 0),
             ("1.23456e12", 0),
         ];
         for (s, scale) in overflow_parse_tests {
@@ -2752,6 +2744,45 @@ mod tests {
             let result = parse_decimal::<Decimal256Type>(s, 76, scale);
             assert_eq!(i, result.unwrap());
         }
+
+        let zero_scale_tests = [
+            (".123", 0, 3),
+            ("0.123", 0, 3),
+            ("1.0", 1, 3),
+            ("1.2", 1, 3),
+            ("1.00", 1, 3),
+            ("1.23", 1, 3),
+            ("1.000", 1, 3),
+            ("1.123", 1, 3),
+            ("123.0", 123, 3),
+            ("123.4", 123, 3),
+            ("123.00", 123, 3),
+            ("123.45", 123, 3),
+            ("123.000000000000000000004", 123, 3),
+            ("0.123e2", 12, 3),
+            ("0.123e4", 1230, 10),
+            ("1.23e4", 12300, 10),
+            ("12.3e4", 123000, 10),
+            ("123e4", 1230000, 10),
+            (
+                "20000000000000000000000000000000000002.0",
+                20000000000000000000000000000000000002,
+                38,
+            ),
+        ];
+        for (s, i, precision) in zero_scale_tests {
+            let result_128 = parse_decimal::<Decimal128Type>(s, precision, 0).unwrap();
+            assert_eq!(i, result_128);
+        }
+
+        let can_not_parse_zero_scale = [".", "blag", "", "+", "-", "e"];
+        for s in can_not_parse_zero_scale {
+            let result_128 = parse_decimal::<Decimal128Type>(s, 5, 0);
+            assert_eq!(
+                format!("Parser error: can't parse the string value {s} to decimal"),
+                result_128.unwrap_err().to_string(),
+            );
+        }
     }
 
     #[test]