[integer] Optimize BigUInt addition and subtraction with SIMD and early stop tricks (#101)

forfudan · web-flow · commit 88e9f7b6c64a · 2025-07-07T20:50:40.000+02:00
This pull request introduces significant updates to the `bench_biguint`
benchmarking suite, optimizes arithmetic operations in the `BigUInt` and
`BigDecimal` modules, and refactors method names for clarity and
consistency. Additionally, new benchmarking cases and constants are
added to improve performance testing and support for larger numbers.

1. Use SIMD to accelerate BigUInt addition and in-place addition. The
speed gain is 2x to 4x for large numbers.
2. Refine the BigUInt subtraction and in-place addition with some tricks
on carry so that floor_divide and modulo are replaced by addition and
subtraction.
3. Use a trick to first do a paralelled addition word-by-word, and then
do normalized carries with one loop.

### Arithmetic Optimizations:
*
[`src/decimojo/bigdecimal/arithmetics.mojo`](diffhunk://#diff-f79534f4e7fdd891932ce9d015c50bd3c8a72c4a1689f0cb55524490ffc0458dL73-R74):
Refactored methods to replace `scale_up_by_power_of_10` and
`scale_down_by_power_of_10` with `multiply_by_power_of_ten` and
`floor_divide_by_power_of_ten`, improving naming consistency and
clarity.
[[1]](diffhunk://#diff-f79534f4e7fdd891932ce9d015c50bd3c8a72c4a1689f0cb55524490ffc0458dL73-R74)
[[2]](diffhunk://#diff-f79534f4e7fdd891932ce9d015c50bd3c8a72c4a1689f0cb55524490ffc0458dL304-R304)
[[3]](diffhunk://#diff-f79534f4e7fdd891932ce9d015c50bd3c8a72c4a1689f0cb55524490ffc0458dL440-R445)
*
[`src/decimojo/biguint/biguint.mojo`](diffhunk://#diff-f9432b9b2671643af91201f9e3f011551a3d3b0e6d7b256d0d4569f5ae59848aL1003-L1010):
Removed redundant `add_inplace_by_1` method and replaced it with a more
general `add_inplace_by_uint32` for optimized addition operations.
[[1]](diffhunk://#diff-f9432b9b2671643af91201f9e3f011551a3d3b0e6d7b256d0d4569f5ae59848aL1003-L1010)
[[2]](diffhunk://#diff-f9432b9b2671643af91201f9e3f011551a3d3b0e6d7b256d0d4569f5ae59848aL1437-R1439)

### Refactoring and Enhancements:
*
[`src/decimojo/biguint/biguint.mojo`](diffhunk://#diff-f9432b9b2671643af91201f9e3f011551a3d3b0e6d7b256d0d4569f5ae59848aR74-R75):
Renamed methods (e.g., `scale_up_by_power_of_10` →
`multiply_by_power_of_ten`) for consistency across the codebase and
introduced `VECTOR_WIDTH` constant for SIMD-based arithmetic
optimizations.
[[1]](diffhunk://#diff-f9432b9b2671643af91201f9e3f011551a3d3b0e6d7b256d0d4569f5ae59848aR74-R75)
[[2]](diffhunk://#diff-f9432b9b2671643af91201f9e3f011551a3d3b0e6d7b256d0d4569f5ae59848aL1070-R1095)
*
[`src/decimojo/bigdecimal/comparison.mojo`](diffhunk://#diff-04237ffa697ff22a4879812f65a72c23bc5d3e183b58f11e437c94836bd43da3L66-R70):
Updated comparison logic to use the newly renamed
`multiply_by_power_of_ten` method for scaling coefficients.

### Benchmarking Updates:
*
[`benches/biguint/bench_biguint_add.mojo`](diffhunk://#diff-967ad165864a3f276ee27b8eca0721f132d904f71ffb3da60003a75aec8837efR460-R509):
Added five new addition benchmark cases for larger word sizes (e.g.,
4096 words + 2048 words) to test scalability.
*
[`benches/biguint/bench_biguint_multiply.mojo`](diffhunk://#diff-3fba3fe441d30e17e77d7e18b33b2508452b08f07e7af177d413c08b5b5c88c2L463-R584):
Expanded multiplication benchmarks to include 12 new cases with varying
word sizes, introducing reduced iterations for very large numbers to
optimize runtime.
*
[`benches/biguint/bench_biguint_multiply_complexity.mojo`](diffhunk://#diff-d0d1723b5108046f6dc332ce4cf856979f00576061017e71660d38bcd536b31fL132-R132):
Adjusted test sizes to start from 8 words instead of 32 and updated
iteration logic for benchmarking complexity.
[[1]](diffhunk://#diff-d0d1723b5108046f6dc332ce4cf856979f00576061017e71660d38bcd536b31fL132-R132)
[[2]](diffhunk://#diff-d0d1723b5108046f6dc332ce4cf856979f00576061017e71660d38bcd536b31fL141-R144)

These changes collectively enhance the code's readability, scalability,
and performance, especially for operations involving large numbers and
benchmarking scenarios.
diff --git a/benches/biguint/bench_biguint_add.mojo b/benches/biguint/bench_biguint_add.mojo
@@ -21,6 +21,8 @@ fn open_log_file() raises -> PythonObject:
     """
     var python = Python.import_module("builtins")
     var datetime = Python.import_module("datetime")
+    var pysys = Python.import_module("sys")
+    pysys.set_int_max_str_digits(1000000)
 
     # Create logs directory if it doesn't exist
     var log_dir = "./logs"
@@ -455,6 +457,56 @@ fn main() raises:
         speedup_factors,
     )
 
+    # Case 31: Addition with 64 words + 32 words
+    run_benchmark_add(
+        "Addition with 64 words + 32 words",
+        "123456789" * 64,
+        "987654321" * 32,
+        iterations,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 32: Addition with 256 words + 128 words
+    run_benchmark_add(
+        "Addition with 256 words + 128 words",
+        "123456789" * 256,
+        "987654321" * 128,
+        iterations,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 33: Addition with 1024 words + 512 words
+    run_benchmark_add(
+        "Addition with 1024 words + 512 words",
+        "123456789" * 1024,
+        "987654321" * 512,
+        iterations,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 34: Addition with 4096 words + 2048 words
+    run_benchmark_add(
+        "Addition with 4096 words + 2048 words",
+        "123456789" * 4096,
+        "987654321" * 2048,
+        iterations,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 35: Addition with 16384 words + 8192 words
+    run_benchmark_add(
+        "Addition with 16384 words + 8192 words",
+        "123456789" * 16384,
+        "987654321" * 8192,
+        iterations,
+        log_file,
+        speedup_factors,
+    )
+
     # Calculate average speedup factor
     var sum_speedup: Float64 = 0.0
     for i in range(len(speedup_factors)):
diff --git a/benches/biguint/bench_biguint_multiply.mojo b/benches/biguint/bench_biguint_multiply.mojo
@@ -150,7 +150,9 @@ fn main() raises:
         log_print("Could not retrieve system information", log_file)
 
     # Use fewer iterations for multiplication as it's more compute-intensive
+    # For large numbers, we reduce iterations to avoid long runtimes
     var iterations = 100
+    var iterations_large = 20
 
     # Define benchmark cases
     log_print(
@@ -460,26 +462,126 @@ fn main() raises:
         speedup_factors,
     )
 
-    # Case 31: Very, very large numbers multiplication
+    # Case 31: 2 words * 2 words multiplication
     run_benchmark_multiply(
-        "Extreme large numbers multiplication (9000 digits * 9000 digits)",
-        "123456789" * 1000,  # 9000 digits
-        "987654321" * 1000,  # 9000 digits
+        "2 words * 2 words multiplication",
+        "123456789" * 2,
+        "987654321" * 2,
         iterations,
         log_file,
         speedup_factors,
     )
 
-    # Case 32: Extremely large numbers multiplication
+    # Case 32: 4 words * 4 words multiplication
     run_benchmark_multiply(
-        "Extreme large numbers multiplication (36000 digits * 36000 digits)",
-        "123456789" * 4000,  # 36000 digits
-        "987654321" * 4000,  # 36000 digits
+        "4 words * 4 words multiplication",
+        "123456789" * 4,
+        "987654321" * 4,
         iterations,
         log_file,
         speedup_factors,
     )
 
+    # Case 33: 8 words * 8 words multiplication
+    run_benchmark_multiply(
+        "8 words * 8 words multiplication",
+        "123456789" * 8,
+        "987654321" * 8,
+        iterations,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 34: 16 words * 16 words multiplication
+    run_benchmark_multiply(
+        "16 words * 16 words multiplication",
+        "123456789" * 16,
+        "987654321" * 16,
+        iterations,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 35: 32 words * 32 words multiplication
+    run_benchmark_multiply(
+        "32 words * 32 words multiplication",
+        "123456789" * 32,
+        "987654321" * 32,
+        iterations,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 36: 64 words * 64 words multiplication
+    run_benchmark_multiply(
+        "64 words * 64 words multiplication",
+        "123456789" * 64,
+        "987654321" * 64,
+        iterations,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 37: 128 words * 128 words multiplication
+    run_benchmark_multiply(
+        "128 words * 128 words multiplication",
+        "123456789" * 128,
+        "987654321" * 128,
+        iterations_large,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 38: 256 words * 256 words multiplication
+    run_benchmark_multiply(
+        "256 words * 256 words multiplication",
+        "123456789" * 256,
+        "987654321" * 256,
+        iterations_large,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 39: 512 words * 512 words multiplication
+    run_benchmark_multiply(
+        "512 words * 512 words multiplication",
+        "123456789" * 512,
+        "987654321" * 512,
+        iterations_large,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 40: 1024 words * 1024 words multiplication
+    run_benchmark_multiply(
+        "1024 words * 1024 words multiplication",
+        "123456789" * 1024,
+        "987654321" * 1024,
+        iterations_large,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 41: 2048 words * 2048 words multiplication
+    run_benchmark_multiply(
+        "2048 words * 2048 words multiplication",
+        "123456789" * 2048,
+        "987654321" * 2048,
+        iterations_large,
+        log_file,
+        speedup_factors,
+    )
+
+    # Case 42: 4096 words * 4096 words multiplication
+    run_benchmark_multiply(
+        "4096 words * 4096 words multiplication",
+        "123456789" * 4096,
+        "987654321" * 4096,
+        iterations_large,
+        log_file,
+        speedup_factors,
+    )
+
     # Calculate average speedup factor
     var sum_speedup: Float64 = 0.0
     for i in range(len(speedup_factors)):
diff --git a/benches/biguint/bench_biguint_multiply_complexity.mojo b/benches/biguint/bench_biguint_multiply_complexity.mojo
@@ -129,7 +129,7 @@ fn main() raises:
 
     log_print("", log_file)
     log_print(
-        "Testing word sizes from 32 to 262144 words (powers of 2)", log_file
+        "Testing word sizes from 8 to 262144 words (powers of 2)", log_file
     )
     log_print("Each test uses 5 iterations for averaging", log_file)
     log_print(
@@ -138,8 +138,10 @@ fn main() raises:
     )
     log_print("", log_file)
 
-    # Test sizes: powers of 2 from 32 to 262144
+    # Test sizes: powers of 2 from 8 to 262144
     var test_sizes = List[Int]()
+    test_sizes.append(8)
+    test_sizes.append(16)
     test_sizes.append(32)
     test_sizes.append(64)
     test_sizes.append(128)
diff --git a/src/decimojo/bigdecimal/arithmetics.mojo b/src/decimojo/bigdecimal/arithmetics.mojo
@@ -70,8 +70,8 @@ fn add(x1: BigDecimal, x2: BigDecimal) raises -> BigDecimal:
         return x1.extend_precision(scale_factor1)
 
     # Scale coefficients to match
-    var coef1 = x1.coefficient.scale_up_by_power_of_10(scale_factor1)
-    var coef2 = x2.coefficient.scale_up_by_power_of_10(scale_factor2)
+    var coef1 = x1.coefficient.multiply_by_power_of_ten(scale_factor1)
+    var coef2 = x2.coefficient.multiply_by_power_of_ten(scale_factor2)
 
     # Handle addition based on signs
     if x1.sign == x2.sign:
@@ -135,8 +135,8 @@ fn subtract(x1: BigDecimal, x2: BigDecimal) raises -> BigDecimal:
         return result^
 
     # Scale coefficients to match
-    var coef1 = x1.coefficient.scale_up_by_power_of_10(scale_factor1)
-    var coef2 = x2.coefficient.scale_up_by_power_of_10(scale_factor2)
+    var coef1 = x1.coefficient.multiply_by_power_of_ten(scale_factor1)
+    var coef2 = x2.coefficient.multiply_by_power_of_ten(scale_factor2)
 
     # Handle subtraction based on signs
     if x1.sign != x2.sign:
@@ -278,7 +278,7 @@ fn true_divide(
     # Scale up the dividend to ensure sufficient precision
     var scaled_x1 = x1.coefficient
     if additional_digits > 0:
-        scaled_x1.scale_up_inplace_by_power_of_10(additional_digits)
+        scaled_x1.multiply_inplace_by_power_of_ten(additional_digits)
 
     # Perform division
     var quotient: BigUInt
@@ -301,7 +301,7 @@ fn true_divide(
     if is_exact:
         var num_trailing_zeros = quotient.number_of_trailing_zeros()
         if num_trailing_zeros > 0:
-            quotient = quotient.scale_down_by_power_of_10(num_trailing_zeros)
+            quotient = quotient.floor_divide_by_power_of_ten(num_trailing_zeros)
             result_scale -= num_trailing_zeros
             # Recalculate digits after removing trailing zeros
             result_digits = quotient.number_of_digits()
@@ -382,7 +382,7 @@ fn true_divide_inexact(
     # Scale up the dividend to ensure sufficient precision
     var scaled_x1 = x1.coefficient
     if buffer_digits > 0:
-        scaled_x1.scale_up_inplace_by_power_of_10(buffer_digits)
+        scaled_x1.multiply_inplace_by_power_of_ten(buffer_digits)
 
     # Perform division
     var quotient: BigUInt = scaled_x1 // x2.coefficient
@@ -437,12 +437,12 @@ fn truncate_divide(x1: BigDecimal, x2: BigDecimal) raises -> BigDecimal:
     # If scale_diff is positive, we need to scale up the dividend
     # If scale_diff is negative, we need to scale up the divisor
     if scale_diff > 0:
-        var divisor = x2.coefficient.scale_up_by_power_of_10(scale_diff)
+        var divisor = x2.coefficient.multiply_by_power_of_ten(scale_diff)
         var quotient = x1.coefficient.truncate_divide(divisor)
         return BigDecimal(quotient^, 0, x1.sign != x2.sign)
 
     else:  # scale_diff < 0
-        var dividend = x1.coefficient.scale_up_by_power_of_10(-scale_diff)
+        var dividend = x1.coefficient.multiply_by_power_of_ten(-scale_diff)
         var quotient = dividend.truncate_divide(x2.coefficient)
         return BigDecimal(quotient^, 0, x1.sign != x2.sign)
 
diff --git a/src/decimojo/bigdecimal/comparison.mojo b/src/decimojo/bigdecimal/comparison.mojo
@@ -63,11 +63,11 @@ fn compare_absolute(x1: BigDecimal, x2: BigDecimal) -> Int8:
 
     if scale_diff > 0:
         # x1 has larger scale (more decimal places)
-        var scaled_x2 = x2.coefficient.scale_up_by_power_of_10(scale_diff)
+        var scaled_x2 = x2.coefficient.multiply_by_power_of_ten(scale_diff)
         return x1.coefficient.compare(scaled_x2^)
     else:
         # x2 has larger scale (more decimal places)
-        var scaled_x1 = x1.coefficient.scale_up_by_power_of_10(-scale_diff)
+        var scaled_x1 = x1.coefficient.multiply_by_power_of_ten(-scale_diff)
         return scaled_x1.compare(x2.coefficient)
 
 
diff --git a/src/decimojo/bigdecimal/exponential.mojo b/src/decimojo/bigdecimal/exponential.mojo
@@ -156,11 +156,13 @@ fn integer_power(
     var abs_exp = abs(exponent)
     var exp_value: BigUInt
     if abs_exp.scale > 0:
-        exp_value = abs_exp.coefficient.scale_down_by_power_of_10(abs_exp.scale)
+        exp_value = abs_exp.coefficient.floor_divide_by_power_of_ten(
+            abs_exp.scale
+        )
     elif abs_exp.scale == 0:
         exp_value = abs_exp.coefficient
     else:
-        exp_value = abs_exp.coefficient.scale_up_by_power_of_10(-abs_exp.scale)
+        exp_value = abs_exp.coefficient.multiply_by_power_of_ten(-abs_exp.scale)
 
     var result = BigDecimal(BigUInt.ONE, 0, False)
     var current_power = base
@@ -357,7 +359,7 @@ fn integer_root(
         # Convert n to integer to check odd/even
         var n_uint: BigUInt
         if n.scale > 0:
-            n_uint = n.coefficient.scale_down_by_power_of_10(n.scale)
+            n_uint = n.coefficient.floor_divide_by_power_of_ten(n.scale)
         else:  # n.scale <= 0
             n_uint = n.coefficient
 
diff --git a/src/decimojo/bigint/bigint.mojo b/src/decimojo/bigint/bigint.mojo
@@ -523,8 +523,10 @@ struct BigInt(Absable, IntableRaising, Representable, Stringable, Writable):
     @always_inline
     fn __iadd__(mut self, other: Int) raises:
         # Optimize the case `i += 1`
-        if (self >= 0) and (other == 1):
-            self.magnitude.add_inplace_by_1()
+        if (self >= 0) and (other >= 0) and (other <= 999_999_999):
+            decimojo.biguint.arithmetics.add_inplace_by_uint32(
+                self.magnitude, UInt32(other)
+            )
         else:
             decimojo.bigint.arithmetics.add_inplace(self, other)
 
diff --git a/src/decimojo/biguint/arithmetics.mojo b/src/decimojo/biguint/arithmetics.mojo
diff --git a/src/decimojo/biguint/biguint.mojo b/src/decimojo/biguint/biguint.mojo