From 5731bfc397637e4304d6398b866bacebf6a256fd Mon Sep 17 00:00:00 2001
From: Jack Kleeman <jackkleeman@gmail.com>
Date: Fri, 22 Aug 2025 16:49:25 +0100
Subject: [PATCH 1/2] Use multiply shift instead of u128 division to 2x
 encoding speed

```
encode/standard_new_fixed
                        time:   [32.628 ns 32.801 ns 33.085 ns]
                        change: [-41.146% -39.914% -39.003%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 4 outliers among 100 measurements (4.00%)
  2 (2.00%) high mild
  2 (2.00%) high severe
encode/standard_new_random
                        time:   [30.778 ns 31.021 ns 31.250 ns]
                        change: [-46.061% -45.407% -44.802%] (p = 0.00 < 0.05)
                        Performance has improved.
encode/standard_bytes_fixed
                        time:   [21.184 ns 21.255 ns 21.352 ns]
                        change: [-50.594% -49.762% -48.248%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 3 outliers among 100 measurements (3.00%)
  1 (1.00%) high mild
  2 (2.00%) high severe
encode/standard_bytes_random
                        time:   [23.588 ns 23.664 ns 23.744 ns]
                        change: [-49.891% -49.451% -48.838%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 4 outliers among 100 measurements (4.00%)
  3 (3.00%) high mild
  1 (1.00%) high severe
encode/standard_buf_fixed
                        time:   [36.390 ns 36.488 ns 36.590 ns]
                        change: [-35.510% -35.272% -34.976%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 3 outliers among 100 measurements (3.00%)
  2 (2.00%) high mild
  1 (1.00%) high severe
encode/standard_buf_random
                        time:   [23.543 ns 23.669 ns 23.813 ns]
                        change: [-48.243% -47.933% -47.579%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 4 outliers among 100 measurements (4.00%)
  1 (1.00%) high mild
  3 (3.00%) high severe
encode/alternative_new_fixed
                        time:   [32.984 ns 33.088 ns 33.180 ns]
                        change: [-39.498% -39.270% -39.035%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 1 outliers among 100 measurements (1.00%)
  1 (1.00%) high severe
encode/alternative_new_random
                        time:   [30.405 ns 30.618 ns 30.859 ns]
                        change: [-45.348% -44.161% -42.669%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 3 outliers among 100 measurements (3.00%)
  1 (1.00%) high mild
  2 (2.00%) high severe
encode/alternative_bytes_fixed
                        time:   [21.306 ns 21.358 ns 21.419 ns]
                        change: [-50.234% -50.088% -49.940%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 2 outliers among 100 measurements (2.00%)
  1 (1.00%) high mild
  1 (1.00%) high severe
encode/alternative_bytes_random
                        time:   [24.073 ns 24.297 ns 24.546 ns]
                        change: [-49.165% -48.685% -48.177%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 4 outliers among 100 measurements (4.00%)
  2 (2.00%) high mild
  2 (2.00%) high severe
encode/alternative_buf_fixed
                        time:   [36.533 ns 36.621 ns 36.718 ns]
                        change: [-35.979% -35.560% -35.046%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 2 outliers among 100 measurements (2.00%)
  1 (1.00%) high mild
  1 (1.00%) high severe
encode/alternative_buf_random
                        time:   [23.507 ns 23.579 ns 23.658 ns]
                        change: [-49.875% -49.461% -49.169%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 1 outliers among 100 measurements (1.00%)
  1 (1.00%) high mild
```
---
 src/lib.rs | 46 ++++++++++++++++++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 10 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 155d55a..3ec6792 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -46,6 +46,11 @@ const BASE_TO_19: u128 = BASE_TO_18 * BASE as u128;
 const BASE_TO_20: u128 = BASE_TO_19 * BASE as u128;
 const BASE_TO_21: u128 = BASE_TO_20 * BASE as u128;
 
+// Rust does not apply strength reduction to integer division by u128s, instead it uses the very slow __udivti3 intrinsic.
+// This multiply and shift is equivalent to dividing by BASE_TO_10.
+const DIV_BASE_TO_10_MULTIPLY: u128 = 233718071534448225491982379416108680074;
+const DIV_BASE_TO_10_SHIFT: u8 = 59;
+
 struct Base62Tables {
     standard: [u8; 62],
     alternative: [u8; 62],
@@ -530,16 +535,11 @@ pub fn decode_alternative<T: AsRef<[u8]>>(input: T) -> Result<u128, DecodeError>
 }
 
 // Common encoding function
-unsafe fn encode_impl(
-    mut num: u128,
-    digits: usize,
-    buf: &mut [u8],
-    encode_table: &[u8; 62],
-) -> usize {
+unsafe fn encode_impl(num: u128, digits: usize, buf: &mut [u8], encode_table: &[u8; 62]) -> usize {
     let mut write_idx = digits;
     let mut digit_index = 0_usize;
-    let mut u64_num = (num % BASE_TO_10) as u64;
-    num /= BASE_TO_10;
+
+    let (mut num, mut u64_num) = div_base_to_10(num);
 
     while digit_index < digits {
         write_idx = write_idx.wrapping_sub(1);
@@ -552,8 +552,7 @@ unsafe fn encode_impl(
         digit_index = digit_index.wrapping_add(1);
         match digit_index {
             10 => {
-                u64_num = (num % BASE_TO_10) as u64;
-                num /= BASE_TO_10;
+                (num, u64_num) = div_base_to_10(num);
             }
             20 => u64_num = num as u64,
             _ => u64_num = quotient,
@@ -563,6 +562,33 @@ unsafe fn encode_impl(
     digits
 }
 
+fn div_base_to_10(num: u128) -> (u128, u64) {
+    let quotient = mulh(DIV_BASE_TO_10_MULTIPLY, num) >> DIV_BASE_TO_10_SHIFT;
+    let remainder = num - BASE_TO_10 * quotient;
+    (quotient, remainder as u64)
+}
+
+// Multiply two u128 together, returning only the top half of the product.
+const fn mulh(x: u128, y: u128) -> u128 {
+    const LOWER_HALF_MASK: u128 = (1 << 64) - 1;
+
+    let x_low = x & LOWER_HALF_MASK;
+    let y_low = y & LOWER_HALF_MASK;
+    let t = x_low.wrapping_mul(y_low);
+    let k = t >> 64;
+
+    let x_high = x >> 64;
+    let t = x_high.wrapping_mul(y_low) + k;
+    let k = t & LOWER_HALF_MASK;
+    let w1 = t >> 64;
+
+    let y_high = y >> 64;
+    let t = x_low.wrapping_mul(y_high) + k;
+    let k = t >> 64;
+
+    x_high.wrapping_mul(y_high) + w1 + k
+}
+
 unsafe fn _encode_buf(num: u128, digits: usize, buf: &mut [u8]) -> usize {
     encode_impl(num, digits, buf, &TABLES.standard)
 }

From 830bb015f7f9df92b5aeb1c3322fafeb6c6f2884 Mon Sep 17 00:00:00 2001
From: Jack Kleeman <jackkleeman@gmail.com>
Date: Sat, 23 Aug 2025 08:39:12 +0100
Subject: [PATCH 2/2] Add quickcheck for fast division

---
 src/lib.rs | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/lib.rs b/src/lib.rs
index 3ec6792..cc4f8cf 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -824,6 +824,16 @@ mod tests {
                 }
             }
         }
+
+        quickcheck! {
+            fn div_base_to_10_matches_std(num: u128) -> bool {
+                let (quotient_fast, remainder_fast) = div_base_to_10(num);
+                let quotient = num / BASE_TO_10;
+                let remainder = (num % BASE_TO_10) as u64;
+
+                quotient == quotient_fast && remainder == remainder_fast
+            }
+        }
     }
 
     // Pure computation tests that don't need allocation