From 5731bfc397637e4304d6398b866bacebf6a256fd Mon Sep 17 00:00:00 2001 From: Jack Kleeman Date: Fri, 22 Aug 2025 16:49:25 +0100 Subject: [PATCH 1/2] Use multiply shift instead of u128 division to 2x encoding speed ``` encode/standard_new_fixed time: [32.628 ns 32.801 ns 33.085 ns] change: [-41.146% -39.914% -39.003%] (p = 0.00 < 0.05) Performance has improved. Found 4 outliers among 100 measurements (4.00%) 2 (2.00%) high mild 2 (2.00%) high severe encode/standard_new_random time: [30.778 ns 31.021 ns 31.250 ns] change: [-46.061% -45.407% -44.802%] (p = 0.00 < 0.05) Performance has improved. encode/standard_bytes_fixed time: [21.184 ns 21.255 ns 21.352 ns] change: [-50.594% -49.762% -48.248%] (p = 0.00 < 0.05) Performance has improved. Found 3 outliers among 100 measurements (3.00%) 1 (1.00%) high mild 2 (2.00%) high severe encode/standard_bytes_random time: [23.588 ns 23.664 ns 23.744 ns] change: [-49.891% -49.451% -48.838%] (p = 0.00 < 0.05) Performance has improved. Found 4 outliers among 100 measurements (4.00%) 3 (3.00%) high mild 1 (1.00%) high severe encode/standard_buf_fixed time: [36.390 ns 36.488 ns 36.590 ns] change: [-35.510% -35.272% -34.976%] (p = 0.00 < 0.05) Performance has improved. Found 3 outliers among 100 measurements (3.00%) 2 (2.00%) high mild 1 (1.00%) high severe encode/standard_buf_random time: [23.543 ns 23.669 ns 23.813 ns] change: [-48.243% -47.933% -47.579%] (p = 0.00 < 0.05) Performance has improved. Found 4 outliers among 100 measurements (4.00%) 1 (1.00%) high mild 3 (3.00%) high severe encode/alternative_new_fixed time: [32.984 ns 33.088 ns 33.180 ns] change: [-39.498% -39.270% -39.035%] (p = 0.00 < 0.05) Performance has improved. Found 1 outliers among 100 measurements (1.00%) 1 (1.00%) high severe encode/alternative_new_random time: [30.405 ns 30.618 ns 30.859 ns] change: [-45.348% -44.161% -42.669%] (p = 0.00 < 0.05) Performance has improved. Found 3 outliers among 100 measurements (3.00%) 1 (1.00%) high mild 2 (2.00%) high severe encode/alternative_bytes_fixed time: [21.306 ns 21.358 ns 21.419 ns] change: [-50.234% -50.088% -49.940%] (p = 0.00 < 0.05) Performance has improved. Found 2 outliers among 100 measurements (2.00%) 1 (1.00%) high mild 1 (1.00%) high severe encode/alternative_bytes_random time: [24.073 ns 24.297 ns 24.546 ns] change: [-49.165% -48.685% -48.177%] (p = 0.00 < 0.05) Performance has improved. Found 4 outliers among 100 measurements (4.00%) 2 (2.00%) high mild 2 (2.00%) high severe encode/alternative_buf_fixed time: [36.533 ns 36.621 ns 36.718 ns] change: [-35.979% -35.560% -35.046%] (p = 0.00 < 0.05) Performance has improved. Found 2 outliers among 100 measurements (2.00%) 1 (1.00%) high mild 1 (1.00%) high severe encode/alternative_buf_random time: [23.507 ns 23.579 ns 23.658 ns] change: [-49.875% -49.461% -49.169%] (p = 0.00 < 0.05) Performance has improved. Found 1 outliers among 100 measurements (1.00%) 1 (1.00%) high mild ``` --- src/lib.rs | 46 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 155d55a..3ec6792 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -46,6 +46,11 @@ const BASE_TO_19: u128 = BASE_TO_18 * BASE as u128; const BASE_TO_20: u128 = BASE_TO_19 * BASE as u128; const BASE_TO_21: u128 = BASE_TO_20 * BASE as u128; +// Rust does not apply strength reduction to integer division by u128s, instead it uses the very slow __udivti3 intrinsic. +// This multiply and shift is equivalent to dividing by BASE_TO_10. +const DIV_BASE_TO_10_MULTIPLY: u128 = 233718071534448225491982379416108680074; +const DIV_BASE_TO_10_SHIFT: u8 = 59; + struct Base62Tables { standard: [u8; 62], alternative: [u8; 62], @@ -530,16 +535,11 @@ pub fn decode_alternative>(input: T) -> Result } // Common encoding function -unsafe fn encode_impl( - mut num: u128, - digits: usize, - buf: &mut [u8], - encode_table: &[u8; 62], -) -> usize { +unsafe fn encode_impl(num: u128, digits: usize, buf: &mut [u8], encode_table: &[u8; 62]) -> usize { let mut write_idx = digits; let mut digit_index = 0_usize; - let mut u64_num = (num % BASE_TO_10) as u64; - num /= BASE_TO_10; + + let (mut num, mut u64_num) = div_base_to_10(num); while digit_index < digits { write_idx = write_idx.wrapping_sub(1); @@ -552,8 +552,7 @@ unsafe fn encode_impl( digit_index = digit_index.wrapping_add(1); match digit_index { 10 => { - u64_num = (num % BASE_TO_10) as u64; - num /= BASE_TO_10; + (num, u64_num) = div_base_to_10(num); } 20 => u64_num = num as u64, _ => u64_num = quotient, @@ -563,6 +562,33 @@ unsafe fn encode_impl( digits } +fn div_base_to_10(num: u128) -> (u128, u64) { + let quotient = mulh(DIV_BASE_TO_10_MULTIPLY, num) >> DIV_BASE_TO_10_SHIFT; + let remainder = num - BASE_TO_10 * quotient; + (quotient, remainder as u64) +} + +// Multiply two u128 together, returning only the top half of the product. +const fn mulh(x: u128, y: u128) -> u128 { + const LOWER_HALF_MASK: u128 = (1 << 64) - 1; + + let x_low = x & LOWER_HALF_MASK; + let y_low = y & LOWER_HALF_MASK; + let t = x_low.wrapping_mul(y_low); + let k = t >> 64; + + let x_high = x >> 64; + let t = x_high.wrapping_mul(y_low) + k; + let k = t & LOWER_HALF_MASK; + let w1 = t >> 64; + + let y_high = y >> 64; + let t = x_low.wrapping_mul(y_high) + k; + let k = t >> 64; + + x_high.wrapping_mul(y_high) + w1 + k +} + unsafe fn _encode_buf(num: u128, digits: usize, buf: &mut [u8]) -> usize { encode_impl(num, digits, buf, &TABLES.standard) } From 830bb015f7f9df92b5aeb1c3322fafeb6c6f2884 Mon Sep 17 00:00:00 2001 From: Jack Kleeman Date: Sat, 23 Aug 2025 08:39:12 +0100 Subject: [PATCH 2/2] Add quickcheck for fast division --- src/lib.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 3ec6792..cc4f8cf 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -824,6 +824,16 @@ mod tests { } } } + + quickcheck! { + fn div_base_to_10_matches_std(num: u128) -> bool { + let (quotient_fast, remainder_fast) = div_base_to_10(num); + let quotient = num / BASE_TO_10; + let remainder = (num % BASE_TO_10) as u64; + + quotient == quotient_fast && remainder == remainder_fast + } + } } // Pure computation tests that don't need allocation