From 2e44f6742eaf9c84fad9fe1c4dc883ee138a3be5 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Mon, 15 Dec 2025 19:29:20 +0800
Subject: [PATCH 01/12] M31: init

---
 Cargo.toml       |  2 +-
 cm31/Cargo.toml  | 14 ++++++++
 cm31/src/lib.rs  | 69 ++++++++++++++++++++++++++++++++++++
 cm31/src/main.rs | 14 ++++++++
 cm31/src/u32.rs  | 92 ++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 190 insertions(+), 1 deletion(-)
 create mode 100644 cm31/Cargo.toml
 create mode 100644 cm31/src/lib.rs
 create mode 100644 cm31/src/main.rs
 create mode 100644 cm31/src/u32.rs
diff --git a/Cargo.toml b/Cargo.toml
index 72dfd4db..71e5202b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,7 +14,7 @@ members = [
   "tooling/provekit-bench",
   "tooling/provekit-gnark",
   "tooling/verifier-server",
-  "ntt",
+  "ntt", "cm31",
 ]
 exclude = [
   "playground/passport-input-gen",
diff --git a/cm31/Cargo.toml b/cm31/Cargo.toml
new file mode 100644
index 00000000..465dc292
--- /dev/null
+++ b/cm31/Cargo.toml
@@ -0,0 +1,14 @@
+[package]
+name = "cm31"
+version = "0.1.0"
+edition.workspace = true
+rust-version.workspace = true
+authors.workspace = true
+license.workspace = true
+homepage.workspace = true
+repository.workspace = true
+
+[dependencies]
+
+[lints]
+workspace = true
diff --git a/cm31/src/lib.rs b/cm31/src/lib.rs
new file mode 100644
index 00000000..b2cceb27
--- /dev/null
+++ b/cm31/src/lib.rs
@@ -0,0 +1,69 @@
+use std::ops::{Add, Mul, Neg, Shl, Sub};
+
+struct M31I<T>(T);
+
+impl<T: Into<u64>> Add<M31I<T>> for M31I<u64> {
+    type Output = M31I<u64>;
+
+    fn add(self, rhs: M31I<T>) -> Self::Output {
+        Self(self.0 + rhs.0.into())
+    }
+}
+
+trait Reduce {
+    fn reduce_u32(self) -> M31I<u32>;
+}
+
+impl Reduce for M31I<u64> {
+    // Assumes bits(hi) < 31
+    // Safety: this should only be called on M31I<u64> whose range is [61:0] with
+    // the top bits zero.
+    // The only way the upper bits can be set is due to additions. After a
+    // multiplication will go to 34 bits.
+    fn reduce_u32(self) -> M31I<u32> {
+        // TODO add debug assertion for top bits
+        // these operations are only intended for kernel
+        let reduced = reduce(self.0);
+        M31I(reduced as u32)
+    }
+}
+
+impl Reduce for M31I<u32> {
+    #[inline]
+    fn reduce_u32(self) -> M31I<u32> {
+        self
+    }
+}
+
+// Output bits
+// max(lo, hi) + 1
+// where lo max 31 bits and hi max 33 bits
+// so maximum is 34 bits
+fn reduce(r: u64) -> u64 {
+    // What are the precedence rules for shift?
+    // Three cheap operations mask, shift, addition
+    let lo = r & ((1 << 31) - 1);
+    let hi = r >> 31;
+    hi + lo
+}
+
+// Might not be the right type for the multiplication in a kernel.
+impl<T: Reduce> Mul<T> for M31I<u64> {
+    type Output = M31I<u64>;
+
+    fn mul(self, rhs: T) -> Self::Output {
+        let lhs = self.reduce_u32();
+        let rhs = rhs.reduce_u32();
+        let res = lhs.0 as u64 * rhs.0 as u64;
+        // After reduction 34 bits
+        M31I(reduce(res))
+    }
+}
+
+impl Sub for M31I<u64> {
+    type Output = M31I<u64>;
+
+    fn sub(self, rhs: Self) -> Self::Output {
+        todo!()
+    }
+}
diff --git a/cm31/src/main.rs b/cm31/src/main.rs
new file mode 100644
index 00000000..059d6a3a
--- /dev/null
+++ b/cm31/src/main.rs
@@ -0,0 +1,14 @@
+// Type your code here, or load an example.
+
+// As of Rust 1.75, small functions are automatically
+// marked as `#[inline]` so they will not show up in
+// the output when compiling with optimisations. Use
+// `#[no_mangle]` or `#[inline(never)]` to work around
+// this issue.
+// See https://github.com/compiler-explorer/compiler-explorer/issues/5939
+use std::hint::black_box;
+// If you use `main()`, declare it as `pub` to see it in the output:
+pub fn main() {
+    let (lo, hi) = (u32::carrying_mul_add(black_box(5), black_box(6), 0, 0));
+    let _res = black_box(2 * hi + lo);
+}
diff --git a/cm31/src/u32.rs b/cm31/src/u32.rs
new file mode 100644
index 00000000..e976984b
--- /dev/null
+++ b/cm31/src/u32.rs
@@ -0,0 +1,92 @@
+const P3: u64 = 3 * (1 << 31 - 1);
+
+// Tag can also be the internal type
+// 64 byte, 32 byte, 31 byte, 31 byte zero check. De laatste twee is
+// waarschijnlijk geen interessant ondersheid Aan de andere kant lijkt het geen
+struct M31(u32);
+
+// Introduce tag to track in which state the M31 is?
+// State
+impl M31 {
+    // Needs to return a value if it's going to be tagged with the internal state.
+    fn reduce_u31(&mut self) {
+        let tmp = self.0;
+        // TODO: mask
+        let tmp = tmp >> 31 + ((tmp << 1) >> 1);
+        let tmp = tmp >> 31 + ((tmp << 1) >> 1);
+        self.0 = tmp;
+    }
+}
+
+impl Add for M31 {
+    type Output = M31;
+
+    // 32 bit sized because otherwise there might be a masking operations
+    // while adds W will zero it out.
+    #[inline]
+    fn add(self, rhs: Self) -> Self::Output {
+        // Different ways of presenting it
+        // 32 bytes
+        // 64 bytes
+        // adds w
+        let (tmp, o) = self.0.overflowing_add(rhs.0);
+        // can't use adcs as it needs to be shifted first
+        // so 64 bit might be better as it doesn't require moving the carry to another
+        // register.
+        // Interested in how the compiler takes this?
+        let (tmp, o) = tmp.overflowing_add(2 * o as u32);
+        let out = tmp + 2 * o as u32;
+
+        Self(out)
+    }
+}
+
+impl Sub for M31 {
+    type Output = M31;
+
+    fn sub(self, rhs: Self) -> Self::Output {
+        // 33 bits + 32 bits -> 34 bits
+        let tmp = P3 + self.0 as u64 - rhs.0 as u64;
+        // 34 bits -> 32 bits
+        let hi = (tmp >> 32) as u32;
+        let lo = tmp as u32;
+        Self(2 * hi + lo)
+    }
+}
+
+impl Neg for M31 {
+    type Output = M31;
+
+    fn neg(self) -> Self::Output {
+        let tmp = P3 - self.0 as u64;
+        let hi = (tmp >> 32) as u32;
+        let lo = tmp as u32;
+        Self(2 * hi + lo)
+    }
+}
+
+impl Mul for M31 {
+    type Output = M31;
+
+    fn mul(self, rhs: Self) -> Self::Output {
+        let (lo, hi) = u32::carrying_mul_add(self.0, rhs.0, 0, 0);
+        // shift addition
+        // shift, shift, addition
+        // 64bits -> 34 bits
+        // 2 * hi -> 33 bits; 33 bits + 32 bits -> 34 bits
+        let tmp = 2 * hi as u64 + lo as u64;
+        // 34 bits -> 32 bits
+        let hi = (tmp >> 32) as u32;
+        let lo = tmp as u32;
+        Self(2 * hi + lo)
+    }
+}
+
+impl Shl<u8> for M31 {
+    type Output = M31;
+
+    fn shl(mut self, rhs: u8) -> Self::Output {
+        self.reduce_u31();
+        todo!();
+    }
+}

From 8eb50e99924e95e748c471fd1a419f0ba5a5a0c6 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Tue, 16 Dec 2025 10:36:13 +0800
Subject: [PATCH 02/12] M31: signed integers support

---
 cm31/src/lib.rs | 128 +++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 105 insertions(+), 23 deletions(-)

diff --git a/cm31/src/lib.rs b/cm31/src/lib.rs
index b2cceb27..d4281c53 100644
--- a/cm31/src/lib.rs
+++ b/cm31/src/lib.rs
@@ -1,58 +1,132 @@
 use std::ops::{Add, Mul, Neg, Shl, Sub};
 
+#[derive(Copy, Clone)]
 struct M31I<T>(T);
 
-impl<T: Into<u64>> Add<M31I<T>> for M31I<u64> {
-    type Output = M31I<u64>;
-
-    fn add(self, rhs: M31I<T>) -> Self::Output {
-        Self(self.0 + rhs.0.into())
+trait Reduce {
+    fn reduce_u32(&self) -> M31I<u32>;
+    // Assumes >= 62 bits representation, "enforced" by the allowed trait
+    // implementations. And also not being i32.
+    fn full_reduce(&self) -> u32 {
+        let tmp = self.reduce_u32().0;
+        let tmp = (tmp >> 31) + (tmp & ((1 << 31) - 1));
+        let tmp = (tmp >> 31) + (tmp & ((1 << 31) - 1));
+        // branch should become CSEL
+        if tmp == ((1 << 31) - 1) {
+            0
+        } else {
+            tmp
+        }
     }
 }
 
-trait Reduce {
-    fn reduce_u32(self) -> M31I<u32>;
-}
+// Working with two redundant forms i64 and u32
+// i64 to deal with subtractions
+// u32 for storage, i32 is not interesting as we want to be able to add two 31
+// bits values together. i32 would only allow for a subtraction.
+// internally u64 is also used
+
+// For unsigned value smaller than 61 bits conversion to i64 is problem free
 
-impl Reduce for M31I<u64> {
+// i signifies two complement form
+// u signifies valid one complement form.
+// for unsigned integers two complement and one complement is the same. Which
+// means that conversion back and forth can be done. Whenever i is used we can
+// expect the value to be negative so it first needs to be converted to
+// one-complement form before reduction. That is why u64 exist internally. And
+// why after reduction we move back to i64 again.
+//
+// i32 signifies fully reduced but that should maybe not be inside a M31, but
+// just be a i32 return. i32 signifying that the upper bit is not used.
+
+impl Reduce for M31I<i64> {
     // Assumes bits(hi) < 31
     // Safety: this should only be called on M31I<u64> whose range is [61:0] with
     // the top bits zero.
     // The only way the upper bits can be set is due to additions. After a
     // multiplication will go to 34 bits.
-    fn reduce_u32(self) -> M31I<u32> {
+    fn reduce_u32(&self) -> M31I<u32> {
         // TODO add debug assertion for top bits
         // these operations are only intended for kernel
-        let reduced = reduce(self.0);
-        M31I(reduced as u32)
+        let reduced = reduce(one_complement(self.0)) as u32;
+        M31I(reduced)
     }
 }
 
 impl Reduce for M31I<u32> {
     #[inline]
-    fn reduce_u32(self) -> M31I<u32> {
-        self
+    fn reduce_u32(&self) -> M31I<u32> {
+        *self
     }
 }
 
+// switching between i64 and u64 because of the sign shift.
+
+// After multiplication this can use the full range.
+// When coming from one complement this is u32.
 // Output bits
 // max(lo, hi) + 1
 // where lo max 31 bits and hi max 33 bits
 // so maximum is 34 bits
-fn reduce(r: u64) -> u64 {
+fn reduce(r: u64) -> i64 {
     // What are the precedence rules for shift?
     // Three cheap operations mask, shift, addition
     let lo = r & ((1 << 31) - 1);
     let hi = r >> 31;
-    hi + lo
+    (hi + lo) as i64
+}
+
+// Other option is to keep everything in unsigned and only do on complement on
+// subtraction and negation. That however would requiring three operations. not,
+// and, add instead of just sub.
+
+// i64 signed representation taking full 64 bits
+// u64 one complement representation. By going to one complement we have a valid
+// mersenne number. So it actually does two things.
+// It can potentially take up 64 bits but those will all be over. It works
+// because we work on a Once in one complement form you can go back and forward
+// between the two representations. But not when it's in two complement.
+
+// limit to 62 bits otherwise two reduction steps need to be done.
+fn one_complement(r: i64) -> u64 {
+    // A shift of 61 and 62 might look strange at first, but that is because in
+    // contrary to other shifting operations there is no splitting into an upper
+    // and lower part. It's just for checking the MSB and perform an operation
+    // basedon that.
+
+    // Would checking the N-flag
+    // be better? Does the compiler do this optimisation?
+    let sign = r >> 61;
+    // Turn into one complement and clear out the top two bits. These top bits would
+    // lead to wrong calculations as we only do not contain any information.
+    ((r + sign) & ((1 << 62) - 1)) as u64
 }
 
-// Might not be the right type for the multiplication in a kernel.
-impl<T: Reduce> Mul<T> for M31I<u64> {
-    type Output = M31I<u64>;
+// The following traits are to make working with M31 across different
+// representation easier.
+// On ARM64 conversion from u32 to i64 is free as a u32 already takes up a full
+// 64 bit register.
+// Monomorphisation should optimise out when the reductions is
+// not needed.
+
+// M31I<i64> and M31I<u32> are split up becaues rust otherwise requires a dyn
+// which would break monomorphisation.
+impl<T: Reduce> Mul<T> for M31I<i64> {
+    type Output = M31I<i64>;
 
     fn mul(self, rhs: T) -> Self::Output {
         let lhs = self.reduce_u32();
+        lhs * rhs
+    }
+}
+
+impl<T: Reduce> Mul<T> for M31I<u32> {
+    type Output = M31I<i64>;
+
+    // Mul returns a partial reduced to 34 bits as to only have to do a single
+    // round.
+    fn mul(self, rhs: T) -> Self::Output {
+        let lhs = self;
         let rhs = rhs.reduce_u32();
         let res = lhs.0 as u64 * rhs.0 as u64;
         // After reduction 34 bits
@@ -60,10 +134,18 @@ impl<T: Reduce> Mul<T> for M31I<u64> {
     }
 }
 
-impl Sub for M31I<u64> {
-    type Output = M31I<u64>;
+impl<T: Into<i64>, K: Into<i64>> Sub<M31I<T>> for M31I<K> {
+    type Output = M31I<i64>;
+
+    fn sub(self, rhs: M31I<T>) -> Self::Output {
+        M31I(self.0.into() - rhs.0.into())
+    }
+}
+
+impl<T: Into<i64>, K: Into<i64>> Add<M31I<T>> for M31I<K> {
+    type Output = M31I<i64>;
 
-    fn sub(self, rhs: Self) -> Self::Output {
-        todo!()
+    fn add(self, rhs: M31I<T>) -> Self::Output {
+        M31I(self.0.into() + rhs.0.into())
     }
 }

From 25048cb6c275f4c633997c358d5c764a56240acd Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Tue, 16 Dec 2025 13:38:55 +0800
Subject: [PATCH 03/12] M31: Add proptest and kani

---
 cm31/Cargo.toml |  3 +++
 cm31/src/lib.rs | 67 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)

diff --git a/cm31/Cargo.toml b/cm31/Cargo.toml
index 465dc292..b6667d48 100644
--- a/cm31/Cargo.toml
+++ b/cm31/Cargo.toml
@@ -10,5 +10,8 @@ repository.workspace = true
 
 [dependencies]
 
+[dev-dependencies]
+proptest.workspace = true
+
 [lints]
 workspace = true
diff --git a/cm31/src/lib.rs b/cm31/src/lib.rs
index d4281c53..b093560b 100644
--- a/cm31/src/lib.rs
+++ b/cm31/src/lib.rs
@@ -149,3 +149,70 @@ impl<T: Into<i64>, K: Into<i64>> Add<M31I<T>> for M31I<K> {
         M31I(self.0.into() + rhs.0.into())
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use {
+        crate::{reduce, Reduce, M31I},
+        proptest::prelude::*,
+    };
+    proptest! {
+        #[test]
+        fn reduce_u64(x: u64){
+            assert_eq!(x.rem_euclid((1<<31) - 1) as u32, M31I(reduce(x)).full_reduce())
+        }
+    }
+    proptest! {
+        // Use Kani
+        fn reduce_i64(x: i64){
+            assert_eq!(x.rem_euclid((1<<31) - 1) as u32, M31I(x).full_reduce())
+        }
+    }
+    proptest! {
+        fn reduce_u32(x: u32){
+            assert_eq!(x.rem_euclid((1<<31) - 1) as u32, M31I(x).full_reduce())
+        }
+    }
+}
+
+// Takes a 62bit number and sign extends it into a valid i64
+fn sign_extend(x: u64) -> i64 {
+    let sign = x >> 61;
+    let sign = sign << 2 | sign << 1 | sign;
+    ((sign << 61) | x) as i64
+}
+
+#[kani::proof]
+fn reduce_i64() {
+    let x: u64 = kani::any::<u64>() & ((1 << 61) - 1);
+    let x = sign_extend(x);
+    assert_eq!(x.rem_euclid((1 << 31) - 1) as u32, M31I(x).full_reduce())
+}
+
+// Checks the reduction inside the multiplier
+#[kani::proof]
+fn reduce_u64() {
+    let x: u64 = kani::any::<u64>();
+    assert_eq!(
+        x.rem_euclid((1 << 31) - 1) as u32,
+        M31I(reduce(x)).full_reduce()
+    )
+}
+
+// TOO slow
+// #[kani::proof]
+// fn reduce_mul_u32() {
+//     let x: u32 = kani::any::<u32>();
+//     let y: u32 = kani::any::<u32>();
+//     let z = (x as u64) * (y as u64);
+//     assert_eq!(
+//         z.rem_euclid((1 << 31) - 1) as u32,
+//         (M31I(x) * M31I(y)).full_reduce()
+//     )
+// }
+
+#[kani::proof]
+fn reduce_u32() {
+    let x = kani::any::<u32>();
+    assert_eq!(x.rem_euclid((1 << 31) - 1) as u32, M31I(x).full_reduce())
+}

From 4412ddeb04bef24e9351134c3b5acdce4640a786 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Tue, 16 Dec 2025 13:57:46 +0800
Subject: [PATCH 04/12] cm31: remove unnecessary reduction in i64

---
 cm31/src/lib.rs | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/cm31/src/lib.rs b/cm31/src/lib.rs
index b093560b..08bafdf0 100644
--- a/cm31/src/lib.rs
+++ b/cm31/src/lib.rs
@@ -7,17 +7,7 @@ trait Reduce {
     fn reduce_u32(&self) -> M31I<u32>;
     // Assumes >= 62 bits representation, "enforced" by the allowed trait
     // implementations. And also not being i32.
-    fn full_reduce(&self) -> u32 {
-        let tmp = self.reduce_u32().0;
-        let tmp = (tmp >> 31) + (tmp & ((1 << 31) - 1));
-        let tmp = (tmp >> 31) + (tmp & ((1 << 31) - 1));
-        // branch should become CSEL
-        if tmp == ((1 << 31) - 1) {
-            0
-        } else {
-            tmp
-        }
-    }
+    fn full_reduce(&self) -> u32;
 }
 
 // Working with two redundant forms i64 and u32
@@ -51,6 +41,17 @@ impl Reduce for M31I<i64> {
         let reduced = reduce(one_complement(self.0)) as u32;
         M31I(reduced)
     }
+
+    fn full_reduce(&self) -> u32 {
+        let tmp = self.reduce_u32().0;
+        let tmp = (tmp >> 31) + (tmp & ((1 << 31) - 1));
+        // branch should become CSEL
+        if tmp == ((1 << 31) - 1) {
+            0
+        } else {
+            tmp
+        }
+    }
 }
 
 impl Reduce for M31I<u32> {
@@ -58,6 +59,18 @@ impl Reduce for M31I<u32> {
     fn reduce_u32(&self) -> M31I<u32> {
         *self
     }
+
+    fn full_reduce(&self) -> u32 {
+        let tmp = self.0;
+        let tmp = (tmp >> 31) + (tmp & ((1 << 31) - 1));
+        let tmp = (tmp >> 31) + (tmp & ((1 << 31) - 1));
+        // branch should become CSEL
+        if tmp == ((1 << 31) - 1) {
+            0
+        } else {
+            tmp
+        }
+    }
 }
 
 // switching between i64 and u64 because of the sign shift.

From b356893d96533200b7830701e32bfc2558939371 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Tue, 16 Dec 2025 14:27:17 +0800
Subject: [PATCH 05/12] m31: remove proptest

kani provides better coverage than proptest
---
 cm31/Cargo.toml |  1 -
 cm31/src/lib.rs | 97 ++++++++++++++++++-------------------------------
 2 files changed, 36 insertions(+), 62 deletions(-)

diff --git a/cm31/Cargo.toml b/cm31/Cargo.toml
index b6667d48..dee4cafc 100644
--- a/cm31/Cargo.toml
+++ b/cm31/Cargo.toml
@@ -11,7 +11,6 @@ repository.workspace = true
 [dependencies]
 
 [dev-dependencies]
-proptest.workspace = true
 
 [lints]
 workspace = true
diff --git a/cm31/src/lib.rs b/cm31/src/lib.rs
index 08bafdf0..cf6cd720 100644
--- a/cm31/src/lib.rs
+++ b/cm31/src/lib.rs
@@ -1,3 +1,4 @@
+/// When modifying this file, rerun with Kani
 use std::ops::{Add, Mul, Neg, Shl, Sub};
 
 #[derive(Copy, Clone)]
@@ -43,6 +44,7 @@ impl Reduce for M31I<i64> {
     }
 
     fn full_reduce(&self) -> u32 {
+        // Only need two rounds to fully reduce down
         let tmp = self.reduce_u32().0;
         let tmp = (tmp >> 31) + (tmp & ((1 << 31) - 1));
         // branch should become CSEL
@@ -54,6 +56,8 @@ impl Reduce for M31I<i64> {
     }
 }
 
+// u32 is mostly for storage. Staying within u32 has no benefit on 64 bit
+// machines
 impl Reduce for M31I<u32> {
     #[inline]
     fn reduce_u32(&self) -> M31I<u32> {
@@ -109,6 +113,7 @@ fn one_complement(r: i64) -> u64 {
 
     // Would checking the N-flag
     // be better? Does the compiler do this optimisation?
+    // Relies on the *arithmic* shift left to maintain the sign.
     let sign = r >> 61;
     // Turn into one complement and clear out the top two bits. These top bits would
     // lead to wrong calculations as we only do not contain any information.
@@ -163,69 +168,39 @@ impl<T: Into<i64>, K: Into<i64>> Add<M31I<T>> for M31I<K> {
     }
 }
 
-#[cfg(test)]
-mod tests {
-    use {
-        crate::{reduce, Reduce, M31I},
-        proptest::prelude::*,
-    };
-    proptest! {
-        #[test]
-        fn reduce_u64(x: u64){
-            assert_eq!(x.rem_euclid((1<<31) - 1) as u32, M31I(reduce(x)).full_reduce())
-        }
-    }
-    proptest! {
-        // Use Kani
-        fn reduce_i64(x: i64){
-            assert_eq!(x.rem_euclid((1<<31) - 1) as u32, M31I(x).full_reduce())
-        }
+#[cfg(kani)]
+mod verification {
+    use crate::{reduce, Reduce, M31I};
+    // Takes a 62bit number and sign extends it into a valid i64
+    fn sign_extend(x: u64) -> i64 {
+        let sign = x >> 61;
+        let sign = sign << 2 | sign << 1 | sign;
+        ((sign << 61) | x) as i64
     }
-    proptest! {
-        fn reduce_u32(x: u32){
-            assert_eq!(x.rem_euclid((1<<31) - 1) as u32, M31I(x).full_reduce())
-        }
-    }
-}
 
-// Takes a 62bit number and sign extends it into a valid i64
-fn sign_extend(x: u64) -> i64 {
-    let sign = x >> 61;
-    let sign = sign << 2 | sign << 1 | sign;
-    ((sign << 61) | x) as i64
-}
-
-#[kani::proof]
-fn reduce_i64() {
-    let x: u64 = kani::any::<u64>() & ((1 << 61) - 1);
-    let x = sign_extend(x);
-    assert_eq!(x.rem_euclid((1 << 31) - 1) as u32, M31I(x).full_reduce())
-}
+    #[kani::proof]
+    fn reduce_i64() {
+        let x: u64 = kani::any::<u64>() & ((1 << 62) - 1);
+        let x = sign_extend(x);
+        assert_eq!(x.rem_euclid((1 << 31) - 1) as u32, M31I(x).full_reduce())
+    }
 
-// Checks the reduction inside the multiplier
-#[kani::proof]
-fn reduce_u64() {
-    let x: u64 = kani::any::<u64>();
-    assert_eq!(
-        x.rem_euclid((1 << 31) - 1) as u32,
-        M31I(reduce(x)).full_reduce()
-    )
-}
+    // The proof for the multiplier is too slow. Therefore we model check the inner
+    // part. The input slightly larger than what it would be within the multiplier.
+    #[kani::proof]
+    fn reduce_u64() {
+        let x: u64 = kani::any::<u64>();
+        assert_eq!(
+            x.rem_euclid((1 << 31) - 1) as u32,
+            M31I(reduce(x)).full_reduce()
+        )
+    }
 
-// TOO slow
-// #[kani::proof]
-// fn reduce_mul_u32() {
-//     let x: u32 = kani::any::<u32>();
-//     let y: u32 = kani::any::<u32>();
-//     let z = (x as u64) * (y as u64);
-//     assert_eq!(
-//         z.rem_euclid((1 << 31) - 1) as u32,
-//         (M31I(x) * M31I(y)).full_reduce()
-//     )
-// }
-
-#[kani::proof]
-fn reduce_u32() {
-    let x = kani::any::<u32>();
-    assert_eq!(x.rem_euclid((1 << 31) - 1) as u32, M31I(x).full_reduce())
+    // Checking the reduction of i64 is not enough as it will not cover the full
+    // range of u32. Other libraries use
+    #[kani::proof]
+    fn reduce_u32() {
+        let x = kani::any::<u32>();
+        assert_eq!(x.rem_euclid((1 << 31) - 1) as u32, M31I(x).full_reduce())
+    }
 }

From 4b585f48fd4f3032fbd5e0b6b2e16ff59dbc8c51 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Tue, 16 Dec 2025 15:11:04 +0800
Subject: [PATCH 06/12] m31: Add left shift

---
 cm31/src/lib.rs | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/cm31/src/lib.rs b/cm31/src/lib.rs
index cf6cd720..43040fdb 100644
--- a/cm31/src/lib.rs
+++ b/cm31/src/lib.rs
@@ -36,6 +36,7 @@ impl Reduce for M31I<i64> {
     // the top bits zero.
     // The only way the upper bits can be set is due to additions. After a
     // multiplication will go to 34 bits.
+    #[inline(always)]
     fn reduce_u32(&self) -> M31I<u32> {
         // TODO add debug assertion for top bits
         // these operations are only intended for kernel
@@ -43,6 +44,7 @@ impl Reduce for M31I<i64> {
         M31I(reduced)
     }
 
+    #[inline(always)]
     fn full_reduce(&self) -> u32 {
         // Only need two rounds to fully reduce down
         let tmp = self.reduce_u32().0;
@@ -59,11 +61,12 @@ impl Reduce for M31I<i64> {
 // u32 is mostly for storage. Staying within u32 has no benefit on 64 bit
 // machines
 impl Reduce for M31I<u32> {
-    #[inline]
+    #[inline(always)]
     fn reduce_u32(&self) -> M31I<u32> {
         *self
     }
 
+    #[inline(always)]
     fn full_reduce(&self) -> u32 {
         let tmp = self.0;
         let tmp = (tmp >> 31) + (tmp & ((1 << 31) - 1));
@@ -85,6 +88,7 @@ impl Reduce for M31I<u32> {
 // max(lo, hi) + 1
 // where lo max 31 bits and hi max 33 bits
 // so maximum is 34 bits
+#[inline(always)]
 fn reduce(r: u64) -> i64 {
     // What are the precedence rules for shift?
     // Three cheap operations mask, shift, addition
@@ -105,6 +109,7 @@ fn reduce(r: u64) -> i64 {
 // between the two representations. But not when it's in two complement.
 
 // limit to 62 bits otherwise two reduction steps need to be done.
+// implicitly says that the result we take in is two complement
 fn one_complement(r: i64) -> u64 {
     // A shift of 61 and 62 might look strange at first, but that is because in
     // contrary to other shifting operations there is no splitting into an upper
@@ -132,6 +137,7 @@ fn one_complement(r: i64) -> u64 {
 impl<T: Reduce> Mul<T> for M31I<i64> {
     type Output = M31I<i64>;
 
+    #[inline(always)]
     fn mul(self, rhs: T) -> Self::Output {
         let lhs = self.reduce_u32();
         lhs * rhs
@@ -143,6 +149,7 @@ impl<T: Reduce> Mul<T> for M31I<u32> {
 
     // Mul returns a partial reduced to 34 bits as to only have to do a single
     // round.
+    #[inline(always)]
     fn mul(self, rhs: T) -> Self::Output {
         let lhs = self;
         let rhs = rhs.reduce_u32();
@@ -168,6 +175,15 @@ impl<T: Into<i64>, K: Into<i64>> Add<M31I<T>> for M31I<K> {
     }
 }
 
+// TODO implement shift left
+impl<K: Into<i64>> Shl<u8> for M31I<K> {
+    type Output = M31I<i64>;
+
+    fn shl(self, rhs: u8) -> Self::Output {
+        M31I(self.0.into() << rhs)
+    }
+}
+
 #[cfg(kani)]
 mod verification {
     use crate::{reduce, Reduce, M31I};

From 1200733d500eb6951813b9754bda93707de28f42 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Tue, 16 Dec 2025 18:21:58 +0800
Subject: [PATCH 07/12] m31: Add documentation

---
 cm31/src/lib.rs | 195 ++++++++++++++++++------------------------------
 1 file changed, 72 insertions(+), 123 deletions(-)

diff --git a/cm31/src/lib.rs b/cm31/src/lib.rs
index 43040fdb..48d4b6eb 100644
--- a/cm31/src/lib.rs
+++ b/cm31/src/lib.rs
@@ -1,52 +1,35 @@
-/// When modifying this file, rerun with Kani
-use std::ops::{Add, Mul, Neg, Shl, Sub};
+/// Safety: When modifying this file, rerun with Kani
+use std::ops::{Add, Mul, Shl, Sub};
 
 #[derive(Copy, Clone)]
-struct M31I<T>(T);
+pub struct M31<T>(T);
 
-trait Reduce {
-    fn reduce_u32(&self) -> M31I<u32>;
-    // Assumes >= 62 bits representation, "enforced" by the allowed trait
-    // implementations. And also not being i32.
-    fn full_reduce(&self) -> u32;
+trait M31Reduce {
+    fn reduce_u32(self) -> M31<u32>;
+    fn full_reduce(self) -> u32;
 }
 
-// Working with two redundant forms i64 and u32
-// i64 to deal with subtractions
-// u32 for storage, i32 is not interesting as we want to be able to add two 31
-// bits values together. i32 would only allow for a subtraction.
-// internally u64 is also used
-
-// For unsigned value smaller than 61 bits conversion to i64 is problem free
-
-// i signifies two complement form
-// u signifies valid one complement form.
-// for unsigned integers two complement and one complement is the same. Which
-// means that conversion back and forth can be done. Whenever i is used we can
-// expect the value to be negative so it first needs to be converted to
-// one-complement form before reduction. That is why u64 exist internally. And
-// why after reduction we move back to i64 again.
-//
-// i32 signifies fully reduced but that should maybe not be inside a M31, but
-// just be a i32 return. i32 signifying that the upper bit is not used.
-
-impl Reduce for M31I<i64> {
-    // Assumes bits(hi) < 31
-    // Safety: this should only be called on M31I<u64> whose range is [61:0] with
-    // the top bits zero.
-    // The only way the upper bits can be set is due to additions. After a
-    // multiplication will go to 34 bits.
+/// Code operating on M31 integers will mostly be operation on i62 (i64).
+///
+/// All operations assume that i64 works is being treated as a i62 integer. No
+/// newtype wrapper for i62 has been introduced to keep boilerplate to a
+/// minimum. It is up to the user to make sure that the signed results stays
+/// within 62 bits. As most operations will never reach 62 bits and using the
+/// upper two bits would require an extra reduction round.
+///
+/// One way to think of it is that the upper three bits act as the sign but and
+/// therefore can not be different for each other.
+/// This also allows for efficient implementation on 64 bit register machines
+impl M31Reduce for i64 {
     #[inline(always)]
-    fn reduce_u32(&self) -> M31I<u32> {
-        // TODO add debug assertion for top bits
-        // these operations are only intended for kernel
-        let reduced = reduce(one_complement(self.0)) as u32;
-        M31I(reduced)
+    fn reduce_u32(self) -> M31<u32> {
+        let reduced = reduce_round(one_complement(self)) as u32;
+        M31(reduced)
     }
 
     #[inline(always)]
-    fn full_reduce(&self) -> u32 {
-        // Only need two rounds to fully reduce down
+    fn full_reduce(self) -> u32 {
+        // Two rounds to fully reduce down
         let tmp = self.reduce_u32().0;
         let tmp = (tmp >> 31) + (tmp & ((1 << 31) - 1));
         // branch should become CSEL
@@ -58,17 +41,19 @@ impl Reduce for M31I<i64> {
     }
 }
 
-// u32 is mostly for storage. Staying within u32 has no benefit on 64 bit
-// machines
-impl Reduce for M31I<u32> {
+/// From an end user perspective u32 is for storage as staying within u32 has no
+/// benefit on 64 bit machines. Internally it is used to reduce the operands
+/// before multiplition such that only a single multiplication has to be done on
+/// 64 bit machines.
+impl M31Reduce for u32 {
     #[inline(always)]
-    fn reduce_u32(&self) -> M31I<u32> {
-        *self
+    fn reduce_u32(self) -> M31<u32> {
+        M31(self)
     }
 
     #[inline(always)]
-    fn full_reduce(&self) -> u32 {
-        let tmp = self.0;
+    fn full_reduce(self) -> u32 {
+        let tmp = self;
         let tmp = (tmp >> 31) + (tmp & ((1 << 31) - 1));
         let tmp = (tmp >> 31) + (tmp & ((1 << 31) - 1));
         // branch should become CSEL
@@ -80,113 +65,77 @@ impl Reduce for M31I<u32> {
     }
 }
 
-// switching between i64 and u64 because of the sign shift.
-
-// After multiplication this can use the full range.
-// When coming from one complement this is u32.
-// Output bits
-// max(lo, hi) + 1
-// where lo max 31 bits and hi max 33 bits
-// so maximum is 34 bits
+/// Performs a single reduction round
+///
+/// Up to the caller to keep track of how many bits are actualy reduced.
 #[inline(always)]
-fn reduce(r: u64) -> i64 {
-    // What are the precedence rules for shift?
-    // Three cheap operations mask, shift, addition
+fn reduce_round(r: u64) -> i64 {
     let lo = r & ((1 << 31) - 1);
     let hi = r >> 31;
     (hi + lo) as i64
 }
 
-// Other option is to keep everything in unsigned and only do on complement on
-// subtraction and negation. That however would requiring three operations. not,
-// and, add instead of just sub.
-
-// i64 signed representation taking full 64 bits
-// u64 one complement representation. By going to one complement we have a valid
-// mersenne number. So it actually does two things.
-// It can potentially take up 64 bits but those will all be over. It works
-// because we work on a Once in one complement form you can go back and forward
-// between the two representations. But not when it's in two complement.
-
-// limit to 62 bits otherwise two reduction steps need to be done.
-// implicitly says that the result we take in is two complement
+/// Assumes that the input fits in i62
+///
+/// By taking the one complement we can make use of the correspondence between
+/// one complement and numbers modulo Mersenne numbers. Another way of seeing
+/// this is going from signed 62 bit value to a unsigned 62 bit number
+/// hence the type change.
 fn one_complement(r: i64) -> u64 {
-    // A shift of 61 and 62 might look strange at first, but that is because in
-    // contrary to other shifting operations there is no splitting into an upper
-    // and lower part. It's just for checking the MSB and perform an operation
-    // basedon that.
-
-    // Would checking the N-flag
-    // be better? Does the compiler do this optimisation?
-    // Relies on the *arithmic* shift left to maintain the sign.
+    // Relies on the *arithmic* shift right to extend the three bit sign bits to an
+    // i64.
     let sign = r >> 61;
-    // Turn into one complement and clear out the top two bits. These top bits would
-    // lead to wrong calculations as we only do not contain any information.
+    // Use the sign information to turn into one complement and clear out the
+    // redudant sign bits as these would lead to an overcorrection.
     ((r + sign) & ((1 << 62) - 1)) as u64
 }
 
-// The following traits are to make working with M31 across different
-// representation easier.
-// On ARM64 conversion from u32 to i64 is free as a u32 already takes up a full
-// 64 bit register.
-// Monomorphisation should optimise out when the reductions is
-// not needed.
+// The following traits reduce the boiler plate when working with M31 by taking
+// on some of the widening and reducing required.
 
-// M31I<i64> and M31I<u32> are split up becaues rust otherwise requires a dyn
-// which would break monomorphisation.
-impl<T: Reduce> Mul<T> for M31I<i64> {
-    type Output = M31I<i64>;
+impl<T: M31Reduce, K: M31Reduce> Mul<M31<T>> for M31<K> {
+    type Output = M31<i64>;
 
     #[inline(always)]
-    fn mul(self, rhs: T) -> Self::Output {
-        let lhs = self.reduce_u32();
-        lhs * rhs
-    }
-}
-
-impl<T: Reduce> Mul<T> for M31I<u32> {
-    type Output = M31I<i64>;
-
-    // Mul returns a partial reduced to 34 bits as to only have to do a single
-    // round.
-    #[inline(always)]
-    fn mul(self, rhs: T) -> Self::Output {
-        let lhs = self;
-        let rhs = rhs.reduce_u32();
+    fn mul(self, rhs: M31<T>) -> Self::Output {
+        let lhs = self.0.reduce_u32();
+        let rhs = rhs.0.reduce_u32();
         let res = lhs.0 as u64 * rhs.0 as u64;
         // After reduction 34 bits
-        M31I(reduce(res))
+        M31(reduce_round(res))
     }
 }
 
-impl<T: Into<i64>, K: Into<i64>> Sub<M31I<T>> for M31I<K> {
-    type Output = M31I<i64>;
+impl<T: Into<i64>, K: Into<i64>> Sub<M31<T>> for M31<K> {
+    type Output = M31<i64>;
 
-    fn sub(self, rhs: M31I<T>) -> Self::Output {
-        M31I(self.0.into() - rhs.0.into())
+    fn sub(self, rhs: M31<T>) -> Self::Output {
+        M31(self.0.into() - rhs.0.into())
     }
 }
 
-impl<T: Into<i64>, K: Into<i64>> Add<M31I<T>> for M31I<K> {
-    type Output = M31I<i64>;
+impl<T: Into<i64>, K: Into<i64>> Add<M31<T>> for M31<K> {
+    type Output = M31<i64>;
 
-    fn add(self, rhs: M31I<T>) -> Self::Output {
-        M31I(self.0.into() + rhs.0.into())
+    fn add(self, rhs: M31<T>) -> Self::Output {
+        M31(self.0.into() + rhs.0.into())
     }
 }
 
 // TODO implement shift left
-impl<K: Into<i64>> Shl<u8> for M31I<K> {
-    type Output = M31I<i64>;
-
+impl<K: Into<i64>> Shl<u8> for M31<K> {
+    type Output = M31<i64>;
+    // A fully reduced m31 has space for two 15 left shifts. So in practice there
+    // might only be space for one left shift. In that case adding a reduction
+    // might be the right thing to do.
     fn shl(self, rhs: u8) -> Self::Output {
-        M31I(self.0.into() << rhs)
+        M31(self.0.into() << rhs)
     }
 }
 
 #[cfg(kani)]
 mod verification {
-    use crate::{reduce, Reduce, M31I};
+    use crate::{reduce_round, M31Reduce, M31};
     // Takes a 62bit number and sign extends it into a valid i64
     fn sign_extend(x: u64) -> i64 {
         let sign = x >> 61;
@@ -198,7 +147,7 @@ mod verification {
     fn reduce_i64() {
         let x: u64 = kani::any::<u64>() & ((1 << 62) - 1);
         let x = sign_extend(x);
-        assert_eq!(x.rem_euclid((1 << 31) - 1) as u32, M31I(x).full_reduce())
+        assert_eq!(x.rem_euclid((1 << 31) - 1) as u32, x.full_reduce())
     }
 
     // The proof for the multiplier is too slow. Therefore we model check the inner
@@ -208,7 +157,7 @@ mod verification {
         let x: u64 = kani::any::<u64>();
         assert_eq!(
             x.rem_euclid((1 << 31) - 1) as u32,
-            M31I(reduce(x)).full_reduce()
+            reduce_round(x).full_reduce()
         )
     }
 
@@ -217,6 +166,6 @@ mod verification {
     #[kani::proof]
     fn reduce_u32() {
         let x = kani::any::<u32>();
-        assert_eq!(x.rem_euclid((1 << 31) - 1) as u32, M31I(x).full_reduce())
+        assert_eq!(x.rem_euclid((1 << 31) - 1) as u32, x.full_reduce())
     }
 }

From f7f822d5ea5ea01816f37e4aa756e87e6101c53a Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Wed, 17 Dec 2025 10:22:53 +0800
Subject: [PATCH 08/12] cm31: Kani lint in Cargo

---
 Cargo.toml       |  5 ++-
 cm31/src/lib.rs  | 40 +++++++++++----------
 cm31/src/main.rs | 14 --------
 cm31/src/u32.rs  | 92 ------------------------------------------------
 4 files changed, 26 insertions(+), 125 deletions(-)
 delete mode 100644 cm31/src/main.rs
 delete mode 100644 cm31/src/u32.rs

diff --git a/Cargo.toml b/Cargo.toml
index 71e5202b..07928ec4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -41,6 +41,9 @@ license = "MIT"
 homepage = "https://github.com/worldfnd/ProveKit"
 repository = "https://github.com/worldfnd/ProveKit"
 
+[workspace.lints.rust]
+unexpected_cfgs = { level = "warn", check-cfg = ['cfg(kani)'] }
+
 [workspace.lints.clippy]
 cargo = "warn"
 perf = "warn"
@@ -151,4 +154,4 @@ spongefish = { git = "https://github.com/arkworks-rs/spongefish", features = [
   "arkworks-algebra",
 ], rev = "ecb4f08373ed930175585c856517efdb1851fb47" }
 spongefish-pow = { git = "https://github.com/arkworks-rs/spongefish", rev = "ecb4f08373ed930175585c856517efdb1851fb47" }
-whir = { git = "https://github.com/WizardOfMenlo/whir/", features = ["tracing"], rev = "15cf6668e904ed2e80c9e6209dcce69f5bcf79b9" }
\ No newline at end of file
+whir = { git = "https://github.com/WizardOfMenlo/whir/", features = ["tracing"], rev = "15cf6668e904ed2e80c9e6209dcce69f5bcf79b9" }
diff --git a/cm31/src/lib.rs b/cm31/src/lib.rs
index 48d4b6eb..5ac614bd 100644
--- a/cm31/src/lib.rs
+++ b/cm31/src/lib.rs
@@ -2,24 +2,29 @@
 use std::ops::{Add, Mul, Shl, Sub};
 
 #[derive(Copy, Clone)]
+/// 
 pub struct M31<T>(T);
 
 trait M31Reduce {
+    /// Performs the necessary rounds to get the result to fit inside 32 bits.
     fn reduce_u32(self) -> M31<u32>;
-    fn full_reduce(self) -> u32;
+    /// Performs the necessary rounds to get the result to fit inside 31 bits.
+    fn reduce_fully(self) -> u32;
 }
 
-/// Code operating on M31 integers will mostly be operation on i62 (i64).
+/// Code operating on M31 integers will mostly be operation on i62 (represented
+/// as i64).
 ///
-/// All operations assume that i64 works is being treated as a i62 integer. No
-/// newtype wrapper for i62 has been introduced to keep boilerplate to a
-/// minimum. It is up to the user to make sure that the signed results stays
-/// within 62 bits. As most operations will never reach 62 bits and using the
-/// upper two bits would require an extra reduction round.
+/// All operations assume that i64 is as a i62. No newtype wrapper for i62 has
+/// been introduced to keep boilerplate to a minimum. It is up to the user to
+/// make sure that the signed results stays within 62 bits. As most operations
+/// will never reach 62 bits and using the upper two bits would require an extra
+/// reduction round.
 ///
-/// One way to think of it is that the upper three bits act as the sign but and
-/// therefore can not be different for each other.
-/// This also allows for efficient implementation on 64 bit register machines
+/// One way to think of it is that the upper three bits act as the sign bit and
+/// therefore can not be different from each other.
+/// This representation allows for efficient implementation of i62 on 64b
+/// machines.
 impl M31Reduce for i64 {
     #[inline(always)]
     fn reduce_u32(self) -> M31<u32> {
@@ -28,7 +33,7 @@ impl M31Reduce for i64 {
     }
 
     #[inline(always)]
-    fn full_reduce(self) -> u32 {
+    fn reduce_fully(self) -> u32 {
         // Two rounds to fully reduce down
         let tmp = self.reduce_u32().0;
         let tmp = (tmp >> 31) + (tmp & ((1 << 31) - 1));
@@ -52,7 +57,7 @@ impl M31Reduce for u32 {
     }
 
     #[inline(always)]
-    fn full_reduce(self) -> u32 {
+    fn reduce_fully(self) -> u32 {
         let tmp = self;
         let tmp = (tmp >> 31) + (tmp & ((1 << 31) - 1));
         let tmp = (tmp >> 31) + (tmp & ((1 << 31) - 1));
@@ -122,7 +127,6 @@ impl<T: Into<i64>, K: Into<i64>> Add<M31<T>> for M31<K> {
     }
 }
 
-// TODO implement shift left
 impl<K: Into<i64>> Shl<u8> for M31<K> {
     type Output = M31<i64>;
     // A fully reduced m31 has space for two 15 left shifts. So in practice there
@@ -135,8 +139,8 @@ impl<K: Into<i64>> Shl<u8> for M31<K> {
 
 #[cfg(kani)]
 mod verification {
-    use crate::{reduce_round, M31Reduce, M31};
-    // Takes a 62bit number and sign extends it into a valid i64
+    use super::*;
+    // Takes a signed 62bit number and sign extends it into a valid i64
     fn sign_extend(x: u64) -> i64 {
         let sign = x >> 61;
         let sign = sign << 2 | sign << 1 | sign;
@@ -147,7 +151,7 @@ mod verification {
     fn reduce_i64() {
         let x: u64 = kani::any::<u64>() & ((1 << 62) - 1);
         let x = sign_extend(x);
-        assert_eq!(x.rem_euclid((1 << 31) - 1) as u32, x.full_reduce())
+        assert_eq!(x.rem_euclid((1 << 31) - 1) as u32, x.reduce_fully())
     }
 
     // The proof for the multiplier is too slow. Therefore we model check the inner
@@ -157,7 +161,7 @@ mod verification {
         let x: u64 = kani::any::<u64>();
         assert_eq!(
             x.rem_euclid((1 << 31) - 1) as u32,
-            reduce_round(x).full_reduce()
+            reduce_round(x).reduce_fully()
         )
     }
 
@@ -166,6 +170,6 @@ mod verification {
     #[kani::proof]
     fn reduce_u32() {
         let x = kani::any::<u32>();
-        assert_eq!(x.rem_euclid((1 << 31) - 1) as u32, x.full_reduce())
+        assert_eq!(x.rem_euclid((1 << 31) - 1) as u32, x.reduce_fully())
     }
 }
diff --git a/cm31/src/main.rs b/cm31/src/main.rs
deleted file mode 100644
index 059d6a3a..00000000
--- a/cm31/src/main.rs
+++ /dev/null
@@ -1,14 +0,0 @@
-// Type your code here, or load an example.
-
-// As of Rust 1.75, small functions are automatically
-// marked as `#[inline]` so they will not show up in
-// the output when compiling with optimisations. Use
-// `#[no_mangle]` or `#[inline(never)]` to work around
-// this issue.
-// See https://github.com/compiler-explorer/compiler-explorer/issues/5939
-use std::hint::black_box;
-// If you use `main()`, declare it as `pub` to see it in the output:
-pub fn main() {
-    let (lo, hi) = (u32::carrying_mul_add(black_box(5), black_box(6), 0, 0));
-    let _res = black_box(2 * hi + lo);
-}
diff --git a/cm31/src/u32.rs b/cm31/src/u32.rs
deleted file mode 100644
index e976984b..00000000
--- a/cm31/src/u32.rs
+++ /dev/null
@@ -1,92 +0,0 @@
-const P3: u64 = 3 * (1 << 31 - 1);
-
-// Tag can also be the internal type
-// 64 byte, 32 byte, 31 byte, 31 byte zero check. De laatste twee is
-// waarschijnlijk geen interessant ondersheid Aan de andere kant lijkt het geen
-struct M31(u32);
-
-// Introduce tag to track in which state the M31 is?
-// State
-impl M31 {
-    // Needs to return a value if it's going to be tagged with the internal state.
-    fn reduce_u31(&mut self) {
-        let tmp = self.0;
-        // TODO: mask
-        let tmp = tmp >> 31 + ((tmp << 1) >> 1);
-        let tmp = tmp >> 31 + ((tmp << 1) >> 1);
-        self.0 = tmp;
-    }
-}
-
-impl Add for M31 {
-    type Output = M31;
-
-    // 32 bit sized because otherwise there might be a masking operations
-    // while adds W will zero it out.
-    #[inline]
-    fn add(self, rhs: Self) -> Self::Output {
-        // Different ways of presenting it
-        // 32 bytes
-        // 64 bytes
-        // adds w
-        let (tmp, o) = self.0.overflowing_add(rhs.0);
-        // can't use adcs as it needs to be shifted first
-        // so 64 bit might be better as it doesn't require moving the carry to another
-        // register.
-        // Interested in how the compiler takes this?
-        let (tmp, o) = tmp.overflowing_add(2 * o as u32);
-        let out = tmp + 2 * o as u32;
-
-        Self(out)
-    }
-}
-
-impl Sub for M31 {
-    type Output = M31;
-
-    fn sub(self, rhs: Self) -> Self::Output {
-        // 33 bits + 32 bits -> 34 bits
-        let tmp = P3 + self.0 as u64 - rhs.0 as u64;
-        // 34 bits -> 32 bits
-        let hi = (tmp >> 32) as u32;
-        let lo = tmp as u32;
-        Self(2 * hi + lo)
-    }
-}
-
-impl Neg for M31 {
-    type Output = M31;
-
-    fn neg(self) -> Self::Output {
-        let tmp = P3 - self.0 as u64;
-        let hi = (tmp >> 32) as u32;
-        let lo = tmp as u32;
-        Self(2 * hi + lo)
-    }
-}
-
-impl Mul for M31 {
-    type Output = M31;
-
-    fn mul(self, rhs: Self) -> Self::Output {
-        let (lo, hi) = u32::carrying_mul_add(self.0, rhs.0, 0, 0);
-        // shift addition
-        // shift, shift, addition
-        // 64bits -> 34 bits
-        // 2 * hi -> 33 bits; 33 bits + 32 bits -> 34 bits
-        let tmp = 2 * hi as u64 + lo as u64;
-        // 34 bits -> 32 bits
-        let hi = (tmp >> 32) as u32;
-        let lo = tmp as u32;
-        Self(2 * hi + lo)
-    }
-}
-
-impl Shl<u8> for M31 {
-    type Output = M31;
-
-    fn shl(mut self, rhs: u8) -> Self::Output {
-        self.reduce_u31();
-        todo!();
-    }
-}

From cdba6b8c6ae711ae37da01b6066d1b49c40eede5 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Wed, 17 Dec 2025 11:24:52 +0800
Subject: [PATCH 09/12] cm31: improve documentation

---
 cm31/src/lib.rs | 114 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 77 insertions(+), 37 deletions(-)

diff --git a/cm31/src/lib.rs b/cm31/src/lib.rs
index 5ac614bd..5796da02 100644
--- a/cm31/src/lib.rs
+++ b/cm31/src/lib.rs
@@ -1,30 +1,59 @@
-/// Safety: When modifying this file, rerun with Kani
+//! Arithmetic over the Mersenne prime p = 2^31 - 1 (“M31”).
+//!
+//! This module provides fast, allocation-free helpers to work with values that
+//! are intended to live in the finite field modulo \(2^{31}-1\). It relies on:
+//! - a 62-bit signed accumulator representation (`i62` stored in `i64`) for
+//!   intermediate results on 64-bit machines, and
+//! - reductions that add the high 31-bit chunk into the low 31-bit chunk until
+//!   the result fits within the desired bits.
+//!
+//! Safety: When modifying this file, rerun with Kani
 use std::ops::{Add, Mul, Shl, Sub};
 
+/// Newtype wrapper for values participating in arithmetic modulo 2^31 - 1.
+///
+/// The generic parameter models the “shape” of the wrapped value:
+/// - `u32`: a value known to fit in 32 bits. Primarily used for storage and to
+///   tighten bounds when reasoning about sizes.
+/// - `i64`: a 62-bit signed accumulator (stored in `i64`). This representation
+///   is used for intermediate results.
+///
+/// In general, operations widen into the accumulator form and should be
+/// followed by an explicit reduction when a canonical representative is
+/// required.
 #[derive(Copy, Clone)]
-/// 
 pub struct M31<T>(T);
 
+/// Trait for types that can be reduced modulo \(2^{31}-1\).
 trait M31Reduce {
-    /// Performs the necessary rounds to get the result to fit inside 32 bits.
+    /// Partially reduces the value so it fits in 32 bits. This is typically one
+    /// reduction round for Mersenne moduli.
     fn reduce_u32(self) -> M31<u32>;
-    /// Performs the necessary rounds to get the result to fit inside 31 bits.
+    /// Fully reduces into the canonical representative in 0..(2^31 - 1).
+    /// Typically requires two reduction rounds
     fn reduce_fully(self) -> u32;
 }
 
-/// Code operating on M31 integers will mostly be operation on i62 (represented
-/// as i64).
+impl<T: M31Reduce> M31Reduce for M31<T> {
+    fn reduce_u32(self) -> M31<u32> {
+        self.0.reduce_u32()
+    }
+
+    fn reduce_fully(self) -> u32 {
+        self.0.reduce_fully()
+    }
+}
+
+/// All operations assume that `i64` values are valid `i62`. No separate newtype
+/// is introduced to keep boilerplate to a minimum. Callers must ensure signed
+/// intermediate results stay within 62 bits.
 ///
-/// All operations assume that i64 is as a i62. No newtype wrapper for i62 has
-/// been introduced to keep boilerplate to a minimum. It is up to the user to
-/// make sure that the signed results stays within 62 bits. As most operations
-/// will never reach 62 bits and using the upper two bits would require an extra
-/// reduction round.
+/// Intuitively, the upper three bits act as a sign extension; they must not
+/// disagree with each other. This enables an efficient `i62` representation on
+/// 64-bit machines.
 ///
-/// One way to think of it is that the upper three bits act as the sign bit and
-/// therefore can not be different from each other.
-/// This representation allows for efficient implementation of i62 on 64b
-/// machines.
+/// 62 bits has been chosen over 64 bits as 64 bits would necessitate another
+/// round of reduction while most algorithms do not require that much space.
 impl M31Reduce for i64 {
     #[inline(always)]
     fn reduce_u32(self) -> M31<u32> {
@@ -46,10 +75,11 @@ impl M31Reduce for i64 {
     }
 }
 
-/// From an end user perspective u32 is for storage as staying within u32 has no
-/// benefit on 64 bit machines. Internally it is used to reduce the operands
-/// before multiplition such that only a single multiplication has to be done on
-/// 64 bit machines.
+/// `u32` is the preferred storage type.
+///
+/// On 64-bit machines, the computational benefit comes from using `i64`
+/// accumulators and reducing operands to 32 bits before multiplication so
+/// that only a single 64-bit multiplication is required.
 impl M31Reduce for u32 {
     #[inline(always)]
     fn reduce_u32(self) -> M31<u32> {
@@ -70,9 +100,9 @@ impl M31Reduce for u32 {
     }
 }
 
-/// Performs a single reduction round
-///
-/// Up to the caller to keep track of how many bits are actualy reduced.
+/// Performs a single reduction round.
+/// The caller is responsible for invoking a sufficient number of
+/// rounds to reach the desired bound.
 #[inline(always)]
 fn reduce_round(r: u64) -> i64 {
     let lo = r & ((1 << 31) - 1);
@@ -80,24 +110,28 @@ fn reduce_round(r: u64) -> i64 {
     (hi + lo) as i64
 }
 
-/// Assumes that the input fits in i62
+/// Converts an `i62` accumulator into its ones’ complement `u62` form.
 ///
-/// By taking the one complement we can make use of the correspondence between
-/// one complement and numbers modulo Mersenne numbers. Another way of seeing
-/// this is going from signed 62 bit value to a unsigned 62 bit number
-/// hence the type change.
+/// By taking the ones’ complement we leverage the correspondence between ones’
+/// complement and arithmetic modulo Mersenne numbers. Equivalently, this maps a
+/// signed 62-bit value to an unsigned 62-bit number while preserving the
+/// residue class modulo \(2^{31}-1\).
 fn one_complement(r: i64) -> u64 {
-    // Relies on the *arithmic* shift right to extend the three bit sign bits to an
+    // Relies on the arithmetic shift right to extend the three sign bits to an
     // i64.
     let sign = r >> 61;
     // Use the sign information to turn into one complement and clear out the
-    // redudant sign bits as these would lead to an overcorrection.
+    // redundant sign bits as these would lead to an overcorrection.
     ((r + sign) & ((1 << 62) - 1)) as u64
 }
 
 // The following traits reduce the boiler plate when working with M31 by taking
 // on some of the widening and reducing required.
 
+/// Multiplication in M31 arithmetic.
+///
+/// Operands are first reduced to 32 bits, multiplied, and then folded
+/// once, yielding an 34 bit result.
 impl<T: M31Reduce, K: M31Reduce> Mul<M31<T>> for M31<K> {
     type Output = M31<i64>;
 
@@ -111,6 +145,7 @@ impl<T: M31Reduce, K: M31Reduce> Mul<M31<T>> for M31<K> {
     }
 }
 
+/// Subtraction in M31 arithmetic. Widens into a `i62`.
 impl<T: Into<i64>, K: Into<i64>> Sub<M31<T>> for M31<K> {
     type Output = M31<i64>;
 
@@ -119,6 +154,7 @@ impl<T: Into<i64>, K: Into<i64>> Sub<M31<T>> for M31<K> {
     }
 }
 
+/// Addition in M31 arithmetic. Widens into an `i64`.
 impl<T: Into<i64>, K: Into<i64>> Add<M31<T>> for M31<K> {
     type Output = M31<i64>;
 
@@ -127,11 +163,12 @@ impl<T: Into<i64>, K: Into<i64>> Add<M31<T>> for M31<K> {
     }
 }
 
+/// Left shift on an `i62` accumulator.
+///
+/// A fully reduced residue has room for at most two 15-bit left shifts (8-th
+/// root of unity).
 impl<K: Into<i64>> Shl<u8> for M31<K> {
     type Output = M31<i64>;
-    // A fully reduced m31 has space for two 15 left shifts. So in practice there
-    // might only be space for one left shift. In that case adding a reduction
-    // might be the right thing to do.
     fn shl(self, rhs: u8) -> Self::Output {
         M31(self.0.into() << rhs)
     }
@@ -140,13 +177,14 @@ impl<K: Into<i64>> Shl<u8> for M31<K> {
 #[cfg(kani)]
 mod verification {
     use super::*;
-    // Takes a signed 62bit number and sign extends it into a valid i64
+    // Takes a signed 62-bit number and sign-extends it into a valid i64.
     fn sign_extend(x: u64) -> i64 {
         let sign = x >> 61;
         let sign = sign << 2 | sign << 1 | sign;
         ((sign << 61) | x) as i64
     }
 
+    /// Proves that reducing an `i62` accumulator matches modulo p semantics.
     #[kani::proof]
     fn reduce_i64() {
         let x: u64 = kani::any::<u64>() & ((1 << 62) - 1);
@@ -154,8 +192,10 @@ mod verification {
         assert_eq!(x.rem_euclid((1 << 31) - 1) as u32, x.reduce_fully())
     }
 
-    // The proof for the multiplier is too slow. Therefore we model check the inner
-    // part. The input slightly larger than what it would be within the multiplier.
+    // The proof for the multiplier is too slow. Therefore we model-check the
+    // inner part. The input is slightly larger than what it would be within the
+    // multiplier.
+    /// Proves that one fold plus full reduction matches modulo p on `u64`.
     #[kani::proof]
     fn reduce_u64() {
         let x: u64 = kani::any::<u64>();
@@ -165,8 +205,8 @@ mod verification {
         )
     }
 
-    // Checking the reduction of i64 is not enough as it will not cover the full
-    // range of u32. Other libraries use
+    // Checking only `i64` does not cover the full `u32` range.
+    /// Proves full reduction on all `u32` inputs.
     #[kani::proof]
     fn reduce_u32() {
         let x = kani::any::<u32>();

From 6402359a7876f17bf6f4f78de0263452e4d9967b Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Wed, 17 Dec 2025 12:15:36 +0800
Subject: [PATCH 10/12] CI: add kani support

---
 .github/workflows/kani.yml | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 .github/workflows/kani.yml

diff --git a/.github/workflows/kani.yml b/.github/workflows/kani.yml
new file mode 100644
index 00000000..e2779470
--- /dev/null
+++ b/.github/workflows/kani.yml
@@ -0,0 +1,18 @@
+name: Kani model checking
+
+on:
+  workflow_dispatch:
+  push:
+    branches: ["main"]
+  pull_request:
+    branches: ["main"]
+
+jobs:
+  kani:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Run Kani
+        uses: model-checking/kani-github-action@v1
+        with:
+          # Whitelist the directories that use Kani to prevent it building the entire project.
+          working-directory: "./cm31"

From f26cab6f153b8642d56cf365dc3472f74c9fe6b8 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Wed, 17 Dec 2025 12:20:39 +0800
Subject: [PATCH 11/12] fixup! CI: add kani support

---
 .github/workflows/kani.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/kani.yml b/.github/workflows/kani.yml
index e2779470..53208f12 100644
--- a/.github/workflows/kani.yml
+++ b/.github/workflows/kani.yml
@@ -14,5 +14,5 @@ jobs:
       - name: Run Kani
         uses: model-checking/kani-github-action@v1
         with:
-          # Whitelist the directories that use Kani to prevent it building the entire project.
+          # Whitelist directories that use Kani this prevents the entire project from being built.
           working-directory: "./cm31"

From 06f3b18deb332b4e92572f7f5eb97cea447468c5 Mon Sep 17 00:00:00 2001
From: Xander van der Goot <xandervandergoot@gmail.com>
Date: Wed, 24 Dec 2025 15:41:16 +0800
Subject: [PATCH 12/12] CI: path trigger + checkout

---
 .github/workflows/kani.yml | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/kani.yml b/.github/workflows/kani.yml
index 53208f12..1daa5331 100644
--- a/.github/workflows/kani.yml
+++ b/.github/workflows/kani.yml
@@ -1,18 +1,20 @@
 name: Kani model checking
 
 on:
-  workflow_dispatch:
   push:
-    branches: ["main"]
+    paths:
+      - "cm31/**"
   pull_request:
-    branches: ["main"]
+    paths:
+      - "cm31/**"
 
 jobs:
   kani:
     runs-on: ubuntu-latest
     steps:
+      - uses: actions/checkout@v4
       - name: Run Kani
         uses: model-checking/kani-github-action@v1
         with:
           # Whitelist directories that use Kani this prevents the entire project from being built.
-          working-directory: "./cm31"
+          working-directory: "cm31"