Lagrange-Labs · montekki · Jan 12, 2026 · Jan 13, 2026 · Jan 14, 2026 · Jan 14, 2026
diff --git a/Cargo.toml b/Cargo.toml
@@ -5,3 +5,7 @@ members = [
   "gpu-tests",
 ]
 resolver = "2"
+
+[workspace.dependencies]
+ec-gpu = { path = "./ec-gpu" }
+tracing-profile = { git = "https://github.com/IrreducibleOSS/tracing-profile" }
diff --git a/ec-gpu-gen/Cargo.toml b/ec-gpu-gen/Cargo.toml
@@ -12,34 +12,35 @@ rust-version = "1.83.0"
 [dependencies]
 bitvec = "1.0.1"
 crossbeam-channel = "0.5.1"
-ec-gpu = "0.2.0"
+ec-gpu.workspace = true
 execute = "0.2.9"
-ff = { version = "0.13.0", default-features = false }
-group = "0.13.0"
 hex = "0.4"
+# Pin home to version compatible with Rust 1.83 (edition 2021)
+home = "=0.5.9"
 log = "0.4.14"
 num_cpus = "1.13.0"
 once_cell = "1.8.0"
 rayon = "1.5.1"
 rust-gpu-tools = { version = "0.7.0", default-features = false, optional = true }
 sha2 = "0.10"
 thiserror = "1.0.30"
+tracing = "0.1"
 yastl = "0.1.2"
 
+ark-bn254 = { version = "0.5.0", optional = true }
+ark-std = { version = "0.5.0", optional = true }
+ark-ec = "0.5.0"
+ark-ff = "0.5.0"
+ark-serialize = { version = "0.5.0", optional = true }
+
 [dev-dependencies]
-# NOTE vmx 2022-07-07: Using the `__private_bench` feature of `blstrs` is just
-# temporarily until https://github.com/zkcrypto/group/pull/29 is fixed. Then
-# we won't need the exports of `Fp` and `Fp2` any more.
-#blstrs = { version = "0.6.0", features = ["__private_bench"], optional = true }
-blstrs = { version = "0.7.0", features = ["__private_bench", "gpu"] }
 rand = "0.8"
 lazy_static = "1.2"
-pairing = "0.23.0"
 temp-env = "0.3.0"
-rand_core = "0.6.3"
-rand_xorshift = "0.3.0"
+ark-bn254 = "0.5.0"
 
 [features]
 default = []
 cuda = ["rust-gpu-tools/cuda"]
 opencl = ["rust-gpu-tools/opencl"]
+arkworks = ["ec-gpu/arkworks", "dep:ark-bn254", "dep:ark-std", "dep:ark-serialize"]
diff --git a/ec-gpu-gen/src/cl/field.cl b/ec-gpu-gen/src/cl/field.cl
@@ -164,6 +164,94 @@ DEVICE void FIELD_reduce(uint32_t accLow[FIELD_LIMBS], uint32_t np0, uint32_t fq
   accLow[i]=chain_add(&chain5, accLow[i], highCarry);
 }
 
+// Optimized squaring: exploits symmetry a² = a·a
+// Cross products aᵢ·aⱼ for i≠j appear twice, so compute once and double.
+// Reduces from n² to n(n+1)/2 multiplications (~44% fewer for 8 limbs).
+DEVICE inline
+void FIELD_sqr_v1(uint32_t *x, uint32_t *xx) {
+  const uint32_t xLimbs = FIELD_LIMBS;
+  const uint32_t xxLimbs = FIELD_LIMBS * 2;
+  uint32_t temp[FIELD_LIMBS * 2];
+  uint32_t carry = 0;
+
+  #pragma unroll
+  for (int32_t i = 0; i < xxLimbs; i++) {
+    temp[i] = 0;
+  }
+
+  // Step 1: Compute off-diagonal products for odd (i+j) positions
+  // Following the same pattern as FIELD_mult_v1 for correctness
+  #pragma unroll
+  for (int32_t i = 0; i < xLimbs; i++) {
+    chain_t chain1;
+    chain_init(&chain1);
+    #pragma unroll
+    for (int32_t j = i + 1; j < xLimbs; j++) {
+      if ((i + j) % 2 == 1) {
+        temp[i + j - 1] = chain_madlo(&chain1, x[i], x[j], temp[i + j - 1]);
+        temp[i + j]     = chain_madhi(&chain1, x[i], x[j], temp[i + j]);
+      }
+    }
+    if (i % 2 == 1 && i + 1 < xLimbs) {
+      temp[i + xLimbs - 1] = chain_add(&chain1, 0, 0);
+    }
+  }
+
+  // Shift right by 1 position (same as mult_v1)
+  #pragma unroll
+  for (int32_t i = xxLimbs - 1; i > 0; i--) {
+    temp[i] = temp[i - 1];
+  }
+  temp[0] = 0;
+
+  // Step 2: Compute off-diagonal products for even (i+j) positions
+  carry = 0;
+  #pragma unroll
+  for (int32_t i = 0; i < xLimbs; i++) {
+    chain_t chain2;
+    chain_init(&chain2);
+
+    #pragma unroll
+    for (int32_t j = i + 1; j < xLimbs; j++) {
+      if ((i + j) % 2 == 0) {
+        temp[i + j]     = chain_madlo(&chain2, x[i], x[j], temp[i + j]);
+        temp[i + j + 1] = chain_madhi(&chain2, x[i], x[j], temp[i + j + 1]);
+      }
+    }
+    if ((i + xLimbs) % 2 == 0 && i != xLimbs - 1 && i + 1 < xLimbs) {
+      temp[i + xLimbs]     = chain_add(&chain2, temp[i + xLimbs], carry);
+      temp[i + xLimbs + 1] = chain_add(&chain2, temp[i + xLimbs + 1], 0);
+      carry = chain_add(&chain2, 0, 0);
+    }
+    if ((i + xLimbs) % 2 == 1 && i != xLimbs - 1 && i + 1 < xLimbs) {
+      carry = chain_add(&chain2, carry, 0);
+    }
+  }
+
+  // Step 3: Double the off-diagonal products (left shift by 1 bit)
+  carry = 0;
+  #pragma unroll
+  for (int32_t i = 0; i < xxLimbs; i++) {
+    uint32_t new_carry = temp[i] >> 31;
+    temp[i] = (temp[i] << 1) | carry;
+    carry = new_carry;
+  }
+
+  // Step 4: Add diagonal products x[i] * x[i]
+  chain_t chain3;
+  chain_init(&chain3);
+  #pragma unroll
+  for (int32_t i = 0; i < xLimbs; i++) {
+    temp[2 * i]     = chain_madlo(&chain3, x[i], x[i], temp[2 * i]);
+    temp[2 * i + 1] = chain_madhi(&chain3, x[i], x[i], temp[2 * i + 1]);
+  }
+
+  #pragma unroll
+  for (int32_t i = 0; i < xxLimbs; i++) {
+    xx[i] = temp[i];
+  }
+}
+
 // Requirement: yLimbs >= xLimbs
 DEVICE inline
 void FIELD_mult_v1(uint32_t *x, uint32_t *y, uint32_t *xy) {
@@ -262,6 +350,40 @@ DEVICE FIELD FIELD_mul_nvidia(FIELD a, FIELD b) {
   return r;
 }
 
+DEVICE FIELD FIELD_sqr_nvidia(FIELD a) {
+  // Perform optimized squaring
+  limb aa[2 * FIELD_LIMBS];
+  FIELD_sqr_v1(a.val, aa);
+
+  uint32_t io[FIELD_LIMBS];
+  #pragma unroll
+  for(int i=0;i<FIELD_LIMBS;i++) {
+    io[i]=aa[i];
+  }
+  FIELD_reduce(io, FIELD_INV, FIELD_P.val);
+
+  // Add io to the upper words of aa
+  aa[FIELD_LIMBS] = add_cc(aa[FIELD_LIMBS], io[0]);
+  int j;
+  #pragma unroll
+  for (j = 1; j < FIELD_LIMBS - 1; j++) {
+    aa[j + FIELD_LIMBS] = addc_cc(aa[j + FIELD_LIMBS], io[j]);
+  }
+  aa[2 * FIELD_LIMBS - 1] = addc(aa[2 * FIELD_LIMBS - 1], io[FIELD_LIMBS - 1]);
+
+  FIELD r;
+  #pragma unroll
+  for (int i = 0; i < FIELD_LIMBS; i++) {
+    r.val[i] = aa[i + FIELD_LIMBS];
+  }
+
+  if (FIELD_gte(r, FIELD_P)) {
+    r = FIELD_sub_(r, FIELD_P);
+  }
+
+  return r;
+}
+
 #endif
 
 // Modular multiplication
@@ -310,9 +432,15 @@ DEVICE FIELD FIELD_mul(FIELD a, FIELD b) {
 
 // Squaring is a special case of multiplication which can be done ~1.5x faster.
 // https://stackoverflow.com/a/16388571/1348497
+#ifdef CUDA
+DEVICE FIELD FIELD_sqr(FIELD a) {
+  return FIELD_sqr_nvidia(a);
+}
+#else
 DEVICE FIELD FIELD_sqr(FIELD a) {
   return FIELD_mul(a, a);
 }
+#endif
 
 // Left-shift the limbs by one bit and subtract by modulus in case of overflow.
 // Faster version of FIELD_add(a, a)
@@ -375,3 +503,17 @@ DEVICE uint FIELD_get_bits(FIELD l, uint skip, uint window) {
   }
   return ret;
 }
+
+// Get `i`th bit (From least significant digit) of the field.
+DEVICE bool FIELD_get_bit_lsb(FIELD l, uint i) {
+  return (l.val[i / FIELD_LIMB_BITS] >> (i % FIELD_LIMB_BITS)) & 1;
+}
+
+// Get `window` consecutive bits, (Starting from `skip`th bit from LSB) from the field.
+DEVICE uint FIELD_get_bits_lsb(FIELD l, uint skip, uint window) {
+  uint ret = 0;
+  for(uint i = 0; i < window; i++) {
+    ret |= ((uint)FIELD_get_bit_lsb(l, skip + i)) << i;
+  }
+  return ret;
+}
diff --git a/ec-gpu-gen/src/cl/multiexp.cl b/ec-gpu-gen/src/cl/multiexp.cl
@@ -43,7 +43,7 @@ KERNEL void POINT_multiexp(
 
   POINT_jacobian res = POINT_ZERO;
   for(uint i = nstart; i < nend; i++) {
-    uint ind = EXPONENT_get_bits(exps[i], bits, w);
+    uint ind = EXPONENT_get_bits_lsb(exps[i], bits, w);
 
     #if defined(OPENCL_NVIDIA) || defined(CUDA)
       // O_o, weird optimization, having a single special case makes it

diff --git a/ec-gpu-gen/src/fft.rs b/ec-gpu-gen/src/fft.rs
@@ -1,8 +1,8 @@
 use std::cmp;
 use std::sync::{Arc, RwLock};
 
+use ark_ff::FftField;
 use ec_gpu::GpuName;
-use ff::Field;
 use log::{error, info};
 use rust_gpu_tools::{program_closures, LocalBuffer, Program};
 
@@ -16,17 +16,14 @@ const MAX_LOG2_LOCAL_WORK_SIZE: u32 = 7; // 128
 /// FFT kernel for a single GPU.
 pub struct SingleFftKernel<'a, F>
 where
-    F: Field + GpuName,
+    F: FftField + GpuName,
 {
     program: Program,
-    /// An optional function which will be called at places where it is possible to abort the FFT
-    /// calculations. If it returns true, the calculation will be aborted with an
-    /// [`EcError::Aborted`].
     maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>,
     _phantom: std::marker::PhantomData<F>,
 }
 
-impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> {
+impl<'a, F: FftField + GpuName> SingleFftKernel<'a, F> {
     /// Create a new FFT instance for the given device.
     ///
     /// The `maybe_abort` function is called when it is possible to abort the computation, without
@@ -48,23 +45,18 @@ impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> {
     pub fn radix_fft(&mut self, input: &mut [F], omega: &F, log_n: u32) -> EcResult<()> {
         let closures = program_closures!(|program, input: &mut [F]| -> EcResult<()> {
             let n = 1 << log_n;
-            // All usages are safe as the buffers are initialized from either the host or the GPU
-            // before they are read.
             let mut src_buffer = unsafe { program.create_buffer::<F>(n)? };
             let mut dst_buffer = unsafe { program.create_buffer::<F>(n)? };
-            // The precalculated values pq` and `omegas` are valid for radix degrees up to `max_deg`
             let max_deg = cmp::min(MAX_LOG2_RADIX, log_n);
 
-            // Precalculate:
-            // [omega^(0/(2^(deg-1))), omega^(1/(2^(deg-1))), ..., omega^((2^(deg-1)-1)/(2^(deg-1)))]
+            // Precalculate twiddle factors
             let mut pq = vec![F::ZERO; 1 << max_deg >> 1];
-            let twiddle = omega.pow_vartime([(n >> max_deg) as u64]);
+            let twiddle = omega.pow([(n >> max_deg) as u64]);
             pq[0] = F::ONE;
             if max_deg > 1 {
                 pq[1] = twiddle;
                 for i in 2..(1 << max_deg >> 1) {
-                    pq[i] = pq[i - 1];
-                    pq[i].mul_assign(&twiddle);
+                    pq[i] = pq[i - 1] * twiddle;
                 }
             }
             let pq_buffer = program.create_buffer_from_slice(&pq)?;
@@ -73,24 +65,21 @@ impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> {
             let mut omegas = vec![F::ZERO; 32];
             omegas[0] = *omega;
             for i in 1..LOG2_MAX_ELEMENTS {
-                omegas[i] = omegas[i - 1].pow_vartime([2u64]);
+                omegas[i] = omegas[i - 1].pow([2u64]);
             }
             let omegas_buffer = program.create_buffer_from_slice(&omegas)?;
 
             program.write_from_buffer(&mut src_buffer, &*input)?;
-            // Specifies log2 of `p`, (http://www.bealto.com/gpu-fft_group-1.html)
             let mut log_p = 0u32;
-            // Each iteration performs a FFT round
+
             while log_p < log_n {
                 if let Some(maybe_abort) = &self.maybe_abort {
                     if maybe_abort() {
                         return Err(EcError::Aborted);
                     }
                 }
 
-                // 1=>radix2, 2=>radix4, 3=>radix8, ...
                 let deg = cmp::min(max_deg, log_n - log_p);
-
                 let n = 1u32 << log_n;
                 let local_work_size = 1 << cmp::min(deg - 1, MAX_LOG2_LOCAL_WORK_SIZE);
                 let global_work_size = n >> deg;
@@ -117,7 +106,6 @@ impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> {
             }
 
             program.read_into_buffer(&src_buffer, input)?;
-
             Ok(())
         });
 
@@ -128,14 +116,14 @@ impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> {
 /// One FFT kernel for each GPU available.
 pub struct FftKernel<'a, F>
 where
-    F: Field + GpuName,
+    F: FftField + GpuName,
 {
     kernels: Vec<SingleFftKernel<'a, F>>,
 }
 
 impl<'a, F> FftKernel<'a, F>
 where
-    F: Field + GpuName,
+    F: FftField + GpuName,
 {
     /// Create new kernels, one for each given device.
     pub fn create(programs: Vec<Program>) -> EcResult<Self> {
@@ -175,28 +163,20 @@ where
         if kernels.is_empty() {
             return Err(EcError::Simple("No working GPUs found!"));
         }
-        info!("FFT: {} working device(s) selected. ", kernels.len());
+        info!("FFT: {} working device(s) selected.", kernels.len());
         for (i, k) in kernels.iter().enumerate() {
-            info!("FFT: Device {}: {}", i, k.program.device_name(),);
+            info!("FFT: Device {}: {}", i, k.program.device_name());
         }
 
         Ok(Self { kernels })
     }
 
-    /// Performs FFT on `input`
-    /// * `omega` - Special value `omega` is used for FFT over finite-fields
-    /// * `log_n` - Specifies log2 of number of elements
-    ///
-    /// Uses the first available GPU.
+    /// Performs FFT on `input` using the first available GPU.
     pub fn radix_fft(&mut self, input: &mut [F], omega: &F, log_n: u32) -> EcResult<()> {
         self.kernels[0].radix_fft(input, omega, log_n)
     }
 
-    /// Performs FFT on `inputs`
-    /// * `omega` - Special value `omega` is used for FFT over finite-fields
-    /// * `log_n` - Specifies log2 of number of elements
-    ///
-    /// Uses all available GPUs to distribute the work.
+    /// Performs FFT on `inputs` using all available GPUs.
     pub fn radix_fft_many(
         &mut self,
         inputs: &mut [&mut [F]],
@@ -237,3 +217,8 @@ where
         Arc::try_unwrap(result).unwrap().into_inner().unwrap()
     }
 }
+
+/// Type alias for backward compatibility
+pub type SingleFftKernelArk<'a, F> = SingleFftKernel<'a, F>;
+/// Type alias for backward compatibility
+pub type FftKernelArk<'a, F> = FftKernel<'a, F>;