diff --git a/Cargo.toml b/Cargo.toml
index a36e52f..86c90fa 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -5,3 +5,7 @@ members = [
   "gpu-tests",
 ]
 resolver = "2"
+
+[workspace.dependencies]
+ec-gpu = { path = "./ec-gpu" }
+tracing-profile = { git = "https://github.com/IrreducibleOSS/tracing-profile" }
diff --git a/ec-gpu-gen/Cargo.toml b/ec-gpu-gen/Cargo.toml
index 51f75b9..8584597 100644
--- a/ec-gpu-gen/Cargo.toml
+++ b/ec-gpu-gen/Cargo.toml
@@ -12,11 +12,11 @@ rust-version = "1.83.0"
 [dependencies]
 bitvec = "1.0.1"
 crossbeam-channel = "0.5.1"
-ec-gpu = "0.2.0"
+ec-gpu.workspace = true
 execute = "0.2.9"
-ff = { version = "0.13.0", default-features = false }
-group = "0.13.0"
 hex = "0.4"
+# Pin home to version compatible with Rust 1.83 (edition 2021)
+home = "=0.5.9"
 log = "0.4.14"
 num_cpus = "1.13.0"
 once_cell = "1.8.0"
@@ -24,22 +24,23 @@ rayon = "1.5.1"
 rust-gpu-tools = { version = "0.7.0", default-features = false, optional = true }
 sha2 = "0.10"
 thiserror = "1.0.30"
+tracing = "0.1"
 yastl = "0.1.2"
 
+ark-bn254 = { version = "0.5.0", optional = true }
+ark-std = { version = "0.5.0", optional = true }
+ark-ec = "0.5.0"
+ark-ff = "0.5.0"
+ark-serialize = { version = "0.5.0", optional = true }
+
 [dev-dependencies]
-# NOTE vmx 2022-07-07: Using the `__private_bench` feature of `blstrs` is just
-# temporarily until https://github.com/zkcrypto/group/pull/29 is fixed. Then
-# we won't need the exports of `Fp` and `Fp2` any more.
-#blstrs = { version = "0.6.0", features = ["__private_bench"], optional = true }
-blstrs = { version = "0.7.0", features = ["__private_bench", "gpu"] }
 rand = "0.8"
 lazy_static = "1.2"
-pairing = "0.23.0"
 temp-env = "0.3.0"
-rand_core = "0.6.3"
-rand_xorshift = "0.3.0"
+ark-bn254 = "0.5.0"
 
 [features]
 default = []
 cuda = ["rust-gpu-tools/cuda"]
 opencl = ["rust-gpu-tools/opencl"]
+arkworks = ["ec-gpu/arkworks", "dep:ark-bn254", "dep:ark-std", "dep:ark-serialize"]
diff --git a/ec-gpu-gen/src/cl/field.cl b/ec-gpu-gen/src/cl/field.cl
index 5846b59..4aa1c77 100644
--- a/ec-gpu-gen/src/cl/field.cl
+++ b/ec-gpu-gen/src/cl/field.cl
@@ -164,6 +164,94 @@ DEVICE void FIELD_reduce(uint32_t accLow[FIELD_LIMBS], uint32_t np0, uint32_t fq
   accLow[i]=chain_add(&chain5, accLow[i], highCarry);
 }
 
+// Optimized squaring: exploits symmetry a² = a·a
+// Cross products aᵢ·aⱼ for i≠j appear twice, so compute once and double.
+// Reduces from n² to n(n+1)/2 multiplications (~44% fewer for 8 limbs).
+DEVICE inline
+void FIELD_sqr_v1(uint32_t *x, uint32_t *xx) {
+  const uint32_t xLimbs = FIELD_LIMBS;
+  const uint32_t xxLimbs = FIELD_LIMBS * 2;
+  uint32_t temp[FIELD_LIMBS * 2];
+  uint32_t carry = 0;
+
+  #pragma unroll
+  for (int32_t i = 0; i < xxLimbs; i++) {
+    temp[i] = 0;
+  }
+
+  // Step 1: Compute off-diagonal products for odd (i+j) positions
+  // Following the same pattern as FIELD_mult_v1 for correctness
+  #pragma unroll
+  for (int32_t i = 0; i < xLimbs; i++) {
+    chain_t chain1;
+    chain_init(&chain1);
+    #pragma unroll
+    for (int32_t j = i + 1; j < xLimbs; j++) {
+      if ((i + j) % 2 == 1) {
+        temp[i + j - 1] = chain_madlo(&chain1, x[i], x[j], temp[i + j - 1]);
+        temp[i + j]     = chain_madhi(&chain1, x[i], x[j], temp[i + j]);
+      }
+    }
+    if (i % 2 == 1 && i + 1 < xLimbs) {
+      temp[i + xLimbs - 1] = chain_add(&chain1, 0, 0);
+    }
+  }
+
+  // Shift right by 1 position (same as mult_v1)
+  #pragma unroll
+  for (int32_t i = xxLimbs - 1; i > 0; i--) {
+    temp[i] = temp[i - 1];
+  }
+  temp[0] = 0;
+
+  // Step 2: Compute off-diagonal products for even (i+j) positions
+  carry = 0;
+  #pragma unroll
+  for (int32_t i = 0; i < xLimbs; i++) {
+    chain_t chain2;
+    chain_init(&chain2);
+
+    #pragma unroll
+    for (int32_t j = i + 1; j < xLimbs; j++) {
+      if ((i + j) % 2 == 0) {
+        temp[i + j]     = chain_madlo(&chain2, x[i], x[j], temp[i + j]);
+        temp[i + j + 1] = chain_madhi(&chain2, x[i], x[j], temp[i + j + 1]);
+      }
+    }
+    if ((i + xLimbs) % 2 == 0 && i != xLimbs - 1 && i + 1 < xLimbs) {
+      temp[i + xLimbs]     = chain_add(&chain2, temp[i + xLimbs], carry);
+      temp[i + xLimbs + 1] = chain_add(&chain2, temp[i + xLimbs + 1], 0);
+      carry = chain_add(&chain2, 0, 0);
+    }
+    if ((i + xLimbs) % 2 == 1 && i != xLimbs - 1 && i + 1 < xLimbs) {
+      carry = chain_add(&chain2, carry, 0);
+    }
+  }
+
+  // Step 3: Double the off-diagonal products (left shift by 1 bit)
+  carry = 0;
+  #pragma unroll
+  for (int32_t i = 0; i < xxLimbs; i++) {
+    uint32_t new_carry = temp[i] >> 31;
+    temp[i] = (temp[i] << 1) | carry;
+    carry = new_carry;
+  }
+
+  // Step 4: Add diagonal products x[i] * x[i]
+  chain_t chain3;
+  chain_init(&chain3);
+  #pragma unroll
+  for (int32_t i = 0; i < xLimbs; i++) {
+    temp[2 * i]     = chain_madlo(&chain3, x[i], x[i], temp[2 * i]);
+    temp[2 * i + 1] = chain_madhi(&chain3, x[i], x[i], temp[2 * i + 1]);
+  }
+
+  #pragma unroll
+  for (int32_t i = 0; i < xxLimbs; i++) {
+    xx[i] = temp[i];
+  }
+}
+
 // Requirement: yLimbs >= xLimbs
 DEVICE inline
 void FIELD_mult_v1(uint32_t *x, uint32_t *y, uint32_t *xy) {
@@ -262,6 +350,40 @@ DEVICE FIELD FIELD_mul_nvidia(FIELD a, FIELD b) {
   return r;
 }
 
+DEVICE FIELD FIELD_sqr_nvidia(FIELD a) {
+  // Perform optimized squaring
+  limb aa[2 * FIELD_LIMBS];
+  FIELD_sqr_v1(a.val, aa);
+
+  uint32_t io[FIELD_LIMBS];
+  #pragma unroll
+  for(int i=0;i<FIELD_LIMBS;i++) {
+    io[i]=aa[i];
+  }
+  FIELD_reduce(io, FIELD_INV, FIELD_P.val);
+
+  // Add io to the upper words of aa
+  aa[FIELD_LIMBS] = add_cc(aa[FIELD_LIMBS], io[0]);
+  int j;
+  #pragma unroll
+  for (j = 1; j < FIELD_LIMBS - 1; j++) {
+    aa[j + FIELD_LIMBS] = addc_cc(aa[j + FIELD_LIMBS], io[j]);
+  }
+  aa[2 * FIELD_LIMBS - 1] = addc(aa[2 * FIELD_LIMBS - 1], io[FIELD_LIMBS - 1]);
+
+  FIELD r;
+  #pragma unroll
+  for (int i = 0; i < FIELD_LIMBS; i++) {
+    r.val[i] = aa[i + FIELD_LIMBS];
+  }
+
+  if (FIELD_gte(r, FIELD_P)) {
+    r = FIELD_sub_(r, FIELD_P);
+  }
+
+  return r;
+}
+
 #endif
 
 // Modular multiplication
@@ -310,9 +432,15 @@ DEVICE FIELD FIELD_mul(FIELD a, FIELD b) {
 
 // Squaring is a special case of multiplication which can be done ~1.5x faster.
 // https://stackoverflow.com/a/16388571/1348497
+#ifdef CUDA
+DEVICE FIELD FIELD_sqr(FIELD a) {
+  return FIELD_sqr_nvidia(a);
+}
+#else
 DEVICE FIELD FIELD_sqr(FIELD a) {
   return FIELD_mul(a, a);
 }
+#endif
 
 // Left-shift the limbs by one bit and subtract by modulus in case of overflow.
 // Faster version of FIELD_add(a, a)
@@ -375,3 +503,17 @@ DEVICE uint FIELD_get_bits(FIELD l, uint skip, uint window) {
   }
   return ret;
 }
+
+// Get `i`th bit (From least significant digit) of the field.
+DEVICE bool FIELD_get_bit_lsb(FIELD l, uint i) {
+  return (l.val[i / FIELD_LIMB_BITS] >> (i % FIELD_LIMB_BITS)) & 1;
+}
+
+// Get `window` consecutive bits, (Starting from `skip`th bit from LSB) from the field.
+DEVICE uint FIELD_get_bits_lsb(FIELD l, uint skip, uint window) {
+  uint ret = 0;
+  for(uint i = 0; i < window; i++) {
+    ret |= ((uint)FIELD_get_bit_lsb(l, skip + i)) << i;
+  }
+  return ret;
+}
diff --git a/ec-gpu-gen/src/cl/multiexp.cl b/ec-gpu-gen/src/cl/multiexp.cl
index 6edc445..2c77693 100644
--- a/ec-gpu-gen/src/cl/multiexp.cl
+++ b/ec-gpu-gen/src/cl/multiexp.cl
@@ -43,7 +43,7 @@ KERNEL void POINT_multiexp(
 
   POINT_jacobian res = POINT_ZERO;
   for(uint i = nstart; i < nend; i++) {
-    uint ind = EXPONENT_get_bits(exps[i], bits, w);
+    uint ind = EXPONENT_get_bits_lsb(exps[i], bits, w);
 
     #if defined(OPENCL_NVIDIA) || defined(CUDA)
       // O_o, weird optimization, having a single special case makes it
diff --git a/ec-gpu-gen/src/fft.rs b/ec-gpu-gen/src/fft.rs
index aef4815..e19d51d 100644
--- a/ec-gpu-gen/src/fft.rs
+++ b/ec-gpu-gen/src/fft.rs
@@ -1,8 +1,8 @@
 use std::cmp;
 use std::sync::{Arc, RwLock};
 
+use ark_ff::FftField;
 use ec_gpu::GpuName;
-use ff::Field;
 use log::{error, info};
 use rust_gpu_tools::{program_closures, LocalBuffer, Program};
 
@@ -16,17 +16,14 @@ const MAX_LOG2_LOCAL_WORK_SIZE: u32 = 7; // 128
 /// FFT kernel for a single GPU.
 pub struct SingleFftKernel<'a, F>
 where
-    F: Field + GpuName,
+    F: FftField + GpuName,
 {
     program: Program,
-    /// An optional function which will be called at places where it is possible to abort the FFT
-    /// calculations. If it returns true, the calculation will be aborted with an
-    /// [`EcError::Aborted`].
     maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>,
     _phantom: std::marker::PhantomData<F>,
 }
 
-impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> {
+impl<'a, F: FftField + GpuName> SingleFftKernel<'a, F> {
     /// Create a new FFT instance for the given device.
     ///
     /// The `maybe_abort` function is called when it is possible to abort the computation, without
@@ -48,23 +45,18 @@ impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> {
     pub fn radix_fft(&mut self, input: &mut [F], omega: &F, log_n: u32) -> EcResult<()> {
         let closures = program_closures!(|program, input: &mut [F]| -> EcResult<()> {
             let n = 1 << log_n;
-            // All usages are safe as the buffers are initialized from either the host or the GPU
-            // before they are read.
             let mut src_buffer = unsafe { program.create_buffer::<F>(n)? };
             let mut dst_buffer = unsafe { program.create_buffer::<F>(n)? };
-            // The precalculated values pq` and `omegas` are valid for radix degrees up to `max_deg`
             let max_deg = cmp::min(MAX_LOG2_RADIX, log_n);
 
-            // Precalculate:
-            // [omega^(0/(2^(deg-1))), omega^(1/(2^(deg-1))), ..., omega^((2^(deg-1)-1)/(2^(deg-1)))]
+            // Precalculate twiddle factors
             let mut pq = vec![F::ZERO; 1 << max_deg >> 1];
-            let twiddle = omega.pow_vartime([(n >> max_deg) as u64]);
+            let twiddle = omega.pow([(n >> max_deg) as u64]);
             pq[0] = F::ONE;
             if max_deg > 1 {
                 pq[1] = twiddle;
                 for i in 2..(1 << max_deg >> 1) {
-                    pq[i] = pq[i - 1];
-                    pq[i].mul_assign(&twiddle);
+                    pq[i] = pq[i - 1] * twiddle;
                 }
             }
             let pq_buffer = program.create_buffer_from_slice(&pq)?;
@@ -73,14 +65,13 @@ impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> {
             let mut omegas = vec![F::ZERO; 32];
             omegas[0] = *omega;
             for i in 1..LOG2_MAX_ELEMENTS {
-                omegas[i] = omegas[i - 1].pow_vartime([2u64]);
+                omegas[i] = omegas[i - 1].pow([2u64]);
             }
             let omegas_buffer = program.create_buffer_from_slice(&omegas)?;
 
             program.write_from_buffer(&mut src_buffer, &*input)?;
-            // Specifies log2 of `p`, (http://www.bealto.com/gpu-fft_group-1.html)
             let mut log_p = 0u32;
-            // Each iteration performs a FFT round
+
             while log_p < log_n {
                 if let Some(maybe_abort) = &self.maybe_abort {
                     if maybe_abort() {
@@ -88,9 +79,7 @@ impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> {
                     }
                 }
 
-                // 1=>radix2, 2=>radix4, 3=>radix8, ...
                 let deg = cmp::min(max_deg, log_n - log_p);
-
                 let n = 1u32 << log_n;
                 let local_work_size = 1 << cmp::min(deg - 1, MAX_LOG2_LOCAL_WORK_SIZE);
                 let global_work_size = n >> deg;
@@ -117,7 +106,6 @@ impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> {
             }
 
             program.read_into_buffer(&src_buffer, input)?;
-
             Ok(())
         });
 
@@ -128,14 +116,14 @@ impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> {
 /// One FFT kernel for each GPU available.
 pub struct FftKernel<'a, F>
 where
-    F: Field + GpuName,
+    F: FftField + GpuName,
 {
     kernels: Vec<SingleFftKernel<'a, F>>,
 }
 
 impl<'a, F> FftKernel<'a, F>
 where
-    F: Field + GpuName,
+    F: FftField + GpuName,
 {
     /// Create new kernels, one for each given device.
     pub fn create(programs: Vec<Program>) -> EcResult<Self> {
@@ -175,28 +163,20 @@ where
         if kernels.is_empty() {
             return Err(EcError::Simple("No working GPUs found!"));
         }
-        info!("FFT: {} working device(s) selected. ", kernels.len());
+        info!("FFT: {} working device(s) selected.", kernels.len());
         for (i, k) in kernels.iter().enumerate() {
-            info!("FFT: Device {}: {}", i, k.program.device_name(),);
+            info!("FFT: Device {}: {}", i, k.program.device_name());
         }
 
         Ok(Self { kernels })
     }
 
-    /// Performs FFT on `input`
-    /// * `omega` - Special value `omega` is used for FFT over finite-fields
-    /// * `log_n` - Specifies log2 of number of elements
-    ///
-    /// Uses the first available GPU.
+    /// Performs FFT on `input` using the first available GPU.
     pub fn radix_fft(&mut self, input: &mut [F], omega: &F, log_n: u32) -> EcResult<()> {
         self.kernels[0].radix_fft(input, omega, log_n)
     }
 
-    /// Performs FFT on `inputs`
-    /// * `omega` - Special value `omega` is used for FFT over finite-fields
-    /// * `log_n` - Specifies log2 of number of elements
-    ///
-    /// Uses all available GPUs to distribute the work.
+    /// Performs FFT on `inputs` using all available GPUs.
     pub fn radix_fft_many(
         &mut self,
         inputs: &mut [&mut [F]],
@@ -237,3 +217,8 @@ where
         Arc::try_unwrap(result).unwrap().into_inner().unwrap()
     }
 }
+
+/// Type alias for backward compatibility
+pub type SingleFftKernelArk<'a, F> = SingleFftKernel<'a, F>;
+/// Type alias for backward compatibility
+pub type FftKernelArk<'a, F> = FftKernel<'a, F>;
diff --git a/ec-gpu-gen/src/fft_cpu.rs b/ec-gpu-gen/src/fft_cpu.rs
index 777ffb5..acabc4d 100644
--- a/ec-gpu-gen/src/fft_cpu.rs
+++ b/ec-gpu-gen/src/fft_cpu.rs
@@ -1,4 +1,4 @@
-use ff::PrimeField;
+use ark_ff::FftField;
 
 use crate::threadpool::Worker;
 
@@ -7,7 +7,7 @@ use crate::threadpool::Worker;
 /// The input `a` is mutated and contains the result when this function returns. The length of the
 /// input vector must be `2^log_n`.
 #[allow(clippy::many_single_char_names)]
-pub fn serial_fft<F: PrimeField>(a: &mut [F], omega: &F, log_n: u32) {
+pub fn serial_fft<F: FftField>(a: &mut [F], omega: &F, log_n: u32) {
     fn bitreverse(mut n: u32, l: u32) -> u32 {
         let mut r = 0;
         for _ in 0..l {
@@ -29,7 +29,7 @@ pub fn serial_fft<F: PrimeField>(a: &mut [F], omega: &F, log_n: u32) {
 
     let mut m = 1;
     for _ in 0..log_n {
-        let w_m = omega.pow_vartime([u64::from(n / (2 * m))]);
+        let w_m = omega.pow([u64::from(n / (2 * m))]);
 
         let mut k = 0;
         while k < n {
@@ -56,7 +56,7 @@ pub fn serial_fft<F: PrimeField>(a: &mut [F], omega: &F, log_n: u32) {
 /// The result is written to the input `a`.
 /// The number of threads used will be `2^log_threads`.
 /// There must be more items to process than threads.
-pub fn parallel_fft<F: PrimeField>(
+pub fn parallel_fft<F: FftField>(
     a: &mut [F],
     worker: &Worker,
     omega: &F,
@@ -68,7 +68,7 @@ pub fn parallel_fft<F: PrimeField>(
     let num_threads = 1 << log_threads;
     let log_new_n = log_n - log_threads;
     let mut tmp = vec![vec![F::ZERO; 1 << log_new_n]; num_threads];
-    let new_omega = omega.pow_vartime([num_threads as u64]);
+    let new_omega = omega.pow([num_threads as u64]);
 
     worker.scope(0, |scope, _| {
         let a = &*a;
@@ -76,8 +76,8 @@ pub fn parallel_fft<F: PrimeField>(
         for (j, tmp) in tmp.iter_mut().enumerate() {
             scope.execute(move || {
                 // Shuffle into a sub-FFT
-                let omega_j = omega.pow_vartime([j as u64]);
-                let omega_step = omega.pow_vartime([(j as u64) << log_new_n]);
+                let omega_j = omega.pow([j as u64]);
+                let omega_step = omega.pow([(j as u64) << log_new_n]);
 
                 let mut elt = F::ONE;
                 for (i, tmp) in tmp.iter_mut().enumerate() {
@@ -97,7 +97,6 @@ pub fn parallel_fft<F: PrimeField>(
         }
     });
 
-    // TODO: does this hurt or help?
     worker.scope(a.len(), |scope, chunk| {
         let tmp = &tmp;
 
@@ -120,15 +119,13 @@ mod tests {
 
     use std::cmp::min;
 
-    use blstrs::Scalar as Fr;
-    use ff::PrimeField;
-    use rand_core::RngCore;
+    use ark_bn254::Fr;
+    use ark_ff::UniformRand;
 
-    fn omega<F: PrimeField>(num_coeffs: usize) -> F {
-        // Compute omega, the 2^exp primitive root of unity
+    fn omega<F: FftField>(num_coeffs: usize) -> F {
         let exp = (num_coeffs as f32).log2().floor() as u32;
-        let mut omega = F::ROOT_OF_UNITY;
-        for _ in exp..F::S {
+        let mut omega = F::TWO_ADIC_ROOT_OF_UNITY;
+        for _ in exp..F::TWO_ADICITY {
             omega = omega.square();
         }
         omega
@@ -136,30 +133,24 @@ mod tests {
 
     #[test]
     fn parallel_fft_consistency() {
-        fn test_consistency<F: PrimeField, R: RngCore>(rng: &mut R) {
-            let worker = Worker::new();
+        let worker = Worker::new();
+        let mut rng = rand::thread_rng();
 
-            for _ in 0..5 {
-                for log_d in 0..10 {
-                    let d = 1 << log_d;
+        for _ in 0..5 {
+            for log_d in 0..10 {
+                let d = 1 << log_d;
 
-                    let mut v1_coeffs = (0..d).map(|_| F::random(&mut *rng)).collect::<Vec<_>>();
-                    let mut v2_coeffs = v1_coeffs.clone();
-                    let v1_omega = omega::<F>(v1_coeffs.len());
-                    let v2_omega = v1_omega;
+                let mut v1_coeffs = (0..d).map(|_| Fr::rand(&mut rng)).collect::<Vec<_>>();
+                let mut v2_coeffs = v1_coeffs.clone();
+                let fft_omega = omega::<Fr>(v1_coeffs.len());
 
-                    for log_threads in log_d..min(log_d + 1, 3) {
-                        parallel_fft::<F>(&mut v1_coeffs, &worker, &v1_omega, log_d, log_threads);
-                        serial_fft::<F>(&mut v2_coeffs, &v2_omega, log_d);
+                for log_threads in log_d..min(log_d + 1, 3) {
+                    parallel_fft::<Fr>(&mut v1_coeffs, &worker, &fft_omega, log_d, log_threads);
+                    serial_fft::<Fr>(&mut v2_coeffs, &fft_omega, log_d);
 
-                        assert!(v1_coeffs == v2_coeffs);
-                    }
+                    assert!(v1_coeffs == v2_coeffs);
                 }
             }
         }
-
-        let rng = &mut rand::thread_rng();
-
-        test_consistency::<Fr, _>(rng);
     }
 }
diff --git a/ec-gpu-gen/src/lib.rs b/ec-gpu-gen/src/lib.rs
index f487959..77f1735 100644
--- a/ec-gpu-gen/src/lib.rs
+++ b/ec-gpu-gen/src/lib.rs
@@ -4,29 +4,35 @@
 //!
 //! There is also support for Fast Fourier Transform and Multiexponentiation.
 //!
-//! This crate usually creates GPU kernels at compile-time. CUDA generates a [fatbin], which OpenCL only generates the source code, which is then compiled at run-time.
+//! This crate usually creates GPU kernels at compile-time. CUDA generates a [fatbin], which OpenCL
+//! only generates the source code, which is then compiled at run-time.
 //!
-//! In order to make things easier to use, there are helper functions available. You would put some code into `build.rs`, that generates the kernels, and some code into your library which then consumes those generated kernels. The kernels will be directly embedded into your program/library. If something goes wrong, you will get an error at compile-time.
+//! In order to make things easier to use, there are helper functions available. You would put some
+//! code into `build.rs`, that generates the kernels, and some code into your library which then
+//! consumes those generated kernels. The kernels will be directly embedded into your program/library.
+//! If something goes wrong, you will get an error at compile-time.
 //!
 //! In this example we will make use of the FFT functionality. Add to your `build.rs`:
 //!
 //! ```no_run
-//! use blstrs::Scalar;
+//! use ark_bn254::Fr;
 //! use ec_gpu_gen::SourceBuilder;
 //!
-//! let source_builder = SourceBuilder::new().add_fft::<Scalar>();
+//! let source_builder = SourceBuilder::new().add_fft::<Fr>();
 //! ec_gpu_gen::generate(&source_builder);
 //! ```
 //!
-//! The `ec_gpu_gen::generate()` takes care of the actual code generation/compilation. It will automatically create a CUDA and/or OpenCL kernel. It will define two environment variables, which are meant for internal use. `_EC_GPU_CUDA_KERNEL_FATBIN` that points to the compiled CUDA kernel, and `_EC_GPU_OPENCL_KERNEL_SOURCE` that points to the generated OpenCL source.
+//! The `ec_gpu_gen::generate()` takes care of the actual code generation/compilation. It will
+//! automatically create a CUDA and/or OpenCL kernel. It will define two environment variables,
+//! which are meant for internal use. `_EC_GPU_CUDA_KERNEL_FATBIN` that points to the compiled
+//! CUDA kernel, and `_EC_GPU_OPENCL_KERNEL_SOURCE` that points to the generated OpenCL source.
 //!
-//! Those variables are then picked up by the `ec_gpu_gen::program!()` macro, which generates a program, for a given GPU device. Using FFT within your library would then look like this:
+//! Those variables are then picked up by the `ec_gpu_gen::program!()` macro, which generates a
+//! program, for a given GPU device. Using FFT within your library would then look like this:
 //!
-//! ```no_compile
-//! use blstrs::Scalar;
-//! use ec_gpu_gen::{
-//!     rust_gpu_tools::Device,
-//! };
+//! ```ignore
+//! use ark_bn254::Fr;
+//! use ec_gpu_gen::{fft::FftKernel, rust_gpu_tools::Device};
 //!
 //! let devices = Device::all();
 //! let programs = devices
@@ -35,7 +41,7 @@
 //!     .collect::<Result<_, _>>()
 //!     .expect("Cannot create programs!");
 //!
-//! let mut kern = FftKernel::<Scalar>::create(programs).expect("Cannot initialize kernel!");
+//! let mut kern = FftKernel::<Fr>::create(programs).expect("Cannot initialize kernel!");
 //! kern.radix_fft_many(&mut [&mut coeffs], &[omega], &[log_d]).expect("GPU FFT failed!");
 //! ```
 //!
@@ -59,8 +65,6 @@ pub mod fft_cpu;
 /// Multiexponentiation on the GPU.
 #[cfg(any(feature = "cuda", feature = "opencl"))]
 pub mod multiexp;
-/// Multiexponentiation on the CPU.
-pub mod multiexp_cpu;
 /// Helpers for multithreaded code.
 pub mod threadpool;
 
@@ -70,3 +74,8 @@ pub use rust_gpu_tools;
 
 pub use error::{EcError, EcResult};
 pub use source::{generate, SourceBuilder};
+
+#[cfg(any(feature = "cuda", feature = "opencl"))]
+pub use fft::{FftKernel, FftKernelArk, SingleFftKernel, SingleFftKernelArk};
+#[cfg(any(feature = "cuda", feature = "opencl"))]
+pub use multiexp::{G1AffineM, G2AffineM, GpuAffine, MultiexpKernel, SingleMultiexpKernel};
diff --git a/ec-gpu-gen/src/multiexp.rs b/ec-gpu-gen/src/multiexp.rs
index 24780b8..8a8a3a8 100644
--- a/ec-gpu-gen/src/multiexp.rs
+++ b/ec-gpu-gen/src/multiexp.rs
@@ -1,11 +1,12 @@
 use std::ops::AddAssign;
 use std::sync::{Arc, RwLock};
 
+use ark_ec::CurveGroup;
+use ark_ff::{AdditiveGroup, BigInteger, PrimeField};
 use ec_gpu::GpuName;
-use ff::PrimeField;
-use group::{prime::PrimeCurveAffine, Group};
 use log::{error, info};
 use rust_gpu_tools::{program_closures, Device, Program};
+use tracing::{debug_span, info_span};
 use yastl::Scope;
 
 use crate::{
@@ -13,6 +14,19 @@ use crate::{
     threadpool::Worker,
 };
 
+/// Trait for curve affine points that have a GPU-compatible representation.
+pub trait GpuAffine: GpuName + Clone + Send + Sync + Sized {
+    /// The GPU-compatible representation type.
+    type GpuRepr: Copy + Clone + Default + Send + Sync;
+    /// The scalar field type.
+    type ScalarField: PrimeField;
+    /// The projective group type.
+    type Group: CurveGroup<ScalarField = Self::ScalarField> + AddAssign;
+
+    /// Convert the affine point to its GPU representation.
+    fn to_gpu(&self) -> Self::GpuRepr;
+}
+
 /// On the GPU, the exponents are split into windows, this is the maximum number of such windows.
 const MAX_WINDOW_SIZE: usize = 10;
 /// In CUDA this is the number of blocks per grid (grid size).
@@ -45,7 +59,7 @@ const fn work_units(compute_units: u32, compute_capabilities: Option<(u32, u32)>
 /// Multiexp kernel for a single GPU.
 pub struct SingleMultiexpKernel<'a, G>
 where
-    G: PrimeCurveAffine,
+    G: GpuAffine,
 {
     program: Program,
     /// The number of exponentiations the GPU can handle in a single execution of the kernel.
@@ -58,18 +72,17 @@ where
     /// [`EcError::Aborted`].
     maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>,
 
-    _phantom: std::marker::PhantomData<G::Scalar>,
+    _phantom: std::marker::PhantomData<G>,
 }
 
 /// Calculates the maximum number of terms that can be put onto the GPU memory.
 fn calc_chunk_size<G>(mem: u64, work_units: usize) -> usize
 where
-    G: PrimeCurveAffine,
-    G::Scalar: PrimeField,
+    G: GpuAffine,
 {
-    let aff_size = std::mem::size_of::<G>();
-    let exp_size = exp_size::<G::Scalar>();
-    let proj_size = std::mem::size_of::<G::Curve>();
+    let aff_size = std::mem::size_of::<G::GpuRepr>();
+    let exp_size = exp_size::<G::ScalarField>();
+    let proj_size = std::mem::size_of::<G::Group>();
 
     // Leave `MEMORY_PADDING` percent of the memory free.
     let max_memory = ((mem as f64) * (1f64 - MEMORY_PADDING)) as usize;
@@ -88,13 +101,152 @@ where
 /// The size of the exponent in bytes.
 ///
 /// It's the actual bytes size it needs in memory, not it's theoretical bit size.
-fn exp_size<F: PrimeField>() -> usize {
-    std::mem::size_of::<F::Repr>()
+fn exp_size<F: ark_ff::PrimeField>() -> usize {
+    std::mem::size_of::<F::BigInt>()
+}
+
+/// Computes the maximum number of significant bits across all scalar byte arrays.
+/// Returns the position of the highest set bit + 1, or 1 if all scalars are zero.
+fn compute_max_scalar_bits(scalars: &[[u8; 32]]) -> usize {
+    let max_bits = scalars
+        .iter()
+        .map(|bytes| {
+            // Scan from MSB to find highest non-zero byte
+            for (i, &byte) in bytes.iter().enumerate().rev() {
+                if byte != 0 {
+                    return (i + 1) * 8 - byte.leading_zeros() as usize;
+                }
+            }
+            0
+        })
+        .max()
+        .unwrap_or(0);
+    // Ensure at least 1 to avoid edge cases
+    max_bits.max(1)
+}
+
+/// GPU-compatible representation of an affine point.
+/// Coordinates are stored as 32-byte little-endian field elements in Montgomery form.
+#[repr(C)]
+#[derive(Copy, Clone, Debug, Default)]
+pub struct G1AffineM {
+    /// X coordinate as 32 bytes in little-endian Montgomery form
+    pub x: [u8; 32],
+    /// Y coordinate as 32 bytes in little-endian Montgomery form
+    pub y: [u8; 32],
+}
+
+#[cfg(feature = "arkworks")]
+fn fq_to_montgomery_bytes(f: &ark_bn254::Fq) -> [u8; 32] {
+    // Arkworks stores Fq as 4 u64 limbs in Montgomery form
+    // We need the raw Montgomery representation, not the serialized (standard) form
+    let limbs: [u64; 4] = unsafe { std::mem::transmute_copy(f) };
+    let mut out = [0u8; 32];
+    for (i, limb) in limbs.iter().enumerate() {
+        out[i * 8..(i + 1) * 8].copy_from_slice(&limb.to_le_bytes());
+    }
+    out
+}
+
+#[cfg(feature = "arkworks")]
+impl From<ark_bn254::G1Affine> for G1AffineM {
+    fn from(p: ark_bn254::G1Affine) -> Self {
+        use ark_ec::AffineRepr;
+
+        if p.is_zero() {
+            return Self::default();
+        }
+
+        Self {
+            x: fq_to_montgomery_bytes(&p.x),
+            y: fq_to_montgomery_bytes(&p.y),
+        }
+    }
+}
+
+#[cfg(feature = "arkworks")]
+impl From<&ark_bn254::G1Affine> for G1AffineM {
+    fn from(p: &ark_bn254::G1Affine) -> Self {
+        (*p).into()
+    }
+}
+
+/// GPU-compatible representation of a G2 affine point.
+/// Coordinates are stored as 64-byte little-endian Fq2 elements (each Fq2 = two 32-byte Fq elements).
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct G2AffineM {
+    /// X coordinate as Fq2 (64 bytes: c0 followed by c1)
+    pub x: [u8; 64],
+    /// Y coordinate as Fq2 (64 bytes: c0 followed by c1)
+    pub y: [u8; 64],
+}
+
+impl Default for G2AffineM {
+    fn default() -> Self {
+        Self {
+            x: [0u8; 64],
+            y: [0u8; 64],
+        }
+    }
+}
+
+#[cfg(feature = "arkworks")]
+fn fq2_to_montgomery_bytes(f: &ark_bn254::Fq2) -> [u8; 64] {
+    let mut out = [0u8; 64];
+    out[..32].copy_from_slice(&fq_to_montgomery_bytes(&f.c0));
+    out[32..].copy_from_slice(&fq_to_montgomery_bytes(&f.c1));
+    out
+}
+
+#[cfg(feature = "arkworks")]
+impl From<ark_bn254::G2Affine> for G2AffineM {
+    fn from(p: ark_bn254::G2Affine) -> Self {
+        use ark_ec::AffineRepr;
+
+        if p.is_zero() {
+            return Self::default();
+        }
+
+        Self {
+            x: fq2_to_montgomery_bytes(&p.x),
+            y: fq2_to_montgomery_bytes(&p.y),
+        }
+    }
+}
+
+#[cfg(feature = "arkworks")]
+impl From<&ark_bn254::G2Affine> for G2AffineM {
+    fn from(p: &ark_bn254::G2Affine) -> Self {
+        (*p).into()
+    }
+}
+
+#[cfg(feature = "arkworks")]
+impl GpuAffine for ec_gpu::arkworks_bn254::G1Affine {
+    type GpuRepr = G1AffineM;
+    type ScalarField = ark_bn254::Fr;
+    type Group = ark_bn254::G1Projective;
+
+    fn to_gpu(&self) -> G1AffineM {
+        self.0.into()
+    }
+}
+
+#[cfg(feature = "arkworks")]
+impl GpuAffine for ec_gpu::arkworks_bn254::G2Affine {
+    type GpuRepr = G2AffineM;
+    type ScalarField = ark_bn254::Fr;
+    type Group = ark_bn254::G2Projective;
+
+    fn to_gpu(&self) -> G2AffineM {
+        self.0.into()
+    }
 }
 
 impl<'a, G> SingleMultiexpKernel<'a, G>
 where
-    G: PrimeCurveAffine + GpuName,
+    G: GpuAffine,
 {
     /// Create a new Multiexp kernel instance for a device.
     ///
@@ -105,6 +257,7 @@ where
         device: &Device,
         maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>,
     ) -> EcResult<Self> {
+        let _span = debug_span!("single_multiexp_kernel_create").entered();
         let mem = device.memory();
         let compute_units = device.compute_units();
         let compute_capability = device.compute_capability();
@@ -127,19 +280,38 @@ where
     /// running on.
     pub fn multiexp(
         &self,
-        bases: &[G],
-        exponents: &[<G::Scalar as PrimeField>::Repr],
-    ) -> EcResult<G::Curve> {
+        bases: &[G::GpuRepr],
+        exponents: &[<G::ScalarField as ark_ff::PrimeField>::BigInt],
+    ) -> EcResult<G::Group> {
+        let _span = debug_span!("single_multiexp", n = bases.len()).entered();
         assert_eq!(bases.len(), exponents.len());
 
+        let exponents: Vec<_> = {
+            let _span = debug_span!("convert_exponents").entered();
+            exponents
+                .iter()
+                .map(|b| {
+                    let mut out = [0u8; 32];
+                    let le = b.to_bytes_le();
+                    out[..le.len()].copy_from_slice(&le);
+
+                    out
+                })
+                .collect()
+        };
+
         if let Some(maybe_abort) = &self.maybe_abort {
             if maybe_abort() {
                 return Err(EcError::Aborted);
             }
         }
+
+        // Compute actual bit length needed for small scalar optimization
+        let effective_bits = compute_max_scalar_bits(&exponents);
+
         let window_size = self.calc_window_size(bases.len());
-        // windows_size * num_windows needs to be >= 256 in order for the kernel to work correctly.
-        let num_windows = div_ceil(256, window_size);
+        // windows_size * num_windows needs to be >= effective_bits to cover all scalar bits.
+        let num_windows = div_ceil(effective_bits, window_size);
         let num_groups = self.work_units / num_windows;
         let bucket_len = 1 << window_size;
 
@@ -147,36 +319,55 @@ where
         // be `num_groups` * `num_windows` threads in total.
         // Each thread will use `num_groups` * `num_windows` * `bucket_len` buckets.
 
-        let closures = program_closures!(|program, _arg| -> EcResult<Vec<G::Curve>> {
-            let base_buffer = program.create_buffer_from_slice(bases)?;
-            let exp_buffer = program.create_buffer_from_slice(exponents)?;
-
-            // It is safe as the GPU will initialize that buffer
-            let bucket_buffer =
-                unsafe { program.create_buffer::<G::Curve>(self.work_units * bucket_len)? };
-            // It is safe as the GPU will initialize that buffer
-            let result_buffer = unsafe { program.create_buffer::<G::Curve>(self.work_units)? };
+        let closures = program_closures!(|program, _arg| -> EcResult<Vec<G::Group>> {
+            let base_buffer = {
+                let _span = debug_span!("upload_bases").entered();
+                program.create_buffer_from_slice(bases)?
+            };
+            let exp_buffer = {
+                let _span = debug_span!("upload_exponents").entered();
+                program.create_buffer_from_slice(&exponents)?
+            };
+
+            let (bucket_buffer, result_buffer) = {
+                let _span = debug_span!("allocate_gpu_buffers").entered();
+                // It is safe as the GPU will initialize that buffer
+                let bucket_buffer =
+                    unsafe { program.create_buffer::<G::Group>(self.work_units * bucket_len)? };
+                // It is safe as the GPU will initialize that buffer
+                let result_buffer = unsafe { program.create_buffer::<G::Group>(self.work_units)? };
+                (bucket_buffer, result_buffer)
+            };
 
             // The global work size follows CUDA's definition and is the number of
             // `LOCAL_WORK_SIZE` sized thread groups.
             let global_work_size = div_ceil(num_windows * num_groups, LOCAL_WORK_SIZE);
 
-            let kernel_name = format!("{}_multiexp", G::name());
-            let kernel = program.create_kernel(&kernel_name, global_work_size, LOCAL_WORK_SIZE)?;
-
-            kernel
-                .arg(&base_buffer)
-                .arg(&bucket_buffer)
-                .arg(&result_buffer)
-                .arg(&exp_buffer)
-                .arg(&(bases.len() as u32))
-                .arg(&(num_groups as u32))
-                .arg(&(num_windows as u32))
-                .arg(&(window_size as u32))
-                .run()?;
+            let kernel = {
+                let _span = debug_span!("create_kernel").entered();
+                let kernel_name = format!("{}_multiexp", G::name());
+                program.create_kernel(&kernel_name, global_work_size, LOCAL_WORK_SIZE)?
+            };
+
+            {
+                let _span = debug_span!("kernel_run").entered();
+                kernel
+                    .arg(&base_buffer)
+                    .arg(&bucket_buffer)
+                    .arg(&result_buffer)
+                    .arg(&exp_buffer)
+                    .arg(&(bases.len() as u32))
+                    .arg(&(num_groups as u32))
+                    .arg(&(num_windows as u32))
+                    .arg(&(window_size as u32))
+                    .run()?;
+            }
 
-            let mut results = vec![G::Curve::identity(); self.work_units];
-            program.read_into_buffer(&result_buffer, &mut results)?;
+            let mut results = vec![<G::Group as AdditiveGroup>::ZERO; self.work_units];
+            {
+                let _span = debug_span!("download_results").entered();
+                program.read_into_buffer(&result_buffer, &mut results)?;
+            }
 
             Ok(results)
         });
@@ -185,19 +376,23 @@ where
 
         // Using the algorithm below, we can calculate the final result by accumulating the results
         // of those `NUM_GROUPS` * `NUM_WINDOWS` threads.
-        let mut acc = G::Curve::identity();
-        let mut bits = 0;
-        let exp_bits = exp_size::<G::Scalar>() * 8;
-        for i in 0..num_windows {
-            let w = std::cmp::min(window_size, exp_bits - bits);
-            for _ in 0..w {
-                acc = acc.double();
-            }
-            for g in 0..num_groups {
-                acc.add_assign(&results[g * num_windows + i]);
+        // Since we use LSB-first bit extraction, window 0 contains the LSB and window (num_windows-1)
+        // contains the MSB. We process windows in reverse order (MSB first) using Horner's method.
+        let acc = {
+            let _span = debug_span!("cpu_accumulation").entered();
+            let mut acc = <G::Group as AdditiveGroup>::ZERO;
+            for i in (0..num_windows).rev() {
+                // Window i covers bits [i * window_size, min((i+1) * window_size, effective_bits))
+                let w = std::cmp::min(window_size, effective_bits - i * window_size);
+                for _ in 0..w {
+                    acc = acc.double();
+                }
+                for g in 0..num_groups {
+                    acc.add_assign(&results[g * num_windows + i]);
+                }
             }
-            bits += w; // Process the next window
-        }
+            acc
+        };
 
         Ok(acc)
     }
@@ -219,14 +414,14 @@ where
 /// A struct that contains several multiexp kernels for different devices.
 pub struct MultiexpKernel<'a, G>
 where
-    G: PrimeCurveAffine,
+    G: GpuAffine,
 {
     kernels: Vec<SingleMultiexpKernel<'a, G>>,
 }
 
 impl<'a, G> MultiexpKernel<'a, G>
 where
-    G: PrimeCurveAffine + GpuName,
+    G: GpuAffine,
 {
     /// Create new kernels, one for each given device.
     pub fn create(programs: Vec<Program>, devices: &[&Device]) -> EcResult<Self> {
@@ -250,6 +445,7 @@ where
         devices: &[&Device],
         maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>,
     ) -> EcResult<Self> {
+        let _span = debug_span!("multiexp_kernel_create").entered();
         let kernels: Vec<_> = programs
             .into_iter()
             .zip(devices.iter())
@@ -288,9 +484,9 @@ where
     pub fn parallel_multiexp<'s>(
         &'s mut self,
         scope: &Scope<'s>,
-        bases: &'s [G],
-        exps: &'s [<G::Scalar as PrimeField>::Repr],
-        results: &'s mut [G::Curve],
+        bases: &'s [G::GpuRepr],
+        exps: &'s [<G::ScalarField as ark_ff::PrimeField>::BigInt],
+        results: &'s mut [G::Group],
         error: Arc<RwLock<EcResult<()>>>,
     ) {
         let num_devices = self.kernels.len();
@@ -308,8 +504,11 @@ where
             .zip(results.iter_mut())
         {
             let error = error.clone();
+            // Capture current span to propagate to worker thread
+
             scope.execute(move || {
-                let mut acc = G::Curve::identity();
+                let _span = debug_span!("gpu_device_multiexp", n = exps.len()).entered();
+                let mut acc = <G::Group as AdditiveGroup>::ZERO;
                 for (bases, exps) in bases.chunks(kern.n).zip(exps.chunks(kern.n)) {
                     if error.read().unwrap().is_err() {
                         break;
@@ -335,10 +534,11 @@ where
     pub fn multiexp(
         &mut self,
         pool: &Worker,
-        bases_arc: Arc<Vec<G>>,
-        exps: Arc<Vec<<G::Scalar as PrimeField>::Repr>>,
+        bases_arc: Arc<Vec<G::GpuRepr>>,
+        exps: Arc<Vec<<G::ScalarField as ark_ff::PrimeField>::BigInt>>,
         skip: usize,
-    ) -> EcResult<G::Curve> {
+    ) -> EcResult<G::Group> {
+        let _span = debug_span!("multiexp", n = exps.len()).entered();
         // Bases are skipped by `self.1` elements, when converted from (Arc<Vec<G>>, usize) to Source
         // https://github.com/zkcrypto/bellman/blob/10c5010fd9c2ca69442dc9775ea271e286e776d8/src/multiexp.rs#L38
         let bases = &bases_arc[skip..(skip + exps.len())];
@@ -348,7 +548,7 @@ where
         let error = Arc::new(RwLock::new(Ok(())));
 
         pool.scoped(|s| {
-            results = vec![G::Curve::identity(); self.kernels.len()];
+            results = vec![<G::Group as AdditiveGroup>::ZERO; self.kernels.len()];
             self.parallel_multiexp(s, bases, exps, &mut results, error.clone());
         });
 
@@ -357,7 +557,7 @@ where
             .into_inner()
             .unwrap()?;
 
-        let mut acc = G::Curve::identity();
+        let mut acc = <G::Group as AdditiveGroup>::ZERO;
         for r in results {
             acc.add_assign(&r);
         }
diff --git a/ec-gpu-gen/src/multiexp_cpu.rs b/ec-gpu-gen/src/multiexp_cpu.rs
deleted file mode 100644
index 5703af4..0000000
--- a/ec-gpu-gen/src/multiexp_cpu.rs
+++ /dev/null
@@ -1,571 +0,0 @@
-#![allow(missing_docs)]
-use std::convert::TryInto;
-use std::io;
-use std::iter;
-use std::ops::AddAssign;
-use std::sync::Arc;
-
-use bitvec::prelude::{BitVec, Lsb0};
-use ff::{Field, PrimeField};
-use group::{prime::PrimeCurveAffine, Group};
-use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
-
-use crate::error::EcError;
-use crate::threadpool::{Waiter, Worker};
-
-/// An object that builds a source of bases.
-pub trait SourceBuilder<G: PrimeCurveAffine>: Send + Sync + 'static + Clone {
-    type Source: Source<G>;
-
-    #[allow(clippy::wrong_self_convention)]
-    fn new(self) -> Self::Source;
-    fn get(self) -> (Arc<Vec<G>>, usize);
-}
-
-/// A source of bases, like an iterator.
-pub trait Source<G: PrimeCurveAffine> {
-    /// Parses the element from the source. Fails if the point is at infinity.
-    fn add_assign_mixed(&mut self, to: &mut <G as PrimeCurveAffine>::Curve) -> Result<(), EcError>;
-
-    /// Skips `amt` elements from the source, avoiding deserialization.
-    fn skip(&mut self, amt: usize) -> Result<(), EcError>;
-}
-
-impl<G: PrimeCurveAffine> SourceBuilder<G> for (Arc<Vec<G>>, usize) {
-    type Source = (Arc<Vec<G>>, usize);
-
-    fn new(self) -> (Arc<Vec<G>>, usize) {
-        (self.0.clone(), self.1)
-    }
-
-    fn get(self) -> (Arc<Vec<G>>, usize) {
-        (self.0.clone(), self.1)
-    }
-}
-
-impl<G: PrimeCurveAffine> Source<G> for (Arc<Vec<G>>, usize) {
-    fn add_assign_mixed(&mut self, to: &mut <G as PrimeCurveAffine>::Curve) -> Result<(), EcError> {
-        if self.0.len() <= self.1 {
-            return Err(io::Error::new(
-                io::ErrorKind::UnexpectedEof,
-                "Expected more bases from source.",
-            )
-            .into());
-        }
-
-        if self.0[self.1].is_identity().into() {
-            return Err(EcError::Simple(
-                "Encountered an identity element in the CRS.",
-            ));
-        }
-
-        to.add_assign(&self.0[self.1]);
-
-        self.1 += 1;
-
-        Ok(())
-    }
-
-    fn skip(&mut self, amt: usize) -> Result<(), EcError> {
-        if self.0.len() <= self.1 {
-            return Err(io::Error::new(
-                io::ErrorKind::UnexpectedEof,
-                "Expected more bases from source.",
-            )
-            .into());
-        }
-
-        self.1 += amt;
-
-        Ok(())
-    }
-}
-
-pub trait QueryDensity: Sized {
-    /// Returns whether the base exists.
-    type Iter: Iterator<Item = bool>;
-
-    fn iter(self) -> Self::Iter;
-    fn get_query_size(self) -> Option<usize>;
-    fn generate_exps<F: PrimeField>(self, exponents: Arc<Vec<F::Repr>>) -> Arc<Vec<F::Repr>>;
-}
-
-#[derive(Clone)]
-pub struct FullDensity;
-
-impl AsRef<FullDensity> for FullDensity {
-    fn as_ref(&self) -> &FullDensity {
-        self
-    }
-}
-
-impl QueryDensity for &FullDensity {
-    type Iter = iter::Repeat<bool>;
-
-    fn iter(self) -> Self::Iter {
-        iter::repeat(true)
-    }
-
-    fn get_query_size(self) -> Option<usize> {
-        None
-    }
-
-    fn generate_exps<F: PrimeField>(self, exponents: Arc<Vec<F::Repr>>) -> Arc<Vec<F::Repr>> {
-        exponents
-    }
-}
-
-#[derive(Clone, PartialEq, Eq, Debug, Default)]
-pub struct DensityTracker {
-    pub bv: BitVec,
-    pub total_density: usize,
-}
-
-impl<'a> QueryDensity for &'a DensityTracker {
-    type Iter = bitvec::slice::BitValIter<'a, usize, Lsb0>;
-
-    fn iter(self) -> Self::Iter {
-        self.bv.iter().by_vals()
-    }
-
-    fn get_query_size(self) -> Option<usize> {
-        Some(self.bv.len())
-    }
-
-    fn generate_exps<F: PrimeField>(self, exponents: Arc<Vec<F::Repr>>) -> Arc<Vec<F::Repr>> {
-        let exps: Vec<_> = exponents
-            .iter()
-            .zip(self.bv.iter())
-            .filter_map(|(&e, d)| if *d { Some(e) } else { None })
-            .collect();
-
-        Arc::new(exps)
-    }
-}
-
-impl DensityTracker {
-    pub fn new() -> DensityTracker {
-        DensityTracker {
-            bv: BitVec::new(),
-            total_density: 0,
-        }
-    }
-
-    pub fn add_element(&mut self) {
-        self.bv.push(false);
-    }
-
-    pub fn inc(&mut self, idx: usize) {
-        if !self.bv.get(idx).unwrap() {
-            self.bv.set(idx, true);
-            self.total_density += 1;
-        }
-    }
-
-    pub fn get_total_density(&self) -> usize {
-        self.total_density
-    }
-
-    /// Extend by concatenating `other`. If `is_input_density` is true, then we are tracking an input density,
-    /// and other may contain a redundant input for the `One` element. Coalesce those as needed and track the result.
-    pub fn extend(&mut self, other: &Self, is_input_density: bool) {
-        if other.bv.is_empty() {
-            // Nothing to do if other is empty.
-            return;
-        }
-
-        if self.bv.is_empty() {
-            // If self is empty, assume other's density.
-            self.total_density = other.total_density;
-            self.bv.resize(other.bv.len(), false);
-            self.bv.copy_from_bitslice(&*other.bv);
-            return;
-        }
-
-        if is_input_density {
-            // Input densities need special handling to coalesce their first inputs.
-
-            if other.bv[0] {
-                // If other's first bit is set,
-                if self.bv[0] {
-                    // And own first bit is set, then decrement total density so the final sum doesn't overcount.
-                    self.total_density -= 1;
-                } else {
-                    // Otherwise, set own first bit.
-                    self.bv.set(0, true);
-                }
-            }
-            // Now discard other's first bit, having accounted for it above, and extend self by remaining bits.
-            self.bv.extend(other.bv.iter().skip(1));
-        } else {
-            // Not an input density, just extend straightforwardly.
-            self.bv.extend(other.bv.iter());
-        }
-
-        // Since any needed adjustments to total densities have been made, just sum the totals and keep the sum.
-        self.total_density += other.total_density;
-    }
-}
-
-// Right shift the repr of a field element by `n` bits.
-fn shr(le_bytes: &mut [u8], mut n: u32) {
-    if n >= 8 * le_bytes.len() as u32 {
-        le_bytes.iter_mut().for_each(|byte| *byte = 0);
-        return;
-    }
-
-    // Shift each full byte towards the least significant end.
-    while n >= 8 {
-        let mut replacement = 0;
-        for byte in le_bytes.iter_mut().rev() {
-            std::mem::swap(&mut replacement, byte);
-        }
-        n -= 8;
-    }
-
-    // Starting at the most significant byte, shift the byte's `n` least significant bits into the
-    // `n` most significant bits of the next byte.
-    if n > 0 {
-        let mut shift_in = 0;
-        for byte in le_bytes.iter_mut().rev() {
-            // Copy the byte's `n` least significant bits.
-            let shift_out = *byte << (8 - n);
-            // Shift the byte by `n` bits; zeroing its `n` most significant bits.
-            *byte >>= n;
-            // Replace the `n` most significant bits with the bits shifted out of the previous byte.
-            *byte |= shift_in;
-            shift_in = shift_out;
-        }
-    }
-}
-
-fn multiexp_inner<Q, D, G, S>(
-    bases: S,
-    density_map: D,
-    exponents: Arc<Vec<<G::Scalar as PrimeField>::Repr>>,
-    c: u32,
-) -> Result<<G as PrimeCurveAffine>::Curve, EcError>
-where
-    for<'a> &'a Q: QueryDensity,
-    D: Send + Sync + 'static + Clone + AsRef<Q>,
-    G: PrimeCurveAffine,
-    S: SourceBuilder<G>,
-{
-    // Perform this region of the multiexp
-    let this = move |bases: S,
-                     density_map: D,
-                     exponents: Arc<Vec<<G::Scalar as PrimeField>::Repr>>,
-                     skip: u32|
-          -> Result<_, EcError> {
-        // Accumulate the result
-        let mut acc = G::Curve::identity();
-
-        // Build a source for the bases
-        let mut bases = bases.new();
-
-        // Create space for the buckets
-        let mut buckets = vec![<G as PrimeCurveAffine>::Curve::identity(); (1 << c) - 1];
-
-        let zero = G::Scalar::ZERO.to_repr();
-        let one = G::Scalar::ONE.to_repr();
-
-        // only the first round uses this
-        let handle_trivial = skip == 0;
-
-        // Sort the bases into buckets
-        for (&exp, density) in exponents.iter().zip(density_map.as_ref().iter()) {
-            if density {
-                if exp.as_ref() == zero.as_ref() {
-                    bases.skip(1)?;
-                } else if exp.as_ref() == one.as_ref() {
-                    if handle_trivial {
-                        bases.add_assign_mixed(&mut acc)?;
-                    } else {
-                        bases.skip(1)?;
-                    }
-                } else {
-                    let mut exp = exp;
-                    shr(exp.as_mut(), skip);
-                    let exp = u64::from_le_bytes(exp.as_ref()[..8].try_into().unwrap()) % (1 << c);
-
-                    if exp != 0 {
-                        bases.add_assign_mixed(&mut buckets[(exp - 1) as usize])?;
-                    } else {
-                        bases.skip(1)?;
-                    }
-                }
-            }
-        }
-
-        // Summation by parts
-        // e.g. 3a + 2b + 1c = a +
-        //                    (a) + b +
-        //                    ((a) + b) + c
-        let mut running_sum = G::Curve::identity();
-        for exp in buckets.into_iter().rev() {
-            running_sum.add_assign(&exp);
-            acc.add_assign(&running_sum);
-        }
-
-        Ok(acc)
-    };
-
-    let parts = (0..<G::Scalar as PrimeField>::NUM_BITS)
-        .into_par_iter()
-        .step_by(c as usize)
-        .map(|skip| this(bases.clone(), density_map.clone(), exponents.clone(), skip))
-        .collect::<Vec<Result<_, _>>>();
-
-    parts.into_iter().rev().try_fold(
-        <G as PrimeCurveAffine>::Curve::identity(),
-        |mut acc, part| {
-            for _ in 0..c {
-                acc = acc.double();
-            }
-
-            acc.add_assign(&part?);
-            Ok(acc)
-        },
-    )
-}
-
-/// Perform multi-exponentiation. The caller is responsible for ensuring the
-/// query size is the same as the number of exponents.
-pub fn multiexp_cpu<'b, Q, D, G, S>(
-    pool: &Worker,
-    bases: S,
-    density_map: D,
-    exponents: Arc<Vec<<G::Scalar as PrimeField>::Repr>>,
-) -> Waiter<Result<<G as PrimeCurveAffine>::Curve, EcError>>
-where
-    for<'a> &'a Q: QueryDensity,
-    D: Send + Sync + 'static + Clone + AsRef<Q>,
-    G: PrimeCurveAffine,
-    S: SourceBuilder<G>,
-{
-    let c = if exponents.len() < 32 {
-        3u32
-    } else {
-        (f64::from(exponents.len() as u32)).ln().ceil() as u32
-    };
-
-    if let Some(query_size) = density_map.as_ref().get_query_size() {
-        // If the density map has a known query size, it should not be
-        // inconsistent with the number of exponents.
-        assert!(query_size == exponents.len());
-    }
-
-    pool.compute(move || multiexp_inner(bases, density_map, exponents, c))
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use blstrs::Bls12;
-    use group::Curve;
-    use pairing::Engine;
-    use rand::Rng;
-    use rand_core::SeedableRng;
-    use rand_xorshift::XorShiftRng;
-
-    #[test]
-    fn test_with_bls12() {
-        fn naive_multiexp<G: PrimeCurveAffine>(
-            bases: Arc<Vec<G>>,
-            exponents: &[G::Scalar],
-        ) -> G::Curve {
-            assert_eq!(bases.len(), exponents.len());
-
-            let mut acc = G::Curve::identity();
-
-            for (base, exp) in bases.iter().zip(exponents.iter()) {
-                acc.add_assign(&base.mul(*exp));
-            }
-
-            acc
-        }
-
-        const SAMPLES: usize = 1 << 14;
-
-        let rng = &mut rand::thread_rng();
-        let v: Vec<<Bls12 as Engine>::Fr> = (0..SAMPLES)
-            .map(|_| <Bls12 as Engine>::Fr::random(&mut *rng))
-            .collect();
-        let g = Arc::new(
-            (0..SAMPLES)
-                .map(|_| <Bls12 as Engine>::G1::random(&mut *rng).to_affine())
-                .collect::<Vec<_>>(),
-        );
-
-        let now = std::time::Instant::now();
-        let naive = naive_multiexp(g.clone(), &v);
-        println!("Naive: {}", now.elapsed().as_millis());
-
-        let now = std::time::Instant::now();
-        let pool = Worker::new();
-
-        let v = Arc::new(v.into_iter().map(|fr| fr.to_repr()).collect());
-        let fast = multiexp_cpu(&pool, (g, 0), FullDensity, v).wait().unwrap();
-
-        println!("Fast: {}", now.elapsed().as_millis());
-
-        assert_eq!(naive, fast);
-    }
-
-    #[test]
-    fn test_extend_density_regular() {
-        let mut rng = XorShiftRng::from_seed([
-            0x59, 0x62, 0xbe, 0x5d, 0x76, 0x3d, 0x31, 0x8d, 0x17, 0xdb, 0x37, 0x32, 0x54, 0x06,
-            0xbc, 0xe5,
-        ]);
-
-        for k in &[2, 4, 8] {
-            for j in &[10, 20, 50] {
-                let count: usize = k * j;
-
-                let mut tracker_full = DensityTracker::new();
-                let mut partial_trackers: Vec<DensityTracker> = Vec::with_capacity(count / k);
-                for i in 0..count {
-                    if i % k == 0 {
-                        partial_trackers.push(DensityTracker::new());
-                    }
-
-                    let index: usize = i / k;
-                    if rng.gen() {
-                        tracker_full.add_element();
-                        partial_trackers[index].add_element();
-                    }
-
-                    if !partial_trackers[index].bv.is_empty() {
-                        let idx = rng.gen_range(0..partial_trackers[index].bv.len());
-                        let offset: usize = partial_trackers
-                            .iter()
-                            .take(index)
-                            .map(|t| t.bv.len())
-                            .sum();
-                        tracker_full.inc(offset + idx);
-                        partial_trackers[index].inc(idx);
-                    }
-                }
-
-                let mut tracker_combined = DensityTracker::new();
-                for tracker in partial_trackers.into_iter() {
-                    tracker_combined.extend(&tracker, false);
-                }
-                assert_eq!(tracker_combined, tracker_full);
-            }
-        }
-    }
-
-    #[test]
-    fn test_extend_density_input() {
-        let mut rng = XorShiftRng::from_seed([
-            0x59, 0x62, 0xbe, 0x5d, 0x76, 0x3d, 0x31, 0x8d, 0x17, 0xdb, 0x37, 0x32, 0x54, 0x06,
-            0xbc, 0xe5,
-        ]);
-        let trials = 10;
-        let max_bits = 10;
-        let max_density = max_bits;
-
-        // Create an empty DensityTracker.
-        let empty = DensityTracker::new;
-
-        // Create a random DensityTracker with first bit unset.
-        let unset = |rng: &mut XorShiftRng| {
-            let mut dt = DensityTracker::new();
-            dt.add_element();
-            let n = rng.gen_range(1..max_bits);
-            let target_density = rng.gen_range(0..max_density);
-            for _ in 1..n {
-                dt.add_element();
-            }
-
-            for _ in 0..target_density {
-                if n > 1 {
-                    let to_inc = rng.gen_range(1..n);
-                    dt.inc(to_inc);
-                }
-            }
-            assert!(!dt.bv[0]);
-            assert_eq!(n, dt.bv.len());
-            dbg!(&target_density, &dt.total_density);
-
-            dt
-        };
-
-        // Create a random DensityTracker with first bit set.
-        let set = |rng: &mut XorShiftRng| {
-            let mut dt = unset(rng);
-            dt.inc(0);
-            dt
-        };
-
-        for _ in 0..trials {
-            {
-                // Both empty.
-                let (mut e1, e2) = (empty(), empty());
-                e1.extend(&e2, true);
-                assert_eq!(empty(), e1);
-            }
-            {
-                // First empty, second unset.
-                let (mut e1, u1) = (empty(), unset(&mut rng));
-                e1.extend(&u1.clone(), true);
-                assert_eq!(u1, e1);
-            }
-            {
-                // First empty, second set.
-                let (mut e1, s1) = (empty(), set(&mut rng));
-                e1.extend(&s1.clone(), true);
-                assert_eq!(s1, e1);
-            }
-            {
-                // First set, second empty.
-                let (mut s1, e1) = (set(&mut rng), empty());
-                let s2 = s1.clone();
-                s1.extend(&e1, true);
-                assert_eq!(s1, s2);
-            }
-            {
-                // First unset, second empty.
-                let (mut u1, e1) = (unset(&mut rng), empty());
-                let u2 = u1.clone();
-                u1.extend(&e1, true);
-                assert_eq!(u1, u2);
-            }
-            {
-                // First unset, second unset.
-                let (mut u1, u2) = (unset(&mut rng), unset(&mut rng));
-                let expected_total = u1.total_density + u2.total_density;
-                u1.extend(&u2, true);
-                assert_eq!(expected_total, u1.total_density);
-                assert!(!u1.bv[0]);
-            }
-            {
-                // First unset, second set.
-                let (mut u1, s1) = (unset(&mut rng), set(&mut rng));
-                let expected_total = u1.total_density + s1.total_density;
-                u1.extend(&s1, true);
-                assert_eq!(expected_total, u1.total_density);
-                assert!(u1.bv[0]);
-            }
-            {
-                // First set, second unset.
-                let (mut s1, u1) = (set(&mut rng), unset(&mut rng));
-                let expected_total = s1.total_density + u1.total_density;
-                s1.extend(&u1, true);
-                assert_eq!(expected_total, s1.total_density);
-                assert!(s1.bv[0]);
-            }
-            {
-                // First set, second set.
-                let (mut s1, s2) = (set(&mut rng), set(&mut rng));
-                let expected_total = s1.total_density + s2.total_density - 1;
-                s1.extend(&s2, true);
-                assert_eq!(expected_total, s1.total_density);
-                assert!(s1.bv[0]);
-            }
-        }
-    }
-}
diff --git a/ec-gpu-gen/src/source.rs b/ec-gpu-gen/src/source.rs
index 49fda87..fde587f 100644
--- a/ec-gpu-gen/src/source.rs
+++ b/ec-gpu-gen/src/source.rs
@@ -9,7 +9,6 @@ use std::path::PathBuf;
 use std::{env, fs};
 
 use ec_gpu::{GpuField, GpuName};
-use group::prime::PrimeCurveAffine;
 
 static COMMON_SRC: &str = include_str!("cl/common.cl");
 static FIELD_SRC: &str = include_str!("cl/field.cl");
@@ -194,15 +193,12 @@ impl<P: GpuName, F: GpuName, Exp: GpuName> NameAndSource for Multiexp<P, F, Exp>
 ///
 /// # Example
 ///
-/// ```
-/// use blstrs::{Fp, Fp2, G1Affine, G2Affine, Scalar};
+/// ```ignore
+/// use ark_bn254::Fr;
 /// use ec_gpu_gen::SourceBuilder;
 ///
-/// # #[cfg(any(feature = "cuda", feature = "opencl"))]
 /// let source = SourceBuilder::new()
-///     .add_fft::<Scalar>()
-///     .add_multiexp::<G1Affine, Fp>()
-///     .add_multiexp::<G2Affine, Fp2>()
+///     .add_fft::<Fr>()
 ///     .build_32_bit_limbs();
 ///```
 // In the `HashSet`s the concrete types cannot be used, as each item of the set should be able to
@@ -264,18 +260,18 @@ impl SourceBuilder {
         config
     }
 
-    /// Add an Multiexp kernel function to the configuration.
+    /// Add a Multiexp kernel function to the configuration.
     ///
-    /// The field must be given explicitly as currently it cannot derived from the curve point
-    /// directly.
-    pub fn add_multiexp<C, F>(self) -> Self
+    /// The field and scalar types must be given explicitly as they currently cannot be derived
+    /// from the curve point directly.
+    pub fn add_multiexp<C, F, S>(self) -> Self
     where
-        C: PrimeCurveAffine + GpuName,
-        C::Scalar: GpuField,
+        C: GpuName + 'static,
+        S: GpuField + 'static,
         F: GpuField + 'static,
     {
-        let mut config = self.add_field::<F>().add_field::<C::Scalar>();
-        let multiexp = Multiexp::<C, F, C::Scalar>::new();
+        let mut config = self.add_field::<F>().add_field::<S>();
+        let multiexp = Multiexp::<C, F, S>::new();
         config.multiexps.insert(Box::new(multiexp));
         config
     }
@@ -711,31 +707,58 @@ mod tests {
     use rust_gpu_tools::opencl;
     use rust_gpu_tools::{program_closures, Device, GPUError, Program};
 
-    use blstrs::Scalar;
-    use ff::{Field as _, PrimeField};
+    use ark_ff::AdditiveGroup;
+    use ark_std::Zero;
+
     use lazy_static::lazy_static;
     use rand::{thread_rng, Rng};
 
     static TEST_SRC: &str = include_str!("./cl/test.cl");
 
+    // #[derive(PartialEq, Debug, Clone, Copy)]
+    // #[repr(transparent)]
+    // pub struct GpuScalar(pub Scalar);
+    // impl Default for GpuScalar {
+    //     fn default() -> Self {
+    //         Self(Scalar::ZERO)
+    //     }
+    // }
+
+    // #[cfg(feature = "cuda")]
+    // impl cuda::KernelArgument for GpuScalar {
+    //     fn as_c_void(&self) -> *mut std::ffi::c_void {
+    //         &self.0 as *const _ as _
+    //     }
+    // }
+
+    // #[cfg(feature = "opencl")]
+    // impl opencl::KernelArgument for GpuScalar {
+    //     fn push(&self, kernel: &mut opencl::Kernel) {
+    //         unsafe { kernel.builder.set_arg(&self.0) };
+    //     }
+    // }
+
+    #[cfg(feature = "arkworks")]
     #[derive(PartialEq, Debug, Clone, Copy)]
     #[repr(transparent)]
-    pub struct GpuScalar(pub Scalar);
-    impl Default for GpuScalar {
+    pub struct GpuScalarBN254(pub ark_bn254::Fr);
+
+    #[cfg(feature = "arkworks")]
+    impl Default for GpuScalarBN254 {
         fn default() -> Self {
-            Self(Scalar::ZERO)
+            Self(ark_bn254::Fr::zero())
         }
     }
 
-    #[cfg(feature = "cuda")]
-    impl cuda::KernelArgument for GpuScalar {
+    #[cfg(all(feature = "arkworks", feature = "cuda"))]
+    impl cuda::KernelArgument for GpuScalarBN254 {
         fn as_c_void(&self) -> *mut std::ffi::c_void {
             &self.0 as *const _ as _
         }
     }
 
-    #[cfg(feature = "opencl")]
-    impl opencl::KernelArgument for GpuScalar {
+    #[cfg(all(feature = "arkworks", feature = "opencl"))]
+    impl opencl::KernelArgument for GpuScalarBN254 {
         fn push(&self, kernel: &mut opencl::Kernel) {
             unsafe { kernel.builder.set_arg(&self.0) };
         }
@@ -750,19 +773,60 @@ mod tests {
         }
     }
 
-    fn test_source() -> SourceBuilder {
-        let test_source = String::from(TEST_SRC).replace("FIELD", &Scalar::name());
+    // Temporarily commented out to avoid version conflicts
+    // fn test_source() -> SourceBuilder {
+    //     let test_source = String::from(TEST_SRC).replace("FIELD", &Scalar::name());
+    //     SourceBuilder::new()
+    //         .add_field::<Scalar>()
+    //         .append_source(test_source)
+    // }
+
+    #[cfg(feature = "arkworks")]
+    fn test_source_ark_bn254() -> SourceBuilder {
+        let test_source = String::from(TEST_SRC).replace("FIELD", &ark_bn254::Fr::name());
         SourceBuilder::new()
-            .add_field::<Scalar>()
+            .add_field::<ark_bn254::Fr>()
             .append_source(test_source)
     }
-
-    #[cfg(feature = "cuda")]
+    //
+    //     #[cfg(feature = "cuda")]
+    //     lazy_static! {
+    //         static ref CUDA_PROGRAM: Mutex<Program> = {
+    //             use std::ffi::CString;
+    //
+    //             let source = test_source();
+    //             let fatbin_path = generate_cuda(&source);
+    //
+    //             let device = *Device::all().first().expect("Cannot get a default device.");
+    //             let cuda_device = device.cuda_device().unwrap();
+    //             let fatbin_path_cstring =
+    //                 CString::new(fatbin_path.to_str().expect("path is not valid UTF-8."))
+    //                     .expect("path contains NULL byte.");
+    //             let program =
+    //                 cuda::Program::from_binary(cuda_device, fatbin_path_cstring.as_c_str()).unwrap();
+    //             Mutex::new(Program::Cuda(program))
+    //         };
+    //     }
+    //
+    //     #[cfg(feature = "opencl")]
+    //     lazy_static! {
+    //         static ref OPENCL_PROGRAM: Mutex<(Program, Program)> = {
+    //             let device = *Device::all().first().expect("Cannot get a default device");
+    //             let opencl_device = device.opencl_device().unwrap();
+    //             let source_32 = test_source().build_32_bit_limbs();
+    //             let program_32 = opencl::Program::from_opencl(opencl_device, &source_32).unwrap();
+    //             let source_64 = test_source().build_64_bit_limbs();
+    //             let program_64 = opencl::Program::from_opencl(opencl_device, &source_64).unwrap();
+    //             Mutex::new((Program::Opencl(program_32), Program::Opencl(program_64)))
+    //         };
+    //     }
+
+    #[cfg(all(feature = "arkworks", feature = "cuda"))]
     lazy_static! {
-        static ref CUDA_PROGRAM: Mutex<Program> = {
+        static ref CUDA_PROGRAM_ARK_BN254: Mutex<Program> = {
             use std::ffi::CString;
 
-            let source = test_source();
+            let source = test_source_ark_bn254();
             let fatbin_path = generate_cuda(&source);
 
             let device = *Device::all().first().expect("Cannot get a default device.");
@@ -776,22 +840,80 @@ mod tests {
         };
     }
 
-    #[cfg(feature = "opencl")]
+    #[cfg(all(feature = "arkworks", feature = "opencl"))]
     lazy_static! {
-        static ref OPENCL_PROGRAM: Mutex<(Program, Program)> = {
+        static ref OPENCL_PROGRAM_ARK_BN254: Mutex<(Program, Program)> = {
             let device = *Device::all().first().expect("Cannot get a default device");
             let opencl_device = device.opencl_device().unwrap();
-            let source_32 = test_source().build_32_bit_limbs();
+            let source_32 = test_source_ark_bn254().build_32_bit_limbs();
             let program_32 = opencl::Program::from_opencl(opencl_device, &source_32).unwrap();
-            let source_64 = test_source().build_64_bit_limbs();
+            let source_64 = test_source_ark_bn254().build_64_bit_limbs();
             let program_64 = opencl::Program::from_opencl(opencl_device, &source_64).unwrap();
             Mutex::new((Program::Opencl(program_32), Program::Opencl(program_64)))
         };
     }
 
-    fn call_kernel(name: &str, scalars: &[GpuScalar], uints: &[u32]) -> Scalar {
-        let closures = program_closures!(|program, _args| -> Result<Scalar, NoError> {
-            let mut cpu_buffer = vec![GpuScalar::default()];
+    //     fn call_kernel(name: &str, scalars: &[GpuScalar], uints: &[u32]) -> Scalar {
+    //         let closures = program_closures!(|program, _args| -> Result<Scalar, NoError> {
+    //             let mut cpu_buffer = vec![GpuScalar::default()];
+    //             let buffer = program.create_buffer_from_slice(&cpu_buffer).unwrap();
+    //
+    //             let mut kernel = program.create_kernel(name, 1, 64).unwrap();
+    //             for scalar in scalars {
+    //                 kernel = kernel.arg(scalar);
+    //             }
+    //             for uint in uints {
+    //                 kernel = kernel.arg(uint);
+    //             }
+    //             kernel.arg(&buffer).run().unwrap();
+    //
+    //             program.read_into_buffer(&buffer, &mut cpu_buffer).unwrap();
+    //             Ok(cpu_buffer[0].0)
+    //         });
+    //
+    //         // For CUDA we only test 32-bit limbs.
+    //         #[cfg(all(feature = "cuda", not(feature = "opencl")))]
+    //         return CUDA_PROGRAM.lock().unwrap().run(closures, ()).unwrap();
+    //
+    //         // For OpenCL we test for 32 and 64-bi limbs.
+    //         #[cfg(all(feature = "opencl", not(feature = "cuda")))]
+    //         {
+    //             let result_32 = OPENCL_PROGRAM.lock().unwrap().0.run(closures, ()).unwrap();
+    //             let result_64 = OPENCL_PROGRAM.lock().unwrap().1.run(closures, ()).unwrap();
+    //             assert_eq!(
+    //                 result_32, result_64,
+    //                 "Results for 32-bit and 64-bit limbs must be the same."
+    //             );
+    //             result_32
+    //         }
+    //
+    //         // When both features are enabled, check if the results are the same
+    //         #[cfg(all(feature = "cuda", feature = "opencl"))]
+    //         {
+    //             let cuda_result = CUDA_PROGRAM.lock().unwrap().run(closures, ()).unwrap();
+    //             let opencl_32_result = OPENCL_PROGRAM.lock().unwrap().0.run(closures, ()).unwrap();
+    //             let opencl_64_result = OPENCL_PROGRAM.lock().unwrap().1.run(closures, ()).unwrap();
+    //             assert_eq!(
+    //                 opencl_32_result, opencl_64_result,
+    //                 "Results for 32-bit and 64-bit limbs on OpenCL must be the same."
+    //             );
+    //             assert_eq!(
+    //                 cuda_result, opencl_32_result,
+    //                 "Results for CUDA and OpenCL must be the same."
+    //             );
+    //             cuda_result
+    //         }
+    //     }
+    //
+
+    #[cfg(feature = "arkworks")]
+    fn call_kernel_ark_bn254(
+        name: &str,
+        scalars: &[GpuScalarBN254],
+        uints: &[u32],
+    ) -> ark_bn254::Fr {
+        let closures = program_closures!(|program, _args| -> Result<ark_bn254::Fr, NoError> {
+            let mut cpu_buffer = vec![GpuScalarBN254::default()];
             let buffer = program.create_buffer_from_slice(&cpu_buffer).unwrap();
 
             let mut kernel = program.create_kernel(name, 1, 64).unwrap();
@@ -809,13 +931,27 @@ mod tests {
 
         // For CUDA we only test 32-bit limbs.
         #[cfg(all(feature = "cuda", not(feature = "opencl")))]
-        return CUDA_PROGRAM.lock().unwrap().run(closures, ()).unwrap();
+        return CUDA_PROGRAM_ARK_BN254
+            .lock()
+            .unwrap()
+            .run(closures, ())
+            .unwrap();
 
-        // For OpenCL we test for 32 and 64-bi limbs.
+        // For OpenCL we test for 32 and 64-bit limbs.
         #[cfg(all(feature = "opencl", not(feature = "cuda")))]
         {
-            let result_32 = OPENCL_PROGRAM.lock().unwrap().0.run(closures, ()).unwrap();
-            let result_64 = OPENCL_PROGRAM.lock().unwrap().1.run(closures, ()).unwrap();
+            let result_32 = OPENCL_PROGRAM_ARK_BN254
+                .lock()
+                .unwrap()
+                .0
+                .run(closures, ())
+                .unwrap();
+            let result_64 = OPENCL_PROGRAM_ARK_BN254
+                .lock()
+                .unwrap()
+                .1
+                .run(closures, ())
+                .unwrap();
             assert_eq!(
                 result_32, result_64,
                 "Results for 32-bit and 64-bit limbs must be the same."
@@ -826,9 +962,23 @@ mod tests {
         // When both features are enabled, check if the results are the same
         #[cfg(all(feature = "cuda", feature = "opencl"))]
         {
-            let cuda_result = CUDA_PROGRAM.lock().unwrap().run(closures, ()).unwrap();
-            let opencl_32_result = OPENCL_PROGRAM.lock().unwrap().0.run(closures, ()).unwrap();
-            let opencl_64_result = OPENCL_PROGRAM.lock().unwrap().1.run(closures, ()).unwrap();
+            let cuda_result = CUDA_PROGRAM_ARK_BN254
+                .lock()
+                .unwrap()
+                .run(closures, ())
+                .unwrap();
+            let opencl_32_result = OPENCL_PROGRAM_ARK_BN254
+                .lock()
+                .unwrap()
+                .0
+                .run(closures, ())
+                .unwrap();
+            let opencl_64_result = OPENCL_PROGRAM_ARK_BN254
+                .lock()
+                .unwrap()
+                .1
+                .run(closures, ())
+                .unwrap();
             assert_eq!(
                 opencl_32_result, opencl_64_result,
                 "Results for 32-bit and 64-bit limbs on OpenCL must be the same."
@@ -841,101 +991,145 @@ mod tests {
         }
     }
 
+    #[cfg(feature = "arkworks")]
     #[test]
-    fn test_add() {
+    fn test_ark_bn254_add() {
+        use ark_std::UniformRand;
         let mut rng = thread_rng();
         for _ in 0..10 {
-            let a = Scalar::random(&mut rng);
-            let b = Scalar::random(&mut rng);
+            let a = ark_bn254::Fr::rand(&mut rng);
+            let b = ark_bn254::Fr::rand(&mut rng);
             let c = a + b;
 
             assert_eq!(
-                call_kernel("test_add", &[GpuScalar(a), GpuScalar(b)], &[]),
+                call_kernel_ark_bn254("test_add", &[GpuScalarBN254(a), GpuScalarBN254(b)], &[]),
                 c
             );
         }
     }
 
+    #[cfg(feature = "arkworks")]
     #[test]
-    fn test_sub() {
+    fn test_ark_bn254_sub() {
+        use ark_std::UniformRand;
         let mut rng = thread_rng();
         for _ in 0..10 {
-            let a = Scalar::random(&mut rng);
-            let b = Scalar::random(&mut rng);
+            let a = ark_bn254::Fr::rand(&mut rng);
+            let b = ark_bn254::Fr::rand(&mut rng);
             let c = a - b;
             assert_eq!(
-                call_kernel("test_sub", &[GpuScalar(a), GpuScalar(b)], &[]),
+                call_kernel_ark_bn254("test_sub", &[GpuScalarBN254(a), GpuScalarBN254(b)], &[]),
                 c
             );
         }
     }
 
+    #[cfg(feature = "arkworks")]
     #[test]
-    fn test_mul() {
+    fn test_ark_bn254_mul() {
+        use ark_std::UniformRand;
         let mut rng = thread_rng();
         for _ in 0..10 {
-            let a = Scalar::random(&mut rng);
-            let b = Scalar::random(&mut rng);
+            let a = ark_bn254::Fr::rand(&mut rng);
+            let b = ark_bn254::Fr::rand(&mut rng);
             let c = a * b;
 
             assert_eq!(
-                call_kernel("test_mul", &[GpuScalar(a), GpuScalar(b)], &[]),
+                call_kernel_ark_bn254("test_mul", &[GpuScalarBN254(a), GpuScalarBN254(b)], &[]),
                 c
             );
         }
     }
 
+    #[cfg(feature = "arkworks")]
     #[test]
-    fn test_pow() {
+    fn test_ark_bn254_pow() {
+        use ark_ff::Field;
+        use ark_std::UniformRand;
         let mut rng = thread_rng();
         for _ in 0..10 {
-            let a = Scalar::random(&mut rng);
+            let a = ark_bn254::Fr::rand(&mut rng);
             let b = rng.gen::<u32>();
-            let c = a.pow_vartime([b as u64]);
-            assert_eq!(call_kernel("test_pow", &[GpuScalar(a)], &[b]), c);
+            let c = a.pow([(b as u64)]);
+            assert_eq!(
+                call_kernel_ark_bn254("test_pow", &[GpuScalarBN254(a)], &[b]),
+                c
+            );
         }
     }
 
+    #[cfg(feature = "arkworks")]
     #[test]
-    fn test_sqr() {
+    fn test_ark_bn254_sqr() {
+        use ark_ff::Field;
+        use ark_std::UniformRand;
         let mut rng = thread_rng();
         for _ in 0..10 {
-            let a = Scalar::random(&mut rng);
+            let a = ark_bn254::Fr::rand(&mut rng);
             let b = a.square();
 
-            assert_eq!(call_kernel("test_sqr", &[GpuScalar(a)], &[]), b);
+            assert_eq!(
+                call_kernel_ark_bn254("test_sqr", &[GpuScalarBN254(a)], &[]),
+                b
+            );
         }
     }
 
+    #[cfg(feature = "arkworks")]
     #[test]
-    fn test_double() {
+    fn test_ark_bn254_double() {
+        use ark_std::UniformRand;
         let mut rng = thread_rng();
         for _ in 0..10 {
-            let a = Scalar::random(&mut rng);
+            let a = ark_bn254::Fr::rand(&mut rng);
             let b = a.double();
 
-            assert_eq!(call_kernel("test_double", &[GpuScalar(a)], &[]), b);
+            assert_eq!(
+                call_kernel_ark_bn254("test_double", &[GpuScalarBN254(a)], &[]),
+                b
+            );
         }
     }
 
+    #[cfg(feature = "arkworks")]
     #[test]
-    fn test_unmont() {
+    fn test_ark_bn254_unmont() {
+        use ark_std::UniformRand;
         let mut rng = thread_rng();
         for _ in 0..10 {
-            let a = Scalar::random(&mut rng);
-            let b: Scalar = unsafe { std::mem::transmute(a.to_repr()) };
-            assert_eq!(call_kernel("test_unmont", &[GpuScalar(a)], &[]), b);
+            let a = ark_bn254::Fr::rand(&mut rng);
+            let b: ark_bn254::Fr = unsafe {
+                use ark_ff::{BigInteger, PrimeField};
+                let b: [u8; 32] = a.into_bigint().to_bytes_le().try_into().unwrap();
+                std::mem::transmute(b)
+            };
+            assert_eq!(
+                call_kernel_ark_bn254("test_unmont", &[GpuScalarBN254(a)], &[]),
+                b
+            );
         }
     }
 
+    #[cfg(feature = "arkworks")]
     #[test]
-    fn test_mont() {
+    fn test_ark_bn254_mont() {
+        use ark_std::UniformRand;
         let mut rng = thread_rng();
         for _ in 0..10 {
-            let a_repr = Scalar::random(&mut rng).to_repr();
-            let a: Scalar = unsafe { std::mem::transmute(a_repr) };
-            let b = Scalar::from_repr(a_repr).unwrap();
-            assert_eq!(call_kernel("test_mont", &[GpuScalar(a)], &[]), b);
+            use ark_ff::{BigInteger, PrimeField};
+
+            let a_repr: [u8; 32] = ark_bn254::Fr::rand(&mut rng)
+                .into_bigint()
+                .to_bytes_le()
+                .try_into()
+                .unwrap();
+            let a: ark_bn254::Fr = unsafe { std::mem::transmute(a_repr) };
+            let b = ark_bn254::Fr::from_le_bytes_mod_order(&a_repr);
+
+            assert_eq!(
+                call_kernel_ark_bn254("test_mont", &[GpuScalarBN254(a)], &[]),
+                b
+            );
         }
     }
 }
diff --git a/ec-gpu/Cargo.toml b/ec-gpu/Cargo.toml
index a4184ff..c1f61e4 100644
--- a/ec-gpu/Cargo.toml
+++ b/ec-gpu/Cargo.toml
@@ -9,3 +9,17 @@ repository = "https://github.com/filecoin-project/ff-cl-gen"
 license = "MIT/Apache-2.0"
 
 [dependencies]
+ark-bn254 = { version = "0.5.0", optional = true }
+ark-ec = { version = "0.5.0", optional = true }
+ark-ff = { version = "0.5.0", optional = true }
+ark-serialize = { version = "0.5.0", optional = true }
+ark-std = { version = "0.5.0", optional = true }
+
+[features]
+arkworks = [
+	"dep:ark-bn254",
+	"dep:ark-ec",
+	"dep:ark-ff",
+	"dep:ark-serialize",
+	"dep:ark-std",
+]
diff --git a/ec-gpu/src/arkworks_bn254.rs b/ec-gpu/src/arkworks_bn254.rs
new file mode 100644
index 0000000..66b3d79
--- /dev/null
+++ b/ec-gpu/src/arkworks_bn254.rs
@@ -0,0 +1,161 @@
+use std::ops::{Deref, DerefMut};
+
+use ark_bn254::{Fq, Fq2, FqConfig, Fr, FrConfig};
+use ark_ec::short_weierstrass::Affine;
+use ark_ff::{BigInteger, MontConfig};
+
+use crate::{GpuField, GpuName};
+
+fn bytes_le_to_u32_limbs(mut bytes: Vec<u8>) -> Vec<u32> {
+    while !bytes.len().is_multiple_of(4) {
+        bytes.push(0);
+    }
+    bytes
+        .chunks_exact(4)
+        .map(|c| u32::from_le_bytes([c[0], c[1], c[2], c[3]]))
+        .collect()
+}
+
+fn bigint_to_u32_limbs_le<B: BigInteger>(b: B) -> Vec<u32> {
+    bytes_le_to_u32_limbs(b.to_bytes_le())
+}
+
+impl GpuName for Fq {
+    fn name() -> String {
+        crate::name!()
+    }
+}
+
+impl GpuField for Fq {
+    fn one() -> Vec<u32> {
+        bigint_to_u32_limbs_le(FqConfig::R)
+    }
+
+    fn r2() -> Vec<u32> {
+        bigint_to_u32_limbs_le(FqConfig::R2)
+    }
+
+    fn modulus() -> Vec<u32> {
+        bigint_to_u32_limbs_le(FqConfig::MODULUS)
+    }
+}
+
+impl GpuName for Fq2 {
+    fn name() -> String {
+        crate::name!()
+    }
+}
+
+impl GpuField for Fq2 {
+    fn one() -> Vec<u32> {
+        let n = bigint_to_u32_limbs_le(FqConfig::MODULUS).len();
+        let mut out = vec![0u32; 2 * n];
+        out[..n].copy_from_slice(&bigint_to_u32_limbs_le(FqConfig::R));
+        out
+    }
+
+    fn r2() -> Vec<u32> {
+        let n = bigint_to_u32_limbs_le(FqConfig::MODULUS).len();
+        let mut out = vec![0u32; 2 * n];
+        out[..n].copy_from_slice(&bigint_to_u32_limbs_le(FqConfig::R2));
+        out
+    }
+
+    fn modulus() -> Vec<u32> {
+        bigint_to_u32_limbs_le(FqConfig::MODULUS)
+    }
+
+    fn sub_field_name() -> Option<String> {
+        Some(Fq::name())
+    }
+}
+
+impl GpuName for Fr {
+    fn name() -> String {
+        crate::name!()
+    }
+}
+
+impl GpuField for Fr {
+    fn one() -> Vec<u32> {
+        bigint_to_u32_limbs_le(FrConfig::R)
+    }
+
+    fn r2() -> Vec<u32> {
+        bigint_to_u32_limbs_le(FrConfig::R2)
+    }
+
+    fn modulus() -> Vec<u32> {
+        bigint_to_u32_limbs_le(FrConfig::MODULUS)
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
+#[repr(transparent)]
+pub struct G1Affine(pub Affine<ark_bn254::g1::Config>);
+
+impl Deref for G1Affine {
+    type Target = Affine<ark_bn254::g1::Config>;
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl DerefMut for G1Affine {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
+impl From<Affine<ark_bn254::g1::Config>> for G1Affine {
+    fn from(p: Affine<ark_bn254::g1::Config>) -> Self {
+        Self(p)
+    }
+}
+
+impl From<G1Affine> for Affine<ark_bn254::g1::Config> {
+    fn from(p: G1Affine) -> Self {
+        p.0
+    }
+}
+
+impl GpuName for G1Affine {
+    fn name() -> String {
+        crate::name!()
+    }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
+#[repr(transparent)]
+pub struct G2Affine(pub Affine<ark_bn254::g2::Config>);
+
+impl Deref for G2Affine {
+    type Target = Affine<ark_bn254::g2::Config>;
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl DerefMut for G2Affine {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
+impl From<Affine<ark_bn254::g2::Config>> for G2Affine {
+    fn from(p: Affine<ark_bn254::g2::Config>) -> Self {
+        Self(p)
+    }
+}
+
+impl From<G2Affine> for Affine<ark_bn254::g2::Config> {
+    fn from(p: G2Affine) -> Self {
+        p.0
+    }
+}
+
+impl GpuName for G2Affine {
+    fn name() -> String {
+        crate::name!()
+    }
+}
diff --git a/ec-gpu/src/arkworks_macros.rs b/ec-gpu/src/arkworks_macros.rs
new file mode 100644
index 0000000..ac27c43
--- /dev/null
+++ b/ec-gpu/src/arkworks_macros.rs
@@ -0,0 +1,128 @@
+/// Helper function to convert arkworks BigInteger to u32 limbs in little-endian order.
+pub fn bigint_to_u32_limbs<B: ark_ff::BigInteger>(b: B) -> Vec<u32> {
+    let bytes = b.to_bytes_le();
+    bytes_to_u32_limbs(bytes)
+}
+
+/// Helper function to convert bytes to u32 limbs in little-endian order.
+pub fn bytes_to_u32_limbs(mut bytes: Vec<u8>) -> Vec<u32> {
+    // Pad to multiple of 4 bytes
+    while !bytes.len().is_multiple_of(4) {
+        bytes.push(0);
+    }
+    bytes
+        .chunks_exact(4)
+        .map(|c| u32::from_le_bytes([c[0], c[1], c[2], c[3]]))
+        .collect()
+}
+
+/// Implement `GpuName` and `GpuField` for an arkworks prime field.
+///
+/// # Example
+///
+/// ```ignore
+/// use ec_gpu::impl_gpu_field_arkworks;
+/// use ark_bls12_381::{Fr, FrConfig};
+///
+/// impl_gpu_field_arkworks!(Fr, FrConfig);
+/// ```
+#[macro_export]
+macro_rules! impl_gpu_field_arkworks {
+    ($field:ty, $config:ty) => {
+        impl $crate::GpuName for $field {
+            fn name() -> String {
+                $crate::name!()
+            }
+        }
+
+        impl $crate::GpuField for $field {
+            fn one() -> Vec<u32> {
+                use ark_ff::MontConfig;
+                $crate::arkworks_macros::bigint_to_u32_limbs(<$config>::R)
+            }
+
+            fn r2() -> Vec<u32> {
+                use ark_ff::MontConfig;
+                $crate::arkworks_macros::bigint_to_u32_limbs(<$config>::R2)
+            }
+
+            fn modulus() -> Vec<u32> {
+                use ark_ff::MontConfig;
+                $crate::arkworks_macros::bigint_to_u32_limbs(<$config>::MODULUS)
+            }
+        }
+    };
+}
+
+/// Implement `GpuName` for an arkworks curve affine point type.
+///
+/// # Example
+///
+/// ```ignore
+/// use ec_gpu::impl_gpu_name_arkworks_curve;
+/// use ark_bls12_381::G1Affine;
+///
+/// impl_gpu_name_arkworks_curve!(G1Affine);
+/// ```
+#[macro_export]
+macro_rules! impl_gpu_name_arkworks_curve {
+    ($curve:ty) => {
+        impl $crate::GpuName for $curve {
+            fn name() -> String {
+                $crate::name!()
+            }
+        }
+    };
+}
+
+/// Implement `GpuName` and `GpuField` for an arkworks quadratic extension field (like Fq2).
+///
+/// # Example
+///
+/// ```ignore
+/// use ec_gpu::impl_gpu_field_arkworks_ext2;
+/// use ark_bls12_381::{Fq, Fq2, FqConfig};
+///
+/// impl_gpu_field_arkworks_ext2!(Fq2, FqConfig, Fq);
+/// ```
+#[macro_export]
+macro_rules! impl_gpu_field_arkworks_ext2 {
+    ($field2:ty, $base_config:ty, $base_field:ty) => {
+        impl $crate::GpuName for $field2 {
+            fn name() -> String {
+                $crate::name!()
+            }
+        }
+
+        impl $crate::GpuField for $field2 {
+            fn one() -> Vec<u32> {
+                use ark_ff::MontConfig;
+                let n = $crate::arkworks_macros::bigint_to_u32_limbs(<$base_config>::MODULUS).len();
+                let mut out = vec![0u32; 2 * n];
+                out[..n].copy_from_slice(&$crate::arkworks_macros::bigint_to_u32_limbs(
+                    <$base_config>::R,
+                ));
+                out
+            }
+
+            fn r2() -> Vec<u32> {
+                use ark_ff::MontConfig;
+                let n = $crate::arkworks_macros::bigint_to_u32_limbs(<$base_config>::MODULUS).len();
+                let mut out = vec![0u32; 2 * n];
+                out[..n].copy_from_slice(&$crate::arkworks_macros::bigint_to_u32_limbs(
+                    <$base_config>::R2,
+                ));
+                out
+            }
+
+            fn modulus() -> Vec<u32> {
+                use ark_ff::MontConfig;
+                $crate::arkworks_macros::bigint_to_u32_limbs(<$base_config>::MODULUS)
+            }
+
+            fn sub_field_name() -> Option<String> {
+                Some(<$base_field as $crate::GpuName>::name())
+            }
+        }
+    };
+}
diff --git a/ec-gpu/src/lib.rs b/ec-gpu/src/lib.rs
index b7c7b69..1cac7b2 100644
--- a/ec-gpu/src/lib.rs
+++ b/ec-gpu/src/lib.rs
@@ -41,6 +41,12 @@ pub trait GpuField: GpuName {
     }
 }
 
+#[cfg(feature = "arkworks")]
+pub mod arkworks_bn254;
+
+#[cfg(feature = "arkworks")]
+pub mod arkworks_macros;
+
 /// Macro to get a unique name of an item.
 ///
 /// The name is a string that consists of the module path and the type name. All non-alphanumeric
diff --git a/gpu-tests/Cargo.toml b/gpu-tests/Cargo.toml
index bf3e92c..0df3418 100644
--- a/gpu-tests/Cargo.toml
+++ b/gpu-tests/Cargo.toml
@@ -1,6 +1,3 @@
-# NOTE vmx 2022-07-07: Using the `__private_bench` feature of `blstrs` is just temporarily until
-# https://github.com/zkcrypto/group/pull/29 is fixed. Then we won't need the exports of `Fp` and
-# `Fp2` any more.
 [package]
 name = "gpu-tests"
 version = "0.1.0"
@@ -12,26 +9,35 @@ license = "MIT/Apache-2.0"
 publish = false
 
 [dev-dependencies]
-blstrs = { version = "0.7.0", features = ["__private_bench"] }
 criterion = "0.4"
-ec-gpu = "0.2"
-ec-gpu-gen = { path = "../ec-gpu-gen", default-features = false }
-ff = { version = "0.13.0", default-features = false }
+divan = "0.1"
+ec-gpu = { workspace = true, features = ["arkworks"] }
+ec-gpu-gen = { path = "../ec-gpu-gen", default-features = false, features = ["arkworks"] }
 fil_logger = "0.1.6"
-group = "0.13.0"
-pairing = "0.23.0"
 rand = "0.8"
 rayon = "1.5.3"
+ark-ff = "0.5.0"
+ark-ec = "0.5.0"
+ark-bn254 = "0.5.0"
+ark-std = "0.5.0"
+tracing = "0.1"
+tracing-subscriber = "0.3"
+tracing-profile.workspace = true
 
 [build-dependencies]
-blstrs = { version = "0.7.0", features = ["__private_bench"] }
-ec-gpu-gen = { path = "../ec-gpu-gen" }
+ark-bn254 = { version = "0.5.0" }
+ec-gpu = { path = "../ec-gpu", features = ["arkworks"] }
+ec-gpu-gen = { path = "../ec-gpu-gen", features = ["arkworks"] }
 
 [features]
 default = ["cuda", "opencl"]
-cuda = ["blstrs/gpu", "ec-gpu-gen/cuda"]
-opencl = ["blstrs/gpu", "ec-gpu-gen/opencl"]
+cuda = ["ec-gpu-gen/cuda"]
+opencl = ["ec-gpu-gen/opencl"]
 
 [[bench]]
 name = "multiexp"
 harness = false
+
+[[bench]]
+name = "small_scalars"
+harness = false
diff --git a/gpu-tests/benches/multiexp.rs b/gpu-tests/benches/multiexp.rs
index 011c367..797a3de 100644
--- a/gpu-tests/benches/multiexp.rs
+++ b/gpu-tests/benches/multiexp.rs
@@ -1,54 +1,95 @@
 use std::sync::Arc;
 
-use blstrs::Bls12;
+use ark_bn254::{Fr, G1Projective};
+use ark_ec::CurveGroup;
+use ark_ff::{PrimeField, UniformRand};
 use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+use ec_gpu::arkworks_bn254::G1Affine;
 use ec_gpu_gen::{
-    multiexp::MultiexpKernel, multiexp_cpu::SourceBuilder, rust_gpu_tools::Device,
+    multiexp::{GpuAffine, MultiexpKernel},
+    rust_gpu_tools::Device,
     threadpool::Worker,
 };
-use ff::{Field, PrimeField};
-use group::{Curve, Group};
-use pairing::Engine;
 use rayon::iter::{IntoParallelIterator, ParallelIterator};
 
 /// The power that will be used to define the maximum number of elements. The number of elements
 /// is `2^MAX_ELEMENTS_POWER`.
-const MAX_ELEMENTS_POWER: usize = 29;
+const MAX_ELEMENTS_POWER: usize = 20;
 /// The maximum number of elements for this benchmark.
 const MAX_ELEMENTS: usize = 1 << MAX_ELEMENTS_POWER;
 
+pub trait QueryDensity: Sized {
+    /// Returns whether the base exists.
+    type Iter: Iterator<Item = bool>;
+
+    fn iter(self) -> Self::Iter;
+    fn get_query_size(self) -> Option<usize>;
+    fn generate_exps<F: PrimeField>(self, exponents: Arc<Vec<F::BigInt>>) -> Arc<Vec<F::BigInt>>;
+}
+
+#[derive(Clone)]
+pub struct FullDensity;
+
+impl AsRef<FullDensity> for FullDensity {
+    fn as_ref(&self) -> &FullDensity {
+        self
+    }
+}
+
+impl QueryDensity for &FullDensity {
+    type Iter = std::iter::Repeat<bool>;
+
+    fn iter(self) -> Self::Iter {
+        std::iter::repeat(true)
+    }
+
+    fn get_query_size(self) -> Option<usize> {
+        None
+    }
+
+    fn generate_exps<F: PrimeField>(self, exponents: Arc<Vec<F::BigInt>>) -> Arc<Vec<F::BigInt>> {
+        exponents
+    }
+}
+
 fn bench_multiexp(crit: &mut Criterion) {
     let mut group = crit.benchmark_group("multiexp");
-    // The difference between runs is so little, hence a low sample size is OK.
     group.sample_size(10);
 
     let devices = Device::all();
+
     let programs = devices
         .iter()
         .map(|device| ec_gpu_gen::program!(device))
         .collect::<Result<_, _>>()
         .expect("Cannot create programs!");
-    let mut kern = MultiexpKernel::<<Bls12 as Engine>::G1Affine>::create(programs, &devices)
-        .expect("Cannot initialize kernel!");
+    let mut kern =
+        MultiexpKernel::<G1Affine>::create(programs, &devices).expect("Cannot initialize kernel!");
     let pool = Worker::new();
-    let max_bases: Vec<_> = (0..MAX_ELEMENTS)
+
+    let max_bases: Vec<G1Affine> = (0..MAX_ELEMENTS)
         .into_par_iter()
-        .map(|_| <Bls12 as Engine>::G1::random(rand::thread_rng()).to_affine())
+        .map(|_| G1Affine(G1Projective::rand(&mut rand::thread_rng()).into_affine()))
         .collect();
     let max_exponents: Vec<_> = (0..MAX_ELEMENTS)
         .into_par_iter()
-        .map(|_| <Bls12 as Engine>::Fr::random(rand::thread_rng()).to_repr())
+        .map(|_| Fr::rand(&mut rand::thread_rng()))
         .collect();
 
     let num_elements: Vec<_> = (10..MAX_ELEMENTS_POWER).map(|shift| 1 << shift).collect();
     for num in num_elements {
-        group.bench_with_input(BenchmarkId::from_parameter(num), &num, |bencher, &num| {
-            let (bases, skip) = SourceBuilder::get((Arc::new(max_bases[0..num].to_vec()), 0));
-            let exponents = Arc::new(max_exponents[0..num].to_vec());
+        group.bench_with_input(BenchmarkId::from_parameter(num), &num, |bencher, &_num| {
+            let bases_gpu: Vec<_> = max_bases.iter().map(|p| p.to_gpu()).collect();
+            let bases_gpu = Arc::new(bases_gpu);
+
+            let exps_bigint: Arc<Vec<_>> =
+                Arc::new(max_exponents.iter().map(|e| e.into_bigint()).collect());
+
+            let exps = FullDensity.as_ref().generate_exps::<Fr>(exps_bigint);
 
             bencher.iter(|| {
-                black_box(
-                    kern.multiexp(&pool, bases.clone(), exponents.clone(), skip)
+                let _ = black_box(
+                    kern.multiexp(&pool, bases_gpu.clone(), exps.clone(), 0)
                         .unwrap(),
                 );
             })
diff --git a/gpu-tests/benches/small_scalars.rs b/gpu-tests/benches/small_scalars.rs
new file mode 100644
index 0000000..0486846
--- /dev/null
+++ b/gpu-tests/benches/small_scalars.rs
@@ -0,0 +1,138 @@
+//! Benchmark comparing MSM performance with full-size vs small scalars.
+//!
+//! This demonstrates the speedup from the small scalar optimization:
+//! when scalars only use 64 bits, we skip processing the upper 190+ zero bits.
+//!
+//! Run with: cargo bench --bench small_scalars
+
+fn main() {
+    #[cfg(any(feature = "cuda", feature = "opencl"))]
+    divan::main();
+
+    #[cfg(not(any(feature = "cuda", feature = "opencl")))]
+    println!("Benchmarks require cuda or opencl feature");
+}
+
+#[cfg(any(feature = "cuda", feature = "opencl"))]
+use std::sync::Arc;
+
+#[cfg(any(feature = "cuda", feature = "opencl"))]
+use ark_bn254::{Fq, Fr, G1Affine, G1Projective};
+#[cfg(any(feature = "cuda", feature = "opencl"))]
+use ark_ec::CurveGroup;
+#[cfg(any(feature = "cuda", feature = "opencl"))]
+use ark_ff::{PrimeField, UniformRand};
+#[cfg(any(feature = "cuda", feature = "opencl"))]
+use divan::{black_box, Bencher};
+#[cfg(any(feature = "cuda", feature = "opencl"))]
+use ec_gpu_gen::{
+    multiexp::{G1AffineM, MultiexpKernel},
+    rust_gpu_tools::Device,
+    threadpool::Worker,
+};
+#[cfg(any(feature = "cuda", feature = "opencl"))]
+use rand::Rng;
+
+#[cfg(any(feature = "cuda", feature = "opencl"))]
+fn fq_to_32_le(x: &Fq) -> [u8; 32] {
+    let limbs = unsafe { std::mem::transmute::<Fq, [u64; 4]>(*x) };
+    let mut out = [0u8; 32];
+    for (i, limb) in limbs.iter().enumerate() {
+        let bytes = limb.to_le_bytes();
+        out[i * 8..(i + 1) * 8].copy_from_slice(&bytes);
+    }
+    out
+}
+
+#[cfg(any(feature = "cuda", feature = "opencl"))]
+fn g1_xy_bytes_le(p: &G1Affine) -> Option<([u8; 32], [u8; 32])> {
+    use ark_ec::AffineRepr;
+    p.xy().map(|(x, y)| (fq_to_32_le(&x), fq_to_32_le(&y)))
+}
+
+#[cfg(any(feature = "cuda", feature = "opencl"))]
+const NUM_POINTS: usize = 1 << 16; // 65536 points
+
+#[cfg(any(feature = "cuda", feature = "opencl"))]
+#[divan::bench]
+fn msm_full_254bit_scalars(bencher: Bencher) {
+    let devices = Device::all();
+    let programs = devices
+        .iter()
+        .map(|device| ec_gpu_gen::program!(device))
+        .collect::<Result<_, _>>()
+        .expect("Cannot create programs!");
+    let mut kern =
+        MultiexpKernel::<G1Affine>::create(programs, &devices).expect("Cannot initialize kernel!");
+    let pool = Worker::new();
+
+    let mut rng = rand::thread_rng();
+
+    // Generate random bases
+    let bases: Vec<G1Affine> = (0..NUM_POINTS)
+        .map(|_| G1Projective::rand(&mut rng).into_affine())
+        .collect();
+
+    let bases_gpu: Vec<G1AffineM> = bases
+        .iter()
+        .map(|affine| {
+            let (x, y) = g1_xy_bytes_le(affine).expect("point not at infinity");
+            G1AffineM { x, y }
+        })
+        .collect();
+    let bases_gpu = Arc::new(bases_gpu);
+
+    // Generate full 254-bit random scalars
+    let full_scalars: Vec<Fr> = (0..NUM_POINTS).map(|_| Fr::rand(&mut rng)).collect();
+    let full_exps: Arc<Vec<_>> = Arc::new(full_scalars.iter().map(|e| e.into_bigint()).collect());
+
+    bencher.bench_local(|| {
+        black_box(
+            kern.multiexp(&pool, bases_gpu.clone(), full_exps.clone(), 0)
+                .unwrap(),
+        )
+    });
+}
+
+#[cfg(any(feature = "cuda", feature = "opencl"))]
+#[divan::bench]
+fn msm_small_64bit_scalars(bencher: Bencher) {
+    let devices = Device::all();
+    let programs = devices
+        .iter()
+        .map(|device| ec_gpu_gen::program!(device))
+        .collect::<Result<_, _>>()
+        .expect("Cannot create programs!");
+    let mut kern =
+        MultiexpKernel::<G1Affine>::create(programs, &devices).expect("Cannot initialize kernel!");
+    let pool = Worker::new();
+
+    let mut rng = rand::thread_rng();
+
+    // Generate random bases
+    let bases: Vec<G1Affine> = (0..NUM_POINTS)
+        .map(|_| G1Projective::rand(&mut rng).into_affine())
+        .collect();
+
+    let bases_gpu: Vec<G1AffineM> = bases
+        .iter()
+        .map(|affine| {
+            let (x, y) = g1_xy_bytes_le(affine).expect("point not at infinity");
+            G1AffineM { x, y }
+        })
+        .collect();
+    let bases_gpu = Arc::new(bases_gpu);
+
+    // Generate small 64-bit scalars (0 to 2^64)
+    let small_scalars: Vec<Fr> = (0..NUM_POINTS)
+        .map(|_| Fr::from(rng.gen::<u64>()))
+        .collect();
+    let small_exps: Arc<Vec<_>> = Arc::new(small_scalars.iter().map(|e| e.into_bigint()).collect());
+
+    bencher.bench_local(|| {
+        black_box(
+            kern.multiexp(&pool, bases_gpu.clone(), small_exps.clone(), 0)
+                .unwrap(),
+        )
+    });
+}
diff --git a/gpu-tests/build.rs b/gpu-tests/build.rs
index c1c5a5b..14d39cc 100644
--- a/gpu-tests/build.rs
+++ b/gpu-tests/build.rs
@@ -3,12 +3,13 @@ fn main() {}
 
 #[cfg(any(feature = "cuda", feature = "opencl"))]
 fn main() {
-    use blstrs::{Fp, Fp2, G1Affine, G2Affine, Scalar};
+    use ark_bn254::{Fq, Fq2, Fr};
+    use ec_gpu::arkworks_bn254::{G1Affine, G2Affine};
     use ec_gpu_gen::SourceBuilder;
 
     let source_builder = SourceBuilder::new()
-        .add_fft::<Scalar>()
-        .add_multiexp::<G1Affine, Fp>()
-        .add_multiexp::<G2Affine, Fp2>();
+        .add_fft::<Fr>()
+        .add_multiexp::<G1Affine, Fq, Fr>()
+        .add_multiexp::<G2Affine, Fq2, Fr>();
     ec_gpu_gen::generate(&source_builder);
 }
diff --git a/gpu-tests/tests/fft.rs b/gpu-tests/tests/fft.rs
index aeda181..4b3641c 100644
--- a/gpu-tests/tests/fft.rs
+++ b/gpu-tests/tests/fft.rs
@@ -2,44 +2,74 @@
 
 use std::time::Instant;
 
-use blstrs::Scalar as Fr;
-use ec_gpu_gen::{
-    fft::FftKernel,
-    fft_cpu::{parallel_fft, serial_fft},
-    rust_gpu_tools::Device,
-    threadpool::Worker,
-};
-use ff::{Field, PrimeField};
-
-fn omega<F: PrimeField>(num_coeffs: usize) -> F {
-    // Compute omega, the 2^exp primitive root of unity
+use ark_bn254::Fr;
+use ark_ff::{FftField, UniformRand};
+use ec_gpu_gen::{fft::FftKernelArk, rust_gpu_tools::Device};
+
+fn omega<F: FftField>(num_coeffs: usize) -> F {
     let exp = (num_coeffs as f32).log2().floor() as u32;
-    let mut omega = F::ROOT_OF_UNITY;
-    for _ in exp..F::S {
+    let mut omega = F::TWO_ADIC_ROOT_OF_UNITY;
+    for _ in exp..F::TWO_ADICITY {
         omega = omega.square();
     }
     omega
 }
 
+fn serial_fft<F: FftField>(a: &mut [F], omega: &F, log_n: u32) {
+    let n = a.len();
+    assert_eq!(n, 1 << log_n);
+
+    for k in 0..n {
+        let rk = bitreverse(k, log_n as usize);
+        if k < rk {
+            a.swap(rk, k);
+        }
+    }
+
+    let mut m = 1;
+    for _ in 0..log_n {
+        let w_m = omega.pow([(n / (2 * m)) as u64]);
+        let mut k = 0;
+        while k < n {
+            let mut w = F::ONE;
+            for j in 0..m {
+                let t = a[k + j + m] * w;
+                a[k + j + m] = a[k + j] - t;
+                a[k + j] += t;
+                w *= w_m;
+            }
+            k += 2 * m;
+        }
+        m *= 2;
+    }
+}
+
+fn bitreverse(mut n: usize, l: usize) -> usize {
+    let mut r = 0;
+    for _ in 0..l {
+        r = (r << 1) | (n & 1);
+        n >>= 1;
+    }
+    r
+}
+
 #[test]
 pub fn gpu_fft_consistency() {
     fil_logger::maybe_init();
     let mut rng = rand::thread_rng();
 
-    let worker = Worker::new();
-    let log_threads = worker.log_num_threads();
     let devices = Device::all();
     let programs = devices
         .iter()
         .map(|device| ec_gpu_gen::program!(device))
         .collect::<Result<_, _>>()
         .expect("Cannot create programs!");
-    let mut kern = FftKernel::<Fr>::create(programs).expect("Cannot initialize kernel!");
+    let mut kern = FftKernelArk::<Fr>::create(programs).expect("Cannot initialize kernel!");
 
     for log_d in 1..=20 {
         let d = 1 << log_d;
 
-        let mut v1_coeffs = (0..d).map(|_| Fr::random(&mut rng)).collect::<Vec<_>>();
+        let mut v1_coeffs = (0..d).map(|_| Fr::rand(&mut rng)).collect::<Vec<_>>();
         let v1_omega = omega::<Fr>(v1_coeffs.len());
         let mut v2_coeffs = v1_coeffs.clone();
         let v2_omega = v1_omega;
@@ -53,17 +83,13 @@ pub fn gpu_fft_consistency() {
         println!("GPU took {}ms.", gpu_dur);
 
         now = Instant::now();
-        if log_d <= log_threads {
-            serial_fft::<Fr>(&mut v2_coeffs, &v2_omega, log_d);
-        } else {
-            parallel_fft::<Fr>(&mut v2_coeffs, &worker, &v2_omega, log_d, log_threads);
-        }
+        serial_fft::<Fr>(&mut v2_coeffs, &v2_omega, log_d);
         let cpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64;
-        println!("CPU ({} cores) took {}ms.", 1 << log_threads, cpu_dur);
+        println!("CPU took {}ms.", cpu_dur);
 
         println!("Speedup: x{}", cpu_dur as f32 / gpu_dur as f32);
 
-        assert!(v1_coeffs == v2_coeffs);
+        assert_eq!(v1_coeffs, v2_coeffs);
         println!("============================");
     }
 }
@@ -73,39 +99,32 @@ pub fn gpu_fft_many_consistency() {
     fil_logger::maybe_init();
     let mut rng = rand::thread_rng();
 
-    let worker = Worker::new();
-    let log_threads = worker.log_num_threads();
     let devices = Device::all();
     let programs = devices
         .iter()
         .map(|device| ec_gpu_gen::program!(device))
         .collect::<Result<_, _>>()
         .expect("Cannot create programs!");
-    let mut kern = FftKernel::<Fr>::create(programs).expect("Cannot initialize kernel!");
+    let mut kern = FftKernelArk::<Fr>::create(programs).expect("Cannot initialize kernel!");
 
     for log_d in 1..=20 {
         let d = 1 << log_d;
 
-        let mut v11_coeffs = (0..d).map(|_| Fr::random(&mut rng)).collect::<Vec<_>>();
-        let mut v12_coeffs = (0..d).map(|_| Fr::random(&mut rng)).collect::<Vec<_>>();
-        let mut v13_coeffs = (0..d).map(|_| Fr::random(&mut rng)).collect::<Vec<_>>();
-        let v11_omega = omega::<Fr>(v11_coeffs.len());
-        let v12_omega = omega::<Fr>(v12_coeffs.len());
-        let v13_omega = omega::<Fr>(v13_coeffs.len());
+        let mut v11_coeffs = (0..d).map(|_| Fr::rand(&mut rng)).collect::<Vec<_>>();
+        let mut v12_coeffs = (0..d).map(|_| Fr::rand(&mut rng)).collect::<Vec<_>>();
+        let mut v13_coeffs = (0..d).map(|_| Fr::rand(&mut rng)).collect::<Vec<_>>();
+        let fft_omega = omega::<Fr>(d);
 
         let mut v21_coeffs = v11_coeffs.clone();
         let mut v22_coeffs = v12_coeffs.clone();
         let mut v23_coeffs = v13_coeffs.clone();
-        let v21_omega = v11_omega;
-        let v22_omega = v12_omega;
-        let v23_omega = v13_omega;
 
         println!("Testing FFT3 for {} elements...", d);
 
         let mut now = Instant::now();
         kern.radix_fft_many(
             &mut [&mut v11_coeffs, &mut v12_coeffs, &mut v13_coeffs],
-            &[v11_omega, v12_omega, v13_omega],
+            &[fft_omega, fft_omega, fft_omega],
             &[log_d, log_d, log_d],
         )
         .expect("GPU FFT failed!");
@@ -113,23 +132,17 @@ pub fn gpu_fft_many_consistency() {
         println!("GPU took {}ms.", gpu_dur);
 
         now = Instant::now();
-        if log_d <= log_threads {
-            serial_fft::<Fr>(&mut v21_coeffs, &v21_omega, log_d);
-            serial_fft::<Fr>(&mut v22_coeffs, &v22_omega, log_d);
-            serial_fft::<Fr>(&mut v23_coeffs, &v23_omega, log_d);
-        } else {
-            parallel_fft::<Fr>(&mut v21_coeffs, &worker, &v21_omega, log_d, log_threads);
-            parallel_fft::<Fr>(&mut v22_coeffs, &worker, &v22_omega, log_d, log_threads);
-            parallel_fft::<Fr>(&mut v23_coeffs, &worker, &v23_omega, log_d, log_threads);
-        }
+        serial_fft::<Fr>(&mut v21_coeffs, &fft_omega, log_d);
+        serial_fft::<Fr>(&mut v22_coeffs, &fft_omega, log_d);
+        serial_fft::<Fr>(&mut v23_coeffs, &fft_omega, log_d);
         let cpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64;
-        println!("CPU ({} cores) took {}ms.", 1 << log_threads, cpu_dur);
+        println!("CPU took {}ms.", cpu_dur);
 
         println!("Speedup: x{}", cpu_dur as f32 / gpu_dur as f32);
 
-        assert!(v11_coeffs == v21_coeffs);
-        assert!(v12_coeffs == v22_coeffs);
-        assert!(v13_coeffs == v23_coeffs);
+        assert_eq!(v11_coeffs, v21_coeffs);
+        assert_eq!(v12_coeffs, v22_coeffs);
+        assert_eq!(v13_coeffs, v23_coeffs);
 
         println!("============================");
     }
diff --git a/gpu-tests/tests/multiexp.rs b/gpu-tests/tests/multiexp.rs
index 06360d1..23375a3 100644
--- a/gpu-tests/tests/multiexp.rs
+++ b/gpu-tests/tests/multiexp.rs
@@ -3,86 +3,333 @@
 use std::sync::Arc;
 use std::time::Instant;
 
-use blstrs::Bls12;
-use ec_gpu::GpuName;
-use ec_gpu_gen::multiexp_cpu::{multiexp_cpu, FullDensity, QueryDensity, SourceBuilder};
+use ark_bn254::{Fr, G1Projective};
+use ark_ec::{CurveGroup, VariableBaseMSM};
+use ark_ff::{PrimeField, UniformRand};
+use ec_gpu::arkworks_bn254::{G1Affine, G2Affine};
+use ec_gpu_gen::multiexp::GpuAffine;
 use ec_gpu_gen::{
     multiexp::MultiexpKernel, program, rust_gpu_tools::Device, threadpool::Worker, EcError,
 };
-use ff::{Field, PrimeField};
-use group::Curve;
-use group::{prime::PrimeCurveAffine, Group};
-use pairing::Engine;
+use tracing::debug_span;
+use tracing_profile::{PrintTreeConfig, PrintTreeLayer};
+use tracing_subscriber::{filter::filter_fn, prelude::*};
 
-fn multiexp_gpu<Q, D, G, S>(
+pub trait QueryDensity: Sized {
+    type Iter: Iterator<Item = bool>;
+
+    fn iter(self) -> Self::Iter;
+    fn get_query_size(self) -> Option<usize>;
+    fn generate_exps<F: PrimeField>(self, exponents: Arc<Vec<F::BigInt>>) -> Arc<Vec<F::BigInt>>;
+}
+
+#[derive(Clone)]
+pub struct FullDensity;
+
+impl AsRef<FullDensity> for FullDensity {
+    fn as_ref(&self) -> &FullDensity {
+        self
+    }
+}
+
+impl QueryDensity for &FullDensity {
+    type Iter = std::iter::Repeat<bool>;
+
+    fn iter(self) -> Self::Iter {
+        std::iter::repeat(true)
+    }
+
+    fn get_query_size(self) -> Option<usize> {
+        None
+    }
+
+    fn generate_exps<F: PrimeField>(self, exponents: Arc<Vec<F::BigInt>>) -> Arc<Vec<F::BigInt>> {
+        exponents
+    }
+}
+
+fn multiexp_gpu<G, Q, D>(
     pool: &Worker,
-    bases: S,
+    bases: Arc<Vec<G>>,
     density_map: D,
-    exponents: Arc<Vec<<G::Scalar as PrimeField>::Repr>>,
+    exponents: Arc<Vec<Fr>>,
     kern: &mut MultiexpKernel<G>,
-) -> Result<G::Curve, EcError>
+) -> Result<G::Group, EcError>
 where
+    G: GpuAffine<ScalarField = Fr>,
     for<'a> &'a Q: QueryDensity,
     D: Send + Sync + 'static + Clone + AsRef<Q>,
-    G: PrimeCurveAffine + GpuName,
-    S: SourceBuilder<G>,
 {
-    let exps = density_map.as_ref().generate_exps::<G::Scalar>(exponents);
-    let (bss, skip) = bases.get();
-    kern.multiexp(pool, bss, exps, skip).map_err(Into::into)
+    let bases_gpu: Vec<G::GpuRepr> = bases.iter().map(|affine| affine.to_gpu()).collect();
+    let exps_bigint: Arc<Vec<_>> = Arc::new(exponents.iter().map(|e| e.into_bigint()).collect());
+    let exps = density_map.as_ref().generate_exps::<Fr>(exps_bigint);
+    kern.multiexp(pool, Arc::new(bases_gpu), exps, 0)
 }
 
-#[test]
-fn gpu_multiexp_consistency() {
+/// Trait to bridge our newtype wrappers with arkworks types for testing
+trait TestableAffine: GpuAffine<ScalarField = Fr> + From<Self::ArkAffine> {
+    type ArkAffine: ark_ec::AffineRepr<ScalarField = Fr> + Copy;
+    type ArkProjective: CurveGroup<Affine = Self::ArkAffine, ScalarField = Fr> + UniformRand;
+
+    fn group_name() -> &'static str;
+}
+
+impl TestableAffine for G1Affine {
+    type ArkAffine = ark_bn254::G1Affine;
+    type ArkProjective = ark_bn254::G1Projective;
+
+    fn group_name() -> &'static str {
+        "G1"
+    }
+}
+
+impl TestableAffine for G2Affine {
+    type ArkAffine = ark_bn254::G2Affine;
+    type ArkProjective = ark_bn254::G2Projective;
+
+    fn group_name() -> &'static str {
+        "G2"
+    }
+}
+
+fn gpu_multiexp_consistency_test<G>(start_log_d: usize, max_log_d: usize)
+where
+    G: TestableAffine,
+    G::Group: PartialEq<G::ArkProjective>,
+{
     fil_logger::maybe_init();
-    const MAX_LOG_D: usize = 16;
-    const START_LOG_D: usize = 10;
+
     let devices = Device::all();
     let programs = devices
         .iter()
         .map(|device| crate::program!(device))
         .collect::<Result<_, _>>()
         .expect("Cannot create programs!");
-    let mut kern = MultiexpKernel::<<Bls12 as Engine>::G1Affine>::create(programs, &devices)
-        .expect("Cannot initialize kernel!");
+    let mut kern =
+        MultiexpKernel::<G>::create(programs, &devices).expect("Cannot initialize kernel!");
     let pool = Worker::new();
 
     let mut rng = rand::thread_rng();
 
-    let mut bases = (0..(1 << START_LOG_D))
-        .map(|_| <Bls12 as Engine>::G1::random(&mut rng).to_affine())
-        .collect::<Vec<_>>();
+    let mut bases_ark: Vec<G::ArkAffine> = (0..(1 << start_log_d))
+        .map(|_| G::ArkProjective::rand(&mut rng).into_affine())
+        .collect();
 
-    for log_d in START_LOG_D..=MAX_LOG_D {
-        let g = Arc::new(bases.clone());
+    for log_d in start_log_d..=max_log_d {
+        let bases: Vec<G> = bases_ark.iter().map(|p| G::from(*p)).collect();
+        let g = Arc::new(bases);
 
         let samples = 1 << log_d;
-        println!("Testing Multiexp for {} elements...", samples);
-
-        let v = Arc::new(
-            (0..samples)
-                .map(|_| <Bls12 as Engine>::Fr::random(&mut rng).to_repr())
-                .collect::<Vec<_>>(),
+        println!(
+            "Testing {} Multiexp for {} elements...",
+            G::group_name(),
+            samples
         );
 
+        let v: Vec<Fr> = (0..samples).map(|_| Fr::rand(&mut rng)).collect();
+        let v_arc = Arc::new(v.clone());
+
         let mut now = Instant::now();
-        let gpu = multiexp_gpu(&pool, (g.clone(), 0), FullDensity, v.clone(), &mut kern).unwrap();
+        let gpu = multiexp_gpu(&pool, g.clone(), FullDensity, v_arc.clone(), &mut kern).unwrap();
         let gpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64;
         println!("GPU took {}ms.", gpu_dur);
 
         now = Instant::now();
-        let cpu = multiexp_cpu(&pool, (g.clone(), 0), FullDensity, v.clone())
-            .wait()
-            .unwrap();
+        let cpu: G::ArkProjective =
+            VariableBaseMSM::msm(bases_ark.as_slice(), v.as_slice()).unwrap();
         let cpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64;
         println!("CPU took {}ms.", cpu_dur);
 
         println!("Speedup: x{}", cpu_dur as f32 / gpu_dur as f32);
 
-        assert_eq!(cpu, gpu);
+        assert_eq!(
+            gpu,
+            cpu,
+            "GPU and CPU results differ for {} MSM",
+            G::group_name()
+        );
 
         println!("============================");
 
-        bases = [bases.clone(), bases.clone()].concat();
+        bases_ark = [bases_ark.clone(), bases_ark.clone()].concat();
     }
 }
+
+#[test]
+fn gpu_multiexp_g1_consistency() {
+    gpu_multiexp_consistency_test::<G1Affine>(10, 16);
+}
+
+#[test]
+fn gpu_multiexp_g2_consistency() {
+    gpu_multiexp_consistency_test::<G2Affine>(10, 16);
+}
+
+/// Test that the small scalar optimization works correctly.
+/// When scalars are small (e.g., 64-bit), the optimization should skip
+/// processing upper zero bits while producing correct results.
+#[test]
+fn gpu_multiexp_small_scalars() {
+    fil_logger::maybe_init();
+    let devices = Device::all();
+    let programs = devices
+        .iter()
+        .map(|device| crate::program!(device))
+        .collect::<Result<_, _>>()
+        .expect("Cannot create programs!");
+    let mut kern =
+        MultiexpKernel::<G1Affine>::create(programs, &devices).expect("Cannot initialize kernel!");
+    let pool = Worker::new();
+
+    let mut rng = rand::thread_rng();
+    use ark_ff::UniformRand;
+    use rand::Rng;
+
+    // Test with small scalars (64-bit range)
+    let num_points = 1 << 16;
+    println!(
+        "Testing small scalar MSM optimization with {} points...",
+        num_points
+    );
+
+    let bases_ark: Vec<ark_bn254::G1Affine> = (0..num_points)
+        .map(|_| G1Projective::rand(&mut rng).into_affine())
+        .collect();
+
+    // Generate small scalars (only 64 bits used out of 254)
+    let small_scalars: Vec<Fr> = (0..num_points)
+        .map(|_| Fr::from(rng.gen::<u64>()))
+        .collect();
+
+    let bases: Vec<G1Affine> = bases_ark.iter().map(|p| G1Affine::from(*p)).collect();
+    let g = Arc::new(bases);
+    let v_arc: Arc<Vec<_>> = Arc::new(small_scalars.clone());
+
+    let now = Instant::now();
+    let gpu: G1Projective =
+        multiexp_gpu(&pool, g.clone(), FullDensity, v_arc.clone(), &mut kern).unwrap();
+    let gpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64;
+    println!("Small scalar GPU MSM took {}ms.", gpu_dur);
+
+    let cpu: G1Projective =
+        VariableBaseMSM::msm(bases_ark.as_slice(), small_scalars.as_slice()).unwrap();
+
+    assert_eq!(cpu, gpu, "Small scalar MSM mismatch!");
+    println!("Small scalar MSM test passed!");
+}
+
+/// Test edge case with very small scalars (32-bit)
+#[test]
+fn gpu_multiexp_very_small_scalars() {
+    fil_logger::maybe_init();
+    let devices = Device::all();
+    let programs = devices
+        .iter()
+        .map(|device| crate::program!(device))
+        .collect::<Result<_, _>>()
+        .expect("Cannot create programs!");
+    let mut kern =
+        MultiexpKernel::<G1Affine>::create(programs, &devices).expect("Cannot initialize kernel!");
+    let pool = Worker::new();
+
+    let mut rng = rand::thread_rng();
+    use ark_ff::UniformRand;
+    use rand::Rng;
+
+    let num_points = 1 << 14;
+    println!(
+        "Testing very small scalar (32-bit) MSM with {} points...",
+        num_points
+    );
+
+    let bases_ark: Vec<ark_bn254::G1Affine> = (0..num_points)
+        .map(|_| G1Projective::rand(&mut rng).into_affine())
+        .collect();
+
+    // Generate very small scalars (only 32 bits used)
+    let small_scalars: Vec<Fr> = (0..num_points)
+        .map(|_| Fr::from(rng.gen::<u32>() as u64))
+        .collect();
+
+    let bases: Vec<G1Affine> = bases_ark.iter().map(|p| G1Affine::from(*p)).collect();
+    let g = Arc::new(bases);
+    let v_arc: Arc<Vec<_>> = Arc::new(small_scalars.clone());
+
+    let now = Instant::now();
+    let gpu: G1Projective =
+        multiexp_gpu(&pool, g.clone(), FullDensity, v_arc.clone(), &mut kern).unwrap();
+    let gpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64;
+    println!("Very small scalar GPU MSM took {}ms.", gpu_dur);
+
+    let cpu: G1Projective =
+        VariableBaseMSM::msm(bases_ark.as_slice(), small_scalars.as_slice()).unwrap();
+
+    assert_eq!(cpu, gpu, "Very small scalar MSM mismatch!");
+    println!("Very small scalar MSM test passed!");
+}
+
+#[test]
+fn gpu_multiexp_profile() {
+    use ec_gpu_gen::multiexp::SingleMultiexpKernel;
+
+    let config = PrintTreeConfig {
+        hide_below_percent: 0.0,
+        accumulate_events: false,
+        ..PrintTreeConfig::default()
+    };
+    let (layer, _guard) = PrintTreeLayer::new(config);
+    // Filter out events, only keep spans
+    let layer = layer.with_filter(filter_fn(|metadata| metadata.is_span()));
+    tracing_subscriber::registry().with(layer).init();
+
+    let root = debug_span!("profile_multiexp");
+    let _root_guard = root.enter();
+
+    let devices = Device::all();
+    let device = &devices[0];
+    let program = crate::program!(device).expect("Cannot create program!");
+
+    let kern = {
+        let span = debug_span!("create_kernel");
+        let _guard = span.enter();
+        SingleMultiexpKernel::<G1Affine>::create(program, device, None)
+            .expect("Cannot initialize kernel!")
+    };
+
+    let mut rng = rand::thread_rng();
+    let log_n = 16;
+    let n = 1 << log_n;
+
+    let bases_ark: Vec<ark_bn254::G1Affine> = {
+        let span = debug_span!("generate_bases", n = n);
+        let _guard = span.enter();
+        (0..n)
+            .map(|_| G1Projective::rand(&mut rng).into_affine())
+            .collect()
+    };
+
+    let bases_gpu: Vec<_> = {
+        let span = debug_span!("convert_bases_to_gpu", n = n);
+        let _guard = span.enter();
+        bases_ark
+            .iter()
+            .map(|p| G1Affine::from(*p).to_gpu())
+            .collect()
+    };
+
+    let exponents: Vec<_> = {
+        let span = debug_span!("generate_exponents", n = n);
+        let _guard = span.enter();
+        (0..n).map(|_| Fr::rand(&mut rng).into_bigint()).collect()
+    };
+
+    // Run multiexp on main thread - this will show all nested spans
+    let _result = {
+        let span = debug_span!("run_multiexp", n = n);
+        let _guard = span.enter();
+        kern.multiexp(&bases_gpu, &exponents)
+            .expect("multiexp failed")
+    };
+
+    drop(_root_guard);
+}
diff --git a/rust-toolchain b/rust-toolchain
deleted file mode 100644
index 6b4de0a..0000000
--- a/rust-toolchain
+++ /dev/null
@@ -1 +0,0 @@
-1.83.0