diff --git a/Cargo.toml b/Cargo.toml index a36e52f..86c90fa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,3 +5,7 @@ members = [ "gpu-tests", ] resolver = "2" + +[workspace.dependencies] +ec-gpu = { path = "./ec-gpu" } +tracing-profile = { git = "https://github.com/IrreducibleOSS/tracing-profile" } diff --git a/ec-gpu-gen/Cargo.toml b/ec-gpu-gen/Cargo.toml index 51f75b9..8584597 100644 --- a/ec-gpu-gen/Cargo.toml +++ b/ec-gpu-gen/Cargo.toml @@ -12,11 +12,11 @@ rust-version = "1.83.0" [dependencies] bitvec = "1.0.1" crossbeam-channel = "0.5.1" -ec-gpu = "0.2.0" +ec-gpu.workspace = true execute = "0.2.9" -ff = { version = "0.13.0", default-features = false } -group = "0.13.0" hex = "0.4" +# Pin home to version compatible with Rust 1.83 (edition 2021) +home = "=0.5.9" log = "0.4.14" num_cpus = "1.13.0" once_cell = "1.8.0" @@ -24,22 +24,23 @@ rayon = "1.5.1" rust-gpu-tools = { version = "0.7.0", default-features = false, optional = true } sha2 = "0.10" thiserror = "1.0.30" +tracing = "0.1" yastl = "0.1.2" +ark-bn254 = { version = "0.5.0", optional = true } +ark-std = { version = "0.5.0", optional = true } +ark-ec = "0.5.0" +ark-ff = "0.5.0" +ark-serialize = { version = "0.5.0", optional = true } + [dev-dependencies] -# NOTE vmx 2022-07-07: Using the `__private_bench` feature of `blstrs` is just -# temporarily until https://github.com/zkcrypto/group/pull/29 is fixed. Then -# we won't need the exports of `Fp` and `Fp2` any more. -#blstrs = { version = "0.6.0", features = ["__private_bench"], optional = true } -blstrs = { version = "0.7.0", features = ["__private_bench", "gpu"] } rand = "0.8" lazy_static = "1.2" -pairing = "0.23.0" temp-env = "0.3.0" -rand_core = "0.6.3" -rand_xorshift = "0.3.0" +ark-bn254 = "0.5.0" [features] default = [] cuda = ["rust-gpu-tools/cuda"] opencl = ["rust-gpu-tools/opencl"] +arkworks = ["ec-gpu/arkworks", "dep:ark-bn254", "dep:ark-std", "dep:ark-serialize"] diff --git a/ec-gpu-gen/src/cl/field.cl b/ec-gpu-gen/src/cl/field.cl index 5846b59..4aa1c77 100644 --- a/ec-gpu-gen/src/cl/field.cl +++ b/ec-gpu-gen/src/cl/field.cl @@ -164,6 +164,94 @@ DEVICE void FIELD_reduce(uint32_t accLow[FIELD_LIMBS], uint32_t np0, uint32_t fq accLow[i]=chain_add(&chain5, accLow[i], highCarry); } +// Optimized squaring: exploits symmetry a² = a·a +// Cross products aᵢ·aⱼ for i≠j appear twice, so compute once and double. +// Reduces from n² to n(n+1)/2 multiplications (~44% fewer for 8 limbs). +DEVICE inline +void FIELD_sqr_v1(uint32_t *x, uint32_t *xx) { + const uint32_t xLimbs = FIELD_LIMBS; + const uint32_t xxLimbs = FIELD_LIMBS * 2; + uint32_t temp[FIELD_LIMBS * 2]; + uint32_t carry = 0; + + #pragma unroll + for (int32_t i = 0; i < xxLimbs; i++) { + temp[i] = 0; + } + + // Step 1: Compute off-diagonal products for odd (i+j) positions + // Following the same pattern as FIELD_mult_v1 for correctness + #pragma unroll + for (int32_t i = 0; i < xLimbs; i++) { + chain_t chain1; + chain_init(&chain1); + #pragma unroll + for (int32_t j = i + 1; j < xLimbs; j++) { + if ((i + j) % 2 == 1) { + temp[i + j - 1] = chain_madlo(&chain1, x[i], x[j], temp[i + j - 1]); + temp[i + j] = chain_madhi(&chain1, x[i], x[j], temp[i + j]); + } + } + if (i % 2 == 1 && i + 1 < xLimbs) { + temp[i + xLimbs - 1] = chain_add(&chain1, 0, 0); + } + } + + // Shift right by 1 position (same as mult_v1) + #pragma unroll + for (int32_t i = xxLimbs - 1; i > 0; i--) { + temp[i] = temp[i - 1]; + } + temp[0] = 0; + + // Step 2: Compute off-diagonal products for even (i+j) positions + carry = 0; + #pragma unroll + for (int32_t i = 0; i < xLimbs; i++) { + chain_t chain2; + chain_init(&chain2); + + #pragma unroll + for (int32_t j = i + 1; j < xLimbs; j++) { + if ((i + j) % 2 == 0) { + temp[i + j] = chain_madlo(&chain2, x[i], x[j], temp[i + j]); + temp[i + j + 1] = chain_madhi(&chain2, x[i], x[j], temp[i + j + 1]); + } + } + if ((i + xLimbs) % 2 == 0 && i != xLimbs - 1 && i + 1 < xLimbs) { + temp[i + xLimbs] = chain_add(&chain2, temp[i + xLimbs], carry); + temp[i + xLimbs + 1] = chain_add(&chain2, temp[i + xLimbs + 1], 0); + carry = chain_add(&chain2, 0, 0); + } + if ((i + xLimbs) % 2 == 1 && i != xLimbs - 1 && i + 1 < xLimbs) { + carry = chain_add(&chain2, carry, 0); + } + } + + // Step 3: Double the off-diagonal products (left shift by 1 bit) + carry = 0; + #pragma unroll + for (int32_t i = 0; i < xxLimbs; i++) { + uint32_t new_carry = temp[i] >> 31; + temp[i] = (temp[i] << 1) | carry; + carry = new_carry; + } + + // Step 4: Add diagonal products x[i] * x[i] + chain_t chain3; + chain_init(&chain3); + #pragma unroll + for (int32_t i = 0; i < xLimbs; i++) { + temp[2 * i] = chain_madlo(&chain3, x[i], x[i], temp[2 * i]); + temp[2 * i + 1] = chain_madhi(&chain3, x[i], x[i], temp[2 * i + 1]); + } + + #pragma unroll + for (int32_t i = 0; i < xxLimbs; i++) { + xx[i] = temp[i]; + } +} + // Requirement: yLimbs >= xLimbs DEVICE inline void FIELD_mult_v1(uint32_t *x, uint32_t *y, uint32_t *xy) { @@ -262,6 +350,40 @@ DEVICE FIELD FIELD_mul_nvidia(FIELD a, FIELD b) { return r; } +DEVICE FIELD FIELD_sqr_nvidia(FIELD a) { + // Perform optimized squaring + limb aa[2 * FIELD_LIMBS]; + FIELD_sqr_v1(a.val, aa); + + uint32_t io[FIELD_LIMBS]; + #pragma unroll + for(int i=0;i> (i % FIELD_LIMB_BITS)) & 1; +} + +// Get `window` consecutive bits, (Starting from `skip`th bit from LSB) from the field. +DEVICE uint FIELD_get_bits_lsb(FIELD l, uint skip, uint window) { + uint ret = 0; + for(uint i = 0; i < window; i++) { + ret |= ((uint)FIELD_get_bit_lsb(l, skip + i)) << i; + } + return ret; +} diff --git a/ec-gpu-gen/src/cl/multiexp.cl b/ec-gpu-gen/src/cl/multiexp.cl index 6edc445..2c77693 100644 --- a/ec-gpu-gen/src/cl/multiexp.cl +++ b/ec-gpu-gen/src/cl/multiexp.cl @@ -43,7 +43,7 @@ KERNEL void POINT_multiexp( POINT_jacobian res = POINT_ZERO; for(uint i = nstart; i < nend; i++) { - uint ind = EXPONENT_get_bits(exps[i], bits, w); + uint ind = EXPONENT_get_bits_lsb(exps[i], bits, w); #if defined(OPENCL_NVIDIA) || defined(CUDA) // O_o, weird optimization, having a single special case makes it diff --git a/ec-gpu-gen/src/fft.rs b/ec-gpu-gen/src/fft.rs index aef4815..e19d51d 100644 --- a/ec-gpu-gen/src/fft.rs +++ b/ec-gpu-gen/src/fft.rs @@ -1,8 +1,8 @@ use std::cmp; use std::sync::{Arc, RwLock}; +use ark_ff::FftField; use ec_gpu::GpuName; -use ff::Field; use log::{error, info}; use rust_gpu_tools::{program_closures, LocalBuffer, Program}; @@ -16,17 +16,14 @@ const MAX_LOG2_LOCAL_WORK_SIZE: u32 = 7; // 128 /// FFT kernel for a single GPU. pub struct SingleFftKernel<'a, F> where - F: Field + GpuName, + F: FftField + GpuName, { program: Program, - /// An optional function which will be called at places where it is possible to abort the FFT - /// calculations. If it returns true, the calculation will be aborted with an - /// [`EcError::Aborted`]. maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>, _phantom: std::marker::PhantomData, } -impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> { +impl<'a, F: FftField + GpuName> SingleFftKernel<'a, F> { /// Create a new FFT instance for the given device. /// /// The `maybe_abort` function is called when it is possible to abort the computation, without @@ -48,23 +45,18 @@ impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> { pub fn radix_fft(&mut self, input: &mut [F], omega: &F, log_n: u32) -> EcResult<()> { let closures = program_closures!(|program, input: &mut [F]| -> EcResult<()> { let n = 1 << log_n; - // All usages are safe as the buffers are initialized from either the host or the GPU - // before they are read. let mut src_buffer = unsafe { program.create_buffer::(n)? }; let mut dst_buffer = unsafe { program.create_buffer::(n)? }; - // The precalculated values pq` and `omegas` are valid for radix degrees up to `max_deg` let max_deg = cmp::min(MAX_LOG2_RADIX, log_n); - // Precalculate: - // [omega^(0/(2^(deg-1))), omega^(1/(2^(deg-1))), ..., omega^((2^(deg-1)-1)/(2^(deg-1)))] + // Precalculate twiddle factors let mut pq = vec![F::ZERO; 1 << max_deg >> 1]; - let twiddle = omega.pow_vartime([(n >> max_deg) as u64]); + let twiddle = omega.pow([(n >> max_deg) as u64]); pq[0] = F::ONE; if max_deg > 1 { pq[1] = twiddle; for i in 2..(1 << max_deg >> 1) { - pq[i] = pq[i - 1]; - pq[i].mul_assign(&twiddle); + pq[i] = pq[i - 1] * twiddle; } } let pq_buffer = program.create_buffer_from_slice(&pq)?; @@ -73,14 +65,13 @@ impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> { let mut omegas = vec![F::ZERO; 32]; omegas[0] = *omega; for i in 1..LOG2_MAX_ELEMENTS { - omegas[i] = omegas[i - 1].pow_vartime([2u64]); + omegas[i] = omegas[i - 1].pow([2u64]); } let omegas_buffer = program.create_buffer_from_slice(&omegas)?; program.write_from_buffer(&mut src_buffer, &*input)?; - // Specifies log2 of `p`, (http://www.bealto.com/gpu-fft_group-1.html) let mut log_p = 0u32; - // Each iteration performs a FFT round + while log_p < log_n { if let Some(maybe_abort) = &self.maybe_abort { if maybe_abort() { @@ -88,9 +79,7 @@ impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> { } } - // 1=>radix2, 2=>radix4, 3=>radix8, ... let deg = cmp::min(max_deg, log_n - log_p); - let n = 1u32 << log_n; let local_work_size = 1 << cmp::min(deg - 1, MAX_LOG2_LOCAL_WORK_SIZE); let global_work_size = n >> deg; @@ -117,7 +106,6 @@ impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> { } program.read_into_buffer(&src_buffer, input)?; - Ok(()) }); @@ -128,14 +116,14 @@ impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> { /// One FFT kernel for each GPU available. pub struct FftKernel<'a, F> where - F: Field + GpuName, + F: FftField + GpuName, { kernels: Vec>, } impl<'a, F> FftKernel<'a, F> where - F: Field + GpuName, + F: FftField + GpuName, { /// Create new kernels, one for each given device. pub fn create(programs: Vec) -> EcResult { @@ -175,28 +163,20 @@ where if kernels.is_empty() { return Err(EcError::Simple("No working GPUs found!")); } - info!("FFT: {} working device(s) selected. ", kernels.len()); + info!("FFT: {} working device(s) selected.", kernels.len()); for (i, k) in kernels.iter().enumerate() { - info!("FFT: Device {}: {}", i, k.program.device_name(),); + info!("FFT: Device {}: {}", i, k.program.device_name()); } Ok(Self { kernels }) } - /// Performs FFT on `input` - /// * `omega` - Special value `omega` is used for FFT over finite-fields - /// * `log_n` - Specifies log2 of number of elements - /// - /// Uses the first available GPU. + /// Performs FFT on `input` using the first available GPU. pub fn radix_fft(&mut self, input: &mut [F], omega: &F, log_n: u32) -> EcResult<()> { self.kernels[0].radix_fft(input, omega, log_n) } - /// Performs FFT on `inputs` - /// * `omega` - Special value `omega` is used for FFT over finite-fields - /// * `log_n` - Specifies log2 of number of elements - /// - /// Uses all available GPUs to distribute the work. + /// Performs FFT on `inputs` using all available GPUs. pub fn radix_fft_many( &mut self, inputs: &mut [&mut [F]], @@ -237,3 +217,8 @@ where Arc::try_unwrap(result).unwrap().into_inner().unwrap() } } + +/// Type alias for backward compatibility +pub type SingleFftKernelArk<'a, F> = SingleFftKernel<'a, F>; +/// Type alias for backward compatibility +pub type FftKernelArk<'a, F> = FftKernel<'a, F>; diff --git a/ec-gpu-gen/src/fft_cpu.rs b/ec-gpu-gen/src/fft_cpu.rs index 777ffb5..acabc4d 100644 --- a/ec-gpu-gen/src/fft_cpu.rs +++ b/ec-gpu-gen/src/fft_cpu.rs @@ -1,4 +1,4 @@ -use ff::PrimeField; +use ark_ff::FftField; use crate::threadpool::Worker; @@ -7,7 +7,7 @@ use crate::threadpool::Worker; /// The input `a` is mutated and contains the result when this function returns. The length of the /// input vector must be `2^log_n`. #[allow(clippy::many_single_char_names)] -pub fn serial_fft(a: &mut [F], omega: &F, log_n: u32) { +pub fn serial_fft(a: &mut [F], omega: &F, log_n: u32) { fn bitreverse(mut n: u32, l: u32) -> u32 { let mut r = 0; for _ in 0..l { @@ -29,7 +29,7 @@ pub fn serial_fft(a: &mut [F], omega: &F, log_n: u32) { let mut m = 1; for _ in 0..log_n { - let w_m = omega.pow_vartime([u64::from(n / (2 * m))]); + let w_m = omega.pow([u64::from(n / (2 * m))]); let mut k = 0; while k < n { @@ -56,7 +56,7 @@ pub fn serial_fft(a: &mut [F], omega: &F, log_n: u32) { /// The result is written to the input `a`. /// The number of threads used will be `2^log_threads`. /// There must be more items to process than threads. -pub fn parallel_fft( +pub fn parallel_fft( a: &mut [F], worker: &Worker, omega: &F, @@ -68,7 +68,7 @@ pub fn parallel_fft( let num_threads = 1 << log_threads; let log_new_n = log_n - log_threads; let mut tmp = vec![vec![F::ZERO; 1 << log_new_n]; num_threads]; - let new_omega = omega.pow_vartime([num_threads as u64]); + let new_omega = omega.pow([num_threads as u64]); worker.scope(0, |scope, _| { let a = &*a; @@ -76,8 +76,8 @@ pub fn parallel_fft( for (j, tmp) in tmp.iter_mut().enumerate() { scope.execute(move || { // Shuffle into a sub-FFT - let omega_j = omega.pow_vartime([j as u64]); - let omega_step = omega.pow_vartime([(j as u64) << log_new_n]); + let omega_j = omega.pow([j as u64]); + let omega_step = omega.pow([(j as u64) << log_new_n]); let mut elt = F::ONE; for (i, tmp) in tmp.iter_mut().enumerate() { @@ -97,7 +97,6 @@ pub fn parallel_fft( } }); - // TODO: does this hurt or help? worker.scope(a.len(), |scope, chunk| { let tmp = &tmp; @@ -120,15 +119,13 @@ mod tests { use std::cmp::min; - use blstrs::Scalar as Fr; - use ff::PrimeField; - use rand_core::RngCore; + use ark_bn254::Fr; + use ark_ff::UniformRand; - fn omega(num_coeffs: usize) -> F { - // Compute omega, the 2^exp primitive root of unity + fn omega(num_coeffs: usize) -> F { let exp = (num_coeffs as f32).log2().floor() as u32; - let mut omega = F::ROOT_OF_UNITY; - for _ in exp..F::S { + let mut omega = F::TWO_ADIC_ROOT_OF_UNITY; + for _ in exp..F::TWO_ADICITY { omega = omega.square(); } omega @@ -136,30 +133,24 @@ mod tests { #[test] fn parallel_fft_consistency() { - fn test_consistency(rng: &mut R) { - let worker = Worker::new(); + let worker = Worker::new(); + let mut rng = rand::thread_rng(); - for _ in 0..5 { - for log_d in 0..10 { - let d = 1 << log_d; + for _ in 0..5 { + for log_d in 0..10 { + let d = 1 << log_d; - let mut v1_coeffs = (0..d).map(|_| F::random(&mut *rng)).collect::>(); - let mut v2_coeffs = v1_coeffs.clone(); - let v1_omega = omega::(v1_coeffs.len()); - let v2_omega = v1_omega; + let mut v1_coeffs = (0..d).map(|_| Fr::rand(&mut rng)).collect::>(); + let mut v2_coeffs = v1_coeffs.clone(); + let fft_omega = omega::(v1_coeffs.len()); - for log_threads in log_d..min(log_d + 1, 3) { - parallel_fft::(&mut v1_coeffs, &worker, &v1_omega, log_d, log_threads); - serial_fft::(&mut v2_coeffs, &v2_omega, log_d); + for log_threads in log_d..min(log_d + 1, 3) { + parallel_fft::(&mut v1_coeffs, &worker, &fft_omega, log_d, log_threads); + serial_fft::(&mut v2_coeffs, &fft_omega, log_d); - assert!(v1_coeffs == v2_coeffs); - } + assert!(v1_coeffs == v2_coeffs); } } } - - let rng = &mut rand::thread_rng(); - - test_consistency::(rng); } } diff --git a/ec-gpu-gen/src/lib.rs b/ec-gpu-gen/src/lib.rs index f487959..77f1735 100644 --- a/ec-gpu-gen/src/lib.rs +++ b/ec-gpu-gen/src/lib.rs @@ -4,29 +4,35 @@ //! //! There is also support for Fast Fourier Transform and Multiexponentiation. //! -//! This crate usually creates GPU kernels at compile-time. CUDA generates a [fatbin], which OpenCL only generates the source code, which is then compiled at run-time. +//! This crate usually creates GPU kernels at compile-time. CUDA generates a [fatbin], which OpenCL +//! only generates the source code, which is then compiled at run-time. //! -//! In order to make things easier to use, there are helper functions available. You would put some code into `build.rs`, that generates the kernels, and some code into your library which then consumes those generated kernels. The kernels will be directly embedded into your program/library. If something goes wrong, you will get an error at compile-time. +//! In order to make things easier to use, there are helper functions available. You would put some +//! code into `build.rs`, that generates the kernels, and some code into your library which then +//! consumes those generated kernels. The kernels will be directly embedded into your program/library. +//! If something goes wrong, you will get an error at compile-time. //! //! In this example we will make use of the FFT functionality. Add to your `build.rs`: //! //! ```no_run -//! use blstrs::Scalar; +//! use ark_bn254::Fr; //! use ec_gpu_gen::SourceBuilder; //! -//! let source_builder = SourceBuilder::new().add_fft::(); +//! let source_builder = SourceBuilder::new().add_fft::(); //! ec_gpu_gen::generate(&source_builder); //! ``` //! -//! The `ec_gpu_gen::generate()` takes care of the actual code generation/compilation. It will automatically create a CUDA and/or OpenCL kernel. It will define two environment variables, which are meant for internal use. `_EC_GPU_CUDA_KERNEL_FATBIN` that points to the compiled CUDA kernel, and `_EC_GPU_OPENCL_KERNEL_SOURCE` that points to the generated OpenCL source. +//! The `ec_gpu_gen::generate()` takes care of the actual code generation/compilation. It will +//! automatically create a CUDA and/or OpenCL kernel. It will define two environment variables, +//! which are meant for internal use. `_EC_GPU_CUDA_KERNEL_FATBIN` that points to the compiled +//! CUDA kernel, and `_EC_GPU_OPENCL_KERNEL_SOURCE` that points to the generated OpenCL source. //! -//! Those variables are then picked up by the `ec_gpu_gen::program!()` macro, which generates a program, for a given GPU device. Using FFT within your library would then look like this: +//! Those variables are then picked up by the `ec_gpu_gen::program!()` macro, which generates a +//! program, for a given GPU device. Using FFT within your library would then look like this: //! -//! ```no_compile -//! use blstrs::Scalar; -//! use ec_gpu_gen::{ -//! rust_gpu_tools::Device, -//! }; +//! ```ignore +//! use ark_bn254::Fr; +//! use ec_gpu_gen::{fft::FftKernel, rust_gpu_tools::Device}; //! //! let devices = Device::all(); //! let programs = devices @@ -35,7 +41,7 @@ //! .collect::>() //! .expect("Cannot create programs!"); //! -//! let mut kern = FftKernel::::create(programs).expect("Cannot initialize kernel!"); +//! let mut kern = FftKernel::::create(programs).expect("Cannot initialize kernel!"); //! kern.radix_fft_many(&mut [&mut coeffs], &[omega], &[log_d]).expect("GPU FFT failed!"); //! ``` //! @@ -59,8 +65,6 @@ pub mod fft_cpu; /// Multiexponentiation on the GPU. #[cfg(any(feature = "cuda", feature = "opencl"))] pub mod multiexp; -/// Multiexponentiation on the CPU. -pub mod multiexp_cpu; /// Helpers for multithreaded code. pub mod threadpool; @@ -70,3 +74,8 @@ pub use rust_gpu_tools; pub use error::{EcError, EcResult}; pub use source::{generate, SourceBuilder}; + +#[cfg(any(feature = "cuda", feature = "opencl"))] +pub use fft::{FftKernel, FftKernelArk, SingleFftKernel, SingleFftKernelArk}; +#[cfg(any(feature = "cuda", feature = "opencl"))] +pub use multiexp::{G1AffineM, G2AffineM, GpuAffine, MultiexpKernel, SingleMultiexpKernel}; diff --git a/ec-gpu-gen/src/multiexp.rs b/ec-gpu-gen/src/multiexp.rs index 24780b8..8a8a3a8 100644 --- a/ec-gpu-gen/src/multiexp.rs +++ b/ec-gpu-gen/src/multiexp.rs @@ -1,11 +1,12 @@ use std::ops::AddAssign; use std::sync::{Arc, RwLock}; +use ark_ec::CurveGroup; +use ark_ff::{AdditiveGroup, BigInteger, PrimeField}; use ec_gpu::GpuName; -use ff::PrimeField; -use group::{prime::PrimeCurveAffine, Group}; use log::{error, info}; use rust_gpu_tools::{program_closures, Device, Program}; +use tracing::{debug_span, info_span}; use yastl::Scope; use crate::{ @@ -13,6 +14,19 @@ use crate::{ threadpool::Worker, }; +/// Trait for curve affine points that have a GPU-compatible representation. +pub trait GpuAffine: GpuName + Clone + Send + Sync + Sized { + /// The GPU-compatible representation type. + type GpuRepr: Copy + Clone + Default + Send + Sync; + /// The scalar field type. + type ScalarField: PrimeField; + /// The projective group type. + type Group: CurveGroup + AddAssign; + + /// Convert the affine point to its GPU representation. + fn to_gpu(&self) -> Self::GpuRepr; +} + /// On the GPU, the exponents are split into windows, this is the maximum number of such windows. const MAX_WINDOW_SIZE: usize = 10; /// In CUDA this is the number of blocks per grid (grid size). @@ -45,7 +59,7 @@ const fn work_units(compute_units: u32, compute_capabilities: Option<(u32, u32)> /// Multiexp kernel for a single GPU. pub struct SingleMultiexpKernel<'a, G> where - G: PrimeCurveAffine, + G: GpuAffine, { program: Program, /// The number of exponentiations the GPU can handle in a single execution of the kernel. @@ -58,18 +72,17 @@ where /// [`EcError::Aborted`]. maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>, - _phantom: std::marker::PhantomData, + _phantom: std::marker::PhantomData, } /// Calculates the maximum number of terms that can be put onto the GPU memory. fn calc_chunk_size(mem: u64, work_units: usize) -> usize where - G: PrimeCurveAffine, - G::Scalar: PrimeField, + G: GpuAffine, { - let aff_size = std::mem::size_of::(); - let exp_size = exp_size::(); - let proj_size = std::mem::size_of::(); + let aff_size = std::mem::size_of::(); + let exp_size = exp_size::(); + let proj_size = std::mem::size_of::(); // Leave `MEMORY_PADDING` percent of the memory free. let max_memory = ((mem as f64) * (1f64 - MEMORY_PADDING)) as usize; @@ -88,13 +101,152 @@ where /// The size of the exponent in bytes. /// /// It's the actual bytes size it needs in memory, not it's theoretical bit size. -fn exp_size() -> usize { - std::mem::size_of::() +fn exp_size() -> usize { + std::mem::size_of::() +} + +/// Computes the maximum number of significant bits across all scalar byte arrays. +/// Returns the position of the highest set bit + 1, or 1 if all scalars are zero. +fn compute_max_scalar_bits(scalars: &[[u8; 32]]) -> usize { + let max_bits = scalars + .iter() + .map(|bytes| { + // Scan from MSB to find highest non-zero byte + for (i, &byte) in bytes.iter().enumerate().rev() { + if byte != 0 { + return (i + 1) * 8 - byte.leading_zeros() as usize; + } + } + 0 + }) + .max() + .unwrap_or(0); + // Ensure at least 1 to avoid edge cases + max_bits.max(1) +} + +/// GPU-compatible representation of an affine point. +/// Coordinates are stored as 32-byte little-endian field elements in Montgomery form. +#[repr(C)] +#[derive(Copy, Clone, Debug, Default)] +pub struct G1AffineM { + /// X coordinate as 32 bytes in little-endian Montgomery form + pub x: [u8; 32], + /// Y coordinate as 32 bytes in little-endian Montgomery form + pub y: [u8; 32], +} + +#[cfg(feature = "arkworks")] +fn fq_to_montgomery_bytes(f: &ark_bn254::Fq) -> [u8; 32] { + // Arkworks stores Fq as 4 u64 limbs in Montgomery form + // We need the raw Montgomery representation, not the serialized (standard) form + let limbs: [u64; 4] = unsafe { std::mem::transmute_copy(f) }; + let mut out = [0u8; 32]; + for (i, limb) in limbs.iter().enumerate() { + out[i * 8..(i + 1) * 8].copy_from_slice(&limb.to_le_bytes()); + } + out +} + +#[cfg(feature = "arkworks")] +impl From for G1AffineM { + fn from(p: ark_bn254::G1Affine) -> Self { + use ark_ec::AffineRepr; + + if p.is_zero() { + return Self::default(); + } + + Self { + x: fq_to_montgomery_bytes(&p.x), + y: fq_to_montgomery_bytes(&p.y), + } + } +} + +#[cfg(feature = "arkworks")] +impl From<&ark_bn254::G1Affine> for G1AffineM { + fn from(p: &ark_bn254::G1Affine) -> Self { + (*p).into() + } +} + +/// GPU-compatible representation of a G2 affine point. +/// Coordinates are stored as 64-byte little-endian Fq2 elements (each Fq2 = two 32-byte Fq elements). +#[repr(C)] +#[derive(Copy, Clone, Debug)] +pub struct G2AffineM { + /// X coordinate as Fq2 (64 bytes: c0 followed by c1) + pub x: [u8; 64], + /// Y coordinate as Fq2 (64 bytes: c0 followed by c1) + pub y: [u8; 64], +} + +impl Default for G2AffineM { + fn default() -> Self { + Self { + x: [0u8; 64], + y: [0u8; 64], + } + } +} + +#[cfg(feature = "arkworks")] +fn fq2_to_montgomery_bytes(f: &ark_bn254::Fq2) -> [u8; 64] { + let mut out = [0u8; 64]; + out[..32].copy_from_slice(&fq_to_montgomery_bytes(&f.c0)); + out[32..].copy_from_slice(&fq_to_montgomery_bytes(&f.c1)); + out +} + +#[cfg(feature = "arkworks")] +impl From for G2AffineM { + fn from(p: ark_bn254::G2Affine) -> Self { + use ark_ec::AffineRepr; + + if p.is_zero() { + return Self::default(); + } + + Self { + x: fq2_to_montgomery_bytes(&p.x), + y: fq2_to_montgomery_bytes(&p.y), + } + } +} + +#[cfg(feature = "arkworks")] +impl From<&ark_bn254::G2Affine> for G2AffineM { + fn from(p: &ark_bn254::G2Affine) -> Self { + (*p).into() + } +} + +#[cfg(feature = "arkworks")] +impl GpuAffine for ec_gpu::arkworks_bn254::G1Affine { + type GpuRepr = G1AffineM; + type ScalarField = ark_bn254::Fr; + type Group = ark_bn254::G1Projective; + + fn to_gpu(&self) -> G1AffineM { + self.0.into() + } +} + +#[cfg(feature = "arkworks")] +impl GpuAffine for ec_gpu::arkworks_bn254::G2Affine { + type GpuRepr = G2AffineM; + type ScalarField = ark_bn254::Fr; + type Group = ark_bn254::G2Projective; + + fn to_gpu(&self) -> G2AffineM { + self.0.into() + } } impl<'a, G> SingleMultiexpKernel<'a, G> where - G: PrimeCurveAffine + GpuName, + G: GpuAffine, { /// Create a new Multiexp kernel instance for a device. /// @@ -105,6 +257,7 @@ where device: &Device, maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>, ) -> EcResult { + let _span = debug_span!("single_multiexp_kernel_create").entered(); let mem = device.memory(); let compute_units = device.compute_units(); let compute_capability = device.compute_capability(); @@ -127,19 +280,38 @@ where /// running on. pub fn multiexp( &self, - bases: &[G], - exponents: &[::Repr], - ) -> EcResult { + bases: &[G::GpuRepr], + exponents: &[::BigInt], + ) -> EcResult { + let _span = debug_span!("single_multiexp", n = bases.len()).entered(); assert_eq!(bases.len(), exponents.len()); + let exponents: Vec<_> = { + let _span = debug_span!("convert_exponents").entered(); + exponents + .iter() + .map(|b| { + let mut out = [0u8; 32]; + let le = b.to_bytes_le(); + out[..le.len()].copy_from_slice(&le); + + out + }) + .collect() + }; + if let Some(maybe_abort) = &self.maybe_abort { if maybe_abort() { return Err(EcError::Aborted); } } + + // Compute actual bit length needed for small scalar optimization + let effective_bits = compute_max_scalar_bits(&exponents); + let window_size = self.calc_window_size(bases.len()); - // windows_size * num_windows needs to be >= 256 in order for the kernel to work correctly. - let num_windows = div_ceil(256, window_size); + // windows_size * num_windows needs to be >= effective_bits to cover all scalar bits. + let num_windows = div_ceil(effective_bits, window_size); let num_groups = self.work_units / num_windows; let bucket_len = 1 << window_size; @@ -147,36 +319,55 @@ where // be `num_groups` * `num_windows` threads in total. // Each thread will use `num_groups` * `num_windows` * `bucket_len` buckets. - let closures = program_closures!(|program, _arg| -> EcResult> { - let base_buffer = program.create_buffer_from_slice(bases)?; - let exp_buffer = program.create_buffer_from_slice(exponents)?; - - // It is safe as the GPU will initialize that buffer - let bucket_buffer = - unsafe { program.create_buffer::(self.work_units * bucket_len)? }; - // It is safe as the GPU will initialize that buffer - let result_buffer = unsafe { program.create_buffer::(self.work_units)? }; + let closures = program_closures!(|program, _arg| -> EcResult> { + let base_buffer = { + let _span = debug_span!("upload_bases").entered(); + program.create_buffer_from_slice(bases)? + }; + let exp_buffer = { + let _span = debug_span!("upload_exponents").entered(); + program.create_buffer_from_slice(&exponents)? + }; + + let (bucket_buffer, result_buffer) = { + let _span = debug_span!("allocate_gpu_buffers").entered(); + // It is safe as the GPU will initialize that buffer + let bucket_buffer = + unsafe { program.create_buffer::(self.work_units * bucket_len)? }; + // It is safe as the GPU will initialize that buffer + let result_buffer = unsafe { program.create_buffer::(self.work_units)? }; + (bucket_buffer, result_buffer) + }; // The global work size follows CUDA's definition and is the number of // `LOCAL_WORK_SIZE` sized thread groups. let global_work_size = div_ceil(num_windows * num_groups, LOCAL_WORK_SIZE); - let kernel_name = format!("{}_multiexp", G::name()); - let kernel = program.create_kernel(&kernel_name, global_work_size, LOCAL_WORK_SIZE)?; - - kernel - .arg(&base_buffer) - .arg(&bucket_buffer) - .arg(&result_buffer) - .arg(&exp_buffer) - .arg(&(bases.len() as u32)) - .arg(&(num_groups as u32)) - .arg(&(num_windows as u32)) - .arg(&(window_size as u32)) - .run()?; + let kernel = { + let _span = debug_span!("create_kernel").entered(); + let kernel_name = format!("{}_multiexp", G::name()); + program.create_kernel(&kernel_name, global_work_size, LOCAL_WORK_SIZE)? + }; + + { + let _span = debug_span!("kernel_run").entered(); + kernel + .arg(&base_buffer) + .arg(&bucket_buffer) + .arg(&result_buffer) + .arg(&exp_buffer) + .arg(&(bases.len() as u32)) + .arg(&(num_groups as u32)) + .arg(&(num_windows as u32)) + .arg(&(window_size as u32)) + .run()?; + } - let mut results = vec![G::Curve::identity(); self.work_units]; - program.read_into_buffer(&result_buffer, &mut results)?; + let mut results = vec![::ZERO; self.work_units]; + { + let _span = debug_span!("download_results").entered(); + program.read_into_buffer(&result_buffer, &mut results)?; + } Ok(results) }); @@ -185,19 +376,23 @@ where // Using the algorithm below, we can calculate the final result by accumulating the results // of those `NUM_GROUPS` * `NUM_WINDOWS` threads. - let mut acc = G::Curve::identity(); - let mut bits = 0; - let exp_bits = exp_size::() * 8; - for i in 0..num_windows { - let w = std::cmp::min(window_size, exp_bits - bits); - for _ in 0..w { - acc = acc.double(); - } - for g in 0..num_groups { - acc.add_assign(&results[g * num_windows + i]); + // Since we use LSB-first bit extraction, window 0 contains the LSB and window (num_windows-1) + // contains the MSB. We process windows in reverse order (MSB first) using Horner's method. + let acc = { + let _span = debug_span!("cpu_accumulation").entered(); + let mut acc = ::ZERO; + for i in (0..num_windows).rev() { + // Window i covers bits [i * window_size, min((i+1) * window_size, effective_bits)) + let w = std::cmp::min(window_size, effective_bits - i * window_size); + for _ in 0..w { + acc = acc.double(); + } + for g in 0..num_groups { + acc.add_assign(&results[g * num_windows + i]); + } } - bits += w; // Process the next window - } + acc + }; Ok(acc) } @@ -219,14 +414,14 @@ where /// A struct that contains several multiexp kernels for different devices. pub struct MultiexpKernel<'a, G> where - G: PrimeCurveAffine, + G: GpuAffine, { kernels: Vec>, } impl<'a, G> MultiexpKernel<'a, G> where - G: PrimeCurveAffine + GpuName, + G: GpuAffine, { /// Create new kernels, one for each given device. pub fn create(programs: Vec, devices: &[&Device]) -> EcResult { @@ -250,6 +445,7 @@ where devices: &[&Device], maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>, ) -> EcResult { + let _span = debug_span!("multiexp_kernel_create").entered(); let kernels: Vec<_> = programs .into_iter() .zip(devices.iter()) @@ -288,9 +484,9 @@ where pub fn parallel_multiexp<'s>( &'s mut self, scope: &Scope<'s>, - bases: &'s [G], - exps: &'s [::Repr], - results: &'s mut [G::Curve], + bases: &'s [G::GpuRepr], + exps: &'s [::BigInt], + results: &'s mut [G::Group], error: Arc>>, ) { let num_devices = self.kernels.len(); @@ -308,8 +504,11 @@ where .zip(results.iter_mut()) { let error = error.clone(); + // Capture current span to propagate to worker thread + scope.execute(move || { - let mut acc = G::Curve::identity(); + let _span = debug_span!("gpu_device_multiexp", n = exps.len()).entered(); + let mut acc = ::ZERO; for (bases, exps) in bases.chunks(kern.n).zip(exps.chunks(kern.n)) { if error.read().unwrap().is_err() { break; @@ -335,10 +534,11 @@ where pub fn multiexp( &mut self, pool: &Worker, - bases_arc: Arc>, - exps: Arc::Repr>>, + bases_arc: Arc>, + exps: Arc::BigInt>>, skip: usize, - ) -> EcResult { + ) -> EcResult { + let _span = debug_span!("multiexp", n = exps.len()).entered(); // Bases are skipped by `self.1` elements, when converted from (Arc>, usize) to Source // https://github.com/zkcrypto/bellman/blob/10c5010fd9c2ca69442dc9775ea271e286e776d8/src/multiexp.rs#L38 let bases = &bases_arc[skip..(skip + exps.len())]; @@ -348,7 +548,7 @@ where let error = Arc::new(RwLock::new(Ok(()))); pool.scoped(|s| { - results = vec![G::Curve::identity(); self.kernels.len()]; + results = vec![::ZERO; self.kernels.len()]; self.parallel_multiexp(s, bases, exps, &mut results, error.clone()); }); @@ -357,7 +557,7 @@ where .into_inner() .unwrap()?; - let mut acc = G::Curve::identity(); + let mut acc = ::ZERO; for r in results { acc.add_assign(&r); } diff --git a/ec-gpu-gen/src/multiexp_cpu.rs b/ec-gpu-gen/src/multiexp_cpu.rs deleted file mode 100644 index 5703af4..0000000 --- a/ec-gpu-gen/src/multiexp_cpu.rs +++ /dev/null @@ -1,571 +0,0 @@ -#![allow(missing_docs)] -use std::convert::TryInto; -use std::io; -use std::iter; -use std::ops::AddAssign; -use std::sync::Arc; - -use bitvec::prelude::{BitVec, Lsb0}; -use ff::{Field, PrimeField}; -use group::{prime::PrimeCurveAffine, Group}; -use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator}; - -use crate::error::EcError; -use crate::threadpool::{Waiter, Worker}; - -/// An object that builds a source of bases. -pub trait SourceBuilder: Send + Sync + 'static + Clone { - type Source: Source; - - #[allow(clippy::wrong_self_convention)] - fn new(self) -> Self::Source; - fn get(self) -> (Arc>, usize); -} - -/// A source of bases, like an iterator. -pub trait Source { - /// Parses the element from the source. Fails if the point is at infinity. - fn add_assign_mixed(&mut self, to: &mut ::Curve) -> Result<(), EcError>; - - /// Skips `amt` elements from the source, avoiding deserialization. - fn skip(&mut self, amt: usize) -> Result<(), EcError>; -} - -impl SourceBuilder for (Arc>, usize) { - type Source = (Arc>, usize); - - fn new(self) -> (Arc>, usize) { - (self.0.clone(), self.1) - } - - fn get(self) -> (Arc>, usize) { - (self.0.clone(), self.1) - } -} - -impl Source for (Arc>, usize) { - fn add_assign_mixed(&mut self, to: &mut ::Curve) -> Result<(), EcError> { - if self.0.len() <= self.1 { - return Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - "Expected more bases from source.", - ) - .into()); - } - - if self.0[self.1].is_identity().into() { - return Err(EcError::Simple( - "Encountered an identity element in the CRS.", - )); - } - - to.add_assign(&self.0[self.1]); - - self.1 += 1; - - Ok(()) - } - - fn skip(&mut self, amt: usize) -> Result<(), EcError> { - if self.0.len() <= self.1 { - return Err(io::Error::new( - io::ErrorKind::UnexpectedEof, - "Expected more bases from source.", - ) - .into()); - } - - self.1 += amt; - - Ok(()) - } -} - -pub trait QueryDensity: Sized { - /// Returns whether the base exists. - type Iter: Iterator; - - fn iter(self) -> Self::Iter; - fn get_query_size(self) -> Option; - fn generate_exps(self, exponents: Arc>) -> Arc>; -} - -#[derive(Clone)] -pub struct FullDensity; - -impl AsRef for FullDensity { - fn as_ref(&self) -> &FullDensity { - self - } -} - -impl QueryDensity for &FullDensity { - type Iter = iter::Repeat; - - fn iter(self) -> Self::Iter { - iter::repeat(true) - } - - fn get_query_size(self) -> Option { - None - } - - fn generate_exps(self, exponents: Arc>) -> Arc> { - exponents - } -} - -#[derive(Clone, PartialEq, Eq, Debug, Default)] -pub struct DensityTracker { - pub bv: BitVec, - pub total_density: usize, -} - -impl<'a> QueryDensity for &'a DensityTracker { - type Iter = bitvec::slice::BitValIter<'a, usize, Lsb0>; - - fn iter(self) -> Self::Iter { - self.bv.iter().by_vals() - } - - fn get_query_size(self) -> Option { - Some(self.bv.len()) - } - - fn generate_exps(self, exponents: Arc>) -> Arc> { - let exps: Vec<_> = exponents - .iter() - .zip(self.bv.iter()) - .filter_map(|(&e, d)| if *d { Some(e) } else { None }) - .collect(); - - Arc::new(exps) - } -} - -impl DensityTracker { - pub fn new() -> DensityTracker { - DensityTracker { - bv: BitVec::new(), - total_density: 0, - } - } - - pub fn add_element(&mut self) { - self.bv.push(false); - } - - pub fn inc(&mut self, idx: usize) { - if !self.bv.get(idx).unwrap() { - self.bv.set(idx, true); - self.total_density += 1; - } - } - - pub fn get_total_density(&self) -> usize { - self.total_density - } - - /// Extend by concatenating `other`. If `is_input_density` is true, then we are tracking an input density, - /// and other may contain a redundant input for the `One` element. Coalesce those as needed and track the result. - pub fn extend(&mut self, other: &Self, is_input_density: bool) { - if other.bv.is_empty() { - // Nothing to do if other is empty. - return; - } - - if self.bv.is_empty() { - // If self is empty, assume other's density. - self.total_density = other.total_density; - self.bv.resize(other.bv.len(), false); - self.bv.copy_from_bitslice(&*other.bv); - return; - } - - if is_input_density { - // Input densities need special handling to coalesce their first inputs. - - if other.bv[0] { - // If other's first bit is set, - if self.bv[0] { - // And own first bit is set, then decrement total density so the final sum doesn't overcount. - self.total_density -= 1; - } else { - // Otherwise, set own first bit. - self.bv.set(0, true); - } - } - // Now discard other's first bit, having accounted for it above, and extend self by remaining bits. - self.bv.extend(other.bv.iter().skip(1)); - } else { - // Not an input density, just extend straightforwardly. - self.bv.extend(other.bv.iter()); - } - - // Since any needed adjustments to total densities have been made, just sum the totals and keep the sum. - self.total_density += other.total_density; - } -} - -// Right shift the repr of a field element by `n` bits. -fn shr(le_bytes: &mut [u8], mut n: u32) { - if n >= 8 * le_bytes.len() as u32 { - le_bytes.iter_mut().for_each(|byte| *byte = 0); - return; - } - - // Shift each full byte towards the least significant end. - while n >= 8 { - let mut replacement = 0; - for byte in le_bytes.iter_mut().rev() { - std::mem::swap(&mut replacement, byte); - } - n -= 8; - } - - // Starting at the most significant byte, shift the byte's `n` least significant bits into the - // `n` most significant bits of the next byte. - if n > 0 { - let mut shift_in = 0; - for byte in le_bytes.iter_mut().rev() { - // Copy the byte's `n` least significant bits. - let shift_out = *byte << (8 - n); - // Shift the byte by `n` bits; zeroing its `n` most significant bits. - *byte >>= n; - // Replace the `n` most significant bits with the bits shifted out of the previous byte. - *byte |= shift_in; - shift_in = shift_out; - } - } -} - -fn multiexp_inner( - bases: S, - density_map: D, - exponents: Arc::Repr>>, - c: u32, -) -> Result<::Curve, EcError> -where - for<'a> &'a Q: QueryDensity, - D: Send + Sync + 'static + Clone + AsRef, - G: PrimeCurveAffine, - S: SourceBuilder, -{ - // Perform this region of the multiexp - let this = move |bases: S, - density_map: D, - exponents: Arc::Repr>>, - skip: u32| - -> Result<_, EcError> { - // Accumulate the result - let mut acc = G::Curve::identity(); - - // Build a source for the bases - let mut bases = bases.new(); - - // Create space for the buckets - let mut buckets = vec![::Curve::identity(); (1 << c) - 1]; - - let zero = G::Scalar::ZERO.to_repr(); - let one = G::Scalar::ONE.to_repr(); - - // only the first round uses this - let handle_trivial = skip == 0; - - // Sort the bases into buckets - for (&exp, density) in exponents.iter().zip(density_map.as_ref().iter()) { - if density { - if exp.as_ref() == zero.as_ref() { - bases.skip(1)?; - } else if exp.as_ref() == one.as_ref() { - if handle_trivial { - bases.add_assign_mixed(&mut acc)?; - } else { - bases.skip(1)?; - } - } else { - let mut exp = exp; - shr(exp.as_mut(), skip); - let exp = u64::from_le_bytes(exp.as_ref()[..8].try_into().unwrap()) % (1 << c); - - if exp != 0 { - bases.add_assign_mixed(&mut buckets[(exp - 1) as usize])?; - } else { - bases.skip(1)?; - } - } - } - } - - // Summation by parts - // e.g. 3a + 2b + 1c = a + - // (a) + b + - // ((a) + b) + c - let mut running_sum = G::Curve::identity(); - for exp in buckets.into_iter().rev() { - running_sum.add_assign(&exp); - acc.add_assign(&running_sum); - } - - Ok(acc) - }; - - let parts = (0..::NUM_BITS) - .into_par_iter() - .step_by(c as usize) - .map(|skip| this(bases.clone(), density_map.clone(), exponents.clone(), skip)) - .collect::>>(); - - parts.into_iter().rev().try_fold( - ::Curve::identity(), - |mut acc, part| { - for _ in 0..c { - acc = acc.double(); - } - - acc.add_assign(&part?); - Ok(acc) - }, - ) -} - -/// Perform multi-exponentiation. The caller is responsible for ensuring the -/// query size is the same as the number of exponents. -pub fn multiexp_cpu<'b, Q, D, G, S>( - pool: &Worker, - bases: S, - density_map: D, - exponents: Arc::Repr>>, -) -> Waiter::Curve, EcError>> -where - for<'a> &'a Q: QueryDensity, - D: Send + Sync + 'static + Clone + AsRef, - G: PrimeCurveAffine, - S: SourceBuilder, -{ - let c = if exponents.len() < 32 { - 3u32 - } else { - (f64::from(exponents.len() as u32)).ln().ceil() as u32 - }; - - if let Some(query_size) = density_map.as_ref().get_query_size() { - // If the density map has a known query size, it should not be - // inconsistent with the number of exponents. - assert!(query_size == exponents.len()); - } - - pool.compute(move || multiexp_inner(bases, density_map, exponents, c)) -} - -#[cfg(test)] -mod tests { - use super::*; - - use blstrs::Bls12; - use group::Curve; - use pairing::Engine; - use rand::Rng; - use rand_core::SeedableRng; - use rand_xorshift::XorShiftRng; - - #[test] - fn test_with_bls12() { - fn naive_multiexp( - bases: Arc>, - exponents: &[G::Scalar], - ) -> G::Curve { - assert_eq!(bases.len(), exponents.len()); - - let mut acc = G::Curve::identity(); - - for (base, exp) in bases.iter().zip(exponents.iter()) { - acc.add_assign(&base.mul(*exp)); - } - - acc - } - - const SAMPLES: usize = 1 << 14; - - let rng = &mut rand::thread_rng(); - let v: Vec<::Fr> = (0..SAMPLES) - .map(|_| ::Fr::random(&mut *rng)) - .collect(); - let g = Arc::new( - (0..SAMPLES) - .map(|_| ::G1::random(&mut *rng).to_affine()) - .collect::>(), - ); - - let now = std::time::Instant::now(); - let naive = naive_multiexp(g.clone(), &v); - println!("Naive: {}", now.elapsed().as_millis()); - - let now = std::time::Instant::now(); - let pool = Worker::new(); - - let v = Arc::new(v.into_iter().map(|fr| fr.to_repr()).collect()); - let fast = multiexp_cpu(&pool, (g, 0), FullDensity, v).wait().unwrap(); - - println!("Fast: {}", now.elapsed().as_millis()); - - assert_eq!(naive, fast); - } - - #[test] - fn test_extend_density_regular() { - let mut rng = XorShiftRng::from_seed([ - 0x59, 0x62, 0xbe, 0x5d, 0x76, 0x3d, 0x31, 0x8d, 0x17, 0xdb, 0x37, 0x32, 0x54, 0x06, - 0xbc, 0xe5, - ]); - - for k in &[2, 4, 8] { - for j in &[10, 20, 50] { - let count: usize = k * j; - - let mut tracker_full = DensityTracker::new(); - let mut partial_trackers: Vec = Vec::with_capacity(count / k); - for i in 0..count { - if i % k == 0 { - partial_trackers.push(DensityTracker::new()); - } - - let index: usize = i / k; - if rng.gen() { - tracker_full.add_element(); - partial_trackers[index].add_element(); - } - - if !partial_trackers[index].bv.is_empty() { - let idx = rng.gen_range(0..partial_trackers[index].bv.len()); - let offset: usize = partial_trackers - .iter() - .take(index) - .map(|t| t.bv.len()) - .sum(); - tracker_full.inc(offset + idx); - partial_trackers[index].inc(idx); - } - } - - let mut tracker_combined = DensityTracker::new(); - for tracker in partial_trackers.into_iter() { - tracker_combined.extend(&tracker, false); - } - assert_eq!(tracker_combined, tracker_full); - } - } - } - - #[test] - fn test_extend_density_input() { - let mut rng = XorShiftRng::from_seed([ - 0x59, 0x62, 0xbe, 0x5d, 0x76, 0x3d, 0x31, 0x8d, 0x17, 0xdb, 0x37, 0x32, 0x54, 0x06, - 0xbc, 0xe5, - ]); - let trials = 10; - let max_bits = 10; - let max_density = max_bits; - - // Create an empty DensityTracker. - let empty = DensityTracker::new; - - // Create a random DensityTracker with first bit unset. - let unset = |rng: &mut XorShiftRng| { - let mut dt = DensityTracker::new(); - dt.add_element(); - let n = rng.gen_range(1..max_bits); - let target_density = rng.gen_range(0..max_density); - for _ in 1..n { - dt.add_element(); - } - - for _ in 0..target_density { - if n > 1 { - let to_inc = rng.gen_range(1..n); - dt.inc(to_inc); - } - } - assert!(!dt.bv[0]); - assert_eq!(n, dt.bv.len()); - dbg!(&target_density, &dt.total_density); - - dt - }; - - // Create a random DensityTracker with first bit set. - let set = |rng: &mut XorShiftRng| { - let mut dt = unset(rng); - dt.inc(0); - dt - }; - - for _ in 0..trials { - { - // Both empty. - let (mut e1, e2) = (empty(), empty()); - e1.extend(&e2, true); - assert_eq!(empty(), e1); - } - { - // First empty, second unset. - let (mut e1, u1) = (empty(), unset(&mut rng)); - e1.extend(&u1.clone(), true); - assert_eq!(u1, e1); - } - { - // First empty, second set. - let (mut e1, s1) = (empty(), set(&mut rng)); - e1.extend(&s1.clone(), true); - assert_eq!(s1, e1); - } - { - // First set, second empty. - let (mut s1, e1) = (set(&mut rng), empty()); - let s2 = s1.clone(); - s1.extend(&e1, true); - assert_eq!(s1, s2); - } - { - // First unset, second empty. - let (mut u1, e1) = (unset(&mut rng), empty()); - let u2 = u1.clone(); - u1.extend(&e1, true); - assert_eq!(u1, u2); - } - { - // First unset, second unset. - let (mut u1, u2) = (unset(&mut rng), unset(&mut rng)); - let expected_total = u1.total_density + u2.total_density; - u1.extend(&u2, true); - assert_eq!(expected_total, u1.total_density); - assert!(!u1.bv[0]); - } - { - // First unset, second set. - let (mut u1, s1) = (unset(&mut rng), set(&mut rng)); - let expected_total = u1.total_density + s1.total_density; - u1.extend(&s1, true); - assert_eq!(expected_total, u1.total_density); - assert!(u1.bv[0]); - } - { - // First set, second unset. - let (mut s1, u1) = (set(&mut rng), unset(&mut rng)); - let expected_total = s1.total_density + u1.total_density; - s1.extend(&u1, true); - assert_eq!(expected_total, s1.total_density); - assert!(s1.bv[0]); - } - { - // First set, second set. - let (mut s1, s2) = (set(&mut rng), set(&mut rng)); - let expected_total = s1.total_density + s2.total_density - 1; - s1.extend(&s2, true); - assert_eq!(expected_total, s1.total_density); - assert!(s1.bv[0]); - } - } - } -} diff --git a/ec-gpu-gen/src/source.rs b/ec-gpu-gen/src/source.rs index 49fda87..fde587f 100644 --- a/ec-gpu-gen/src/source.rs +++ b/ec-gpu-gen/src/source.rs @@ -9,7 +9,6 @@ use std::path::PathBuf; use std::{env, fs}; use ec_gpu::{GpuField, GpuName}; -use group::prime::PrimeCurveAffine; static COMMON_SRC: &str = include_str!("cl/common.cl"); static FIELD_SRC: &str = include_str!("cl/field.cl"); @@ -194,15 +193,12 @@ impl NameAndSource for Multiexp /// /// # Example /// -/// ``` -/// use blstrs::{Fp, Fp2, G1Affine, G2Affine, Scalar}; +/// ```ignore +/// use ark_bn254::Fr; /// use ec_gpu_gen::SourceBuilder; /// -/// # #[cfg(any(feature = "cuda", feature = "opencl"))] /// let source = SourceBuilder::new() -/// .add_fft::() -/// .add_multiexp::() -/// .add_multiexp::() +/// .add_fft::() /// .build_32_bit_limbs(); ///``` // In the `HashSet`s the concrete types cannot be used, as each item of the set should be able to @@ -264,18 +260,18 @@ impl SourceBuilder { config } - /// Add an Multiexp kernel function to the configuration. + /// Add a Multiexp kernel function to the configuration. /// - /// The field must be given explicitly as currently it cannot derived from the curve point - /// directly. - pub fn add_multiexp(self) -> Self + /// The field and scalar types must be given explicitly as they currently cannot be derived + /// from the curve point directly. + pub fn add_multiexp(self) -> Self where - C: PrimeCurveAffine + GpuName, - C::Scalar: GpuField, + C: GpuName + 'static, + S: GpuField + 'static, F: GpuField + 'static, { - let mut config = self.add_field::().add_field::(); - let multiexp = Multiexp::::new(); + let mut config = self.add_field::().add_field::(); + let multiexp = Multiexp::::new(); config.multiexps.insert(Box::new(multiexp)); config } @@ -711,31 +707,58 @@ mod tests { use rust_gpu_tools::opencl; use rust_gpu_tools::{program_closures, Device, GPUError, Program}; - use blstrs::Scalar; - use ff::{Field as _, PrimeField}; + use ark_ff::AdditiveGroup; + use ark_std::Zero; + use lazy_static::lazy_static; use rand::{thread_rng, Rng}; static TEST_SRC: &str = include_str!("./cl/test.cl"); + // #[derive(PartialEq, Debug, Clone, Copy)] + // #[repr(transparent)] + // pub struct GpuScalar(pub Scalar); + // impl Default for GpuScalar { + // fn default() -> Self { + // Self(Scalar::ZERO) + // } + // } + + // #[cfg(feature = "cuda")] + // impl cuda::KernelArgument for GpuScalar { + // fn as_c_void(&self) -> *mut std::ffi::c_void { + // &self.0 as *const _ as _ + // } + // } + + // #[cfg(feature = "opencl")] + // impl opencl::KernelArgument for GpuScalar { + // fn push(&self, kernel: &mut opencl::Kernel) { + // unsafe { kernel.builder.set_arg(&self.0) }; + // } + // } + + #[cfg(feature = "arkworks")] #[derive(PartialEq, Debug, Clone, Copy)] #[repr(transparent)] - pub struct GpuScalar(pub Scalar); - impl Default for GpuScalar { + pub struct GpuScalarBN254(pub ark_bn254::Fr); + + #[cfg(feature = "arkworks")] + impl Default for GpuScalarBN254 { fn default() -> Self { - Self(Scalar::ZERO) + Self(ark_bn254::Fr::zero()) } } - #[cfg(feature = "cuda")] - impl cuda::KernelArgument for GpuScalar { + #[cfg(all(feature = "arkworks", feature = "cuda"))] + impl cuda::KernelArgument for GpuScalarBN254 { fn as_c_void(&self) -> *mut std::ffi::c_void { &self.0 as *const _ as _ } } - #[cfg(feature = "opencl")] - impl opencl::KernelArgument for GpuScalar { + #[cfg(all(feature = "arkworks", feature = "opencl"))] + impl opencl::KernelArgument for GpuScalarBN254 { fn push(&self, kernel: &mut opencl::Kernel) { unsafe { kernel.builder.set_arg(&self.0) }; } @@ -750,19 +773,60 @@ mod tests { } } - fn test_source() -> SourceBuilder { - let test_source = String::from(TEST_SRC).replace("FIELD", &Scalar::name()); + // Temporarily commented out to avoid version conflicts + // fn test_source() -> SourceBuilder { + // let test_source = String::from(TEST_SRC).replace("FIELD", &Scalar::name()); + // SourceBuilder::new() + // .add_field::() + // .append_source(test_source) + // } + + #[cfg(feature = "arkworks")] + fn test_source_ark_bn254() -> SourceBuilder { + let test_source = String::from(TEST_SRC).replace("FIELD", &ark_bn254::Fr::name()); SourceBuilder::new() - .add_field::() + .add_field::() .append_source(test_source) } - - #[cfg(feature = "cuda")] + // + // #[cfg(feature = "cuda")] + // lazy_static! { + // static ref CUDA_PROGRAM: Mutex = { + // use std::ffi::CString; + // + // let source = test_source(); + // let fatbin_path = generate_cuda(&source); + // + // let device = *Device::all().first().expect("Cannot get a default device."); + // let cuda_device = device.cuda_device().unwrap(); + // let fatbin_path_cstring = + // CString::new(fatbin_path.to_str().expect("path is not valid UTF-8.")) + // .expect("path contains NULL byte."); + // let program = + // cuda::Program::from_binary(cuda_device, fatbin_path_cstring.as_c_str()).unwrap(); + // Mutex::new(Program::Cuda(program)) + // }; + // } + // + // #[cfg(feature = "opencl")] + // lazy_static! { + // static ref OPENCL_PROGRAM: Mutex<(Program, Program)> = { + // let device = *Device::all().first().expect("Cannot get a default device"); + // let opencl_device = device.opencl_device().unwrap(); + // let source_32 = test_source().build_32_bit_limbs(); + // let program_32 = opencl::Program::from_opencl(opencl_device, &source_32).unwrap(); + // let source_64 = test_source().build_64_bit_limbs(); + // let program_64 = opencl::Program::from_opencl(opencl_device, &source_64).unwrap(); + // Mutex::new((Program::Opencl(program_32), Program::Opencl(program_64))) + // }; + // } + + #[cfg(all(feature = "arkworks", feature = "cuda"))] lazy_static! { - static ref CUDA_PROGRAM: Mutex = { + static ref CUDA_PROGRAM_ARK_BN254: Mutex = { use std::ffi::CString; - let source = test_source(); + let source = test_source_ark_bn254(); let fatbin_path = generate_cuda(&source); let device = *Device::all().first().expect("Cannot get a default device."); @@ -776,22 +840,80 @@ mod tests { }; } - #[cfg(feature = "opencl")] + #[cfg(all(feature = "arkworks", feature = "opencl"))] lazy_static! { - static ref OPENCL_PROGRAM: Mutex<(Program, Program)> = { + static ref OPENCL_PROGRAM_ARK_BN254: Mutex<(Program, Program)> = { let device = *Device::all().first().expect("Cannot get a default device"); let opencl_device = device.opencl_device().unwrap(); - let source_32 = test_source().build_32_bit_limbs(); + let source_32 = test_source_ark_bn254().build_32_bit_limbs(); let program_32 = opencl::Program::from_opencl(opencl_device, &source_32).unwrap(); - let source_64 = test_source().build_64_bit_limbs(); + let source_64 = test_source_ark_bn254().build_64_bit_limbs(); let program_64 = opencl::Program::from_opencl(opencl_device, &source_64).unwrap(); Mutex::new((Program::Opencl(program_32), Program::Opencl(program_64))) }; } - fn call_kernel(name: &str, scalars: &[GpuScalar], uints: &[u32]) -> Scalar { - let closures = program_closures!(|program, _args| -> Result { - let mut cpu_buffer = vec![GpuScalar::default()]; + // fn call_kernel(name: &str, scalars: &[GpuScalar], uints: &[u32]) -> Scalar { + // let closures = program_closures!(|program, _args| -> Result { + // let mut cpu_buffer = vec![GpuScalar::default()]; + // let buffer = program.create_buffer_from_slice(&cpu_buffer).unwrap(); + // + // let mut kernel = program.create_kernel(name, 1, 64).unwrap(); + // for scalar in scalars { + // kernel = kernel.arg(scalar); + // } + // for uint in uints { + // kernel = kernel.arg(uint); + // } + // kernel.arg(&buffer).run().unwrap(); + // + // program.read_into_buffer(&buffer, &mut cpu_buffer).unwrap(); + // Ok(cpu_buffer[0].0) + // }); + // + // // For CUDA we only test 32-bit limbs. + // #[cfg(all(feature = "cuda", not(feature = "opencl")))] + // return CUDA_PROGRAM.lock().unwrap().run(closures, ()).unwrap(); + // + // // For OpenCL we test for 32 and 64-bi limbs. + // #[cfg(all(feature = "opencl", not(feature = "cuda")))] + // { + // let result_32 = OPENCL_PROGRAM.lock().unwrap().0.run(closures, ()).unwrap(); + // let result_64 = OPENCL_PROGRAM.lock().unwrap().1.run(closures, ()).unwrap(); + // assert_eq!( + // result_32, result_64, + // "Results for 32-bit and 64-bit limbs must be the same." + // ); + // result_32 + // } + // + // // When both features are enabled, check if the results are the same + // #[cfg(all(feature = "cuda", feature = "opencl"))] + // { + // let cuda_result = CUDA_PROGRAM.lock().unwrap().run(closures, ()).unwrap(); + // let opencl_32_result = OPENCL_PROGRAM.lock().unwrap().0.run(closures, ()).unwrap(); + // let opencl_64_result = OPENCL_PROGRAM.lock().unwrap().1.run(closures, ()).unwrap(); + // assert_eq!( + // opencl_32_result, opencl_64_result, + // "Results for 32-bit and 64-bit limbs on OpenCL must be the same." + // ); + // assert_eq!( + // cuda_result, opencl_32_result, + // "Results for CUDA and OpenCL must be the same." + // ); + // cuda_result + // } + // } + // + + #[cfg(feature = "arkworks")] + fn call_kernel_ark_bn254( + name: &str, + scalars: &[GpuScalarBN254], + uints: &[u32], + ) -> ark_bn254::Fr { + let closures = program_closures!(|program, _args| -> Result { + let mut cpu_buffer = vec![GpuScalarBN254::default()]; let buffer = program.create_buffer_from_slice(&cpu_buffer).unwrap(); let mut kernel = program.create_kernel(name, 1, 64).unwrap(); @@ -809,13 +931,27 @@ mod tests { // For CUDA we only test 32-bit limbs. #[cfg(all(feature = "cuda", not(feature = "opencl")))] - return CUDA_PROGRAM.lock().unwrap().run(closures, ()).unwrap(); + return CUDA_PROGRAM_ARK_BN254 + .lock() + .unwrap() + .run(closures, ()) + .unwrap(); - // For OpenCL we test for 32 and 64-bi limbs. + // For OpenCL we test for 32 and 64-bit limbs. #[cfg(all(feature = "opencl", not(feature = "cuda")))] { - let result_32 = OPENCL_PROGRAM.lock().unwrap().0.run(closures, ()).unwrap(); - let result_64 = OPENCL_PROGRAM.lock().unwrap().1.run(closures, ()).unwrap(); + let result_32 = OPENCL_PROGRAM_ARK_BN254 + .lock() + .unwrap() + .0 + .run(closures, ()) + .unwrap(); + let result_64 = OPENCL_PROGRAM_ARK_BN254 + .lock() + .unwrap() + .1 + .run(closures, ()) + .unwrap(); assert_eq!( result_32, result_64, "Results for 32-bit and 64-bit limbs must be the same." @@ -826,9 +962,23 @@ mod tests { // When both features are enabled, check if the results are the same #[cfg(all(feature = "cuda", feature = "opencl"))] { - let cuda_result = CUDA_PROGRAM.lock().unwrap().run(closures, ()).unwrap(); - let opencl_32_result = OPENCL_PROGRAM.lock().unwrap().0.run(closures, ()).unwrap(); - let opencl_64_result = OPENCL_PROGRAM.lock().unwrap().1.run(closures, ()).unwrap(); + let cuda_result = CUDA_PROGRAM_ARK_BN254 + .lock() + .unwrap() + .run(closures, ()) + .unwrap(); + let opencl_32_result = OPENCL_PROGRAM_ARK_BN254 + .lock() + .unwrap() + .0 + .run(closures, ()) + .unwrap(); + let opencl_64_result = OPENCL_PROGRAM_ARK_BN254 + .lock() + .unwrap() + .1 + .run(closures, ()) + .unwrap(); assert_eq!( opencl_32_result, opencl_64_result, "Results for 32-bit and 64-bit limbs on OpenCL must be the same." @@ -841,101 +991,145 @@ mod tests { } } + #[cfg(feature = "arkworks")] #[test] - fn test_add() { + fn test_ark_bn254_add() { + use ark_std::UniformRand; let mut rng = thread_rng(); for _ in 0..10 { - let a = Scalar::random(&mut rng); - let b = Scalar::random(&mut rng); + let a = ark_bn254::Fr::rand(&mut rng); + let b = ark_bn254::Fr::rand(&mut rng); let c = a + b; assert_eq!( - call_kernel("test_add", &[GpuScalar(a), GpuScalar(b)], &[]), + call_kernel_ark_bn254("test_add", &[GpuScalarBN254(a), GpuScalarBN254(b)], &[]), c ); } } + #[cfg(feature = "arkworks")] #[test] - fn test_sub() { + fn test_ark_bn254_sub() { + use ark_std::UniformRand; let mut rng = thread_rng(); for _ in 0..10 { - let a = Scalar::random(&mut rng); - let b = Scalar::random(&mut rng); + let a = ark_bn254::Fr::rand(&mut rng); + let b = ark_bn254::Fr::rand(&mut rng); let c = a - b; assert_eq!( - call_kernel("test_sub", &[GpuScalar(a), GpuScalar(b)], &[]), + call_kernel_ark_bn254("test_sub", &[GpuScalarBN254(a), GpuScalarBN254(b)], &[]), c ); } } + #[cfg(feature = "arkworks")] #[test] - fn test_mul() { + fn test_ark_bn254_mul() { + use ark_std::UniformRand; let mut rng = thread_rng(); for _ in 0..10 { - let a = Scalar::random(&mut rng); - let b = Scalar::random(&mut rng); + let a = ark_bn254::Fr::rand(&mut rng); + let b = ark_bn254::Fr::rand(&mut rng); let c = a * b; assert_eq!( - call_kernel("test_mul", &[GpuScalar(a), GpuScalar(b)], &[]), + call_kernel_ark_bn254("test_mul", &[GpuScalarBN254(a), GpuScalarBN254(b)], &[]), c ); } } + #[cfg(feature = "arkworks")] #[test] - fn test_pow() { + fn test_ark_bn254_pow() { + use ark_ff::Field; + use ark_std::UniformRand; let mut rng = thread_rng(); for _ in 0..10 { - let a = Scalar::random(&mut rng); + let a = ark_bn254::Fr::rand(&mut rng); let b = rng.gen::(); - let c = a.pow_vartime([b as u64]); - assert_eq!(call_kernel("test_pow", &[GpuScalar(a)], &[b]), c); + let c = a.pow([(b as u64)]); + assert_eq!( + call_kernel_ark_bn254("test_pow", &[GpuScalarBN254(a)], &[b]), + c + ); } } + #[cfg(feature = "arkworks")] #[test] - fn test_sqr() { + fn test_ark_bn254_sqr() { + use ark_ff::Field; + use ark_std::UniformRand; let mut rng = thread_rng(); for _ in 0..10 { - let a = Scalar::random(&mut rng); + let a = ark_bn254::Fr::rand(&mut rng); let b = a.square(); - assert_eq!(call_kernel("test_sqr", &[GpuScalar(a)], &[]), b); + assert_eq!( + call_kernel_ark_bn254("test_sqr", &[GpuScalarBN254(a)], &[]), + b + ); } } + #[cfg(feature = "arkworks")] #[test] - fn test_double() { + fn test_ark_bn254_double() { + use ark_std::UniformRand; let mut rng = thread_rng(); for _ in 0..10 { - let a = Scalar::random(&mut rng); + let a = ark_bn254::Fr::rand(&mut rng); let b = a.double(); - assert_eq!(call_kernel("test_double", &[GpuScalar(a)], &[]), b); + assert_eq!( + call_kernel_ark_bn254("test_double", &[GpuScalarBN254(a)], &[]), + b + ); } } + #[cfg(feature = "arkworks")] #[test] - fn test_unmont() { + fn test_ark_bn254_unmont() { + use ark_std::UniformRand; let mut rng = thread_rng(); for _ in 0..10 { - let a = Scalar::random(&mut rng); - let b: Scalar = unsafe { std::mem::transmute(a.to_repr()) }; - assert_eq!(call_kernel("test_unmont", &[GpuScalar(a)], &[]), b); + let a = ark_bn254::Fr::rand(&mut rng); + let b: ark_bn254::Fr = unsafe { + use ark_ff::{BigInteger, PrimeField}; + let b: [u8; 32] = a.into_bigint().to_bytes_le().try_into().unwrap(); + std::mem::transmute(b) + }; + assert_eq!( + call_kernel_ark_bn254("test_unmont", &[GpuScalarBN254(a)], &[]), + b + ); } } + #[cfg(feature = "arkworks")] #[test] - fn test_mont() { + fn test_ark_bn254_mont() { + use ark_std::UniformRand; let mut rng = thread_rng(); for _ in 0..10 { - let a_repr = Scalar::random(&mut rng).to_repr(); - let a: Scalar = unsafe { std::mem::transmute(a_repr) }; - let b = Scalar::from_repr(a_repr).unwrap(); - assert_eq!(call_kernel("test_mont", &[GpuScalar(a)], &[]), b); + use ark_ff::{BigInteger, PrimeField}; + + let a_repr: [u8; 32] = ark_bn254::Fr::rand(&mut rng) + .into_bigint() + .to_bytes_le() + .try_into() + .unwrap(); + let a: ark_bn254::Fr = unsafe { std::mem::transmute(a_repr) }; + let b = ark_bn254::Fr::from_le_bytes_mod_order(&a_repr); + + assert_eq!( + call_kernel_ark_bn254("test_mont", &[GpuScalarBN254(a)], &[]), + b + ); } } } diff --git a/ec-gpu/Cargo.toml b/ec-gpu/Cargo.toml index a4184ff..c1f61e4 100644 --- a/ec-gpu/Cargo.toml +++ b/ec-gpu/Cargo.toml @@ -9,3 +9,17 @@ repository = "https://github.com/filecoin-project/ff-cl-gen" license = "MIT/Apache-2.0" [dependencies] +ark-bn254 = { version = "0.5.0", optional = true } +ark-ec = { version = "0.5.0", optional = true } +ark-ff = { version = "0.5.0", optional = true } +ark-serialize = { version = "0.5.0", optional = true } +ark-std = { version = "0.5.0", optional = true } + +[features] +arkworks = [ + "dep:ark-bn254", + "dep:ark-ec", + "dep:ark-ff", + "dep:ark-serialize", + "dep:ark-std", +] diff --git a/ec-gpu/src/arkworks_bn254.rs b/ec-gpu/src/arkworks_bn254.rs new file mode 100644 index 0000000..66b3d79 --- /dev/null +++ b/ec-gpu/src/arkworks_bn254.rs @@ -0,0 +1,161 @@ +use std::ops::{Deref, DerefMut}; + +use ark_bn254::{Fq, Fq2, FqConfig, Fr, FrConfig}; +use ark_ec::short_weierstrass::Affine; +use ark_ff::{BigInteger, MontConfig}; + +use crate::{GpuField, GpuName}; + +fn bytes_le_to_u32_limbs(mut bytes: Vec) -> Vec { + while !bytes.len().is_multiple_of(4) { + bytes.push(0); + } + bytes + .chunks_exact(4) + .map(|c| u32::from_le_bytes([c[0], c[1], c[2], c[3]])) + .collect() +} + +fn bigint_to_u32_limbs_le(b: B) -> Vec { + bytes_le_to_u32_limbs(b.to_bytes_le()) +} + +impl GpuName for Fq { + fn name() -> String { + crate::name!() + } +} + +impl GpuField for Fq { + fn one() -> Vec { + bigint_to_u32_limbs_le(FqConfig::R) + } + + fn r2() -> Vec { + bigint_to_u32_limbs_le(FqConfig::R2) + } + + fn modulus() -> Vec { + bigint_to_u32_limbs_le(FqConfig::MODULUS) + } +} + +impl GpuName for Fq2 { + fn name() -> String { + crate::name!() + } +} + +impl GpuField for Fq2 { + fn one() -> Vec { + let n = bigint_to_u32_limbs_le(FqConfig::MODULUS).len(); + let mut out = vec![0u32; 2 * n]; + out[..n].copy_from_slice(&bigint_to_u32_limbs_le(FqConfig::R)); + out + } + + fn r2() -> Vec { + let n = bigint_to_u32_limbs_le(FqConfig::MODULUS).len(); + let mut out = vec![0u32; 2 * n]; + out[..n].copy_from_slice(&bigint_to_u32_limbs_le(FqConfig::R2)); + out + } + + fn modulus() -> Vec { + bigint_to_u32_limbs_le(FqConfig::MODULUS) + } + + fn sub_field_name() -> Option { + Some(Fq::name()) + } +} + +impl GpuName for Fr { + fn name() -> String { + crate::name!() + } +} + +impl GpuField for Fr { + fn one() -> Vec { + bigint_to_u32_limbs_le(FrConfig::R) + } + + fn r2() -> Vec { + bigint_to_u32_limbs_le(FrConfig::R2) + } + + fn modulus() -> Vec { + bigint_to_u32_limbs_le(FrConfig::MODULUS) + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] +#[repr(transparent)] +pub struct G1Affine(pub Affine); + +impl Deref for G1Affine { + type Target = Affine; + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for G1Affine { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl From> for G1Affine { + fn from(p: Affine) -> Self { + Self(p) + } +} + +impl From for Affine { + fn from(p: G1Affine) -> Self { + p.0 + } +} + +impl GpuName for G1Affine { + fn name() -> String { + crate::name!() + } +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)] +#[repr(transparent)] +pub struct G2Affine(pub Affine); + +impl Deref for G2Affine { + type Target = Affine; + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for G2Affine { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl From> for G2Affine { + fn from(p: Affine) -> Self { + Self(p) + } +} + +impl From for Affine { + fn from(p: G2Affine) -> Self { + p.0 + } +} + +impl GpuName for G2Affine { + fn name() -> String { + crate::name!() + } +} diff --git a/ec-gpu/src/arkworks_macros.rs b/ec-gpu/src/arkworks_macros.rs new file mode 100644 index 0000000..ac27c43 --- /dev/null +++ b/ec-gpu/src/arkworks_macros.rs @@ -0,0 +1,128 @@ +/// Helper function to convert arkworks BigInteger to u32 limbs in little-endian order. +pub fn bigint_to_u32_limbs(b: B) -> Vec { + let bytes = b.to_bytes_le(); + bytes_to_u32_limbs(bytes) +} + +/// Helper function to convert bytes to u32 limbs in little-endian order. +pub fn bytes_to_u32_limbs(mut bytes: Vec) -> Vec { + // Pad to multiple of 4 bytes + while !bytes.len().is_multiple_of(4) { + bytes.push(0); + } + bytes + .chunks_exact(4) + .map(|c| u32::from_le_bytes([c[0], c[1], c[2], c[3]])) + .collect() +} + +/// Implement `GpuName` and `GpuField` for an arkworks prime field. +/// +/// # Example +/// +/// ```ignore +/// use ec_gpu::impl_gpu_field_arkworks; +/// use ark_bls12_381::{Fr, FrConfig}; +/// +/// impl_gpu_field_arkworks!(Fr, FrConfig); +/// ``` +#[macro_export] +macro_rules! impl_gpu_field_arkworks { + ($field:ty, $config:ty) => { + impl $crate::GpuName for $field { + fn name() -> String { + $crate::name!() + } + } + + impl $crate::GpuField for $field { + fn one() -> Vec { + use ark_ff::MontConfig; + $crate::arkworks_macros::bigint_to_u32_limbs(<$config>::R) + } + + fn r2() -> Vec { + use ark_ff::MontConfig; + $crate::arkworks_macros::bigint_to_u32_limbs(<$config>::R2) + } + + fn modulus() -> Vec { + use ark_ff::MontConfig; + $crate::arkworks_macros::bigint_to_u32_limbs(<$config>::MODULUS) + } + } + }; +} + +/// Implement `GpuName` for an arkworks curve affine point type. +/// +/// # Example +/// +/// ```ignore +/// use ec_gpu::impl_gpu_name_arkworks_curve; +/// use ark_bls12_381::G1Affine; +/// +/// impl_gpu_name_arkworks_curve!(G1Affine); +/// ``` +#[macro_export] +macro_rules! impl_gpu_name_arkworks_curve { + ($curve:ty) => { + impl $crate::GpuName for $curve { + fn name() -> String { + $crate::name!() + } + } + }; +} + +/// Implement `GpuName` and `GpuField` for an arkworks quadratic extension field (like Fq2). +/// +/// # Example +/// +/// ```ignore +/// use ec_gpu::impl_gpu_field_arkworks_ext2; +/// use ark_bls12_381::{Fq, Fq2, FqConfig}; +/// +/// impl_gpu_field_arkworks_ext2!(Fq2, FqConfig, Fq); +/// ``` +#[macro_export] +macro_rules! impl_gpu_field_arkworks_ext2 { + ($field2:ty, $base_config:ty, $base_field:ty) => { + impl $crate::GpuName for $field2 { + fn name() -> String { + $crate::name!() + } + } + + impl $crate::GpuField for $field2 { + fn one() -> Vec { + use ark_ff::MontConfig; + let n = $crate::arkworks_macros::bigint_to_u32_limbs(<$base_config>::MODULUS).len(); + let mut out = vec![0u32; 2 * n]; + out[..n].copy_from_slice(&$crate::arkworks_macros::bigint_to_u32_limbs( + <$base_config>::R, + )); + out + } + + fn r2() -> Vec { + use ark_ff::MontConfig; + let n = $crate::arkworks_macros::bigint_to_u32_limbs(<$base_config>::MODULUS).len(); + let mut out = vec![0u32; 2 * n]; + out[..n].copy_from_slice(&$crate::arkworks_macros::bigint_to_u32_limbs( + <$base_config>::R2, + )); + out + } + + fn modulus() -> Vec { + use ark_ff::MontConfig; + $crate::arkworks_macros::bigint_to_u32_limbs(<$base_config>::MODULUS) + } + + fn sub_field_name() -> Option { + Some(<$base_field as $crate::GpuName>::name()) + } + } + }; +} diff --git a/ec-gpu/src/lib.rs b/ec-gpu/src/lib.rs index b7c7b69..1cac7b2 100644 --- a/ec-gpu/src/lib.rs +++ b/ec-gpu/src/lib.rs @@ -41,6 +41,12 @@ pub trait GpuField: GpuName { } } +#[cfg(feature = "arkworks")] +pub mod arkworks_bn254; + +#[cfg(feature = "arkworks")] +pub mod arkworks_macros; + /// Macro to get a unique name of an item. /// /// The name is a string that consists of the module path and the type name. All non-alphanumeric diff --git a/gpu-tests/Cargo.toml b/gpu-tests/Cargo.toml index bf3e92c..0df3418 100644 --- a/gpu-tests/Cargo.toml +++ b/gpu-tests/Cargo.toml @@ -1,6 +1,3 @@ -# NOTE vmx 2022-07-07: Using the `__private_bench` feature of `blstrs` is just temporarily until -# https://github.com/zkcrypto/group/pull/29 is fixed. Then we won't need the exports of `Fp` and -# `Fp2` any more. [package] name = "gpu-tests" version = "0.1.0" @@ -12,26 +9,35 @@ license = "MIT/Apache-2.0" publish = false [dev-dependencies] -blstrs = { version = "0.7.0", features = ["__private_bench"] } criterion = "0.4" -ec-gpu = "0.2" -ec-gpu-gen = { path = "../ec-gpu-gen", default-features = false } -ff = { version = "0.13.0", default-features = false } +divan = "0.1" +ec-gpu = { workspace = true, features = ["arkworks"] } +ec-gpu-gen = { path = "../ec-gpu-gen", default-features = false, features = ["arkworks"] } fil_logger = "0.1.6" -group = "0.13.0" -pairing = "0.23.0" rand = "0.8" rayon = "1.5.3" +ark-ff = "0.5.0" +ark-ec = "0.5.0" +ark-bn254 = "0.5.0" +ark-std = "0.5.0" +tracing = "0.1" +tracing-subscriber = "0.3" +tracing-profile.workspace = true [build-dependencies] -blstrs = { version = "0.7.0", features = ["__private_bench"] } -ec-gpu-gen = { path = "../ec-gpu-gen" } +ark-bn254 = { version = "0.5.0" } +ec-gpu = { path = "../ec-gpu", features = ["arkworks"] } +ec-gpu-gen = { path = "../ec-gpu-gen", features = ["arkworks"] } [features] default = ["cuda", "opencl"] -cuda = ["blstrs/gpu", "ec-gpu-gen/cuda"] -opencl = ["blstrs/gpu", "ec-gpu-gen/opencl"] +cuda = ["ec-gpu-gen/cuda"] +opencl = ["ec-gpu-gen/opencl"] [[bench]] name = "multiexp" harness = false + +[[bench]] +name = "small_scalars" +harness = false diff --git a/gpu-tests/benches/multiexp.rs b/gpu-tests/benches/multiexp.rs index 011c367..797a3de 100644 --- a/gpu-tests/benches/multiexp.rs +++ b/gpu-tests/benches/multiexp.rs @@ -1,54 +1,95 @@ use std::sync::Arc; -use blstrs::Bls12; +use ark_bn254::{Fr, G1Projective}; +use ark_ec::CurveGroup; +use ark_ff::{PrimeField, UniformRand}; use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; +use ec_gpu::arkworks_bn254::G1Affine; use ec_gpu_gen::{ - multiexp::MultiexpKernel, multiexp_cpu::SourceBuilder, rust_gpu_tools::Device, + multiexp::{GpuAffine, MultiexpKernel}, + rust_gpu_tools::Device, threadpool::Worker, }; -use ff::{Field, PrimeField}; -use group::{Curve, Group}; -use pairing::Engine; use rayon::iter::{IntoParallelIterator, ParallelIterator}; /// The power that will be used to define the maximum number of elements. The number of elements /// is `2^MAX_ELEMENTS_POWER`. -const MAX_ELEMENTS_POWER: usize = 29; +const MAX_ELEMENTS_POWER: usize = 20; /// The maximum number of elements for this benchmark. const MAX_ELEMENTS: usize = 1 << MAX_ELEMENTS_POWER; +pub trait QueryDensity: Sized { + /// Returns whether the base exists. + type Iter: Iterator; + + fn iter(self) -> Self::Iter; + fn get_query_size(self) -> Option; + fn generate_exps(self, exponents: Arc>) -> Arc>; +} + +#[derive(Clone)] +pub struct FullDensity; + +impl AsRef for FullDensity { + fn as_ref(&self) -> &FullDensity { + self + } +} + +impl QueryDensity for &FullDensity { + type Iter = std::iter::Repeat; + + fn iter(self) -> Self::Iter { + std::iter::repeat(true) + } + + fn get_query_size(self) -> Option { + None + } + + fn generate_exps(self, exponents: Arc>) -> Arc> { + exponents + } +} + fn bench_multiexp(crit: &mut Criterion) { let mut group = crit.benchmark_group("multiexp"); - // The difference between runs is so little, hence a low sample size is OK. group.sample_size(10); let devices = Device::all(); + let programs = devices .iter() .map(|device| ec_gpu_gen::program!(device)) .collect::>() .expect("Cannot create programs!"); - let mut kern = MultiexpKernel::<::G1Affine>::create(programs, &devices) - .expect("Cannot initialize kernel!"); + let mut kern = + MultiexpKernel::::create(programs, &devices).expect("Cannot initialize kernel!"); let pool = Worker::new(); - let max_bases: Vec<_> = (0..MAX_ELEMENTS) + + let max_bases: Vec = (0..MAX_ELEMENTS) .into_par_iter() - .map(|_| ::G1::random(rand::thread_rng()).to_affine()) + .map(|_| G1Affine(G1Projective::rand(&mut rand::thread_rng()).into_affine())) .collect(); let max_exponents: Vec<_> = (0..MAX_ELEMENTS) .into_par_iter() - .map(|_| ::Fr::random(rand::thread_rng()).to_repr()) + .map(|_| Fr::rand(&mut rand::thread_rng())) .collect(); let num_elements: Vec<_> = (10..MAX_ELEMENTS_POWER).map(|shift| 1 << shift).collect(); for num in num_elements { - group.bench_with_input(BenchmarkId::from_parameter(num), &num, |bencher, &num| { - let (bases, skip) = SourceBuilder::get((Arc::new(max_bases[0..num].to_vec()), 0)); - let exponents = Arc::new(max_exponents[0..num].to_vec()); + group.bench_with_input(BenchmarkId::from_parameter(num), &num, |bencher, &_num| { + let bases_gpu: Vec<_> = max_bases.iter().map(|p| p.to_gpu()).collect(); + let bases_gpu = Arc::new(bases_gpu); + + let exps_bigint: Arc> = + Arc::new(max_exponents.iter().map(|e| e.into_bigint()).collect()); + + let exps = FullDensity.as_ref().generate_exps::(exps_bigint); bencher.iter(|| { - black_box( - kern.multiexp(&pool, bases.clone(), exponents.clone(), skip) + let _ = black_box( + kern.multiexp(&pool, bases_gpu.clone(), exps.clone(), 0) .unwrap(), ); }) diff --git a/gpu-tests/benches/small_scalars.rs b/gpu-tests/benches/small_scalars.rs new file mode 100644 index 0000000..0486846 --- /dev/null +++ b/gpu-tests/benches/small_scalars.rs @@ -0,0 +1,138 @@ +//! Benchmark comparing MSM performance with full-size vs small scalars. +//! +//! This demonstrates the speedup from the small scalar optimization: +//! when scalars only use 64 bits, we skip processing the upper 190+ zero bits. +//! +//! Run with: cargo bench --bench small_scalars + +fn main() { + #[cfg(any(feature = "cuda", feature = "opencl"))] + divan::main(); + + #[cfg(not(any(feature = "cuda", feature = "opencl")))] + println!("Benchmarks require cuda or opencl feature"); +} + +#[cfg(any(feature = "cuda", feature = "opencl"))] +use std::sync::Arc; + +#[cfg(any(feature = "cuda", feature = "opencl"))] +use ark_bn254::{Fq, Fr, G1Affine, G1Projective}; +#[cfg(any(feature = "cuda", feature = "opencl"))] +use ark_ec::CurveGroup; +#[cfg(any(feature = "cuda", feature = "opencl"))] +use ark_ff::{PrimeField, UniformRand}; +#[cfg(any(feature = "cuda", feature = "opencl"))] +use divan::{black_box, Bencher}; +#[cfg(any(feature = "cuda", feature = "opencl"))] +use ec_gpu_gen::{ + multiexp::{G1AffineM, MultiexpKernel}, + rust_gpu_tools::Device, + threadpool::Worker, +}; +#[cfg(any(feature = "cuda", feature = "opencl"))] +use rand::Rng; + +#[cfg(any(feature = "cuda", feature = "opencl"))] +fn fq_to_32_le(x: &Fq) -> [u8; 32] { + let limbs = unsafe { std::mem::transmute::(*x) }; + let mut out = [0u8; 32]; + for (i, limb) in limbs.iter().enumerate() { + let bytes = limb.to_le_bytes(); + out[i * 8..(i + 1) * 8].copy_from_slice(&bytes); + } + out +} + +#[cfg(any(feature = "cuda", feature = "opencl"))] +fn g1_xy_bytes_le(p: &G1Affine) -> Option<([u8; 32], [u8; 32])> { + use ark_ec::AffineRepr; + p.xy().map(|(x, y)| (fq_to_32_le(&x), fq_to_32_le(&y))) +} + +#[cfg(any(feature = "cuda", feature = "opencl"))] +const NUM_POINTS: usize = 1 << 16; // 65536 points + +#[cfg(any(feature = "cuda", feature = "opencl"))] +#[divan::bench] +fn msm_full_254bit_scalars(bencher: Bencher) { + let devices = Device::all(); + let programs = devices + .iter() + .map(|device| ec_gpu_gen::program!(device)) + .collect::>() + .expect("Cannot create programs!"); + let mut kern = + MultiexpKernel::::create(programs, &devices).expect("Cannot initialize kernel!"); + let pool = Worker::new(); + + let mut rng = rand::thread_rng(); + + // Generate random bases + let bases: Vec = (0..NUM_POINTS) + .map(|_| G1Projective::rand(&mut rng).into_affine()) + .collect(); + + let bases_gpu: Vec = bases + .iter() + .map(|affine| { + let (x, y) = g1_xy_bytes_le(affine).expect("point not at infinity"); + G1AffineM { x, y } + }) + .collect(); + let bases_gpu = Arc::new(bases_gpu); + + // Generate full 254-bit random scalars + let full_scalars: Vec = (0..NUM_POINTS).map(|_| Fr::rand(&mut rng)).collect(); + let full_exps: Arc> = Arc::new(full_scalars.iter().map(|e| e.into_bigint()).collect()); + + bencher.bench_local(|| { + black_box( + kern.multiexp(&pool, bases_gpu.clone(), full_exps.clone(), 0) + .unwrap(), + ) + }); +} + +#[cfg(any(feature = "cuda", feature = "opencl"))] +#[divan::bench] +fn msm_small_64bit_scalars(bencher: Bencher) { + let devices = Device::all(); + let programs = devices + .iter() + .map(|device| ec_gpu_gen::program!(device)) + .collect::>() + .expect("Cannot create programs!"); + let mut kern = + MultiexpKernel::::create(programs, &devices).expect("Cannot initialize kernel!"); + let pool = Worker::new(); + + let mut rng = rand::thread_rng(); + + // Generate random bases + let bases: Vec = (0..NUM_POINTS) + .map(|_| G1Projective::rand(&mut rng).into_affine()) + .collect(); + + let bases_gpu: Vec = bases + .iter() + .map(|affine| { + let (x, y) = g1_xy_bytes_le(affine).expect("point not at infinity"); + G1AffineM { x, y } + }) + .collect(); + let bases_gpu = Arc::new(bases_gpu); + + // Generate small 64-bit scalars (0 to 2^64) + let small_scalars: Vec = (0..NUM_POINTS) + .map(|_| Fr::from(rng.gen::())) + .collect(); + let small_exps: Arc> = Arc::new(small_scalars.iter().map(|e| e.into_bigint()).collect()); + + bencher.bench_local(|| { + black_box( + kern.multiexp(&pool, bases_gpu.clone(), small_exps.clone(), 0) + .unwrap(), + ) + }); +} diff --git a/gpu-tests/build.rs b/gpu-tests/build.rs index c1c5a5b..14d39cc 100644 --- a/gpu-tests/build.rs +++ b/gpu-tests/build.rs @@ -3,12 +3,13 @@ fn main() {} #[cfg(any(feature = "cuda", feature = "opencl"))] fn main() { - use blstrs::{Fp, Fp2, G1Affine, G2Affine, Scalar}; + use ark_bn254::{Fq, Fq2, Fr}; + use ec_gpu::arkworks_bn254::{G1Affine, G2Affine}; use ec_gpu_gen::SourceBuilder; let source_builder = SourceBuilder::new() - .add_fft::() - .add_multiexp::() - .add_multiexp::(); + .add_fft::() + .add_multiexp::() + .add_multiexp::(); ec_gpu_gen::generate(&source_builder); } diff --git a/gpu-tests/tests/fft.rs b/gpu-tests/tests/fft.rs index aeda181..4b3641c 100644 --- a/gpu-tests/tests/fft.rs +++ b/gpu-tests/tests/fft.rs @@ -2,44 +2,74 @@ use std::time::Instant; -use blstrs::Scalar as Fr; -use ec_gpu_gen::{ - fft::FftKernel, - fft_cpu::{parallel_fft, serial_fft}, - rust_gpu_tools::Device, - threadpool::Worker, -}; -use ff::{Field, PrimeField}; - -fn omega(num_coeffs: usize) -> F { - // Compute omega, the 2^exp primitive root of unity +use ark_bn254::Fr; +use ark_ff::{FftField, UniformRand}; +use ec_gpu_gen::{fft::FftKernelArk, rust_gpu_tools::Device}; + +fn omega(num_coeffs: usize) -> F { let exp = (num_coeffs as f32).log2().floor() as u32; - let mut omega = F::ROOT_OF_UNITY; - for _ in exp..F::S { + let mut omega = F::TWO_ADIC_ROOT_OF_UNITY; + for _ in exp..F::TWO_ADICITY { omega = omega.square(); } omega } +fn serial_fft(a: &mut [F], omega: &F, log_n: u32) { + let n = a.len(); + assert_eq!(n, 1 << log_n); + + for k in 0..n { + let rk = bitreverse(k, log_n as usize); + if k < rk { + a.swap(rk, k); + } + } + + let mut m = 1; + for _ in 0..log_n { + let w_m = omega.pow([(n / (2 * m)) as u64]); + let mut k = 0; + while k < n { + let mut w = F::ONE; + for j in 0..m { + let t = a[k + j + m] * w; + a[k + j + m] = a[k + j] - t; + a[k + j] += t; + w *= w_m; + } + k += 2 * m; + } + m *= 2; + } +} + +fn bitreverse(mut n: usize, l: usize) -> usize { + let mut r = 0; + for _ in 0..l { + r = (r << 1) | (n & 1); + n >>= 1; + } + r +} + #[test] pub fn gpu_fft_consistency() { fil_logger::maybe_init(); let mut rng = rand::thread_rng(); - let worker = Worker::new(); - let log_threads = worker.log_num_threads(); let devices = Device::all(); let programs = devices .iter() .map(|device| ec_gpu_gen::program!(device)) .collect::>() .expect("Cannot create programs!"); - let mut kern = FftKernel::::create(programs).expect("Cannot initialize kernel!"); + let mut kern = FftKernelArk::::create(programs).expect("Cannot initialize kernel!"); for log_d in 1..=20 { let d = 1 << log_d; - let mut v1_coeffs = (0..d).map(|_| Fr::random(&mut rng)).collect::>(); + let mut v1_coeffs = (0..d).map(|_| Fr::rand(&mut rng)).collect::>(); let v1_omega = omega::(v1_coeffs.len()); let mut v2_coeffs = v1_coeffs.clone(); let v2_omega = v1_omega; @@ -53,17 +83,13 @@ pub fn gpu_fft_consistency() { println!("GPU took {}ms.", gpu_dur); now = Instant::now(); - if log_d <= log_threads { - serial_fft::(&mut v2_coeffs, &v2_omega, log_d); - } else { - parallel_fft::(&mut v2_coeffs, &worker, &v2_omega, log_d, log_threads); - } + serial_fft::(&mut v2_coeffs, &v2_omega, log_d); let cpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64; - println!("CPU ({} cores) took {}ms.", 1 << log_threads, cpu_dur); + println!("CPU took {}ms.", cpu_dur); println!("Speedup: x{}", cpu_dur as f32 / gpu_dur as f32); - assert!(v1_coeffs == v2_coeffs); + assert_eq!(v1_coeffs, v2_coeffs); println!("============================"); } } @@ -73,39 +99,32 @@ pub fn gpu_fft_many_consistency() { fil_logger::maybe_init(); let mut rng = rand::thread_rng(); - let worker = Worker::new(); - let log_threads = worker.log_num_threads(); let devices = Device::all(); let programs = devices .iter() .map(|device| ec_gpu_gen::program!(device)) .collect::>() .expect("Cannot create programs!"); - let mut kern = FftKernel::::create(programs).expect("Cannot initialize kernel!"); + let mut kern = FftKernelArk::::create(programs).expect("Cannot initialize kernel!"); for log_d in 1..=20 { let d = 1 << log_d; - let mut v11_coeffs = (0..d).map(|_| Fr::random(&mut rng)).collect::>(); - let mut v12_coeffs = (0..d).map(|_| Fr::random(&mut rng)).collect::>(); - let mut v13_coeffs = (0..d).map(|_| Fr::random(&mut rng)).collect::>(); - let v11_omega = omega::(v11_coeffs.len()); - let v12_omega = omega::(v12_coeffs.len()); - let v13_omega = omega::(v13_coeffs.len()); + let mut v11_coeffs = (0..d).map(|_| Fr::rand(&mut rng)).collect::>(); + let mut v12_coeffs = (0..d).map(|_| Fr::rand(&mut rng)).collect::>(); + let mut v13_coeffs = (0..d).map(|_| Fr::rand(&mut rng)).collect::>(); + let fft_omega = omega::(d); let mut v21_coeffs = v11_coeffs.clone(); let mut v22_coeffs = v12_coeffs.clone(); let mut v23_coeffs = v13_coeffs.clone(); - let v21_omega = v11_omega; - let v22_omega = v12_omega; - let v23_omega = v13_omega; println!("Testing FFT3 for {} elements...", d); let mut now = Instant::now(); kern.radix_fft_many( &mut [&mut v11_coeffs, &mut v12_coeffs, &mut v13_coeffs], - &[v11_omega, v12_omega, v13_omega], + &[fft_omega, fft_omega, fft_omega], &[log_d, log_d, log_d], ) .expect("GPU FFT failed!"); @@ -113,23 +132,17 @@ pub fn gpu_fft_many_consistency() { println!("GPU took {}ms.", gpu_dur); now = Instant::now(); - if log_d <= log_threads { - serial_fft::(&mut v21_coeffs, &v21_omega, log_d); - serial_fft::(&mut v22_coeffs, &v22_omega, log_d); - serial_fft::(&mut v23_coeffs, &v23_omega, log_d); - } else { - parallel_fft::(&mut v21_coeffs, &worker, &v21_omega, log_d, log_threads); - parallel_fft::(&mut v22_coeffs, &worker, &v22_omega, log_d, log_threads); - parallel_fft::(&mut v23_coeffs, &worker, &v23_omega, log_d, log_threads); - } + serial_fft::(&mut v21_coeffs, &fft_omega, log_d); + serial_fft::(&mut v22_coeffs, &fft_omega, log_d); + serial_fft::(&mut v23_coeffs, &fft_omega, log_d); let cpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64; - println!("CPU ({} cores) took {}ms.", 1 << log_threads, cpu_dur); + println!("CPU took {}ms.", cpu_dur); println!("Speedup: x{}", cpu_dur as f32 / gpu_dur as f32); - assert!(v11_coeffs == v21_coeffs); - assert!(v12_coeffs == v22_coeffs); - assert!(v13_coeffs == v23_coeffs); + assert_eq!(v11_coeffs, v21_coeffs); + assert_eq!(v12_coeffs, v22_coeffs); + assert_eq!(v13_coeffs, v23_coeffs); println!("============================"); } diff --git a/gpu-tests/tests/multiexp.rs b/gpu-tests/tests/multiexp.rs index 06360d1..23375a3 100644 --- a/gpu-tests/tests/multiexp.rs +++ b/gpu-tests/tests/multiexp.rs @@ -3,86 +3,333 @@ use std::sync::Arc; use std::time::Instant; -use blstrs::Bls12; -use ec_gpu::GpuName; -use ec_gpu_gen::multiexp_cpu::{multiexp_cpu, FullDensity, QueryDensity, SourceBuilder}; +use ark_bn254::{Fr, G1Projective}; +use ark_ec::{CurveGroup, VariableBaseMSM}; +use ark_ff::{PrimeField, UniformRand}; +use ec_gpu::arkworks_bn254::{G1Affine, G2Affine}; +use ec_gpu_gen::multiexp::GpuAffine; use ec_gpu_gen::{ multiexp::MultiexpKernel, program, rust_gpu_tools::Device, threadpool::Worker, EcError, }; -use ff::{Field, PrimeField}; -use group::Curve; -use group::{prime::PrimeCurveAffine, Group}; -use pairing::Engine; +use tracing::debug_span; +use tracing_profile::{PrintTreeConfig, PrintTreeLayer}; +use tracing_subscriber::{filter::filter_fn, prelude::*}; -fn multiexp_gpu( +pub trait QueryDensity: Sized { + type Iter: Iterator; + + fn iter(self) -> Self::Iter; + fn get_query_size(self) -> Option; + fn generate_exps(self, exponents: Arc>) -> Arc>; +} + +#[derive(Clone)] +pub struct FullDensity; + +impl AsRef for FullDensity { + fn as_ref(&self) -> &FullDensity { + self + } +} + +impl QueryDensity for &FullDensity { + type Iter = std::iter::Repeat; + + fn iter(self) -> Self::Iter { + std::iter::repeat(true) + } + + fn get_query_size(self) -> Option { + None + } + + fn generate_exps(self, exponents: Arc>) -> Arc> { + exponents + } +} + +fn multiexp_gpu( pool: &Worker, - bases: S, + bases: Arc>, density_map: D, - exponents: Arc::Repr>>, + exponents: Arc>, kern: &mut MultiexpKernel, -) -> Result +) -> Result where + G: GpuAffine, for<'a> &'a Q: QueryDensity, D: Send + Sync + 'static + Clone + AsRef, - G: PrimeCurveAffine + GpuName, - S: SourceBuilder, { - let exps = density_map.as_ref().generate_exps::(exponents); - let (bss, skip) = bases.get(); - kern.multiexp(pool, bss, exps, skip).map_err(Into::into) + let bases_gpu: Vec = bases.iter().map(|affine| affine.to_gpu()).collect(); + let exps_bigint: Arc> = Arc::new(exponents.iter().map(|e| e.into_bigint()).collect()); + let exps = density_map.as_ref().generate_exps::(exps_bigint); + kern.multiexp(pool, Arc::new(bases_gpu), exps, 0) } -#[test] -fn gpu_multiexp_consistency() { +/// Trait to bridge our newtype wrappers with arkworks types for testing +trait TestableAffine: GpuAffine + From { + type ArkAffine: ark_ec::AffineRepr + Copy; + type ArkProjective: CurveGroup + UniformRand; + + fn group_name() -> &'static str; +} + +impl TestableAffine for G1Affine { + type ArkAffine = ark_bn254::G1Affine; + type ArkProjective = ark_bn254::G1Projective; + + fn group_name() -> &'static str { + "G1" + } +} + +impl TestableAffine for G2Affine { + type ArkAffine = ark_bn254::G2Affine; + type ArkProjective = ark_bn254::G2Projective; + + fn group_name() -> &'static str { + "G2" + } +} + +fn gpu_multiexp_consistency_test(start_log_d: usize, max_log_d: usize) +where + G: TestableAffine, + G::Group: PartialEq, +{ fil_logger::maybe_init(); - const MAX_LOG_D: usize = 16; - const START_LOG_D: usize = 10; + let devices = Device::all(); let programs = devices .iter() .map(|device| crate::program!(device)) .collect::>() .expect("Cannot create programs!"); - let mut kern = MultiexpKernel::<::G1Affine>::create(programs, &devices) - .expect("Cannot initialize kernel!"); + let mut kern = + MultiexpKernel::::create(programs, &devices).expect("Cannot initialize kernel!"); let pool = Worker::new(); let mut rng = rand::thread_rng(); - let mut bases = (0..(1 << START_LOG_D)) - .map(|_| ::G1::random(&mut rng).to_affine()) - .collect::>(); + let mut bases_ark: Vec = (0..(1 << start_log_d)) + .map(|_| G::ArkProjective::rand(&mut rng).into_affine()) + .collect(); - for log_d in START_LOG_D..=MAX_LOG_D { - let g = Arc::new(bases.clone()); + for log_d in start_log_d..=max_log_d { + let bases: Vec = bases_ark.iter().map(|p| G::from(*p)).collect(); + let g = Arc::new(bases); let samples = 1 << log_d; - println!("Testing Multiexp for {} elements...", samples); - - let v = Arc::new( - (0..samples) - .map(|_| ::Fr::random(&mut rng).to_repr()) - .collect::>(), + println!( + "Testing {} Multiexp for {} elements...", + G::group_name(), + samples ); + let v: Vec = (0..samples).map(|_| Fr::rand(&mut rng)).collect(); + let v_arc = Arc::new(v.clone()); + let mut now = Instant::now(); - let gpu = multiexp_gpu(&pool, (g.clone(), 0), FullDensity, v.clone(), &mut kern).unwrap(); + let gpu = multiexp_gpu(&pool, g.clone(), FullDensity, v_arc.clone(), &mut kern).unwrap(); let gpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64; println!("GPU took {}ms.", gpu_dur); now = Instant::now(); - let cpu = multiexp_cpu(&pool, (g.clone(), 0), FullDensity, v.clone()) - .wait() - .unwrap(); + let cpu: G::ArkProjective = + VariableBaseMSM::msm(bases_ark.as_slice(), v.as_slice()).unwrap(); let cpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64; println!("CPU took {}ms.", cpu_dur); println!("Speedup: x{}", cpu_dur as f32 / gpu_dur as f32); - assert_eq!(cpu, gpu); + assert_eq!( + gpu, + cpu, + "GPU and CPU results differ for {} MSM", + G::group_name() + ); println!("============================"); - bases = [bases.clone(), bases.clone()].concat(); + bases_ark = [bases_ark.clone(), bases_ark.clone()].concat(); } } + +#[test] +fn gpu_multiexp_g1_consistency() { + gpu_multiexp_consistency_test::(10, 16); +} + +#[test] +fn gpu_multiexp_g2_consistency() { + gpu_multiexp_consistency_test::(10, 16); +} + +/// Test that the small scalar optimization works correctly. +/// When scalars are small (e.g., 64-bit), the optimization should skip +/// processing upper zero bits while producing correct results. +#[test] +fn gpu_multiexp_small_scalars() { + fil_logger::maybe_init(); + let devices = Device::all(); + let programs = devices + .iter() + .map(|device| crate::program!(device)) + .collect::>() + .expect("Cannot create programs!"); + let mut kern = + MultiexpKernel::::create(programs, &devices).expect("Cannot initialize kernel!"); + let pool = Worker::new(); + + let mut rng = rand::thread_rng(); + use ark_ff::UniformRand; + use rand::Rng; + + // Test with small scalars (64-bit range) + let num_points = 1 << 16; + println!( + "Testing small scalar MSM optimization with {} points...", + num_points + ); + + let bases_ark: Vec = (0..num_points) + .map(|_| G1Projective::rand(&mut rng).into_affine()) + .collect(); + + // Generate small scalars (only 64 bits used out of 254) + let small_scalars: Vec = (0..num_points) + .map(|_| Fr::from(rng.gen::())) + .collect(); + + let bases: Vec = bases_ark.iter().map(|p| G1Affine::from(*p)).collect(); + let g = Arc::new(bases); + let v_arc: Arc> = Arc::new(small_scalars.clone()); + + let now = Instant::now(); + let gpu: G1Projective = + multiexp_gpu(&pool, g.clone(), FullDensity, v_arc.clone(), &mut kern).unwrap(); + let gpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64; + println!("Small scalar GPU MSM took {}ms.", gpu_dur); + + let cpu: G1Projective = + VariableBaseMSM::msm(bases_ark.as_slice(), small_scalars.as_slice()).unwrap(); + + assert_eq!(cpu, gpu, "Small scalar MSM mismatch!"); + println!("Small scalar MSM test passed!"); +} + +/// Test edge case with very small scalars (32-bit) +#[test] +fn gpu_multiexp_very_small_scalars() { + fil_logger::maybe_init(); + let devices = Device::all(); + let programs = devices + .iter() + .map(|device| crate::program!(device)) + .collect::>() + .expect("Cannot create programs!"); + let mut kern = + MultiexpKernel::::create(programs, &devices).expect("Cannot initialize kernel!"); + let pool = Worker::new(); + + let mut rng = rand::thread_rng(); + use ark_ff::UniformRand; + use rand::Rng; + + let num_points = 1 << 14; + println!( + "Testing very small scalar (32-bit) MSM with {} points...", + num_points + ); + + let bases_ark: Vec = (0..num_points) + .map(|_| G1Projective::rand(&mut rng).into_affine()) + .collect(); + + // Generate very small scalars (only 32 bits used) + let small_scalars: Vec = (0..num_points) + .map(|_| Fr::from(rng.gen::() as u64)) + .collect(); + + let bases: Vec = bases_ark.iter().map(|p| G1Affine::from(*p)).collect(); + let g = Arc::new(bases); + let v_arc: Arc> = Arc::new(small_scalars.clone()); + + let now = Instant::now(); + let gpu: G1Projective = + multiexp_gpu(&pool, g.clone(), FullDensity, v_arc.clone(), &mut kern).unwrap(); + let gpu_dur = now.elapsed().as_secs() * 1000 + now.elapsed().subsec_millis() as u64; + println!("Very small scalar GPU MSM took {}ms.", gpu_dur); + + let cpu: G1Projective = + VariableBaseMSM::msm(bases_ark.as_slice(), small_scalars.as_slice()).unwrap(); + + assert_eq!(cpu, gpu, "Very small scalar MSM mismatch!"); + println!("Very small scalar MSM test passed!"); +} + +#[test] +fn gpu_multiexp_profile() { + use ec_gpu_gen::multiexp::SingleMultiexpKernel; + + let config = PrintTreeConfig { + hide_below_percent: 0.0, + accumulate_events: false, + ..PrintTreeConfig::default() + }; + let (layer, _guard) = PrintTreeLayer::new(config); + // Filter out events, only keep spans + let layer = layer.with_filter(filter_fn(|metadata| metadata.is_span())); + tracing_subscriber::registry().with(layer).init(); + + let root = debug_span!("profile_multiexp"); + let _root_guard = root.enter(); + + let devices = Device::all(); + let device = &devices[0]; + let program = crate::program!(device).expect("Cannot create program!"); + + let kern = { + let span = debug_span!("create_kernel"); + let _guard = span.enter(); + SingleMultiexpKernel::::create(program, device, None) + .expect("Cannot initialize kernel!") + }; + + let mut rng = rand::thread_rng(); + let log_n = 16; + let n = 1 << log_n; + + let bases_ark: Vec = { + let span = debug_span!("generate_bases", n = n); + let _guard = span.enter(); + (0..n) + .map(|_| G1Projective::rand(&mut rng).into_affine()) + .collect() + }; + + let bases_gpu: Vec<_> = { + let span = debug_span!("convert_bases_to_gpu", n = n); + let _guard = span.enter(); + bases_ark + .iter() + .map(|p| G1Affine::from(*p).to_gpu()) + .collect() + }; + + let exponents: Vec<_> = { + let span = debug_span!("generate_exponents", n = n); + let _guard = span.enter(); + (0..n).map(|_| Fr::rand(&mut rng).into_bigint()).collect() + }; + + // Run multiexp on main thread - this will show all nested spans + let _result = { + let span = debug_span!("run_multiexp", n = n); + let _guard = span.enter(); + kern.multiexp(&bases_gpu, &exponents) + .expect("multiexp failed") + }; + + drop(_root_guard); +} diff --git a/rust-toolchain b/rust-toolchain deleted file mode 100644 index 6b4de0a..0000000 --- a/rust-toolchain +++ /dev/null @@ -1 +0,0 @@ -1.83.0