Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,7 @@ members = [
"gpu-tests",
]
resolver = "2"

[workspace.dependencies]
ec-gpu = { path = "./ec-gpu" }
tracing-profile = { git = "https://github.com/IrreducibleOSS/tracing-profile" }
23 changes: 12 additions & 11 deletions ec-gpu-gen/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,34 +12,35 @@ rust-version = "1.83.0"
[dependencies]
bitvec = "1.0.1"
crossbeam-channel = "0.5.1"
ec-gpu = "0.2.0"
ec-gpu.workspace = true
execute = "0.2.9"
ff = { version = "0.13.0", default-features = false }
group = "0.13.0"
hex = "0.4"
# Pin home to version compatible with Rust 1.83 (edition 2021)
home = "=0.5.9"
log = "0.4.14"
num_cpus = "1.13.0"
once_cell = "1.8.0"
rayon = "1.5.1"
rust-gpu-tools = { version = "0.7.0", default-features = false, optional = true }
sha2 = "0.10"
thiserror = "1.0.30"
tracing = "0.1"
yastl = "0.1.2"

ark-bn254 = { version = "0.5.0", optional = true }
ark-std = { version = "0.5.0", optional = true }
ark-ec = "0.5.0"
ark-ff = "0.5.0"
ark-serialize = { version = "0.5.0", optional = true }

[dev-dependencies]
# NOTE vmx 2022-07-07: Using the `__private_bench` feature of `blstrs` is just
# temporarily until https://github.com/zkcrypto/group/pull/29 is fixed. Then
# we won't need the exports of `Fp` and `Fp2` any more.
#blstrs = { version = "0.6.0", features = ["__private_bench"], optional = true }
blstrs = { version = "0.7.0", features = ["__private_bench", "gpu"] }
rand = "0.8"
lazy_static = "1.2"
pairing = "0.23.0"
temp-env = "0.3.0"
rand_core = "0.6.3"
rand_xorshift = "0.3.0"
ark-bn254 = "0.5.0"

[features]
default = []
cuda = ["rust-gpu-tools/cuda"]
opencl = ["rust-gpu-tools/opencl"]
arkworks = ["ec-gpu/arkworks", "dep:ark-bn254", "dep:ark-std", "dep:ark-serialize"]
142 changes: 142 additions & 0 deletions ec-gpu-gen/src/cl/field.cl
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,94 @@ DEVICE void FIELD_reduce(uint32_t accLow[FIELD_LIMBS], uint32_t np0, uint32_t fq
accLow[i]=chain_add(&chain5, accLow[i], highCarry);
}

// Optimized squaring: exploits symmetry a² = a·a
// Cross products aᵢ·aⱼ for i≠j appear twice, so compute once and double.
// Reduces from n² to n(n+1)/2 multiplications (~44% fewer for 8 limbs).
DEVICE inline
void FIELD_sqr_v1(uint32_t *x, uint32_t *xx) {
const uint32_t xLimbs = FIELD_LIMBS;
const uint32_t xxLimbs = FIELD_LIMBS * 2;
uint32_t temp[FIELD_LIMBS * 2];
uint32_t carry = 0;

#pragma unroll
for (int32_t i = 0; i < xxLimbs; i++) {
temp[i] = 0;
}

// Step 1: Compute off-diagonal products for odd (i+j) positions
// Following the same pattern as FIELD_mult_v1 for correctness
#pragma unroll
for (int32_t i = 0; i < xLimbs; i++) {
chain_t chain1;
chain_init(&chain1);
#pragma unroll
for (int32_t j = i + 1; j < xLimbs; j++) {
if ((i + j) % 2 == 1) {
temp[i + j - 1] = chain_madlo(&chain1, x[i], x[j], temp[i + j - 1]);
temp[i + j] = chain_madhi(&chain1, x[i], x[j], temp[i + j]);
}
}
if (i % 2 == 1 && i + 1 < xLimbs) {
temp[i + xLimbs - 1] = chain_add(&chain1, 0, 0);
}
}

// Shift right by 1 position (same as mult_v1)
#pragma unroll
for (int32_t i = xxLimbs - 1; i > 0; i--) {
temp[i] = temp[i - 1];
}
temp[0] = 0;

// Step 2: Compute off-diagonal products for even (i+j) positions
carry = 0;
#pragma unroll
for (int32_t i = 0; i < xLimbs; i++) {
chain_t chain2;
chain_init(&chain2);

#pragma unroll
for (int32_t j = i + 1; j < xLimbs; j++) {
if ((i + j) % 2 == 0) {
temp[i + j] = chain_madlo(&chain2, x[i], x[j], temp[i + j]);
temp[i + j + 1] = chain_madhi(&chain2, x[i], x[j], temp[i + j + 1]);
}
}
if ((i + xLimbs) % 2 == 0 && i != xLimbs - 1 && i + 1 < xLimbs) {
temp[i + xLimbs] = chain_add(&chain2, temp[i + xLimbs], carry);
temp[i + xLimbs + 1] = chain_add(&chain2, temp[i + xLimbs + 1], 0);
carry = chain_add(&chain2, 0, 0);
}
if ((i + xLimbs) % 2 == 1 && i != xLimbs - 1 && i + 1 < xLimbs) {
carry = chain_add(&chain2, carry, 0);
}
}

// Step 3: Double the off-diagonal products (left shift by 1 bit)
carry = 0;
#pragma unroll
for (int32_t i = 0; i < xxLimbs; i++) {
uint32_t new_carry = temp[i] >> 31;
temp[i] = (temp[i] << 1) | carry;
carry = new_carry;
}

// Step 4: Add diagonal products x[i] * x[i]
chain_t chain3;
chain_init(&chain3);
#pragma unroll
for (int32_t i = 0; i < xLimbs; i++) {
temp[2 * i] = chain_madlo(&chain3, x[i], x[i], temp[2 * i]);
temp[2 * i + 1] = chain_madhi(&chain3, x[i], x[i], temp[2 * i + 1]);
}

#pragma unroll
for (int32_t i = 0; i < xxLimbs; i++) {
xx[i] = temp[i];
}
}

// Requirement: yLimbs >= xLimbs
DEVICE inline
void FIELD_mult_v1(uint32_t *x, uint32_t *y, uint32_t *xy) {
Expand Down Expand Up @@ -262,6 +350,40 @@ DEVICE FIELD FIELD_mul_nvidia(FIELD a, FIELD b) {
return r;
}

DEVICE FIELD FIELD_sqr_nvidia(FIELD a) {
// Perform optimized squaring
limb aa[2 * FIELD_LIMBS];
FIELD_sqr_v1(a.val, aa);

uint32_t io[FIELD_LIMBS];
#pragma unroll
for(int i=0;i<FIELD_LIMBS;i++) {
io[i]=aa[i];
}
FIELD_reduce(io, FIELD_INV, FIELD_P.val);

// Add io to the upper words of aa
aa[FIELD_LIMBS] = add_cc(aa[FIELD_LIMBS], io[0]);
int j;
#pragma unroll
for (j = 1; j < FIELD_LIMBS - 1; j++) {
aa[j + FIELD_LIMBS] = addc_cc(aa[j + FIELD_LIMBS], io[j]);
}
aa[2 * FIELD_LIMBS - 1] = addc(aa[2 * FIELD_LIMBS - 1], io[FIELD_LIMBS - 1]);

FIELD r;
#pragma unroll
for (int i = 0; i < FIELD_LIMBS; i++) {
r.val[i] = aa[i + FIELD_LIMBS];
}

if (FIELD_gte(r, FIELD_P)) {
r = FIELD_sub_(r, FIELD_P);
}

return r;
}

#endif

// Modular multiplication
Expand Down Expand Up @@ -310,9 +432,15 @@ DEVICE FIELD FIELD_mul(FIELD a, FIELD b) {

// Squaring is a special case of multiplication which can be done ~1.5x faster.
// https://stackoverflow.com/a/16388571/1348497
#ifdef CUDA
DEVICE FIELD FIELD_sqr(FIELD a) {
return FIELD_sqr_nvidia(a);
}
#else
DEVICE FIELD FIELD_sqr(FIELD a) {
return FIELD_mul(a, a);
}
#endif

// Left-shift the limbs by one bit and subtract by modulus in case of overflow.
// Faster version of FIELD_add(a, a)
Expand Down Expand Up @@ -375,3 +503,17 @@ DEVICE uint FIELD_get_bits(FIELD l, uint skip, uint window) {
}
return ret;
}

// Get `i`th bit (From least significant digit) of the field.
DEVICE bool FIELD_get_bit_lsb(FIELD l, uint i) {
return (l.val[i / FIELD_LIMB_BITS] >> (i % FIELD_LIMB_BITS)) & 1;
}

// Get `window` consecutive bits, (Starting from `skip`th bit from LSB) from the field.
DEVICE uint FIELD_get_bits_lsb(FIELD l, uint skip, uint window) {
uint ret = 0;
for(uint i = 0; i < window; i++) {
ret |= ((uint)FIELD_get_bit_lsb(l, skip + i)) << i;
}
return ret;
}
2 changes: 1 addition & 1 deletion ec-gpu-gen/src/cl/multiexp.cl
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ KERNEL void POINT_multiexp(

POINT_jacobian res = POINT_ZERO;
for(uint i = nstart; i < nend; i++) {
uint ind = EXPONENT_get_bits(exps[i], bits, w);
uint ind = EXPONENT_get_bits_lsb(exps[i], bits, w);

#if defined(OPENCL_NVIDIA) || defined(CUDA)
// O_o, weird optimization, having a single special case makes it
Expand Down
53 changes: 19 additions & 34 deletions ec-gpu-gen/src/fft.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use std::cmp;
use std::sync::{Arc, RwLock};

use ark_ff::FftField;
use ec_gpu::GpuName;
use ff::Field;
use log::{error, info};
use rust_gpu_tools::{program_closures, LocalBuffer, Program};

Expand All @@ -16,17 +16,14 @@ const MAX_LOG2_LOCAL_WORK_SIZE: u32 = 7; // 128
/// FFT kernel for a single GPU.
pub struct SingleFftKernel<'a, F>
where
F: Field + GpuName,
F: FftField + GpuName,
{
program: Program,
/// An optional function which will be called at places where it is possible to abort the FFT
/// calculations. If it returns true, the calculation will be aborted with an
/// [`EcError::Aborted`].
maybe_abort: Option<&'a (dyn Fn() -> bool + Send + Sync)>,
_phantom: std::marker::PhantomData<F>,
}

impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> {
impl<'a, F: FftField + GpuName> SingleFftKernel<'a, F> {
/// Create a new FFT instance for the given device.
///
/// The `maybe_abort` function is called when it is possible to abort the computation, without
Expand All @@ -48,23 +45,18 @@ impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> {
pub fn radix_fft(&mut self, input: &mut [F], omega: &F, log_n: u32) -> EcResult<()> {
let closures = program_closures!(|program, input: &mut [F]| -> EcResult<()> {
let n = 1 << log_n;
// All usages are safe as the buffers are initialized from either the host or the GPU
// before they are read.
let mut src_buffer = unsafe { program.create_buffer::<F>(n)? };
let mut dst_buffer = unsafe { program.create_buffer::<F>(n)? };
// The precalculated values pq` and `omegas` are valid for radix degrees up to `max_deg`
let max_deg = cmp::min(MAX_LOG2_RADIX, log_n);

// Precalculate:
// [omega^(0/(2^(deg-1))), omega^(1/(2^(deg-1))), ..., omega^((2^(deg-1)-1)/(2^(deg-1)))]
// Precalculate twiddle factors
let mut pq = vec![F::ZERO; 1 << max_deg >> 1];
let twiddle = omega.pow_vartime([(n >> max_deg) as u64]);
let twiddle = omega.pow([(n >> max_deg) as u64]);
pq[0] = F::ONE;
if max_deg > 1 {
pq[1] = twiddle;
for i in 2..(1 << max_deg >> 1) {
pq[i] = pq[i - 1];
pq[i].mul_assign(&twiddle);
pq[i] = pq[i - 1] * twiddle;
}
}
let pq_buffer = program.create_buffer_from_slice(&pq)?;
Expand All @@ -73,24 +65,21 @@ impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> {
let mut omegas = vec![F::ZERO; 32];
omegas[0] = *omega;
for i in 1..LOG2_MAX_ELEMENTS {
omegas[i] = omegas[i - 1].pow_vartime([2u64]);
omegas[i] = omegas[i - 1].pow([2u64]);
}
let omegas_buffer = program.create_buffer_from_slice(&omegas)?;

program.write_from_buffer(&mut src_buffer, &*input)?;
// Specifies log2 of `p`, (http://www.bealto.com/gpu-fft_group-1.html)
let mut log_p = 0u32;
// Each iteration performs a FFT round

while log_p < log_n {
if let Some(maybe_abort) = &self.maybe_abort {
if maybe_abort() {
return Err(EcError::Aborted);
}
}

// 1=>radix2, 2=>radix4, 3=>radix8, ...
let deg = cmp::min(max_deg, log_n - log_p);

let n = 1u32 << log_n;
let local_work_size = 1 << cmp::min(deg - 1, MAX_LOG2_LOCAL_WORK_SIZE);
let global_work_size = n >> deg;
Expand All @@ -117,7 +106,6 @@ impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> {
}

program.read_into_buffer(&src_buffer, input)?;

Ok(())
});

Expand All @@ -128,14 +116,14 @@ impl<'a, F: Field + GpuName> SingleFftKernel<'a, F> {
/// One FFT kernel for each GPU available.
pub struct FftKernel<'a, F>
where
F: Field + GpuName,
F: FftField + GpuName,
{
kernels: Vec<SingleFftKernel<'a, F>>,
}

impl<'a, F> FftKernel<'a, F>
where
F: Field + GpuName,
F: FftField + GpuName,
{
/// Create new kernels, one for each given device.
pub fn create(programs: Vec<Program>) -> EcResult<Self> {
Expand Down Expand Up @@ -175,28 +163,20 @@ where
if kernels.is_empty() {
return Err(EcError::Simple("No working GPUs found!"));
}
info!("FFT: {} working device(s) selected. ", kernels.len());
info!("FFT: {} working device(s) selected.", kernels.len());
for (i, k) in kernels.iter().enumerate() {
info!("FFT: Device {}: {}", i, k.program.device_name(),);
info!("FFT: Device {}: {}", i, k.program.device_name());
}

Ok(Self { kernels })
}

/// Performs FFT on `input`
/// * `omega` - Special value `omega` is used for FFT over finite-fields
/// * `log_n` - Specifies log2 of number of elements
///
/// Uses the first available GPU.
/// Performs FFT on `input` using the first available GPU.
pub fn radix_fft(&mut self, input: &mut [F], omega: &F, log_n: u32) -> EcResult<()> {
self.kernels[0].radix_fft(input, omega, log_n)
}

/// Performs FFT on `inputs`
/// * `omega` - Special value `omega` is used for FFT over finite-fields
/// * `log_n` - Specifies log2 of number of elements
///
/// Uses all available GPUs to distribute the work.
/// Performs FFT on `inputs` using all available GPUs.
pub fn radix_fft_many(
&mut self,
inputs: &mut [&mut [F]],
Expand Down Expand Up @@ -237,3 +217,8 @@ where
Arc::try_unwrap(result).unwrap().into_inner().unwrap()
}
}

/// Type alias for backward compatibility
pub type SingleFftKernelArk<'a, F> = SingleFftKernel<'a, F>;
/// Type alias for backward compatibility
pub type FftKernelArk<'a, F> = FftKernel<'a, F>;
Loading