Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@
!CMakeLists.txt
!model.py
!demo.sh
!gpu_arch.sh
10 changes: 9 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,18 @@ NVCC := $(CUDA_PATH)/bin/nvcc
INCLUDES := -I$(CUDA_PATH)/include -I$(LIBTORCH_PATH)/include -I$(LIBTORCH_PATH)/include/torch/csrc/api/include
LIBS := -L$(CUDA_PATH)/lib64 -lcudart -L$(LIBTORCH_PATH)/lib -ltorch -ltorch_cpu -ltorch_cuda -lc10 -lc10_cuda

# Detect GPU arch
GPU_ARCH := $(shell ./gpu_arch.sh)

# Check if the architecture was detected
ifeq ($(GPU_ARCH),)
$(error No CUDA capable GPU detected or unsupported architecture.)
endif

# NVCC compiler flags
# note: -D_GLIBCXX_USE_CXX11_ABI=0 may not be necessary and was only added due to some weird compilation errors with the torch library
# std=c++17 was also needed due to some errors with the torch library
NVCC_FLAGS := -arch=sm_86 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0
NVCC_FLAGS := -v -arch=$(GPU_ARCH) -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0

# Source and object files
CU_SRCS := $(wildcard *.cu)
Expand Down
17 changes: 5 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,39 +57,32 @@ Before running the demo, ensure you have the following requirements:

## Setup & Installation

1. **Configure the GPU Architecture**:
- Set the desired GPU architecture in the `Makefile` by updating the `NVCC_FLAGS`:
```cmake
# NVCC compiler flags
NVCC_FLAGS := -arch=sm_70
```

2. **Set up a Virtual Environment**:
1. **Set up a Virtual Environment**:
- Create and activate the environment using the following command:
```bash
virtualenv --no-download --clear path/to/your/env
source path/to/your/env/bin/activate
```

3. **Install Necessary Packages**:
2. **Install Necessary Packages**:
- Install `numpy` and `torch`. Note that installing `torch` will also include `LibTorch`, the `C++` interface for `torch`, in your virtual environment:
```bash
pip install torch numpy
```

4. **Prepare Your Model and Data by PyTorch**:
3. **Prepare Your Model and Data by PyTorch**:
- To generate example data and a model (`traced_model.pt` and `sample_input.pt`), execute:
```bash
python model.py
```
- Alternatively, you can use your own model and data. Ensure that they are [serialized](https://pytorch.org/docs/stable/notes/serialization.html) to be compatible with the `C++` loader. For this demo, we only support multilayer perceptrons (MLPs) with ReLU activations. The expected input tensor shape is `(batch_size, input_dim)`.

5. **Compile the Demonstration**:
4. **Compile the Demonstration**:
- Run the following command:
```bash
make demo
```
- Please be patient as the compilation might take a while, possibly a few minutes. We're aware of this and are working to enhance the compilation speed.
- CUDA device compute capability is automatically fetched. Please be patient as the compilation might take a while, possibly a few minutes. We're aware of this and are working to enhance the compilation speed.

## Running the Demo

Expand Down
20 changes: 20 additions & 0 deletions gpu_arch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

# Check if nvidia-smi works
if ! command -v nvidia-smi &> /dev/null; then
echo "nvidia-smi not found, ensure your Nvidia GPU drivers are installed correctly."
exit 1
fi

# calculate compute capability
COMPUTE_CAPABILITY=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader)

if [ -z "$COMPUTE_CAPABILITY" ]; then
echo "Failed to detect GPU Compute Capability"
exit 1
fi

# get compute capability from retrieved value
ARCH="sm_$(echo $COMPUTE_CAPABILITY | tr -d '.')"

echo $ARCH
111 changes: 69 additions & 42 deletions zkfc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,58 +2,75 @@

#define TILE_WIDTH 16


KERNEL void matrixMultiplyOptimized(Fr_t* A, Fr_t* B, Fr_t* C, int rowsA, int colsA, int colsB) {
__shared__ Fr_t A_tile[TILE_WIDTH][TILE_WIDTH];
__shared__ Fr_t B_tile[TILE_WIDTH][TILE_WIDTH];
KERNEL void matrixMultiplyOptimized(Fr_t *A, Fr_t *B, Fr_t *C, int rowsA, int colsA, int colsB)
{
// Leverage double buffering
__shared__ Fr_t A_tiles[2][TILE_WIDTH][TILE_WIDTH];
__shared__ Fr_t B_tiles[2][TILE_WIDTH][TILE_WIDTH];

int row = blockIdx.y * TILE_WIDTH + threadIdx.y;
int col = blockIdx.x * TILE_WIDTH + threadIdx.x;

Fr_t sum = blstrs__scalar__Scalar_ZERO;

// Loop over the tiles of A and B required to compute the block sub-matrix
for (int t = 0; t < (colsA - 1)/TILE_WIDTH + 1; ++t) {
#pragma unroll
for (int t = 0; t < (colsA - 1) / TILE_WIDTH + 1; ++t)
{
// buffer index
int buffer = t % 2;

// Load the matrices from device memory to shared memory; each thread loads
// one element of each matrix
if (row < rowsA && t*TILE_WIDTH + threadIdx.x < colsA) {
A_tile[threadIdx.y][threadIdx.x] = A[row*colsA + t*TILE_WIDTH + threadIdx.x];
} else {
A_tile[threadIdx.y][threadIdx.x] = blstrs__scalar__Scalar_ZERO;
if (row < rowsA && t * TILE_WIDTH + threadIdx.x < colsA)
{
// prefetch matrix A into shared mem
A_tiles[buffer][threadIdx.y][threadIdx.x] = __ldg(&A[row * colsA + t * TILE_WIDTH + threadIdx.x]);
}

if (t*TILE_WIDTH + threadIdx.y < colsA && col < colsB) {
B_tile[threadIdx.y][threadIdx.x] = B[(t*TILE_WIDTH + threadIdx.y)*colsB + col];
} else {
B_tile[threadIdx.y][threadIdx.x] = blstrs__scalar__Scalar_ZERO;
else
{
A_tiles[buffer][threadIdx.y][threadIdx.x] = blstrs__scalar__Scalar_ZERO;
}

if (t * TILE_WIDTH + threadIdx.y < colsA && col < colsB)
{
// prefetch matrix B into shared mem
B_tiles[buffer][threadIdx.y][threadIdx.x] = __ldg(&B[(t * TILE_WIDTH + threadIdx.y) * colsB + col]);
}
else
{
B_tiles[buffer][threadIdx.y][threadIdx.x] = blstrs__scalar__Scalar_ZERO;
}

// Synchronize to ensure all the data in shared memory is available
__syncthreads();

// Multiply the two matrices together;
for (int k = 0; k < TILE_WIDTH; ++k) {
sum = blstrs__scalar__Scalar_add(sum, blstrs__scalar__Scalar_mul(A_tile[threadIdx.y][k], B_tile[k][threadIdx.x]));
// multiply matrices
#pragma unroll
for (int k = 0; k < TILE_WIDTH; ++k)
{
Fr_t A_value = A_tiles[threadIdx.y][k];
Fr_t B_value = B_tiles[k][threadIdx.x];
sum = blstrs__scalar__Scalar_add(sum, blstrs__scalar__Scalar_mul(A_value, B_value));
}

// Synchronize to ensure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
__syncthreads();
}

if (row < rowsA && col < colsB) {
C[row*colsB + col] = sum;
if (row < rowsA && col < colsB)
{
C[row * colsB + col] = sum;
}
}

// KERNEL void random_init(Fr_t* params, uint num_bits, uint n)
// {
// int tid = blockIdx.x * blockDim.x + threadIdx.x;
// curandState state;

// // Initialize the RNG state for this thread.
// curand_init(1234, tid, 0, &state);
// curand_init(1234, tid, 0, &state);

// if (tid < n) {
// params[tid] = {curand(&state) & ((1U << num_bits) - 1), 0, 0, 0, 0, 0, 0, 0};
// params[tid] = blstrs__scalar__Scalar_mont(blstrs__scalar__Scalar_sub(params[tid], {1U << (num_bits - 1), 0, 0, 0, 0, 0, 0, 0}));
Expand All @@ -69,53 +86,62 @@ DEVICE Fr_t float_to_Fr(float x)
bool negative = (sign_x < 0);
uint rounded_abs = static_cast<uint>(abs_x);

if (negative){
if (negative)
{
return blstrs__scalar__Scalar_sub({0, 0, 0, 0, 0, 0, 0, 0}, {rounded_abs, 0, 0, 0, 0, 0, 0, 0});
}
else {
else
{
return {rounded_abs, 0, 0, 0, 0, 0, 0, 0};
}
}

KERNEL void float_to_Fr_kernel(float* fs, Fr_t* frs, uint fs_num_window, uint frs_num_window, uint fs_window_size, uint frs_window_size)
KERNEL void float_to_Fr_kernel(float *fs, Fr_t *frs, uint fs_num_window, uint frs_num_window, uint fs_window_size, uint frs_window_size)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
uint dim0 = tid / frs_window_size;
uint dim1 = tid % frs_window_size;
if (tid >= frs_num_window * frs_window_size) return;
if (dim0 < fs_num_window && dim1 < fs_window_size) frs[dim0 * frs_window_size + dim1] = float_to_Fr(fs[dim0 * fs_window_size + dim1]);
else frs[tid] = {0, 0, 0, 0, 0, 0, 0, 0};
if (tid >= frs_num_window * frs_window_size)
return;
if (dim0 < fs_num_window && dim1 < fs_window_size)
frs[dim0 * frs_window_size + dim1] = float_to_Fr(fs[dim0 * fs_window_size + dim1]);
else
frs[tid] = {0, 0, 0, 0, 0, 0, 0, 0};
}

zkFC zkFC::from_float_gpu_ptr (uint input_size, uint output_size, float* float_gpu_ptr, const Commitment& generators)
{
zkFC zkFC::from_float_gpu_ptr(uint input_size, uint output_size, float *float_gpu_ptr, const Commitment &generators)
{
uint rounded_input_size = 1 << ceilLog2(input_size);
uint rounded_output_size = 1 << ceilLog2(output_size);

FrTensor weights(rounded_input_size * rounded_output_size);
float_to_Fr_kernel<<<(rounded_input_size * rounded_output_size+FrNumThread-1)/FrNumThread,FrNumThread>>>(float_gpu_ptr, weights.gpu_data, input_size, rounded_input_size, output_size, rounded_output_size);
float_to_Fr_kernel<<<(rounded_input_size * rounded_output_size + FrNumThread - 1) / FrNumThread, FrNumThread>>>(float_gpu_ptr, weights.gpu_data, input_size, rounded_input_size, output_size, rounded_output_size);
cudaDeviceSynchronize();
// cout << "Loaded weight is: " << weights << endl;
return zkFC(rounded_input_size, rounded_output_size, weights.mont(), generators);
}

zkFC::zkFC(uint input_size, uint output_size, const FrTensor& t, const Commitment& c) : inputSize(input_size), outputSize(output_size), weights(t), com(c.commit(t)) {
if (t.size != input_size * output_size) throw std::runtime_error("Incompatible dimensions");
zkFC::zkFC(uint input_size, uint output_size, const FrTensor &t, const Commitment &c) : inputSize(input_size), outputSize(output_size), weights(t), com(c.commit(t))
{
if (t.size != input_size * output_size)
throw std::runtime_error("Incompatible dimensions");
}

FrTensor zkFC::load_float_gpu_input(uint batch_size, uint input_dim, float* input_ptr)
FrTensor zkFC::load_float_gpu_input(uint batch_size, uint input_dim, float *input_ptr)
{
uint rounded_batch_size = 1 << ceilLog2(batch_size);
uint rounded_input_dim = 1 << ceilLog2(input_dim);
FrTensor t(rounded_batch_size * rounded_input_dim);
float_to_Fr_kernel<<<(rounded_batch_size * rounded_input_dim+FrNumThread-1)/FrNumThread,FrNumThread>>>(input_ptr, t.gpu_data, batch_size, rounded_batch_size, input_dim, rounded_input_dim);
float_to_Fr_kernel<<<(rounded_batch_size * rounded_input_dim + FrNumThread - 1) / FrNumThread, FrNumThread>>>(input_ptr, t.gpu_data, batch_size, rounded_batch_size, input_dim, rounded_input_dim);
cudaDeviceSynchronize();
// cout << "Loaded input is: " << t << endl;
return t;
}

FrTensor zkFC::operator()(const FrTensor& X) const {
if (X.size % inputSize != 0) throw std::runtime_error("Incompatible dimensions");
FrTensor zkFC::operator()(const FrTensor &X) const
{
if (X.size % inputSize != 0)
throw std::runtime_error("Incompatible dimensions");
uint batchSize = X.size / inputSize;
dim3 blockSize(TILE_WIDTH, TILE_WIDTH);
dim3 gridSize((outputSize + blockSize.x - 1) / blockSize.x, (batchSize + blockSize.y - 1) / blockSize.y);
Expand All @@ -125,9 +151,11 @@ FrTensor zkFC::operator()(const FrTensor& X) const {
return out;
}

void zkFC::prove(const FrTensor& X, const FrTensor& Z, Commitment& generators) const {
void zkFC::prove(const FrTensor &X, const FrTensor &Z, Commitment &generators) const
{
// cout << X.size << " " << inputSize << endl;
if (X.size % inputSize != 0) {
if (X.size % inputSize != 0)
{
throw std::runtime_error("Incompatible dimensions 1");
}
uint batchSize = X.size / inputSize;
Expand All @@ -143,4 +171,3 @@ void zkFC::prove(const FrTensor& X, const FrTensor& Z, Commitment& generators) c
Z(u_Z);
generators.open(weights, com, concatenate<Fr_t>({u_out_dim, u_in_dim}));
}

30 changes: 15 additions & 15 deletions zkfc.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -6,42 +6,42 @@
#include <cstddef>
#include <cuda_runtime.h>
#include <curand_kernel.h>
#include "bls12-381.cuh" // adjust this to point to the blstrs header file
#include "bls12-381.cuh" // adjust this to point to the blstrs header file
#include "fr-tensor.cuh"
#include "proof.cuh"
#include "commitment.cuh"

#define TILE_WIDTH 16

class zkFC {
class zkFC
{
private:

FrTensor weights;
G1TensorJacobian com;

public:
const uint inputSize;
const uint outputSize;
//zkFC(uint input_size, uint output_size, uint num_bits, const Commitment& generators);
zkFC(uint input_size, uint output_size, const FrTensor& t, const Commitment& generators);
FrTensor operator()(const FrTensor& X) const;
void prove(const FrTensor& X, const FrTensor& Z, Commitment& generators) const;
// zkFC(uint input_size, uint output_size, uint num_bits, const Commitment& generators);
zkFC(uint input_size, uint output_size, const FrTensor &t, const Commitment &generators);
FrTensor operator()(const FrTensor &X) const;
void prove(const FrTensor &X, const FrTensor &Z, Commitment &generators) const;

// static zkFC random_fc(uint input_size, uint output_size, uint num_bits, const Commitment& generators);
static zkFC from_float_gpu_ptr (uint input_size, uint output_size, float* float_gpu_ptr, const Commitment& generators);
static FrTensor load_float_gpu_input(uint batch_size, uint input_dim, float* input_ptr);
static zkFC from_float_gpu_ptr(uint input_size, uint output_size, float *float_gpu_ptr, const Commitment &generators);
static FrTensor load_float_gpu_input(uint batch_size, uint input_dim, float *input_ptr);
};

KERNEL void matrixMultiplyOptimized(Fr_t* A, Fr_t* B, Fr_t* C, int rowsA, int colsA, int colsB);
KERNEL void matrixMultiplyOptimized(Fr_t *A, Fr_t *B, Fr_t *C, int rowsA, int colsA, int colsB);

// KERNEL void random_init(Fr_t* params, uint num_bits, uint n)
// {
// int tid = blockIdx.x * blockDim.x + threadIdx.x;
// curandState state;

// // Initialize the RNG state for this thread.
// curand_init(1234, tid, 0, &state);
// curand_init(1234, tid, 0, &state);

// if (tid < n) {
// params[tid] = {curand(&state) & ((1U << num_bits) - 1), 0, 0, 0, 0, 0, 0, 0};
// params[tid] = blstrs__scalar__Scalar_mont(blstrs__scalar__Scalar_sub(params[tid], {1U << (num_bits - 1), 0, 0, 0, 0, 0, 0, 0}));
Expand All @@ -50,6 +50,6 @@ KERNEL void matrixMultiplyOptimized(Fr_t* A, Fr_t* B, Fr_t* C, int rowsA, int co

DEVICE Fr_t float_to_Fr(float x);

KERNEL void float_to_Fr_kernel(float* fs, Fr_t* frs, uint fs_num_window, uint frs_num_window, uint fs_window_size, uint frs_window_size);
KERNEL void float_to_Fr_kernel(float *fs, Fr_t *frs, uint fs_num_window, uint frs_num_window, uint fs_window_size, uint frs_window_size);

#endif // ZKFC_CUH
#endif // ZKFC_CUH