diff --git a/.gitignore b/.gitignore index e01929b..e3dfab9 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ !CMakeLists.txt !model.py !demo.sh +!gpu_arch.sh diff --git a/Makefile b/Makefile index 66e6084..2dae189 100644 --- a/Makefile +++ b/Makefile @@ -9,10 +9,18 @@ NVCC := $(CUDA_PATH)/bin/nvcc INCLUDES := -I$(CUDA_PATH)/include -I$(LIBTORCH_PATH)/include -I$(LIBTORCH_PATH)/include/torch/csrc/api/include LIBS := -L$(CUDA_PATH)/lib64 -lcudart -L$(LIBTORCH_PATH)/lib -ltorch -ltorch_cpu -ltorch_cuda -lc10 -lc10_cuda +# Detect GPU arch +GPU_ARCH := $(shell ./gpu_arch.sh) + +# Check if the architecture was detected +ifeq ($(GPU_ARCH),) + $(error No CUDA capable GPU detected or unsupported architecture.) +endif + # NVCC compiler flags # note: -D_GLIBCXX_USE_CXX11_ABI=0 may not be necessary and was only added due to some weird compilation errors with the torch library # std=c++17 was also needed due to some errors with the torch library -NVCC_FLAGS := -arch=sm_86 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 +NVCC_FLAGS := -v -arch=$(GPU_ARCH) -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 # Source and object files CU_SRCS := $(wildcard *.cu) diff --git a/README.md b/README.md index eb1ddf8..a5037bd 100644 --- a/README.md +++ b/README.md @@ -57,39 +57,32 @@ Before running the demo, ensure you have the following requirements: ## Setup & Installation -1. **Configure the GPU Architecture**: - - Set the desired GPU architecture in the `Makefile` by updating the `NVCC_FLAGS`: - ```cmake - # NVCC compiler flags - NVCC_FLAGS := -arch=sm_70 - ``` - -2. **Set up a Virtual Environment**: +1. **Set up a Virtual Environment**: - Create and activate the environment using the following command: ```bash virtualenv --no-download --clear path/to/your/env source path/to/your/env/bin/activate ``` -3. **Install Necessary Packages**: +2. **Install Necessary Packages**: - Install `numpy` and `torch`. Note that installing `torch` will also include `LibTorch`, the `C++` interface for `torch`, in your virtual environment: ```bash pip install torch numpy ``` -4. **Prepare Your Model and Data by PyTorch**: +3. **Prepare Your Model and Data by PyTorch**: - To generate example data and a model (`traced_model.pt` and `sample_input.pt`), execute: ```bash python model.py ``` - Alternatively, you can use your own model and data. Ensure that they are [serialized](https://pytorch.org/docs/stable/notes/serialization.html) to be compatible with the `C++` loader. For this demo, we only support multilayer perceptrons (MLPs) with ReLU activations. The expected input tensor shape is `(batch_size, input_dim)`. -5. **Compile the Demonstration**: +4. **Compile the Demonstration**: - Run the following command: ```bash make demo ``` - - Please be patient as the compilation might take a while, possibly a few minutes. We're aware of this and are working to enhance the compilation speed. + - CUDA device compute capability is automatically fetched. Please be patient as the compilation might take a while, possibly a few minutes. We're aware of this and are working to enhance the compilation speed. ## Running the Demo diff --git a/gpu_arch.sh b/gpu_arch.sh new file mode 100755 index 0000000..b5898e8 --- /dev/null +++ b/gpu_arch.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +# Check if nvidia-smi works +if ! command -v nvidia-smi &> /dev/null; then + echo "nvidia-smi not found, ensure your Nvidia GPU drivers are installed correctly." + exit 1 +fi + +# calculate compute capability +COMPUTE_CAPABILITY=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader) + +if [ -z "$COMPUTE_CAPABILITY" ]; then + echo "Failed to detect GPU Compute Capability" + exit 1 +fi + +# get compute capability from retrieved value +ARCH="sm_$(echo $COMPUTE_CAPABILITY | tr -d '.')" + +echo $ARCH \ No newline at end of file diff --git a/zkfc.cu b/zkfc.cu index 4154e46..5bd5d13 100644 --- a/zkfc.cu +++ b/zkfc.cu @@ -2,47 +2,64 @@ #define TILE_WIDTH 16 - -KERNEL void matrixMultiplyOptimized(Fr_t* A, Fr_t* B, Fr_t* C, int rowsA, int colsA, int colsB) { - __shared__ Fr_t A_tile[TILE_WIDTH][TILE_WIDTH]; - __shared__ Fr_t B_tile[TILE_WIDTH][TILE_WIDTH]; +KERNEL void matrixMultiplyOptimized(Fr_t *A, Fr_t *B, Fr_t *C, int rowsA, int colsA, int colsB) +{ + // Leverage double buffering + __shared__ Fr_t A_tiles[2][TILE_WIDTH][TILE_WIDTH]; + __shared__ Fr_t B_tiles[2][TILE_WIDTH][TILE_WIDTH]; int row = blockIdx.y * TILE_WIDTH + threadIdx.y; int col = blockIdx.x * TILE_WIDTH + threadIdx.x; Fr_t sum = blstrs__scalar__Scalar_ZERO; - + // Loop over the tiles of A and B required to compute the block sub-matrix - for (int t = 0; t < (colsA - 1)/TILE_WIDTH + 1; ++t) { +#pragma unroll + for (int t = 0; t < (colsA - 1) / TILE_WIDTH + 1; ++t) + { + // buffer index + int buffer = t % 2; // Load the matrices from device memory to shared memory; each thread loads // one element of each matrix - if (row < rowsA && t*TILE_WIDTH + threadIdx.x < colsA) { - A_tile[threadIdx.y][threadIdx.x] = A[row*colsA + t*TILE_WIDTH + threadIdx.x]; - } else { - A_tile[threadIdx.y][threadIdx.x] = blstrs__scalar__Scalar_ZERO; + if (row < rowsA && t * TILE_WIDTH + threadIdx.x < colsA) + { + // prefetch matrix A into shared mem + A_tiles[buffer][threadIdx.y][threadIdx.x] = __ldg(&A[row * colsA + t * TILE_WIDTH + threadIdx.x]); } - - if (t*TILE_WIDTH + threadIdx.y < colsA && col < colsB) { - B_tile[threadIdx.y][threadIdx.x] = B[(t*TILE_WIDTH + threadIdx.y)*colsB + col]; - } else { - B_tile[threadIdx.y][threadIdx.x] = blstrs__scalar__Scalar_ZERO; + else + { + A_tiles[buffer][threadIdx.y][threadIdx.x] = blstrs__scalar__Scalar_ZERO; + } + + if (t * TILE_WIDTH + threadIdx.y < colsA && col < colsB) + { + // prefetch matrix B into shared mem + B_tiles[buffer][threadIdx.y][threadIdx.x] = __ldg(&B[(t * TILE_WIDTH + threadIdx.y) * colsB + col]); + } + else + { + B_tiles[buffer][threadIdx.y][threadIdx.x] = blstrs__scalar__Scalar_ZERO; } // Synchronize to ensure all the data in shared memory is available __syncthreads(); - // Multiply the two matrices together; - for (int k = 0; k < TILE_WIDTH; ++k) { - sum = blstrs__scalar__Scalar_add(sum, blstrs__scalar__Scalar_mul(A_tile[threadIdx.y][k], B_tile[k][threadIdx.x])); + // multiply matrices +#pragma unroll + for (int k = 0; k < TILE_WIDTH; ++k) + { + Fr_t A_value = A_tiles[threadIdx.y][k]; + Fr_t B_value = B_tiles[k][threadIdx.x]; + sum = blstrs__scalar__Scalar_add(sum, blstrs__scalar__Scalar_mul(A_value, B_value)); } - // Synchronize to ensure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration __syncthreads(); } - if (row < rowsA && col < colsB) { - C[row*colsB + col] = sum; + if (row < rowsA && col < colsB) + { + C[row * colsB + col] = sum; } } @@ -50,10 +67,10 @@ KERNEL void matrixMultiplyOptimized(Fr_t* A, Fr_t* B, Fr_t* C, int rowsA, int co // { // int tid = blockIdx.x * blockDim.x + threadIdx.x; // curandState state; - + // // Initialize the RNG state for this thread. -// curand_init(1234, tid, 0, &state); - +// curand_init(1234, tid, 0, &state); + // if (tid < n) { // params[tid] = {curand(&state) & ((1U << num_bits) - 1), 0, 0, 0, 0, 0, 0, 0}; // params[tid] = blstrs__scalar__Scalar_mont(blstrs__scalar__Scalar_sub(params[tid], {1U << (num_bits - 1), 0, 0, 0, 0, 0, 0, 0})); @@ -69,53 +86,62 @@ DEVICE Fr_t float_to_Fr(float x) bool negative = (sign_x < 0); uint rounded_abs = static_cast(abs_x); - if (negative){ + if (negative) + { return blstrs__scalar__Scalar_sub({0, 0, 0, 0, 0, 0, 0, 0}, {rounded_abs, 0, 0, 0, 0, 0, 0, 0}); } - else { + else + { return {rounded_abs, 0, 0, 0, 0, 0, 0, 0}; } } -KERNEL void float_to_Fr_kernel(float* fs, Fr_t* frs, uint fs_num_window, uint frs_num_window, uint fs_window_size, uint frs_window_size) +KERNEL void float_to_Fr_kernel(float *fs, Fr_t *frs, uint fs_num_window, uint frs_num_window, uint fs_window_size, uint frs_window_size) { int tid = blockIdx.x * blockDim.x + threadIdx.x; uint dim0 = tid / frs_window_size; uint dim1 = tid % frs_window_size; - if (tid >= frs_num_window * frs_window_size) return; - if (dim0 < fs_num_window && dim1 < fs_window_size) frs[dim0 * frs_window_size + dim1] = float_to_Fr(fs[dim0 * fs_window_size + dim1]); - else frs[tid] = {0, 0, 0, 0, 0, 0, 0, 0}; + if (tid >= frs_num_window * frs_window_size) + return; + if (dim0 < fs_num_window && dim1 < fs_window_size) + frs[dim0 * frs_window_size + dim1] = float_to_Fr(fs[dim0 * fs_window_size + dim1]); + else + frs[tid] = {0, 0, 0, 0, 0, 0, 0, 0}; } -zkFC zkFC::from_float_gpu_ptr (uint input_size, uint output_size, float* float_gpu_ptr, const Commitment& generators) -{ +zkFC zkFC::from_float_gpu_ptr(uint input_size, uint output_size, float *float_gpu_ptr, const Commitment &generators) +{ uint rounded_input_size = 1 << ceilLog2(input_size); uint rounded_output_size = 1 << ceilLog2(output_size); FrTensor weights(rounded_input_size * rounded_output_size); - float_to_Fr_kernel<<<(rounded_input_size * rounded_output_size+FrNumThread-1)/FrNumThread,FrNumThread>>>(float_gpu_ptr, weights.gpu_data, input_size, rounded_input_size, output_size, rounded_output_size); + float_to_Fr_kernel<<<(rounded_input_size * rounded_output_size + FrNumThread - 1) / FrNumThread, FrNumThread>>>(float_gpu_ptr, weights.gpu_data, input_size, rounded_input_size, output_size, rounded_output_size); cudaDeviceSynchronize(); // cout << "Loaded weight is: " << weights << endl; return zkFC(rounded_input_size, rounded_output_size, weights.mont(), generators); } -zkFC::zkFC(uint input_size, uint output_size, const FrTensor& t, const Commitment& c) : inputSize(input_size), outputSize(output_size), weights(t), com(c.commit(t)) { - if (t.size != input_size * output_size) throw std::runtime_error("Incompatible dimensions"); +zkFC::zkFC(uint input_size, uint output_size, const FrTensor &t, const Commitment &c) : inputSize(input_size), outputSize(output_size), weights(t), com(c.commit(t)) +{ + if (t.size != input_size * output_size) + throw std::runtime_error("Incompatible dimensions"); } -FrTensor zkFC::load_float_gpu_input(uint batch_size, uint input_dim, float* input_ptr) +FrTensor zkFC::load_float_gpu_input(uint batch_size, uint input_dim, float *input_ptr) { uint rounded_batch_size = 1 << ceilLog2(batch_size); uint rounded_input_dim = 1 << ceilLog2(input_dim); FrTensor t(rounded_batch_size * rounded_input_dim); - float_to_Fr_kernel<<<(rounded_batch_size * rounded_input_dim+FrNumThread-1)/FrNumThread,FrNumThread>>>(input_ptr, t.gpu_data, batch_size, rounded_batch_size, input_dim, rounded_input_dim); + float_to_Fr_kernel<<<(rounded_batch_size * rounded_input_dim + FrNumThread - 1) / FrNumThread, FrNumThread>>>(input_ptr, t.gpu_data, batch_size, rounded_batch_size, input_dim, rounded_input_dim); cudaDeviceSynchronize(); // cout << "Loaded input is: " << t << endl; return t; } -FrTensor zkFC::operator()(const FrTensor& X) const { - if (X.size % inputSize != 0) throw std::runtime_error("Incompatible dimensions"); +FrTensor zkFC::operator()(const FrTensor &X) const +{ + if (X.size % inputSize != 0) + throw std::runtime_error("Incompatible dimensions"); uint batchSize = X.size / inputSize; dim3 blockSize(TILE_WIDTH, TILE_WIDTH); dim3 gridSize((outputSize + blockSize.x - 1) / blockSize.x, (batchSize + blockSize.y - 1) / blockSize.y); @@ -125,9 +151,11 @@ FrTensor zkFC::operator()(const FrTensor& X) const { return out; } -void zkFC::prove(const FrTensor& X, const FrTensor& Z, Commitment& generators) const { +void zkFC::prove(const FrTensor &X, const FrTensor &Z, Commitment &generators) const +{ // cout << X.size << " " << inputSize << endl; - if (X.size % inputSize != 0) { + if (X.size % inputSize != 0) + { throw std::runtime_error("Incompatible dimensions 1"); } uint batchSize = X.size / inputSize; @@ -143,4 +171,3 @@ void zkFC::prove(const FrTensor& X, const FrTensor& Z, Commitment& generators) c Z(u_Z); generators.open(weights, com, concatenate({u_out_dim, u_in_dim})); } - diff --git a/zkfc.cuh b/zkfc.cuh index c3db76a..1cf74ac 100644 --- a/zkfc.cuh +++ b/zkfc.cuh @@ -6,42 +6,42 @@ #include #include #include -#include "bls12-381.cuh" // adjust this to point to the blstrs header file +#include "bls12-381.cuh" // adjust this to point to the blstrs header file #include "fr-tensor.cuh" #include "proof.cuh" #include "commitment.cuh" #define TILE_WIDTH 16 -class zkFC { +class zkFC +{ private: - FrTensor weights; G1TensorJacobian com; public: const uint inputSize; const uint outputSize; - //zkFC(uint input_size, uint output_size, uint num_bits, const Commitment& generators); - zkFC(uint input_size, uint output_size, const FrTensor& t, const Commitment& generators); - FrTensor operator()(const FrTensor& X) const; - void prove(const FrTensor& X, const FrTensor& Z, Commitment& generators) const; + // zkFC(uint input_size, uint output_size, uint num_bits, const Commitment& generators); + zkFC(uint input_size, uint output_size, const FrTensor &t, const Commitment &generators); + FrTensor operator()(const FrTensor &X) const; + void prove(const FrTensor &X, const FrTensor &Z, Commitment &generators) const; // static zkFC random_fc(uint input_size, uint output_size, uint num_bits, const Commitment& generators); - static zkFC from_float_gpu_ptr (uint input_size, uint output_size, float* float_gpu_ptr, const Commitment& generators); - static FrTensor load_float_gpu_input(uint batch_size, uint input_dim, float* input_ptr); + static zkFC from_float_gpu_ptr(uint input_size, uint output_size, float *float_gpu_ptr, const Commitment &generators); + static FrTensor load_float_gpu_input(uint batch_size, uint input_dim, float *input_ptr); }; -KERNEL void matrixMultiplyOptimized(Fr_t* A, Fr_t* B, Fr_t* C, int rowsA, int colsA, int colsB); +KERNEL void matrixMultiplyOptimized(Fr_t *A, Fr_t *B, Fr_t *C, int rowsA, int colsA, int colsB); // KERNEL void random_init(Fr_t* params, uint num_bits, uint n) // { // int tid = blockIdx.x * blockDim.x + threadIdx.x; // curandState state; - + // // Initialize the RNG state for this thread. -// curand_init(1234, tid, 0, &state); - +// curand_init(1234, tid, 0, &state); + // if (tid < n) { // params[tid] = {curand(&state) & ((1U << num_bits) - 1), 0, 0, 0, 0, 0, 0, 0}; // params[tid] = blstrs__scalar__Scalar_mont(blstrs__scalar__Scalar_sub(params[tid], {1U << (num_bits - 1), 0, 0, 0, 0, 0, 0, 0})); @@ -50,6 +50,6 @@ KERNEL void matrixMultiplyOptimized(Fr_t* A, Fr_t* B, Fr_t* C, int rowsA, int co DEVICE Fr_t float_to_Fr(float x); -KERNEL void float_to_Fr_kernel(float* fs, Fr_t* frs, uint fs_num_window, uint frs_num_window, uint fs_window_size, uint frs_window_size); +KERNEL void float_to_Fr_kernel(float *fs, Fr_t *frs, uint fs_num_window, uint frs_num_window, uint fs_window_size, uint frs_window_size); -#endif // ZKFC_CUH +#endif // ZKFC_CUH