diff --git a/.gitignore b/.gitignore
index e01929b..e3dfab9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,4 @@
 !CMakeLists.txt
 !model.py
 !demo.sh
+!gpu_arch.sh
diff --git a/Makefile b/Makefile
index 66e6084..2dae189 100644
--- a/Makefile
+++ b/Makefile
@@ -9,10 +9,18 @@ NVCC := $(CUDA_PATH)/bin/nvcc
 INCLUDES := -I$(CUDA_PATH)/include -I$(LIBTORCH_PATH)/include -I$(LIBTORCH_PATH)/include/torch/csrc/api/include
 LIBS := -L$(CUDA_PATH)/lib64 -lcudart -L$(LIBTORCH_PATH)/lib -ltorch -ltorch_cpu -ltorch_cuda -lc10 -lc10_cuda
 
+# Detect GPU arch
+GPU_ARCH := $(shell ./gpu_arch.sh)
+
+# Check if the architecture was detected
+ifeq ($(GPU_ARCH),)
+ $(error No CUDA capable GPU detected or unsupported architecture.)
+endif
+
 # NVCC compiler flags
 # note: -D_GLIBCXX_USE_CXX11_ABI=0 may not be necessary and was only added due to some weird compilation errors with the torch library
 # std=c++17 was also needed due to some errors with the torch library
-NVCC_FLAGS := -arch=sm_86 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0
+NVCC_FLAGS := -v -arch=$(GPU_ARCH) -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0
 
 # Source and object files
 CU_SRCS := $(wildcard *.cu)
diff --git a/README.md b/README.md
index eb1ddf8..a5037bd 100644
--- a/README.md
+++ b/README.md
@@ -57,39 +57,32 @@ Before running the demo, ensure you have the following requirements:
 
 ## Setup & Installation
 
-1. **Configure the GPU Architecture**:
-    - Set the desired GPU architecture in the `Makefile` by updating the `NVCC_FLAGS`:
-    ```cmake
-    # NVCC compiler flags
-    NVCC_FLAGS := -arch=sm_70
-    ```
-
-2. **Set up a Virtual Environment**:
+1. **Set up a Virtual Environment**:
     - Create and activate the environment using the following command:
     ```bash
     virtualenv --no-download --clear path/to/your/env
     source path/to/your/env/bin/activate
     ```
 
-3. **Install Necessary Packages**:
+2. **Install Necessary Packages**:
     - Install `numpy` and `torch`. Note that installing `torch` will also include `LibTorch`, the `C++` interface for `torch`, in your virtual environment:
     ```bash
     pip install torch numpy
     ```
 
-4. **Prepare Your Model and Data by PyTorch**:
+3. **Prepare Your Model and Data by PyTorch**:
     - To generate example data and a model (`traced_model.pt` and `sample_input.pt`), execute:
     ```bash
     python model.py
     ```
     - Alternatively, you can use your own model and data. Ensure that they are [serialized](https://pytorch.org/docs/stable/notes/serialization.html) to be compatible with the `C++` loader. For this demo, we only support multilayer perceptrons (MLPs) with ReLU activations. The expected input tensor shape is `(batch_size, input_dim)`.
 
-5. **Compile the Demonstration**:
+4. **Compile the Demonstration**:
     - Run the following command:
     ```bash
     make demo
     ```
-    - Please be patient as the compilation might take a while, possibly a few minutes. We're aware of this and are working to enhance the compilation speed.
+    - CUDA device compute capability is automatically fetched. Please be patient as the compilation might take a while, possibly a few minutes. We're aware of this and are working to enhance the compilation speed.
 
 ## Running the Demo
 
diff --git a/gpu_arch.sh b/gpu_arch.sh
new file mode 100755
index 0000000..b5898e8
--- /dev/null
+++ b/gpu_arch.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Check if nvidia-smi works
+if ! command -v nvidia-smi &> /dev/null; then
+    echo "nvidia-smi not found, ensure your Nvidia GPU drivers are installed correctly."
+    exit 1
+fi
+
+# calculate compute capability
+COMPUTE_CAPABILITY=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader)
+
+if [ -z "$COMPUTE_CAPABILITY" ]; then
+    echo "Failed to detect GPU Compute Capability"
+    exit 1
+fi
+
+# get compute capability from retrieved value
+ARCH="sm_$(echo $COMPUTE_CAPABILITY | tr -d '.')"
+
+echo $ARCH
\ No newline at end of file
diff --git a/zkfc.cu b/zkfc.cu
index 4154e46..5bd5d13 100644
--- a/zkfc.cu
+++ b/zkfc.cu
@@ -2,47 +2,64 @@
 
 #define TILE_WIDTH 16
 
-
-KERNEL void matrixMultiplyOptimized(Fr_t* A, Fr_t* B, Fr_t* C, int rowsA, int colsA, int colsB) {
-    __shared__ Fr_t A_tile[TILE_WIDTH][TILE_WIDTH];
-    __shared__ Fr_t B_tile[TILE_WIDTH][TILE_WIDTH];
+KERNEL void matrixMultiplyOptimized(Fr_t *A, Fr_t *B, Fr_t *C, int rowsA, int colsA, int colsB)
+{
+    // Leverage double buffering
+    __shared__ Fr_t A_tiles[2][TILE_WIDTH][TILE_WIDTH];
+    __shared__ Fr_t B_tiles[2][TILE_WIDTH][TILE_WIDTH];
 
     int row = blockIdx.y * TILE_WIDTH + threadIdx.y;
     int col = blockIdx.x * TILE_WIDTH + threadIdx.x;
 
     Fr_t sum = blstrs__scalar__Scalar_ZERO;
-    
+
     // Loop over the tiles of A and B required to compute the block sub-matrix
-    for (int t = 0; t < (colsA - 1)/TILE_WIDTH + 1; ++t) {
+#pragma unroll
+    for (int t = 0; t < (colsA - 1) / TILE_WIDTH + 1; ++t)
+    {
+        // buffer index
+        int buffer = t % 2;
 
         // Load the matrices from device memory to shared memory; each thread loads
         // one element of each matrix
-        if (row < rowsA && t*TILE_WIDTH + threadIdx.x < colsA) {
-            A_tile[threadIdx.y][threadIdx.x] = A[row*colsA + t*TILE_WIDTH + threadIdx.x];
-        } else {
-            A_tile[threadIdx.y][threadIdx.x] = blstrs__scalar__Scalar_ZERO;
+        if (row < rowsA && t * TILE_WIDTH + threadIdx.x < colsA)
+        {
+            // prefetch matrix A into shared mem
+            A_tiles[buffer][threadIdx.y][threadIdx.x] = __ldg(&A[row * colsA + t * TILE_WIDTH + threadIdx.x]);
         }
-        
-        if (t*TILE_WIDTH + threadIdx.y < colsA && col < colsB) {
-            B_tile[threadIdx.y][threadIdx.x] = B[(t*TILE_WIDTH + threadIdx.y)*colsB + col];
-        } else {
-            B_tile[threadIdx.y][threadIdx.x] = blstrs__scalar__Scalar_ZERO;
+        else
+        {
+            A_tiles[buffer][threadIdx.y][threadIdx.x] = blstrs__scalar__Scalar_ZERO;
+        }
+
+        if (t * TILE_WIDTH + threadIdx.y < colsA && col < colsB)
+        {
+            // prefetch matrix B into shared mem
+            B_tiles[buffer][threadIdx.y][threadIdx.x] = __ldg(&B[(t * TILE_WIDTH + threadIdx.y) * colsB + col]);
+        }
+        else
+        {
+            B_tiles[buffer][threadIdx.y][threadIdx.x] = blstrs__scalar__Scalar_ZERO;
         }
 
         // Synchronize to ensure all the data in shared memory is available
         __syncthreads();
 
-        // Multiply the two matrices together;
-        for (int k = 0; k < TILE_WIDTH; ++k) {
-            sum = blstrs__scalar__Scalar_add(sum, blstrs__scalar__Scalar_mul(A_tile[threadIdx.y][k], B_tile[k][threadIdx.x]));
+        // multiply matrices
+#pragma unroll
+        for (int k = 0; k < TILE_WIDTH; ++k)
+        {
+            Fr_t A_value = A_tiles[threadIdx.y][k];
+            Fr_t B_value = B_tiles[k][threadIdx.x];
+            sum = blstrs__scalar__Scalar_add(sum, blstrs__scalar__Scalar_mul(A_value, B_value));
         }
 
-        // Synchronize to ensure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
         __syncthreads();
     }
 
-    if (row < rowsA && col < colsB) {
-        C[row*colsB + col] = sum;
+    if (row < rowsA && col < colsB)
+    {
+        C[row * colsB + col] = sum;
     }
 }
 
@@ -50,10 +67,10 @@ KERNEL void matrixMultiplyOptimized(Fr_t* A, Fr_t* B, Fr_t* C, int rowsA, int co
 // {
 //     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 //     curandState state;
-    
+
 //     // Initialize the RNG state for this thread.
-//     curand_init(1234, tid, 0, &state);  
-    
+//     curand_init(1234, tid, 0, &state);
+
 //     if (tid < n) {
 //         params[tid] = {curand(&state) & ((1U << num_bits) - 1), 0, 0, 0, 0, 0, 0, 0};
 //         params[tid] = blstrs__scalar__Scalar_mont(blstrs__scalar__Scalar_sub(params[tid], {1U << (num_bits - 1), 0, 0, 0, 0, 0, 0, 0}));
@@ -69,53 +86,62 @@ DEVICE Fr_t float_to_Fr(float x)
     bool negative = (sign_x < 0);
     uint rounded_abs = static_cast<uint>(abs_x);
 
-    if (negative){
+    if (negative)
+    {
         return blstrs__scalar__Scalar_sub({0, 0, 0, 0, 0, 0, 0, 0}, {rounded_abs, 0, 0, 0, 0, 0, 0, 0});
     }
-    else {
+    else
+    {
         return {rounded_abs, 0, 0, 0, 0, 0, 0, 0};
     }
 }
 
-KERNEL void float_to_Fr_kernel(float* fs, Fr_t* frs, uint fs_num_window, uint frs_num_window, uint fs_window_size, uint frs_window_size)
+KERNEL void float_to_Fr_kernel(float *fs, Fr_t *frs, uint fs_num_window, uint frs_num_window, uint fs_window_size, uint frs_window_size)
 {
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
     uint dim0 = tid / frs_window_size;
     uint dim1 = tid % frs_window_size;
-    if (tid >= frs_num_window * frs_window_size) return;
-    if (dim0 < fs_num_window && dim1 < fs_window_size) frs[dim0 * frs_window_size + dim1] = float_to_Fr(fs[dim0 * fs_window_size + dim1]);
-    else frs[tid] = {0, 0, 0, 0, 0, 0, 0, 0};
+    if (tid >= frs_num_window * frs_window_size)
+        return;
+    if (dim0 < fs_num_window && dim1 < fs_window_size)
+        frs[dim0 * frs_window_size + dim1] = float_to_Fr(fs[dim0 * fs_window_size + dim1]);
+    else
+        frs[tid] = {0, 0, 0, 0, 0, 0, 0, 0};
 }
 
-zkFC zkFC::from_float_gpu_ptr (uint input_size, uint output_size, float* float_gpu_ptr, const Commitment& generators)
-{   
+zkFC zkFC::from_float_gpu_ptr(uint input_size, uint output_size, float *float_gpu_ptr, const Commitment &generators)
+{
     uint rounded_input_size = 1 << ceilLog2(input_size);
     uint rounded_output_size = 1 << ceilLog2(output_size);
 
     FrTensor weights(rounded_input_size * rounded_output_size);
-    float_to_Fr_kernel<<<(rounded_input_size * rounded_output_size+FrNumThread-1)/FrNumThread,FrNumThread>>>(float_gpu_ptr, weights.gpu_data, input_size, rounded_input_size, output_size, rounded_output_size);
+    float_to_Fr_kernel<<<(rounded_input_size * rounded_output_size + FrNumThread - 1) / FrNumThread, FrNumThread>>>(float_gpu_ptr, weights.gpu_data, input_size, rounded_input_size, output_size, rounded_output_size);
     cudaDeviceSynchronize();
     // cout << "Loaded weight is: " << weights << endl;
     return zkFC(rounded_input_size, rounded_output_size, weights.mont(), generators);
 }
 
-zkFC::zkFC(uint input_size, uint output_size, const FrTensor& t, const Commitment& c) : inputSize(input_size), outputSize(output_size), weights(t), com(c.commit(t)) {
-    if (t.size != input_size * output_size) throw std::runtime_error("Incompatible dimensions");
+zkFC::zkFC(uint input_size, uint output_size, const FrTensor &t, const Commitment &c) : inputSize(input_size), outputSize(output_size), weights(t), com(c.commit(t))
+{
+    if (t.size != input_size * output_size)
+        throw std::runtime_error("Incompatible dimensions");
 }
 
-FrTensor zkFC::load_float_gpu_input(uint batch_size, uint input_dim, float* input_ptr)
+FrTensor zkFC::load_float_gpu_input(uint batch_size, uint input_dim, float *input_ptr)
 {
     uint rounded_batch_size = 1 << ceilLog2(batch_size);
     uint rounded_input_dim = 1 << ceilLog2(input_dim);
     FrTensor t(rounded_batch_size * rounded_input_dim);
-    float_to_Fr_kernel<<<(rounded_batch_size * rounded_input_dim+FrNumThread-1)/FrNumThread,FrNumThread>>>(input_ptr, t.gpu_data, batch_size, rounded_batch_size, input_dim, rounded_input_dim);
+    float_to_Fr_kernel<<<(rounded_batch_size * rounded_input_dim + FrNumThread - 1) / FrNumThread, FrNumThread>>>(input_ptr, t.gpu_data, batch_size, rounded_batch_size, input_dim, rounded_input_dim);
     cudaDeviceSynchronize();
     // cout << "Loaded input is: " << t << endl;
     return t;
 }
 
-FrTensor zkFC::operator()(const FrTensor& X) const {
-    if (X.size % inputSize != 0) throw std::runtime_error("Incompatible dimensions");
+FrTensor zkFC::operator()(const FrTensor &X) const
+{
+    if (X.size % inputSize != 0)
+        throw std::runtime_error("Incompatible dimensions");
     uint batchSize = X.size / inputSize;
     dim3 blockSize(TILE_WIDTH, TILE_WIDTH);
     dim3 gridSize((outputSize + blockSize.x - 1) / blockSize.x, (batchSize + blockSize.y - 1) / blockSize.y);
@@ -125,9 +151,11 @@ FrTensor zkFC::operator()(const FrTensor& X) const {
     return out;
 }
 
-void zkFC::prove(const FrTensor& X, const FrTensor& Z, Commitment& generators) const {
+void zkFC::prove(const FrTensor &X, const FrTensor &Z, Commitment &generators) const
+{
     // cout << X.size << " " << inputSize << endl;
-    if (X.size % inputSize != 0) {
+    if (X.size % inputSize != 0)
+    {
         throw std::runtime_error("Incompatible dimensions 1");
     }
     uint batchSize = X.size / inputSize;
@@ -143,4 +171,3 @@ void zkFC::prove(const FrTensor& X, const FrTensor& Z, Commitment& generators) c
     Z(u_Z);
     generators.open(weights, com, concatenate<Fr_t>({u_out_dim, u_in_dim}));
 }
-
diff --git a/zkfc.cuh b/zkfc.cuh
index c3db76a..1cf74ac 100644
--- a/zkfc.cuh
+++ b/zkfc.cuh
@@ -6,42 +6,42 @@
 #include <cstddef>
 #include <cuda_runtime.h>
 #include <curand_kernel.h>
-#include "bls12-381.cuh"  // adjust this to point to the blstrs header file
+#include "bls12-381.cuh" // adjust this to point to the blstrs header file
 #include "fr-tensor.cuh"
 #include "proof.cuh"
 #include "commitment.cuh"
 
 #define TILE_WIDTH 16
 
-class zkFC {
+class zkFC
+{
 private:
-    
     FrTensor weights;
     G1TensorJacobian com;
 
 public:
     const uint inputSize;
     const uint outputSize;
-    //zkFC(uint input_size, uint output_size, uint num_bits, const Commitment& generators);
-    zkFC(uint input_size, uint output_size, const FrTensor& t, const Commitment& generators);
-    FrTensor operator()(const FrTensor& X) const;
-    void prove(const FrTensor& X, const FrTensor& Z, Commitment& generators) const;
+    // zkFC(uint input_size, uint output_size, uint num_bits, const Commitment& generators);
+    zkFC(uint input_size, uint output_size, const FrTensor &t, const Commitment &generators);
+    FrTensor operator()(const FrTensor &X) const;
+    void prove(const FrTensor &X, const FrTensor &Z, Commitment &generators) const;
 
     // static zkFC random_fc(uint input_size, uint output_size, uint num_bits, const Commitment& generators);
-    static zkFC from_float_gpu_ptr (uint input_size, uint output_size, float* float_gpu_ptr, const Commitment& generators);
-    static FrTensor load_float_gpu_input(uint batch_size, uint input_dim, float* input_ptr);
+    static zkFC from_float_gpu_ptr(uint input_size, uint output_size, float *float_gpu_ptr, const Commitment &generators);
+    static FrTensor load_float_gpu_input(uint batch_size, uint input_dim, float *input_ptr);
 };
 
-KERNEL void matrixMultiplyOptimized(Fr_t* A, Fr_t* B, Fr_t* C, int rowsA, int colsA, int colsB);
+KERNEL void matrixMultiplyOptimized(Fr_t *A, Fr_t *B, Fr_t *C, int rowsA, int colsA, int colsB);
 
 // KERNEL void random_init(Fr_t* params, uint num_bits, uint n)
 // {
 //     int tid = blockIdx.x * blockDim.x + threadIdx.x;
 //     curandState state;
-    
+
 //     // Initialize the RNG state for this thread.
-//     curand_init(1234, tid, 0, &state);  
-    
+//     curand_init(1234, tid, 0, &state);
+
 //     if (tid < n) {
 //         params[tid] = {curand(&state) & ((1U << num_bits) - 1), 0, 0, 0, 0, 0, 0, 0};
 //         params[tid] = blstrs__scalar__Scalar_mont(blstrs__scalar__Scalar_sub(params[tid], {1U << (num_bits - 1), 0, 0, 0, 0, 0, 0, 0}));
@@ -50,6 +50,6 @@ KERNEL void matrixMultiplyOptimized(Fr_t* A, Fr_t* B, Fr_t* C, int rowsA, int co
 
 DEVICE Fr_t float_to_Fr(float x);
 
-KERNEL void float_to_Fr_kernel(float* fs, Fr_t* frs, uint fs_num_window, uint frs_num_window, uint fs_window_size, uint frs_window_size);
+KERNEL void float_to_Fr_kernel(float *fs, Fr_t *frs, uint fs_num_window, uint frs_num_window, uint fs_window_size, uint frs_window_size);
 
-#endif  // ZKFC_CUH
+#endif // ZKFC_CUH