diff --git a/buildTensorflow.h b/buildTensorflow.h
index cf2cf4c..88d31a2 100644
--- a/buildTensorflow.h
+++ b/buildTensorflow.h
@@ -1,4 +1,9 @@
+// Check whether GPU is accessible or not
+bool gpu = false;
+
 #include "types/tensor.h" 
 #include "overloads/tensor.h"
 #include "operations/operations_Impl.h"
 #include "layers/dense.h"
+#include "optims/sgd.h"
+#include "data/celsius2faranheit.h"
\ No newline at end of file
diff --git a/buildTensorflowGpu.h b/buildTensorflowGpu.h
new file mode 100644
index 0000000..b8d69d1
--- /dev/null
+++ b/buildTensorflowGpu.h
@@ -0,0 +1,11 @@
+// Check whether GPU is accessible or not
+bool gpu = true;
+
+#include "gpu/defn.h" // Includes GPU Kernel Code Defination for Forward pass
+#include "types/tensor.h"
+#include "gpu/impl.h" // Includes GPU Kernel Code Implementation
+#include "overloads/tensor.h"
+#include "operations/operations_Impl.h"
+#include "layers/dense.h"
+#include "optims/sgd.h"
+#include "data/celsius2faranheit.h"
\ No newline at end of file
diff --git a/data/celsius2faranheit.h b/data/celsius2faranheit.h
new file mode 100644
index 0000000..1964abb
--- /dev/null
+++ b/data/celsius2faranheit.h
@@ -0,0 +1,56 @@
+/*
+    This file defines the Celsius To Faranheit DataLoader. It's input is variables containing the
+    celsius numbers and the targets are the corresponding faranheit numbers.
+
+    The way to use this dataset is as follows:
+
+    Celsius2Faranheit<float,float> dataloader;
+    dataloader.create(10); // Creates 10 training examples
+
+    for(auto i: dataloader.data) {
+        auto inp = j.first;
+        auto tar = j.second;
+
+        // And then use this above data in your model for training or inference
+    }
+
+    Note that the data won't be outputted in tensors. It will simply be of the data type the user 
+    signifies in the dataloader defination. In the above case the input and targets are both floats.
+
+*/
+
+#include "data/dataloader.h"
+#include <stdlib.h>
+
+#ifndef __C2F_DATASET_INCLUDED__   
+#define __C2F_DATASET_INCLUDED__ 
+
+template<typename I, typename T>
+class Celsius2Faranheit: public DataLoader<I,T> {
+    
+    private:
+    int MAX_CELSIUS = 10;
+
+    // Helper function to convert celsius to faranheit
+    T toFaranheit(I input) {
+        return (9*input)/5 + 32;
+    }
+
+    public: 
+
+    // Adds a training example into the dataset
+    void add(I input, T target) {
+        this->data.push_back(make_pair(input,target));
+    }
+
+    // Populates the dataset with the number of examples specified by the user.
+    void create(int num_examples) {
+        for(int i=0; i< num_examples;i++) {
+            I input = rand() % MAX_CELSIUS + 1; // random int value between 1 and MAX_CELSIUS
+            T target = toFaranheit(input);
+            add(input,target);
+        }
+    }
+};
+
+#endif
diff --git a/data/dataloader.h b/data/dataloader.h
new file mode 100644
index 0000000..22d9a8e
--- /dev/null
+++ b/data/dataloader.h
@@ -0,0 +1,23 @@
+/*
+    This file defines the base class of each Dataset in the project. The data is stored in a simple
+    vector and each object in the vector is a pari signifying input and target (ground truth).
+*/
+
+#include<iostream>
+#include<vector>
+
+#ifndef __DATALOADER_INCLUDED__   
+#define __DATALOADER_INCLUDED__ 
+
+template<typename I, typename T>
+class DataLoader {
+
+    public: 
+    // This variable contains all the data of the dataset
+    vector<pair<I,T>> data;
+
+    // This function perfroms the operation that populates the "data" variable.
+    virtual void add(I input, T target) = 0;
+}; 
+
+#endif
diff --git a/gpu/defn.h b/gpu/defn.h
new file mode 100644
index 0000000..2d2d39f
--- /dev/null
+++ b/gpu/defn.h
@@ -0,0 +1 @@
+#include "gpu/dot/defn.h"
\ No newline at end of file
diff --git a/gpu/dot.h b/gpu/dot/defn.h
similarity index 62%
rename from gpu/dot.h
rename to gpu/dot/defn.h
index a16db39..9da72d8 100644
--- a/gpu/dot.h
+++ b/gpu/dot/defn.h
@@ -1,5 +1,7 @@
-#ifndef __GPU_DOT_INCLUDED__   
-#define __GPU_DOT_INCLUDED__  
+#include "utils/common.h"
+
+#ifndef __GPU_DOT_DEFN_INCLUDED__   
+#define __GPU_DOT_DEFN_INCLUDED__  
 
 template<typename T>
 struct Matrix;
@@ -8,4 +10,3 @@ template<typename T>
 void dotGPU(vector<T> &res, const Matrix<T>* lhs, const Matrix<T> &rhs, int start, int startRes);
 
 #endif
-
diff --git a/gpu/dot.cu b/gpu/dot/impl.cuh
similarity index 63%
rename from gpu/dot.cu
rename to gpu/dot/impl.cuh
index 6b9bfc8..2f7a7f7 100644
--- a/gpu/dot.cu
+++ b/gpu/dot/impl.cuh
@@ -1,17 +1,20 @@
-#include "utils/common.h"
-#include "types/matrix.h"
 
+#ifndef __GPU_DOT_IMPL_INCLUDED__   
+#define __GPU_DOT_IMPL_INCLUDED__  
+
+// TODO need to refactor this to different files and 
+// figure out a way to link it for the GPU build 
 template<typename T>
-__global__ void mm(T* a, T* b, T* c, T width) {
+__global__ void mm(T* a, T* b, T* c, T width, T second) {
 
     int x = blockIdx.x; // block id
     int y = threadIdx.x; // thread id
     T temp = 0;
     for(int i = 0;i< width;i++) {
-        temp += a[x*width + i]*b[i*width+ y];
+        temp += a[x*width + i]*b[i*second+ y];
     }
 
-    c[x*width + y] = temp;
+    c[x*second + y] = temp;
 }
 
 template<typename T>
@@ -27,8 +30,8 @@ void dotGPU(vector<T> &res, const Matrix<T> *lhs, const Matrix<T> &rhs, int star
 
     // Copy to CUDA memory
 
-    T* h_A = lhs->val.data();
-    T* h_B = rhs.val.data();
+    const T* h_A = lhs->val.data();
+    const T* h_B = rhs.val.data();
     T* h_C = res.data();
 
     T *d_a, *d_b, *d_c;
@@ -40,8 +43,16 @@ void dotGPU(vector<T> &res, const Matrix<T> *lhs, const Matrix<T> &rhs, int star
     cudaMemcpy((void *)d_a, h_A + start, sizeof(T)*row1*col1, cudaMemcpyHostToDevice);
     cudaMemcpy((void *)d_b, h_B, sizeof(T)*row2*col2, cudaMemcpyHostToDevice);
 
-    mm<T><<<row1,col2>>>(d_a,d_b,d_c,col1);
+    mm<T><<<row1,col2>>>(d_a,d_b,d_c,col1,col2); // non blocking function
 
     // Copy back from cuda memory
-    cudaMemcpy(h_C+startRes, (void **)d_c, sizeof(T)*row1*col2, cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_C+startRes, (void **)d_c, sizeof(T)*row1*col2, cudaMemcpyDeviceToHost); // waits for kernel to get over
+
+    // Clean Up 
+    cudaFree(d_a);
+    cudaFree(d_b);
+    cudaFree(d_c);
 }
+
+#endif
+
diff --git a/gpu/impl.h b/gpu/impl.h
new file mode 100644
index 0000000..44e3668
--- /dev/null
+++ b/gpu/impl.h
@@ -0,0 +1 @@
+#include "gpu/dot/impl.cuh"
\ No newline at end of file
diff --git a/layers/dense.h b/layers/dense.h
index e3b35a1..b323175 100644
--- a/layers/dense.h
+++ b/layers/dense.h
@@ -37,6 +37,9 @@ class Dense{
         if(init == GLOROT) {
             return utils::glorotInit<T>(fan_in, fan_out);
         }
+
+        // Default return zero vector
+        return vector<T>(fan_in*fan_out,0);
     }
 
     public:
diff --git a/main.cpp b/main.cpp
index 54d6f67..b67c9e9 100644
--- a/main.cpp
+++ b/main.cpp
@@ -1,171 +1,54 @@
 #include "buildTensorflow.h"
 
-void oldSigmoidTest() {
-
-    Tensor<float> w0({2},{1});
-    Tensor<float> x0({-1},{1});
-
-    Tensor<float> w1({-3},{1});
-    Tensor<float> x1({-2},{1});
-
-    Tensor<float> w3({-3},{1});
-    
-    Tensor<float> a = w0*x0;
-    Tensor<float> b = w1*x1;
-    Tensor<float> c = a + b;
-    Tensor<float> d = w3+c;
-    Tensor<float> e({-1}, {1});
-    Tensor<float> f = d*e;
-    Tensor<float> g = f.exp();
-    Tensor<float> h({1}, {1});
-    Tensor<float> i = g + h;
-    Tensor<float> j({1}, {1});
-    Tensor<float> k = j/i;
-    
-    vector<float> vsl = {1};
-    vector<int> sh = {1};
-    auto grad = Matrix<float>(vsl,sh);
-    k.backward(grad);
-
-
-    cout<<w0.grad<<endl;
-    cout<<x0.grad<<endl;
-
-    cout<<w1.grad<<endl;
-    cout<<x1.grad<<endl;
-
-    cout<<w3.grad<<endl;
-}
-
-// WRONG BACKPROP: SOME ERROR WITH POINTERS AND OPERATION OVERLOADING.
-void newSigmoidTest() {
-    Tensor<float> w0({2},{1});
-    Tensor<float> x0({-1},{1});
-
-    Tensor<float> w1({-3},{1});
-    Tensor<float> x1({-2},{1});
-
-    Tensor<float> w3({-3},{1});
-    Tensor<float> e({-1}, {1});
-    Tensor<float> h({1}, {1});
-    Tensor<float> j({1}, {1});
-
-    Tensor<float> a = e*(w0*x0 + w1*x1 + w3);
-    Tensor<float> k = j/(a.exp() + h);
-
-    vector<float> vsl = {1};
-    vector<int> sh = {1};
-    auto grad = Matrix<float>(vsl,sh);
-    k.backward(grad);
-
-    cout<<w0.grad<<endl;
-    cout<<x0.grad<<endl;
-
-    cout<<w1.grad<<endl;
-    cout<<x1.grad<<endl;
-
-    cout<<w3.grad<<endl;
-}
-
-/*
-API guide:
-
-    1. Tensor(vector) - 1 D vector
-    2. Tensor(vector, row, col) - 2 D vector
-    Operations are add, dot, multiply, divide, exponent for tensors
-
-    Matrix size will always be batch size, channels, height width
-    Or batch size, embedding size, y
-
-    Rules during simple add, sub. divide, multiply use broadcasting
-    During matrix multiply, then 2D matrices can only be multiplied. 
-
-    Cases will be multilayer perceptron, batch size, input vector
-    layer weights would be input vector, output layer, hence add one to batch size dim
-    The matrix multiply the uses two interior dims as 2D matrices as input
-    Output will be 
-
-*/
-
-void sigmoidPointerTest() {
-
-    Tensor<float>* w0 = new Tensor<float>({2},{1});
-    Tensor<float>* x0= new Tensor<float>({-1},{1});
-
-    Tensor<float>* w1= new Tensor<float>({-3},{1});
-    Tensor<float>* x1= new Tensor<float>({-2},{1});
-
-    Tensor<float>* w3= new Tensor<float>({-3},{1});
-    
-    auto a = tensorOps::multiply(w0,x0);
-    auto b = tensorOps::multiply(w1,x1);
-    auto c = tensorOps::add(a,b);
-    auto d = tensorOps::add(w3,c);
-
-    Tensor<float>* e = new Tensor<float>({-1}, {1});
-    auto f = tensorOps::multiply(d,e);
-
-    auto g = tensorOps::exp(f); // exponent
-
-    Tensor<float>* h = new Tensor<float>({1}, {1});
-    auto i = tensorOps::add(g,h);
-
-    Tensor<float>* j = new Tensor<float>({1}, {1});
-    auto k = tensorOps::divide(j,i);
-    
-    auto grad = Matrix<float>({1},{1});
-    k->backward(grad);
-
-
-    cout<<w0->grad<<endl;
-    cout<<x0->grad<<endl;
-
-    cout<<w1->grad<<endl;
-    cout<<x1->grad<<endl;
-
-    cout<<w3->grad<<endl;
-}
-
-void updatedSigmoidtest() {
-    Tensor<float>* w0 = new Tensor<float>({2},{1});
-    Tensor<float>* x0= new Tensor<float>({-1},{1});
-
-    Tensor<float>* w1= new Tensor<float>({-3},{1});
-    Tensor<float>* x1= new Tensor<float>({-2},{1});
-
-    Tensor<float>* w3= new Tensor<float>({-3},{1});
-    
-    auto a = tensorOps::multiply(w0,x0);
-    auto b = tensorOps::multiply(w1,x1);
-    auto c = tensorOps::add(a,b);
-    auto d = tensorOps::add(w3,c);
-
-    auto k = tensorOps::sigmoid(d);
-    k->backward();
-
-    cout<<w0->grad<<endl;
-    cout<<x0->grad<<endl;
-
-    cout<<w1->grad<<endl;
-    cout<<x1->grad<<endl;
-
-    cout<<w3->grad<<endl;
-
-}
-
-
+// Example of training a network on the buildTensorflow framework.
 int main() {
-    Tensor<float> a({2},{1});
-    Tensor<float> b({4},{1});
-
-    auto loss = a+b;
-    cout<<loss.val<<endl;
-    loss.backward();
+    // Load Dataset
+    Celsius2Faranheit<float,float> dataset;
+    dataset.create(5);
 
-    oldSigmoidTest();
-    newSigmoidTest();
-    sigmoidPointerTest();
-    updatedSigmoidtest();
+    // Create Model
+    Dense<float> fc1(1,1,NO_ACTIVATION);
 
+    // Initialise Optimiser
+    SGD<float> sgd(0.01);
+    
+    // Train
+    cout<<"Training started"<<endl;
+    for(int j = 0;j<2000;j++) {
+        for(auto i: dataset.data) {
+            // Get data
+            auto inp = new Tensor<float>({i.first}, {1,1});
+            auto tar = new Tensor<float>({i.second}, {1,1});
+
+            // Forward Prop
+            auto out = fc1.forward(inp);
+
+            // Get Loss
+            auto l = new Tensor<float>({-1}, {1,1});
+            auto k = tensorOps::multiply(l,tar);
+            auto loss = tensorOps::add(out,k); // error in loss
+            auto finalLoss = tensorOps::power(loss,(float)2);
+
+            // Compute backProp
+            finalLoss->backward();
+            // cout<<finalLoss->val<<endl;
+
+            // Perform Gradient Descent
+            sgd.minimise(finalLoss);
+        
+        }
+    }
+
+    cout<<"Training completed"<<endl;
+
+    // Inference
+    float cel = 4;
+    auto test = new Tensor<float>({cel}, {1,1});
+    auto out1 = fc1.forward(test);
+
+    cout<<"The conversion of "<<cel<<" degrees celcius to faranheit is "<<out1->val<<endl; // For 4 Celcius: it's ~39.2
+
+    // Clean up
+    delete out1;
 }
 
diff --git a/main.cu b/main.cu
new file mode 100644
index 0000000..ef148ce
--- /dev/null
+++ b/main.cu
@@ -0,0 +1,54 @@
+#include "buildTensorflowGpu.h"
+
+// Example of training a network on the buildTensorflow framework.
+int main() {
+    // Load Dataset
+    Celsius2Faranheit<float,float> dataset;
+    dataset.create(5);
+
+    // Create Model
+    Dense<float> fc1(1,1,NO_ACTIVATION);
+
+    // Initialise Optimiser
+    SGD<float> sgd(0.01);
+    
+    // Train
+    cout<<"Training started"<<endl;
+    for(int j = 0;j<2000;j++) {
+        for(auto i: dataset.data) {
+            // Get data
+            auto inp = new Tensor<float>({i.first}, {1,1});
+            auto tar = new Tensor<float>({i.second}, {1,1});
+
+            // Forward Prop
+            auto out = fc1.forward(inp);
+
+            // Get Loss
+            auto l = new Tensor<float>({-1}, {1,1});
+            auto k = tensorOps::multiply(l,tar);
+            auto loss = tensorOps::add(out,k); // error in loss
+            auto finalLoss = tensorOps::power(loss,(float)2);
+
+            // Compute backProp
+            finalLoss->backward();
+            // cout<<finalLoss->val<<endl;
+
+            // Perform Gradient Descent
+            sgd.minimise(finalLoss);
+        
+        }
+    }
+
+    cout<<"Training completed"<<endl;
+
+    // Inference
+    float cel = 4;
+    auto test = new Tensor<float>({cel}, {1,1});
+    auto out1 = fc1.forward(test);
+
+    cout<<"The conversion of "<<cel<<" degrees celcius to faranheit is "<<out1->val<<endl; // For 4 Celcius: it's ~39.2
+
+    // Clean up
+    delete out1;
+}
+
diff --git a/operations/operation.h b/operations/operation.h
index 0a3f721..901acb1 100644
--- a/operations/operation.h
+++ b/operations/operation.h
@@ -40,6 +40,11 @@ class Operation {
     
     // New API for forward Prop
     virtual Tensor<T>* forward() = 0;
+
+    ~Operation() {
+        delete t1;
+        delete t2;
+    }
     
 };
 
diff --git a/operations/operations_Impl.h b/operations/operations_Impl.h
index 7ed6af9..bc788c6 100644
--- a/operations/operations_Impl.h
+++ b/operations/operations_Impl.h
@@ -1,5 +1,6 @@
 /*
-    This file includes all the operator implementations
+    This file includes all the operator implementations. Be sure to include your operation 
+    implementation here for the project to be able to use your operation.
 */
 
 #ifndef __OP_IMPL_INCLUDED__   
@@ -11,6 +12,7 @@
 #include "operations/dotOperation_Impl.h"
 #include "operations/exponentOperation_Impl.h"
 #include "operations/sigmoidOperation_Impl.h"
+#include "operations/powerOperation_Impl.h"
 
 #endif
 
diff --git a/operations/powerOperation.h b/operations/powerOperation.h
new file mode 100644
index 0000000..05d32c1
--- /dev/null
+++ b/operations/powerOperation.h
@@ -0,0 +1,28 @@
+/*
+    This file defines the PowerOperation class which represents the
+    exponentiation of a tensor with a scalar.
+*/
+
+#include "operations/operation.h"
+
+#ifndef __OP_POWER_INCLUDED__
+#define __OP_POWER_INCLUDED__
+
+template <typename T>
+class PowerOperation : public Operation<T> {
+    public:
+    T pow;
+
+    PowerOperation(Tensor<T> *t1, T pow) {
+        this->t1 = t1;
+        this->pow = pow;
+    }
+    void backward(Matrix<T> grad);
+
+    Tensor<T> forwardDeprecated();
+
+    Tensor<T>* forward();
+};
+
+#endif
+
diff --git a/operations/powerOperation_Impl.h b/operations/powerOperation_Impl.h
new file mode 100644
index 0000000..f16f05e
--- /dev/null
+++ b/operations/powerOperation_Impl.h
@@ -0,0 +1,44 @@
+/*
+    This file contains the implementation of the forward and backward pass of
+    the power operation.
+*/
+
+#include "operations/powerOperation.h"
+
+#ifndef __OP_IMPL_POWER_INCLUDED__
+#define __OP_IMPL_POWER_INCLUDED__
+
+/* 
+    Backpropogation of the power operation.
+    
+    F = x*pow is forward propogation
+    The gradient would be as follows:
+    1. dF/dx = pow*x^(pow-1)
+*/
+template <typename T>
+void PowerOperation<T>::backward(Matrix<T> grad) {
+    this->t1->backward(grad * (pow * matrixOps::power(this->t1->val,pow-1)));
+}
+
+/* 
+    Forward Propogation of the operation. Returns a tensor.
+
+    TODO: Remove: See addition operation impl for more details
+*/
+template <typename T>
+Tensor<T> PowerOperation<T>::forwardDeprecated() {
+    return NULL;
+}
+
+/* 
+    Forward Propogation of the operation. Return pointer to the tensor.
+    Forward propogation is simply y = x^(pow).
+*/
+template <typename T>
+Tensor<T>* PowerOperation<T>::forward() {
+    this->t3 = new Tensor<T>(matrixOps::power(this->t1->val, this->pow), this);
+    return this->t3;
+}
+
+#endif
+
diff --git a/optims/optim.h b/optims/optim.h
new file mode 100644
index 0000000..64eee83
--- /dev/null
+++ b/optims/optim.h
@@ -0,0 +1,37 @@
+/*
+    This file defines the Base Class for all Optimizers.
+*/
+
+#include "types/tensor.h"
+#include "unordered_set"
+
+#ifndef __OPTIM_BASE_INCLUDED__   
+#define __OPTIM_BASE_INCLUDED__ 
+
+template<typename T>
+class Optimizer {
+    
+    public:
+
+    // This variable contains all the tensors that need to be updated via the optimiser
+    unordered_set<Tensor<T>*> params;
+
+    // The learning rate
+    T lr;
+
+    Optimizer() {
+        
+    }
+
+    // This function resets the gradients of the tensors in params to zero for the next forward pass
+    void zeroGrad() {
+        for(auto i : params) {
+            i->zeroGrad();
+        }
+    }
+
+    // This overloaded function specifes how one optimisation step will be performed
+    virtual void step(T learning_rate) {};
+};
+
+#endif
\ No newline at end of file
diff --git a/optims/sgd.h b/optims/sgd.h
new file mode 100644
index 0000000..7c48c9b
--- /dev/null
+++ b/optims/sgd.h
@@ -0,0 +1,102 @@
+/*
+    This file defines the Stochastic Gradient Descent Optimiser. The Stochastic Gradient Descent
+    Optimizer takes the loss computed over a single training example or the averages of the loss
+    computed with multiple training examples and "minimises" the loss.
+
+    By minimising, we mean it finds out all the updatable tensors that contributed towards 
+    computing this loss. Once it has these parameters it performs an update step on each 
+    parameter (Tensor) to tweak them into the right direction to minimise the overall loss.
+
+    It performs this update step by this formula:
+    
+    val = val - learning_rate*gradient_of_val
+    
+    Where val is the value of the tensor and gradient_of_val is the partial gradient of the
+    tensor with respect to the loss.
+*/
+
+#include "optims/optim.h"
+#include "queue"
+
+#ifndef __OPTIM_SGD_INCLUDED__   
+#define __OPTIM_SGD_INCLUDED__ 
+
+
+template<typename T>
+class SGD : public Optimizer<T> {
+
+    public: 
+
+    SGD(T lr) {
+        this->params.clear();
+        this->lr = lr;
+    }
+
+    /*
+        This function does a full search through the computational graph of the Tensor x and
+        stores all the Tensor nodes of the graph in the params set.
+
+        The params set represents all the tensors that need t be updated.
+
+        As of now, a BFS style algorithm traverses through the graph to find out all the Tensor
+        nodes.
+    */
+    void getParams(Tensor<T>* x) {
+        
+        this->params.clear(); // Clear out old params. Should we do this ? 
+        
+        queue<Tensor<T>*> q;
+        q.push(x);
+
+        while(!q.empty()) {
+
+            auto v = q.front();
+            q.pop();
+            auto op = v->backOp;
+
+            if(op) {
+
+                if(op->t1 != NULL && this->params.find(op->t1) == this->params.end()) {
+                    q.push(op->t1);
+                    this->params.insert(op->t1);
+                }
+
+                if(op->t2 != NULL && this->params.find(op->t2) == this->params.end()) {
+                    q.push(op->t2);
+                    this->params.insert(op->t2);
+                }
+            }
+        }
+    }
+
+    /*
+        This function is the function all users will use to perfrom the gradient descent update
+        for their model. It performs this operation in 3 phases.
+        1. Gets all tensor parameters
+        2. Updates all these parameters via the step function
+        3. Clear's all the gradients of the parameters for the next step.
+    */
+    void minimise(Tensor<T>* x) {
+
+        // Get all tensors in computational graph
+        getParams(x);
+
+        // step through 1 parameter update
+        step(this->lr);
+
+        // reset Gradients to zero
+        this->zeroGrad();
+       
+    }
+
+    // Performs 1 step of gradient descent. See top of the file to see definition of SGD. 
+    void step(T learning_rate) {
+
+        for(auto t: this->params) {
+            t->val = t->val - learning_rate*t->grad;
+        }
+    }
+
+};
+
+#endif
\ No newline at end of file
diff --git a/overloads/matrix.h b/overloads/matrix.h
index dfb5dc8..2ebe183 100644
--- a/overloads/matrix.h
+++ b/overloads/matrix.h
@@ -7,12 +7,20 @@
 #ifndef __MATRIX_OPS_INCLUDED__   
 #define __MATRIX_OPS_INCLUDED__  
 
-// Sigmoid 
+
 namespace matrixOps {
+
+    // Sigmoid Operation
     template<typename T>
     Matrix<T> sigmoid(const Matrix<T> &a) {
         return (T)1/((T)1 + (((T)-1)*a).exp());
     }
+
+    // Power Operation
+    template<typename T>
+    Matrix<T> power(Matrix<T> &a, T pow) {
+        return a^pow;
+    }
 };
 
 // Overloaded function for printing matrix: cout<<matrix<<endl;
@@ -46,4 +54,12 @@ Matrix<T> operator + (const T t, const Matrix<T> &rhs) {
     return Matrix<T>(res, resShape);
 }
 
+// Subtraction with a scalar
+template<typename T>
+Matrix<T> operator - (const T t, const Matrix<T> &rhs) {
+    auto res =  t-rhs.val;
+    auto resShape = rhs.shape;
+    return Matrix<T>(res, resShape);
+}
+
 #endif
\ No newline at end of file
diff --git a/overloads/tensor.h b/overloads/tensor.h
index f61733d..695ce0d 100644
--- a/overloads/tensor.h
+++ b/overloads/tensor.h
@@ -76,6 +76,13 @@ namespace tensorOps {
         return one->frontOp->forward();
     }
 
+    // Power
+    template<typename T>
+    Tensor<T>* power(Tensor<T>* one, T t) {
+        one->frontOp = new PowerOperation<T>(one, t);
+        return one->frontOp->forward();
+    }
+
 };
 
 #endif
diff --git a/overloads/vector.h b/overloads/vector.h
index 9aa14ba..8d8e7c0 100644
--- a/overloads/vector.h
+++ b/overloads/vector.h
@@ -62,6 +62,29 @@ vector<T> operator + (T a, const vector<T> &b) {
     return arr;
 }
 
+// Subtraction
+template<typename T>
+vector<T> operator - (vector<T> &a, const vector<T> &b) {
+    assert("Tensors are not of the same size !" && a.size() == b.size());
+    vector<T> arr;
+    for(int i = 0;i<a.size();i++) {
+        T prod = a[i]-b[i];
+        arr.push_back(prod);
+    }
+    return arr;
+}
+
+// Scalar Subtraction
+template<typename T>
+vector<T> operator - (T a, const vector<T> &b) {
+    vector<T> arr;
+    for(int i = 0;i<b.size();i++) {
+        T prod = a-b[i];
+        arr.push_back(prod);
+    }
+    return arr;
+}
+
 // Vector Divide
 template<typename T>
 vector<T> operator / (vector<T> &a, const vector<T> &b) {
diff --git a/tests/dense.h b/tests/dense.h
index 18b9f8b..4072a94 100644
--- a/tests/dense.h
+++ b/tests/dense.h
@@ -15,11 +15,14 @@ TEST(DENSE_LAYER_TESTS, SHAPE_CHECKS) {
     Tensor<float>* x1 = new Tensor<float>({1,2},{1,2}); // put 1 by 2 tensor
     auto m = fc1.forward(x1); // should work fine
 
-    Tensor<float>* x2 = new Tensor<float>({1},{1}); // put 1 by 2 tensor
+    delete m;
 
     ASSERT_DEATH({
+       Tensor<float>* x2 = new Tensor<float>({1},{1}); // put 1 by 2 tensor
+       Dense<float> fc1(2,5); // input - 2, output should be 5
        auto m = fc1.forward(x2); // should give error as dot product will not be compatible !
     }, "Shapes aren't compatible for dot product !");
+
 }
 
 /*
@@ -41,4 +44,6 @@ TEST(DENSE_LAYER_TESTS, CORRECTNESS_CHECK) {
     auto expectedVal = matrixOps::sigmoid((x->val).dot(w) + b);
 
     ASSERT_TRUE(testUtils::isMatrixEqual(m->val, expectedVal));
+
+    delete m;
 }
diff --git a/tests/main.cpp b/tests/main.cpp
index cde9948..6065f3c 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -7,6 +7,7 @@
 #include "tests/matrix.h"
 #include "tests/tensor.h"
 #include "tests/dense.h"
+#include "tests/sgd.h"
 
 int main(int argc, char **argv) {
     testing::InitGoogleTest(&argc, argv);
diff --git a/tests/matrix.h b/tests/matrix.h
index 8aa14bd..1e4dc54 100644
--- a/tests/matrix.h
+++ b/tests/matrix.h
@@ -127,6 +127,28 @@ TEST( MATRIX_TESTS, MatrixOperationMultiplicationCheck) {
     ASSERT_TRUE(testUtils::isMatrixEqual<int>(ans,res));
 }
 
+/*
+    This test tests the accuracy of the power operation between a matrix and a scalar
+*/
+TEST( MATRIX_TESTS, MatrixOperationPowerCheck) {
+
+    vector<int> a({1,2,3});
+    vector<int> shape1({1,3});
+    Matrix<int> m1(a,shape1);
+    int pow = 3;
+    auto ans = m1^pow; // Checking barebones operation
+    Matrix<int> res({1,8,27},{1,3});
+
+    ASSERT_TRUE(testUtils::isMatrixEqual<int>(ans,res));
+
+    Matrix<int> m2({1,2,3},{1,3});
+    pow = 2;
+    Matrix<int> res2({1,4,9},{1,3});
+    auto ans2 = matrixOps::power(m2,pow); // Checking wrapper function
+
+    ASSERT_TRUE(testUtils::isMatrixEqual<int>(ans2,res2));
+}
+
 /*
     This test tests the accuracy of the division operation between 2 matrices
 */
@@ -143,17 +165,6 @@ TEST( MATRIX_TESTS, MatrixOperationDivisionCheck) {
     ASSERT_TRUE(testUtils::isMatrixEqual<int>(ans,res));
 }
 
-/*
-    This test tests the accuracy of the power operation between matrice and scalar
-*/
-TEST( MATRIX_TESTS, MatrixOperationPowerCheck) {
-    vector<int> a({1,2,3});
-    vector<int> shape1({1,3});
-    Matrix<int> m1(a,shape1);
-    auto ans = m1^2;
-    Matrix<int> res({1,4,9},{1,3});
-    ASSERT_TRUE(testUtils::isMatrixEqual<int>(ans,res));
-}
 
 /*
     This test tests the accuracy of the exponent operation.
diff --git a/tests/sgd.h b/tests/sgd.h
new file mode 100644
index 0000000..3c40d02
--- /dev/null
+++ b/tests/sgd.h
@@ -0,0 +1,54 @@
+/*
+    This file tests the SGD Optimizer layer.
+*/
+
+#include <gtest/gtest.h>
+#include "optims/sgd.h"
+#include "tests/utils.h"
+#include "overloads/tensor.h"
+
+/*
+    Tests that the optimizer layer gets all the tensors that need to be updated.
+*/
+TEST(SGD_OPTIM_TESTS, TENSOR_UPDATE_CHECK) {
+    Tensor<float>* a = new Tensor<float>({2},{1});
+    Tensor<float>* b = new Tensor<float>({4},{1});
+    auto c = tensorOps::add(a,b);
+    Tensor<float>* d = new Tensor<float>({3},{1});
+
+    auto e = tensorOps::multiply(c,d);
+    e->backward();
+
+    SGD<float> sgd(0.1);
+    // get all paramters/tensors that need to be updated wrt to e
+    sgd.getParams(e);
+    unordered_set<Tensor<float>*> expected_res = {a,b,c,d};
+    ASSERT_TRUE(sgd.params == expected_res);
+
+    // Clean up
+    delete e;
+}
+
+/*
+    Tests that the tensor values are updated according to gradient values and learning rate
+*/
+TEST(SGD_OPTIM_TESTS, SGD_STEP_CHECK) {
+    Tensor<float>* a = new Tensor<float>({2},{1});
+    Tensor<float>* b = new Tensor<float>({4},{1});
+    auto c = tensorOps::add(a,b);
+    Tensor<float>* d = new Tensor<float>({3},{1});
+
+    auto e = tensorOps::multiply(c,d);
+    e->backward();
+
+    SGD<float> sgd(1);
+    // get all paramters/tensors that need to be updated wrt to e
+    sgd.minimise(e);
+
+    ASSERT_TRUE(a->val.val[0] == -1); // update = 2 - 1*3
+    ASSERT_TRUE(b->val.val[0] == 1); // update = 4 - 1*3
+    ASSERT_TRUE(d->val.val[0] == -3); // update = 3 -1*6
+
+    // Clean up
+    delete e;
+}
diff --git a/tests/sigmoidTests.h b/tests/sigmoidTests.h
new file mode 100644
index 0000000..8d9a36a
--- /dev/null
+++ b/tests/sigmoidTests.h
@@ -0,0 +1,155 @@
+#include "buildTensorflow.h"
+
+void oldSigmoidTest() {
+
+    Tensor<float> w0({2},{1});
+    Tensor<float> x0({-1},{1});
+
+    Tensor<float> w1({-3},{1});
+    Tensor<float> x1({-2},{1});
+
+    Tensor<float> w3({-3},{1});
+    
+    Tensor<float> a = w0*x0;
+    Tensor<float> b = w1*x1;
+    Tensor<float> c = a + b;
+    Tensor<float> d = w3+c;
+    Tensor<float> e({-1}, {1});
+    Tensor<float> f = d*e;
+    Tensor<float> g = f.exp();
+    Tensor<float> h({1}, {1});
+    Tensor<float> i = g + h;
+    Tensor<float> j({1}, {1});
+    Tensor<float> k = j/i;
+    
+    vector<float> vsl = {1};
+    vector<int> sh = {1};
+    auto grad = Matrix<float>(vsl,sh);
+    k.backward(grad);
+
+
+    cout<<w0.grad<<endl;
+    cout<<x0.grad<<endl;
+
+    cout<<w1.grad<<endl;
+    cout<<x1.grad<<endl;
+
+    cout<<w3.grad<<endl;
+}
+
+// WRONG BACKPROP: SOME ERROR WITH POINTERS AND OPERATION OVERLOADING.
+void newSigmoidTest() {
+    Tensor<float> w0({2},{1});
+    Tensor<float> x0({-1},{1});
+
+    Tensor<float> w1({-3},{1});
+    Tensor<float> x1({-2},{1});
+
+    Tensor<float> w3({-3},{1});
+    Tensor<float> e({-1}, {1});
+    Tensor<float> h({1}, {1});
+    Tensor<float> j({1}, {1});
+
+    Tensor<float> a = e*(w0*x0 + w1*x1 + w3);
+    Tensor<float> k = j/(a.exp() + h);
+
+    vector<float> vsl = {1};
+    vector<int> sh = {1};
+    auto grad = Matrix<float>(vsl,sh);
+    k.backward(grad);
+
+    cout<<w0.grad<<endl;
+    cout<<x0.grad<<endl;
+
+    cout<<w1.grad<<endl;
+    cout<<x1.grad<<endl;
+
+    cout<<w3.grad<<endl;
+}
+
+/*
+API guide:
+
+    1. Tensor(vector) - 1 D vector
+    2. Tensor(vector, row, col) - 2 D vector
+    Operations are add, dot, multiply, divide, exponent for tensors
+
+    Matrix size will always be batch size, channels, height width
+    Or batch size, embedding size, y
+
+    Rules during simple add, sub. divide, multiply use broadcasting
+    During matrix multiply, then 2D matrices can only be multiplied. 
+
+    Cases will be multilayer perceptron, batch size, input vector
+    layer weights would be input vector, output layer, hence add one to batch size dim
+    The matrix multiply the uses two interior dims as 2D matrices as input
+    Output will be 
+
+*/
+
+void sigmoidPointerTest() {
+
+    Tensor<float>* w0 = new Tensor<float>({2},{1});
+    Tensor<float>* x0= new Tensor<float>({-1},{1});
+
+    Tensor<float>* w1= new Tensor<float>({-3},{1});
+    Tensor<float>* x1= new Tensor<float>({-2},{1});
+
+    Tensor<float>* w3= new Tensor<float>({-3},{1});
+    
+    auto a = tensorOps::multiply(w0,x0);
+    auto b = tensorOps::multiply(w1,x1);
+    auto c = tensorOps::add(a,b);
+    auto d = tensorOps::add(w3,c);
+
+    Tensor<float>* e = new Tensor<float>({-1}, {1});
+    auto f = tensorOps::multiply(d,e);
+
+    auto g = tensorOps::exp(f); // exponent
+
+    Tensor<float>* h = new Tensor<float>({1}, {1});
+    auto i = tensorOps::add(g,h);
+
+    Tensor<float>* j = new Tensor<float>({1}, {1});
+    auto k = tensorOps::divide(j,i);
+    
+    auto grad = Matrix<float>({1},{1});
+    k->backward(grad);
+
+
+    cout<<w0->grad<<endl;
+    cout<<x0->grad<<endl;
+
+    cout<<w1->grad<<endl;
+    cout<<x1->grad<<endl;
+
+    cout<<w3->grad<<endl;
+}
+
+void updatedSigmoidtest() {
+    Tensor<float>* w0 = new Tensor<float>({2},{1});
+    Tensor<float>* x0= new Tensor<float>({-1},{1});
+
+    Tensor<float>* w1= new Tensor<float>({-3},{1});
+    Tensor<float>* x1= new Tensor<float>({-2},{1});
+
+    Tensor<float>* w3= new Tensor<float>({-3},{1});
+    
+    auto a = tensorOps::multiply(w0,x0);
+    auto b = tensorOps::multiply(w1,x1);
+    auto c = tensorOps::add(a,b);
+    auto d = tensorOps::add(w3,c);
+
+    auto k = tensorOps::sigmoid(d);
+    k->backward();
+
+    cout<<w0->grad<<endl;
+    cout<<x0->grad<<endl;
+
+    cout<<w1->grad<<endl;
+    cout<<x1->grad<<endl;
+
+    cout<<w3->grad<<endl;
+
+    delete k;
+}
\ No newline at end of file
diff --git a/tests/tensor.h b/tests/tensor.h
index e97ef9a..4888a64 100644
--- a/tests/tensor.h
+++ b/tests/tensor.h
@@ -16,7 +16,7 @@ TEST( TENSOR_TESTS, TensorCreation) {
     ASSERT_DEATH({
         vector<int> a({1,2,3,4,5,6});
         vector<int> shape1({2,4});
-        Matrix<int> m1(a,shape1);
+        Tensor<int> m1(a,shape1);
     }, "Shape and size of vector are incompatible !");
 
     // testing for no asserts with various dimensions that can used in nd matrix
@@ -24,9 +24,9 @@ TEST( TENSOR_TESTS, TensorCreation) {
     vector<int> shape1({2,3});
     vector<int> shape2({1,1,1,2,3});
     vector<int> shape3({2,3,1,1,1});
-    Matrix<int> m1(a,shape1);
-    m1 = Matrix<int>(a,shape2);
-    m1 = Matrix<int>(a,shape3);
+    Tensor<int> m1(a,shape1);
+    m1 = Tensor<int>(a,shape2);
+    m1 = Tensor<int>(a,shape3);
 }
 
 /*
@@ -42,6 +42,9 @@ TEST( TENSOR_TESTS, TensorAddOperations) {
     Matrix<int> res({2,4,6,8,10},{5});
 
     ASSERT_TRUE(testUtils::isMatrixEqual(ans->val,res));
+
+    // Clean up
+    delete ans;
 }
 
 
@@ -53,6 +56,9 @@ TEST( TENSOR_TESTS, TensorMultiplyOperations) {
     Matrix<int> res({1,4,9,16,25},{5});
 
     ASSERT_TRUE(testUtils::isMatrixEqual(ans->val,res));
+
+    // Clean up
+    delete ans;
 }
 
 TEST( TENSOR_TESTS, TensorDivideOperations) {
@@ -63,6 +69,9 @@ TEST( TENSOR_TESTS, TensorDivideOperations) {
     Matrix<int> res({5,2,5,2,1},{5});
 
     ASSERT_TRUE(testUtils::isMatrixEqual(ans->val,res));
+
+    // Clean up
+    delete ans;
 }
 
 /*
@@ -90,8 +99,33 @@ TEST( TENSOR_TESTS, TensorSigmoidOperations) {
 
     Matrix<float> resGrad({0.196611926}, {1});
     ASSERT_TRUE(testUtils::isMatrixEqual(one->grad,resGrad)); // check back Propogation
+
+    // Clean up
+    delete ans;
 }
 
+/*
+    This test checks the backward pass and forward pass of the power operation.
+*/
+TEST( TENSOR_TESTS, TensorPowerOperations) {
+    
+    Tensor<float>* one = new Tensor<float>({2,3,4},{1,3});
+    float pow = 3;
+    auto ans = tensorOps::power(one,pow);
+    Matrix<float> res({8,27,64}, {1,3});
+
+    ASSERT_TRUE(testUtils::isMatrixEqual(ans->val,res)); // check front Propogation
+
+    ans->backward();
+
+    Matrix<float> resGrad({12,27,48}, {1,3});
+    ASSERT_TRUE(testUtils::isMatrixEqual(one->grad,resGrad)); // check back Propogation
+
+    // Clean up
+    delete ans;
+}
+
+
 /*
     Test Computational Graph by checking Pointer Values of each
     tensor and operation for a barebones sigmoid function 
@@ -211,6 +245,9 @@ TEST( TENSOR_TESTS, ComputationGraph) {
     ASSERT_TRUE(x1->frontOp == b->backOp);
     ASSERT_TRUE(x1->backOp == NULL);
 
+    // Clean up
+    delete k;
+
 }
 
 /*
@@ -258,4 +295,7 @@ TEST(TENSOR_TESTS, BackwardPropogation) {
 
     res =  Matrix<float>({0.196611971},{1});
     ASSERT_TRUE(testUtils::isMatrixEqual(w3->grad,res));
+
+    // Clean up
+    delete k;
 }
diff --git a/types/matrix.h b/types/matrix.h
index 6c87200..74a947f 100644
--- a/types/matrix.h
+++ b/types/matrix.h
@@ -18,8 +18,7 @@ struct Matrix{
     */
     vector<int> elemsEncounteredPerDim;
 
-    // Check whether GPU is accessible or not
-    bool gpu = false;
+
 
     // Verifies that the shape provided and val vector provided are compatible in size
     bool verifyShape(const vector<T> &val, const vector<int> &shape) {
@@ -245,6 +244,15 @@ struct Matrix{
         return Matrix(res, resShape);
     }
 
+    // Performs elementwise subtraction
+    Matrix<T> operator - (const Matrix<T> &rhs) {
+        assert("Shapes aren't compatible for addition !" &&
+         verifyShapeForElementwiseOperation(this->shape, rhs.shape));
+
+        auto res = this->val - rhs.val;
+        auto resShape = this->shape;
+        return Matrix(res, resShape);
+    }
 
     // Performs elementwise division
     Matrix<T> operator / (const Matrix<T> &rhs) {
diff --git a/types/tensor.h b/types/tensor.h
index bae4ec1..9c41341 100644
--- a/types/tensor.h
+++ b/types/tensor.h
@@ -25,26 +25,13 @@
 #include "operations/exponentOperation.h"
 #include "operations/dotOperation.h"
 #include "operations/sigmoidOperation.h"
+#include "operations/powerOperation.h"
 
 #ifndef __TENSOR_FLOAT_INCLUDED__   
 #define __TENSOR_FLOAT_INCLUDED__   
 
 template <typename T>
 class Tensor {
-    private:
-
-    /*
-        This function is called during the initilaisation of Tensor. It sets the value of it's gradients to zero. This is needed as 
-        during backPropogation the same tensor can be used for different operation, hence to calculate it's partial gradients
-        each individual operation's gradients have to be summed up. Hence we initialise the tensor's gradients to zero.
-        
-        See constructor for it's usage.
-    */
-    void zeroGrad() {
-        assert(val.shape.size() != 0 && "The value of matrix cannot be uninitialised during initialisng zeros in tensor's gradient");
-        vector<T> g(val.val.size(), 0);
-        this->grad = Matrix<T>(g, val.shape);
-    }
 
     public:
     
@@ -145,6 +132,19 @@ class Tensor {
         }
     }
 
+    /*
+        This function is called during the initilaisation of Tensor. It sets the value of it's gradients to zero. This is needed as 
+        during backPropogation the same tensor can be used for different operation, hence to calculate it's partial gradients
+        each individual operation's gradients have to be summed up. Hence we initialise the tensor's gradients to zero.
+        
+        See constructor for it's usage.
+    */
+    void zeroGrad() {
+        assert(val.shape.size() != 0 && "The value of matrix cannot be uninitialised during initialisng zeros in tensor's gradient");
+        vector<T> g(val.val.size(), 0);
+        this->grad = Matrix<T>(g, val.shape);
+    }
+
     /*
         From here on, we overload the operators like +, / and * to define what happens when
         we we add, divide and multiply tensors. We also support other operations like dot 
@@ -220,10 +220,14 @@ class Tensor {
         return this->frontOp->forwardDeprecated();
     }
 
-    // Destructor
+    /* 
+        Go back towards computational graph and deletes every Tensor and Op encountered 
+        in a DFS fashion
+
+        TODO: find better way to clear memory of all tensors and prevent memory leaks.
+    */
     ~Tensor() {
-        // delete backOp;
-        // delete frontOp;
+        delete backOp;
     }
 
 };
diff --git a/utils/matrix.h b/utils/matrix.h
index fc79d3e..1e8394e 100644
--- a/utils/matrix.h
+++ b/utils/matrix.h
@@ -24,6 +24,12 @@ namespace utils {
 
         return Matrix<T>(m.val,shape);
     }
+
+    template< typename T>
+    Matrix<T> zerosLike(const Matrix<T> &m) {
+        vector<T> val(m.val.size(),0);
+        return Matrix<T>(val,m.shape);
+    }
 }
 
 #endif
\ No newline at end of file