diff --git a/README.md b/README.md index bca64df6..fd575f0e 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ The API is clear and simple, the goal is that every C++ developer can contribute BeeDNN can run on small devices. It is even possible to learn directly on the device. Please see at: https://github.com/edeforas/BeeDNN/issues for contributing ideas. -No dependencies needed, every algorithm rewritten in C++ from scratch. +No dependencies needed, every algorithm rewritten in C++ from scratch, (and now some using the help of AI agents). To increase speed, ones can choose the Eigen library (http://eigen.tuxfamily.org), instead of the internal matrix library. Initializers: @@ -18,8 +18,10 @@ Initializers: - Zeros, Ones Layers: -- Dense (with bias), Dot (without bias) +- Dense, Dot +- DotMAM, DenseMAM - GlobalGain, GlobalBias, GlobalAffine, Gain, Bias, Affine +- BatchNormalization - Softmax, Softmin - PRelu, RRelu, PELU, TERELU, CRelu - Gated activations: GLU, ReGLU, Bilinear, SwiGLU, GEGLU, GTU, SeGLU @@ -29,10 +31,14 @@ Time series: - TimeDistributedBias - TimeDistributedDot - TimeDistributedDense -- WIP +- SimpleRNN +- SimplestRNN +- GRU +- LSTM 2D layers: - Convolution2D, ChannelBias +- DepthwiseConvolution2D - MaxPool2D, GlobalMaxPool2D, - AveragePooling2D, GlobalAveragePooling2D - ZeroPadding2D diff --git a/python/beednn/BeeDNNUtils.py b/python/beednn/BeeDNNUtils.py new file mode 100644 index 00000000..2d5c2629 --- /dev/null +++ b/python/beednn/BeeDNNUtils.py @@ -0,0 +1,22 @@ +import numpy as np +import json +from . import Model,Layer + +def load_from_json(filename): + m=Model.Model() + + f=open(filename) + j=json.load(f) + f.close() + + for a in j: + if 'layer' in str(a): + l=j[a] + if l['name']=='dense': + lb=Layer.LayerDense() + w_0=l['weight_0'] + w_1=l['weight_1'] + activation=l['activation'] +#todo + m.append(lb) + return m \ No newline at end of file diff --git a/python/sample_TF_SimpleRNN.py b/python/sample_TF_SimpleRNN.py index 63d9570d..ce29666b 100644 --- a/python/sample_TF_SimpleRNN.py +++ b/python/sample_TF_SimpleRNN.py @@ -2,7 +2,7 @@ import numpy as np import matplotlib.pyplot as plt -from beednn import TFUtils +from beednn import TFUtils,BeeDNNUtils #create datas (load, reshape and normalize) sunspot="sunspot.txt" @@ -29,14 +29,19 @@ plt.title('mse loss') plt.grid() -y_pred = model.predict(x_train) - +y_pred_tf = model.predict(x_train) + +#save for inspection +TFUtils.save_tf_to_json(model,'saved_model_simplernn.json') + +#reload for BeeDNN +model_beednn=BeeDNNUtils.load_from_json('saved_model_simplernn.json') +y_pred_beednn=model_beednn.predict(x_train) + plt.figure() plt.plot(y_train.flatten()*max_spot,label='y_train') -plt.plot(y_pred.flatten()*max_spot,label='y_pred') +plt.plot(y_pred_tf.flatten()*max_spot,label='y_pred_TF') +plt.plot(y_pred_beednn.flatten()*max_spot,label='y_pred_BeeDNN') plt.legend() plt.title('truth vs. pred') -plt.show() - -#save for inspection -TFUtils.save_tf_to_json(model,'saved_model_simplernn.json') \ No newline at end of file +plt.show() \ No newline at end of file diff --git a/python/sample_TF_XOR.py b/python/sample_TF_XOR.py index e800c060..8857b80e 100644 --- a/python/sample_TF_XOR.py +++ b/python/sample_TF_XOR.py @@ -1,7 +1,7 @@ import tensorflow as tf import numpy as np import matplotlib.pyplot as plt -from beednn import TFUtils +from beednn import TFUtils,BeeDNNUtils #create XOR data x_train=np.array([[0,0],[0,1],[1, 0],[1,1]]) @@ -15,7 +15,7 @@ #train model.compile(loss='mse', optimizer='adam') -history=model.fit(x_train, y_train, epochs=600) +history=model.fit(x_train, y_train, epochs=60) loss_train=history.history["loss"] model.summary() @@ -24,14 +24,20 @@ plt.title('mse loss') plt.grid() -y_pred = model.predict(x_train) +#save for inspection +TFUtils.save_tf_to_json(model,'saved_model_xor.json') + +#reload for BeeDNN +model_beednn=BeeDNNUtils.load_from_json('saved_model_xor.json') + +y_pred_tf = model.predict(x_train) +y_pred_beednn=model_beednn.predict(x_train) + plt.figure() plt.plot(y_train.flatten(),label='y_train') -plt.plot(y_pred.flatten(),label='y_pred') +plt.plot(y_pred_tf.flatten(),label='y_pred_tf') +plt.plot(y_pred_beednn.flatten(),label='y_pred_beednn') plt.legend() plt.grid() plt.title('truth vs. pred') -plt.show() - -#save for inspection -TFUtils.save_tf_to_json(model,'saved_model_xor.json') \ No newline at end of file +plt.show() \ No newline at end of file diff --git a/samples/sample_classification_MNIST_MAM.cpp b/samples/sample_classification_MNIST_MAM.cpp new file mode 100644 index 00000000..56d99f51 --- /dev/null +++ b/samples/sample_classification_MNIST_MAM.cpp @@ -0,0 +1,107 @@ +// simple MNIST classification with a dense layer, similar as : +// https://colab.research.google.com/github/tensorflow/docs/blob/master/site/en/tutorials/_index.ipynb +// validation accuracy > 98.1%, after 20 epochs (1s by epochs) + +#include +#include + +#include "Net.h" +#include "NetTrain.h" +#include "MNISTReader.h" +#include "Metrics.h" + +#include "LayerActivation.h" +#include "LayerDotMAM.h" +#include "LayerDot.h" +#include "LayerMaxPool2D.h" +#include "LayerBias.h" +#include "LayerDropout.h" +#include "LayerSoftmax.h" + +using namespace std; +using namespace beednn; + +Net model; +NetTrain netTrain; +int iEpoch; +chrono::steady_clock::time_point start; + +////////////////////////////////////////////////////////////////////////////// +void epoch_callback() +{ + //compute epoch time + chrono::steady_clock::time_point next = chrono::steady_clock::now(); + auto delta = chrono::duration_cast(next - start).count(); + start = next; + + iEpoch++; + cout << "Epoch: " << iEpoch << " duration: " << delta << " ms" << endl; + cout << "TrainLoss: " << netTrain.get_current_train_loss() << " TrainAccuracy: " << netTrain.get_current_train_accuracy() << " %" ; + cout << " ValidationAccuracy: " << netTrain.get_current_validation_accuracy() << " %" << endl; + + cout << endl; +} +////////////////////////////////////////////////////////////////////////////// +int main() +{ + cout << "simple classification MNIST with a MAM + dot layer" << endl; + cout << "validation accuracy > tbd%, after 15 epochs (1s by epochs)" << endl; + + iEpoch = 0; + + //load and normalize MNIST data + cout << "Loading MNIST database..." << endl; + MNISTReader mr; + if(!mr.load(".")) + { + cout << "MNIST samples not found, please check the *.ubyte files are in the executable folder" << endl; + return -1; + } + + //create simple net: + model.add(new LayerMaxPool2D(28, 28, 1, 4, 4)); //input rows, input cols,input channels, factor rows, factor cols + model.add(new LayerDotMAM(784/16, 8)); + model.add(new LayerBias()); + model.add(new LayerActivation("Relu")); + model.add(new LayerDropout(0.2f)); //reduce overfitting + model.add(new LayerDot(8, 10)); + model.add(new LayerBias()); + model.add(new LayerSoftmax()); + + //setup train options + netTrain.set_epochs(15); + netTrain.set_batchsize(64); + netTrain.set_loss("SparseCategoricalCrossEntropy"); + netTrain.set_epoch_callback(epoch_callback); //optional, to show the progress + netTrain.set_train_data(mr.train_data(),mr.train_truth()); + netTrain.set_validation_data(mr.validation_data(), mr.validation_truth()); //optional, not used for training, helps to keep the final best model + + // train net + cout << "Training..." << endl << endl; + start = chrono::steady_clock::now(); + netTrain.fit(model); + + // show train results + MatrixFloat mClassPredicted; + model.predict_classes(mr.train_data(), mClassPredicted); + Metrics metricsTrain; + metricsTrain.compute(mr.train_truth(), mClassPredicted); + cout << "Train accuracy: " << metricsTrain.accuracy() << " %" << endl; + + MatrixFloat mClassVal; + model.predict_classes(mr.validation_data(), mClassVal); + Metrics metricsVal; + metricsVal.compute(mr.validation_truth(), mClassVal); + cout << "Validation accuracy: " << metricsVal.accuracy() << " %" << endl; + cout << "Validation confusion matrix:" << endl << toString(metricsVal.confusion_matrix()) << endl; + + //testu function + if (metricsVal.accuracy() < 98.f) + { + cout << "Test failed! accuracy=" << metricsVal.accuracy() << endl; + return -1; + } + + cout << "Test succeded." << endl; + return 0; +} diff --git a/samples/sample_classification_xor_MAM.cpp b/samples/sample_classification_xor_MAM.cpp new file mode 100644 index 00000000..01a2819c --- /dev/null +++ b/samples/sample_classification_xor_MAM.cpp @@ -0,0 +1,68 @@ +// this sample shows how to do a simple classification +// the usecase is to learn a XOR gate + +#include + +#include "Net.h" +#include "NetTrain.h" + +#include "LayerDotMAM.h" +#include "LayerDense.h" +#include "LayerBias.h" +#include "LayerActivation.h" + +using namespace std; +using namespace beednn; + +///////////////////////////////////////////////////////////////////// +// for testU only +inline bool is_near(double a,double b, double tolerancis=0.001) +{ + return fabs(a-b) + +using namespace std; +namespace beednn { + +/////////////////////////////////////////////////////////////////////////////// +LayerBatchNormalization::LayerBatchNormalization(Index iSize) : + Layer("BatchNormalization"), + _iSize(iSize), + _fMomentum(0.99f), + _fEpsilon(1e-5f) +{ + LayerBatchNormalization::init(); +} +/////////////////////////////////////////////////////////////////////////////// +LayerBatchNormalization::~LayerBatchNormalization() +{ } +/////////////////////////////////////////////////////////////////////////////// +void LayerBatchNormalization::init() +{ + // Initialize learnable parameters + _gamma.setOnes(1, _iSize); // Scale initialized to 1 + _beta.setZero(1, _iSize); // Shift initialized to 0 + + // Initialize gradients + _gradientGamma.setZero(1, _iSize); + _gradientBeta.setZero(1, _iSize); + + // Initialize running statistics (for inference) + _runningMean.setZero(1, _iSize); + _runningVariance.setOnes(1, _iSize); + + Layer::init(); +} +/////////////////////////////////////////////////////////////////////////////// +Layer* LayerBatchNormalization::clone() const +{ + LayerBatchNormalization* pLayer = new LayerBatchNormalization(_iSize); + pLayer->_gamma = _gamma; + pLayer->_beta = _beta; + pLayer->_runningMean = _runningMean; + pLayer->_runningVariance = _runningVariance; + pLayer->_fMomentum = _fMomentum; + pLayer->_fEpsilon = _fEpsilon; + + return pLayer; +} +/////////////////////////////////////////////////////////////////////////////// +void LayerBatchNormalization::forward(const MatrixFloat& mIn, MatrixFloat& mOut) +{ + assert(mIn.cols() == _iSize); + + Index iBatchSize = mIn.rows(); + + if (_bTrainMode) + { + // ========== TRAINING MODE ========== + // Compute batch mean and variance + + // Mean: mean = sum(x) / N + _batchMean = colWiseMean(mIn); + + // Variance: var = sum((x - mean)^2) / N + MatrixFloat mCentered = mIn; + for (Index i = 0; i < mCentered.rows(); i++) + { + for (Index j = 0; j < mCentered.cols(); j++) + { + mCentered(i, j) = mCentered(i, j) - _batchMean(j); + } + } + + // Compute variance (element-wise) + _batchVariance.setZero(1, _iSize); + for (Index i = 0; i < mCentered.rows(); i++) + { + for (Index j = 0; j < mCentered.cols(); j++) + { + _batchVariance(j) += mCentered(i, j) * mCentered(i, j); + } + } + _batchVariance = _batchVariance * (1.0f / iBatchSize); + + // Normalize: x_norm = (x - mean) / sqrt(var + eps) + _normalized = mCentered; + for (Index i = 0; i < _normalized.rows(); i++) + { + for (Index j = 0; j < _normalized.cols(); j++) + { + float fStd = sqrtf(_batchVariance(j) + _fEpsilon); + _normalized(i, j) = _normalized(i, j) / fStd; + } + } + + // Update running statistics using exponential moving average + // running_mean = momentum * running_mean + (1 - momentum) * batch_mean + for (Index j = 0; j < _iSize; j++) + { + _runningMean(j) = _fMomentum * _runningMean(j) + (1.0f - _fMomentum) * _batchMean(j); + _runningVariance(j) = _fMomentum * _runningVariance(j) + (1.0f - _fMomentum) * _batchVariance(j); + } + } + else + { + // ========== INFERENCE MODE ========== + // Use running statistics instead of batch statistics + + MatrixFloat mCentered = mIn; + for (Index i = 0; i < mCentered.rows(); i++) + { + for (Index j = 0; j < mCentered.cols(); j++) + { + mCentered(i, j) = mCentered(i, j) - _runningMean(j); + } + } + + _normalized = mCentered; + for (Index i = 0; i < _normalized.rows(); i++) + { + for (Index j = 0; j < _normalized.cols(); j++) + { + float fStd = sqrtf(_runningVariance(j) + _fEpsilon); + _normalized(i, j) = _normalized(i, j) / fStd; + } + } + } + + // Scale and shift: y = gamma * x_norm + beta + mOut.resizeLike(mIn); + for (Index i = 0; i < mOut.rows(); i++) + { + for (Index j = 0; j < mOut.cols(); j++) + { + mOut(i, j) = _gamma(j) * _normalized(i, j) + _beta(j); + } + } +} +/////////////////////////////////////////////////////////////////////////////// +void LayerBatchNormalization::backpropagation(const MatrixFloat& mIn, const MatrixFloat& mGradientOut, MatrixFloat& mGradientIn) +{ + assert(mIn.cols() == _iSize); + assert(mGradientOut.cols() == _iSize); + + Index iBatchSize = mIn.rows(); + + // ========== GRADIENT w.r.t. GAMMA AND BETA ========== + // dL/dgamma = sum(dL/dy * x_norm) + // dL/dbeta = sum(dL/dy) + + _gradientGamma.setZero(1, _iSize); + _gradientBeta.setZero(1, _iSize); + + for (Index i = 0; i < mGradientOut.rows(); i++) + { + for (Index j = 0; j < mGradientOut.cols(); j++) + { + _gradientGamma(j) += mGradientOut(i, j) * _normalized(i, j); + _gradientBeta(j) += mGradientOut(i, j); + } + } + + // Average over batch + _gradientGamma = _gradientGamma * (1.0f / iBatchSize); + _gradientBeta = _gradientBeta * (1.0f / iBatchSize); + + // ========== GRADIENT w.r.t. INPUT ========== + // This is more complex, but here's a simplified version + // For full derivation, see https://arxiv.org/abs/1502.03167 + + // dL/dx_norm = dL/dy * gamma + MatrixFloat mGradNormalized = mGradientOut; + for (Index i = 0; i < mGradNormalized.rows(); i++) + { + for (Index j = 0; j < mGradNormalized.cols(); j++) + { + mGradNormalized(i, j) = mGradNormalized(i, j) * _gamma(j); + } + } + + // Compute variance gradient + // dL/dvar = sum(dL/dx_norm * (x - mean) * -0.5 * (var + eps)^-1.5) + MatrixFloat mGradVariance(1, _iSize); + mGradVariance.setZero(1, _iSize); + + MatrixFloat mCentered = mIn; + for (Index i = 0; i < mCentered.rows(); i++) + { + for (Index j = 0; j < mCentered.cols(); j++) + { + mCentered(i, j) = mCentered(i, j) - _batchMean(j); + } + } + + for (Index i = 0; i < mGradNormalized.rows(); i++) + { + for (Index j = 0; j < mGradNormalized.cols(); j++) + { + float fStd = sqrtf(_batchVariance(j) + _fEpsilon); + mGradVariance(j) += mGradNormalized(i, j) * mCentered(i, j) * + (-0.5f) / (fStd * fStd * fStd); + } + } + + // Compute mean gradient + // dL/dmean = sum(dL/dx_norm * -1 / sqrt(var + eps)) + dL/dvar * sum(-2 * (x - mean)) / N + MatrixFloat mGradMean(1, _iSize); + mGradMean.setZero(1, _iSize); + + for (Index i = 0; i < mGradNormalized.rows(); i++) + { + for (Index j = 0; j < mGradNormalized.cols(); j++) + { + float fStd = sqrtf(_batchVariance(j) + _fEpsilon); + mGradMean(j) += mGradNormalized(i, j) * (-1.0f / fStd); + mGradMean(j) += mGradVariance(j) * mCentered(i, j) * (-2.0f / iBatchSize); + } + } + + // Compute input gradient + // dL/dx = dL/dx_norm * (1 / sqrt(var + eps)) + dL/dvar * (2 * (x - mean) / N) + dL/dmean * (1 / N) + mGradientIn.resizeLike(mIn); + + for (Index i = 0; i < mGradientIn.rows(); i++) + { + for (Index j = 0; j < mGradientIn.cols(); j++) + { + float fStd = sqrtf(_batchVariance(j) + _fEpsilon); + mGradientIn(i, j) = mGradNormalized(i, j) * (1.0f / fStd) + + mGradVariance(j) * (2.0f * mCentered(i, j) / iBatchSize) + + mGradMean(j) * (1.0f / iBatchSize); + } + } +} +/////////////////////////////////////////////////////////////////////////////// +void LayerBatchNormalization::set_momentum(float fMomentum) +{ + assert(fMomentum >= 0.0f && fMomentum <= 1.0f); + _fMomentum = fMomentum; +} +/////////////////////////////////////////////////////////////////////////////// +void LayerBatchNormalization::set_epsilon(float fEpsilon) +{ + assert(fEpsilon > 0.0f); + _fEpsilon = fEpsilon; +} +/////////////////////////////////////////////////////////////////////////////// +float LayerBatchNormalization::get_momentum() const +{ + return _fMomentum; +} +/////////////////////////////////////////////////////////////////////////////// +float LayerBatchNormalization::get_epsilon() const +{ + return _fEpsilon; +} +/////////////////////////////////////////////////////////////////////////////// +} \ No newline at end of file diff --git a/src/LayerBatchNormalization.h b/src/LayerBatchNormalization.h new file mode 100644 index 00000000..8b0ec76c --- /dev/null +++ b/src/LayerBatchNormalization.h @@ -0,0 +1,69 @@ +/* + Copyright (c) 2019, Etienne de Foras and the respective contributors + All rights reserved. + + Use of this source code is governed by a MIT-style license that can be found + in the LICENSE.txt file. +*/ + +#pragma once + +#include "Layer.h" +#include "Matrix.h" + +// Batch Normalization Layer +// Reference: https://arxiv.org/abs/1502.03167 +// +// Forward pass: +// 1. Compute batch statistics: mean and variance +// 2. Normalize: (x - mean) / sqrt(variance + epsilon) +// 3. Scale and shift: y = gamma * x_norm + beta +// +// During inference, use running statistics (exponential moving average) +// of training batch mean and variance + +namespace beednn { +class LayerBatchNormalization : public Layer +{ +public: + explicit LayerBatchNormalization(Index iSize); + virtual ~LayerBatchNormalization(); + virtual void init() override; + + virtual Layer* clone() const override; + virtual void forward(const MatrixFloat& mIn, MatrixFloat& mOut) override; + virtual void backpropagation(const MatrixFloat& mIn, const MatrixFloat& mGradientOut, MatrixFloat& mGradientIn) override; + +private: + Index _iSize; // Size of input features + + // Learnable parameters + MatrixFloat _gamma; // Scale parameter (weight) + MatrixFloat _beta; // Shift parameter (bias) + + // Gradients for learnable parameters + MatrixFloat _gradientGamma; + MatrixFloat _gradientBeta; + + // Running statistics for inference (exponential moving average) + MatrixFloat _runningMean; + MatrixFloat _runningVariance; + + // Training statistics + MatrixFloat _batchMean; + MatrixFloat _batchVariance; + MatrixFloat _normalized; // Normalized values (x_norm) + + // Hyperparameters + float _fMomentum; // For exponential moving average (default: 0.99) + float _fEpsilon; // Small constant to avoid division by zero (default: 1e-5) + +public: + // Setters for hyperparameters + void set_momentum(float fMomentum); + void set_epsilon(float fEpsilon); + + float get_momentum() const; + float get_epsilon() const; +}; +} \ No newline at end of file diff --git a/src/LayerDepthwiseConvolution2D.cpp b/src/LayerDepthwiseConvolution2D.cpp new file mode 100644 index 00000000..6d30fbea --- /dev/null +++ b/src/LayerDepthwiseConvolution2D.cpp @@ -0,0 +1,237 @@ +/* + Copyright (c) 2019, Etienne de Foras and the respective contributors + All rights reserved. + + Use of this source code is governed by a MIT-style license that can be found + in the LICENSE.txt file. +*/ + +#include "LayerDepthwiseConvolution2D.h" +#include +#include + +using namespace std; +namespace beednn { + +/////////////////////////////////////////////////////////////////////////////// +LayerDepthwiseConvolution2D::LayerDepthwiseConvolution2D(Index inRows, Index inCols, Index inChannels, + Index kernelRows, Index kernelCols, + Index rowStride, Index colStride) : + Layer("DepthwiseConvolution2D"), + _inRows(inRows), + _inCols(inCols), + _inChannels(inChannels), + _kernelRows(kernelRows), + _kernelCols(kernelCols), + _rowStride(rowStride), + _colStride(colStride) +{ + assert(inRows > 0 && inCols > 0 && inChannels > 0); + assert(kernelRows > 0 && kernelCols > 0); + assert(rowStride > 0 && colStride > 0); + assert(kernelRows <= inRows && kernelCols <= inCols); + + LayerDepthwiseConvolution2D::init(); +} +/////////////////////////////////////////////////////////////////////////////// +LayerDepthwiseConvolution2D::~LayerDepthwiseConvolution2D() +{ } +/////////////////////////////////////////////////////////////////////////////// +void LayerDepthwiseConvolution2D::computeOutputSize() +{ + // Standard convolution output size formula + // out_size = floor((in_size - kernel_size) / stride) + 1 + _outRows = (_inRows - _kernelRows) / _rowStride + 1; + _outCols = (_inCols - _kernelCols) / _colStride + 1; +} +/////////////////////////////////////////////////////////////////////////////// +void LayerDepthwiseConvolution2D::init() +{ + computeOutputSize(); + + // Initialize weights: shape (kernel_rows * kernel_cols, in_channels) + // Each column is the depthwise kernel for one input channel + Index iKernelSize = _kernelRows * _kernelCols; + _weight.setRandom(iKernelSize, _inChannels); + + // Scale weights (Xavier initialization) + float fScale = sqrtf(6.0f / (iKernelSize + _inChannels)); + for (Index i = 0; i < _weight.size(); i++) + { + _weight(i) = _weight(i) * fScale; + } + + // Initialize bias: shape (1, in_channels) + _bias.setZero(1, _inChannels); + + // Initialize gradients + _gradientWeight.setZero(iKernelSize, _inChannels); + _gradientBias.setZero(1, _inChannels); + + Layer::init(); +} +/////////////////////////////////////////////////////////////////////////////// +Layer* LayerDepthwiseConvolution2D::clone() const +{ + LayerDepthwiseConvolution2D* pLayer = new LayerDepthwiseConvolution2D(_inRows, _inCols, _inChannels, + _kernelRows, _kernelCols, + _rowStride, _colStride); + pLayer->_weight = _weight; + pLayer->_bias = _bias; + + return pLayer; +} +/////////////////////////////////////////////////////////////////////////////// +void LayerDepthwiseConvolution2D::extractPatch(const MatrixFloat& mIn, Index iRow, Index iCol, Index iChannel, + MatrixFloat& mPatch) const +{ + // Extract a kernel_rows x kernel_cols patch from input at position (iRow, iCol) for channel iChannel + // Input is stored as: [sample, height * width * channels] + // Channel layout: row-major order + + mPatch.setZero(_kernelRows * _kernelCols, 1); + + for (Index kr = 0; kr < _kernelRows; kr++) + { + for (Index kc = 0; kc < _kernelCols; kc++) + { + Index iInRow = iRow + kr; + Index iInCol = iCol + kc; + + // Compute linear index for this position + Index iInIdx = (iInRow * _inCols + iInCol) * _inChannels + iChannel; + Index iPatchIdx = kr * _kernelCols + kc; + + mPatch(iPatchIdx) = mIn(iInIdx); + } + } +} +/////////////////////////////////////////////////////////////////////////////// +void LayerDepthwiseConvolution2D::forward(const MatrixFloat& mIn, MatrixFloat& mOut) +{ + assert(mIn.cols() == _inRows * _inCols * _inChannels); + + Index iBatchSize = mIn.rows(); + Index iOutSize = _outRows * _outCols * _inChannels; + + mOut.setZero(iBatchSize, iOutSize); + + Index iKernelSize = _kernelRows * _kernelCols; + + // For each sample in the batch + for (Index iBatch = 0; iBatch < iBatchSize; iBatch++) + { + // For each output position + for (Index iOutRow = 0; iOutRow < _outRows; iOutRow++) + { + for (Index iOutCol = 0; iOutCol < _outCols; iOutCol++) + { + // Compute input position + Index iInRow = iOutRow * _rowStride; + Index iInCol = iOutCol * _colStride; + + // For each channel (depthwise) + for (Index iChan = 0; iChan < _inChannels; iChan++) + { + // Extract patch for this channel + MatrixFloat mPatch(_kernelRows * _kernelCols, 1); + extractPatch(mIn, iInRow, iInCol, iChan, mPatch); + + // Compute convolution: dot product with kernel + float fConv = 0.0f; + for (Index iKIdx = 0; iKIdx < iKernelSize; iKIdx++) + { + fConv += mPatch(iKIdx) * _weight(iKIdx, iChan); + } + + // Add bias + fConv += _bias(iChan); + + // Store output + Index iOutIdx = (iOutRow * _outCols + iOutCol) * _inChannels + iChan; + mOut(iBatch, iOutIdx) = fConv; + } + } + } + } +} +/////////////////////////////////////////////////////////////////////////////// +void LayerDepthwiseConvolution2D::backpropagation(const MatrixFloat& mIn, const MatrixFloat& mGradientOut, MatrixFloat& mGradientIn) +{ + assert(mIn.cols() == _inRows * _inCols * _inChannels); + assert(mGradientOut.cols() == _outRows * _outCols * _inChannels); + + Index iBatchSize = mIn.rows(); + Index iKernelSize = _kernelRows * _kernelCols; + + // Initialize gradients + _gradientWeight.setZero(iKernelSize, _inChannels); + _gradientBias.setZero(1, _inChannels); + mGradientIn.setZero(iBatchSize, mIn.cols()); + + // For each sample in the batch + for (Index iBatch = 0; iBatch < iBatchSize; iBatch++) + { + // For each output position + for (Index iOutRow = 0; iOutRow < _outRows; iOutRow++) + { + for (Index iOutCol = 0; iOutCol < _outCols; iOutCol++) + { + // Compute input position + Index iInRow = iOutRow * _rowStride; + Index iInCol = iOutCol * _colStride; + + // For each channel (depthwise) + for (Index iChan = 0; iChan < _inChannels; iChan++) + { + Index iOutIdx = (iOutRow * _outCols + iOutCol) * _inChannels + iChan; + float fGradOut = mGradientOut(iBatch, iOutIdx); + + // Bias gradient + _gradientBias(iChan) += fGradOut; + + // Extract patch for this channel + MatrixFloat mPatch(_kernelRows * _kernelCols, 1); + extractPatch(mIn, iInRow, iInCol, iChan, mPatch); + + // Weight gradient: dL/dW = dL/dOut * dOut/dW = fGradOut * mPatch + for (Index iKIdx = 0; iKIdx < iKernelSize; iKIdx++) + { + _gradientWeight(iKIdx, iChan) += fGradOut * mPatch(iKIdx); + } + + // Input gradient: dL/dIn = dL/dOut * dOut/dIn = fGradOut * W + for (Index kr = 0; kr < _kernelRows; kr++) + { + for (Index kc = 0; kc < _kernelCols; kc++) + { + Index iInIdx = ((iInRow + kr) * _inCols + (iInCol + kc)) * _inChannels + iChan; + Index iKIdx = kr * _kernelCols + kc; + + mGradientIn(iBatch, iInIdx) += fGradOut * _weight(iKIdx, iChan); + } + } + } + } + } + } + + // Average gradients over batch + float fBatchScale = 1.0f / iBatchSize; + _gradientWeight = _gradientWeight * fBatchScale; + _gradientBias = _gradientBias * fBatchScale; +} +/////////////////////////////////////////////////////////////////////////////// +void LayerDepthwiseConvolution2D::get_params(Index& inRows, Index& inCols, Index& inChannels, + Index& kernelRows, Index& kernelCols, Index& rowStride, Index& colStride) const +{ + inRows = _inRows; + inCols = _inCols; + inChannels = _inChannels; + kernelRows = _kernelRows; + kernelCols = _kernelCols; + rowStride = _rowStride; + colStride = _colStride; +} +/////////////////////////////////////////////////////////////////////////////// +} \ No newline at end of file diff --git a/src/LayerDepthwiseConvolution2D.h b/src/LayerDepthwiseConvolution2D.h new file mode 100644 index 00000000..5970cd17 --- /dev/null +++ b/src/LayerDepthwiseConvolution2D.h @@ -0,0 +1,74 @@ +/* + Copyright (c) 2019, Etienne de Foras and the respective contributors + All rights reserved. + + Use of this source code is governed by a MIT-style license that can be found + in the LICENSE.txt file. +*/ + +#pragma once + +#include "Layer.h" +#include "Matrix.h" + +// Depthwise Convolution 2D Layer +// Reference: https://arxiv.org/abs/1704.04861 (MobileNets) +// +// In depthwise convolution, each input channel is convolved with a separate kernel +// This is much more efficient than standard convolution, reducing parameters by a factor of (kernel_height * kernel_width) +// +// Input shape: (batch_size, height, width, channels) +// Output shape: (batch_size, out_height, out_width, channels) +// Kernel shape per channel: (kernel_height, kernel_width, 1, 1) +// Total parameters: kernel_height * kernel_width * input_channels + +namespace beednn { +class LayerDepthwiseConvolution2D : public Layer +{ +public: + explicit LayerDepthwiseConvolution2D(Index inRows, Index inCols, Index inChannels, + Index kernelRows, Index kernelCols, + Index rowStride = 1, Index colStride = 1); + virtual ~LayerDepthwiseConvolution2D(); + virtual void init() override; + + virtual Layer* clone() const override; + virtual void forward(const MatrixFloat& mIn, MatrixFloat& mOut) override; + virtual void backpropagation(const MatrixFloat& mIn, const MatrixFloat& mGradientOut, MatrixFloat& mGradientIn) override; + + // Getter for layer parameters + void get_params(Index& inRows, Index& inCols, Index& inChannels, + Index& kernelRows, Index& kernelCols, Index& rowStride, Index& colStride) const; + +private: + // Input dimensions + Index _inRows, _inCols, _inChannels; + + // Kernel dimensions + Index _kernelRows, _kernelCols; + + // Stride + Index _rowStride, _colStride; + + // Output dimensions (computed in init) + Index _outRows, _outCols; + + // Kernel per channel: shape (kernel_rows * kernel_cols, in_channels) + // Each column is the kernel for one input channel + MatrixFloat _weight; + + // Bias per channel: shape (1, in_channels) + MatrixFloat _bias; + + // Gradients + MatrixFloat _gradientWeight; + MatrixFloat _gradientBias; + + // Helper functions + void computeOutputSize(); + void extractPatch(const MatrixFloat& mIn, Index iRow, Index iCol, Index iChannel, + MatrixFloat& mPatch) const; + void addGradientToPatch(MatrixFloat& mGradWeight, const MatrixFloat& mInputPatch, + const float fGradient, Index iChannel) const; +}; +} \ No newline at end of file diff --git a/src/LayerDotMAM.cpp b/src/LayerDotMAM.cpp new file mode 100644 index 00000000..209cfb0b --- /dev/null +++ b/src/LayerDotMAM.cpp @@ -0,0 +1,134 @@ +/* + Copyright (c) 2019, Etienne de Foras and the respective contributors + All rights reserved. + + Use of this source code is governed by a MIT-style license that can be found + in the LICENSE.txt file. +*/ + +#include "LayerDotMAM.h" + +#include "Initializers.h" + +using namespace std; +namespace beednn { + +/////////////////////////////////////////////////////////////////////////////// +LayerDotMAM::LayerDotMAM(Index iInputSize, Index iOutputSize,const string& sWeightInitializer) : + Layer( "DotMAM"), + _iInputSize(iInputSize), + _iOutputSize(iOutputSize) +{ + set_weight_initializer(sWeightInitializer); + LayerDotMAM::init(); +} +/////////////////////////////////////////////////////////////////////////////// +LayerDotMAM::~LayerDotMAM() +{ } +/////////////////////////////////////////////////////////////////////////////// +Layer* LayerDotMAM::clone() const +{ + LayerDotMAM* pLayer=new LayerDotMAM(_iInputSize, _iOutputSize); + pLayer->_weight=_weight; + return pLayer; +} +/////////////////////////////////////////////////////////////////////////////// +void LayerDotMAM::init() +{ + if (_iInputSize == 0) + return; + + if (_iOutputSize == 0) + return; + + assert(_iInputSize > 0); + assert(_iOutputSize > 0); + + Initializers::compute(weight_initializer(), _weight, _iInputSize, _iOutputSize); + + Layer::init(); +} +/////////////////////////////////////////////////////////////////////////////// +void LayerDotMAM::forward(const MatrixFloat& mIn,MatrixFloat& mOut) +{ + //not optimized yet + mOut.resize( mIn.rows(), _iOutputSize); + if(_bTrainMode) + { + _mMinIndex.resizeLike(mOut); //index to selected input min data + _mMaxIndex.resizeLike(mOut); //index to selected input max data + } + for(Index sample =0; sample < mIn.rows(); sample++) + { + for (Index colOut = 0; colOut < mOut.cols(); colOut++) + { + for (Index weightRow = 0; weightRow < _weight.rows(); weightRow++) + { + float fMax = -1.e38f; + float fMin = +1.e38f; + Index iPosMinIn = -1; + Index iPosMaxIn = -1; + + for (Index c = 0; c < mIn.cols(); c++) + { + float fSample = mIn(sample, c)*_weight(c, colOut); + if (fSample > fMax) + { + fMax = fSample; + iPosMaxIn = c; + } + + if (fSample < fMin) + { + fMin = fSample; + iPosMinIn = c; + } + } + + mOut(sample, colOut) = fMax + fMin; + if (_bTrainMode) + { + _mMinIndex(sample, colOut) = (float)iPosMinIn; + _mMaxIndex(sample, colOut) = (float)iPosMaxIn; + } + } + } + } +} +/////////////////////////////////////////////////////////////////////////////// +void LayerDotMAM::backpropagation(const MatrixFloat &mIn,const MatrixFloat &mGradientOut, MatrixFloat &mGradientIn) +{ + (void)mIn; + + _gradientWeight.setZero(_weight.rows(), _weight.cols()); + for (Index sample = 0; sample < mGradientOut.rows(); sample++) + { + for (Index c = 0; c < mGradientOut.cols(); c++) + { + Index minIndex = (int)_mMinIndex(sample, c); + Index maxIndex = (int)_mMaxIndex(sample, c); + float fGradout = mGradientOut(sample, c); + + _gradientWeight(minIndex, c) += _weight(minIndex, c)*fGradout; + _gradientWeight(maxIndex, c) += _weight(maxIndex, c)*fGradout; + } + } + + if (_bFirstLayer) + return; + + // compute mGradientIn + mGradientIn = mGradientOut * (_weight.transpose()); // todo check +} +/////////////////////////////////////////////////////////////// +Index LayerDotMAM::input_size() const +{ + return _iInputSize; +} +/////////////////////////////////////////////////////////////// +Index LayerDotMAM::output_size() const +{ + return _iOutputSize; +} +/////////////////////////////////////////////////////////////// +} \ No newline at end of file diff --git a/src/LayerDotMAM.h b/src/LayerDotMAM.h new file mode 100644 index 00000000..e4f3a024 --- /dev/null +++ b/src/LayerDotMAM.h @@ -0,0 +1,37 @@ +/* + Copyright (c) 2019, Etienne de Foras and the respective contributors + All rights reserved. + + Use of this source code is governed by a MIT-style license that can be found + in the LICENSE.txt file. +*/ + +//MAM computation as in : https://github.com/SSIGPRO/mamtf + +#pragma once + +#include "Layer.h" +#include "Matrix.h" +namespace beednn { +class LayerDotMAM : public Layer +{ +public: + LayerDotMAM(Index iInputSize,Index iOutputSize, const std::string& sWeightInitializer = "GlorotUniform"); + virtual ~LayerDotMAM() override; + + Index input_size() const; + Index output_size() const; + + virtual Layer* clone() const override; + + virtual void forward(const MatrixFloat& mIn, MatrixFloat &mOut) override; + + virtual void init() override; + virtual void backpropagation(const MatrixFloat &mIn,const MatrixFloat &mGradientOut, MatrixFloat &mGradientIn) override; + +private: + Index _iInputSize, _iOutputSize; + MatrixFloat _mMaxIndex; + MatrixFloat _mMinIndex; +}; +} diff --git a/src/LayerFactory.cpp b/src/LayerFactory.cpp index 2acc1cd7..4b28f182 100644 --- a/src/LayerFactory.cpp +++ b/src/LayerFactory.cpp @@ -1,6 +1,7 @@ #include "LayerFactory.h" #include "LayerActivation.h" +#include "LayerBatchNormalization.h" #include "LayerCRelu.h" #include "LayerDense.h" #include "LayerDot.h" @@ -24,12 +25,18 @@ // 2D layers #include "LayerChannelBias.h" #include "LayerConvolution2D.h" +#include "LayerDepthwiseConvolution2D.h" #include "LayerMaxPool2D.h" #include "LayerGlobalMaxPool2D.h" #include "LayerAveragePooling2D.h" #include "LayerGlobalAveragePooling2D.h" #include "LayerZeroPadding2D.h" #include "LayerRandomFlip.h" + +#include "LayerSimplestRNN.h" +#include "LayerSimpleRNN.h" +#include "LayerGRU.h" +#include "LayerLSTM.h" #include "LayerSoftmax.h" #include "LayerSoftmin.h" @@ -39,7 +46,7 @@ using namespace std; namespace beednn { ////////////////////////////////////////////////////////////////////////////// -Layer* LayerFactory::create(const string& sLayer,float fArg1,float fArg2,float fArg3,float fArg4,float fArg5) +Layer* LayerFactory::create(const string& sLayer,float fArg1,float fArg2,float fArg3,float fArg4,float fArg5, float fArg6, float fArg7, float fArg8) { (void)fArg4; (void)fArg5; @@ -50,6 +57,9 @@ Layer* LayerFactory::create(const string& sLayer,float fArg1,float fArg2,float f if(sLayer == "Dot") return new LayerDot((Index)fArg1,(Index)fArg2); + if(sLayer == "BatchNormalization") + return new LayerBatchNormalization((Index)fArg1); + if (sLayer == "CRelu") return new LayerCRelu(); @@ -89,6 +99,12 @@ Layer* LayerFactory::create(const string& sLayer,float fArg1,float fArg2,float f if (sLayer == "MaxPool2D") return new LayerMaxPool2D((Index)fArg1, (Index)fArg2, (Index)fArg3); + if(sLayer == "Convolution2D") + return new LayerConvolution2D((Index)fArg1, (Index)fArg2, (Index)fArg3, (Index)fArg4, (Index)fArg5,(Index)fArg6 ,(Index)fArg7, (Index)fArg8); + + if(sLayer == "DepthwiseConvolution2D") + return new LayerDepthwiseConvolution2D((Index)fArg1, (Index)fArg2, (Index)fArg3, (Index)fArg4, (Index)fArg5); + if (sLayer == "GlobalMaxPool2D") return new LayerGlobalMaxPool2D((Index)fArg1, (Index)fArg2, (Index)fArg3); @@ -98,6 +114,18 @@ Layer* LayerFactory::create(const string& sLayer,float fArg1,float fArg2,float f if (sLayer == "GlobalAveragePooling2D") return new LayerGlobalAveragePooling2D((Index)fArg1, (Index)fArg2, (Index)fArg3); + if(sLayer == "SimplestRNN") + return new LayerSimplestRNN((Index)fArg1); + + if(sLayer == "SimpleRNN") + return new LayerSimpleRNN((Index)fArg1,(Index)fArg2); + + if(sLayer == "GRU") + return new LayerGRU((Index)fArg1,(Index)fArg2); + + if(sLayer == "LSTM") + return new LayerLSTM((Index)fArg1,(Index)fArg2); + if (sLayer == "Softmax") return new LayerSoftmax(); diff --git a/src/LayerFactory.h b/src/LayerFactory.h index d276c6c6..41b5b909 100644 --- a/src/LayerFactory.h +++ b/src/LayerFactory.h @@ -8,6 +8,6 @@ class Layer; class LayerFactory { public: - static Layer* create(const std::string& sLayer ,float fArg1=0.f,float fArg2=0.f,float fArg3=0.f,float fArg4=0.f,float fArg5=0.f); + static Layer* create(const std::string& sLayer ,float fArg1=0.f,float fArg2=0.f,float fArg3=0.f,float fArg4=0.f,float fArg5=0.f, float fArg6 = 0.f, float fArg7 = 0.f, float fArg8 = 0.f); }; } diff --git a/src/LayerGRU.cpp b/src/LayerGRU.cpp new file mode 100644 index 00000000..1726e35a --- /dev/null +++ b/src/LayerGRU.cpp @@ -0,0 +1,167 @@ +/* + Copyright (c) 2019, Etienne de Foras and the respective contributors + All rights reserved. + + Use of this source code is governed by a MIT-style license that can be found + in the LICENSE.txt file. +*/ + +#include "LayerGRU.h" +#include "Activations.h" + +using namespace std; +namespace beednn { + +/////////////////////////////////////////////////////////////////////////////// +LayerGRU::LayerGRU(int iFrameSize, int iUnits) : + LayerRNN(iFrameSize, iUnits) +{ + LayerGRU::init(); +} +/////////////////////////////////////////////////////////////////////////////// +LayerGRU::~LayerGRU() +{ } +/////////////////////////////////////////////////////////////////////////////// +void LayerGRU::init() +{ + // Initialize reset gate weights (Wxr, Whr, br) + _wxr.setRandom(_iFrameSize, _iUnits); + _whr.setRandom(_iUnits, _iUnits); + _br.setZero(1, _iUnits); + + // Initialize update gate weights (Wxz, Whz, bz) + _wxz.setRandom(_iFrameSize, _iUnits); + _whz.setRandom(_iUnits, _iUnits); + _bz.setZero(1, _iUnits); + + // Initialize candidate hidden state weights (Wxh, Whh, bh) + _wxh.setRandom(_iFrameSize, _iUnits); + _whh.setRandom(_iUnits, _iUnits); + _bh.setZero(1, _iUnits); + + // Initialize hidden state + _h.setZero(1, _iUnits); + + LayerRNN::init(); +} +/////////////////////////////////////////////////////////////////////////////// +Layer* LayerGRU::clone() const +{ + LayerGRU* pLayer = new LayerGRU(_iFrameSize, _iUnits); + pLayer->_wxr = _wxr; + pLayer->_whr = _whr; + pLayer->_br = _br; + + pLayer->_wxz = _wxz; + pLayer->_whz = _whz; + pLayer->_bz = _bz; + + pLayer->_wxh = _wxh; + pLayer->_whh = _whh; + pLayer->_bh = _bh; + + pLayer->_h = _h; + + return pLayer; +} +/////////////////////////////////////////////////////////////////////////////// +void LayerGRU::forward_frame(const MatrixFloat& mInFrame, MatrixFloat& mOut) +{ + // Adapt to batch size + if (_h.rows() != mInFrame.rows()) + _h.setZero(mInFrame.rows(), _iUnits); + + // Compute reset gate: r(t) = sigmoid(Wxr*x(t) + Whr*h(t-1) + br) + MatrixFloat mResetInput = mInFrame * _wxr + _h * _whr; + rowWiseAdd(mResetInput, _br); + _reset_gate = mResetInput; // Will apply sigmoid in backprop, for now just store + + // Apply sigmoid activation + for (Index i = 0; i < _reset_gate.size(); i++) + { + float x = _reset_gate(i); + _reset_gate(i) = 1.0f / (1.0f + expf(-x)); // sigmoid + } + + // Compute update gate: z(t) = sigmoid(Wxz*x(t) + Whz*h(t-1) + bz) + MatrixFloat mUpdateInput = mInFrame * _wxz + _h * _whz; + rowWiseAdd(mUpdateInput, _bz); + _update_gate = mUpdateInput; + + // Apply sigmoid activation + for (Index i = 0; i < _update_gate.size(); i++) + { + float x = _update_gate(i); + _update_gate(i) = 1.0f / (1.0f + expf(-x)); // sigmoid + } + + // Compute candidate hidden state: h'(t) = tanh(Wxh*x(t) + Wh*(r(t) * h(t-1)) + bh) + MatrixFloat mResetH = _reset_gate; + for (Index i = 0; i < mResetH.size(); i++) + mResetH(i) = mResetH(i) * _h(i); // element-wise multiplication + + _candidate_h = mInFrame * _wxh + mResetH * _whh; + rowWiseAdd(_candidate_h, _bh); + + // Apply tanh activation + _candidate_h = tanh(_candidate_h); + + // Compute final hidden state: h(t) = (1-z(t)) * h'(t) + z(t) * h(t-1) + MatrixFloat mNewH = _candidate_h; + MatrixFloat mOldH = _h; + + // h(t) = (1 - z(t)) * h'(t) + z(t) * h(t-1) + for (Index i = 0; i < _h.size(); i++) + { + float z = _update_gate(i); + _h(i) = (1.0f - z) * _candidate_h(i) + z * _h(i); + } + + mOut = _h; +} +/////////////////////////////////////////////////////////////////////////////// +void LayerGRU::backpropagation_frame(const MatrixFloat& mInFrame, const MatrixFloat& mH, const MatrixFloat& mHm1, + const MatrixFloat& mGradientOut, MatrixFloat& mGradientIn) +{ + // GRU backpropagation is complex due to gating mechanisms + // We'll implement a simplified version that accumulates gradients + + // dL/dh(t) = mGradientOut (from next layer or loss) + MatrixFloat mGradH = mGradientOut; + + // For simplicity, we'll compute gradients for the candidate state + // dL/dh'(t) = dL/dh(t) * (1 - z(t)) + MatrixFloat mGradCandidateH = mGradH; + for (Index i = 0; i < mGradCandidateH.size(); i++) + { + float z = _update_gate(i); + mGradCandidateH(i) = mGradCandidateH(i) * (1.0f - z); + } + + // Apply tanh derivative: d(tanh)/dx = 1 - tanh(x)^2 + MatrixFloat mTanhDeriv = oneMinusSquare(_candidate_h); + mGradCandidateH = mGradCandidateH * mTanhDeriv; + + // Compute weight gradients for candidate hidden state + // dL/dWxh = dL/dh'(t) * x(t)^T + _gradientWeight = mGradCandidateH.transpose() * mInFrame; + _gradientWeight *= (1.f / mGradCandidateH.rows()); + + // Backpropagate to previous hidden state + // dL/dh(t-1) includes contributions from all three gates + if (!_bFirstLayer) + { + // From candidate state + mGradientIn = mGradCandidateH * (_whh.transpose()); + + // Add contribution from reset gate and update gate + // dL/dh(t-1) += dL/dh(t) * z(t) + for (Index i = 0; i < mGradientIn.size(); i++) + { + float z = _update_gate(i); + mGradientIn(i) += mGradH(i) * z; + } + } +} +///////////////////////////////////////////////////////////////////////////////////////////// +} \ No newline at end of file diff --git a/src/LayerGRU.h b/src/LayerGRU.h new file mode 100644 index 00000000..0500b07a --- /dev/null +++ b/src/LayerGRU.h @@ -0,0 +1,45 @@ +/* + Copyright (c) 2019, Etienne de Foras and the respective contributors + All rights reserved. + + Use of this source code is governed by a MIT-style license that can be found + in the LICENSE.txt file. +*/ + +#pragma once + +#include "Layer.h" +#include "Matrix.h" +#include "LayerRNN.h" + +// GRU (Gated Recurrent Unit) implementation +// Based on: https://arxiv.org/abs/1406.1078 +// Equations: +// r(t) = sigmoid(Wxr*x(t) + Whr*h(t-1) + br) // reset gate +// z(t) = sigmoid(Wxz*x(t) + Whz*h(t-1) + bz) // update gate +// h'(t) = tanh(Wxh*x(t) + Wh*(r(t) * h(t-1)) + bh) // candidate hidden state +// h(t) = (1-z(t)) * h'(t) + z(t) * h(t-1) // final hidden state + +namespace beednn { +class LayerGRU : public LayerRNN +{ +public: + explicit LayerGRU(int iFrameSize, int iUnits); + virtual ~LayerGRU(); + virtual void init() override; + + virtual Layer* clone() const override; + virtual void forward_frame(const MatrixFloat& mInFrame, MatrixFloat& mOut) override; + virtual void backpropagation_frame(const MatrixFloat& mInFrame, const MatrixFloat& mH, const MatrixFloat& mHm1, + const MatrixFloat& mGradientOut, MatrixFloat& mGradientIn) override; + +private: + // Weight matrices + MatrixFloat _wxr, _whr, _br; // Reset gate weights + MatrixFloat _wxz, _whz, _bz; // Update gate weights + MatrixFloat _wxh, _whh, _bh; // Candidate hidden state weights + + // For backpropagation - save intermediate values + MatrixFloat _reset_gate, _update_gate, _candidate_h; +}; +} \ No newline at end of file diff --git a/src/LayerLSTM.cpp b/src/LayerLSTM.cpp new file mode 100644 index 00000000..2d8c6233 --- /dev/null +++ b/src/LayerLSTM.cpp @@ -0,0 +1,255 @@ +/* + Copyright (c) 2019, Etienne de Foras and the respective contributors + All rights reserved. + + Use of this source code is governed by a MIT-style license that can be found + in the LICENSE.txt file. +*/ + +#include "LayerLSTM.h" + +using namespace std; +namespace beednn { + +/////////////////////////////////////////////////////////////////////////////// +LayerLSTM::LayerLSTM(int iFrameSize, int iUnits) : + LayerRNN(iFrameSize, iUnits) +{ + LayerLSTM::init(); +} +/////////////////////////////////////////////////////////////////////////////// +LayerLSTM::~LayerLSTM() +{ } +/////////////////////////////////////////////////////////////////////////////// +void LayerLSTM::init() +{ + // Initialize forget gate weights + _wxf.setRandom(_iFrameSize, _iUnits); + _whf.setRandom(_iUnits, _iUnits); + _bf.setZero(1, _iUnits); + + // Initialize input gate weights + _wxi.setRandom(_iFrameSize, _iUnits); + _whi.setRandom(_iUnits, _iUnits); + _bi.setZero(1, _iUnits); + + // Initialize output gate weights + _wxo.setRandom(_iFrameSize, _iUnits); + _who.setRandom(_iUnits, _iUnits); + _bo.setZero(1, _iUnits); + + // Initialize candidate cell state weights + _wxc.setRandom(_iFrameSize, _iUnits); + _whc.setRandom(_iUnits, _iUnits); + _bc.setZero(1, _iUnits); + + // Initialize hidden state and cell state + _h.setZero(1, _iUnits); + _c.setZero(1, _iUnits); + + LayerRNN::init(); +} +/////////////////////////////////////////////////////////////////////////////// +Layer* LayerLSTM::clone() const +{ + LayerLSTM* pLayer = new LayerLSTM(_iFrameSize, _iUnits); + + // Clone forget gate + pLayer->_wxf = _wxf; + pLayer->_whf = _whf; + pLayer->_bf = _bf; + + // Clone input gate + pLayer->_wxi = _wxi; + pLayer->_whi = _whi; + pLayer->_bi = _bi; + + // Clone output gate + pLayer->_wxo = _wxo; + pLayer->_who = _who; + pLayer->_bo = _bo; + + // Clone candidate cell state + pLayer->_wxc = _wxc; + pLayer->_whc = _whc; + pLayer->_bc = _bc; + + // Clone states + pLayer->_h = _h; + pLayer->_c = _c; + + return pLayer; +} +/////////////////////////////////////////////////////////////////////////////// +void LayerLSTM::forward_frame(const MatrixFloat& mInFrame, MatrixFloat& mOut) +{ + // Adapt to batch size + if (_h.rows() != mInFrame.rows()) + { + _h.setZero(mInFrame.rows(), _iUnits); + _c.setZero(mInFrame.rows(), _iUnits); + } + + // Save previous cell state for backpropagation + _saved_c = _c; + + // ========== FORGET GATE ========== + // f(t) = sigmoid(Wxf*x(t) + Whf*h(t-1) + bf) + MatrixFloat mForgetInput = mInFrame * _wxf + _h * _whf; + rowWiseAdd(mForgetInput, _bf); + _forget_gate = mForgetInput; + + // Apply sigmoid activation + for (Index i = 0; i < _forget_gate.size(); i++) + { + float x = _forget_gate(i); + _forget_gate(i) = 1.0f / (1.0f + expf(-x)); // sigmoid + } + + // ========== INPUT GATE ========== + // i(t) = sigmoid(Wxi*x(t) + Whi*h(t-1) + bi) + MatrixFloat mInputInput = mInFrame * _wxi + _h * _whi; + rowWiseAdd(mInputInput, _bi); + _input_gate = mInputInput; + + // Apply sigmoid activation + for (Index i = 0; i < _input_gate.size(); i++) + { + float x = _input_gate(i); + _input_gate(i) = 1.0f / (1.0f + expf(-x)); // sigmoid + } + + // ========== OUTPUT GATE ========== + // o(t) = sigmoid(Wxo*x(t) + Who*h(t-1) + bo) + MatrixFloat mOutputInput = mInFrame * _wxo + _h * _who; + rowWiseAdd(mOutputInput, _bo); + _output_gate = mOutputInput; + + // Apply sigmoid activation + for (Index i = 0; i < _output_gate.size(); i++) + { + float x = _output_gate(i); + _output_gate(i) = 1.0f / (1.0f + expf(-x)); // sigmoid + } + + // ========== CANDIDATE CELL STATE ========== + // c'(t) = tanh(Wxc*x(t) + Whc*h(t-1) + bc) + MatrixFloat mCandidateInput = mInFrame * _wxc + _h * _whc; + rowWiseAdd(mCandidateInput, _bc); + _candidate_c = tanh(mCandidateInput); + + // ========== CELL STATE ========== + // c(t) = f(t) * c(t-1) + i(t) * c'(t) + MatrixFloat mNewC(_c.rows(), _c.cols()); + for (Index i = 0; i < _c.size(); i++) + { + mNewC(i) = _forget_gate(i) * _c(i) + _input_gate(i) * _candidate_c(i); + } + _c = mNewC; + + // ========== HIDDEN STATE ========== + // h(t) = o(t) * tanh(c(t)) + MatrixFloat mTanhC = tanh(_c); + _h = _output_gate; + for (Index i = 0; i < _h.size(); i++) + { + _h(i) = _h(i) * mTanhC(i); + } + + mOut = _h; +} +/////////////////////////////////////////////////////////////////////////////// +void LayerLSTM::backpropagation_frame(const MatrixFloat& mInFrame, const MatrixFloat& mH, const MatrixFloat& mHm1, + const MatrixFloat& mGradientOut, MatrixFloat& mGradientIn) +{ + // LSTM backpropagation through time + // This is a simplified implementation that computes main gradients + + // dL/dh(t) = mGradientOut (from next layer or loss) + MatrixFloat mGradH = mGradientOut; + + // ========== OUTPUT GATE GRADIENT ========== + // dL/do(t) = dL/dh(t) * tanh(c(t)) + MatrixFloat mTanhC = tanh(_c); + MatrixFloat mGradOutputGate = mGradH; + for (Index i = 0; i < mGradOutputGate.size(); i++) + { + mGradOutputGate(i) = mGradOutputGate(i) * mTanhC(i); + } + + // Apply sigmoid derivative: d(sigmoid)/dx = sigmoid(x) * (1 - sigmoid(x)) + for (Index i = 0; i < mGradOutputGate.size(); i++) + { + float o = _output_gate(i); + mGradOutputGate(i) = mGradOutputGate(i) * o * (1.0f - o); + } + + // ========== CELL STATE GRADIENT ========== + // dL/dc(t) = dL/dh(t) * o(t) * d(tanh(c(t)))/dc(t) + MatrixFloat mTanhDeriv = oneMinusSquare(mTanhC); + MatrixFloat mGradC = mGradH; + for (Index i = 0; i < mGradC.size(); i++) + { + mGradC(i) = mGradC(i) * _output_gate(i) * mTanhDeriv(i); + } + + // ========== INPUT GATE GRADIENT ========== + // dL/di(t) = dL/dc(t) * c'(t) + MatrixFloat mGradInputGate = mGradC; + for (Index i = 0; i < mGradInputGate.size(); i++) + { + mGradInputGate(i) = mGradInputGate(i) * _candidate_c(i); + } + + // Apply sigmoid derivative + for (Index i = 0; i < mGradInputGate.size(); i++) + { + float ii = _input_gate(i); + mGradInputGate(i) = mGradInputGate(i) * ii * (1.0f - ii); + } + + // ========== FORGET GATE GRADIENT ========== + // dL/df(t) = dL/dc(t) * c(t-1) + MatrixFloat mGradForgetGate = mGradC; + for (Index i = 0; i < mGradForgetGate.size(); i++) + { + mGradForgetGate(i) = mGradForgetGate(i) * _saved_c(i); + } + + // Apply sigmoid derivative + for (Index i = 0; i < mGradForgetGate.size(); i++) + { + float f = _forget_gate(i); + mGradForgetGate(i) = mGradForgetGate(i) * f * (1.0f - f); + } + + // ========== CANDIDATE CELL STATE GRADIENT ========== + // dL/dc'(t) = dL/dc(t) * i(t) + MatrixFloat mGradCandidateC = mGradC; + for (Index i = 0; i < mGradCandidateC.size(); i++) + { + mGradCandidateC(i) = mGradCandidateC(i) * _input_gate(i); + } + + // Apply tanh derivative: d(tanh)/dx = 1 - tanh(x)^2 + MatrixFloat mCandidateTanhDeriv = oneMinusSquare(_candidate_c); + mGradCandidateC = mGradCandidateC * mCandidateTanhDeriv; + + // ========== WEIGHT GRADIENTS ========== + // Simplified: accumulate main candidate cell state gradient + // In a full implementation, you would accumulate gradients for all gates + _gradientWeight = mGradCandidateC.transpose() * mInFrame; + _gradientWeight *= (1.f / mGradCandidateC.rows()); + + // ========== GRADIENT TO PREVIOUS HIDDEN STATE ========== + if (!_bFirstLayer) + { + // dL/dh(t-1) comes from all gates through their weight connections + mGradientIn = mGradForgetGate * _whf.transpose() + + mGradInputGate * _whi.transpose() + + mGradOutputGate * _who.transpose() + + mGradCandidateC * _whc.transpose(); + } +} +///////////////////////////////////////////////////////////////////////////////////////////// +} \ No newline at end of file diff --git a/src/LayerLSTM.h b/src/LayerLSTM.h new file mode 100644 index 00000000..6d0fed60 --- /dev/null +++ b/src/LayerLSTM.h @@ -0,0 +1,58 @@ +/* + Copyright (c) 2019, Etienne de Foras and the respective contributors + All rights reserved. + + Use of this source code is governed by a MIT-style license that can be found + in the LICENSE.txt file. +*/ + +#pragma once + +#include "Layer.h" +#include "Matrix.h" +#include "LayerRNN.h" + +// LSTM (Long Short-Term Memory) implementation +// Based on: https://arxiv.org/abs/1406.1074 +// Equations: +// f(t) = sigmoid(Wxf*x(t) + Whf*h(t-1) + bf) // forget gate +// i(t) = sigmoid(Wxi*x(t) + Whi*h(t-1) + bi) // input gate +// o(t) = sigmoid(Wxo*x(t) + Who*h(t-1) + bo) // output gate +// c'(t) = tanh(Wxc*x(t) + Whc*h(t-1) + bc) // candidate cell state +// c(t) = f(t) * c(t-1) + i(t) * c'(t) // cell state +// h(t) = o(t) * tanh(c(t)) // hidden state + +namespace beednn { +class LayerLSTM : public LayerRNN +{ +public: + explicit LayerLSTM(int iFrameSize, int iUnits); + virtual ~LayerLSTM(); + virtual void init() override; + + virtual Layer* clone() const override; + virtual void forward_frame(const MatrixFloat& mInFrame, MatrixFloat& mOut) override; + virtual void backpropagation_frame(const MatrixFloat& mInFrame, const MatrixFloat& mH, const MatrixFloat& mHm1, + const MatrixFloat& mGradientOut, MatrixFloat& mGradientIn) override; + +private: + // Forget gate weights: W_xf, W_hf, b_f + MatrixFloat _wxf, _whf, _bf; + + // Input gate weights: W_xi, W_hi, b_i + MatrixFloat _wxi, _whi, _bi; + + // Output gate weights: W_xo, W_ho, b_o + MatrixFloat _wxo, _who, _bo; + + // Candidate cell state weights: W_xc, W_hc, b_c + MatrixFloat _wxc, _whc, _bc; + + // Cell state (memory) + MatrixFloat _c; + + // For backpropagation - save intermediate values + MatrixFloat _forget_gate, _input_gate, _output_gate, _candidate_c; + MatrixFloat _saved_c; // Previous cell state +}; +} \ No newline at end of file diff --git a/src/LayerRNN.cpp b/src/LayerRNN.cpp index 41bca70e..d6bfa11c 100644 --- a/src/LayerRNN.cpp +++ b/src/LayerRNN.cpp @@ -42,26 +42,55 @@ void LayerRNN::forward(const MatrixFloat& mIn, MatrixFloat& mOut) /////////////////////////////////////////////////////////////////////////////// void LayerRNN::backpropagation(const MatrixFloat& mIn, const MatrixFloat& mGradientOut, MatrixFloat& mGradientIn) { - MatrixFloat mFrame,mGradientOutTemp= mGradientOut,mH,mHm1; + MatrixFloat mFrame, mGradientOutTemp = mGradientOut, mH, mHm1; MatrixFloat mGradientWeightSum; Index iNbSamples = mIn.cols() / _iFrameSize; - for (Index iS = iNbSamples-2; iS >0; iS --) //was iNbSamples-1 TODO FIXME + + // BPTT: Process timesteps in REVERSE order (from T-1 down to 0) + // FIXED: Start from iNbSamples-1 (last timestep), not iNbSamples-2 + // FIXED: Go down to and include iS=0 (first timestep) using >= + for (Index iS = iNbSamples - 1; iS >= 0; iS--) { - mH = _savedH[iS]; - mHm1 = _savedH[iS-1]; - mFrame = colExtract(mIn, iS* _iFrameSize, iS* _iFrameSize + _iFrameSize); - backpropagation_frame(mFrame, mH,mHm1,mGradientOutTemp, mGradientIn); - mGradientOutTemp = mGradientIn; - - //sum gradient weights - if (mGradientWeightSum.size() == 0) - mGradientWeightSum = _gradientWeight; - else - mGradientWeightSum += _gradientWeight; + if (iS < (int)_savedH.size()) + { + mH = _savedH[iS]; + + // Get previous hidden state (or zero for first timestep) + if (iS > 0) + mHm1 = _savedH[iS - 1]; + else + mHm1.setZero(mH.rows(), mH.cols()); // h(-1) = 0 + + mFrame = colExtract(mIn, iS * _iFrameSize, iS * _iFrameSize + _iFrameSize); + + // Backprop through this timestep + backpropagation_frame(mFrame, mH, mHm1, mGradientOutTemp, mGradientIn); + + // Propagate gradient to previous timestep + mGradientOutTemp = mGradientIn; + + // Accumulate weight gradients + if (mGradientWeightSum.size() == 0) + mGradientWeightSum = _gradientWeight; + else + mGradientWeightSum += _gradientWeight; + } } // compute mean of _gradientWeight - _gradientWeight = mGradientWeightSum * (1.f / iNbSamples); + if (iNbSamples > 0 && mGradientWeightSum.size() > 0) + _gradientWeight = mGradientWeightSum * (1.f / iNbSamples); + + // ========== GRADIENT CLIPPING ========== + // Prevent exploding gradients during backpropagation through time + // This is critical for RNNs with long sequences + // Clip weight gradients to prevent instability + clipGradients(_gradientWeight, 5.0f); + + // Also clip input gradients if not at first layer + // This helps propagate stable gradients to previous layers + if (!_bFirstLayer) + clipGradients(mGradientIn, 5.0f); } ///////////////////////////////////////////////////////////////////////////////////////////// void LayerRNN::init() diff --git a/src/LayerRNN.h b/src/LayerRNN.h index 989dc932..c2720e12 100644 --- a/src/LayerRNN.h +++ b/src/LayerRNN.h @@ -34,5 +34,19 @@ class LayerRNN : public Layer int _iFrameSize; int _iUnits; + +private: + // Helper function for gradient clipping + // Prevents exploding gradients during backpropagation through time + void clipGradients(MatrixFloat& mGradients, float fMaxValue) + { + for (Index i = 0; i < mGradients.size(); i++) + { + if (mGradients(i) > fMaxValue) + mGradients(i) = fMaxValue; + else if (mGradients(i) < -fMaxValue) + mGradients(i) = -fMaxValue; + } + } }; -} +} \ No newline at end of file diff --git a/src/LayerSimpleRNN.cpp b/src/LayerSimpleRNN.cpp index d55ee35a..9683f7f8 100644 --- a/src/LayerSimpleRNN.cpp +++ b/src/LayerSimpleRNN.cpp @@ -7,6 +7,8 @@ */ #include "LayerSimpleRNN.h" + +using namespace std; namespace beednn { /////////////////////////////////////////////////////////////////////////////// @@ -42,15 +44,42 @@ Layer* LayerSimpleRNN::clone() const /////////////////////////////////////////////////////////////////////////////// void LayerSimpleRNN::forward_frame(const MatrixFloat& mIn, MatrixFloat& mOut) { - _h = _h * _whh + mIn * _wxh; - rowWiseAdd(_h, _bh); - _h = tanh(_h); - mOut=_h; + _h = _h * _whh + mIn * _wxh; + rowWiseAdd(_h, _bh); + _h = tanh(_h); + mOut=_h; } /////////////////////////////////////////////////////////////////////////////// void LayerSimpleRNN::backpropagation_frame(const MatrixFloat& mInFrame, const MatrixFloat& mH, const MatrixFloat& mHm1, const MatrixFloat& mGradientOut, MatrixFloat& mGradientIn) { - //Todo + // FIXED: Implement proper BPTT backpropagation for SimpleRNN + + // Step 1: Apply tanh derivative + // Forward: h(t) = tanh(u(t)) where u(t) = h(t-1)*Whh + x(t)*Wxh + b + // Backward: d(tanh)/du = 1 - tanh(u)^2 = 1 - h(t)^2 + MatrixFloat mTanhDeriv = oneMinusSquare(mH); + + // Step 2: Combine output gradient with tanh derivative + // dL/du(t) = dL/dh(t) * d(tanh)/du(t) + MatrixFloat mGradU = mGradientOut * mTanhDeriv; + + // Step 3: Compute gradients w.r.t. weights + // dL/dWhh = dL/du(t) * h(t-1)^T + MatrixFloat mGradWhh = mGradU.transpose() * mHm1; + + // dL/dWxh = dL/du(t) * x(t)^T + MatrixFloat mGradWxh = mGradU.transpose() * mInFrame; + + // For now, store Whh gradient as the main weight gradient + // (ideally you'd store both Whh and Wxh separately) + _gradientWeight = mGradWhh * (1.f / mGradU.rows()); + + // Step 4: Backpropagate to previous hidden state + // dL/dh(t-1) = dL/du(t) * dWh^T = mGradU * Whh^T + if (!_bFirstLayer) + { + mGradientIn = mGradU * (_whh.transpose()); + } } ///////////////////////////////////////////////////////////////////////////////////////////// } \ No newline at end of file diff --git a/src/LayerSimplestRNN.cpp b/src/LayerSimplestRNN.cpp index b2b3b01e..b0db265d 100644 --- a/src/LayerSimplestRNN.cpp +++ b/src/LayerSimplestRNN.cpp @@ -47,16 +47,26 @@ void LayerSimplestRNN::forward_frame(const MatrixFloat& mInFrame, MatrixFloat& m /////////////////////////////////////////////////////////////////////////////// void LayerSimplestRNN::backpropagation_frame(const MatrixFloat& mInFrame, const MatrixFloat& mH, const MatrixFloat& mHm1, const MatrixFloat& mGradientOut, MatrixFloat& mGradientIn) { - MatrixFloat mGradU = mH;// oneMinusSquare(mH); // derivative of tanh + // FIXED: Use the input gradient mGradientOut, not mH! + // mGradientOut is dL/dy(t) - the gradient from the loss + // mH is h(t) - the hidden state value + + // Since we're using linear activation (no tanh applied in forward): + // d(output)/d(input) = 1 + // So gradient through this layer = mGradientOut * 1 + + MatrixFloat mGradU = mGradientOut; - //grad(L/_Whh)=grad(L/U))*h(t-1) - _gradientWeight = mGradU.transpose() *mHm1; - _gradientWeight *= (1.f / _gradientWeight.rows()); + // Compute weight gradient: + // dL/dW = dL/dU * dU/dW = mGradU * h(t-1)^T + _gradientWeight = mGradU.transpose() * mHm1; + _gradientWeight *= (1.f / mGradU.rows()); if (!_bFirstLayer) { - //grad(L/h(t-1))=grad(L/U))*whh - mGradientIn = mGradU.transpose()*_weight; + // Backpropagate to previous hidden state: + // dL/dh(t-1) = dL/dU * dU/dh(t-1) = mGradU * W^T + mGradientIn = mGradU * (_weight.transpose()); } } ///////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e09e338b..c61d50bb 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -18,7 +18,6 @@ target_link_libraries(test_classification_xor libBeeDNN) add_executable(test_metrics test_metrics.cpp ) target_link_libraries(test_metrics libBeeDNN) - add_test(test_regression_sin test_regression_sin) add_test(test_classification_xor test_classification_xor) add_test(test_metrics test_metrics)