diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp index f05dd07..9bbbc02 100644 --- a/NAM/conv1d.cpp +++ b/NAM/conv1d.cpp @@ -7,7 +7,21 @@ namespace nam void Conv1D::set_weights_(std::vector::iterator& weights) { - if (this->_weight.size() > 0) + if (this->_is_depthwise) + { + // Depthwise convolution: one weight per channel per kernel tap + // Weight layout: for each channel c, for each kernel position k + const int channels = this->_channels; + const size_t kernel_size = this->_depthwise_weight.size(); + for (int c = 0; c < channels; c++) + { + for (size_t k = 0; k < kernel_size; k++) + { + this->_depthwise_weight[k](c) = *(weights++); + } + } + } + else if (this->_weight.size() > 0) { const long out_channels = this->_weight[0].rows(); const long in_channels = this->_weight[0].cols(); @@ -53,13 +67,39 @@ void Conv1D::set_size_(const int in_channels, const int out_channels, const int } this->_num_groups = groups; - this->_weight.resize(kernel_size); - for (size_t i = 0; i < this->_weight.size(); i++) + this->_dilation = _dilation; + + // Check for depthwise convolution: groups == in_channels == out_channels + // In this case, each channel is processed independently with a single weight per kernel tap, + // so we can use efficient element-wise multiplication instead of matrix multiplication. + this->_is_depthwise = (groups == in_channels && in_channels == out_channels); + + if (this->_is_depthwise) + { + // Depthwise: store one weight vector per kernel tap + this->_channels = in_channels; + this->_depthwise_weight.resize(kernel_size); + for (int i = 0; i < kernel_size; i++) + { + this->_depthwise_weight[i].resize(in_channels); + this->_depthwise_weight[i].setZero(); + } + this->_weight.clear(); // Not used for depthwise + } + else { - this->_weight[i].resize(out_channels, - in_channels); // y = Ax, input array (C,L) - this->_weight[i].setZero(); + // Non-depthwise: store full weight matrices (block-diagonal for grouped convolutions) + this->_weight.resize(kernel_size); + for (int i = 0; i < kernel_size; i++) + { + this->_weight[i].resize(out_channels, + in_channels); // y = Ax, input array (C,L) + this->_weight[i].setZero(); + } + this->_depthwise_weight.clear(); // Not used for non-depthwise + this->_channels = 0; } + if (do_bias) { this->_bias.resize(out_channels); @@ -67,7 +107,6 @@ void Conv1D::set_size_(const int in_channels, const int out_channels, const int } else this->_bias.resize(0); - this->_dilation = _dilation; } void Conv1D::set_size_and_weights_(const int in_channels, const int out_channels, const int kernel_size, @@ -114,18 +153,37 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames) // After Write(), data is at positions [_write_pos, _write_pos+num_frames-1] // For kernel tap k with offset, we need to read from _write_pos + offset // The offset is negative (looking back), so _write_pos + offset reads from earlier positions - // - // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal), - // so we can use a single GEMM for all cases. A more advanced implementation could store - // compact per-group weight matrices and loop over groups, but at typical model sizes - // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate - // and the single sparse GEMM approach is faster. - for (size_t k = 0; k < this->_weight.size(); k++) + + if (this->_is_depthwise) + { + // Depthwise convolution: use efficient element-wise multiplication + // Each channel is processed independently with a single weight per kernel tap. + // output[c, t] = sum_k(weight[k, c] * input[c, t - k*dilation]) + const size_t kernel_size = this->_depthwise_weight.size(); + for (size_t k = 0; k < kernel_size; k++) + { + const long offset = this->_dilation * (k + 1 - (long)kernel_size); + const long lookback = -offset; + auto input_block = _input_buffer.Read(num_frames, lookback); + // Element-wise multiply: each row of input_block is multiplied by corresponding weight + _output.leftCols(num_frames).noalias() += + this->_depthwise_weight[k].asDiagonal() * input_block.leftCols(num_frames); + } + } + else { - const long offset = this->_dilation * (k + 1 - (long)this->_weight.size()); - const long lookback = -offset; - auto input_block = _input_buffer.Read(num_frames, lookback); - _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block; + // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal), + // so we can use a single GEMM for all cases. A more advanced implementation could store + // compact per-group weight matrices and loop over groups, but at typical model sizes + // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate + // and the single sparse GEMM approach is faster. + for (size_t k = 0; k < this->_weight.size(); k++) + { + const long offset = this->_dilation * (k + 1 - (long)this->_weight.size()); + const long lookback = -offset; + auto input_block = _input_buffer.Read(num_frames, lookback); + _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block; + } } // Add bias if present @@ -141,18 +199,36 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames) void Conv1D::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start, const long ncols, const long j_start) const { - // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal), - // so we can use a single GEMM for all cases. A more advanced implementation could store - // compact per-group weight matrices and loop over groups, but at typical model sizes - // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate - // and the single sparse GEMM approach is faster. - for (size_t k = 0; k < this->_weight.size(); k++) + if (this->_is_depthwise) { - const long offset = this->_dilation * (k + 1 - this->_weight.size()); - if (k == 0) - output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols); - else - output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols); + // Depthwise convolution: use efficient element-wise multiplication + const size_t kernel_size = this->_depthwise_weight.size(); + for (size_t k = 0; k < kernel_size; k++) + { + const long offset = this->_dilation * (k + 1 - (long)kernel_size); + if (k == 0) + output.middleCols(j_start, ncols).noalias() = + this->_depthwise_weight[k].asDiagonal() * input.middleCols(i_start + offset, ncols); + else + output.middleCols(j_start, ncols).noalias() += + this->_depthwise_weight[k].asDiagonal() * input.middleCols(i_start + offset, ncols); + } + } + else + { + // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal), + // so we can use a single GEMM for all cases. A more advanced implementation could store + // compact per-group weight matrices and loop over groups, but at typical model sizes + // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate + // and the single sparse GEMM approach is faster. + for (size_t k = 0; k < this->_weight.size(); k++) + { + const long offset = this->_dilation * (k + 1 - this->_weight.size()); + if (k == 0) + output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols); + else + output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols); + } } if (this->_bias.size() > 0) { @@ -160,10 +236,36 @@ void Conv1D::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, con } } +long Conv1D::get_in_channels() const +{ + if (this->_is_depthwise) + return this->_channels; + return this->_weight.size() > 0 ? this->_weight[0].cols() : 0; +} + +long Conv1D::get_out_channels() const +{ + if (this->_is_depthwise) + return this->_channels; + return this->_weight.size() > 0 ? this->_weight[0].rows() : 0; +} + +long Conv1D::get_kernel_size() const +{ + if (this->_is_depthwise) + return this->_depthwise_weight.size(); + return this->_weight.size(); +} + long Conv1D::get_num_weights() const { long num_weights = this->_bias.size(); - if (this->_weight.size() > 0) + if (this->_is_depthwise) + { + // Depthwise: one weight per channel per kernel tap + num_weights += this->_channels * this->_depthwise_weight.size(); + } + else if (this->_weight.size() > 0) { const long out_channels = this->_weight[0].rows(); const long in_channels = this->_weight[0].cols(); diff --git a/NAM/conv1d.h b/NAM/conv1d.h index 8182966..8f00686 100644 --- a/NAM/conv1d.h +++ b/NAM/conv1d.h @@ -95,11 +95,11 @@ class Conv1D const long j_start) const; /// \brief Get the number of input channels /// \return Number of input channels - long get_in_channels() const { return this->_weight.size() > 0 ? this->_weight[0].cols() : 0; }; + long get_in_channels() const; /// \brief Get the kernel size /// \return Kernel size - long get_kernel_size() const { return this->_weight.size(); }; + long get_kernel_size() const; /// \brief Get the total number of weights /// \return Total number of weight parameters @@ -107,7 +107,7 @@ class Conv1D /// \brief Get the number of output channels /// \return Number of output channels - long get_out_channels() const { return this->_weight.size() > 0 ? this->_weight[0].rows() : 0; }; + long get_out_channels() const; /// \brief Get the dilation factor /// \return Dilation factor @@ -118,8 +118,13 @@ class Conv1D bool has_bias() const { return this->_bias.size() > 0; }; protected: - // conv[kernel](cout, cin) + // conv[kernel](cout, cin) - used for non-depthwise convolutions std::vector _weight; + // For depthwise convolution (groups == in_channels == out_channels): + // stores one weight per channel per kernel tap + std::vector _depthwise_weight; + bool _is_depthwise = false; + int _channels = 0; // Used for depthwise case (in_channels == out_channels) Eigen::VectorXf _bias; int _dilation; int _num_groups; diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp index b7f5f3f..05dab09 100644 --- a/NAM/dsp.cpp +++ b/NAM/dsp.cpp @@ -331,9 +331,30 @@ nam::Conv1x1::Conv1x1(const int in_channels, const int out_channels, const bool } this->_num_groups = groups; - this->_weight.resize(out_channels, in_channels); - this->_weight.setZero(); this->_do_bias = _bias; + + // Check for depthwise convolution: groups == in_channels == out_channels + // In this case, each channel is processed independently with a single weight, + // so we can use efficient element-wise multiplication instead of matrix multiplication. + this->_is_depthwise = (groups == in_channels && in_channels == out_channels); + + if (this->_is_depthwise) + { + // Depthwise: store one weight per channel + this->_channels = in_channels; + this->_depthwise_weight.resize(in_channels); + this->_depthwise_weight.setZero(); + // Clear the matrix weight (not used) + this->_weight.resize(0, 0); + } + else + { + // Non-depthwise: store full weight matrix (block-diagonal for grouped convolutions) + this->_weight.resize(out_channels, in_channels); + this->_weight.setZero(); + this->_channels = 0; + } + if (_bias) { this->_bias.resize(out_channels); @@ -349,7 +370,15 @@ void nam::Conv1x1::SetMaxBufferSize(const int maxBufferSize) void nam::Conv1x1::set_weights_(std::vector::iterator& weights) { - if (this->_weight.size() > 0) + if (this->_is_depthwise) + { + // Depthwise convolution: one weight per channel + for (int c = 0; c < this->_channels; c++) + { + this->_depthwise_weight(c) = *(weights++); + } + } + else if (this->_weight.size() > 0) { const long out_channels = this->_weight.rows(); const long in_channels = this->_weight.cols(); @@ -376,10 +405,35 @@ void nam::Conv1x1::set_weights_(std::vector::iterator& weights) this->_bias(i) = *(weights++); } +long nam::Conv1x1::get_out_channels() const +{ + if (this->_is_depthwise) + return this->_channels; + return this->_weight.rows(); +} + +long nam::Conv1x1::get_in_channels() const +{ + if (this->_is_depthwise) + return this->_channels; + return this->_weight.cols(); +} + Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int num_frames) const { - // Single GEMM for all cases - block-diagonal zero structure handles grouping - Eigen::MatrixXf result = this->_weight * input.leftCols(num_frames); + Eigen::MatrixXf result(get_out_channels(), num_frames); + + if (this->_is_depthwise) + { + // Depthwise convolution: efficient element-wise multiplication + // Each channel is scaled by its corresponding weight + result.noalias() = this->_depthwise_weight.asDiagonal() * input.leftCols(num_frames); + } + else + { + // Single GEMM for all cases - block-diagonal zero structure handles grouping + result.noalias() = this->_weight * input.leftCols(num_frames); + } if (this->_do_bias) result.colwise() += this->_bias; @@ -391,8 +445,17 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons { assert(num_frames <= _output.cols()); - // Single GEMM for all cases - block-diagonal zero structure handles grouping - _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames); + if (this->_is_depthwise) + { + // Depthwise convolution: efficient element-wise multiplication + // Each channel is scaled by its corresponding weight + _output.leftCols(num_frames).noalias() = this->_depthwise_weight.asDiagonal() * input.leftCols(num_frames); + } + else + { + // Single GEMM for all cases - block-diagonal zero structure handles grouping + _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames); + } if (this->_do_bias) _output.leftCols(num_frames).colwise() += this->_bias; diff --git a/NAM/dsp.h b/NAM/dsp.h index 8b984d2..1313ad9 100644 --- a/NAM/dsp.h +++ b/NAM/dsp.h @@ -323,11 +323,17 @@ class Conv1x1 /// \param num_frames Number of frames to process void process_(const Eigen::Ref& input, const int num_frames); - long get_out_channels() const { return this->_weight.rows(); }; - long get_in_channels() const { return this->_weight.cols(); }; + long get_out_channels() const; + long get_in_channels() const; protected: + // Non-depthwise: full weight matrix (out_channels x in_channels) Eigen::MatrixXf _weight; + // For depthwise convolution (groups == in_channels == out_channels): + // stores one weight per channel + Eigen::VectorXf _depthwise_weight; + bool _is_depthwise = false; + int _channels = 0; // Used for depthwise case (in_channels == out_channels) Eigen::VectorXf _bias; int _num_groups;