From c20fb86f3dc48706a8544b16abe15dbcd74ce677 Mon Sep 17 00:00:00 2001 From: Steven Atkinson Date: Wed, 28 Jan 2026 23:23:28 -0800 Subject: [PATCH 1/5] Zero out conv weight matrices after resize --- NAM/conv1d.cpp | 6 ++++++ NAM/dsp.cpp | 5 ++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp index 6e1835b..4febf91 100644 --- a/NAM/conv1d.cpp +++ b/NAM/conv1d.cpp @@ -55,10 +55,16 @@ void Conv1D::set_size_(const int in_channels, const int out_channels, const int this->_num_groups = groups; this->_weight.resize(kernel_size); for (size_t i = 0; i < this->_weight.size(); i++) + { this->_weight[i].resize(out_channels, in_channels); // y = Ax, input array (C,L) + this->_weight[i].setZero(); + } if (do_bias) + { this->_bias.resize(out_channels); + this->_bias.setZero(); + } else this->_bias.resize(0); this->_dilation = _dilation; diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp index 02a4a13..69ef330 100644 --- a/NAM/dsp.cpp +++ b/NAM/dsp.cpp @@ -332,9 +332,13 @@ nam::Conv1x1::Conv1x1(const int in_channels, const int out_channels, const bool this->_num_groups = groups; this->_weight.resize(out_channels, in_channels); + this->_weight.setZero(); this->_do_bias = _bias; if (_bias) + { this->_bias.resize(out_channels); + this->_bias.setZero(); + } } @@ -435,7 +439,6 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons else { // Grouped convolution: process each group separately - _output.leftCols(num_frames).setZero(); for (int g = 0; g < numGroups; g++) { // Extract input slice for this group From 546f820929d30e804f638156148a4ad21037fd95 Mon Sep 17 00:00:00 2001 From: Steven Atkinson Date: Wed, 28 Jan 2026 23:31:28 -0800 Subject: [PATCH 2/5] Improve speed of small grouped convolutions with single GEMM --- NAM/dsp.cpp | 76 +++++------------------------------------------------ 1 file changed, 6 insertions(+), 70 deletions(-) diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp index 69ef330..b7f5f3f 100644 --- a/NAM/dsp.cpp +++ b/NAM/dsp.cpp @@ -378,45 +378,11 @@ void nam::Conv1x1::set_weights_(std::vector::iterator& weights) Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int num_frames) const { - const int numGroups = this->_num_groups; - const long in_channels = get_in_channels(); - const long out_channels = get_out_channels(); - const long in_per_group = in_channels / numGroups; - const long out_per_group = out_channels / numGroups; + // Single GEMM for all cases - block-diagonal zero structure handles grouping + Eigen::MatrixXf result = this->_weight * input.leftCols(num_frames); - Eigen::MatrixXf result(out_channels, num_frames); - - if (numGroups == 1) - { - // Standard convolution (no grouping) - if (this->_do_bias) - result = (this->_weight * input.leftCols(num_frames)).colwise() + this->_bias; - else - result = this->_weight * input.leftCols(num_frames); - } - else - { - // Grouped convolution: process each group separately - result.setZero(); - for (int g = 0; g < numGroups; g++) - { - // Extract input slice for this group - auto input_group = input.leftCols(num_frames).middleRows(g * in_per_group, in_per_group); - - // Extract weight slice for this group - auto weight_group = this->_weight.block(g * out_per_group, g * in_per_group, out_per_group, in_per_group); - - // Extract output slice for this group - auto output_group = result.middleRows(g * out_per_group, out_per_group); - - // Perform grouped convolution: output_group = weight_group * input_group - output_group.noalias() = weight_group * input_group; - } - - // Add bias if present - if (this->_do_bias) - result.colwise() += this->_bias; - } + if (this->_do_bias) + result.colwise() += this->_bias; return result; } @@ -425,39 +391,9 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons { assert(num_frames <= _output.cols()); - const int numGroups = this->_num_groups; - const long in_channels = get_in_channels(); - const long out_channels = get_out_channels(); - const long in_per_group = in_channels / numGroups; - const long out_per_group = out_channels / numGroups; + // Single GEMM for all cases - block-diagonal zero structure handles grouping + _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames); - if (numGroups == 1) - { - // Standard convolution (no grouping) - _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames); - } - else - { - // Grouped convolution: process each group separately - for (int g = 0; g < numGroups; g++) - { - // Extract input slice for this group - auto input_group = input.leftCols(num_frames).middleRows(g * in_per_group, in_per_group); - - // Extract weight slice for this group - auto weight_group = this->_weight.block(g * out_per_group, g * in_per_group, out_per_group, in_per_group); - - // Extract output slice for this group - auto output_group = _output.leftCols(num_frames).middleRows(g * out_per_group, out_per_group); - - // Perform grouped convolution: output_group = weight_group * input_group - output_group.noalias() = weight_group * input_group; - } - } - - // Add bias if present if (this->_do_bias) - { _output.leftCols(num_frames).colwise() += this->_bias; - } } From e78e1917f901178e71ed0b08f7e6a7d4ddfd550d Mon Sep 17 00:00:00 2001 From: Steven Atkinson Date: Wed, 28 Jan 2026 23:41:45 -0800 Subject: [PATCH 3/5] Implement std::vector grouped_weights --- NAM/dsp.cpp | 120 +++++++++++++++++++++++++++++++++++++++++++--------- NAM/dsp.h | 10 ++++- 2 files changed, 107 insertions(+), 23 deletions(-) diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp index b7f5f3f..0d35138 100644 --- a/NAM/dsp.cpp +++ b/NAM/dsp.cpp @@ -317,6 +317,10 @@ std::unique_ptr nam::linear::Factory(const nlohmann::json& config, std // Conv1x1 ==================================================================== nam::Conv1x1::Conv1x1(const int in_channels, const int out_channels, const bool _bias, const int groups) +: _num_groups(groups) +, _in_channels(in_channels) +, _out_channels(out_channels) +, _do_bias(_bias) { // Validate that channels divide evenly by groups if (in_channels % groups != 0) @@ -330,10 +334,25 @@ nam::Conv1x1::Conv1x1(const int in_channels, const int out_channels, const bool + std::to_string(groups) + ")"); } - this->_num_groups = groups; - this->_weight.resize(out_channels, in_channels); - this->_weight.setZero(); - this->_do_bias = _bias; + if (groups == 1) + { + // Single group: use single weight matrix + this->_weight.resize(out_channels, in_channels); + this->_weight.setZero(); + } + else + { + // Multiple groups: use per-group weight matrices (more memory efficient) + const int out_per_group = out_channels / groups; + const int in_per_group = in_channels / groups; + this->_group_weights.resize(groups); + for (int g = 0; g < groups; g++) + { + this->_group_weights[g].resize(out_per_group, in_per_group); + this->_group_weights[g].setZero(); + } + } + if (_bias) { this->_bias.resize(out_channels); @@ -349,37 +368,79 @@ void nam::Conv1x1::SetMaxBufferSize(const int maxBufferSize) void nam::Conv1x1::set_weights_(std::vector::iterator& weights) { - if (this->_weight.size() > 0) + const int numGroups = this->_num_groups; + + if (numGroups == 1) { - const long out_channels = this->_weight.rows(); - const long in_channels = this->_weight.cols(); - const int numGroups = this->_num_groups; - const long out_per_group = out_channels / numGroups; - const long in_per_group = in_channels / numGroups; - - // For grouped convolutions, weights are organized per group - // Weight layout: weights are [group0, group1, ..., groupN-1] - // Each group's weight matrix is (out_channels/numGroups, in_channels/numGroups) + // Single group: populate the single weight matrix + if (this->_weight.size() > 0) + { + const long out_channels = this->_weight.rows(); + const long in_channels = this->_weight.cols(); + for (long i = 0; i < out_channels; i++) + { + for (long j = 0; j < in_channels; j++) + { + this->_weight(i, j) = *(weights++); + } + } + } + } + else + { + // Multiple groups: populate per-group weight matrices + const long out_per_group = this->_out_channels / numGroups; + const long in_per_group = this->_in_channels / numGroups; + for (int g = 0; g < numGroups; g++) { - for (auto i = 0; i < out_per_group; i++) + for (long i = 0; i < out_per_group; i++) { - for (auto j = 0; j < in_per_group; j++) + for (long j = 0; j < in_per_group; j++) { - this->_weight(g * out_per_group + i, g * in_per_group + j) = *(weights++); + this->_group_weights[g](i, j) = *(weights++); } } } } + if (this->_do_bias) for (int i = 0; i < this->_bias.size(); i++) this->_bias(i) = *(weights++); } +long nam::Conv1x1::get_out_channels() const +{ + return this->_out_channels; +} + +long nam::Conv1x1::get_in_channels() const +{ + return this->_in_channels; +} + Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int num_frames) const { - // Single GEMM for all cases - block-diagonal zero structure handles grouping - Eigen::MatrixXf result = this->_weight * input.leftCols(num_frames); + const int numGroups = this->_num_groups; + Eigen::MatrixXf result(this->_out_channels, num_frames); + + if (numGroups == 1) + { + // Single GEMM for non-grouped case + result.noalias() = this->_weight * input.leftCols(num_frames); + } + else + { + // Grouped convolution: process each group with compact weight matrices + const long in_per_group = this->_in_channels / numGroups; + const long out_per_group = this->_out_channels / numGroups; + + for (int g = 0; g < numGroups; g++) + { + result.middleRows(g * out_per_group, out_per_group).noalias() = + this->_group_weights[g] * input.leftCols(num_frames).middleRows(g * in_per_group, in_per_group); + } + } if (this->_do_bias) result.colwise() += this->_bias; @@ -391,8 +452,25 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons { assert(num_frames <= _output.cols()); - // Single GEMM for all cases - block-diagonal zero structure handles grouping - _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames); + const int numGroups = this->_num_groups; + + if (numGroups == 1) + { + // Single GEMM for non-grouped case + _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames); + } + else + { + // Grouped convolution: process each group with compact weight matrices + const long in_per_group = this->_in_channels / numGroups; + const long out_per_group = this->_out_channels / numGroups; + + for (int g = 0; g < numGroups; g++) + { + _output.leftCols(num_frames).middleRows(g * out_per_group, out_per_group).noalias() = + this->_group_weights[g] * input.leftCols(num_frames).middleRows(g * in_per_group, in_per_group); + } + } if (this->_do_bias) _output.leftCols(num_frames).colwise() += this->_bias; diff --git a/NAM/dsp.h b/NAM/dsp.h index 8b984d2..7ada6a0 100644 --- a/NAM/dsp.h +++ b/NAM/dsp.h @@ -323,13 +323,19 @@ class Conv1x1 /// \param num_frames Number of frames to process void process_(const Eigen::Ref& input, const int num_frames); - long get_out_channels() const { return this->_weight.rows(); }; - long get_in_channels() const { return this->_weight.cols(); }; + long get_out_channels() const; + long get_in_channels() const; protected: + // For groups == 1: single weight matrix (out_channels x in_channels) + // For groups > 1: empty (use _group_weights instead) Eigen::MatrixXf _weight; + // For groups > 1: vector of per-group weight matrices, each (out_per_group x in_per_group) + std::vector _group_weights; Eigen::VectorXf _bias; int _num_groups; + int _in_channels; + int _out_channels; private: Eigen::MatrixXf _output; From e3be2552d559cc2a520999b307d8b17e023ab4da Mon Sep 17 00:00:00 2001 From: Steven Atkinson Date: Wed, 28 Jan 2026 23:46:36 -0800 Subject: [PATCH 4/5] Revert "Implement std::vector grouped_weights" This reverts commit e78e1917f901178e71ed0b08f7e6a7d4ddfd550d. --- NAM/dsp.cpp | 120 +++++++++------------------------------------------- NAM/dsp.h | 10 +---- 2 files changed, 23 insertions(+), 107 deletions(-) diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp index 0d35138..b7f5f3f 100644 --- a/NAM/dsp.cpp +++ b/NAM/dsp.cpp @@ -317,10 +317,6 @@ std::unique_ptr nam::linear::Factory(const nlohmann::json& config, std // Conv1x1 ==================================================================== nam::Conv1x1::Conv1x1(const int in_channels, const int out_channels, const bool _bias, const int groups) -: _num_groups(groups) -, _in_channels(in_channels) -, _out_channels(out_channels) -, _do_bias(_bias) { // Validate that channels divide evenly by groups if (in_channels % groups != 0) @@ -334,25 +330,10 @@ nam::Conv1x1::Conv1x1(const int in_channels, const int out_channels, const bool + std::to_string(groups) + ")"); } - if (groups == 1) - { - // Single group: use single weight matrix - this->_weight.resize(out_channels, in_channels); - this->_weight.setZero(); - } - else - { - // Multiple groups: use per-group weight matrices (more memory efficient) - const int out_per_group = out_channels / groups; - const int in_per_group = in_channels / groups; - this->_group_weights.resize(groups); - for (int g = 0; g < groups; g++) - { - this->_group_weights[g].resize(out_per_group, in_per_group); - this->_group_weights[g].setZero(); - } - } - + this->_num_groups = groups; + this->_weight.resize(out_channels, in_channels); + this->_weight.setZero(); + this->_do_bias = _bias; if (_bias) { this->_bias.resize(out_channels); @@ -368,79 +349,37 @@ void nam::Conv1x1::SetMaxBufferSize(const int maxBufferSize) void nam::Conv1x1::set_weights_(std::vector::iterator& weights) { - const int numGroups = this->_num_groups; - - if (numGroups == 1) + if (this->_weight.size() > 0) { - // Single group: populate the single weight matrix - if (this->_weight.size() > 0) - { - const long out_channels = this->_weight.rows(); - const long in_channels = this->_weight.cols(); - for (long i = 0; i < out_channels; i++) - { - for (long j = 0; j < in_channels; j++) - { - this->_weight(i, j) = *(weights++); - } - } - } - } - else - { - // Multiple groups: populate per-group weight matrices - const long out_per_group = this->_out_channels / numGroups; - const long in_per_group = this->_in_channels / numGroups; - + const long out_channels = this->_weight.rows(); + const long in_channels = this->_weight.cols(); + const int numGroups = this->_num_groups; + const long out_per_group = out_channels / numGroups; + const long in_per_group = in_channels / numGroups; + + // For grouped convolutions, weights are organized per group + // Weight layout: weights are [group0, group1, ..., groupN-1] + // Each group's weight matrix is (out_channels/numGroups, in_channels/numGroups) for (int g = 0; g < numGroups; g++) { - for (long i = 0; i < out_per_group; i++) + for (auto i = 0; i < out_per_group; i++) { - for (long j = 0; j < in_per_group; j++) + for (auto j = 0; j < in_per_group; j++) { - this->_group_weights[g](i, j) = *(weights++); + this->_weight(g * out_per_group + i, g * in_per_group + j) = *(weights++); } } } } - if (this->_do_bias) for (int i = 0; i < this->_bias.size(); i++) this->_bias(i) = *(weights++); } -long nam::Conv1x1::get_out_channels() const -{ - return this->_out_channels; -} - -long nam::Conv1x1::get_in_channels() const -{ - return this->_in_channels; -} - Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int num_frames) const { - const int numGroups = this->_num_groups; - Eigen::MatrixXf result(this->_out_channels, num_frames); - - if (numGroups == 1) - { - // Single GEMM for non-grouped case - result.noalias() = this->_weight * input.leftCols(num_frames); - } - else - { - // Grouped convolution: process each group with compact weight matrices - const long in_per_group = this->_in_channels / numGroups; - const long out_per_group = this->_out_channels / numGroups; - - for (int g = 0; g < numGroups; g++) - { - result.middleRows(g * out_per_group, out_per_group).noalias() = - this->_group_weights[g] * input.leftCols(num_frames).middleRows(g * in_per_group, in_per_group); - } - } + // Single GEMM for all cases - block-diagonal zero structure handles grouping + Eigen::MatrixXf result = this->_weight * input.leftCols(num_frames); if (this->_do_bias) result.colwise() += this->_bias; @@ -452,25 +391,8 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons { assert(num_frames <= _output.cols()); - const int numGroups = this->_num_groups; - - if (numGroups == 1) - { - // Single GEMM for non-grouped case - _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames); - } - else - { - // Grouped convolution: process each group with compact weight matrices - const long in_per_group = this->_in_channels / numGroups; - const long out_per_group = this->_out_channels / numGroups; - - for (int g = 0; g < numGroups; g++) - { - _output.leftCols(num_frames).middleRows(g * out_per_group, out_per_group).noalias() = - this->_group_weights[g] * input.leftCols(num_frames).middleRows(g * in_per_group, in_per_group); - } - } + // Single GEMM for all cases - block-diagonal zero structure handles grouping + _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames); if (this->_do_bias) _output.leftCols(num_frames).colwise() += this->_bias; diff --git a/NAM/dsp.h b/NAM/dsp.h index 7ada6a0..8b984d2 100644 --- a/NAM/dsp.h +++ b/NAM/dsp.h @@ -323,19 +323,13 @@ class Conv1x1 /// \param num_frames Number of frames to process void process_(const Eigen::Ref& input, const int num_frames); - long get_out_channels() const; - long get_in_channels() const; + long get_out_channels() const { return this->_weight.rows(); }; + long get_in_channels() const { return this->_weight.cols(); }; protected: - // For groups == 1: single weight matrix (out_channels x in_channels) - // For groups > 1: empty (use _group_weights instead) Eigen::MatrixXf _weight; - // For groups > 1: vector of per-group weight matrices, each (out_per_group x in_per_group) - std::vector _group_weights; Eigen::VectorXf _bias; int _num_groups; - int _in_channels; - int _out_channels; private: Eigen::MatrixXf _output; From 2ad9decd15b3d38e45b7047d3429b4e16660b2e2 Mon Sep 17 00:00:00 2001 From: Steven Atkinson Date: Wed, 28 Jan 2026 23:56:35 -0800 Subject: [PATCH 5/5] Improve grouped convolutions for Conv1D by...ignoring them for now. --- NAM/conv1d.cpp | 107 ++++++++++--------------------------------------- 1 file changed, 22 insertions(+), 85 deletions(-) diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp index 4febf91..f05dd07 100644 --- a/NAM/conv1d.cpp +++ b/NAM/conv1d.cpp @@ -110,54 +110,22 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames) // Zero output before processing _output.leftCols(num_frames).setZero(); - const int numGroups = this->_num_groups; - const long in_channels = get_in_channels(); - const long out_channels = get_out_channels(); - const long in_per_group = in_channels / numGroups; - const long out_per_group = out_channels / numGroups; - // Process from ring buffer with dilation lookback // After Write(), data is at positions [_write_pos, _write_pos+num_frames-1] // For kernel tap k with offset, we need to read from _write_pos + offset // The offset is negative (looking back), so _write_pos + offset reads from earlier positions - // The original process_() reads: input.middleCols(i_start + offset, ncols) - // where i_start is the current position and offset is negative for lookback - - if (numGroups == 1) - { - // Standard convolution (no grouping) - for (size_t k = 0; k < this->_weight.size(); k++) - { - const long offset = this->_dilation * (k + 1 - (long)this->_weight.size()); - const long lookback = -offset; - auto input_block = _input_buffer.Read(num_frames, lookback); - _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block; - } - } - else + // + // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal), + // so we can use a single GEMM for all cases. A more advanced implementation could store + // compact per-group weight matrices and loop over groups, but at typical model sizes + // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate + // and the single sparse GEMM approach is faster. + for (size_t k = 0; k < this->_weight.size(); k++) { - // Grouped convolution: process each group separately - for (int g = 0; g < numGroups; g++) - { - for (size_t k = 0; k < this->_weight.size(); k++) - { - const long offset = this->_dilation * (k + 1 - (long)this->_weight.size()); - const long lookback = -offset; - auto input_block = _input_buffer.Read(num_frames, lookback); - - // Extract input slice for this group - auto input_group = input_block.middleRows(g * in_per_group, in_per_group); - - // Extract weight slice for this group - auto weight_group = this->_weight[k].block(g * out_per_group, g * in_per_group, out_per_group, in_per_group); - - // Extract output slice for this group - auto output_group = _output.leftCols(num_frames).middleRows(g * out_per_group, out_per_group); - - // Perform grouped convolution: output_group += weight_group * input_group - output_group.noalias() += weight_group * input_group; - } - } + const long offset = this->_dilation * (k + 1 - (long)this->_weight.size()); + const long lookback = -offset; + auto input_block = _input_buffer.Read(num_frames, lookback); + _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block; } // Add bias if present @@ -173,49 +141,18 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames) void Conv1D::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start, const long ncols, const long j_start) const { - const int numGroups = this->_num_groups; - const long in_channels = get_in_channels(); - const long out_channels = get_out_channels(); - const long in_per_group = in_channels / numGroups; - const long out_per_group = out_channels / numGroups; - - if (numGroups == 1) + // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal), + // so we can use a single GEMM for all cases. A more advanced implementation could store + // compact per-group weight matrices and loop over groups, but at typical model sizes + // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate + // and the single sparse GEMM approach is faster. + for (size_t k = 0; k < this->_weight.size(); k++) { - // Standard convolution (no grouping) - for (size_t k = 0; k < this->_weight.size(); k++) - { - const long offset = this->_dilation * (k + 1 - this->_weight.size()); - if (k == 0) - output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols); - else - output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols); - } - } - else - { - // Grouped convolution: process each group separately - for (int g = 0; g < numGroups; g++) - { - for (size_t k = 0; k < this->_weight.size(); k++) - { - const long offset = this->_dilation * (k + 1 - this->_weight.size()); - - // Extract input slice for this group - auto input_group = input.middleCols(i_start + offset, ncols).middleRows(g * in_per_group, in_per_group); - - // Extract weight slice for this group - auto weight_group = this->_weight[k].block(g * out_per_group, g * in_per_group, out_per_group, in_per_group); - - // Extract output slice for this group - auto output_group = output.middleCols(j_start, ncols).middleRows(g * out_per_group, out_per_group); - - // Perform grouped convolution - if (k == 0) - output_group.noalias() = weight_group * input_group; - else - output_group.noalias() += weight_group * input_group; - } - } + const long offset = this->_dilation * (k + 1 - this->_weight.size()); + if (k == 0) + output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols); + else + output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols); } if (this->_bias.size() > 0) {