From c20fb86f3dc48706a8544b16abe15dbcd74ce677 Mon Sep 17 00:00:00 2001
From: Steven Atkinson <steven@atkinson.mn>
Date: Wed, 28 Jan 2026 23:23:28 -0800
Subject: [PATCH 1/5] Zero out conv weight matrices after resize

---
 NAM/conv1d.cpp | 6 ++++++
 NAM/dsp.cpp    | 5 ++++-
 2 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp
index 6e1835b..4febf91 100644
--- a/NAM/conv1d.cpp
+++ b/NAM/conv1d.cpp
@@ -55,10 +55,16 @@ void Conv1D::set_size_(const int in_channels, const int out_channels, const int
   this->_num_groups = groups;
   this->_weight.resize(kernel_size);
   for (size_t i = 0; i < this->_weight.size(); i++)
+  {
     this->_weight[i].resize(out_channels,
                             in_channels); // y = Ax, input array (C,L)
+    this->_weight[i].setZero();
+  }
   if (do_bias)
+  {
     this->_bias.resize(out_channels);
+    this->_bias.setZero();
+  }
   else
     this->_bias.resize(0);
   this->_dilation = _dilation;
diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp
index 02a4a13..69ef330 100644
--- a/NAM/dsp.cpp
+++ b/NAM/dsp.cpp
@@ -332,9 +332,13 @@ nam::Conv1x1::Conv1x1(const int in_channels, const int out_channels, const bool
 
   this->_num_groups = groups;
   this->_weight.resize(out_channels, in_channels);
+  this->_weight.setZero();
   this->_do_bias = _bias;
   if (_bias)
+  {
     this->_bias.resize(out_channels);
+    this->_bias.setZero();
+  }
 }
 
 
@@ -435,7 +439,6 @@ void nam::Conv1x1::process_(const Eigen::Ref<const Eigen::MatrixXf>& input, cons
   else
   {
     // Grouped convolution: process each group separately
-    _output.leftCols(num_frames).setZero();
     for (int g = 0; g < numGroups; g++)
     {
       // Extract input slice for this group

From 546f820929d30e804f638156148a4ad21037fd95 Mon Sep 17 00:00:00 2001
From: Steven Atkinson <steven@atkinson.mn>
Date: Wed, 28 Jan 2026 23:31:28 -0800
Subject: [PATCH 2/5] Improve speed of small grouped convolutions with single
 GEMM

---
 NAM/dsp.cpp | 76 +++++------------------------------------------------
 1 file changed, 6 insertions(+), 70 deletions(-)

diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp
index 69ef330..b7f5f3f 100644
--- a/NAM/dsp.cpp
+++ b/NAM/dsp.cpp
@@ -378,45 +378,11 @@ void nam::Conv1x1::set_weights_(std::vector<float>::iterator& weights)
 
 Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int num_frames) const
 {
-  const int numGroups = this->_num_groups;
-  const long in_channels = get_in_channels();
-  const long out_channels = get_out_channels();
-  const long in_per_group = in_channels / numGroups;
-  const long out_per_group = out_channels / numGroups;
+  // Single GEMM for all cases - block-diagonal zero structure handles grouping
+  Eigen::MatrixXf result = this->_weight * input.leftCols(num_frames);
 
-  Eigen::MatrixXf result(out_channels, num_frames);
-
-  if (numGroups == 1)
-  {
-    // Standard convolution (no grouping)
-    if (this->_do_bias)
-      result = (this->_weight * input.leftCols(num_frames)).colwise() + this->_bias;
-    else
-      result = this->_weight * input.leftCols(num_frames);
-  }
-  else
-  {
-    // Grouped convolution: process each group separately
-    result.setZero();
-    for (int g = 0; g < numGroups; g++)
-    {
-      // Extract input slice for this group
-      auto input_group = input.leftCols(num_frames).middleRows(g * in_per_group, in_per_group);
-
-      // Extract weight slice for this group
-      auto weight_group = this->_weight.block(g * out_per_group, g * in_per_group, out_per_group, in_per_group);
-
-      // Extract output slice for this group
-      auto output_group = result.middleRows(g * out_per_group, out_per_group);
-
-      // Perform grouped convolution: output_group = weight_group * input_group
-      output_group.noalias() = weight_group * input_group;
-    }
-
-    // Add bias if present
-    if (this->_do_bias)
-      result.colwise() += this->_bias;
-  }
+  if (this->_do_bias)
+    result.colwise() += this->_bias;
 
   return result;
 }
@@ -425,39 +391,9 @@ void nam::Conv1x1::process_(const Eigen::Ref<const Eigen::MatrixXf>& input, cons
 {
   assert(num_frames <= _output.cols());
 
-  const int numGroups = this->_num_groups;
-  const long in_channels = get_in_channels();
-  const long out_channels = get_out_channels();
-  const long in_per_group = in_channels / numGroups;
-  const long out_per_group = out_channels / numGroups;
+  // Single GEMM for all cases - block-diagonal zero structure handles grouping
+  _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames);
 
-  if (numGroups == 1)
-  {
-    // Standard convolution (no grouping)
-    _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames);
-  }
-  else
-  {
-    // Grouped convolution: process each group separately
-    for (int g = 0; g < numGroups; g++)
-    {
-      // Extract input slice for this group
-      auto input_group = input.leftCols(num_frames).middleRows(g * in_per_group, in_per_group);
-
-      // Extract weight slice for this group
-      auto weight_group = this->_weight.block(g * out_per_group, g * in_per_group, out_per_group, in_per_group);
-
-      // Extract output slice for this group
-      auto output_group = _output.leftCols(num_frames).middleRows(g * out_per_group, out_per_group);
-
-      // Perform grouped convolution: output_group = weight_group * input_group
-      output_group.noalias() = weight_group * input_group;
-    }
-  }
-
-  // Add bias if present
   if (this->_do_bias)
-  {
     _output.leftCols(num_frames).colwise() += this->_bias;
-  }
 }

From e78e1917f901178e71ed0b08f7e6a7d4ddfd550d Mon Sep 17 00:00:00 2001
From: Steven Atkinson <steven@atkinson.mn>
Date: Wed, 28 Jan 2026 23:41:45 -0800
Subject: [PATCH 3/5] Implement std::vector grouped_weights

---
 NAM/dsp.cpp | 120 +++++++++++++++++++++++++++++++++++++++++++---------
 NAM/dsp.h   |  10 ++++-
 2 files changed, 107 insertions(+), 23 deletions(-)

diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp
index b7f5f3f..0d35138 100644
--- a/NAM/dsp.cpp
+++ b/NAM/dsp.cpp
@@ -317,6 +317,10 @@ std::unique_ptr<nam::DSP> nam::linear::Factory(const nlohmann::json& config, std
 // Conv1x1 ====================================================================
 
 nam::Conv1x1::Conv1x1(const int in_channels, const int out_channels, const bool _bias, const int groups)
+: _num_groups(groups)
+, _in_channels(in_channels)
+, _out_channels(out_channels)
+, _do_bias(_bias)
 {
   // Validate that channels divide evenly by groups
   if (in_channels % groups != 0)
@@ -330,10 +334,25 @@ nam::Conv1x1::Conv1x1(const int in_channels, const int out_channels, const bool
                              + std::to_string(groups) + ")");
   }
 
-  this->_num_groups = groups;
-  this->_weight.resize(out_channels, in_channels);
-  this->_weight.setZero();
-  this->_do_bias = _bias;
+  if (groups == 1)
+  {
+    // Single group: use single weight matrix
+    this->_weight.resize(out_channels, in_channels);
+    this->_weight.setZero();
+  }
+  else
+  {
+    // Multiple groups: use per-group weight matrices (more memory efficient)
+    const int out_per_group = out_channels / groups;
+    const int in_per_group = in_channels / groups;
+    this->_group_weights.resize(groups);
+    for (int g = 0; g < groups; g++)
+    {
+      this->_group_weights[g].resize(out_per_group, in_per_group);
+      this->_group_weights[g].setZero();
+    }
+  }
+
   if (_bias)
   {
     this->_bias.resize(out_channels);
@@ -349,37 +368,79 @@ void nam::Conv1x1::SetMaxBufferSize(const int maxBufferSize)
 
 void nam::Conv1x1::set_weights_(std::vector<float>::iterator& weights)
 {
-  if (this->_weight.size() > 0)
+  const int numGroups = this->_num_groups;
+
+  if (numGroups == 1)
   {
-    const long out_channels = this->_weight.rows();
-    const long in_channels = this->_weight.cols();
-    const int numGroups = this->_num_groups;
-    const long out_per_group = out_channels / numGroups;
-    const long in_per_group = in_channels / numGroups;
-
-    // For grouped convolutions, weights are organized per group
-    // Weight layout: weights are [group0, group1, ..., groupN-1]
-    // Each group's weight matrix is (out_channels/numGroups, in_channels/numGroups)
+    // Single group: populate the single weight matrix
+    if (this->_weight.size() > 0)
+    {
+      const long out_channels = this->_weight.rows();
+      const long in_channels = this->_weight.cols();
+      for (long i = 0; i < out_channels; i++)
+      {
+        for (long j = 0; j < in_channels; j++)
+        {
+          this->_weight(i, j) = *(weights++);
+        }
+      }
+    }
+  }
+  else
+  {
+    // Multiple groups: populate per-group weight matrices
+    const long out_per_group = this->_out_channels / numGroups;
+    const long in_per_group = this->_in_channels / numGroups;
+
     for (int g = 0; g < numGroups; g++)
     {
-      for (auto i = 0; i < out_per_group; i++)
+      for (long i = 0; i < out_per_group; i++)
       {
-        for (auto j = 0; j < in_per_group; j++)
+        for (long j = 0; j < in_per_group; j++)
         {
-          this->_weight(g * out_per_group + i, g * in_per_group + j) = *(weights++);
+          this->_group_weights[g](i, j) = *(weights++);
         }
       }
     }
   }
+
   if (this->_do_bias)
     for (int i = 0; i < this->_bias.size(); i++)
       this->_bias(i) = *(weights++);
 }
 
+long nam::Conv1x1::get_out_channels() const
+{
+  return this->_out_channels;
+}
+
+long nam::Conv1x1::get_in_channels() const
+{
+  return this->_in_channels;
+}
+
 Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int num_frames) const
 {
-  // Single GEMM for all cases - block-diagonal zero structure handles grouping
-  Eigen::MatrixXf result = this->_weight * input.leftCols(num_frames);
+  const int numGroups = this->_num_groups;
+  Eigen::MatrixXf result(this->_out_channels, num_frames);
+
+  if (numGroups == 1)
+  {
+    // Single GEMM for non-grouped case
+    result.noalias() = this->_weight * input.leftCols(num_frames);
+  }
+  else
+  {
+    // Grouped convolution: process each group with compact weight matrices
+    const long in_per_group = this->_in_channels / numGroups;
+    const long out_per_group = this->_out_channels / numGroups;
+
+    for (int g = 0; g < numGroups; g++)
+    {
+      result.middleRows(g * out_per_group, out_per_group).noalias() =
+        this->_group_weights[g] * input.leftCols(num_frames).middleRows(g * in_per_group, in_per_group);
+    }
+  }
 
   if (this->_do_bias)
     result.colwise() += this->_bias;
@@ -391,8 +452,25 @@ void nam::Conv1x1::process_(const Eigen::Ref<const Eigen::MatrixXf>& input, cons
 {
   assert(num_frames <= _output.cols());
 
-  // Single GEMM for all cases - block-diagonal zero structure handles grouping
-  _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames);
+  const int numGroups = this->_num_groups;
+
+  if (numGroups == 1)
+  {
+    // Single GEMM for non-grouped case
+    _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames);
+  }
+  else
+  {
+    // Grouped convolution: process each group with compact weight matrices
+    const long in_per_group = this->_in_channels / numGroups;
+    const long out_per_group = this->_out_channels / numGroups;
+
+    for (int g = 0; g < numGroups; g++)
+    {
+      _output.leftCols(num_frames).middleRows(g * out_per_group, out_per_group).noalias() =
+        this->_group_weights[g] * input.leftCols(num_frames).middleRows(g * in_per_group, in_per_group);
+    }
+  }
 
   if (this->_do_bias)
     _output.leftCols(num_frames).colwise() += this->_bias;
diff --git a/NAM/dsp.h b/NAM/dsp.h
index 8b984d2..7ada6a0 100644
--- a/NAM/dsp.h
+++ b/NAM/dsp.h
@@ -323,13 +323,19 @@ class Conv1x1
   /// \param num_frames Number of frames to process
   void process_(const Eigen::Ref<const Eigen::MatrixXf>& input, const int num_frames);
 
-  long get_out_channels() const { return this->_weight.rows(); };
-  long get_in_channels() const { return this->_weight.cols(); };
+  long get_out_channels() const;
+  long get_in_channels() const;
 
 protected:
+  // For groups == 1: single weight matrix (out_channels x in_channels)
+  // For groups > 1: empty (use _group_weights instead)
   Eigen::MatrixXf _weight;
+  // For groups > 1: vector of per-group weight matrices, each (out_per_group x in_per_group)
+  std::vector<Eigen::MatrixXf> _group_weights;
   Eigen::VectorXf _bias;
   int _num_groups;
+  int _in_channels;
+  int _out_channels;
 
 private:
   Eigen::MatrixXf _output;

From e3be2552d559cc2a520999b307d8b17e023ab4da Mon Sep 17 00:00:00 2001
From: Steven Atkinson <steven@atkinson.mn>
Date: Wed, 28 Jan 2026 23:46:36 -0800
Subject: [PATCH 4/5] Revert "Implement std::vector grouped_weights"

This reverts commit e78e1917f901178e71ed0b08f7e6a7d4ddfd550d.
---
 NAM/dsp.cpp | 120 +++++++++-------------------------------------------
 NAM/dsp.h   |  10 +----
 2 files changed, 23 insertions(+), 107 deletions(-)

diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp
index 0d35138..b7f5f3f 100644
--- a/NAM/dsp.cpp
+++ b/NAM/dsp.cpp
@@ -317,10 +317,6 @@ std::unique_ptr<nam::DSP> nam::linear::Factory(const nlohmann::json& config, std
 // Conv1x1 ====================================================================
 
 nam::Conv1x1::Conv1x1(const int in_channels, const int out_channels, const bool _bias, const int groups)
-: _num_groups(groups)
-, _in_channels(in_channels)
-, _out_channels(out_channels)
-, _do_bias(_bias)
 {
   // Validate that channels divide evenly by groups
   if (in_channels % groups != 0)
@@ -334,25 +330,10 @@ nam::Conv1x1::Conv1x1(const int in_channels, const int out_channels, const bool
                              + std::to_string(groups) + ")");
   }
 
-  if (groups == 1)
-  {
-    // Single group: use single weight matrix
-    this->_weight.resize(out_channels, in_channels);
-    this->_weight.setZero();
-  }
-  else
-  {
-    // Multiple groups: use per-group weight matrices (more memory efficient)
-    const int out_per_group = out_channels / groups;
-    const int in_per_group = in_channels / groups;
-    this->_group_weights.resize(groups);
-    for (int g = 0; g < groups; g++)
-    {
-      this->_group_weights[g].resize(out_per_group, in_per_group);
-      this->_group_weights[g].setZero();
-    }
-  }
-
+  this->_num_groups = groups;
+  this->_weight.resize(out_channels, in_channels);
+  this->_weight.setZero();
+  this->_do_bias = _bias;
   if (_bias)
   {
     this->_bias.resize(out_channels);
@@ -368,79 +349,37 @@ void nam::Conv1x1::SetMaxBufferSize(const int maxBufferSize)
 
 void nam::Conv1x1::set_weights_(std::vector<float>::iterator& weights)
 {
-  const int numGroups = this->_num_groups;
-
-  if (numGroups == 1)
+  if (this->_weight.size() > 0)
   {
-    // Single group: populate the single weight matrix
-    if (this->_weight.size() > 0)
-    {
-      const long out_channels = this->_weight.rows();
-      const long in_channels = this->_weight.cols();
-      for (long i = 0; i < out_channels; i++)
-      {
-        for (long j = 0; j < in_channels; j++)
-        {
-          this->_weight(i, j) = *(weights++);
-        }
-      }
-    }
-  }
-  else
-  {
-    // Multiple groups: populate per-group weight matrices
-    const long out_per_group = this->_out_channels / numGroups;
-    const long in_per_group = this->_in_channels / numGroups;
-
+    const long out_channels = this->_weight.rows();
+    const long in_channels = this->_weight.cols();
+    const int numGroups = this->_num_groups;
+    const long out_per_group = out_channels / numGroups;
+    const long in_per_group = in_channels / numGroups;
+
+    // For grouped convolutions, weights are organized per group
+    // Weight layout: weights are [group0, group1, ..., groupN-1]
+    // Each group's weight matrix is (out_channels/numGroups, in_channels/numGroups)
     for (int g = 0; g < numGroups; g++)
     {
-      for (long i = 0; i < out_per_group; i++)
+      for (auto i = 0; i < out_per_group; i++)
       {
-        for (long j = 0; j < in_per_group; j++)
+        for (auto j = 0; j < in_per_group; j++)
         {
-          this->_group_weights[g](i, j) = *(weights++);
+          this->_weight(g * out_per_group + i, g * in_per_group + j) = *(weights++);
         }
       }
     }
   }
-
   if (this->_do_bias)
     for (int i = 0; i < this->_bias.size(); i++)
       this->_bias(i) = *(weights++);
 }
 
-long nam::Conv1x1::get_out_channels() const
-{
-  return this->_out_channels;
-}
-
-long nam::Conv1x1::get_in_channels() const
-{
-  return this->_in_channels;
-}
-
 Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int num_frames) const
 {
-  const int numGroups = this->_num_groups;
-  Eigen::MatrixXf result(this->_out_channels, num_frames);
-
-  if (numGroups == 1)
-  {
-    // Single GEMM for non-grouped case
-    result.noalias() = this->_weight * input.leftCols(num_frames);
-  }
-  else
-  {
-    // Grouped convolution: process each group with compact weight matrices
-    const long in_per_group = this->_in_channels / numGroups;
-    const long out_per_group = this->_out_channels / numGroups;
-
-    for (int g = 0; g < numGroups; g++)
-    {
-      result.middleRows(g * out_per_group, out_per_group).noalias() =
-        this->_group_weights[g] * input.leftCols(num_frames).middleRows(g * in_per_group, in_per_group);
-    }
-  }
+  // Single GEMM for all cases - block-diagonal zero structure handles grouping
+  Eigen::MatrixXf result = this->_weight * input.leftCols(num_frames);
 
   if (this->_do_bias)
     result.colwise() += this->_bias;
@@ -452,25 +391,8 @@ void nam::Conv1x1::process_(const Eigen::Ref<const Eigen::MatrixXf>& input, cons
 {
   assert(num_frames <= _output.cols());
 
-  const int numGroups = this->_num_groups;
-
-  if (numGroups == 1)
-  {
-    // Single GEMM for non-grouped case
-    _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames);
-  }
-  else
-  {
-    // Grouped convolution: process each group with compact weight matrices
-    const long in_per_group = this->_in_channels / numGroups;
-    const long out_per_group = this->_out_channels / numGroups;
-
-    for (int g = 0; g < numGroups; g++)
-    {
-      _output.leftCols(num_frames).middleRows(g * out_per_group, out_per_group).noalias() =
-        this->_group_weights[g] * input.leftCols(num_frames).middleRows(g * in_per_group, in_per_group);
-    }
-  }
+  // Single GEMM for all cases - block-diagonal zero structure handles grouping
+  _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames);
 
   if (this->_do_bias)
     _output.leftCols(num_frames).colwise() += this->_bias;
diff --git a/NAM/dsp.h b/NAM/dsp.h
index 7ada6a0..8b984d2 100644
--- a/NAM/dsp.h
+++ b/NAM/dsp.h
@@ -323,19 +323,13 @@ class Conv1x1
   /// \param num_frames Number of frames to process
   void process_(const Eigen::Ref<const Eigen::MatrixXf>& input, const int num_frames);
 
-  long get_out_channels() const;
-  long get_in_channels() const;
+  long get_out_channels() const { return this->_weight.rows(); };
+  long get_in_channels() const { return this->_weight.cols(); };
 
 protected:
-  // For groups == 1: single weight matrix (out_channels x in_channels)
-  // For groups > 1: empty (use _group_weights instead)
   Eigen::MatrixXf _weight;
-  // For groups > 1: vector of per-group weight matrices, each (out_per_group x in_per_group)
-  std::vector<Eigen::MatrixXf> _group_weights;
   Eigen::VectorXf _bias;
   int _num_groups;
-  int _in_channels;
-  int _out_channels;
 
 private:
   Eigen::MatrixXf _output;

From 2ad9decd15b3d38e45b7047d3429b4e16660b2e2 Mon Sep 17 00:00:00 2001
From: Steven Atkinson <steven@atkinson.mn>
Date: Wed, 28 Jan 2026 23:56:35 -0800
Subject: [PATCH 5/5] Improve grouped convolutions for Conv1D by...ignoring
 them for now.

---
 NAM/conv1d.cpp | 107 ++++++++++---------------------------------------
 1 file changed, 22 insertions(+), 85 deletions(-)

diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp
index 4febf91..f05dd07 100644
--- a/NAM/conv1d.cpp
+++ b/NAM/conv1d.cpp
@@ -110,54 +110,22 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
   // Zero output before processing
   _output.leftCols(num_frames).setZero();
 
-  const int numGroups = this->_num_groups;
-  const long in_channels = get_in_channels();
-  const long out_channels = get_out_channels();
-  const long in_per_group = in_channels / numGroups;
-  const long out_per_group = out_channels / numGroups;
-
   // Process from ring buffer with dilation lookback
   // After Write(), data is at positions [_write_pos, _write_pos+num_frames-1]
   // For kernel tap k with offset, we need to read from _write_pos + offset
   // The offset is negative (looking back), so _write_pos + offset reads from earlier positions
-  // The original process_() reads: input.middleCols(i_start + offset, ncols)
-  // where i_start is the current position and offset is negative for lookback
-
-  if (numGroups == 1)
-  {
-    // Standard convolution (no grouping)
-    for (size_t k = 0; k < this->_weight.size(); k++)
-    {
-      const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
-      const long lookback = -offset;
-      auto input_block = _input_buffer.Read(num_frames, lookback);
-      _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block;
-    }
-  }
-  else
+  //
+  // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
+  // so we can use a single GEMM for all cases. A more advanced implementation could store
+  // compact per-group weight matrices and loop over groups, but at typical model sizes
+  // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
+  // and the single sparse GEMM approach is faster.
+  for (size_t k = 0; k < this->_weight.size(); k++)
   {
-    // Grouped convolution: process each group separately
-    for (int g = 0; g < numGroups; g++)
-    {
-      for (size_t k = 0; k < this->_weight.size(); k++)
-      {
-        const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
-        const long lookback = -offset;
-        auto input_block = _input_buffer.Read(num_frames, lookback);
-
-        // Extract input slice for this group
-        auto input_group = input_block.middleRows(g * in_per_group, in_per_group);
-
-        // Extract weight slice for this group
-        auto weight_group = this->_weight[k].block(g * out_per_group, g * in_per_group, out_per_group, in_per_group);
-
-        // Extract output slice for this group
-        auto output_group = _output.leftCols(num_frames).middleRows(g * out_per_group, out_per_group);
-
-        // Perform grouped convolution: output_group += weight_group * input_group
-        output_group.noalias() += weight_group * input_group;
-      }
-    }
+    const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
+    const long lookback = -offset;
+    auto input_block = _input_buffer.Read(num_frames, lookback);
+    _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block;
   }
 
   // Add bias if present
@@ -173,49 +141,18 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
 void Conv1D::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start, const long ncols,
                       const long j_start) const
 {
-  const int numGroups = this->_num_groups;
-  const long in_channels = get_in_channels();
-  const long out_channels = get_out_channels();
-  const long in_per_group = in_channels / numGroups;
-  const long out_per_group = out_channels / numGroups;
-
-  if (numGroups == 1)
+  // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
+  // so we can use a single GEMM for all cases. A more advanced implementation could store
+  // compact per-group weight matrices and loop over groups, but at typical model sizes
+  // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
+  // and the single sparse GEMM approach is faster.
+  for (size_t k = 0; k < this->_weight.size(); k++)
   {
-    // Standard convolution (no grouping)
-    for (size_t k = 0; k < this->_weight.size(); k++)
-    {
-      const long offset = this->_dilation * (k + 1 - this->_weight.size());
-      if (k == 0)
-        output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols);
-      else
-        output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols);
-    }
-  }
-  else
-  {
-    // Grouped convolution: process each group separately
-    for (int g = 0; g < numGroups; g++)
-    {
-      for (size_t k = 0; k < this->_weight.size(); k++)
-      {
-        const long offset = this->_dilation * (k + 1 - this->_weight.size());
-
-        // Extract input slice for this group
-        auto input_group = input.middleCols(i_start + offset, ncols).middleRows(g * in_per_group, in_per_group);
-
-        // Extract weight slice for this group
-        auto weight_group = this->_weight[k].block(g * out_per_group, g * in_per_group, out_per_group, in_per_group);
-
-        // Extract output slice for this group
-        auto output_group = output.middleCols(j_start, ncols).middleRows(g * out_per_group, out_per_group);
-
-        // Perform grouped convolution
-        if (k == 0)
-          output_group.noalias() = weight_group * input_group;
-        else
-          output_group.noalias() += weight_group * input_group;
-      }
-    }
+    const long offset = this->_dilation * (k + 1 - this->_weight.size());
+    if (k == 0)
+      output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols);
+    else
+      output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols);
   }
   if (this->_bias.size() > 0)
   {