sdatkinson · jfsantos · Jan 28, 2026 · Jan 28, 2026 · Jan 29, 2026 · Jan 29, 2026
diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp
@@ -54,9 +54,10 @@ void Conv1D::set_size_(const int in_channels, const int out_channels, const int
 
   this->_num_groups = groups;
   this->_weight.resize(kernel_size);
+  // Initialize weight matrices to zero - critical for block-diagonal structure
+  // Off-diagonal blocks must be zero for single-matmul grouped convolution
   for (size_t i = 0; i < this->_weight.size(); i++)
-    this->_weight[i].resize(out_channels,
-                            in_channels); // y = Ax, input array (C,L)
+    this->_weight[i].setZero(out_channels, in_channels);
   if (do_bias)
     this->_bias.resize(out_channels);
   else
@@ -104,54 +105,15 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
   // Zero output before processing
   _output.leftCols(num_frames).setZero();
 
-  const int numGroups = this->_num_groups;
-  const long in_channels = get_in_channels();
-  const long out_channels = get_out_channels();
-  const long in_per_group = in_channels / numGroups;
-  const long out_per_group = out_channels / numGroups;
-
-  // Process from ring buffer with dilation lookback
-  // After Write(), data is at positions [_write_pos, _write_pos+num_frames-1]
-  // For kernel tap k with offset, we need to read from _write_pos + offset
-  // The offset is negative (looking back), so _write_pos + offset reads from earlier positions
-  // The original process_() reads: input.middleCols(i_start + offset, ncols)
-  // where i_start is the current position and offset is negative for lookback
-
-  if (numGroups == 1)
-  {
-    // Standard convolution (no grouping)
-    for (size_t k = 0; k < this->_weight.size(); k++)
-    {
-      const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
-      const long lookback = -offset;
-      auto input_block = _input_buffer.Read(num_frames, lookback);
-      _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block;
-    }
-  }
-  else
+  // Block-diagonal optimization: each weight matrix has block-diagonal structure for grouped convs.
+  // Off-diagonal blocks are zeros, so a single matmul per kernel position gives the same result
+  // as G separate matmuls. This is more efficient because BLAS can optimize larger operations.
+  for (size_t k = 0; k < this->_weight.size(); k++)
   {
-    // Grouped convolution: process each group separately
-    for (int g = 0; g < numGroups; g++)
-    {
-      for (size_t k = 0; k < this->_weight.size(); k++)
-      {
-        const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
-        const long lookback = -offset;
-        auto input_block = _input_buffer.Read(num_frames, lookback);
-
-        // Extract input slice for this group
-        auto input_group = input_block.middleRows(g * in_per_group, in_per_group);
-
-        // Extract weight slice for this group
-        auto weight_group = this->_weight[k].block(g * out_per_group, g * in_per_group, out_per_group, in_per_group);
-
-        // Extract output slice for this group
-        auto output_group = _output.leftCols(num_frames).middleRows(g * out_per_group, out_per_group);
-
-        // Perform grouped convolution: output_group += weight_group * input_group
-        output_group.noalias() += weight_group * input_group;
-      }
-    }
+    const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
+    const long lookback = -offset;
+    auto input_block = _input_buffer.Read(num_frames, lookback);
+    _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block;
   }
 
   // Add bias if present
@@ -167,49 +129,15 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
 void Conv1D::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start, const long ncols,
                       const long j_start) const
 {
-  const int numGroups = this->_num_groups;
-  const long in_channels = get_in_channels();
-  const long out_channels = get_out_channels();
-  const long in_per_group = in_channels / numGroups;
-  const long out_per_group = out_channels / numGroups;
-
-  if (numGroups == 1)
+  // Block-diagonal optimization: each weight matrix has block-diagonal structure for grouped convs.
+  // Off-diagonal blocks are zeros, so a single matmul per kernel position gives the same result.
+  for (size_t k = 0; k < this->_weight.size(); k++)
   {
-    // Standard convolution (no grouping)
-    for (size_t k = 0; k < this->_weight.size(); k++)
-    {
-      const long offset = this->_dilation * (k + 1 - this->_weight.size());
-      if (k == 0)
-        output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols);
-      else
-        output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols);
-    }
-  }
-  else
-  {
-    // Grouped convolution: process each group separately
-    for (int g = 0; g < numGroups; g++)
-    {
-      for (size_t k = 0; k < this->_weight.size(); k++)
-      {
-        const long offset = this->_dilation * (k + 1 - this->_weight.size());
-
-        // Extract input slice for this group
-        auto input_group = input.middleCols(i_start + offset, ncols).middleRows(g * in_per_group, in_per_group);
-
-        // Extract weight slice for this group
-        auto weight_group = this->_weight[k].block(g * out_per_group, g * in_per_group, out_per_group, in_per_group);
-
-        // Extract output slice for this group
-        auto output_group = output.middleCols(j_start, ncols).middleRows(g * out_per_group, out_per_group);
-
-        // Perform grouped convolution
-        if (k == 0)
-          output_group.noalias() = weight_group * input_group;
-        else
-          output_group.noalias() += weight_group * input_group;
-      }
-    }
+    const long offset = this->_dilation * (k + 1 - this->_weight.size());
+    if (k == 0)
+      output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols);
+    else
+      output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols);
   }
   if (this->_bias.size() > 0)
   {

diff --git a/NAM/conv1d_factory.cpp b/NAM/conv1d_factory.cpp
@@ -0,0 +1,53 @@
+// Conv1D Factory implementation
+// Returns dynamic Conv1D wrapped in IConv1D interface
+
+#include "conv1d_factory.h"
+#include "conv1d.h"
+
+namespace nam
+{
+
+/// \brief Dynamic wrapper for Conv1D implementing IConv1D interface
+///
+/// This class wraps the existing Conv1D implementation to provide the IConv1D
+/// interface for configurations that don't have specialized template instantiations.
+class Conv1DDynamicWrapper : public IConv1D
+{
+public:
+  Conv1DDynamicWrapper(int in_channels, int out_channels, int kernel_size, int dilation, bool bias, int groups)
+  {
+    _conv.set_size_(in_channels, out_channels, kernel_size, bias, dilation, groups);
+  }
+
+  Eigen::MatrixXf& GetOutput() override { return _conv.GetOutput(); }
+
+  const Eigen::MatrixXf& GetOutput() const override { return _conv.GetOutput(); }
+
+  void SetMaxBufferSize(int maxBufferSize) override { _conv.SetMaxBufferSize(maxBufferSize); }
+
+  void set_weights_(std::vector<float>::iterator& weights) override { _conv.set_weights_(weights); }
+
+  void Process(const Eigen::MatrixXf& input, int num_frames) override { _conv.Process(input, num_frames); }
+
+  long get_out_channels() const override { return _conv.get_out_channels(); }
+
+  long get_in_channels() const override { return _conv.get_in_channels(); }
+
+  long get_kernel_size() const override { return _conv.get_kernel_size(); }
+
+  int get_dilation() const override { return _conv.get_dilation(); }
+
+  bool has_bias() const override { return _conv.has_bias(); }
+
+private:
+  Conv1D _conv;
+};
+
+// Factory implementation - always returns dynamic implementation
+std::unique_ptr<IConv1D> Conv1DFactory::create(int in_channels, int out_channels, int kernel_size, int dilation,
+                                               bool bias, int groups)
+{
+  return std::make_unique<Conv1DDynamicWrapper>(in_channels, out_channels, kernel_size, dilation, bias, groups);
+}
+
+} // namespace nam
diff --git a/NAM/conv1d_factory.h b/NAM/conv1d_factory.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <memory>
+#include "conv1d_fixed.h"
+
+namespace nam
+{
+
+/// \brief Factory for creating Conv1D implementations
+///
+/// Returns a dynamic Conv1D implementation wrapped in the IConv1D interface.
+/// For fully optimized implementations with compile-time known buffer sizes,
+/// use Conv1DFullyFixed directly.
+class Conv1DFactory
+{
+public:
+  /// \brief Create a Conv1D implementation
+  ///
+  /// Returns a dynamic implementation. For maximum performance with known
+  /// buffer sizes, use Conv1DFullyFixed template directly.
+  ///
+  /// \param in_channels Number of input channels
+  /// \param out_channels Number of output channels
+  /// \param kernel_size Size of the convolution kernel
+  /// \param dilation Dilation factor for the convolution
+  /// \param bias Whether to use bias
+  /// \param groups Number of groups for grouped convolution (default: 1)
+  /// \return Unique pointer to an IConv1D implementation
+  static std::unique_ptr<IConv1D> create(int in_channels, int out_channels, int kernel_size, int dilation, bool bias,
+                                         int groups = 1);
+};
+
+} // namespace nam