From f247953df0b77f7d4d0e27d7a29c91e6809d1a8c Mon Sep 17 00:00:00 2001
From: Steven Atkinson <steven@atkinson.mn>
Date: Thu, 29 Jan 2026 00:23:14 -0800
Subject: [PATCH] Depthwise convolution implementation

Squashed commit of the following:

commit 79e9f31415cde3ec1430229121751429eb7eff25
Merge: 4d1fd5d 12f93a2
Author: Steven Atkinson <steven@atkinson.mn>
Date:   Thu Jan 29 00:22:38 2026 -0800

    Merge branch 'main' into 215-group-2

commit 4d1fd5d95cfb2b4c02ab542b624a8c98811a40a3
Author: Steven Atkinson <steven@atkinson.mn>
Date:   Thu Jan 29 00:17:36 2026 -0800

    Enhance Conv1x1 and Conv1D classes to support depthwise convolutions. Introduced logic to differentiate between depthwise and non-depthwise configurations, optimizing weight storage and processing methods accordingly. Updated weight setting and processing functions to handle depthwise operations efficiently, ensuring correct handling of input channels and weights.

commit 2ad9decd15b3d38e45b7047d3429b4e16660b2e2
Author: Steven Atkinson <steven@atkinson.mn>
Date:   Wed Jan 28 23:56:35 2026 -0800

    Improve grouped convolutions for Conv1D by...ignoring them for now.

commit e3be2552d559cc2a520999b307d8b17e023ab4da
Author: Steven Atkinson <steven@atkinson.mn>
Date:   Wed Jan 28 23:46:36 2026 -0800

    Revert "Implement std::vector grouped_weights"

    This reverts commit e78e1917f901178e71ed0b08f7e6a7d4ddfd550d.

commit e78e1917f901178e71ed0b08f7e6a7d4ddfd550d
Author: Steven Atkinson <steven@atkinson.mn>
Date:   Wed Jan 28 23:41:45 2026 -0800

    Implement std::vector grouped_weights

commit 546f820929d30e804f638156148a4ad21037fd95
Author: Steven Atkinson <steven@atkinson.mn>
Date:   Wed Jan 28 23:31:28 2026 -0800

    Improve speed of small grouped convolutions with single GEMM

commit c20fb86f3dc48706a8544b16abe15dbcd74ce677
Author: Steven Atkinson <steven@atkinson.mn>
Date:   Wed Jan 28 23:23:28 2026 -0800

    Zero out conv weight matrices after resize
---
 NAM/conv1d.cpp | 162 ++++++++++++++++++++++++++++++++++++++++---------
 NAM/conv1d.h   |  13 ++--
 NAM/dsp.cpp    |  77 ++++++++++++++++++++---
 NAM/dsp.h      |  10 ++-
 4 files changed, 219 insertions(+), 43 deletions(-)
diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp
index f05dd07..9bbbc02 100644
--- a/NAM/conv1d.cpp
+++ b/NAM/conv1d.cpp
@@ -7,7 +7,21 @@ namespace nam
 
 void Conv1D::set_weights_(std::vector<float>::iterator& weights)
 {
-  if (this->_weight.size() > 0)
+  if (this->_is_depthwise)
+  {
+    // Depthwise convolution: one weight per channel per kernel tap
+    // Weight layout: for each channel c, for each kernel position k
+    const int channels = this->_channels;
+    const size_t kernel_size = this->_depthwise_weight.size();
+    for (int c = 0; c < channels; c++)
+    {
+      for (size_t k = 0; k < kernel_size; k++)
+      {
+        this->_depthwise_weight[k](c) = *(weights++);
+      }
+    }
+  }
+  else if (this->_weight.size() > 0)
   {
     const long out_channels = this->_weight[0].rows();
     const long in_channels = this->_weight[0].cols();
@@ -53,13 +67,39 @@ void Conv1D::set_size_(const int in_channels, const int out_channels, const int
   }
 
   this->_num_groups = groups;
-  this->_weight.resize(kernel_size);
-  for (size_t i = 0; i < this->_weight.size(); i++)
+  this->_dilation = _dilation;
+
+  // Check for depthwise convolution: groups == in_channels == out_channels
+  // In this case, each channel is processed independently with a single weight per kernel tap,
+  // so we can use efficient element-wise multiplication instead of matrix multiplication.
+  this->_is_depthwise = (groups == in_channels && in_channels == out_channels);
+
+  if (this->_is_depthwise)
+  {
+    // Depthwise: store one weight vector per kernel tap
+    this->_channels = in_channels;
+    this->_depthwise_weight.resize(kernel_size);
+    for (int i = 0; i < kernel_size; i++)
+    {
+      this->_depthwise_weight[i].resize(in_channels);
+      this->_depthwise_weight[i].setZero();
+    }
+    this->_weight.clear(); // Not used for depthwise
+  }
+  else
   {
-    this->_weight[i].resize(out_channels,
-                            in_channels); // y = Ax, input array (C,L)
-    this->_weight[i].setZero();
+    // Non-depthwise: store full weight matrices (block-diagonal for grouped convolutions)
+    this->_weight.resize(kernel_size);
+    for (int i = 0; i < kernel_size; i++)
+    {
+      this->_weight[i].resize(out_channels,
+                              in_channels); // y = Ax, input array (C,L)
+      this->_weight[i].setZero();
+    }
+    this->_depthwise_weight.clear(); // Not used for non-depthwise
+    this->_channels = 0;
   }
+
   if (do_bias)
   {
     this->_bias.resize(out_channels);
@@ -67,7 +107,6 @@ void Conv1D::set_size_(const int in_channels, const int out_channels, const int
   }
   else
     this->_bias.resize(0);
-  this->_dilation = _dilation;
 }
 
 void Conv1D::set_size_and_weights_(const int in_channels, const int out_channels, const int kernel_size,
@@ -114,18 +153,37 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
   // After Write(), data is at positions [_write_pos, _write_pos+num_frames-1]
   // For kernel tap k with offset, we need to read from _write_pos + offset
   // The offset is negative (looking back), so _write_pos + offset reads from earlier positions
-  //
-  // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
-  // so we can use a single GEMM for all cases. A more advanced implementation could store
-  // compact per-group weight matrices and loop over groups, but at typical model sizes
-  // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
-  // and the single sparse GEMM approach is faster.
-  for (size_t k = 0; k < this->_weight.size(); k++)
+
+  if (this->_is_depthwise)
+  {
+    // Depthwise convolution: use efficient element-wise multiplication
+    // Each channel is processed independently with a single weight per kernel tap.
+    // output[c, t] = sum_k(weight[k, c] * input[c, t - k*dilation])
+    const size_t kernel_size = this->_depthwise_weight.size();
+    for (size_t k = 0; k < kernel_size; k++)
+    {
+      const long offset = this->_dilation * (k + 1 - (long)kernel_size);
+      const long lookback = -offset;
+      auto input_block = _input_buffer.Read(num_frames, lookback);
+      // Element-wise multiply: each row of input_block is multiplied by corresponding weight
+      _output.leftCols(num_frames).noalias() +=
+        this->_depthwise_weight[k].asDiagonal() * input_block.leftCols(num_frames);
+    }
+  }
+  else
   {
-    const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
-    const long lookback = -offset;
-    auto input_block = _input_buffer.Read(num_frames, lookback);
-    _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block;
+    // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
+    // so we can use a single GEMM for all cases. A more advanced implementation could store
+    // compact per-group weight matrices and loop over groups, but at typical model sizes
+    // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
+    // and the single sparse GEMM approach is faster.
+    for (size_t k = 0; k < this->_weight.size(); k++)
+    {
+      const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
+      const long lookback = -offset;
+      auto input_block = _input_buffer.Read(num_frames, lookback);
+      _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block;
+    }
   }
 
   // Add bias if present
@@ -141,18 +199,36 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
 void Conv1D::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start, const long ncols,
                       const long j_start) const
 {
-  // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
-  // so we can use a single GEMM for all cases. A more advanced implementation could store
-  // compact per-group weight matrices and loop over groups, but at typical model sizes
-  // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
-  // and the single sparse GEMM approach is faster.
-  for (size_t k = 0; k < this->_weight.size(); k++)
+  if (this->_is_depthwise)
   {
-    const long offset = this->_dilation * (k + 1 - this->_weight.size());
-    if (k == 0)
-      output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols);
-    else
-      output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols);
+    // Depthwise convolution: use efficient element-wise multiplication
+    const size_t kernel_size = this->_depthwise_weight.size();
+    for (size_t k = 0; k < kernel_size; k++)
+    {
+      const long offset = this->_dilation * (k + 1 - (long)kernel_size);
+      if (k == 0)
+        output.middleCols(j_start, ncols).noalias() =
+          this->_depthwise_weight[k].asDiagonal() * input.middleCols(i_start + offset, ncols);
+      else
+        output.middleCols(j_start, ncols).noalias() +=
+          this->_depthwise_weight[k].asDiagonal() * input.middleCols(i_start + offset, ncols);
+    }
+  }
+  else
+  {
+    // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
+    // so we can use a single GEMM for all cases. A more advanced implementation could store
+    // compact per-group weight matrices and loop over groups, but at typical model sizes
+    // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
+    // and the single sparse GEMM approach is faster.
+    for (size_t k = 0; k < this->_weight.size(); k++)
+    {
+      const long offset = this->_dilation * (k + 1 - this->_weight.size());
+      if (k == 0)
+        output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols);
+      else
+        output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols);
+    }
   }
   if (this->_bias.size() > 0)
   {
@@ -160,10 +236,36 @@ void Conv1D::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, con
   }
 }
 
+long Conv1D::get_in_channels() const
+{
+  if (this->_is_depthwise)
+    return this->_channels;
+  return this->_weight.size() > 0 ? this->_weight[0].cols() : 0;
+}
+
+long Conv1D::get_out_channels() const
+{
+  if (this->_is_depthwise)
+    return this->_channels;
+  return this->_weight.size() > 0 ? this->_weight[0].rows() : 0;
+}
+
+long Conv1D::get_kernel_size() const
+{
+  if (this->_is_depthwise)
+    return this->_depthwise_weight.size();
+  return this->_weight.size();
+}
+
 long Conv1D::get_num_weights() const
 {
   long num_weights = this->_bias.size();
-  if (this->_weight.size() > 0)
+  if (this->_is_depthwise)
+  {
+    // Depthwise: one weight per channel per kernel tap
+    num_weights += this->_channels * this->_depthwise_weight.size();
+  }
+  else if (this->_weight.size() > 0)
   {
     const long out_channels = this->_weight[0].rows();
     const long in_channels = this->_weight[0].cols();
diff --git a/NAM/conv1d.h b/NAM/conv1d.h
index 8182966..8f00686 100644
--- a/NAM/conv1d.h
+++ b/NAM/conv1d.h
@@ -95,11 +95,11 @@ class Conv1D
                 const long j_start) const;
   /// \brief Get the number of input channels
   /// \return Number of input channels
-  long get_in_channels() const { return this->_weight.size() > 0 ? this->_weight[0].cols() : 0; };
+  long get_in_channels() const;
 
   /// \brief Get the kernel size
   /// \return Kernel size
-  long get_kernel_size() const { return this->_weight.size(); };
+  long get_kernel_size() const;
 
   /// \brief Get the total number of weights
   /// \return Total number of weight parameters
@@ -107,7 +107,7 @@ class Conv1D
 
   /// \brief Get the number of output channels
   /// \return Number of output channels
-  long get_out_channels() const { return this->_weight.size() > 0 ? this->_weight[0].rows() : 0; };
+  long get_out_channels() const;
 
   /// \brief Get the dilation factor
   /// \return Dilation factor
@@ -118,8 +118,13 @@ class Conv1D
   bool has_bias() const { return this->_bias.size() > 0; };
 
 protected:
-  // conv[kernel](cout, cin)
+  // conv[kernel](cout, cin) - used for non-depthwise convolutions
   std::vector<Eigen::MatrixXf> _weight;
+  // For depthwise convolution (groups == in_channels == out_channels):
+  // stores one weight per channel per kernel tap
+  std::vector<Eigen::VectorXf> _depthwise_weight;
+  bool _is_depthwise = false;
+  int _channels = 0; // Used for depthwise case (in_channels == out_channels)
   Eigen::VectorXf _bias;
   int _dilation;
   int _num_groups;
diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp
index b7f5f3f..05dab09 100644
--- a/NAM/dsp.cpp
+++ b/NAM/dsp.cpp
@@ -331,9 +331,30 @@ nam::Conv1x1::Conv1x1(const int in_channels, const int out_channels, const bool
   }
 
   this->_num_groups = groups;
-  this->_weight.resize(out_channels, in_channels);
-  this->_weight.setZero();
   this->_do_bias = _bias;
+
+  // Check for depthwise convolution: groups == in_channels == out_channels
+  // In this case, each channel is processed independently with a single weight,
+  // so we can use efficient element-wise multiplication instead of matrix multiplication.
+  this->_is_depthwise = (groups == in_channels && in_channels == out_channels);
+
+  if (this->_is_depthwise)
+  {
+    // Depthwise: store one weight per channel
+    this->_channels = in_channels;
+    this->_depthwise_weight.resize(in_channels);
+    this->_depthwise_weight.setZero();
+    // Clear the matrix weight (not used)
+    this->_weight.resize(0, 0);
+  }
+  else
+  {
+    // Non-depthwise: store full weight matrix (block-diagonal for grouped convolutions)
+    this->_weight.resize(out_channels, in_channels);
+    this->_weight.setZero();
+    this->_channels = 0;
+  }
+
   if (_bias)
   {
     this->_bias.resize(out_channels);
@@ -349,7 +370,15 @@ void nam::Conv1x1::SetMaxBufferSize(const int maxBufferSize)
 
 void nam::Conv1x1::set_weights_(std::vector<float>::iterator& weights)
 {
-  if (this->_weight.size() > 0)
+  if (this->_is_depthwise)
+  {
+    // Depthwise convolution: one weight per channel
+    for (int c = 0; c < this->_channels; c++)
+    {
+      this->_depthwise_weight(c) = *(weights++);
+    }
+  }
+  else if (this->_weight.size() > 0)
   {
     const long out_channels = this->_weight.rows();
     const long in_channels = this->_weight.cols();
@@ -376,10 +405,35 @@ void nam::Conv1x1::set_weights_(std::vector<float>::iterator& weights)
       this->_bias(i) = *(weights++);
 }
 
+long nam::Conv1x1::get_out_channels() const
+{
+  if (this->_is_depthwise)
+    return this->_channels;
+  return this->_weight.rows();
+}
+
+long nam::Conv1x1::get_in_channels() const
+{
+  if (this->_is_depthwise)
+    return this->_channels;
+  return this->_weight.cols();
+}
+
 Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int num_frames) const
 {
-  // Single GEMM for all cases - block-diagonal zero structure handles grouping
-  Eigen::MatrixXf result = this->_weight * input.leftCols(num_frames);
+  Eigen::MatrixXf result(get_out_channels(), num_frames);
+
+  if (this->_is_depthwise)
+  {
+    // Depthwise convolution: efficient element-wise multiplication
+    // Each channel is scaled by its corresponding weight
+    result.noalias() = this->_depthwise_weight.asDiagonal() * input.leftCols(num_frames);
+  }
+  else
+  {
+    // Single GEMM for all cases - block-diagonal zero structure handles grouping
+    result.noalias() = this->_weight * input.leftCols(num_frames);
+  }
 
   if (this->_do_bias)
     result.colwise() += this->_bias;
@@ -391,8 +445,17 @@ void nam::Conv1x1::process_(const Eigen::Ref<const Eigen::MatrixXf>& input, cons
 {
   assert(num_frames <= _output.cols());
 
-  // Single GEMM for all cases - block-diagonal zero structure handles grouping
-  _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames);
+  if (this->_is_depthwise)
+  {
+    // Depthwise convolution: efficient element-wise multiplication
+    // Each channel is scaled by its corresponding weight
+    _output.leftCols(num_frames).noalias() = this->_depthwise_weight.asDiagonal() * input.leftCols(num_frames);
+  }
+  else
+  {
+    // Single GEMM for all cases - block-diagonal zero structure handles grouping
+    _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames);
+  }
 
   if (this->_do_bias)
     _output.leftCols(num_frames).colwise() += this->_bias;
diff --git a/NAM/dsp.h b/NAM/dsp.h
index 8b984d2..1313ad9 100644
--- a/NAM/dsp.h
+++ b/NAM/dsp.h
@@ -323,11 +323,17 @@ class Conv1x1
   /// \param num_frames Number of frames to process
   void process_(const Eigen::Ref<const Eigen::MatrixXf>& input, const int num_frames);
 
-  long get_out_channels() const { return this->_weight.rows(); };
-  long get_in_channels() const { return this->_weight.cols(); };
+  long get_out_channels() const;
+  long get_in_channels() const;
 
 protected:
+  // Non-depthwise: full weight matrix (out_channels x in_channels)
   Eigen::MatrixXf _weight;
+  // For depthwise convolution (groups == in_channels == out_channels):
+  // stores one weight per channel
+  Eigen::VectorXf _depthwise_weight;
+  bool _is_depthwise = false;
+  int _channels = 0; // Used for depthwise case (in_channels == out_channels)
   Eigen::VectorXf _bias;
   int _num_groups;