sdatkinson · sdatkinson · Jan 29, 2026 · Jan 29, 2026
diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp
@@ -7,7 +7,21 @@ namespace nam
 
 void Conv1D::set_weights_(std::vector<float>::iterator& weights)
 {
-  if (this->_weight.size() > 0)
+  if (this->_is_depthwise)
+  {
+    // Depthwise convolution: one weight per channel per kernel tap
+    // Weight layout: for each channel c, for each kernel position k
+    const int channels = this->_channels;
+    const size_t kernel_size = this->_depthwise_weight.size();
+    for (int c = 0; c < channels; c++)
+    {
+      for (size_t k = 0; k < kernel_size; k++)
+      {
+        this->_depthwise_weight[k](c) = *(weights++);
+      }
+    }
+  }
+  else if (this->_weight.size() > 0)
   {
     const long out_channels = this->_weight[0].rows();
     const long in_channels = this->_weight[0].cols();
@@ -53,21 +67,46 @@ void Conv1D::set_size_(const int in_channels, const int out_channels, const int
   }
 
   this->_num_groups = groups;
-  this->_weight.resize(kernel_size);
-  for (size_t i = 0; i < this->_weight.size(); i++)
+  this->_dilation = _dilation;
+
+  // Check for depthwise convolution: groups == in_channels == out_channels
+  // In this case, each channel is processed independently with a single weight per kernel tap,
+  // so we can use efficient element-wise multiplication instead of matrix multiplication.
+  this->_is_depthwise = (groups == in_channels && in_channels == out_channels);
+
+  if (this->_is_depthwise)
+  {
+    // Depthwise: store one weight vector per kernel tap
+    this->_channels = in_channels;
+    this->_depthwise_weight.resize(kernel_size);
+    for (int i = 0; i < kernel_size; i++)
+    {
+      this->_depthwise_weight[i].resize(in_channels);
+      this->_depthwise_weight[i].setZero();
+    }
+    this->_weight.clear(); // Not used for depthwise
+  }
+  else
   {
-    this->_weight[i].resize(out_channels,
-                            in_channels); // y = Ax, input array (C,L)
-    this->_weight[i].setZero();
+    // Non-depthwise: store full weight matrices (block-diagonal for grouped convolutions)
+    this->_weight.resize(kernel_size);
+    for (int i = 0; i < kernel_size; i++)
+    {
+      this->_weight[i].resize(out_channels,
+                              in_channels); // y = Ax, input array (C,L)
+      this->_weight[i].setZero();
+    }
+    this->_depthwise_weight.clear(); // Not used for non-depthwise
+    this->_channels = 0;
   }
+
   if (do_bias)
   {
     this->_bias.resize(out_channels);
     this->_bias.setZero();
   }
   else
     this->_bias.resize(0);
-  this->_dilation = _dilation;
 }
 
 void Conv1D::set_size_and_weights_(const int in_channels, const int out_channels, const int kernel_size,
@@ -114,18 +153,37 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
   // After Write(), data is at positions [_write_pos, _write_pos+num_frames-1]
   // For kernel tap k with offset, we need to read from _write_pos + offset
   // The offset is negative (looking back), so _write_pos + offset reads from earlier positions
-  //
-  // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
-  // so we can use a single GEMM for all cases. A more advanced implementation could store
-  // compact per-group weight matrices and loop over groups, but at typical model sizes
-  // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
-  // and the single sparse GEMM approach is faster.
-  for (size_t k = 0; k < this->_weight.size(); k++)
+
+  if (this->_is_depthwise)
+  {
+    // Depthwise convolution: use efficient element-wise multiplication
+    // Each channel is processed independently with a single weight per kernel tap.
+    // output[c, t] = sum_k(weight[k, c] * input[c, t - k*dilation])
+    const size_t kernel_size = this->_depthwise_weight.size();
+    for (size_t k = 0; k < kernel_size; k++)
+    {
+      const long offset = this->_dilation * (k + 1 - (long)kernel_size);
+      const long lookback = -offset;
+      auto input_block = _input_buffer.Read(num_frames, lookback);
+      // Element-wise multiply: each row of input_block is multiplied by corresponding weight
+      _output.leftCols(num_frames).noalias() +=
+        this->_depthwise_weight[k].asDiagonal() * input_block.leftCols(num_frames);
+    }
+  }
+  else
   {
-    const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
-    const long lookback = -offset;
-    auto input_block = _input_buffer.Read(num_frames, lookback);
-    _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block;
+    // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
+    // so we can use a single GEMM for all cases. A more advanced implementation could store
+    // compact per-group weight matrices and loop over groups, but at typical model sizes
+    // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
+    // and the single sparse GEMM approach is faster.
+    for (size_t k = 0; k < this->_weight.size(); k++)
+    {
+      const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
+      const long lookback = -offset;
+      auto input_block = _input_buffer.Read(num_frames, lookback);
+      _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block;
+    }
   }
 
   // Add bias if present
@@ -141,29 +199,73 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
 void Conv1D::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start, const long ncols,
                       const long j_start) const
 {
-  // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
-  // so we can use a single GEMM for all cases. A more advanced implementation could store
-  // compact per-group weight matrices and loop over groups, but at typical model sizes
-  // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
-  // and the single sparse GEMM approach is faster.
-  for (size_t k = 0; k < this->_weight.size(); k++)
+  if (this->_is_depthwise)
   {
-    const long offset = this->_dilation * (k + 1 - this->_weight.size());
-    if (k == 0)
-      output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols);
-    else
-      output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols);
+    // Depthwise convolution: use efficient element-wise multiplication
+    const size_t kernel_size = this->_depthwise_weight.size();
+    for (size_t k = 0; k < kernel_size; k++)
+    {
+      const long offset = this->_dilation * (k + 1 - (long)kernel_size);
+      if (k == 0)
+        output.middleCols(j_start, ncols).noalias() =
+          this->_depthwise_weight[k].asDiagonal() * input.middleCols(i_start + offset, ncols);
+      else
+        output.middleCols(j_start, ncols).noalias() +=
+          this->_depthwise_weight[k].asDiagonal() * input.middleCols(i_start + offset, ncols);
+    }
+  }
+  else
+  {
+    // Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
+    // so we can use a single GEMM for all cases. A more advanced implementation could store
+    // compact per-group weight matrices and loop over groups, but at typical model sizes
+    // (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
+    // and the single sparse GEMM approach is faster.
+    for (size_t k = 0; k < this->_weight.size(); k++)
+    {
+      const long offset = this->_dilation * (k + 1 - this->_weight.size());
+      if (k == 0)
+        output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols);
+      else
+        output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols);
+    }
   }
   if (this->_bias.size() > 0)
   {
     output.middleCols(j_start, ncols).colwise() += this->_bias;
   }
 }
 
+long Conv1D::get_in_channels() const
+{
+  if (this->_is_depthwise)
+    return this->_channels;
+  return this->_weight.size() > 0 ? this->_weight[0].cols() : 0;
+}
+
+long Conv1D::get_out_channels() const
+{
+  if (this->_is_depthwise)
+    return this->_channels;
+  return this->_weight.size() > 0 ? this->_weight[0].rows() : 0;
+}
+
+long Conv1D::get_kernel_size() const
+{
+  if (this->_is_depthwise)
+    return this->_depthwise_weight.size();
+  return this->_weight.size();
+}
+
 long Conv1D::get_num_weights() const
 {
   long num_weights = this->_bias.size();
-  if (this->_weight.size() > 0)
+  if (this->_is_depthwise)
+  {
+    // Depthwise: one weight per channel per kernel tap
+    num_weights += this->_channels * this->_depthwise_weight.size();
+  }
+  else if (this->_weight.size() > 0)
   {
     const long out_channels = this->_weight[0].rows();
     const long in_channels = this->_weight[0].cols();

diff --git a/NAM/conv1d.h b/NAM/conv1d.h
@@ -95,19 +95,19 @@ class Conv1D
                 const long j_start) const;
   /// \brief Get the number of input channels
   /// \return Number of input channels
-  long get_in_channels() const { return this->_weight.size() > 0 ? this->_weight[0].cols() : 0; };
+  long get_in_channels() const;
 
   /// \brief Get the kernel size
   /// \return Kernel size
-  long get_kernel_size() const { return this->_weight.size(); };
+  long get_kernel_size() const;
 
   /// \brief Get the total number of weights
   /// \return Total number of weight parameters
   long get_num_weights() const;
 
   /// \brief Get the number of output channels
   /// \return Number of output channels
-  long get_out_channels() const { return this->_weight.size() > 0 ? this->_weight[0].rows() : 0; };
+  long get_out_channels() const;
 
   /// \brief Get the dilation factor
   /// \return Dilation factor
@@ -118,8 +118,13 @@ class Conv1D
   bool has_bias() const { return this->_bias.size() > 0; };
 
 protected:
-  // conv[kernel](cout, cin)
+  // conv[kernel](cout, cin) - used for non-depthwise convolutions
   std::vector<Eigen::MatrixXf> _weight;
+  // For depthwise convolution (groups == in_channels == out_channels):
+  // stores one weight per channel per kernel tap
+  std::vector<Eigen::VectorXf> _depthwise_weight;
+  bool _is_depthwise = false;
+  int _channels = 0; // Used for depthwise case (in_channels == out_channels)
   Eigen::VectorXf _bias;
   int _dilation;
   int _num_groups;

diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp
@@ -331,9 +331,30 @@ nam::Conv1x1::Conv1x1(const int in_channels, const int out_channels, const bool
   }
 
   this->_num_groups = groups;
-  this->_weight.resize(out_channels, in_channels);
-  this->_weight.setZero();
   this->_do_bias = _bias;
+
+  // Check for depthwise convolution: groups == in_channels == out_channels
+  // In this case, each channel is processed independently with a single weight,
+  // so we can use efficient element-wise multiplication instead of matrix multiplication.
+  this->_is_depthwise = (groups == in_channels && in_channels == out_channels);
+
+  if (this->_is_depthwise)
+  {
+    // Depthwise: store one weight per channel
+    this->_channels = in_channels;
+    this->_depthwise_weight.resize(in_channels);
+    this->_depthwise_weight.setZero();
+    // Clear the matrix weight (not used)
+    this->_weight.resize(0, 0);
+  }
+  else
+  {
+    // Non-depthwise: store full weight matrix (block-diagonal for grouped convolutions)
+    this->_weight.resize(out_channels, in_channels);
+    this->_weight.setZero();
+    this->_channels = 0;
+  }
+
   if (_bias)
   {
     this->_bias.resize(out_channels);
@@ -349,7 +370,15 @@ void nam::Conv1x1::SetMaxBufferSize(const int maxBufferSize)
 
 void nam::Conv1x1::set_weights_(std::vector<float>::iterator& weights)
 {
-  if (this->_weight.size() > 0)
+  if (this->_is_depthwise)
+  {
+    // Depthwise convolution: one weight per channel
+    for (int c = 0; c < this->_channels; c++)
+    {
+      this->_depthwise_weight(c) = *(weights++);
+    }
+  }
+  else if (this->_weight.size() > 0)
   {
     const long out_channels = this->_weight.rows();
     const long in_channels = this->_weight.cols();
@@ -376,10 +405,35 @@ void nam::Conv1x1::set_weights_(std::vector<float>::iterator& weights)
       this->_bias(i) = *(weights++);
 }
 
+long nam::Conv1x1::get_out_channels() const
+{
+  if (this->_is_depthwise)
+    return this->_channels;
+  return this->_weight.rows();
+}
+
+long nam::Conv1x1::get_in_channels() const
+{
+  if (this->_is_depthwise)
+    return this->_channels;
+  return this->_weight.cols();
+}
+
 Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int num_frames) const
 {
-  // Single GEMM for all cases - block-diagonal zero structure handles grouping
-  Eigen::MatrixXf result = this->_weight * input.leftCols(num_frames);
+  Eigen::MatrixXf result(get_out_channels(), num_frames);
+
+  if (this->_is_depthwise)
+  {
+    // Depthwise convolution: efficient element-wise multiplication
+    // Each channel is scaled by its corresponding weight
+    result.noalias() = this->_depthwise_weight.asDiagonal() * input.leftCols(num_frames);
+  }
+  else
+  {
+    // Single GEMM for all cases - block-diagonal zero structure handles grouping
+    result.noalias() = this->_weight * input.leftCols(num_frames);
+  }
 
   if (this->_do_bias)
     result.colwise() += this->_bias;
@@ -391,8 +445,17 @@ void nam::Conv1x1::process_(const Eigen::Ref<const Eigen::MatrixXf>& input, cons
 {
   assert(num_frames <= _output.cols());
 
-  // Single GEMM for all cases - block-diagonal zero structure handles grouping
-  _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames);
+  if (this->_is_depthwise)
+  {
+    // Depthwise convolution: efficient element-wise multiplication
+    // Each channel is scaled by its corresponding weight
+    _output.leftCols(num_frames).noalias() = this->_depthwise_weight.asDiagonal() * input.leftCols(num_frames);
+  }
+  else
+  {
+    // Single GEMM for all cases - block-diagonal zero structure handles grouping
+    _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames);
+  }
 
   if (this->_do_bias)
     _output.leftCols(num_frames).colwise() += this->_bias;

diff --git a/NAM/dsp.h b/NAM/dsp.h
@@ -323,11 +323,17 @@ class Conv1x1
   /// \param num_frames Number of frames to process
   void process_(const Eigen::Ref<const Eigen::MatrixXf>& input, const int num_frames);
 
-  long get_out_channels() const { return this->_weight.rows(); };
-  long get_in_channels() const { return this->_weight.cols(); };
+  long get_out_channels() const;
+  long get_in_channels() const;
 
 protected:
+  // Non-depthwise: full weight matrix (out_channels x in_channels)
   Eigen::MatrixXf _weight;
+  // For depthwise convolution (groups == in_channels == out_channels):
+  // stores one weight per channel
+  Eigen::VectorXf _depthwise_weight;
+  bool _is_depthwise = false;
+  int _channels = 0; // Used for depthwise case (in_channels == out_channels)
   Eigen::VectorXf _bias;
   int _num_groups;