Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 132 additions & 30 deletions NAM/conv1d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,21 @@ namespace nam

void Conv1D::set_weights_(std::vector<float>::iterator& weights)
{
if (this->_weight.size() > 0)
if (this->_is_depthwise)
{
// Depthwise convolution: one weight per channel per kernel tap
// Weight layout: for each channel c, for each kernel position k
const int channels = this->_channels;
const size_t kernel_size = this->_depthwise_weight.size();
for (int c = 0; c < channels; c++)
{
for (size_t k = 0; k < kernel_size; k++)
{
this->_depthwise_weight[k](c) = *(weights++);
}
}
}
else if (this->_weight.size() > 0)
{
const long out_channels = this->_weight[0].rows();
const long in_channels = this->_weight[0].cols();
Expand Down Expand Up @@ -53,21 +67,46 @@ void Conv1D::set_size_(const int in_channels, const int out_channels, const int
}

this->_num_groups = groups;
this->_weight.resize(kernel_size);
for (size_t i = 0; i < this->_weight.size(); i++)
this->_dilation = _dilation;

// Check for depthwise convolution: groups == in_channels == out_channels
// In this case, each channel is processed independently with a single weight per kernel tap,
// so we can use efficient element-wise multiplication instead of matrix multiplication.
this->_is_depthwise = (groups == in_channels && in_channels == out_channels);

if (this->_is_depthwise)
{
// Depthwise: store one weight vector per kernel tap
this->_channels = in_channels;
this->_depthwise_weight.resize(kernel_size);
for (int i = 0; i < kernel_size; i++)
{
this->_depthwise_weight[i].resize(in_channels);
this->_depthwise_weight[i].setZero();
}
this->_weight.clear(); // Not used for depthwise
}
else
{
this->_weight[i].resize(out_channels,
in_channels); // y = Ax, input array (C,L)
this->_weight[i].setZero();
// Non-depthwise: store full weight matrices (block-diagonal for grouped convolutions)
this->_weight.resize(kernel_size);
for (int i = 0; i < kernel_size; i++)
{
this->_weight[i].resize(out_channels,
in_channels); // y = Ax, input array (C,L)
this->_weight[i].setZero();
}
this->_depthwise_weight.clear(); // Not used for non-depthwise
this->_channels = 0;
}

if (do_bias)
{
this->_bias.resize(out_channels);
this->_bias.setZero();
}
else
this->_bias.resize(0);
this->_dilation = _dilation;
}

void Conv1D::set_size_and_weights_(const int in_channels, const int out_channels, const int kernel_size,
Expand Down Expand Up @@ -114,18 +153,37 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
// After Write(), data is at positions [_write_pos, _write_pos+num_frames-1]
// For kernel tap k with offset, we need to read from _write_pos + offset
// The offset is negative (looking back), so _write_pos + offset reads from earlier positions
//
// Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
// so we can use a single GEMM for all cases. A more advanced implementation could store
// compact per-group weight matrices and loop over groups, but at typical model sizes
// (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
// and the single sparse GEMM approach is faster.
for (size_t k = 0; k < this->_weight.size(); k++)

if (this->_is_depthwise)
{
// Depthwise convolution: use efficient element-wise multiplication
// Each channel is processed independently with a single weight per kernel tap.
// output[c, t] = sum_k(weight[k, c] * input[c, t - k*dilation])
const size_t kernel_size = this->_depthwise_weight.size();
for (size_t k = 0; k < kernel_size; k++)
{
const long offset = this->_dilation * (k + 1 - (long)kernel_size);
const long lookback = -offset;
auto input_block = _input_buffer.Read(num_frames, lookback);
// Element-wise multiply: each row of input_block is multiplied by corresponding weight
_output.leftCols(num_frames).noalias() +=
this->_depthwise_weight[k].asDiagonal() * input_block.leftCols(num_frames);
}
}
else
{
const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
const long lookback = -offset;
auto input_block = _input_buffer.Read(num_frames, lookback);
_output.leftCols(num_frames).noalias() += this->_weight[k] * input_block;
// Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
// so we can use a single GEMM for all cases. A more advanced implementation could store
// compact per-group weight matrices and loop over groups, but at typical model sizes
// (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
// and the single sparse GEMM approach is faster.
for (size_t k = 0; k < this->_weight.size(); k++)
{
const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
const long lookback = -offset;
auto input_block = _input_buffer.Read(num_frames, lookback);
_output.leftCols(num_frames).noalias() += this->_weight[k] * input_block;
}
}

// Add bias if present
Expand All @@ -141,29 +199,73 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
void Conv1D::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start, const long ncols,
const long j_start) const
{
// Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
// so we can use a single GEMM for all cases. A more advanced implementation could store
// compact per-group weight matrices and loop over groups, but at typical model sizes
// (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
// and the single sparse GEMM approach is faster.
for (size_t k = 0; k < this->_weight.size(); k++)
if (this->_is_depthwise)
{
const long offset = this->_dilation * (k + 1 - this->_weight.size());
if (k == 0)
output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols);
else
output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols);
// Depthwise convolution: use efficient element-wise multiplication
const size_t kernel_size = this->_depthwise_weight.size();
for (size_t k = 0; k < kernel_size; k++)
{
const long offset = this->_dilation * (k + 1 - (long)kernel_size);
if (k == 0)
output.middleCols(j_start, ncols).noalias() =
this->_depthwise_weight[k].asDiagonal() * input.middleCols(i_start + offset, ncols);
else
output.middleCols(j_start, ncols).noalias() +=
this->_depthwise_weight[k].asDiagonal() * input.middleCols(i_start + offset, ncols);
}
}
else
{
// Grouped convolution note: The weight matrices are block-diagonal (zeros off-diagonal),
// so we can use a single GEMM for all cases. A more advanced implementation could store
// compact per-group weight matrices and loop over groups, but at typical model sizes
// (e.g. 8 channels, 4 groups, 64 samples), the GEMM call overhead tends to dominate
// and the single sparse GEMM approach is faster.
for (size_t k = 0; k < this->_weight.size(); k++)
{
const long offset = this->_dilation * (k + 1 - this->_weight.size());
if (k == 0)
output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols);
else
output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols);
}
}
if (this->_bias.size() > 0)
{
output.middleCols(j_start, ncols).colwise() += this->_bias;
}
}

long Conv1D::get_in_channels() const
{
if (this->_is_depthwise)
return this->_channels;
return this->_weight.size() > 0 ? this->_weight[0].cols() : 0;
}

long Conv1D::get_out_channels() const
{
if (this->_is_depthwise)
return this->_channels;
return this->_weight.size() > 0 ? this->_weight[0].rows() : 0;
}

long Conv1D::get_kernel_size() const
{
if (this->_is_depthwise)
return this->_depthwise_weight.size();
return this->_weight.size();
}

long Conv1D::get_num_weights() const
{
long num_weights = this->_bias.size();
if (this->_weight.size() > 0)
if (this->_is_depthwise)
{
// Depthwise: one weight per channel per kernel tap
num_weights += this->_channels * this->_depthwise_weight.size();
}
else if (this->_weight.size() > 0)
{
const long out_channels = this->_weight[0].rows();
const long in_channels = this->_weight[0].cols();
Expand Down
13 changes: 9 additions & 4 deletions NAM/conv1d.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,19 +95,19 @@ class Conv1D
const long j_start) const;
/// \brief Get the number of input channels
/// \return Number of input channels
long get_in_channels() const { return this->_weight.size() > 0 ? this->_weight[0].cols() : 0; };
long get_in_channels() const;

/// \brief Get the kernel size
/// \return Kernel size
long get_kernel_size() const { return this->_weight.size(); };
long get_kernel_size() const;

/// \brief Get the total number of weights
/// \return Total number of weight parameters
long get_num_weights() const;

/// \brief Get the number of output channels
/// \return Number of output channels
long get_out_channels() const { return this->_weight.size() > 0 ? this->_weight[0].rows() : 0; };
long get_out_channels() const;

/// \brief Get the dilation factor
/// \return Dilation factor
Expand All @@ -118,8 +118,13 @@ class Conv1D
bool has_bias() const { return this->_bias.size() > 0; };

protected:
// conv[kernel](cout, cin)
// conv[kernel](cout, cin) - used for non-depthwise convolutions
std::vector<Eigen::MatrixXf> _weight;
// For depthwise convolution (groups == in_channels == out_channels):
// stores one weight per channel per kernel tap
std::vector<Eigen::VectorXf> _depthwise_weight;
bool _is_depthwise = false;
int _channels = 0; // Used for depthwise case (in_channels == out_channels)
Eigen::VectorXf _bias;
int _dilation;
int _num_groups;
Expand Down
77 changes: 70 additions & 7 deletions NAM/dsp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -331,9 +331,30 @@ nam::Conv1x1::Conv1x1(const int in_channels, const int out_channels, const bool
}

this->_num_groups = groups;
this->_weight.resize(out_channels, in_channels);
this->_weight.setZero();
this->_do_bias = _bias;

// Check for depthwise convolution: groups == in_channels == out_channels
// In this case, each channel is processed independently with a single weight,
// so we can use efficient element-wise multiplication instead of matrix multiplication.
this->_is_depthwise = (groups == in_channels && in_channels == out_channels);

if (this->_is_depthwise)
{
// Depthwise: store one weight per channel
this->_channels = in_channels;
this->_depthwise_weight.resize(in_channels);
this->_depthwise_weight.setZero();
// Clear the matrix weight (not used)
this->_weight.resize(0, 0);
}
else
{
// Non-depthwise: store full weight matrix (block-diagonal for grouped convolutions)
this->_weight.resize(out_channels, in_channels);
this->_weight.setZero();
this->_channels = 0;
}

if (_bias)
{
this->_bias.resize(out_channels);
Expand All @@ -349,7 +370,15 @@ void nam::Conv1x1::SetMaxBufferSize(const int maxBufferSize)

void nam::Conv1x1::set_weights_(std::vector<float>::iterator& weights)
{
if (this->_weight.size() > 0)
if (this->_is_depthwise)
{
// Depthwise convolution: one weight per channel
for (int c = 0; c < this->_channels; c++)
{
this->_depthwise_weight(c) = *(weights++);
}
}
else if (this->_weight.size() > 0)
{
const long out_channels = this->_weight.rows();
const long in_channels = this->_weight.cols();
Expand All @@ -376,10 +405,35 @@ void nam::Conv1x1::set_weights_(std::vector<float>::iterator& weights)
this->_bias(i) = *(weights++);
}

long nam::Conv1x1::get_out_channels() const
{
if (this->_is_depthwise)
return this->_channels;
return this->_weight.rows();
}

long nam::Conv1x1::get_in_channels() const
{
if (this->_is_depthwise)
return this->_channels;
return this->_weight.cols();
}

Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int num_frames) const
{
// Single GEMM for all cases - block-diagonal zero structure handles grouping
Eigen::MatrixXf result = this->_weight * input.leftCols(num_frames);
Eigen::MatrixXf result(get_out_channels(), num_frames);

if (this->_is_depthwise)
{
// Depthwise convolution: efficient element-wise multiplication
// Each channel is scaled by its corresponding weight
result.noalias() = this->_depthwise_weight.asDiagonal() * input.leftCols(num_frames);
}
else
{
// Single GEMM for all cases - block-diagonal zero structure handles grouping
result.noalias() = this->_weight * input.leftCols(num_frames);
}

if (this->_do_bias)
result.colwise() += this->_bias;
Expand All @@ -391,8 +445,17 @@ void nam::Conv1x1::process_(const Eigen::Ref<const Eigen::MatrixXf>& input, cons
{
assert(num_frames <= _output.cols());

// Single GEMM for all cases - block-diagonal zero structure handles grouping
_output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames);
if (this->_is_depthwise)
{
// Depthwise convolution: efficient element-wise multiplication
// Each channel is scaled by its corresponding weight
_output.leftCols(num_frames).noalias() = this->_depthwise_weight.asDiagonal() * input.leftCols(num_frames);
}
else
{
// Single GEMM for all cases - block-diagonal zero structure handles grouping
_output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames);
}

if (this->_do_bias)
_output.leftCols(num_frames).colwise() += this->_bias;
Expand Down
10 changes: 8 additions & 2 deletions NAM/dsp.h
Original file line number Diff line number Diff line change
Expand Up @@ -323,11 +323,17 @@ class Conv1x1
/// \param num_frames Number of frames to process
void process_(const Eigen::Ref<const Eigen::MatrixXf>& input, const int num_frames);

long get_out_channels() const { return this->_weight.rows(); };
long get_in_channels() const { return this->_weight.cols(); };
long get_out_channels() const;
long get_in_channels() const;

protected:
// Non-depthwise: full weight matrix (out_channels x in_channels)
Eigen::MatrixXf _weight;
// For depthwise convolution (groups == in_channels == out_channels):
// stores one weight per channel
Eigen::VectorXf _depthwise_weight;
bool _is_depthwise = false;
int _channels = 0; // Used for depthwise case (in_channels == out_channels)
Eigen::VectorXf _bias;
int _num_groups;

Expand Down