Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 19 additions & 91 deletions NAM/conv1d.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,10 @@ void Conv1D::set_size_(const int in_channels, const int out_channels, const int

this->_num_groups = groups;
this->_weight.resize(kernel_size);
// Initialize weight matrices to zero - critical for block-diagonal structure
// Off-diagonal blocks must be zero for single-matmul grouped convolution
for (size_t i = 0; i < this->_weight.size(); i++)
this->_weight[i].resize(out_channels,
in_channels); // y = Ax, input array (C,L)
this->_weight[i].setZero(out_channels, in_channels);
if (do_bias)
this->_bias.resize(out_channels);
else
Expand Down Expand Up @@ -104,54 +105,15 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
// Zero output before processing
_output.leftCols(num_frames).setZero();

const int numGroups = this->_num_groups;
const long in_channels = get_in_channels();
const long out_channels = get_out_channels();
const long in_per_group = in_channels / numGroups;
const long out_per_group = out_channels / numGroups;

// Process from ring buffer with dilation lookback
// After Write(), data is at positions [_write_pos, _write_pos+num_frames-1]
// For kernel tap k with offset, we need to read from _write_pos + offset
// The offset is negative (looking back), so _write_pos + offset reads from earlier positions
// The original process_() reads: input.middleCols(i_start + offset, ncols)
// where i_start is the current position and offset is negative for lookback

if (numGroups == 1)
{
// Standard convolution (no grouping)
for (size_t k = 0; k < this->_weight.size(); k++)
{
const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
const long lookback = -offset;
auto input_block = _input_buffer.Read(num_frames, lookback);
_output.leftCols(num_frames).noalias() += this->_weight[k] * input_block;
}
}
else
// Block-diagonal optimization: each weight matrix has block-diagonal structure for grouped convs.
// Off-diagonal blocks are zeros, so a single matmul per kernel position gives the same result
// as G separate matmuls. This is more efficient because BLAS can optimize larger operations.
for (size_t k = 0; k < this->_weight.size(); k++)
{
// Grouped convolution: process each group separately
for (int g = 0; g < numGroups; g++)
{
for (size_t k = 0; k < this->_weight.size(); k++)
{
const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
const long lookback = -offset;
auto input_block = _input_buffer.Read(num_frames, lookback);

// Extract input slice for this group
auto input_group = input_block.middleRows(g * in_per_group, in_per_group);

// Extract weight slice for this group
auto weight_group = this->_weight[k].block(g * out_per_group, g * in_per_group, out_per_group, in_per_group);

// Extract output slice for this group
auto output_group = _output.leftCols(num_frames).middleRows(g * out_per_group, out_per_group);

// Perform grouped convolution: output_group += weight_group * input_group
output_group.noalias() += weight_group * input_group;
}
}
const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
const long lookback = -offset;
auto input_block = _input_buffer.Read(num_frames, lookback);
_output.leftCols(num_frames).noalias() += this->_weight[k] * input_block;
}

// Add bias if present
Expand All @@ -167,49 +129,15 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
void Conv1D::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start, const long ncols,
const long j_start) const
{
const int numGroups = this->_num_groups;
const long in_channels = get_in_channels();
const long out_channels = get_out_channels();
const long in_per_group = in_channels / numGroups;
const long out_per_group = out_channels / numGroups;

if (numGroups == 1)
// Block-diagonal optimization: each weight matrix has block-diagonal structure for grouped convs.
// Off-diagonal blocks are zeros, so a single matmul per kernel position gives the same result.
for (size_t k = 0; k < this->_weight.size(); k++)
{
// Standard convolution (no grouping)
for (size_t k = 0; k < this->_weight.size(); k++)
{
const long offset = this->_dilation * (k + 1 - this->_weight.size());
if (k == 0)
output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols);
else
output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols);
}
}
else
{
// Grouped convolution: process each group separately
for (int g = 0; g < numGroups; g++)
{
for (size_t k = 0; k < this->_weight.size(); k++)
{
const long offset = this->_dilation * (k + 1 - this->_weight.size());

// Extract input slice for this group
auto input_group = input.middleCols(i_start + offset, ncols).middleRows(g * in_per_group, in_per_group);

// Extract weight slice for this group
auto weight_group = this->_weight[k].block(g * out_per_group, g * in_per_group, out_per_group, in_per_group);

// Extract output slice for this group
auto output_group = output.middleCols(j_start, ncols).middleRows(g * out_per_group, out_per_group);

// Perform grouped convolution
if (k == 0)
output_group.noalias() = weight_group * input_group;
else
output_group.noalias() += weight_group * input_group;
}
}
const long offset = this->_dilation * (k + 1 - this->_weight.size());
if (k == 0)
output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols);
else
output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols);
}
if (this->_bias.size() > 0)
{
Expand Down
53 changes: 53 additions & 0 deletions NAM/conv1d_factory.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Conv1D Factory implementation
// Returns dynamic Conv1D wrapped in IConv1D interface

#include "conv1d_factory.h"
#include "conv1d.h"

namespace nam
{

/// \brief Dynamic wrapper for Conv1D implementing IConv1D interface
///
/// This class wraps the existing Conv1D implementation to provide the IConv1D
/// interface for configurations that don't have specialized template instantiations.
class Conv1DDynamicWrapper : public IConv1D
{
public:
Conv1DDynamicWrapper(int in_channels, int out_channels, int kernel_size, int dilation, bool bias, int groups)
{
_conv.set_size_(in_channels, out_channels, kernel_size, bias, dilation, groups);
}

Eigen::MatrixXf& GetOutput() override { return _conv.GetOutput(); }

const Eigen::MatrixXf& GetOutput() const override { return _conv.GetOutput(); }

void SetMaxBufferSize(int maxBufferSize) override { _conv.SetMaxBufferSize(maxBufferSize); }

void set_weights_(std::vector<float>::iterator& weights) override { _conv.set_weights_(weights); }

void Process(const Eigen::MatrixXf& input, int num_frames) override { _conv.Process(input, num_frames); }

long get_out_channels() const override { return _conv.get_out_channels(); }

long get_in_channels() const override { return _conv.get_in_channels(); }

long get_kernel_size() const override { return _conv.get_kernel_size(); }

int get_dilation() const override { return _conv.get_dilation(); }

bool has_bias() const override { return _conv.has_bias(); }

private:
Conv1D _conv;
};

// Factory implementation - always returns dynamic implementation
std::unique_ptr<IConv1D> Conv1DFactory::create(int in_channels, int out_channels, int kernel_size, int dilation,
bool bias, int groups)
{
return std::make_unique<Conv1DDynamicWrapper>(in_channels, out_channels, kernel_size, dilation, bias, groups);
}

} // namespace nam
33 changes: 33 additions & 0 deletions NAM/conv1d_factory.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#pragma once

#include <memory>
#include "conv1d_fixed.h"

namespace nam
{

/// \brief Factory for creating Conv1D implementations
///
/// Returns a dynamic Conv1D implementation wrapped in the IConv1D interface.
/// For fully optimized implementations with compile-time known buffer sizes,
/// use Conv1DFullyFixed directly.
class Conv1DFactory
{
public:
/// \brief Create a Conv1D implementation
///
/// Returns a dynamic implementation. For maximum performance with known
/// buffer sizes, use Conv1DFullyFixed template directly.
///
/// \param in_channels Number of input channels
/// \param out_channels Number of output channels
/// \param kernel_size Size of the convolution kernel
/// \param dilation Dilation factor for the convolution
/// \param bias Whether to use bias
/// \param groups Number of groups for grouped convolution (default: 1)
/// \return Unique pointer to an IConv1D implementation
static std::unique_ptr<IConv1D> create(int in_channels, int out_channels, int kernel_size, int dilation, bool bias,
int groups = 1);
};

} // namespace nam
Loading