From 48f4e15f4326dc22c6ca5b16d5a16c5497bceece Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= <santosjf@pm.me>
Date: Wed, 28 Jan 2026 12:49:51 -0800
Subject: [PATCH 1/4] Implemented some optimizations for grouped convolutions
 and a new benchmarking tool for convolution performance.

---
 NAM/conv1d.cpp                      |  40 +++--
 NAM/conv1d.h                        |  14 ++
 NAM/dsp.cpp                         |  37 ++--
 NAM/dsp.h                           |  13 ++
 tools/CMakeLists.txt                |  49 +++---
 tools/benchmark_convolution.cpp     | 208 ++++++++++++++++++++++
 tools/plot_convolution_benchmark.py | 258 ++++++++++++++++++++++++++++
 tools/run_tests.cpp                 |   5 +
 tools/test/test_conv1d.cpp          | 162 +++++++++++++++++
 tools/test/test_conv_1x1.cpp        |  95 ++++++++++
 10 files changed, 827 insertions(+), 54 deletions(-)
 create mode 100644 tools/benchmark_convolution.cpp
 create mode 100644 tools/plot_convolution_benchmark.py

diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp
index 6e1835b..fda50aa 100644
--- a/NAM/conv1d.cpp
+++ b/NAM/conv1d.cpp
@@ -62,6 +62,18 @@ void Conv1D::set_size_(const int in_channels, const int out_channels, const int
   else
     this->_bias.resize(0);
   this->_dilation = _dilation;
+
+  // Pre-compute group block indices for efficient runtime access
+  const long out_per_group = out_channels / groups;
+  const long in_per_group = in_channels / groups;
+  this->_group_blocks.resize(groups);
+  for (int g = 0; g < groups; g++)
+  {
+    this->_group_blocks[g].out_start = g * out_per_group;
+    this->_group_blocks[g].in_start = g * in_per_group;
+    this->_group_blocks[g].out_size = out_per_group;
+    this->_group_blocks[g].in_size = in_per_group;
+  }
 }
 
 void Conv1D::set_size_and_weights_(const int in_channels, const int out_channels, const int kernel_size,
@@ -105,10 +117,6 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
   _output.leftCols(num_frames).setZero();
 
   const int numGroups = this->_num_groups;
-  const long in_channels = get_in_channels();
-  const long out_channels = get_out_channels();
-  const long in_per_group = in_channels / numGroups;
-  const long out_per_group = out_channels / numGroups;
 
   // Process from ring buffer with dilation lookback
   // After Write(), data is at positions [_write_pos, _write_pos+num_frames-1]
@@ -130,9 +138,11 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
   }
   else
   {
-    // Grouped convolution: process each group separately
+    // Grouped convolution: process each group separately using pre-computed block indices
     for (int g = 0; g < numGroups; g++)
     {
+      const auto& block = this->_group_blocks[g];
+
       for (size_t k = 0; k < this->_weight.size(); k++)
       {
         const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
@@ -140,13 +150,13 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
         auto input_block = _input_buffer.Read(num_frames, lookback);
 
         // Extract input slice for this group
-        auto input_group = input_block.middleRows(g * in_per_group, in_per_group);
+        auto input_group = input_block.middleRows(block.in_start, block.in_size);
 
         // Extract weight slice for this group
-        auto weight_group = this->_weight[k].block(g * out_per_group, g * in_per_group, out_per_group, in_per_group);
+        auto weight_group = this->_weight[k].block(block.out_start, block.in_start, block.out_size, block.in_size);
 
         // Extract output slice for this group
-        auto output_group = _output.leftCols(num_frames).middleRows(g * out_per_group, out_per_group);
+        auto output_group = _output.leftCols(num_frames).middleRows(block.out_start, block.out_size);
 
         // Perform grouped convolution: output_group += weight_group * input_group
         output_group.noalias() += weight_group * input_group;
@@ -168,10 +178,6 @@ void Conv1D::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, con
                       const long j_start) const
 {
   const int numGroups = this->_num_groups;
-  const long in_channels = get_in_channels();
-  const long out_channels = get_out_channels();
-  const long in_per_group = in_channels / numGroups;
-  const long out_per_group = out_channels / numGroups;
 
   if (numGroups == 1)
   {
@@ -187,21 +193,23 @@ void Conv1D::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, con
   }
   else
   {
-    // Grouped convolution: process each group separately
+    // Grouped convolution: process each group separately using pre-computed block indices
     for (int g = 0; g < numGroups; g++)
     {
+      const auto& block = this->_group_blocks[g];
+
       for (size_t k = 0; k < this->_weight.size(); k++)
       {
         const long offset = this->_dilation * (k + 1 - this->_weight.size());
 
         // Extract input slice for this group
-        auto input_group = input.middleCols(i_start + offset, ncols).middleRows(g * in_per_group, in_per_group);
+        auto input_group = input.middleCols(i_start + offset, ncols).middleRows(block.in_start, block.in_size);
 
         // Extract weight slice for this group
-        auto weight_group = this->_weight[k].block(g * out_per_group, g * in_per_group, out_per_group, in_per_group);
+        auto weight_group = this->_weight[k].block(block.out_start, block.in_start, block.out_size, block.in_size);
 
         // Extract output slice for this group
-        auto output_group = output.middleCols(j_start, ncols).middleRows(g * out_per_group, out_per_group);
+        auto output_group = output.middleCols(j_start, ncols).middleRows(block.out_start, block.out_size);
 
         // Perform grouped convolution
         if (k == 0)
diff --git a/NAM/conv1d.h b/NAM/conv1d.h
index 8182966..0bf64a7 100644
--- a/NAM/conv1d.h
+++ b/NAM/conv1d.h
@@ -6,6 +6,19 @@
 
 namespace nam
 {
+
+/// \brief Pre-computed group block indices for grouped convolutions
+///
+/// Stores the indices for extracting input/output slices for each group,
+/// avoiding repeated computation during real-time processing.
+struct Conv1DGroupBlock
+{
+  long out_start; ///< Starting row index in output
+  long in_start; ///< Starting row index in input
+  long out_size; ///< Number of output channels per group
+  long in_size; ///< Number of input channels per group
+};
+
 /// \brief 1D dilated convolution layer
 ///
 /// Implements a 1D convolution with support for dilation and grouped convolution.
@@ -123,6 +136,7 @@ class Conv1D
   Eigen::VectorXf _bias;
   int _dilation;
   int _num_groups;
+  std::vector<Conv1DGroupBlock> _group_blocks; ///< Pre-computed group block indices
 
 private:
   RingBuffer _input_buffer; // Ring buffer for input (channels x buffer_size)
diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp
index 02a4a13..5eb567e 100644
--- a/NAM/dsp.cpp
+++ b/NAM/dsp.cpp
@@ -353,6 +353,16 @@ void nam::Conv1x1::set_weights_(std::vector<float>::iterator& weights)
     const long out_per_group = out_channels / numGroups;
     const long in_per_group = in_channels / numGroups;
 
+    // Pre-compute group block indices for efficient runtime access
+    this->_group_blocks.resize(numGroups);
+    for (int g = 0; g < numGroups; g++)
+    {
+      this->_group_blocks[g].out_start = g * out_per_group;
+      this->_group_blocks[g].in_start = g * in_per_group;
+      this->_group_blocks[g].out_size = out_per_group;
+      this->_group_blocks[g].in_size = in_per_group;
+    }
+
     // For grouped convolutions, weights are organized per group
     // Weight layout: weights are [group0, group1, ..., groupN-1]
     // Each group's weight matrix is (out_channels/numGroups, in_channels/numGroups)
@@ -375,10 +385,7 @@ void nam::Conv1x1::set_weights_(std::vector<float>::iterator& weights)
 Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int num_frames) const
 {
   const int numGroups = this->_num_groups;
-  const long in_channels = get_in_channels();
   const long out_channels = get_out_channels();
-  const long in_per_group = in_channels / numGroups;
-  const long out_per_group = out_channels / numGroups;
 
   Eigen::MatrixXf result(out_channels, num_frames);
 
@@ -392,18 +399,20 @@ Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int nu
   }
   else
   {
-    // Grouped convolution: process each group separately
+    // Grouped convolution: process each group separately using pre-computed block indices
     result.setZero();
     for (int g = 0; g < numGroups; g++)
     {
+      const auto& block = this->_group_blocks[g];
+
       // Extract input slice for this group
-      auto input_group = input.leftCols(num_frames).middleRows(g * in_per_group, in_per_group);
+      auto input_group = input.leftCols(num_frames).middleRows(block.in_start, block.in_size);
 
       // Extract weight slice for this group
-      auto weight_group = this->_weight.block(g * out_per_group, g * in_per_group, out_per_group, in_per_group);
+      auto weight_group = this->_weight.block(block.out_start, block.in_start, block.out_size, block.in_size);
 
       // Extract output slice for this group
-      auto output_group = result.middleRows(g * out_per_group, out_per_group);
+      auto output_group = result.middleRows(block.out_start, block.out_size);
 
       // Perform grouped convolution: output_group = weight_group * input_group
       output_group.noalias() = weight_group * input_group;
@@ -422,10 +431,6 @@ void nam::Conv1x1::process_(const Eigen::Ref<const Eigen::MatrixXf>& input, cons
   assert(num_frames <= _output.cols());
 
   const int numGroups = this->_num_groups;
-  const long in_channels = get_in_channels();
-  const long out_channels = get_out_channels();
-  const long in_per_group = in_channels / numGroups;
-  const long out_per_group = out_channels / numGroups;
 
   if (numGroups == 1)
   {
@@ -434,18 +439,20 @@ void nam::Conv1x1::process_(const Eigen::Ref<const Eigen::MatrixXf>& input, cons
   }
   else
   {
-    // Grouped convolution: process each group separately
+    // Grouped convolution: process each group separately using pre-computed block indices
     _output.leftCols(num_frames).setZero();
     for (int g = 0; g < numGroups; g++)
     {
+      const auto& block = this->_group_blocks[g];
+
       // Extract input slice for this group
-      auto input_group = input.leftCols(num_frames).middleRows(g * in_per_group, in_per_group);
+      auto input_group = input.leftCols(num_frames).middleRows(block.in_start, block.in_size);
 
       // Extract weight slice for this group
-      auto weight_group = this->_weight.block(g * out_per_group, g * in_per_group, out_per_group, in_per_group);
+      auto weight_group = this->_weight.block(block.out_start, block.in_start, block.out_size, block.in_size);
 
       // Extract output slice for this group
-      auto output_group = _output.leftCols(num_frames).middleRows(g * out_per_group, out_per_group);
+      auto output_group = _output.leftCols(num_frames).middleRows(block.out_start, block.out_size);
 
       // Perform grouped convolution: output_group = weight_group * input_group
       output_group.noalias() = weight_group * input_group;
diff --git a/NAM/dsp.h b/NAM/dsp.h
index 8b984d2..f7763c2 100644
--- a/NAM/dsp.h
+++ b/NAM/dsp.h
@@ -269,6 +269,18 @@ std::unique_ptr<DSP> Factory(const nlohmann::json& config, std::vector<float>& w
 
 // NN modules =================================================================
 
+/// \brief Pre-computed group block indices for grouped convolutions
+///
+/// Stores the indices for extracting input/output slices for each group,
+/// avoiding repeated computation during real-time processing.
+struct GroupBlock
+{
+  long out_start; ///< Starting row index in output
+  long in_start; ///< Starting row index in input
+  long out_size; ///< Number of output channels per group
+  long in_size; ///< Number of input channels per group
+};
+
 /// \brief 1x1 convolution (really just a fully-connected linear layer operating per-sample)
 ///
 /// Performs a pointwise convolution, which is equivalent to a fully connected layer
@@ -330,6 +342,7 @@ class Conv1x1
   Eigen::MatrixXf _weight;
   Eigen::VectorXf _bias;
   int _num_groups;
+  std::vector<GroupBlock> _group_blocks; ///< Pre-computed group block indices
 
 private:
   Eigen::MatrixXf _output;
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 8118e08..22e4db6 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -1,7 +1,7 @@
 file(GLOB_RECURSE NAM_SOURCES ../NAM/*.cpp ../NAM/*.c ../NAM*.h)
 
 # TODO: add loadmodel and run_tests to TOOLS?
-set(TOOLS benchmodel)
+set(TOOLS benchmodel benchmark_convolution)
 
 add_custom_target(tools ALL
 	DEPENDS ${TOOLS})
@@ -12,6 +12,7 @@ include_directories(tools ${NAM_DEPS_PATH}/nlohmann)
 
 add_executable(loadmodel loadmodel.cpp ${NAM_SOURCES})
 add_executable(benchmodel benchmodel.cpp ${NAM_SOURCES})
+add_executable(benchmark_convolution benchmark_convolution.cpp ${NAM_SOURCES})
 add_executable(run_tests run_tests.cpp test/allocation_tracking.cpp ${NAM_SOURCES})
 # Compile run_tests without optimizations to ensure allocation tracking works correctly
 # Also ensure assertions are enabled (NDEBUG is not defined) so tests actually run
@@ -31,31 +32,33 @@ endif()
 
 source_group(NAM ${CMAKE_CURRENT_SOURCE_DIR} FILES ${NAM_SOURCES})
 
-target_compile_features(${TOOLS} PUBLIC cxx_std_20)
+foreach(TOOL ${TOOLS})
+	target_compile_features(${TOOL} PUBLIC cxx_std_20)
 
-set_target_properties(${TOOLS}
-	PROPERTIES
-	CXX_VISIBILITY_PRESET hidden
-	INTERPROCEDURAL_OPTIMIZATION TRUE
-	PREFIX ""
-)
+	set_target_properties(${TOOL}
+		PROPERTIES
+		CXX_VISIBILITY_PRESET hidden
+		INTERPROCEDURAL_OPTIMIZATION TRUE
+		PREFIX ""
+	)
 
-if (CMAKE_SYSTEM_NAME STREQUAL "Windows")
-	target_compile_definitions(${TOOLS} PRIVATE NOMINMAX WIN32_LEAN_AND_MEAN)
-endif()
+	if (CMAKE_SYSTEM_NAME STREQUAL "Windows")
+		target_compile_definitions(${TOOL} PRIVATE NOMINMAX WIN32_LEAN_AND_MEAN)
+	endif()
 
-if (MSVC)
-	target_compile_options(${TOOLS} PRIVATE
-		"$<$<CONFIG:DEBUG>:/W4>"
-		"$<$<CONFIG:RELEASE>:/O2>"
-	)
-else()
-	target_compile_options(${TOOLS} PRIVATE
-		-Wall -Wextra -Wpedantic -Wstrict-aliasing -Wunreachable-code -Weffc++ -Wno-unused-parameter
-		"$<$<CONFIG:DEBUG>:-Og;-ggdb;-Werror>"
-		"$<$<CONFIG:RELEASE>:-Ofast>"
-	)
-endif()
+	if (MSVC)
+		target_compile_options(${TOOL} PRIVATE
+			"$<$<CONFIG:DEBUG>:/W4>"
+			"$<$<CONFIG:RELEASE>:/O2>"
+		)
+	else()
+		target_compile_options(${TOOL} PRIVATE
+			-Wall -Wextra -Wpedantic -Wstrict-aliasing -Wunreachable-code -Weffc++ -Wno-unused-parameter
+			"$<$<CONFIG:DEBUG>:-Og;-ggdb;-Werror>"
+			"$<$<CONFIG:RELEASE>:-Ofast>"
+		)
+	endif()
+endforeach()
 
 # There's an error in eigen's
 # /Users/steve/src/NeuralAmpModelerCore/Dependencies/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h
diff --git a/tools/benchmark_convolution.cpp b/tools/benchmark_convolution.cpp
new file mode 100644
index 0000000..6501c55
--- /dev/null
+++ b/tools/benchmark_convolution.cpp
@@ -0,0 +1,208 @@
+// Microbenchmark for Conv1x1 and Conv1D convolution operations
+// Measures performance across various configurations of channels, groups, and frame sizes.
+// Outputs CSV format for analysis.
+
+#include <chrono>
+#include <cmath>
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#include "NAM/conv1d.h"
+#include "NAM/dsp.h"
+
+using std::chrono::duration;
+using std::chrono::duration_cast;
+using std::chrono::high_resolution_clock;
+using std::chrono::nanoseconds;
+
+// Number of iterations per benchmark configuration
+constexpr int NUM_WARMUP_ITERATIONS = 10;
+constexpr int NUM_BENCHMARK_ITERATIONS = 100;
+
+// Benchmark configurations
+constexpr int CHANNELS[] = {8, 16, 32, 64, 128};
+constexpr int GROUPS[] = {1, 2, 4, 8, 16};
+constexpr int FRAMES[] = {64, 256, 1024};
+constexpr int KERNEL_SIZES[] = {1, 3}; // For Conv1D
+
+struct BenchmarkResult
+{
+  double mean_ns;
+  double stddev_ns;
+  double min_ns;
+  double max_ns;
+};
+
+// Calculate statistics from timing samples
+BenchmarkResult calculate_stats(const std::vector<double>& samples)
+{
+  BenchmarkResult result;
+  double sum = 0.0;
+  result.min_ns = samples[0];
+  result.max_ns = samples[0];
+
+  for (double s : samples)
+  {
+    sum += s;
+    if (s < result.min_ns)
+      result.min_ns = s;
+    if (s > result.max_ns)
+      result.max_ns = s;
+  }
+
+  result.mean_ns = sum / samples.size();
+
+  double sq_sum = 0.0;
+  for (double s : samples)
+  {
+    double diff = s - result.mean_ns;
+    sq_sum += diff * diff;
+  }
+  result.stddev_ns = std::sqrt(sq_sum / samples.size());
+
+  return result;
+}
+
+// Benchmark Conv1x1
+void benchmark_conv1x1(int channels, int groups, int frames, std::mt19937& rng)
+{
+  if (channels % groups != 0)
+    return; // Skip invalid configurations
+
+  // Create Conv1x1 layer
+  nam::Conv1x1 conv(channels, channels, false, groups);
+
+  // Initialize with random weights
+  const int num_weights = (channels / groups) * (channels / groups) * groups;
+  std::vector<float> weights(num_weights);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+  for (auto& w : weights)
+    w = dist(rng);
+
+  auto it = weights.begin();
+  conv.set_weights_(it);
+  conv.SetMaxBufferSize(frames);
+
+  // Create random input
+  Eigen::MatrixXf input(channels, frames);
+  for (int i = 0; i < channels; i++)
+    for (int j = 0; j < frames; j++)
+      input(i, j) = dist(rng);
+
+  // Warmup
+  for (int i = 0; i < NUM_WARMUP_ITERATIONS; i++)
+  {
+    conv.process_(input, frames);
+  }
+
+  // Benchmark
+  std::vector<double> samples;
+  samples.reserve(NUM_BENCHMARK_ITERATIONS);
+
+  for (int i = 0; i < NUM_BENCHMARK_ITERATIONS; i++)
+  {
+    auto t1 = high_resolution_clock::now();
+    conv.process_(input, frames);
+    auto t2 = high_resolution_clock::now();
+    samples.push_back(static_cast<double>(duration_cast<nanoseconds>(t2 - t1).count()));
+  }
+
+  BenchmarkResult result = calculate_stats(samples);
+
+  // Output CSV row
+  std::cout << "Conv1x1," << channels << "," << groups << "," << frames << ",1," << std::fixed << std::setprecision(2)
+            << result.mean_ns << "," << result.stddev_ns << "," << result.min_ns << "," << result.max_ns << "\n";
+}
+
+// Benchmark Conv1D
+void benchmark_conv1d(int channels, int groups, int frames, int kernel_size, std::mt19937& rng)
+{
+  if (channels % groups != 0)
+    return; // Skip invalid configurations
+
+  // Create Conv1D layer
+  nam::Conv1D conv;
+  conv.set_size_(channels, channels, kernel_size, false, 1, groups);
+
+  // Initialize with random weights
+  const int num_weights = kernel_size * (channels / groups) * (channels / groups) * groups;
+  std::vector<float> weights(num_weights);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+  for (auto& w : weights)
+    w = dist(rng);
+
+  auto it = weights.begin();
+  conv.set_weights_(it);
+  conv.SetMaxBufferSize(frames);
+
+  // Create random input
+  Eigen::MatrixXf input(channels, frames);
+  for (int i = 0; i < channels; i++)
+    for (int j = 0; j < frames; j++)
+      input(i, j) = dist(rng);
+
+  // Warmup
+  for (int i = 0; i < NUM_WARMUP_ITERATIONS; i++)
+  {
+    conv.Process(input, frames);
+  }
+
+  // Benchmark
+  std::vector<double> samples;
+  samples.reserve(NUM_BENCHMARK_ITERATIONS);
+
+  for (int i = 0; i < NUM_BENCHMARK_ITERATIONS; i++)
+  {
+    auto t1 = high_resolution_clock::now();
+    conv.Process(input, frames);
+    auto t2 = high_resolution_clock::now();
+    samples.push_back(static_cast<double>(duration_cast<nanoseconds>(t2 - t1).count()));
+  }
+
+  BenchmarkResult result = calculate_stats(samples);
+
+  // Output CSV row
+  std::cout << "Conv1D," << channels << "," << groups << "," << frames << "," << kernel_size << "," << std::fixed
+            << std::setprecision(2) << result.mean_ns << "," << result.stddev_ns << "," << result.min_ns << ","
+            << result.max_ns << "\n";
+}
+
+int main(int argc, char* argv[])
+{
+  // Print CSV header
+  std::cout << "type,channels,groups,frames,kernel_size,mean_ns,stddev_ns,min_ns,max_ns\n";
+
+  // Use fixed seed for reproducibility
+  std::mt19937 rng(42);
+
+  // Benchmark Conv1x1
+  for (int channels : CHANNELS)
+  {
+    for (int groups : GROUPS)
+    {
+      for (int frames : FRAMES)
+      {
+        benchmark_conv1x1(channels, groups, frames, rng);
+      }
+    }
+  }
+
+  // Benchmark Conv1D
+  for (int channels : CHANNELS)
+  {
+    for (int groups : GROUPS)
+    {
+      for (int frames : FRAMES)
+      {
+        for (int kernel_size : KERNEL_SIZES)
+        {
+          benchmark_conv1d(channels, groups, frames, kernel_size, rng);
+        }
+      }
+    }
+  }
+
+  return 0;
+}
diff --git a/tools/plot_convolution_benchmark.py b/tools/plot_convolution_benchmark.py
new file mode 100644
index 0000000..929e947
--- /dev/null
+++ b/tools/plot_convolution_benchmark.py
@@ -0,0 +1,258 @@
+#!/usr/bin/env python3
+"""
+Visualization script for convolution benchmark results.
+
+Usage:
+    python plot_convolution_benchmark.py results.csv
+    python plot_convolution_benchmark.py before.csv after.csv  # Compare two runs
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+try:
+    import pandas as pd
+    import matplotlib.pyplot as plt
+    import numpy as np
+except ImportError:
+    print("Error: This script requires pandas and matplotlib.")
+    print("Install with: pip install pandas matplotlib")
+    sys.exit(1)
+
+
+def load_results(csv_path: str) -> pd.DataFrame:
+    """Load benchmark results from CSV file."""
+    df = pd.read_csv(csv_path)
+    # Convert ns to microseconds for readability
+    df["mean_us"] = df["mean_ns"] / 1000
+    df["stddev_us"] = df["stddev_ns"] / 1000
+    df["min_us"] = df["min_ns"] / 1000
+    df["max_us"] = df["max_ns"] / 1000
+    return df
+
+
+def plot_groups_vs_time(df: pd.DataFrame, conv_type: str, output_prefix: str):
+    """Plot groups vs execution time for different channel counts."""
+    type_df = df[df["type"] == conv_type]
+
+    if type_df.empty:
+        print(f"No data for {conv_type}")
+        return
+
+    frames_list = sorted(type_df["frames"].unique())
+    channels_list = sorted(type_df["channels"].unique())
+
+    for frames in frames_list:
+        fig, ax = plt.subplots(figsize=(10, 6))
+
+        for channels in channels_list:
+            subset = type_df[(type_df["frames"] == frames) & (type_df["channels"] == channels)]
+            if subset.empty:
+                continue
+
+            # Sort by groups
+            subset = subset.sort_values("groups")
+
+            ax.errorbar(
+                subset["groups"],
+                subset["mean_us"],
+                yerr=subset["stddev_us"],
+                marker="o",
+                capsize=3,
+                label=f"{channels} channels",
+            )
+
+        ax.set_xlabel("Number of Groups")
+        ax.set_ylabel("Execution Time (microseconds)")
+        ax.set_title(f"{conv_type}: Groups vs Time (frames={frames})")
+        ax.legend()
+        ax.set_xscale("log", base=2)
+        ax.grid(True, alpha=0.3)
+
+        plt.tight_layout()
+        output_path = f"{output_prefix}_{conv_type.lower()}_frames{frames}_groups_vs_time.png"
+        plt.savefig(output_path, dpi=150)
+        print(f"Saved: {output_path}")
+        plt.close()
+
+
+def plot_speedup_vs_baseline(df: pd.DataFrame, conv_type: str, output_prefix: str):
+    """Plot speedup relative to groups=1 baseline."""
+    type_df = df[df["type"] == conv_type]
+
+    if type_df.empty:
+        print(f"No data for {conv_type}")
+        return
+
+    frames_list = sorted(type_df["frames"].unique())
+    channels_list = sorted(type_df["channels"].unique())
+
+    for frames in frames_list:
+        fig, ax = plt.subplots(figsize=(10, 6))
+
+        for channels in channels_list:
+            subset = type_df[(type_df["frames"] == frames) & (type_df["channels"] == channels)]
+            if subset.empty:
+                continue
+
+            # Get baseline (groups=1)
+            baseline = subset[subset["groups"] == 1]
+            if baseline.empty:
+                continue
+            baseline_time = baseline["mean_us"].values[0]
+
+            # Calculate speedup
+            subset = subset.sort_values("groups")
+            speedup = baseline_time / subset["mean_us"]
+
+            ax.plot(
+                subset["groups"],
+                speedup,
+                marker="o",
+                label=f"{channels} channels",
+            )
+
+        ax.axhline(y=1.0, color="gray", linestyle="--", alpha=0.5, label="Baseline (groups=1)")
+        ax.set_xlabel("Number of Groups")
+        ax.set_ylabel("Speedup (relative to groups=1)")
+        ax.set_title(f"{conv_type}: Speedup vs Groups (frames={frames})")
+        ax.legend()
+        ax.set_xscale("log", base=2)
+        ax.grid(True, alpha=0.3)
+
+        plt.tight_layout()
+        output_path = f"{output_prefix}_{conv_type.lower()}_frames{frames}_speedup.png"
+        plt.savefig(output_path, dpi=150)
+        print(f"Saved: {output_path}")
+        plt.close()
+
+
+def plot_comparison(df_before: pd.DataFrame, df_after: pd.DataFrame, conv_type: str, output_prefix: str):
+    """Compare before/after benchmark results."""
+    before = df_before[df_before["type"] == conv_type]
+    after = df_after[df_after["type"] == conv_type]
+
+    if before.empty or after.empty:
+        print(f"No data for {conv_type}")
+        return
+
+    frames_list = sorted(before["frames"].unique())
+    channels_list = sorted(before["channels"].unique())
+
+    for frames in frames_list:
+        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
+
+        # Left plot: Absolute times
+        ax1 = axes[0]
+        for channels in channels_list:
+            before_subset = before[(before["frames"] == frames) & (before["channels"] == channels)]
+            after_subset = after[(after["frames"] == frames) & (after["channels"] == channels)]
+
+            if before_subset.empty or after_subset.empty:
+                continue
+
+            before_subset = before_subset.sort_values("groups")
+            after_subset = after_subset.sort_values("groups")
+
+            ax1.plot(
+                before_subset["groups"],
+                before_subset["mean_us"],
+                marker="o",
+                linestyle="--",
+                alpha=0.7,
+                label=f"{channels}ch (before)",
+            )
+            ax1.plot(
+                after_subset["groups"],
+                after_subset["mean_us"],
+                marker="s",
+                label=f"{channels}ch (after)",
+            )
+
+        ax1.set_xlabel("Number of Groups")
+        ax1.set_ylabel("Execution Time (microseconds)")
+        ax1.set_title(f"{conv_type}: Before vs After (frames={frames})")
+        ax1.legend(fontsize=8)
+        ax1.set_xscale("log", base=2)
+        ax1.grid(True, alpha=0.3)
+
+        # Right plot: Speedup (after vs before)
+        ax2 = axes[1]
+        for channels in channels_list:
+            before_subset = before[(before["frames"] == frames) & (before["channels"] == channels)]
+            after_subset = after[(after["frames"] == frames) & (after["channels"] == channels)]
+
+            if before_subset.empty or after_subset.empty:
+                continue
+
+            # Merge on groups
+            merged = pd.merge(
+                before_subset[["groups", "mean_us"]],
+                after_subset[["groups", "mean_us"]],
+                on="groups",
+                suffixes=("_before", "_after"),
+            )
+
+            speedup = merged["mean_us_before"] / merged["mean_us_after"]
+
+            ax2.plot(
+                merged["groups"],
+                speedup,
+                marker="o",
+                label=f"{channels} channels",
+            )
+
+        ax2.axhline(y=1.0, color="gray", linestyle="--", alpha=0.5, label="No change")
+        ax2.set_xlabel("Number of Groups")
+        ax2.set_ylabel("Speedup (before/after)")
+        ax2.set_title(f"{conv_type}: Optimization Speedup (frames={frames})")
+        ax2.legend(fontsize=8)
+        ax2.set_xscale("log", base=2)
+        ax2.grid(True, alpha=0.3)
+
+        plt.tight_layout()
+        output_path = f"{output_prefix}_{conv_type.lower()}_frames{frames}_comparison.png"
+        plt.savefig(output_path, dpi=150)
+        print(f"Saved: {output_path}")
+        plt.close()
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Visualize convolution benchmark results")
+    parser.add_argument("csv_files", nargs="+", help="CSV file(s) with benchmark results")
+    parser.add_argument("-o", "--output-prefix", default="benchmark", help="Output file prefix")
+    args = parser.parse_args()
+
+    if len(args.csv_files) == 1:
+        # Single file mode
+        df = load_results(args.csv_files[0])
+
+        for conv_type in ["Conv1x1", "Conv1D"]:
+            plot_groups_vs_time(df, conv_type, args.output_prefix)
+            plot_speedup_vs_baseline(df, conv_type, args.output_prefix)
+
+        print("\nSummary statistics:")
+        print(df.groupby(["type", "channels", "groups"])["mean_us"].mean().unstack())
+
+    elif len(args.csv_files) == 2:
+        # Comparison mode
+        df_before = load_results(args.csv_files[0])
+        df_after = load_results(args.csv_files[1])
+
+        for conv_type in ["Conv1x1", "Conv1D"]:
+            plot_comparison(df_before, df_after, conv_type, args.output_prefix)
+
+        # Calculate overall improvement
+        print("\nOverall speedup (before/after):")
+        for conv_type in ["Conv1x1", "Conv1D"]:
+            before_mean = df_before[df_before["type"] == conv_type]["mean_us"].mean()
+            after_mean = df_after[df_after["type"] == conv_type]["mean_us"].mean()
+            print(f"  {conv_type}: {before_mean/after_mean:.2f}x")
+    else:
+        print("Error: Provide 1 or 2 CSV files")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/run_tests.cpp b/tools/run_tests.cpp
index be65760..e1ebbf7 100644
--- a/tools/run_tests.cpp
+++ b/tools/run_tests.cpp
@@ -103,6 +103,9 @@ int main()
   test_conv1d::test_process_grouped_dilation();
   test_conv1d::test_process_grouped_channel_isolation();
   test_conv1d::test_get_num_weights_grouped();
+  test_conv1d::test_process_grouped_realtime_safe();
+  test_conv1d::test_process_realtime_safe();
+  test_conv1d::test_process_grouped_dilated_realtime_safe();
 
   test_conv_1x1::test_construct();
   test_conv_1x1::test_construct_with_groups();
@@ -118,6 +121,8 @@ int main()
   test_conv_1x1::test_process_underscore_grouped();
   test_conv_1x1::test_set_max_buffer_size();
   test_conv_1x1::test_process_multiple_calls();
+  test_conv_1x1::test_process_grouped_realtime_safe();
+  test_conv_1x1::test_process_realtime_safe();
 
   test_film::test_set_max_buffer_size();
   test_film::test_process_bias_only();
diff --git a/tools/test/test_conv1d.cpp b/tools/test/test_conv1d.cpp
index 900eea0..14e72aa 100644
--- a/tools/test/test_conv1d.cpp
+++ b/tools/test/test_conv1d.cpp
@@ -7,6 +7,7 @@
 #include <vector>
 
 #include "NAM/conv1d.h"
+#include "allocation_tracking.h"
 
 namespace test_conv1d
 {
@@ -848,4 +849,165 @@ void test_get_num_weights_grouped()
   actual = conv_4groups.get_num_weights();
   assert(actual == expected);
 }
+
+// Test that grouped convolution Process() is real-time safe (no allocations)
+void test_process_grouped_realtime_safe()
+{
+  const int in_channels = 8;
+  const int out_channels = 8;
+  const int kernel_size = 3;
+  const bool do_bias = true;
+  const int dilation = 1;
+  const int groups = 4;
+  const int num_frames = 64;
+
+  nam::Conv1D conv;
+  conv.set_size_(in_channels, out_channels, kernel_size, do_bias, dilation, groups);
+
+  // Initialize weights (identity-like for each group, each kernel position)
+  std::vector<float> weights;
+  const int in_per_group = in_channels / groups;
+  const int out_per_group = out_channels / groups;
+
+  // Weight layout: for each group, for each (i,j), for each kernel position k
+  for (int g = 0; g < groups; g++)
+  {
+    for (int i = 0; i < out_per_group; i++)
+    {
+      for (int j = 0; j < in_per_group; j++)
+      {
+        for (int k = 0; k < kernel_size; k++)
+        {
+          // Only set weight for last kernel tap and diagonal
+          weights.push_back((k == kernel_size - 1 && i == j) ? 1.0f : 0.0f);
+        }
+      }
+    }
+  }
+  // Add bias
+  for (int i = 0; i < out_channels; i++)
+  {
+    weights.push_back(0.0f);
+  }
+
+  auto it = weights.begin();
+  conv.set_weights_(it);
+  conv.SetMaxBufferSize(num_frames);
+
+  // Create input buffer
+  Eigen::MatrixXf input(in_channels, num_frames);
+  for (int i = 0; i < in_channels; i++)
+    for (int j = 0; j < num_frames; j++)
+      input(i, j) = static_cast<float>(i + j);
+
+  // Run allocation test
+  allocation_tracking::run_allocation_test_no_allocations(
+    nullptr,
+    [&]() {
+      conv.Process(input, num_frames);
+    },
+    nullptr, "test_process_grouped_realtime_safe");
+}
+
+// Test that non-grouped convolution Process() is also real-time safe
+void test_process_realtime_safe()
+{
+  const int in_channels = 16;
+  const int out_channels = 16;
+  const int kernel_size = 3;
+  const bool do_bias = true;
+  const int dilation = 1;
+  const int num_frames = 64;
+
+  nam::Conv1D conv;
+  conv.set_size_(in_channels, out_channels, kernel_size, do_bias, dilation);
+
+  // Initialize weights (identity for last kernel tap)
+  std::vector<float> weights;
+  for (int i = 0; i < out_channels; i++)
+  {
+    for (int j = 0; j < in_channels; j++)
+    {
+      for (int k = 0; k < kernel_size; k++)
+      {
+        weights.push_back((k == kernel_size - 1 && i == j) ? 1.0f : 0.0f);
+      }
+    }
+  }
+  // Add bias
+  for (int i = 0; i < out_channels; i++)
+  {
+    weights.push_back(0.0f);
+  }
+
+  auto it = weights.begin();
+  conv.set_weights_(it);
+  conv.SetMaxBufferSize(num_frames);
+
+  // Create input buffer
+  Eigen::MatrixXf input(in_channels, num_frames);
+  for (int i = 0; i < in_channels; i++)
+    for (int j = 0; j < num_frames; j++)
+      input(i, j) = static_cast<float>(i + j);
+
+  // Run allocation test
+  allocation_tracking::run_allocation_test_no_allocations(
+    nullptr,
+    [&]() {
+      conv.Process(input, num_frames);
+    },
+    nullptr, "test_process_realtime_safe");
+}
+
+// Test grouped convolution with dilation is real-time safe
+void test_process_grouped_dilated_realtime_safe()
+{
+  const int in_channels = 8;
+  const int out_channels = 8;
+  const int kernel_size = 2;
+  const bool do_bias = false;
+  const int dilation = 4;
+  const int groups = 2;
+  const int num_frames = 64;
+
+  nam::Conv1D conv;
+  conv.set_size_(in_channels, out_channels, kernel_size, do_bias, dilation, groups);
+
+  // Initialize weights
+  std::vector<float> weights;
+  const int in_per_group = in_channels / groups;
+  const int out_per_group = out_channels / groups;
+
+  for (int g = 0; g < groups; g++)
+  {
+    for (int i = 0; i < out_per_group; i++)
+    {
+      for (int j = 0; j < in_per_group; j++)
+      {
+        for (int k = 0; k < kernel_size; k++)
+        {
+          weights.push_back((k == kernel_size - 1 && i == j) ? 1.0f : 0.0f);
+        }
+      }
+    }
+  }
+
+  auto it = weights.begin();
+  conv.set_weights_(it);
+  conv.SetMaxBufferSize(num_frames);
+
+  // Create input buffer
+  Eigen::MatrixXf input(in_channels, num_frames);
+  for (int i = 0; i < in_channels; i++)
+    for (int j = 0; j < num_frames; j++)
+      input(i, j) = static_cast<float>(i + j);
+
+  // Run allocation test
+  allocation_tracking::run_allocation_test_no_allocations(
+    nullptr,
+    [&]() {
+      conv.Process(input, num_frames);
+    },
+    nullptr, "test_process_grouped_dilated_realtime_safe");
+}
 }; // namespace test_conv1d
diff --git a/tools/test/test_conv_1x1.cpp b/tools/test/test_conv_1x1.cpp
index cb3e234..3cba668 100644
--- a/tools/test/test_conv_1x1.cpp
+++ b/tools/test/test_conv_1x1.cpp
@@ -8,6 +8,7 @@
 #include <vector>
 
 #include "NAM/dsp.h"
+#include "allocation_tracking.h"
 
 namespace test_conv_1x1
 {
@@ -492,4 +493,98 @@ void test_process_multiple_calls()
   assert(std::abs(output2(0, 0) - 3.0f) < 0.01f);
   assert(std::abs(output2(1, 0) - 4.0f) < 0.01f);
 }
+
+// Test that grouped convolution process_() is real-time safe (no allocations)
+void test_process_grouped_realtime_safe()
+{
+  const int in_channels = 8;
+  const int out_channels = 8;
+  const bool do_bias = true;
+  const int groups = 4;
+  const int num_frames = 64;
+
+  nam::Conv1x1 conv(in_channels, out_channels, do_bias, groups);
+
+  // Initialize weights (identity-like for each group)
+  std::vector<float> weights;
+  const int in_per_group = in_channels / groups;
+  const int out_per_group = out_channels / groups;
+  for (int g = 0; g < groups; g++)
+  {
+    for (int i = 0; i < out_per_group; i++)
+    {
+      for (int j = 0; j < in_per_group; j++)
+      {
+        weights.push_back(i == j ? 1.0f : 0.0f);
+      }
+    }
+  }
+  // Add bias
+  for (int i = 0; i < out_channels; i++)
+  {
+    weights.push_back(0.0f);
+  }
+
+  auto it = weights.begin();
+  conv.set_weights_(it);
+  conv.SetMaxBufferSize(num_frames);
+
+  // Create input buffer
+  Eigen::MatrixXf input(in_channels, num_frames);
+  for (int i = 0; i < in_channels; i++)
+    for (int j = 0; j < num_frames; j++)
+      input(i, j) = static_cast<float>(i + j);
+
+  // Run allocation test
+  allocation_tracking::run_allocation_test_no_allocations(
+    nullptr,
+    [&]() {
+      conv.process_(input, num_frames);
+    },
+    nullptr, "test_process_grouped_realtime_safe");
+}
+
+// Test that non-grouped convolution process_() is also real-time safe
+void test_process_realtime_safe()
+{
+  const int in_channels = 16;
+  const int out_channels = 16;
+  const bool do_bias = true;
+  const int num_frames = 64;
+
+  nam::Conv1x1 conv(in_channels, out_channels, do_bias);
+
+  // Initialize weights (identity)
+  std::vector<float> weights;
+  for (int i = 0; i < out_channels; i++)
+  {
+    for (int j = 0; j < in_channels; j++)
+    {
+      weights.push_back(i == j ? 1.0f : 0.0f);
+    }
+  }
+  // Add bias
+  for (int i = 0; i < out_channels; i++)
+  {
+    weights.push_back(0.0f);
+  }
+
+  auto it = weights.begin();
+  conv.set_weights_(it);
+  conv.SetMaxBufferSize(num_frames);
+
+  // Create input buffer
+  Eigen::MatrixXf input(in_channels, num_frames);
+  for (int i = 0; i < in_channels; i++)
+    for (int j = 0; j < num_frames; j++)
+      input(i, j) = static_cast<float>(i + j);
+
+  // Run allocation test
+  allocation_tracking::run_allocation_test_no_allocations(
+    nullptr,
+    [&]() {
+      conv.process_(input, num_frames);
+    },
+    nullptr, "test_process_realtime_safe");
+}
 } // namespace test_conv_1x1

From 68d31d39c87e2efc4baa52e0e5bcfff007b25b5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= <santosjf@pm.me>
Date: Wed, 28 Jan 2026 15:47:30 -0800
Subject: [PATCH 2/4] Optimize grouped convolutions: loop-based for Conv1x1,
 block-diagonal for Conv1D

Conv1x1: Use explicit group loop with groups=1 fast path. For small channel
counts (2-8), this avoids the overhead of zero multiplications in block-diagonal
matrices that BLAS cannot optimize efficiently.

Conv1D: Keep block-diagonal approach (single matmul per kernel position) which
shows 1.5-1.9x speedup for grouped convolutions. The multiple kernel positions
amortize the overhead, making this approach beneficial.

Removed pre-computed GroupBlock structs as they are no longer needed with
these simplified implementations.

Updated benchmark tool to test channels 2-8 for detailed comparison.

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 NAM/conv1d.cpp                  | 118 +------
 NAM/conv1d.h                    |  14 -
 NAM/dsp.cpp                     | 102 +++---
 NAM/dsp.h                       |  13 -
 tools/benchmark_compare.sh      | 590 ++++++++++++++++----------------
 tools/benchmark_convolution.cpp |   4 +-
 6 files changed, 356 insertions(+), 485 deletions(-)

diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp
index fda50aa..ff4e55d 100644
--- a/NAM/conv1d.cpp
+++ b/NAM/conv1d.cpp
@@ -54,26 +54,15 @@ void Conv1D::set_size_(const int in_channels, const int out_channels, const int
 
   this->_num_groups = groups;
   this->_weight.resize(kernel_size);
+  // Initialize weight matrices to zero - critical for block-diagonal structure
+  // Off-diagonal blocks must be zero for single-matmul grouped convolution
   for (size_t i = 0; i < this->_weight.size(); i++)
-    this->_weight[i].resize(out_channels,
-                            in_channels); // y = Ax, input array (C,L)
+    this->_weight[i].setZero(out_channels, in_channels);
   if (do_bias)
     this->_bias.resize(out_channels);
   else
     this->_bias.resize(0);
   this->_dilation = _dilation;
-
-  // Pre-compute group block indices for efficient runtime access
-  const long out_per_group = out_channels / groups;
-  const long in_per_group = in_channels / groups;
-  this->_group_blocks.resize(groups);
-  for (int g = 0; g < groups; g++)
-  {
-    this->_group_blocks[g].out_start = g * out_per_group;
-    this->_group_blocks[g].in_start = g * in_per_group;
-    this->_group_blocks[g].out_size = out_per_group;
-    this->_group_blocks[g].in_size = in_per_group;
-  }
 }
 
 void Conv1D::set_size_and_weights_(const int in_channels, const int out_channels, const int kernel_size,
@@ -116,52 +105,15 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
   // Zero output before processing
   _output.leftCols(num_frames).setZero();
 
-  const int numGroups = this->_num_groups;
-
-  // Process from ring buffer with dilation lookback
-  // After Write(), data is at positions [_write_pos, _write_pos+num_frames-1]
-  // For kernel tap k with offset, we need to read from _write_pos + offset
-  // The offset is negative (looking back), so _write_pos + offset reads from earlier positions
-  // The original process_() reads: input.middleCols(i_start + offset, ncols)
-  // where i_start is the current position and offset is negative for lookback
-
-  if (numGroups == 1)
-  {
-    // Standard convolution (no grouping)
-    for (size_t k = 0; k < this->_weight.size(); k++)
-    {
-      const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
-      const long lookback = -offset;
-      auto input_block = _input_buffer.Read(num_frames, lookback);
-      _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block;
-    }
-  }
-  else
+  // Block-diagonal optimization: each weight matrix has block-diagonal structure for grouped convs.
+  // Off-diagonal blocks are zeros, so a single matmul per kernel position gives the same result
+  // as G separate matmuls. This is more efficient because BLAS can optimize larger operations.
+  for (size_t k = 0; k < this->_weight.size(); k++)
   {
-    // Grouped convolution: process each group separately using pre-computed block indices
-    for (int g = 0; g < numGroups; g++)
-    {
-      const auto& block = this->_group_blocks[g];
-
-      for (size_t k = 0; k < this->_weight.size(); k++)
-      {
-        const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
-        const long lookback = -offset;
-        auto input_block = _input_buffer.Read(num_frames, lookback);
-
-        // Extract input slice for this group
-        auto input_group = input_block.middleRows(block.in_start, block.in_size);
-
-        // Extract weight slice for this group
-        auto weight_group = this->_weight[k].block(block.out_start, block.in_start, block.out_size, block.in_size);
-
-        // Extract output slice for this group
-        auto output_group = _output.leftCols(num_frames).middleRows(block.out_start, block.out_size);
-
-        // Perform grouped convolution: output_group += weight_group * input_group
-        output_group.noalias() += weight_group * input_group;
-      }
-    }
+    const long offset = this->_dilation * (k + 1 - (long)this->_weight.size());
+    const long lookback = -offset;
+    auto input_block = _input_buffer.Read(num_frames, lookback);
+    _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block;
   }
 
   // Add bias if present
@@ -177,47 +129,15 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames)
 void Conv1D::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start, const long ncols,
                       const long j_start) const
 {
-  const int numGroups = this->_num_groups;
-
-  if (numGroups == 1)
+  // Block-diagonal optimization: each weight matrix has block-diagonal structure for grouped convs.
+  // Off-diagonal blocks are zeros, so a single matmul per kernel position gives the same result.
+  for (size_t k = 0; k < this->_weight.size(); k++)
   {
-    // Standard convolution (no grouping)
-    for (size_t k = 0; k < this->_weight.size(); k++)
-    {
-      const long offset = this->_dilation * (k + 1 - this->_weight.size());
-      if (k == 0)
-        output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols);
-      else
-        output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols);
-    }
-  }
-  else
-  {
-    // Grouped convolution: process each group separately using pre-computed block indices
-    for (int g = 0; g < numGroups; g++)
-    {
-      const auto& block = this->_group_blocks[g];
-
-      for (size_t k = 0; k < this->_weight.size(); k++)
-      {
-        const long offset = this->_dilation * (k + 1 - this->_weight.size());
-
-        // Extract input slice for this group
-        auto input_group = input.middleCols(i_start + offset, ncols).middleRows(block.in_start, block.in_size);
-
-        // Extract weight slice for this group
-        auto weight_group = this->_weight[k].block(block.out_start, block.in_start, block.out_size, block.in_size);
-
-        // Extract output slice for this group
-        auto output_group = output.middleCols(j_start, ncols).middleRows(block.out_start, block.out_size);
-
-        // Perform grouped convolution
-        if (k == 0)
-          output_group.noalias() = weight_group * input_group;
-        else
-          output_group.noalias() += weight_group * input_group;
-      }
-    }
+    const long offset = this->_dilation * (k + 1 - this->_weight.size());
+    if (k == 0)
+      output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols);
+    else
+      output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols);
   }
   if (this->_bias.size() > 0)
   {
diff --git a/NAM/conv1d.h b/NAM/conv1d.h
index 0bf64a7..8182966 100644
--- a/NAM/conv1d.h
+++ b/NAM/conv1d.h
@@ -6,19 +6,6 @@
 
 namespace nam
 {
-
-/// \brief Pre-computed group block indices for grouped convolutions
-///
-/// Stores the indices for extracting input/output slices for each group,
-/// avoiding repeated computation during real-time processing.
-struct Conv1DGroupBlock
-{
-  long out_start; ///< Starting row index in output
-  long in_start; ///< Starting row index in input
-  long out_size; ///< Number of output channels per group
-  long in_size; ///< Number of input channels per group
-};
-
 /// \brief 1D dilated convolution layer
 ///
 /// Implements a 1D convolution with support for dilation and grouped convolution.
@@ -136,7 +123,6 @@ class Conv1D
   Eigen::VectorXf _bias;
   int _dilation;
   int _num_groups;
-  std::vector<Conv1DGroupBlock> _group_blocks; ///< Pre-computed group block indices
 
 private:
   RingBuffer _input_buffer; // Ring buffer for input (channels x buffer_size)
diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp
index 5eb567e..bdfa8e3 100644
--- a/NAM/dsp.cpp
+++ b/NAM/dsp.cpp
@@ -331,7 +331,9 @@ nam::Conv1x1::Conv1x1(const int in_channels, const int out_channels, const bool
   }
 
   this->_num_groups = groups;
-  this->_weight.resize(out_channels, in_channels);
+  // Initialize weight matrix to zero - critical for block-diagonal structure
+  // Off-diagonal blocks must be zero for single-matmul grouped convolution
+  this->_weight.setZero(out_channels, in_channels);
   this->_do_bias = _bias;
   if (_bias)
     this->_bias.resize(out_channels);
@@ -353,17 +355,9 @@ void nam::Conv1x1::set_weights_(std::vector<float>::iterator& weights)
     const long out_per_group = out_channels / numGroups;
     const long in_per_group = in_channels / numGroups;
 
-    // Pre-compute group block indices for efficient runtime access
-    this->_group_blocks.resize(numGroups);
-    for (int g = 0; g < numGroups; g++)
-    {
-      this->_group_blocks[g].out_start = g * out_per_group;
-      this->_group_blocks[g].in_start = g * in_per_group;
-      this->_group_blocks[g].out_size = out_per_group;
-      this->_group_blocks[g].in_size = in_per_group;
-    }
-
-    // For grouped convolutions, weights are organized per group
+    // For grouped convolutions, weights form a block-diagonal matrix.
+    // Off-diagonal blocks are already zero (from constructor).
+    // We only set the diagonal blocks here.
     // Weight layout: weights are [group0, group1, ..., groupN-1]
     // Each group's weight matrix is (out_channels/numGroups, in_channels/numGroups)
     for (int g = 0; g < numGroups; g++)
@@ -384,78 +378,62 @@ void nam::Conv1x1::set_weights_(std::vector<float>::iterator& weights)
 
 Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int num_frames) const
 {
-  const int numGroups = this->_num_groups;
-  const long out_channels = get_out_channels();
-
-  Eigen::MatrixXf result(out_channels, num_frames);
+  const long out_channels = this->_weight.rows();
+  const long in_channels = this->_weight.cols();
 
-  if (numGroups == 1)
+  // For groups=1, use simple matrix multiply (most common case)
+  if (this->_num_groups == 1)
   {
-    // Standard convolution (no grouping)
     if (this->_do_bias)
-      result = (this->_weight * input.leftCols(num_frames)).colwise() + this->_bias;
+      return (this->_weight * input.leftCols(num_frames)).colwise() + this->_bias;
     else
-      result = this->_weight * input.leftCols(num_frames);
+      return this->_weight * input.leftCols(num_frames);
   }
-  else
-  {
-    // Grouped convolution: process each group separately using pre-computed block indices
-    result.setZero();
-    for (int g = 0; g < numGroups; g++)
-    {
-      const auto& block = this->_group_blocks[g];
 
-      // Extract input slice for this group
-      auto input_group = input.leftCols(num_frames).middleRows(block.in_start, block.in_size);
+  // For grouped convolutions with small channel counts, explicit loop is faster
+  // than block-diagonal single matmul due to BLAS overhead on small matrices
+  const long out_per_group = out_channels / this->_num_groups;
+  const long in_per_group = in_channels / this->_num_groups;
 
-      // Extract weight slice for this group
-      auto weight_group = this->_weight.block(block.out_start, block.in_start, block.out_size, block.in_size);
-
-      // Extract output slice for this group
-      auto output_group = result.middleRows(block.out_start, block.out_size);
-
-      // Perform grouped convolution: output_group = weight_group * input_group
-      output_group.noalias() = weight_group * input_group;
-    }
-
-    // Add bias if present
-    if (this->_do_bias)
-      result.colwise() += this->_bias;
+  Eigen::MatrixXf output(out_channels, num_frames);
+  for (int g = 0; g < this->_num_groups; g++)
+  {
+    auto input_group = input.middleRows(g * in_per_group, in_per_group).leftCols(num_frames);
+    auto weight_group = this->_weight.block(g * out_per_group, g * in_per_group, out_per_group, in_per_group);
+    output.middleRows(g * out_per_group, out_per_group).noalias() = weight_group * input_group;
   }
 
-  return result;
+  if (this->_do_bias)
+    output.colwise() += this->_bias;
+
+  return output;
 }
 
 void nam::Conv1x1::process_(const Eigen::Ref<const Eigen::MatrixXf>& input, const int num_frames)
 {
   assert(num_frames <= _output.cols());
 
-  const int numGroups = this->_num_groups;
+  const long out_channels = this->_weight.rows();
+  const long in_channels = this->_weight.cols();
 
-  if (numGroups == 1)
+  // For groups=1, use simple matrix multiply (most common case)
+  if (this->_num_groups == 1)
   {
-    // Standard convolution (no grouping)
     _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames);
   }
   else
   {
-    // Grouped convolution: process each group separately using pre-computed block indices
-    _output.leftCols(num_frames).setZero();
-    for (int g = 0; g < numGroups; g++)
-    {
-      const auto& block = this->_group_blocks[g];
-
-      // Extract input slice for this group
-      auto input_group = input.leftCols(num_frames).middleRows(block.in_start, block.in_size);
-
-      // Extract weight slice for this group
-      auto weight_group = this->_weight.block(block.out_start, block.in_start, block.out_size, block.in_size);
+    // For grouped convolutions with small channel counts, explicit loop is faster
+    // than block-diagonal single matmul due to BLAS overhead on small matrices
+    const long out_per_group = out_channels / this->_num_groups;
+    const long in_per_group = in_channels / this->_num_groups;
 
-      // Extract output slice for this group
-      auto output_group = _output.leftCols(num_frames).middleRows(block.out_start, block.out_size);
-
-      // Perform grouped convolution: output_group = weight_group * input_group
-      output_group.noalias() = weight_group * input_group;
+    for (int g = 0; g < this->_num_groups; g++)
+    {
+      auto input_group = input.middleRows(g * in_per_group, in_per_group).leftCols(num_frames);
+      auto weight_group = this->_weight.block(g * out_per_group, g * in_per_group, out_per_group, in_per_group);
+      _output.middleRows(g * out_per_group, out_per_group).leftCols(num_frames).noalias() =
+        weight_group * input_group;
     }
   }
 
diff --git a/NAM/dsp.h b/NAM/dsp.h
index f7763c2..8b984d2 100644
--- a/NAM/dsp.h
+++ b/NAM/dsp.h
@@ -269,18 +269,6 @@ std::unique_ptr<DSP> Factory(const nlohmann::json& config, std::vector<float>& w
 
 // NN modules =================================================================
 
-/// \brief Pre-computed group block indices for grouped convolutions
-///
-/// Stores the indices for extracting input/output slices for each group,
-/// avoiding repeated computation during real-time processing.
-struct GroupBlock
-{
-  long out_start; ///< Starting row index in output
-  long in_start; ///< Starting row index in input
-  long out_size; ///< Number of output channels per group
-  long in_size; ///< Number of input channels per group
-};
-
 /// \brief 1x1 convolution (really just a fully-connected linear layer operating per-sample)
 ///
 /// Performs a pointwise convolution, which is equivalent to a fully connected layer
@@ -342,7 +330,6 @@ class Conv1x1
   Eigen::MatrixXf _weight;
   Eigen::VectorXf _bias;
   int _num_groups;
-  std::vector<GroupBlock> _group_blocks; ///< Pre-computed group block indices
 
 private:
   Eigen::MatrixXf _output;
diff --git a/tools/benchmark_compare.sh b/tools/benchmark_compare.sh
index e742fd1..8b12197 100755
--- a/tools/benchmark_compare.sh
+++ b/tools/benchmark_compare.sh
@@ -3,13 +3,13 @@
 # Script to compare performance of current branch against another branch (default: main)
 # Usage: ./tools/benchmark_compare.sh [--model MODEL_PATH] [--branch BRANCH_NAME]
 
-set -e  # Exit on error
+set -e # Exit on error
 
-MODEL_PATH="example_models/wavenet_a1_standard.nam"
+MODEL_PATH="example_models/wavenet_a2_max.nam" # "example_models/wavenet_a1_standard.nam"
 BUILD_DIR="build"
 BENCHMARK_EXEC="build/tools/benchmodel"
 NUM_RUNS=10
-COMPARE_BRANCH="main"  # Default branch to compare against
+COMPARE_BRANCH="main" # Default branch to compare against
 # Report file will be set with timestamp in main()
 
 # Colors for output
@@ -20,80 +20,80 @@ NC='\033[0m' # No Color
 
 # Function to extract milliseconds from benchmodel output
 extract_ms() {
-    local output="$1"
-    # Extract the double precision milliseconds value (the second one)
-    echo "$output" | grep -E "^[0-9]+\.[0-9]+ms$" | head -1 | sed 's/ms$//'
+  local output="$1"
+  # Extract the double precision milliseconds value (the second one)
+  echo "$output" | grep -E "^[0-9]+\.[0-9]+ms$" | head -1 | sed 's/ms$//'
 }
 
 # Function to run benchmark multiple times and collect results
 run_benchmark() {
-    local branch_name="$1"
-    local results_file="$2"
-    local project_root="$PWD"  # Save current directory
-    
-    echo -e "${YELLOW}Running benchmark on branch: ${branch_name}${NC}"
-    
-    # Clean build directory - remove only untracked files, preserve tracked files like .gitignore
-    if [ -d "$BUILD_DIR" ]; then
-        # Remove files/directories that aren't tracked by git (process depth-first)
-        find "$BUILD_DIR" -mindepth 1 -depth -exec sh -c 'if ! git ls-files --error-unmatch "$1" >/dev/null 2>&1; then rm -rf "$1"; fi' _ {} \;
-    fi
-    mkdir -p "$BUILD_DIR"
-    
-    # Configure and build in release mode
-    echo "Configuring CMake..."
-    cd "$BUILD_DIR" || exit 1
-    cmake -DCMAKE_BUILD_TYPE=Release ..
-    
-    echo "Building benchmodel..."
-    cmake --build . --target benchmodel -j$(sysctl -n hw.ncpu 2>/dev/null || echo 4)
-    cd "$project_root" || exit 1
-    
-    # Verify executable exists
-    if [ ! -f "$BENCHMARK_EXEC" ]; then
-        echo -e "${RED}Error: benchmodel executable not found at $BENCHMARK_EXEC${NC}"
-        exit 1
-    fi
-    
-    # Verify model file exists (use absolute path to be sure)
-    local abs_model_path="$project_root/$MODEL_PATH"
-    if [ ! -f "$abs_model_path" ]; then
-        echo -e "${RED}Error: Model file not found at $abs_model_path${NC}"
-        echo "Available model files:"
-        find "$project_root/example_models" -name "*.nam" -type f 2>/dev/null || echo "  (none found)"
-        exit 1
+  local branch_name="$1"
+  local results_file="$2"
+  local project_root="$PWD" # Save current directory
+
+  echo -e "${YELLOW}Running benchmark on branch: ${branch_name}${NC}"
+
+  # Clean build directory - remove only untracked files, preserve tracked files like .gitignore
+  if [ -d "$BUILD_DIR" ]; then
+    # Remove files/directories that aren't tracked by git (process depth-first)
+    find "$BUILD_DIR" -mindepth 1 -depth -exec sh -c 'if ! git ls-files --error-unmatch "$1" >/dev/null 2>&1; then rm -rf "$1"; fi' _ {} \;
+  fi
+  mkdir -p "$BUILD_DIR"
+
+  # Configure and build in release mode
+  echo "Configuring CMake..."
+  cd "$BUILD_DIR" || exit 1
+  cmake -DCMAKE_BUILD_TYPE=Release ..
+
+  echo "Building benchmodel..."
+  cmake --build . --target benchmodel -j$(sysctl -n hw.ncpu 2>/dev/null || echo 4)
+  cd "$project_root" || exit 1
+
+  # Verify executable exists
+  if [ ! -f "$BENCHMARK_EXEC" ]; then
+    echo -e "${RED}Error: benchmodel executable not found at $BENCHMARK_EXEC${NC}"
+    exit 1
+  fi
+
+  # Verify model file exists (use absolute path to be sure)
+  local abs_model_path="$project_root/$MODEL_PATH"
+  if [ ! -f "$abs_model_path" ]; then
+    echo -e "${RED}Error: Model file not found at $abs_model_path${NC}"
+    echo "Available model files:"
+    find "$project_root/example_models" -name "*.nam" -type f 2>/dev/null || echo "  (none found)"
+    exit 1
+  fi
+
+  # Run benchmark multiple times
+  echo "Running benchmark $NUM_RUNS times..."
+  >"$results_file" # Clear results file
+
+  for i in $(seq 1 $NUM_RUNS); do
+    echo -n "  Run $i/$NUM_RUNS... "
+    output=$("$BENCHMARK_EXEC" "$abs_model_path" 2>&1)
+    ms=$(extract_ms "$output")
+
+    if [ -z "$ms" ]; then
+      echo -e "${RED}Failed to extract timing${NC}"
+      echo "Output was:"
+      echo "$output"
+      exit 1
     fi
-    
-    # Run benchmark multiple times
-    echo "Running benchmark $NUM_RUNS times..."
-    > "$results_file"  # Clear results file
-    
-    for i in $(seq 1 $NUM_RUNS); do
-        echo -n "  Run $i/$NUM_RUNS... "
-        output=$("$BENCHMARK_EXEC" "$abs_model_path" 2>&1)
-        ms=$(extract_ms "$output")
-        
-        if [ -z "$ms" ]; then
-            echo -e "${RED}Failed to extract timing${NC}"
-            echo "Output was:"
-            echo "$output"
-            exit 1
-        fi
-        
-        echo "$ms" >> "$results_file"
-        echo "${ms}ms"
-    done
-    
-    echo -e "${GREEN}Completed benchmark for ${branch_name}${NC}"
-    echo ""
+
+    echo "$ms" >>"$results_file"
+    echo "${ms}ms"
+  done
+
+  echo -e "${GREEN}Completed benchmark for ${branch_name}${NC}"
+  echo ""
 }
 
 # Function to calculate statistics
 calculate_stats() {
-    local results_file="$1"
-    
-    # Calculate mean, min, max, stddev with awk
-    local stats=$(awk '
+  local results_file="$1"
+
+  # Calculate mean, min, max, stddev with awk
+  local stats=$(awk '
     {
         sum += $1
         sumsq += $1 * $1
@@ -107,248 +107,248 @@ calculate_stats() {
         stddev = sqrt(variance)
         printf "%.3f %.3f %.3f %.3f %d", mean, min, max, stddev, n
     }' "$results_file")
-    
-    # Calculate median using sort (works with BSD awk)
-    local n=$(echo "$stats" | awk '{print $5}')
-    local median
-    if [ $((n % 2)) -eq 0 ]; then
-        # Even number of values: average of middle two
-        local mid1=$((n / 2))
-        local mid2=$((n / 2 + 1))
-        local val1=$(sort -n "$results_file" | sed -n "${mid1}p")
-        local val2=$(sort -n "$results_file" | sed -n "${mid2}p")
-        median=$(echo "scale=3; ($val1 + $val2) / 2" | bc)
-    else
-        # Odd number of values: middle value
-        local mid=$((n / 2 + 1))
-        median=$(sort -n "$results_file" | sed -n "${mid}p")
-    fi
-    
-    # Output: mean median min max stddev
-    echo "$stats" | awk -v median="$median" '{printf "%.3f %.3f %.3f %.3f %.3f", $1, median, $2, $3, $4}'
+
+  # Calculate median using sort (works with BSD awk)
+  local n=$(echo "$stats" | awk '{print $5}')
+  local median
+  if [ $((n % 2)) -eq 0 ]; then
+    # Even number of values: average of middle two
+    local mid1=$((n / 2))
+    local mid2=$((n / 2 + 1))
+    local val1=$(sort -n "$results_file" | sed -n "${mid1}p")
+    local val2=$(sort -n "$results_file" | sed -n "${mid2}p")
+    median=$(echo "scale=3; ($val1 + $val2) / 2" | bc)
+  else
+    # Odd number of values: middle value
+    local mid=$((n / 2 + 1))
+    median=$(sort -n "$results_file" | sed -n "${mid}p")
+  fi
+
+  # Output: mean median min max stddev
+  echo "$stats" | awk -v median="$median" '{printf "%.3f %.3f %.3f %.3f %.3f", $1, median, $2, $3, $4}'
 }
 
 # Function to generate report
 generate_report() {
-    local compare_results="$1"
-    local current_results="$2"
-    local current_branch="$3"
-    local compare_branch="$4"
-    local compare_commit="$5"
-    local current_commit="$6"
-    local report_file="$7"
-    
-    echo "Generating performance comparison report..."
-    
-    # Calculate statistics for both branches
-    read compare_mean compare_median compare_min compare_max compare_stddev <<< $(calculate_stats "$compare_results")
-    read current_mean current_median current_min current_max current_stddev <<< $(calculate_stats "$current_results")
-    
-    # Calculate percentage difference
-    diff_mean=$(echo "scale=2; (($current_mean - $compare_mean) / $compare_mean) * 100" | bc)
-    diff_median=$(echo "scale=2; (($current_median - $compare_median) / $compare_median) * 100" | bc)
-    
-    # Generate report
-    {
-        echo "=========================================="
-        echo "Performance Benchmark Comparison Report"
-        echo "=========================================="
-        echo ""
-        echo "Model: $MODEL_PATH"
-        echo "Number of runs per branch: $NUM_RUNS"
-        echo "Date: $(date)"
-        echo ""
-        echo "----------------------------------------"
-        echo "Branch: $compare_branch"
-        echo "----------------------------------------"
-        echo "Commit:   ${compare_commit}"
-        echo "Mean:     ${compare_mean} ms"
-        echo "Median:   ${compare_median} ms"
-        echo "Min:      ${compare_min} ms"
-        echo "Max:      ${compare_max} ms"
-        echo "Std Dev:  ${compare_stddev} ms"
-        echo ""
-        echo "----------------------------------------"
-        echo "Branch: $current_branch"
-        echo "----------------------------------------"
-        echo "Commit:   ${current_commit}"
-        echo "Mean:     ${current_mean} ms"
-        echo "Median:   ${current_median} ms"
-        echo "Min:      ${current_min} ms"
-        echo "Max:      ${current_max} ms"
-        echo "Std Dev:  ${current_stddev} ms"
-        echo ""
-        echo "----------------------------------------"
-        echo "Comparison"
-        echo "----------------------------------------"
-        if (( $(echo "$diff_mean > 0" | bc -l) )); then
-            echo "Mean:     ${current_branch} is ${diff_mean}% SLOWER than ${compare_branch}"
-        else
-            echo "Mean:     ${current_branch} is ${diff_mean#-}% FASTER than ${compare_branch}"
-        fi
-        if (( $(echo "$diff_median > 0" | bc -l) )); then
-            echo "Median:   ${current_branch} is ${diff_median}% SLOWER than ${compare_branch}"
-        else
-            echo "Median:   ${current_branch} is ${diff_median#-}% FASTER than ${compare_branch}"
-        fi
-        echo ""
-        echo "Raw Results ($compare_branch):"
-        cat "$compare_results" | awk '{printf "  %.3f ms\n", $1}'
-        echo ""
-        echo "Raw Results ($current_branch):"
-        cat "$current_results" | awk '{printf "  %.3f ms\n", $1}'
-    } > "$report_file"
-    
-    echo -e "${GREEN}Report written to: $report_file${NC}"
+  local compare_results="$1"
+  local current_results="$2"
+  local current_branch="$3"
+  local compare_branch="$4"
+  local compare_commit="$5"
+  local current_commit="$6"
+  local report_file="$7"
+
+  echo "Generating performance comparison report..."
+
+  # Calculate statistics for both branches
+  read compare_mean compare_median compare_min compare_max compare_stddev <<<$(calculate_stats "$compare_results")
+  read current_mean current_median current_min current_max current_stddev <<<$(calculate_stats "$current_results")
+
+  # Calculate percentage difference
+  diff_mean=$(echo "scale=2; (($current_mean - $compare_mean) / $compare_mean) * 100" | bc)
+  diff_median=$(echo "scale=2; (($current_median - $compare_median) / $compare_median) * 100" | bc)
+
+  # Generate report
+  {
+    echo "=========================================="
+    echo "Performance Benchmark Comparison Report"
+    echo "=========================================="
+    echo ""
+    echo "Model: $MODEL_PATH"
+    echo "Number of runs per branch: $NUM_RUNS"
+    echo "Date: $(date)"
+    echo ""
+    echo "----------------------------------------"
+    echo "Branch: $compare_branch"
+    echo "----------------------------------------"
+    echo "Commit:   ${compare_commit}"
+    echo "Mean:     ${compare_mean} ms"
+    echo "Median:   ${compare_median} ms"
+    echo "Min:      ${compare_min} ms"
+    echo "Max:      ${compare_max} ms"
+    echo "Std Dev:  ${compare_stddev} ms"
+    echo ""
+    echo "----------------------------------------"
+    echo "Branch: $current_branch"
+    echo "----------------------------------------"
+    echo "Commit:   ${current_commit}"
+    echo "Mean:     ${current_mean} ms"
+    echo "Median:   ${current_median} ms"
+    echo "Min:      ${current_min} ms"
+    echo "Max:      ${current_max} ms"
+    echo "Std Dev:  ${current_stddev} ms"
+    echo ""
+    echo "----------------------------------------"
+    echo "Comparison"
+    echo "----------------------------------------"
+    if (($(echo "$diff_mean > 0" | bc -l))); then
+      echo "Mean:     ${current_branch} is ${diff_mean}% SLOWER than ${compare_branch}"
+    else
+      echo "Mean:     ${current_branch} is ${diff_mean#-}% FASTER than ${compare_branch}"
+    fi
+    if (($(echo "$diff_median > 0" | bc -l))); then
+      echo "Median:   ${current_branch} is ${diff_median}% SLOWER than ${compare_branch}"
+    else
+      echo "Median:   ${current_branch} is ${diff_median#-}% FASTER than ${compare_branch}"
+    fi
+    echo ""
+    echo "Raw Results ($compare_branch):"
+    cat "$compare_results" | awk '{printf "  %.3f ms\n", $1}'
     echo ""
-    cat "$report_file"
+    echo "Raw Results ($current_branch):"
+    cat "$current_results" | awk '{printf "  %.3f ms\n", $1}'
+  } >"$report_file"
+
+  echo -e "${GREEN}Report written to: $report_file${NC}"
+  echo ""
+  cat "$report_file"
 }
 
 # Main execution
 main() {
-    # Parse command line arguments
-    while [[ $# -gt 0 ]]; do
-        case $1 in
-            --model)
-                if [ -z "$2" ]; then
-                    echo -e "${RED}Error: --model requires a path argument${NC}"
-                    echo "Use --help for usage information"
-                    exit 1
-                fi
-                MODEL_PATH="$2"
-                shift 2
-                ;;
-            --branch)
-                if [ -z "$2" ]; then
-                    echo -e "${RED}Error: --branch requires a branch name argument${NC}"
-                    echo "Use --help for usage information"
-                    exit 1
-                fi
-                COMPARE_BRANCH="$2"
-                shift 2
-                ;;
-            --help|-h)
-                echo "Usage: $0 [--model MODEL_PATH] [--branch BRANCH_NAME]"
-                echo ""
-                echo "Options:"
-                echo "  --model MODEL_PATH    Path to the model file to benchmark (default: example_models/wavenet_a1_standard.nam)"
-                echo "  --branch BRANCH_NAME  Branch to compare against (default: main)"
-                echo "  --help, -h            Show this help message"
-                exit 0
-                ;;
-            *)
-                echo -e "${RED}Error: Unknown option: $1${NC}"
-                echo "Use --help for usage information"
-                exit 1
-                ;;
-        esac
-    done
-    
-    # Ensure we're in the project root (parent of tools/)
-    SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-    PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
-    cd "$PROJECT_ROOT"
-    
-    # Verify we're in a git repository
-    if ! git rev-parse --git-dir > /dev/null 2>&1; then
-        echo -e "${RED}Error: Not in a git repository${NC}"
+  # Parse command line arguments
+  while [[ $# -gt 0 ]]; do
+    case $1 in
+    --model)
+      if [ -z "$2" ]; then
+        echo -e "${RED}Error: --model requires a path argument${NC}"
+        echo "Use --help for usage information"
         exit 1
-    fi
-    
-    # Get current branch
-    current_branch=$(git rev-parse --abbrev-ref HEAD)
-    
-    if [ "$current_branch" = "$COMPARE_BRANCH" ]; then
-        echo -e "${RED}Error: Already on $COMPARE_BRANCH branch. Please checkout a different branch first.${NC}"
+      fi
+      MODEL_PATH="$2"
+      shift 2
+      ;;
+    --branch)
+      if [ -z "$2" ]; then
+        echo -e "${RED}Error: --branch requires a branch name argument${NC}"
+        echo "Use --help for usage information"
         exit 1
+      fi
+      COMPARE_BRANCH="$2"
+      shift 2
+      ;;
+    --help | -h)
+      echo "Usage: $0 [--model MODEL_PATH] [--branch BRANCH_NAME]"
+      echo ""
+      echo "Options:"
+      echo "  --model MODEL_PATH    Path to the model file to benchmark (default: example_models/wavenet_a1_standard.nam)"
+      echo "  --branch BRANCH_NAME  Branch to compare against (default: main)"
+      echo "  --help, -h            Show this help message"
+      exit 0
+      ;;
+    *)
+      echo -e "${RED}Error: Unknown option: $1${NC}"
+      echo "Use --help for usage information"
+      exit 1
+      ;;
+    esac
+  done
+
+  # Ensure we're in the project root (parent of tools/)
+  SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+  PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+  cd "$PROJECT_ROOT"
+
+  # Verify we're in a git repository
+  if ! git rev-parse --git-dir >/dev/null 2>&1; then
+    echo -e "${RED}Error: Not in a git repository${NC}"
+    exit 1
+  fi
+
+  # Get current branch
+  current_branch=$(git rev-parse --abbrev-ref HEAD)
+
+  if [ "$current_branch" = "$COMPARE_BRANCH" ]; then
+    echo -e "${RED}Error: Already on $COMPARE_BRANCH branch. Please checkout a different branch first.${NC}"
+    exit 1
+  fi
+
+  echo -e "${YELLOW}Current branch: ${current_branch}${NC}"
+  echo -e "${YELLOW}Comparing against: ${COMPARE_BRANCH}${NC}"
+  echo ""
+
+  # Generate timestamped report filename
+  TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
+  REPORT_FILE="benchmark_report_${TIMESTAMP}.txt"
+
+  # Create temporary files for results
+  compare_results=$(mktemp)
+  current_results=$(mktemp)
+
+  # Variables to store commit hashes
+  compare_commit=""
+  current_commit=""
+
+  # Save untracked model file if it exists (to preserve it across branch switches)
+  model_backup=""
+  if [ -f "$MODEL_PATH" ] && ! git ls-files --error-unmatch "$MODEL_PATH" >/dev/null 2>&1; then
+    echo -e "${YELLOW}Preserving untracked model file: $MODEL_PATH${NC}"
+    model_backup=$(mktemp)
+    cp "$MODEL_PATH" "$model_backup"
+  fi
+
+  # Track if we stashed anything
+  stashed=false
+
+  # Cleanup function
+  cleanup() {
+    rm -f "$compare_results" "$current_results"
+    # Restore original branch if we're not on it
+    if [ -n "$current_branch" ] && [ "$(git rev-parse --abbrev-ref HEAD)" != "$current_branch" ]; then
+      git checkout "$current_branch" >/dev/null 2>&1 || true
     fi
-    
-    echo -e "${YELLOW}Current branch: ${current_branch}${NC}"
-    echo -e "${YELLOW}Comparing against: ${COMPARE_BRANCH}${NC}"
-    echo ""
-    
-    # Generate timestamped report filename
-    TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
-    REPORT_FILE="benchmark_report_${TIMESTAMP}.txt"
-    
-    # Create temporary files for results
-    compare_results=$(mktemp)
-    current_results=$(mktemp)
-    
-    # Variables to store commit hashes
-    compare_commit=""
-    current_commit=""
-    
-    # Save untracked model file if it exists (to preserve it across branch switches)
-    model_backup=""
-    if [ -f "$MODEL_PATH" ] && ! git ls-files --error-unmatch "$MODEL_PATH" > /dev/null 2>&1; then
-        echo -e "${YELLOW}Preserving untracked model file: $MODEL_PATH${NC}"
-        model_backup=$(mktemp)
-        cp "$MODEL_PATH" "$model_backup"
-    fi
-    
-    # Track if we stashed anything
-    stashed=false
-    
-    # Cleanup function
-    cleanup() {
-        rm -f "$compare_results" "$current_results"
-        # Restore original branch if we're not on it
-        if [ -n "$current_branch" ] && [ "$(git rev-parse --abbrev-ref HEAD)" != "$current_branch" ]; then
-            git checkout "$current_branch" > /dev/null 2>&1 || true
-        fi
-        # Restore stashed changes if we stashed anything
-        if [ "$stashed" = true ]; then
-            git stash pop > /dev/null 2>&1 || true
-        fi
-        # Restore untracked model file if we backed it up
-        if [ -n "$model_backup" ] && [ -f "$model_backup" ]; then
-            mkdir -p "$(dirname "$MODEL_PATH")"
-            cp "$model_backup" "$MODEL_PATH"
-            rm -f "$model_backup"
-            echo -e "${GREEN}Restored untracked model file: $MODEL_PATH${NC}"
-        fi
-    }
-    trap cleanup EXIT
-    
-    # Test comparison branch
-    echo -e "${YELLOW}=== Testing ${COMPARE_BRANCH} branch ===${NC}"
-    # Stash any uncommitted changes (only if there are any)
-    if ! git diff-index --quiet HEAD -- 2>/dev/null || ! git diff-index --quiet --cached HEAD -- 2>/dev/null; then
-        git stash push -m "benchmark_compare.sh temporary stash" > /dev/null 2>&1
-        stashed=true
-    fi
-    # Restore model file to comparison branch if we backed it up (so it's available for benchmarking)
-    if [ -n "$model_backup" ] && [ -f "$model_backup" ]; then
-        mkdir -p "$(dirname "$MODEL_PATH")"
-        cp "$model_backup" "$MODEL_PATH"
+    # Restore stashed changes if we stashed anything
+    if [ "$stashed" = true ]; then
+      git stash pop >/dev/null 2>&1 || true
     fi
-    # Use --force to allow overwriting untracked files if needed
-    git checkout "$COMPARE_BRANCH" --force 2>/dev/null || git checkout "$COMPARE_BRANCH"
-    compare_commit=$(git rev-parse HEAD)
-    echo "Commit: ${compare_commit}"
-    run_benchmark "$COMPARE_BRANCH" "$compare_results"
-    
-    # Test current branch
-    echo -e "${YELLOW}=== Testing ${current_branch} branch ===${NC}"
-    git checkout "$current_branch" --force 2>/dev/null || git checkout "$current_branch"
-    # Restore model file if we backed it up
+    # Restore untracked model file if we backed it up
     if [ -n "$model_backup" ] && [ -f "$model_backup" ]; then
-        mkdir -p "$(dirname "$MODEL_PATH")"
-        cp "$model_backup" "$MODEL_PATH"
+      mkdir -p "$(dirname "$MODEL_PATH")"
+      cp "$model_backup" "$MODEL_PATH"
+      rm -f "$model_backup"
+      echo -e "${GREEN}Restored untracked model file: $MODEL_PATH${NC}"
     fi
-    if [ "$stashed" = true ]; then
-        git stash pop > /dev/null 2>&1 || true
-        stashed=false
-    fi
-    current_commit=$(git rev-parse HEAD)
-    echo "Commit: ${current_commit}"
-    run_benchmark "$current_branch" "$current_results"
-    
-    # Generate report
-    generate_report "$compare_results" "$current_results" "$current_branch" "$COMPARE_BRANCH" "$compare_commit" "$current_commit" "$REPORT_FILE"
-    
-    echo -e "${GREEN}Benchmark comparison complete!${NC}"
+  }
+  trap cleanup EXIT
+
+  # Test comparison branch
+  echo -e "${YELLOW}=== Testing ${COMPARE_BRANCH} branch ===${NC}"
+  # Stash any uncommitted changes (only if there are any)
+  if ! git diff-index --quiet HEAD -- 2>/dev/null || ! git diff-index --quiet --cached HEAD -- 2>/dev/null; then
+    git stash push -m "benchmark_compare.sh temporary stash" >/dev/null 2>&1
+    stashed=true
+  fi
+  # Restore model file to comparison branch if we backed it up (so it's available for benchmarking)
+  if [ -n "$model_backup" ] && [ -f "$model_backup" ]; then
+    mkdir -p "$(dirname "$MODEL_PATH")"
+    cp "$model_backup" "$MODEL_PATH"
+  fi
+  # Use --force to allow overwriting untracked files if needed
+  git checkout "$COMPARE_BRANCH" --force 2>/dev/null || git checkout "$COMPARE_BRANCH"
+  compare_commit=$(git rev-parse HEAD)
+  echo "Commit: ${compare_commit}"
+  run_benchmark "$COMPARE_BRANCH" "$compare_results"
+
+  # Test current branch
+  echo -e "${YELLOW}=== Testing ${current_branch} branch ===${NC}"
+  git checkout "$current_branch" --force 2>/dev/null || git checkout "$current_branch"
+  # Restore model file if we backed it up
+  if [ -n "$model_backup" ] && [ -f "$model_backup" ]; then
+    mkdir -p "$(dirname "$MODEL_PATH")"
+    cp "$model_backup" "$MODEL_PATH"
+  fi
+  if [ "$stashed" = true ]; then
+    git stash pop >/dev/null 2>&1 || true
+    stashed=false
+  fi
+  current_commit=$(git rev-parse HEAD)
+  echo "Commit: ${current_commit}"
+  run_benchmark "$current_branch" "$current_results"
+
+  # Generate report
+  generate_report "$compare_results" "$current_results" "$current_branch" "$COMPARE_BRANCH" "$compare_commit" "$current_commit" "$REPORT_FILE"
+
+  echo -e "${GREEN}Benchmark comparison complete!${NC}"
 }
 
 # Run main function with all command line arguments
diff --git a/tools/benchmark_convolution.cpp b/tools/benchmark_convolution.cpp
index 6501c55..4acbc1d 100644
--- a/tools/benchmark_convolution.cpp
+++ b/tools/benchmark_convolution.cpp
@@ -22,8 +22,8 @@ constexpr int NUM_WARMUP_ITERATIONS = 10;
 constexpr int NUM_BENCHMARK_ITERATIONS = 100;
 
 // Benchmark configurations
-constexpr int CHANNELS[] = {8, 16, 32, 64, 128};
-constexpr int GROUPS[] = {1, 2, 4, 8, 16};
+constexpr int CHANNELS[] = {2, 3, 4, 5, 6, 7, 8};
+constexpr int GROUPS[] = {1, 2, 3, 4, 5, 6, 7, 8};
 constexpr int FRAMES[] = {64, 256, 1024};
 constexpr int KERNEL_SIZES[] = {1, 3}; // For Conv1D
 

From acc028f1d6aceda24d496651b470edcf2c68fc86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= <santosjf@pm.me>
Date: Thu, 29 Jan 2026 13:12:13 -0800
Subject: [PATCH 3/4] Implementation with all dimensions fixed.

---
 NAM/conv1d_factory.cpp               |  53 +++
 NAM/conv1d_factory.h                 |  33 ++
 NAM/conv1d_fixed.h                   | 290 +++++++++++++++
 NAM/conv1x1_factory.cpp              |  51 +++
 NAM/conv1x1_factory.h                |  30 ++
 NAM/conv1x1_fixed.h                  | 272 ++++++++++++++
 tools/CMakeLists.txt                 |   8 +-
 tools/benchmark_convolution.cpp      | 160 +++++++--
 tools/benchmark_fully_fixed_conv.cpp | 282 +++++++++++++++
 tools/run_tests.cpp                  |  48 +++
 tools/test/test_conv1d_fixed.cpp     | 512 +++++++++++++++++++++++++++
 tools/test/test_conv1x1_fixed.cpp    | 323 +++++++++++++++++
 12 files changed, 2038 insertions(+), 24 deletions(-)
 create mode 100644 NAM/conv1d_factory.cpp
 create mode 100644 NAM/conv1d_factory.h
 create mode 100644 NAM/conv1d_fixed.h
 create mode 100644 NAM/conv1x1_factory.cpp
 create mode 100644 NAM/conv1x1_factory.h
 create mode 100644 NAM/conv1x1_fixed.h
 create mode 100644 tools/benchmark_fully_fixed_conv.cpp
 create mode 100644 tools/test/test_conv1d_fixed.cpp
 create mode 100644 tools/test/test_conv1x1_fixed.cpp

diff --git a/NAM/conv1d_factory.cpp b/NAM/conv1d_factory.cpp
new file mode 100644
index 0000000..f1512e1
--- /dev/null
+++ b/NAM/conv1d_factory.cpp
@@ -0,0 +1,53 @@
+// Conv1D Factory implementation
+// Returns dynamic Conv1D wrapped in IConv1D interface
+
+#include "conv1d_factory.h"
+#include "conv1d.h"
+
+namespace nam
+{
+
+/// \brief Dynamic wrapper for Conv1D implementing IConv1D interface
+///
+/// This class wraps the existing Conv1D implementation to provide the IConv1D
+/// interface for configurations that don't have specialized template instantiations.
+class Conv1DDynamicWrapper : public IConv1D
+{
+public:
+  Conv1DDynamicWrapper(int in_channels, int out_channels, int kernel_size, int dilation, bool bias, int groups)
+  {
+    _conv.set_size_(in_channels, out_channels, kernel_size, bias, dilation, groups);
+  }
+
+  Eigen::MatrixXf& GetOutput() override { return _conv.GetOutput(); }
+
+  const Eigen::MatrixXf& GetOutput() const override { return _conv.GetOutput(); }
+
+  void SetMaxBufferSize(int maxBufferSize) override { _conv.SetMaxBufferSize(maxBufferSize); }
+
+  void set_weights_(std::vector<float>::iterator& weights) override { _conv.set_weights_(weights); }
+
+  void Process(const Eigen::MatrixXf& input, int num_frames) override { _conv.Process(input, num_frames); }
+
+  long get_out_channels() const override { return _conv.get_out_channels(); }
+
+  long get_in_channels() const override { return _conv.get_in_channels(); }
+
+  long get_kernel_size() const override { return _conv.get_kernel_size(); }
+
+  int get_dilation() const override { return _conv.get_dilation(); }
+
+  bool has_bias() const override { return _conv.has_bias(); }
+
+private:
+  Conv1D _conv;
+};
+
+// Factory implementation - always returns dynamic implementation
+std::unique_ptr<IConv1D> Conv1DFactory::create(int in_channels, int out_channels, int kernel_size, int dilation,
+                                               bool bias, int groups)
+{
+  return std::make_unique<Conv1DDynamicWrapper>(in_channels, out_channels, kernel_size, dilation, bias, groups);
+}
+
+} // namespace nam
diff --git a/NAM/conv1d_factory.h b/NAM/conv1d_factory.h
new file mode 100644
index 0000000..3cfa8af
--- /dev/null
+++ b/NAM/conv1d_factory.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <memory>
+#include "conv1d_fixed.h"
+
+namespace nam
+{
+
+/// \brief Factory for creating Conv1D implementations
+///
+/// Returns a dynamic Conv1D implementation wrapped in the IConv1D interface.
+/// For fully optimized implementations with compile-time known buffer sizes,
+/// use Conv1DFullyFixed directly.
+class Conv1DFactory
+{
+public:
+  /// \brief Create a Conv1D implementation
+  ///
+  /// Returns a dynamic implementation. For maximum performance with known
+  /// buffer sizes, use Conv1DFullyFixed template directly.
+  ///
+  /// \param in_channels Number of input channels
+  /// \param out_channels Number of output channels
+  /// \param kernel_size Size of the convolution kernel
+  /// \param dilation Dilation factor for the convolution
+  /// \param bias Whether to use bias
+  /// \param groups Number of groups for grouped convolution (default: 1)
+  /// \return Unique pointer to an IConv1D implementation
+  static std::unique_ptr<IConv1D> create(int in_channels, int out_channels, int kernel_size, int dilation, bool bias,
+                                         int groups = 1);
+};
+
+} // namespace nam
diff --git a/NAM/conv1d_fixed.h b/NAM/conv1d_fixed.h
new file mode 100644
index 0000000..48ddb39
--- /dev/null
+++ b/NAM/conv1d_fixed.h
@@ -0,0 +1,290 @@
+#pragma once
+
+#include <Eigen/Dense>
+#include <array>
+#include <cassert>
+#include <memory>
+#include <vector>
+
+namespace nam
+{
+
+/// \brief Type-erased interface for Conv1D implementations
+///
+/// This interface allows runtime polymorphism while enabling compile-time
+/// optimized implementations via templates. All Conv1D variants (fixed-size
+/// and dynamic) implement this interface.
+class IConv1D
+{
+public:
+  virtual ~IConv1D() = default;
+
+  /// \brief Get the entire internal output buffer
+  /// \return Reference to the output buffer
+  virtual Eigen::MatrixXf& GetOutput() = 0;
+
+  /// \brief Get the entire internal output buffer (const version)
+  /// \return Const reference to the output buffer
+  virtual const Eigen::MatrixXf& GetOutput() const = 0;
+
+  /// \brief Resize the output buffer and reset ring buffer
+  /// \param maxBufferSize Maximum number of frames to process in a single call
+  virtual void SetMaxBufferSize(int maxBufferSize) = 0;
+
+  /// \brief Set the parameters (weights) of this module
+  /// \param weights Iterator to the weights vector. Will be advanced as weights are consumed.
+  virtual void set_weights_(std::vector<float>::iterator& weights) = 0;
+
+  /// \brief Process input and store output to pre-allocated buffer
+  /// \param input Input matrix (channels x num_frames)
+  /// \param num_frames Number of frames to process
+  virtual void Process(const Eigen::MatrixXf& input, int num_frames) = 0;
+
+  /// \brief Get the number of output channels
+  /// \return Number of output channels
+  virtual long get_out_channels() const = 0;
+
+  /// \brief Get the number of input channels
+  /// \return Number of input channels
+  virtual long get_in_channels() const = 0;
+
+  /// \brief Get the kernel size
+  /// \return Kernel size
+  virtual long get_kernel_size() const = 0;
+
+  /// \brief Get the dilation factor
+  /// \return Dilation factor
+  virtual int get_dilation() const = 0;
+
+  /// \brief Check if bias is used
+  /// \return true if bias is present, false otherwise
+  virtual bool has_bias() const = 0;
+};
+
+/// \brief Fully compile-time optimized Conv1D with fixed dimensions AND buffer size
+///
+/// This implementation uses fixed-size Eigen matrices for weights, input, and output,
+/// enabling the compiler to fully unroll and vectorize all operations.
+///
+/// Template parameters:
+/// \tparam OutChannels Number of output channels
+/// \tparam InChannels Number of input channels
+/// \tparam KernelSize Size of the convolution kernel
+/// \tparam MaxFrames Maximum buffer size (e.g., 32, 64, 128, 256, 512)
+/// \tparam Groups Number of groups for grouped convolution
+/// \tparam HasBias Whether to use bias
+template <int OutChannels, int InChannels, int KernelSize, int MaxFrames, int Groups = 1, bool HasBias = true>
+class Conv1DFullyFixed : public IConv1D
+{
+public:
+  static_assert(OutChannels > 0, "OutChannels must be positive");
+  static_assert(InChannels > 0, "InChannels must be positive");
+  static_assert(KernelSize > 0, "KernelSize must be positive");
+  static_assert(MaxFrames > 0, "MaxFrames must be positive");
+  static_assert(Groups > 0, "Groups must be positive");
+  static_assert(OutChannels % Groups == 0, "OutChannels must be divisible by Groups");
+  static_assert(InChannels % Groups == 0, "InChannels must be divisible by Groups");
+
+  // Derived constants
+  static constexpr int OutPerGroup = OutChannels / Groups;
+  static constexpr int InPerGroup = InChannels / Groups;
+
+  // Fully fixed-size types for maximum optimization
+  using WeightMatrix = Eigen::Matrix<float, OutChannels, InChannels>;
+  using BiasVector = Eigen::Matrix<float, OutChannels, 1>;
+  using InputBuffer = Eigen::Matrix<float, InChannels, MaxFrames>;
+  using OutputBuffer = Eigen::Matrix<float, OutChannels, MaxFrames>;
+
+
+  Conv1DFullyFixed(int dilation = 1)
+  : _dilation(dilation)
+  {
+    // Initialize weights to zero (critical for block-diagonal structure)
+    for (int k = 0; k < KernelSize; k++)
+    {
+      _weight[k].setZero();
+    }
+
+    if constexpr (HasBias)
+    {
+      _bias.setZero();
+    }
+
+    _output_fixed.setZero();
+    _output_dynamic.resize(OutChannels, MaxFrames);
+    _output_dynamic.setZero();
+
+    // Initialize contiguous buffer
+    _input_contiguous.setZero();
+  }
+
+  Eigen::MatrixXf& GetOutput() override { return _output_dynamic; }
+
+  const Eigen::MatrixXf& GetOutput() const override { return _output_dynamic; }
+
+  void SetMaxBufferSize(int maxBufferSize) override
+  {
+    assert(maxBufferSize <= MaxFrames && "Buffer size exceeds MaxFrames template parameter");
+    // Reset contiguous buffer (zeros out history)
+    _input_contiguous.setZero();
+  }
+
+  void set_weights_(std::vector<float>::iterator& weights) override
+  {
+    // Weight layout: for each kernel position k, weights are [group0, group1, ..., groupN-1]
+    // Crazy ordering because that's how it gets flattened in PyTorch
+    for (int g = 0; g < Groups; g++)
+    {
+      for (int i = 0; i < OutPerGroup; i++)
+      {
+        for (int j = 0; j < InPerGroup; j++)
+        {
+          for (int k = 0; k < KernelSize; k++)
+          {
+            _weight[k](g * OutPerGroup + i, g * InPerGroup + j) = *(weights++);
+          }
+        }
+      }
+    }
+
+    if constexpr (HasBias)
+    {
+      for (int i = 0; i < OutChannels; i++)
+      {
+        _bias(i) = *(weights++);
+      }
+    }
+  }
+
+  void Process(const Eigen::MatrixXf& input, int num_frames) override
+  {
+    assert(num_frames <= MaxFrames);
+
+    // Calculate receptive field for this dilation
+    const int receptive_field = (KernelSize - 1) * _dilation;
+
+    // Buffer layout: [history (receptive_field cols) | new_input (num_frames cols)]
+    // History is always stored at leftCols(receptive_field) between calls
+
+    // Copy new input after history region
+    _input_contiguous.middleCols(receptive_field, num_frames) = input.leftCols(num_frames);
+
+    // Zero output before accumulation
+    _output_fixed.leftCols(num_frames).setZero();
+
+    // Process kernel positions using block operations
+    if constexpr (Groups == 1)
+    {
+      // Non-grouped: use efficient block operations
+      process_kernel_block_impl(std::make_integer_sequence<int, KernelSize>{}, num_frames, receptive_field);
+    }
+    else
+    {
+      // Grouped: process per-group (still uses block operations per group)
+      process_kernel_grouped_impl(std::make_integer_sequence<int, KernelSize>{}, num_frames, receptive_field);
+    }
+
+    // Add bias if present
+    if constexpr (HasBias)
+    {
+      _output_fixed.leftCols(num_frames).colwise() += _bias;
+    }
+
+    // Copy to dynamic output for interface compatibility
+    _output_dynamic.leftCols(num_frames) = _output_fixed.leftCols(num_frames);
+
+    // Save history for next call: copy the last receptive_field frames to the beginning
+    // This prepares the buffer for the next Process() call
+    if (receptive_field > 0)
+    {
+      if (num_frames >= receptive_field)
+      {
+        // Take history from end of current input
+        _input_contiguous.leftCols(receptive_field) = input.middleCols(num_frames - receptive_field, receptive_field);
+      }
+      else
+      {
+        // Not enough new frames - combine old history with new input
+        const int old_history_needed = receptive_field - num_frames;
+        // Shift old history left
+        _input_contiguous.leftCols(old_history_needed) =
+          _input_contiguous.middleCols(receptive_field - old_history_needed, old_history_needed);
+        // Append new input as recent history
+        _input_contiguous.middleCols(old_history_needed, num_frames) = input.leftCols(num_frames);
+      }
+    }
+  }
+
+  long get_out_channels() const override { return OutChannels; }
+
+  long get_in_channels() const override { return InChannels; }
+
+  long get_kernel_size() const override { return KernelSize; }
+
+  int get_dilation() const override { return _dilation; }
+
+  bool has_bias() const override { return HasBias; }
+
+  /// \brief Get the maximum buffer size this implementation supports
+  static constexpr int GetMaxFrames() { return MaxFrames; }
+
+private:
+  std::array<WeightMatrix, KernelSize> _weight;
+  BiasVector _bias;
+  OutputBuffer _output_fixed;
+  Eigen::MatrixXf _output_dynamic; // For interface compatibility
+
+  // Contiguous buffer for efficient block operations: [history | current_input]
+  // Size: InChannels x (receptive_field + MaxFrames)
+  static constexpr int MaxReceptiveField = (KernelSize - 1) * 16; // Support up to dilation=16
+  static constexpr int ContiguousBufferSize = MaxReceptiveField + MaxFrames;
+  Eigen::Matrix<float, InChannels, ContiguousBufferSize> _input_contiguous;
+  int _dilation;
+
+  // Helper to unroll kernel processing using block operations (non-grouped)
+  template <int... Ks>
+  void process_kernel_block_impl(std::integer_sequence<int, Ks...>, int num_frames, int receptive_field)
+  {
+    (process_single_kernel_block<Ks>(num_frames, receptive_field), ...);
+  }
+
+  template <int K>
+  void process_single_kernel_block(int num_frames, int receptive_field)
+  {
+    // Calculate offset for this kernel position
+    // For causal conv: output[t] = sum_k(weight[k] * input[t - dilation*(K-1-k)])
+    const int offset = _dilation * (KernelSize - 1 - K);
+
+    // Source position in contiguous buffer
+    const int src_start = receptive_field - offset;
+
+    // Use block operation for efficient matmul
+    _output_fixed.leftCols(num_frames).noalias() +=
+      _weight[K] * _input_contiguous.middleCols(src_start, num_frames);
+  }
+
+  // Helper to unroll kernel processing for grouped convolution
+  template <int... Ks>
+  void process_kernel_grouped_impl(std::integer_sequence<int, Ks...>, int num_frames, int receptive_field)
+  {
+    (process_single_kernel_grouped<Ks>(num_frames, receptive_field), ...);
+  }
+
+  template <int K>
+  void process_single_kernel_grouped(int num_frames, int receptive_field)
+  {
+    const int offset = _dilation * (KernelSize - 1 - K);
+    const int src_start = receptive_field - offset;
+
+    // Process each group
+    for (int g = 0; g < Groups; g++)
+    {
+      auto input_group = _input_contiguous.template middleRows<InPerGroup>(g * InPerGroup).middleCols(src_start, num_frames);
+      auto weight_group = _weight[K].template block<OutPerGroup, InPerGroup>(g * OutPerGroup, g * InPerGroup);
+      _output_fixed.template middleRows<OutPerGroup>(g * OutPerGroup).leftCols(num_frames).noalias() +=
+        weight_group * input_group;
+    }
+  }
+};
+
+} // namespace nam
diff --git a/NAM/conv1x1_factory.cpp b/NAM/conv1x1_factory.cpp
new file mode 100644
index 0000000..bcad04c
--- /dev/null
+++ b/NAM/conv1x1_factory.cpp
@@ -0,0 +1,51 @@
+// Conv1x1 Factory implementation
+// Returns dynamic Conv1x1 wrapped in IConv1x1 interface
+
+#include "conv1x1_factory.h"
+#include "dsp.h"
+
+namespace nam
+{
+
+/// \brief Dynamic wrapper for Conv1x1 implementing IConv1x1 interface
+class Conv1x1Dynamic : public IConv1x1
+{
+public:
+  Conv1x1Dynamic(int in_channels, int out_channels, bool bias, int groups)
+  : _conv(in_channels, out_channels, bias, groups)
+  {
+  }
+
+  Eigen::MatrixXf& GetOutput() override { return _conv.GetOutput(); }
+
+  const Eigen::MatrixXf& GetOutput() const override { return _conv.GetOutput(); }
+
+  void SetMaxBufferSize(int maxBufferSize) override { _conv.SetMaxBufferSize(maxBufferSize); }
+
+  void set_weights_(std::vector<float>::iterator& weights) override { _conv.set_weights_(weights); }
+
+  void process_(const Eigen::Ref<const Eigen::MatrixXf>& input, int num_frames) override
+  {
+    _conv.process_(input, num_frames);
+  }
+
+  Eigen::MatrixXf process(const Eigen::MatrixXf& input, int num_frames) const override
+  {
+    return _conv.process(input, num_frames);
+  }
+
+  long get_out_channels() const override { return _conv.get_out_channels(); }
+
+  long get_in_channels() const override { return _conv.get_in_channels(); }
+
+private:
+  Conv1x1 _conv;
+};
+
+// Factory implementation - always returns dynamic implementation
+std::unique_ptr<IConv1x1> Conv1x1Factory::create(int in_channels, int out_channels, bool bias, int groups)
+{
+  return std::make_unique<Conv1x1Dynamic>(in_channels, out_channels, bias, groups);
+}
+
+} // namespace nam
diff --git a/NAM/conv1x1_factory.h b/NAM/conv1x1_factory.h
new file mode 100644
index 0000000..c4fd342
--- /dev/null
+++ b/NAM/conv1x1_factory.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <memory>
+#include "conv1x1_fixed.h"
+
+namespace nam
+{
+
+/// \brief Factory for creating Conv1x1 implementations
+///
+/// Returns a dynamic Conv1x1 implementation wrapped in the IConv1x1 interface.
+/// For fully optimized implementations with compile-time known buffer sizes,
+/// use Conv1x1FullyFixed directly.
+class Conv1x1Factory
+{
+public:
+  /// \brief Create a Conv1x1 implementation
+  ///
+  /// Returns a dynamic implementation. For maximum performance with known
+  /// buffer sizes, use Conv1x1FullyFixed template directly.
+  ///
+  /// \param in_channels Number of input channels
+  /// \param out_channels Number of output channels
+  /// \param bias Whether to use bias
+  /// \param groups Number of groups for grouped convolution (default: 1)
+  /// \return Unique pointer to an IConv1x1 implementation
+  static std::unique_ptr<IConv1x1> create(int in_channels, int out_channels, bool bias, int groups = 1);
+};
+
+} // namespace nam
diff --git a/NAM/conv1x1_fixed.h b/NAM/conv1x1_fixed.h
new file mode 100644
index 0000000..a122c87
--- /dev/null
+++ b/NAM/conv1x1_fixed.h
@@ -0,0 +1,272 @@
+#pragma once
+
+#include <Eigen/Dense>
+#include <cassert>
+#include <memory>
+#include <vector>
+
+namespace nam
+{
+
+/// \brief Type-erased interface for Conv1x1 implementations
+///
+/// This interface allows runtime polymorphism while enabling compile-time
+/// optimized implementations via templates. All Conv1x1 variants (fixed-size
+/// and dynamic) implement this interface.
+class IConv1x1
+{
+public:
+  virtual ~IConv1x1() = default;
+
+  /// \brief Get the entire internal output buffer
+  ///
+  /// This is intended for internal wiring between layers/arrays; callers should treat
+  /// the buffer as pre-allocated storage and only consider the first num_frames columns
+  /// valid for a given processing call. Slice with .leftCols(num_frames) as needed.
+  /// \return Reference to the output buffer
+  virtual Eigen::MatrixXf& GetOutput() = 0;
+
+  /// \brief Get the entire internal output buffer (const version)
+  /// \return Const reference to the output buffer
+  virtual const Eigen::MatrixXf& GetOutput() const = 0;
+
+  /// \brief Resize the output buffer to handle maxBufferSize frames
+  /// \param maxBufferSize Maximum number of frames to process in a single call
+  virtual void SetMaxBufferSize(int maxBufferSize) = 0;
+
+  /// \brief Set the parameters (weights) of this module
+  /// \param weights Iterator to the weights vector. Will be advanced as weights are consumed.
+  virtual void set_weights_(std::vector<float>::iterator& weights) = 0;
+
+  /// \brief Process input and store output to pre-allocated buffer
+  ///
+  /// Uses Eigen::Ref to accept matrices and block expressions without creating
+  /// temporaries (real-time safe). Access output via GetOutput().
+  /// \param input Input matrix (channels x num_frames)
+  /// \param num_frames Number of frames to process
+  virtual void process_(const Eigen::Ref<const Eigen::MatrixXf>& input, int num_frames) = 0;
+
+  /// \brief Process input and return output matrix
+  /// \param input Input matrix (channels x num_frames)
+  /// \param num_frames Number of frames to process
+  /// \return Output matrix (channels x num_frames)
+  virtual Eigen::MatrixXf process(const Eigen::MatrixXf& input, int num_frames) const = 0;
+
+  /// \brief Get the number of output channels
+  /// \return Number of output channels
+  virtual long get_out_channels() const = 0;
+
+  /// \brief Get the number of input channels
+  /// \return Number of input channels
+  virtual long get_in_channels() const = 0;
+};
+
+/// \brief Fully compile-time optimized Conv1x1 with fixed dimensions AND buffer size
+///
+/// This implementation uses fixed-size Eigen matrices for weights, input, and output,
+/// enabling the compiler to fully unroll and vectorize all operations.
+///
+/// Template parameters:
+/// \tparam OutChannels Number of output channels
+/// \tparam InChannels Number of input channels
+/// \tparam MaxFrames Maximum buffer size (e.g., 32, 64, 128, 256, 512)
+/// \tparam Groups Number of groups for grouped convolution
+/// \tparam HasBias Whether to use bias
+template <int OutChannels, int InChannels, int MaxFrames, int Groups = 1, bool HasBias = true>
+class Conv1x1FullyFixed : public IConv1x1
+{
+public:
+  static_assert(OutChannels > 0, "OutChannels must be positive");
+  static_assert(InChannels > 0, "InChannels must be positive");
+  static_assert(MaxFrames > 0, "MaxFrames must be positive");
+  static_assert(Groups > 0, "Groups must be positive");
+  static_assert(OutChannels % Groups == 0, "OutChannels must be divisible by Groups");
+  static_assert(InChannels % Groups == 0, "InChannels must be divisible by Groups");
+
+  // Fully fixed-size types for maximum optimization
+  using WeightMatrix = Eigen::Matrix<float, OutChannels, InChannels>;
+  using BiasVector = Eigen::Matrix<float, OutChannels, 1>;
+  using InputBuffer = Eigen::Matrix<float, InChannels, MaxFrames>;
+  using OutputBuffer = Eigen::Matrix<float, OutChannels, MaxFrames>;
+
+  Conv1x1FullyFixed()
+  {
+    _weight.setZero();
+    if constexpr (HasBias)
+    {
+      _bias.setZero();
+    }
+    _output_dynamic.resize(OutChannels, MaxFrames);
+  }
+
+  Eigen::MatrixXf& GetOutput() override { return _output_dynamic; }
+
+  const Eigen::MatrixXf& GetOutput() const override { return _output_dynamic; }
+
+  void SetMaxBufferSize(int maxBufferSize) override
+  {
+    // For fully fixed implementation, we require the buffer size to match
+    assert(maxBufferSize <= MaxFrames && "Buffer size exceeds MaxFrames template parameter");
+    // Output is already sized correctly
+  }
+
+  void set_weights_(std::vector<float>::iterator& weights) override
+  {
+    if constexpr (Groups == 1)
+    {
+      // Non-grouped: simple row-major weight loading
+      for (int i = 0; i < OutChannels; i++)
+      {
+        for (int j = 0; j < InChannels; j++)
+        {
+          _weight(i, j) = *(weights++);
+        }
+      }
+    }
+    else
+    {
+      // Grouped convolution: block-diagonal weight matrix
+      constexpr int out_per_group = OutChannels / Groups;
+      constexpr int in_per_group = InChannels / Groups;
+
+      for (int g = 0; g < Groups; g++)
+      {
+        for (int i = 0; i < out_per_group; i++)
+        {
+          for (int j = 0; j < in_per_group; j++)
+          {
+            _weight(g * out_per_group + i, g * in_per_group + j) = *(weights++);
+          }
+        }
+      }
+    }
+
+    if constexpr (HasBias)
+    {
+      for (int i = 0; i < OutChannels; i++)
+      {
+        _bias(i) = *(weights++);
+      }
+    }
+  }
+
+  void process_(const Eigen::Ref<const Eigen::MatrixXf>& input, int num_frames) override
+  {
+    assert(num_frames <= MaxFrames);
+
+    // Copy input to fixed-size buffer for fully optimized matmul
+    _input_fixed.template leftCols<MaxFrames>().leftCols(num_frames) = input.leftCols(num_frames);
+
+    if constexpr (Groups == 1)
+    {
+      // Single group: fully fixed matrix multiply
+      _output_fixed.noalias() = _weight * _input_fixed;
+    }
+    else
+    {
+      // Grouped convolution with compile-time unrolled loop
+      constexpr int out_per_group = OutChannels / Groups;
+      constexpr int in_per_group = InChannels / Groups;
+      process_groups_impl<out_per_group, in_per_group>(std::make_integer_sequence<int, Groups>{});
+    }
+
+    // Add bias if present
+    if constexpr (HasBias)
+    {
+      _output_fixed.colwise() += _bias;
+    }
+
+    // Copy back to dynamic output for interface compatibility
+    _output_dynamic.leftCols(num_frames) = _output_fixed.leftCols(num_frames);
+  }
+
+  /// \brief Optimized process for when caller knows the exact frame count at compile time
+  template <int NumFrames>
+  void process_fixed(const Eigen::Matrix<float, InChannels, NumFrames>& input)
+  {
+    static_assert(NumFrames <= MaxFrames, "NumFrames exceeds MaxFrames");
+
+    if constexpr (Groups == 1)
+    {
+      _output_fixed.template leftCols<NumFrames>().noalias() = _weight * input;
+    }
+    else
+    {
+      // Copy to internal buffer first
+      _input_fixed.template leftCols<NumFrames>() = input;
+      constexpr int out_per_group = OutChannels / Groups;
+      constexpr int in_per_group = InChannels / Groups;
+      process_groups_impl<out_per_group, in_per_group>(std::make_integer_sequence<int, Groups>{});
+    }
+
+    if constexpr (HasBias)
+    {
+      _output_fixed.template leftCols<NumFrames>().colwise() += _bias;
+    }
+  }
+
+  /// \brief Get output as fixed-size matrix reference
+  template <int NumFrames>
+  auto GetOutputFixed() -> Eigen::Block<OutputBuffer, OutChannels, NumFrames>
+  {
+    return _output_fixed.template leftCols<NumFrames>();
+  }
+
+  Eigen::MatrixXf process(const Eigen::MatrixXf& input, int num_frames) const override
+  {
+    Eigen::MatrixXf result(OutChannels, num_frames);
+
+    if constexpr (Groups == 1)
+    {
+      result.noalias() = _weight * input.leftCols(num_frames);
+    }
+    else
+    {
+      constexpr int out_per_group = OutChannels / Groups;
+      constexpr int in_per_group = InChannels / Groups;
+      for (int g = 0; g < Groups; g++)
+      {
+        auto input_group = input.middleRows(g * in_per_group, in_per_group).leftCols(num_frames);
+        auto weight_group = _weight.template block<out_per_group, in_per_group>(g * out_per_group, g * in_per_group);
+        result.middleRows(g * out_per_group, out_per_group).noalias() = weight_group * input_group;
+      }
+    }
+
+    if constexpr (HasBias)
+    {
+      result.colwise() += _bias;
+    }
+
+    return result;
+  }
+
+  long get_out_channels() const override { return OutChannels; }
+  long get_in_channels() const override { return InChannels; }
+
+  /// \brief Get the maximum buffer size this implementation supports
+  static constexpr int GetMaxFrames() { return MaxFrames; }
+
+private:
+  WeightMatrix _weight;
+  BiasVector _bias;
+  InputBuffer _input_fixed;
+  OutputBuffer _output_fixed;
+  Eigen::MatrixXf _output_dynamic; // For interface compatibility
+
+  // Helper to unroll group processing at compile time
+  template <int OutPerGroup, int InPerGroup, int... Gs>
+  void process_groups_impl(std::integer_sequence<int, Gs...>)
+  {
+    (process_single_group<Gs, OutPerGroup, InPerGroup>(), ...);
+  }
+
+  template <int G, int OutPerGroup, int InPerGroup>
+  void process_single_group()
+  {
+    auto input_group = _input_fixed.template middleRows<InPerGroup>(G * InPerGroup);
+    auto weight_group = _weight.template block<OutPerGroup, InPerGroup>(G * OutPerGroup, G * InPerGroup);
+    _output_fixed.template middleRows<OutPerGroup>(G * OutPerGroup).noalias() = weight_group * input_group;
+  }
+};
+
+} // namespace nam
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index 22e4db6..d320563 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -1,7 +1,7 @@
 file(GLOB_RECURSE NAM_SOURCES ../NAM/*.cpp ../NAM/*.c ../NAM*.h)
 
 # TODO: add loadmodel and run_tests to TOOLS?
-set(TOOLS benchmodel benchmark_convolution)
+set(TOOLS benchmodel benchmark_convolution benchmark_fully_fixed_conv)
 
 add_custom_target(tools ALL
 	DEPENDS ${TOOLS})
@@ -13,6 +13,7 @@ include_directories(tools ${NAM_DEPS_PATH}/nlohmann)
 add_executable(loadmodel loadmodel.cpp ${NAM_SOURCES})
 add_executable(benchmodel benchmodel.cpp ${NAM_SOURCES})
 add_executable(benchmark_convolution benchmark_convolution.cpp ${NAM_SOURCES})
+add_executable(benchmark_fully_fixed_conv benchmark_fully_fixed_conv.cpp ${NAM_SOURCES})
 add_executable(run_tests run_tests.cpp test/allocation_tracking.cpp ${NAM_SOURCES})
 # Compile run_tests without optimizations to ensure allocation tracking works correctly
 # Also ensure assertions are enabled (NDEBUG is not defined) so tests actually run
@@ -64,4 +65,7 @@ endforeach()
 # /Users/steve/src/NeuralAmpModelerCore/Dependencies/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h
 # Don't let this break my build on debug:
 set_source_files_properties(../NAM/dsp.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")
-set_source_files_properties(../NAM/conv1d.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")
\ No newline at end of file
+set_source_files_properties(../NAM/conv1d.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")
+set_source_files_properties(../NAM/conv1x1_factory.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")
+set_source_files_properties(../NAM/conv1d_factory.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")
+set_source_files_properties(benchmark_fully_fixed_conv.cpp PROPERTIES COMPILE_FLAGS "-Wno-error")
diff --git a/tools/benchmark_convolution.cpp b/tools/benchmark_convolution.cpp
index 4acbc1d..1080419 100644
--- a/tools/benchmark_convolution.cpp
+++ b/tools/benchmark_convolution.cpp
@@ -1,5 +1,6 @@
 // Microbenchmark for Conv1x1 and Conv1D convolution operations
 // Measures performance across various configurations of channels, groups, and frame sizes.
+// Compares dynamic implementations vs templated fixed-size implementations.
 // Outputs CSV format for analysis.
 
 #include <chrono>
@@ -10,6 +11,8 @@
 #include <vector>
 
 #include "NAM/conv1d.h"
+#include "NAM/conv1d_factory.h"
+#include "NAM/conv1x1_factory.h"
 #include "NAM/dsp.h"
 
 using std::chrono::duration;
@@ -23,9 +26,9 @@ constexpr int NUM_BENCHMARK_ITERATIONS = 100;
 
 // Benchmark configurations
 constexpr int CHANNELS[] = {2, 3, 4, 5, 6, 7, 8};
-constexpr int GROUPS[] = {1, 2, 3, 4, 5, 6, 7, 8};
+constexpr int GROUPS[] = {1, 2, 3, 4};
 constexpr int FRAMES[] = {64, 256, 1024};
-constexpr int KERNEL_SIZES[] = {1, 3}; // For Conv1D
+constexpr int KERNEL_SIZES[] = {3, 4}; // For Conv1D
 
 struct BenchmarkResult
 {
@@ -65,12 +68,9 @@ BenchmarkResult calculate_stats(const std::vector<double>& samples)
   return result;
 }
 
-// Benchmark Conv1x1
-void benchmark_conv1x1(int channels, int groups, int frames, std::mt19937& rng)
+// Benchmark Conv1x1 (dynamic implementation)
+BenchmarkResult benchmark_conv1x1_dynamic(int channels, int groups, int frames, std::mt19937& rng)
 {
-  if (channels % groups != 0)
-    return; // Skip invalid configurations
-
   // Create Conv1x1 layer
   nam::Conv1x1 conv(channels, channels, false, groups);
 
@@ -109,19 +109,56 @@ void benchmark_conv1x1(int channels, int groups, int frames, std::mt19937& rng)
     samples.push_back(static_cast<double>(duration_cast<nanoseconds>(t2 - t1).count()));
   }
 
-  BenchmarkResult result = calculate_stats(samples);
-
-  // Output CSV row
-  std::cout << "Conv1x1," << channels << "," << groups << "," << frames << ",1," << std::fixed << std::setprecision(2)
-            << result.mean_ns << "," << result.stddev_ns << "," << result.min_ns << "," << result.max_ns << "\n";
+  return calculate_stats(samples);
 }
 
-// Benchmark Conv1D
-void benchmark_conv1d(int channels, int groups, int frames, int kernel_size, std::mt19937& rng)
+// Benchmark Conv1x1 (fixed/templated implementation via factory)
+BenchmarkResult benchmark_conv1x1_fixed(int channels, int groups, int frames, std::mt19937& rng)
 {
-  if (channels % groups != 0)
-    return; // Skip invalid configurations
+  // Create Conv1x1 layer via factory
+  auto conv = nam::Conv1x1Factory::create(channels, channels, false, groups);
+
+  // Initialize with random weights
+  const int num_weights = (channels / groups) * (channels / groups) * groups;
+  std::vector<float> weights(num_weights);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+  for (auto& w : weights)
+    w = dist(rng);
+
+  auto it = weights.begin();
+  conv->set_weights_(it);
+  conv->SetMaxBufferSize(frames);
+
+  // Create random input
+  Eigen::MatrixXf input(channels, frames);
+  for (int i = 0; i < channels; i++)
+    for (int j = 0; j < frames; j++)
+      input(i, j) = dist(rng);
 
+  // Warmup
+  for (int i = 0; i < NUM_WARMUP_ITERATIONS; i++)
+  {
+    conv->process_(input, frames);
+  }
+
+  // Benchmark
+  std::vector<double> samples;
+  samples.reserve(NUM_BENCHMARK_ITERATIONS);
+
+  for (int i = 0; i < NUM_BENCHMARK_ITERATIONS; i++)
+  {
+    auto t1 = high_resolution_clock::now();
+    conv->process_(input, frames);
+    auto t2 = high_resolution_clock::now();
+    samples.push_back(static_cast<double>(duration_cast<nanoseconds>(t2 - t1).count()));
+  }
+
+  return calculate_stats(samples);
+}
+
+// Benchmark Conv1D (dynamic implementation)
+BenchmarkResult benchmark_conv1d_dynamic(int channels, int groups, int frames, int kernel_size, std::mt19937& rng)
+{
   // Create Conv1D layer
   nam::Conv1D conv;
   conv.set_size_(channels, channels, kernel_size, false, 1, groups);
@@ -161,18 +198,97 @@ void benchmark_conv1d(int channels, int groups, int frames, int kernel_size, std
     samples.push_back(static_cast<double>(duration_cast<nanoseconds>(t2 - t1).count()));
   }
 
-  BenchmarkResult result = calculate_stats(samples);
+  return calculate_stats(samples);
+}
+
+// Benchmark Conv1D (fixed/templated implementation via factory)
+BenchmarkResult benchmark_conv1d_fixed(int channels, int groups, int frames, int kernel_size, std::mt19937& rng)
+{
+  // Create Conv1D layer via factory
+  auto conv = nam::Conv1DFactory::create(channels, channels, kernel_size, 1, false, groups);
+
+  // Initialize with random weights
+  const int num_weights = kernel_size * (channels / groups) * (channels / groups) * groups;
+  std::vector<float> weights(num_weights);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+  for (auto& w : weights)
+    w = dist(rng);
+
+  auto it = weights.begin();
+  conv->set_weights_(it);
+  conv->SetMaxBufferSize(frames);
+
+  // Create random input
+  Eigen::MatrixXf input(channels, frames);
+  for (int i = 0; i < channels; i++)
+    for (int j = 0; j < frames; j++)
+      input(i, j) = dist(rng);
+
+  // Warmup
+  for (int i = 0; i < NUM_WARMUP_ITERATIONS; i++)
+  {
+    conv->Process(input, frames);
+  }
+
+  // Benchmark
+  std::vector<double> samples;
+  samples.reserve(NUM_BENCHMARK_ITERATIONS);
+
+  for (int i = 0; i < NUM_BENCHMARK_ITERATIONS; i++)
+  {
+    auto t1 = high_resolution_clock::now();
+    conv->Process(input, frames);
+    auto t2 = high_resolution_clock::now();
+    samples.push_back(static_cast<double>(duration_cast<nanoseconds>(t2 - t1).count()));
+  }
+
+  return calculate_stats(samples);
+}
+
+// Run benchmarks for Conv1x1 and output comparison
+void benchmark_conv1x1(int channels, int groups, int frames, std::mt19937& rng)
+{
+  if (channels % groups != 0)
+    return; // Skip invalid configurations
+
+  BenchmarkResult dynamic_result = benchmark_conv1x1_dynamic(channels, groups, frames, rng);
+  BenchmarkResult fixed_result = benchmark_conv1x1_fixed(channels, groups, frames, rng);
+
+  double speedup = dynamic_result.mean_ns / fixed_result.mean_ns;
+
+  // Output CSV row: type,impl,channels,groups,frames,kernel_size,mean_ns,stddev_ns,min_ns,max_ns,speedup
+  std::cout << "Conv1x1,dynamic," << channels << "," << groups << "," << frames << ",1," << std::fixed
+            << std::setprecision(2) << dynamic_result.mean_ns << "," << dynamic_result.stddev_ns << ","
+            << dynamic_result.min_ns << "," << dynamic_result.max_ns << ",1.00\n";
+  std::cout << "Conv1x1,fixed," << channels << "," << groups << "," << frames << ",1," << std::fixed
+            << std::setprecision(2) << fixed_result.mean_ns << "," << fixed_result.stddev_ns << ","
+            << fixed_result.min_ns << "," << fixed_result.max_ns << "," << speedup << "\n";
+}
+
+// Run benchmarks for Conv1D and output comparison
+void benchmark_conv1d(int channels, int groups, int frames, int kernel_size, std::mt19937& rng)
+{
+  if (channels % groups != 0)
+    return; // Skip invalid configurations
+
+  BenchmarkResult dynamic_result = benchmark_conv1d_dynamic(channels, groups, frames, kernel_size, rng);
+  BenchmarkResult fixed_result = benchmark_conv1d_fixed(channels, groups, frames, kernel_size, rng);
+
+  double speedup = dynamic_result.mean_ns / fixed_result.mean_ns;
 
-  // Output CSV row
-  std::cout << "Conv1D," << channels << "," << groups << "," << frames << "," << kernel_size << "," << std::fixed
-            << std::setprecision(2) << result.mean_ns << "," << result.stddev_ns << "," << result.min_ns << ","
-            << result.max_ns << "\n";
+  // Output CSV row: type,impl,channels,groups,frames,kernel_size,mean_ns,stddev_ns,min_ns,max_ns,speedup
+  std::cout << "Conv1D,dynamic," << channels << "," << groups << "," << frames << "," << kernel_size << "," << std::fixed
+            << std::setprecision(2) << dynamic_result.mean_ns << "," << dynamic_result.stddev_ns << ","
+            << dynamic_result.min_ns << "," << dynamic_result.max_ns << ",1.00\n";
+  std::cout << "Conv1D,fixed," << channels << "," << groups << "," << frames << "," << kernel_size << "," << std::fixed
+            << std::setprecision(2) << fixed_result.mean_ns << "," << fixed_result.stddev_ns << ","
+            << fixed_result.min_ns << "," << fixed_result.max_ns << "," << speedup << "\n";
 }
 
 int main(int argc, char* argv[])
 {
   // Print CSV header
-  std::cout << "type,channels,groups,frames,kernel_size,mean_ns,stddev_ns,min_ns,max_ns\n";
+  std::cout << "type,impl,channels,groups,frames,kernel_size,mean_ns,stddev_ns,min_ns,max_ns,speedup\n";
 
   // Use fixed seed for reproducibility
   std::mt19937 rng(42);
diff --git a/tools/benchmark_fully_fixed_conv.cpp b/tools/benchmark_fully_fixed_conv.cpp
new file mode 100644
index 0000000..8921562
--- /dev/null
+++ b/tools/benchmark_fully_fixed_conv.cpp
@@ -0,0 +1,282 @@
+// Benchmark for fully fixed convolution implementations
+// Compares Conv1x1FullyFixed and Conv1DFullyFixed (all dimensions fixed) vs dynamic implementations
+
+#include <chrono>
+#include <cmath>
+#include <iomanip>
+#include <iostream>
+#include <random>
+#include <vector>
+
+#include "NAM/conv1d.h"
+#include "NAM/conv1d_fixed.h"
+#include "NAM/conv1x1_fixed.h"
+#include "NAM/dsp.h"
+
+using std::chrono::duration_cast;
+using std::chrono::high_resolution_clock;
+using std::chrono::nanoseconds;
+
+constexpr int NUM_WARMUP = 100;
+constexpr int NUM_ITERATIONS = 1000;
+
+struct Result
+{
+  double mean_ns;
+  double stddev_ns;
+};
+
+Result calculate_stats(const std::vector<double>& samples)
+{
+  double sum = 0.0;
+  for (double s : samples)
+    sum += s;
+  double mean = sum / samples.size();
+
+  double sq_sum = 0.0;
+  for (double s : samples)
+  {
+    double diff = s - mean;
+    sq_sum += diff * diff;
+  }
+  return {mean, std::sqrt(sq_sum / samples.size())};
+}
+
+// Benchmark Conv1x1FullyFixed vs Conv1x1 (dynamic)
+template <int Channels, int MaxFrames, int Groups, bool HasBias>
+void benchmark_conv1x1_fully_fixed(std::mt19937& rng)
+{
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  // Generate weights
+  constexpr int in_per_group = Channels / Groups;
+  constexpr int out_per_group = Channels / Groups;
+  std::vector<float> weights;
+  for (int g = 0; g < Groups; g++)
+    for (int i = 0; i < out_per_group; i++)
+      for (int j = 0; j < in_per_group; j++)
+        weights.push_back(dist(rng));
+  if constexpr (HasBias)
+    for (int i = 0; i < Channels; i++)
+      weights.push_back(dist(rng));
+
+  // Create input (dynamic for interface, but we'll also create fixed version)
+  Eigen::MatrixXf input_dynamic(Channels, MaxFrames);
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < MaxFrames; j++)
+      input_dynamic(i, j) = dist(rng);
+
+  // Fixed-size input
+  Eigen::Matrix<float, Channels, MaxFrames> input_fixed = input_dynamic;
+
+  // ========== FULLY FIXED ==========
+  nam::Conv1x1FullyFixed<Channels, Channels, MaxFrames, Groups, HasBias> conv_fixed;
+  auto it1 = weights.begin();
+  conv_fixed.set_weights_(it1);
+  conv_fixed.SetMaxBufferSize(MaxFrames);
+
+  // Warmup
+  for (int i = 0; i < NUM_WARMUP; i++)
+    conv_fixed.process_(input_dynamic, MaxFrames);
+
+  std::vector<double> fixed_samples;
+  fixed_samples.reserve(NUM_ITERATIONS);
+  for (int i = 0; i < NUM_ITERATIONS; i++)
+  {
+    auto t1 = high_resolution_clock::now();
+    conv_fixed.process_(input_dynamic, MaxFrames);
+    auto t2 = high_resolution_clock::now();
+    fixed_samples.push_back(static_cast<double>(duration_cast<nanoseconds>(t2 - t1).count()));
+  }
+
+  // ========== DYNAMIC ==========
+  nam::Conv1x1 conv_dynamic(Channels, Channels, HasBias, Groups);
+  auto it2 = weights.begin();
+  conv_dynamic.set_weights_(it2);
+  conv_dynamic.SetMaxBufferSize(MaxFrames);
+
+  // Warmup
+  for (int i = 0; i < NUM_WARMUP; i++)
+    conv_dynamic.process_(input_dynamic, MaxFrames);
+
+  std::vector<double> dynamic_samples;
+  dynamic_samples.reserve(NUM_ITERATIONS);
+  for (int i = 0; i < NUM_ITERATIONS; i++)
+  {
+    auto t1 = high_resolution_clock::now();
+    conv_dynamic.process_(input_dynamic, MaxFrames);
+    auto t2 = high_resolution_clock::now();
+    dynamic_samples.push_back(static_cast<double>(duration_cast<nanoseconds>(t2 - t1).count()));
+  }
+
+  Result fixed_result = calculate_stats(fixed_samples);
+  Result dynamic_result = calculate_stats(dynamic_samples);
+
+  double speedup = dynamic_result.mean_ns / fixed_result.mean_ns;
+
+  std::cout << "Conv1x1," << Channels << "," << Groups << "," << (HasBias ? "true" : "false") << "," << MaxFrames << ","
+            << std::fixed << std::setprecision(1) << dynamic_result.mean_ns << "," << fixed_result.mean_ns << ","
+            << std::setprecision(2) << speedup << "x\n";
+}
+
+// Benchmark Conv1DFullyFixed vs Conv1D (dynamic)
+template <int Channels, int KernelSize, int MaxFrames, int Groups, bool HasBias>
+void benchmark_conv1d_fully_fixed(std::mt19937& rng)
+{
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  // Generate weights
+  constexpr int in_per_group = Channels / Groups;
+  constexpr int out_per_group = Channels / Groups;
+  std::vector<float> weights;
+  for (int g = 0; g < Groups; g++)
+    for (int i = 0; i < out_per_group; i++)
+      for (int j = 0; j < in_per_group; j++)
+        for (int k = 0; k < KernelSize; k++)
+          weights.push_back(dist(rng));
+  if constexpr (HasBias)
+    for (int i = 0; i < Channels; i++)
+      weights.push_back(dist(rng));
+
+  // Create input
+  Eigen::MatrixXf input(Channels, MaxFrames);
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < MaxFrames; j++)
+      input(i, j) = dist(rng);
+
+  const int dilation = 1;
+
+  // ========== FULLY FIXED ==========
+  nam::Conv1DFullyFixed<Channels, Channels, KernelSize, MaxFrames, Groups, HasBias> conv_fixed(dilation);
+  auto it1 = weights.begin();
+  conv_fixed.set_weights_(it1);
+  conv_fixed.SetMaxBufferSize(MaxFrames);
+
+  // Warmup
+  for (int i = 0; i < NUM_WARMUP; i++)
+    conv_fixed.Process(input, MaxFrames);
+
+  std::vector<double> fixed_samples;
+  fixed_samples.reserve(NUM_ITERATIONS);
+  for (int i = 0; i < NUM_ITERATIONS; i++)
+  {
+    auto t1 = high_resolution_clock::now();
+    conv_fixed.Process(input, MaxFrames);
+    auto t2 = high_resolution_clock::now();
+    fixed_samples.push_back(static_cast<double>(duration_cast<nanoseconds>(t2 - t1).count()));
+  }
+
+  // ========== DYNAMIC ==========
+  nam::Conv1D conv_dynamic;
+  conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups);
+  auto it2 = weights.begin();
+  conv_dynamic.set_weights_(it2);
+  conv_dynamic.SetMaxBufferSize(MaxFrames);
+
+  // Warmup
+  for (int i = 0; i < NUM_WARMUP; i++)
+    conv_dynamic.Process(input, MaxFrames);
+
+  std::vector<double> dynamic_samples;
+  dynamic_samples.reserve(NUM_ITERATIONS);
+  for (int i = 0; i < NUM_ITERATIONS; i++)
+  {
+    auto t1 = high_resolution_clock::now();
+    conv_dynamic.Process(input, MaxFrames);
+    auto t2 = high_resolution_clock::now();
+    dynamic_samples.push_back(static_cast<double>(duration_cast<nanoseconds>(t2 - t1).count()));
+  }
+
+  Result fixed_result = calculate_stats(fixed_samples);
+  Result dynamic_result = calculate_stats(dynamic_samples);
+
+  double speedup = dynamic_result.mean_ns / fixed_result.mean_ns;
+
+  std::cout << "Conv1D," << Channels << "," << Groups << "," << KernelSize << "," << (HasBias ? "true" : "false") << ","
+            << MaxFrames << "," << std::fixed << std::setprecision(1) << dynamic_result.mean_ns << ","
+            << fixed_result.mean_ns << "," << std::setprecision(2) << speedup << "x\n";
+}
+
+int main()
+{
+  std::mt19937 rng(42);
+
+  std::cout << "================================================================================\n";
+  std::cout << "CONV1X1: Fully Fixed (all dimensions) vs Dynamic\n";
+  std::cout << "================================================================================\n";
+  std::cout << "Type,Channels,Groups,Bias,Frames,Dynamic(ns),FullyFixed(ns),Speedup\n";
+
+  // Common audio buffer sizes: 32, 64, 128, 256, 512
+  // Small channels (where fixed-size optimization helps most)
+
+  // 2 channels
+  benchmark_conv1x1_fully_fixed<2, 32, 1, true>(rng);
+  benchmark_conv1x1_fully_fixed<2, 64, 1, true>(rng);
+  benchmark_conv1x1_fully_fixed<2, 128, 1, true>(rng);
+  benchmark_conv1x1_fully_fixed<2, 256, 1, true>(rng);
+  benchmark_conv1x1_fully_fixed<2, 512, 1, true>(rng);
+
+  // 4 channels
+  benchmark_conv1x1_fully_fixed<4, 32, 1, true>(rng);
+  benchmark_conv1x1_fully_fixed<4, 64, 1, true>(rng);
+  benchmark_conv1x1_fully_fixed<4, 128, 1, true>(rng);
+  benchmark_conv1x1_fully_fixed<4, 256, 1, true>(rng);
+  benchmark_conv1x1_fully_fixed<4, 512, 1, true>(rng);
+
+  // 4 channels with 4 groups (grouped convolution)
+  benchmark_conv1x1_fully_fixed<4, 32, 4, true>(rng);
+  benchmark_conv1x1_fully_fixed<4, 64, 4, true>(rng);
+  benchmark_conv1x1_fully_fixed<4, 128, 4, true>(rng);
+  benchmark_conv1x1_fully_fixed<4, 256, 4, true>(rng);
+  benchmark_conv1x1_fully_fixed<4, 512, 4, true>(rng);
+
+  // 8 channels
+  benchmark_conv1x1_fully_fixed<8, 32, 1, true>(rng);
+  benchmark_conv1x1_fully_fixed<8, 64, 1, true>(rng);
+  benchmark_conv1x1_fully_fixed<8, 128, 1, true>(rng);
+  benchmark_conv1x1_fully_fixed<8, 256, 1, true>(rng);
+  benchmark_conv1x1_fully_fixed<8, 512, 1, true>(rng);
+
+  // 8 channels with 8 groups
+  benchmark_conv1x1_fully_fixed<8, 32, 8, true>(rng);
+  benchmark_conv1x1_fully_fixed<8, 64, 8, true>(rng);
+  benchmark_conv1x1_fully_fixed<8, 128, 8, true>(rng);
+  benchmark_conv1x1_fully_fixed<8, 256, 8, true>(rng);
+  benchmark_conv1x1_fully_fixed<8, 512, 8, true>(rng);
+
+  std::cout << "\n================================================================================\n";
+  std::cout << "CONV1D: Fully Fixed (all dimensions) vs Dynamic\n";
+  std::cout << "================================================================================\n";
+  std::cout << "Type,Channels,Groups,KernelSize,Bias,Frames,Dynamic(ns),FullyFixed(ns),Speedup\n";
+
+  // Conv1D with kernel size 3 (most common)
+  // 4 channels
+  benchmark_conv1d_fully_fixed<4, 3, 32, 1, true>(rng);
+  benchmark_conv1d_fully_fixed<4, 3, 64, 1, true>(rng);
+  benchmark_conv1d_fully_fixed<4, 3, 128, 1, true>(rng);
+  benchmark_conv1d_fully_fixed<4, 3, 256, 1, true>(rng);
+  benchmark_conv1d_fully_fixed<4, 3, 512, 1, true>(rng);
+
+  // 4 channels with 4 groups
+  benchmark_conv1d_fully_fixed<4, 3, 32, 4, true>(rng);
+  benchmark_conv1d_fully_fixed<4, 3, 64, 4, true>(rng);
+  benchmark_conv1d_fully_fixed<4, 3, 128, 4, true>(rng);
+  benchmark_conv1d_fully_fixed<4, 3, 256, 4, true>(rng);
+  benchmark_conv1d_fully_fixed<4, 3, 512, 4, true>(rng);
+
+  // 8 channels
+  benchmark_conv1d_fully_fixed<8, 3, 32, 1, true>(rng);
+  benchmark_conv1d_fully_fixed<8, 3, 64, 1, true>(rng);
+  benchmark_conv1d_fully_fixed<8, 3, 128, 1, true>(rng);
+  benchmark_conv1d_fully_fixed<8, 3, 256, 1, true>(rng);
+  benchmark_conv1d_fully_fixed<8, 3, 512, 1, true>(rng);
+
+  // 8 channels with 8 groups
+  benchmark_conv1d_fully_fixed<8, 3, 32, 8, true>(rng);
+  benchmark_conv1d_fully_fixed<8, 3, 64, 8, true>(rng);
+  benchmark_conv1d_fully_fixed<8, 3, 128, 8, true>(rng);
+  benchmark_conv1d_fully_fixed<8, 3, 256, 8, true>(rng);
+  benchmark_conv1d_fully_fixed<8, 3, 512, 8, true>(rng);
+
+  return 0;
+}
diff --git a/tools/run_tests.cpp b/tools/run_tests.cpp
index e1ebbf7..80af8af 100644
--- a/tools/run_tests.cpp
+++ b/tools/run_tests.cpp
@@ -24,6 +24,9 @@
 #include "test/test_input_buffer_verification.cpp"
 #include "test/test_lstm.cpp"
 #include "test/test_wavenet_configurable_gating.cpp"
+#include "test/test_conv1x1_fixed.cpp"
+#include "test/test_conv1d_fixed.cpp"
+#include "test/test_fully_fixed_correctness.cpp"
 
 int main()
 {
@@ -232,6 +235,51 @@ int main()
   // Configurable gating/blending tests
   run_configurable_gating_tests();
 
+  // Conv1x1Fixed tests (templated implementation)
+  test_conv1x1_fixed::test_factory_create();
+  test_conv1x1_fixed::test_factory_create_with_groups();
+  test_conv1x1_fixed::test_numerical_equivalence();
+  test_conv1x1_fixed::test_numerical_equivalence_grouped();
+  test_conv1x1_fixed::test_process_realtime_safe();
+  test_conv1x1_fixed::test_process_grouped_realtime_safe();
+  test_conv1x1_fixed::test_set_max_buffer_size();
+  test_conv1x1_fixed::test_process_multiple_calls();
+  test_conv1x1_fixed::test_no_bias();
+
+  // Conv1DFixed tests (templated implementation)
+  test_conv1d_fixed::test_factory_create();
+  test_conv1d_fixed::test_factory_create_with_groups();
+  test_conv1d_fixed::test_numerical_equivalence();
+  test_conv1d_fixed::test_numerical_equivalence_grouped();
+  test_conv1d_fixed::test_numerical_equivalence_kernel4();
+  test_conv1d_fixed::test_process_realtime_safe();
+  test_conv1d_fixed::test_process_grouped_realtime_safe();
+  test_conv1d_fixed::test_set_max_buffer_size();
+  test_conv1d_fixed::test_process_multiple_calls();
+  test_conv1d_fixed::test_no_bias();
+  test_conv1d_fixed::test_with_dilation();
+
+  // Fully fixed correctness tests (Conv1x1FullyFixed vs Conv1x1)
+  test_fully_fixed_correctness::test_conv1x1_fully_fixed_2ch_32frames();
+  test_fully_fixed_correctness::test_conv1x1_fully_fixed_4ch_64frames();
+  test_fully_fixed_correctness::test_conv1x1_fully_fixed_4ch_4groups();
+  test_fully_fixed_correctness::test_conv1x1_fully_fixed_8ch_8groups();
+  test_fully_fixed_correctness::test_conv1x1_fully_fixed_no_bias();
+  test_fully_fixed_correctness::test_conv1x1_fully_fixed_partial_buffer();
+  test_fully_fixed_correctness::test_conv1x1_fully_fixed_multiple_calls();
+
+  // Fully fixed correctness tests (Conv1DFullyFixed vs Conv1D)
+  test_fully_fixed_correctness::test_conv1d_fully_fixed_4ch_k3_64frames();
+  test_fully_fixed_correctness::test_conv1d_fully_fixed_4ch_4groups();
+  test_fully_fixed_correctness::test_conv1d_fully_fixed_8ch_8groups();
+  test_fully_fixed_correctness::test_conv1d_fully_fixed_dilation2();
+  test_fully_fixed_correctness::test_conv1d_fully_fixed_dilation8();
+  test_fully_fixed_correctness::test_conv1d_fully_fixed_no_bias();
+  test_fully_fixed_correctness::test_conv1d_fully_fixed_multiple_calls();
+  test_fully_fixed_correctness::test_conv1d_fully_fixed_multiple_calls_dilation4();
+  test_fully_fixed_correctness::test_conv1d_fully_fixed_varying_buffer_sizes();
+  test_fully_fixed_correctness::test_conv1d_fully_fixed_kernel4();
+
   test_get_dsp::test_gets_input_level();
   test_get_dsp::test_gets_output_level();
   test_get_dsp::test_null_input_level();
diff --git a/tools/test/test_conv1d_fixed.cpp b/tools/test/test_conv1d_fixed.cpp
new file mode 100644
index 0000000..1b916e0
--- /dev/null
+++ b/tools/test/test_conv1d_fixed.cpp
@@ -0,0 +1,512 @@
+// Tests for Conv1DFixed (templated implementation)
+
+#include <Eigen/Dense>
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+
+#include "NAM/conv1d.h"
+#include "NAM/conv1d_factory.h"
+#include "NAM/conv1d_fixed.h"
+#include "allocation_tracking.h"
+
+namespace test_conv1d_fixed
+{
+
+// Test factory creation
+void test_factory_create()
+{
+  auto conv = nam::Conv1DFactory::create(4, 4, 3, 1, true, 1);
+  assert(conv != nullptr);
+  assert(conv->get_in_channels() == 4);
+  assert(conv->get_out_channels() == 4);
+  assert(conv->get_kernel_size() == 3);
+  assert(conv->get_dilation() == 1);
+  assert(conv->has_bias() == true);
+}
+
+// Test factory with groups
+void test_factory_create_with_groups()
+{
+  auto conv = nam::Conv1DFactory::create(8, 8, 3, 1, false, 2);
+  assert(conv != nullptr);
+  assert(conv->get_in_channels() == 8);
+  assert(conv->get_out_channels() == 8);
+}
+
+// Test process gives same result as dynamic implementation
+void test_numerical_equivalence()
+{
+  const int in_channels = 4;
+  const int out_channels = 4;
+  const int kernel_size = 3;
+  const int dilation = 1;
+  const bool do_bias = true;
+  const int groups = 1;
+  const int num_frames = 8;
+
+  // Create both implementations
+  nam::Conv1D conv_dynamic;
+  conv_dynamic.set_size_(in_channels, out_channels, kernel_size, do_bias, dilation, groups);
+  auto conv_fixed = nam::Conv1DFactory::create(in_channels, out_channels, kernel_size, dilation, do_bias, groups);
+
+  // Generate weights
+  std::vector<float> weights;
+  const int in_per_group = in_channels / groups;
+  const int out_per_group = out_channels / groups;
+  for (int g = 0; g < groups; g++)
+  {
+    for (int i = 0; i < out_per_group; i++)
+    {
+      for (int j = 0; j < in_per_group; j++)
+      {
+        for (int k = 0; k < kernel_size; k++)
+        {
+          weights.push_back(static_cast<float>(g * 100 + i * 10 + j + k) * 0.01f);
+        }
+      }
+    }
+  }
+  for (int i = 0; i < out_channels; i++)
+  {
+    weights.push_back(static_cast<float>(i) * 0.1f);
+  }
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_dynamic.set_weights_(it1);
+  conv_fixed->set_weights_(it2);
+
+  conv_dynamic.SetMaxBufferSize(num_frames);
+  conv_fixed->SetMaxBufferSize(num_frames);
+
+  // Same input
+  Eigen::MatrixXf input(in_channels, num_frames);
+  for (int i = 0; i < in_channels; i++)
+    for (int j = 0; j < num_frames; j++)
+      input(i, j) = static_cast<float>(i * num_frames + j) * 0.1f;
+
+  // Process both
+  conv_dynamic.Process(input, num_frames);
+  conv_fixed->Process(input, num_frames);
+
+  auto& output_dynamic = conv_dynamic.GetOutput();
+  auto& output_fixed = conv_fixed->GetOutput();
+
+  // Compare outputs
+  for (int i = 0; i < out_channels; i++)
+  {
+    for (int j = 0; j < num_frames; j++)
+    {
+      float diff = std::abs(output_dynamic(i, j) - output_fixed(i, j));
+      assert(diff < 1e-4f);
+    }
+  }
+}
+
+// Test grouped convolution numerical equivalence
+void test_numerical_equivalence_grouped()
+{
+  const int in_channels = 8;
+  const int out_channels = 8;
+  const int kernel_size = 3;
+  const int dilation = 1;
+  const bool do_bias = true;
+  const int groups = 2;
+  const int num_frames = 8;
+
+  // Create both implementations
+  nam::Conv1D conv_dynamic;
+  conv_dynamic.set_size_(in_channels, out_channels, kernel_size, do_bias, dilation, groups);
+  auto conv_fixed = nam::Conv1DFactory::create(in_channels, out_channels, kernel_size, dilation, do_bias, groups);
+
+  // Generate weights
+  std::vector<float> weights;
+  const int in_per_group = in_channels / groups;
+  const int out_per_group = out_channels / groups;
+  for (int g = 0; g < groups; g++)
+  {
+    for (int i = 0; i < out_per_group; i++)
+    {
+      for (int j = 0; j < in_per_group; j++)
+      {
+        for (int k = 0; k < kernel_size; k++)
+        {
+          weights.push_back(static_cast<float>(g * 100 + i * 10 + j + k) * 0.01f);
+        }
+      }
+    }
+  }
+  for (int i = 0; i < out_channels; i++)
+  {
+    weights.push_back(static_cast<float>(i) * 0.1f);
+  }
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_dynamic.set_weights_(it1);
+  conv_fixed->set_weights_(it2);
+
+  conv_dynamic.SetMaxBufferSize(num_frames);
+  conv_fixed->SetMaxBufferSize(num_frames);
+
+  // Same input
+  Eigen::MatrixXf input(in_channels, num_frames);
+  for (int i = 0; i < in_channels; i++)
+    for (int j = 0; j < num_frames; j++)
+      input(i, j) = static_cast<float>(i * num_frames + j) * 0.1f;
+
+  // Process both
+  conv_dynamic.Process(input, num_frames);
+  conv_fixed->Process(input, num_frames);
+
+  auto& output_dynamic = conv_dynamic.GetOutput();
+  auto& output_fixed = conv_fixed->GetOutput();
+
+  // Compare outputs
+  for (int i = 0; i < out_channels; i++)
+  {
+    for (int j = 0; j < num_frames; j++)
+    {
+      float diff = std::abs(output_dynamic(i, j) - output_fixed(i, j));
+      assert(diff < 1e-4f);
+    }
+  }
+}
+
+// Test with different kernel size
+void test_numerical_equivalence_kernel4()
+{
+  const int in_channels = 4;
+  const int out_channels = 4;
+  const int kernel_size = 4;
+  const int dilation = 1;
+  const bool do_bias = true;
+  const int groups = 1;
+  const int num_frames = 8;
+
+  // Create both implementations
+  nam::Conv1D conv_dynamic;
+  conv_dynamic.set_size_(in_channels, out_channels, kernel_size, do_bias, dilation, groups);
+  auto conv_fixed = nam::Conv1DFactory::create(in_channels, out_channels, kernel_size, dilation, do_bias, groups);
+
+  // Generate weights
+  std::vector<float> weights;
+  for (int i = 0; i < out_channels; i++)
+  {
+    for (int j = 0; j < in_channels; j++)
+    {
+      for (int k = 0; k < kernel_size; k++)
+      {
+        weights.push_back(static_cast<float>(i * 10 + j + k) * 0.01f);
+      }
+    }
+  }
+  for (int i = 0; i < out_channels; i++)
+  {
+    weights.push_back(static_cast<float>(i) * 0.1f);
+  }
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_dynamic.set_weights_(it1);
+  conv_fixed->set_weights_(it2);
+
+  conv_dynamic.SetMaxBufferSize(num_frames);
+  conv_fixed->SetMaxBufferSize(num_frames);
+
+  // Same input
+  Eigen::MatrixXf input(in_channels, num_frames);
+  for (int i = 0; i < in_channels; i++)
+    for (int j = 0; j < num_frames; j++)
+      input(i, j) = static_cast<float>(i * num_frames + j) * 0.1f;
+
+  // Process both
+  conv_dynamic.Process(input, num_frames);
+  conv_fixed->Process(input, num_frames);
+
+  auto& output_dynamic = conv_dynamic.GetOutput();
+  auto& output_fixed = conv_fixed->GetOutput();
+
+  // Compare outputs
+  for (int i = 0; i < out_channels; i++)
+  {
+    for (int j = 0; j < num_frames; j++)
+    {
+      float diff = std::abs(output_dynamic(i, j) - output_fixed(i, j));
+      assert(diff < 1e-4f);
+    }
+  }
+}
+
+// Test process is real-time safe (no allocations)
+void test_process_realtime_safe()
+{
+  const int in_channels = 4;
+  const int out_channels = 4;
+  const int kernel_size = 3;
+  const int dilation = 1;
+  const bool do_bias = true;
+  const int groups = 1;
+  const int num_frames = 64;
+
+  auto conv = nam::Conv1DFactory::create(in_channels, out_channels, kernel_size, dilation, do_bias, groups);
+
+  // Initialize weights
+  std::vector<float> weights;
+  for (int i = 0; i < out_channels; i++)
+  {
+    for (int j = 0; j < in_channels; j++)
+    {
+      for (int k = 0; k < kernel_size; k++)
+      {
+        weights.push_back(0.1f);
+      }
+    }
+  }
+  for (int i = 0; i < out_channels; i++)
+  {
+    weights.push_back(0.0f);
+  }
+
+  auto it = weights.begin();
+  conv->set_weights_(it);
+  conv->SetMaxBufferSize(num_frames);
+
+  // Create input buffer
+  Eigen::MatrixXf input(in_channels, num_frames);
+  for (int i = 0; i < in_channels; i++)
+    for (int j = 0; j < num_frames; j++)
+      input(i, j) = static_cast<float>(i + j);
+
+  // Run allocation test
+  allocation_tracking::run_allocation_test_no_allocations(
+    nullptr,
+    [&]() {
+      conv->Process(input, num_frames);
+    },
+    nullptr, "test_conv1d_fixed_process_realtime_safe");
+}
+
+// Test process with groups is real-time safe
+void test_process_grouped_realtime_safe()
+{
+  const int in_channels = 8;
+  const int out_channels = 8;
+  const int kernel_size = 3;
+  const int dilation = 1;
+  const bool do_bias = true;
+  const int groups = 4;
+  const int num_frames = 64;
+
+  auto conv = nam::Conv1DFactory::create(in_channels, out_channels, kernel_size, dilation, do_bias, groups);
+
+  // Initialize weights
+  std::vector<float> weights;
+  const int in_per_group = in_channels / groups;
+  const int out_per_group = out_channels / groups;
+  for (int g = 0; g < groups; g++)
+  {
+    for (int i = 0; i < out_per_group; i++)
+    {
+      for (int j = 0; j < in_per_group; j++)
+      {
+        for (int k = 0; k < kernel_size; k++)
+        {
+          weights.push_back(i == j ? 1.0f : 0.0f);
+        }
+      }
+    }
+  }
+  for (int i = 0; i < out_channels; i++)
+  {
+    weights.push_back(0.0f);
+  }
+
+  auto it = weights.begin();
+  conv->set_weights_(it);
+  conv->SetMaxBufferSize(num_frames);
+
+  // Create input buffer
+  Eigen::MatrixXf input(in_channels, num_frames);
+  for (int i = 0; i < in_channels; i++)
+    for (int j = 0; j < num_frames; j++)
+      input(i, j) = static_cast<float>(i + j);
+
+  // Run allocation test
+  allocation_tracking::run_allocation_test_no_allocations(
+    nullptr,
+    [&]() {
+      conv->Process(input, num_frames);
+    },
+    nullptr, "test_conv1d_fixed_process_grouped_realtime_safe");
+}
+
+// Test SetMaxBufferSize
+void test_set_max_buffer_size()
+{
+  auto conv = nam::Conv1DFactory::create(4, 4, 3, 1, false, 1);
+  conv->SetMaxBufferSize(128);
+  auto& output = conv->GetOutput();
+  assert(output.rows() == 4);
+  assert(output.cols() == 128);
+}
+
+// Test multiple calls to process
+void test_process_multiple_calls()
+{
+  const int channels = 4;
+  const int kernel_size = 3;
+  const int num_frames = 4;
+
+  auto conv = nam::Conv1DFactory::create(channels, channels, kernel_size, 1, false, 1);
+
+  // Identity-like weights (all zeros except center)
+  std::vector<float> weights;
+  for (int i = 0; i < channels; i++)
+  {
+    for (int j = 0; j < channels; j++)
+    {
+      for (int k = 0; k < kernel_size; k++)
+      {
+        // Put weight at last kernel position for identity-like behavior
+        weights.push_back((k == kernel_size - 1 && i == j) ? 1.0f : 0.0f);
+      }
+    }
+  }
+
+  auto it = weights.begin();
+  conv->set_weights_(it);
+  conv->SetMaxBufferSize(num_frames);
+
+  // First call
+  Eigen::MatrixXf input1(channels, num_frames);
+  input1.setConstant(1.0f);
+  conv->Process(input1, num_frames);
+
+  // Second call
+  Eigen::MatrixXf input2(channels, num_frames);
+  input2.setConstant(2.0f);
+  conv->Process(input2, num_frames);
+
+  // Output should reflect the second call's values (for the last positions at least)
+  auto& output = conv->GetOutput();
+  // After the ring buffer fills, we should see values based on the second input
+  assert(output.rows() == channels);
+}
+
+// Test with bias disabled
+void test_no_bias()
+{
+  const int channels = 4;
+  const int kernel_size = 3;
+  const int num_frames = 4;
+
+  // Create dynamic and fixed with no bias
+  nam::Conv1D conv_dynamic;
+  conv_dynamic.set_size_(channels, channels, kernel_size, false, 1, 1);
+  auto conv_fixed = nam::Conv1DFactory::create(channels, channels, kernel_size, 1, false, 1);
+
+  // Same weights (no bias)
+  std::vector<float> weights;
+  for (int i = 0; i < channels; i++)
+  {
+    for (int j = 0; j < channels; j++)
+    {
+      for (int k = 0; k < kernel_size; k++)
+      {
+        weights.push_back((k == kernel_size - 1 && i == j) ? 1.0f : 0.0f);
+      }
+    }
+  }
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_dynamic.set_weights_(it1);
+  conv_fixed->set_weights_(it2);
+
+  conv_dynamic.SetMaxBufferSize(num_frames);
+  conv_fixed->SetMaxBufferSize(num_frames);
+
+  Eigen::MatrixXf input(channels, num_frames);
+  input.setConstant(5.0f);
+
+  conv_dynamic.Process(input, num_frames);
+  conv_fixed->Process(input, num_frames);
+
+  auto& output_dynamic = conv_dynamic.GetOutput();
+  auto& output_fixed = conv_fixed->GetOutput();
+
+  // Compare outputs
+  for (int i = 0; i < channels; i++)
+  {
+    for (int j = 0; j < num_frames; j++)
+    {
+      float diff = std::abs(output_dynamic(i, j) - output_fixed(i, j));
+      assert(diff < 1e-4f);
+    }
+  }
+}
+
+// Test with dilation
+void test_with_dilation()
+{
+  const int channels = 4;
+  const int kernel_size = 3;
+  const int dilation = 2;
+  const int num_frames = 8;
+
+  nam::Conv1D conv_dynamic;
+  conv_dynamic.set_size_(channels, channels, kernel_size, true, dilation, 1);
+  auto conv_fixed = nam::Conv1DFactory::create(channels, channels, kernel_size, dilation, true, 1);
+
+  // Same weights
+  std::vector<float> weights;
+  for (int i = 0; i < channels; i++)
+  {
+    for (int j = 0; j < channels; j++)
+    {
+      for (int k = 0; k < kernel_size; k++)
+      {
+        weights.push_back(0.1f * (i + j + k));
+      }
+    }
+  }
+  for (int i = 0; i < channels; i++)
+  {
+    weights.push_back(0.5f);
+  }
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_dynamic.set_weights_(it1);
+  conv_fixed->set_weights_(it2);
+
+  conv_dynamic.SetMaxBufferSize(num_frames);
+  conv_fixed->SetMaxBufferSize(num_frames);
+
+  Eigen::MatrixXf input(channels, num_frames);
+  for (int i = 0; i < channels; i++)
+    for (int j = 0; j < num_frames; j++)
+      input(i, j) = static_cast<float>(i + j);
+
+  conv_dynamic.Process(input, num_frames);
+  conv_fixed->Process(input, num_frames);
+
+  auto& output_dynamic = conv_dynamic.GetOutput();
+  auto& output_fixed = conv_fixed->GetOutput();
+
+  // Compare outputs
+  for (int i = 0; i < channels; i++)
+  {
+    for (int j = 0; j < num_frames; j++)
+    {
+      float diff = std::abs(output_dynamic(i, j) - output_fixed(i, j));
+      assert(diff < 1e-4f);
+    }
+  }
+}
+
+} // namespace test_conv1d_fixed
diff --git a/tools/test/test_conv1x1_fixed.cpp b/tools/test/test_conv1x1_fixed.cpp
new file mode 100644
index 0000000..5b6ac80
--- /dev/null
+++ b/tools/test/test_conv1x1_fixed.cpp
@@ -0,0 +1,323 @@
+// Tests for Conv1x1Fixed (templated implementation)
+
+#include <Eigen/Dense>
+#include <cassert>
+#include <cmath>
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+
+#include "NAM/conv1x1_factory.h"
+#include "NAM/conv1x1_fixed.h"
+#include "NAM/dsp.h"
+#include "allocation_tracking.h"
+
+namespace test_conv1x1_fixed
+{
+
+// Test factory creation
+void test_factory_create()
+{
+  auto conv = nam::Conv1x1Factory::create(4, 4, true, 1);
+  assert(conv != nullptr);
+  assert(conv->get_in_channels() == 4);
+  assert(conv->get_out_channels() == 4);
+}
+
+// Test factory with groups
+void test_factory_create_with_groups()
+{
+  auto conv = nam::Conv1x1Factory::create(8, 8, false, 2);
+  assert(conv != nullptr);
+  assert(conv->get_in_channels() == 8);
+  assert(conv->get_out_channels() == 8);
+}
+
+// Test process gives same result as dynamic implementation
+void test_numerical_equivalence()
+{
+  const int in_channels = 4;
+  const int out_channels = 4;
+  const bool do_bias = true;
+  const int groups = 1;
+  const int num_frames = 4;
+
+  // Create both implementations
+  nam::Conv1x1 conv_dynamic(in_channels, out_channels, do_bias, groups);
+  auto conv_fixed = nam::Conv1x1Factory::create(in_channels, out_channels, do_bias, groups);
+
+  // Same weights
+  std::vector<float> weights;
+  for (int i = 0; i < out_channels * in_channels; i++)
+  {
+    weights.push_back(static_cast<float>(i) * 0.1f);
+  }
+  for (int i = 0; i < out_channels; i++)
+  {
+    weights.push_back(static_cast<float>(i) * 0.5f);
+  }
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_dynamic.set_weights_(it1);
+  conv_fixed->set_weights_(it2);
+
+  conv_dynamic.SetMaxBufferSize(num_frames);
+  conv_fixed->SetMaxBufferSize(num_frames);
+
+  // Same input
+  Eigen::MatrixXf input(in_channels, num_frames);
+  for (int i = 0; i < in_channels; i++)
+    for (int j = 0; j < num_frames; j++)
+      input(i, j) = static_cast<float>(i * num_frames + j);
+
+  // Process both
+  Eigen::MatrixXf output_dynamic = conv_dynamic.process(input, num_frames);
+  Eigen::MatrixXf output_fixed = conv_fixed->process(input, num_frames);
+
+  // Compare outputs
+  assert(output_dynamic.rows() == output_fixed.rows());
+  assert(output_dynamic.cols() == output_fixed.cols());
+
+  for (int i = 0; i < output_dynamic.rows(); i++)
+  {
+    for (int j = 0; j < output_dynamic.cols(); j++)
+    {
+      float diff = std::abs(output_dynamic(i, j) - output_fixed(i, j));
+      assert(diff < 1e-5f);
+    }
+  }
+}
+
+// Test grouped convolution numerical equivalence
+void test_numerical_equivalence_grouped()
+{
+  const int in_channels = 8;
+  const int out_channels = 8;
+  const bool do_bias = true;
+  const int groups = 2;
+  const int num_frames = 4;
+
+  // Create both implementations
+  nam::Conv1x1 conv_dynamic(in_channels, out_channels, do_bias, groups);
+  auto conv_fixed = nam::Conv1x1Factory::create(in_channels, out_channels, do_bias, groups);
+
+  // Same weights (grouped layout)
+  std::vector<float> weights;
+  const int in_per_group = in_channels / groups;
+  const int out_per_group = out_channels / groups;
+  for (int g = 0; g < groups; g++)
+  {
+    for (int i = 0; i < out_per_group; i++)
+    {
+      for (int j = 0; j < in_per_group; j++)
+      {
+        weights.push_back(static_cast<float>(g * 10 + i * in_per_group + j) * 0.1f);
+      }
+    }
+  }
+  for (int i = 0; i < out_channels; i++)
+  {
+    weights.push_back(static_cast<float>(i) * 0.5f);
+  }
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_dynamic.set_weights_(it1);
+  conv_fixed->set_weights_(it2);
+
+  conv_dynamic.SetMaxBufferSize(num_frames);
+  conv_fixed->SetMaxBufferSize(num_frames);
+
+  // Same input
+  Eigen::MatrixXf input(in_channels, num_frames);
+  for (int i = 0; i < in_channels; i++)
+    for (int j = 0; j < num_frames; j++)
+      input(i, j) = static_cast<float>(i * num_frames + j);
+
+  // Process both
+  Eigen::MatrixXf output_dynamic = conv_dynamic.process(input, num_frames);
+  Eigen::MatrixXf output_fixed = conv_fixed->process(input, num_frames);
+
+  // Compare outputs
+  for (int i = 0; i < output_dynamic.rows(); i++)
+  {
+    for (int j = 0; j < output_dynamic.cols(); j++)
+    {
+      float diff = std::abs(output_dynamic(i, j) - output_fixed(i, j));
+      assert(diff < 1e-5f);
+    }
+  }
+}
+
+// Test process_ is real-time safe (no allocations)
+void test_process_realtime_safe()
+{
+  const int in_channels = 4;
+  const int out_channels = 4;
+  const bool do_bias = true;
+  const int groups = 1;
+  const int num_frames = 64;
+
+  auto conv = nam::Conv1x1Factory::create(in_channels, out_channels, do_bias, groups);
+
+  // Initialize weights
+  std::vector<float> weights;
+  for (int i = 0; i < out_channels * in_channels; i++)
+  {
+    weights.push_back(static_cast<float>(i) * 0.1f);
+  }
+  for (int i = 0; i < out_channels; i++)
+  {
+    weights.push_back(0.0f);
+  }
+
+  auto it = weights.begin();
+  conv->set_weights_(it);
+  conv->SetMaxBufferSize(num_frames);
+
+  // Create input buffer
+  Eigen::MatrixXf input(in_channels, num_frames);
+  for (int i = 0; i < in_channels; i++)
+    for (int j = 0; j < num_frames; j++)
+      input(i, j) = static_cast<float>(i + j);
+
+  // Run allocation test
+  allocation_tracking::run_allocation_test_no_allocations(
+    nullptr,
+    [&]() {
+      conv->process_(input, num_frames);
+    },
+    nullptr, "test_conv1x1_fixed_process_realtime_safe");
+}
+
+// Test process_ with groups is real-time safe
+void test_process_grouped_realtime_safe()
+{
+  const int in_channels = 8;
+  const int out_channels = 8;
+  const bool do_bias = true;
+  const int groups = 4;
+  const int num_frames = 64;
+
+  auto conv = nam::Conv1x1Factory::create(in_channels, out_channels, do_bias, groups);
+
+  // Initialize weights (identity-like for each group)
+  std::vector<float> weights;
+  const int in_per_group = in_channels / groups;
+  const int out_per_group = out_channels / groups;
+  for (int g = 0; g < groups; g++)
+  {
+    for (int i = 0; i < out_per_group; i++)
+    {
+      for (int j = 0; j < in_per_group; j++)
+      {
+        weights.push_back(i == j ? 1.0f : 0.0f);
+      }
+    }
+  }
+  for (int i = 0; i < out_channels; i++)
+  {
+    weights.push_back(0.0f);
+  }
+
+  auto it = weights.begin();
+  conv->set_weights_(it);
+  conv->SetMaxBufferSize(num_frames);
+
+  // Create input buffer
+  Eigen::MatrixXf input(in_channels, num_frames);
+  for (int i = 0; i < in_channels; i++)
+    for (int j = 0; j < num_frames; j++)
+      input(i, j) = static_cast<float>(i + j);
+
+  // Run allocation test
+  allocation_tracking::run_allocation_test_no_allocations(
+    nullptr,
+    [&]() {
+      conv->process_(input, num_frames);
+    },
+    nullptr, "test_conv1x1_fixed_process_grouped_realtime_safe");
+}
+
+// Test SetMaxBufferSize
+void test_set_max_buffer_size()
+{
+  auto conv = nam::Conv1x1Factory::create(4, 4, false, 1);
+  conv->SetMaxBufferSize(128);
+  auto& output = conv->GetOutput();
+  assert(output.rows() == 4);
+  assert(output.cols() == 128);
+}
+
+// Test multiple calls to process
+void test_process_multiple_calls()
+{
+  const int channels = 4;
+  const int num_frames = 2;
+
+  auto conv = nam::Conv1x1Factory::create(channels, channels, false, 1);
+
+  // Identity weights
+  std::vector<float> weights;
+  for (int i = 0; i < channels; i++)
+  {
+    for (int j = 0; j < channels; j++)
+    {
+      weights.push_back(i == j ? 1.0f : 0.0f);
+    }
+  }
+
+  auto it = weights.begin();
+  conv->set_weights_(it);
+  conv->SetMaxBufferSize(num_frames);
+
+  Eigen::MatrixXf input1(channels, num_frames);
+  input1.setConstant(1.0f);
+
+  auto output1 = conv->process(input1, num_frames);
+  for (int i = 0; i < channels; i++)
+    for (int j = 0; j < num_frames; j++)
+      assert(std::abs(output1(i, j) - 1.0f) < 0.01f);
+
+  Eigen::MatrixXf input2(channels, num_frames);
+  input2.setConstant(2.0f);
+
+  auto output2 = conv->process(input2, num_frames);
+  for (int i = 0; i < channels; i++)
+    for (int j = 0; j < num_frames; j++)
+      assert(std::abs(output2(i, j) - 2.0f) < 0.01f);
+}
+
+// Test with bias disabled
+void test_no_bias()
+{
+  const int channels = 4;
+  const int num_frames = 2;
+
+  auto conv = nam::Conv1x1Factory::create(channels, channels, false, 1);
+
+  // Identity weights (no bias)
+  std::vector<float> weights;
+  for (int i = 0; i < channels; i++)
+  {
+    for (int j = 0; j < channels; j++)
+    {
+      weights.push_back(i == j ? 1.0f : 0.0f);
+    }
+  }
+
+  auto it = weights.begin();
+  conv->set_weights_(it);
+  conv->SetMaxBufferSize(num_frames);
+
+  Eigen::MatrixXf input(channels, num_frames);
+  input.setConstant(5.0f);
+
+  auto output = conv->process(input, num_frames);
+  for (int i = 0; i < channels; i++)
+    for (int j = 0; j < num_frames; j++)
+      assert(std::abs(output(i, j) - 5.0f) < 0.01f);
+}
+
+} // namespace test_conv1x1_fixed

From 4b3c2368b67d99f73a81a202f887e0ff5a8409fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= <santosjf@pm.me>
Date: Thu, 29 Jan 2026 15:17:57 -0800
Subject: [PATCH 4/4] Added tests to verify correctness of fixed
 implementations.

---
 tools/test/test_fully_fixed_correctness.cpp | 761 ++++++++++++++++++++
 1 file changed, 761 insertions(+)
 create mode 100644 tools/test/test_fully_fixed_correctness.cpp

diff --git a/tools/test/test_fully_fixed_correctness.cpp b/tools/test/test_fully_fixed_correctness.cpp
new file mode 100644
index 0000000..c7c0063
--- /dev/null
+++ b/tools/test/test_fully_fixed_correctness.cpp
@@ -0,0 +1,761 @@
+// Tests for Conv1x1FullyFixed and Conv1DFullyFixed correctness
+// Compares outputs against dynamic implementations
+
+#include <Eigen/Dense>
+#include <cassert>
+#include <cmath>
+#include <random>
+#include <vector>
+
+#include "NAM/conv1d.h"
+#include "NAM/conv1d_fixed.h"
+#include "NAM/conv1x1_fixed.h"
+#include "NAM/dsp.h"
+
+namespace test_fully_fixed_correctness
+{
+
+constexpr float TOLERANCE = 1e-5f;
+
+// Helper to check matrix equality
+inline void assert_matrices_equal(const Eigen::MatrixXf& a, const Eigen::MatrixXf& b, int num_cols,
+                                  float tol = TOLERANCE)
+{
+  assert(a.rows() == b.rows());
+  for (int i = 0; i < a.rows(); i++)
+  {
+    for (int j = 0; j < num_cols; j++)
+    {
+      float diff = std::abs(a(i, j) - b(i, j));
+      assert(diff < tol);
+    }
+  }
+}
+
+// ============================================================================
+// Conv1x1FullyFixed Tests
+// ============================================================================
+
+void test_conv1x1_fully_fixed_2ch_32frames()
+{
+  std::mt19937 rng(42);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  constexpr int Channels = 2;
+  constexpr int MaxFrames = 32;
+  constexpr int Groups = 1;
+  constexpr bool HasBias = true;
+
+  std::vector<float> weights;
+  for (int i = 0; i < Channels * Channels; i++)
+    weights.push_back(dist(rng));
+  for (int i = 0; i < Channels; i++)
+    weights.push_back(dist(rng));
+
+  nam::Conv1x1FullyFixed<Channels, Channels, MaxFrames, Groups, HasBias> conv_fixed;
+  nam::Conv1x1 conv_dynamic(Channels, Channels, HasBias, Groups);
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_fixed.set_weights_(it1);
+  conv_dynamic.set_weights_(it2);
+
+  conv_fixed.SetMaxBufferSize(MaxFrames);
+  conv_dynamic.SetMaxBufferSize(MaxFrames);
+
+  Eigen::MatrixXf input(Channels, MaxFrames);
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < MaxFrames; j++)
+      input(i, j) = dist(rng);
+
+  conv_fixed.process_(input, MaxFrames);
+  conv_dynamic.process_(input, MaxFrames);
+
+  assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames);
+}
+
+void test_conv1x1_fully_fixed_4ch_64frames()
+{
+  std::mt19937 rng(123);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  constexpr int Channels = 4;
+  constexpr int MaxFrames = 64;
+  constexpr int Groups = 1;
+  constexpr bool HasBias = true;
+
+  std::vector<float> weights;
+  for (int i = 0; i < Channels * Channels; i++)
+    weights.push_back(dist(rng));
+  for (int i = 0; i < Channels; i++)
+    weights.push_back(dist(rng));
+
+  nam::Conv1x1FullyFixed<Channels, Channels, MaxFrames, Groups, HasBias> conv_fixed;
+  nam::Conv1x1 conv_dynamic(Channels, Channels, HasBias, Groups);
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_fixed.set_weights_(it1);
+  conv_dynamic.set_weights_(it2);
+
+  conv_fixed.SetMaxBufferSize(MaxFrames);
+  conv_dynamic.SetMaxBufferSize(MaxFrames);
+
+  Eigen::MatrixXf input(Channels, MaxFrames);
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < MaxFrames; j++)
+      input(i, j) = dist(rng);
+
+  conv_fixed.process_(input, MaxFrames);
+  conv_dynamic.process_(input, MaxFrames);
+
+  assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames);
+}
+
+void test_conv1x1_fully_fixed_4ch_4groups()
+{
+  std::mt19937 rng(456);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  constexpr int Channels = 4;
+  constexpr int MaxFrames = 64;
+  constexpr int Groups = 4;
+  constexpr bool HasBias = true;
+  constexpr int PerGroup = Channels / Groups;
+
+  std::vector<float> weights;
+  for (int g = 0; g < Groups; g++)
+    for (int i = 0; i < PerGroup; i++)
+      for (int j = 0; j < PerGroup; j++)
+        weights.push_back(dist(rng));
+  for (int i = 0; i < Channels; i++)
+    weights.push_back(dist(rng));
+
+  nam::Conv1x1FullyFixed<Channels, Channels, MaxFrames, Groups, HasBias> conv_fixed;
+  nam::Conv1x1 conv_dynamic(Channels, Channels, HasBias, Groups);
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_fixed.set_weights_(it1);
+  conv_dynamic.set_weights_(it2);
+
+  conv_fixed.SetMaxBufferSize(MaxFrames);
+  conv_dynamic.SetMaxBufferSize(MaxFrames);
+
+  Eigen::MatrixXf input(Channels, MaxFrames);
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < MaxFrames; j++)
+      input(i, j) = dist(rng);
+
+  conv_fixed.process_(input, MaxFrames);
+  conv_dynamic.process_(input, MaxFrames);
+
+  assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames);
+}
+
+void test_conv1x1_fully_fixed_8ch_8groups()
+{
+  std::mt19937 rng(789);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  constexpr int Channels = 8;
+  constexpr int MaxFrames = 128;
+  constexpr int Groups = 8;
+  constexpr bool HasBias = true;
+  constexpr int PerGroup = Channels / Groups;
+
+  std::vector<float> weights;
+  for (int g = 0; g < Groups; g++)
+    for (int i = 0; i < PerGroup; i++)
+      for (int j = 0; j < PerGroup; j++)
+        weights.push_back(dist(rng));
+  for (int i = 0; i < Channels; i++)
+    weights.push_back(dist(rng));
+
+  nam::Conv1x1FullyFixed<Channels, Channels, MaxFrames, Groups, HasBias> conv_fixed;
+  nam::Conv1x1 conv_dynamic(Channels, Channels, HasBias, Groups);
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_fixed.set_weights_(it1);
+  conv_dynamic.set_weights_(it2);
+
+  conv_fixed.SetMaxBufferSize(MaxFrames);
+  conv_dynamic.SetMaxBufferSize(MaxFrames);
+
+  Eigen::MatrixXf input(Channels, MaxFrames);
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < MaxFrames; j++)
+      input(i, j) = dist(rng);
+
+  conv_fixed.process_(input, MaxFrames);
+  conv_dynamic.process_(input, MaxFrames);
+
+  assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames);
+}
+
+void test_conv1x1_fully_fixed_no_bias()
+{
+  std::mt19937 rng(111);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  constexpr int Channels = 4;
+  constexpr int MaxFrames = 64;
+  constexpr int Groups = 1;
+  constexpr bool HasBias = false;
+
+  std::vector<float> weights;
+  for (int i = 0; i < Channels * Channels; i++)
+    weights.push_back(dist(rng));
+
+  nam::Conv1x1FullyFixed<Channels, Channels, MaxFrames, Groups, HasBias> conv_fixed;
+  nam::Conv1x1 conv_dynamic(Channels, Channels, HasBias, Groups);
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_fixed.set_weights_(it1);
+  conv_dynamic.set_weights_(it2);
+
+  conv_fixed.SetMaxBufferSize(MaxFrames);
+  conv_dynamic.SetMaxBufferSize(MaxFrames);
+
+  Eigen::MatrixXf input(Channels, MaxFrames);
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < MaxFrames; j++)
+      input(i, j) = dist(rng);
+
+  conv_fixed.process_(input, MaxFrames);
+  conv_dynamic.process_(input, MaxFrames);
+
+  assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames);
+}
+
+void test_conv1x1_fully_fixed_partial_buffer()
+{
+  std::mt19937 rng(222);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  constexpr int Channels = 4;
+  constexpr int MaxFrames = 64;
+  constexpr int Groups = 1;
+  constexpr bool HasBias = true;
+  constexpr int NumFrames = 32; // Half buffer
+
+  std::vector<float> weights;
+  for (int i = 0; i < Channels * Channels; i++)
+    weights.push_back(dist(rng));
+  for (int i = 0; i < Channels; i++)
+    weights.push_back(dist(rng));
+
+  nam::Conv1x1FullyFixed<Channels, Channels, MaxFrames, Groups, HasBias> conv_fixed;
+  nam::Conv1x1 conv_dynamic(Channels, Channels, HasBias, Groups);
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_fixed.set_weights_(it1);
+  conv_dynamic.set_weights_(it2);
+
+  conv_fixed.SetMaxBufferSize(MaxFrames);
+  conv_dynamic.SetMaxBufferSize(MaxFrames);
+
+  Eigen::MatrixXf input(Channels, MaxFrames);
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < MaxFrames; j++)
+      input(i, j) = dist(rng);
+
+  conv_fixed.process_(input, NumFrames);
+  conv_dynamic.process_(input, NumFrames);
+
+  assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), NumFrames);
+}
+
+void test_conv1x1_fully_fixed_multiple_calls()
+{
+  std::mt19937 rng(333);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  constexpr int Channels = 4;
+  constexpr int MaxFrames = 64;
+  constexpr int Groups = 1;
+  constexpr bool HasBias = true;
+
+  std::vector<float> weights;
+  for (int i = 0; i < Channels * Channels; i++)
+    weights.push_back(dist(rng));
+  for (int i = 0; i < Channels; i++)
+    weights.push_back(dist(rng));
+
+  nam::Conv1x1FullyFixed<Channels, Channels, MaxFrames, Groups, HasBias> conv_fixed;
+  nam::Conv1x1 conv_dynamic(Channels, Channels, HasBias, Groups);
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_fixed.set_weights_(it1);
+  conv_dynamic.set_weights_(it2);
+
+  conv_fixed.SetMaxBufferSize(MaxFrames);
+  conv_dynamic.SetMaxBufferSize(MaxFrames);
+
+  for (int call = 0; call < 5; call++)
+  {
+    Eigen::MatrixXf input(Channels, MaxFrames);
+    for (int i = 0; i < Channels; i++)
+      for (int j = 0; j < MaxFrames; j++)
+        input(i, j) = dist(rng);
+
+    conv_fixed.process_(input, MaxFrames);
+    conv_dynamic.process_(input, MaxFrames);
+
+    assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames);
+  }
+}
+
+// ============================================================================
+// Conv1DFullyFixed Tests
+// ============================================================================
+
+void test_conv1d_fully_fixed_4ch_k3_64frames()
+{
+  std::mt19937 rng(42);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  constexpr int Channels = 4;
+  constexpr int KernelSize = 3;
+  constexpr int MaxFrames = 64;
+  constexpr int Groups = 1;
+  constexpr bool HasBias = true;
+  const int dilation = 1;
+
+  std::vector<float> weights;
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < Channels; j++)
+      for (int k = 0; k < KernelSize; k++)
+        weights.push_back(dist(rng));
+  for (int i = 0; i < Channels; i++)
+    weights.push_back(dist(rng));
+
+  nam::Conv1DFullyFixed<Channels, Channels, KernelSize, MaxFrames, Groups, HasBias> conv_fixed(dilation);
+  nam::Conv1D conv_dynamic;
+  conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups);
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_fixed.set_weights_(it1);
+  conv_dynamic.set_weights_(it2);
+
+  conv_fixed.SetMaxBufferSize(MaxFrames);
+  conv_dynamic.SetMaxBufferSize(MaxFrames);
+
+  Eigen::MatrixXf input(Channels, MaxFrames);
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < MaxFrames; j++)
+      input(i, j) = dist(rng);
+
+  conv_fixed.Process(input, MaxFrames);
+  conv_dynamic.Process(input, MaxFrames);
+
+  assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames);
+}
+
+void test_conv1d_fully_fixed_4ch_4groups()
+{
+  std::mt19937 rng(123);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  constexpr int Channels = 4;
+  constexpr int KernelSize = 3;
+  constexpr int MaxFrames = 64;
+  constexpr int Groups = 4;
+  constexpr bool HasBias = true;
+  constexpr int PerGroup = Channels / Groups;
+  const int dilation = 1;
+
+  std::vector<float> weights;
+  for (int g = 0; g < Groups; g++)
+    for (int i = 0; i < PerGroup; i++)
+      for (int j = 0; j < PerGroup; j++)
+        for (int k = 0; k < KernelSize; k++)
+          weights.push_back(dist(rng));
+  for (int i = 0; i < Channels; i++)
+    weights.push_back(dist(rng));
+
+  nam::Conv1DFullyFixed<Channels, Channels, KernelSize, MaxFrames, Groups, HasBias> conv_fixed(dilation);
+  nam::Conv1D conv_dynamic;
+  conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups);
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_fixed.set_weights_(it1);
+  conv_dynamic.set_weights_(it2);
+
+  conv_fixed.SetMaxBufferSize(MaxFrames);
+  conv_dynamic.SetMaxBufferSize(MaxFrames);
+
+  Eigen::MatrixXf input(Channels, MaxFrames);
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < MaxFrames; j++)
+      input(i, j) = dist(rng);
+
+  conv_fixed.Process(input, MaxFrames);
+  conv_dynamic.Process(input, MaxFrames);
+
+  assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames);
+}
+
+void test_conv1d_fully_fixed_8ch_8groups()
+{
+  std::mt19937 rng(456);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  constexpr int Channels = 8;
+  constexpr int KernelSize = 3;
+  constexpr int MaxFrames = 128;
+  constexpr int Groups = 8;
+  constexpr bool HasBias = true;
+  constexpr int PerGroup = Channels / Groups;
+  const int dilation = 1;
+
+  std::vector<float> weights;
+  for (int g = 0; g < Groups; g++)
+    for (int i = 0; i < PerGroup; i++)
+      for (int j = 0; j < PerGroup; j++)
+        for (int k = 0; k < KernelSize; k++)
+          weights.push_back(dist(rng));
+  for (int i = 0; i < Channels; i++)
+    weights.push_back(dist(rng));
+
+  nam::Conv1DFullyFixed<Channels, Channels, KernelSize, MaxFrames, Groups, HasBias> conv_fixed(dilation);
+  nam::Conv1D conv_dynamic;
+  conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups);
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_fixed.set_weights_(it1);
+  conv_dynamic.set_weights_(it2);
+
+  conv_fixed.SetMaxBufferSize(MaxFrames);
+  conv_dynamic.SetMaxBufferSize(MaxFrames);
+
+  Eigen::MatrixXf input(Channels, MaxFrames);
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < MaxFrames; j++)
+      input(i, j) = dist(rng);
+
+  conv_fixed.Process(input, MaxFrames);
+  conv_dynamic.Process(input, MaxFrames);
+
+  assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames);
+}
+
+void test_conv1d_fully_fixed_dilation2()
+{
+  std::mt19937 rng(789);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  constexpr int Channels = 4;
+  constexpr int KernelSize = 3;
+  constexpr int MaxFrames = 64;
+  constexpr int Groups = 1;
+  constexpr bool HasBias = true;
+  const int dilation = 2;
+
+  std::vector<float> weights;
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < Channels; j++)
+      for (int k = 0; k < KernelSize; k++)
+        weights.push_back(dist(rng));
+  for (int i = 0; i < Channels; i++)
+    weights.push_back(dist(rng));
+
+  nam::Conv1DFullyFixed<Channels, Channels, KernelSize, MaxFrames, Groups, HasBias> conv_fixed(dilation);
+  nam::Conv1D conv_dynamic;
+  conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups);
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_fixed.set_weights_(it1);
+  conv_dynamic.set_weights_(it2);
+
+  conv_fixed.SetMaxBufferSize(MaxFrames);
+  conv_dynamic.SetMaxBufferSize(MaxFrames);
+
+  Eigen::MatrixXf input(Channels, MaxFrames);
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < MaxFrames; j++)
+      input(i, j) = dist(rng);
+
+  conv_fixed.Process(input, MaxFrames);
+  conv_dynamic.Process(input, MaxFrames);
+
+  assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames);
+}
+
+void test_conv1d_fully_fixed_dilation8()
+{
+  std::mt19937 rng(111);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  constexpr int Channels = 4;
+  constexpr int KernelSize = 3;
+  constexpr int MaxFrames = 128;
+  constexpr int Groups = 1;
+  constexpr bool HasBias = true;
+  const int dilation = 8;
+
+  std::vector<float> weights;
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < Channels; j++)
+      for (int k = 0; k < KernelSize; k++)
+        weights.push_back(dist(rng));
+  for (int i = 0; i < Channels; i++)
+    weights.push_back(dist(rng));
+
+  nam::Conv1DFullyFixed<Channels, Channels, KernelSize, MaxFrames, Groups, HasBias> conv_fixed(dilation);
+  nam::Conv1D conv_dynamic;
+  conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups);
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_fixed.set_weights_(it1);
+  conv_dynamic.set_weights_(it2);
+
+  conv_fixed.SetMaxBufferSize(MaxFrames);
+  conv_dynamic.SetMaxBufferSize(MaxFrames);
+
+  Eigen::MatrixXf input(Channels, MaxFrames);
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < MaxFrames; j++)
+      input(i, j) = dist(rng);
+
+  conv_fixed.Process(input, MaxFrames);
+  conv_dynamic.Process(input, MaxFrames);
+
+  assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames);
+}
+
+void test_conv1d_fully_fixed_no_bias()
+{
+  std::mt19937 rng(222);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  constexpr int Channels = 4;
+  constexpr int KernelSize = 3;
+  constexpr int MaxFrames = 64;
+  constexpr int Groups = 1;
+  constexpr bool HasBias = false;
+  const int dilation = 1;
+
+  std::vector<float> weights;
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < Channels; j++)
+      for (int k = 0; k < KernelSize; k++)
+        weights.push_back(dist(rng));
+
+  nam::Conv1DFullyFixed<Channels, Channels, KernelSize, MaxFrames, Groups, HasBias> conv_fixed(dilation);
+  nam::Conv1D conv_dynamic;
+  conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups);
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_fixed.set_weights_(it1);
+  conv_dynamic.set_weights_(it2);
+
+  conv_fixed.SetMaxBufferSize(MaxFrames);
+  conv_dynamic.SetMaxBufferSize(MaxFrames);
+
+  Eigen::MatrixXf input(Channels, MaxFrames);
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < MaxFrames; j++)
+      input(i, j) = dist(rng);
+
+  conv_fixed.Process(input, MaxFrames);
+  conv_dynamic.Process(input, MaxFrames);
+
+  assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames);
+}
+
+void test_conv1d_fully_fixed_multiple_calls()
+{
+  std::mt19937 rng(333);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  constexpr int Channels = 4;
+  constexpr int KernelSize = 3;
+  constexpr int MaxFrames = 64;
+  constexpr int Groups = 1;
+  constexpr bool HasBias = true;
+  const int dilation = 1;
+
+  std::vector<float> weights;
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < Channels; j++)
+      for (int k = 0; k < KernelSize; k++)
+        weights.push_back(dist(rng));
+  for (int i = 0; i < Channels; i++)
+    weights.push_back(dist(rng));
+
+  nam::Conv1DFullyFixed<Channels, Channels, KernelSize, MaxFrames, Groups, HasBias> conv_fixed(dilation);
+  nam::Conv1D conv_dynamic;
+  conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups);
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_fixed.set_weights_(it1);
+  conv_dynamic.set_weights_(it2);
+
+  conv_fixed.SetMaxBufferSize(MaxFrames);
+  conv_dynamic.SetMaxBufferSize(MaxFrames);
+
+  // Multiple calls - tests history management
+  for (int call = 0; call < 10; call++)
+  {
+    Eigen::MatrixXf input(Channels, MaxFrames);
+    for (int i = 0; i < Channels; i++)
+      for (int j = 0; j < MaxFrames; j++)
+        input(i, j) = dist(rng);
+
+    conv_fixed.Process(input, MaxFrames);
+    conv_dynamic.Process(input, MaxFrames);
+
+    assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames);
+  }
+}
+
+void test_conv1d_fully_fixed_multiple_calls_dilation4()
+{
+  std::mt19937 rng(444);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  constexpr int Channels = 4;
+  constexpr int KernelSize = 3;
+  constexpr int MaxFrames = 64;
+  constexpr int Groups = 1;
+  constexpr bool HasBias = true;
+  const int dilation = 4;
+
+  std::vector<float> weights;
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < Channels; j++)
+      for (int k = 0; k < KernelSize; k++)
+        weights.push_back(dist(rng));
+  for (int i = 0; i < Channels; i++)
+    weights.push_back(dist(rng));
+
+  nam::Conv1DFullyFixed<Channels, Channels, KernelSize, MaxFrames, Groups, HasBias> conv_fixed(dilation);
+  nam::Conv1D conv_dynamic;
+  conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups);
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_fixed.set_weights_(it1);
+  conv_dynamic.set_weights_(it2);
+
+  conv_fixed.SetMaxBufferSize(MaxFrames);
+  conv_dynamic.SetMaxBufferSize(MaxFrames);
+
+  for (int call = 0; call < 10; call++)
+  {
+    Eigen::MatrixXf input(Channels, MaxFrames);
+    for (int i = 0; i < Channels; i++)
+      for (int j = 0; j < MaxFrames; j++)
+        input(i, j) = dist(rng);
+
+    conv_fixed.Process(input, MaxFrames);
+    conv_dynamic.Process(input, MaxFrames);
+
+    assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames);
+  }
+}
+
+void test_conv1d_fully_fixed_varying_buffer_sizes()
+{
+  std::mt19937 rng(555);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  constexpr int Channels = 4;
+  constexpr int KernelSize = 3;
+  constexpr int MaxFrames = 64;
+  constexpr int Groups = 1;
+  constexpr bool HasBias = true;
+  const int dilation = 2;
+
+  std::vector<float> weights;
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < Channels; j++)
+      for (int k = 0; k < KernelSize; k++)
+        weights.push_back(dist(rng));
+  for (int i = 0; i < Channels; i++)
+    weights.push_back(dist(rng));
+
+  nam::Conv1DFullyFixed<Channels, Channels, KernelSize, MaxFrames, Groups, HasBias> conv_fixed(dilation);
+  nam::Conv1D conv_dynamic;
+  conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups);
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_fixed.set_weights_(it1);
+  conv_dynamic.set_weights_(it2);
+
+  conv_fixed.SetMaxBufferSize(MaxFrames);
+  conv_dynamic.SetMaxBufferSize(MaxFrames);
+
+  // Varying sizes to stress test history management
+  int sizes[] = {64, 32, 16, 64, 32, 8, 64};
+  for (int num_frames : sizes)
+  {
+    Eigen::MatrixXf input(Channels, MaxFrames);
+    for (int i = 0; i < Channels; i++)
+      for (int j = 0; j < MaxFrames; j++)
+        input(i, j) = dist(rng);
+
+    conv_fixed.Process(input, num_frames);
+    conv_dynamic.Process(input, num_frames);
+
+    assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), num_frames);
+  }
+}
+
+void test_conv1d_fully_fixed_kernel4()
+{
+  std::mt19937 rng(666);
+  std::uniform_real_distribution<float> dist(-1.0f, 1.0f);
+
+  constexpr int Channels = 4;
+  constexpr int KernelSize = 4;
+  constexpr int MaxFrames = 64;
+  constexpr int Groups = 1;
+  constexpr bool HasBias = true;
+  const int dilation = 1;
+
+  std::vector<float> weights;
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < Channels; j++)
+      for (int k = 0; k < KernelSize; k++)
+        weights.push_back(dist(rng));
+  for (int i = 0; i < Channels; i++)
+    weights.push_back(dist(rng));
+
+  nam::Conv1DFullyFixed<Channels, Channels, KernelSize, MaxFrames, Groups, HasBias> conv_fixed(dilation);
+  nam::Conv1D conv_dynamic;
+  conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups);
+
+  auto it1 = weights.begin();
+  auto it2 = weights.begin();
+  conv_fixed.set_weights_(it1);
+  conv_dynamic.set_weights_(it2);
+
+  conv_fixed.SetMaxBufferSize(MaxFrames);
+  conv_dynamic.SetMaxBufferSize(MaxFrames);
+
+  Eigen::MatrixXf input(Channels, MaxFrames);
+  for (int i = 0; i < Channels; i++)
+    for (int j = 0; j < MaxFrames; j++)
+      input(i, j) = dist(rng);
+
+  conv_fixed.Process(input, MaxFrames);
+  conv_dynamic.Process(input, MaxFrames);
+
+  assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames);
+}
+
+} // namespace test_fully_fixed_correctness