From 48f4e15f4326dc22c6ca5b16d5a16c5497bceece Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= Date: Wed, 28 Jan 2026 12:49:51 -0800 Subject: [PATCH 1/4] Implemented some optimizations for grouped convolutions and a new benchmarking tool for convolution performance. --- NAM/conv1d.cpp | 40 +++-- NAM/conv1d.h | 14 ++ NAM/dsp.cpp | 37 ++-- NAM/dsp.h | 13 ++ tools/CMakeLists.txt | 49 +++--- tools/benchmark_convolution.cpp | 208 ++++++++++++++++++++++ tools/plot_convolution_benchmark.py | 258 ++++++++++++++++++++++++++++ tools/run_tests.cpp | 5 + tools/test/test_conv1d.cpp | 162 +++++++++++++++++ tools/test/test_conv_1x1.cpp | 95 ++++++++++ 10 files changed, 827 insertions(+), 54 deletions(-) create mode 100644 tools/benchmark_convolution.cpp create mode 100644 tools/plot_convolution_benchmark.py diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp index 6e1835b..fda50aa 100644 --- a/NAM/conv1d.cpp +++ b/NAM/conv1d.cpp @@ -62,6 +62,18 @@ void Conv1D::set_size_(const int in_channels, const int out_channels, const int else this->_bias.resize(0); this->_dilation = _dilation; + + // Pre-compute group block indices for efficient runtime access + const long out_per_group = out_channels / groups; + const long in_per_group = in_channels / groups; + this->_group_blocks.resize(groups); + for (int g = 0; g < groups; g++) + { + this->_group_blocks[g].out_start = g * out_per_group; + this->_group_blocks[g].in_start = g * in_per_group; + this->_group_blocks[g].out_size = out_per_group; + this->_group_blocks[g].in_size = in_per_group; + } } void Conv1D::set_size_and_weights_(const int in_channels, const int out_channels, const int kernel_size, @@ -105,10 +117,6 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames) _output.leftCols(num_frames).setZero(); const int numGroups = this->_num_groups; - const long in_channels = get_in_channels(); - const long out_channels = get_out_channels(); - const long in_per_group = in_channels / numGroups; - const long out_per_group = out_channels / numGroups; // Process from ring buffer with dilation lookback // After Write(), data is at positions [_write_pos, _write_pos+num_frames-1] @@ -130,9 +138,11 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames) } else { - // Grouped convolution: process each group separately + // Grouped convolution: process each group separately using pre-computed block indices for (int g = 0; g < numGroups; g++) { + const auto& block = this->_group_blocks[g]; + for (size_t k = 0; k < this->_weight.size(); k++) { const long offset = this->_dilation * (k + 1 - (long)this->_weight.size()); @@ -140,13 +150,13 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames) auto input_block = _input_buffer.Read(num_frames, lookback); // Extract input slice for this group - auto input_group = input_block.middleRows(g * in_per_group, in_per_group); + auto input_group = input_block.middleRows(block.in_start, block.in_size); // Extract weight slice for this group - auto weight_group = this->_weight[k].block(g * out_per_group, g * in_per_group, out_per_group, in_per_group); + auto weight_group = this->_weight[k].block(block.out_start, block.in_start, block.out_size, block.in_size); // Extract output slice for this group - auto output_group = _output.leftCols(num_frames).middleRows(g * out_per_group, out_per_group); + auto output_group = _output.leftCols(num_frames).middleRows(block.out_start, block.out_size); // Perform grouped convolution: output_group += weight_group * input_group output_group.noalias() += weight_group * input_group; @@ -168,10 +178,6 @@ void Conv1D::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, con const long j_start) const { const int numGroups = this->_num_groups; - const long in_channels = get_in_channels(); - const long out_channels = get_out_channels(); - const long in_per_group = in_channels / numGroups; - const long out_per_group = out_channels / numGroups; if (numGroups == 1) { @@ -187,21 +193,23 @@ void Conv1D::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, con } else { - // Grouped convolution: process each group separately + // Grouped convolution: process each group separately using pre-computed block indices for (int g = 0; g < numGroups; g++) { + const auto& block = this->_group_blocks[g]; + for (size_t k = 0; k < this->_weight.size(); k++) { const long offset = this->_dilation * (k + 1 - this->_weight.size()); // Extract input slice for this group - auto input_group = input.middleCols(i_start + offset, ncols).middleRows(g * in_per_group, in_per_group); + auto input_group = input.middleCols(i_start + offset, ncols).middleRows(block.in_start, block.in_size); // Extract weight slice for this group - auto weight_group = this->_weight[k].block(g * out_per_group, g * in_per_group, out_per_group, in_per_group); + auto weight_group = this->_weight[k].block(block.out_start, block.in_start, block.out_size, block.in_size); // Extract output slice for this group - auto output_group = output.middleCols(j_start, ncols).middleRows(g * out_per_group, out_per_group); + auto output_group = output.middleCols(j_start, ncols).middleRows(block.out_start, block.out_size); // Perform grouped convolution if (k == 0) diff --git a/NAM/conv1d.h b/NAM/conv1d.h index 8182966..0bf64a7 100644 --- a/NAM/conv1d.h +++ b/NAM/conv1d.h @@ -6,6 +6,19 @@ namespace nam { + +/// \brief Pre-computed group block indices for grouped convolutions +/// +/// Stores the indices for extracting input/output slices for each group, +/// avoiding repeated computation during real-time processing. +struct Conv1DGroupBlock +{ + long out_start; ///< Starting row index in output + long in_start; ///< Starting row index in input + long out_size; ///< Number of output channels per group + long in_size; ///< Number of input channels per group +}; + /// \brief 1D dilated convolution layer /// /// Implements a 1D convolution with support for dilation and grouped convolution. @@ -123,6 +136,7 @@ class Conv1D Eigen::VectorXf _bias; int _dilation; int _num_groups; + std::vector _group_blocks; ///< Pre-computed group block indices private: RingBuffer _input_buffer; // Ring buffer for input (channels x buffer_size) diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp index 02a4a13..5eb567e 100644 --- a/NAM/dsp.cpp +++ b/NAM/dsp.cpp @@ -353,6 +353,16 @@ void nam::Conv1x1::set_weights_(std::vector::iterator& weights) const long out_per_group = out_channels / numGroups; const long in_per_group = in_channels / numGroups; + // Pre-compute group block indices for efficient runtime access + this->_group_blocks.resize(numGroups); + for (int g = 0; g < numGroups; g++) + { + this->_group_blocks[g].out_start = g * out_per_group; + this->_group_blocks[g].in_start = g * in_per_group; + this->_group_blocks[g].out_size = out_per_group; + this->_group_blocks[g].in_size = in_per_group; + } + // For grouped convolutions, weights are organized per group // Weight layout: weights are [group0, group1, ..., groupN-1] // Each group's weight matrix is (out_channels/numGroups, in_channels/numGroups) @@ -375,10 +385,7 @@ void nam::Conv1x1::set_weights_(std::vector::iterator& weights) Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int num_frames) const { const int numGroups = this->_num_groups; - const long in_channels = get_in_channels(); const long out_channels = get_out_channels(); - const long in_per_group = in_channels / numGroups; - const long out_per_group = out_channels / numGroups; Eigen::MatrixXf result(out_channels, num_frames); @@ -392,18 +399,20 @@ Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int nu } else { - // Grouped convolution: process each group separately + // Grouped convolution: process each group separately using pre-computed block indices result.setZero(); for (int g = 0; g < numGroups; g++) { + const auto& block = this->_group_blocks[g]; + // Extract input slice for this group - auto input_group = input.leftCols(num_frames).middleRows(g * in_per_group, in_per_group); + auto input_group = input.leftCols(num_frames).middleRows(block.in_start, block.in_size); // Extract weight slice for this group - auto weight_group = this->_weight.block(g * out_per_group, g * in_per_group, out_per_group, in_per_group); + auto weight_group = this->_weight.block(block.out_start, block.in_start, block.out_size, block.in_size); // Extract output slice for this group - auto output_group = result.middleRows(g * out_per_group, out_per_group); + auto output_group = result.middleRows(block.out_start, block.out_size); // Perform grouped convolution: output_group = weight_group * input_group output_group.noalias() = weight_group * input_group; @@ -422,10 +431,6 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons assert(num_frames <= _output.cols()); const int numGroups = this->_num_groups; - const long in_channels = get_in_channels(); - const long out_channels = get_out_channels(); - const long in_per_group = in_channels / numGroups; - const long out_per_group = out_channels / numGroups; if (numGroups == 1) { @@ -434,18 +439,20 @@ void nam::Conv1x1::process_(const Eigen::Ref& input, cons } else { - // Grouped convolution: process each group separately + // Grouped convolution: process each group separately using pre-computed block indices _output.leftCols(num_frames).setZero(); for (int g = 0; g < numGroups; g++) { + const auto& block = this->_group_blocks[g]; + // Extract input slice for this group - auto input_group = input.leftCols(num_frames).middleRows(g * in_per_group, in_per_group); + auto input_group = input.leftCols(num_frames).middleRows(block.in_start, block.in_size); // Extract weight slice for this group - auto weight_group = this->_weight.block(g * out_per_group, g * in_per_group, out_per_group, in_per_group); + auto weight_group = this->_weight.block(block.out_start, block.in_start, block.out_size, block.in_size); // Extract output slice for this group - auto output_group = _output.leftCols(num_frames).middleRows(g * out_per_group, out_per_group); + auto output_group = _output.leftCols(num_frames).middleRows(block.out_start, block.out_size); // Perform grouped convolution: output_group = weight_group * input_group output_group.noalias() = weight_group * input_group; diff --git a/NAM/dsp.h b/NAM/dsp.h index 8b984d2..f7763c2 100644 --- a/NAM/dsp.h +++ b/NAM/dsp.h @@ -269,6 +269,18 @@ std::unique_ptr Factory(const nlohmann::json& config, std::vector& w // NN modules ================================================================= +/// \brief Pre-computed group block indices for grouped convolutions +/// +/// Stores the indices for extracting input/output slices for each group, +/// avoiding repeated computation during real-time processing. +struct GroupBlock +{ + long out_start; ///< Starting row index in output + long in_start; ///< Starting row index in input + long out_size; ///< Number of output channels per group + long in_size; ///< Number of input channels per group +}; + /// \brief 1x1 convolution (really just a fully-connected linear layer operating per-sample) /// /// Performs a pointwise convolution, which is equivalent to a fully connected layer @@ -330,6 +342,7 @@ class Conv1x1 Eigen::MatrixXf _weight; Eigen::VectorXf _bias; int _num_groups; + std::vector _group_blocks; ///< Pre-computed group block indices private: Eigen::MatrixXf _output; diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 8118e08..22e4db6 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -1,7 +1,7 @@ file(GLOB_RECURSE NAM_SOURCES ../NAM/*.cpp ../NAM/*.c ../NAM*.h) # TODO: add loadmodel and run_tests to TOOLS? -set(TOOLS benchmodel) +set(TOOLS benchmodel benchmark_convolution) add_custom_target(tools ALL DEPENDS ${TOOLS}) @@ -12,6 +12,7 @@ include_directories(tools ${NAM_DEPS_PATH}/nlohmann) add_executable(loadmodel loadmodel.cpp ${NAM_SOURCES}) add_executable(benchmodel benchmodel.cpp ${NAM_SOURCES}) +add_executable(benchmark_convolution benchmark_convolution.cpp ${NAM_SOURCES}) add_executable(run_tests run_tests.cpp test/allocation_tracking.cpp ${NAM_SOURCES}) # Compile run_tests without optimizations to ensure allocation tracking works correctly # Also ensure assertions are enabled (NDEBUG is not defined) so tests actually run @@ -31,31 +32,33 @@ endif() source_group(NAM ${CMAKE_CURRENT_SOURCE_DIR} FILES ${NAM_SOURCES}) -target_compile_features(${TOOLS} PUBLIC cxx_std_20) +foreach(TOOL ${TOOLS}) + target_compile_features(${TOOL} PUBLIC cxx_std_20) -set_target_properties(${TOOLS} - PROPERTIES - CXX_VISIBILITY_PRESET hidden - INTERPROCEDURAL_OPTIMIZATION TRUE - PREFIX "" -) + set_target_properties(${TOOL} + PROPERTIES + CXX_VISIBILITY_PRESET hidden + INTERPROCEDURAL_OPTIMIZATION TRUE + PREFIX "" + ) -if (CMAKE_SYSTEM_NAME STREQUAL "Windows") - target_compile_definitions(${TOOLS} PRIVATE NOMINMAX WIN32_LEAN_AND_MEAN) -endif() + if (CMAKE_SYSTEM_NAME STREQUAL "Windows") + target_compile_definitions(${TOOL} PRIVATE NOMINMAX WIN32_LEAN_AND_MEAN) + endif() -if (MSVC) - target_compile_options(${TOOLS} PRIVATE - "$<$:/W4>" - "$<$:/O2>" - ) -else() - target_compile_options(${TOOLS} PRIVATE - -Wall -Wextra -Wpedantic -Wstrict-aliasing -Wunreachable-code -Weffc++ -Wno-unused-parameter - "$<$:-Og;-ggdb;-Werror>" - "$<$:-Ofast>" - ) -endif() + if (MSVC) + target_compile_options(${TOOL} PRIVATE + "$<$:/W4>" + "$<$:/O2>" + ) + else() + target_compile_options(${TOOL} PRIVATE + -Wall -Wextra -Wpedantic -Wstrict-aliasing -Wunreachable-code -Weffc++ -Wno-unused-parameter + "$<$:-Og;-ggdb;-Werror>" + "$<$:-Ofast>" + ) + endif() +endforeach() # There's an error in eigen's # /Users/steve/src/NeuralAmpModelerCore/Dependencies/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h diff --git a/tools/benchmark_convolution.cpp b/tools/benchmark_convolution.cpp new file mode 100644 index 0000000..6501c55 --- /dev/null +++ b/tools/benchmark_convolution.cpp @@ -0,0 +1,208 @@ +// Microbenchmark for Conv1x1 and Conv1D convolution operations +// Measures performance across various configurations of channels, groups, and frame sizes. +// Outputs CSV format for analysis. + +#include +#include +#include +#include +#include +#include + +#include "NAM/conv1d.h" +#include "NAM/dsp.h" + +using std::chrono::duration; +using std::chrono::duration_cast; +using std::chrono::high_resolution_clock; +using std::chrono::nanoseconds; + +// Number of iterations per benchmark configuration +constexpr int NUM_WARMUP_ITERATIONS = 10; +constexpr int NUM_BENCHMARK_ITERATIONS = 100; + +// Benchmark configurations +constexpr int CHANNELS[] = {8, 16, 32, 64, 128}; +constexpr int GROUPS[] = {1, 2, 4, 8, 16}; +constexpr int FRAMES[] = {64, 256, 1024}; +constexpr int KERNEL_SIZES[] = {1, 3}; // For Conv1D + +struct BenchmarkResult +{ + double mean_ns; + double stddev_ns; + double min_ns; + double max_ns; +}; + +// Calculate statistics from timing samples +BenchmarkResult calculate_stats(const std::vector& samples) +{ + BenchmarkResult result; + double sum = 0.0; + result.min_ns = samples[0]; + result.max_ns = samples[0]; + + for (double s : samples) + { + sum += s; + if (s < result.min_ns) + result.min_ns = s; + if (s > result.max_ns) + result.max_ns = s; + } + + result.mean_ns = sum / samples.size(); + + double sq_sum = 0.0; + for (double s : samples) + { + double diff = s - result.mean_ns; + sq_sum += diff * diff; + } + result.stddev_ns = std::sqrt(sq_sum / samples.size()); + + return result; +} + +// Benchmark Conv1x1 +void benchmark_conv1x1(int channels, int groups, int frames, std::mt19937& rng) +{ + if (channels % groups != 0) + return; // Skip invalid configurations + + // Create Conv1x1 layer + nam::Conv1x1 conv(channels, channels, false, groups); + + // Initialize with random weights + const int num_weights = (channels / groups) * (channels / groups) * groups; + std::vector weights(num_weights); + std::uniform_real_distribution dist(-1.0f, 1.0f); + for (auto& w : weights) + w = dist(rng); + + auto it = weights.begin(); + conv.set_weights_(it); + conv.SetMaxBufferSize(frames); + + // Create random input + Eigen::MatrixXf input(channels, frames); + for (int i = 0; i < channels; i++) + for (int j = 0; j < frames; j++) + input(i, j) = dist(rng); + + // Warmup + for (int i = 0; i < NUM_WARMUP_ITERATIONS; i++) + { + conv.process_(input, frames); + } + + // Benchmark + std::vector samples; + samples.reserve(NUM_BENCHMARK_ITERATIONS); + + for (int i = 0; i < NUM_BENCHMARK_ITERATIONS; i++) + { + auto t1 = high_resolution_clock::now(); + conv.process_(input, frames); + auto t2 = high_resolution_clock::now(); + samples.push_back(static_cast(duration_cast(t2 - t1).count())); + } + + BenchmarkResult result = calculate_stats(samples); + + // Output CSV row + std::cout << "Conv1x1," << channels << "," << groups << "," << frames << ",1," << std::fixed << std::setprecision(2) + << result.mean_ns << "," << result.stddev_ns << "," << result.min_ns << "," << result.max_ns << "\n"; +} + +// Benchmark Conv1D +void benchmark_conv1d(int channels, int groups, int frames, int kernel_size, std::mt19937& rng) +{ + if (channels % groups != 0) + return; // Skip invalid configurations + + // Create Conv1D layer + nam::Conv1D conv; + conv.set_size_(channels, channels, kernel_size, false, 1, groups); + + // Initialize with random weights + const int num_weights = kernel_size * (channels / groups) * (channels / groups) * groups; + std::vector weights(num_weights); + std::uniform_real_distribution dist(-1.0f, 1.0f); + for (auto& w : weights) + w = dist(rng); + + auto it = weights.begin(); + conv.set_weights_(it); + conv.SetMaxBufferSize(frames); + + // Create random input + Eigen::MatrixXf input(channels, frames); + for (int i = 0; i < channels; i++) + for (int j = 0; j < frames; j++) + input(i, j) = dist(rng); + + // Warmup + for (int i = 0; i < NUM_WARMUP_ITERATIONS; i++) + { + conv.Process(input, frames); + } + + // Benchmark + std::vector samples; + samples.reserve(NUM_BENCHMARK_ITERATIONS); + + for (int i = 0; i < NUM_BENCHMARK_ITERATIONS; i++) + { + auto t1 = high_resolution_clock::now(); + conv.Process(input, frames); + auto t2 = high_resolution_clock::now(); + samples.push_back(static_cast(duration_cast(t2 - t1).count())); + } + + BenchmarkResult result = calculate_stats(samples); + + // Output CSV row + std::cout << "Conv1D," << channels << "," << groups << "," << frames << "," << kernel_size << "," << std::fixed + << std::setprecision(2) << result.mean_ns << "," << result.stddev_ns << "," << result.min_ns << "," + << result.max_ns << "\n"; +} + +int main(int argc, char* argv[]) +{ + // Print CSV header + std::cout << "type,channels,groups,frames,kernel_size,mean_ns,stddev_ns,min_ns,max_ns\n"; + + // Use fixed seed for reproducibility + std::mt19937 rng(42); + + // Benchmark Conv1x1 + for (int channels : CHANNELS) + { + for (int groups : GROUPS) + { + for (int frames : FRAMES) + { + benchmark_conv1x1(channels, groups, frames, rng); + } + } + } + + // Benchmark Conv1D + for (int channels : CHANNELS) + { + for (int groups : GROUPS) + { + for (int frames : FRAMES) + { + for (int kernel_size : KERNEL_SIZES) + { + benchmark_conv1d(channels, groups, frames, kernel_size, rng); + } + } + } + } + + return 0; +} diff --git a/tools/plot_convolution_benchmark.py b/tools/plot_convolution_benchmark.py new file mode 100644 index 0000000..929e947 --- /dev/null +++ b/tools/plot_convolution_benchmark.py @@ -0,0 +1,258 @@ +#!/usr/bin/env python3 +""" +Visualization script for convolution benchmark results. + +Usage: + python plot_convolution_benchmark.py results.csv + python plot_convolution_benchmark.py before.csv after.csv # Compare two runs +""" + +import argparse +import sys +from pathlib import Path + +try: + import pandas as pd + import matplotlib.pyplot as plt + import numpy as np +except ImportError: + print("Error: This script requires pandas and matplotlib.") + print("Install with: pip install pandas matplotlib") + sys.exit(1) + + +def load_results(csv_path: str) -> pd.DataFrame: + """Load benchmark results from CSV file.""" + df = pd.read_csv(csv_path) + # Convert ns to microseconds for readability + df["mean_us"] = df["mean_ns"] / 1000 + df["stddev_us"] = df["stddev_ns"] / 1000 + df["min_us"] = df["min_ns"] / 1000 + df["max_us"] = df["max_ns"] / 1000 + return df + + +def plot_groups_vs_time(df: pd.DataFrame, conv_type: str, output_prefix: str): + """Plot groups vs execution time for different channel counts.""" + type_df = df[df["type"] == conv_type] + + if type_df.empty: + print(f"No data for {conv_type}") + return + + frames_list = sorted(type_df["frames"].unique()) + channels_list = sorted(type_df["channels"].unique()) + + for frames in frames_list: + fig, ax = plt.subplots(figsize=(10, 6)) + + for channels in channels_list: + subset = type_df[(type_df["frames"] == frames) & (type_df["channels"] == channels)] + if subset.empty: + continue + + # Sort by groups + subset = subset.sort_values("groups") + + ax.errorbar( + subset["groups"], + subset["mean_us"], + yerr=subset["stddev_us"], + marker="o", + capsize=3, + label=f"{channels} channels", + ) + + ax.set_xlabel("Number of Groups") + ax.set_ylabel("Execution Time (microseconds)") + ax.set_title(f"{conv_type}: Groups vs Time (frames={frames})") + ax.legend() + ax.set_xscale("log", base=2) + ax.grid(True, alpha=0.3) + + plt.tight_layout() + output_path = f"{output_prefix}_{conv_type.lower()}_frames{frames}_groups_vs_time.png" + plt.savefig(output_path, dpi=150) + print(f"Saved: {output_path}") + plt.close() + + +def plot_speedup_vs_baseline(df: pd.DataFrame, conv_type: str, output_prefix: str): + """Plot speedup relative to groups=1 baseline.""" + type_df = df[df["type"] == conv_type] + + if type_df.empty: + print(f"No data for {conv_type}") + return + + frames_list = sorted(type_df["frames"].unique()) + channels_list = sorted(type_df["channels"].unique()) + + for frames in frames_list: + fig, ax = plt.subplots(figsize=(10, 6)) + + for channels in channels_list: + subset = type_df[(type_df["frames"] == frames) & (type_df["channels"] == channels)] + if subset.empty: + continue + + # Get baseline (groups=1) + baseline = subset[subset["groups"] == 1] + if baseline.empty: + continue + baseline_time = baseline["mean_us"].values[0] + + # Calculate speedup + subset = subset.sort_values("groups") + speedup = baseline_time / subset["mean_us"] + + ax.plot( + subset["groups"], + speedup, + marker="o", + label=f"{channels} channels", + ) + + ax.axhline(y=1.0, color="gray", linestyle="--", alpha=0.5, label="Baseline (groups=1)") + ax.set_xlabel("Number of Groups") + ax.set_ylabel("Speedup (relative to groups=1)") + ax.set_title(f"{conv_type}: Speedup vs Groups (frames={frames})") + ax.legend() + ax.set_xscale("log", base=2) + ax.grid(True, alpha=0.3) + + plt.tight_layout() + output_path = f"{output_prefix}_{conv_type.lower()}_frames{frames}_speedup.png" + plt.savefig(output_path, dpi=150) + print(f"Saved: {output_path}") + plt.close() + + +def plot_comparison(df_before: pd.DataFrame, df_after: pd.DataFrame, conv_type: str, output_prefix: str): + """Compare before/after benchmark results.""" + before = df_before[df_before["type"] == conv_type] + after = df_after[df_after["type"] == conv_type] + + if before.empty or after.empty: + print(f"No data for {conv_type}") + return + + frames_list = sorted(before["frames"].unique()) + channels_list = sorted(before["channels"].unique()) + + for frames in frames_list: + fig, axes = plt.subplots(1, 2, figsize=(14, 5)) + + # Left plot: Absolute times + ax1 = axes[0] + for channels in channels_list: + before_subset = before[(before["frames"] == frames) & (before["channels"] == channels)] + after_subset = after[(after["frames"] == frames) & (after["channels"] == channels)] + + if before_subset.empty or after_subset.empty: + continue + + before_subset = before_subset.sort_values("groups") + after_subset = after_subset.sort_values("groups") + + ax1.plot( + before_subset["groups"], + before_subset["mean_us"], + marker="o", + linestyle="--", + alpha=0.7, + label=f"{channels}ch (before)", + ) + ax1.plot( + after_subset["groups"], + after_subset["mean_us"], + marker="s", + label=f"{channels}ch (after)", + ) + + ax1.set_xlabel("Number of Groups") + ax1.set_ylabel("Execution Time (microseconds)") + ax1.set_title(f"{conv_type}: Before vs After (frames={frames})") + ax1.legend(fontsize=8) + ax1.set_xscale("log", base=2) + ax1.grid(True, alpha=0.3) + + # Right plot: Speedup (after vs before) + ax2 = axes[1] + for channels in channels_list: + before_subset = before[(before["frames"] == frames) & (before["channels"] == channels)] + after_subset = after[(after["frames"] == frames) & (after["channels"] == channels)] + + if before_subset.empty or after_subset.empty: + continue + + # Merge on groups + merged = pd.merge( + before_subset[["groups", "mean_us"]], + after_subset[["groups", "mean_us"]], + on="groups", + suffixes=("_before", "_after"), + ) + + speedup = merged["mean_us_before"] / merged["mean_us_after"] + + ax2.plot( + merged["groups"], + speedup, + marker="o", + label=f"{channels} channels", + ) + + ax2.axhline(y=1.0, color="gray", linestyle="--", alpha=0.5, label="No change") + ax2.set_xlabel("Number of Groups") + ax2.set_ylabel("Speedup (before/after)") + ax2.set_title(f"{conv_type}: Optimization Speedup (frames={frames})") + ax2.legend(fontsize=8) + ax2.set_xscale("log", base=2) + ax2.grid(True, alpha=0.3) + + plt.tight_layout() + output_path = f"{output_prefix}_{conv_type.lower()}_frames{frames}_comparison.png" + plt.savefig(output_path, dpi=150) + print(f"Saved: {output_path}") + plt.close() + + +def main(): + parser = argparse.ArgumentParser(description="Visualize convolution benchmark results") + parser.add_argument("csv_files", nargs="+", help="CSV file(s) with benchmark results") + parser.add_argument("-o", "--output-prefix", default="benchmark", help="Output file prefix") + args = parser.parse_args() + + if len(args.csv_files) == 1: + # Single file mode + df = load_results(args.csv_files[0]) + + for conv_type in ["Conv1x1", "Conv1D"]: + plot_groups_vs_time(df, conv_type, args.output_prefix) + plot_speedup_vs_baseline(df, conv_type, args.output_prefix) + + print("\nSummary statistics:") + print(df.groupby(["type", "channels", "groups"])["mean_us"].mean().unstack()) + + elif len(args.csv_files) == 2: + # Comparison mode + df_before = load_results(args.csv_files[0]) + df_after = load_results(args.csv_files[1]) + + for conv_type in ["Conv1x1", "Conv1D"]: + plot_comparison(df_before, df_after, conv_type, args.output_prefix) + + # Calculate overall improvement + print("\nOverall speedup (before/after):") + for conv_type in ["Conv1x1", "Conv1D"]: + before_mean = df_before[df_before["type"] == conv_type]["mean_us"].mean() + after_mean = df_after[df_after["type"] == conv_type]["mean_us"].mean() + print(f" {conv_type}: {before_mean/after_mean:.2f}x") + else: + print("Error: Provide 1 or 2 CSV files") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/tools/run_tests.cpp b/tools/run_tests.cpp index be65760..e1ebbf7 100644 --- a/tools/run_tests.cpp +++ b/tools/run_tests.cpp @@ -103,6 +103,9 @@ int main() test_conv1d::test_process_grouped_dilation(); test_conv1d::test_process_grouped_channel_isolation(); test_conv1d::test_get_num_weights_grouped(); + test_conv1d::test_process_grouped_realtime_safe(); + test_conv1d::test_process_realtime_safe(); + test_conv1d::test_process_grouped_dilated_realtime_safe(); test_conv_1x1::test_construct(); test_conv_1x1::test_construct_with_groups(); @@ -118,6 +121,8 @@ int main() test_conv_1x1::test_process_underscore_grouped(); test_conv_1x1::test_set_max_buffer_size(); test_conv_1x1::test_process_multiple_calls(); + test_conv_1x1::test_process_grouped_realtime_safe(); + test_conv_1x1::test_process_realtime_safe(); test_film::test_set_max_buffer_size(); test_film::test_process_bias_only(); diff --git a/tools/test/test_conv1d.cpp b/tools/test/test_conv1d.cpp index 900eea0..14e72aa 100644 --- a/tools/test/test_conv1d.cpp +++ b/tools/test/test_conv1d.cpp @@ -7,6 +7,7 @@ #include #include "NAM/conv1d.h" +#include "allocation_tracking.h" namespace test_conv1d { @@ -848,4 +849,165 @@ void test_get_num_weights_grouped() actual = conv_4groups.get_num_weights(); assert(actual == expected); } + +// Test that grouped convolution Process() is real-time safe (no allocations) +void test_process_grouped_realtime_safe() +{ + const int in_channels = 8; + const int out_channels = 8; + const int kernel_size = 3; + const bool do_bias = true; + const int dilation = 1; + const int groups = 4; + const int num_frames = 64; + + nam::Conv1D conv; + conv.set_size_(in_channels, out_channels, kernel_size, do_bias, dilation, groups); + + // Initialize weights (identity-like for each group, each kernel position) + std::vector weights; + const int in_per_group = in_channels / groups; + const int out_per_group = out_channels / groups; + + // Weight layout: for each group, for each (i,j), for each kernel position k + for (int g = 0; g < groups; g++) + { + for (int i = 0; i < out_per_group; i++) + { + for (int j = 0; j < in_per_group; j++) + { + for (int k = 0; k < kernel_size; k++) + { + // Only set weight for last kernel tap and diagonal + weights.push_back((k == kernel_size - 1 && i == j) ? 1.0f : 0.0f); + } + } + } + } + // Add bias + for (int i = 0; i < out_channels; i++) + { + weights.push_back(0.0f); + } + + auto it = weights.begin(); + conv.set_weights_(it); + conv.SetMaxBufferSize(num_frames); + + // Create input buffer + Eigen::MatrixXf input(in_channels, num_frames); + for (int i = 0; i < in_channels; i++) + for (int j = 0; j < num_frames; j++) + input(i, j) = static_cast(i + j); + + // Run allocation test + allocation_tracking::run_allocation_test_no_allocations( + nullptr, + [&]() { + conv.Process(input, num_frames); + }, + nullptr, "test_process_grouped_realtime_safe"); +} + +// Test that non-grouped convolution Process() is also real-time safe +void test_process_realtime_safe() +{ + const int in_channels = 16; + const int out_channels = 16; + const int kernel_size = 3; + const bool do_bias = true; + const int dilation = 1; + const int num_frames = 64; + + nam::Conv1D conv; + conv.set_size_(in_channels, out_channels, kernel_size, do_bias, dilation); + + // Initialize weights (identity for last kernel tap) + std::vector weights; + for (int i = 0; i < out_channels; i++) + { + for (int j = 0; j < in_channels; j++) + { + for (int k = 0; k < kernel_size; k++) + { + weights.push_back((k == kernel_size - 1 && i == j) ? 1.0f : 0.0f); + } + } + } + // Add bias + for (int i = 0; i < out_channels; i++) + { + weights.push_back(0.0f); + } + + auto it = weights.begin(); + conv.set_weights_(it); + conv.SetMaxBufferSize(num_frames); + + // Create input buffer + Eigen::MatrixXf input(in_channels, num_frames); + for (int i = 0; i < in_channels; i++) + for (int j = 0; j < num_frames; j++) + input(i, j) = static_cast(i + j); + + // Run allocation test + allocation_tracking::run_allocation_test_no_allocations( + nullptr, + [&]() { + conv.Process(input, num_frames); + }, + nullptr, "test_process_realtime_safe"); +} + +// Test grouped convolution with dilation is real-time safe +void test_process_grouped_dilated_realtime_safe() +{ + const int in_channels = 8; + const int out_channels = 8; + const int kernel_size = 2; + const bool do_bias = false; + const int dilation = 4; + const int groups = 2; + const int num_frames = 64; + + nam::Conv1D conv; + conv.set_size_(in_channels, out_channels, kernel_size, do_bias, dilation, groups); + + // Initialize weights + std::vector weights; + const int in_per_group = in_channels / groups; + const int out_per_group = out_channels / groups; + + for (int g = 0; g < groups; g++) + { + for (int i = 0; i < out_per_group; i++) + { + for (int j = 0; j < in_per_group; j++) + { + for (int k = 0; k < kernel_size; k++) + { + weights.push_back((k == kernel_size - 1 && i == j) ? 1.0f : 0.0f); + } + } + } + } + + auto it = weights.begin(); + conv.set_weights_(it); + conv.SetMaxBufferSize(num_frames); + + // Create input buffer + Eigen::MatrixXf input(in_channels, num_frames); + for (int i = 0; i < in_channels; i++) + for (int j = 0; j < num_frames; j++) + input(i, j) = static_cast(i + j); + + // Run allocation test + allocation_tracking::run_allocation_test_no_allocations( + nullptr, + [&]() { + conv.Process(input, num_frames); + }, + nullptr, "test_process_grouped_dilated_realtime_safe"); +} }; // namespace test_conv1d diff --git a/tools/test/test_conv_1x1.cpp b/tools/test/test_conv_1x1.cpp index cb3e234..3cba668 100644 --- a/tools/test/test_conv_1x1.cpp +++ b/tools/test/test_conv_1x1.cpp @@ -8,6 +8,7 @@ #include #include "NAM/dsp.h" +#include "allocation_tracking.h" namespace test_conv_1x1 { @@ -492,4 +493,98 @@ void test_process_multiple_calls() assert(std::abs(output2(0, 0) - 3.0f) < 0.01f); assert(std::abs(output2(1, 0) - 4.0f) < 0.01f); } + +// Test that grouped convolution process_() is real-time safe (no allocations) +void test_process_grouped_realtime_safe() +{ + const int in_channels = 8; + const int out_channels = 8; + const bool do_bias = true; + const int groups = 4; + const int num_frames = 64; + + nam::Conv1x1 conv(in_channels, out_channels, do_bias, groups); + + // Initialize weights (identity-like for each group) + std::vector weights; + const int in_per_group = in_channels / groups; + const int out_per_group = out_channels / groups; + for (int g = 0; g < groups; g++) + { + for (int i = 0; i < out_per_group; i++) + { + for (int j = 0; j < in_per_group; j++) + { + weights.push_back(i == j ? 1.0f : 0.0f); + } + } + } + // Add bias + for (int i = 0; i < out_channels; i++) + { + weights.push_back(0.0f); + } + + auto it = weights.begin(); + conv.set_weights_(it); + conv.SetMaxBufferSize(num_frames); + + // Create input buffer + Eigen::MatrixXf input(in_channels, num_frames); + for (int i = 0; i < in_channels; i++) + for (int j = 0; j < num_frames; j++) + input(i, j) = static_cast(i + j); + + // Run allocation test + allocation_tracking::run_allocation_test_no_allocations( + nullptr, + [&]() { + conv.process_(input, num_frames); + }, + nullptr, "test_process_grouped_realtime_safe"); +} + +// Test that non-grouped convolution process_() is also real-time safe +void test_process_realtime_safe() +{ + const int in_channels = 16; + const int out_channels = 16; + const bool do_bias = true; + const int num_frames = 64; + + nam::Conv1x1 conv(in_channels, out_channels, do_bias); + + // Initialize weights (identity) + std::vector weights; + for (int i = 0; i < out_channels; i++) + { + for (int j = 0; j < in_channels; j++) + { + weights.push_back(i == j ? 1.0f : 0.0f); + } + } + // Add bias + for (int i = 0; i < out_channels; i++) + { + weights.push_back(0.0f); + } + + auto it = weights.begin(); + conv.set_weights_(it); + conv.SetMaxBufferSize(num_frames); + + // Create input buffer + Eigen::MatrixXf input(in_channels, num_frames); + for (int i = 0; i < in_channels; i++) + for (int j = 0; j < num_frames; j++) + input(i, j) = static_cast(i + j); + + // Run allocation test + allocation_tracking::run_allocation_test_no_allocations( + nullptr, + [&]() { + conv.process_(input, num_frames); + }, + nullptr, "test_process_realtime_safe"); +} } // namespace test_conv_1x1 From 68d31d39c87e2efc4baa52e0e5bcfff007b25b5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= Date: Wed, 28 Jan 2026 15:47:30 -0800 Subject: [PATCH 2/4] Optimize grouped convolutions: loop-based for Conv1x1, block-diagonal for Conv1D Conv1x1: Use explicit group loop with groups=1 fast path. For small channel counts (2-8), this avoids the overhead of zero multiplications in block-diagonal matrices that BLAS cannot optimize efficiently. Conv1D: Keep block-diagonal approach (single matmul per kernel position) which shows 1.5-1.9x speedup for grouped convolutions. The multiple kernel positions amortize the overhead, making this approach beneficial. Removed pre-computed GroupBlock structs as they are no longer needed with these simplified implementations. Updated benchmark tool to test channels 2-8 for detailed comparison. Co-Authored-By: Claude Opus 4.5 --- NAM/conv1d.cpp | 118 +------ NAM/conv1d.h | 14 - NAM/dsp.cpp | 102 +++--- NAM/dsp.h | 13 - tools/benchmark_compare.sh | 590 ++++++++++++++++---------------- tools/benchmark_convolution.cpp | 4 +- 6 files changed, 356 insertions(+), 485 deletions(-) diff --git a/NAM/conv1d.cpp b/NAM/conv1d.cpp index fda50aa..ff4e55d 100644 --- a/NAM/conv1d.cpp +++ b/NAM/conv1d.cpp @@ -54,26 +54,15 @@ void Conv1D::set_size_(const int in_channels, const int out_channels, const int this->_num_groups = groups; this->_weight.resize(kernel_size); + // Initialize weight matrices to zero - critical for block-diagonal structure + // Off-diagonal blocks must be zero for single-matmul grouped convolution for (size_t i = 0; i < this->_weight.size(); i++) - this->_weight[i].resize(out_channels, - in_channels); // y = Ax, input array (C,L) + this->_weight[i].setZero(out_channels, in_channels); if (do_bias) this->_bias.resize(out_channels); else this->_bias.resize(0); this->_dilation = _dilation; - - // Pre-compute group block indices for efficient runtime access - const long out_per_group = out_channels / groups; - const long in_per_group = in_channels / groups; - this->_group_blocks.resize(groups); - for (int g = 0; g < groups; g++) - { - this->_group_blocks[g].out_start = g * out_per_group; - this->_group_blocks[g].in_start = g * in_per_group; - this->_group_blocks[g].out_size = out_per_group; - this->_group_blocks[g].in_size = in_per_group; - } } void Conv1D::set_size_and_weights_(const int in_channels, const int out_channels, const int kernel_size, @@ -116,52 +105,15 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames) // Zero output before processing _output.leftCols(num_frames).setZero(); - const int numGroups = this->_num_groups; - - // Process from ring buffer with dilation lookback - // After Write(), data is at positions [_write_pos, _write_pos+num_frames-1] - // For kernel tap k with offset, we need to read from _write_pos + offset - // The offset is negative (looking back), so _write_pos + offset reads from earlier positions - // The original process_() reads: input.middleCols(i_start + offset, ncols) - // where i_start is the current position and offset is negative for lookback - - if (numGroups == 1) - { - // Standard convolution (no grouping) - for (size_t k = 0; k < this->_weight.size(); k++) - { - const long offset = this->_dilation * (k + 1 - (long)this->_weight.size()); - const long lookback = -offset; - auto input_block = _input_buffer.Read(num_frames, lookback); - _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block; - } - } - else + // Block-diagonal optimization: each weight matrix has block-diagonal structure for grouped convs. + // Off-diagonal blocks are zeros, so a single matmul per kernel position gives the same result + // as G separate matmuls. This is more efficient because BLAS can optimize larger operations. + for (size_t k = 0; k < this->_weight.size(); k++) { - // Grouped convolution: process each group separately using pre-computed block indices - for (int g = 0; g < numGroups; g++) - { - const auto& block = this->_group_blocks[g]; - - for (size_t k = 0; k < this->_weight.size(); k++) - { - const long offset = this->_dilation * (k + 1 - (long)this->_weight.size()); - const long lookback = -offset; - auto input_block = _input_buffer.Read(num_frames, lookback); - - // Extract input slice for this group - auto input_group = input_block.middleRows(block.in_start, block.in_size); - - // Extract weight slice for this group - auto weight_group = this->_weight[k].block(block.out_start, block.in_start, block.out_size, block.in_size); - - // Extract output slice for this group - auto output_group = _output.leftCols(num_frames).middleRows(block.out_start, block.out_size); - - // Perform grouped convolution: output_group += weight_group * input_group - output_group.noalias() += weight_group * input_group; - } - } + const long offset = this->_dilation * (k + 1 - (long)this->_weight.size()); + const long lookback = -offset; + auto input_block = _input_buffer.Read(num_frames, lookback); + _output.leftCols(num_frames).noalias() += this->_weight[k] * input_block; } // Add bias if present @@ -177,47 +129,15 @@ void Conv1D::Process(const Eigen::MatrixXf& input, const int num_frames) void Conv1D::process_(const Eigen::MatrixXf& input, Eigen::MatrixXf& output, const long i_start, const long ncols, const long j_start) const { - const int numGroups = this->_num_groups; - - if (numGroups == 1) + // Block-diagonal optimization: each weight matrix has block-diagonal structure for grouped convs. + // Off-diagonal blocks are zeros, so a single matmul per kernel position gives the same result. + for (size_t k = 0; k < this->_weight.size(); k++) { - // Standard convolution (no grouping) - for (size_t k = 0; k < this->_weight.size(); k++) - { - const long offset = this->_dilation * (k + 1 - this->_weight.size()); - if (k == 0) - output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols); - else - output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols); - } - } - else - { - // Grouped convolution: process each group separately using pre-computed block indices - for (int g = 0; g < numGroups; g++) - { - const auto& block = this->_group_blocks[g]; - - for (size_t k = 0; k < this->_weight.size(); k++) - { - const long offset = this->_dilation * (k + 1 - this->_weight.size()); - - // Extract input slice for this group - auto input_group = input.middleCols(i_start + offset, ncols).middleRows(block.in_start, block.in_size); - - // Extract weight slice for this group - auto weight_group = this->_weight[k].block(block.out_start, block.in_start, block.out_size, block.in_size); - - // Extract output slice for this group - auto output_group = output.middleCols(j_start, ncols).middleRows(block.out_start, block.out_size); - - // Perform grouped convolution - if (k == 0) - output_group.noalias() = weight_group * input_group; - else - output_group.noalias() += weight_group * input_group; - } - } + const long offset = this->_dilation * (k + 1 - this->_weight.size()); + if (k == 0) + output.middleCols(j_start, ncols).noalias() = this->_weight[k] * input.middleCols(i_start + offset, ncols); + else + output.middleCols(j_start, ncols).noalias() += this->_weight[k] * input.middleCols(i_start + offset, ncols); } if (this->_bias.size() > 0) { diff --git a/NAM/conv1d.h b/NAM/conv1d.h index 0bf64a7..8182966 100644 --- a/NAM/conv1d.h +++ b/NAM/conv1d.h @@ -6,19 +6,6 @@ namespace nam { - -/// \brief Pre-computed group block indices for grouped convolutions -/// -/// Stores the indices for extracting input/output slices for each group, -/// avoiding repeated computation during real-time processing. -struct Conv1DGroupBlock -{ - long out_start; ///< Starting row index in output - long in_start; ///< Starting row index in input - long out_size; ///< Number of output channels per group - long in_size; ///< Number of input channels per group -}; - /// \brief 1D dilated convolution layer /// /// Implements a 1D convolution with support for dilation and grouped convolution. @@ -136,7 +123,6 @@ class Conv1D Eigen::VectorXf _bias; int _dilation; int _num_groups; - std::vector _group_blocks; ///< Pre-computed group block indices private: RingBuffer _input_buffer; // Ring buffer for input (channels x buffer_size) diff --git a/NAM/dsp.cpp b/NAM/dsp.cpp index 5eb567e..bdfa8e3 100644 --- a/NAM/dsp.cpp +++ b/NAM/dsp.cpp @@ -331,7 +331,9 @@ nam::Conv1x1::Conv1x1(const int in_channels, const int out_channels, const bool } this->_num_groups = groups; - this->_weight.resize(out_channels, in_channels); + // Initialize weight matrix to zero - critical for block-diagonal structure + // Off-diagonal blocks must be zero for single-matmul grouped convolution + this->_weight.setZero(out_channels, in_channels); this->_do_bias = _bias; if (_bias) this->_bias.resize(out_channels); @@ -353,17 +355,9 @@ void nam::Conv1x1::set_weights_(std::vector::iterator& weights) const long out_per_group = out_channels / numGroups; const long in_per_group = in_channels / numGroups; - // Pre-compute group block indices for efficient runtime access - this->_group_blocks.resize(numGroups); - for (int g = 0; g < numGroups; g++) - { - this->_group_blocks[g].out_start = g * out_per_group; - this->_group_blocks[g].in_start = g * in_per_group; - this->_group_blocks[g].out_size = out_per_group; - this->_group_blocks[g].in_size = in_per_group; - } - - // For grouped convolutions, weights are organized per group + // For grouped convolutions, weights form a block-diagonal matrix. + // Off-diagonal blocks are already zero (from constructor). + // We only set the diagonal blocks here. // Weight layout: weights are [group0, group1, ..., groupN-1] // Each group's weight matrix is (out_channels/numGroups, in_channels/numGroups) for (int g = 0; g < numGroups; g++) @@ -384,78 +378,62 @@ void nam::Conv1x1::set_weights_(std::vector::iterator& weights) Eigen::MatrixXf nam::Conv1x1::process(const Eigen::MatrixXf& input, const int num_frames) const { - const int numGroups = this->_num_groups; - const long out_channels = get_out_channels(); - - Eigen::MatrixXf result(out_channels, num_frames); + const long out_channels = this->_weight.rows(); + const long in_channels = this->_weight.cols(); - if (numGroups == 1) + // For groups=1, use simple matrix multiply (most common case) + if (this->_num_groups == 1) { - // Standard convolution (no grouping) if (this->_do_bias) - result = (this->_weight * input.leftCols(num_frames)).colwise() + this->_bias; + return (this->_weight * input.leftCols(num_frames)).colwise() + this->_bias; else - result = this->_weight * input.leftCols(num_frames); + return this->_weight * input.leftCols(num_frames); } - else - { - // Grouped convolution: process each group separately using pre-computed block indices - result.setZero(); - for (int g = 0; g < numGroups; g++) - { - const auto& block = this->_group_blocks[g]; - // Extract input slice for this group - auto input_group = input.leftCols(num_frames).middleRows(block.in_start, block.in_size); + // For grouped convolutions with small channel counts, explicit loop is faster + // than block-diagonal single matmul due to BLAS overhead on small matrices + const long out_per_group = out_channels / this->_num_groups; + const long in_per_group = in_channels / this->_num_groups; - // Extract weight slice for this group - auto weight_group = this->_weight.block(block.out_start, block.in_start, block.out_size, block.in_size); - - // Extract output slice for this group - auto output_group = result.middleRows(block.out_start, block.out_size); - - // Perform grouped convolution: output_group = weight_group * input_group - output_group.noalias() = weight_group * input_group; - } - - // Add bias if present - if (this->_do_bias) - result.colwise() += this->_bias; + Eigen::MatrixXf output(out_channels, num_frames); + for (int g = 0; g < this->_num_groups; g++) + { + auto input_group = input.middleRows(g * in_per_group, in_per_group).leftCols(num_frames); + auto weight_group = this->_weight.block(g * out_per_group, g * in_per_group, out_per_group, in_per_group); + output.middleRows(g * out_per_group, out_per_group).noalias() = weight_group * input_group; } - return result; + if (this->_do_bias) + output.colwise() += this->_bias; + + return output; } void nam::Conv1x1::process_(const Eigen::Ref& input, const int num_frames) { assert(num_frames <= _output.cols()); - const int numGroups = this->_num_groups; + const long out_channels = this->_weight.rows(); + const long in_channels = this->_weight.cols(); - if (numGroups == 1) + // For groups=1, use simple matrix multiply (most common case) + if (this->_num_groups == 1) { - // Standard convolution (no grouping) _output.leftCols(num_frames).noalias() = this->_weight * input.leftCols(num_frames); } else { - // Grouped convolution: process each group separately using pre-computed block indices - _output.leftCols(num_frames).setZero(); - for (int g = 0; g < numGroups; g++) - { - const auto& block = this->_group_blocks[g]; - - // Extract input slice for this group - auto input_group = input.leftCols(num_frames).middleRows(block.in_start, block.in_size); - - // Extract weight slice for this group - auto weight_group = this->_weight.block(block.out_start, block.in_start, block.out_size, block.in_size); + // For grouped convolutions with small channel counts, explicit loop is faster + // than block-diagonal single matmul due to BLAS overhead on small matrices + const long out_per_group = out_channels / this->_num_groups; + const long in_per_group = in_channels / this->_num_groups; - // Extract output slice for this group - auto output_group = _output.leftCols(num_frames).middleRows(block.out_start, block.out_size); - - // Perform grouped convolution: output_group = weight_group * input_group - output_group.noalias() = weight_group * input_group; + for (int g = 0; g < this->_num_groups; g++) + { + auto input_group = input.middleRows(g * in_per_group, in_per_group).leftCols(num_frames); + auto weight_group = this->_weight.block(g * out_per_group, g * in_per_group, out_per_group, in_per_group); + _output.middleRows(g * out_per_group, out_per_group).leftCols(num_frames).noalias() = + weight_group * input_group; } } diff --git a/NAM/dsp.h b/NAM/dsp.h index f7763c2..8b984d2 100644 --- a/NAM/dsp.h +++ b/NAM/dsp.h @@ -269,18 +269,6 @@ std::unique_ptr Factory(const nlohmann::json& config, std::vector& w // NN modules ================================================================= -/// \brief Pre-computed group block indices for grouped convolutions -/// -/// Stores the indices for extracting input/output slices for each group, -/// avoiding repeated computation during real-time processing. -struct GroupBlock -{ - long out_start; ///< Starting row index in output - long in_start; ///< Starting row index in input - long out_size; ///< Number of output channels per group - long in_size; ///< Number of input channels per group -}; - /// \brief 1x1 convolution (really just a fully-connected linear layer operating per-sample) /// /// Performs a pointwise convolution, which is equivalent to a fully connected layer @@ -342,7 +330,6 @@ class Conv1x1 Eigen::MatrixXf _weight; Eigen::VectorXf _bias; int _num_groups; - std::vector _group_blocks; ///< Pre-computed group block indices private: Eigen::MatrixXf _output; diff --git a/tools/benchmark_compare.sh b/tools/benchmark_compare.sh index e742fd1..8b12197 100755 --- a/tools/benchmark_compare.sh +++ b/tools/benchmark_compare.sh @@ -3,13 +3,13 @@ # Script to compare performance of current branch against another branch (default: main) # Usage: ./tools/benchmark_compare.sh [--model MODEL_PATH] [--branch BRANCH_NAME] -set -e # Exit on error +set -e # Exit on error -MODEL_PATH="example_models/wavenet_a1_standard.nam" +MODEL_PATH="example_models/wavenet_a2_max.nam" # "example_models/wavenet_a1_standard.nam" BUILD_DIR="build" BENCHMARK_EXEC="build/tools/benchmodel" NUM_RUNS=10 -COMPARE_BRANCH="main" # Default branch to compare against +COMPARE_BRANCH="main" # Default branch to compare against # Report file will be set with timestamp in main() # Colors for output @@ -20,80 +20,80 @@ NC='\033[0m' # No Color # Function to extract milliseconds from benchmodel output extract_ms() { - local output="$1" - # Extract the double precision milliseconds value (the second one) - echo "$output" | grep -E "^[0-9]+\.[0-9]+ms$" | head -1 | sed 's/ms$//' + local output="$1" + # Extract the double precision milliseconds value (the second one) + echo "$output" | grep -E "^[0-9]+\.[0-9]+ms$" | head -1 | sed 's/ms$//' } # Function to run benchmark multiple times and collect results run_benchmark() { - local branch_name="$1" - local results_file="$2" - local project_root="$PWD" # Save current directory - - echo -e "${YELLOW}Running benchmark on branch: ${branch_name}${NC}" - - # Clean build directory - remove only untracked files, preserve tracked files like .gitignore - if [ -d "$BUILD_DIR" ]; then - # Remove files/directories that aren't tracked by git (process depth-first) - find "$BUILD_DIR" -mindepth 1 -depth -exec sh -c 'if ! git ls-files --error-unmatch "$1" >/dev/null 2>&1; then rm -rf "$1"; fi' _ {} \; - fi - mkdir -p "$BUILD_DIR" - - # Configure and build in release mode - echo "Configuring CMake..." - cd "$BUILD_DIR" || exit 1 - cmake -DCMAKE_BUILD_TYPE=Release .. - - echo "Building benchmodel..." - cmake --build . --target benchmodel -j$(sysctl -n hw.ncpu 2>/dev/null || echo 4) - cd "$project_root" || exit 1 - - # Verify executable exists - if [ ! -f "$BENCHMARK_EXEC" ]; then - echo -e "${RED}Error: benchmodel executable not found at $BENCHMARK_EXEC${NC}" - exit 1 - fi - - # Verify model file exists (use absolute path to be sure) - local abs_model_path="$project_root/$MODEL_PATH" - if [ ! -f "$abs_model_path" ]; then - echo -e "${RED}Error: Model file not found at $abs_model_path${NC}" - echo "Available model files:" - find "$project_root/example_models" -name "*.nam" -type f 2>/dev/null || echo " (none found)" - exit 1 + local branch_name="$1" + local results_file="$2" + local project_root="$PWD" # Save current directory + + echo -e "${YELLOW}Running benchmark on branch: ${branch_name}${NC}" + + # Clean build directory - remove only untracked files, preserve tracked files like .gitignore + if [ -d "$BUILD_DIR" ]; then + # Remove files/directories that aren't tracked by git (process depth-first) + find "$BUILD_DIR" -mindepth 1 -depth -exec sh -c 'if ! git ls-files --error-unmatch "$1" >/dev/null 2>&1; then rm -rf "$1"; fi' _ {} \; + fi + mkdir -p "$BUILD_DIR" + + # Configure and build in release mode + echo "Configuring CMake..." + cd "$BUILD_DIR" || exit 1 + cmake -DCMAKE_BUILD_TYPE=Release .. + + echo "Building benchmodel..." + cmake --build . --target benchmodel -j$(sysctl -n hw.ncpu 2>/dev/null || echo 4) + cd "$project_root" || exit 1 + + # Verify executable exists + if [ ! -f "$BENCHMARK_EXEC" ]; then + echo -e "${RED}Error: benchmodel executable not found at $BENCHMARK_EXEC${NC}" + exit 1 + fi + + # Verify model file exists (use absolute path to be sure) + local abs_model_path="$project_root/$MODEL_PATH" + if [ ! -f "$abs_model_path" ]; then + echo -e "${RED}Error: Model file not found at $abs_model_path${NC}" + echo "Available model files:" + find "$project_root/example_models" -name "*.nam" -type f 2>/dev/null || echo " (none found)" + exit 1 + fi + + # Run benchmark multiple times + echo "Running benchmark $NUM_RUNS times..." + >"$results_file" # Clear results file + + for i in $(seq 1 $NUM_RUNS); do + echo -n " Run $i/$NUM_RUNS... " + output=$("$BENCHMARK_EXEC" "$abs_model_path" 2>&1) + ms=$(extract_ms "$output") + + if [ -z "$ms" ]; then + echo -e "${RED}Failed to extract timing${NC}" + echo "Output was:" + echo "$output" + exit 1 fi - - # Run benchmark multiple times - echo "Running benchmark $NUM_RUNS times..." - > "$results_file" # Clear results file - - for i in $(seq 1 $NUM_RUNS); do - echo -n " Run $i/$NUM_RUNS... " - output=$("$BENCHMARK_EXEC" "$abs_model_path" 2>&1) - ms=$(extract_ms "$output") - - if [ -z "$ms" ]; then - echo -e "${RED}Failed to extract timing${NC}" - echo "Output was:" - echo "$output" - exit 1 - fi - - echo "$ms" >> "$results_file" - echo "${ms}ms" - done - - echo -e "${GREEN}Completed benchmark for ${branch_name}${NC}" - echo "" + + echo "$ms" >>"$results_file" + echo "${ms}ms" + done + + echo -e "${GREEN}Completed benchmark for ${branch_name}${NC}" + echo "" } # Function to calculate statistics calculate_stats() { - local results_file="$1" - - # Calculate mean, min, max, stddev with awk - local stats=$(awk ' + local results_file="$1" + + # Calculate mean, min, max, stddev with awk + local stats=$(awk ' { sum += $1 sumsq += $1 * $1 @@ -107,248 +107,248 @@ calculate_stats() { stddev = sqrt(variance) printf "%.3f %.3f %.3f %.3f %d", mean, min, max, stddev, n }' "$results_file") - - # Calculate median using sort (works with BSD awk) - local n=$(echo "$stats" | awk '{print $5}') - local median - if [ $((n % 2)) -eq 0 ]; then - # Even number of values: average of middle two - local mid1=$((n / 2)) - local mid2=$((n / 2 + 1)) - local val1=$(sort -n "$results_file" | sed -n "${mid1}p") - local val2=$(sort -n "$results_file" | sed -n "${mid2}p") - median=$(echo "scale=3; ($val1 + $val2) / 2" | bc) - else - # Odd number of values: middle value - local mid=$((n / 2 + 1)) - median=$(sort -n "$results_file" | sed -n "${mid}p") - fi - - # Output: mean median min max stddev - echo "$stats" | awk -v median="$median" '{printf "%.3f %.3f %.3f %.3f %.3f", $1, median, $2, $3, $4}' + + # Calculate median using sort (works with BSD awk) + local n=$(echo "$stats" | awk '{print $5}') + local median + if [ $((n % 2)) -eq 0 ]; then + # Even number of values: average of middle two + local mid1=$((n / 2)) + local mid2=$((n / 2 + 1)) + local val1=$(sort -n "$results_file" | sed -n "${mid1}p") + local val2=$(sort -n "$results_file" | sed -n "${mid2}p") + median=$(echo "scale=3; ($val1 + $val2) / 2" | bc) + else + # Odd number of values: middle value + local mid=$((n / 2 + 1)) + median=$(sort -n "$results_file" | sed -n "${mid}p") + fi + + # Output: mean median min max stddev + echo "$stats" | awk -v median="$median" '{printf "%.3f %.3f %.3f %.3f %.3f", $1, median, $2, $3, $4}' } # Function to generate report generate_report() { - local compare_results="$1" - local current_results="$2" - local current_branch="$3" - local compare_branch="$4" - local compare_commit="$5" - local current_commit="$6" - local report_file="$7" - - echo "Generating performance comparison report..." - - # Calculate statistics for both branches - read compare_mean compare_median compare_min compare_max compare_stddev <<< $(calculate_stats "$compare_results") - read current_mean current_median current_min current_max current_stddev <<< $(calculate_stats "$current_results") - - # Calculate percentage difference - diff_mean=$(echo "scale=2; (($current_mean - $compare_mean) / $compare_mean) * 100" | bc) - diff_median=$(echo "scale=2; (($current_median - $compare_median) / $compare_median) * 100" | bc) - - # Generate report - { - echo "==========================================" - echo "Performance Benchmark Comparison Report" - echo "==========================================" - echo "" - echo "Model: $MODEL_PATH" - echo "Number of runs per branch: $NUM_RUNS" - echo "Date: $(date)" - echo "" - echo "----------------------------------------" - echo "Branch: $compare_branch" - echo "----------------------------------------" - echo "Commit: ${compare_commit}" - echo "Mean: ${compare_mean} ms" - echo "Median: ${compare_median} ms" - echo "Min: ${compare_min} ms" - echo "Max: ${compare_max} ms" - echo "Std Dev: ${compare_stddev} ms" - echo "" - echo "----------------------------------------" - echo "Branch: $current_branch" - echo "----------------------------------------" - echo "Commit: ${current_commit}" - echo "Mean: ${current_mean} ms" - echo "Median: ${current_median} ms" - echo "Min: ${current_min} ms" - echo "Max: ${current_max} ms" - echo "Std Dev: ${current_stddev} ms" - echo "" - echo "----------------------------------------" - echo "Comparison" - echo "----------------------------------------" - if (( $(echo "$diff_mean > 0" | bc -l) )); then - echo "Mean: ${current_branch} is ${diff_mean}% SLOWER than ${compare_branch}" - else - echo "Mean: ${current_branch} is ${diff_mean#-}% FASTER than ${compare_branch}" - fi - if (( $(echo "$diff_median > 0" | bc -l) )); then - echo "Median: ${current_branch} is ${diff_median}% SLOWER than ${compare_branch}" - else - echo "Median: ${current_branch} is ${diff_median#-}% FASTER than ${compare_branch}" - fi - echo "" - echo "Raw Results ($compare_branch):" - cat "$compare_results" | awk '{printf " %.3f ms\n", $1}' - echo "" - echo "Raw Results ($current_branch):" - cat "$current_results" | awk '{printf " %.3f ms\n", $1}' - } > "$report_file" - - echo -e "${GREEN}Report written to: $report_file${NC}" + local compare_results="$1" + local current_results="$2" + local current_branch="$3" + local compare_branch="$4" + local compare_commit="$5" + local current_commit="$6" + local report_file="$7" + + echo "Generating performance comparison report..." + + # Calculate statistics for both branches + read compare_mean compare_median compare_min compare_max compare_stddev <<<$(calculate_stats "$compare_results") + read current_mean current_median current_min current_max current_stddev <<<$(calculate_stats "$current_results") + + # Calculate percentage difference + diff_mean=$(echo "scale=2; (($current_mean - $compare_mean) / $compare_mean) * 100" | bc) + diff_median=$(echo "scale=2; (($current_median - $compare_median) / $compare_median) * 100" | bc) + + # Generate report + { + echo "==========================================" + echo "Performance Benchmark Comparison Report" + echo "==========================================" + echo "" + echo "Model: $MODEL_PATH" + echo "Number of runs per branch: $NUM_RUNS" + echo "Date: $(date)" + echo "" + echo "----------------------------------------" + echo "Branch: $compare_branch" + echo "----------------------------------------" + echo "Commit: ${compare_commit}" + echo "Mean: ${compare_mean} ms" + echo "Median: ${compare_median} ms" + echo "Min: ${compare_min} ms" + echo "Max: ${compare_max} ms" + echo "Std Dev: ${compare_stddev} ms" + echo "" + echo "----------------------------------------" + echo "Branch: $current_branch" + echo "----------------------------------------" + echo "Commit: ${current_commit}" + echo "Mean: ${current_mean} ms" + echo "Median: ${current_median} ms" + echo "Min: ${current_min} ms" + echo "Max: ${current_max} ms" + echo "Std Dev: ${current_stddev} ms" + echo "" + echo "----------------------------------------" + echo "Comparison" + echo "----------------------------------------" + if (($(echo "$diff_mean > 0" | bc -l))); then + echo "Mean: ${current_branch} is ${diff_mean}% SLOWER than ${compare_branch}" + else + echo "Mean: ${current_branch} is ${diff_mean#-}% FASTER than ${compare_branch}" + fi + if (($(echo "$diff_median > 0" | bc -l))); then + echo "Median: ${current_branch} is ${diff_median}% SLOWER than ${compare_branch}" + else + echo "Median: ${current_branch} is ${diff_median#-}% FASTER than ${compare_branch}" + fi + echo "" + echo "Raw Results ($compare_branch):" + cat "$compare_results" | awk '{printf " %.3f ms\n", $1}' echo "" - cat "$report_file" + echo "Raw Results ($current_branch):" + cat "$current_results" | awk '{printf " %.3f ms\n", $1}' + } >"$report_file" + + echo -e "${GREEN}Report written to: $report_file${NC}" + echo "" + cat "$report_file" } # Main execution main() { - # Parse command line arguments - while [[ $# -gt 0 ]]; do - case $1 in - --model) - if [ -z "$2" ]; then - echo -e "${RED}Error: --model requires a path argument${NC}" - echo "Use --help for usage information" - exit 1 - fi - MODEL_PATH="$2" - shift 2 - ;; - --branch) - if [ -z "$2" ]; then - echo -e "${RED}Error: --branch requires a branch name argument${NC}" - echo "Use --help for usage information" - exit 1 - fi - COMPARE_BRANCH="$2" - shift 2 - ;; - --help|-h) - echo "Usage: $0 [--model MODEL_PATH] [--branch BRANCH_NAME]" - echo "" - echo "Options:" - echo " --model MODEL_PATH Path to the model file to benchmark (default: example_models/wavenet_a1_standard.nam)" - echo " --branch BRANCH_NAME Branch to compare against (default: main)" - echo " --help, -h Show this help message" - exit 0 - ;; - *) - echo -e "${RED}Error: Unknown option: $1${NC}" - echo "Use --help for usage information" - exit 1 - ;; - esac - done - - # Ensure we're in the project root (parent of tools/) - SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" - cd "$PROJECT_ROOT" - - # Verify we're in a git repository - if ! git rev-parse --git-dir > /dev/null 2>&1; then - echo -e "${RED}Error: Not in a git repository${NC}" + # Parse command line arguments + while [[ $# -gt 0 ]]; do + case $1 in + --model) + if [ -z "$2" ]; then + echo -e "${RED}Error: --model requires a path argument${NC}" + echo "Use --help for usage information" exit 1 - fi - - # Get current branch - current_branch=$(git rev-parse --abbrev-ref HEAD) - - if [ "$current_branch" = "$COMPARE_BRANCH" ]; then - echo -e "${RED}Error: Already on $COMPARE_BRANCH branch. Please checkout a different branch first.${NC}" + fi + MODEL_PATH="$2" + shift 2 + ;; + --branch) + if [ -z "$2" ]; then + echo -e "${RED}Error: --branch requires a branch name argument${NC}" + echo "Use --help for usage information" exit 1 + fi + COMPARE_BRANCH="$2" + shift 2 + ;; + --help | -h) + echo "Usage: $0 [--model MODEL_PATH] [--branch BRANCH_NAME]" + echo "" + echo "Options:" + echo " --model MODEL_PATH Path to the model file to benchmark (default: example_models/wavenet_a1_standard.nam)" + echo " --branch BRANCH_NAME Branch to compare against (default: main)" + echo " --help, -h Show this help message" + exit 0 + ;; + *) + echo -e "${RED}Error: Unknown option: $1${NC}" + echo "Use --help for usage information" + exit 1 + ;; + esac + done + + # Ensure we're in the project root (parent of tools/) + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + cd "$PROJECT_ROOT" + + # Verify we're in a git repository + if ! git rev-parse --git-dir >/dev/null 2>&1; then + echo -e "${RED}Error: Not in a git repository${NC}" + exit 1 + fi + + # Get current branch + current_branch=$(git rev-parse --abbrev-ref HEAD) + + if [ "$current_branch" = "$COMPARE_BRANCH" ]; then + echo -e "${RED}Error: Already on $COMPARE_BRANCH branch. Please checkout a different branch first.${NC}" + exit 1 + fi + + echo -e "${YELLOW}Current branch: ${current_branch}${NC}" + echo -e "${YELLOW}Comparing against: ${COMPARE_BRANCH}${NC}" + echo "" + + # Generate timestamped report filename + TIMESTAMP=$(date +"%Y%m%d_%H%M%S") + REPORT_FILE="benchmark_report_${TIMESTAMP}.txt" + + # Create temporary files for results + compare_results=$(mktemp) + current_results=$(mktemp) + + # Variables to store commit hashes + compare_commit="" + current_commit="" + + # Save untracked model file if it exists (to preserve it across branch switches) + model_backup="" + if [ -f "$MODEL_PATH" ] && ! git ls-files --error-unmatch "$MODEL_PATH" >/dev/null 2>&1; then + echo -e "${YELLOW}Preserving untracked model file: $MODEL_PATH${NC}" + model_backup=$(mktemp) + cp "$MODEL_PATH" "$model_backup" + fi + + # Track if we stashed anything + stashed=false + + # Cleanup function + cleanup() { + rm -f "$compare_results" "$current_results" + # Restore original branch if we're not on it + if [ -n "$current_branch" ] && [ "$(git rev-parse --abbrev-ref HEAD)" != "$current_branch" ]; then + git checkout "$current_branch" >/dev/null 2>&1 || true fi - - echo -e "${YELLOW}Current branch: ${current_branch}${NC}" - echo -e "${YELLOW}Comparing against: ${COMPARE_BRANCH}${NC}" - echo "" - - # Generate timestamped report filename - TIMESTAMP=$(date +"%Y%m%d_%H%M%S") - REPORT_FILE="benchmark_report_${TIMESTAMP}.txt" - - # Create temporary files for results - compare_results=$(mktemp) - current_results=$(mktemp) - - # Variables to store commit hashes - compare_commit="" - current_commit="" - - # Save untracked model file if it exists (to preserve it across branch switches) - model_backup="" - if [ -f "$MODEL_PATH" ] && ! git ls-files --error-unmatch "$MODEL_PATH" > /dev/null 2>&1; then - echo -e "${YELLOW}Preserving untracked model file: $MODEL_PATH${NC}" - model_backup=$(mktemp) - cp "$MODEL_PATH" "$model_backup" - fi - - # Track if we stashed anything - stashed=false - - # Cleanup function - cleanup() { - rm -f "$compare_results" "$current_results" - # Restore original branch if we're not on it - if [ -n "$current_branch" ] && [ "$(git rev-parse --abbrev-ref HEAD)" != "$current_branch" ]; then - git checkout "$current_branch" > /dev/null 2>&1 || true - fi - # Restore stashed changes if we stashed anything - if [ "$stashed" = true ]; then - git stash pop > /dev/null 2>&1 || true - fi - # Restore untracked model file if we backed it up - if [ -n "$model_backup" ] && [ -f "$model_backup" ]; then - mkdir -p "$(dirname "$MODEL_PATH")" - cp "$model_backup" "$MODEL_PATH" - rm -f "$model_backup" - echo -e "${GREEN}Restored untracked model file: $MODEL_PATH${NC}" - fi - } - trap cleanup EXIT - - # Test comparison branch - echo -e "${YELLOW}=== Testing ${COMPARE_BRANCH} branch ===${NC}" - # Stash any uncommitted changes (only if there are any) - if ! git diff-index --quiet HEAD -- 2>/dev/null || ! git diff-index --quiet --cached HEAD -- 2>/dev/null; then - git stash push -m "benchmark_compare.sh temporary stash" > /dev/null 2>&1 - stashed=true - fi - # Restore model file to comparison branch if we backed it up (so it's available for benchmarking) - if [ -n "$model_backup" ] && [ -f "$model_backup" ]; then - mkdir -p "$(dirname "$MODEL_PATH")" - cp "$model_backup" "$MODEL_PATH" + # Restore stashed changes if we stashed anything + if [ "$stashed" = true ]; then + git stash pop >/dev/null 2>&1 || true fi - # Use --force to allow overwriting untracked files if needed - git checkout "$COMPARE_BRANCH" --force 2>/dev/null || git checkout "$COMPARE_BRANCH" - compare_commit=$(git rev-parse HEAD) - echo "Commit: ${compare_commit}" - run_benchmark "$COMPARE_BRANCH" "$compare_results" - - # Test current branch - echo -e "${YELLOW}=== Testing ${current_branch} branch ===${NC}" - git checkout "$current_branch" --force 2>/dev/null || git checkout "$current_branch" - # Restore model file if we backed it up + # Restore untracked model file if we backed it up if [ -n "$model_backup" ] && [ -f "$model_backup" ]; then - mkdir -p "$(dirname "$MODEL_PATH")" - cp "$model_backup" "$MODEL_PATH" + mkdir -p "$(dirname "$MODEL_PATH")" + cp "$model_backup" "$MODEL_PATH" + rm -f "$model_backup" + echo -e "${GREEN}Restored untracked model file: $MODEL_PATH${NC}" fi - if [ "$stashed" = true ]; then - git stash pop > /dev/null 2>&1 || true - stashed=false - fi - current_commit=$(git rev-parse HEAD) - echo "Commit: ${current_commit}" - run_benchmark "$current_branch" "$current_results" - - # Generate report - generate_report "$compare_results" "$current_results" "$current_branch" "$COMPARE_BRANCH" "$compare_commit" "$current_commit" "$REPORT_FILE" - - echo -e "${GREEN}Benchmark comparison complete!${NC}" + } + trap cleanup EXIT + + # Test comparison branch + echo -e "${YELLOW}=== Testing ${COMPARE_BRANCH} branch ===${NC}" + # Stash any uncommitted changes (only if there are any) + if ! git diff-index --quiet HEAD -- 2>/dev/null || ! git diff-index --quiet --cached HEAD -- 2>/dev/null; then + git stash push -m "benchmark_compare.sh temporary stash" >/dev/null 2>&1 + stashed=true + fi + # Restore model file to comparison branch if we backed it up (so it's available for benchmarking) + if [ -n "$model_backup" ] && [ -f "$model_backup" ]; then + mkdir -p "$(dirname "$MODEL_PATH")" + cp "$model_backup" "$MODEL_PATH" + fi + # Use --force to allow overwriting untracked files if needed + git checkout "$COMPARE_BRANCH" --force 2>/dev/null || git checkout "$COMPARE_BRANCH" + compare_commit=$(git rev-parse HEAD) + echo "Commit: ${compare_commit}" + run_benchmark "$COMPARE_BRANCH" "$compare_results" + + # Test current branch + echo -e "${YELLOW}=== Testing ${current_branch} branch ===${NC}" + git checkout "$current_branch" --force 2>/dev/null || git checkout "$current_branch" + # Restore model file if we backed it up + if [ -n "$model_backup" ] && [ -f "$model_backup" ]; then + mkdir -p "$(dirname "$MODEL_PATH")" + cp "$model_backup" "$MODEL_PATH" + fi + if [ "$stashed" = true ]; then + git stash pop >/dev/null 2>&1 || true + stashed=false + fi + current_commit=$(git rev-parse HEAD) + echo "Commit: ${current_commit}" + run_benchmark "$current_branch" "$current_results" + + # Generate report + generate_report "$compare_results" "$current_results" "$current_branch" "$COMPARE_BRANCH" "$compare_commit" "$current_commit" "$REPORT_FILE" + + echo -e "${GREEN}Benchmark comparison complete!${NC}" } # Run main function with all command line arguments diff --git a/tools/benchmark_convolution.cpp b/tools/benchmark_convolution.cpp index 6501c55..4acbc1d 100644 --- a/tools/benchmark_convolution.cpp +++ b/tools/benchmark_convolution.cpp @@ -22,8 +22,8 @@ constexpr int NUM_WARMUP_ITERATIONS = 10; constexpr int NUM_BENCHMARK_ITERATIONS = 100; // Benchmark configurations -constexpr int CHANNELS[] = {8, 16, 32, 64, 128}; -constexpr int GROUPS[] = {1, 2, 4, 8, 16}; +constexpr int CHANNELS[] = {2, 3, 4, 5, 6, 7, 8}; +constexpr int GROUPS[] = {1, 2, 3, 4, 5, 6, 7, 8}; constexpr int FRAMES[] = {64, 256, 1024}; constexpr int KERNEL_SIZES[] = {1, 3}; // For Conv1D From acc028f1d6aceda24d496651b470edcf2c68fc86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= Date: Thu, 29 Jan 2026 13:12:13 -0800 Subject: [PATCH 3/4] Implementation with all dimensions fixed. --- NAM/conv1d_factory.cpp | 53 +++ NAM/conv1d_factory.h | 33 ++ NAM/conv1d_fixed.h | 290 +++++++++++++++ NAM/conv1x1_factory.cpp | 51 +++ NAM/conv1x1_factory.h | 30 ++ NAM/conv1x1_fixed.h | 272 ++++++++++++++ tools/CMakeLists.txt | 8 +- tools/benchmark_convolution.cpp | 160 +++++++-- tools/benchmark_fully_fixed_conv.cpp | 282 +++++++++++++++ tools/run_tests.cpp | 48 +++ tools/test/test_conv1d_fixed.cpp | 512 +++++++++++++++++++++++++++ tools/test/test_conv1x1_fixed.cpp | 323 +++++++++++++++++ 12 files changed, 2038 insertions(+), 24 deletions(-) create mode 100644 NAM/conv1d_factory.cpp create mode 100644 NAM/conv1d_factory.h create mode 100644 NAM/conv1d_fixed.h create mode 100644 NAM/conv1x1_factory.cpp create mode 100644 NAM/conv1x1_factory.h create mode 100644 NAM/conv1x1_fixed.h create mode 100644 tools/benchmark_fully_fixed_conv.cpp create mode 100644 tools/test/test_conv1d_fixed.cpp create mode 100644 tools/test/test_conv1x1_fixed.cpp diff --git a/NAM/conv1d_factory.cpp b/NAM/conv1d_factory.cpp new file mode 100644 index 0000000..f1512e1 --- /dev/null +++ b/NAM/conv1d_factory.cpp @@ -0,0 +1,53 @@ +// Conv1D Factory implementation +// Returns dynamic Conv1D wrapped in IConv1D interface + +#include "conv1d_factory.h" +#include "conv1d.h" + +namespace nam +{ + +/// \brief Dynamic wrapper for Conv1D implementing IConv1D interface +/// +/// This class wraps the existing Conv1D implementation to provide the IConv1D +/// interface for configurations that don't have specialized template instantiations. +class Conv1DDynamicWrapper : public IConv1D +{ +public: + Conv1DDynamicWrapper(int in_channels, int out_channels, int kernel_size, int dilation, bool bias, int groups) + { + _conv.set_size_(in_channels, out_channels, kernel_size, bias, dilation, groups); + } + + Eigen::MatrixXf& GetOutput() override { return _conv.GetOutput(); } + + const Eigen::MatrixXf& GetOutput() const override { return _conv.GetOutput(); } + + void SetMaxBufferSize(int maxBufferSize) override { _conv.SetMaxBufferSize(maxBufferSize); } + + void set_weights_(std::vector::iterator& weights) override { _conv.set_weights_(weights); } + + void Process(const Eigen::MatrixXf& input, int num_frames) override { _conv.Process(input, num_frames); } + + long get_out_channels() const override { return _conv.get_out_channels(); } + + long get_in_channels() const override { return _conv.get_in_channels(); } + + long get_kernel_size() const override { return _conv.get_kernel_size(); } + + int get_dilation() const override { return _conv.get_dilation(); } + + bool has_bias() const override { return _conv.has_bias(); } + +private: + Conv1D _conv; +}; + +// Factory implementation - always returns dynamic implementation +std::unique_ptr Conv1DFactory::create(int in_channels, int out_channels, int kernel_size, int dilation, + bool bias, int groups) +{ + return std::make_unique(in_channels, out_channels, kernel_size, dilation, bias, groups); +} + +} // namespace nam diff --git a/NAM/conv1d_factory.h b/NAM/conv1d_factory.h new file mode 100644 index 0000000..3cfa8af --- /dev/null +++ b/NAM/conv1d_factory.h @@ -0,0 +1,33 @@ +#pragma once + +#include +#include "conv1d_fixed.h" + +namespace nam +{ + +/// \brief Factory for creating Conv1D implementations +/// +/// Returns a dynamic Conv1D implementation wrapped in the IConv1D interface. +/// For fully optimized implementations with compile-time known buffer sizes, +/// use Conv1DFullyFixed directly. +class Conv1DFactory +{ +public: + /// \brief Create a Conv1D implementation + /// + /// Returns a dynamic implementation. For maximum performance with known + /// buffer sizes, use Conv1DFullyFixed template directly. + /// + /// \param in_channels Number of input channels + /// \param out_channels Number of output channels + /// \param kernel_size Size of the convolution kernel + /// \param dilation Dilation factor for the convolution + /// \param bias Whether to use bias + /// \param groups Number of groups for grouped convolution (default: 1) + /// \return Unique pointer to an IConv1D implementation + static std::unique_ptr create(int in_channels, int out_channels, int kernel_size, int dilation, bool bias, + int groups = 1); +}; + +} // namespace nam diff --git a/NAM/conv1d_fixed.h b/NAM/conv1d_fixed.h new file mode 100644 index 0000000..48ddb39 --- /dev/null +++ b/NAM/conv1d_fixed.h @@ -0,0 +1,290 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace nam +{ + +/// \brief Type-erased interface for Conv1D implementations +/// +/// This interface allows runtime polymorphism while enabling compile-time +/// optimized implementations via templates. All Conv1D variants (fixed-size +/// and dynamic) implement this interface. +class IConv1D +{ +public: + virtual ~IConv1D() = default; + + /// \brief Get the entire internal output buffer + /// \return Reference to the output buffer + virtual Eigen::MatrixXf& GetOutput() = 0; + + /// \brief Get the entire internal output buffer (const version) + /// \return Const reference to the output buffer + virtual const Eigen::MatrixXf& GetOutput() const = 0; + + /// \brief Resize the output buffer and reset ring buffer + /// \param maxBufferSize Maximum number of frames to process in a single call + virtual void SetMaxBufferSize(int maxBufferSize) = 0; + + /// \brief Set the parameters (weights) of this module + /// \param weights Iterator to the weights vector. Will be advanced as weights are consumed. + virtual void set_weights_(std::vector::iterator& weights) = 0; + + /// \brief Process input and store output to pre-allocated buffer + /// \param input Input matrix (channels x num_frames) + /// \param num_frames Number of frames to process + virtual void Process(const Eigen::MatrixXf& input, int num_frames) = 0; + + /// \brief Get the number of output channels + /// \return Number of output channels + virtual long get_out_channels() const = 0; + + /// \brief Get the number of input channels + /// \return Number of input channels + virtual long get_in_channels() const = 0; + + /// \brief Get the kernel size + /// \return Kernel size + virtual long get_kernel_size() const = 0; + + /// \brief Get the dilation factor + /// \return Dilation factor + virtual int get_dilation() const = 0; + + /// \brief Check if bias is used + /// \return true if bias is present, false otherwise + virtual bool has_bias() const = 0; +}; + +/// \brief Fully compile-time optimized Conv1D with fixed dimensions AND buffer size +/// +/// This implementation uses fixed-size Eigen matrices for weights, input, and output, +/// enabling the compiler to fully unroll and vectorize all operations. +/// +/// Template parameters: +/// \tparam OutChannels Number of output channels +/// \tparam InChannels Number of input channels +/// \tparam KernelSize Size of the convolution kernel +/// \tparam MaxFrames Maximum buffer size (e.g., 32, 64, 128, 256, 512) +/// \tparam Groups Number of groups for grouped convolution +/// \tparam HasBias Whether to use bias +template +class Conv1DFullyFixed : public IConv1D +{ +public: + static_assert(OutChannels > 0, "OutChannels must be positive"); + static_assert(InChannels > 0, "InChannels must be positive"); + static_assert(KernelSize > 0, "KernelSize must be positive"); + static_assert(MaxFrames > 0, "MaxFrames must be positive"); + static_assert(Groups > 0, "Groups must be positive"); + static_assert(OutChannels % Groups == 0, "OutChannels must be divisible by Groups"); + static_assert(InChannels % Groups == 0, "InChannels must be divisible by Groups"); + + // Derived constants + static constexpr int OutPerGroup = OutChannels / Groups; + static constexpr int InPerGroup = InChannels / Groups; + + // Fully fixed-size types for maximum optimization + using WeightMatrix = Eigen::Matrix; + using BiasVector = Eigen::Matrix; + using InputBuffer = Eigen::Matrix; + using OutputBuffer = Eigen::Matrix; + + + Conv1DFullyFixed(int dilation = 1) + : _dilation(dilation) + { + // Initialize weights to zero (critical for block-diagonal structure) + for (int k = 0; k < KernelSize; k++) + { + _weight[k].setZero(); + } + + if constexpr (HasBias) + { + _bias.setZero(); + } + + _output_fixed.setZero(); + _output_dynamic.resize(OutChannels, MaxFrames); + _output_dynamic.setZero(); + + // Initialize contiguous buffer + _input_contiguous.setZero(); + } + + Eigen::MatrixXf& GetOutput() override { return _output_dynamic; } + + const Eigen::MatrixXf& GetOutput() const override { return _output_dynamic; } + + void SetMaxBufferSize(int maxBufferSize) override + { + assert(maxBufferSize <= MaxFrames && "Buffer size exceeds MaxFrames template parameter"); + // Reset contiguous buffer (zeros out history) + _input_contiguous.setZero(); + } + + void set_weights_(std::vector::iterator& weights) override + { + // Weight layout: for each kernel position k, weights are [group0, group1, ..., groupN-1] + // Crazy ordering because that's how it gets flattened in PyTorch + for (int g = 0; g < Groups; g++) + { + for (int i = 0; i < OutPerGroup; i++) + { + for (int j = 0; j < InPerGroup; j++) + { + for (int k = 0; k < KernelSize; k++) + { + _weight[k](g * OutPerGroup + i, g * InPerGroup + j) = *(weights++); + } + } + } + } + + if constexpr (HasBias) + { + for (int i = 0; i < OutChannels; i++) + { + _bias(i) = *(weights++); + } + } + } + + void Process(const Eigen::MatrixXf& input, int num_frames) override + { + assert(num_frames <= MaxFrames); + + // Calculate receptive field for this dilation + const int receptive_field = (KernelSize - 1) * _dilation; + + // Buffer layout: [history (receptive_field cols) | new_input (num_frames cols)] + // History is always stored at leftCols(receptive_field) between calls + + // Copy new input after history region + _input_contiguous.middleCols(receptive_field, num_frames) = input.leftCols(num_frames); + + // Zero output before accumulation + _output_fixed.leftCols(num_frames).setZero(); + + // Process kernel positions using block operations + if constexpr (Groups == 1) + { + // Non-grouped: use efficient block operations + process_kernel_block_impl(std::make_integer_sequence{}, num_frames, receptive_field); + } + else + { + // Grouped: process per-group (still uses block operations per group) + process_kernel_grouped_impl(std::make_integer_sequence{}, num_frames, receptive_field); + } + + // Add bias if present + if constexpr (HasBias) + { + _output_fixed.leftCols(num_frames).colwise() += _bias; + } + + // Copy to dynamic output for interface compatibility + _output_dynamic.leftCols(num_frames) = _output_fixed.leftCols(num_frames); + + // Save history for next call: copy the last receptive_field frames to the beginning + // This prepares the buffer for the next Process() call + if (receptive_field > 0) + { + if (num_frames >= receptive_field) + { + // Take history from end of current input + _input_contiguous.leftCols(receptive_field) = input.middleCols(num_frames - receptive_field, receptive_field); + } + else + { + // Not enough new frames - combine old history with new input + const int old_history_needed = receptive_field - num_frames; + // Shift old history left + _input_contiguous.leftCols(old_history_needed) = + _input_contiguous.middleCols(receptive_field - old_history_needed, old_history_needed); + // Append new input as recent history + _input_contiguous.middleCols(old_history_needed, num_frames) = input.leftCols(num_frames); + } + } + } + + long get_out_channels() const override { return OutChannels; } + + long get_in_channels() const override { return InChannels; } + + long get_kernel_size() const override { return KernelSize; } + + int get_dilation() const override { return _dilation; } + + bool has_bias() const override { return HasBias; } + + /// \brief Get the maximum buffer size this implementation supports + static constexpr int GetMaxFrames() { return MaxFrames; } + +private: + std::array _weight; + BiasVector _bias; + OutputBuffer _output_fixed; + Eigen::MatrixXf _output_dynamic; // For interface compatibility + + // Contiguous buffer for efficient block operations: [history | current_input] + // Size: InChannels x (receptive_field + MaxFrames) + static constexpr int MaxReceptiveField = (KernelSize - 1) * 16; // Support up to dilation=16 + static constexpr int ContiguousBufferSize = MaxReceptiveField + MaxFrames; + Eigen::Matrix _input_contiguous; + int _dilation; + + // Helper to unroll kernel processing using block operations (non-grouped) + template + void process_kernel_block_impl(std::integer_sequence, int num_frames, int receptive_field) + { + (process_single_kernel_block(num_frames, receptive_field), ...); + } + + template + void process_single_kernel_block(int num_frames, int receptive_field) + { + // Calculate offset for this kernel position + // For causal conv: output[t] = sum_k(weight[k] * input[t - dilation*(K-1-k)]) + const int offset = _dilation * (KernelSize - 1 - K); + + // Source position in contiguous buffer + const int src_start = receptive_field - offset; + + // Use block operation for efficient matmul + _output_fixed.leftCols(num_frames).noalias() += + _weight[K] * _input_contiguous.middleCols(src_start, num_frames); + } + + // Helper to unroll kernel processing for grouped convolution + template + void process_kernel_grouped_impl(std::integer_sequence, int num_frames, int receptive_field) + { + (process_single_kernel_grouped(num_frames, receptive_field), ...); + } + + template + void process_single_kernel_grouped(int num_frames, int receptive_field) + { + const int offset = _dilation * (KernelSize - 1 - K); + const int src_start = receptive_field - offset; + + // Process each group + for (int g = 0; g < Groups; g++) + { + auto input_group = _input_contiguous.template middleRows(g * InPerGroup).middleCols(src_start, num_frames); + auto weight_group = _weight[K].template block(g * OutPerGroup, g * InPerGroup); + _output_fixed.template middleRows(g * OutPerGroup).leftCols(num_frames).noalias() += + weight_group * input_group; + } + } +}; + +} // namespace nam diff --git a/NAM/conv1x1_factory.cpp b/NAM/conv1x1_factory.cpp new file mode 100644 index 0000000..bcad04c --- /dev/null +++ b/NAM/conv1x1_factory.cpp @@ -0,0 +1,51 @@ +// Conv1x1 Factory implementation +// Returns dynamic Conv1x1 wrapped in IConv1x1 interface + +#include "conv1x1_factory.h" +#include "dsp.h" + +namespace nam +{ + +/// \brief Dynamic wrapper for Conv1x1 implementing IConv1x1 interface +class Conv1x1Dynamic : public IConv1x1 +{ +public: + Conv1x1Dynamic(int in_channels, int out_channels, bool bias, int groups) + : _conv(in_channels, out_channels, bias, groups) + { + } + + Eigen::MatrixXf& GetOutput() override { return _conv.GetOutput(); } + + const Eigen::MatrixXf& GetOutput() const override { return _conv.GetOutput(); } + + void SetMaxBufferSize(int maxBufferSize) override { _conv.SetMaxBufferSize(maxBufferSize); } + + void set_weights_(std::vector::iterator& weights) override { _conv.set_weights_(weights); } + + void process_(const Eigen::Ref& input, int num_frames) override + { + _conv.process_(input, num_frames); + } + + Eigen::MatrixXf process(const Eigen::MatrixXf& input, int num_frames) const override + { + return _conv.process(input, num_frames); + } + + long get_out_channels() const override { return _conv.get_out_channels(); } + + long get_in_channels() const override { return _conv.get_in_channels(); } + +private: + Conv1x1 _conv; +}; + +// Factory implementation - always returns dynamic implementation +std::unique_ptr Conv1x1Factory::create(int in_channels, int out_channels, bool bias, int groups) +{ + return std::make_unique(in_channels, out_channels, bias, groups); +} + +} // namespace nam diff --git a/NAM/conv1x1_factory.h b/NAM/conv1x1_factory.h new file mode 100644 index 0000000..c4fd342 --- /dev/null +++ b/NAM/conv1x1_factory.h @@ -0,0 +1,30 @@ +#pragma once + +#include +#include "conv1x1_fixed.h" + +namespace nam +{ + +/// \brief Factory for creating Conv1x1 implementations +/// +/// Returns a dynamic Conv1x1 implementation wrapped in the IConv1x1 interface. +/// For fully optimized implementations with compile-time known buffer sizes, +/// use Conv1x1FullyFixed directly. +class Conv1x1Factory +{ +public: + /// \brief Create a Conv1x1 implementation + /// + /// Returns a dynamic implementation. For maximum performance with known + /// buffer sizes, use Conv1x1FullyFixed template directly. + /// + /// \param in_channels Number of input channels + /// \param out_channels Number of output channels + /// \param bias Whether to use bias + /// \param groups Number of groups for grouped convolution (default: 1) + /// \return Unique pointer to an IConv1x1 implementation + static std::unique_ptr create(int in_channels, int out_channels, bool bias, int groups = 1); +}; + +} // namespace nam diff --git a/NAM/conv1x1_fixed.h b/NAM/conv1x1_fixed.h new file mode 100644 index 0000000..a122c87 --- /dev/null +++ b/NAM/conv1x1_fixed.h @@ -0,0 +1,272 @@ +#pragma once + +#include +#include +#include +#include + +namespace nam +{ + +/// \brief Type-erased interface for Conv1x1 implementations +/// +/// This interface allows runtime polymorphism while enabling compile-time +/// optimized implementations via templates. All Conv1x1 variants (fixed-size +/// and dynamic) implement this interface. +class IConv1x1 +{ +public: + virtual ~IConv1x1() = default; + + /// \brief Get the entire internal output buffer + /// + /// This is intended for internal wiring between layers/arrays; callers should treat + /// the buffer as pre-allocated storage and only consider the first num_frames columns + /// valid for a given processing call. Slice with .leftCols(num_frames) as needed. + /// \return Reference to the output buffer + virtual Eigen::MatrixXf& GetOutput() = 0; + + /// \brief Get the entire internal output buffer (const version) + /// \return Const reference to the output buffer + virtual const Eigen::MatrixXf& GetOutput() const = 0; + + /// \brief Resize the output buffer to handle maxBufferSize frames + /// \param maxBufferSize Maximum number of frames to process in a single call + virtual void SetMaxBufferSize(int maxBufferSize) = 0; + + /// \brief Set the parameters (weights) of this module + /// \param weights Iterator to the weights vector. Will be advanced as weights are consumed. + virtual void set_weights_(std::vector::iterator& weights) = 0; + + /// \brief Process input and store output to pre-allocated buffer + /// + /// Uses Eigen::Ref to accept matrices and block expressions without creating + /// temporaries (real-time safe). Access output via GetOutput(). + /// \param input Input matrix (channels x num_frames) + /// \param num_frames Number of frames to process + virtual void process_(const Eigen::Ref& input, int num_frames) = 0; + + /// \brief Process input and return output matrix + /// \param input Input matrix (channels x num_frames) + /// \param num_frames Number of frames to process + /// \return Output matrix (channels x num_frames) + virtual Eigen::MatrixXf process(const Eigen::MatrixXf& input, int num_frames) const = 0; + + /// \brief Get the number of output channels + /// \return Number of output channels + virtual long get_out_channels() const = 0; + + /// \brief Get the number of input channels + /// \return Number of input channels + virtual long get_in_channels() const = 0; +}; + +/// \brief Fully compile-time optimized Conv1x1 with fixed dimensions AND buffer size +/// +/// This implementation uses fixed-size Eigen matrices for weights, input, and output, +/// enabling the compiler to fully unroll and vectorize all operations. +/// +/// Template parameters: +/// \tparam OutChannels Number of output channels +/// \tparam InChannels Number of input channels +/// \tparam MaxFrames Maximum buffer size (e.g., 32, 64, 128, 256, 512) +/// \tparam Groups Number of groups for grouped convolution +/// \tparam HasBias Whether to use bias +template +class Conv1x1FullyFixed : public IConv1x1 +{ +public: + static_assert(OutChannels > 0, "OutChannels must be positive"); + static_assert(InChannels > 0, "InChannels must be positive"); + static_assert(MaxFrames > 0, "MaxFrames must be positive"); + static_assert(Groups > 0, "Groups must be positive"); + static_assert(OutChannels % Groups == 0, "OutChannels must be divisible by Groups"); + static_assert(InChannels % Groups == 0, "InChannels must be divisible by Groups"); + + // Fully fixed-size types for maximum optimization + using WeightMatrix = Eigen::Matrix; + using BiasVector = Eigen::Matrix; + using InputBuffer = Eigen::Matrix; + using OutputBuffer = Eigen::Matrix; + + Conv1x1FullyFixed() + { + _weight.setZero(); + if constexpr (HasBias) + { + _bias.setZero(); + } + _output_dynamic.resize(OutChannels, MaxFrames); + } + + Eigen::MatrixXf& GetOutput() override { return _output_dynamic; } + + const Eigen::MatrixXf& GetOutput() const override { return _output_dynamic; } + + void SetMaxBufferSize(int maxBufferSize) override + { + // For fully fixed implementation, we require the buffer size to match + assert(maxBufferSize <= MaxFrames && "Buffer size exceeds MaxFrames template parameter"); + // Output is already sized correctly + } + + void set_weights_(std::vector::iterator& weights) override + { + if constexpr (Groups == 1) + { + // Non-grouped: simple row-major weight loading + for (int i = 0; i < OutChannels; i++) + { + for (int j = 0; j < InChannels; j++) + { + _weight(i, j) = *(weights++); + } + } + } + else + { + // Grouped convolution: block-diagonal weight matrix + constexpr int out_per_group = OutChannels / Groups; + constexpr int in_per_group = InChannels / Groups; + + for (int g = 0; g < Groups; g++) + { + for (int i = 0; i < out_per_group; i++) + { + for (int j = 0; j < in_per_group; j++) + { + _weight(g * out_per_group + i, g * in_per_group + j) = *(weights++); + } + } + } + } + + if constexpr (HasBias) + { + for (int i = 0; i < OutChannels; i++) + { + _bias(i) = *(weights++); + } + } + } + + void process_(const Eigen::Ref& input, int num_frames) override + { + assert(num_frames <= MaxFrames); + + // Copy input to fixed-size buffer for fully optimized matmul + _input_fixed.template leftCols().leftCols(num_frames) = input.leftCols(num_frames); + + if constexpr (Groups == 1) + { + // Single group: fully fixed matrix multiply + _output_fixed.noalias() = _weight * _input_fixed; + } + else + { + // Grouped convolution with compile-time unrolled loop + constexpr int out_per_group = OutChannels / Groups; + constexpr int in_per_group = InChannels / Groups; + process_groups_impl(std::make_integer_sequence{}); + } + + // Add bias if present + if constexpr (HasBias) + { + _output_fixed.colwise() += _bias; + } + + // Copy back to dynamic output for interface compatibility + _output_dynamic.leftCols(num_frames) = _output_fixed.leftCols(num_frames); + } + + /// \brief Optimized process for when caller knows the exact frame count at compile time + template + void process_fixed(const Eigen::Matrix& input) + { + static_assert(NumFrames <= MaxFrames, "NumFrames exceeds MaxFrames"); + + if constexpr (Groups == 1) + { + _output_fixed.template leftCols().noalias() = _weight * input; + } + else + { + // Copy to internal buffer first + _input_fixed.template leftCols() = input; + constexpr int out_per_group = OutChannels / Groups; + constexpr int in_per_group = InChannels / Groups; + process_groups_impl(std::make_integer_sequence{}); + } + + if constexpr (HasBias) + { + _output_fixed.template leftCols().colwise() += _bias; + } + } + + /// \brief Get output as fixed-size matrix reference + template + auto GetOutputFixed() -> Eigen::Block + { + return _output_fixed.template leftCols(); + } + + Eigen::MatrixXf process(const Eigen::MatrixXf& input, int num_frames) const override + { + Eigen::MatrixXf result(OutChannels, num_frames); + + if constexpr (Groups == 1) + { + result.noalias() = _weight * input.leftCols(num_frames); + } + else + { + constexpr int out_per_group = OutChannels / Groups; + constexpr int in_per_group = InChannels / Groups; + for (int g = 0; g < Groups; g++) + { + auto input_group = input.middleRows(g * in_per_group, in_per_group).leftCols(num_frames); + auto weight_group = _weight.template block(g * out_per_group, g * in_per_group); + result.middleRows(g * out_per_group, out_per_group).noalias() = weight_group * input_group; + } + } + + if constexpr (HasBias) + { + result.colwise() += _bias; + } + + return result; + } + + long get_out_channels() const override { return OutChannels; } + long get_in_channels() const override { return InChannels; } + + /// \brief Get the maximum buffer size this implementation supports + static constexpr int GetMaxFrames() { return MaxFrames; } + +private: + WeightMatrix _weight; + BiasVector _bias; + InputBuffer _input_fixed; + OutputBuffer _output_fixed; + Eigen::MatrixXf _output_dynamic; // For interface compatibility + + // Helper to unroll group processing at compile time + template + void process_groups_impl(std::integer_sequence) + { + (process_single_group(), ...); + } + + template + void process_single_group() + { + auto input_group = _input_fixed.template middleRows(G * InPerGroup); + auto weight_group = _weight.template block(G * OutPerGroup, G * InPerGroup); + _output_fixed.template middleRows(G * OutPerGroup).noalias() = weight_group * input_group; + } +}; + +} // namespace nam diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt index 22e4db6..d320563 100644 --- a/tools/CMakeLists.txt +++ b/tools/CMakeLists.txt @@ -1,7 +1,7 @@ file(GLOB_RECURSE NAM_SOURCES ../NAM/*.cpp ../NAM/*.c ../NAM*.h) # TODO: add loadmodel and run_tests to TOOLS? -set(TOOLS benchmodel benchmark_convolution) +set(TOOLS benchmodel benchmark_convolution benchmark_fully_fixed_conv) add_custom_target(tools ALL DEPENDS ${TOOLS}) @@ -13,6 +13,7 @@ include_directories(tools ${NAM_DEPS_PATH}/nlohmann) add_executable(loadmodel loadmodel.cpp ${NAM_SOURCES}) add_executable(benchmodel benchmodel.cpp ${NAM_SOURCES}) add_executable(benchmark_convolution benchmark_convolution.cpp ${NAM_SOURCES}) +add_executable(benchmark_fully_fixed_conv benchmark_fully_fixed_conv.cpp ${NAM_SOURCES}) add_executable(run_tests run_tests.cpp test/allocation_tracking.cpp ${NAM_SOURCES}) # Compile run_tests without optimizations to ensure allocation tracking works correctly # Also ensure assertions are enabled (NDEBUG is not defined) so tests actually run @@ -64,4 +65,7 @@ endforeach() # /Users/steve/src/NeuralAmpModelerCore/Dependencies/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h # Don't let this break my build on debug: set_source_files_properties(../NAM/dsp.cpp PROPERTIES COMPILE_FLAGS "-Wno-error") -set_source_files_properties(../NAM/conv1d.cpp PROPERTIES COMPILE_FLAGS "-Wno-error") \ No newline at end of file +set_source_files_properties(../NAM/conv1d.cpp PROPERTIES COMPILE_FLAGS "-Wno-error") +set_source_files_properties(../NAM/conv1x1_factory.cpp PROPERTIES COMPILE_FLAGS "-Wno-error") +set_source_files_properties(../NAM/conv1d_factory.cpp PROPERTIES COMPILE_FLAGS "-Wno-error") +set_source_files_properties(benchmark_fully_fixed_conv.cpp PROPERTIES COMPILE_FLAGS "-Wno-error") diff --git a/tools/benchmark_convolution.cpp b/tools/benchmark_convolution.cpp index 4acbc1d..1080419 100644 --- a/tools/benchmark_convolution.cpp +++ b/tools/benchmark_convolution.cpp @@ -1,5 +1,6 @@ // Microbenchmark for Conv1x1 and Conv1D convolution operations // Measures performance across various configurations of channels, groups, and frame sizes. +// Compares dynamic implementations vs templated fixed-size implementations. // Outputs CSV format for analysis. #include @@ -10,6 +11,8 @@ #include #include "NAM/conv1d.h" +#include "NAM/conv1d_factory.h" +#include "NAM/conv1x1_factory.h" #include "NAM/dsp.h" using std::chrono::duration; @@ -23,9 +26,9 @@ constexpr int NUM_BENCHMARK_ITERATIONS = 100; // Benchmark configurations constexpr int CHANNELS[] = {2, 3, 4, 5, 6, 7, 8}; -constexpr int GROUPS[] = {1, 2, 3, 4, 5, 6, 7, 8}; +constexpr int GROUPS[] = {1, 2, 3, 4}; constexpr int FRAMES[] = {64, 256, 1024}; -constexpr int KERNEL_SIZES[] = {1, 3}; // For Conv1D +constexpr int KERNEL_SIZES[] = {3, 4}; // For Conv1D struct BenchmarkResult { @@ -65,12 +68,9 @@ BenchmarkResult calculate_stats(const std::vector& samples) return result; } -// Benchmark Conv1x1 -void benchmark_conv1x1(int channels, int groups, int frames, std::mt19937& rng) +// Benchmark Conv1x1 (dynamic implementation) +BenchmarkResult benchmark_conv1x1_dynamic(int channels, int groups, int frames, std::mt19937& rng) { - if (channels % groups != 0) - return; // Skip invalid configurations - // Create Conv1x1 layer nam::Conv1x1 conv(channels, channels, false, groups); @@ -109,19 +109,56 @@ void benchmark_conv1x1(int channels, int groups, int frames, std::mt19937& rng) samples.push_back(static_cast(duration_cast(t2 - t1).count())); } - BenchmarkResult result = calculate_stats(samples); - - // Output CSV row - std::cout << "Conv1x1," << channels << "," << groups << "," << frames << ",1," << std::fixed << std::setprecision(2) - << result.mean_ns << "," << result.stddev_ns << "," << result.min_ns << "," << result.max_ns << "\n"; + return calculate_stats(samples); } -// Benchmark Conv1D -void benchmark_conv1d(int channels, int groups, int frames, int kernel_size, std::mt19937& rng) +// Benchmark Conv1x1 (fixed/templated implementation via factory) +BenchmarkResult benchmark_conv1x1_fixed(int channels, int groups, int frames, std::mt19937& rng) { - if (channels % groups != 0) - return; // Skip invalid configurations + // Create Conv1x1 layer via factory + auto conv = nam::Conv1x1Factory::create(channels, channels, false, groups); + + // Initialize with random weights + const int num_weights = (channels / groups) * (channels / groups) * groups; + std::vector weights(num_weights); + std::uniform_real_distribution dist(-1.0f, 1.0f); + for (auto& w : weights) + w = dist(rng); + + auto it = weights.begin(); + conv->set_weights_(it); + conv->SetMaxBufferSize(frames); + + // Create random input + Eigen::MatrixXf input(channels, frames); + for (int i = 0; i < channels; i++) + for (int j = 0; j < frames; j++) + input(i, j) = dist(rng); + // Warmup + for (int i = 0; i < NUM_WARMUP_ITERATIONS; i++) + { + conv->process_(input, frames); + } + + // Benchmark + std::vector samples; + samples.reserve(NUM_BENCHMARK_ITERATIONS); + + for (int i = 0; i < NUM_BENCHMARK_ITERATIONS; i++) + { + auto t1 = high_resolution_clock::now(); + conv->process_(input, frames); + auto t2 = high_resolution_clock::now(); + samples.push_back(static_cast(duration_cast(t2 - t1).count())); + } + + return calculate_stats(samples); +} + +// Benchmark Conv1D (dynamic implementation) +BenchmarkResult benchmark_conv1d_dynamic(int channels, int groups, int frames, int kernel_size, std::mt19937& rng) +{ // Create Conv1D layer nam::Conv1D conv; conv.set_size_(channels, channels, kernel_size, false, 1, groups); @@ -161,18 +198,97 @@ void benchmark_conv1d(int channels, int groups, int frames, int kernel_size, std samples.push_back(static_cast(duration_cast(t2 - t1).count())); } - BenchmarkResult result = calculate_stats(samples); + return calculate_stats(samples); +} + +// Benchmark Conv1D (fixed/templated implementation via factory) +BenchmarkResult benchmark_conv1d_fixed(int channels, int groups, int frames, int kernel_size, std::mt19937& rng) +{ + // Create Conv1D layer via factory + auto conv = nam::Conv1DFactory::create(channels, channels, kernel_size, 1, false, groups); + + // Initialize with random weights + const int num_weights = kernel_size * (channels / groups) * (channels / groups) * groups; + std::vector weights(num_weights); + std::uniform_real_distribution dist(-1.0f, 1.0f); + for (auto& w : weights) + w = dist(rng); + + auto it = weights.begin(); + conv->set_weights_(it); + conv->SetMaxBufferSize(frames); + + // Create random input + Eigen::MatrixXf input(channels, frames); + for (int i = 0; i < channels; i++) + for (int j = 0; j < frames; j++) + input(i, j) = dist(rng); + + // Warmup + for (int i = 0; i < NUM_WARMUP_ITERATIONS; i++) + { + conv->Process(input, frames); + } + + // Benchmark + std::vector samples; + samples.reserve(NUM_BENCHMARK_ITERATIONS); + + for (int i = 0; i < NUM_BENCHMARK_ITERATIONS; i++) + { + auto t1 = high_resolution_clock::now(); + conv->Process(input, frames); + auto t2 = high_resolution_clock::now(); + samples.push_back(static_cast(duration_cast(t2 - t1).count())); + } + + return calculate_stats(samples); +} + +// Run benchmarks for Conv1x1 and output comparison +void benchmark_conv1x1(int channels, int groups, int frames, std::mt19937& rng) +{ + if (channels % groups != 0) + return; // Skip invalid configurations + + BenchmarkResult dynamic_result = benchmark_conv1x1_dynamic(channels, groups, frames, rng); + BenchmarkResult fixed_result = benchmark_conv1x1_fixed(channels, groups, frames, rng); + + double speedup = dynamic_result.mean_ns / fixed_result.mean_ns; + + // Output CSV row: type,impl,channels,groups,frames,kernel_size,mean_ns,stddev_ns,min_ns,max_ns,speedup + std::cout << "Conv1x1,dynamic," << channels << "," << groups << "," << frames << ",1," << std::fixed + << std::setprecision(2) << dynamic_result.mean_ns << "," << dynamic_result.stddev_ns << "," + << dynamic_result.min_ns << "," << dynamic_result.max_ns << ",1.00\n"; + std::cout << "Conv1x1,fixed," << channels << "," << groups << "," << frames << ",1," << std::fixed + << std::setprecision(2) << fixed_result.mean_ns << "," << fixed_result.stddev_ns << "," + << fixed_result.min_ns << "," << fixed_result.max_ns << "," << speedup << "\n"; +} + +// Run benchmarks for Conv1D and output comparison +void benchmark_conv1d(int channels, int groups, int frames, int kernel_size, std::mt19937& rng) +{ + if (channels % groups != 0) + return; // Skip invalid configurations + + BenchmarkResult dynamic_result = benchmark_conv1d_dynamic(channels, groups, frames, kernel_size, rng); + BenchmarkResult fixed_result = benchmark_conv1d_fixed(channels, groups, frames, kernel_size, rng); + + double speedup = dynamic_result.mean_ns / fixed_result.mean_ns; - // Output CSV row - std::cout << "Conv1D," << channels << "," << groups << "," << frames << "," << kernel_size << "," << std::fixed - << std::setprecision(2) << result.mean_ns << "," << result.stddev_ns << "," << result.min_ns << "," - << result.max_ns << "\n"; + // Output CSV row: type,impl,channels,groups,frames,kernel_size,mean_ns,stddev_ns,min_ns,max_ns,speedup + std::cout << "Conv1D,dynamic," << channels << "," << groups << "," << frames << "," << kernel_size << "," << std::fixed + << std::setprecision(2) << dynamic_result.mean_ns << "," << dynamic_result.stddev_ns << "," + << dynamic_result.min_ns << "," << dynamic_result.max_ns << ",1.00\n"; + std::cout << "Conv1D,fixed," << channels << "," << groups << "," << frames << "," << kernel_size << "," << std::fixed + << std::setprecision(2) << fixed_result.mean_ns << "," << fixed_result.stddev_ns << "," + << fixed_result.min_ns << "," << fixed_result.max_ns << "," << speedup << "\n"; } int main(int argc, char* argv[]) { // Print CSV header - std::cout << "type,channels,groups,frames,kernel_size,mean_ns,stddev_ns,min_ns,max_ns\n"; + std::cout << "type,impl,channels,groups,frames,kernel_size,mean_ns,stddev_ns,min_ns,max_ns,speedup\n"; // Use fixed seed for reproducibility std::mt19937 rng(42); diff --git a/tools/benchmark_fully_fixed_conv.cpp b/tools/benchmark_fully_fixed_conv.cpp new file mode 100644 index 0000000..8921562 --- /dev/null +++ b/tools/benchmark_fully_fixed_conv.cpp @@ -0,0 +1,282 @@ +// Benchmark for fully fixed convolution implementations +// Compares Conv1x1FullyFixed and Conv1DFullyFixed (all dimensions fixed) vs dynamic implementations + +#include +#include +#include +#include +#include +#include + +#include "NAM/conv1d.h" +#include "NAM/conv1d_fixed.h" +#include "NAM/conv1x1_fixed.h" +#include "NAM/dsp.h" + +using std::chrono::duration_cast; +using std::chrono::high_resolution_clock; +using std::chrono::nanoseconds; + +constexpr int NUM_WARMUP = 100; +constexpr int NUM_ITERATIONS = 1000; + +struct Result +{ + double mean_ns; + double stddev_ns; +}; + +Result calculate_stats(const std::vector& samples) +{ + double sum = 0.0; + for (double s : samples) + sum += s; + double mean = sum / samples.size(); + + double sq_sum = 0.0; + for (double s : samples) + { + double diff = s - mean; + sq_sum += diff * diff; + } + return {mean, std::sqrt(sq_sum / samples.size())}; +} + +// Benchmark Conv1x1FullyFixed vs Conv1x1 (dynamic) +template +void benchmark_conv1x1_fully_fixed(std::mt19937& rng) +{ + std::uniform_real_distribution dist(-1.0f, 1.0f); + + // Generate weights + constexpr int in_per_group = Channels / Groups; + constexpr int out_per_group = Channels / Groups; + std::vector weights; + for (int g = 0; g < Groups; g++) + for (int i = 0; i < out_per_group; i++) + for (int j = 0; j < in_per_group; j++) + weights.push_back(dist(rng)); + if constexpr (HasBias) + for (int i = 0; i < Channels; i++) + weights.push_back(dist(rng)); + + // Create input (dynamic for interface, but we'll also create fixed version) + Eigen::MatrixXf input_dynamic(Channels, MaxFrames); + for (int i = 0; i < Channels; i++) + for (int j = 0; j < MaxFrames; j++) + input_dynamic(i, j) = dist(rng); + + // Fixed-size input + Eigen::Matrix input_fixed = input_dynamic; + + // ========== FULLY FIXED ========== + nam::Conv1x1FullyFixed conv_fixed; + auto it1 = weights.begin(); + conv_fixed.set_weights_(it1); + conv_fixed.SetMaxBufferSize(MaxFrames); + + // Warmup + for (int i = 0; i < NUM_WARMUP; i++) + conv_fixed.process_(input_dynamic, MaxFrames); + + std::vector fixed_samples; + fixed_samples.reserve(NUM_ITERATIONS); + for (int i = 0; i < NUM_ITERATIONS; i++) + { + auto t1 = high_resolution_clock::now(); + conv_fixed.process_(input_dynamic, MaxFrames); + auto t2 = high_resolution_clock::now(); + fixed_samples.push_back(static_cast(duration_cast(t2 - t1).count())); + } + + // ========== DYNAMIC ========== + nam::Conv1x1 conv_dynamic(Channels, Channels, HasBias, Groups); + auto it2 = weights.begin(); + conv_dynamic.set_weights_(it2); + conv_dynamic.SetMaxBufferSize(MaxFrames); + + // Warmup + for (int i = 0; i < NUM_WARMUP; i++) + conv_dynamic.process_(input_dynamic, MaxFrames); + + std::vector dynamic_samples; + dynamic_samples.reserve(NUM_ITERATIONS); + for (int i = 0; i < NUM_ITERATIONS; i++) + { + auto t1 = high_resolution_clock::now(); + conv_dynamic.process_(input_dynamic, MaxFrames); + auto t2 = high_resolution_clock::now(); + dynamic_samples.push_back(static_cast(duration_cast(t2 - t1).count())); + } + + Result fixed_result = calculate_stats(fixed_samples); + Result dynamic_result = calculate_stats(dynamic_samples); + + double speedup = dynamic_result.mean_ns / fixed_result.mean_ns; + + std::cout << "Conv1x1," << Channels << "," << Groups << "," << (HasBias ? "true" : "false") << "," << MaxFrames << "," + << std::fixed << std::setprecision(1) << dynamic_result.mean_ns << "," << fixed_result.mean_ns << "," + << std::setprecision(2) << speedup << "x\n"; +} + +// Benchmark Conv1DFullyFixed vs Conv1D (dynamic) +template +void benchmark_conv1d_fully_fixed(std::mt19937& rng) +{ + std::uniform_real_distribution dist(-1.0f, 1.0f); + + // Generate weights + constexpr int in_per_group = Channels / Groups; + constexpr int out_per_group = Channels / Groups; + std::vector weights; + for (int g = 0; g < Groups; g++) + for (int i = 0; i < out_per_group; i++) + for (int j = 0; j < in_per_group; j++) + for (int k = 0; k < KernelSize; k++) + weights.push_back(dist(rng)); + if constexpr (HasBias) + for (int i = 0; i < Channels; i++) + weights.push_back(dist(rng)); + + // Create input + Eigen::MatrixXf input(Channels, MaxFrames); + for (int i = 0; i < Channels; i++) + for (int j = 0; j < MaxFrames; j++) + input(i, j) = dist(rng); + + const int dilation = 1; + + // ========== FULLY FIXED ========== + nam::Conv1DFullyFixed conv_fixed(dilation); + auto it1 = weights.begin(); + conv_fixed.set_weights_(it1); + conv_fixed.SetMaxBufferSize(MaxFrames); + + // Warmup + for (int i = 0; i < NUM_WARMUP; i++) + conv_fixed.Process(input, MaxFrames); + + std::vector fixed_samples; + fixed_samples.reserve(NUM_ITERATIONS); + for (int i = 0; i < NUM_ITERATIONS; i++) + { + auto t1 = high_resolution_clock::now(); + conv_fixed.Process(input, MaxFrames); + auto t2 = high_resolution_clock::now(); + fixed_samples.push_back(static_cast(duration_cast(t2 - t1).count())); + } + + // ========== DYNAMIC ========== + nam::Conv1D conv_dynamic; + conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups); + auto it2 = weights.begin(); + conv_dynamic.set_weights_(it2); + conv_dynamic.SetMaxBufferSize(MaxFrames); + + // Warmup + for (int i = 0; i < NUM_WARMUP; i++) + conv_dynamic.Process(input, MaxFrames); + + std::vector dynamic_samples; + dynamic_samples.reserve(NUM_ITERATIONS); + for (int i = 0; i < NUM_ITERATIONS; i++) + { + auto t1 = high_resolution_clock::now(); + conv_dynamic.Process(input, MaxFrames); + auto t2 = high_resolution_clock::now(); + dynamic_samples.push_back(static_cast(duration_cast(t2 - t1).count())); + } + + Result fixed_result = calculate_stats(fixed_samples); + Result dynamic_result = calculate_stats(dynamic_samples); + + double speedup = dynamic_result.mean_ns / fixed_result.mean_ns; + + std::cout << "Conv1D," << Channels << "," << Groups << "," << KernelSize << "," << (HasBias ? "true" : "false") << "," + << MaxFrames << "," << std::fixed << std::setprecision(1) << dynamic_result.mean_ns << "," + << fixed_result.mean_ns << "," << std::setprecision(2) << speedup << "x\n"; +} + +int main() +{ + std::mt19937 rng(42); + + std::cout << "================================================================================\n"; + std::cout << "CONV1X1: Fully Fixed (all dimensions) vs Dynamic\n"; + std::cout << "================================================================================\n"; + std::cout << "Type,Channels,Groups,Bias,Frames,Dynamic(ns),FullyFixed(ns),Speedup\n"; + + // Common audio buffer sizes: 32, 64, 128, 256, 512 + // Small channels (where fixed-size optimization helps most) + + // 2 channels + benchmark_conv1x1_fully_fixed<2, 32, 1, true>(rng); + benchmark_conv1x1_fully_fixed<2, 64, 1, true>(rng); + benchmark_conv1x1_fully_fixed<2, 128, 1, true>(rng); + benchmark_conv1x1_fully_fixed<2, 256, 1, true>(rng); + benchmark_conv1x1_fully_fixed<2, 512, 1, true>(rng); + + // 4 channels + benchmark_conv1x1_fully_fixed<4, 32, 1, true>(rng); + benchmark_conv1x1_fully_fixed<4, 64, 1, true>(rng); + benchmark_conv1x1_fully_fixed<4, 128, 1, true>(rng); + benchmark_conv1x1_fully_fixed<4, 256, 1, true>(rng); + benchmark_conv1x1_fully_fixed<4, 512, 1, true>(rng); + + // 4 channels with 4 groups (grouped convolution) + benchmark_conv1x1_fully_fixed<4, 32, 4, true>(rng); + benchmark_conv1x1_fully_fixed<4, 64, 4, true>(rng); + benchmark_conv1x1_fully_fixed<4, 128, 4, true>(rng); + benchmark_conv1x1_fully_fixed<4, 256, 4, true>(rng); + benchmark_conv1x1_fully_fixed<4, 512, 4, true>(rng); + + // 8 channels + benchmark_conv1x1_fully_fixed<8, 32, 1, true>(rng); + benchmark_conv1x1_fully_fixed<8, 64, 1, true>(rng); + benchmark_conv1x1_fully_fixed<8, 128, 1, true>(rng); + benchmark_conv1x1_fully_fixed<8, 256, 1, true>(rng); + benchmark_conv1x1_fully_fixed<8, 512, 1, true>(rng); + + // 8 channels with 8 groups + benchmark_conv1x1_fully_fixed<8, 32, 8, true>(rng); + benchmark_conv1x1_fully_fixed<8, 64, 8, true>(rng); + benchmark_conv1x1_fully_fixed<8, 128, 8, true>(rng); + benchmark_conv1x1_fully_fixed<8, 256, 8, true>(rng); + benchmark_conv1x1_fully_fixed<8, 512, 8, true>(rng); + + std::cout << "\n================================================================================\n"; + std::cout << "CONV1D: Fully Fixed (all dimensions) vs Dynamic\n"; + std::cout << "================================================================================\n"; + std::cout << "Type,Channels,Groups,KernelSize,Bias,Frames,Dynamic(ns),FullyFixed(ns),Speedup\n"; + + // Conv1D with kernel size 3 (most common) + // 4 channels + benchmark_conv1d_fully_fixed<4, 3, 32, 1, true>(rng); + benchmark_conv1d_fully_fixed<4, 3, 64, 1, true>(rng); + benchmark_conv1d_fully_fixed<4, 3, 128, 1, true>(rng); + benchmark_conv1d_fully_fixed<4, 3, 256, 1, true>(rng); + benchmark_conv1d_fully_fixed<4, 3, 512, 1, true>(rng); + + // 4 channels with 4 groups + benchmark_conv1d_fully_fixed<4, 3, 32, 4, true>(rng); + benchmark_conv1d_fully_fixed<4, 3, 64, 4, true>(rng); + benchmark_conv1d_fully_fixed<4, 3, 128, 4, true>(rng); + benchmark_conv1d_fully_fixed<4, 3, 256, 4, true>(rng); + benchmark_conv1d_fully_fixed<4, 3, 512, 4, true>(rng); + + // 8 channels + benchmark_conv1d_fully_fixed<8, 3, 32, 1, true>(rng); + benchmark_conv1d_fully_fixed<8, 3, 64, 1, true>(rng); + benchmark_conv1d_fully_fixed<8, 3, 128, 1, true>(rng); + benchmark_conv1d_fully_fixed<8, 3, 256, 1, true>(rng); + benchmark_conv1d_fully_fixed<8, 3, 512, 1, true>(rng); + + // 8 channels with 8 groups + benchmark_conv1d_fully_fixed<8, 3, 32, 8, true>(rng); + benchmark_conv1d_fully_fixed<8, 3, 64, 8, true>(rng); + benchmark_conv1d_fully_fixed<8, 3, 128, 8, true>(rng); + benchmark_conv1d_fully_fixed<8, 3, 256, 8, true>(rng); + benchmark_conv1d_fully_fixed<8, 3, 512, 8, true>(rng); + + return 0; +} diff --git a/tools/run_tests.cpp b/tools/run_tests.cpp index e1ebbf7..80af8af 100644 --- a/tools/run_tests.cpp +++ b/tools/run_tests.cpp @@ -24,6 +24,9 @@ #include "test/test_input_buffer_verification.cpp" #include "test/test_lstm.cpp" #include "test/test_wavenet_configurable_gating.cpp" +#include "test/test_conv1x1_fixed.cpp" +#include "test/test_conv1d_fixed.cpp" +#include "test/test_fully_fixed_correctness.cpp" int main() { @@ -232,6 +235,51 @@ int main() // Configurable gating/blending tests run_configurable_gating_tests(); + // Conv1x1Fixed tests (templated implementation) + test_conv1x1_fixed::test_factory_create(); + test_conv1x1_fixed::test_factory_create_with_groups(); + test_conv1x1_fixed::test_numerical_equivalence(); + test_conv1x1_fixed::test_numerical_equivalence_grouped(); + test_conv1x1_fixed::test_process_realtime_safe(); + test_conv1x1_fixed::test_process_grouped_realtime_safe(); + test_conv1x1_fixed::test_set_max_buffer_size(); + test_conv1x1_fixed::test_process_multiple_calls(); + test_conv1x1_fixed::test_no_bias(); + + // Conv1DFixed tests (templated implementation) + test_conv1d_fixed::test_factory_create(); + test_conv1d_fixed::test_factory_create_with_groups(); + test_conv1d_fixed::test_numerical_equivalence(); + test_conv1d_fixed::test_numerical_equivalence_grouped(); + test_conv1d_fixed::test_numerical_equivalence_kernel4(); + test_conv1d_fixed::test_process_realtime_safe(); + test_conv1d_fixed::test_process_grouped_realtime_safe(); + test_conv1d_fixed::test_set_max_buffer_size(); + test_conv1d_fixed::test_process_multiple_calls(); + test_conv1d_fixed::test_no_bias(); + test_conv1d_fixed::test_with_dilation(); + + // Fully fixed correctness tests (Conv1x1FullyFixed vs Conv1x1) + test_fully_fixed_correctness::test_conv1x1_fully_fixed_2ch_32frames(); + test_fully_fixed_correctness::test_conv1x1_fully_fixed_4ch_64frames(); + test_fully_fixed_correctness::test_conv1x1_fully_fixed_4ch_4groups(); + test_fully_fixed_correctness::test_conv1x1_fully_fixed_8ch_8groups(); + test_fully_fixed_correctness::test_conv1x1_fully_fixed_no_bias(); + test_fully_fixed_correctness::test_conv1x1_fully_fixed_partial_buffer(); + test_fully_fixed_correctness::test_conv1x1_fully_fixed_multiple_calls(); + + // Fully fixed correctness tests (Conv1DFullyFixed vs Conv1D) + test_fully_fixed_correctness::test_conv1d_fully_fixed_4ch_k3_64frames(); + test_fully_fixed_correctness::test_conv1d_fully_fixed_4ch_4groups(); + test_fully_fixed_correctness::test_conv1d_fully_fixed_8ch_8groups(); + test_fully_fixed_correctness::test_conv1d_fully_fixed_dilation2(); + test_fully_fixed_correctness::test_conv1d_fully_fixed_dilation8(); + test_fully_fixed_correctness::test_conv1d_fully_fixed_no_bias(); + test_fully_fixed_correctness::test_conv1d_fully_fixed_multiple_calls(); + test_fully_fixed_correctness::test_conv1d_fully_fixed_multiple_calls_dilation4(); + test_fully_fixed_correctness::test_conv1d_fully_fixed_varying_buffer_sizes(); + test_fully_fixed_correctness::test_conv1d_fully_fixed_kernel4(); + test_get_dsp::test_gets_input_level(); test_get_dsp::test_gets_output_level(); test_get_dsp::test_null_input_level(); diff --git a/tools/test/test_conv1d_fixed.cpp b/tools/test/test_conv1d_fixed.cpp new file mode 100644 index 0000000..1b916e0 --- /dev/null +++ b/tools/test/test_conv1d_fixed.cpp @@ -0,0 +1,512 @@ +// Tests for Conv1DFixed (templated implementation) + +#include +#include +#include +#include +#include +#include + +#include "NAM/conv1d.h" +#include "NAM/conv1d_factory.h" +#include "NAM/conv1d_fixed.h" +#include "allocation_tracking.h" + +namespace test_conv1d_fixed +{ + +// Test factory creation +void test_factory_create() +{ + auto conv = nam::Conv1DFactory::create(4, 4, 3, 1, true, 1); + assert(conv != nullptr); + assert(conv->get_in_channels() == 4); + assert(conv->get_out_channels() == 4); + assert(conv->get_kernel_size() == 3); + assert(conv->get_dilation() == 1); + assert(conv->has_bias() == true); +} + +// Test factory with groups +void test_factory_create_with_groups() +{ + auto conv = nam::Conv1DFactory::create(8, 8, 3, 1, false, 2); + assert(conv != nullptr); + assert(conv->get_in_channels() == 8); + assert(conv->get_out_channels() == 8); +} + +// Test process gives same result as dynamic implementation +void test_numerical_equivalence() +{ + const int in_channels = 4; + const int out_channels = 4; + const int kernel_size = 3; + const int dilation = 1; + const bool do_bias = true; + const int groups = 1; + const int num_frames = 8; + + // Create both implementations + nam::Conv1D conv_dynamic; + conv_dynamic.set_size_(in_channels, out_channels, kernel_size, do_bias, dilation, groups); + auto conv_fixed = nam::Conv1DFactory::create(in_channels, out_channels, kernel_size, dilation, do_bias, groups); + + // Generate weights + std::vector weights; + const int in_per_group = in_channels / groups; + const int out_per_group = out_channels / groups; + for (int g = 0; g < groups; g++) + { + for (int i = 0; i < out_per_group; i++) + { + for (int j = 0; j < in_per_group; j++) + { + for (int k = 0; k < kernel_size; k++) + { + weights.push_back(static_cast(g * 100 + i * 10 + j + k) * 0.01f); + } + } + } + } + for (int i = 0; i < out_channels; i++) + { + weights.push_back(static_cast(i) * 0.1f); + } + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_dynamic.set_weights_(it1); + conv_fixed->set_weights_(it2); + + conv_dynamic.SetMaxBufferSize(num_frames); + conv_fixed->SetMaxBufferSize(num_frames); + + // Same input + Eigen::MatrixXf input(in_channels, num_frames); + for (int i = 0; i < in_channels; i++) + for (int j = 0; j < num_frames; j++) + input(i, j) = static_cast(i * num_frames + j) * 0.1f; + + // Process both + conv_dynamic.Process(input, num_frames); + conv_fixed->Process(input, num_frames); + + auto& output_dynamic = conv_dynamic.GetOutput(); + auto& output_fixed = conv_fixed->GetOutput(); + + // Compare outputs + for (int i = 0; i < out_channels; i++) + { + for (int j = 0; j < num_frames; j++) + { + float diff = std::abs(output_dynamic(i, j) - output_fixed(i, j)); + assert(diff < 1e-4f); + } + } +} + +// Test grouped convolution numerical equivalence +void test_numerical_equivalence_grouped() +{ + const int in_channels = 8; + const int out_channels = 8; + const int kernel_size = 3; + const int dilation = 1; + const bool do_bias = true; + const int groups = 2; + const int num_frames = 8; + + // Create both implementations + nam::Conv1D conv_dynamic; + conv_dynamic.set_size_(in_channels, out_channels, kernel_size, do_bias, dilation, groups); + auto conv_fixed = nam::Conv1DFactory::create(in_channels, out_channels, kernel_size, dilation, do_bias, groups); + + // Generate weights + std::vector weights; + const int in_per_group = in_channels / groups; + const int out_per_group = out_channels / groups; + for (int g = 0; g < groups; g++) + { + for (int i = 0; i < out_per_group; i++) + { + for (int j = 0; j < in_per_group; j++) + { + for (int k = 0; k < kernel_size; k++) + { + weights.push_back(static_cast(g * 100 + i * 10 + j + k) * 0.01f); + } + } + } + } + for (int i = 0; i < out_channels; i++) + { + weights.push_back(static_cast(i) * 0.1f); + } + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_dynamic.set_weights_(it1); + conv_fixed->set_weights_(it2); + + conv_dynamic.SetMaxBufferSize(num_frames); + conv_fixed->SetMaxBufferSize(num_frames); + + // Same input + Eigen::MatrixXf input(in_channels, num_frames); + for (int i = 0; i < in_channels; i++) + for (int j = 0; j < num_frames; j++) + input(i, j) = static_cast(i * num_frames + j) * 0.1f; + + // Process both + conv_dynamic.Process(input, num_frames); + conv_fixed->Process(input, num_frames); + + auto& output_dynamic = conv_dynamic.GetOutput(); + auto& output_fixed = conv_fixed->GetOutput(); + + // Compare outputs + for (int i = 0; i < out_channels; i++) + { + for (int j = 0; j < num_frames; j++) + { + float diff = std::abs(output_dynamic(i, j) - output_fixed(i, j)); + assert(diff < 1e-4f); + } + } +} + +// Test with different kernel size +void test_numerical_equivalence_kernel4() +{ + const int in_channels = 4; + const int out_channels = 4; + const int kernel_size = 4; + const int dilation = 1; + const bool do_bias = true; + const int groups = 1; + const int num_frames = 8; + + // Create both implementations + nam::Conv1D conv_dynamic; + conv_dynamic.set_size_(in_channels, out_channels, kernel_size, do_bias, dilation, groups); + auto conv_fixed = nam::Conv1DFactory::create(in_channels, out_channels, kernel_size, dilation, do_bias, groups); + + // Generate weights + std::vector weights; + for (int i = 0; i < out_channels; i++) + { + for (int j = 0; j < in_channels; j++) + { + for (int k = 0; k < kernel_size; k++) + { + weights.push_back(static_cast(i * 10 + j + k) * 0.01f); + } + } + } + for (int i = 0; i < out_channels; i++) + { + weights.push_back(static_cast(i) * 0.1f); + } + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_dynamic.set_weights_(it1); + conv_fixed->set_weights_(it2); + + conv_dynamic.SetMaxBufferSize(num_frames); + conv_fixed->SetMaxBufferSize(num_frames); + + // Same input + Eigen::MatrixXf input(in_channels, num_frames); + for (int i = 0; i < in_channels; i++) + for (int j = 0; j < num_frames; j++) + input(i, j) = static_cast(i * num_frames + j) * 0.1f; + + // Process both + conv_dynamic.Process(input, num_frames); + conv_fixed->Process(input, num_frames); + + auto& output_dynamic = conv_dynamic.GetOutput(); + auto& output_fixed = conv_fixed->GetOutput(); + + // Compare outputs + for (int i = 0; i < out_channels; i++) + { + for (int j = 0; j < num_frames; j++) + { + float diff = std::abs(output_dynamic(i, j) - output_fixed(i, j)); + assert(diff < 1e-4f); + } + } +} + +// Test process is real-time safe (no allocations) +void test_process_realtime_safe() +{ + const int in_channels = 4; + const int out_channels = 4; + const int kernel_size = 3; + const int dilation = 1; + const bool do_bias = true; + const int groups = 1; + const int num_frames = 64; + + auto conv = nam::Conv1DFactory::create(in_channels, out_channels, kernel_size, dilation, do_bias, groups); + + // Initialize weights + std::vector weights; + for (int i = 0; i < out_channels; i++) + { + for (int j = 0; j < in_channels; j++) + { + for (int k = 0; k < kernel_size; k++) + { + weights.push_back(0.1f); + } + } + } + for (int i = 0; i < out_channels; i++) + { + weights.push_back(0.0f); + } + + auto it = weights.begin(); + conv->set_weights_(it); + conv->SetMaxBufferSize(num_frames); + + // Create input buffer + Eigen::MatrixXf input(in_channels, num_frames); + for (int i = 0; i < in_channels; i++) + for (int j = 0; j < num_frames; j++) + input(i, j) = static_cast(i + j); + + // Run allocation test + allocation_tracking::run_allocation_test_no_allocations( + nullptr, + [&]() { + conv->Process(input, num_frames); + }, + nullptr, "test_conv1d_fixed_process_realtime_safe"); +} + +// Test process with groups is real-time safe +void test_process_grouped_realtime_safe() +{ + const int in_channels = 8; + const int out_channels = 8; + const int kernel_size = 3; + const int dilation = 1; + const bool do_bias = true; + const int groups = 4; + const int num_frames = 64; + + auto conv = nam::Conv1DFactory::create(in_channels, out_channels, kernel_size, dilation, do_bias, groups); + + // Initialize weights + std::vector weights; + const int in_per_group = in_channels / groups; + const int out_per_group = out_channels / groups; + for (int g = 0; g < groups; g++) + { + for (int i = 0; i < out_per_group; i++) + { + for (int j = 0; j < in_per_group; j++) + { + for (int k = 0; k < kernel_size; k++) + { + weights.push_back(i == j ? 1.0f : 0.0f); + } + } + } + } + for (int i = 0; i < out_channels; i++) + { + weights.push_back(0.0f); + } + + auto it = weights.begin(); + conv->set_weights_(it); + conv->SetMaxBufferSize(num_frames); + + // Create input buffer + Eigen::MatrixXf input(in_channels, num_frames); + for (int i = 0; i < in_channels; i++) + for (int j = 0; j < num_frames; j++) + input(i, j) = static_cast(i + j); + + // Run allocation test + allocation_tracking::run_allocation_test_no_allocations( + nullptr, + [&]() { + conv->Process(input, num_frames); + }, + nullptr, "test_conv1d_fixed_process_grouped_realtime_safe"); +} + +// Test SetMaxBufferSize +void test_set_max_buffer_size() +{ + auto conv = nam::Conv1DFactory::create(4, 4, 3, 1, false, 1); + conv->SetMaxBufferSize(128); + auto& output = conv->GetOutput(); + assert(output.rows() == 4); + assert(output.cols() == 128); +} + +// Test multiple calls to process +void test_process_multiple_calls() +{ + const int channels = 4; + const int kernel_size = 3; + const int num_frames = 4; + + auto conv = nam::Conv1DFactory::create(channels, channels, kernel_size, 1, false, 1); + + // Identity-like weights (all zeros except center) + std::vector weights; + for (int i = 0; i < channels; i++) + { + for (int j = 0; j < channels; j++) + { + for (int k = 0; k < kernel_size; k++) + { + // Put weight at last kernel position for identity-like behavior + weights.push_back((k == kernel_size - 1 && i == j) ? 1.0f : 0.0f); + } + } + } + + auto it = weights.begin(); + conv->set_weights_(it); + conv->SetMaxBufferSize(num_frames); + + // First call + Eigen::MatrixXf input1(channels, num_frames); + input1.setConstant(1.0f); + conv->Process(input1, num_frames); + + // Second call + Eigen::MatrixXf input2(channels, num_frames); + input2.setConstant(2.0f); + conv->Process(input2, num_frames); + + // Output should reflect the second call's values (for the last positions at least) + auto& output = conv->GetOutput(); + // After the ring buffer fills, we should see values based on the second input + assert(output.rows() == channels); +} + +// Test with bias disabled +void test_no_bias() +{ + const int channels = 4; + const int kernel_size = 3; + const int num_frames = 4; + + // Create dynamic and fixed with no bias + nam::Conv1D conv_dynamic; + conv_dynamic.set_size_(channels, channels, kernel_size, false, 1, 1); + auto conv_fixed = nam::Conv1DFactory::create(channels, channels, kernel_size, 1, false, 1); + + // Same weights (no bias) + std::vector weights; + for (int i = 0; i < channels; i++) + { + for (int j = 0; j < channels; j++) + { + for (int k = 0; k < kernel_size; k++) + { + weights.push_back((k == kernel_size - 1 && i == j) ? 1.0f : 0.0f); + } + } + } + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_dynamic.set_weights_(it1); + conv_fixed->set_weights_(it2); + + conv_dynamic.SetMaxBufferSize(num_frames); + conv_fixed->SetMaxBufferSize(num_frames); + + Eigen::MatrixXf input(channels, num_frames); + input.setConstant(5.0f); + + conv_dynamic.Process(input, num_frames); + conv_fixed->Process(input, num_frames); + + auto& output_dynamic = conv_dynamic.GetOutput(); + auto& output_fixed = conv_fixed->GetOutput(); + + // Compare outputs + for (int i = 0; i < channels; i++) + { + for (int j = 0; j < num_frames; j++) + { + float diff = std::abs(output_dynamic(i, j) - output_fixed(i, j)); + assert(diff < 1e-4f); + } + } +} + +// Test with dilation +void test_with_dilation() +{ + const int channels = 4; + const int kernel_size = 3; + const int dilation = 2; + const int num_frames = 8; + + nam::Conv1D conv_dynamic; + conv_dynamic.set_size_(channels, channels, kernel_size, true, dilation, 1); + auto conv_fixed = nam::Conv1DFactory::create(channels, channels, kernel_size, dilation, true, 1); + + // Same weights + std::vector weights; + for (int i = 0; i < channels; i++) + { + for (int j = 0; j < channels; j++) + { + for (int k = 0; k < kernel_size; k++) + { + weights.push_back(0.1f * (i + j + k)); + } + } + } + for (int i = 0; i < channels; i++) + { + weights.push_back(0.5f); + } + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_dynamic.set_weights_(it1); + conv_fixed->set_weights_(it2); + + conv_dynamic.SetMaxBufferSize(num_frames); + conv_fixed->SetMaxBufferSize(num_frames); + + Eigen::MatrixXf input(channels, num_frames); + for (int i = 0; i < channels; i++) + for (int j = 0; j < num_frames; j++) + input(i, j) = static_cast(i + j); + + conv_dynamic.Process(input, num_frames); + conv_fixed->Process(input, num_frames); + + auto& output_dynamic = conv_dynamic.GetOutput(); + auto& output_fixed = conv_fixed->GetOutput(); + + // Compare outputs + for (int i = 0; i < channels; i++) + { + for (int j = 0; j < num_frames; j++) + { + float diff = std::abs(output_dynamic(i, j) - output_fixed(i, j)); + assert(diff < 1e-4f); + } + } +} + +} // namespace test_conv1d_fixed diff --git a/tools/test/test_conv1x1_fixed.cpp b/tools/test/test_conv1x1_fixed.cpp new file mode 100644 index 0000000..5b6ac80 --- /dev/null +++ b/tools/test/test_conv1x1_fixed.cpp @@ -0,0 +1,323 @@ +// Tests for Conv1x1Fixed (templated implementation) + +#include +#include +#include +#include +#include +#include + +#include "NAM/conv1x1_factory.h" +#include "NAM/conv1x1_fixed.h" +#include "NAM/dsp.h" +#include "allocation_tracking.h" + +namespace test_conv1x1_fixed +{ + +// Test factory creation +void test_factory_create() +{ + auto conv = nam::Conv1x1Factory::create(4, 4, true, 1); + assert(conv != nullptr); + assert(conv->get_in_channels() == 4); + assert(conv->get_out_channels() == 4); +} + +// Test factory with groups +void test_factory_create_with_groups() +{ + auto conv = nam::Conv1x1Factory::create(8, 8, false, 2); + assert(conv != nullptr); + assert(conv->get_in_channels() == 8); + assert(conv->get_out_channels() == 8); +} + +// Test process gives same result as dynamic implementation +void test_numerical_equivalence() +{ + const int in_channels = 4; + const int out_channels = 4; + const bool do_bias = true; + const int groups = 1; + const int num_frames = 4; + + // Create both implementations + nam::Conv1x1 conv_dynamic(in_channels, out_channels, do_bias, groups); + auto conv_fixed = nam::Conv1x1Factory::create(in_channels, out_channels, do_bias, groups); + + // Same weights + std::vector weights; + for (int i = 0; i < out_channels * in_channels; i++) + { + weights.push_back(static_cast(i) * 0.1f); + } + for (int i = 0; i < out_channels; i++) + { + weights.push_back(static_cast(i) * 0.5f); + } + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_dynamic.set_weights_(it1); + conv_fixed->set_weights_(it2); + + conv_dynamic.SetMaxBufferSize(num_frames); + conv_fixed->SetMaxBufferSize(num_frames); + + // Same input + Eigen::MatrixXf input(in_channels, num_frames); + for (int i = 0; i < in_channels; i++) + for (int j = 0; j < num_frames; j++) + input(i, j) = static_cast(i * num_frames + j); + + // Process both + Eigen::MatrixXf output_dynamic = conv_dynamic.process(input, num_frames); + Eigen::MatrixXf output_fixed = conv_fixed->process(input, num_frames); + + // Compare outputs + assert(output_dynamic.rows() == output_fixed.rows()); + assert(output_dynamic.cols() == output_fixed.cols()); + + for (int i = 0; i < output_dynamic.rows(); i++) + { + for (int j = 0; j < output_dynamic.cols(); j++) + { + float diff = std::abs(output_dynamic(i, j) - output_fixed(i, j)); + assert(diff < 1e-5f); + } + } +} + +// Test grouped convolution numerical equivalence +void test_numerical_equivalence_grouped() +{ + const int in_channels = 8; + const int out_channels = 8; + const bool do_bias = true; + const int groups = 2; + const int num_frames = 4; + + // Create both implementations + nam::Conv1x1 conv_dynamic(in_channels, out_channels, do_bias, groups); + auto conv_fixed = nam::Conv1x1Factory::create(in_channels, out_channels, do_bias, groups); + + // Same weights (grouped layout) + std::vector weights; + const int in_per_group = in_channels / groups; + const int out_per_group = out_channels / groups; + for (int g = 0; g < groups; g++) + { + for (int i = 0; i < out_per_group; i++) + { + for (int j = 0; j < in_per_group; j++) + { + weights.push_back(static_cast(g * 10 + i * in_per_group + j) * 0.1f); + } + } + } + for (int i = 0; i < out_channels; i++) + { + weights.push_back(static_cast(i) * 0.5f); + } + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_dynamic.set_weights_(it1); + conv_fixed->set_weights_(it2); + + conv_dynamic.SetMaxBufferSize(num_frames); + conv_fixed->SetMaxBufferSize(num_frames); + + // Same input + Eigen::MatrixXf input(in_channels, num_frames); + for (int i = 0; i < in_channels; i++) + for (int j = 0; j < num_frames; j++) + input(i, j) = static_cast(i * num_frames + j); + + // Process both + Eigen::MatrixXf output_dynamic = conv_dynamic.process(input, num_frames); + Eigen::MatrixXf output_fixed = conv_fixed->process(input, num_frames); + + // Compare outputs + for (int i = 0; i < output_dynamic.rows(); i++) + { + for (int j = 0; j < output_dynamic.cols(); j++) + { + float diff = std::abs(output_dynamic(i, j) - output_fixed(i, j)); + assert(diff < 1e-5f); + } + } +} + +// Test process_ is real-time safe (no allocations) +void test_process_realtime_safe() +{ + const int in_channels = 4; + const int out_channels = 4; + const bool do_bias = true; + const int groups = 1; + const int num_frames = 64; + + auto conv = nam::Conv1x1Factory::create(in_channels, out_channels, do_bias, groups); + + // Initialize weights + std::vector weights; + for (int i = 0; i < out_channels * in_channels; i++) + { + weights.push_back(static_cast(i) * 0.1f); + } + for (int i = 0; i < out_channels; i++) + { + weights.push_back(0.0f); + } + + auto it = weights.begin(); + conv->set_weights_(it); + conv->SetMaxBufferSize(num_frames); + + // Create input buffer + Eigen::MatrixXf input(in_channels, num_frames); + for (int i = 0; i < in_channels; i++) + for (int j = 0; j < num_frames; j++) + input(i, j) = static_cast(i + j); + + // Run allocation test + allocation_tracking::run_allocation_test_no_allocations( + nullptr, + [&]() { + conv->process_(input, num_frames); + }, + nullptr, "test_conv1x1_fixed_process_realtime_safe"); +} + +// Test process_ with groups is real-time safe +void test_process_grouped_realtime_safe() +{ + const int in_channels = 8; + const int out_channels = 8; + const bool do_bias = true; + const int groups = 4; + const int num_frames = 64; + + auto conv = nam::Conv1x1Factory::create(in_channels, out_channels, do_bias, groups); + + // Initialize weights (identity-like for each group) + std::vector weights; + const int in_per_group = in_channels / groups; + const int out_per_group = out_channels / groups; + for (int g = 0; g < groups; g++) + { + for (int i = 0; i < out_per_group; i++) + { + for (int j = 0; j < in_per_group; j++) + { + weights.push_back(i == j ? 1.0f : 0.0f); + } + } + } + for (int i = 0; i < out_channels; i++) + { + weights.push_back(0.0f); + } + + auto it = weights.begin(); + conv->set_weights_(it); + conv->SetMaxBufferSize(num_frames); + + // Create input buffer + Eigen::MatrixXf input(in_channels, num_frames); + for (int i = 0; i < in_channels; i++) + for (int j = 0; j < num_frames; j++) + input(i, j) = static_cast(i + j); + + // Run allocation test + allocation_tracking::run_allocation_test_no_allocations( + nullptr, + [&]() { + conv->process_(input, num_frames); + }, + nullptr, "test_conv1x1_fixed_process_grouped_realtime_safe"); +} + +// Test SetMaxBufferSize +void test_set_max_buffer_size() +{ + auto conv = nam::Conv1x1Factory::create(4, 4, false, 1); + conv->SetMaxBufferSize(128); + auto& output = conv->GetOutput(); + assert(output.rows() == 4); + assert(output.cols() == 128); +} + +// Test multiple calls to process +void test_process_multiple_calls() +{ + const int channels = 4; + const int num_frames = 2; + + auto conv = nam::Conv1x1Factory::create(channels, channels, false, 1); + + // Identity weights + std::vector weights; + for (int i = 0; i < channels; i++) + { + for (int j = 0; j < channels; j++) + { + weights.push_back(i == j ? 1.0f : 0.0f); + } + } + + auto it = weights.begin(); + conv->set_weights_(it); + conv->SetMaxBufferSize(num_frames); + + Eigen::MatrixXf input1(channels, num_frames); + input1.setConstant(1.0f); + + auto output1 = conv->process(input1, num_frames); + for (int i = 0; i < channels; i++) + for (int j = 0; j < num_frames; j++) + assert(std::abs(output1(i, j) - 1.0f) < 0.01f); + + Eigen::MatrixXf input2(channels, num_frames); + input2.setConstant(2.0f); + + auto output2 = conv->process(input2, num_frames); + for (int i = 0; i < channels; i++) + for (int j = 0; j < num_frames; j++) + assert(std::abs(output2(i, j) - 2.0f) < 0.01f); +} + +// Test with bias disabled +void test_no_bias() +{ + const int channels = 4; + const int num_frames = 2; + + auto conv = nam::Conv1x1Factory::create(channels, channels, false, 1); + + // Identity weights (no bias) + std::vector weights; + for (int i = 0; i < channels; i++) + { + for (int j = 0; j < channels; j++) + { + weights.push_back(i == j ? 1.0f : 0.0f); + } + } + + auto it = weights.begin(); + conv->set_weights_(it); + conv->SetMaxBufferSize(num_frames); + + Eigen::MatrixXf input(channels, num_frames); + input.setConstant(5.0f); + + auto output = conv->process(input, num_frames); + for (int i = 0; i < channels; i++) + for (int j = 0; j < num_frames; j++) + assert(std::abs(output(i, j) - 5.0f) < 0.01f); +} + +} // namespace test_conv1x1_fixed From 4b3c2368b67d99f73a81a202f887e0ff5a8409fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Felipe=20Santos?= Date: Thu, 29 Jan 2026 15:17:57 -0800 Subject: [PATCH 4/4] Added tests to verify correctness of fixed implementations. --- tools/test/test_fully_fixed_correctness.cpp | 761 ++++++++++++++++++++ 1 file changed, 761 insertions(+) create mode 100644 tools/test/test_fully_fixed_correctness.cpp diff --git a/tools/test/test_fully_fixed_correctness.cpp b/tools/test/test_fully_fixed_correctness.cpp new file mode 100644 index 0000000..c7c0063 --- /dev/null +++ b/tools/test/test_fully_fixed_correctness.cpp @@ -0,0 +1,761 @@ +// Tests for Conv1x1FullyFixed and Conv1DFullyFixed correctness +// Compares outputs against dynamic implementations + +#include +#include +#include +#include +#include + +#include "NAM/conv1d.h" +#include "NAM/conv1d_fixed.h" +#include "NAM/conv1x1_fixed.h" +#include "NAM/dsp.h" + +namespace test_fully_fixed_correctness +{ + +constexpr float TOLERANCE = 1e-5f; + +// Helper to check matrix equality +inline void assert_matrices_equal(const Eigen::MatrixXf& a, const Eigen::MatrixXf& b, int num_cols, + float tol = TOLERANCE) +{ + assert(a.rows() == b.rows()); + for (int i = 0; i < a.rows(); i++) + { + for (int j = 0; j < num_cols; j++) + { + float diff = std::abs(a(i, j) - b(i, j)); + assert(diff < tol); + } + } +} + +// ============================================================================ +// Conv1x1FullyFixed Tests +// ============================================================================ + +void test_conv1x1_fully_fixed_2ch_32frames() +{ + std::mt19937 rng(42); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + constexpr int Channels = 2; + constexpr int MaxFrames = 32; + constexpr int Groups = 1; + constexpr bool HasBias = true; + + std::vector weights; + for (int i = 0; i < Channels * Channels; i++) + weights.push_back(dist(rng)); + for (int i = 0; i < Channels; i++) + weights.push_back(dist(rng)); + + nam::Conv1x1FullyFixed conv_fixed; + nam::Conv1x1 conv_dynamic(Channels, Channels, HasBias, Groups); + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_fixed.set_weights_(it1); + conv_dynamic.set_weights_(it2); + + conv_fixed.SetMaxBufferSize(MaxFrames); + conv_dynamic.SetMaxBufferSize(MaxFrames); + + Eigen::MatrixXf input(Channels, MaxFrames); + for (int i = 0; i < Channels; i++) + for (int j = 0; j < MaxFrames; j++) + input(i, j) = dist(rng); + + conv_fixed.process_(input, MaxFrames); + conv_dynamic.process_(input, MaxFrames); + + assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames); +} + +void test_conv1x1_fully_fixed_4ch_64frames() +{ + std::mt19937 rng(123); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + constexpr int Channels = 4; + constexpr int MaxFrames = 64; + constexpr int Groups = 1; + constexpr bool HasBias = true; + + std::vector weights; + for (int i = 0; i < Channels * Channels; i++) + weights.push_back(dist(rng)); + for (int i = 0; i < Channels; i++) + weights.push_back(dist(rng)); + + nam::Conv1x1FullyFixed conv_fixed; + nam::Conv1x1 conv_dynamic(Channels, Channels, HasBias, Groups); + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_fixed.set_weights_(it1); + conv_dynamic.set_weights_(it2); + + conv_fixed.SetMaxBufferSize(MaxFrames); + conv_dynamic.SetMaxBufferSize(MaxFrames); + + Eigen::MatrixXf input(Channels, MaxFrames); + for (int i = 0; i < Channels; i++) + for (int j = 0; j < MaxFrames; j++) + input(i, j) = dist(rng); + + conv_fixed.process_(input, MaxFrames); + conv_dynamic.process_(input, MaxFrames); + + assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames); +} + +void test_conv1x1_fully_fixed_4ch_4groups() +{ + std::mt19937 rng(456); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + constexpr int Channels = 4; + constexpr int MaxFrames = 64; + constexpr int Groups = 4; + constexpr bool HasBias = true; + constexpr int PerGroup = Channels / Groups; + + std::vector weights; + for (int g = 0; g < Groups; g++) + for (int i = 0; i < PerGroup; i++) + for (int j = 0; j < PerGroup; j++) + weights.push_back(dist(rng)); + for (int i = 0; i < Channels; i++) + weights.push_back(dist(rng)); + + nam::Conv1x1FullyFixed conv_fixed; + nam::Conv1x1 conv_dynamic(Channels, Channels, HasBias, Groups); + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_fixed.set_weights_(it1); + conv_dynamic.set_weights_(it2); + + conv_fixed.SetMaxBufferSize(MaxFrames); + conv_dynamic.SetMaxBufferSize(MaxFrames); + + Eigen::MatrixXf input(Channels, MaxFrames); + for (int i = 0; i < Channels; i++) + for (int j = 0; j < MaxFrames; j++) + input(i, j) = dist(rng); + + conv_fixed.process_(input, MaxFrames); + conv_dynamic.process_(input, MaxFrames); + + assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames); +} + +void test_conv1x1_fully_fixed_8ch_8groups() +{ + std::mt19937 rng(789); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + constexpr int Channels = 8; + constexpr int MaxFrames = 128; + constexpr int Groups = 8; + constexpr bool HasBias = true; + constexpr int PerGroup = Channels / Groups; + + std::vector weights; + for (int g = 0; g < Groups; g++) + for (int i = 0; i < PerGroup; i++) + for (int j = 0; j < PerGroup; j++) + weights.push_back(dist(rng)); + for (int i = 0; i < Channels; i++) + weights.push_back(dist(rng)); + + nam::Conv1x1FullyFixed conv_fixed; + nam::Conv1x1 conv_dynamic(Channels, Channels, HasBias, Groups); + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_fixed.set_weights_(it1); + conv_dynamic.set_weights_(it2); + + conv_fixed.SetMaxBufferSize(MaxFrames); + conv_dynamic.SetMaxBufferSize(MaxFrames); + + Eigen::MatrixXf input(Channels, MaxFrames); + for (int i = 0; i < Channels; i++) + for (int j = 0; j < MaxFrames; j++) + input(i, j) = dist(rng); + + conv_fixed.process_(input, MaxFrames); + conv_dynamic.process_(input, MaxFrames); + + assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames); +} + +void test_conv1x1_fully_fixed_no_bias() +{ + std::mt19937 rng(111); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + constexpr int Channels = 4; + constexpr int MaxFrames = 64; + constexpr int Groups = 1; + constexpr bool HasBias = false; + + std::vector weights; + for (int i = 0; i < Channels * Channels; i++) + weights.push_back(dist(rng)); + + nam::Conv1x1FullyFixed conv_fixed; + nam::Conv1x1 conv_dynamic(Channels, Channels, HasBias, Groups); + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_fixed.set_weights_(it1); + conv_dynamic.set_weights_(it2); + + conv_fixed.SetMaxBufferSize(MaxFrames); + conv_dynamic.SetMaxBufferSize(MaxFrames); + + Eigen::MatrixXf input(Channels, MaxFrames); + for (int i = 0; i < Channels; i++) + for (int j = 0; j < MaxFrames; j++) + input(i, j) = dist(rng); + + conv_fixed.process_(input, MaxFrames); + conv_dynamic.process_(input, MaxFrames); + + assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames); +} + +void test_conv1x1_fully_fixed_partial_buffer() +{ + std::mt19937 rng(222); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + constexpr int Channels = 4; + constexpr int MaxFrames = 64; + constexpr int Groups = 1; + constexpr bool HasBias = true; + constexpr int NumFrames = 32; // Half buffer + + std::vector weights; + for (int i = 0; i < Channels * Channels; i++) + weights.push_back(dist(rng)); + for (int i = 0; i < Channels; i++) + weights.push_back(dist(rng)); + + nam::Conv1x1FullyFixed conv_fixed; + nam::Conv1x1 conv_dynamic(Channels, Channels, HasBias, Groups); + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_fixed.set_weights_(it1); + conv_dynamic.set_weights_(it2); + + conv_fixed.SetMaxBufferSize(MaxFrames); + conv_dynamic.SetMaxBufferSize(MaxFrames); + + Eigen::MatrixXf input(Channels, MaxFrames); + for (int i = 0; i < Channels; i++) + for (int j = 0; j < MaxFrames; j++) + input(i, j) = dist(rng); + + conv_fixed.process_(input, NumFrames); + conv_dynamic.process_(input, NumFrames); + + assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), NumFrames); +} + +void test_conv1x1_fully_fixed_multiple_calls() +{ + std::mt19937 rng(333); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + constexpr int Channels = 4; + constexpr int MaxFrames = 64; + constexpr int Groups = 1; + constexpr bool HasBias = true; + + std::vector weights; + for (int i = 0; i < Channels * Channels; i++) + weights.push_back(dist(rng)); + for (int i = 0; i < Channels; i++) + weights.push_back(dist(rng)); + + nam::Conv1x1FullyFixed conv_fixed; + nam::Conv1x1 conv_dynamic(Channels, Channels, HasBias, Groups); + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_fixed.set_weights_(it1); + conv_dynamic.set_weights_(it2); + + conv_fixed.SetMaxBufferSize(MaxFrames); + conv_dynamic.SetMaxBufferSize(MaxFrames); + + for (int call = 0; call < 5; call++) + { + Eigen::MatrixXf input(Channels, MaxFrames); + for (int i = 0; i < Channels; i++) + for (int j = 0; j < MaxFrames; j++) + input(i, j) = dist(rng); + + conv_fixed.process_(input, MaxFrames); + conv_dynamic.process_(input, MaxFrames); + + assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames); + } +} + +// ============================================================================ +// Conv1DFullyFixed Tests +// ============================================================================ + +void test_conv1d_fully_fixed_4ch_k3_64frames() +{ + std::mt19937 rng(42); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + constexpr int Channels = 4; + constexpr int KernelSize = 3; + constexpr int MaxFrames = 64; + constexpr int Groups = 1; + constexpr bool HasBias = true; + const int dilation = 1; + + std::vector weights; + for (int i = 0; i < Channels; i++) + for (int j = 0; j < Channels; j++) + for (int k = 0; k < KernelSize; k++) + weights.push_back(dist(rng)); + for (int i = 0; i < Channels; i++) + weights.push_back(dist(rng)); + + nam::Conv1DFullyFixed conv_fixed(dilation); + nam::Conv1D conv_dynamic; + conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups); + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_fixed.set_weights_(it1); + conv_dynamic.set_weights_(it2); + + conv_fixed.SetMaxBufferSize(MaxFrames); + conv_dynamic.SetMaxBufferSize(MaxFrames); + + Eigen::MatrixXf input(Channels, MaxFrames); + for (int i = 0; i < Channels; i++) + for (int j = 0; j < MaxFrames; j++) + input(i, j) = dist(rng); + + conv_fixed.Process(input, MaxFrames); + conv_dynamic.Process(input, MaxFrames); + + assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames); +} + +void test_conv1d_fully_fixed_4ch_4groups() +{ + std::mt19937 rng(123); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + constexpr int Channels = 4; + constexpr int KernelSize = 3; + constexpr int MaxFrames = 64; + constexpr int Groups = 4; + constexpr bool HasBias = true; + constexpr int PerGroup = Channels / Groups; + const int dilation = 1; + + std::vector weights; + for (int g = 0; g < Groups; g++) + for (int i = 0; i < PerGroup; i++) + for (int j = 0; j < PerGroup; j++) + for (int k = 0; k < KernelSize; k++) + weights.push_back(dist(rng)); + for (int i = 0; i < Channels; i++) + weights.push_back(dist(rng)); + + nam::Conv1DFullyFixed conv_fixed(dilation); + nam::Conv1D conv_dynamic; + conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups); + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_fixed.set_weights_(it1); + conv_dynamic.set_weights_(it2); + + conv_fixed.SetMaxBufferSize(MaxFrames); + conv_dynamic.SetMaxBufferSize(MaxFrames); + + Eigen::MatrixXf input(Channels, MaxFrames); + for (int i = 0; i < Channels; i++) + for (int j = 0; j < MaxFrames; j++) + input(i, j) = dist(rng); + + conv_fixed.Process(input, MaxFrames); + conv_dynamic.Process(input, MaxFrames); + + assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames); +} + +void test_conv1d_fully_fixed_8ch_8groups() +{ + std::mt19937 rng(456); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + constexpr int Channels = 8; + constexpr int KernelSize = 3; + constexpr int MaxFrames = 128; + constexpr int Groups = 8; + constexpr bool HasBias = true; + constexpr int PerGroup = Channels / Groups; + const int dilation = 1; + + std::vector weights; + for (int g = 0; g < Groups; g++) + for (int i = 0; i < PerGroup; i++) + for (int j = 0; j < PerGroup; j++) + for (int k = 0; k < KernelSize; k++) + weights.push_back(dist(rng)); + for (int i = 0; i < Channels; i++) + weights.push_back(dist(rng)); + + nam::Conv1DFullyFixed conv_fixed(dilation); + nam::Conv1D conv_dynamic; + conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups); + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_fixed.set_weights_(it1); + conv_dynamic.set_weights_(it2); + + conv_fixed.SetMaxBufferSize(MaxFrames); + conv_dynamic.SetMaxBufferSize(MaxFrames); + + Eigen::MatrixXf input(Channels, MaxFrames); + for (int i = 0; i < Channels; i++) + for (int j = 0; j < MaxFrames; j++) + input(i, j) = dist(rng); + + conv_fixed.Process(input, MaxFrames); + conv_dynamic.Process(input, MaxFrames); + + assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames); +} + +void test_conv1d_fully_fixed_dilation2() +{ + std::mt19937 rng(789); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + constexpr int Channels = 4; + constexpr int KernelSize = 3; + constexpr int MaxFrames = 64; + constexpr int Groups = 1; + constexpr bool HasBias = true; + const int dilation = 2; + + std::vector weights; + for (int i = 0; i < Channels; i++) + for (int j = 0; j < Channels; j++) + for (int k = 0; k < KernelSize; k++) + weights.push_back(dist(rng)); + for (int i = 0; i < Channels; i++) + weights.push_back(dist(rng)); + + nam::Conv1DFullyFixed conv_fixed(dilation); + nam::Conv1D conv_dynamic; + conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups); + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_fixed.set_weights_(it1); + conv_dynamic.set_weights_(it2); + + conv_fixed.SetMaxBufferSize(MaxFrames); + conv_dynamic.SetMaxBufferSize(MaxFrames); + + Eigen::MatrixXf input(Channels, MaxFrames); + for (int i = 0; i < Channels; i++) + for (int j = 0; j < MaxFrames; j++) + input(i, j) = dist(rng); + + conv_fixed.Process(input, MaxFrames); + conv_dynamic.Process(input, MaxFrames); + + assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames); +} + +void test_conv1d_fully_fixed_dilation8() +{ + std::mt19937 rng(111); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + constexpr int Channels = 4; + constexpr int KernelSize = 3; + constexpr int MaxFrames = 128; + constexpr int Groups = 1; + constexpr bool HasBias = true; + const int dilation = 8; + + std::vector weights; + for (int i = 0; i < Channels; i++) + for (int j = 0; j < Channels; j++) + for (int k = 0; k < KernelSize; k++) + weights.push_back(dist(rng)); + for (int i = 0; i < Channels; i++) + weights.push_back(dist(rng)); + + nam::Conv1DFullyFixed conv_fixed(dilation); + nam::Conv1D conv_dynamic; + conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups); + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_fixed.set_weights_(it1); + conv_dynamic.set_weights_(it2); + + conv_fixed.SetMaxBufferSize(MaxFrames); + conv_dynamic.SetMaxBufferSize(MaxFrames); + + Eigen::MatrixXf input(Channels, MaxFrames); + for (int i = 0; i < Channels; i++) + for (int j = 0; j < MaxFrames; j++) + input(i, j) = dist(rng); + + conv_fixed.Process(input, MaxFrames); + conv_dynamic.Process(input, MaxFrames); + + assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames); +} + +void test_conv1d_fully_fixed_no_bias() +{ + std::mt19937 rng(222); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + constexpr int Channels = 4; + constexpr int KernelSize = 3; + constexpr int MaxFrames = 64; + constexpr int Groups = 1; + constexpr bool HasBias = false; + const int dilation = 1; + + std::vector weights; + for (int i = 0; i < Channels; i++) + for (int j = 0; j < Channels; j++) + for (int k = 0; k < KernelSize; k++) + weights.push_back(dist(rng)); + + nam::Conv1DFullyFixed conv_fixed(dilation); + nam::Conv1D conv_dynamic; + conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups); + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_fixed.set_weights_(it1); + conv_dynamic.set_weights_(it2); + + conv_fixed.SetMaxBufferSize(MaxFrames); + conv_dynamic.SetMaxBufferSize(MaxFrames); + + Eigen::MatrixXf input(Channels, MaxFrames); + for (int i = 0; i < Channels; i++) + for (int j = 0; j < MaxFrames; j++) + input(i, j) = dist(rng); + + conv_fixed.Process(input, MaxFrames); + conv_dynamic.Process(input, MaxFrames); + + assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames); +} + +void test_conv1d_fully_fixed_multiple_calls() +{ + std::mt19937 rng(333); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + constexpr int Channels = 4; + constexpr int KernelSize = 3; + constexpr int MaxFrames = 64; + constexpr int Groups = 1; + constexpr bool HasBias = true; + const int dilation = 1; + + std::vector weights; + for (int i = 0; i < Channels; i++) + for (int j = 0; j < Channels; j++) + for (int k = 0; k < KernelSize; k++) + weights.push_back(dist(rng)); + for (int i = 0; i < Channels; i++) + weights.push_back(dist(rng)); + + nam::Conv1DFullyFixed conv_fixed(dilation); + nam::Conv1D conv_dynamic; + conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups); + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_fixed.set_weights_(it1); + conv_dynamic.set_weights_(it2); + + conv_fixed.SetMaxBufferSize(MaxFrames); + conv_dynamic.SetMaxBufferSize(MaxFrames); + + // Multiple calls - tests history management + for (int call = 0; call < 10; call++) + { + Eigen::MatrixXf input(Channels, MaxFrames); + for (int i = 0; i < Channels; i++) + for (int j = 0; j < MaxFrames; j++) + input(i, j) = dist(rng); + + conv_fixed.Process(input, MaxFrames); + conv_dynamic.Process(input, MaxFrames); + + assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames); + } +} + +void test_conv1d_fully_fixed_multiple_calls_dilation4() +{ + std::mt19937 rng(444); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + constexpr int Channels = 4; + constexpr int KernelSize = 3; + constexpr int MaxFrames = 64; + constexpr int Groups = 1; + constexpr bool HasBias = true; + const int dilation = 4; + + std::vector weights; + for (int i = 0; i < Channels; i++) + for (int j = 0; j < Channels; j++) + for (int k = 0; k < KernelSize; k++) + weights.push_back(dist(rng)); + for (int i = 0; i < Channels; i++) + weights.push_back(dist(rng)); + + nam::Conv1DFullyFixed conv_fixed(dilation); + nam::Conv1D conv_dynamic; + conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups); + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_fixed.set_weights_(it1); + conv_dynamic.set_weights_(it2); + + conv_fixed.SetMaxBufferSize(MaxFrames); + conv_dynamic.SetMaxBufferSize(MaxFrames); + + for (int call = 0; call < 10; call++) + { + Eigen::MatrixXf input(Channels, MaxFrames); + for (int i = 0; i < Channels; i++) + for (int j = 0; j < MaxFrames; j++) + input(i, j) = dist(rng); + + conv_fixed.Process(input, MaxFrames); + conv_dynamic.Process(input, MaxFrames); + + assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames); + } +} + +void test_conv1d_fully_fixed_varying_buffer_sizes() +{ + std::mt19937 rng(555); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + constexpr int Channels = 4; + constexpr int KernelSize = 3; + constexpr int MaxFrames = 64; + constexpr int Groups = 1; + constexpr bool HasBias = true; + const int dilation = 2; + + std::vector weights; + for (int i = 0; i < Channels; i++) + for (int j = 0; j < Channels; j++) + for (int k = 0; k < KernelSize; k++) + weights.push_back(dist(rng)); + for (int i = 0; i < Channels; i++) + weights.push_back(dist(rng)); + + nam::Conv1DFullyFixed conv_fixed(dilation); + nam::Conv1D conv_dynamic; + conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups); + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_fixed.set_weights_(it1); + conv_dynamic.set_weights_(it2); + + conv_fixed.SetMaxBufferSize(MaxFrames); + conv_dynamic.SetMaxBufferSize(MaxFrames); + + // Varying sizes to stress test history management + int sizes[] = {64, 32, 16, 64, 32, 8, 64}; + for (int num_frames : sizes) + { + Eigen::MatrixXf input(Channels, MaxFrames); + for (int i = 0; i < Channels; i++) + for (int j = 0; j < MaxFrames; j++) + input(i, j) = dist(rng); + + conv_fixed.Process(input, num_frames); + conv_dynamic.Process(input, num_frames); + + assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), num_frames); + } +} + +void test_conv1d_fully_fixed_kernel4() +{ + std::mt19937 rng(666); + std::uniform_real_distribution dist(-1.0f, 1.0f); + + constexpr int Channels = 4; + constexpr int KernelSize = 4; + constexpr int MaxFrames = 64; + constexpr int Groups = 1; + constexpr bool HasBias = true; + const int dilation = 1; + + std::vector weights; + for (int i = 0; i < Channels; i++) + for (int j = 0; j < Channels; j++) + for (int k = 0; k < KernelSize; k++) + weights.push_back(dist(rng)); + for (int i = 0; i < Channels; i++) + weights.push_back(dist(rng)); + + nam::Conv1DFullyFixed conv_fixed(dilation); + nam::Conv1D conv_dynamic; + conv_dynamic.set_size_(Channels, Channels, KernelSize, HasBias, dilation, Groups); + + auto it1 = weights.begin(); + auto it2 = weights.begin(); + conv_fixed.set_weights_(it1); + conv_dynamic.set_weights_(it2); + + conv_fixed.SetMaxBufferSize(MaxFrames); + conv_dynamic.SetMaxBufferSize(MaxFrames); + + Eigen::MatrixXf input(Channels, MaxFrames); + for (int i = 0; i < Channels; i++) + for (int j = 0; j < MaxFrames; j++) + input(i, j) = dist(rng); + + conv_fixed.Process(input, MaxFrames); + conv_dynamic.Process(input, MaxFrames); + + assert_matrices_equal(conv_fixed.GetOutput(), conv_dynamic.GetOutput(), MaxFrames); +} + +} // namespace test_fully_fixed_correctness