diff --git a/NAM/get_dsp.cpp b/NAM/get_dsp.cpp index 5505cdc..57d0fbd 100644 --- a/NAM/get_dsp.cpp +++ b/NAM/get_dsp.cpp @@ -133,7 +133,7 @@ std::unique_ptr get_dsp(const nlohmann::json& config, dspData& returnedConf returnedConfig.version = config["version"].get(); returnedConfig.architecture = config["architecture"].get(); returnedConfig.config = config_json; - returnedConfig.metadata = config["metadata"]; + returnedConfig.metadata = config.value("metadata", nlohmann::json()); returnedConfig.weights = weights; returnedConfig.expected_sample_rate = nam::get_sample_rate_from_nam_file(config); diff --git a/NAM/wavenet.cpp b/NAM/wavenet.cpp index 9717df9..6af3d85 100644 --- a/NAM/wavenet.cpp +++ b/NAM/wavenet.cpp @@ -214,22 +214,40 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma nam::wavenet::_LayerArray::_LayerArray( const int input_size, const int condition_size, const int head_size, const int channels, const int bottleneck, - const int kernel_size, const std::vector& dilations, const activations::ActivationConfig& activation_config, - const GatingMode gating_mode, const bool head_bias, const int groups_input, const int groups_input_mixin, - const int groups_1x1, const Head1x1Params& head1x1_params, - const activations::ActivationConfig& secondary_activation_config, const _FiLMParams& conv_pre_film_params, - const _FiLMParams& conv_post_film_params, const _FiLMParams& input_mixin_pre_film_params, - const _FiLMParams& input_mixin_post_film_params, const _FiLMParams& activation_pre_film_params, - const _FiLMParams& activation_post_film_params, const _FiLMParams& _1x1_post_film_params, - const _FiLMParams& head1x1_post_film_params) + const int kernel_size, const std::vector& dilations, + const std::vector& activation_configs, const std::vector& gating_modes, + const bool head_bias, const int groups_input, const int groups_input_mixin, const int groups_1x1, + const Head1x1Params& head1x1_params, const std::vector& secondary_activation_configs, + const _FiLMParams& conv_pre_film_params, const _FiLMParams& conv_post_film_params, + const _FiLMParams& input_mixin_pre_film_params, const _FiLMParams& input_mixin_post_film_params, + const _FiLMParams& activation_pre_film_params, const _FiLMParams& activation_post_film_params, + const _FiLMParams& _1x1_post_film_params, const _FiLMParams& head1x1_post_film_params) : _rechannel(input_size, channels, false) , _head_rechannel(head1x1_params.active ? head1x1_params.out_channels : bottleneck, head_size, head_bias) , _head_output_size(head1x1_params.active ? head1x1_params.out_channels : bottleneck) { + const size_t num_layers = dilations.size(); + if (activation_configs.size() != num_layers) + { + throw std::invalid_argument("_LayerArray: dilations size (" + std::to_string(num_layers) + + ") must match activation_configs size (" + std::to_string(activation_configs.size()) + + ")"); + } + if (gating_modes.size() != num_layers) + { + throw std::invalid_argument("_LayerArray: dilations size (" + std::to_string(num_layers) + + ") must match gating_modes size (" + std::to_string(gating_modes.size()) + ")"); + } + if (secondary_activation_configs.size() != num_layers) + { + throw std::invalid_argument("_LayerArray: dilations size (" + std::to_string(num_layers) + + ") must match secondary_activation_configs size (" + + std::to_string(secondary_activation_configs.size()) + ")"); + } for (size_t i = 0; i < dilations.size(); i++) this->_layers.push_back( - _Layer(condition_size, channels, bottleneck, kernel_size, dilations[i], activation_config, gating_mode, - groups_input, groups_input_mixin, groups_1x1, head1x1_params, secondary_activation_config, + _Layer(condition_size, channels, bottleneck, kernel_size, dilations[i], activation_configs[i], gating_modes[i], + groups_input, groups_input_mixin, groups_1x1, head1x1_params, secondary_activation_configs[i], conv_pre_film_params, conv_post_film_params, input_mixin_pre_film_params, input_mixin_post_film_params, activation_pre_film_params, activation_post_film_params, _1x1_post_film_params, head1x1_post_film_params)); } @@ -382,10 +400,10 @@ nam::wavenet::WaveNet::WaveNet(const int in_channels, this->_layer_arrays.push_back(nam::wavenet::_LayerArray( layer_array_params[i].input_size, layer_array_params[i].condition_size, layer_array_params[i].head_size, layer_array_params[i].channels, layer_array_params[i].bottleneck, layer_array_params[i].kernel_size, - layer_array_params[i].dilations, layer_array_params[i].activation_config, layer_array_params[i].gating_mode, + layer_array_params[i].dilations, layer_array_params[i].activation_configs, layer_array_params[i].gating_modes, layer_array_params[i].head_bias, layer_array_params[i].groups_input, layer_array_params[i].groups_input_mixin, layer_array_params[i].groups_1x1, layer_array_params[i].head1x1_params, - layer_array_params[i].secondary_activation_config, layer_array_params[i].conv_pre_film_params, + layer_array_params[i].secondary_activation_configs, layer_array_params[i].conv_pre_film_params, layer_array_params[i].conv_post_film_params, layer_array_params[i].input_mixin_pre_film_params, layer_array_params[i].input_mixin_post_film_params, layer_array_params[i].activation_pre_film_params, layer_array_params[i].activation_post_film_params, layer_array_params[i]._1x1_post_film_params, @@ -592,48 +610,160 @@ std::unique_ptr nam::wavenet::Factory(const nlohmann::json& config, st const int head_size = layer_config["head_size"]; const int kernel_size = layer_config["kernel_size"]; const auto dilations = layer_config["dilations"]; - // Parse JSON into typed ActivationConfig at model loading boundary - const activations::ActivationConfig activation_config = - activations::ActivationConfig::from_json(layer_config["activation"]); - // Parse gating mode - support both old "gated" boolean and new "gating_mode" string - GatingMode gating_mode = GatingMode::NONE; - activations::ActivationConfig secondary_activation_config; + const size_t num_layers = dilations.size(); - if (layer_config.find("gating_mode") != layer_config.end()) + // Parse activation config(s) - support both single config and array + std::vector activation_configs; + if (layer_config["activation"].is_array()) { - std::string gating_mode_str = layer_config["gating_mode"].get(); - if (gating_mode_str == "gated") + // Array of activation configs + for (const auto& activation_json : layer_config["activation"]) { - gating_mode = GatingMode::GATED; - secondary_activation_config = activations::ActivationConfig::from_json(layer_config["secondary_activation"]); + activation_configs.push_back(activations::ActivationConfig::from_json(activation_json)); } - else if (gating_mode_str == "blended") + if (activation_configs.size() != num_layers) { - gating_mode = GatingMode::BLENDED; - secondary_activation_config = activations::ActivationConfig::from_json(layer_config["secondary_activation"]); + throw std::runtime_error("Layer array " + std::to_string(i) + ": activation array size (" + + std::to_string(activation_configs.size()) + ") must match dilations size (" + + std::to_string(num_layers) + ")"); } + } + else + { + // Single activation config - duplicate it for all layers + const activations::ActivationConfig activation_config = + activations::ActivationConfig::from_json(layer_config["activation"]); + activation_configs.resize(num_layers, activation_config); + } + // Parse gating mode(s) - support both single value and array, and old "gated" boolean + std::vector gating_modes; + std::vector secondary_activation_configs; + + auto parse_gating_mode_str = [](const std::string& gating_mode_str) -> GatingMode { + if (gating_mode_str == "gated") + return GatingMode::GATED; + else if (gating_mode_str == "blended") + return GatingMode::BLENDED; else if (gating_mode_str == "none") + return GatingMode::NONE; + else + throw std::runtime_error("Invalid gating_mode: " + gating_mode_str); + }; + + if (layer_config.find("gating_mode") != layer_config.end()) + { + if (layer_config["gating_mode"].is_array()) { - gating_mode = GatingMode::NONE; - // Leave secondary_activation_config with empty type + // Array of gating modes + for (const auto& gating_mode_json : layer_config["gating_mode"]) + { + std::string gating_mode_str = gating_mode_json.get(); + GatingMode mode = parse_gating_mode_str(gating_mode_str); + gating_modes.push_back(mode); + + // Parse corresponding secondary activation if gating is enabled + if (mode != GatingMode::NONE) + { + if (layer_config.find("secondary_activation") != layer_config.end()) + { + if (layer_config["secondary_activation"].is_array()) + { + // Array of secondary activations - use corresponding index + if (gating_modes.size() > layer_config["secondary_activation"].size()) + { + throw std::runtime_error("Layer array " + std::to_string(i) + + ": secondary_activation array size must be at least " + + std::to_string(gating_modes.size())); + } + secondary_activation_configs.push_back(activations::ActivationConfig::from_json( + layer_config["secondary_activation"][gating_modes.size() - 1])); + } + else + { + // Single secondary activation - use for all gated layers + secondary_activation_configs.push_back( + activations::ActivationConfig::from_json(layer_config["secondary_activation"])); + } + } + else + { + // Default to Sigmoid for backward compatibility + secondary_activation_configs.push_back( + activations::ActivationConfig::simple(activations::ActivationType::Sigmoid)); + } + } + else + { + // NONE mode - use empty config + secondary_activation_configs.push_back(activations::ActivationConfig{}); + } + } + if (gating_modes.size() != num_layers) + { + throw std::runtime_error("Layer array " + std::to_string(i) + ": gating_mode array size (" + + std::to_string(gating_modes.size()) + ") must match dilations size (" + + std::to_string(num_layers) + ")"); + } + // Validate secondary_activation array size if it's an array + if (layer_config.find("secondary_activation") != layer_config.end() + && layer_config["secondary_activation"].is_array()) + { + if (layer_config["secondary_activation"].size() != num_layers) + { + throw std::runtime_error("Layer array " + std::to_string(i) + ": secondary_activation array size (" + + std::to_string(layer_config["secondary_activation"].size()) + + ") must match dilations size (" + std::to_string(num_layers) + ")"); + } + } } else - throw std::runtime_error("Invalid gating_mode: " + gating_mode_str); + { + // Single gating mode - duplicate for all layers + std::string gating_mode_str = layer_config["gating_mode"].get(); + GatingMode gating_mode = parse_gating_mode_str(gating_mode_str); + gating_modes.resize(num_layers, gating_mode); + + // Parse secondary activation + activations::ActivationConfig secondary_activation_config; + if (gating_mode != GatingMode::NONE) + { + if (layer_config.find("secondary_activation") != layer_config.end()) + { + secondary_activation_config = + activations::ActivationConfig::from_json(layer_config["secondary_activation"]); + } + else + { + // Default to Sigmoid for backward compatibility + secondary_activation_config = activations::ActivationConfig::simple(activations::ActivationType::Sigmoid); + } + } + secondary_activation_configs.resize(num_layers, secondary_activation_config); + } } else if (layer_config.find("gated") != layer_config.end()) { // Backward compatibility: convert old "gated" boolean to new enum bool gated = layer_config["gated"]; - gating_mode = gated ? GatingMode::GATED : GatingMode::NONE; + GatingMode gating_mode = gated ? GatingMode::GATED : GatingMode::NONE; + gating_modes.resize(num_layers, gating_mode); + if (gated) { - secondary_activation_config = activations::ActivationConfig::simple(activations::ActivationType::Sigmoid); + activations::ActivationConfig secondary_config = + activations::ActivationConfig::simple(activations::ActivationType::Sigmoid); + secondary_activation_configs.resize(num_layers, secondary_config); + } + else + { + secondary_activation_configs.resize(num_layers, activations::ActivationConfig{}); } - // else: leave secondary_activation_config uninitialized } else { - throw std::invalid_argument("No information on gating mode found for layer array " + std::to_string(i)); + // Default to NONE for all layers + gating_modes.resize(num_layers, GatingMode::NONE); + secondary_activation_configs.resize(num_layers, activations::ActivationConfig{}); } const bool head_bias = layer_config["head_bias"]; @@ -674,10 +804,11 @@ std::unique_ptr nam::wavenet::Factory(const nlohmann::json& config, st nam::wavenet::_FiLMParams head1x1_post_film_params = parse_film_params("head1x1_post_film"); layer_array_params.push_back(nam::wavenet::LayerArrayParams( - input_size, condition_size, head_size, channels, bottleneck, kernel_size, dilations, activation_config, - gating_mode, head_bias, groups, groups_input_mixin, groups_1x1, head1x1_params, secondary_activation_config, - conv_pre_film_params, conv_post_film_params, input_mixin_pre_film_params, input_mixin_post_film_params, - activation_pre_film_params, activation_post_film_params, _1x1_post_film_params, head1x1_post_film_params)); + input_size, condition_size, head_size, channels, bottleneck, kernel_size, dilations, + std::move(activation_configs), std::move(gating_modes), head_bias, groups, groups_input_mixin, groups_1x1, + head1x1_params, std::move(secondary_activation_configs), conv_pre_film_params, conv_post_film_params, + input_mixin_pre_film_params, input_mixin_post_film_params, activation_pre_film_params, + activation_post_film_params, _1x1_post_film_params, head1x1_post_film_params)); } const bool with_head = !config["head"].is_null(); const float head_scale = config["head_scale"]; diff --git a/NAM/wavenet.h b/NAM/wavenet.h index e290ef4..a2ce917 100644 --- a/NAM/wavenet.h +++ b/NAM/wavenet.h @@ -302,7 +302,7 @@ class _Layer /// \brief Parameters for constructing a LayerArray /// /// Contains all configuration needed to construct a _LayerArray with multiple layers -/// sharing the same channel count, kernel size, and activation configuration. +/// sharing the same channel count and kernel size. Each layer can have its own activation configuration. class LayerArrayParams { public: @@ -314,14 +314,14 @@ class LayerArrayParams /// \param bottleneck_ Bottleneck size (internal channel count) /// \param kernel_size_ Kernel size for dilated convolutions /// \param dilations_ Vector of dilation factors, one per layer - /// \param activation_ Primary activation configuration - /// \param gating_mode_ Gating mode for all layers + /// \param activation_configs_ Vector of primary activation configurations, one per layer + /// \param gating_modes_ Vector of gating modes, one per layer /// \param head_bias_ Whether to use bias in the head rechannel /// \param groups_input Number of groups for input convolutions /// \param groups_input_mixin_ Number of groups for input mixin convolutions /// \param groups_1x1_ Number of groups for 1x1 convolutions /// \param head1x1_params_ Parameters for optional head1x1 convolutions - /// \param secondary_activation_config_ Secondary activation for gating/blending + /// \param secondary_activation_configs_ Vector of secondary activation configs for gating/blending, one per layer /// \param conv_pre_film_params_ FiLM parameters before input convolutions /// \param conv_post_film_params_ FiLM parameters after input convolutions /// \param input_mixin_pre_film_params_ FiLM parameters before input mixin @@ -330,12 +330,14 @@ class LayerArrayParams /// \param activation_post_film_params_ FiLM parameters after activation /// \param _1x1_post_film_params_ FiLM parameters after 1x1 convolutions /// \param head1x1_post_film_params_ FiLM parameters after head1x1 convolutions + /// \throws std::invalid_argument If dilations, activation_configs, gating_modes, or secondary_activation_configs + /// sizes don't match LayerArrayParams(const int input_size_, const int condition_size_, const int head_size_, const int channels_, const int bottleneck_, const int kernel_size_, const std::vector&& dilations_, - const activations::ActivationConfig& activation_, const GatingMode gating_mode_, - const bool head_bias_, const int groups_input, const int groups_input_mixin_, const int groups_1x1_, - const Head1x1Params& head1x1_params_, - const activations::ActivationConfig& secondary_activation_config_, + const std::vector&& activation_configs_, + const std::vector&& gating_modes_, const bool head_bias_, const int groups_input, + const int groups_input_mixin_, const int groups_1x1_, const Head1x1Params& head1x1_params_, + const std::vector&& secondary_activation_configs_, const _FiLMParams& conv_pre_film_params_, const _FiLMParams& conv_post_film_params_, const _FiLMParams& input_mixin_pre_film_params_, const _FiLMParams& input_mixin_post_film_params_, const _FiLMParams& activation_pre_film_params_, const _FiLMParams& activation_post_film_params_, @@ -347,14 +349,14 @@ class LayerArrayParams , bottleneck(bottleneck_) , kernel_size(kernel_size_) , dilations(std::move(dilations_)) - , activation_config(activation_) - , gating_mode(gating_mode_) + , activation_configs(std::move(activation_configs_)) + , gating_modes(std::move(gating_modes_)) , head_bias(head_bias_) , groups_input(groups_input) , groups_input_mixin(groups_input_mixin_) , groups_1x1(groups_1x1_) , head1x1_params(head1x1_params_) - , secondary_activation_config(secondary_activation_config_) + , secondary_activation_configs(std::move(secondary_activation_configs_)) , conv_pre_film_params(conv_pre_film_params_) , conv_post_film_params(conv_post_film_params_) , input_mixin_pre_film_params(input_mixin_pre_film_params_) @@ -364,6 +366,24 @@ class LayerArrayParams , _1x1_post_film_params(_1x1_post_film_params_) , head1x1_post_film_params(head1x1_post_film_params_) { + const size_t num_layers = dilations.size(); + if (activation_configs.size() != num_layers) + { + throw std::invalid_argument("LayerArrayParams: dilations size (" + std::to_string(num_layers) + + ") must match activation_configs size (" + std::to_string(activation_configs.size()) + + ")"); + } + if (gating_modes.size() != num_layers) + { + throw std::invalid_argument("LayerArrayParams: dilations size (" + std::to_string(num_layers) + + ") must match gating_modes size (" + std::to_string(gating_modes.size()) + ")"); + } + if (secondary_activation_configs.size() != num_layers) + { + throw std::invalid_argument("LayerArrayParams: dilations size (" + std::to_string(num_layers) + + ") must match secondary_activation_configs size (" + + std::to_string(secondary_activation_configs.size()) + ")"); + } } const int input_size; ///< Input size (number of channels) @@ -373,14 +393,15 @@ class LayerArrayParams const int bottleneck; ///< Bottleneck size (internal channel count) const int kernel_size; ///< Kernel size for dilated convolutions std::vector dilations; ///< Dilation factors, one per layer - const activations::ActivationConfig activation_config; ///< Primary activation configuration - const GatingMode gating_mode; ///< Gating mode for all layers + std::vector activation_configs; ///< Primary activation configurations, one per layer + std::vector gating_modes; ///< Gating modes, one per layer const bool head_bias; ///< Whether to use bias in head rechannel const int groups_input; ///< Number of groups for input convolutions const int groups_input_mixin; ///< Number of groups for input mixin const int groups_1x1; ///< Number of groups for 1x1 convolutions const Head1x1Params head1x1_params; ///< Parameters for optional head1x1 - const activations::ActivationConfig secondary_activation_config; ///< Secondary activation for gating/blending + std::vector + secondary_activation_configs; ///< Secondary activation configs for gating/blending, one per layer const _FiLMParams conv_pre_film_params; ///< FiLM params before input conv const _FiLMParams conv_post_film_params; ///< FiLM params after input conv const _FiLMParams input_mixin_pre_film_params; ///< FiLM params before input mixin @@ -414,14 +435,14 @@ class _LayerArray /// \param bottleneck Bottleneck size (internal channel count) /// \param kernel_size Kernel size for dilated convolutions /// \param dilations Vector of dilation factors, one per layer - /// \param activation_config Primary activation configuration - /// \param gating_mode Gating mode for all layers + /// \param activation_configs Vector of primary activation configurations, one per layer + /// \param gating_modes Vector of gating modes, one per layer /// \param head_bias Whether to use bias in the head rechannel /// \param groups_input Number of groups for input convolutions /// \param groups_input_mixin Number of groups for input mixin /// \param groups_1x1 Number of groups for 1x1 convolutions /// \param head1x1_params Parameters for optional head1x1 convolutions - /// \param secondary_activation_config Secondary activation for gating/blending + /// \param secondary_activation_configs Vector of secondary activation configs for gating/blending, one per layer /// \param conv_pre_film_params FiLM parameters before input convolutions /// \param conv_post_film_params FiLM parameters after input convolutions /// \param input_mixin_pre_film_params FiLM parameters before input mixin @@ -432,9 +453,10 @@ class _LayerArray /// \param head1x1_post_film_params FiLM parameters after head1x1 convolutions _LayerArray(const int input_size, const int condition_size, const int head_size, const int channels, const int bottleneck, const int kernel_size, const std::vector& dilations, - const activations::ActivationConfig& activation_config, const GatingMode gating_mode, - const bool head_bias, const int groups_input, const int groups_input_mixin, const int groups_1x1, - const Head1x1Params& head1x1_params, const activations::ActivationConfig& secondary_activation_config, + const std::vector& activation_configs, + const std::vector& gating_modes, const bool head_bias, const int groups_input, + const int groups_input_mixin, const int groups_1x1, const Head1x1Params& head1x1_params, + const std::vector& secondary_activation_configs, const _FiLMParams& conv_pre_film_params, const _FiLMParams& conv_post_film_params, const _FiLMParams& input_mixin_pre_film_params, const _FiLMParams& input_mixin_post_film_params, const _FiLMParams& activation_pre_film_params, const _FiLMParams& activation_post_film_params, diff --git a/tools/run_tests.cpp b/tools/run_tests.cpp index 56abfec..0e5ad65 100644 --- a/tools/run_tests.cpp +++ b/tools/run_tests.cpp @@ -147,6 +147,7 @@ int main() test_wavenet::test_layer_array::test_layer_array_basic(); test_wavenet::test_layer_array::test_layer_array_receptive_field(); test_wavenet::test_layer_array::test_layer_array_with_head_input(); + test_wavenet::test_layer_array::test_layer_array_different_activations(); test_wavenet::test_full::test_wavenet_model(); test_wavenet::test_full::test_wavenet_multiple_arrays(); test_wavenet::test_full::test_wavenet_zero_input(); diff --git a/tools/test/test_wavenet/test_condition_processing.cpp b/tools/test/test_wavenet/test_condition_processing.cpp index afc16f3..250f943 100644 --- a/tools/test/test_wavenet/test_condition_processing.cpp +++ b/tools/test/test_wavenet/test_condition_processing.cpp @@ -28,10 +28,16 @@ static nam::wavenet::LayerArrayParams make_layer_array_params( const nam::activations::ActivationConfig& secondary_activation_config) { auto film_params = make_default_film_params(); - return nam::wavenet::LayerArrayParams( - input_size, condition_size, head_size, channels, bottleneck, kernel_size, std::move(dilations), activation_config, - gating_mode, head_bias, groups_input, groups_input_mixin, groups_1x1, head1x1_params, secondary_activation_config, - film_params, film_params, film_params, film_params, film_params, film_params, film_params, film_params); + // Duplicate activation_config, gating_mode, and secondary_activation_config for each layer (based on dilations size) + std::vector activation_configs(dilations.size(), activation_config); + std::vector gating_modes(dilations.size(), gating_mode); + std::vector secondary_activation_configs( + dilations.size(), secondary_activation_config); + return nam::wavenet::LayerArrayParams(input_size, condition_size, head_size, channels, bottleneck, kernel_size, + std::move(dilations), std::move(activation_configs), std::move(gating_modes), + head_bias, groups_input, groups_input_mixin, groups_1x1, head1x1_params, + std::move(secondary_activation_configs), film_params, film_params, film_params, + film_params, film_params, film_params, film_params, film_params); } // Helper function to create a simple WaveNet with specified input and output channels diff --git a/tools/test/test_wavenet/test_full.cpp b/tools/test/test_wavenet/test_full.cpp index 20a7af1..9ac7372 100644 --- a/tools/test/test_wavenet/test_full.cpp +++ b/tools/test/test_wavenet/test_full.cpp @@ -27,10 +27,16 @@ static nam::wavenet::LayerArrayParams make_layer_array_params( const nam::activations::ActivationConfig& secondary_activation_config) { auto film_params = make_default_film_params(); - return nam::wavenet::LayerArrayParams( - input_size, condition_size, head_size, channels, bottleneck, kernel_size, std::move(dilations), activation_config, - gating_mode, head_bias, groups_input, groups_input_mixin, groups_1x1, head1x1_params, secondary_activation_config, - film_params, film_params, film_params, film_params, film_params, film_params, film_params, film_params); + // Duplicate activation_config, gating_mode, and secondary_activation_config for each layer (based on dilations size) + std::vector activation_configs(dilations.size(), activation_config); + std::vector gating_modes(dilations.size(), gating_mode); + std::vector secondary_activation_configs( + dilations.size(), secondary_activation_config); + return nam::wavenet::LayerArrayParams(input_size, condition_size, head_size, channels, bottleneck, kernel_size, + std::move(dilations), std::move(activation_configs), std::move(gating_modes), + head_bias, groups_input, groups_input_mixin, groups_1x1, head1x1_params, + std::move(secondary_activation_configs), film_params, film_params, film_params, + film_params, film_params, film_params, film_params, film_params); } // Test full WaveNet model void test_wavenet_model() diff --git a/tools/test/test_wavenet/test_head1x1.cpp b/tools/test/test_wavenet/test_head1x1.cpp index be55431..42aea33 100644 --- a/tools/test/test_wavenet/test_head1x1.cpp +++ b/tools/test/test_wavenet/test_head1x1.cpp @@ -205,27 +205,46 @@ void test_head1x1_gated() // Input mixin: (conditionSize, 2*bottleneck) = (1, 4) = 4 weights // 1x1: (bottleneck, channels) + bias = (2, 2) + 2 = 4 + 2 = 6 weights // head1x1: (bottleneck, head1x1_out_channels) + bias = (2, 2) + 2 = 4 + 2 = 6 weights - std::vector weights{// Conv: (channels, 2*bottleneck, kernelSize=1) weights + (2*bottleneck,) bias - // Weight layout: for each kernel position, for each output channel, for each input channel - // For kernel position 0: - // Output channel 0: connects to input channels 0 and 1 - 1.0f, 0.0f, // output channel 0 - // Output channel 1: connects to input channels 0 and 1 - 0.0f, 1.0f, // output channel 1 - // Output channel 2: connects to input channels 0 and 1 - 1.0f, 0.0f, // output channel 2 - // Output channel 3: connects to input channels 0 and 1 - 0.0f, 1.0f, // output channel 3 - // Bias: 2*bottleneck values - 0.0f, 0.0f, 0.0f, 0.0f, - // Input mixin: (conditionSize, 2*bottleneck) weights (all 1.0 for simplicity) - 1.0f, 1.0f, 1.0f, 1.0f, - // 1x1: (bottleneck, channels) weights + (channels,) bias (identity) - 1.0f, 0.0f, 0.0f, 1.0f, // weights (identity) - 0.0f, 0.0f, // bias - // head1x1: (bottleneck, head1x1_out_channels) weights + (head1x1_out_channels,) bias - 0.5f, 0.0f, 0.0f, 0.5f, // weights - 0.1f, 0.1f}; + std::vector weights; + // Conv: (channels, 2*bottleneck, kernelSize=1) weights + (2*bottleneck,) bias + // Weight layout: for each kernel position, for each output channel, for each input channel + // For kernel position 0: + // Output channel 0: connects to input channels 0 and 1 + weights.push_back(1.0f); // output channel 0 + weights.push_back(0.0f); + // Output channel 1: connects to input channels 0 and 1 + weights.push_back(0.0f); // output channel 1 + weights.push_back(1.0f); + // Output channel 2: connects to input channels 0 and 1 + weights.push_back(1.0f); // output channel 2 + weights.push_back(0.0f); + // Output channel 3: connects to input channels 0 and 1 + weights.push_back(0.0f); // output channel 3 + weights.push_back(1.0f); + // Bias: 2*bottleneck values + weights.push_back(0.0f); + weights.push_back(0.0f); + weights.push_back(0.0f); + weights.push_back(0.0f); + // Input mixin: (conditionSize, 2*bottleneck) weights (all 1.0 for simplicity) + weights.push_back(1.0f); + weights.push_back(1.0f); + weights.push_back(1.0f); + weights.push_back(1.0f); + // 1x1: (bottleneck, channels) weights + (channels,) bias (identity) + weights.push_back(1.0f); // weights (identity) + weights.push_back(0.0f); + weights.push_back(0.0f); + weights.push_back(1.0f); + weights.push_back(0.0f); // bias + weights.push_back(0.0f); + // head1x1: (bottleneck, head1x1_out_channels) weights + (head1x1_out_channels,) bias + weights.push_back(0.5f); // weights + weights.push_back(0.0f); + weights.push_back(0.0f); + weights.push_back(0.5f); + weights.push_back(0.1f); + weights.push_back(0.1f); auto it = weights.begin(); layer.set_weights_(it); diff --git a/tools/test/test_wavenet/test_layer_array.cpp b/tools/test/test_wavenet/test_layer_array.cpp index cd1f762..fc81896 100644 --- a/tools/test/test_wavenet/test_layer_array.cpp +++ b/tools/test/test_wavenet/test_layer_array.cpp @@ -27,9 +27,14 @@ static nam::wavenet::_LayerArray make_layer_array( const nam::activations::ActivationConfig& secondary_activation_config) { auto film_params = make_default_film_params(); + // Duplicate activation_config, gating_mode, and secondary_activation_config for each layer (based on dilations size) + std::vector activation_configs(dilations.size(), activation_config); + std::vector gating_modes(dilations.size(), gating_mode); + std::vector secondary_activation_configs( + dilations.size(), secondary_activation_config); return nam::wavenet::_LayerArray(input_size, condition_size, head_size, channels, bottleneck, kernel_size, dilations, - activation_config, gating_mode, head_bias, groups_input, groups_input_mixin, - groups_1x1, head1x1_params, secondary_activation_config, film_params, film_params, + activation_configs, gating_modes, head_bias, groups_input, groups_input_mixin, + groups_1x1, head1x1_params, secondary_activation_configs, film_params, film_params, film_params, film_params, film_params, film_params, film_params, film_params); } // Test layer array construction and basic processing @@ -166,6 +171,162 @@ void test_layer_array_with_head_input() assert(head_outputs.rows() == head_size); assert(head_outputs.cols() == numFrames); } + +// Test layer array with different activation configs, gating modes, and secondary activations for each layer +void test_layer_array_different_activations() +{ + const int input_size = 1; + const int condition_size = 1; + const int head_size = 1; + const int channels = 1; + const int bottleneck = channels; + const int kernel_size = 1; + std::vector dilations{1, 2, 3}; + const bool head_bias = false; + const int groups = 1; + const int groups_input_mixin = 1; + const int groups_1x1 = 1; + nam::wavenet::Head1x1Params head1x1_params(false, channels, 1); + + // Create different activation configs for each layer + std::vector activation_configs; + activation_configs.push_back(nam::activations::ActivationConfig::simple(nam::activations::ActivationType::ReLU)); + activation_configs.push_back(nam::activations::ActivationConfig::simple(nam::activations::ActivationType::Tanh)); + activation_configs.push_back(nam::activations::ActivationConfig::simple(nam::activations::ActivationType::Sigmoid)); + + // Create different gating modes for each layer: NONE, GATED, BLENDED + std::vector gating_modes; + gating_modes.push_back(nam::wavenet::GatingMode::NONE); + gating_modes.push_back(nam::wavenet::GatingMode::GATED); + gating_modes.push_back(nam::wavenet::GatingMode::BLENDED); + + // Create different secondary activation configs for gated/blended layers + std::vector secondary_activation_configs; + secondary_activation_configs.push_back(nam::activations::ActivationConfig{}); // NONE mode - empty config + secondary_activation_configs.push_back( + nam::activations::ActivationConfig::simple(nam::activations::ActivationType::Sigmoid)); // GATED mode - Sigmoid + secondary_activation_configs.push_back( + nam::activations::ActivationConfig::simple(nam::activations::ActivationType::Tanh)); // BLENDED mode - Tanh + + // Verify we have the right number of configs + assert(activation_configs.size() == dilations.size()); + assert(gating_modes.size() == dilations.size()); + assert(secondary_activation_configs.size() == dilations.size()); + + auto film_params = make_default_film_params(); + nam::wavenet::_LayerArray layer_array( + input_size, condition_size, head_size, channels, bottleneck, kernel_size, dilations, activation_configs, + gating_modes, head_bias, groups, groups_input_mixin, groups_1x1, head1x1_params, secondary_activation_configs, + film_params, film_params, film_params, film_params, film_params, film_params, film_params, film_params); + + const int numFrames = 4; + layer_array.SetMaxBufferSize(numFrames); + + // Set weights: all weights = 1.0, biases = 0.0 + // Rechannel: (1,1) weight (no bias) + // Layer 0 (NONE): conv (1->1, weight=1, bias=0), input_mixin (1->1, weight=1), 1x1 (1->1, weight=1, bias=0) + // Layer 1 (GATED): conv (1->2, weight=1, bias=0), input_mixin (1->2, weight=1), 1x1 (1->1, weight=1, bias=0) + // Note: GATED doubles bottleneck channels, so conv outputs 2 channels, but 1x1 takes 1 channel input + // Layer 2 (BLENDED): conv (1->2, weight=1, bias=0), input_mixin (1->2, weight=1), 1x1 (1->1, weight=1, bias=0) + // Head rechannel: (1,1) weight (no bias) + std::vector weights; + // Rechannel + weights.push_back(1.0f); + // Layer 0 (NONE): conv(1->1) + bias, input_mixin(1->1), 1x1(1->1) + bias + weights.insert(weights.end(), {1.0f, 0.0f, 1.0f, 1.0f, 0.0f}); + // Layer 1 (GATED): conv(1->2) + bias, input_mixin(1->2), 1x1(1->1) + bias + // conv: 1 input * 2 output = 2 weights, 2 biases + // input_mixin: 1 input * 2 output = 2 weights + // 1x1: 1 input * 1 output = 1 weight, 1 bias + weights.insert(weights.end(), {1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 1.0f, 0.0f}); + // Layer 2 (BLENDED): same as GATED + weights.insert(weights.end(), {1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 1.0f, 0.0f}); + // Head rechannel + weights.push_back(1.0f); + + auto it = weights.begin(); + layer_array.set_weights_(it); + assert(it == weights.end()); + + // Test with positive input values to verify all activations work + Eigen::MatrixXf layer_inputs(input_size, numFrames); + Eigen::MatrixXf condition(condition_size, numFrames); + layer_inputs.fill(2.0f); // Use larger value to make differences more pronounced + condition.fill(1.0f); + + layer_array.Process(layer_inputs, condition, numFrames); + + auto layer_outputs = layer_array.GetLayerOutputs().leftCols(numFrames); + auto head_outputs = layer_array.GetHeadOutputs().leftCols(numFrames); + + assert(layer_outputs.rows() == channels); + assert(layer_outputs.cols() == numFrames); + assert(head_outputs.rows() == head_size); + assert(head_outputs.cols() == numFrames); + + // Verify output is reasonable (not NaN, not infinite) + for (int i = 0; i < numFrames; i++) + { + assert(std::isfinite(head_outputs(0, i))); + assert(std::isfinite(layer_outputs(0, i))); + } + + // Verify that the different configurations produce valid outputs + // The key is that: + // - Layer 0 uses ReLU with NONE gating (standard activation) + // - Layer 1 uses Tanh with GATED gating (Tanh primary, Sigmoid secondary) + // - Layer 2 uses Sigmoid with BLENDED gating (Sigmoid primary, Tanh secondary) + // Each should produce different behavior + + // Verify all outputs are finite and reasonable + for (int i = 0; i < numFrames; i++) + { + assert(std::isfinite(head_outputs(0, i))); + assert(std::isfinite(layer_outputs(0, i))); + } + + // Now create a comparison LayerArray with all ReLU activations and NONE gating + // This should produce different outputs since it doesn't have gating or saturating activations + std::vector all_relu_configs( + dilations.size(), nam::activations::ActivationConfig::simple(nam::activations::ActivationType::ReLU)); + std::vector all_none_gating_modes(dilations.size(), nam::wavenet::GatingMode::NONE); + std::vector all_empty_secondary_configs( + dilations.size(), nam::activations::ActivationConfig{}); + nam::wavenet::_LayerArray layer_array_all_relu(input_size, condition_size, head_size, channels, bottleneck, + kernel_size, dilations, all_relu_configs, all_none_gating_modes, + head_bias, groups, groups_input_mixin, groups_1x1, head1x1_params, + all_empty_secondary_configs, film_params, film_params, film_params, + film_params, film_params, film_params, film_params, film_params); + layer_array_all_relu.SetMaxBufferSize(numFrames); + + // Create weights for all-NONE version (simpler, no gating) + std::vector weights_all_none; + weights_all_none.push_back(1.0f); // Rechannel + weights_all_none.insert(weights_all_none.end(), {1.0f, 0.0f, 1.0f, 1.0f, 0.0f}); // Layer 0 + weights_all_none.insert(weights_all_none.end(), {1.0f, 0.0f, 1.0f, 1.0f, 0.0f}); // Layer 1 + weights_all_none.insert(weights_all_none.end(), {1.0f, 0.0f, 1.0f, 1.0f, 0.0f}); // Layer 2 + weights_all_none.push_back(1.0f); // Head rechannel + + auto it_all_none = weights_all_none.begin(); + layer_array_all_relu.set_weights_(it_all_none); + + // Process with same positive input + layer_array_all_relu.Process(layer_inputs, condition, numFrames); + auto head_outputs_all_relu = layer_array_all_relu.GetHeadOutputs().leftCols(numFrames); + + // Verify outputs are different - the mixed configuration (with gating and saturating activations) + // should produce different values than all-ReLU with no gating + bool outputs_differ_from_all_relu = false; + for (int i = 0; i < numFrames; i++) + { + if (std::abs(head_outputs(0, i) - head_outputs_all_relu(0, i)) > 0.1f) + { + outputs_differ_from_all_relu = true; + break; + } + } + assert(outputs_differ_from_all_relu); // Mixed config should produce different outputs than all-ReLU+NONE +} }; // namespace test_layer_array } // namespace test_wavenet diff --git a/tools/test/test_wavenet/test_real_time_safe.cpp b/tools/test/test_wavenet/test_real_time_safe.cpp index 00cfb5b..0561b17 100644 --- a/tools/test/test_wavenet/test_real_time_safe.cpp +++ b/tools/test/test_wavenet/test_real_time_safe.cpp @@ -47,9 +47,14 @@ static nam::wavenet::_LayerArray make_layer_array( const nam::activations::ActivationConfig& secondary_activation_config) { auto film_params = make_default_film_params(); + // Duplicate activation_config, gating_mode, and secondary_activation_config for each layer (based on dilations size) + std::vector activation_configs(dilations.size(), activation_config); + std::vector gating_modes(dilations.size(), gating_mode); + std::vector secondary_activation_configs( + dilations.size(), secondary_activation_config); return nam::wavenet::_LayerArray(input_size, condition_size, head_size, channels, bottleneck, kernel_size, dilations, - activation_config, gating_mode, head_bias, groups_input, groups_input_mixin, - groups_1x1, head1x1_params, secondary_activation_config, film_params, film_params, + activation_configs, gating_modes, head_bias, groups_input, groups_input_mixin, + groups_1x1, head1x1_params, secondary_activation_configs, film_params, film_params, film_params, film_params, film_params, film_params, film_params, film_params); } @@ -62,10 +67,16 @@ static nam::wavenet::LayerArrayParams make_layer_array_params( const nam::activations::ActivationConfig& secondary_activation_config) { auto film_params = make_default_film_params(); - return nam::wavenet::LayerArrayParams( - input_size, condition_size, head_size, channels, bottleneck, kernel_size, std::move(dilations), activation_config, - gating_mode, head_bias, groups_input, groups_input_mixin, groups_1x1, head1x1_params, secondary_activation_config, - film_params, film_params, film_params, film_params, film_params, film_params, film_params, film_params); + // Duplicate activation_config, gating_mode, and secondary_activation_config for each layer (based on dilations size) + std::vector activation_configs(dilations.size(), activation_config); + std::vector gating_modes(dilations.size(), gating_mode); + std::vector secondary_activation_configs( + dilations.size(), secondary_activation_config); + return nam::wavenet::LayerArrayParams(input_size, condition_size, head_size, channels, bottleneck, kernel_size, + std::move(dilations), std::move(activation_configs), std::move(gating_modes), + head_bias, groups_input, groups_input_mixin, groups_1x1, head1x1_params, + std::move(secondary_activation_configs), film_params, film_params, film_params, + film_params, film_params, film_params, film_params, film_params); } // Helper function to create a Layer with all FiLMs active diff --git a/tools/test/test_wavenet_configurable_gating.cpp b/tools/test/test_wavenet_configurable_gating.cpp index 6dbd18d..b5b8e24 100644 --- a/tools/test/test_wavenet_configurable_gating.cpp +++ b/tools/test/test_wavenet_configurable_gating.cpp @@ -40,10 +40,16 @@ static nam::wavenet::LayerArrayParams make_layer_array_params( const nam::activations::ActivationConfig& secondary_activation_config) { auto film_params = make_default_film_params(); - return nam::wavenet::LayerArrayParams( - input_size, condition_size, head_size, channels, bottleneck, kernel_size, std::move(dilations), activation_config, - gating_mode, head_bias, groups_input, groups_input_mixin, groups_1x1, head1x1_params, secondary_activation_config, - film_params, film_params, film_params, film_params, film_params, film_params, film_params, film_params); + // Duplicate activation_config, gating_mode, and secondary_activation_config for each layer (based on dilations size) + std::vector activation_configs(dilations.size(), activation_config); + std::vector gating_modes(dilations.size(), gating_mode); + std::vector secondary_activation_configs( + dilations.size(), secondary_activation_config); + return nam::wavenet::LayerArrayParams(input_size, condition_size, head_size, channels, bottleneck, kernel_size, + std::move(dilations), std::move(activation_configs), std::move(gating_modes), + head_bias, groups_input, groups_input_mixin, groups_1x1, head1x1_params, + std::move(secondary_activation_configs), film_params, film_params, film_params, + film_params, film_params, film_params, film_params, film_params); } // Helper function to create a LayerArray with default FiLM parameters @@ -55,9 +61,14 @@ static nam::wavenet::_LayerArray make_layer_array( const nam::activations::ActivationConfig& secondary_activation_config) { auto film_params = make_default_film_params(); + // Duplicate activation_config, gating_mode, and secondary_activation_config for each layer (based on dilations size) + std::vector activation_configs(dilations.size(), activation_config); + std::vector gating_modes(dilations.size(), gating_mode); + std::vector secondary_activation_configs( + dilations.size(), secondary_activation_config); return nam::wavenet::_LayerArray(input_size, condition_size, head_size, channels, bottleneck, kernel_size, dilations, - activation_config, gating_mode, head_bias, groups_input, groups_input_mixin, - groups_1x1, head1x1_params, secondary_activation_config, film_params, film_params, + activation_configs, gating_modes, head_bias, groups_input, groups_input_mixin, + groups_1x1, head1x1_params, secondary_activation_configs, film_params, film_params, film_params, film_params, film_params, film_params, film_params, film_params); } @@ -151,8 +162,12 @@ class TestConfigurableGating std::vector{1, 2}, activation, nam::wavenet::GatingMode::GATED, head_bias, groups_input, groups_input_mixin, groups_1x1, head1x1_params, tanh_config); - assert(params_gated.gating_mode == nam::wavenet::GatingMode::GATED); - assert(params_gated.secondary_activation_config.type == nam::activations::ActivationType::Tanh); + assert(params_gated.gating_modes.size() == 2); // Two layers (dilations = {1, 2}) + assert(params_gated.gating_modes[0] == nam::wavenet::GatingMode::GATED); + assert(params_gated.gating_modes[1] == nam::wavenet::GatingMode::GATED); + assert(params_gated.secondary_activation_configs.size() == 2); + assert(params_gated.secondary_activation_configs[0].type == nam::activations::ActivationType::Tanh); + assert(params_gated.secondary_activation_configs[1].type == nam::activations::ActivationType::Tanh); // Test with different blending activations auto relu_config = nam::activations::ActivationConfig::simple(nam::activations::ActivationType::ReLU); @@ -161,8 +176,12 @@ class TestConfigurableGating std::vector{1, 2}, activation, nam::wavenet::GatingMode::BLENDED, head_bias, groups_input, groups_input_mixin, groups_1x1, head1x1_params, relu_config); - assert(params_blended.gating_mode == nam::wavenet::GatingMode::BLENDED); - assert(params_blended.secondary_activation_config.type == nam::activations::ActivationType::ReLU); + assert(params_blended.gating_modes.size() == 2); // Two layers (dilations = {1, 2}) + assert(params_blended.gating_modes[0] == nam::wavenet::GatingMode::BLENDED); + assert(params_blended.gating_modes[1] == nam::wavenet::GatingMode::BLENDED); + assert(params_blended.secondary_activation_configs.size() == 2); + assert(params_blended.secondary_activation_configs[0].type == nam::activations::ActivationType::ReLU); + assert(params_blended.secondary_activation_configs[1].type == nam::activations::ActivationType::ReLU); } static void test_layer_array_construction()