From e4a2648f110ccc28c33dd20f6024b2dbf29fd4ec Mon Sep 17 00:00:00 2001 From: andrewkern Date: Wed, 26 Nov 2025 12:35:07 -0800 Subject: [PATCH 1/7] add SIMD vectorization for Eidos math functions Adds compile-time SIMD detection (AVX2/SSE4.2/FMA) and vectorized implementations for sqrt, abs, floor, ceil, round, trunc, sum, and product. Benchmarks show 1.4-5.7x speedups on large float arrays. - CMakeLists.txt: add USE_SIMD option and compiler flag detection - eidos/eidos_simd.h: new header with SIMD intrinsic implementations - eidos/eidos_functions_math.cpp: use SIMD paths when available - eidos/eidos_test_*.{h,cpp}: use tolerance for float comparisons --- CMakeLists.txt | 38 +++ eidos/eidos_functions_math.cpp | 100 +++++--- eidos/eidos_simd.h | 352 +++++++++++++++++++++++++++ eidos/eidos_test_builtins.h | 6 +- eidos/eidos_test_functions_other.cpp | 8 +- 5 files changed, 461 insertions(+), 43 deletions(-) create mode 100644 eidos/eidos_simd.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 9e94f8db..b85c4742 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -286,6 +286,44 @@ if(BUILD_LTO) endif() endif() +# +# SIMD SUPPORT (independent of OpenMP) +# + +# Option to disable SIMD entirely +option(USE_SIMD "Enable SIMD optimizations (SSE4.2/AVX2)" ON) + +if(USE_SIMD AND NOT WIN32) + include(CheckCXXCompilerFlag) + + # Check for AVX2 support + check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2) + check_cxx_compiler_flag("-msse4.2" COMPILER_SUPPORTS_SSE42) + check_cxx_compiler_flag("-mfma" COMPILER_SUPPORTS_FMA) + + if(COMPILER_SUPPORTS_AVX2) + message(STATUS "SIMD: AVX2 support detected") + add_compile_definitions(EIDOS_HAS_AVX2=1) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2") + if(COMPILER_SUPPORTS_FMA) + message(STATUS "SIMD: FMA support detected") + add_compile_definitions(EIDOS_HAS_FMA=1) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma") + endif() + elseif(COMPILER_SUPPORTS_SSE42) + message(STATUS "SIMD: SSE4.2 support detected (no AVX2)") + add_compile_definitions(EIDOS_HAS_SSE42=1) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2") + else() + message(STATUS "SIMD: No SIMD support detected, using scalar fallback") + endif() +elseif(USE_SIMD AND WIN32) + # Windows/MSVC detection not yet implemented + message(STATUS "SIMD: Windows SIMD detection not yet implemented, using scalar fallback") +else() + message(STATUS "SIMD: Disabled by user") +endif() + # GSL - adding /usr/local/include so all targets that use GSL_INCLUDES get omp.h set(TARGET_NAME_GSL gsl) file(GLOB_RECURSE GSL_SOURCES ${PROJECT_SOURCE_DIR}/gsl/*.c ${PROJECT_SOURCE_DIR}/gsl/*/*.c) diff --git a/eidos/eidos_functions_math.cpp b/eidos/eidos_functions_math.cpp index d1335257..2a97dedb 100644 --- a/eidos/eidos_functions_math.cpp +++ b/eidos/eidos_functions_math.cpp @@ -19,6 +19,7 @@ #include "eidos_functions.h" +#include "eidos_simd.h" #include #include @@ -87,15 +88,19 @@ EidosValue_SP Eidos_ExecuteFunction_abs(const std::vector &p_argu EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count); double *float_result_data = float_result->data_mutable(); result_SP = EidosValue_SP(float_result); - + +#ifdef _OPENMP EIDOS_THREAD_COUNT(gEidos_OMP_threads_ABS_FLOAT); -#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ABS_FLOAT) num_threads(thread_count) + #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ABS_FLOAT) num_threads(thread_count) for (int value_index = 0; value_index < x_count; ++value_index) float_result_data[value_index] = fabs(float_data[value_index]); +#else + Eidos_SIMD::abs_float64(float_data, float_result_data, x_count); +#endif } - + result_SP->CopyDimensionsFromValue(x_value); - + return result_SP; } @@ -190,21 +195,25 @@ EidosValue_SP Eidos_ExecuteFunction_atan2(const std::vector &p_ar EidosValue_SP Eidos_ExecuteFunction_ceil(const std::vector &p_arguments, __attribute__((unused)) EidosInterpreter &p_interpreter) { EidosValue_SP result_SP(nullptr); - + EidosValue *x_value = p_arguments[0].get(); int x_count = x_value->Count(); const double *float_data = x_value->FloatData(); EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count); double *float_result_data = float_result->data_mutable(); result_SP = EidosValue_SP(float_result); - + +#ifdef _OPENMP EIDOS_THREAD_COUNT(gEidos_OMP_threads_CEIL); -#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_CEIL) num_threads(thread_count) + #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_CEIL) num_threads(thread_count) for (int value_index = 0; value_index < x_count; ++value_index) float_result_data[value_index] = ceil(float_data[value_index]); - +#else + Eidos_SIMD::ceil_float64(float_data, float_result_data, x_count); +#endif + result_SP->CopyDimensionsFromValue(x_value); - + return result_SP; } @@ -359,21 +368,25 @@ EidosValue_SP Eidos_ExecuteFunction_exp(const std::vector &p_argu EidosValue_SP Eidos_ExecuteFunction_floor(const std::vector &p_arguments, __attribute__((unused)) EidosInterpreter &p_interpreter) { EidosValue_SP result_SP(nullptr); - + EidosValue *x_value = p_arguments[0].get(); int x_count = x_value->Count(); const double *float_data = x_value->FloatData(); EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count); double *float_result_data = float_result->data_mutable(); result_SP = EidosValue_SP(float_result); - + +#ifdef _OPENMP EIDOS_THREAD_COUNT(gEidos_OMP_threads_FLOOR); -#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_FLOOR) num_threads(thread_count) + #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_FLOOR) num_threads(thread_count) for (int value_index = 0; value_index < x_count; ++value_index) float_result_data[value_index] = floor(float_data[value_index]); - +#else + Eidos_SIMD::floor_float64(float_data, float_result_data, x_count); +#endif + result_SP->CopyDimensionsFromValue(x_value); - + return result_SP; } @@ -788,14 +801,11 @@ EidosValue_SP Eidos_ExecuteFunction_product(const std::vector &p_ else if (x_type == EidosValueType::kValueFloat) { const double *float_data = x_value->FloatData(); - double product = 1; - - for (int value_index = 0; value_index < x_count; ++value_index) - product *= float_data[value_index]; - + double product = Eidos_SIMD::product_float64(float_data, x_count); + result_SP = EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Float(product)); } - + return result_SP; } @@ -803,21 +813,25 @@ EidosValue_SP Eidos_ExecuteFunction_product(const std::vector &p_ EidosValue_SP Eidos_ExecuteFunction_round(const std::vector &p_arguments, __attribute__((unused)) EidosInterpreter &p_interpreter) { EidosValue_SP result_SP(nullptr); - + EidosValue *x_value = p_arguments[0].get(); int x_count = x_value->Count(); const double *float_data = x_value->FloatData(); EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count); double *float_result_data = float_result->data_mutable(); result_SP = EidosValue_SP(float_result); - + +#ifdef _OPENMP EIDOS_THREAD_COUNT(gEidos_OMP_threads_ROUND); -#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ROUND) num_threads(thread_count) + #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ROUND) num_threads(thread_count) for (int value_index = 0; value_index < x_count; ++value_index) float_result_data[value_index] = round(float_data[value_index]); - +#else + Eidos_SIMD::round_float64(float_data, float_result_data, x_count); +#endif + result_SP->CopyDimensionsFromValue(x_value); - + return result_SP; } @@ -2426,15 +2440,19 @@ EidosValue_SP Eidos_ExecuteFunction_sqrt(const std::vector &p_arg EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count); double *float_result_data = float_result->data_mutable(); result_SP = EidosValue_SP(float_result); - + +#ifdef _OPENMP EIDOS_THREAD_COUNT(gEidos_OMP_threads_SQRT_FLOAT); -#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_SQRT_FLOAT) num_threads(thread_count) + #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_SQRT_FLOAT) num_threads(thread_count) for (int value_index = 0; value_index < x_count; ++value_index) float_result_data[value_index] = sqrt(float_data[value_index]); +#else + Eidos_SIMD::sqrt_float64(float_data, float_result_data, x_count); +#endif } - + result_SP->CopyDimensionsFromValue(x_value); - + return result_SP; } @@ -2517,12 +2535,16 @@ EidosValue_SP Eidos_ExecuteFunction_sum(const std::vector &p_argu { const double *float_data = x_value->FloatData(); double sum = 0; - + +#ifdef _OPENMP EIDOS_THREAD_COUNT(gEidos_OMP_threads_SUM_FLOAT); -#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data) reduction(+: sum) if(parallel:x_count >= EIDOS_OMPMIN_SUM_FLOAT) num_threads(thread_count) + #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data) reduction(+: sum) if(parallel:x_count >= EIDOS_OMPMIN_SUM_FLOAT) num_threads(thread_count) for (int value_index = 0; value_index < x_count; ++value_index) sum += float_data[value_index]; - +#else + sum = Eidos_SIMD::sum_float64(float_data, x_count); +#endif + result_SP = EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Float(sum)); } else if (x_type == EidosValueType::kValueLogical) @@ -2587,21 +2609,25 @@ EidosValue_SP Eidos_ExecuteFunction_tan(const std::vector &p_argu EidosValue_SP Eidos_ExecuteFunction_trunc(const std::vector &p_arguments, __attribute__((unused)) EidosInterpreter &p_interpreter) { EidosValue_SP result_SP(nullptr); - + EidosValue *x_value = p_arguments[0].get(); int x_count = x_value->Count(); const double *float_data = x_value->FloatData(); EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count); double *float_result_data = float_result->data_mutable(); result_SP = EidosValue_SP(float_result); - + +#ifdef _OPENMP EIDOS_THREAD_COUNT(gEidos_OMP_threads_TRUNC); -#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_TRUNC) num_threads(thread_count) + #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_TRUNC) num_threads(thread_count) for (int value_index = 0; value_index < x_count; ++value_index) float_result_data[value_index] = trunc(float_data[value_index]); - +#else + Eidos_SIMD::trunc_float64(float_data, float_result_data, x_count); +#endif + result_SP->CopyDimensionsFromValue(x_value); - + return result_SP; } diff --git a/eidos/eidos_simd.h b/eidos/eidos_simd.h new file mode 100644 index 00000000..63efbb1f --- /dev/null +++ b/eidos/eidos_simd.h @@ -0,0 +1,352 @@ +// +// eidos_simd.h +// Eidos +// +// Created by Ben Haller on 11/26/2024. +// Copyright (c) 2024-2025 Philipp Messer. All rights reserved. +// A product of the Messer Lab, http://messerlab.org/slim/ +// + +// This file is part of Eidos. +// +// Eidos is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or (at your option) any later version. +// +// Eidos is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License along with Eidos. If not, see . + +/* + + SIMD acceleration for Eidos math operations, independent of OpenMP. + + This header provides vectorized implementations of common math operations + using SSE4.2 or AVX2 intrinsics when available, with scalar fallbacks. + + */ + +#ifndef eidos_simd_h +#define eidos_simd_h + +#include +#include + +// Determine SIMD capability level +#if defined(EIDOS_HAS_AVX2) + #include + #define EIDOS_SIMD_WIDTH 4 // 4 doubles per AVX register + #define EIDOS_SIMD_FLOAT_WIDTH 8 // 8 floats per AVX register +#elif defined(EIDOS_HAS_SSE42) + #include + #include + #define EIDOS_SIMD_WIDTH 2 // 2 doubles per SSE register + #define EIDOS_SIMD_FLOAT_WIDTH 4 // 4 floats per SSE register +#else + #define EIDOS_SIMD_WIDTH 1 // Scalar fallback + #define EIDOS_SIMD_FLOAT_WIDTH 1 +#endif + +// ================================ +// SIMD Vector Math Operations +// ================================ +// These functions apply an operation to arrays of doubles. +// They handle the loop, SIMD processing, and scalar remainder. + +namespace Eidos_SIMD { + +// --------------------- +// Square Root: sqrt(x) +// --------------------- +inline void sqrt_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if defined(EIDOS_HAS_AVX2) + // Process 4 doubles at a time + for (; i + 4 <= count; i += 4) + { + __m256d v = _mm256_loadu_pd(&input[i]); + __m256d r = _mm256_sqrt_pd(v); + _mm256_storeu_pd(&output[i], r); + } +#elif defined(EIDOS_HAS_SSE42) + // Process 2 doubles at a time + for (; i + 2 <= count; i += 2) + { + __m128d v = _mm_loadu_pd(&input[i]); + __m128d r = _mm_sqrt_pd(v); + _mm_storeu_pd(&output[i], r); + } +#endif + + // Scalar remainder + for (; i < count; i++) + output[i] = std::sqrt(input[i]); +} + +// --------------------- +// Absolute Value: abs(x) +// --------------------- +inline void abs_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if defined(EIDOS_HAS_AVX2) + // Create sign mask (all bits except sign bit) + __m256d sign_mask = _mm256_set1_pd(-0.0); + for (; i + 4 <= count; i += 4) + { + __m256d v = _mm256_loadu_pd(&input[i]); + __m256d r = _mm256_andnot_pd(sign_mask, v); // Clear sign bit + _mm256_storeu_pd(&output[i], r); + } +#elif defined(EIDOS_HAS_SSE42) + __m128d sign_mask = _mm_set1_pd(-0.0); + for (; i + 2 <= count; i += 2) + { + __m128d v = _mm_loadu_pd(&input[i]); + __m128d r = _mm_andnot_pd(sign_mask, v); + _mm_storeu_pd(&output[i], r); + } +#endif + + for (; i < count; i++) + output[i] = std::fabs(input[i]); +} + +// --------------------- +// Floor: floor(x) +// --------------------- +inline void floor_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if defined(EIDOS_HAS_AVX2) + for (; i + 4 <= count; i += 4) + { + __m256d v = _mm256_loadu_pd(&input[i]); + __m256d r = _mm256_floor_pd(v); + _mm256_storeu_pd(&output[i], r); + } +#elif defined(EIDOS_HAS_SSE42) + for (; i + 2 <= count; i += 2) + { + __m128d v = _mm_loadu_pd(&input[i]); + __m128d r = _mm_floor_pd(v); + _mm_storeu_pd(&output[i], r); + } +#endif + + for (; i < count; i++) + output[i] = std::floor(input[i]); +} + +// --------------------- +// Ceil: ceil(x) +// --------------------- +inline void ceil_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if defined(EIDOS_HAS_AVX2) + for (; i + 4 <= count; i += 4) + { + __m256d v = _mm256_loadu_pd(&input[i]); + __m256d r = _mm256_ceil_pd(v); + _mm256_storeu_pd(&output[i], r); + } +#elif defined(EIDOS_HAS_SSE42) + for (; i + 2 <= count; i += 2) + { + __m128d v = _mm_loadu_pd(&input[i]); + __m128d r = _mm_ceil_pd(v); + _mm_storeu_pd(&output[i], r); + } +#endif + + for (; i < count; i++) + output[i] = std::ceil(input[i]); +} + +// --------------------- +// Truncate: trunc(x) +// --------------------- +inline void trunc_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if defined(EIDOS_HAS_AVX2) + for (; i + 4 <= count; i += 4) + { + __m256d v = _mm256_loadu_pd(&input[i]); + __m256d r = _mm256_round_pd(v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + _mm256_storeu_pd(&output[i], r); + } +#elif defined(EIDOS_HAS_SSE42) + for (; i + 2 <= count; i += 2) + { + __m128d v = _mm_loadu_pd(&input[i]); + __m128d r = _mm_round_pd(v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + _mm_storeu_pd(&output[i], r); + } +#endif + + for (; i < count; i++) + output[i] = std::trunc(input[i]); +} + +// --------------------- +// Round: round(x) +// --------------------- +inline void round_float64(const double *input, double *output, int64_t count) +{ + int64_t i = 0; + +#if defined(EIDOS_HAS_AVX2) + for (; i + 4 <= count; i += 4) + { + __m256d v = _mm256_loadu_pd(&input[i]); + __m256d r = _mm256_round_pd(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + _mm256_storeu_pd(&output[i], r); + } +#elif defined(EIDOS_HAS_SSE42) + for (; i + 2 <= count; i += 2) + { + __m128d v = _mm_loadu_pd(&input[i]); + __m128d r = _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + _mm_storeu_pd(&output[i], r); + } +#endif + + for (; i < count; i++) + output[i] = std::round(input[i]); +} + +// --------------------- +// Exponential: exp(x) +// --------------------- +// Note: There's no hardware exp instruction, but we structure the loop +// for cache-friendly access. For true SIMD exp, we'd need a vectorized math library. +inline void exp_float64(const double *input, double *output, int64_t count) +{ + for (int64_t i = 0; i < count; i++) + output[i] = std::exp(input[i]); +} + +// --------------------- +// Natural Log: log(x) +// --------------------- +inline void log_float64(const double *input, double *output, int64_t count) +{ + for (int64_t i = 0; i < count; i++) + output[i] = std::log(input[i]); +} + +// --------------------- +// Log base 10: log10(x) +// --------------------- +inline void log10_float64(const double *input, double *output, int64_t count) +{ + for (int64_t i = 0; i < count; i++) + output[i] = std::log10(input[i]); +} + +// --------------------- +// Log base 2: log2(x) +// --------------------- +inline void log2_float64(const double *input, double *output, int64_t count) +{ + for (int64_t i = 0; i < count; i++) + output[i] = std::log2(input[i]); +} + +// ================================ +// Reductions +// ================================ + +// --------------------- +// Sum: sum(x) +// --------------------- +inline double sum_float64(const double *input, int64_t count) +{ + double sum = 0.0; + int64_t i = 0; + +#if defined(EIDOS_HAS_AVX2) + __m256d vsum = _mm256_setzero_pd(); + for (; i + 4 <= count; i += 4) + { + __m256d v = _mm256_loadu_pd(&input[i]); + vsum = _mm256_add_pd(vsum, v); + } + // Horizontal sum of 4 doubles + __m128d vlow = _mm256_castpd256_pd128(vsum); + __m128d vhigh = _mm256_extractf128_pd(vsum, 1); + vlow = _mm_add_pd(vlow, vhigh); // 2 doubles + __m128d shuf = _mm_shuffle_pd(vlow, vlow, 1); + vlow = _mm_add_sd(vlow, shuf); // 1 double + sum = _mm_cvtsd_f64(vlow); +#elif defined(EIDOS_HAS_SSE42) + __m128d vsum = _mm_setzero_pd(); + for (; i + 2 <= count; i += 2) + { + __m128d v = _mm_loadu_pd(&input[i]); + vsum = _mm_add_pd(vsum, v); + } + // Horizontal sum of 2 doubles + __m128d shuf = _mm_shuffle_pd(vsum, vsum, 1); + vsum = _mm_add_sd(vsum, shuf); + sum = _mm_cvtsd_f64(vsum); +#endif + + // Scalar remainder + for (; i < count; i++) + sum += input[i]; + + return sum; +} + +// --------------------- +// Product: product(x) +// --------------------- +inline double product_float64(const double *input, int64_t count) +{ + double prod = 1.0; + int64_t i = 0; + +#if defined(EIDOS_HAS_AVX2) + __m256d vprod = _mm256_set1_pd(1.0); + for (; i + 4 <= count; i += 4) + { + __m256d v = _mm256_loadu_pd(&input[i]); + vprod = _mm256_mul_pd(vprod, v); + } + // Horizontal product of 4 doubles + __m128d vlow = _mm256_castpd256_pd128(vprod); + __m128d vhigh = _mm256_extractf128_pd(vprod, 1); + vlow = _mm_mul_pd(vlow, vhigh); + __m128d shuf = _mm_shuffle_pd(vlow, vlow, 1); + vlow = _mm_mul_sd(vlow, shuf); + prod = _mm_cvtsd_f64(vlow); +#elif defined(EIDOS_HAS_SSE42) + __m128d vprod = _mm_set1_pd(1.0); + for (; i + 2 <= count; i += 2) + { + __m128d v = _mm_loadu_pd(&input[i]); + vprod = _mm_mul_pd(vprod, v); + } + __m128d shuf = _mm_shuffle_pd(vprod, vprod, 1); + vprod = _mm_mul_sd(vprod, shuf); + prod = _mm_cvtsd_f64(vprod); +#endif + + for (; i < count; i++) + prod *= input[i]; + + return prod; +} + +} // namespace Eidos_SIMD + +#endif /* eidos_simd_h */ diff --git a/eidos/eidos_test_builtins.h b/eidos/eidos_test_builtins.h index 4b31cfaf..2611199d 100644 --- a/eidos/eidos_test_builtins.h +++ b/eidos/eidos_test_builtins.h @@ -110,7 +110,8 @@ for (iter in 1:100) x = rnorm(10); // float xbuiltin = cumProduct(x); xuserdef = cumProduct_func(x); - if (!identical(xbuiltin, xuserdef)) stop('Mismatch in test of cumProduct(f)'); + // tolerance because product() can get a little roundoff error due to SIMD + if (!all(abs(xbuiltin - xuserdef) < abs(xuserdef) * 1e-10 + 1e-15)) stop('Mismatch in test of cumProduct(f)'); } // *********************************************************************************************** @@ -418,7 +419,8 @@ for (iter in 1:100) x = rnorm(10); // float xbuiltin = product(x); xuserdef = product_func(x); - if (!identical(xbuiltin, xuserdef)) stop('Mismatch in test of product(f)'); + // tolerance because product() can get a little roundoff error due to SIMD + if (abs(xbuiltin - xuserdef) > abs(xuserdef) * 1e-10 + 1e-15) stop('Mismatch in test of product(f)'); } // *********************************************************************************************** diff --git a/eidos/eidos_test_functions_other.cpp b/eidos/eidos_test_functions_other.cpp index 81a31f9f..3d3844bd 100644 --- a/eidos/eidos_test_functions_other.cpp +++ b/eidos/eidos_test_functions_other.cpp @@ -350,7 +350,7 @@ void _RunFunctionMatrixArrayTests(void) EidosAssertScriptSuccess_I("tr(matrix(1:9, ncol=3));", 1 + 5 + 9); EidosAssertScriptSuccess_F("tr(matrix(1.0:9, ncol=3));", 1 + 5 + 9); EidosAssertScriptSuccess_F("tr(matrix(c(1.25, -7.8, 3.4, 6.1, 4.75, 8.2, -0.3, 8.6, -1.5), ncol=3));", 1.25 + 4.75 + -1.5); - EidosAssertScriptSuccess_L("x = matrix(runif(100), ncol=10); identical(tr(x), sum(diag(x)));", true); + EidosAssertScriptSuccess_L("x = matrix(runif(100), ncol=10); abs(tr(x) - sum(diag(x))) < 1e-10;", true); // tolerance for SIMD EidosAssertScriptSuccess_L("x = matrix(rdunif(100, -1000, 1000), ncol=10); identical(tr(x), sum(diag(x)));", true); // upperTri() @@ -377,15 +377,15 @@ void _RunFunctionMatrixArrayTests(void) EidosAssertScriptSuccess_L("x = 1.0:12; y = matrix(x, nrow=3); identical(rowSums(y), c(22.0, 26, 30));", true); EidosAssertScriptSuccess_L("x = (rbinom(100, 1, 0.4) == 1); y = matrix(x, nrow=10); identical(rowSums(y), apply(y, 0, 'sum(applyValue);'));", true); EidosAssertScriptSuccess_L("x = rdunif(100, -1000, 1000); y = matrix(x, nrow=10); identical(rowSums(y), apply(y, 0, 'sum(applyValue);'));", true); - EidosAssertScriptSuccess_L("x = runif(100); y = matrix(x, nrow=10); identical(rowSums(y), apply(y, 0, 'sum(applyValue);'));", true); - + EidosAssertScriptSuccess_L("x = runif(100); y = matrix(x, nrow=10); all(abs(rowSums(y) - apply(y, 0, 'sum(applyValue);')) < 1e-10);", true); // tolerance for SIMD + // colSums() EidosAssertScriptSuccess_L("x = c(T,T,F,F,T,F,F,T,T,F,F,T); y = matrix(x, nrow=3); identical(colSums(y), c(2, 1, 2, 1));", true); EidosAssertScriptSuccess_L("x = 1:12; y = matrix(x, nrow=3); identical(colSums(y), c(6, 15, 24, 33));", true); EidosAssertScriptSuccess_L("x = 1.0:12; y = matrix(x, nrow=3); identical(colSums(y), c(6.0, 15, 24, 33));", true); EidosAssertScriptSuccess_L("x = (rbinom(100, 1, 0.4) == 1); y = matrix(x, nrow=10); identical(colSums(y), apply(y, 1, 'sum(applyValue);'));", true); EidosAssertScriptSuccess_L("x = rdunif(100, -1000, 1000); y = matrix(x, nrow=10); identical(colSums(y), apply(y, 1, 'sum(applyValue);'));", true); - EidosAssertScriptSuccess_L("x = runif(100); y = matrix(x, nrow=10); identical(colSums(y), apply(y, 1, 'sum(applyValue);'));", true); + EidosAssertScriptSuccess_L("x = runif(100); y = matrix(x, nrow=10); all(abs(colSums(y) - apply(y, 1, 'sum(applyValue);')) < 1e-10);", true); // tolerance for SIMD } #pragma mark filesystem access From 05a1d8a319cda00e2bbbb9634d3e50957af83c2f Mon Sep 17 00:00:00 2001 From: andrewkern Date: Wed, 26 Nov 2025 13:17:51 -0800 Subject: [PATCH 2/7] add SIMD benchmark scripts Includes Eidos math function benchmark, SLiM simulation benchmark, and a runner script that builds both SIMD and scalar versions and compares performance. --- simd_benchmarks/run_benchmarks.sh | 103 +++++++++++++++++++++++++++ simd_benchmarks/simd_benchmark.eidos | 65 +++++++++++++++++ simd_benchmarks/slim_benchmark.slim | 34 +++++++++ 3 files changed, 202 insertions(+) create mode 100755 simd_benchmarks/run_benchmarks.sh create mode 100644 simd_benchmarks/simd_benchmark.eidos create mode 100644 simd_benchmarks/slim_benchmark.slim diff --git a/simd_benchmarks/run_benchmarks.sh b/simd_benchmarks/run_benchmarks.sh new file mode 100755 index 00000000..05c0782f --- /dev/null +++ b/simd_benchmarks/run_benchmarks.sh @@ -0,0 +1,103 @@ +#!/bin/bash +# SIMD Benchmark Runner +# Builds SLiM with and without SIMD, runs benchmarks, compares results + +set -e + +SLIM_ROOT="$(cd "$(dirname "$0")/.." && pwd)" +BENCHMARK_DIR="$SLIM_ROOT/simd_benchmarks" +BUILD_SIMD="$SLIM_ROOT/build_simd" +BUILD_SCALAR="$SLIM_ROOT/build_scalar" + +NUM_RUNS=${1:-3} # Default to 3 runs, or use first argument + +echo "============================================" +echo "SIMD Benchmark Runner" +echo "============================================" +echo "SLiM root: $SLIM_ROOT" +echo "Runs per benchmark: $NUM_RUNS" +echo "" + +# Build with SIMD +echo "Building with SIMD enabled..." +rm -rf "$BUILD_SIMD" +mkdir -p "$BUILD_SIMD" +cd "$BUILD_SIMD" +cmake -DUSE_SIMD=ON -DCMAKE_BUILD_TYPE=Release .. > /dev/null +make -j10 > /dev/null 2>&1 +echo " Done." + +# Build without SIMD +echo "Building with SIMD disabled..." +rm -rf "$BUILD_SCALAR" +mkdir -p "$BUILD_SCALAR" +cd "$BUILD_SCALAR" +cmake -DUSE_SIMD=OFF -DCMAKE_BUILD_TYPE=Release .. > /dev/null +make -j10 > /dev/null 2>&1 +echo " Done." +echo "" + +# Function to run eidos benchmark and extract times +run_eidos_benchmark() { + local binary="$1" + local label="$2" + + echo " Running Eidos benchmark ($label)..." + "$binary" "$BENCHMARK_DIR/simd_benchmark.eidos" 2>/dev/null | grep -E "^\w+\(\):" | while read line; do + echo " $line" + done +} + +# Function to run slim benchmark and get average time +run_slim_benchmark() { + local binary="$1" + local runs="$2" + local total=0 + + for ((i=1; i<=runs; i++)); do + time=$("$binary" "$BENCHMARK_DIR/slim_benchmark.slim" 2>/dev/null | grep "Elapsed time" | grep -oE '[0-9]+\.[0-9]+') + total=$(echo "$total + $time" | bc) + done + + avg=$(echo "scale=3; $total / $runs" | bc) + echo "$avg" +} + +echo "============================================" +echo "Eidos Math Function Benchmarks" +echo "============================================" +echo "" + +echo "SIMD Build:" +run_eidos_benchmark "$BUILD_SIMD/eidos" "SIMD" +echo "" + +echo "Scalar Build:" +run_eidos_benchmark "$BUILD_SCALAR/eidos" "Scalar" +echo "" + +echo "============================================" +echo "SLiM Simulation Benchmark" +echo "(N=5000, 5000 generations, selection)" +echo "============================================" +echo "" + +echo "Running $NUM_RUNS iterations each..." +echo "" + +simd_time=$(run_slim_benchmark "$BUILD_SIMD/slim" "$NUM_RUNS") +echo "SIMD Build: ${simd_time}s (avg)" + +scalar_time=$(run_slim_benchmark "$BUILD_SCALAR/slim" "$NUM_RUNS") +echo "Scalar Build: ${scalar_time}s (avg)" + +if [ "$(echo "$scalar_time > 0" | bc)" -eq 1 ]; then + speedup=$(echo "scale=2; $scalar_time / $simd_time" | bc) + echo "" + echo "Speedup: ${speedup}x" +fi + +echo "" +echo "============================================" +echo "Benchmark complete" +echo "============================================" diff --git a/simd_benchmarks/simd_benchmark.eidos b/simd_benchmarks/simd_benchmark.eidos new file mode 100644 index 00000000..c49f4407 --- /dev/null +++ b/simd_benchmarks/simd_benchmark.eidos @@ -0,0 +1,65 @@ +// SIMD Benchmark for Eidos math functions +// Tests performance with large float arrays + +defineGlobal("ARRAY_SIZE", 1000000); +defineGlobal("ITERATIONS", 100); + +catn("SIMD Benchmark"); +catn("Array size: " + ARRAY_SIZE); +catn("Iterations: " + ITERATIONS); +catn("----------------------------------------"); + +// Generate test data +x = runif(ARRAY_SIZE); + +// Benchmark sqrt +t0 = clock(); +for (i in 1:ITERATIONS) { y = sqrt(x); } +t1 = clock(); +catn("sqrt(): " + format("%.3f", (t1-t0)) + " sec"); + +// Benchmark abs +t0 = clock(); +for (i in 1:ITERATIONS) { y = abs(x - 0.5); } +t1 = clock(); +catn("abs(): " + format("%.3f", (t1-t0)) + " sec"); + +// Benchmark floor +t0 = clock(); +for (i in 1:ITERATIONS) { y = floor(x * 100); } +t1 = clock(); +catn("floor(): " + format("%.3f", (t1-t0)) + " sec"); + +// Benchmark ceil +t0 = clock(); +for (i in 1:ITERATIONS) { y = ceil(x * 100); } +t1 = clock(); +catn("ceil(): " + format("%.3f", (t1-t0)) + " sec"); + +// Benchmark round +t0 = clock(); +for (i in 1:ITERATIONS) { y = round(x * 100); } +t1 = clock(); +catn("round(): " + format("%.3f", (t1-t0)) + " sec"); + +// Benchmark trunc +t0 = clock(); +for (i in 1:ITERATIONS) { y = trunc(x * 100); } +t1 = clock(); +catn("trunc(): " + format("%.3f", (t1-t0)) + " sec"); + +// Benchmark sum +t0 = clock(); +for (i in 1:ITERATIONS) { y = sum(x); } +t1 = clock(); +catn("sum(): " + format("%.3f", (t1-t0)) + " sec"); + +// Benchmark product (smaller array to avoid underflow) +x_small = runif(1000, 0.99, 1.01); +t0 = clock(); +for (i in 1:(ITERATIONS*100)) { y = product(x_small); } +t1 = clock(); +catn("product(): " + format("%.3f", (t1-t0)) + " sec (1000 elements, 10000 iters)"); + +catn("----------------------------------------"); +catn("Done."); diff --git a/simd_benchmarks/slim_benchmark.slim b/simd_benchmarks/slim_benchmark.slim new file mode 100644 index 00000000..bfd80697 --- /dev/null +++ b/simd_benchmarks/slim_benchmark.slim @@ -0,0 +1,34 @@ +// SLiM Benchmark: Simple simulation with recombination and selection +// Tests whether SIMD optimizations provide any incidental speedup + +initialize() { + initializeMutationRate(1e-7); + initializeRecombinationRate(1e-8); + + // Neutral and selected mutations + initializeMutationType("m1", 0.5, "f", 0.0); // neutral + initializeMutationType("m2", 0.5, "g", -0.03, 0.2); // deleterious (gamma) + initializeMutationType("m3", 0.5, "e", 0.1); // beneficial (exponential) + + initializeGenomicElementType("g1", c(m1, m2, m3), c(0.7, 0.25, 0.05)); + initializeGenomicElement(g1, 0, 999999); // 1 Mb chromosome +} + +1 early() { + sim.addSubpop("p1", 5000); + catn("Starting simulation: N=5000, 1Mb chromosome, 500 generations"); + catn("Mutation rate: 1e-7, Recombination rate: 1e-8"); + defineGlobal("start_time", clock()); +} + +5000 late() { + end_time = clock(); + elapsed = end_time - start_time; + + catn("\n----------------------------------------"); + catn("Simulation complete"); + catn("Elapsed time: " + format("%.2f", elapsed) + " seconds"); + catn("Final mutation count: " + size(sim.mutations)); + catn("Population size: " + p1.individualCount); + catn("----------------------------------------"); +} From 1d912dc5a6010df98b2f93e3a7eacda7e53d4cb3 Mon Sep 17 00:00:00 2001 From: andrewkern Date: Wed, 10 Dec 2025 17:57:47 -0800 Subject: [PATCH 3/7] added credit; added readme to bench; tried to clean whitespace --- eidos/eidos_functions_math.cpp | 52 ++++++++--------- eidos/eidos_simd.h | 2 +- eidos/eidos_test_functions_other.cpp | 2 +- simd_benchmarks/README.md | 87 ++++++++++++++++++++++++++++ 4 files changed, 115 insertions(+), 28 deletions(-) create mode 100644 simd_benchmarks/README.md diff --git a/eidos/eidos_functions_math.cpp b/eidos/eidos_functions_math.cpp index 2a97dedb..7a67160f 100644 --- a/eidos/eidos_functions_math.cpp +++ b/eidos/eidos_functions_math.cpp @@ -88,7 +88,7 @@ EidosValue_SP Eidos_ExecuteFunction_abs(const std::vector &p_argu EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count); double *float_result_data = float_result->data_mutable(); result_SP = EidosValue_SP(float_result); - + #ifdef _OPENMP EIDOS_THREAD_COUNT(gEidos_OMP_threads_ABS_FLOAT); #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ABS_FLOAT) num_threads(thread_count) @@ -98,9 +98,9 @@ EidosValue_SP Eidos_ExecuteFunction_abs(const std::vector &p_argu Eidos_SIMD::abs_float64(float_data, float_result_data, x_count); #endif } - + result_SP->CopyDimensionsFromValue(x_value); - + return result_SP; } @@ -195,14 +195,14 @@ EidosValue_SP Eidos_ExecuteFunction_atan2(const std::vector &p_ar EidosValue_SP Eidos_ExecuteFunction_ceil(const std::vector &p_arguments, __attribute__((unused)) EidosInterpreter &p_interpreter) { EidosValue_SP result_SP(nullptr); - + EidosValue *x_value = p_arguments[0].get(); int x_count = x_value->Count(); const double *float_data = x_value->FloatData(); EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count); double *float_result_data = float_result->data_mutable(); result_SP = EidosValue_SP(float_result); - + #ifdef _OPENMP EIDOS_THREAD_COUNT(gEidos_OMP_threads_CEIL); #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_CEIL) num_threads(thread_count) @@ -211,9 +211,9 @@ EidosValue_SP Eidos_ExecuteFunction_ceil(const std::vector &p_arg #else Eidos_SIMD::ceil_float64(float_data, float_result_data, x_count); #endif - + result_SP->CopyDimensionsFromValue(x_value); - + return result_SP; } @@ -368,14 +368,14 @@ EidosValue_SP Eidos_ExecuteFunction_exp(const std::vector &p_argu EidosValue_SP Eidos_ExecuteFunction_floor(const std::vector &p_arguments, __attribute__((unused)) EidosInterpreter &p_interpreter) { EidosValue_SP result_SP(nullptr); - + EidosValue *x_value = p_arguments[0].get(); int x_count = x_value->Count(); const double *float_data = x_value->FloatData(); EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count); double *float_result_data = float_result->data_mutable(); result_SP = EidosValue_SP(float_result); - + #ifdef _OPENMP EIDOS_THREAD_COUNT(gEidos_OMP_threads_FLOOR); #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_FLOOR) num_threads(thread_count) @@ -384,9 +384,9 @@ EidosValue_SP Eidos_ExecuteFunction_floor(const std::vector &p_ar #else Eidos_SIMD::floor_float64(float_data, float_result_data, x_count); #endif - + result_SP->CopyDimensionsFromValue(x_value); - + return result_SP; } @@ -802,10 +802,10 @@ EidosValue_SP Eidos_ExecuteFunction_product(const std::vector &p_ { const double *float_data = x_value->FloatData(); double product = Eidos_SIMD::product_float64(float_data, x_count); - + result_SP = EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Float(product)); } - + return result_SP; } @@ -813,14 +813,14 @@ EidosValue_SP Eidos_ExecuteFunction_product(const std::vector &p_ EidosValue_SP Eidos_ExecuteFunction_round(const std::vector &p_arguments, __attribute__((unused)) EidosInterpreter &p_interpreter) { EidosValue_SP result_SP(nullptr); - + EidosValue *x_value = p_arguments[0].get(); int x_count = x_value->Count(); const double *float_data = x_value->FloatData(); EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count); double *float_result_data = float_result->data_mutable(); result_SP = EidosValue_SP(float_result); - + #ifdef _OPENMP EIDOS_THREAD_COUNT(gEidos_OMP_threads_ROUND); #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ROUND) num_threads(thread_count) @@ -829,9 +829,9 @@ EidosValue_SP Eidos_ExecuteFunction_round(const std::vector &p_ar #else Eidos_SIMD::round_float64(float_data, float_result_data, x_count); #endif - + result_SP->CopyDimensionsFromValue(x_value); - + return result_SP; } @@ -2440,7 +2440,7 @@ EidosValue_SP Eidos_ExecuteFunction_sqrt(const std::vector &p_arg EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count); double *float_result_data = float_result->data_mutable(); result_SP = EidosValue_SP(float_result); - + #ifdef _OPENMP EIDOS_THREAD_COUNT(gEidos_OMP_threads_SQRT_FLOAT); #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_SQRT_FLOAT) num_threads(thread_count) @@ -2450,9 +2450,9 @@ EidosValue_SP Eidos_ExecuteFunction_sqrt(const std::vector &p_arg Eidos_SIMD::sqrt_float64(float_data, float_result_data, x_count); #endif } - + result_SP->CopyDimensionsFromValue(x_value); - + return result_SP; } @@ -2514,12 +2514,12 @@ EidosValue_SP Eidos_ExecuteFunction_sum(const std::vector &p_argu // case across multiple threads seems excessively complex; instead we look for an overflow afterwards const int64_t *int_data = x_value->IntData(); double sum_d = 0; - + EIDOS_THREAD_COUNT(gEidos_OMP_threads_SUM_INTEGER); #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(int_data) reduction(+: sum_d) if(parallel:x_count >= EIDOS_OMPMIN_SUM_INTEGER) num_threads(thread_count) for (int value_index = 0; value_index < x_count; ++value_index) sum_d += int_data[value_index]; - + // 2^53 is the largest integer such that it and all smaller integers can be represented in double losslessly int64_t sum = (int64_t)sum_d; bool fits_in_integer = (((double)sum == sum_d) && (sum < 9007199254740992L) && (sum > -9007199254740992L)); @@ -2609,14 +2609,14 @@ EidosValue_SP Eidos_ExecuteFunction_tan(const std::vector &p_argu EidosValue_SP Eidos_ExecuteFunction_trunc(const std::vector &p_arguments, __attribute__((unused)) EidosInterpreter &p_interpreter) { EidosValue_SP result_SP(nullptr); - + EidosValue *x_value = p_arguments[0].get(); int x_count = x_value->Count(); const double *float_data = x_value->FloatData(); EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count); double *float_result_data = float_result->data_mutable(); result_SP = EidosValue_SP(float_result); - + #ifdef _OPENMP EIDOS_THREAD_COUNT(gEidos_OMP_threads_TRUNC); #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_TRUNC) num_threads(thread_count) @@ -2625,9 +2625,9 @@ EidosValue_SP Eidos_ExecuteFunction_trunc(const std::vector &p_ar #else Eidos_SIMD::trunc_float64(float_data, float_result_data, x_count); #endif - + result_SP->CopyDimensionsFromValue(x_value); - + return result_SP; } diff --git a/eidos/eidos_simd.h b/eidos/eidos_simd.h index 63efbb1f..c14bdf37 100644 --- a/eidos/eidos_simd.h +++ b/eidos/eidos_simd.h @@ -2,7 +2,7 @@ // eidos_simd.h // Eidos // -// Created by Ben Haller on 11/26/2024. +// Created by Andrew Kern on 11/26/2025. // Copyright (c) 2024-2025 Philipp Messer. All rights reserved. // A product of the Messer Lab, http://messerlab.org/slim/ // diff --git a/eidos/eidos_test_functions_other.cpp b/eidos/eidos_test_functions_other.cpp index 3d3844bd..ba882bf8 100644 --- a/eidos/eidos_test_functions_other.cpp +++ b/eidos/eidos_test_functions_other.cpp @@ -378,7 +378,7 @@ void _RunFunctionMatrixArrayTests(void) EidosAssertScriptSuccess_L("x = (rbinom(100, 1, 0.4) == 1); y = matrix(x, nrow=10); identical(rowSums(y), apply(y, 0, 'sum(applyValue);'));", true); EidosAssertScriptSuccess_L("x = rdunif(100, -1000, 1000); y = matrix(x, nrow=10); identical(rowSums(y), apply(y, 0, 'sum(applyValue);'));", true); EidosAssertScriptSuccess_L("x = runif(100); y = matrix(x, nrow=10); all(abs(rowSums(y) - apply(y, 0, 'sum(applyValue);')) < 1e-10);", true); // tolerance for SIMD - + // colSums() EidosAssertScriptSuccess_L("x = c(T,T,F,F,T,F,F,T,T,F,F,T); y = matrix(x, nrow=3); identical(colSums(y), c(2, 1, 2, 1));", true); EidosAssertScriptSuccess_L("x = 1:12; y = matrix(x, nrow=3); identical(colSums(y), c(6, 15, 24, 33));", true); diff --git a/simd_benchmarks/README.md b/simd_benchmarks/README.md new file mode 100644 index 00000000..1afb6ebf --- /dev/null +++ b/simd_benchmarks/README.md @@ -0,0 +1,87 @@ +# SIMD Benchmarks + +This directory contains benchmark scripts used during the development of SIMD optimizations for SLiM. These files are provided for internal development use and are **not used in the build of SLiM**. + +## Contents + +- **`run_benchmarks.sh`** - Shell script that builds SLiM with and without SIMD, runs both benchmark scripts, and reports speedup comparisons. + +- **`simd_benchmark.eidos`** - Eidos script that benchmarks SIMD-optimized math functions (`sqrt`, `abs`, `floor`, `ceil`, `round`, `trunc`, `sum`, `product`) on large arrays. + +- **`slim_benchmark.slim`** - SLiM simulation benchmark (N=5000, 1Mb chromosome, 5000 generations with selection) for measuring overall simulation performance. + +## Author + +These benchmarks were developed by Andrew Kern as part of SIMD optimization work for SLiM. + +## Usage + +These files are not part of the SLiM build system. To run the benchmarks: + +```bash +cd simd_benchmarks +./run_benchmarks.sh [num_runs] +``` + +This will build both SIMD-enabled and scalar versions of SLiM, run the benchmarks, and report the speedup. + +## Results + +Benchmark results look like the following (example output): + +``` +$ simd_benchmarks/run_benchmarks.sh +============================================ +SIMD Benchmark Runner +============================================ +SLiM root: /home/adkern/SLiM +Runs per benchmark: 3 + +Building with SIMD enabled... + Done. +Building with SIMD disabled... + Done. + +============================================ +Eidos Math Function Benchmarks +============================================ + +SIMD Build: + Running Eidos benchmark (SIMD)... + sqrt(): 0.105 sec + abs(): 0.171 sec + floor(): 0.164 sec + ceil(): 0.166 sec + round(): 0.164 sec + trunc(): 0.165 sec + sum(): 0.032 sec + product(): 0.003 sec (1000 elements, 10000 iters) + +Scalar Build: + Running Eidos benchmark (Scalar)... + sqrt(): 0.108 sec + abs(): 0.166 sec + floor(): 0.231 sec + ceil(): 0.246 sec + round(): 0.473 sec + trunc(): 0.246 sec + sum(): 0.166 sec + product(): 0.017 sec (1000 elements, 10000 iters) + +============================================ +SLiM Simulation Benchmark +(N=5000, 5000 generations, selection) +============================================ + +Running 3 iterations each... + +SIMD Build: 12.756s (avg) +Scalar Build: 12.316s (avg) + +Speedup: .96x + +============================================ +Benchmark complete +============================================ +``` +so the takeaway is that SIMD provided significant speedups for eidos math functions, while the overall SLiM simulation speedup was minimal in this specific benchmark scenario. \ No newline at end of file From f6f2412f73747605a7403e9df054a610bca6997a Mon Sep 17 00:00:00 2001 From: andrewkern Date: Wed, 10 Dec 2025 18:04:58 -0800 Subject: [PATCH 4/7] moar whitespace woes --- eidos/eidos_functions_math.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/eidos/eidos_functions_math.cpp b/eidos/eidos_functions_math.cpp index 7a67160f..ebc741b0 100644 --- a/eidos/eidos_functions_math.cpp +++ b/eidos/eidos_functions_math.cpp @@ -2514,12 +2514,12 @@ EidosValue_SP Eidos_ExecuteFunction_sum(const std::vector &p_argu // case across multiple threads seems excessively complex; instead we look for an overflow afterwards const int64_t *int_data = x_value->IntData(); double sum_d = 0; - + EIDOS_THREAD_COUNT(gEidos_OMP_threads_SUM_INTEGER); #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(int_data) reduction(+: sum_d) if(parallel:x_count >= EIDOS_OMPMIN_SUM_INTEGER) num_threads(thread_count) for (int value_index = 0; value_index < x_count; ++value_index) sum_d += int_data[value_index]; - + // 2^53 is the largest integer such that it and all smaller integers can be represented in double losslessly int64_t sum = (int64_t)sum_d; bool fits_in_integer = (((double)sum == sum_d) && (sum < 9007199254740992L) && (sum > -9007199254740992L)); @@ -2535,7 +2535,7 @@ EidosValue_SP Eidos_ExecuteFunction_sum(const std::vector &p_argu { const double *float_data = x_value->FloatData(); double sum = 0; - + #ifdef _OPENMP EIDOS_THREAD_COUNT(gEidos_OMP_threads_SUM_FLOAT); #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data) reduction(+: sum) if(parallel:x_count >= EIDOS_OMPMIN_SUM_FLOAT) num_threads(thread_count) @@ -2544,7 +2544,7 @@ EidosValue_SP Eidos_ExecuteFunction_sum(const std::vector &p_argu #else sum = Eidos_SIMD::sum_float64(float_data, x_count); #endif - + result_SP = EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Float(sum)); } else if (x_type == EidosValueType::kValueLogical) From 08e7aa7c7f72621777d7c87b618fd0b271be505e Mon Sep 17 00:00:00 2001 From: andrewkern Date: Wed, 10 Dec 2025 18:53:02 -0800 Subject: [PATCH 5/7] NEON SIMD for ARM --- CMakeLists.txt | 22 +++++++++++++-- eidos/eidos_simd.h | 70 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 88 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b85c4742..c8080018 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -291,9 +291,19 @@ endif() # # Option to disable SIMD entirely -option(USE_SIMD "Enable SIMD optimizations (SSE4.2/AVX2)" ON) +option(USE_SIMD "Enable SIMD optimizations (SSE4.2/AVX2 on x86_64, NEON on ARM64)" ON) + +# Check architecture +# CMAKE_SYSTEM_PROCESSOR is "x86_64" on Intel Macs and Linux x86_64, "arm64"/"aarch64" on ARM +set(IS_X86_64 FALSE) +set(IS_ARM64 FALSE) +if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64|i686|i386") + set(IS_X86_64 TRUE) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64|ARM64") + set(IS_ARM64 TRUE) +endif() -if(USE_SIMD AND NOT WIN32) +if(USE_SIMD AND NOT WIN32 AND IS_X86_64) include(CheckCXXCompilerFlag) # Check for AVX2 support @@ -315,8 +325,14 @@ if(USE_SIMD AND NOT WIN32) add_compile_definitions(EIDOS_HAS_SSE42=1) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2") else() - message(STATUS "SIMD: No SIMD support detected, using scalar fallback") + message(STATUS "SIMD: No x86 SIMD support detected, using scalar fallback") endif() +elseif(USE_SIMD AND NOT WIN32 AND IS_ARM64) + # ARM64 NEON is always available on ARM64, no compiler flag needed + message(STATUS "SIMD: ARM64 NEON support enabled") + add_compile_definitions(EIDOS_HAS_NEON=1) +elseif(USE_SIMD AND NOT WIN32) + message(STATUS "SIMD: Unknown architecture (${CMAKE_SYSTEM_PROCESSOR}), using scalar fallback") elseif(USE_SIMD AND WIN32) # Windows/MSVC detection not yet implemented message(STATUS "SIMD: Windows SIMD detection not yet implemented, using scalar fallback") diff --git a/eidos/eidos_simd.h b/eidos/eidos_simd.h index c14bdf37..d344db89 100644 --- a/eidos/eidos_simd.h +++ b/eidos/eidos_simd.h @@ -22,7 +22,10 @@ SIMD acceleration for Eidos math operations, independent of OpenMP. This header provides vectorized implementations of common math operations - using SSE4.2 or AVX2 intrinsics when available, with scalar fallbacks. + using platform-specific SIMD intrinsics when available: + - x86_64: SSE4.2 or AVX2 via + - ARM64: NEON via + Falls back to scalar code when no SIMD is available. */ @@ -42,6 +45,10 @@ #include #define EIDOS_SIMD_WIDTH 2 // 2 doubles per SSE register #define EIDOS_SIMD_FLOAT_WIDTH 4 // 4 floats per SSE register +#elif defined(EIDOS_HAS_NEON) + #include + #define EIDOS_SIMD_WIDTH 2 // 2 doubles per NEON register + #define EIDOS_SIMD_FLOAT_WIDTH 4 // 4 floats per NEON register #else #define EIDOS_SIMD_WIDTH 1 // Scalar fallback #define EIDOS_SIMD_FLOAT_WIDTH 1 @@ -78,6 +85,14 @@ inline void sqrt_float64(const double *input, double *output, int64_t count) __m128d r = _mm_sqrt_pd(v); _mm_storeu_pd(&output[i], r); } +#elif defined(EIDOS_HAS_NEON) + // Process 2 doubles at a time + for (; i + 2 <= count; i += 2) + { + float64x2_t v = vld1q_f64(&input[i]); + float64x2_t r = vsqrtq_f64(v); + vst1q_f64(&output[i], r); + } #endif // Scalar remainder @@ -109,6 +124,13 @@ inline void abs_float64(const double *input, double *output, int64_t count) __m128d r = _mm_andnot_pd(sign_mask, v); _mm_storeu_pd(&output[i], r); } +#elif defined(EIDOS_HAS_NEON) + for (; i + 2 <= count; i += 2) + { + float64x2_t v = vld1q_f64(&input[i]); + float64x2_t r = vabsq_f64(v); + vst1q_f64(&output[i], r); + } #endif for (; i < count; i++) @@ -136,6 +158,13 @@ inline void floor_float64(const double *input, double *output, int64_t count) __m128d r = _mm_floor_pd(v); _mm_storeu_pd(&output[i], r); } +#elif defined(EIDOS_HAS_NEON) + for (; i + 2 <= count; i += 2) + { + float64x2_t v = vld1q_f64(&input[i]); + float64x2_t r = vrndmq_f64(v); // Round toward minus infinity (floor) + vst1q_f64(&output[i], r); + } #endif for (; i < count; i++) @@ -163,6 +192,13 @@ inline void ceil_float64(const double *input, double *output, int64_t count) __m128d r = _mm_ceil_pd(v); _mm_storeu_pd(&output[i], r); } +#elif defined(EIDOS_HAS_NEON) + for (; i + 2 <= count; i += 2) + { + float64x2_t v = vld1q_f64(&input[i]); + float64x2_t r = vrndpq_f64(v); // Round toward plus infinity (ceil) + vst1q_f64(&output[i], r); + } #endif for (; i < count; i++) @@ -190,6 +226,13 @@ inline void trunc_float64(const double *input, double *output, int64_t count) __m128d r = _mm_round_pd(v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); _mm_storeu_pd(&output[i], r); } +#elif defined(EIDOS_HAS_NEON) + for (; i + 2 <= count; i += 2) + { + float64x2_t v = vld1q_f64(&input[i]); + float64x2_t r = vrndq_f64(v); // Round toward zero (truncate) + vst1q_f64(&output[i], r); + } #endif for (; i < count; i++) @@ -217,6 +260,13 @@ inline void round_float64(const double *input, double *output, int64_t count) __m128d r = _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); _mm_storeu_pd(&output[i], r); } +#elif defined(EIDOS_HAS_NEON) + for (; i + 2 <= count; i += 2) + { + float64x2_t v = vld1q_f64(&input[i]); + float64x2_t r = vrndaq_f64(v); // Round to nearest, ties away from zero + vst1q_f64(&output[i], r); + } #endif for (; i < count; i++) @@ -298,6 +348,15 @@ inline double sum_float64(const double *input, int64_t count) __m128d shuf = _mm_shuffle_pd(vsum, vsum, 1); vsum = _mm_add_sd(vsum, shuf); sum = _mm_cvtsd_f64(vsum); +#elif defined(EIDOS_HAS_NEON) + float64x2_t vsum = vdupq_n_f64(0.0); + for (; i + 2 <= count; i += 2) + { + float64x2_t v = vld1q_f64(&input[i]); + vsum = vaddq_f64(vsum, v); + } + // Horizontal sum of 2 doubles + sum = vaddvq_f64(vsum); #endif // Scalar remainder @@ -339,6 +398,15 @@ inline double product_float64(const double *input, int64_t count) __m128d shuf = _mm_shuffle_pd(vprod, vprod, 1); vprod = _mm_mul_sd(vprod, shuf); prod = _mm_cvtsd_f64(vprod); +#elif defined(EIDOS_HAS_NEON) + float64x2_t vprod = vdupq_n_f64(1.0); + for (; i + 2 <= count; i += 2) + { + float64x2_t v = vld1q_f64(&input[i]); + vprod = vmulq_f64(vprod, v); + } + // Horizontal product of 2 doubles + prod = vgetq_lane_f64(vprod, 0) * vgetq_lane_f64(vprod, 1); #endif for (; i < count; i++) From c98008d950a31f62a7046ab13c5af4413fd03f8d Mon Sep 17 00:00:00 2001 From: andrewkern Date: Wed, 10 Dec 2025 22:16:28 -0800 Subject: [PATCH 6/7] swap in new allClose function --- eidos/eidos_test_builtins.h | 10 +++++----- eidos/eidos_test_functions_other.cpp | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/eidos/eidos_test_builtins.h b/eidos/eidos_test_builtins.h index 2611199d..8707e2f3 100644 --- a/eidos/eidos_test_builtins.h +++ b/eidos/eidos_test_builtins.h @@ -111,7 +111,7 @@ for (iter in 1:100) xbuiltin = cumProduct(x); xuserdef = cumProduct_func(x); // tolerance because product() can get a little roundoff error due to SIMD - if (!all(abs(xbuiltin - xuserdef) < abs(xuserdef) * 1e-10 + 1e-15)) stop('Mismatch in test of cumProduct(f)'); + if (!allClose(xbuiltin, xuserdef)) stop('Mismatch in test of cumProduct(f)'); } // *********************************************************************************************** @@ -136,7 +136,7 @@ for (iter in 1:100) xbuiltin = cumSum(x); xuserdef = cumSum_func(x); // tolerance because sum() can get a little roundoff error due to SIMD - if (!all(abs(xbuiltin - xuserdef) < 1e-10)) stop('Mismatch in test of cumSum(f)'); + if (!allClose(xbuiltin, xuserdef)) stop('Mismatch in test of cumSum(f)'); } // *********************************************************************************************** @@ -272,7 +272,7 @@ for (iter in 1:100) xbuiltin = mean(x); xuserdef = mean_func(x); // tolerance because sum() can get a little roundoff error due to SIMD - if (!all(abs(xbuiltin - xuserdef) < 1e-10)) stop('Mismatch in test of mean(f)'); + if (!allClose(xbuiltin, xuserdef)) stop('Mismatch in test of mean(f)'); } // *********************************************************************************************** @@ -420,7 +420,7 @@ for (iter in 1:100) xbuiltin = product(x); xuserdef = product_func(x); // tolerance because product() can get a little roundoff error due to SIMD - if (abs(xbuiltin - xuserdef) > abs(xuserdef) * 1e-10 + 1e-15) stop('Mismatch in test of product(f)'); + if (!allClose(xbuiltin, xuserdef)) stop('Mismatch in test of product(f)'); } // *********************************************************************************************** @@ -795,7 +795,7 @@ for (iter in 1:100) xbuiltin = sum(x); xuserdef = sum_func(x); // tolerance because sum() can get a little roundoff error due to SIMD - if (!all(abs(xbuiltin - xuserdef) < 1e-10)) stop('Mismatch in test of sum(f)'); + if (!allClose(xbuiltin, xuserdef)) stop('Mismatch in test of sum(f)'); } // *********************************************************************************************** diff --git a/eidos/eidos_test_functions_other.cpp b/eidos/eidos_test_functions_other.cpp index ba882bf8..87779bf1 100644 --- a/eidos/eidos_test_functions_other.cpp +++ b/eidos/eidos_test_functions_other.cpp @@ -350,7 +350,7 @@ void _RunFunctionMatrixArrayTests(void) EidosAssertScriptSuccess_I("tr(matrix(1:9, ncol=3));", 1 + 5 + 9); EidosAssertScriptSuccess_F("tr(matrix(1.0:9, ncol=3));", 1 + 5 + 9); EidosAssertScriptSuccess_F("tr(matrix(c(1.25, -7.8, 3.4, 6.1, 4.75, 8.2, -0.3, 8.6, -1.5), ncol=3));", 1.25 + 4.75 + -1.5); - EidosAssertScriptSuccess_L("x = matrix(runif(100), ncol=10); abs(tr(x) - sum(diag(x))) < 1e-10;", true); // tolerance for SIMD + EidosAssertScriptSuccess_L("x = matrix(runif(100), ncol=10); allClose(tr(x), sum(diag(x)));", true); // tolerance for SIMD EidosAssertScriptSuccess_L("x = matrix(rdunif(100, -1000, 1000), ncol=10); identical(tr(x), sum(diag(x)));", true); // upperTri() @@ -377,7 +377,7 @@ void _RunFunctionMatrixArrayTests(void) EidosAssertScriptSuccess_L("x = 1.0:12; y = matrix(x, nrow=3); identical(rowSums(y), c(22.0, 26, 30));", true); EidosAssertScriptSuccess_L("x = (rbinom(100, 1, 0.4) == 1); y = matrix(x, nrow=10); identical(rowSums(y), apply(y, 0, 'sum(applyValue);'));", true); EidosAssertScriptSuccess_L("x = rdunif(100, -1000, 1000); y = matrix(x, nrow=10); identical(rowSums(y), apply(y, 0, 'sum(applyValue);'));", true); - EidosAssertScriptSuccess_L("x = runif(100); y = matrix(x, nrow=10); all(abs(rowSums(y) - apply(y, 0, 'sum(applyValue);')) < 1e-10);", true); // tolerance for SIMD + EidosAssertScriptSuccess_L("x = runif(100); y = matrix(x, nrow=10); allClose(rowSums(y), apply(y, 0, 'sum(applyValue);'));", true); // tolerance for SIMD // colSums() EidosAssertScriptSuccess_L("x = c(T,T,F,F,T,F,F,T,T,F,F,T); y = matrix(x, nrow=3); identical(colSums(y), c(2, 1, 2, 1));", true); @@ -385,7 +385,7 @@ void _RunFunctionMatrixArrayTests(void) EidosAssertScriptSuccess_L("x = 1.0:12; y = matrix(x, nrow=3); identical(colSums(y), c(6.0, 15, 24, 33));", true); EidosAssertScriptSuccess_L("x = (rbinom(100, 1, 0.4) == 1); y = matrix(x, nrow=10); identical(colSums(y), apply(y, 1, 'sum(applyValue);'));", true); EidosAssertScriptSuccess_L("x = rdunif(100, -1000, 1000); y = matrix(x, nrow=10); identical(colSums(y), apply(y, 1, 'sum(applyValue);'));", true); - EidosAssertScriptSuccess_L("x = runif(100); y = matrix(x, nrow=10); all(abs(colSums(y) - apply(y, 1, 'sum(applyValue);')) < 1e-10);", true); // tolerance for SIMD + EidosAssertScriptSuccess_L("x = runif(100); y = matrix(x, nrow=10); allClose(colSums(y), apply(y, 1, 'sum(applyValue);'));", true); // tolerance for SIMD } #pragma mark filesystem access From a9c364f50bf43ad4e16bb94488bb6935a0e47382 Mon Sep 17 00:00:00 2001 From: andrewkern Date: Fri, 12 Dec 2025 17:01:44 -0800 Subject: [PATCH 7/7] adding FIXME comments --- eidos/eidos_functions_math.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/eidos/eidos_functions_math.cpp b/eidos/eidos_functions_math.cpp index ebc741b0..0733a357 100644 --- a/eidos/eidos_functions_math.cpp +++ b/eidos/eidos_functions_math.cpp @@ -90,6 +90,7 @@ EidosValue_SP Eidos_ExecuteFunction_abs(const std::vector &p_argu result_SP = EidosValue_SP(float_result); #ifdef _OPENMP + // FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984 EIDOS_THREAD_COUNT(gEidos_OMP_threads_ABS_FLOAT); #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ABS_FLOAT) num_threads(thread_count) for (int value_index = 0; value_index < x_count; ++value_index) @@ -204,6 +205,7 @@ EidosValue_SP Eidos_ExecuteFunction_ceil(const std::vector &p_arg result_SP = EidosValue_SP(float_result); #ifdef _OPENMP + // FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984 EIDOS_THREAD_COUNT(gEidos_OMP_threads_CEIL); #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_CEIL) num_threads(thread_count) for (int value_index = 0; value_index < x_count; ++value_index) @@ -353,6 +355,7 @@ EidosValue_SP Eidos_ExecuteFunction_exp(const std::vector &p_argu double *float_result_data = float_result->data_mutable(); result_SP = EidosValue_SP(float_result); + // FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984 EIDOS_THREAD_COUNT(gEidos_OMP_threads_EXP_FLOAT); #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_EXP_FLOAT) num_threads(thread_count) for (int value_index = 0; value_index < x_count; ++value_index) @@ -377,6 +380,7 @@ EidosValue_SP Eidos_ExecuteFunction_floor(const std::vector &p_ar result_SP = EidosValue_SP(float_result); #ifdef _OPENMP + // FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984 EIDOS_THREAD_COUNT(gEidos_OMP_threads_FLOOR); #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_FLOOR) num_threads(thread_count) for (int value_index = 0; value_index < x_count; ++value_index) @@ -674,6 +678,7 @@ EidosValue_SP Eidos_ExecuteFunction_log(const std::vector &p_argu double *float_result_data = float_result->data_mutable(); result_SP = EidosValue_SP(float_result); + // FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984 EIDOS_THREAD_COUNT(gEidos_OMP_threads_LOG_FLOAT); #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_LOG_FLOAT) num_threads(thread_count) for (int value_index = 0; value_index < x_count; ++value_index) @@ -709,6 +714,7 @@ EidosValue_SP Eidos_ExecuteFunction_log10(const std::vector &p_ar double *float_result_data = float_result->data_mutable(); result_SP = EidosValue_SP(float_result); + // FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984 EIDOS_THREAD_COUNT(gEidos_OMP_threads_LOG10_FLOAT); #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_LOG10_FLOAT) num_threads(thread_count) for (int value_index = 0; value_index < x_count; ++value_index) @@ -744,6 +750,7 @@ EidosValue_SP Eidos_ExecuteFunction_log2(const std::vector &p_arg double *float_result_data = float_result->data_mutable(); result_SP = EidosValue_SP(float_result); + // FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984 EIDOS_THREAD_COUNT(gEidos_OMP_threads_LOG2_FLOAT); #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_LOG2_FLOAT) num_threads(thread_count) for (int value_index = 0; value_index < x_count; ++value_index) @@ -822,6 +829,7 @@ EidosValue_SP Eidos_ExecuteFunction_round(const std::vector &p_ar result_SP = EidosValue_SP(float_result); #ifdef _OPENMP + // FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984 EIDOS_THREAD_COUNT(gEidos_OMP_threads_ROUND); #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ROUND) num_threads(thread_count) for (int value_index = 0; value_index < x_count; ++value_index) @@ -2442,6 +2450,7 @@ EidosValue_SP Eidos_ExecuteFunction_sqrt(const std::vector &p_arg result_SP = EidosValue_SP(float_result); #ifdef _OPENMP + // FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984 EIDOS_THREAD_COUNT(gEidos_OMP_threads_SQRT_FLOAT); #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_SQRT_FLOAT) num_threads(thread_count) for (int value_index = 0; value_index < x_count; ++value_index) @@ -2537,6 +2546,7 @@ EidosValue_SP Eidos_ExecuteFunction_sum(const std::vector &p_argu double sum = 0; #ifdef _OPENMP + // FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984 EIDOS_THREAD_COUNT(gEidos_OMP_threads_SUM_FLOAT); #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data) reduction(+: sum) if(parallel:x_count >= EIDOS_OMPMIN_SUM_FLOAT) num_threads(thread_count) for (int value_index = 0; value_index < x_count; ++value_index) @@ -2618,6 +2628,7 @@ EidosValue_SP Eidos_ExecuteFunction_trunc(const std::vector &p_ar result_SP = EidosValue_SP(float_result); #ifdef _OPENMP + // FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984 EIDOS_THREAD_COUNT(gEidos_OMP_threads_TRUNC); #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_TRUNC) num_threads(thread_count) for (int value_index = 0; value_index < x_count; ++value_index)