From 862fa037b3d0c1f566c94c8c64a3cb66692a7a16 Mon Sep 17 00:00:00 2001 From: andrewkern Date: Mon, 15 Dec 2025 09:47:43 -0800 Subject: [PATCH 1/2] dnorm added; benchmark added --- eidos/eidos_functions_distributions.cpp | 49 ++++++++++++++++++++++- simd_benchmarks/README.md | 28 ++++++++++++- simd_benchmarks/dnorm_benchmark.eidos | 53 +++++++++++++++++++++++++ 3 files changed, 128 insertions(+), 2 deletions(-) create mode 100644 simd_benchmarks/dnorm_benchmark.eidos diff --git a/eidos/eidos_functions_distributions.cpp b/eidos/eidos_functions_distributions.cpp index d83a2d37..0e67de3c 100644 --- a/eidos/eidos_functions_distributions.cpp +++ b/eidos/eidos_functions_distributions.cpp @@ -26,6 +26,7 @@ #include "gsl_linalg.h" #include "gsl_errno.h" #include "gsl_cdf.h" +#include "eidos_simd.h" // BCH 20 October 2016: continuing to try to fix problems with gcc 5.4.0 on Linux without breaking other @@ -484,10 +485,26 @@ EidosValue_SP Eidos_ExecuteFunction_dnorm(const std::vector &p_ar EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(num_quantiles); result_SP = EidosValue_SP(float_result); +#ifdef _OPENMP EIDOS_THREAD_COUNT(gEidos_OMP_threads_DNORM_1); #pragma omp parallel for schedule(static) default(none) shared(num_quantiles) firstprivate(float_data, float_result, mu0, sigma0) if(num_quantiles >= EIDOS_OMPMIN_DNORM_1) num_threads(thread_count) for (int value_index = 0; value_index < num_quantiles; ++value_index) float_result->set_float_no_check(gsl_ran_gaussian_pdf(float_data[value_index] - mu0, sigma0), value_index); +#else + // SIMD-optimized version: batch the exp() calls + double norm = 1.0 / (std::sqrt(2.0 * M_PI) * sigma0); + double inv_2var = -1.0 / (2.0 * sigma0 * sigma0); + double *result_data = float_result->data_mutable(); + + for (int value_index = 0; value_index < num_quantiles; ++value_index) + { + double diff = float_data[value_index] - mu0; + result_data[value_index] = diff * diff * inv_2var; + } + Eidos_SIMD::exp_float64(result_data, result_data, num_quantiles); + for (int value_index = 0; value_index < num_quantiles; ++value_index) + result_data[value_index] *= norm; +#endif } else { @@ -497,13 +514,14 @@ EidosValue_SP Eidos_ExecuteFunction_dnorm(const std::vector &p_ar bool saw_error = false; +#ifdef _OPENMP EIDOS_THREAD_COUNT(gEidos_OMP_threads_DNORM_2); #pragma omp parallel for schedule(static) default(none) shared(num_quantiles) firstprivate(float_data, float_result, mu_singleton, sigma_singleton, mu0, sigma0, arg_mu, arg_sigma) reduction(||: saw_error) if(num_quantiles >= EIDOS_OMPMIN_DNORM_2) num_threads(thread_count) for (int value_index = 0; value_index < num_quantiles; ++value_index) { double mu = (mu_singleton ? mu0 : arg_mu->NumericAtIndex_NOCAST(value_index, nullptr)); double sigma = (sigma_singleton ? sigma0 : arg_sigma->NumericAtIndex_NOCAST(value_index, nullptr)); - + if (sigma <= 0.0) { saw_error = true; @@ -512,6 +530,35 @@ EidosValue_SP Eidos_ExecuteFunction_dnorm(const std::vector &p_ar float_result->set_float_no_check(gsl_ran_gaussian_pdf(float_data[value_index] - mu, sigma), value_index); } +#else + // SIMD-optimized version: batch the exp() calls + double *result_data = float_result->data_mutable(); + std::vector norms(num_quantiles); + + // Pass 1: compute exponents and norms, check for errors + for (int value_index = 0; value_index < num_quantiles; ++value_index) + { + double mu = (mu_singleton ? mu0 : arg_mu->NumericAtIndex_NOCAST(value_index, nullptr)); + double sigma = (sigma_singleton ? sigma0 : arg_sigma->NumericAtIndex_NOCAST(value_index, nullptr)); + + if (sigma <= 0.0) + { + saw_error = true; + continue; + } + + double diff = float_data[value_index] - mu; + result_data[value_index] = -diff * diff / (2.0 * sigma * sigma); + norms[value_index] = 1.0 / (std::sqrt(2.0 * M_PI) * sigma); + } + + // Pass 2: batch exp() - SIMD accelerated + Eidos_SIMD::exp_float64(result_data, result_data, num_quantiles); + + // Pass 3: scale by per-element norms + for (int value_index = 0; value_index < num_quantiles; ++value_index) + result_data[value_index] *= norms[value_index]; +#endif if (saw_error) EIDOS_TERMINATION << "ERROR (Eidos_ExecuteFunction_dnorm): function dnorm() requires sd > 0.0." << EidosTerminate(nullptr); diff --git a/simd_benchmarks/README.md b/simd_benchmarks/README.md index 8cedc4f8..3430d201 100644 --- a/simd_benchmarks/README.md +++ b/simd_benchmarks/README.md @@ -10,6 +10,8 @@ This directory contains benchmark scripts used during the development of SIMD op - **`slim_benchmark.slim`** - SLiM simulation benchmark (N=5000, 1Mb chromosome, 5000 generations with selection) for measuring overall simulation performance. +- **`dnorm_benchmark.eidos`** - Eidos script that benchmarks the SIMD-optimized `dnorm()` function with singleton and vector mu/sigma parameters. + - **`SIMD_BUILD_FLAGS.md`** - Documentation on how SIMD and SLEEF build flags are set and interact. For SLEEF header generation scripts and documentation, see `eidos/sleef/`. @@ -88,4 +90,28 @@ Speedup: .96x Benchmark complete ============================================ ``` -so the takeaway is that SIMD provided significant speedups for eidos math functions, while the overall SLiM simulation speedup was minimal in this specific benchmark scenario. \ No newline at end of file +so the takeaway is that SIMD provided significant speedups for eidos math functions, while the overall SLiM simulation speedup was minimal in this specific benchmark scenario. + +## dnorm() Benchmark Results + +The `dnorm()` function was optimized to batch `exp()` calls using SLEEF SIMD vectorization. Results on x86_64 with AVX2 (N=1,000,000 elements, 10 iterations): + +| Case | SIMD (M elem/s) | Scalar (M elem/s) | Speedup | +|------|-----------------|-------------------|---------| +| `dnorm(x, scalar, scalar)` | 119.9 | 43.7 | **2.74x** | +| `dnorm(x, scalar, vector)` | 52.5 | 33.5 | **1.57x** | +| `dnorm(x, vector, scalar)` | 56.8 | 34.3 | **1.66x** | +| `dnorm(x, vector, vector)` | 41.7 | 28.2 | **1.48x** | + +The single mu/sigma case shows the best speedup at 2.74x. Variable parameter cases have additional overhead from per-element lookups but still benefit from batched SIMD exp(). + +To run this benchmark: +```bash +# Build with SIMD +mkdir build && cd build && cmake .. && make eidos +./eidos ../simd_benchmarks/dnorm_benchmark.eidos + +# Build without SIMD for comparison +mkdir build_nosimd && cd build_nosimd && cmake .. -DUSE_SIMD=OFF && make eidos +./eidos ../simd_benchmarks/dnorm_benchmark.eidos +``` \ No newline at end of file diff --git a/simd_benchmarks/dnorm_benchmark.eidos b/simd_benchmarks/dnorm_benchmark.eidos new file mode 100644 index 00000000..f5931f52 --- /dev/null +++ b/simd_benchmarks/dnorm_benchmark.eidos @@ -0,0 +1,53 @@ +// Benchmark dnorm() SIMD optimization +// Tests both singleton and variable mu/sigma cases + +defineGlobal("N", 1000000); +defineGlobal("ITERS", 10); + +// Generate test data +x = rnorm(N, 0, 1); + +catn("dnorm() SIMD Benchmark"); +catn(" N = " + N + ", iterations = " + ITERS); +catn(""); + +// Path 1: singleton mu and sigma (most common case) +start = clock(); +for (i in 1:ITERS) + result = dnorm(x, 0.0, 1.0); +elapsed = clock() - start; +catn("Path 1 - dnorm(x, scalar, scalar):"); +catn(" Time: " + format("%.3f", elapsed) + "s for " + ITERS + " iterations"); +catn(" Throughput: " + format("%.1f", N * ITERS / elapsed / 1e6) + " M elements/sec"); +catn(""); + +// Path 2a: variable sigma only +sigmas = rep(1.0, N) + runif(N, -0.1, 0.1); +start = clock(); +for (i in 1:ITERS) + result = dnorm(x, 0.0, sigmas); +elapsed = clock() - start; +catn("Path 2a - dnorm(x, scalar, vector):"); +catn(" Time: " + format("%.3f", elapsed) + "s for " + ITERS + " iterations"); +catn(" Throughput: " + format("%.1f", N * ITERS / elapsed / 1e6) + " M elements/sec"); +catn(""); + +// Path 2b: variable mu only +mus = runif(N, -0.1, 0.1); +start = clock(); +for (i in 1:ITERS) + result = dnorm(x, mus, 1.0); +elapsed = clock() - start; +catn("Path 2b - dnorm(x, vector, scalar):"); +catn(" Time: " + format("%.3f", elapsed) + "s for " + ITERS + " iterations"); +catn(" Throughput: " + format("%.1f", N * ITERS / elapsed / 1e6) + " M elements/sec"); +catn(""); + +// Path 2c: variable mu and sigma +start = clock(); +for (i in 1:ITERS) + result = dnorm(x, mus, sigmas); +elapsed = clock() - start; +catn("Path 2c - dnorm(x, vector, vector):"); +catn(" Time: " + format("%.3f", elapsed) + "s for " + ITERS + " iterations"); +catn(" Throughput: " + format("%.1f", N * ITERS / elapsed / 1e6) + " M elements/sec"); From f16e880e39703c0f970c80584556cc0f5606c611 Mon Sep 17 00:00:00 2001 From: andrewkern Date: Mon, 15 Dec 2025 10:31:42 -0800 Subject: [PATCH 2/2] whitespace cleanup --- eidos/eidos_functions_distributions.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eidos/eidos_functions_distributions.cpp b/eidos/eidos_functions_distributions.cpp index 0e67de3c..38751baf 100644 --- a/eidos/eidos_functions_distributions.cpp +++ b/eidos/eidos_functions_distributions.cpp @@ -521,7 +521,7 @@ EidosValue_SP Eidos_ExecuteFunction_dnorm(const std::vector &p_ar { double mu = (mu_singleton ? mu0 : arg_mu->NumericAtIndex_NOCAST(value_index, nullptr)); double sigma = (sigma_singleton ? sigma0 : arg_sigma->NumericAtIndex_NOCAST(value_index, nullptr)); - + if (sigma <= 0.0) { saw_error = true;