From 862fa037b3d0c1f566c94c8c64a3cb66692a7a16 Mon Sep 17 00:00:00 2001
From: andrewkern <adkern@uoregon.edu>
Date: Mon, 15 Dec 2025 09:47:43 -0800
Subject: [PATCH 1/2] dnorm added; benchmark added

---
 eidos/eidos_functions_distributions.cpp | 49 ++++++++++++++++++++++-
 simd_benchmarks/README.md               | 28 ++++++++++++-
 simd_benchmarks/dnorm_benchmark.eidos   | 53 +++++++++++++++++++++++++
 3 files changed, 128 insertions(+), 2 deletions(-)
 create mode 100644 simd_benchmarks/dnorm_benchmark.eidos
diff --git a/eidos/eidos_functions_distributions.cpp b/eidos/eidos_functions_distributions.cpp
index d83a2d37..0e67de3c 100644
--- a/eidos/eidos_functions_distributions.cpp
+++ b/eidos/eidos_functions_distributions.cpp
@@ -26,6 +26,7 @@
 #include "gsl_linalg.h"
 #include "gsl_errno.h"
 #include "gsl_cdf.h"
+#include "eidos_simd.h"
 
 
 // BCH 20 October 2016: continuing to try to fix problems with gcc 5.4.0 on Linux without breaking other
@@ -484,10 +485,26 @@ EidosValue_SP Eidos_ExecuteFunction_dnorm(const std::vector<EidosValue_SP> &p_ar
 		EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(num_quantiles);
 		result_SP = EidosValue_SP(float_result);
 		
+#ifdef _OPENMP
 		EIDOS_THREAD_COUNT(gEidos_OMP_threads_DNORM_1);
 #pragma omp parallel for schedule(static) default(none) shared(num_quantiles) firstprivate(float_data, float_result, mu0, sigma0) if(num_quantiles >= EIDOS_OMPMIN_DNORM_1) num_threads(thread_count)
 		for (int value_index = 0; value_index < num_quantiles; ++value_index)
 			float_result->set_float_no_check(gsl_ran_gaussian_pdf(float_data[value_index] - mu0, sigma0), value_index);
+#else
+		// SIMD-optimized version: batch the exp() calls
+		double norm = 1.0 / (std::sqrt(2.0 * M_PI) * sigma0);
+		double inv_2var = -1.0 / (2.0 * sigma0 * sigma0);
+		double *result_data = float_result->data_mutable();
+
+		for (int value_index = 0; value_index < num_quantiles; ++value_index)
+		{
+			double diff = float_data[value_index] - mu0;
+			result_data[value_index] = diff * diff * inv_2var;
+		}
+		Eidos_SIMD::exp_float64(result_data, result_data, num_quantiles);
+		for (int value_index = 0; value_index < num_quantiles; ++value_index)
+			result_data[value_index] *= norm;
+#endif
 	}
 	else
 	{
@@ -497,13 +514,14 @@ EidosValue_SP Eidos_ExecuteFunction_dnorm(const std::vector<EidosValue_SP> &p_ar
 		
 		bool saw_error = false;
 		
+#ifdef _OPENMP
 		EIDOS_THREAD_COUNT(gEidos_OMP_threads_DNORM_2);
 #pragma omp parallel for schedule(static) default(none) shared(num_quantiles) firstprivate(float_data, float_result, mu_singleton, sigma_singleton, mu0, sigma0, arg_mu, arg_sigma) reduction(||: saw_error) if(num_quantiles >= EIDOS_OMPMIN_DNORM_2) num_threads(thread_count)
 		for (int value_index = 0; value_index < num_quantiles; ++value_index)
 		{
 			double mu = (mu_singleton ? mu0 : arg_mu->NumericAtIndex_NOCAST(value_index, nullptr));
 			double sigma = (sigma_singleton ? sigma0 : arg_sigma->NumericAtIndex_NOCAST(value_index, nullptr));
-			
+
 			if (sigma <= 0.0)
 			{
 				saw_error = true;
@@ -512,6 +530,35 @@ EidosValue_SP Eidos_ExecuteFunction_dnorm(const std::vector<EidosValue_SP> &p_ar
 			
 			float_result->set_float_no_check(gsl_ran_gaussian_pdf(float_data[value_index] - mu, sigma), value_index);
 		}
+#else
+		// SIMD-optimized version: batch the exp() calls
+		double *result_data = float_result->data_mutable();
+		std::vector<double> norms(num_quantiles);
+
+		// Pass 1: compute exponents and norms, check for errors
+		for (int value_index = 0; value_index < num_quantiles; ++value_index)
+		{
+			double mu = (mu_singleton ? mu0 : arg_mu->NumericAtIndex_NOCAST(value_index, nullptr));
+			double sigma = (sigma_singleton ? sigma0 : arg_sigma->NumericAtIndex_NOCAST(value_index, nullptr));
+
+			if (sigma <= 0.0)
+			{
+				saw_error = true;
+				continue;
+			}
+
+			double diff = float_data[value_index] - mu;
+			result_data[value_index] = -diff * diff / (2.0 * sigma * sigma);
+			norms[value_index] = 1.0 / (std::sqrt(2.0 * M_PI) * sigma);
+		}
+
+		// Pass 2: batch exp() - SIMD accelerated
+		Eidos_SIMD::exp_float64(result_data, result_data, num_quantiles);
+
+		// Pass 3: scale by per-element norms
+		for (int value_index = 0; value_index < num_quantiles; ++value_index)
+			result_data[value_index] *= norms[value_index];
+#endif
 		
 		if (saw_error)
 			EIDOS_TERMINATION << "ERROR (Eidos_ExecuteFunction_dnorm): function dnorm() requires sd > 0.0." << EidosTerminate(nullptr);
diff --git a/simd_benchmarks/README.md b/simd_benchmarks/README.md
index 8cedc4f8..3430d201 100644
--- a/simd_benchmarks/README.md
+++ b/simd_benchmarks/README.md
@@ -10,6 +10,8 @@ This directory contains benchmark scripts used during the development of SIMD op
 
 - **`slim_benchmark.slim`** - SLiM simulation benchmark (N=5000, 1Mb chromosome, 5000 generations with selection) for measuring overall simulation performance.
 
+- **`dnorm_benchmark.eidos`** - Eidos script that benchmarks the SIMD-optimized `dnorm()` function with singleton and vector mu/sigma parameters.
+
 - **`SIMD_BUILD_FLAGS.md`** - Documentation on how SIMD and SLEEF build flags are set and interact.
 
 For SLEEF header generation scripts and documentation, see `eidos/sleef/`.
@@ -88,4 +90,28 @@ Speedup: .96x
 Benchmark complete
 ============================================
 ```
-so the takeaway is that SIMD provided significant speedups for eidos math functions, while the overall SLiM simulation speedup was minimal in this specific benchmark scenario.
\ No newline at end of file
+so the takeaway is that SIMD provided significant speedups for eidos math functions, while the overall SLiM simulation speedup was minimal in this specific benchmark scenario.
+
+## dnorm() Benchmark Results
+
+The `dnorm()` function was optimized to batch `exp()` calls using SLEEF SIMD vectorization. Results on x86_64 with AVX2 (N=1,000,000 elements, 10 iterations):
+
+| Case | SIMD (M elem/s) | Scalar (M elem/s) | Speedup |
+|------|-----------------|-------------------|---------|
+| `dnorm(x, scalar, scalar)` | 119.9 | 43.7 | **2.74x** |
+| `dnorm(x, scalar, vector)` | 52.5 | 33.5 | **1.57x** |
+| `dnorm(x, vector, scalar)` | 56.8 | 34.3 | **1.66x** |
+| `dnorm(x, vector, vector)` | 41.7 | 28.2 | **1.48x** |
+
+The single mu/sigma case shows the best speedup at 2.74x. Variable parameter cases have additional overhead from per-element lookups but still benefit from batched SIMD exp().
+
+To run this benchmark:
+```bash
+# Build with SIMD
+mkdir build && cd build && cmake .. && make eidos
+./eidos ../simd_benchmarks/dnorm_benchmark.eidos
+
+# Build without SIMD for comparison
+mkdir build_nosimd && cd build_nosimd && cmake .. -DUSE_SIMD=OFF && make eidos
+./eidos ../simd_benchmarks/dnorm_benchmark.eidos
+```
\ No newline at end of file
diff --git a/simd_benchmarks/dnorm_benchmark.eidos b/simd_benchmarks/dnorm_benchmark.eidos
new file mode 100644
index 00000000..f5931f52
--- /dev/null
+++ b/simd_benchmarks/dnorm_benchmark.eidos
@@ -0,0 +1,53 @@
+// Benchmark dnorm() SIMD optimization
+// Tests both singleton and variable mu/sigma cases
+
+defineGlobal("N", 1000000);
+defineGlobal("ITERS", 10);
+
+// Generate test data
+x = rnorm(N, 0, 1);
+
+catn("dnorm() SIMD Benchmark");
+catn("  N = " + N + ", iterations = " + ITERS);
+catn("");
+
+// Path 1: singleton mu and sigma (most common case)
+start = clock();
+for (i in 1:ITERS)
+	result = dnorm(x, 0.0, 1.0);
+elapsed = clock() - start;
+catn("Path 1 - dnorm(x, scalar, scalar):");
+catn("  Time: " + format("%.3f", elapsed) + "s for " + ITERS + " iterations");
+catn("  Throughput: " + format("%.1f", N * ITERS / elapsed / 1e6) + " M elements/sec");
+catn("");
+
+// Path 2a: variable sigma only
+sigmas = rep(1.0, N) + runif(N, -0.1, 0.1);
+start = clock();
+for (i in 1:ITERS)
+	result = dnorm(x, 0.0, sigmas);
+elapsed = clock() - start;
+catn("Path 2a - dnorm(x, scalar, vector):");
+catn("  Time: " + format("%.3f", elapsed) + "s for " + ITERS + " iterations");
+catn("  Throughput: " + format("%.1f", N * ITERS / elapsed / 1e6) + " M elements/sec");
+catn("");
+
+// Path 2b: variable mu only
+mus = runif(N, -0.1, 0.1);
+start = clock();
+for (i in 1:ITERS)
+	result = dnorm(x, mus, 1.0);
+elapsed = clock() - start;
+catn("Path 2b - dnorm(x, vector, scalar):");
+catn("  Time: " + format("%.3f", elapsed) + "s for " + ITERS + " iterations");
+catn("  Throughput: " + format("%.1f", N * ITERS / elapsed / 1e6) + " M elements/sec");
+catn("");
+
+// Path 2c: variable mu and sigma
+start = clock();
+for (i in 1:ITERS)
+	result = dnorm(x, mus, sigmas);
+elapsed = clock() - start;
+catn("Path 2c - dnorm(x, vector, vector):");
+catn("  Time: " + format("%.3f", elapsed) + "s for " + ITERS + " iterations");
+catn("  Throughput: " + format("%.1f", N * ITERS / elapsed / 1e6) + " M elements/sec");

From f16e880e39703c0f970c80584556cc0f5606c611 Mon Sep 17 00:00:00 2001
From: andrewkern <adkern@uoregon.edu>
Date: Mon, 15 Dec 2025 10:31:42 -0800
Subject: [PATCH 2/2] whitespace cleanup

---
 eidos/eidos_functions_distributions.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/eidos/eidos_functions_distributions.cpp b/eidos/eidos_functions_distributions.cpp
index 0e67de3c..38751baf 100644
--- a/eidos/eidos_functions_distributions.cpp
+++ b/eidos/eidos_functions_distributions.cpp
@@ -521,7 +521,7 @@ EidosValue_SP Eidos_ExecuteFunction_dnorm(const std::vector<EidosValue_SP> &p_ar
 		{
 			double mu = (mu_singleton ? mu0 : arg_mu->NumericAtIndex_NOCAST(value_index, nullptr));
 			double sigma = (sigma_singleton ? sigma0 : arg_sigma->NumericAtIndex_NOCAST(value_index, nullptr));
-
+			
 			if (sigma <= 0.0)
 			{
 				saw_error = true;