Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions eidos/eidos_functions_distributions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "gsl_linalg.h"
#include "gsl_errno.h"
#include "gsl_cdf.h"
#include "eidos_simd.h"


// BCH 20 October 2016: continuing to try to fix problems with gcc 5.4.0 on Linux without breaking other
Expand Down Expand Up @@ -484,10 +485,26 @@ EidosValue_SP Eidos_ExecuteFunction_dnorm(const std::vector<EidosValue_SP> &p_ar
EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(num_quantiles);
result_SP = EidosValue_SP(float_result);

#ifdef _OPENMP
EIDOS_THREAD_COUNT(gEidos_OMP_threads_DNORM_1);
#pragma omp parallel for schedule(static) default(none) shared(num_quantiles) firstprivate(float_data, float_result, mu0, sigma0) if(num_quantiles >= EIDOS_OMPMIN_DNORM_1) num_threads(thread_count)
for (int value_index = 0; value_index < num_quantiles; ++value_index)
float_result->set_float_no_check(gsl_ran_gaussian_pdf(float_data[value_index] - mu0, sigma0), value_index);
#else
// SIMD-optimized version: batch the exp() calls
double norm = 1.0 / (std::sqrt(2.0 * M_PI) * sigma0);
double inv_2var = -1.0 / (2.0 * sigma0 * sigma0);
double *result_data = float_result->data_mutable();

for (int value_index = 0; value_index < num_quantiles; ++value_index)
{
double diff = float_data[value_index] - mu0;
result_data[value_index] = diff * diff * inv_2var;
}
Eidos_SIMD::exp_float64(result_data, result_data, num_quantiles);
for (int value_index = 0; value_index < num_quantiles; ++value_index)
result_data[value_index] *= norm;
#endif
}
else
{
Expand All @@ -497,6 +514,7 @@ EidosValue_SP Eidos_ExecuteFunction_dnorm(const std::vector<EidosValue_SP> &p_ar

bool saw_error = false;

#ifdef _OPENMP
EIDOS_THREAD_COUNT(gEidos_OMP_threads_DNORM_2);
#pragma omp parallel for schedule(static) default(none) shared(num_quantiles) firstprivate(float_data, float_result, mu_singleton, sigma_singleton, mu0, sigma0, arg_mu, arg_sigma) reduction(||: saw_error) if(num_quantiles >= EIDOS_OMPMIN_DNORM_2) num_threads(thread_count)
for (int value_index = 0; value_index < num_quantiles; ++value_index)
Expand All @@ -512,6 +530,35 @@ EidosValue_SP Eidos_ExecuteFunction_dnorm(const std::vector<EidosValue_SP> &p_ar

float_result->set_float_no_check(gsl_ran_gaussian_pdf(float_data[value_index] - mu, sigma), value_index);
}
#else
// SIMD-optimized version: batch the exp() calls
double *result_data = float_result->data_mutable();
std::vector<double> norms(num_quantiles);

// Pass 1: compute exponents and norms, check for errors
for (int value_index = 0; value_index < num_quantiles; ++value_index)
{
double mu = (mu_singleton ? mu0 : arg_mu->NumericAtIndex_NOCAST(value_index, nullptr));
double sigma = (sigma_singleton ? sigma0 : arg_sigma->NumericAtIndex_NOCAST(value_index, nullptr));

if (sigma <= 0.0)
{
saw_error = true;
continue;
}

double diff = float_data[value_index] - mu;
result_data[value_index] = -diff * diff / (2.0 * sigma * sigma);
norms[value_index] = 1.0 / (std::sqrt(2.0 * M_PI) * sigma);
}

// Pass 2: batch exp() - SIMD accelerated
Eidos_SIMD::exp_float64(result_data, result_data, num_quantiles);

// Pass 3: scale by per-element norms
for (int value_index = 0; value_index < num_quantiles; ++value_index)
result_data[value_index] *= norms[value_index];
#endif

if (saw_error)
EIDOS_TERMINATION << "ERROR (Eidos_ExecuteFunction_dnorm): function dnorm() requires sd > 0.0." << EidosTerminate(nullptr);
Expand Down
28 changes: 27 additions & 1 deletion simd_benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ This directory contains benchmark scripts used during the development of SIMD op

- **`slim_benchmark.slim`** - SLiM simulation benchmark (N=5000, 1Mb chromosome, 5000 generations with selection) for measuring overall simulation performance.

- **`dnorm_benchmark.eidos`** - Eidos script that benchmarks the SIMD-optimized `dnorm()` function with singleton and vector mu/sigma parameters.

- **`SIMD_BUILD_FLAGS.md`** - Documentation on how SIMD and SLEEF build flags are set and interact.

For SLEEF header generation scripts and documentation, see `eidos/sleef/`.
Expand Down Expand Up @@ -88,4 +90,28 @@ Speedup: .96x
Benchmark complete
============================================
```
so the takeaway is that SIMD provided significant speedups for eidos math functions, while the overall SLiM simulation speedup was minimal in this specific benchmark scenario.
so the takeaway is that SIMD provided significant speedups for eidos math functions, while the overall SLiM simulation speedup was minimal in this specific benchmark scenario.

## dnorm() Benchmark Results

The `dnorm()` function was optimized to batch `exp()` calls using SLEEF SIMD vectorization. Results on x86_64 with AVX2 (N=1,000,000 elements, 10 iterations):

| Case | SIMD (M elem/s) | Scalar (M elem/s) | Speedup |
|------|-----------------|-------------------|---------|
| `dnorm(x, scalar, scalar)` | 119.9 | 43.7 | **2.74x** |
| `dnorm(x, scalar, vector)` | 52.5 | 33.5 | **1.57x** |
| `dnorm(x, vector, scalar)` | 56.8 | 34.3 | **1.66x** |
| `dnorm(x, vector, vector)` | 41.7 | 28.2 | **1.48x** |

The single mu/sigma case shows the best speedup at 2.74x. Variable parameter cases have additional overhead from per-element lookups but still benefit from batched SIMD exp().

To run this benchmark:
```bash
# Build with SIMD
mkdir build && cd build && cmake .. && make eidos
./eidos ../simd_benchmarks/dnorm_benchmark.eidos

# Build without SIMD for comparison
mkdir build_nosimd && cd build_nosimd && cmake .. -DUSE_SIMD=OFF && make eidos
./eidos ../simd_benchmarks/dnorm_benchmark.eidos
```
53 changes: 53 additions & 0 deletions simd_benchmarks/dnorm_benchmark.eidos
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
// Benchmark dnorm() SIMD optimization
// Tests both singleton and variable mu/sigma cases

defineGlobal("N", 1000000);
defineGlobal("ITERS", 10);

// Generate test data
x = rnorm(N, 0, 1);

catn("dnorm() SIMD Benchmark");
catn(" N = " + N + ", iterations = " + ITERS);
catn("");

// Path 1: singleton mu and sigma (most common case)
start = clock();
for (i in 1:ITERS)
result = dnorm(x, 0.0, 1.0);
elapsed = clock() - start;
catn("Path 1 - dnorm(x, scalar, scalar):");
catn(" Time: " + format("%.3f", elapsed) + "s for " + ITERS + " iterations");
catn(" Throughput: " + format("%.1f", N * ITERS / elapsed / 1e6) + " M elements/sec");
catn("");

// Path 2a: variable sigma only
sigmas = rep(1.0, N) + runif(N, -0.1, 0.1);
start = clock();
for (i in 1:ITERS)
result = dnorm(x, 0.0, sigmas);
elapsed = clock() - start;
catn("Path 2a - dnorm(x, scalar, vector):");
catn(" Time: " + format("%.3f", elapsed) + "s for " + ITERS + " iterations");
catn(" Throughput: " + format("%.1f", N * ITERS / elapsed / 1e6) + " M elements/sec");
catn("");

// Path 2b: variable mu only
mus = runif(N, -0.1, 0.1);
start = clock();
for (i in 1:ITERS)
result = dnorm(x, mus, 1.0);
elapsed = clock() - start;
catn("Path 2b - dnorm(x, vector, scalar):");
catn(" Time: " + format("%.3f", elapsed) + "s for " + ITERS + " iterations");
catn(" Throughput: " + format("%.1f", N * ITERS / elapsed / 1e6) + " M elements/sec");
catn("");

// Path 2c: variable mu and sigma
start = clock();
for (i in 1:ITERS)
result = dnorm(x, mus, sigmas);
elapsed = clock() - start;
catn("Path 2c - dnorm(x, vector, vector):");
catn(" Time: " + format("%.3f", elapsed) + "s for " + ITERS + " iterations");
catn(" Throughput: " + format("%.1f", N * ITERS / elapsed / 1e6) + " M elements/sec");
Loading