From 30faddacb35eef9cedf1693a83aedf3848c31e14 Mon Sep 17 00:00:00 2001 From: andrewkern Date: Tue, 16 Dec 2025 11:35:40 -0800 Subject: [PATCH 1/7] Add SIMD acceleration for spatial interaction kernels (Gaussian/Exponential) - Add float SLEEF macros to sleef_config.h (AVX2: 8 floats, NEON: 4 floats) - Add exp_kernel_float32() and normal_kernel_float32() to eidos_simd.h - Use SIMD kernels in FillSparseVectorForReceiverStrengths() - Add benchmark script for spatial interaction kernels --- core/interaction_type.cpp | 17 +-- eidos/eidos_simd.h | 134 ++++++++++++++++++ eidos/sleef/sleef_config.h | 18 +++ .../sleef_benchmark_spatial_interaction.slim | 95 +++++++++++++ 4 files changed, 252 insertions(+), 12 deletions(-) create mode 100644 simd_benchmarks/sleef_benchmark_spatial_interaction.slim diff --git a/core/interaction_type.cpp b/core/interaction_type.cpp index a3e77c3c..1433e12c 100755 --- a/core/interaction_type.cpp +++ b/core/interaction_type.cpp @@ -27,6 +27,7 @@ #include "species.h" #include "slim_globals.h" #include "sparse_vector.h" +#include "eidos_simd.h" #include #include @@ -2911,22 +2912,14 @@ void InteractionType::FillSparseVectorForReceiverStrengths(SparseVector *sv, Ind } case SpatialKernelType::kExponential: { - for (uint32_t col_iter = 0; col_iter < nnz; ++col_iter) - { - sv_value_t distance = values[col_iter]; - - values[col_iter] = (sv_value_t)(if_param1_ * exp(-if_param2_ * distance)); - } + // SIMD-accelerated exponential kernel: strength = fmax * exp(-lambda * distance) + Eidos_SIMD::exp_kernel_float32(values, nnz, (float)if_param1_, (float)if_param2_); break; } case SpatialKernelType::kNormal: { - for (uint32_t col_iter = 0; col_iter < nnz; ++col_iter) - { - sv_value_t distance = values[col_iter]; - - values[col_iter] = (sv_value_t)(if_param1_ * exp(-(distance * distance) / n_2param2sq_)); - } + // SIMD-accelerated Gaussian kernel: strength = fmax * exp(-d^2 / 2sigma^2) + Eidos_SIMD::normal_kernel_float32(values, nnz, (float)if_param1_, (float)n_2param2sq_); break; } case SpatialKernelType::kCauchy: diff --git a/eidos/eidos_simd.h b/eidos/eidos_simd.h index 208c0992..048864f1 100644 --- a/eidos/eidos_simd.h +++ b/eidos/eidos_simd.h @@ -467,6 +467,140 @@ inline double product_float64(const double *input, int64_t count) return prod; } +// ================================ +// Float (Single-Precision) SIMD Operations +// ================================ +// These functions operate on arrays of floats, used by spatial interaction kernels. + +// --------------------- +// Exponential: exp(x) for floats +// --------------------- +inline void exp_float32(const float *input, float *output, int64_t count) +{ + int64_t i = 0; + +#if EIDOS_SLEEF_FLOAT_AVAILABLE + for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F) + { + EIDOS_SLEEF_TYPE_F v = EIDOS_SLEEF_LOAD_F(&input[i]); + EIDOS_SLEEF_TYPE_F r = EIDOS_SLEEF_EXP_F(v); + EIDOS_SLEEF_STORE_F(&output[i], r); + } +#endif + + // Scalar remainder + for (; i < count; i++) + output[i] = std::exp(input[i]); +} + +// --------------------- +// Exponential Kernel: strength = fmax * exp(-lambda * distance) +// --------------------- +// Operates in-place on a distance array, transforming distances to strengths. +// This is optimized for the spatial interaction kernel calculation. +inline void exp_kernel_float32(float *distances, int64_t count, float fmax, float lambda) +{ + int64_t i = 0; + +#if EIDOS_SLEEF_FLOAT_AVAILABLE + // We need to compute: fmax * exp(-lambda * distance) + // First, compute -lambda * distance for all values, then exp, then multiply by fmax + EIDOS_SLEEF_TYPE_F v_fmax = +#if defined(EIDOS_HAS_AVX2) + _mm256_set1_ps(fmax); + EIDOS_SLEEF_TYPE_F v_neg_lambda = _mm256_set1_ps(-lambda); +#elif defined(EIDOS_HAS_NEON) + vdupq_n_f32(fmax); + EIDOS_SLEEF_TYPE_F v_neg_lambda = vdupq_n_f32(-lambda); +#endif + + for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F) + { + EIDOS_SLEEF_TYPE_F v_dist = EIDOS_SLEEF_LOAD_F(&distances[i]); + + // Compute -lambda * distance +#if defined(EIDOS_HAS_AVX2) + EIDOS_SLEEF_TYPE_F v_arg = _mm256_mul_ps(v_neg_lambda, v_dist); +#elif defined(EIDOS_HAS_NEON) + EIDOS_SLEEF_TYPE_F v_arg = vmulq_f32(v_neg_lambda, v_dist); +#endif + + // Compute exp(-lambda * distance) + EIDOS_SLEEF_TYPE_F v_exp = EIDOS_SLEEF_EXP_F(v_arg); + + // Compute fmax * exp(...) +#if defined(EIDOS_HAS_AVX2) + EIDOS_SLEEF_TYPE_F v_result = _mm256_mul_ps(v_fmax, v_exp); +#elif defined(EIDOS_HAS_NEON) + EIDOS_SLEEF_TYPE_F v_result = vmulq_f32(v_fmax, v_exp); +#endif + + EIDOS_SLEEF_STORE_F(&distances[i], v_result); + } +#endif + + // Scalar remainder + for (; i < count; i++) + distances[i] = fmax * std::exp(-lambda * distances[i]); +} + +// --------------------- +// Normal (Gaussian) Kernel: strength = fmax * exp(-distance^2 / 2sigma^2) +// --------------------- +// Operates in-place on a distance array, transforming distances to strengths. +// The two_sigma_sq parameter is pre-computed as 2 * sigma^2 for efficiency. +inline void normal_kernel_float32(float *distances, int64_t count, float fmax, float two_sigma_sq) +{ + int64_t i = 0; + +#if EIDOS_SLEEF_FLOAT_AVAILABLE + // We need to compute: fmax * exp(-distance^2 / two_sigma_sq) + EIDOS_SLEEF_TYPE_F v_fmax = +#if defined(EIDOS_HAS_AVX2) + _mm256_set1_ps(fmax); + EIDOS_SLEEF_TYPE_F v_neg_inv_2sigsq = _mm256_set1_ps(-1.0f / two_sigma_sq); +#elif defined(EIDOS_HAS_NEON) + vdupq_n_f32(fmax); + EIDOS_SLEEF_TYPE_F v_neg_inv_2sigsq = vdupq_n_f32(-1.0f / two_sigma_sq); +#endif + + for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F) + { + EIDOS_SLEEF_TYPE_F v_dist = EIDOS_SLEEF_LOAD_F(&distances[i]); + + // Compute distance^2 +#if defined(EIDOS_HAS_AVX2) + EIDOS_SLEEF_TYPE_F v_dist_sq = _mm256_mul_ps(v_dist, v_dist); + // Compute -distance^2 / 2sigma^2 + EIDOS_SLEEF_TYPE_F v_arg = _mm256_mul_ps(v_dist_sq, v_neg_inv_2sigsq); +#elif defined(EIDOS_HAS_NEON) + EIDOS_SLEEF_TYPE_F v_dist_sq = vmulq_f32(v_dist, v_dist); + // Compute -distance^2 / 2sigma^2 + EIDOS_SLEEF_TYPE_F v_arg = vmulq_f32(v_dist_sq, v_neg_inv_2sigsq); +#endif + + // Compute exp(-distance^2 / 2sigma^2) + EIDOS_SLEEF_TYPE_F v_exp = EIDOS_SLEEF_EXP_F(v_arg); + + // Compute fmax * exp(...) +#if defined(EIDOS_HAS_AVX2) + EIDOS_SLEEF_TYPE_F v_result = _mm256_mul_ps(v_fmax, v_exp); +#elif defined(EIDOS_HAS_NEON) + EIDOS_SLEEF_TYPE_F v_result = vmulq_f32(v_fmax, v_exp); +#endif + + EIDOS_SLEEF_STORE_F(&distances[i], v_result); + } +#endif + + // Scalar remainder + for (; i < count; i++) + { + float d = distances[i]; + distances[i] = fmax * std::exp(-(d * d) / two_sigma_sq); + } +} + } // namespace Eidos_SIMD #endif /* eidos_simd_h */ diff --git a/eidos/sleef/sleef_config.h b/eidos/sleef/sleef_config.h index 230e34a1..5643cf7f 100644 --- a/eidos/sleef/sleef_config.h +++ b/eidos/sleef/sleef_config.h @@ -62,6 +62,14 @@ #define EIDOS_SLEEF_LOG10_D(v) Sleef_log10d4_u10avx2(v) #define EIDOS_SLEEF_LOG2_D(v) Sleef_log2d4_u10avx2(v) + // Float (single-precision) support - 8 floats per AVX2 register + #define EIDOS_SLEEF_FLOAT_AVAILABLE 1 + #define EIDOS_SLEEF_VEC_SIZE_F 8 + #define EIDOS_SLEEF_TYPE_F __m256 + #define EIDOS_SLEEF_LOAD_F(ptr) _mm256_loadu_ps(ptr) + #define EIDOS_SLEEF_STORE_F(ptr, v) _mm256_storeu_ps(ptr, v) + #define EIDOS_SLEEF_EXP_F(v) Sleef_expf8_u10avx2(v) + // ================================ // ARM NEON Configuration (ARM64) // ================================ @@ -85,12 +93,22 @@ #define EIDOS_SLEEF_LOG10_D(v) Sleef_log10d2_u10advsimd(v) #define EIDOS_SLEEF_LOG2_D(v) Sleef_log2d2_u10advsimd(v) + // Float (single-precision) support - 4 floats per NEON register + #define EIDOS_SLEEF_FLOAT_AVAILABLE 1 + #define EIDOS_SLEEF_VEC_SIZE_F 4 + #define EIDOS_SLEEF_TYPE_F float32x4_t + #define EIDOS_SLEEF_LOAD_F(ptr) vld1q_f32(ptr) + #define EIDOS_SLEEF_STORE_F(ptr, v) vst1q_f32(ptr, v) + #define EIDOS_SLEEF_EXP_F(v) Sleef_expf4_u10advsimd(v) + // ================================ // Scalar Fallback (SSE4.2-only or no SIMD) // ================================ #else #define EIDOS_SLEEF_AVAILABLE 0 #define EIDOS_SLEEF_VEC_SIZE 1 + #define EIDOS_SLEEF_FLOAT_AVAILABLE 0 + #define EIDOS_SLEEF_VEC_SIZE_F 1 #endif #else // EIDOS_SLEEF_AVAILABLE was defined externally diff --git a/simd_benchmarks/sleef_benchmark_spatial_interaction.slim b/simd_benchmarks/sleef_benchmark_spatial_interaction.slim new file mode 100644 index 00000000..f5942e05 --- /dev/null +++ b/simd_benchmarks/sleef_benchmark_spatial_interaction.slim @@ -0,0 +1,95 @@ +// Benchmark for SIMD-accelerated spatial interaction kernels +// Tests Gaussian (normal) and Exponential kernel performance +// These kernels use exp() internally, now SIMD-accelerated via SLEEF +// +// Run with SIMD: ./build/slim simd_benchmarks/sleef_benchmark_spatial_interaction.slim +// Run without SIMD: ./build_no_simd/slim simd_benchmarks/sleef_benchmark_spatial_interaction.slim + +initialize() { + initializeSLiMModelType("WF"); + initializeSLiMOptions(dimensionality="xy"); + + // Large population for intensive benchmarking + defineConstant("POP_SIZE", 5000); + defineConstant("W", 10.0); + defineConstant("SI", 1.0); // larger interaction distance = more neighbors + defineConstant("GENS", 30); + + initializeMutationRate(0); + initializeRecombinationRate(0); + initializeMutationType("m1", 0.5, "f", 0.0); + initializeGenomicElementType("g1", m1, 1.0); + initializeGenomicElement(g1, 0, 99); + + // Gaussian kernel: strength = fmax * exp(-distance^2 / 2*sigma^2) + initializeInteractionType(1, "xy", reciprocal=T, maxDistance=SI * 3); + i1.setInteractionFunction("n", 1.0, SI); + + // Exponential kernel: strength = fmax * exp(-lambda * distance) + initializeInteractionType(2, "xy", reciprocal=T, maxDistance=SI * 3); + i2.setInteractionFunction("e", 1.0, 1.0/SI); +} + +1 early() { + sim.addSubpop("p1", POP_SIZE); + p1.setSpatialBounds(c(0, 0, W, W)); + + for (ind in p1.individuals) { + ind.setSpatialPosition(p1.pointUniform()); + } + + catn("========================================"); + catn("Spatial Interaction Kernel Benchmark"); + catn("========================================"); + catn("Population: " + POP_SIZE); + catn("World: " + W + "x" + W); + catn("Interaction distance: " + SI); + catn("Generations: " + GENS); + + // Estimate neighbors + density = POP_SIZE / (W * W); + avg_neighbors = density * PI * (SI*3)^2; + catn("Est. neighbors/ind: ~" + format("%.0f", avg_neighbors)); + catn("----------------------------------------"); + + defineGlobal("start_time", clock()); + defineGlobal("gaussian_time", 0.0); + defineGlobal("exp_time", 0.0); +} + +early() { + // Gaussian kernel + t0 = clock(); + i1.evaluate(p1); + s1 = i1.totalOfNeighborStrengths(p1.individuals); + defineGlobal("gaussian_time", gaussian_time + (clock() - t0)); + + // Exponential kernel + t1 = clock(); + i2.evaluate(p1); + s2 = i2.totalOfNeighborStrengths(p1.individuals); + defineGlobal("exp_time", exp_time + (clock() - t1)); + + // Use strengths for fitness + p1.individuals.fitnessScaling = 1.0 / (1 + 0.001 * (s1 + s2)); +} + +late() { + // Reshuffle positions each generation + for (ind in p1.individuals) { + ind.setSpatialPosition(p1.pointUniform()); + } +} + +30 late() { + elapsed = clock() - start_time; + + catn("\n========================================"); + catn("RESULTS (SIMD spatial interaction kernels)"); + catn("========================================"); + catn("Total time: " + format("%.3f", elapsed) + " s"); + catn("Gaussian kernel: " + format("%.3f", gaussian_time) + " s"); + catn("Exponential kernel: " + format("%.3f", exp_time) + " s"); + catn("========================================"); + sim.simulationFinished(); +} From f62d7f3e857a63e38976aef7135e6cae825d2401 Mon Sep 17 00:00:00 2001 From: Andrew Kern Date: Tue, 16 Dec 2025 20:06:31 -0800 Subject: [PATCH 2/7] Enable SIMD path for 2D spatial interaction kernels (except Fixed) Modify FillSparseVectorForReceiverStrengths() to use the two-pass distance-then-transform path for most kernel types in 2D, enabling SIMD optimizations for Exponential and Normal kernels. The Fixed kernel retains the original single-pass special-case path since it doesn't benefit from SIMD (just assigns a constant). Benchmarks show 22% overall speedup at high neighbor counts (~2200), with Exponential and Normal kernels seeing 38-42% improvement. --- core/interaction_type.cpp | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/core/interaction_type.cpp b/core/interaction_type.cpp index 1433e12c..8215c700 100755 --- a/core/interaction_type.cpp +++ b/core/interaction_type.cpp @@ -2847,12 +2847,26 @@ void InteractionType::FillSparseVectorForReceiverStrengths(SparseVector *sv, Ind // Figure out what index in the exerter subpopulation, if any, needs to be excluded so self-interaction is zero slim_popsize_t excluded_index = (exerter_subpop == receiver->subpopulation_) ? receiver->index_ : -1; - // We special-case some builds directly to strength values here, for efficiency, - // with no callbacks and spatiality "xy". + // We special-case Fixed kernel builds directly to strength values here, for efficiency, + // with no callbacks and spatiality "xy". Other kernels use the two-pass path below + // which enables SIMD optimizations for Exponential and Normal kernels. + // ADK 12/16/2025: changed to only use special-case path for Fixed kernel + if ((interaction_callbacks.size() == 0) && (spatiality_ == 2) && (if_type_ == SpatialKernelType::kFixed)) + { + sv->SetDataType(SparseVectorDataType::kStrengths); + BuildSV_Strengths_f_2(kd_root, receiver_position, excluded_index, sv, 0); + sv->Finished(); + return; + } + + // ADK 12/16/2025: The original switch below handled all kernel types in the special-case + // single-pass path. Now only Fixed uses the special path above; other kernels fall + // through to the two-pass distance-then-transform path, enabling SIMD optimizations. +#if 0 if ((interaction_callbacks.size() == 0) && (spatiality_ == 2)) { sv->SetDataType(SparseVectorDataType::kStrengths); - + switch (if_type_) { case SpatialKernelType::kFixed: BuildSV_Strengths_f_2(kd_root, receiver_position, excluded_index, sv, 0); break; @@ -2864,11 +2878,12 @@ void InteractionType::FillSparseVectorForReceiverStrengths(SparseVector *sv, Ind default: EIDOS_TERMINATION << "ERROR (InteractionType::FillSparseVectorForReceiverStrengths): (internal error) unoptimized SpatialKernelType value." << EidosTerminate(); } - + sv->Finished(); return; } - +#endif + // Set up to build distances first; this is an internal implementation detail, so we require the sparse vector set up for strengths above sv->SetDataType(SparseVectorDataType::kDistances); From 9bbd5fad5ece9bfe8a598eb7ca567e1fb89f45d1 Mon Sep 17 00:00:00 2001 From: Andrew Kern Date: Tue, 16 Dec 2025 20:21:16 -0800 Subject: [PATCH 3/7] Add SIMD acceleration for Student's T spatial interaction kernel Add SLEEF pow() function support (Sleef_powf8_u10avx2 for AVX2, Sleef_powf4_u10advsimd for NEON) and implement tdist_kernel_float32() to vectorize the Student's T distribution kernel calculation. The kernel computes: strength = fmax / pow(1 + (d/tau)^2 / nu, (nu+1)/2) Benchmarks show 62% speedup for Student's T kernel (130s -> 49s), contributing to 38% overall speedup for spatial interaction benchmarks. --- core/interaction_type.cpp | 8 ++--- eidos/eidos_simd.h | 73 ++++++++++++++++++++++++++++++++++++++ eidos/sleef/sleef_config.h | 2 ++ 3 files changed, 77 insertions(+), 6 deletions(-) diff --git a/core/interaction_type.cpp b/core/interaction_type.cpp index 8215c700..a2fad103 100755 --- a/core/interaction_type.cpp +++ b/core/interaction_type.cpp @@ -2950,12 +2950,8 @@ void InteractionType::FillSparseVectorForReceiverStrengths(SparseVector *sv, Ind } case SpatialKernelType::kStudentsT: { - for (uint32_t col_iter = 0; col_iter < nnz; ++col_iter) - { - sv_value_t distance = values[col_iter]; - - values[col_iter] = (sv_value_t)SpatialKernel::tdist(distance, if_param1_, if_param2_, if_param3_); - } + // Use SIMD-optimized kernel: fmax = if_param1_, nu = if_param2_, tau = if_param3_ + Eidos_SIMD::tdist_kernel_float32(values, nnz, (float)if_param1_, (float)if_param2_, (float)if_param3_); break; } default: diff --git a/eidos/eidos_simd.h b/eidos/eidos_simd.h index 048864f1..b9e73e23 100644 --- a/eidos/eidos_simd.h +++ b/eidos/eidos_simd.h @@ -601,6 +601,79 @@ inline void normal_kernel_float32(float *distances, int64_t count, float fmax, f } } +// --------------------- +// Student's T Kernel: strength = fmax / pow(1 + (d/tau)^2 / nu, (nu+1)/2) +// --------------------- +// Operates in-place on a distance array, transforming distances to strengths. +// Parameters: fmax = maximum strength, nu = degrees of freedom, tau = scale +inline void tdist_kernel_float32(float *distances, int64_t count, float fmax, float nu, float tau) +{ + int64_t i = 0; + + // Pre-compute constants + float inv_tau = 1.0f / tau; + float inv_nu = 1.0f / nu; + float exponent = (nu + 1.0f) / 2.0f; + +#if EIDOS_SLEEF_FLOAT_AVAILABLE + EIDOS_SLEEF_TYPE_F v_fmax, v_inv_tau, v_inv_nu, v_exponent, v_one; +#if defined(EIDOS_HAS_AVX2) + v_fmax = _mm256_set1_ps(fmax); + v_inv_tau = _mm256_set1_ps(inv_tau); + v_inv_nu = _mm256_set1_ps(inv_nu); + v_exponent = _mm256_set1_ps(-exponent); + v_one = _mm256_set1_ps(1.0f); +#elif defined(EIDOS_HAS_NEON) + v_fmax = vdupq_n_f32(fmax); + v_inv_tau = vdupq_n_f32(inv_tau); + v_inv_nu = vdupq_n_f32(inv_nu); + v_exponent = vdupq_n_f32(-exponent); + v_one = vdupq_n_f32(1.0f); +#endif + + for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F) + { + EIDOS_SLEEF_TYPE_F v_dist = EIDOS_SLEEF_LOAD_F(&distances[i]); + + // Compute (d / tau) +#if defined(EIDOS_HAS_AVX2) + EIDOS_SLEEF_TYPE_F v_d_over_tau = _mm256_mul_ps(v_dist, v_inv_tau); + // Compute (d/tau)^2 + EIDOS_SLEEF_TYPE_F v_d_over_tau_sq = _mm256_mul_ps(v_d_over_tau, v_d_over_tau); + // Compute (d/tau)^2 / nu + EIDOS_SLEEF_TYPE_F v_term = _mm256_mul_ps(v_d_over_tau_sq, v_inv_nu); + // Compute 1 + (d/tau)^2 / nu + EIDOS_SLEEF_TYPE_F v_base = _mm256_add_ps(v_one, v_term); +#elif defined(EIDOS_HAS_NEON) + EIDOS_SLEEF_TYPE_F v_d_over_tau = vmulq_f32(v_dist, v_inv_tau); + EIDOS_SLEEF_TYPE_F v_d_over_tau_sq = vmulq_f32(v_d_over_tau, v_d_over_tau); + EIDOS_SLEEF_TYPE_F v_term = vmulq_f32(v_d_over_tau_sq, v_inv_nu); + EIDOS_SLEEF_TYPE_F v_base = vaddq_f32(v_one, v_term); +#endif + + // Compute pow(base, -exponent) = 1 / pow(base, exponent) + EIDOS_SLEEF_TYPE_F v_pow = EIDOS_SLEEF_POW_F(v_base, v_exponent); + + // Compute fmax * pow(...) +#if defined(EIDOS_HAS_AVX2) + EIDOS_SLEEF_TYPE_F v_result = _mm256_mul_ps(v_fmax, v_pow); +#elif defined(EIDOS_HAS_NEON) + EIDOS_SLEEF_TYPE_F v_result = vmulq_f32(v_fmax, v_pow); +#endif + + EIDOS_SLEEF_STORE_F(&distances[i], v_result); + } +#endif + + // Scalar remainder + for (; i < count; i++) + { + float d = distances[i]; + float d_over_tau = d * inv_tau; + distances[i] = fmax * std::pow(1.0f + d_over_tau * d_over_tau * inv_nu, -exponent); + } +} + } // namespace Eidos_SIMD #endif /* eidos_simd_h */ diff --git a/eidos/sleef/sleef_config.h b/eidos/sleef/sleef_config.h index 5643cf7f..6844afbc 100644 --- a/eidos/sleef/sleef_config.h +++ b/eidos/sleef/sleef_config.h @@ -69,6 +69,7 @@ #define EIDOS_SLEEF_LOAD_F(ptr) _mm256_loadu_ps(ptr) #define EIDOS_SLEEF_STORE_F(ptr, v) _mm256_storeu_ps(ptr, v) #define EIDOS_SLEEF_EXP_F(v) Sleef_expf8_u10avx2(v) + #define EIDOS_SLEEF_POW_F(x, y) Sleef_powf8_u10avx2(x, y) // ================================ // ARM NEON Configuration (ARM64) @@ -100,6 +101,7 @@ #define EIDOS_SLEEF_LOAD_F(ptr) vld1q_f32(ptr) #define EIDOS_SLEEF_STORE_F(ptr, v) vst1q_f32(ptr, v) #define EIDOS_SLEEF_EXP_F(v) Sleef_expf4_u10advsimd(v) + #define EIDOS_SLEEF_POW_F(x, y) Sleef_powf4_u10advsimd(x, y) // ================================ // Scalar Fallback (SSE4.2-only or no SIMD) From e1eefd1f405a7d9cd643f39a3fd996d9c71ec897 Mon Sep 17 00:00:00 2001 From: Andrew Kern Date: Tue, 16 Dec 2025 20:29:45 -0800 Subject: [PATCH 4/7] Add SIMD acceleration for Cauchy spatial interaction kernel Implement cauchy_kernel_float32() using AVX2/NEON intrinsics for the Cauchy kernel calculation: strength = fmax / (1 + (d/lambda)^2) Unlike exp/normal/tdist kernels, Cauchy uses only basic arithmetic operations (multiply, divide, add) so no SLEEF functions are needed. Benchmarks show 18% speedup vs original (40s -> 33s). --- core/interaction_type.cpp | 9 ++----- eidos/eidos_simd.h | 49 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 7 deletions(-) diff --git a/core/interaction_type.cpp b/core/interaction_type.cpp index a2fad103..a010b32d 100755 --- a/core/interaction_type.cpp +++ b/core/interaction_type.cpp @@ -2939,13 +2939,8 @@ void InteractionType::FillSparseVectorForReceiverStrengths(SparseVector *sv, Ind } case SpatialKernelType::kCauchy: { - for (uint32_t col_iter = 0; col_iter < nnz; ++col_iter) - { - sv_value_t distance = values[col_iter]; - double temp = distance / if_param2_; - - values[col_iter] = (sv_value_t)(if_param1_ / (1.0 + temp * temp)); - } + // Use SIMD-optimized kernel: fmax = if_param1_, lambda = if_param2_ + Eidos_SIMD::cauchy_kernel_float32(values, nnz, (float)if_param1_, (float)if_param2_); break; } case SpatialKernelType::kStudentsT: diff --git a/eidos/eidos_simd.h b/eidos/eidos_simd.h index b9e73e23..f3625ed5 100644 --- a/eidos/eidos_simd.h +++ b/eidos/eidos_simd.h @@ -674,6 +674,55 @@ inline void tdist_kernel_float32(float *distances, int64_t count, float fmax, fl } } +// --------------------- +// Cauchy Kernel: strength = fmax / (1 + (d/lambda)^2) +// --------------------- +// Operates in-place on a distance array, transforming distances to strengths. +// Parameters: fmax = maximum strength, lambda = scale parameter +inline void cauchy_kernel_float32(float *distances, int64_t count, float fmax, float lambda) +{ + int64_t i = 0; + float inv_lambda = 1.0f / lambda; + +#if defined(EIDOS_HAS_AVX2) + __m256 v_fmax = _mm256_set1_ps(fmax); + __m256 v_inv_lambda = _mm256_set1_ps(inv_lambda); + __m256 v_one = _mm256_set1_ps(1.0f); + + for (; i + 8 <= count; i += 8) + { + __m256 v_dist = _mm256_loadu_ps(&distances[i]); + __m256 v_temp = _mm256_mul_ps(v_dist, v_inv_lambda); // d/lambda + __m256 v_temp_sq = _mm256_mul_ps(v_temp, v_temp); // (d/lambda)^2 + __m256 v_denom = _mm256_add_ps(v_one, v_temp_sq); // 1 + (d/lambda)^2 + __m256 v_result = _mm256_div_ps(v_fmax, v_denom); // fmax / denom + _mm256_storeu_ps(&distances[i], v_result); + } +#elif defined(EIDOS_HAS_NEON) + float32x4_t v_fmax = vdupq_n_f32(fmax); + float32x4_t v_inv_lambda = vdupq_n_f32(inv_lambda); + float32x4_t v_one = vdupq_n_f32(1.0f); + + for (; i + 4 <= count; i += 4) + { + float32x4_t v_dist = vld1q_f32(&distances[i]); + float32x4_t v_temp = vmulq_f32(v_dist, v_inv_lambda); + float32x4_t v_temp_sq = vmulq_f32(v_temp, v_temp); + float32x4_t v_denom = vaddq_f32(v_one, v_temp_sq); + float32x4_t v_result = vdivq_f32(v_fmax, v_denom); + vst1q_f32(&distances[i], v_result); + } +#endif + + // Scalar remainder + for (; i < count; i++) + { + float d = distances[i]; + float temp = d * inv_lambda; + distances[i] = fmax / (1.0f + temp * temp); + } +} + } // namespace Eidos_SIMD #endif /* eidos_simd_h */ From 7e1c3f0a20a5fa1cc10d4924503deeada362b6a6 Mon Sep 17 00:00:00 2001 From: Andrew Kern Date: Tue, 16 Dec 2025 20:36:54 -0800 Subject: [PATCH 5/7] Add SIMD acceleration for Linear spatial interaction kernel Implement linear_kernel_float32() using AVX2/NEON intrinsics for the Linear kernel calculation: strength = fmax * (1 - d / max_distance) Rewritten as: strength = fmax - d * (fmax / max_distance) Simple arithmetic (multiply + subtract) so gains are modest (~2%), but provides consistency with other SIMD-optimized kernels. --- core/interaction_type.cpp | 8 ++------ eidos/eidos_simd.h | 42 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 6 deletions(-) diff --git a/core/interaction_type.cpp b/core/interaction_type.cpp index a010b32d..8515767c 100755 --- a/core/interaction_type.cpp +++ b/core/interaction_type.cpp @@ -2917,12 +2917,8 @@ void InteractionType::FillSparseVectorForReceiverStrengths(SparseVector *sv, Ind } case SpatialKernelType::kLinear: { - for (uint32_t col_iter = 0; col_iter < nnz; ++col_iter) - { - sv_value_t distance = values[col_iter]; - - values[col_iter] = (sv_value_t)(if_param1_ * (1.0 - distance / max_distance_)); - } + // Use SIMD-optimized kernel: fmax = if_param1_, max_distance = max_distance_ + Eidos_SIMD::linear_kernel_float32(values, nnz, (float)if_param1_, (float)max_distance_); break; } case SpatialKernelType::kExponential: diff --git a/eidos/eidos_simd.h b/eidos/eidos_simd.h index f3625ed5..4c142dd3 100644 --- a/eidos/eidos_simd.h +++ b/eidos/eidos_simd.h @@ -723,6 +723,48 @@ inline void cauchy_kernel_float32(float *distances, int64_t count, float fmax, f } } +// --------------------- +// Linear Kernel: strength = fmax * (1 - d / max_distance) +// --------------------- +// Operates in-place on a distance array, transforming distances to strengths. +// Parameters: fmax = maximum strength, max_distance = interaction max distance +inline void linear_kernel_float32(float *distances, int64_t count, float fmax, float max_distance) +{ + int64_t i = 0; + float fmax_over_maxdist = fmax / max_distance; + +#if defined(EIDOS_HAS_AVX2) + __m256 v_fmax = _mm256_set1_ps(fmax); + __m256 v_fmax_over_maxdist = _mm256_set1_ps(fmax_over_maxdist); + + for (; i + 8 <= count; i += 8) + { + __m256 v_dist = _mm256_loadu_ps(&distances[i]); + // fmax - d * (fmax / max_distance) = fmax * (1 - d/max_distance) + __m256 v_term = _mm256_mul_ps(v_dist, v_fmax_over_maxdist); + __m256 v_result = _mm256_sub_ps(v_fmax, v_term); + _mm256_storeu_ps(&distances[i], v_result); + } +#elif defined(EIDOS_HAS_NEON) + float32x4_t v_fmax = vdupq_n_f32(fmax); + float32x4_t v_fmax_over_maxdist = vdupq_n_f32(fmax_over_maxdist); + + for (; i + 4 <= count; i += 4) + { + float32x4_t v_dist = vld1q_f32(&distances[i]); + float32x4_t v_term = vmulq_f32(v_dist, v_fmax_over_maxdist); + float32x4_t v_result = vsubq_f32(v_fmax, v_term); + vst1q_f32(&distances[i], v_result); + } +#endif + + // Scalar remainder + for (; i < count; i++) + { + distances[i] = fmax - distances[i] * fmax_over_maxdist; + } +} + } // namespace Eidos_SIMD #endif /* eidos_simd_h */ From e4962d7c4a4c6ff9c645aa656c83451ca406aa9a Mon Sep 17 00:00:00 2001 From: Andrew Kern Date: Tue, 16 Dec 2025 20:44:21 -0800 Subject: [PATCH 6/7] Replace benchmark with comprehensive all-kernels version Remove sleef_benchmark_spatial_interaction.slim (only tested Gaussian/Exponential) and add benchmark_all_kernels.slim which tests all 6 kernel types: Fixed, Linear, Exponential, Normal, Cauchy, and Student's T. --- simd_benchmarks/benchmark_all_kernels.slim | 131 ++++++++++++++++++ .../sleef_benchmark_spatial_interaction.slim | 95 ------------- 2 files changed, 131 insertions(+), 95 deletions(-) create mode 100644 simd_benchmarks/benchmark_all_kernels.slim delete mode 100644 simd_benchmarks/sleef_benchmark_spatial_interaction.slim diff --git a/simd_benchmarks/benchmark_all_kernels.slim b/simd_benchmarks/benchmark_all_kernels.slim new file mode 100644 index 00000000..613c69ce --- /dev/null +++ b/simd_benchmarks/benchmark_all_kernels.slim @@ -0,0 +1,131 @@ +// Comprehensive benchmark for ALL spatial interaction kernel types +// Tests: Fixed, Linear, Exponential, Normal, Cauchy, Student's T + +initialize() { + initializeSLiMModelType("WF"); + initializeSLiMOptions(dimensionality="xy"); + + defineConstant("POP_SIZE", 50000); + defineConstant("W", 25.0); // Medium density (~2262 neighbors with 50k) + defineConstant("SI", 1.0); + defineConstant("GENS", 30); + + initializeMutationRate(0); + initializeRecombinationRate(0); + initializeMutationType("m1", 0.5, "f", 0.0); + initializeGenomicElementType("g1", m1, 1.0); + initializeGenomicElement(g1, 0, 99); + + // 1. Fixed kernel: strength = fmax (constant) + initializeInteractionType(1, "xy", reciprocal=T, maxDistance=SI * 3); + i1.setInteractionFunction("f", 1.0); + + // 2. Linear kernel: strength = fmax * (1 - d/maxDist) + initializeInteractionType(2, "xy", reciprocal=T, maxDistance=SI * 3); + i2.setInteractionFunction("l", 1.0); + + // 3. Exponential kernel: strength = fmax * exp(-lambda * d) + initializeInteractionType(3, "xy", reciprocal=T, maxDistance=SI * 3); + i3.setInteractionFunction("e", 1.0, 1.0/SI); + + // 4. Normal/Gaussian kernel: strength = fmax * exp(-d^2 / 2*sigma^2) + initializeInteractionType(4, "xy", reciprocal=T, maxDistance=SI * 3); + i4.setInteractionFunction("n", 1.0, SI); + + // 5. Cauchy kernel: strength = fmax / (1 + (d/scale)^2) + initializeInteractionType(5, "xy", reciprocal=T, maxDistance=SI * 3); + i5.setInteractionFunction("c", 1.0, SI); + + // 6. Student's T kernel + initializeInteractionType(6, "xy", reciprocal=T, maxDistance=SI * 3); + i6.setInteractionFunction("t", 1.0, 5.0, SI); // fmax, nu, scale +} + +1 early() { + sim.addSubpop("p1", POP_SIZE); + p1.setSpatialBounds(c(0, 0, W, W)); + + for (ind in p1.individuals) { + ind.setSpatialPosition(p1.pointUniform()); + } + + catn("================================================"); + catn("ALL KERNELS BENCHMARK (2D)"); + catn("================================================"); + catn("Population: " + POP_SIZE + " | World: " + W + "x" + W); + density = POP_SIZE / (W^2); + avg_neighbors = density * PI * (SI*3)^2; + catn("Est. neighbors/ind: ~" + format("%.0f", avg_neighbors)); + catn("Generations: " + GENS); + catn("------------------------------------------------"); + + defineGlobal("start_time", clock()); + defineGlobal("fixed_time", 0.0); + defineGlobal("linear_time", 0.0); + defineGlobal("exp_time", 0.0); + defineGlobal("normal_time", 0.0); + defineGlobal("cauchy_time", 0.0); + defineGlobal("studentst_time", 0.0); +} + +early() { + // Fixed + t0 = clock(); + i1.evaluate(p1); + i1.totalOfNeighborStrengths(p1.individuals); + defineGlobal("fixed_time", fixed_time + (clock() - t0)); + + // Linear + t0 = clock(); + i2.evaluate(p1); + i2.totalOfNeighborStrengths(p1.individuals); + defineGlobal("linear_time", linear_time + (clock() - t0)); + + // Exponential + t0 = clock(); + i3.evaluate(p1); + i3.totalOfNeighborStrengths(p1.individuals); + defineGlobal("exp_time", exp_time + (clock() - t0)); + + // Normal + t0 = clock(); + i4.evaluate(p1); + i4.totalOfNeighborStrengths(p1.individuals); + defineGlobal("normal_time", normal_time + (clock() - t0)); + + // Cauchy + t0 = clock(); + i5.evaluate(p1); + i5.totalOfNeighborStrengths(p1.individuals); + defineGlobal("cauchy_time", cauchy_time + (clock() - t0)); + + // Student's T + t0 = clock(); + i6.evaluate(p1); + i6.totalOfNeighborStrengths(p1.individuals); + defineGlobal("studentst_time", studentst_time + (clock() - t0)); +} + +late() { + for (ind in p1.individuals) { + ind.setSpatialPosition(p1.pointUniform()); + } +} + +30 late() { + elapsed = clock() - start_time; + + catn("\n================================================"); + catn("RESULTS"); + catn("================================================"); + catn("Fixed: " + format("%6.3f", fixed_time) + " s"); + catn("Linear: " + format("%6.3f", linear_time) + " s"); + catn("Exponential:" + format("%6.3f", exp_time) + " s"); + catn("Normal: " + format("%6.3f", normal_time) + " s"); + catn("Cauchy: " + format("%6.3f", cauchy_time) + " s"); + catn("Student's T:" + format("%6.3f", studentst_time) + " s"); + catn("------------------------------------------------"); + catn("TOTAL: " + format("%6.3f", elapsed) + " s"); + catn("================================================"); + sim.simulationFinished(); +} diff --git a/simd_benchmarks/sleef_benchmark_spatial_interaction.slim b/simd_benchmarks/sleef_benchmark_spatial_interaction.slim deleted file mode 100644 index f5942e05..00000000 --- a/simd_benchmarks/sleef_benchmark_spatial_interaction.slim +++ /dev/null @@ -1,95 +0,0 @@ -// Benchmark for SIMD-accelerated spatial interaction kernels -// Tests Gaussian (normal) and Exponential kernel performance -// These kernels use exp() internally, now SIMD-accelerated via SLEEF -// -// Run with SIMD: ./build/slim simd_benchmarks/sleef_benchmark_spatial_interaction.slim -// Run without SIMD: ./build_no_simd/slim simd_benchmarks/sleef_benchmark_spatial_interaction.slim - -initialize() { - initializeSLiMModelType("WF"); - initializeSLiMOptions(dimensionality="xy"); - - // Large population for intensive benchmarking - defineConstant("POP_SIZE", 5000); - defineConstant("W", 10.0); - defineConstant("SI", 1.0); // larger interaction distance = more neighbors - defineConstant("GENS", 30); - - initializeMutationRate(0); - initializeRecombinationRate(0); - initializeMutationType("m1", 0.5, "f", 0.0); - initializeGenomicElementType("g1", m1, 1.0); - initializeGenomicElement(g1, 0, 99); - - // Gaussian kernel: strength = fmax * exp(-distance^2 / 2*sigma^2) - initializeInteractionType(1, "xy", reciprocal=T, maxDistance=SI * 3); - i1.setInteractionFunction("n", 1.0, SI); - - // Exponential kernel: strength = fmax * exp(-lambda * distance) - initializeInteractionType(2, "xy", reciprocal=T, maxDistance=SI * 3); - i2.setInteractionFunction("e", 1.0, 1.0/SI); -} - -1 early() { - sim.addSubpop("p1", POP_SIZE); - p1.setSpatialBounds(c(0, 0, W, W)); - - for (ind in p1.individuals) { - ind.setSpatialPosition(p1.pointUniform()); - } - - catn("========================================"); - catn("Spatial Interaction Kernel Benchmark"); - catn("========================================"); - catn("Population: " + POP_SIZE); - catn("World: " + W + "x" + W); - catn("Interaction distance: " + SI); - catn("Generations: " + GENS); - - // Estimate neighbors - density = POP_SIZE / (W * W); - avg_neighbors = density * PI * (SI*3)^2; - catn("Est. neighbors/ind: ~" + format("%.0f", avg_neighbors)); - catn("----------------------------------------"); - - defineGlobal("start_time", clock()); - defineGlobal("gaussian_time", 0.0); - defineGlobal("exp_time", 0.0); -} - -early() { - // Gaussian kernel - t0 = clock(); - i1.evaluate(p1); - s1 = i1.totalOfNeighborStrengths(p1.individuals); - defineGlobal("gaussian_time", gaussian_time + (clock() - t0)); - - // Exponential kernel - t1 = clock(); - i2.evaluate(p1); - s2 = i2.totalOfNeighborStrengths(p1.individuals); - defineGlobal("exp_time", exp_time + (clock() - t1)); - - // Use strengths for fitness - p1.individuals.fitnessScaling = 1.0 / (1 + 0.001 * (s1 + s2)); -} - -late() { - // Reshuffle positions each generation - for (ind in p1.individuals) { - ind.setSpatialPosition(p1.pointUniform()); - } -} - -30 late() { - elapsed = clock() - start_time; - - catn("\n========================================"); - catn("RESULTS (SIMD spatial interaction kernels)"); - catn("========================================"); - catn("Total time: " + format("%.3f", elapsed) + " s"); - catn("Gaussian kernel: " + format("%.3f", gaussian_time) + " s"); - catn("Exponential kernel: " + format("%.3f", exp_time) + " s"); - catn("========================================"); - sim.simulationFinished(); -} From 5baabe7192d970da6b0e310c24465f9b6ec19bdd Mon Sep 17 00:00:00 2001 From: Andrew Kern Date: Tue, 16 Dec 2025 20:47:07 -0800 Subject: [PATCH 7/7] Update README with spatial interaction kernel benchmark documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add documentation for benchmark_all_kernels.slim script including: - Entry in Contents section describing the 6 kernel types tested - Performance results table showing SIMD speedups on AVX2 - Usage instructions for running with SIMD vs scalar builds - Notes on adjusting neighbor density via W parameter 🤖 Generated with [Claude Code](https://claude.com/claude-code) --- simd_benchmarks/README.md | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/simd_benchmarks/README.md b/simd_benchmarks/README.md index 3430d201..1e37f423 100644 --- a/simd_benchmarks/README.md +++ b/simd_benchmarks/README.md @@ -12,6 +12,8 @@ This directory contains benchmark scripts used during the development of SIMD op - **`dnorm_benchmark.eidos`** - Eidos script that benchmarks the SIMD-optimized `dnorm()` function with singleton and vector mu/sigma parameters. +- **`benchmark_all_kernels.slim`** - SLiM script that benchmarks all 6 SIMD-optimized spatial interaction kernel types (Fixed, Linear, Exponential, Normal, Cauchy, Student's T). + - **`SIMD_BUILD_FLAGS.md`** - Documentation on how SIMD and SLEEF build flags are set and interact. For SLEEF header generation scripts and documentation, see `eidos/sleef/`. @@ -114,4 +116,39 @@ mkdir build && cd build && cmake .. && make eidos # Build without SIMD for comparison mkdir build_nosimd && cd build_nosimd && cmake .. -DUSE_SIMD=OFF && make eidos ./eidos ../simd_benchmarks/dnorm_benchmark.eidos -``` \ No newline at end of file +``` + +## Spatial Interaction Kernel Benchmark Results + +The `benchmark_all_kernels.slim` script tests SIMD-optimized spatial interaction kernels. All kernel types except Fixed use a two-pass approach (build distances, then batch transform) enabling SIMD vectorization via SLEEF. + +Results on x86_64 with AVX2 (50,000 individuals, ~2,262 neighbors per individual, 30 generations): + +| Kernel | Original | SIMD | Speedup | +|--------|----------|------|---------| +| Fixed | 31.97s | 31.36s | 1.02x | +| Linear | 37.26s | 32.95s | **1.13x** | +| Exponential | 59.58s | 34.88s | **1.71x** | +| Normal | 56.37s | 35.15s | **1.60x** | +| Cauchy | 40.04s | 33.00s | **1.21x** | +| Student's T | 130.10s | 49.76s | **2.61x** | +| **Total** | **356.04s** | **217.80s** | **1.64x** | + +The biggest gains come from kernels using transcendental functions: +- **Student's T** (2.61x): Uses `pow()` via SLEEF +- **Exponential/Normal** (1.6-1.7x): Use `exp()` via SLEEF +- **Linear/Cauchy** (~1.1-1.2x): Simple arithmetic, modest gains from explicit SIMD +- **Fixed** (1.02x): Uses special-case single-pass path (no benefit from two-pass) + +To run this benchmark: +```bash +# Build with SIMD +mkdir build && cd build && cmake .. && make slim +./slim -s 42 ../simd_benchmarks/benchmark_all_kernels.slim + +# Build without SIMD for comparison +mkdir build_nosimd && cd build_nosimd && cmake .. -DUSE_SIMD=OFF && make slim +./slim -s 42 ../simd_benchmarks/benchmark_all_kernels.slim +``` + +Adjust `W` in the script to change neighbor density (W=25 for ~2200 neighbors, W=266 for ~20 neighbors). \ No newline at end of file