From 30faddacb35eef9cedf1693a83aedf3848c31e14 Mon Sep 17 00:00:00 2001
From: andrewkern <adkern@uoregon.edu>
Date: Tue, 16 Dec 2025 11:35:40 -0800
Subject: [PATCH 1/7] Add SIMD acceleration for spatial interaction kernels
 (Gaussian/Exponential)

- Add float SLEEF macros to sleef_config.h (AVX2: 8 floats, NEON: 4 floats)
- Add exp_kernel_float32() and normal_kernel_float32() to eidos_simd.h
- Use SIMD kernels in FillSparseVectorForReceiverStrengths()
- Add benchmark script for spatial interaction kernels
---
 core/interaction_type.cpp                     |  17 +--
 eidos/eidos_simd.h                            | 134 ++++++++++++++++++
 eidos/sleef/sleef_config.h                    |  18 +++
 .../sleef_benchmark_spatial_interaction.slim  |  95 +++++++++++++
 4 files changed, 252 insertions(+), 12 deletions(-)
 create mode 100644 simd_benchmarks/sleef_benchmark_spatial_interaction.slim
diff --git a/core/interaction_type.cpp b/core/interaction_type.cpp
index a3e77c3c..1433e12c 100755
--- a/core/interaction_type.cpp
+++ b/core/interaction_type.cpp
@@ -27,6 +27,7 @@
 #include "species.h"
 #include "slim_globals.h"
 #include "sparse_vector.h"
+#include "eidos_simd.h"
 
 #include <utility>
 #include <algorithm>
@@ -2911,22 +2912,14 @@ void InteractionType::FillSparseVectorForReceiverStrengths(SparseVector *sv, Ind
 			}
 			case SpatialKernelType::kExponential:
 			{
-				for (uint32_t col_iter = 0; col_iter < nnz; ++col_iter)
-				{
-					sv_value_t distance = values[col_iter];
-					
-					values[col_iter] = (sv_value_t)(if_param1_ * exp(-if_param2_ * distance));
-				}
+				// SIMD-accelerated exponential kernel: strength = fmax * exp(-lambda * distance)
+				Eidos_SIMD::exp_kernel_float32(values, nnz, (float)if_param1_, (float)if_param2_);
 				break;
 			}
 			case SpatialKernelType::kNormal:
 			{
-				for (uint32_t col_iter = 0; col_iter < nnz; ++col_iter)
-				{
-					sv_value_t distance = values[col_iter];
-					
-					values[col_iter] = (sv_value_t)(if_param1_ * exp(-(distance * distance) / n_2param2sq_));
-				}
+				// SIMD-accelerated Gaussian kernel: strength = fmax * exp(-d^2 / 2sigma^2)
+				Eidos_SIMD::normal_kernel_float32(values, nnz, (float)if_param1_, (float)n_2param2sq_);
 				break;
 			}
 			case SpatialKernelType::kCauchy:
diff --git a/eidos/eidos_simd.h b/eidos/eidos_simd.h
index 208c0992..048864f1 100644
--- a/eidos/eidos_simd.h
+++ b/eidos/eidos_simd.h
@@ -467,6 +467,140 @@ inline double product_float64(const double *input, int64_t count)
     return prod;
 }
 
+// ================================
+// Float (Single-Precision) SIMD Operations
+// ================================
+// These functions operate on arrays of floats, used by spatial interaction kernels.
+
+// ---------------------
+// Exponential: exp(x) for floats
+// ---------------------
+inline void exp_float32(const float *input, float *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if EIDOS_SLEEF_FLOAT_AVAILABLE
+    for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F)
+    {
+        EIDOS_SLEEF_TYPE_F v = EIDOS_SLEEF_LOAD_F(&input[i]);
+        EIDOS_SLEEF_TYPE_F r = EIDOS_SLEEF_EXP_F(v);
+        EIDOS_SLEEF_STORE_F(&output[i], r);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        output[i] = std::exp(input[i]);
+}
+
+// ---------------------
+// Exponential Kernel: strength = fmax * exp(-lambda * distance)
+// ---------------------
+// Operates in-place on a distance array, transforming distances to strengths.
+// This is optimized for the spatial interaction kernel calculation.
+inline void exp_kernel_float32(float *distances, int64_t count, float fmax, float lambda)
+{
+    int64_t i = 0;
+
+#if EIDOS_SLEEF_FLOAT_AVAILABLE
+    // We need to compute: fmax * exp(-lambda * distance)
+    // First, compute -lambda * distance for all values, then exp, then multiply by fmax
+    EIDOS_SLEEF_TYPE_F v_fmax =
+#if defined(EIDOS_HAS_AVX2)
+        _mm256_set1_ps(fmax);
+    EIDOS_SLEEF_TYPE_F v_neg_lambda = _mm256_set1_ps(-lambda);
+#elif defined(EIDOS_HAS_NEON)
+        vdupq_n_f32(fmax);
+    EIDOS_SLEEF_TYPE_F v_neg_lambda = vdupq_n_f32(-lambda);
+#endif
+
+    for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F)
+    {
+        EIDOS_SLEEF_TYPE_F v_dist = EIDOS_SLEEF_LOAD_F(&distances[i]);
+
+        // Compute -lambda * distance
+#if defined(EIDOS_HAS_AVX2)
+        EIDOS_SLEEF_TYPE_F v_arg = _mm256_mul_ps(v_neg_lambda, v_dist);
+#elif defined(EIDOS_HAS_NEON)
+        EIDOS_SLEEF_TYPE_F v_arg = vmulq_f32(v_neg_lambda, v_dist);
+#endif
+
+        // Compute exp(-lambda * distance)
+        EIDOS_SLEEF_TYPE_F v_exp = EIDOS_SLEEF_EXP_F(v_arg);
+
+        // Compute fmax * exp(...)
+#if defined(EIDOS_HAS_AVX2)
+        EIDOS_SLEEF_TYPE_F v_result = _mm256_mul_ps(v_fmax, v_exp);
+#elif defined(EIDOS_HAS_NEON)
+        EIDOS_SLEEF_TYPE_F v_result = vmulq_f32(v_fmax, v_exp);
+#endif
+
+        EIDOS_SLEEF_STORE_F(&distances[i], v_result);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        distances[i] = fmax * std::exp(-lambda * distances[i]);
+}
+
+// ---------------------
+// Normal (Gaussian) Kernel: strength = fmax * exp(-distance^2 / 2sigma^2)
+// ---------------------
+// Operates in-place on a distance array, transforming distances to strengths.
+// The two_sigma_sq parameter is pre-computed as 2 * sigma^2 for efficiency.
+inline void normal_kernel_float32(float *distances, int64_t count, float fmax, float two_sigma_sq)
+{
+    int64_t i = 0;
+
+#if EIDOS_SLEEF_FLOAT_AVAILABLE
+    // We need to compute: fmax * exp(-distance^2 / two_sigma_sq)
+    EIDOS_SLEEF_TYPE_F v_fmax =
+#if defined(EIDOS_HAS_AVX2)
+        _mm256_set1_ps(fmax);
+    EIDOS_SLEEF_TYPE_F v_neg_inv_2sigsq = _mm256_set1_ps(-1.0f / two_sigma_sq);
+#elif defined(EIDOS_HAS_NEON)
+        vdupq_n_f32(fmax);
+    EIDOS_SLEEF_TYPE_F v_neg_inv_2sigsq = vdupq_n_f32(-1.0f / two_sigma_sq);
+#endif
+
+    for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F)
+    {
+        EIDOS_SLEEF_TYPE_F v_dist = EIDOS_SLEEF_LOAD_F(&distances[i]);
+
+        // Compute distance^2
+#if defined(EIDOS_HAS_AVX2)
+        EIDOS_SLEEF_TYPE_F v_dist_sq = _mm256_mul_ps(v_dist, v_dist);
+        // Compute -distance^2 / 2sigma^2
+        EIDOS_SLEEF_TYPE_F v_arg = _mm256_mul_ps(v_dist_sq, v_neg_inv_2sigsq);
+#elif defined(EIDOS_HAS_NEON)
+        EIDOS_SLEEF_TYPE_F v_dist_sq = vmulq_f32(v_dist, v_dist);
+        // Compute -distance^2 / 2sigma^2
+        EIDOS_SLEEF_TYPE_F v_arg = vmulq_f32(v_dist_sq, v_neg_inv_2sigsq);
+#endif
+
+        // Compute exp(-distance^2 / 2sigma^2)
+        EIDOS_SLEEF_TYPE_F v_exp = EIDOS_SLEEF_EXP_F(v_arg);
+
+        // Compute fmax * exp(...)
+#if defined(EIDOS_HAS_AVX2)
+        EIDOS_SLEEF_TYPE_F v_result = _mm256_mul_ps(v_fmax, v_exp);
+#elif defined(EIDOS_HAS_NEON)
+        EIDOS_SLEEF_TYPE_F v_result = vmulq_f32(v_fmax, v_exp);
+#endif
+
+        EIDOS_SLEEF_STORE_F(&distances[i], v_result);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+    {
+        float d = distances[i];
+        distances[i] = fmax * std::exp(-(d * d) / two_sigma_sq);
+    }
+}
+
 } // namespace Eidos_SIMD
 
 #endif /* eidos_simd_h */
diff --git a/eidos/sleef/sleef_config.h b/eidos/sleef/sleef_config.h
index 230e34a1..5643cf7f 100644
--- a/eidos/sleef/sleef_config.h
+++ b/eidos/sleef/sleef_config.h
@@ -62,6 +62,14 @@
     #define EIDOS_SLEEF_LOG10_D(v) Sleef_log10d4_u10avx2(v)
     #define EIDOS_SLEEF_LOG2_D(v) Sleef_log2d4_u10avx2(v)
 
+    // Float (single-precision) support - 8 floats per AVX2 register
+    #define EIDOS_SLEEF_FLOAT_AVAILABLE 1
+    #define EIDOS_SLEEF_VEC_SIZE_F 8
+    #define EIDOS_SLEEF_TYPE_F __m256
+    #define EIDOS_SLEEF_LOAD_F(ptr) _mm256_loadu_ps(ptr)
+    #define EIDOS_SLEEF_STORE_F(ptr, v) _mm256_storeu_ps(ptr, v)
+    #define EIDOS_SLEEF_EXP_F(v) Sleef_expf8_u10avx2(v)
+
 // ================================
 // ARM NEON Configuration (ARM64)
 // ================================
@@ -85,12 +93,22 @@
     #define EIDOS_SLEEF_LOG10_D(v) Sleef_log10d2_u10advsimd(v)
     #define EIDOS_SLEEF_LOG2_D(v) Sleef_log2d2_u10advsimd(v)
 
+    // Float (single-precision) support - 4 floats per NEON register
+    #define EIDOS_SLEEF_FLOAT_AVAILABLE 1
+    #define EIDOS_SLEEF_VEC_SIZE_F 4
+    #define EIDOS_SLEEF_TYPE_F float32x4_t
+    #define EIDOS_SLEEF_LOAD_F(ptr) vld1q_f32(ptr)
+    #define EIDOS_SLEEF_STORE_F(ptr, v) vst1q_f32(ptr, v)
+    #define EIDOS_SLEEF_EXP_F(v) Sleef_expf4_u10advsimd(v)
+
 // ================================
 // Scalar Fallback (SSE4.2-only or no SIMD)
 // ================================
 #else
     #define EIDOS_SLEEF_AVAILABLE 0
     #define EIDOS_SLEEF_VEC_SIZE 1
+    #define EIDOS_SLEEF_FLOAT_AVAILABLE 0
+    #define EIDOS_SLEEF_VEC_SIZE_F 1
 #endif
 
 #else // EIDOS_SLEEF_AVAILABLE was defined externally
diff --git a/simd_benchmarks/sleef_benchmark_spatial_interaction.slim b/simd_benchmarks/sleef_benchmark_spatial_interaction.slim
new file mode 100644
index 00000000..f5942e05
--- /dev/null
+++ b/simd_benchmarks/sleef_benchmark_spatial_interaction.slim
@@ -0,0 +1,95 @@
+// Benchmark for SIMD-accelerated spatial interaction kernels
+// Tests Gaussian (normal) and Exponential kernel performance
+// These kernels use exp() internally, now SIMD-accelerated via SLEEF
+//
+// Run with SIMD: ./build/slim simd_benchmarks/sleef_benchmark_spatial_interaction.slim
+// Run without SIMD: ./build_no_simd/slim simd_benchmarks/sleef_benchmark_spatial_interaction.slim
+
+initialize() {
+    initializeSLiMModelType("WF");
+    initializeSLiMOptions(dimensionality="xy");
+
+    // Large population for intensive benchmarking
+    defineConstant("POP_SIZE", 5000);
+    defineConstant("W", 10.0);
+    defineConstant("SI", 1.0);  // larger interaction distance = more neighbors
+    defineConstant("GENS", 30);
+
+    initializeMutationRate(0);
+    initializeRecombinationRate(0);
+    initializeMutationType("m1", 0.5, "f", 0.0);
+    initializeGenomicElementType("g1", m1, 1.0);
+    initializeGenomicElement(g1, 0, 99);
+
+    // Gaussian kernel: strength = fmax * exp(-distance^2 / 2*sigma^2)
+    initializeInteractionType(1, "xy", reciprocal=T, maxDistance=SI * 3);
+    i1.setInteractionFunction("n", 1.0, SI);
+
+    // Exponential kernel: strength = fmax * exp(-lambda * distance)
+    initializeInteractionType(2, "xy", reciprocal=T, maxDistance=SI * 3);
+    i2.setInteractionFunction("e", 1.0, 1.0/SI);
+}
+
+1 early() {
+    sim.addSubpop("p1", POP_SIZE);
+    p1.setSpatialBounds(c(0, 0, W, W));
+
+    for (ind in p1.individuals) {
+        ind.setSpatialPosition(p1.pointUniform());
+    }
+
+    catn("========================================");
+    catn("Spatial Interaction Kernel Benchmark");
+    catn("========================================");
+    catn("Population: " + POP_SIZE);
+    catn("World: " + W + "x" + W);
+    catn("Interaction distance: " + SI);
+    catn("Generations: " + GENS);
+
+    // Estimate neighbors
+    density = POP_SIZE / (W * W);
+    avg_neighbors = density * PI * (SI*3)^2;
+    catn("Est. neighbors/ind: ~" + format("%.0f", avg_neighbors));
+    catn("----------------------------------------");
+
+    defineGlobal("start_time", clock());
+    defineGlobal("gaussian_time", 0.0);
+    defineGlobal("exp_time", 0.0);
+}
+
+early() {
+    // Gaussian kernel
+    t0 = clock();
+    i1.evaluate(p1);
+    s1 = i1.totalOfNeighborStrengths(p1.individuals);
+    defineGlobal("gaussian_time", gaussian_time + (clock() - t0));
+
+    // Exponential kernel
+    t1 = clock();
+    i2.evaluate(p1);
+    s2 = i2.totalOfNeighborStrengths(p1.individuals);
+    defineGlobal("exp_time", exp_time + (clock() - t1));
+
+    // Use strengths for fitness
+    p1.individuals.fitnessScaling = 1.0 / (1 + 0.001 * (s1 + s2));
+}
+
+late() {
+    // Reshuffle positions each generation
+    for (ind in p1.individuals) {
+        ind.setSpatialPosition(p1.pointUniform());
+    }
+}
+
+30 late() {
+    elapsed = clock() - start_time;
+
+    catn("\n========================================");
+    catn("RESULTS (SIMD spatial interaction kernels)");
+    catn("========================================");
+    catn("Total time: " + format("%.3f", elapsed) + " s");
+    catn("Gaussian kernel: " + format("%.3f", gaussian_time) + " s");
+    catn("Exponential kernel: " + format("%.3f", exp_time) + " s");
+    catn("========================================");
+    sim.simulationFinished();
+}

From f62d7f3e857a63e38976aef7135e6cae825d2401 Mon Sep 17 00:00:00 2001
From: Andrew Kern <adkern@uoregon.edu>
Date: Tue, 16 Dec 2025 20:06:31 -0800
Subject: [PATCH 2/7] Enable SIMD path for 2D spatial interaction kernels
 (except Fixed)

Modify FillSparseVectorForReceiverStrengths() to use the two-pass
distance-then-transform path for most kernel types in 2D, enabling
SIMD optimizations for Exponential and Normal kernels. The Fixed
kernel retains the original single-pass special-case path since it
doesn't benefit from SIMD (just assigns a constant).

Benchmarks show 22% overall speedup at high neighbor counts (~2200),
with Exponential and Normal kernels seeing 38-42% improvement.
---
 core/interaction_type.cpp | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/core/interaction_type.cpp b/core/interaction_type.cpp
index 1433e12c..8215c700 100755
--- a/core/interaction_type.cpp
+++ b/core/interaction_type.cpp
@@ -2847,12 +2847,26 @@ void InteractionType::FillSparseVectorForReceiverStrengths(SparseVector *sv, Ind
 		// Figure out what index in the exerter subpopulation, if any, needs to be excluded so self-interaction is zero
 		slim_popsize_t excluded_index = (exerter_subpop == receiver->subpopulation_) ? receiver->index_ : -1;
 		
-		// We special-case some builds directly to strength values here, for efficiency,
-		// with no callbacks and spatiality "xy".
+		// We special-case Fixed kernel builds directly to strength values here, for efficiency,
+		// with no callbacks and spatiality "xy". Other kernels use the two-pass path below
+		// which enables SIMD optimizations for Exponential and Normal kernels.
+		// ADK 12/16/2025: changed to only use special-case path for Fixed kernel
+		if ((interaction_callbacks.size() == 0) && (spatiality_ == 2) && (if_type_ == SpatialKernelType::kFixed))
+		{
+			sv->SetDataType(SparseVectorDataType::kStrengths);
+			BuildSV_Strengths_f_2(kd_root, receiver_position, excluded_index, sv, 0);
+			sv->Finished();
+			return;
+		}
+
+		// ADK 12/16/2025: The original switch below handled all kernel types in the special-case
+		// single-pass path. Now only Fixed uses the special path above; other kernels fall
+		// through to the two-pass distance-then-transform path, enabling SIMD optimizations.
+#if 0
 		if ((interaction_callbacks.size() == 0) && (spatiality_ == 2))
 		{
 			sv->SetDataType(SparseVectorDataType::kStrengths);
-			
+
 			switch (if_type_)
 			{
 				case SpatialKernelType::kFixed:			BuildSV_Strengths_f_2(kd_root, receiver_position, excluded_index, sv, 0); break;
@@ -2864,11 +2878,12 @@ void InteractionType::FillSparseVectorForReceiverStrengths(SparseVector *sv, Ind
 				default:
 					EIDOS_TERMINATION << "ERROR (InteractionType::FillSparseVectorForReceiverStrengths): (internal error) unoptimized SpatialKernelType value." << EidosTerminate();
 			}
-			
+
 			sv->Finished();
 			return;
 		}
-		
+#endif
+
 		// Set up to build distances first; this is an internal implementation detail, so we require the sparse vector set up for strengths above
 		sv->SetDataType(SparseVectorDataType::kDistances);
 		

From 9bbd5fad5ece9bfe8a598eb7ca567e1fb89f45d1 Mon Sep 17 00:00:00 2001
From: Andrew Kern <adkern@uoregon.edu>
Date: Tue, 16 Dec 2025 20:21:16 -0800
Subject: [PATCH 3/7] Add SIMD acceleration for Student's T spatial interaction
 kernel

Add SLEEF pow() function support (Sleef_powf8_u10avx2 for AVX2,
Sleef_powf4_u10advsimd for NEON) and implement tdist_kernel_float32()
to vectorize the Student's T distribution kernel calculation.

The kernel computes: strength = fmax / pow(1 + (d/tau)^2 / nu, (nu+1)/2)

Benchmarks show 62% speedup for Student's T kernel (130s -> 49s),
contributing to 38% overall speedup for spatial interaction benchmarks.
---
 core/interaction_type.cpp  |  8 ++---
 eidos/eidos_simd.h         | 73 ++++++++++++++++++++++++++++++++++++++
 eidos/sleef/sleef_config.h |  2 ++
 3 files changed, 77 insertions(+), 6 deletions(-)

diff --git a/core/interaction_type.cpp b/core/interaction_type.cpp
index 8215c700..a2fad103 100755
--- a/core/interaction_type.cpp
+++ b/core/interaction_type.cpp
@@ -2950,12 +2950,8 @@ void InteractionType::FillSparseVectorForReceiverStrengths(SparseVector *sv, Ind
 			}
 			case SpatialKernelType::kStudentsT:
 			{
-				for (uint32_t col_iter = 0; col_iter < nnz; ++col_iter)
-				{
-					sv_value_t distance = values[col_iter];
-					
-					values[col_iter] = (sv_value_t)SpatialKernel::tdist(distance, if_param1_, if_param2_, if_param3_);
-				}
+				// Use SIMD-optimized kernel: fmax = if_param1_, nu = if_param2_, tau = if_param3_
+				Eidos_SIMD::tdist_kernel_float32(values, nnz, (float)if_param1_, (float)if_param2_, (float)if_param3_);
 				break;
 			}
 			default:
diff --git a/eidos/eidos_simd.h b/eidos/eidos_simd.h
index 048864f1..b9e73e23 100644
--- a/eidos/eidos_simd.h
+++ b/eidos/eidos_simd.h
@@ -601,6 +601,79 @@ inline void normal_kernel_float32(float *distances, int64_t count, float fmax, f
     }
 }
 
+// ---------------------
+// Student's T Kernel: strength = fmax / pow(1 + (d/tau)^2 / nu, (nu+1)/2)
+// ---------------------
+// Operates in-place on a distance array, transforming distances to strengths.
+// Parameters: fmax = maximum strength, nu = degrees of freedom, tau = scale
+inline void tdist_kernel_float32(float *distances, int64_t count, float fmax, float nu, float tau)
+{
+    int64_t i = 0;
+
+    // Pre-compute constants
+    float inv_tau = 1.0f / tau;
+    float inv_nu = 1.0f / nu;
+    float exponent = (nu + 1.0f) / 2.0f;
+
+#if EIDOS_SLEEF_FLOAT_AVAILABLE
+    EIDOS_SLEEF_TYPE_F v_fmax, v_inv_tau, v_inv_nu, v_exponent, v_one;
+#if defined(EIDOS_HAS_AVX2)
+    v_fmax = _mm256_set1_ps(fmax);
+    v_inv_tau = _mm256_set1_ps(inv_tau);
+    v_inv_nu = _mm256_set1_ps(inv_nu);
+    v_exponent = _mm256_set1_ps(-exponent);
+    v_one = _mm256_set1_ps(1.0f);
+#elif defined(EIDOS_HAS_NEON)
+    v_fmax = vdupq_n_f32(fmax);
+    v_inv_tau = vdupq_n_f32(inv_tau);
+    v_inv_nu = vdupq_n_f32(inv_nu);
+    v_exponent = vdupq_n_f32(-exponent);
+    v_one = vdupq_n_f32(1.0f);
+#endif
+
+    for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F)
+    {
+        EIDOS_SLEEF_TYPE_F v_dist = EIDOS_SLEEF_LOAD_F(&distances[i]);
+
+        // Compute (d / tau)
+#if defined(EIDOS_HAS_AVX2)
+        EIDOS_SLEEF_TYPE_F v_d_over_tau = _mm256_mul_ps(v_dist, v_inv_tau);
+        // Compute (d/tau)^2
+        EIDOS_SLEEF_TYPE_F v_d_over_tau_sq = _mm256_mul_ps(v_d_over_tau, v_d_over_tau);
+        // Compute (d/tau)^2 / nu
+        EIDOS_SLEEF_TYPE_F v_term = _mm256_mul_ps(v_d_over_tau_sq, v_inv_nu);
+        // Compute 1 + (d/tau)^2 / nu
+        EIDOS_SLEEF_TYPE_F v_base = _mm256_add_ps(v_one, v_term);
+#elif defined(EIDOS_HAS_NEON)
+        EIDOS_SLEEF_TYPE_F v_d_over_tau = vmulq_f32(v_dist, v_inv_tau);
+        EIDOS_SLEEF_TYPE_F v_d_over_tau_sq = vmulq_f32(v_d_over_tau, v_d_over_tau);
+        EIDOS_SLEEF_TYPE_F v_term = vmulq_f32(v_d_over_tau_sq, v_inv_nu);
+        EIDOS_SLEEF_TYPE_F v_base = vaddq_f32(v_one, v_term);
+#endif
+
+        // Compute pow(base, -exponent) = 1 / pow(base, exponent)
+        EIDOS_SLEEF_TYPE_F v_pow = EIDOS_SLEEF_POW_F(v_base, v_exponent);
+
+        // Compute fmax * pow(...)
+#if defined(EIDOS_HAS_AVX2)
+        EIDOS_SLEEF_TYPE_F v_result = _mm256_mul_ps(v_fmax, v_pow);
+#elif defined(EIDOS_HAS_NEON)
+        EIDOS_SLEEF_TYPE_F v_result = vmulq_f32(v_fmax, v_pow);
+#endif
+
+        EIDOS_SLEEF_STORE_F(&distances[i], v_result);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+    {
+        float d = distances[i];
+        float d_over_tau = d * inv_tau;
+        distances[i] = fmax * std::pow(1.0f + d_over_tau * d_over_tau * inv_nu, -exponent);
+    }
+}
+
 } // namespace Eidos_SIMD
 
 #endif /* eidos_simd_h */
diff --git a/eidos/sleef/sleef_config.h b/eidos/sleef/sleef_config.h
index 5643cf7f..6844afbc 100644
--- a/eidos/sleef/sleef_config.h
+++ b/eidos/sleef/sleef_config.h
@@ -69,6 +69,7 @@
     #define EIDOS_SLEEF_LOAD_F(ptr) _mm256_loadu_ps(ptr)
     #define EIDOS_SLEEF_STORE_F(ptr, v) _mm256_storeu_ps(ptr, v)
     #define EIDOS_SLEEF_EXP_F(v) Sleef_expf8_u10avx2(v)
+    #define EIDOS_SLEEF_POW_F(x, y) Sleef_powf8_u10avx2(x, y)
 
 // ================================
 // ARM NEON Configuration (ARM64)
@@ -100,6 +101,7 @@
     #define EIDOS_SLEEF_LOAD_F(ptr) vld1q_f32(ptr)
     #define EIDOS_SLEEF_STORE_F(ptr, v) vst1q_f32(ptr, v)
     #define EIDOS_SLEEF_EXP_F(v) Sleef_expf4_u10advsimd(v)
+    #define EIDOS_SLEEF_POW_F(x, y) Sleef_powf4_u10advsimd(x, y)
 
 // ================================
 // Scalar Fallback (SSE4.2-only or no SIMD)

From e1eefd1f405a7d9cd643f39a3fd996d9c71ec897 Mon Sep 17 00:00:00 2001
From: Andrew Kern <adkern@uoregon.edu>
Date: Tue, 16 Dec 2025 20:29:45 -0800
Subject: [PATCH 4/7] Add SIMD acceleration for Cauchy spatial interaction
 kernel

Implement cauchy_kernel_float32() using AVX2/NEON intrinsics for the
Cauchy kernel calculation: strength = fmax / (1 + (d/lambda)^2)

Unlike exp/normal/tdist kernels, Cauchy uses only basic arithmetic
operations (multiply, divide, add) so no SLEEF functions are needed.

Benchmarks show 18% speedup vs original (40s -> 33s).
---
 core/interaction_type.cpp |  9 ++-----
 eidos/eidos_simd.h        | 49 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/core/interaction_type.cpp b/core/interaction_type.cpp
index a2fad103..a010b32d 100755
--- a/core/interaction_type.cpp
+++ b/core/interaction_type.cpp
@@ -2939,13 +2939,8 @@ void InteractionType::FillSparseVectorForReceiverStrengths(SparseVector *sv, Ind
 			}
 			case SpatialKernelType::kCauchy:
 			{
-				for (uint32_t col_iter = 0; col_iter < nnz; ++col_iter)
-				{
-					sv_value_t distance = values[col_iter];
-					double temp = distance / if_param2_;
-					
-					values[col_iter] = (sv_value_t)(if_param1_ / (1.0 + temp * temp));
-				}
+				// Use SIMD-optimized kernel: fmax = if_param1_, lambda = if_param2_
+				Eidos_SIMD::cauchy_kernel_float32(values, nnz, (float)if_param1_, (float)if_param2_);
 				break;
 			}
 			case SpatialKernelType::kStudentsT:
diff --git a/eidos/eidos_simd.h b/eidos/eidos_simd.h
index b9e73e23..f3625ed5 100644
--- a/eidos/eidos_simd.h
+++ b/eidos/eidos_simd.h
@@ -674,6 +674,55 @@ inline void tdist_kernel_float32(float *distances, int64_t count, float fmax, fl
     }
 }
 
+// ---------------------
+// Cauchy Kernel: strength = fmax / (1 + (d/lambda)^2)
+// ---------------------
+// Operates in-place on a distance array, transforming distances to strengths.
+// Parameters: fmax = maximum strength, lambda = scale parameter
+inline void cauchy_kernel_float32(float *distances, int64_t count, float fmax, float lambda)
+{
+    int64_t i = 0;
+    float inv_lambda = 1.0f / lambda;
+
+#if defined(EIDOS_HAS_AVX2)
+    __m256 v_fmax = _mm256_set1_ps(fmax);
+    __m256 v_inv_lambda = _mm256_set1_ps(inv_lambda);
+    __m256 v_one = _mm256_set1_ps(1.0f);
+
+    for (; i + 8 <= count; i += 8)
+    {
+        __m256 v_dist = _mm256_loadu_ps(&distances[i]);
+        __m256 v_temp = _mm256_mul_ps(v_dist, v_inv_lambda);      // d/lambda
+        __m256 v_temp_sq = _mm256_mul_ps(v_temp, v_temp);         // (d/lambda)^2
+        __m256 v_denom = _mm256_add_ps(v_one, v_temp_sq);         // 1 + (d/lambda)^2
+        __m256 v_result = _mm256_div_ps(v_fmax, v_denom);         // fmax / denom
+        _mm256_storeu_ps(&distances[i], v_result);
+    }
+#elif defined(EIDOS_HAS_NEON)
+    float32x4_t v_fmax = vdupq_n_f32(fmax);
+    float32x4_t v_inv_lambda = vdupq_n_f32(inv_lambda);
+    float32x4_t v_one = vdupq_n_f32(1.0f);
+
+    for (; i + 4 <= count; i += 4)
+    {
+        float32x4_t v_dist = vld1q_f32(&distances[i]);
+        float32x4_t v_temp = vmulq_f32(v_dist, v_inv_lambda);
+        float32x4_t v_temp_sq = vmulq_f32(v_temp, v_temp);
+        float32x4_t v_denom = vaddq_f32(v_one, v_temp_sq);
+        float32x4_t v_result = vdivq_f32(v_fmax, v_denom);
+        vst1q_f32(&distances[i], v_result);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+    {
+        float d = distances[i];
+        float temp = d * inv_lambda;
+        distances[i] = fmax / (1.0f + temp * temp);
+    }
+}
+
 } // namespace Eidos_SIMD
 
 #endif /* eidos_simd_h */

From 7e1c3f0a20a5fa1cc10d4924503deeada362b6a6 Mon Sep 17 00:00:00 2001
From: Andrew Kern <adkern@uoregon.edu>
Date: Tue, 16 Dec 2025 20:36:54 -0800
Subject: [PATCH 5/7] Add SIMD acceleration for Linear spatial interaction
 kernel

Implement linear_kernel_float32() using AVX2/NEON intrinsics for the
Linear kernel calculation: strength = fmax * (1 - d / max_distance)

Rewritten as: strength = fmax - d * (fmax / max_distance)

Simple arithmetic (multiply + subtract) so gains are modest (~2%),
but provides consistency with other SIMD-optimized kernels.
---
 core/interaction_type.cpp |  8 ++------
 eidos/eidos_simd.h        | 42 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 6 deletions(-)

diff --git a/core/interaction_type.cpp b/core/interaction_type.cpp
index a010b32d..8515767c 100755
--- a/core/interaction_type.cpp
+++ b/core/interaction_type.cpp
@@ -2917,12 +2917,8 @@ void InteractionType::FillSparseVectorForReceiverStrengths(SparseVector *sv, Ind
 			}
 			case SpatialKernelType::kLinear:
 			{
-				for (uint32_t col_iter = 0; col_iter < nnz; ++col_iter)
-				{
-					sv_value_t distance = values[col_iter];
-					
-					values[col_iter] = (sv_value_t)(if_param1_ * (1.0 - distance / max_distance_));
-				}
+				// Use SIMD-optimized kernel: fmax = if_param1_, max_distance = max_distance_
+				Eidos_SIMD::linear_kernel_float32(values, nnz, (float)if_param1_, (float)max_distance_);
 				break;
 			}
 			case SpatialKernelType::kExponential:
diff --git a/eidos/eidos_simd.h b/eidos/eidos_simd.h
index f3625ed5..4c142dd3 100644
--- a/eidos/eidos_simd.h
+++ b/eidos/eidos_simd.h
@@ -723,6 +723,48 @@ inline void cauchy_kernel_float32(float *distances, int64_t count, float fmax, f
     }
 }
 
+// ---------------------
+// Linear Kernel: strength = fmax * (1 - d / max_distance)
+// ---------------------
+// Operates in-place on a distance array, transforming distances to strengths.
+// Parameters: fmax = maximum strength, max_distance = interaction max distance
+inline void linear_kernel_float32(float *distances, int64_t count, float fmax, float max_distance)
+{
+    int64_t i = 0;
+    float fmax_over_maxdist = fmax / max_distance;
+
+#if defined(EIDOS_HAS_AVX2)
+    __m256 v_fmax = _mm256_set1_ps(fmax);
+    __m256 v_fmax_over_maxdist = _mm256_set1_ps(fmax_over_maxdist);
+
+    for (; i + 8 <= count; i += 8)
+    {
+        __m256 v_dist = _mm256_loadu_ps(&distances[i]);
+        // fmax - d * (fmax / max_distance) = fmax * (1 - d/max_distance)
+        __m256 v_term = _mm256_mul_ps(v_dist, v_fmax_over_maxdist);
+        __m256 v_result = _mm256_sub_ps(v_fmax, v_term);
+        _mm256_storeu_ps(&distances[i], v_result);
+    }
+#elif defined(EIDOS_HAS_NEON)
+    float32x4_t v_fmax = vdupq_n_f32(fmax);
+    float32x4_t v_fmax_over_maxdist = vdupq_n_f32(fmax_over_maxdist);
+
+    for (; i + 4 <= count; i += 4)
+    {
+        float32x4_t v_dist = vld1q_f32(&distances[i]);
+        float32x4_t v_term = vmulq_f32(v_dist, v_fmax_over_maxdist);
+        float32x4_t v_result = vsubq_f32(v_fmax, v_term);
+        vst1q_f32(&distances[i], v_result);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+    {
+        distances[i] = fmax - distances[i] * fmax_over_maxdist;
+    }
+}
+
 } // namespace Eidos_SIMD
 
 #endif /* eidos_simd_h */

From e4962d7c4a4c6ff9c645aa656c83451ca406aa9a Mon Sep 17 00:00:00 2001
From: Andrew Kern <adkern@uoregon.edu>
Date: Tue, 16 Dec 2025 20:44:21 -0800
Subject: [PATCH 6/7] Replace benchmark with comprehensive all-kernels version

Remove sleef_benchmark_spatial_interaction.slim (only tested Gaussian/Exponential)
and add benchmark_all_kernels.slim which tests all 6 kernel types:
Fixed, Linear, Exponential, Normal, Cauchy, and Student's T.
---
 simd_benchmarks/benchmark_all_kernels.slim    | 131 ++++++++++++++++++
 .../sleef_benchmark_spatial_interaction.slim  |  95 -------------
 2 files changed, 131 insertions(+), 95 deletions(-)
 create mode 100644 simd_benchmarks/benchmark_all_kernels.slim
 delete mode 100644 simd_benchmarks/sleef_benchmark_spatial_interaction.slim

diff --git a/simd_benchmarks/benchmark_all_kernels.slim b/simd_benchmarks/benchmark_all_kernels.slim
new file mode 100644
index 00000000..613c69ce
--- /dev/null
+++ b/simd_benchmarks/benchmark_all_kernels.slim
@@ -0,0 +1,131 @@
+// Comprehensive benchmark for ALL spatial interaction kernel types
+// Tests: Fixed, Linear, Exponential, Normal, Cauchy, Student's T
+
+initialize() {
+    initializeSLiMModelType("WF");
+    initializeSLiMOptions(dimensionality="xy");
+
+    defineConstant("POP_SIZE", 50000);
+    defineConstant("W", 25.0);   // Medium density (~2262 neighbors with 50k)
+    defineConstant("SI", 1.0);
+    defineConstant("GENS", 30);
+
+    initializeMutationRate(0);
+    initializeRecombinationRate(0);
+    initializeMutationType("m1", 0.5, "f", 0.0);
+    initializeGenomicElementType("g1", m1, 1.0);
+    initializeGenomicElement(g1, 0, 99);
+
+    // 1. Fixed kernel: strength = fmax (constant)
+    initializeInteractionType(1, "xy", reciprocal=T, maxDistance=SI * 3);
+    i1.setInteractionFunction("f", 1.0);
+
+    // 2. Linear kernel: strength = fmax * (1 - d/maxDist)
+    initializeInteractionType(2, "xy", reciprocal=T, maxDistance=SI * 3);
+    i2.setInteractionFunction("l", 1.0);
+
+    // 3. Exponential kernel: strength = fmax * exp(-lambda * d)
+    initializeInteractionType(3, "xy", reciprocal=T, maxDistance=SI * 3);
+    i3.setInteractionFunction("e", 1.0, 1.0/SI);
+
+    // 4. Normal/Gaussian kernel: strength = fmax * exp(-d^2 / 2*sigma^2)
+    initializeInteractionType(4, "xy", reciprocal=T, maxDistance=SI * 3);
+    i4.setInteractionFunction("n", 1.0, SI);
+
+    // 5. Cauchy kernel: strength = fmax / (1 + (d/scale)^2)
+    initializeInteractionType(5, "xy", reciprocal=T, maxDistance=SI * 3);
+    i5.setInteractionFunction("c", 1.0, SI);
+
+    // 6. Student's T kernel
+    initializeInteractionType(6, "xy", reciprocal=T, maxDistance=SI * 3);
+    i6.setInteractionFunction("t", 1.0, 5.0, SI);  // fmax, nu, scale
+}
+
+1 early() {
+    sim.addSubpop("p1", POP_SIZE);
+    p1.setSpatialBounds(c(0, 0, W, W));
+
+    for (ind in p1.individuals) {
+        ind.setSpatialPosition(p1.pointUniform());
+    }
+
+    catn("================================================");
+    catn("ALL KERNELS BENCHMARK (2D)");
+    catn("================================================");
+    catn("Population: " + POP_SIZE + " | World: " + W + "x" + W);
+    density = POP_SIZE / (W^2);
+    avg_neighbors = density * PI * (SI*3)^2;
+    catn("Est. neighbors/ind: ~" + format("%.0f", avg_neighbors));
+    catn("Generations: " + GENS);
+    catn("------------------------------------------------");
+
+    defineGlobal("start_time", clock());
+    defineGlobal("fixed_time", 0.0);
+    defineGlobal("linear_time", 0.0);
+    defineGlobal("exp_time", 0.0);
+    defineGlobal("normal_time", 0.0);
+    defineGlobal("cauchy_time", 0.0);
+    defineGlobal("studentst_time", 0.0);
+}
+
+early() {
+    // Fixed
+    t0 = clock();
+    i1.evaluate(p1);
+    i1.totalOfNeighborStrengths(p1.individuals);
+    defineGlobal("fixed_time", fixed_time + (clock() - t0));
+
+    // Linear
+    t0 = clock();
+    i2.evaluate(p1);
+    i2.totalOfNeighborStrengths(p1.individuals);
+    defineGlobal("linear_time", linear_time + (clock() - t0));
+
+    // Exponential
+    t0 = clock();
+    i3.evaluate(p1);
+    i3.totalOfNeighborStrengths(p1.individuals);
+    defineGlobal("exp_time", exp_time + (clock() - t0));
+
+    // Normal
+    t0 = clock();
+    i4.evaluate(p1);
+    i4.totalOfNeighborStrengths(p1.individuals);
+    defineGlobal("normal_time", normal_time + (clock() - t0));
+
+    // Cauchy
+    t0 = clock();
+    i5.evaluate(p1);
+    i5.totalOfNeighborStrengths(p1.individuals);
+    defineGlobal("cauchy_time", cauchy_time + (clock() - t0));
+
+    // Student's T
+    t0 = clock();
+    i6.evaluate(p1);
+    i6.totalOfNeighborStrengths(p1.individuals);
+    defineGlobal("studentst_time", studentst_time + (clock() - t0));
+}
+
+late() {
+    for (ind in p1.individuals) {
+        ind.setSpatialPosition(p1.pointUniform());
+    }
+}
+
+30 late() {
+    elapsed = clock() - start_time;
+
+    catn("\n================================================");
+    catn("RESULTS");
+    catn("================================================");
+    catn("Fixed:      " + format("%6.3f", fixed_time) + " s");
+    catn("Linear:     " + format("%6.3f", linear_time) + " s");
+    catn("Exponential:" + format("%6.3f", exp_time) + " s");
+    catn("Normal:     " + format("%6.3f", normal_time) + " s");
+    catn("Cauchy:     " + format("%6.3f", cauchy_time) + " s");
+    catn("Student's T:" + format("%6.3f", studentst_time) + " s");
+    catn("------------------------------------------------");
+    catn("TOTAL:      " + format("%6.3f", elapsed) + " s");
+    catn("================================================");
+    sim.simulationFinished();
+}
diff --git a/simd_benchmarks/sleef_benchmark_spatial_interaction.slim b/simd_benchmarks/sleef_benchmark_spatial_interaction.slim
deleted file mode 100644
index f5942e05..00000000
--- a/simd_benchmarks/sleef_benchmark_spatial_interaction.slim
+++ /dev/null
@@ -1,95 +0,0 @@
-// Benchmark for SIMD-accelerated spatial interaction kernels
-// Tests Gaussian (normal) and Exponential kernel performance
-// These kernels use exp() internally, now SIMD-accelerated via SLEEF
-//
-// Run with SIMD: ./build/slim simd_benchmarks/sleef_benchmark_spatial_interaction.slim
-// Run without SIMD: ./build_no_simd/slim simd_benchmarks/sleef_benchmark_spatial_interaction.slim
-
-initialize() {
-    initializeSLiMModelType("WF");
-    initializeSLiMOptions(dimensionality="xy");
-
-    // Large population for intensive benchmarking
-    defineConstant("POP_SIZE", 5000);
-    defineConstant("W", 10.0);
-    defineConstant("SI", 1.0);  // larger interaction distance = more neighbors
-    defineConstant("GENS", 30);
-
-    initializeMutationRate(0);
-    initializeRecombinationRate(0);
-    initializeMutationType("m1", 0.5, "f", 0.0);
-    initializeGenomicElementType("g1", m1, 1.0);
-    initializeGenomicElement(g1, 0, 99);
-
-    // Gaussian kernel: strength = fmax * exp(-distance^2 / 2*sigma^2)
-    initializeInteractionType(1, "xy", reciprocal=T, maxDistance=SI * 3);
-    i1.setInteractionFunction("n", 1.0, SI);
-
-    // Exponential kernel: strength = fmax * exp(-lambda * distance)
-    initializeInteractionType(2, "xy", reciprocal=T, maxDistance=SI * 3);
-    i2.setInteractionFunction("e", 1.0, 1.0/SI);
-}
-
-1 early() {
-    sim.addSubpop("p1", POP_SIZE);
-    p1.setSpatialBounds(c(0, 0, W, W));
-
-    for (ind in p1.individuals) {
-        ind.setSpatialPosition(p1.pointUniform());
-    }
-
-    catn("========================================");
-    catn("Spatial Interaction Kernel Benchmark");
-    catn("========================================");
-    catn("Population: " + POP_SIZE);
-    catn("World: " + W + "x" + W);
-    catn("Interaction distance: " + SI);
-    catn("Generations: " + GENS);
-
-    // Estimate neighbors
-    density = POP_SIZE / (W * W);
-    avg_neighbors = density * PI * (SI*3)^2;
-    catn("Est. neighbors/ind: ~" + format("%.0f", avg_neighbors));
-    catn("----------------------------------------");
-
-    defineGlobal("start_time", clock());
-    defineGlobal("gaussian_time", 0.0);
-    defineGlobal("exp_time", 0.0);
-}
-
-early() {
-    // Gaussian kernel
-    t0 = clock();
-    i1.evaluate(p1);
-    s1 = i1.totalOfNeighborStrengths(p1.individuals);
-    defineGlobal("gaussian_time", gaussian_time + (clock() - t0));
-
-    // Exponential kernel
-    t1 = clock();
-    i2.evaluate(p1);
-    s2 = i2.totalOfNeighborStrengths(p1.individuals);
-    defineGlobal("exp_time", exp_time + (clock() - t1));
-
-    // Use strengths for fitness
-    p1.individuals.fitnessScaling = 1.0 / (1 + 0.001 * (s1 + s2));
-}
-
-late() {
-    // Reshuffle positions each generation
-    for (ind in p1.individuals) {
-        ind.setSpatialPosition(p1.pointUniform());
-    }
-}
-
-30 late() {
-    elapsed = clock() - start_time;
-
-    catn("\n========================================");
-    catn("RESULTS (SIMD spatial interaction kernels)");
-    catn("========================================");
-    catn("Total time: " + format("%.3f", elapsed) + " s");
-    catn("Gaussian kernel: " + format("%.3f", gaussian_time) + " s");
-    catn("Exponential kernel: " + format("%.3f", exp_time) + " s");
-    catn("========================================");
-    sim.simulationFinished();
-}

From 5baabe7192d970da6b0e310c24465f9b6ec19bdd Mon Sep 17 00:00:00 2001
From: Andrew Kern <adkern@uoregon.edu>
Date: Tue, 16 Dec 2025 20:47:07 -0800
Subject: [PATCH 7/7] Update README with spatial interaction kernel benchmark
 documentation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add documentation for benchmark_all_kernels.slim script including:
- Entry in Contents section describing the 6 kernel types tested
- Performance results table showing SIMD speedups on AVX2
- Usage instructions for running with SIMD vs scalar builds
- Notes on adjusting neighbor density via W parameter

🤖 Generated with [Claude Code](https://claude.com/claude-code)
---
 simd_benchmarks/README.md | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/simd_benchmarks/README.md b/simd_benchmarks/README.md
index 3430d201..1e37f423 100644
--- a/simd_benchmarks/README.md
+++ b/simd_benchmarks/README.md
@@ -12,6 +12,8 @@ This directory contains benchmark scripts used during the development of SIMD op
 
 - **`dnorm_benchmark.eidos`** - Eidos script that benchmarks the SIMD-optimized `dnorm()` function with singleton and vector mu/sigma parameters.
 
+- **`benchmark_all_kernels.slim`** - SLiM script that benchmarks all 6 SIMD-optimized spatial interaction kernel types (Fixed, Linear, Exponential, Normal, Cauchy, Student's T).
+
 - **`SIMD_BUILD_FLAGS.md`** - Documentation on how SIMD and SLEEF build flags are set and interact.
 
 For SLEEF header generation scripts and documentation, see `eidos/sleef/`.
@@ -114,4 +116,39 @@ mkdir build && cd build && cmake .. && make eidos
 # Build without SIMD for comparison
 mkdir build_nosimd && cd build_nosimd && cmake .. -DUSE_SIMD=OFF && make eidos
 ./eidos ../simd_benchmarks/dnorm_benchmark.eidos
-```
\ No newline at end of file
+```
+
+## Spatial Interaction Kernel Benchmark Results
+
+The `benchmark_all_kernels.slim` script tests SIMD-optimized spatial interaction kernels. All kernel types except Fixed use a two-pass approach (build distances, then batch transform) enabling SIMD vectorization via SLEEF.
+
+Results on x86_64 with AVX2 (50,000 individuals, ~2,262 neighbors per individual, 30 generations):
+
+| Kernel | Original | SIMD | Speedup |
+|--------|----------|------|---------|
+| Fixed | 31.97s | 31.36s | 1.02x |
+| Linear | 37.26s | 32.95s | **1.13x** |
+| Exponential | 59.58s | 34.88s | **1.71x** |
+| Normal | 56.37s | 35.15s | **1.60x** |
+| Cauchy | 40.04s | 33.00s | **1.21x** |
+| Student's T | 130.10s | 49.76s | **2.61x** |
+| **Total** | **356.04s** | **217.80s** | **1.64x** |
+
+The biggest gains come from kernels using transcendental functions:
+- **Student's T** (2.61x): Uses `pow()` via SLEEF
+- **Exponential/Normal** (1.6-1.7x): Use `exp()` via SLEEF
+- **Linear/Cauchy** (~1.1-1.2x): Simple arithmetic, modest gains from explicit SIMD
+- **Fixed** (1.02x): Uses special-case single-pass path (no benefit from two-pass)
+
+To run this benchmark:
+```bash
+# Build with SIMD
+mkdir build && cd build && cmake .. && make slim
+./slim -s 42 ../simd_benchmarks/benchmark_all_kernels.slim
+
+# Build without SIMD for comparison
+mkdir build_nosimd && cd build_nosimd && cmake .. -DUSE_SIMD=OFF && make slim
+./slim -s 42 ../simd_benchmarks/benchmark_all_kernels.slim
+```
+
+Adjust `W` in the script to change neighbor density (W=25 for ~2200 neighbors, W=266 for ~20 neighbors).
\ No newline at end of file