diff --git a/core/interaction_type.cpp b/core/interaction_type.cpp index a3e77c3c..8515767c 100755 --- a/core/interaction_type.cpp +++ b/core/interaction_type.cpp @@ -27,6 +27,7 @@ #include "species.h" #include "slim_globals.h" #include "sparse_vector.h" +#include "eidos_simd.h" #include #include @@ -2846,12 +2847,26 @@ void InteractionType::FillSparseVectorForReceiverStrengths(SparseVector *sv, Ind // Figure out what index in the exerter subpopulation, if any, needs to be excluded so self-interaction is zero slim_popsize_t excluded_index = (exerter_subpop == receiver->subpopulation_) ? receiver->index_ : -1; - // We special-case some builds directly to strength values here, for efficiency, - // with no callbacks and spatiality "xy". + // We special-case Fixed kernel builds directly to strength values here, for efficiency, + // with no callbacks and spatiality "xy". Other kernels use the two-pass path below + // which enables SIMD optimizations for Exponential and Normal kernels. + // ADK 12/16/2025: changed to only use special-case path for Fixed kernel + if ((interaction_callbacks.size() == 0) && (spatiality_ == 2) && (if_type_ == SpatialKernelType::kFixed)) + { + sv->SetDataType(SparseVectorDataType::kStrengths); + BuildSV_Strengths_f_2(kd_root, receiver_position, excluded_index, sv, 0); + sv->Finished(); + return; + } + + // ADK 12/16/2025: The original switch below handled all kernel types in the special-case + // single-pass path. Now only Fixed uses the special path above; other kernels fall + // through to the two-pass distance-then-transform path, enabling SIMD optimizations. +#if 0 if ((interaction_callbacks.size() == 0) && (spatiality_ == 2)) { sv->SetDataType(SparseVectorDataType::kStrengths); - + switch (if_type_) { case SpatialKernelType::kFixed: BuildSV_Strengths_f_2(kd_root, receiver_position, excluded_index, sv, 0); break; @@ -2863,11 +2878,12 @@ void InteractionType::FillSparseVectorForReceiverStrengths(SparseVector *sv, Ind default: EIDOS_TERMINATION << "ERROR (InteractionType::FillSparseVectorForReceiverStrengths): (internal error) unoptimized SpatialKernelType value." << EidosTerminate(); } - + sv->Finished(); return; } - +#endif + // Set up to build distances first; this is an internal implementation detail, so we require the sparse vector set up for strengths above sv->SetDataType(SparseVectorDataType::kDistances); @@ -2901,53 +2917,32 @@ void InteractionType::FillSparseVectorForReceiverStrengths(SparseVector *sv, Ind } case SpatialKernelType::kLinear: { - for (uint32_t col_iter = 0; col_iter < nnz; ++col_iter) - { - sv_value_t distance = values[col_iter]; - - values[col_iter] = (sv_value_t)(if_param1_ * (1.0 - distance / max_distance_)); - } + // Use SIMD-optimized kernel: fmax = if_param1_, max_distance = max_distance_ + Eidos_SIMD::linear_kernel_float32(values, nnz, (float)if_param1_, (float)max_distance_); break; } case SpatialKernelType::kExponential: { - for (uint32_t col_iter = 0; col_iter < nnz; ++col_iter) - { - sv_value_t distance = values[col_iter]; - - values[col_iter] = (sv_value_t)(if_param1_ * exp(-if_param2_ * distance)); - } + // SIMD-accelerated exponential kernel: strength = fmax * exp(-lambda * distance) + Eidos_SIMD::exp_kernel_float32(values, nnz, (float)if_param1_, (float)if_param2_); break; } case SpatialKernelType::kNormal: { - for (uint32_t col_iter = 0; col_iter < nnz; ++col_iter) - { - sv_value_t distance = values[col_iter]; - - values[col_iter] = (sv_value_t)(if_param1_ * exp(-(distance * distance) / n_2param2sq_)); - } + // SIMD-accelerated Gaussian kernel: strength = fmax * exp(-d^2 / 2sigma^2) + Eidos_SIMD::normal_kernel_float32(values, nnz, (float)if_param1_, (float)n_2param2sq_); break; } case SpatialKernelType::kCauchy: { - for (uint32_t col_iter = 0; col_iter < nnz; ++col_iter) - { - sv_value_t distance = values[col_iter]; - double temp = distance / if_param2_; - - values[col_iter] = (sv_value_t)(if_param1_ / (1.0 + temp * temp)); - } + // Use SIMD-optimized kernel: fmax = if_param1_, lambda = if_param2_ + Eidos_SIMD::cauchy_kernel_float32(values, nnz, (float)if_param1_, (float)if_param2_); break; } case SpatialKernelType::kStudentsT: { - for (uint32_t col_iter = 0; col_iter < nnz; ++col_iter) - { - sv_value_t distance = values[col_iter]; - - values[col_iter] = (sv_value_t)SpatialKernel::tdist(distance, if_param1_, if_param2_, if_param3_); - } + // Use SIMD-optimized kernel: fmax = if_param1_, nu = if_param2_, tau = if_param3_ + Eidos_SIMD::tdist_kernel_float32(values, nnz, (float)if_param1_, (float)if_param2_, (float)if_param3_); break; } default: diff --git a/eidos/eidos_simd.h b/eidos/eidos_simd.h index 208c0992..4c142dd3 100644 --- a/eidos/eidos_simd.h +++ b/eidos/eidos_simd.h @@ -467,6 +467,304 @@ inline double product_float64(const double *input, int64_t count) return prod; } +// ================================ +// Float (Single-Precision) SIMD Operations +// ================================ +// These functions operate on arrays of floats, used by spatial interaction kernels. + +// --------------------- +// Exponential: exp(x) for floats +// --------------------- +inline void exp_float32(const float *input, float *output, int64_t count) +{ + int64_t i = 0; + +#if EIDOS_SLEEF_FLOAT_AVAILABLE + for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F) + { + EIDOS_SLEEF_TYPE_F v = EIDOS_SLEEF_LOAD_F(&input[i]); + EIDOS_SLEEF_TYPE_F r = EIDOS_SLEEF_EXP_F(v); + EIDOS_SLEEF_STORE_F(&output[i], r); + } +#endif + + // Scalar remainder + for (; i < count; i++) + output[i] = std::exp(input[i]); +} + +// --------------------- +// Exponential Kernel: strength = fmax * exp(-lambda * distance) +// --------------------- +// Operates in-place on a distance array, transforming distances to strengths. +// This is optimized for the spatial interaction kernel calculation. +inline void exp_kernel_float32(float *distances, int64_t count, float fmax, float lambda) +{ + int64_t i = 0; + +#if EIDOS_SLEEF_FLOAT_AVAILABLE + // We need to compute: fmax * exp(-lambda * distance) + // First, compute -lambda * distance for all values, then exp, then multiply by fmax + EIDOS_SLEEF_TYPE_F v_fmax = +#if defined(EIDOS_HAS_AVX2) + _mm256_set1_ps(fmax); + EIDOS_SLEEF_TYPE_F v_neg_lambda = _mm256_set1_ps(-lambda); +#elif defined(EIDOS_HAS_NEON) + vdupq_n_f32(fmax); + EIDOS_SLEEF_TYPE_F v_neg_lambda = vdupq_n_f32(-lambda); +#endif + + for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F) + { + EIDOS_SLEEF_TYPE_F v_dist = EIDOS_SLEEF_LOAD_F(&distances[i]); + + // Compute -lambda * distance +#if defined(EIDOS_HAS_AVX2) + EIDOS_SLEEF_TYPE_F v_arg = _mm256_mul_ps(v_neg_lambda, v_dist); +#elif defined(EIDOS_HAS_NEON) + EIDOS_SLEEF_TYPE_F v_arg = vmulq_f32(v_neg_lambda, v_dist); +#endif + + // Compute exp(-lambda * distance) + EIDOS_SLEEF_TYPE_F v_exp = EIDOS_SLEEF_EXP_F(v_arg); + + // Compute fmax * exp(...) +#if defined(EIDOS_HAS_AVX2) + EIDOS_SLEEF_TYPE_F v_result = _mm256_mul_ps(v_fmax, v_exp); +#elif defined(EIDOS_HAS_NEON) + EIDOS_SLEEF_TYPE_F v_result = vmulq_f32(v_fmax, v_exp); +#endif + + EIDOS_SLEEF_STORE_F(&distances[i], v_result); + } +#endif + + // Scalar remainder + for (; i < count; i++) + distances[i] = fmax * std::exp(-lambda * distances[i]); +} + +// --------------------- +// Normal (Gaussian) Kernel: strength = fmax * exp(-distance^2 / 2sigma^2) +// --------------------- +// Operates in-place on a distance array, transforming distances to strengths. +// The two_sigma_sq parameter is pre-computed as 2 * sigma^2 for efficiency. +inline void normal_kernel_float32(float *distances, int64_t count, float fmax, float two_sigma_sq) +{ + int64_t i = 0; + +#if EIDOS_SLEEF_FLOAT_AVAILABLE + // We need to compute: fmax * exp(-distance^2 / two_sigma_sq) + EIDOS_SLEEF_TYPE_F v_fmax = +#if defined(EIDOS_HAS_AVX2) + _mm256_set1_ps(fmax); + EIDOS_SLEEF_TYPE_F v_neg_inv_2sigsq = _mm256_set1_ps(-1.0f / two_sigma_sq); +#elif defined(EIDOS_HAS_NEON) + vdupq_n_f32(fmax); + EIDOS_SLEEF_TYPE_F v_neg_inv_2sigsq = vdupq_n_f32(-1.0f / two_sigma_sq); +#endif + + for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F) + { + EIDOS_SLEEF_TYPE_F v_dist = EIDOS_SLEEF_LOAD_F(&distances[i]); + + // Compute distance^2 +#if defined(EIDOS_HAS_AVX2) + EIDOS_SLEEF_TYPE_F v_dist_sq = _mm256_mul_ps(v_dist, v_dist); + // Compute -distance^2 / 2sigma^2 + EIDOS_SLEEF_TYPE_F v_arg = _mm256_mul_ps(v_dist_sq, v_neg_inv_2sigsq); +#elif defined(EIDOS_HAS_NEON) + EIDOS_SLEEF_TYPE_F v_dist_sq = vmulq_f32(v_dist, v_dist); + // Compute -distance^2 / 2sigma^2 + EIDOS_SLEEF_TYPE_F v_arg = vmulq_f32(v_dist_sq, v_neg_inv_2sigsq); +#endif + + // Compute exp(-distance^2 / 2sigma^2) + EIDOS_SLEEF_TYPE_F v_exp = EIDOS_SLEEF_EXP_F(v_arg); + + // Compute fmax * exp(...) +#if defined(EIDOS_HAS_AVX2) + EIDOS_SLEEF_TYPE_F v_result = _mm256_mul_ps(v_fmax, v_exp); +#elif defined(EIDOS_HAS_NEON) + EIDOS_SLEEF_TYPE_F v_result = vmulq_f32(v_fmax, v_exp); +#endif + + EIDOS_SLEEF_STORE_F(&distances[i], v_result); + } +#endif + + // Scalar remainder + for (; i < count; i++) + { + float d = distances[i]; + distances[i] = fmax * std::exp(-(d * d) / two_sigma_sq); + } +} + +// --------------------- +// Student's T Kernel: strength = fmax / pow(1 + (d/tau)^2 / nu, (nu+1)/2) +// --------------------- +// Operates in-place on a distance array, transforming distances to strengths. +// Parameters: fmax = maximum strength, nu = degrees of freedom, tau = scale +inline void tdist_kernel_float32(float *distances, int64_t count, float fmax, float nu, float tau) +{ + int64_t i = 0; + + // Pre-compute constants + float inv_tau = 1.0f / tau; + float inv_nu = 1.0f / nu; + float exponent = (nu + 1.0f) / 2.0f; + +#if EIDOS_SLEEF_FLOAT_AVAILABLE + EIDOS_SLEEF_TYPE_F v_fmax, v_inv_tau, v_inv_nu, v_exponent, v_one; +#if defined(EIDOS_HAS_AVX2) + v_fmax = _mm256_set1_ps(fmax); + v_inv_tau = _mm256_set1_ps(inv_tau); + v_inv_nu = _mm256_set1_ps(inv_nu); + v_exponent = _mm256_set1_ps(-exponent); + v_one = _mm256_set1_ps(1.0f); +#elif defined(EIDOS_HAS_NEON) + v_fmax = vdupq_n_f32(fmax); + v_inv_tau = vdupq_n_f32(inv_tau); + v_inv_nu = vdupq_n_f32(inv_nu); + v_exponent = vdupq_n_f32(-exponent); + v_one = vdupq_n_f32(1.0f); +#endif + + for (; i + EIDOS_SLEEF_VEC_SIZE_F <= count; i += EIDOS_SLEEF_VEC_SIZE_F) + { + EIDOS_SLEEF_TYPE_F v_dist = EIDOS_SLEEF_LOAD_F(&distances[i]); + + // Compute (d / tau) +#if defined(EIDOS_HAS_AVX2) + EIDOS_SLEEF_TYPE_F v_d_over_tau = _mm256_mul_ps(v_dist, v_inv_tau); + // Compute (d/tau)^2 + EIDOS_SLEEF_TYPE_F v_d_over_tau_sq = _mm256_mul_ps(v_d_over_tau, v_d_over_tau); + // Compute (d/tau)^2 / nu + EIDOS_SLEEF_TYPE_F v_term = _mm256_mul_ps(v_d_over_tau_sq, v_inv_nu); + // Compute 1 + (d/tau)^2 / nu + EIDOS_SLEEF_TYPE_F v_base = _mm256_add_ps(v_one, v_term); +#elif defined(EIDOS_HAS_NEON) + EIDOS_SLEEF_TYPE_F v_d_over_tau = vmulq_f32(v_dist, v_inv_tau); + EIDOS_SLEEF_TYPE_F v_d_over_tau_sq = vmulq_f32(v_d_over_tau, v_d_over_tau); + EIDOS_SLEEF_TYPE_F v_term = vmulq_f32(v_d_over_tau_sq, v_inv_nu); + EIDOS_SLEEF_TYPE_F v_base = vaddq_f32(v_one, v_term); +#endif + + // Compute pow(base, -exponent) = 1 / pow(base, exponent) + EIDOS_SLEEF_TYPE_F v_pow = EIDOS_SLEEF_POW_F(v_base, v_exponent); + + // Compute fmax * pow(...) +#if defined(EIDOS_HAS_AVX2) + EIDOS_SLEEF_TYPE_F v_result = _mm256_mul_ps(v_fmax, v_pow); +#elif defined(EIDOS_HAS_NEON) + EIDOS_SLEEF_TYPE_F v_result = vmulq_f32(v_fmax, v_pow); +#endif + + EIDOS_SLEEF_STORE_F(&distances[i], v_result); + } +#endif + + // Scalar remainder + for (; i < count; i++) + { + float d = distances[i]; + float d_over_tau = d * inv_tau; + distances[i] = fmax * std::pow(1.0f + d_over_tau * d_over_tau * inv_nu, -exponent); + } +} + +// --------------------- +// Cauchy Kernel: strength = fmax / (1 + (d/lambda)^2) +// --------------------- +// Operates in-place on a distance array, transforming distances to strengths. +// Parameters: fmax = maximum strength, lambda = scale parameter +inline void cauchy_kernel_float32(float *distances, int64_t count, float fmax, float lambda) +{ + int64_t i = 0; + float inv_lambda = 1.0f / lambda; + +#if defined(EIDOS_HAS_AVX2) + __m256 v_fmax = _mm256_set1_ps(fmax); + __m256 v_inv_lambda = _mm256_set1_ps(inv_lambda); + __m256 v_one = _mm256_set1_ps(1.0f); + + for (; i + 8 <= count; i += 8) + { + __m256 v_dist = _mm256_loadu_ps(&distances[i]); + __m256 v_temp = _mm256_mul_ps(v_dist, v_inv_lambda); // d/lambda + __m256 v_temp_sq = _mm256_mul_ps(v_temp, v_temp); // (d/lambda)^2 + __m256 v_denom = _mm256_add_ps(v_one, v_temp_sq); // 1 + (d/lambda)^2 + __m256 v_result = _mm256_div_ps(v_fmax, v_denom); // fmax / denom + _mm256_storeu_ps(&distances[i], v_result); + } +#elif defined(EIDOS_HAS_NEON) + float32x4_t v_fmax = vdupq_n_f32(fmax); + float32x4_t v_inv_lambda = vdupq_n_f32(inv_lambda); + float32x4_t v_one = vdupq_n_f32(1.0f); + + for (; i + 4 <= count; i += 4) + { + float32x4_t v_dist = vld1q_f32(&distances[i]); + float32x4_t v_temp = vmulq_f32(v_dist, v_inv_lambda); + float32x4_t v_temp_sq = vmulq_f32(v_temp, v_temp); + float32x4_t v_denom = vaddq_f32(v_one, v_temp_sq); + float32x4_t v_result = vdivq_f32(v_fmax, v_denom); + vst1q_f32(&distances[i], v_result); + } +#endif + + // Scalar remainder + for (; i < count; i++) + { + float d = distances[i]; + float temp = d * inv_lambda; + distances[i] = fmax / (1.0f + temp * temp); + } +} + +// --------------------- +// Linear Kernel: strength = fmax * (1 - d / max_distance) +// --------------------- +// Operates in-place on a distance array, transforming distances to strengths. +// Parameters: fmax = maximum strength, max_distance = interaction max distance +inline void linear_kernel_float32(float *distances, int64_t count, float fmax, float max_distance) +{ + int64_t i = 0; + float fmax_over_maxdist = fmax / max_distance; + +#if defined(EIDOS_HAS_AVX2) + __m256 v_fmax = _mm256_set1_ps(fmax); + __m256 v_fmax_over_maxdist = _mm256_set1_ps(fmax_over_maxdist); + + for (; i + 8 <= count; i += 8) + { + __m256 v_dist = _mm256_loadu_ps(&distances[i]); + // fmax - d * (fmax / max_distance) = fmax * (1 - d/max_distance) + __m256 v_term = _mm256_mul_ps(v_dist, v_fmax_over_maxdist); + __m256 v_result = _mm256_sub_ps(v_fmax, v_term); + _mm256_storeu_ps(&distances[i], v_result); + } +#elif defined(EIDOS_HAS_NEON) + float32x4_t v_fmax = vdupq_n_f32(fmax); + float32x4_t v_fmax_over_maxdist = vdupq_n_f32(fmax_over_maxdist); + + for (; i + 4 <= count; i += 4) + { + float32x4_t v_dist = vld1q_f32(&distances[i]); + float32x4_t v_term = vmulq_f32(v_dist, v_fmax_over_maxdist); + float32x4_t v_result = vsubq_f32(v_fmax, v_term); + vst1q_f32(&distances[i], v_result); + } +#endif + + // Scalar remainder + for (; i < count; i++) + { + distances[i] = fmax - distances[i] * fmax_over_maxdist; + } +} + } // namespace Eidos_SIMD #endif /* eidos_simd_h */ diff --git a/eidos/sleef/sleef_config.h b/eidos/sleef/sleef_config.h index 230e34a1..6844afbc 100644 --- a/eidos/sleef/sleef_config.h +++ b/eidos/sleef/sleef_config.h @@ -62,6 +62,15 @@ #define EIDOS_SLEEF_LOG10_D(v) Sleef_log10d4_u10avx2(v) #define EIDOS_SLEEF_LOG2_D(v) Sleef_log2d4_u10avx2(v) + // Float (single-precision) support - 8 floats per AVX2 register + #define EIDOS_SLEEF_FLOAT_AVAILABLE 1 + #define EIDOS_SLEEF_VEC_SIZE_F 8 + #define EIDOS_SLEEF_TYPE_F __m256 + #define EIDOS_SLEEF_LOAD_F(ptr) _mm256_loadu_ps(ptr) + #define EIDOS_SLEEF_STORE_F(ptr, v) _mm256_storeu_ps(ptr, v) + #define EIDOS_SLEEF_EXP_F(v) Sleef_expf8_u10avx2(v) + #define EIDOS_SLEEF_POW_F(x, y) Sleef_powf8_u10avx2(x, y) + // ================================ // ARM NEON Configuration (ARM64) // ================================ @@ -85,12 +94,23 @@ #define EIDOS_SLEEF_LOG10_D(v) Sleef_log10d2_u10advsimd(v) #define EIDOS_SLEEF_LOG2_D(v) Sleef_log2d2_u10advsimd(v) + // Float (single-precision) support - 4 floats per NEON register + #define EIDOS_SLEEF_FLOAT_AVAILABLE 1 + #define EIDOS_SLEEF_VEC_SIZE_F 4 + #define EIDOS_SLEEF_TYPE_F float32x4_t + #define EIDOS_SLEEF_LOAD_F(ptr) vld1q_f32(ptr) + #define EIDOS_SLEEF_STORE_F(ptr, v) vst1q_f32(ptr, v) + #define EIDOS_SLEEF_EXP_F(v) Sleef_expf4_u10advsimd(v) + #define EIDOS_SLEEF_POW_F(x, y) Sleef_powf4_u10advsimd(x, y) + // ================================ // Scalar Fallback (SSE4.2-only or no SIMD) // ================================ #else #define EIDOS_SLEEF_AVAILABLE 0 #define EIDOS_SLEEF_VEC_SIZE 1 + #define EIDOS_SLEEF_FLOAT_AVAILABLE 0 + #define EIDOS_SLEEF_VEC_SIZE_F 1 #endif #else // EIDOS_SLEEF_AVAILABLE was defined externally diff --git a/simd_benchmarks/README.md b/simd_benchmarks/README.md index 3430d201..1e37f423 100644 --- a/simd_benchmarks/README.md +++ b/simd_benchmarks/README.md @@ -12,6 +12,8 @@ This directory contains benchmark scripts used during the development of SIMD op - **`dnorm_benchmark.eidos`** - Eidos script that benchmarks the SIMD-optimized `dnorm()` function with singleton and vector mu/sigma parameters. +- **`benchmark_all_kernels.slim`** - SLiM script that benchmarks all 6 SIMD-optimized spatial interaction kernel types (Fixed, Linear, Exponential, Normal, Cauchy, Student's T). + - **`SIMD_BUILD_FLAGS.md`** - Documentation on how SIMD and SLEEF build flags are set and interact. For SLEEF header generation scripts and documentation, see `eidos/sleef/`. @@ -114,4 +116,39 @@ mkdir build && cd build && cmake .. && make eidos # Build without SIMD for comparison mkdir build_nosimd && cd build_nosimd && cmake .. -DUSE_SIMD=OFF && make eidos ./eidos ../simd_benchmarks/dnorm_benchmark.eidos -``` \ No newline at end of file +``` + +## Spatial Interaction Kernel Benchmark Results + +The `benchmark_all_kernels.slim` script tests SIMD-optimized spatial interaction kernels. All kernel types except Fixed use a two-pass approach (build distances, then batch transform) enabling SIMD vectorization via SLEEF. + +Results on x86_64 with AVX2 (50,000 individuals, ~2,262 neighbors per individual, 30 generations): + +| Kernel | Original | SIMD | Speedup | +|--------|----------|------|---------| +| Fixed | 31.97s | 31.36s | 1.02x | +| Linear | 37.26s | 32.95s | **1.13x** | +| Exponential | 59.58s | 34.88s | **1.71x** | +| Normal | 56.37s | 35.15s | **1.60x** | +| Cauchy | 40.04s | 33.00s | **1.21x** | +| Student's T | 130.10s | 49.76s | **2.61x** | +| **Total** | **356.04s** | **217.80s** | **1.64x** | + +The biggest gains come from kernels using transcendental functions: +- **Student's T** (2.61x): Uses `pow()` via SLEEF +- **Exponential/Normal** (1.6-1.7x): Use `exp()` via SLEEF +- **Linear/Cauchy** (~1.1-1.2x): Simple arithmetic, modest gains from explicit SIMD +- **Fixed** (1.02x): Uses special-case single-pass path (no benefit from two-pass) + +To run this benchmark: +```bash +# Build with SIMD +mkdir build && cd build && cmake .. && make slim +./slim -s 42 ../simd_benchmarks/benchmark_all_kernels.slim + +# Build without SIMD for comparison +mkdir build_nosimd && cd build_nosimd && cmake .. -DUSE_SIMD=OFF && make slim +./slim -s 42 ../simd_benchmarks/benchmark_all_kernels.slim +``` + +Adjust `W` in the script to change neighbor density (W=25 for ~2200 neighbors, W=266 for ~20 neighbors). \ No newline at end of file diff --git a/simd_benchmarks/benchmark_all_kernels.slim b/simd_benchmarks/benchmark_all_kernels.slim new file mode 100644 index 00000000..613c69ce --- /dev/null +++ b/simd_benchmarks/benchmark_all_kernels.slim @@ -0,0 +1,131 @@ +// Comprehensive benchmark for ALL spatial interaction kernel types +// Tests: Fixed, Linear, Exponential, Normal, Cauchy, Student's T + +initialize() { + initializeSLiMModelType("WF"); + initializeSLiMOptions(dimensionality="xy"); + + defineConstant("POP_SIZE", 50000); + defineConstant("W", 25.0); // Medium density (~2262 neighbors with 50k) + defineConstant("SI", 1.0); + defineConstant("GENS", 30); + + initializeMutationRate(0); + initializeRecombinationRate(0); + initializeMutationType("m1", 0.5, "f", 0.0); + initializeGenomicElementType("g1", m1, 1.0); + initializeGenomicElement(g1, 0, 99); + + // 1. Fixed kernel: strength = fmax (constant) + initializeInteractionType(1, "xy", reciprocal=T, maxDistance=SI * 3); + i1.setInteractionFunction("f", 1.0); + + // 2. Linear kernel: strength = fmax * (1 - d/maxDist) + initializeInteractionType(2, "xy", reciprocal=T, maxDistance=SI * 3); + i2.setInteractionFunction("l", 1.0); + + // 3. Exponential kernel: strength = fmax * exp(-lambda * d) + initializeInteractionType(3, "xy", reciprocal=T, maxDistance=SI * 3); + i3.setInteractionFunction("e", 1.0, 1.0/SI); + + // 4. Normal/Gaussian kernel: strength = fmax * exp(-d^2 / 2*sigma^2) + initializeInteractionType(4, "xy", reciprocal=T, maxDistance=SI * 3); + i4.setInteractionFunction("n", 1.0, SI); + + // 5. Cauchy kernel: strength = fmax / (1 + (d/scale)^2) + initializeInteractionType(5, "xy", reciprocal=T, maxDistance=SI * 3); + i5.setInteractionFunction("c", 1.0, SI); + + // 6. Student's T kernel + initializeInteractionType(6, "xy", reciprocal=T, maxDistance=SI * 3); + i6.setInteractionFunction("t", 1.0, 5.0, SI); // fmax, nu, scale +} + +1 early() { + sim.addSubpop("p1", POP_SIZE); + p1.setSpatialBounds(c(0, 0, W, W)); + + for (ind in p1.individuals) { + ind.setSpatialPosition(p1.pointUniform()); + } + + catn("================================================"); + catn("ALL KERNELS BENCHMARK (2D)"); + catn("================================================"); + catn("Population: " + POP_SIZE + " | World: " + W + "x" + W); + density = POP_SIZE / (W^2); + avg_neighbors = density * PI * (SI*3)^2; + catn("Est. neighbors/ind: ~" + format("%.0f", avg_neighbors)); + catn("Generations: " + GENS); + catn("------------------------------------------------"); + + defineGlobal("start_time", clock()); + defineGlobal("fixed_time", 0.0); + defineGlobal("linear_time", 0.0); + defineGlobal("exp_time", 0.0); + defineGlobal("normal_time", 0.0); + defineGlobal("cauchy_time", 0.0); + defineGlobal("studentst_time", 0.0); +} + +early() { + // Fixed + t0 = clock(); + i1.evaluate(p1); + i1.totalOfNeighborStrengths(p1.individuals); + defineGlobal("fixed_time", fixed_time + (clock() - t0)); + + // Linear + t0 = clock(); + i2.evaluate(p1); + i2.totalOfNeighborStrengths(p1.individuals); + defineGlobal("linear_time", linear_time + (clock() - t0)); + + // Exponential + t0 = clock(); + i3.evaluate(p1); + i3.totalOfNeighborStrengths(p1.individuals); + defineGlobal("exp_time", exp_time + (clock() - t0)); + + // Normal + t0 = clock(); + i4.evaluate(p1); + i4.totalOfNeighborStrengths(p1.individuals); + defineGlobal("normal_time", normal_time + (clock() - t0)); + + // Cauchy + t0 = clock(); + i5.evaluate(p1); + i5.totalOfNeighborStrengths(p1.individuals); + defineGlobal("cauchy_time", cauchy_time + (clock() - t0)); + + // Student's T + t0 = clock(); + i6.evaluate(p1); + i6.totalOfNeighborStrengths(p1.individuals); + defineGlobal("studentst_time", studentst_time + (clock() - t0)); +} + +late() { + for (ind in p1.individuals) { + ind.setSpatialPosition(p1.pointUniform()); + } +} + +30 late() { + elapsed = clock() - start_time; + + catn("\n================================================"); + catn("RESULTS"); + catn("================================================"); + catn("Fixed: " + format("%6.3f", fixed_time) + " s"); + catn("Linear: " + format("%6.3f", linear_time) + " s"); + catn("Exponential:" + format("%6.3f", exp_time) + " s"); + catn("Normal: " + format("%6.3f", normal_time) + " s"); + catn("Cauchy: " + format("%6.3f", cauchy_time) + " s"); + catn("Student's T:" + format("%6.3f", studentst_time) + " s"); + catn("------------------------------------------------"); + catn("TOTAL: " + format("%6.3f", elapsed) + " s"); + catn("================================================"); + sim.simulationFinished(); +}