From e4a2648f110ccc28c33dd20f6024b2dbf29fd4ec Mon Sep 17 00:00:00 2001
From: andrewkern <adkern@uoregon.edu>
Date: Wed, 26 Nov 2025 12:35:07 -0800
Subject: [PATCH 1/7] add SIMD vectorization for Eidos math functions

Adds compile-time SIMD detection (AVX2/SSE4.2/FMA) and vectorized
implementations for sqrt, abs, floor, ceil, round, trunc, sum, and
product. Benchmarks show 1.4-5.7x speedups on large float arrays.

- CMakeLists.txt: add USE_SIMD option and compiler flag detection
- eidos/eidos_simd.h: new header with SIMD intrinsic implementations
- eidos/eidos_functions_math.cpp: use SIMD paths when available
- eidos/eidos_test_*.{h,cpp}: use tolerance for float comparisons
---
 CMakeLists.txt                       |  38 +++
 eidos/eidos_functions_math.cpp       | 100 +++++---
 eidos/eidos_simd.h                   | 352 +++++++++++++++++++++++++++
 eidos/eidos_test_builtins.h          |   6 +-
 eidos/eidos_test_functions_other.cpp |   8 +-
 5 files changed, 461 insertions(+), 43 deletions(-)
 create mode 100644 eidos/eidos_simd.h
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9e94f8db..b85c4742 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -286,6 +286,44 @@ if(BUILD_LTO)
     endif()
 endif()
 
+#
+# SIMD SUPPORT (independent of OpenMP)
+#
+
+# Option to disable SIMD entirely
+option(USE_SIMD "Enable SIMD optimizations (SSE4.2/AVX2)" ON)
+
+if(USE_SIMD AND NOT WIN32)
+    include(CheckCXXCompilerFlag)
+
+    # Check for AVX2 support
+    check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2)
+    check_cxx_compiler_flag("-msse4.2" COMPILER_SUPPORTS_SSE42)
+    check_cxx_compiler_flag("-mfma" COMPILER_SUPPORTS_FMA)
+
+    if(COMPILER_SUPPORTS_AVX2)
+        message(STATUS "SIMD: AVX2 support detected")
+        add_compile_definitions(EIDOS_HAS_AVX2=1)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2")
+        if(COMPILER_SUPPORTS_FMA)
+            message(STATUS "SIMD: FMA support detected")
+            add_compile_definitions(EIDOS_HAS_FMA=1)
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
+        endif()
+    elseif(COMPILER_SUPPORTS_SSE42)
+        message(STATUS "SIMD: SSE4.2 support detected (no AVX2)")
+        add_compile_definitions(EIDOS_HAS_SSE42=1)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2")
+    else()
+        message(STATUS "SIMD: No SIMD support detected, using scalar fallback")
+    endif()
+elseif(USE_SIMD AND WIN32)
+    # Windows/MSVC detection not yet implemented
+    message(STATUS "SIMD: Windows SIMD detection not yet implemented, using scalar fallback")
+else()
+    message(STATUS "SIMD: Disabled by user")
+endif()
+
 # GSL - adding /usr/local/include so all targets that use GSL_INCLUDES get omp.h
 set(TARGET_NAME_GSL gsl)
 file(GLOB_RECURSE GSL_SOURCES ${PROJECT_SOURCE_DIR}/gsl/*.c ${PROJECT_SOURCE_DIR}/gsl/*/*.c)
diff --git a/eidos/eidos_functions_math.cpp b/eidos/eidos_functions_math.cpp
index d1335257..2a97dedb 100644
--- a/eidos/eidos_functions_math.cpp
+++ b/eidos/eidos_functions_math.cpp
@@ -19,6 +19,7 @@
 
 
 #include "eidos_functions.h"
+#include "eidos_simd.h"
 
 #include <utility>
 #include <string>
@@ -87,15 +88,19 @@ EidosValue_SP Eidos_ExecuteFunction_abs(const std::vector<EidosValue_SP> &p_argu
 		EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count);
 		double *float_result_data = float_result->data_mutable();
 		result_SP = EidosValue_SP(float_result);
-		
+
+#ifdef _OPENMP
 		EIDOS_THREAD_COUNT(gEidos_OMP_threads_ABS_FLOAT);
-#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ABS_FLOAT) num_threads(thread_count)
+		#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ABS_FLOAT) num_threads(thread_count)
 		for (int value_index = 0; value_index < x_count; ++value_index)
 			float_result_data[value_index] = fabs(float_data[value_index]);
+#else
+		Eidos_SIMD::abs_float64(float_data, float_result_data, x_count);
+#endif
 	}
-	
+
 	result_SP->CopyDimensionsFromValue(x_value);
-	
+
 	return result_SP;
 }
 
@@ -190,21 +195,25 @@ EidosValue_SP Eidos_ExecuteFunction_atan2(const std::vector<EidosValue_SP> &p_ar
 EidosValue_SP Eidos_ExecuteFunction_ceil(const std::vector<EidosValue_SP> &p_arguments, __attribute__((unused)) EidosInterpreter &p_interpreter)
 {
 	EidosValue_SP result_SP(nullptr);
-	
+
 	EidosValue *x_value = p_arguments[0].get();
 	int x_count = x_value->Count();
 	const double *float_data = x_value->FloatData();
 	EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count);
 	double *float_result_data = float_result->data_mutable();
 	result_SP = EidosValue_SP(float_result);
-	
+
+#ifdef _OPENMP
 	EIDOS_THREAD_COUNT(gEidos_OMP_threads_CEIL);
-#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_CEIL) num_threads(thread_count)
+	#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_CEIL) num_threads(thread_count)
 	for (int value_index = 0; value_index < x_count; ++value_index)
 		float_result_data[value_index] = ceil(float_data[value_index]);
-	
+#else
+	Eidos_SIMD::ceil_float64(float_data, float_result_data, x_count);
+#endif
+
 	result_SP->CopyDimensionsFromValue(x_value);
-	
+
 	return result_SP;
 }
 
@@ -359,21 +368,25 @@ EidosValue_SP Eidos_ExecuteFunction_exp(const std::vector<EidosValue_SP> &p_argu
 EidosValue_SP Eidos_ExecuteFunction_floor(const std::vector<EidosValue_SP> &p_arguments, __attribute__((unused)) EidosInterpreter &p_interpreter)
 {
 	EidosValue_SP result_SP(nullptr);
-	
+
 	EidosValue *x_value = p_arguments[0].get();
 	int x_count = x_value->Count();
 	const double *float_data = x_value->FloatData();
 	EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count);
 	double *float_result_data = float_result->data_mutable();
 	result_SP = EidosValue_SP(float_result);
-	
+
+#ifdef _OPENMP
 	EIDOS_THREAD_COUNT(gEidos_OMP_threads_FLOOR);
-#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_FLOOR) num_threads(thread_count)
+	#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_FLOOR) num_threads(thread_count)
 	for (int value_index = 0; value_index < x_count; ++value_index)
 		float_result_data[value_index] = floor(float_data[value_index]);
-	
+#else
+	Eidos_SIMD::floor_float64(float_data, float_result_data, x_count);
+#endif
+
 	result_SP->CopyDimensionsFromValue(x_value);
-	
+
 	return result_SP;
 }
 
@@ -788,14 +801,11 @@ EidosValue_SP Eidos_ExecuteFunction_product(const std::vector<EidosValue_SP> &p_
 	else if (x_type == EidosValueType::kValueFloat)
 	{
 		const double *float_data = x_value->FloatData();
-		double product = 1;
-		
-		for (int value_index = 0; value_index < x_count; ++value_index)
-			product *= float_data[value_index];
-		
+		double product = Eidos_SIMD::product_float64(float_data, x_count);
+
 		result_SP = EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Float(product));
 	}
-	
+
 	return result_SP;
 }
 
@@ -803,21 +813,25 @@ EidosValue_SP Eidos_ExecuteFunction_product(const std::vector<EidosValue_SP> &p_
 EidosValue_SP Eidos_ExecuteFunction_round(const std::vector<EidosValue_SP> &p_arguments, __attribute__((unused)) EidosInterpreter &p_interpreter)
 {
 	EidosValue_SP result_SP(nullptr);
-	
+
 	EidosValue *x_value = p_arguments[0].get();
 	int x_count = x_value->Count();
 	const double *float_data = x_value->FloatData();
 	EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count);
 	double *float_result_data = float_result->data_mutable();
 	result_SP = EidosValue_SP(float_result);
-	
+
+#ifdef _OPENMP
 	EIDOS_THREAD_COUNT(gEidos_OMP_threads_ROUND);
-#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ROUND) num_threads(thread_count)
+	#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ROUND) num_threads(thread_count)
 	for (int value_index = 0; value_index < x_count; ++value_index)
 		float_result_data[value_index] = round(float_data[value_index]);
-	
+#else
+	Eidos_SIMD::round_float64(float_data, float_result_data, x_count);
+#endif
+
 	result_SP->CopyDimensionsFromValue(x_value);
-	
+
 	return result_SP;
 }
 
@@ -2426,15 +2440,19 @@ EidosValue_SP Eidos_ExecuteFunction_sqrt(const std::vector<EidosValue_SP> &p_arg
 		EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count);
 		double *float_result_data = float_result->data_mutable();
 		result_SP = EidosValue_SP(float_result);
-		
+
+#ifdef _OPENMP
 		EIDOS_THREAD_COUNT(gEidos_OMP_threads_SQRT_FLOAT);
-#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_SQRT_FLOAT) num_threads(thread_count)
+		#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_SQRT_FLOAT) num_threads(thread_count)
 		for (int value_index = 0; value_index < x_count; ++value_index)
 			float_result_data[value_index] = sqrt(float_data[value_index]);
+#else
+		Eidos_SIMD::sqrt_float64(float_data, float_result_data, x_count);
+#endif
 	}
-	
+
 	result_SP->CopyDimensionsFromValue(x_value);
-	
+
 	return result_SP;
 }
 
@@ -2517,12 +2535,16 @@ EidosValue_SP Eidos_ExecuteFunction_sum(const std::vector<EidosValue_SP> &p_argu
 	{
 		const double *float_data = x_value->FloatData();
 		double sum = 0;
-		
+
+#ifdef _OPENMP
 		EIDOS_THREAD_COUNT(gEidos_OMP_threads_SUM_FLOAT);
-#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data) reduction(+: sum) if(parallel:x_count >= EIDOS_OMPMIN_SUM_FLOAT) num_threads(thread_count)
+		#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data) reduction(+: sum) if(parallel:x_count >= EIDOS_OMPMIN_SUM_FLOAT) num_threads(thread_count)
 		for (int value_index = 0; value_index < x_count; ++value_index)
 			sum += float_data[value_index];
-		
+#else
+		sum = Eidos_SIMD::sum_float64(float_data, x_count);
+#endif
+
 		result_SP = EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Float(sum));
 	}
 	else if (x_type == EidosValueType::kValueLogical)
@@ -2587,21 +2609,25 @@ EidosValue_SP Eidos_ExecuteFunction_tan(const std::vector<EidosValue_SP> &p_argu
 EidosValue_SP Eidos_ExecuteFunction_trunc(const std::vector<EidosValue_SP> &p_arguments, __attribute__((unused)) EidosInterpreter &p_interpreter)
 {
 	EidosValue_SP result_SP(nullptr);
-	
+
 	EidosValue *x_value = p_arguments[0].get();
 	int x_count = x_value->Count();
 	const double *float_data = x_value->FloatData();
 	EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count);
 	double *float_result_data = float_result->data_mutable();
 	result_SP = EidosValue_SP(float_result);
-	
+
+#ifdef _OPENMP
 	EIDOS_THREAD_COUNT(gEidos_OMP_threads_TRUNC);
-#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_TRUNC) num_threads(thread_count)
+	#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_TRUNC) num_threads(thread_count)
 	for (int value_index = 0; value_index < x_count; ++value_index)
 		float_result_data[value_index] = trunc(float_data[value_index]);
-	
+#else
+	Eidos_SIMD::trunc_float64(float_data, float_result_data, x_count);
+#endif
+
 	result_SP->CopyDimensionsFromValue(x_value);
-	
+
 	return result_SP;
 }
 
diff --git a/eidos/eidos_simd.h b/eidos/eidos_simd.h
new file mode 100644
index 00000000..63efbb1f
--- /dev/null
+++ b/eidos/eidos_simd.h
@@ -0,0 +1,352 @@
+//
+//  eidos_simd.h
+//  Eidos
+//
+//  Created by Ben Haller on 11/26/2024.
+//  Copyright (c) 2024-2025 Philipp Messer.  All rights reserved.
+//	A product of the Messer Lab, http://messerlab.org/slim/
+//
+
+//	This file is part of Eidos.
+//
+//	Eidos is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by
+//	the Free Software Foundation, either version 3 of the License, or (at your option) any later version.
+//
+//	Eidos is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
+//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+//
+//	You should have received a copy of the GNU General Public License along with Eidos.  If not, see <http://www.gnu.org/licenses/>.
+
+/*
+
+ SIMD acceleration for Eidos math operations, independent of OpenMP.
+
+ This header provides vectorized implementations of common math operations
+ using SSE4.2 or AVX2 intrinsics when available, with scalar fallbacks.
+
+ */
+
+#ifndef eidos_simd_h
+#define eidos_simd_h
+
+#include <cstdint>
+#include <cmath>
+
+// Determine SIMD capability level
+#if defined(EIDOS_HAS_AVX2)
+    #include <immintrin.h>
+    #define EIDOS_SIMD_WIDTH 4          // 4 doubles per AVX register
+    #define EIDOS_SIMD_FLOAT_WIDTH 8    // 8 floats per AVX register
+#elif defined(EIDOS_HAS_SSE42)
+    #include <emmintrin.h>
+    #include <smmintrin.h>
+    #define EIDOS_SIMD_WIDTH 2          // 2 doubles per SSE register
+    #define EIDOS_SIMD_FLOAT_WIDTH 4    // 4 floats per SSE register
+#else
+    #define EIDOS_SIMD_WIDTH 1          // Scalar fallback
+    #define EIDOS_SIMD_FLOAT_WIDTH 1
+#endif
+
+// ================================
+// SIMD Vector Math Operations
+// ================================
+// These functions apply an operation to arrays of doubles.
+// They handle the loop, SIMD processing, and scalar remainder.
+
+namespace Eidos_SIMD {
+
+// ---------------------
+// Square Root: sqrt(x)
+// ---------------------
+inline void sqrt_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if defined(EIDOS_HAS_AVX2)
+    // Process 4 doubles at a time
+    for (; i + 4 <= count; i += 4)
+    {
+        __m256d v = _mm256_loadu_pd(&input[i]);
+        __m256d r = _mm256_sqrt_pd(v);
+        _mm256_storeu_pd(&output[i], r);
+    }
+#elif defined(EIDOS_HAS_SSE42)
+    // Process 2 doubles at a time
+    for (; i + 2 <= count; i += 2)
+    {
+        __m128d v = _mm_loadu_pd(&input[i]);
+        __m128d r = _mm_sqrt_pd(v);
+        _mm_storeu_pd(&output[i], r);
+    }
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        output[i] = std::sqrt(input[i]);
+}
+
+// ---------------------
+// Absolute Value: abs(x)
+// ---------------------
+inline void abs_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if defined(EIDOS_HAS_AVX2)
+    // Create sign mask (all bits except sign bit)
+    __m256d sign_mask = _mm256_set1_pd(-0.0);
+    for (; i + 4 <= count; i += 4)
+    {
+        __m256d v = _mm256_loadu_pd(&input[i]);
+        __m256d r = _mm256_andnot_pd(sign_mask, v);  // Clear sign bit
+        _mm256_storeu_pd(&output[i], r);
+    }
+#elif defined(EIDOS_HAS_SSE42)
+    __m128d sign_mask = _mm_set1_pd(-0.0);
+    for (; i + 2 <= count; i += 2)
+    {
+        __m128d v = _mm_loadu_pd(&input[i]);
+        __m128d r = _mm_andnot_pd(sign_mask, v);
+        _mm_storeu_pd(&output[i], r);
+    }
+#endif
+
+    for (; i < count; i++)
+        output[i] = std::fabs(input[i]);
+}
+
+// ---------------------
+// Floor: floor(x)
+// ---------------------
+inline void floor_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if defined(EIDOS_HAS_AVX2)
+    for (; i + 4 <= count; i += 4)
+    {
+        __m256d v = _mm256_loadu_pd(&input[i]);
+        __m256d r = _mm256_floor_pd(v);
+        _mm256_storeu_pd(&output[i], r);
+    }
+#elif defined(EIDOS_HAS_SSE42)
+    for (; i + 2 <= count; i += 2)
+    {
+        __m128d v = _mm_loadu_pd(&input[i]);
+        __m128d r = _mm_floor_pd(v);
+        _mm_storeu_pd(&output[i], r);
+    }
+#endif
+
+    for (; i < count; i++)
+        output[i] = std::floor(input[i]);
+}
+
+// ---------------------
+// Ceil: ceil(x)
+// ---------------------
+inline void ceil_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if defined(EIDOS_HAS_AVX2)
+    for (; i + 4 <= count; i += 4)
+    {
+        __m256d v = _mm256_loadu_pd(&input[i]);
+        __m256d r = _mm256_ceil_pd(v);
+        _mm256_storeu_pd(&output[i], r);
+    }
+#elif defined(EIDOS_HAS_SSE42)
+    for (; i + 2 <= count; i += 2)
+    {
+        __m128d v = _mm_loadu_pd(&input[i]);
+        __m128d r = _mm_ceil_pd(v);
+        _mm_storeu_pd(&output[i], r);
+    }
+#endif
+
+    for (; i < count; i++)
+        output[i] = std::ceil(input[i]);
+}
+
+// ---------------------
+// Truncate: trunc(x)
+// ---------------------
+inline void trunc_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if defined(EIDOS_HAS_AVX2)
+    for (; i + 4 <= count; i += 4)
+    {
+        __m256d v = _mm256_loadu_pd(&input[i]);
+        __m256d r = _mm256_round_pd(v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        _mm256_storeu_pd(&output[i], r);
+    }
+#elif defined(EIDOS_HAS_SSE42)
+    for (; i + 2 <= count; i += 2)
+    {
+        __m128d v = _mm_loadu_pd(&input[i]);
+        __m128d r = _mm_round_pd(v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+        _mm_storeu_pd(&output[i], r);
+    }
+#endif
+
+    for (; i < count; i++)
+        output[i] = std::trunc(input[i]);
+}
+
+// ---------------------
+// Round: round(x)
+// ---------------------
+inline void round_float64(const double *input, double *output, int64_t count)
+{
+    int64_t i = 0;
+
+#if defined(EIDOS_HAS_AVX2)
+    for (; i + 4 <= count; i += 4)
+    {
+        __m256d v = _mm256_loadu_pd(&input[i]);
+        __m256d r = _mm256_round_pd(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        _mm256_storeu_pd(&output[i], r);
+    }
+#elif defined(EIDOS_HAS_SSE42)
+    for (; i + 2 <= count; i += 2)
+    {
+        __m128d v = _mm_loadu_pd(&input[i]);
+        __m128d r = _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+        _mm_storeu_pd(&output[i], r);
+    }
+#endif
+
+    for (; i < count; i++)
+        output[i] = std::round(input[i]);
+}
+
+// ---------------------
+// Exponential: exp(x)
+// ---------------------
+// Note: There's no hardware exp instruction, but we structure the loop
+// for cache-friendly access. For true SIMD exp, we'd need a vectorized math library.
+inline void exp_float64(const double *input, double *output, int64_t count)
+{
+    for (int64_t i = 0; i < count; i++)
+        output[i] = std::exp(input[i]);
+}
+
+// ---------------------
+// Natural Log: log(x)
+// ---------------------
+inline void log_float64(const double *input, double *output, int64_t count)
+{
+    for (int64_t i = 0; i < count; i++)
+        output[i] = std::log(input[i]);
+}
+
+// ---------------------
+// Log base 10: log10(x)
+// ---------------------
+inline void log10_float64(const double *input, double *output, int64_t count)
+{
+    for (int64_t i = 0; i < count; i++)
+        output[i] = std::log10(input[i]);
+}
+
+// ---------------------
+// Log base 2: log2(x)
+// ---------------------
+inline void log2_float64(const double *input, double *output, int64_t count)
+{
+    for (int64_t i = 0; i < count; i++)
+        output[i] = std::log2(input[i]);
+}
+
+// ================================
+// Reductions
+// ================================
+
+// ---------------------
+// Sum: sum(x)
+// ---------------------
+inline double sum_float64(const double *input, int64_t count)
+{
+    double sum = 0.0;
+    int64_t i = 0;
+
+#if defined(EIDOS_HAS_AVX2)
+    __m256d vsum = _mm256_setzero_pd();
+    for (; i + 4 <= count; i += 4)
+    {
+        __m256d v = _mm256_loadu_pd(&input[i]);
+        vsum = _mm256_add_pd(vsum, v);
+    }
+    // Horizontal sum of 4 doubles
+    __m128d vlow  = _mm256_castpd256_pd128(vsum);
+    __m128d vhigh = _mm256_extractf128_pd(vsum, 1);
+    vlow  = _mm_add_pd(vlow, vhigh);     // 2 doubles
+    __m128d shuf = _mm_shuffle_pd(vlow, vlow, 1);
+    vlow = _mm_add_sd(vlow, shuf);       // 1 double
+    sum = _mm_cvtsd_f64(vlow);
+#elif defined(EIDOS_HAS_SSE42)
+    __m128d vsum = _mm_setzero_pd();
+    for (; i + 2 <= count; i += 2)
+    {
+        __m128d v = _mm_loadu_pd(&input[i]);
+        vsum = _mm_add_pd(vsum, v);
+    }
+    // Horizontal sum of 2 doubles
+    __m128d shuf = _mm_shuffle_pd(vsum, vsum, 1);
+    vsum = _mm_add_sd(vsum, shuf);
+    sum = _mm_cvtsd_f64(vsum);
+#endif
+
+    // Scalar remainder
+    for (; i < count; i++)
+        sum += input[i];
+
+    return sum;
+}
+
+// ---------------------
+// Product: product(x)
+// ---------------------
+inline double product_float64(const double *input, int64_t count)
+{
+    double prod = 1.0;
+    int64_t i = 0;
+
+#if defined(EIDOS_HAS_AVX2)
+    __m256d vprod = _mm256_set1_pd(1.0);
+    for (; i + 4 <= count; i += 4)
+    {
+        __m256d v = _mm256_loadu_pd(&input[i]);
+        vprod = _mm256_mul_pd(vprod, v);
+    }
+    // Horizontal product of 4 doubles
+    __m128d vlow  = _mm256_castpd256_pd128(vprod);
+    __m128d vhigh = _mm256_extractf128_pd(vprod, 1);
+    vlow  = _mm_mul_pd(vlow, vhigh);
+    __m128d shuf = _mm_shuffle_pd(vlow, vlow, 1);
+    vlow = _mm_mul_sd(vlow, shuf);
+    prod = _mm_cvtsd_f64(vlow);
+#elif defined(EIDOS_HAS_SSE42)
+    __m128d vprod = _mm_set1_pd(1.0);
+    for (; i + 2 <= count; i += 2)
+    {
+        __m128d v = _mm_loadu_pd(&input[i]);
+        vprod = _mm_mul_pd(vprod, v);
+    }
+    __m128d shuf = _mm_shuffle_pd(vprod, vprod, 1);
+    vprod = _mm_mul_sd(vprod, shuf);
+    prod = _mm_cvtsd_f64(vprod);
+#endif
+
+    for (; i < count; i++)
+        prod *= input[i];
+
+    return prod;
+}
+
+} // namespace Eidos_SIMD
+
+#endif /* eidos_simd_h */
diff --git a/eidos/eidos_test_builtins.h b/eidos/eidos_test_builtins.h
index 4b31cfaf..2611199d 100644
--- a/eidos/eidos_test_builtins.h
+++ b/eidos/eidos_test_builtins.h
@@ -110,7 +110,8 @@ for (iter in 1:100)
 	x = rnorm(10);		// float
 	xbuiltin = cumProduct(x);
 	xuserdef = cumProduct_func(x);
-	if (!identical(xbuiltin, xuserdef)) stop('Mismatch in test of cumProduct(f)');
+	// tolerance because product() can get a little roundoff error due to SIMD
+	if (!all(abs(xbuiltin - xuserdef) < abs(xuserdef) * 1e-10 + 1e-15)) stop('Mismatch in test of cumProduct(f)');
 }
 
 // ***********************************************************************************************
@@ -418,7 +419,8 @@ for (iter in 1:100)
 	x = rnorm(10);		// float
 	xbuiltin = product(x);
 	xuserdef = product_func(x);
-	if (!identical(xbuiltin, xuserdef)) stop('Mismatch in test of product(f)');
+	// tolerance because product() can get a little roundoff error due to SIMD
+	if (abs(xbuiltin - xuserdef) > abs(xuserdef) * 1e-10 + 1e-15) stop('Mismatch in test of product(f)');
 }
 
 // ***********************************************************************************************
diff --git a/eidos/eidos_test_functions_other.cpp b/eidos/eidos_test_functions_other.cpp
index 81a31f9f..3d3844bd 100644
--- a/eidos/eidos_test_functions_other.cpp
+++ b/eidos/eidos_test_functions_other.cpp
@@ -350,7 +350,7 @@ void _RunFunctionMatrixArrayTests(void)
 	EidosAssertScriptSuccess_I("tr(matrix(1:9, ncol=3));", 1 + 5 + 9);
 	EidosAssertScriptSuccess_F("tr(matrix(1.0:9, ncol=3));", 1 + 5 + 9);
 	EidosAssertScriptSuccess_F("tr(matrix(c(1.25, -7.8, 3.4, 6.1, 4.75, 8.2, -0.3, 8.6, -1.5), ncol=3));", 1.25 + 4.75 + -1.5);
-	EidosAssertScriptSuccess_L("x = matrix(runif(100), ncol=10); identical(tr(x), sum(diag(x)));", true);
+	EidosAssertScriptSuccess_L("x = matrix(runif(100), ncol=10); abs(tr(x) - sum(diag(x))) < 1e-10;", true);  // tolerance for SIMD
 	EidosAssertScriptSuccess_L("x = matrix(rdunif(100, -1000, 1000), ncol=10); identical(tr(x), sum(diag(x)));", true);
 	
 	// upperTri()
@@ -377,15 +377,15 @@ void _RunFunctionMatrixArrayTests(void)
 	EidosAssertScriptSuccess_L("x = 1.0:12; y = matrix(x, nrow=3); identical(rowSums(y), c(22.0, 26, 30));", true);
 	EidosAssertScriptSuccess_L("x = (rbinom(100, 1, 0.4) == 1); y = matrix(x, nrow=10); identical(rowSums(y), apply(y, 0, 'sum(applyValue);'));", true);
 	EidosAssertScriptSuccess_L("x = rdunif(100, -1000, 1000); y = matrix(x, nrow=10); identical(rowSums(y), apply(y, 0, 'sum(applyValue);'));", true);
-	EidosAssertScriptSuccess_L("x = runif(100); y = matrix(x, nrow=10); identical(rowSums(y), apply(y, 0, 'sum(applyValue);'));", true);
-	
+	EidosAssertScriptSuccess_L("x = runif(100); y = matrix(x, nrow=10); all(abs(rowSums(y) - apply(y, 0, 'sum(applyValue);')) < 1e-10);", true);  // tolerance for SIMD
+
 	// colSums()
 	EidosAssertScriptSuccess_L("x = c(T,T,F,F,T,F,F,T,T,F,F,T); y = matrix(x, nrow=3); identical(colSums(y), c(2, 1, 2, 1));", true);
 	EidosAssertScriptSuccess_L("x = 1:12; y = matrix(x, nrow=3); identical(colSums(y), c(6, 15, 24, 33));", true);
 	EidosAssertScriptSuccess_L("x = 1.0:12; y = matrix(x, nrow=3); identical(colSums(y), c(6.0, 15, 24, 33));", true);
 	EidosAssertScriptSuccess_L("x = (rbinom(100, 1, 0.4) == 1); y = matrix(x, nrow=10); identical(colSums(y), apply(y, 1, 'sum(applyValue);'));", true);
 	EidosAssertScriptSuccess_L("x = rdunif(100, -1000, 1000); y = matrix(x, nrow=10); identical(colSums(y), apply(y, 1, 'sum(applyValue);'));", true);
-	EidosAssertScriptSuccess_L("x = runif(100); y = matrix(x, nrow=10); identical(colSums(y), apply(y, 1, 'sum(applyValue);'));", true);
+	EidosAssertScriptSuccess_L("x = runif(100); y = matrix(x, nrow=10); all(abs(colSums(y) - apply(y, 1, 'sum(applyValue);')) < 1e-10);", true);  // tolerance for SIMD
 }
 
 #pragma mark filesystem access

From 05a1d8a319cda00e2bbbb9634d3e50957af83c2f Mon Sep 17 00:00:00 2001
From: andrewkern <adkern@uoregon.edu>
Date: Wed, 26 Nov 2025 13:17:51 -0800
Subject: [PATCH 2/7] add SIMD benchmark scripts

Includes Eidos math function benchmark, SLiM simulation benchmark,
and a runner script that builds both SIMD and scalar versions and
compares performance.
---
 simd_benchmarks/run_benchmarks.sh    | 103 +++++++++++++++++++++++++++
 simd_benchmarks/simd_benchmark.eidos |  65 +++++++++++++++++
 simd_benchmarks/slim_benchmark.slim  |  34 +++++++++
 3 files changed, 202 insertions(+)
 create mode 100755 simd_benchmarks/run_benchmarks.sh
 create mode 100644 simd_benchmarks/simd_benchmark.eidos
 create mode 100644 simd_benchmarks/slim_benchmark.slim

diff --git a/simd_benchmarks/run_benchmarks.sh b/simd_benchmarks/run_benchmarks.sh
new file mode 100755
index 00000000..05c0782f
--- /dev/null
+++ b/simd_benchmarks/run_benchmarks.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+# SIMD Benchmark Runner
+# Builds SLiM with and without SIMD, runs benchmarks, compares results
+
+set -e
+
+SLIM_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+BENCHMARK_DIR="$SLIM_ROOT/simd_benchmarks"
+BUILD_SIMD="$SLIM_ROOT/build_simd"
+BUILD_SCALAR="$SLIM_ROOT/build_scalar"
+
+NUM_RUNS=${1:-3}  # Default to 3 runs, or use first argument
+
+echo "============================================"
+echo "SIMD Benchmark Runner"
+echo "============================================"
+echo "SLiM root: $SLIM_ROOT"
+echo "Runs per benchmark: $NUM_RUNS"
+echo ""
+
+# Build with SIMD
+echo "Building with SIMD enabled..."
+rm -rf "$BUILD_SIMD"
+mkdir -p "$BUILD_SIMD"
+cd "$BUILD_SIMD"
+cmake -DUSE_SIMD=ON -DCMAKE_BUILD_TYPE=Release .. > /dev/null
+make -j10 > /dev/null 2>&1
+echo "  Done."
+
+# Build without SIMD
+echo "Building with SIMD disabled..."
+rm -rf "$BUILD_SCALAR"
+mkdir -p "$BUILD_SCALAR"
+cd "$BUILD_SCALAR"
+cmake -DUSE_SIMD=OFF -DCMAKE_BUILD_TYPE=Release .. > /dev/null
+make -j10 > /dev/null 2>&1
+echo "  Done."
+echo ""
+
+# Function to run eidos benchmark and extract times
+run_eidos_benchmark() {
+    local binary="$1"
+    local label="$2"
+
+    echo "  Running Eidos benchmark ($label)..."
+    "$binary" "$BENCHMARK_DIR/simd_benchmark.eidos" 2>/dev/null | grep -E "^\w+\(\):" | while read line; do
+        echo "    $line"
+    done
+}
+
+# Function to run slim benchmark and get average time
+run_slim_benchmark() {
+    local binary="$1"
+    local runs="$2"
+    local total=0
+
+    for ((i=1; i<=runs; i++)); do
+        time=$("$binary" "$BENCHMARK_DIR/slim_benchmark.slim" 2>/dev/null | grep "Elapsed time" | grep -oE '[0-9]+\.[0-9]+')
+        total=$(echo "$total + $time" | bc)
+    done
+
+    avg=$(echo "scale=3; $total / $runs" | bc)
+    echo "$avg"
+}
+
+echo "============================================"
+echo "Eidos Math Function Benchmarks"
+echo "============================================"
+echo ""
+
+echo "SIMD Build:"
+run_eidos_benchmark "$BUILD_SIMD/eidos" "SIMD"
+echo ""
+
+echo "Scalar Build:"
+run_eidos_benchmark "$BUILD_SCALAR/eidos" "Scalar"
+echo ""
+
+echo "============================================"
+echo "SLiM Simulation Benchmark"
+echo "(N=5000, 5000 generations, selection)"
+echo "============================================"
+echo ""
+
+echo "Running $NUM_RUNS iterations each..."
+echo ""
+
+simd_time=$(run_slim_benchmark "$BUILD_SIMD/slim" "$NUM_RUNS")
+echo "SIMD Build:   ${simd_time}s (avg)"
+
+scalar_time=$(run_slim_benchmark "$BUILD_SCALAR/slim" "$NUM_RUNS")
+echo "Scalar Build: ${scalar_time}s (avg)"
+
+if [ "$(echo "$scalar_time > 0" | bc)" -eq 1 ]; then
+    speedup=$(echo "scale=2; $scalar_time / $simd_time" | bc)
+    echo ""
+    echo "Speedup: ${speedup}x"
+fi
+
+echo ""
+echo "============================================"
+echo "Benchmark complete"
+echo "============================================"
diff --git a/simd_benchmarks/simd_benchmark.eidos b/simd_benchmarks/simd_benchmark.eidos
new file mode 100644
index 00000000..c49f4407
--- /dev/null
+++ b/simd_benchmarks/simd_benchmark.eidos
@@ -0,0 +1,65 @@
+// SIMD Benchmark for Eidos math functions
+// Tests performance with large float arrays
+
+defineGlobal("ARRAY_SIZE", 1000000);
+defineGlobal("ITERATIONS", 100);
+
+catn("SIMD Benchmark");
+catn("Array size: " + ARRAY_SIZE);
+catn("Iterations: " + ITERATIONS);
+catn("----------------------------------------");
+
+// Generate test data
+x = runif(ARRAY_SIZE);
+
+// Benchmark sqrt
+t0 = clock();
+for (i in 1:ITERATIONS) { y = sqrt(x); }
+t1 = clock();
+catn("sqrt():    " + format("%.3f", (t1-t0)) + " sec");
+
+// Benchmark abs
+t0 = clock();
+for (i in 1:ITERATIONS) { y = abs(x - 0.5); }
+t1 = clock();
+catn("abs():     " + format("%.3f", (t1-t0)) + " sec");
+
+// Benchmark floor
+t0 = clock();
+for (i in 1:ITERATIONS) { y = floor(x * 100); }
+t1 = clock();
+catn("floor():   " + format("%.3f", (t1-t0)) + " sec");
+
+// Benchmark ceil
+t0 = clock();
+for (i in 1:ITERATIONS) { y = ceil(x * 100); }
+t1 = clock();
+catn("ceil():    " + format("%.3f", (t1-t0)) + " sec");
+
+// Benchmark round
+t0 = clock();
+for (i in 1:ITERATIONS) { y = round(x * 100); }
+t1 = clock();
+catn("round():   " + format("%.3f", (t1-t0)) + " sec");
+
+// Benchmark trunc
+t0 = clock();
+for (i in 1:ITERATIONS) { y = trunc(x * 100); }
+t1 = clock();
+catn("trunc():   " + format("%.3f", (t1-t0)) + " sec");
+
+// Benchmark sum
+t0 = clock();
+for (i in 1:ITERATIONS) { y = sum(x); }
+t1 = clock();
+catn("sum():     " + format("%.3f", (t1-t0)) + " sec");
+
+// Benchmark product (smaller array to avoid underflow)
+x_small = runif(1000, 0.99, 1.01);
+t0 = clock();
+for (i in 1:(ITERATIONS*100)) { y = product(x_small); }
+t1 = clock();
+catn("product(): " + format("%.3f", (t1-t0)) + " sec (1000 elements, 10000 iters)");
+
+catn("----------------------------------------");
+catn("Done.");
diff --git a/simd_benchmarks/slim_benchmark.slim b/simd_benchmarks/slim_benchmark.slim
new file mode 100644
index 00000000..bfd80697
--- /dev/null
+++ b/simd_benchmarks/slim_benchmark.slim
@@ -0,0 +1,34 @@
+// SLiM Benchmark: Simple simulation with recombination and selection
+// Tests whether SIMD optimizations provide any incidental speedup
+
+initialize() {
+	initializeMutationRate(1e-7);
+	initializeRecombinationRate(1e-8);
+
+	// Neutral and selected mutations
+	initializeMutationType("m1", 0.5, "f", 0.0);         // neutral
+	initializeMutationType("m2", 0.5, "g", -0.03, 0.2);  // deleterious (gamma)
+	initializeMutationType("m3", 0.5, "e", 0.1);         // beneficial (exponential)
+
+	initializeGenomicElementType("g1", c(m1, m2, m3), c(0.7, 0.25, 0.05));
+	initializeGenomicElement(g1, 0, 999999);  // 1 Mb chromosome
+}
+
+1 early() {
+	sim.addSubpop("p1", 5000);
+	catn("Starting simulation: N=5000, 1Mb chromosome, 500 generations");
+	catn("Mutation rate: 1e-7, Recombination rate: 1e-8");
+	defineGlobal("start_time", clock());
+}
+
+5000 late() {
+	end_time = clock();
+	elapsed = end_time - start_time;
+
+	catn("\n----------------------------------------");
+	catn("Simulation complete");
+	catn("Elapsed time: " + format("%.2f", elapsed) + " seconds");
+	catn("Final mutation count: " + size(sim.mutations));
+	catn("Population size: " + p1.individualCount);
+	catn("----------------------------------------");
+}

From 1d912dc5a6010df98b2f93e3a7eacda7e53d4cb3 Mon Sep 17 00:00:00 2001
From: andrewkern <adkern@uoregon.edu>
Date: Wed, 10 Dec 2025 17:57:47 -0800
Subject: [PATCH 3/7] added credit; added readme to bench; tried to clean
 whitespace

---
 eidos/eidos_functions_math.cpp       | 52 ++++++++---------
 eidos/eidos_simd.h                   |  2 +-
 eidos/eidos_test_functions_other.cpp |  2 +-
 simd_benchmarks/README.md            | 87 ++++++++++++++++++++++++++++
 4 files changed, 115 insertions(+), 28 deletions(-)
 create mode 100644 simd_benchmarks/README.md

diff --git a/eidos/eidos_functions_math.cpp b/eidos/eidos_functions_math.cpp
index 2a97dedb..7a67160f 100644
--- a/eidos/eidos_functions_math.cpp
+++ b/eidos/eidos_functions_math.cpp
@@ -88,7 +88,7 @@ EidosValue_SP Eidos_ExecuteFunction_abs(const std::vector<EidosValue_SP> &p_argu
 		EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count);
 		double *float_result_data = float_result->data_mutable();
 		result_SP = EidosValue_SP(float_result);
-
+		
 #ifdef _OPENMP
 		EIDOS_THREAD_COUNT(gEidos_OMP_threads_ABS_FLOAT);
 		#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ABS_FLOAT) num_threads(thread_count)
@@ -98,9 +98,9 @@ EidosValue_SP Eidos_ExecuteFunction_abs(const std::vector<EidosValue_SP> &p_argu
 		Eidos_SIMD::abs_float64(float_data, float_result_data, x_count);
 #endif
 	}
-
+	
 	result_SP->CopyDimensionsFromValue(x_value);
-
+	
 	return result_SP;
 }
 
@@ -195,14 +195,14 @@ EidosValue_SP Eidos_ExecuteFunction_atan2(const std::vector<EidosValue_SP> &p_ar
 EidosValue_SP Eidos_ExecuteFunction_ceil(const std::vector<EidosValue_SP> &p_arguments, __attribute__((unused)) EidosInterpreter &p_interpreter)
 {
 	EidosValue_SP result_SP(nullptr);
-
+	
 	EidosValue *x_value = p_arguments[0].get();
 	int x_count = x_value->Count();
 	const double *float_data = x_value->FloatData();
 	EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count);
 	double *float_result_data = float_result->data_mutable();
 	result_SP = EidosValue_SP(float_result);
-
+	
 #ifdef _OPENMP
 	EIDOS_THREAD_COUNT(gEidos_OMP_threads_CEIL);
 	#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_CEIL) num_threads(thread_count)
@@ -211,9 +211,9 @@ EidosValue_SP Eidos_ExecuteFunction_ceil(const std::vector<EidosValue_SP> &p_arg
 #else
 	Eidos_SIMD::ceil_float64(float_data, float_result_data, x_count);
 #endif
-
+	
 	result_SP->CopyDimensionsFromValue(x_value);
-
+	
 	return result_SP;
 }
 
@@ -368,14 +368,14 @@ EidosValue_SP Eidos_ExecuteFunction_exp(const std::vector<EidosValue_SP> &p_argu
 EidosValue_SP Eidos_ExecuteFunction_floor(const std::vector<EidosValue_SP> &p_arguments, __attribute__((unused)) EidosInterpreter &p_interpreter)
 {
 	EidosValue_SP result_SP(nullptr);
-
+	
 	EidosValue *x_value = p_arguments[0].get();
 	int x_count = x_value->Count();
 	const double *float_data = x_value->FloatData();
 	EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count);
 	double *float_result_data = float_result->data_mutable();
 	result_SP = EidosValue_SP(float_result);
-
+	
 #ifdef _OPENMP
 	EIDOS_THREAD_COUNT(gEidos_OMP_threads_FLOOR);
 	#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_FLOOR) num_threads(thread_count)
@@ -384,9 +384,9 @@ EidosValue_SP Eidos_ExecuteFunction_floor(const std::vector<EidosValue_SP> &p_ar
 #else
 	Eidos_SIMD::floor_float64(float_data, float_result_data, x_count);
 #endif
-
+	
 	result_SP->CopyDimensionsFromValue(x_value);
-
+	
 	return result_SP;
 }
 
@@ -802,10 +802,10 @@ EidosValue_SP Eidos_ExecuteFunction_product(const std::vector<EidosValue_SP> &p_
 	{
 		const double *float_data = x_value->FloatData();
 		double product = Eidos_SIMD::product_float64(float_data, x_count);
-
+		
 		result_SP = EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Float(product));
 	}
-
+	
 	return result_SP;
 }
 
@@ -813,14 +813,14 @@ EidosValue_SP Eidos_ExecuteFunction_product(const std::vector<EidosValue_SP> &p_
 EidosValue_SP Eidos_ExecuteFunction_round(const std::vector<EidosValue_SP> &p_arguments, __attribute__((unused)) EidosInterpreter &p_interpreter)
 {
 	EidosValue_SP result_SP(nullptr);
-
+	
 	EidosValue *x_value = p_arguments[0].get();
 	int x_count = x_value->Count();
 	const double *float_data = x_value->FloatData();
 	EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count);
 	double *float_result_data = float_result->data_mutable();
 	result_SP = EidosValue_SP(float_result);
-
+	
 #ifdef _OPENMP
 	EIDOS_THREAD_COUNT(gEidos_OMP_threads_ROUND);
 	#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ROUND) num_threads(thread_count)
@@ -829,9 +829,9 @@ EidosValue_SP Eidos_ExecuteFunction_round(const std::vector<EidosValue_SP> &p_ar
 #else
 	Eidos_SIMD::round_float64(float_data, float_result_data, x_count);
 #endif
-
+	
 	result_SP->CopyDimensionsFromValue(x_value);
-
+	
 	return result_SP;
 }
 
@@ -2440,7 +2440,7 @@ EidosValue_SP Eidos_ExecuteFunction_sqrt(const std::vector<EidosValue_SP> &p_arg
 		EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count);
 		double *float_result_data = float_result->data_mutable();
 		result_SP = EidosValue_SP(float_result);
-
+		
 #ifdef _OPENMP
 		EIDOS_THREAD_COUNT(gEidos_OMP_threads_SQRT_FLOAT);
 		#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_SQRT_FLOAT) num_threads(thread_count)
@@ -2450,9 +2450,9 @@ EidosValue_SP Eidos_ExecuteFunction_sqrt(const std::vector<EidosValue_SP> &p_arg
 		Eidos_SIMD::sqrt_float64(float_data, float_result_data, x_count);
 #endif
 	}
-
+	
 	result_SP->CopyDimensionsFromValue(x_value);
-
+	
 	return result_SP;
 }
 
@@ -2514,12 +2514,12 @@ EidosValue_SP Eidos_ExecuteFunction_sum(const std::vector<EidosValue_SP> &p_argu
 			// case across multiple threads seems excessively complex; instead we look for an overflow afterwards
 			const int64_t *int_data = x_value->IntData();
 			double sum_d = 0;
-
+			
 			EIDOS_THREAD_COUNT(gEidos_OMP_threads_SUM_INTEGER);
 #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(int_data) reduction(+: sum_d) if(parallel:x_count >= EIDOS_OMPMIN_SUM_INTEGER) num_threads(thread_count)
 			for (int value_index = 0; value_index < x_count; ++value_index)
 				sum_d += int_data[value_index];
-
+				
 			// 2^53 is the largest integer such that it and all smaller integers can be represented in double losslessly
 			int64_t sum = (int64_t)sum_d;
 			bool fits_in_integer = (((double)sum == sum_d) && (sum < 9007199254740992L) && (sum > -9007199254740992L));
@@ -2609,14 +2609,14 @@ EidosValue_SP Eidos_ExecuteFunction_tan(const std::vector<EidosValue_SP> &p_argu
 EidosValue_SP Eidos_ExecuteFunction_trunc(const std::vector<EidosValue_SP> &p_arguments, __attribute__((unused)) EidosInterpreter &p_interpreter)
 {
 	EidosValue_SP result_SP(nullptr);
-
+	
 	EidosValue *x_value = p_arguments[0].get();
 	int x_count = x_value->Count();
 	const double *float_data = x_value->FloatData();
 	EidosValue_Float *float_result = (new (gEidosValuePool->AllocateChunk()) EidosValue_Float())->resize_no_initialize(x_count);
 	double *float_result_data = float_result->data_mutable();
 	result_SP = EidosValue_SP(float_result);
-
+	
 #ifdef _OPENMP
 	EIDOS_THREAD_COUNT(gEidos_OMP_threads_TRUNC);
 	#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_TRUNC) num_threads(thread_count)
@@ -2625,9 +2625,9 @@ EidosValue_SP Eidos_ExecuteFunction_trunc(const std::vector<EidosValue_SP> &p_ar
 #else
 	Eidos_SIMD::trunc_float64(float_data, float_result_data, x_count);
 #endif
-
+	
 	result_SP->CopyDimensionsFromValue(x_value);
-
+	
 	return result_SP;
 }
 
diff --git a/eidos/eidos_simd.h b/eidos/eidos_simd.h
index 63efbb1f..c14bdf37 100644
--- a/eidos/eidos_simd.h
+++ b/eidos/eidos_simd.h
@@ -2,7 +2,7 @@
 //  eidos_simd.h
 //  Eidos
 //
-//  Created by Ben Haller on 11/26/2024.
+//  Created by Andrew Kern on 11/26/2025.
 //  Copyright (c) 2024-2025 Philipp Messer.  All rights reserved.
 //	A product of the Messer Lab, http://messerlab.org/slim/
 //
diff --git a/eidos/eidos_test_functions_other.cpp b/eidos/eidos_test_functions_other.cpp
index 3d3844bd..ba882bf8 100644
--- a/eidos/eidos_test_functions_other.cpp
+++ b/eidos/eidos_test_functions_other.cpp
@@ -378,7 +378,7 @@ void _RunFunctionMatrixArrayTests(void)
 	EidosAssertScriptSuccess_L("x = (rbinom(100, 1, 0.4) == 1); y = matrix(x, nrow=10); identical(rowSums(y), apply(y, 0, 'sum(applyValue);'));", true);
 	EidosAssertScriptSuccess_L("x = rdunif(100, -1000, 1000); y = matrix(x, nrow=10); identical(rowSums(y), apply(y, 0, 'sum(applyValue);'));", true);
 	EidosAssertScriptSuccess_L("x = runif(100); y = matrix(x, nrow=10); all(abs(rowSums(y) - apply(y, 0, 'sum(applyValue);')) < 1e-10);", true);  // tolerance for SIMD
-
+	
 	// colSums()
 	EidosAssertScriptSuccess_L("x = c(T,T,F,F,T,F,F,T,T,F,F,T); y = matrix(x, nrow=3); identical(colSums(y), c(2, 1, 2, 1));", true);
 	EidosAssertScriptSuccess_L("x = 1:12; y = matrix(x, nrow=3); identical(colSums(y), c(6, 15, 24, 33));", true);
diff --git a/simd_benchmarks/README.md b/simd_benchmarks/README.md
new file mode 100644
index 00000000..1afb6ebf
--- /dev/null
+++ b/simd_benchmarks/README.md
@@ -0,0 +1,87 @@
+# SIMD Benchmarks
+
+This directory contains benchmark scripts used during the development of SIMD optimizations for SLiM. These files are provided for internal development use and are **not used in the build of SLiM**.
+
+## Contents
+
+- **`run_benchmarks.sh`** - Shell script that builds SLiM with and without SIMD, runs both benchmark scripts, and reports speedup comparisons.
+
+- **`simd_benchmark.eidos`** - Eidos script that benchmarks SIMD-optimized math functions (`sqrt`, `abs`, `floor`, `ceil`, `round`, `trunc`, `sum`, `product`) on large arrays.
+
+- **`slim_benchmark.slim`** - SLiM simulation benchmark (N=5000, 1Mb chromosome, 5000 generations with selection) for measuring overall simulation performance.
+
+## Author
+
+These benchmarks were developed by Andrew Kern as part of SIMD optimization work for SLiM.
+
+## Usage
+
+These files are not part of the SLiM build system. To run the benchmarks:
+
+```bash
+cd simd_benchmarks
+./run_benchmarks.sh [num_runs]
+```
+
+This will build both SIMD-enabled and scalar versions of SLiM, run the benchmarks, and report the speedup.
+
+## Results
+
+Benchmark results look like the following (example output):
+
+```
+$ simd_benchmarks/run_benchmarks.sh 
+============================================
+SIMD Benchmark Runner
+============================================
+SLiM root: /home/adkern/SLiM
+Runs per benchmark: 3
+
+Building with SIMD enabled...
+  Done.
+Building with SIMD disabled...
+  Done.
+
+============================================
+Eidos Math Function Benchmarks
+============================================
+
+SIMD Build:
+  Running Eidos benchmark (SIMD)...
+    sqrt():    0.105 sec
+    abs():     0.171 sec
+    floor():   0.164 sec
+    ceil():    0.166 sec
+    round():   0.164 sec
+    trunc():   0.165 sec
+    sum():     0.032 sec
+    product(): 0.003 sec (1000 elements, 10000 iters)
+
+Scalar Build:
+  Running Eidos benchmark (Scalar)...
+    sqrt():    0.108 sec
+    abs():     0.166 sec
+    floor():   0.231 sec
+    ceil():    0.246 sec
+    round():   0.473 sec
+    trunc():   0.246 sec
+    sum():     0.166 sec
+    product(): 0.017 sec (1000 elements, 10000 iters)
+
+============================================
+SLiM Simulation Benchmark
+(N=5000, 5000 generations, selection)
+============================================
+
+Running 3 iterations each...
+
+SIMD Build:   12.756s (avg)
+Scalar Build: 12.316s (avg)
+
+Speedup: .96x
+
+============================================
+Benchmark complete
+============================================
+```
+so the takeaway is that SIMD provided significant speedups for eidos math functions, while the overall SLiM simulation speedup was minimal in this specific benchmark scenario.
\ No newline at end of file

From f6f2412f73747605a7403e9df054a610bca6997a Mon Sep 17 00:00:00 2001
From: andrewkern <adkern@uoregon.edu>
Date: Wed, 10 Dec 2025 18:04:58 -0800
Subject: [PATCH 4/7] moar whitespace woes

---
 eidos/eidos_functions_math.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/eidos/eidos_functions_math.cpp b/eidos/eidos_functions_math.cpp
index 7a67160f..ebc741b0 100644
--- a/eidos/eidos_functions_math.cpp
+++ b/eidos/eidos_functions_math.cpp
@@ -2514,12 +2514,12 @@ EidosValue_SP Eidos_ExecuteFunction_sum(const std::vector<EidosValue_SP> &p_argu
 			// case across multiple threads seems excessively complex; instead we look for an overflow afterwards
 			const int64_t *int_data = x_value->IntData();
 			double sum_d = 0;
-			
+
 			EIDOS_THREAD_COUNT(gEidos_OMP_threads_SUM_INTEGER);
 #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(int_data) reduction(+: sum_d) if(parallel:x_count >= EIDOS_OMPMIN_SUM_INTEGER) num_threads(thread_count)
 			for (int value_index = 0; value_index < x_count; ++value_index)
 				sum_d += int_data[value_index];
-				
+
 			// 2^53 is the largest integer such that it and all smaller integers can be represented in double losslessly
 			int64_t sum = (int64_t)sum_d;
 			bool fits_in_integer = (((double)sum == sum_d) && (sum < 9007199254740992L) && (sum > -9007199254740992L));
@@ -2535,7 +2535,7 @@ EidosValue_SP Eidos_ExecuteFunction_sum(const std::vector<EidosValue_SP> &p_argu
 	{
 		const double *float_data = x_value->FloatData();
 		double sum = 0;
-
+		
 #ifdef _OPENMP
 		EIDOS_THREAD_COUNT(gEidos_OMP_threads_SUM_FLOAT);
 		#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data) reduction(+: sum) if(parallel:x_count >= EIDOS_OMPMIN_SUM_FLOAT) num_threads(thread_count)
@@ -2544,7 +2544,7 @@ EidosValue_SP Eidos_ExecuteFunction_sum(const std::vector<EidosValue_SP> &p_argu
 #else
 		sum = Eidos_SIMD::sum_float64(float_data, x_count);
 #endif
-
+		
 		result_SP = EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Float(sum));
 	}
 	else if (x_type == EidosValueType::kValueLogical)

From 08e7aa7c7f72621777d7c87b618fd0b271be505e Mon Sep 17 00:00:00 2001
From: andrewkern <adkern@uoregon.edu>
Date: Wed, 10 Dec 2025 18:53:02 -0800
Subject: [PATCH 5/7] NEON SIMD for ARM

---
 CMakeLists.txt     | 22 +++++++++++++--
 eidos/eidos_simd.h | 70 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 88 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b85c4742..c8080018 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -291,9 +291,19 @@ endif()
 #
 
 # Option to disable SIMD entirely
-option(USE_SIMD "Enable SIMD optimizations (SSE4.2/AVX2)" ON)
+option(USE_SIMD "Enable SIMD optimizations (SSE4.2/AVX2 on x86_64, NEON on ARM64)" ON)
+
+# Check architecture
+# CMAKE_SYSTEM_PROCESSOR is "x86_64" on Intel Macs and Linux x86_64, "arm64"/"aarch64" on ARM
+set(IS_X86_64 FALSE)
+set(IS_ARM64 FALSE)
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64|i686|i386")
+    set(IS_X86_64 TRUE)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64|ARM64")
+    set(IS_ARM64 TRUE)
+endif()
 
-if(USE_SIMD AND NOT WIN32)
+if(USE_SIMD AND NOT WIN32 AND IS_X86_64)
     include(CheckCXXCompilerFlag)
 
     # Check for AVX2 support
@@ -315,8 +325,14 @@ if(USE_SIMD AND NOT WIN32)
         add_compile_definitions(EIDOS_HAS_SSE42=1)
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2")
     else()
-        message(STATUS "SIMD: No SIMD support detected, using scalar fallback")
+        message(STATUS "SIMD: No x86 SIMD support detected, using scalar fallback")
     endif()
+elseif(USE_SIMD AND NOT WIN32 AND IS_ARM64)
+    # ARM64 NEON is always available on ARM64, no compiler flag needed
+    message(STATUS "SIMD: ARM64 NEON support enabled")
+    add_compile_definitions(EIDOS_HAS_NEON=1)
+elseif(USE_SIMD AND NOT WIN32)
+    message(STATUS "SIMD: Unknown architecture (${CMAKE_SYSTEM_PROCESSOR}), using scalar fallback")
 elseif(USE_SIMD AND WIN32)
     # Windows/MSVC detection not yet implemented
     message(STATUS "SIMD: Windows SIMD detection not yet implemented, using scalar fallback")
diff --git a/eidos/eidos_simd.h b/eidos/eidos_simd.h
index c14bdf37..d344db89 100644
--- a/eidos/eidos_simd.h
+++ b/eidos/eidos_simd.h
@@ -22,7 +22,10 @@
  SIMD acceleration for Eidos math operations, independent of OpenMP.
 
  This header provides vectorized implementations of common math operations
- using SSE4.2 or AVX2 intrinsics when available, with scalar fallbacks.
+ using platform-specific SIMD intrinsics when available:
+   - x86_64: SSE4.2 or AVX2 via <immintrin.h>
+   - ARM64: NEON via <arm_neon.h>
+ Falls back to scalar code when no SIMD is available.
 
  */
 
@@ -42,6 +45,10 @@
     #include <smmintrin.h>
     #define EIDOS_SIMD_WIDTH 2          // 2 doubles per SSE register
     #define EIDOS_SIMD_FLOAT_WIDTH 4    // 4 floats per SSE register
+#elif defined(EIDOS_HAS_NEON)
+    #include <arm_neon.h>
+    #define EIDOS_SIMD_WIDTH 2          // 2 doubles per NEON register
+    #define EIDOS_SIMD_FLOAT_WIDTH 4    // 4 floats per NEON register
 #else
     #define EIDOS_SIMD_WIDTH 1          // Scalar fallback
     #define EIDOS_SIMD_FLOAT_WIDTH 1
@@ -78,6 +85,14 @@ inline void sqrt_float64(const double *input, double *output, int64_t count)
         __m128d r = _mm_sqrt_pd(v);
         _mm_storeu_pd(&output[i], r);
     }
+#elif defined(EIDOS_HAS_NEON)
+    // Process 2 doubles at a time
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        float64x2_t r = vsqrtq_f64(v);
+        vst1q_f64(&output[i], r);
+    }
 #endif
 
     // Scalar remainder
@@ -109,6 +124,13 @@ inline void abs_float64(const double *input, double *output, int64_t count)
         __m128d r = _mm_andnot_pd(sign_mask, v);
         _mm_storeu_pd(&output[i], r);
     }
+#elif defined(EIDOS_HAS_NEON)
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        float64x2_t r = vabsq_f64(v);
+        vst1q_f64(&output[i], r);
+    }
 #endif
 
     for (; i < count; i++)
@@ -136,6 +158,13 @@ inline void floor_float64(const double *input, double *output, int64_t count)
         __m128d r = _mm_floor_pd(v);
         _mm_storeu_pd(&output[i], r);
     }
+#elif defined(EIDOS_HAS_NEON)
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        float64x2_t r = vrndmq_f64(v);  // Round toward minus infinity (floor)
+        vst1q_f64(&output[i], r);
+    }
 #endif
 
     for (; i < count; i++)
@@ -163,6 +192,13 @@ inline void ceil_float64(const double *input, double *output, int64_t count)
         __m128d r = _mm_ceil_pd(v);
         _mm_storeu_pd(&output[i], r);
     }
+#elif defined(EIDOS_HAS_NEON)
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        float64x2_t r = vrndpq_f64(v);  // Round toward plus infinity (ceil)
+        vst1q_f64(&output[i], r);
+    }
 #endif
 
     for (; i < count; i++)
@@ -190,6 +226,13 @@ inline void trunc_float64(const double *input, double *output, int64_t count)
         __m128d r = _mm_round_pd(v, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
         _mm_storeu_pd(&output[i], r);
     }
+#elif defined(EIDOS_HAS_NEON)
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        float64x2_t r = vrndq_f64(v);  // Round toward zero (truncate)
+        vst1q_f64(&output[i], r);
+    }
 #endif
 
     for (; i < count; i++)
@@ -217,6 +260,13 @@ inline void round_float64(const double *input, double *output, int64_t count)
         __m128d r = _mm_round_pd(v, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
         _mm_storeu_pd(&output[i], r);
     }
+#elif defined(EIDOS_HAS_NEON)
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        float64x2_t r = vrndaq_f64(v);  // Round to nearest, ties away from zero
+        vst1q_f64(&output[i], r);
+    }
 #endif
 
     for (; i < count; i++)
@@ -298,6 +348,15 @@ inline double sum_float64(const double *input, int64_t count)
     __m128d shuf = _mm_shuffle_pd(vsum, vsum, 1);
     vsum = _mm_add_sd(vsum, shuf);
     sum = _mm_cvtsd_f64(vsum);
+#elif defined(EIDOS_HAS_NEON)
+    float64x2_t vsum = vdupq_n_f64(0.0);
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        vsum = vaddq_f64(vsum, v);
+    }
+    // Horizontal sum of 2 doubles
+    sum = vaddvq_f64(vsum);
 #endif
 
     // Scalar remainder
@@ -339,6 +398,15 @@ inline double product_float64(const double *input, int64_t count)
     __m128d shuf = _mm_shuffle_pd(vprod, vprod, 1);
     vprod = _mm_mul_sd(vprod, shuf);
     prod = _mm_cvtsd_f64(vprod);
+#elif defined(EIDOS_HAS_NEON)
+    float64x2_t vprod = vdupq_n_f64(1.0);
+    for (; i + 2 <= count; i += 2)
+    {
+        float64x2_t v = vld1q_f64(&input[i]);
+        vprod = vmulq_f64(vprod, v);
+    }
+    // Horizontal product of 2 doubles
+    prod = vgetq_lane_f64(vprod, 0) * vgetq_lane_f64(vprod, 1);
 #endif
 
     for (; i < count; i++)

From c98008d950a31f62a7046ab13c5af4413fd03f8d Mon Sep 17 00:00:00 2001
From: andrewkern <adkern@uoregon.edu>
Date: Wed, 10 Dec 2025 22:16:28 -0800
Subject: [PATCH 6/7] swap in new allClose function

---
 eidos/eidos_test_builtins.h          | 10 +++++-----
 eidos/eidos_test_functions_other.cpp |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/eidos/eidos_test_builtins.h b/eidos/eidos_test_builtins.h
index 2611199d..8707e2f3 100644
--- a/eidos/eidos_test_builtins.h
+++ b/eidos/eidos_test_builtins.h
@@ -111,7 +111,7 @@ for (iter in 1:100)
 	xbuiltin = cumProduct(x);
 	xuserdef = cumProduct_func(x);
 	// tolerance because product() can get a little roundoff error due to SIMD
-	if (!all(abs(xbuiltin - xuserdef) < abs(xuserdef) * 1e-10 + 1e-15)) stop('Mismatch in test of cumProduct(f)');
+	if (!allClose(xbuiltin, xuserdef)) stop('Mismatch in test of cumProduct(f)');
 }
 
 // ***********************************************************************************************
@@ -136,7 +136,7 @@ for (iter in 1:100)
 	xbuiltin = cumSum(x);
 	xuserdef = cumSum_func(x);
 	// tolerance because sum() can get a little roundoff error due to SIMD
-	if (!all(abs(xbuiltin - xuserdef) < 1e-10)) stop('Mismatch in test of cumSum(f)');
+	if (!allClose(xbuiltin, xuserdef)) stop('Mismatch in test of cumSum(f)');
 }
 
 // ***********************************************************************************************
@@ -272,7 +272,7 @@ for (iter in 1:100)
 	xbuiltin = mean(x);
 	xuserdef = mean_func(x);
 	// tolerance because sum() can get a little roundoff error due to SIMD
-	if (!all(abs(xbuiltin - xuserdef) < 1e-10)) stop('Mismatch in test of mean(f)');
+	if (!allClose(xbuiltin, xuserdef)) stop('Mismatch in test of mean(f)');
 }
 
 // ***********************************************************************************************
@@ -420,7 +420,7 @@ for (iter in 1:100)
 	xbuiltin = product(x);
 	xuserdef = product_func(x);
 	// tolerance because product() can get a little roundoff error due to SIMD
-	if (abs(xbuiltin - xuserdef) > abs(xuserdef) * 1e-10 + 1e-15) stop('Mismatch in test of product(f)');
+	if (!allClose(xbuiltin, xuserdef)) stop('Mismatch in test of product(f)');
 }
 
 // ***********************************************************************************************
@@ -795,7 +795,7 @@ for (iter in 1:100)
 	xbuiltin = sum(x);
 	xuserdef = sum_func(x);
 	// tolerance because sum() can get a little roundoff error due to SIMD
-	if (!all(abs(xbuiltin - xuserdef) < 1e-10)) stop('Mismatch in test of sum(f)');
+	if (!allClose(xbuiltin, xuserdef)) stop('Mismatch in test of sum(f)');
 }
 
 // ***********************************************************************************************
diff --git a/eidos/eidos_test_functions_other.cpp b/eidos/eidos_test_functions_other.cpp
index ba882bf8..87779bf1 100644
--- a/eidos/eidos_test_functions_other.cpp
+++ b/eidos/eidos_test_functions_other.cpp
@@ -350,7 +350,7 @@ void _RunFunctionMatrixArrayTests(void)
 	EidosAssertScriptSuccess_I("tr(matrix(1:9, ncol=3));", 1 + 5 + 9);
 	EidosAssertScriptSuccess_F("tr(matrix(1.0:9, ncol=3));", 1 + 5 + 9);
 	EidosAssertScriptSuccess_F("tr(matrix(c(1.25, -7.8, 3.4, 6.1, 4.75, 8.2, -0.3, 8.6, -1.5), ncol=3));", 1.25 + 4.75 + -1.5);
-	EidosAssertScriptSuccess_L("x = matrix(runif(100), ncol=10); abs(tr(x) - sum(diag(x))) < 1e-10;", true);  // tolerance for SIMD
+	EidosAssertScriptSuccess_L("x = matrix(runif(100), ncol=10); allClose(tr(x), sum(diag(x)));", true);  // tolerance for SIMD
 	EidosAssertScriptSuccess_L("x = matrix(rdunif(100, -1000, 1000), ncol=10); identical(tr(x), sum(diag(x)));", true);
 	
 	// upperTri()
@@ -377,7 +377,7 @@ void _RunFunctionMatrixArrayTests(void)
 	EidosAssertScriptSuccess_L("x = 1.0:12; y = matrix(x, nrow=3); identical(rowSums(y), c(22.0, 26, 30));", true);
 	EidosAssertScriptSuccess_L("x = (rbinom(100, 1, 0.4) == 1); y = matrix(x, nrow=10); identical(rowSums(y), apply(y, 0, 'sum(applyValue);'));", true);
 	EidosAssertScriptSuccess_L("x = rdunif(100, -1000, 1000); y = matrix(x, nrow=10); identical(rowSums(y), apply(y, 0, 'sum(applyValue);'));", true);
-	EidosAssertScriptSuccess_L("x = runif(100); y = matrix(x, nrow=10); all(abs(rowSums(y) - apply(y, 0, 'sum(applyValue);')) < 1e-10);", true);  // tolerance for SIMD
+	EidosAssertScriptSuccess_L("x = runif(100); y = matrix(x, nrow=10); allClose(rowSums(y), apply(y, 0, 'sum(applyValue);'));", true);  // tolerance for SIMD
 	
 	// colSums()
 	EidosAssertScriptSuccess_L("x = c(T,T,F,F,T,F,F,T,T,F,F,T); y = matrix(x, nrow=3); identical(colSums(y), c(2, 1, 2, 1));", true);
@@ -385,7 +385,7 @@ void _RunFunctionMatrixArrayTests(void)
 	EidosAssertScriptSuccess_L("x = 1.0:12; y = matrix(x, nrow=3); identical(colSums(y), c(6.0, 15, 24, 33));", true);
 	EidosAssertScriptSuccess_L("x = (rbinom(100, 1, 0.4) == 1); y = matrix(x, nrow=10); identical(colSums(y), apply(y, 1, 'sum(applyValue);'));", true);
 	EidosAssertScriptSuccess_L("x = rdunif(100, -1000, 1000); y = matrix(x, nrow=10); identical(colSums(y), apply(y, 1, 'sum(applyValue);'));", true);
-	EidosAssertScriptSuccess_L("x = runif(100); y = matrix(x, nrow=10); all(abs(colSums(y) - apply(y, 1, 'sum(applyValue);')) < 1e-10);", true);  // tolerance for SIMD
+	EidosAssertScriptSuccess_L("x = runif(100); y = matrix(x, nrow=10); allClose(colSums(y), apply(y, 1, 'sum(applyValue);'));", true);  // tolerance for SIMD
 }
 
 #pragma mark filesystem access

From a9c364f50bf43ad4e16bb94488bb6935a0e47382 Mon Sep 17 00:00:00 2001
From: andrewkern <adkern@uoregon.edu>
Date: Fri, 12 Dec 2025 17:01:44 -0800
Subject: [PATCH 7/7] adding FIXME comments

---
 eidos/eidos_functions_math.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/eidos/eidos_functions_math.cpp b/eidos/eidos_functions_math.cpp
index ebc741b0..0733a357 100644
--- a/eidos/eidos_functions_math.cpp
+++ b/eidos/eidos_functions_math.cpp
@@ -90,6 +90,7 @@ EidosValue_SP Eidos_ExecuteFunction_abs(const std::vector<EidosValue_SP> &p_argu
 		result_SP = EidosValue_SP(float_result);
 		
 #ifdef _OPENMP
+		// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
 		EIDOS_THREAD_COUNT(gEidos_OMP_threads_ABS_FLOAT);
 		#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ABS_FLOAT) num_threads(thread_count)
 		for (int value_index = 0; value_index < x_count; ++value_index)
@@ -204,6 +205,7 @@ EidosValue_SP Eidos_ExecuteFunction_ceil(const std::vector<EidosValue_SP> &p_arg
 	result_SP = EidosValue_SP(float_result);
 	
 #ifdef _OPENMP
+	// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
 	EIDOS_THREAD_COUNT(gEidos_OMP_threads_CEIL);
 	#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_CEIL) num_threads(thread_count)
 	for (int value_index = 0; value_index < x_count; ++value_index)
@@ -353,6 +355,7 @@ EidosValue_SP Eidos_ExecuteFunction_exp(const std::vector<EidosValue_SP> &p_argu
 		double *float_result_data = float_result->data_mutable();
 		result_SP = EidosValue_SP(float_result);
 		
+		// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
 		EIDOS_THREAD_COUNT(gEidos_OMP_threads_EXP_FLOAT);
 #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_EXP_FLOAT) num_threads(thread_count)
 		for (int value_index = 0; value_index < x_count; ++value_index)
@@ -377,6 +380,7 @@ EidosValue_SP Eidos_ExecuteFunction_floor(const std::vector<EidosValue_SP> &p_ar
 	result_SP = EidosValue_SP(float_result);
 	
 #ifdef _OPENMP
+	// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
 	EIDOS_THREAD_COUNT(gEidos_OMP_threads_FLOOR);
 	#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_FLOOR) num_threads(thread_count)
 	for (int value_index = 0; value_index < x_count; ++value_index)
@@ -674,6 +678,7 @@ EidosValue_SP Eidos_ExecuteFunction_log(const std::vector<EidosValue_SP> &p_argu
 		double *float_result_data = float_result->data_mutable();
 		result_SP = EidosValue_SP(float_result);
 		
+		// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
 		EIDOS_THREAD_COUNT(gEidos_OMP_threads_LOG_FLOAT);
 #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_LOG_FLOAT) num_threads(thread_count)
 		for (int value_index = 0; value_index < x_count; ++value_index)
@@ -709,6 +714,7 @@ EidosValue_SP Eidos_ExecuteFunction_log10(const std::vector<EidosValue_SP> &p_ar
 		double *float_result_data = float_result->data_mutable();
 		result_SP = EidosValue_SP(float_result);
 		
+		// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
 		EIDOS_THREAD_COUNT(gEidos_OMP_threads_LOG10_FLOAT);
 #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_LOG10_FLOAT) num_threads(thread_count)
 		for (int value_index = 0; value_index < x_count; ++value_index)
@@ -744,6 +750,7 @@ EidosValue_SP Eidos_ExecuteFunction_log2(const std::vector<EidosValue_SP> &p_arg
 		double *float_result_data = float_result->data_mutable();
 		result_SP = EidosValue_SP(float_result);
 		
+		// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
 		EIDOS_THREAD_COUNT(gEidos_OMP_threads_LOG2_FLOAT);
 #pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_LOG2_FLOAT) num_threads(thread_count)
 		for (int value_index = 0; value_index < x_count; ++value_index)
@@ -822,6 +829,7 @@ EidosValue_SP Eidos_ExecuteFunction_round(const std::vector<EidosValue_SP> &p_ar
 	result_SP = EidosValue_SP(float_result);
 	
 #ifdef _OPENMP
+	// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
 	EIDOS_THREAD_COUNT(gEidos_OMP_threads_ROUND);
 	#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ROUND) num_threads(thread_count)
 	for (int value_index = 0; value_index < x_count; ++value_index)
@@ -2442,6 +2450,7 @@ EidosValue_SP Eidos_ExecuteFunction_sqrt(const std::vector<EidosValue_SP> &p_arg
 		result_SP = EidosValue_SP(float_result);
 		
 #ifdef _OPENMP
+		// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
 		EIDOS_THREAD_COUNT(gEidos_OMP_threads_SQRT_FLOAT);
 		#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_SQRT_FLOAT) num_threads(thread_count)
 		for (int value_index = 0; value_index < x_count; ++value_index)
@@ -2537,6 +2546,7 @@ EidosValue_SP Eidos_ExecuteFunction_sum(const std::vector<EidosValue_SP> &p_argu
 		double sum = 0;
 		
 #ifdef _OPENMP
+		// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
 		EIDOS_THREAD_COUNT(gEidos_OMP_threads_SUM_FLOAT);
 		#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data) reduction(+: sum) if(parallel:x_count >= EIDOS_OMPMIN_SUM_FLOAT) num_threads(thread_count)
 		for (int value_index = 0; value_index < x_count; ++value_index)
@@ -2618,6 +2628,7 @@ EidosValue_SP Eidos_ExecuteFunction_trunc(const std::vector<EidosValue_SP> &p_ar
 	result_SP = EidosValue_SP(float_result);
 	
 #ifdef _OPENMP
+	// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
 	EIDOS_THREAD_COUNT(gEidos_OMP_threads_TRUNC);
 	#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_TRUNC) num_threads(thread_count)
 	for (int value_index = 0; value_index < x_count; ++value_index)