Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,60 @@ if(BUILD_LTO)
endif()
endif()

#
# SIMD SUPPORT (independent of OpenMP)
#

# Option to disable SIMD entirely
option(USE_SIMD "Enable SIMD optimizations (SSE4.2/AVX2 on x86_64, NEON on ARM64)" ON)

# Check architecture
# CMAKE_SYSTEM_PROCESSOR is "x86_64" on Intel Macs and Linux x86_64, "arm64"/"aarch64" on ARM
set(IS_X86_64 FALSE)
set(IS_ARM64 FALSE)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|AMD64|amd64|i686|i386")
set(IS_X86_64 TRUE)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64|ARM64")
set(IS_ARM64 TRUE)
endif()

if(USE_SIMD AND NOT WIN32 AND IS_X86_64)
include(CheckCXXCompilerFlag)

# Check for AVX2 support
check_cxx_compiler_flag("-mavx2" COMPILER_SUPPORTS_AVX2)
check_cxx_compiler_flag("-msse4.2" COMPILER_SUPPORTS_SSE42)
check_cxx_compiler_flag("-mfma" COMPILER_SUPPORTS_FMA)

if(COMPILER_SUPPORTS_AVX2)
message(STATUS "SIMD: AVX2 support detected")
add_compile_definitions(EIDOS_HAS_AVX2=1)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2")
if(COMPILER_SUPPORTS_FMA)
message(STATUS "SIMD: FMA support detected")
add_compile_definitions(EIDOS_HAS_FMA=1)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfma")
endif()
elseif(COMPILER_SUPPORTS_SSE42)
message(STATUS "SIMD: SSE4.2 support detected (no AVX2)")
add_compile_definitions(EIDOS_HAS_SSE42=1)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse4.2")
else()
message(STATUS "SIMD: No x86 SIMD support detected, using scalar fallback")
endif()
elseif(USE_SIMD AND NOT WIN32 AND IS_ARM64)
# ARM64 NEON is always available on ARM64, no compiler flag needed
message(STATUS "SIMD: ARM64 NEON support enabled")
add_compile_definitions(EIDOS_HAS_NEON=1)
elseif(USE_SIMD AND NOT WIN32)
message(STATUS "SIMD: Unknown architecture (${CMAKE_SYSTEM_PROCESSOR}), using scalar fallback")
elseif(USE_SIMD AND WIN32)
# Windows/MSVC detection not yet implemented
message(STATUS "SIMD: Windows SIMD detection not yet implemented, using scalar fallback")
else()
message(STATUS "SIMD: Disabled by user")
endif()

# GSL - adding /usr/local/include so all targets that use GSL_INCLUDES get omp.h
set(TARGET_NAME_GSL gsl)
file(GLOB_RECURSE GSL_SOURCES ${PROJECT_SOURCE_DIR}/gsl/*.c ${PROJECT_SOURCE_DIR}/gsl/*/*.c)
Expand Down
59 changes: 48 additions & 11 deletions eidos/eidos_functions_math.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@


#include "eidos_functions.h"
#include "eidos_simd.h"

#include <utility>
#include <string>
Expand Down Expand Up @@ -88,10 +89,15 @@ EidosValue_SP Eidos_ExecuteFunction_abs(const std::vector<EidosValue_SP> &p_argu
double *float_result_data = float_result->data_mutable();
result_SP = EidosValue_SP(float_result);

#ifdef _OPENMP
// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
EIDOS_THREAD_COUNT(gEidos_OMP_threads_ABS_FLOAT);
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ABS_FLOAT) num_threads(thread_count)
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ABS_FLOAT) num_threads(thread_count)
for (int value_index = 0; value_index < x_count; ++value_index)
float_result_data[value_index] = fabs(float_data[value_index]);
#else
Eidos_SIMD::abs_float64(float_data, float_result_data, x_count);
#endif
}

result_SP->CopyDimensionsFromValue(x_value);
Expand Down Expand Up @@ -198,10 +204,15 @@ EidosValue_SP Eidos_ExecuteFunction_ceil(const std::vector<EidosValue_SP> &p_arg
double *float_result_data = float_result->data_mutable();
result_SP = EidosValue_SP(float_result);

#ifdef _OPENMP
// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
EIDOS_THREAD_COUNT(gEidos_OMP_threads_CEIL);
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_CEIL) num_threads(thread_count)
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_CEIL) num_threads(thread_count)
for (int value_index = 0; value_index < x_count; ++value_index)
float_result_data[value_index] = ceil(float_data[value_index]);
#else
Eidos_SIMD::ceil_float64(float_data, float_result_data, x_count);
#endif

result_SP->CopyDimensionsFromValue(x_value);

Expand Down Expand Up @@ -344,6 +355,7 @@ EidosValue_SP Eidos_ExecuteFunction_exp(const std::vector<EidosValue_SP> &p_argu
double *float_result_data = float_result->data_mutable();
result_SP = EidosValue_SP(float_result);

// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
EIDOS_THREAD_COUNT(gEidos_OMP_threads_EXP_FLOAT);
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_EXP_FLOAT) num_threads(thread_count)
for (int value_index = 0; value_index < x_count; ++value_index)
Expand All @@ -367,10 +379,15 @@ EidosValue_SP Eidos_ExecuteFunction_floor(const std::vector<EidosValue_SP> &p_ar
double *float_result_data = float_result->data_mutable();
result_SP = EidosValue_SP(float_result);

#ifdef _OPENMP
// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
EIDOS_THREAD_COUNT(gEidos_OMP_threads_FLOOR);
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_FLOOR) num_threads(thread_count)
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_FLOOR) num_threads(thread_count)
for (int value_index = 0; value_index < x_count; ++value_index)
float_result_data[value_index] = floor(float_data[value_index]);
#else
Eidos_SIMD::floor_float64(float_data, float_result_data, x_count);
#endif

result_SP->CopyDimensionsFromValue(x_value);

Expand Down Expand Up @@ -661,6 +678,7 @@ EidosValue_SP Eidos_ExecuteFunction_log(const std::vector<EidosValue_SP> &p_argu
double *float_result_data = float_result->data_mutable();
result_SP = EidosValue_SP(float_result);

// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
EIDOS_THREAD_COUNT(gEidos_OMP_threads_LOG_FLOAT);
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_LOG_FLOAT) num_threads(thread_count)
for (int value_index = 0; value_index < x_count; ++value_index)
Expand Down Expand Up @@ -696,6 +714,7 @@ EidosValue_SP Eidos_ExecuteFunction_log10(const std::vector<EidosValue_SP> &p_ar
double *float_result_data = float_result->data_mutable();
result_SP = EidosValue_SP(float_result);

// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
EIDOS_THREAD_COUNT(gEidos_OMP_threads_LOG10_FLOAT);
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_LOG10_FLOAT) num_threads(thread_count)
for (int value_index = 0; value_index < x_count; ++value_index)
Expand Down Expand Up @@ -731,6 +750,7 @@ EidosValue_SP Eidos_ExecuteFunction_log2(const std::vector<EidosValue_SP> &p_arg
double *float_result_data = float_result->data_mutable();
result_SP = EidosValue_SP(float_result);

// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
EIDOS_THREAD_COUNT(gEidos_OMP_threads_LOG2_FLOAT);
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_LOG2_FLOAT) num_threads(thread_count)
for (int value_index = 0; value_index < x_count; ++value_index)
Expand Down Expand Up @@ -788,10 +808,7 @@ EidosValue_SP Eidos_ExecuteFunction_product(const std::vector<EidosValue_SP> &p_
else if (x_type == EidosValueType::kValueFloat)
{
const double *float_data = x_value->FloatData();
double product = 1;

for (int value_index = 0; value_index < x_count; ++value_index)
product *= float_data[value_index];
double product = Eidos_SIMD::product_float64(float_data, x_count);

result_SP = EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Float(product));
}
Expand All @@ -811,10 +828,15 @@ EidosValue_SP Eidos_ExecuteFunction_round(const std::vector<EidosValue_SP> &p_ar
double *float_result_data = float_result->data_mutable();
result_SP = EidosValue_SP(float_result);

#ifdef _OPENMP
// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
EIDOS_THREAD_COUNT(gEidos_OMP_threads_ROUND);
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ROUND) num_threads(thread_count)
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_ROUND) num_threads(thread_count)
for (int value_index = 0; value_index < x_count; ++value_index)
float_result_data[value_index] = round(float_data[value_index]);
#else
Eidos_SIMD::round_float64(float_data, float_result_data, x_count);
#endif

result_SP->CopyDimensionsFromValue(x_value);

Expand Down Expand Up @@ -2427,10 +2449,15 @@ EidosValue_SP Eidos_ExecuteFunction_sqrt(const std::vector<EidosValue_SP> &p_arg
double *float_result_data = float_result->data_mutable();
result_SP = EidosValue_SP(float_result);

#ifdef _OPENMP
// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
EIDOS_THREAD_COUNT(gEidos_OMP_threads_SQRT_FLOAT);
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_SQRT_FLOAT) num_threads(thread_count)
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_SQRT_FLOAT) num_threads(thread_count)
for (int value_index = 0; value_index < x_count; ++value_index)
float_result_data[value_index] = sqrt(float_data[value_index]);
#else
Eidos_SIMD::sqrt_float64(float_data, float_result_data, x_count);
#endif
}

result_SP->CopyDimensionsFromValue(x_value);
Expand Down Expand Up @@ -2518,10 +2545,15 @@ EidosValue_SP Eidos_ExecuteFunction_sum(const std::vector<EidosValue_SP> &p_argu
const double *float_data = x_value->FloatData();
double sum = 0;

#ifdef _OPENMP
// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
EIDOS_THREAD_COUNT(gEidos_OMP_threads_SUM_FLOAT);
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data) reduction(+: sum) if(parallel:x_count >= EIDOS_OMPMIN_SUM_FLOAT) num_threads(thread_count)
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data) reduction(+: sum) if(parallel:x_count >= EIDOS_OMPMIN_SUM_FLOAT) num_threads(thread_count)
for (int value_index = 0; value_index < x_count; ++value_index)
sum += float_data[value_index];
#else
sum = Eidos_SIMD::sum_float64(float_data, x_count);
#endif

result_SP = EidosValue_SP(new (gEidosValuePool->AllocateChunk()) EidosValue_Float(sum));
}
Expand Down Expand Up @@ -2595,10 +2627,15 @@ EidosValue_SP Eidos_ExecuteFunction_trunc(const std::vector<EidosValue_SP> &p_ar
double *float_result_data = float_result->data_mutable();
result_SP = EidosValue_SP(float_result);

#ifdef _OPENMP
// FIXME: refactor this parallel code to use the Eidos_SIMD code path, chunked; see github.com/MesserLab/SLiM/pull/578#issuecomment-3640288984
EIDOS_THREAD_COUNT(gEidos_OMP_threads_TRUNC);
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_TRUNC) num_threads(thread_count)
#pragma omp parallel for simd schedule(simd:static) default(none) shared(x_count) firstprivate(float_data, float_result_data) if(parallel:x_count >= EIDOS_OMPMIN_TRUNC) num_threads(thread_count)
for (int value_index = 0; value_index < x_count; ++value_index)
float_result_data[value_index] = trunc(float_data[value_index]);
#else
Eidos_SIMD::trunc_float64(float_data, float_result_data, x_count);
#endif

result_SP->CopyDimensionsFromValue(x_value);

Expand Down
Loading
Loading