From 1b345277facc9a45d20a307bca3e911d571a1c76 Mon Sep 17 00:00:00 2001 From: Andrew Kern Date: Wed, 17 Dec 2025 19:07:11 -0800 Subject: [PATCH 1/6] Replace manual loop unrolling with compiler pragma in mutation tallying Replace the 16x manually unrolled loop in _TallyMutationReferences_FAST_FromMutationRunUsage() with compiler-directed unrolling via EIDOS_PRAGMA_UNROLL_16. The new implementation: - Adds EIDOS_PRAGMA_UNROLL_16 macro to eidos_globals.h (GCC 8+/Clang) - Uses __restrict__ qualifiers to indicate no pointer aliasing - Uses index-based loop with explicit count for clearer loop bounds - Reduces code from 30+ lines to 11 lines Verified that all three target compilers generate 16x unrolled assembly: - Linux GCC 11 (x86_64) - MinGW GCC 10 (Windows x86_64 cross-compile) - Apple Clang 17 (ARM64) Benchmarks show equivalent performance to manual unrolling (~1.1s for 500 generations with 10K individuals and ~22K mutations). --- core/population.cpp | 41 +++++++++++------------------------------ eidos/eidos_globals.h | 8 ++++++++ 2 files changed, 19 insertions(+), 30 deletions(-) diff --git a/core/population.cpp b/core/population.cpp index 63814943..0abd4055 100644 --- a/core/population.cpp +++ b/core/population.cpp @@ -7189,36 +7189,17 @@ void Population::_TallyMutationReferences_FAST_FromMutationRunUsage(bool p_clock // to put the refcounts for different mutations into different memory blocks // according to the thread that manages each mutation. - const MutationIndex *mutrun_iter = mutrun->begin_pointer_const(); - const MutationIndex *mutrun_end_iter = mutrun->end_pointer_const(); - - // I've gone back and forth on unrolling this loop. This ought to be done - // by the compiler, and the best unrolling strategy depends on the platform. - // But the compiler doesn't seem to do it, for my macOS system at least, or - // doesn't do it well; this increases speed by ~5% here. I'm not sure if - // clang is being dumb, or what, but it seems worthwhile. - while (mutrun_iter + 16 < mutrun_end_iter) - { - *(refcount_block_ptr + (*mutrun_iter++)) += use_count; - *(refcount_block_ptr + (*mutrun_iter++)) += use_count; - *(refcount_block_ptr + (*mutrun_iter++)) += use_count; - *(refcount_block_ptr + (*mutrun_iter++)) += use_count; - *(refcount_block_ptr + (*mutrun_iter++)) += use_count; - *(refcount_block_ptr + (*mutrun_iter++)) += use_count; - *(refcount_block_ptr + (*mutrun_iter++)) += use_count; - *(refcount_block_ptr + (*mutrun_iter++)) += use_count; - *(refcount_block_ptr + (*mutrun_iter++)) += use_count; - *(refcount_block_ptr + (*mutrun_iter++)) += use_count; - *(refcount_block_ptr + (*mutrun_iter++)) += use_count; - *(refcount_block_ptr + (*mutrun_iter++)) += use_count; - *(refcount_block_ptr + (*mutrun_iter++)) += use_count; - *(refcount_block_ptr + (*mutrun_iter++)) += use_count; - *(refcount_block_ptr + (*mutrun_iter++)) += use_count; - *(refcount_block_ptr + (*mutrun_iter++)) += use_count; - } - - while (mutrun_iter != mutrun_end_iter) - *(refcount_block_ptr + (*mutrun_iter++)) += use_count; + // Loop unrolling is handled by the compiler via EIDOS_PRAGMA_UNROLL_16. + // The __restrict__ qualifiers indicate no pointer aliasing, and the + // index-based loop with explicit count helps the compiler reason about + // loop bounds. This replaces previous manual 16x unrolling. + const MutationIndex * __restrict__ indices = mutrun->begin_pointer_const(); + const int32_t mutrun_count = mutrun->size(); + slim_refcount_t * __restrict__ refcounts = refcount_block_ptr; + + EIDOS_PRAGMA_UNROLL_16 + for (int32_t i = 0; i < mutrun_count; ++i) + refcounts[indices[i]] += use_count; } } diff --git a/eidos/eidos_globals.h b/eidos/eidos_globals.h index 0d99d5d1..51869f01 100644 --- a/eidos/eidos_globals.h +++ b/eidos/eidos_globals.h @@ -51,6 +51,14 @@ #include "eidos_openmp.h" #include "eidos_intrusive_ptr.h" +// Loop unrolling hints for compiler optimization +// GCC 8+ and Clang both support #pragma GCC unroll +#if defined(__GNUC__) && (__GNUC__ >= 8 || defined(__clang__)) + #define EIDOS_PRAGMA_UNROLL_16 _Pragma("GCC unroll 16") +#else + #define EIDOS_PRAGMA_UNROLL_16 +#endif + class EidosScript; class EidosToken; From 570ca91f510eb958a34c5ae67acfbc8d984ca39a Mon Sep 17 00:00:00 2001 From: Andrew Kern Date: Wed, 17 Dec 2025 19:56:09 -0800 Subject: [PATCH 2/6] Use compiler-specific unroll pragmas for better cross-platform optimization - Rename EIDOS_PRAGMA_UNROLL_16 to EIDOS_UNROLL_AUTO - Clang: use #pragma clang loop unroll(enable) to auto-choose factor - GCC/MinGW: fall back to explicit #pragma GCC unroll 16 - Clang auto-chooses 8x; GCC/MinGW use 16x (both valid for this memory-bound loop) --- core/population.cpp | 5 +++-- eidos/eidos_globals.h | 11 +++++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/core/population.cpp b/core/population.cpp index 0abd4055..bb732621 100644 --- a/core/population.cpp +++ b/core/population.cpp @@ -7189,7 +7189,8 @@ void Population::_TallyMutationReferences_FAST_FromMutationRunUsage(bool p_clock // to put the refcounts for different mutations into different memory blocks // according to the thread that manages each mutation. - // Loop unrolling is handled by the compiler via EIDOS_PRAGMA_UNROLL_16. + // Loop unrolling is handled by the compiler via EIDOS_UNROLL_AUTO. + // Clang auto-chooses the unroll factor; GCC/MinGW use 16x. // The __restrict__ qualifiers indicate no pointer aliasing, and the // index-based loop with explicit count helps the compiler reason about // loop bounds. This replaces previous manual 16x unrolling. @@ -7197,7 +7198,7 @@ void Population::_TallyMutationReferences_FAST_FromMutationRunUsage(bool p_clock const int32_t mutrun_count = mutrun->size(); slim_refcount_t * __restrict__ refcounts = refcount_block_ptr; - EIDOS_PRAGMA_UNROLL_16 + EIDOS_UNROLL_AUTO for (int32_t i = 0; i < mutrun_count; ++i) refcounts[indices[i]] += use_count; } diff --git a/eidos/eidos_globals.h b/eidos/eidos_globals.h index 51869f01..b25d5210 100644 --- a/eidos/eidos_globals.h +++ b/eidos/eidos_globals.h @@ -52,11 +52,14 @@ #include "eidos_intrusive_ptr.h" // Loop unrolling hints for compiler optimization -// GCC 8+ and Clang both support #pragma GCC unroll -#if defined(__GNUC__) && (__GNUC__ >= 8 || defined(__clang__)) - #define EIDOS_PRAGMA_UNROLL_16 _Pragma("GCC unroll 16") +// Clang supports auto-choosing via #pragma clang loop unroll(enable) +// GCC 8+ requires an explicit count via #pragma GCC unroll N +#if defined(__clang__) + #define EIDOS_UNROLL_AUTO _Pragma("clang loop unroll(enable)") +#elif defined(__GNUC__) && (__GNUC__ >= 8) + #define EIDOS_UNROLL_AUTO _Pragma("GCC unroll 16") #else - #define EIDOS_PRAGMA_UNROLL_16 + #define EIDOS_UNROLL_AUTO #endif class EidosScript; From b9341750e069d8e6ba5a7b8f43b35d346d42ad86 Mon Sep 17 00:00:00 2001 From: Andrew Kern Date: Wed, 17 Dec 2025 20:53:21 -0800 Subject: [PATCH 3/6] Replace pragma-based loop unrolling with compiler flags - Add -funroll-loops for GCC/MinGW in CMakeLists.txt - Add -mllvm -unroll-runtime for Clang in CMakeLists.txt - Remove EIDOS_UNROLL_AUTO macro from eidos_globals.h - Remove pragma from _TallyMutationReferences_FAST loop Both compilers now auto-detect optimal unroll factor (~8x). Benchmarks confirm no performance regression. --- CMakeLists.txt | 7 +++++++ core/population.cpp | 4 +--- eidos/eidos_globals.h | 11 ----------- 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 82f5beec..da05d7cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -159,6 +159,13 @@ get_git_head_revision(GIT_REFSPEC GIT_SHA1) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c11 -Wno-attributes -Wunused-label -Wimplicit -Wunused-variable -Wunused-value -Wno-pragmas -Wempty-body -Wshadow -Wparentheses -Wmissing-prototypes -Wswitch -Wpointer-sign -Wsign-compare -Wstrict-prototypes -Wno-sign-conversion -Wuninitialized") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wno-attributes -Wunused-label -Wunused-variable -Wunused-value -Wno-pragmas -Wempty-body -Wshadow -Wparentheses -Wswitch -Wsign-compare -Wno-sign-conversion -Wuninitialized -fno-math-errno") +# Enable loop unrolling (GCC and Clang have different flags) +if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mllvm -unroll-runtime") +else() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -funroll-loops") +endif() + # Add -march=native if requested if(BUILD_NATIVE) message(STATUS "BUILD_NATIVE is ${BUILD_NATIVE}; building native (for this machine only)") diff --git a/core/population.cpp b/core/population.cpp index bb732621..07a7f018 100644 --- a/core/population.cpp +++ b/core/population.cpp @@ -7189,8 +7189,7 @@ void Population::_TallyMutationReferences_FAST_FromMutationRunUsage(bool p_clock // to put the refcounts for different mutations into different memory blocks // according to the thread that manages each mutation. - // Loop unrolling is handled by the compiler via EIDOS_UNROLL_AUTO. - // Clang auto-chooses the unroll factor; GCC/MinGW use 16x. + // Loop unrolling is enabled globally via -funroll-loops in CMakeLists.txt. // The __restrict__ qualifiers indicate no pointer aliasing, and the // index-based loop with explicit count helps the compiler reason about // loop bounds. This replaces previous manual 16x unrolling. @@ -7198,7 +7197,6 @@ void Population::_TallyMutationReferences_FAST_FromMutationRunUsage(bool p_clock const int32_t mutrun_count = mutrun->size(); slim_refcount_t * __restrict__ refcounts = refcount_block_ptr; - EIDOS_UNROLL_AUTO for (int32_t i = 0; i < mutrun_count; ++i) refcounts[indices[i]] += use_count; } diff --git a/eidos/eidos_globals.h b/eidos/eidos_globals.h index b25d5210..0d99d5d1 100644 --- a/eidos/eidos_globals.h +++ b/eidos/eidos_globals.h @@ -51,17 +51,6 @@ #include "eidos_openmp.h" #include "eidos_intrusive_ptr.h" -// Loop unrolling hints for compiler optimization -// Clang supports auto-choosing via #pragma clang loop unroll(enable) -// GCC 8+ requires an explicit count via #pragma GCC unroll N -#if defined(__clang__) - #define EIDOS_UNROLL_AUTO _Pragma("clang loop unroll(enable)") -#elif defined(__GNUC__) && (__GNUC__ >= 8) - #define EIDOS_UNROLL_AUTO _Pragma("GCC unroll 16") -#else - #define EIDOS_UNROLL_AUTO -#endif - class EidosScript; class EidosToken; From be645989cb903853a4444ddd2ad8371936d3e45f Mon Sep 17 00:00:00 2001 From: Andrew Kern Date: Thu, 18 Dec 2025 09:46:29 -0800 Subject: [PATCH 4/6] Use targeted function attribute and loop pragma for unrolling - Remove global -funroll-loops/-mllvm flags from CMakeLists.txt - Add __attribute__((optimize("unroll-loops"))) for GCC/MinGW - Add #pragma clang loop unroll(enable) for Clang - Only affects _TallyMutationReferences_FAST, not all loops Both compilers auto-detect optimal unroll factor (~8x). No performance regression. --- CMakeLists.txt | 7 ------- core/population.cpp | 8 +++++++- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index da05d7cd..82f5beec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -159,13 +159,6 @@ get_git_head_revision(GIT_REFSPEC GIT_SHA1) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c11 -Wno-attributes -Wunused-label -Wimplicit -Wunused-variable -Wunused-value -Wno-pragmas -Wempty-body -Wshadow -Wparentheses -Wmissing-prototypes -Wswitch -Wpointer-sign -Wsign-compare -Wstrict-prototypes -Wno-sign-conversion -Wuninitialized") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wno-attributes -Wunused-label -Wunused-variable -Wunused-value -Wno-pragmas -Wempty-body -Wshadow -Wparentheses -Wswitch -Wsign-compare -Wno-sign-conversion -Wuninitialized -fno-math-errno") -# Enable loop unrolling (GCC and Clang have different flags) -if(CMAKE_CXX_COMPILER_ID MATCHES "Clang") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mllvm -unroll-runtime") -else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -funroll-loops") -endif() - # Add -march=native if requested if(BUILD_NATIVE) message(STATUS "BUILD_NATIVE is ${BUILD_NATIVE}; building native (for this machine only)") diff --git a/core/population.cpp b/core/population.cpp index 07a7f018..27b71baf 100644 --- a/core/population.cpp +++ b/core/population.cpp @@ -7155,6 +7155,9 @@ void Population::TallyMutationReferencesAcrossHaplosomes(const Haplosome * const // the mutation run tallying itself, however; instead, the caller can tally mutation runs // across whatever set of subpops/haplosomes they wish, and then this method will provide // mutation tallies given that choice. +#if defined(__GNUC__) && !defined(__clang__) +__attribute__((optimize("unroll-loops"))) +#endif void Population::_TallyMutationReferences_FAST_FromMutationRunUsage(bool p_clock_for_mutrun_experiments) { // first zero out the refcounts in all registered Mutation objects @@ -7189,7 +7192,7 @@ void Population::_TallyMutationReferences_FAST_FromMutationRunUsage(bool p_clock // to put the refcounts for different mutations into different memory blocks // according to the thread that manages each mutation. - // Loop unrolling is enabled globally via -funroll-loops in CMakeLists.txt. + // Loop unrolling is enabled via function attribute (GCC) or pragma (Clang). // The __restrict__ qualifiers indicate no pointer aliasing, and the // index-based loop with explicit count helps the compiler reason about // loop bounds. This replaces previous manual 16x unrolling. @@ -7197,6 +7200,9 @@ void Population::_TallyMutationReferences_FAST_FromMutationRunUsage(bool p_clock const int32_t mutrun_count = mutrun->size(); slim_refcount_t * __restrict__ refcounts = refcount_block_ptr; +#if defined(__clang__) +#pragma clang loop unroll(enable) +#endif for (int32_t i = 0; i < mutrun_count; ++i) refcounts[indices[i]] += use_count; } From aa95990ef40b840acb6965530cd9f2ad86de3126 Mon Sep 17 00:00:00 2001 From: Andrew Kern Date: Thu, 18 Dec 2025 11:55:55 -0800 Subject: [PATCH 5/6] Address PR review: restore pointer-range loop style, clarify scope - Restore the original pointer-range while loop instead of index-based for loop - Update comment to clarify that the function attribute (GCC) only affects this function, and the pragma (Clang) only affects the immediately following loop - Both loop styles produce identical 8x unrolled assembly with GCC; pointer-range preferred for consistency with STL iterator patterns --- core/population.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/core/population.cpp b/core/population.cpp index 27b71baf..f0311799 100644 --- a/core/population.cpp +++ b/core/population.cpp @@ -7192,19 +7192,21 @@ void Population::_TallyMutationReferences_FAST_FromMutationRunUsage(bool p_clock // to put the refcounts for different mutations into different memory blocks // according to the thread that manages each mutation. - // Loop unrolling is enabled via function attribute (GCC) or pragma (Clang). - // The __restrict__ qualifiers indicate no pointer aliasing, and the - // index-based loop with explicit count helps the compiler reason about - // loop bounds. This replaces previous manual 16x unrolling. - const MutationIndex * __restrict__ indices = mutrun->begin_pointer_const(); - const int32_t mutrun_count = mutrun->size(); + // Loop unrolling is enabled via the function attribute above (GCC) or + // pragma below (Clang). These are both scoped: the attribute applies only + // to this function, and the pragma applies only to the immediately + // following loop. The __restrict__ qualifiers indicate no pointer + // aliasing, helping the compiler optimize. This replaces previous manual + // 16x unrolling; the compiler now chooses the optimal unroll factor. + const MutationIndex * __restrict__ mutrun_iter = mutrun->begin_pointer_const(); + const MutationIndex * __restrict__ mutrun_end_iter = mutrun->end_pointer_const(); slim_refcount_t * __restrict__ refcounts = refcount_block_ptr; #if defined(__clang__) #pragma clang loop unroll(enable) #endif - for (int32_t i = 0; i < mutrun_count; ++i) - refcounts[indices[i]] += use_count; + while (mutrun_iter != mutrun_end_iter) + *(refcounts + (*mutrun_iter++)) += use_count; } } From 8198ecc6d9178cb2311c8ca728ee1cc1cae55d02 Mon Sep 17 00:00:00 2001 From: Andrew Kern Date: Thu, 18 Dec 2025 11:58:54 -0800 Subject: [PATCH 6/6] update VERSIONS --- VERSIONS | 1 + 1 file changed, 1 insertion(+) diff --git a/VERSIONS b/VERSIONS index a61461ca..19dc143d 100644 --- a/VERSIONS +++ b/VERSIONS @@ -28,6 +28,7 @@ development head (in the master branch): SIMD optimizations for spatial interaction strength calculations, thanks to Andy Kern, https://github.com/MesserLab/SLiM/pull/590 fix #593, Student's T distribution had a sign error in tdist(), thanks to Andy Kern, https://github.com/MesserLab/SLiM/issues/591 add comprehensive tests for spatial interaction kernel calculations; includes SIMD vs scalar consistency tests, C++ level tests, and script-level tests, thanks to Andy Kern, https://github.com/MesserLab/SLiM/pull/592 + replaced manual unrolling of loop in TallyMutationReferences_Fast with with compiler-directed unrolling, thanks to Andy Kern, https://github.com/MesserLab/SLiM/pull/596 version 5.1 (Eidos version 4.1):