From a535ff10ba1da8fea809fcb5a77b2179bb8c90f9 Mon Sep 17 00:00:00 2001
From: Cory Fields <cory-nospam-@coryfields.com>
Date: Fri, 12 Dec 2025 21:12:40 +0000
Subject: [PATCH 01/12] chacha20: move single-block crypt to inline helper
 function

Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte):
- clang 22: 1MB 2.20 [2.20-2.21], 256B 2.35 [2.35-2.35], 64B 2.59 [2.57-2.62]
- gcc 14:   1MB 2.45 [2.44-2.45], 256B 2.51 [2.51-2.51], 64B 2.69 [2.68-2.70]

CHACHA20_64BYTES is the single-block path, so it's a good sanity-check for noise.

Assembly (scalar path): both compilers lower `std::rotl` to rotates and keep the round
math in scalar registers. Example (gcc, quarterround fragment):
  eor w3, w3, w7
  ror w3, w3, #16
  add w5, w5, w2

Delta vs base: no measurable change (this is a refactor to simplify later vector work).
---
 src/crypto/chacha20.cpp | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/src/crypto/chacha20.cpp b/src/crypto/chacha20.cpp
index 6bdffe691a6b..e06e79527dc9 100644
--- a/src/crypto/chacha20.cpp
+++ b/src/crypto/chacha20.cpp
@@ -157,13 +157,14 @@ inline void ChaCha20Aligned::Keystream(std::span<std::byte> output) noexcept
     }
 }
 
-inline void ChaCha20Aligned::Crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes) noexcept
+static inline void chacha20_crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, uint32_t input[12]) noexcept
 {
     assert(in_bytes.size() == out_bytes.size());
     const std::byte* m = in_bytes.data();
     std::byte* c = out_bytes.data();
-    size_t blocks = out_bytes.size() / BLOCKLEN;
-    assert(blocks * BLOCKLEN == out_bytes.size());
+    size_t blocks = out_bytes.size() / ChaCha20Aligned::BLOCKLEN;
+    assert(blocks * ChaCha20Aligned::BLOCKLEN == out_bytes.size());
+
 
     uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
     uint32_t j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
@@ -273,11 +274,17 @@ inline void ChaCha20Aligned::Crypt(std::span<const std::byte> in_bytes, std::spa
             return;
         }
         blocks -= 1;
-        c += BLOCKLEN;
-        m += BLOCKLEN;
+        c += ChaCha20Aligned::BLOCKLEN;
+        m += ChaCha20Aligned::BLOCKLEN;
     }
 }
 
+
+inline void ChaCha20Aligned::Crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes) noexcept
+{
+    chacha20_crypt(in_bytes, out_bytes, input);
+}
+
 void ChaCha20::Keystream(std::span<std::byte> out) noexcept
 {
     if (out.empty()) return;

From 3c4a209fd34fe07d6a2a5827bc9e95677ece3d1d Mon Sep 17 00:00:00 2001
From: Cory Fields <cory-nospam-@coryfields.com>
Date: Fri, 12 Dec 2025 21:37:52 +0000
Subject: [PATCH 02/12] chacha20: Add generic vectorized chacha20
 implementation

Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte):
- clang 22: 1MB 1.80 [1.79-1.80], 256B 1.63 [1.63-1.64], 64B 2.59 [2.57-2.60]
- gcc 14:   1MB 5.37 [5.37-5.38], 256B 5.14 [5.13-5.15], 64B 2.70 [2.70-2.75]

The speedup/slowdown only shows up once we hit the multi-block path (1MB/256B).
Single-block (64B) remains scalar and stays ~unchanged.

Assembly highlights (AArch64):
- clang emits NEON-friendly rotates/shuffles (`shl`+`usra` and `ext`) with a small stack frame.
- gcc emits a very large stack frame and scalar pack/unpack sequences around shuffles.
  Example prologue (gcc):
    mov x13, #0x9160
    sub sp, sp, x13
  Example inner-sequence (gcc):
    fmov x18, d18
    bfxil x10, x18, #0, #32
  Example inner-sequence (clang):
    usra v25.4s, v16.4s, #25
    ext  v22.16b, v10.16b, v10.16b, #4

Delta vs previous commit:
- clang: ~18% faster at 1MB (2.20 -> 1.80 ns/B)
- gcc:   ~2.2x slower at 1MB (2.45 -> 5.37 ns/B) due to poor multi-state codegen.
---
 src/crypto/CMakeLists.txt        |   1 +
 src/crypto/chacha20.cpp          |  21 +-
 src/crypto/chacha20_vec.h        |  30 +++
 src/crypto/chacha20_vec.ipp      | 342 +++++++++++++++++++++++++++++++
 src/crypto/chacha20_vec_base.cpp |  26 +++
 5 files changed, 419 insertions(+), 1 deletion(-)
 create mode 100644 src/crypto/chacha20_vec.h
 create mode 100644 src/crypto/chacha20_vec.ipp
 create mode 100644 src/crypto/chacha20_vec_base.cpp

diff --git a/src/crypto/CMakeLists.txt b/src/crypto/CMakeLists.txt
index 92653ade5a7a..a247516e4a95 100644
--- a/src/crypto/CMakeLists.txt
+++ b/src/crypto/CMakeLists.txt
@@ -5,6 +5,7 @@
 add_library(bitcoin_crypto STATIC EXCLUDE_FROM_ALL
   aes.cpp
   chacha20.cpp
+  chacha20_vec_base.cpp
   chacha20poly1305.cpp
   hex_base.cpp
   hkdf_sha256_32.cpp
diff --git a/src/crypto/chacha20.cpp b/src/crypto/chacha20.cpp
index e06e79527dc9..53d5e0857549 100644
--- a/src/crypto/chacha20.cpp
+++ b/src/crypto/chacha20.cpp
@@ -7,11 +7,15 @@
 
 #include <crypto/common.h>
 #include <crypto/chacha20.h>
+#include <crypto/chacha20_vec.h>
 #include <support/cleanse.h>
 
 #include <algorithm>
 #include <bit>
 #include <cassert>
+#include <limits>
+
+static_assert(ChaCha20Aligned::BLOCKLEN == CHACHA20_VEC_BLOCKLEN);
 
 #define QUARTERROUND(a,b,c,d) \
   a += b; d = std::rotl(d ^ a, 16); \
@@ -282,7 +286,22 @@ static inline void chacha20_crypt(std::span<const std::byte> in_bytes, std::span
 
 inline void ChaCha20Aligned::Crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes) noexcept
 {
-    chacha20_crypt(in_bytes, out_bytes, input);
+    assert(in_bytes.size() == out_bytes.size());
+    size_t blocks = out_bytes.size() / ChaCha20Aligned::BLOCKLEN;
+    assert(blocks * ChaCha20Aligned::BLOCKLEN == out_bytes.size());
+#ifdef ENABLE_CHACHA20_VEC
+    // Only use the vectorized implementations if the counter will not overflow.
+    const bool overflow = static_cast<uint64_t>(input[8]) + blocks > std::numeric_limits<uint32_t>::max();
+    if (blocks > 1 && !overflow) {
+        const auto state = std::to_array(input);
+        chacha20_vec_base::chacha20_crypt_vectorized(in_bytes, out_bytes, state);
+        const size_t blocks_written = blocks - (out_bytes.size() / ChaCha20Aligned::BLOCKLEN);
+        input[8] += blocks_written;
+    }
+#endif
+    if (in_bytes.size()) {
+        chacha20_crypt(in_bytes, out_bytes, input);
+    }
 }
 
 void ChaCha20::Keystream(std::span<std::byte> out) noexcept
diff --git a/src/crypto/chacha20_vec.h b/src/crypto/chacha20_vec.h
new file mode 100644
index 000000000000..b1176d2b8dbf
--- /dev/null
+++ b/src/crypto/chacha20_vec.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2025-present The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#ifndef BITCOIN_CRYPTO_CHACHA20_VEC_H
+#define BITCOIN_CRYPTO_CHACHA20_VEC_H
+
+#include <array>
+#include <cstdint>
+#include <cstddef>
+#include <span>
+
+static constexpr size_t CHACHA20_VEC_BLOCKLEN = 64;
+
+#ifdef __has_builtin
+  #if __has_builtin(__builtin_shufflevector)
+    #define ENABLE_CHACHA20_VEC 1
+  #endif
+#endif
+
+#ifdef ENABLE_CHACHA20_VEC
+
+namespace chacha20_vec_base
+{
+    void chacha20_crypt_vectorized(std::span<const std::byte>& in_bytes, std::span<std::byte>& out_bytes, const std::array<uint32_t, 12>& input) noexcept;
+}
+
+#endif // ENABLE_CHACHA20_VEC
+
+#endif // BITCOIN_CRYPTO_CHACHA20_VEC_H
diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp
new file mode 100644
index 000000000000..46a159ce01c6
--- /dev/null
+++ b/src/crypto/chacha20_vec.ipp
@@ -0,0 +1,342 @@
+// Copyright (c) 2025-present The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#include <crypto/chacha20_vec.h>
+
+#include <bit>
+#include <cassert>
+#include <cstring>
+#include <limits>
+
+#if defined(ENABLE_CHACHA20_VEC)
+
+#if defined(CHACHA20_VEC_DISABLE_STATES_16) && \
+    defined(CHACHA20_VEC_DISABLE_STATES_8) && \
+    defined(CHACHA20_VEC_DISABLE_STATES_6) && \
+    defined(CHACHA20_VEC_DISABLE_STATES_4) && \
+    defined(CHACHA20_VEC_DISABLE_STATES_2)
+#define CHACHA20_VEC_ALL_MULTI_STATES_DISABLED
+#endif
+
+
+#if !defined(CHACHA20_VEC_ALL_MULTI_STATES_DISABLED)
+
+#if defined(__has_attribute)
+#  if __has_attribute(always_inline)
+#    define ALWAYS_INLINE __attribute__ ((always_inline)) inline
+#  endif
+#endif
+
+#if !defined(ALWAYS_INLINE)
+#  define ALWAYS_INLINE inline
+#endif
+
+
+namespace {
+
+using vec256 = uint32_t __attribute__((__vector_size__(32)));
+
+/** Endian-conversion for big-endian */
+ALWAYS_INLINE void vec_byteswap(vec256& vec)
+{
+    if constexpr (std::endian::native == std::endian::big)
+    {
+        vec256 ret;
+        ret[0] = __builtin_bswap32(vec[0]);
+        ret[1] = __builtin_bswap32(vec[1]);
+        ret[2] = __builtin_bswap32(vec[2]);
+        ret[3] = __builtin_bswap32(vec[3]);
+        ret[4] = __builtin_bswap32(vec[4]);
+        ret[5] = __builtin_bswap32(vec[5]);
+        ret[6] = __builtin_bswap32(vec[6]);
+        ret[7] = __builtin_bswap32(vec[7]);
+        vec = ret;
+    }
+}
+
+/** Left-rotate vector */
+template <size_t BITS>
+ALWAYS_INLINE void vec_rotl(vec256& vec)
+{
+    vec = (vec << BITS) | (vec >> (32 - BITS));
+}
+
+/** Store a vector in all array elements */
+template <size_t I, size_t ITER = 0>
+ALWAYS_INLINE void arr_set_vec256(std::array<vec256, I>& arr, const vec256& vec)
+{
+    std::get<ITER>(arr) = vec;
+    if constexpr(ITER + 1 < I ) arr_set_vec256<I, ITER + 1>(arr, vec);
+}
+
+/** Add a vector to all array elements */
+template <size_t I, size_t ITER = 0>
+ALWAYS_INLINE void arr_add_vec256(std::array<vec256, I>& arr, const vec256& vec)
+{
+    std::get<ITER>(arr) += vec;
+    if constexpr(ITER + 1 < I ) arr_add_vec256<I, ITER + 1>(arr, vec);
+}
+
+/** Add corresponding vectors in arr1 to arr0 */
+template <size_t I, size_t ITER = 0>
+ALWAYS_INLINE void arr_add_arr(std::array<vec256, I>& arr0, const std::array<vec256, I>& arr1)
+{
+    std::get<ITER>(arr0) += std::get<ITER>(arr1);
+    if constexpr(ITER + 1 < I ) arr_add_arr<I, ITER + 1>(arr0, arr1);
+}
+
+/** Perform add/xor/rotate for the round function */
+template <size_t BITS, size_t I, size_t ITER = 0>
+ALWAYS_INLINE void arr_add_xor_rot(std::array<vec256, I>& arr0, const std::array<vec256, I>& arr1, std::array<vec256, I>& arr2)
+{
+    vec256& x = std::get<ITER>(arr0);
+    const vec256& y = std::get<ITER>(arr1);
+    vec256& z = std::get<ITER>(arr2);
+
+    x += y;
+    z ^= x;
+    vec_rotl<BITS>(z);
+
+    if constexpr(ITER + 1 < I ) arr_add_xor_rot<BITS, I, ITER + 1>(arr0, arr1, arr2);
+}
+
+/*
+The first round:
+            QUARTERROUND( x0, x4, x8,x12);
+            QUARTERROUND( x1, x5, x9,x13);
+            QUARTERROUND( x2, x6,x10,x14);
+            QUARTERROUND( x3, x7,x11,x15);
+
+The second round:
+            QUARTERROUND( x0, x5,x10,x15);
+            QUARTERROUND( x1, x6,x11,x12);
+            QUARTERROUND( x2, x7, x8,x13);
+            QUARTERROUND( x3, x4, x9,x14);
+
+After the first round, arr_shuf0, arr_shuf1, and arr_shuf2 are used to shuffle
+the layout to prepare for the second round.
+
+After the second round, they are used (in reverse) to restore the original
+layout.
+
+*/
+template <size_t I, size_t ITER = 0>
+ALWAYS_INLINE void arr_shuf0(std::array<vec256, I>& arr)
+{
+    vec256& x = std::get<ITER>(arr);
+    x = __builtin_shufflevector(x, x, 1, 2, 3, 0, 5, 6, 7, 4);
+    if constexpr(ITER + 1 < I ) arr_shuf0<I, ITER + 1>(arr);
+}
+
+template <size_t I, size_t ITER = 0>
+ALWAYS_INLINE void arr_shuf1(std::array<vec256, I>& arr)
+{
+    vec256& x = std::get<ITER>(arr);
+    x = __builtin_shufflevector(x, x, 2, 3, 0, 1, 6, 7, 4, 5);
+    if constexpr(ITER + 1 < I ) arr_shuf1<I, ITER + 1>(arr);
+}
+
+template <size_t I, size_t ITER = 0>
+ALWAYS_INLINE void arr_shuf2(std::array<vec256, I>& arr)
+{
+    vec256& x = std::get<ITER>(arr);
+    x = __builtin_shufflevector(x, x, 3, 0, 1, 2, 7, 4, 5, 6);
+    if constexpr(ITER + 1 < I ) arr_shuf2<I, ITER + 1>(arr);
+}
+
+/* Main round function. */
+template <size_t I, size_t ITER = 0>
+ALWAYS_INLINE void doubleround(std::array<vec256, I>& arr0, std::array<vec256, I>& arr1, std::array<vec256, I>&arr2, std::array<vec256, I>&arr3)
+{
+    arr_add_xor_rot<16>(arr0, arr1, arr3);
+    arr_add_xor_rot<12>(arr2, arr3, arr1);
+    arr_add_xor_rot<8>(arr0, arr1, arr3);
+    arr_add_xor_rot<7>(arr2, arr3, arr1);
+    arr_shuf0(arr1);
+    arr_shuf1(arr2);
+    arr_shuf2(arr3);
+    arr_add_xor_rot<16>(arr0, arr1, arr3);
+    arr_add_xor_rot<12>(arr2, arr3, arr1);
+    arr_add_xor_rot<8>(arr0, arr1, arr3);
+    arr_add_xor_rot<7>(arr2, arr3, arr1);
+    arr_shuf2(arr1);
+    arr_shuf1(arr2);
+    arr_shuf0(arr3);
+    if constexpr (ITER + 1 < 10) doubleround<I, ITER + 1>(arr0, arr1, arr2, arr3);
+}
+
+/* Read 32bytes of input, xor with calculated state, write to output. Assumes
+   that input and output are unaligned, and makes no assumptions about the
+   internal layout of vec256;
+*/
+ALWAYS_INLINE void vec_read_xor_write(std::span<const std::byte, 32> in_bytes, std::span<std::byte, 32> out_bytes, const vec256& vec)
+{
+    std::array<uint32_t, 8> temparr;
+    memcpy(temparr.data(), in_bytes.data(), in_bytes.size());
+    vec256 tempvec = vec ^ (vec256){temparr[0], temparr[1], temparr[2], temparr[3], temparr[4], temparr[5], temparr[6], temparr[7]};
+    vec_byteswap(tempvec);
+    temparr = {tempvec[0], tempvec[1], tempvec[2], tempvec[3], tempvec[4], tempvec[5], tempvec[6], tempvec[7]};
+    memcpy(out_bytes.data(), temparr.data(), out_bytes.size());
+}
+
+/* Merge the 128 bit lanes from 2 states to the proper order, then pass each vec_read_xor_write */
+template <size_t I, size_t ITER = 0>
+ALWAYS_INLINE void arr_read_xor_write(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, const std::array<vec256, I>& arr0, const std::array<vec256, I>& arr1, const std::array<vec256, I>& arr2, const std::array<vec256, I>& arr3)
+{
+    const vec256& w = std::get<ITER>(arr0);
+    const vec256& x = std::get<ITER>(arr1);
+    const vec256& y = std::get<ITER>(arr2);
+    const vec256& z = std::get<ITER>(arr3);
+
+    vec_read_xor_write(in_bytes.first<32>(), out_bytes.first<32>(), __builtin_shufflevector(w, x, 4, 5, 6, 7, 12, 13, 14, 15));
+    vec_read_xor_write(in_bytes.subspan<32, 32>(), out_bytes.subspan<32, 32>(), __builtin_shufflevector(y, z, 4, 5, 6, 7, 12, 13, 14, 15));
+    vec_read_xor_write(in_bytes.subspan<64, 32>(), out_bytes.subspan<64, 32>(), __builtin_shufflevector(w, x, 0, 1, 2, 3, 8, 9, 10, 11));
+    vec_read_xor_write(in_bytes.subspan<96, 32>(), out_bytes.subspan<96, 32>(), __builtin_shufflevector(y, z, 0, 1, 2, 3, 8, 9, 10, 11));
+
+    if constexpr(ITER + 1 < I ) arr_read_xor_write<I, ITER + 1>(in_bytes.subspan<128>(), out_bytes.subspan<128>(), arr0, arr1, arr2, arr3);
+}
+
+/* Compile-time helper to create addend vectors which used to increment the states
+
+    Generates vectors of the pattern:
+    1 0 0 0 0 0 0 0
+    3 0 0 0 2 0 0 0
+    5 0 0 0 4 0 0 0
+    ...
+*/
+template <size_t SIZE>
+consteval std::array<vec256, SIZE> generate_increments()
+{
+    std::array<vec256, SIZE> rows;
+    for (uint32_t i = 0; i < SIZE; i ++)
+    {
+        rows[i] = (i * (vec256){2, 0, 0, 0, 2, 0, 0, 0}) + (vec256){1, 0, 0, 0, 0, 0, 0, 0};
+    }
+    return rows;
+}
+
+/* Main crypt function. Calculates up to 16 states.
+
+    Each array contains one or more vectors, with each array representing a
+    quarter of a state. Initially, the high and low parts of each vector are
+    duplicated. They each contain a portion of the current and next state.
+
+    arr0[0]    arr1[0]    arr2[0]    arr3[0]   increment
+    ----------|---------|----------|----------|---------
+    0x61707865 input[0]   input[4]   input[8]   [1]
+    0x3320646e input[1]   input[5]   input[9]   [0]
+    0x79622d32 input[2]   input[6]   input[10]  [0]
+    0x6b206574 input[3]   input[7]   input[11]  [0]
+
+    0x61707865 input[0]   input[4]   input[8]   [0]
+    0x3320646e input[1]   input[5]   input[9]   [0]
+    0x79622d32 input[2]   input[6]   input[10]  [0]
+    0x6b206574 input[3]   input[7]   input[11]  [0]
+
+    After loading the states, arr3's vectors are incremented as-necessary to
+    contain the correct counter values.
+
+    This way, operations like "arr0[0] += arr1[0]" can perform all 8 operations
+    in parallel, taking advantage of 256bit registers where available.
+
+    arrX[0] represents states 0 and 1.
+    arrX[1] represents states 2 and 3 (if present)
+    etc.
+
+    After the doublerounds have been run and the initial state has been mixed
+    back in, the high and low portions of the vectors in each array are
+    shuffled in order to prepare them for mixing with the input bytes. Finally,
+    each state is xor'd with its corresponding input, byteswapped if necessary,
+    and written to its output.
+*/
+template <size_t STATES>
+ALWAYS_INLINE void multi_block_crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, const vec256& state0, const vec256& state1, const vec256& state2)
+{
+    static constexpr size_t HALF_STATES = STATES / 2;
+    static constexpr vec256 nums256 = (vec256){0x61707865, 0x3320646e, 0x79622d32, 0x6b206574, 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574};
+    static constinit std::array<vec256, HALF_STATES> increments = generate_increments<HALF_STATES>();
+
+    std::array<vec256, HALF_STATES> arr0, arr1, arr2, arr3;
+
+    arr_set_vec256(arr0, nums256);
+    arr_set_vec256(arr1, state0);
+    arr_set_vec256(arr2, state1);
+    arr_set_vec256(arr3, state2);
+
+    arr_add_arr(arr3, increments);
+
+    doubleround(arr0, arr1, arr2, arr3);
+
+    arr_add_vec256(arr0, nums256);
+    arr_add_vec256(arr1, state0);
+    arr_add_vec256(arr2, state1);
+    arr_add_vec256(arr3, state2);
+
+    arr_add_arr(arr3, increments);
+
+    arr_read_xor_write(in_bytes, out_bytes, arr0, arr1, arr2, arr3);
+}
+
+} // anonymous namespace
+#endif // CHACHA20_VEC_ALL_MULTI_STATES_DISABLED
+
+#if defined(CHACHA20_NAMESPACE)
+namespace CHACHA20_NAMESPACE {
+#endif
+
+void chacha20_crypt_vectorized(std::span<const std::byte>& in_bytes, std::span<std::byte>& out_bytes, const std::array<uint32_t, 12>& input) noexcept
+{
+#if !defined(CHACHA20_VEC_ALL_MULTI_STATES_DISABLED)
+    assert(in_bytes.size() == out_bytes.size());
+    const vec256 state0 =  (vec256){input[0], input[1], input[2], input[3], input[0], input[1], input[2], input[3]};
+    const vec256 state1 =  (vec256){input[4], input[5], input[6], input[7], input[4], input[5], input[6], input[7]};
+    vec256 state2 =  (vec256){input[8], input[9], input[10], input[11], input[8], input[9], input[10], input[11]};
+#if !defined(CHACHA20_VEC_DISABLE_STATES_16)
+    while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 16) {
+        multi_block_crypt<16>(in_bytes, out_bytes, state0, state1, state2);
+        state2 += (vec256){16, 0, 0, 0, 16, 0, 0, 0};
+        in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 16);
+        out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 16);
+    }
+#endif
+#if !defined(CHACHA20_VEC_DISABLE_STATES_8)
+    while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 8) {
+        multi_block_crypt<8>(in_bytes, out_bytes, state0, state1, state2);
+        state2 += (vec256){8, 0, 0, 0, 8, 0, 0, 0};
+        in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 8);
+        out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 8);
+    }
+#endif
+#if !defined(CHACHA20_VEC_DISABLE_STATES_6)
+    while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 6) {
+        multi_block_crypt<6>(in_bytes, out_bytes, state0, state1, state2);
+        state2 += (vec256){6, 0, 0, 0, 6, 0, 0, 0};
+        in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 6);
+        out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 6);
+    }
+#endif
+#if !defined(CHACHA20_VEC_DISABLE_STATES_4)
+    while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 4) {
+        multi_block_crypt<4>(in_bytes, out_bytes, state0, state1, state2);
+        state2 += (vec256){4, 0, 0, 0, 4, 0, 0, 0};
+        in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 4);
+        out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 4);
+    }
+#endif
+#if !defined(CHACHA20_VEC_DISABLE_STATES_2)
+    while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 2) {
+        multi_block_crypt<2>(in_bytes, out_bytes, state0, state1, state2);
+        state2 += (vec256){2, 0, 0, 0, 2, 0, 0, 0};
+        in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 2);
+        out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 2);
+    }
+#endif
+#endif // CHACHA20_VEC_ALL_MULTI_STATES_DISABLED
+}
+
+#if defined(CHACHA20_NAMESPACE)
+}
+#endif
+
+#endif // ENABLE_CHACHA20_VEC
diff --git a/src/crypto/chacha20_vec_base.cpp b/src/crypto/chacha20_vec_base.cpp
new file mode 100644
index 000000000000..9fda9452a1c8
--- /dev/null
+++ b/src/crypto/chacha20_vec_base.cpp
@@ -0,0 +1,26 @@
+// Copyright (c) 2025-present The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#define CHACHA20_NAMESPACE chacha20_vec_base
+
+// This file should define which states should be en/disabled for all
+// supported architectures. For some, like x86-64 and armv8, simd features
+// (sse2 and neon respectively) are safe to use without runtime detection.
+
+#if defined(__x86_64__) || defined(__amd64__)
+#  define CHACHA20_VEC_DISABLE_STATES_16
+#  define CHACHA20_VEC_DISABLE_STATES_8
+#  define CHACHA20_VEC_DISABLE_STATES_6
+#elif defined(__ARM_NEON)
+#  define CHACHA20_VEC_DISABLE_STATES_2
+#else
+// Be conservative and require platforms to opt-in
+#  define CHACHA20_VEC_DISABLE_STATES_16
+#  define CHACHA20_VEC_DISABLE_STATES_8
+#  define CHACHA20_VEC_DISABLE_STATES_6
+#  define CHACHA20_VEC_DISABLE_STATES_4
+#  define CHACHA20_VEC_DISABLE_STATES_2
+#endif
+
+#include <crypto/chacha20_vec.ipp>

From 0d8d4400a737d9f2dec664c9a1a48619a5eb98d2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Sun, 4 Jan 2026 00:06:15 +0100
Subject: [PATCH 03/12] refactor: replace recursive templates in ChaCha20
 implementation with `static_for` loops

Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte):
- clang 22: 1MB 1.80 [1.79-1.80], 256B 1.63 [1.63-1.64], 64B 2.59 [2.58-2.60]
- gcc 14:   1MB 6.66 [6.64-6.79], 256B 5.02 [5.02-5.03], 64B 2.70 [2.68-2.72]

This refactor keeps clang flat, but makes gcc's 1MB case substantially worse.

Assembly highlights (gcc): instruction count explodes (CHACHA20_1MB `ins/byte` ~43.7)
with many vector loads/stores and branches (lambda clones / `ld1`/`st1` heavy). Example
(from one of the inlined helper clones):
  st1 {v26.16b-v27.16b}, [x4]
  ldp q26, q27, [x2, #64]

Delta vs previous commit:
- gcc: 1MB 5.37 -> 6.66 ns/B (regression)
- clang: essentially unchanged.
---
 src/crypto/chacha20_vec.ipp | 118 ++++++++++++++++++++++--------------
 1 file changed, 71 insertions(+), 47 deletions(-)

diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp
index 46a159ce01c6..236ab84c0ae8 100644
--- a/src/crypto/chacha20_vec.ipp
+++ b/src/crypto/chacha20_vec.ipp
@@ -8,6 +8,8 @@
 #include <cassert>
 #include <cstring>
 #include <limits>
+#include <type_traits>
+#include <utility>
 
 #if defined(ENABLE_CHACHA20_VEC)
 
@@ -62,43 +64,52 @@ ALWAYS_INLINE void vec_rotl(vec256& vec)
     vec = (vec << BITS) | (vec >> (32 - BITS));
 }
 
+template <size_t N, typename Fn, size_t... Is>
+ALWAYS_INLINE void static_for_impl(Fn&& fn, std::index_sequence<Is...>)
+{
+    (fn(std::integral_constant<size_t, Is>{}), ...);
+}
+
+template <size_t N, typename Fn>
+ALWAYS_INLINE void static_for(Fn&& fn)
+{
+    static_for_impl<N>(std::forward<Fn>(fn), std::make_index_sequence<N>{});
+}
+
 /** Store a vector in all array elements */
-template <size_t I, size_t ITER = 0>
+template <size_t I>
 ALWAYS_INLINE void arr_set_vec256(std::array<vec256, I>& arr, const vec256& vec)
 {
-    std::get<ITER>(arr) = vec;
-    if constexpr(ITER + 1 < I ) arr_set_vec256<I, ITER + 1>(arr, vec);
+    static_for<I>([&](auto idx) { std::get<idx.value>(arr) = vec; });
 }
 
 /** Add a vector to all array elements */
-template <size_t I, size_t ITER = 0>
+template <size_t I>
 ALWAYS_INLINE void arr_add_vec256(std::array<vec256, I>& arr, const vec256& vec)
 {
-    std::get<ITER>(arr) += vec;
-    if constexpr(ITER + 1 < I ) arr_add_vec256<I, ITER + 1>(arr, vec);
+    static_for<I>([&](auto idx) { std::get<idx.value>(arr) += vec; });
 }
 
 /** Add corresponding vectors in arr1 to arr0 */
-template <size_t I, size_t ITER = 0>
+template <size_t I>
 ALWAYS_INLINE void arr_add_arr(std::array<vec256, I>& arr0, const std::array<vec256, I>& arr1)
 {
-    std::get<ITER>(arr0) += std::get<ITER>(arr1);
-    if constexpr(ITER + 1 < I ) arr_add_arr<I, ITER + 1>(arr0, arr1);
+    static_for<I>([&](auto idx) { std::get<idx.value>(arr0) += std::get<idx.value>(arr1); });
 }
 
 /** Perform add/xor/rotate for the round function */
-template <size_t BITS, size_t I, size_t ITER = 0>
+template <size_t BITS, size_t I>
 ALWAYS_INLINE void arr_add_xor_rot(std::array<vec256, I>& arr0, const std::array<vec256, I>& arr1, std::array<vec256, I>& arr2)
 {
-    vec256& x = std::get<ITER>(arr0);
-    const vec256& y = std::get<ITER>(arr1);
-    vec256& z = std::get<ITER>(arr2);
-
-    x += y;
-    z ^= x;
-    vec_rotl<BITS>(z);
-
-    if constexpr(ITER + 1 < I ) arr_add_xor_rot<BITS, I, ITER + 1>(arr0, arr1, arr2);
+    static_for<I>([&](auto idx) {
+        vec256& x = std::get<idx.value>(arr0);
+        const vec256& y = std::get<idx.value>(arr1);
+        vec256& z = std::get<idx.value>(arr2);
+
+        x += y;
+        z ^= x;
+        vec_rotl<BITS>(z);
+    });
 }
 
 /*
@@ -121,33 +132,36 @@ After the second round, they are used (in reverse) to restore the original
 layout.
 
 */
-template <size_t I, size_t ITER = 0>
+template <size_t I>
 ALWAYS_INLINE void arr_shuf0(std::array<vec256, I>& arr)
 {
-    vec256& x = std::get<ITER>(arr);
-    x = __builtin_shufflevector(x, x, 1, 2, 3, 0, 5, 6, 7, 4);
-    if constexpr(ITER + 1 < I ) arr_shuf0<I, ITER + 1>(arr);
+    static_for<I>([&](auto idx) {
+        vec256& x = std::get<idx.value>(arr);
+        x = __builtin_shufflevector(x, x, 1, 2, 3, 0, 5, 6, 7, 4);
+    });
 }
 
-template <size_t I, size_t ITER = 0>
+template <size_t I>
 ALWAYS_INLINE void arr_shuf1(std::array<vec256, I>& arr)
 {
-    vec256& x = std::get<ITER>(arr);
-    x = __builtin_shufflevector(x, x, 2, 3, 0, 1, 6, 7, 4, 5);
-    if constexpr(ITER + 1 < I ) arr_shuf1<I, ITER + 1>(arr);
+    static_for<I>([&](auto idx) {
+        vec256& x = std::get<idx.value>(arr);
+        x = __builtin_shufflevector(x, x, 2, 3, 0, 1, 6, 7, 4, 5);
+    });
 }
 
-template <size_t I, size_t ITER = 0>
+template <size_t I>
 ALWAYS_INLINE void arr_shuf2(std::array<vec256, I>& arr)
 {
-    vec256& x = std::get<ITER>(arr);
-    x = __builtin_shufflevector(x, x, 3, 0, 1, 2, 7, 4, 5, 6);
-    if constexpr(ITER + 1 < I ) arr_shuf2<I, ITER + 1>(arr);
+    static_for<I>([&](auto idx) {
+        vec256& x = std::get<idx.value>(arr);
+        x = __builtin_shufflevector(x, x, 3, 0, 1, 2, 7, 4, 5, 6);
+    });
 }
 
-/* Main round function. */
-template <size_t I, size_t ITER = 0>
-ALWAYS_INLINE void doubleround(std::array<vec256, I>& arr0, std::array<vec256, I>& arr1, std::array<vec256, I>&arr2, std::array<vec256, I>&arr3)
+/* Run a single ChaCha20 double-round. */
+template <size_t I>
+ALWAYS_INLINE void doubleround_once(std::array<vec256, I>& arr0, std::array<vec256, I>& arr1, std::array<vec256, I>& arr2, std::array<vec256, I>& arr3)
 {
     arr_add_xor_rot<16>(arr0, arr1, arr3);
     arr_add_xor_rot<12>(arr2, arr3, arr1);
@@ -163,7 +177,13 @@ ALWAYS_INLINE void doubleround(std::array<vec256, I>& arr0, std::array<vec256, I
     arr_shuf2(arr1);
     arr_shuf1(arr2);
     arr_shuf0(arr3);
-    if constexpr (ITER + 1 < 10) doubleround<I, ITER + 1>(arr0, arr1, arr2, arr3);
+}
+
+/* Main round function. */
+template <size_t I>
+ALWAYS_INLINE void doubleround(std::array<vec256, I>& arr0, std::array<vec256, I>& arr1, std::array<vec256, I>& arr2, std::array<vec256, I>& arr3)
+{
+    static_for<10>([&](auto) { doubleround_once(arr0, arr1, arr2, arr3); });
 }
 
 /* Read 32bytes of input, xor with calculated state, write to output. Assumes
@@ -181,20 +201,24 @@ ALWAYS_INLINE void vec_read_xor_write(std::span<const std::byte, 32> in_bytes, s
 }
 
 /* Merge the 128 bit lanes from 2 states to the proper order, then pass each vec_read_xor_write */
-template <size_t I, size_t ITER = 0>
+template <size_t I>
 ALWAYS_INLINE void arr_read_xor_write(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, const std::array<vec256, I>& arr0, const std::array<vec256, I>& arr1, const std::array<vec256, I>& arr2, const std::array<vec256, I>& arr3)
 {
-    const vec256& w = std::get<ITER>(arr0);
-    const vec256& x = std::get<ITER>(arr1);
-    const vec256& y = std::get<ITER>(arr2);
-    const vec256& z = std::get<ITER>(arr3);
-
-    vec_read_xor_write(in_bytes.first<32>(), out_bytes.first<32>(), __builtin_shufflevector(w, x, 4, 5, 6, 7, 12, 13, 14, 15));
-    vec_read_xor_write(in_bytes.subspan<32, 32>(), out_bytes.subspan<32, 32>(), __builtin_shufflevector(y, z, 4, 5, 6, 7, 12, 13, 14, 15));
-    vec_read_xor_write(in_bytes.subspan<64, 32>(), out_bytes.subspan<64, 32>(), __builtin_shufflevector(w, x, 0, 1, 2, 3, 8, 9, 10, 11));
-    vec_read_xor_write(in_bytes.subspan<96, 32>(), out_bytes.subspan<96, 32>(), __builtin_shufflevector(y, z, 0, 1, 2, 3, 8, 9, 10, 11));
-
-    if constexpr(ITER + 1 < I ) arr_read_xor_write<I, ITER + 1>(in_bytes.subspan<128>(), out_bytes.subspan<128>(), arr0, arr1, arr2, arr3);
+    static_for<I>([&](auto idx) {
+        constexpr size_t offset = idx.value * 128;
+        const vec256& w = std::get<idx.value>(arr0);
+        const vec256& x = std::get<idx.value>(arr1);
+        const vec256& y = std::get<idx.value>(arr2);
+        const vec256& z = std::get<idx.value>(arr3);
+
+        auto in_slice = in_bytes.template subspan<offset, 128>();
+        auto out_slice = out_bytes.template subspan<offset, 128>();
+
+        vec_read_xor_write(in_slice.template first<32>(), out_slice.template first<32>(), __builtin_shufflevector(w, x, 4, 5, 6, 7, 12, 13, 14, 15));
+        vec_read_xor_write(in_slice.template subspan<32, 32>(), out_slice.template subspan<32, 32>(), __builtin_shufflevector(y, z, 4, 5, 6, 7, 12, 13, 14, 15));
+        vec_read_xor_write(in_slice.template subspan<64, 32>(), out_slice.template subspan<64, 32>(), __builtin_shufflevector(w, x, 0, 1, 2, 3, 8, 9, 10, 11));
+        vec_read_xor_write(in_slice.template subspan<96, 32>(), out_slice.template subspan<96, 32>(), __builtin_shufflevector(y, z, 0, 1, 2, 3, 8, 9, 10, 11));
+    });
 }
 
 /* Compile-time helper to create addend vectors which used to increment the states

From 9d0a168f5085818b3d1992667899f6102900f9c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Sun, 4 Jan 2026 00:26:57 +0100
Subject: [PATCH 04/12] refactor: replace template-based static_for use in
 ChaCha20 with runtime iteration

Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte):
- clang 22: 1MB 1.85 [1.85-1.89], 256B 1.72 [1.72-1.73], 64B 2.59 [2.59-2.60]
- gcc 14:   1MB 4.51 [4.50-4.51], 256B 4.59 [4.58-4.59], 64B 2.72 [2.70-2.72]

This is the first refactor that materially helps gcc again: the multi-state path
shrinks substantially (much less codegen bloat), reducing `ins/byte` (43.7 -> 25.5)
for CHACHA20_1MB.

Assembly highlight (gcc): far less scalar shuffling glue and reduced stack pressure
(stack allocation drops from ~0x16c0 to ~0x1530, and objdump size shrinks sharply).

Delta vs previous commit:
- gcc: 1MB 6.66 -> 4.51 ns/B (still slower than scalar baseline, but improved)
- clang: slight regression (1.80 -> 1.85 ns/B), consistent with less aggressive unrolling.
---
 src/crypto/chacha20_vec.ipp | 367 ++++++++++++++++++++----------------
 1 file changed, 200 insertions(+), 167 deletions(-)

diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp
index 236ab84c0ae8..06b8512839a2 100644
--- a/src/crypto/chacha20_vec.ipp
+++ b/src/crypto/chacha20_vec.ipp
@@ -8,8 +8,6 @@
 #include <cassert>
 #include <cstring>
 #include <limits>
-#include <type_traits>
-#include <utility>
 
 #if defined(ENABLE_CHACHA20_VEC)
 
@@ -39,6 +37,8 @@ namespace {
 
 using vec256 = uint32_t __attribute__((__vector_size__(32)));
 
+// Like Bitcoin Core's `ALWAYS_INLINE` in other files, but kept local to avoid touching shared headers.
+
 /** Endian-conversion for big-endian */
 ALWAYS_INLINE void vec_byteswap(vec256& vec)
 {
@@ -58,58 +58,140 @@ ALWAYS_INLINE void vec_byteswap(vec256& vec)
 }
 
 /** Left-rotate vector */
-template <size_t BITS>
-ALWAYS_INLINE void vec_rotl(vec256& vec)
+ALWAYS_INLINE void vec_rotl16(vec256& vec)
+{
+    vec = (vec << 16) | (vec >> 16);
+}
+
+ALWAYS_INLINE void vec_rotl12(vec256& vec)
 {
-    vec = (vec << BITS) | (vec >> (32 - BITS));
+    vec = (vec << 12) | (vec >> 20);
 }
 
-template <size_t N, typename Fn, size_t... Is>
-ALWAYS_INLINE void static_for_impl(Fn&& fn, std::index_sequence<Is...>)
+ALWAYS_INLINE void vec_rotl8(vec256& vec)
 {
-    (fn(std::integral_constant<size_t, Is>{}), ...);
+    vec = (vec << 8) | (vec >> 24);
 }
 
-template <size_t N, typename Fn>
-ALWAYS_INLINE void static_for(Fn&& fn)
+ALWAYS_INLINE void vec_rotl7(vec256& vec)
 {
-    static_for_impl<N>(std::forward<Fn>(fn), std::make_index_sequence<N>{});
+    vec = (vec << 7) | (vec >> 25);
+}
+
+static const vec256 nums256 = (vec256){0x61707865, 0x3320646e, 0x79622d32, 0x6b206574, 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574};
+
+static const vec256 increments_1[1] = {
+    (vec256){1, 0, 0, 0, 0, 0, 0, 0},
+};
+
+static const vec256 increments_2[2] = {
+    (vec256){1, 0, 0, 0, 0, 0, 0, 0},
+    (vec256){3, 0, 0, 0, 2, 0, 0, 0},
+};
+
+static const vec256 increments_3[3] = {
+    (vec256){1, 0, 0, 0, 0, 0, 0, 0},
+    (vec256){3, 0, 0, 0, 2, 0, 0, 0},
+    (vec256){5, 0, 0, 0, 4, 0, 0, 0},
+};
+
+static const vec256 increments_4[4] = {
+    (vec256){1, 0, 0, 0, 0, 0, 0, 0},
+    (vec256){3, 0, 0, 0, 2, 0, 0, 0},
+    (vec256){5, 0, 0, 0, 4, 0, 0, 0},
+    (vec256){7, 0, 0, 0, 6, 0, 0, 0},
+};
+
+static const vec256 increments_8[8] = {
+    (vec256){1, 0, 0, 0, 0, 0, 0, 0},
+    (vec256){3, 0, 0, 0, 2, 0, 0, 0},
+    (vec256){5, 0, 0, 0, 4, 0, 0, 0},
+    (vec256){7, 0, 0, 0, 6, 0, 0, 0},
+    (vec256){9, 0, 0, 0, 8, 0, 0, 0},
+    (vec256){11, 0, 0, 0, 10, 0, 0, 0},
+    (vec256){13, 0, 0, 0, 12, 0, 0, 0},
+    (vec256){15, 0, 0, 0, 14, 0, 0, 0},
+};
+
+#define CHACHA20_VEC_PRAGMA(x) _Pragma(#x)
+#if defined(__clang__)
+#define CHACHA20_VEC_UNROLL(N) CHACHA20_VEC_PRAGMA(clang loop unroll_count(N))
+#elif defined(__GNUC__)
+#define CHACHA20_VEC_UNROLL(N) CHACHA20_VEC_PRAGMA(GCC unroll N)
+#else
+#define CHACHA20_VEC_UNROLL(N)
+#endif
+
+ALWAYS_INLINE const vec256* increments_for_half_states(size_t half_states)
+{
+    switch (half_states) {
+    case 1: return increments_1;
+    case 2: return increments_2;
+    case 3: return increments_3;
+    case 4: return increments_4;
+    case 8: return increments_8;
+    default: return nullptr;
+    }
 }
 
 /** Store a vector in all array elements */
-template <size_t I>
-ALWAYS_INLINE void arr_set_vec256(std::array<vec256, I>& arr, const vec256& vec)
+ALWAYS_INLINE void arr_set_vec256(vec256* arr, size_t half_states, const vec256& vec)
 {
-    static_for<I>([&](auto idx) { std::get<idx.value>(arr) = vec; });
+    for (size_t i = 0; i < half_states; ++i) {
+        arr[i] = vec;
+    }
 }
 
 /** Add a vector to all array elements */
-template <size_t I>
-ALWAYS_INLINE void arr_add_vec256(std::array<vec256, I>& arr, const vec256& vec)
+ALWAYS_INLINE void arr_add_vec256(vec256* arr, size_t half_states, const vec256& vec)
 {
-    static_for<I>([&](auto idx) { std::get<idx.value>(arr) += vec; });
+    for (size_t i = 0; i < half_states; ++i) {
+        arr[i] += vec;
+    }
 }
 
 /** Add corresponding vectors in arr1 to arr0 */
-template <size_t I>
-ALWAYS_INLINE void arr_add_arr(std::array<vec256, I>& arr0, const std::array<vec256, I>& arr1)
+ALWAYS_INLINE void arr_add_arr(vec256* arr0, const vec256* arr1, size_t half_states)
 {
-    static_for<I>([&](auto idx) { std::get<idx.value>(arr0) += std::get<idx.value>(arr1); });
+    for (size_t i = 0; i < half_states; ++i) {
+        arr0[i] += arr1[i];
+    }
 }
 
-/** Perform add/xor/rotate for the round function */
-template <size_t BITS, size_t I>
-ALWAYS_INLINE void arr_add_xor_rot(std::array<vec256, I>& arr0, const std::array<vec256, I>& arr1, std::array<vec256, I>& arr2)
+ALWAYS_INLINE void arr_add_xor_rot16(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states)
 {
-    static_for<I>([&](auto idx) {
-        vec256& x = std::get<idx.value>(arr0);
-        const vec256& y = std::get<idx.value>(arr1);
-        vec256& z = std::get<idx.value>(arr2);
-
-        x += y;
-        z ^= x;
-        vec_rotl<BITS>(z);
-    });
+    for (size_t i = 0; i < half_states; ++i) {
+        arr0[i] += arr1[i];
+        arr2[i] ^= arr0[i];
+        vec_rotl16(arr2[i]);
+    }
+}
+
+ALWAYS_INLINE void arr_add_xor_rot12(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states)
+{
+    for (size_t i = 0; i < half_states; ++i) {
+        arr0[i] += arr1[i];
+        arr2[i] ^= arr0[i];
+        vec_rotl12(arr2[i]);
+    }
+}
+
+ALWAYS_INLINE void arr_add_xor_rot8(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states)
+{
+    for (size_t i = 0; i < half_states; ++i) {
+        arr0[i] += arr1[i];
+        arr2[i] ^= arr0[i];
+        vec_rotl8(arr2[i]);
+    }
+}
+
+ALWAYS_INLINE void arr_add_xor_rot7(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states)
+{
+    for (size_t i = 0; i < half_states; ++i) {
+        arr0[i] += arr1[i];
+        arr2[i] ^= arr0[i];
+        vec_rotl7(arr2[i]);
+    }
 }
 
 /*
@@ -132,176 +214,127 @@ After the second round, they are used (in reverse) to restore the original
 layout.
 
 */
-template <size_t I>
-ALWAYS_INLINE void arr_shuf0(std::array<vec256, I>& arr)
+ALWAYS_INLINE void arr_shuf0(vec256* arr, size_t half_states)
 {
-    static_for<I>([&](auto idx) {
-        vec256& x = std::get<idx.value>(arr);
+    for (size_t i = 0; i < half_states; ++i) {
+        vec256& x = arr[i];
         x = __builtin_shufflevector(x, x, 1, 2, 3, 0, 5, 6, 7, 4);
-    });
+    }
 }
 
-template <size_t I>
-ALWAYS_INLINE void arr_shuf1(std::array<vec256, I>& arr)
+ALWAYS_INLINE void arr_shuf1(vec256* arr, size_t half_states)
 {
-    static_for<I>([&](auto idx) {
-        vec256& x = std::get<idx.value>(arr);
+    for (size_t i = 0; i < half_states; ++i) {
+        vec256& x = arr[i];
         x = __builtin_shufflevector(x, x, 2, 3, 0, 1, 6, 7, 4, 5);
-    });
+    }
 }
 
-template <size_t I>
-ALWAYS_INLINE void arr_shuf2(std::array<vec256, I>& arr)
+ALWAYS_INLINE void arr_shuf2(vec256* arr, size_t half_states)
 {
-    static_for<I>([&](auto idx) {
-        vec256& x = std::get<idx.value>(arr);
+    for (size_t i = 0; i < half_states; ++i) {
+        vec256& x = arr[i];
         x = __builtin_shufflevector(x, x, 3, 0, 1, 2, 7, 4, 5, 6);
-    });
-}
-
-/* Run a single ChaCha20 double-round. */
-template <size_t I>
-ALWAYS_INLINE void doubleround_once(std::array<vec256, I>& arr0, std::array<vec256, I>& arr1, std::array<vec256, I>& arr2, std::array<vec256, I>& arr3)
-{
-    arr_add_xor_rot<16>(arr0, arr1, arr3);
-    arr_add_xor_rot<12>(arr2, arr3, arr1);
-    arr_add_xor_rot<8>(arr0, arr1, arr3);
-    arr_add_xor_rot<7>(arr2, arr3, arr1);
-    arr_shuf0(arr1);
-    arr_shuf1(arr2);
-    arr_shuf2(arr3);
-    arr_add_xor_rot<16>(arr0, arr1, arr3);
-    arr_add_xor_rot<12>(arr2, arr3, arr1);
-    arr_add_xor_rot<8>(arr0, arr1, arr3);
-    arr_add_xor_rot<7>(arr2, arr3, arr1);
-    arr_shuf2(arr1);
-    arr_shuf1(arr2);
-    arr_shuf0(arr3);
+    }
 }
 
 /* Main round function. */
-template <size_t I>
-ALWAYS_INLINE void doubleround(std::array<vec256, I>& arr0, std::array<vec256, I>& arr1, std::array<vec256, I>& arr2, std::array<vec256, I>& arr3)
+ALWAYS_INLINE void doubleround(vec256* arr0, vec256* arr1, vec256* arr2, vec256* arr3, size_t half_states)
 {
-    static_for<10>([&](auto) { doubleround_once(arr0, arr1, arr2, arr3); });
+    CHACHA20_VEC_UNROLL(10)
+    for (size_t i = 0; i < 10; ++i) {
+        arr_add_xor_rot16(arr0, arr1, arr3, half_states);
+        arr_add_xor_rot12(arr2, arr3, arr1, half_states);
+        arr_add_xor_rot8(arr0, arr1, arr3, half_states);
+        arr_add_xor_rot7(arr2, arr3, arr1, half_states);
+        arr_shuf0(arr1, half_states);
+        arr_shuf1(arr2, half_states);
+        arr_shuf2(arr3, half_states);
+        arr_add_xor_rot16(arr0, arr1, arr3, half_states);
+        arr_add_xor_rot12(arr2, arr3, arr1, half_states);
+        arr_add_xor_rot8(arr0, arr1, arr3, half_states);
+        arr_add_xor_rot7(arr2, arr3, arr1, half_states);
+        arr_shuf2(arr1, half_states);
+        arr_shuf1(arr2, half_states);
+        arr_shuf0(arr3, half_states);
+    }
 }
 
 /* Read 32bytes of input, xor with calculated state, write to output. Assumes
    that input and output are unaligned, and makes no assumptions about the
    internal layout of vec256;
 */
-ALWAYS_INLINE void vec_read_xor_write(std::span<const std::byte, 32> in_bytes, std::span<std::byte, 32> out_bytes, const vec256& vec)
+ALWAYS_INLINE void vec_read_xor_write(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, const vec256& vec)
 {
-    std::array<uint32_t, 8> temparr;
-    memcpy(temparr.data(), in_bytes.data(), in_bytes.size());
+    assert(in_bytes.size() == 32);
+    assert(out_bytes.size() == 32);
+
+    uint32_t temparr[8];
+    memcpy(temparr, in_bytes.data(), in_bytes.size());
     vec256 tempvec = vec ^ (vec256){temparr[0], temparr[1], temparr[2], temparr[3], temparr[4], temparr[5], temparr[6], temparr[7]};
     vec_byteswap(tempvec);
-    temparr = {tempvec[0], tempvec[1], tempvec[2], tempvec[3], tempvec[4], tempvec[5], tempvec[6], tempvec[7]};
-    memcpy(out_bytes.data(), temparr.data(), out_bytes.size());
+    temparr[0] = tempvec[0];
+    temparr[1] = tempvec[1];
+    temparr[2] = tempvec[2];
+    temparr[3] = tempvec[3];
+    temparr[4] = tempvec[4];
+    temparr[5] = tempvec[5];
+    temparr[6] = tempvec[6];
+    temparr[7] = tempvec[7];
+    memcpy(out_bytes.data(), temparr, out_bytes.size());
 }
 
 /* Merge the 128 bit lanes from 2 states to the proper order, then pass each vec_read_xor_write */
-template <size_t I>
-ALWAYS_INLINE void arr_read_xor_write(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, const std::array<vec256, I>& arr0, const std::array<vec256, I>& arr1, const std::array<vec256, I>& arr2, const std::array<vec256, I>& arr3)
-{
-    static_for<I>([&](auto idx) {
-        constexpr size_t offset = idx.value * 128;
-        const vec256& w = std::get<idx.value>(arr0);
-        const vec256& x = std::get<idx.value>(arr1);
-        const vec256& y = std::get<idx.value>(arr2);
-        const vec256& z = std::get<idx.value>(arr3);
-
-        auto in_slice = in_bytes.template subspan<offset, 128>();
-        auto out_slice = out_bytes.template subspan<offset, 128>();
-
-        vec_read_xor_write(in_slice.template first<32>(), out_slice.template first<32>(), __builtin_shufflevector(w, x, 4, 5, 6, 7, 12, 13, 14, 15));
-        vec_read_xor_write(in_slice.template subspan<32, 32>(), out_slice.template subspan<32, 32>(), __builtin_shufflevector(y, z, 4, 5, 6, 7, 12, 13, 14, 15));
-        vec_read_xor_write(in_slice.template subspan<64, 32>(), out_slice.template subspan<64, 32>(), __builtin_shufflevector(w, x, 0, 1, 2, 3, 8, 9, 10, 11));
-        vec_read_xor_write(in_slice.template subspan<96, 32>(), out_slice.template subspan<96, 32>(), __builtin_shufflevector(y, z, 0, 1, 2, 3, 8, 9, 10, 11));
-    });
-}
-
-/* Compile-time helper to create addend vectors which used to increment the states
-
-    Generates vectors of the pattern:
-    1 0 0 0 0 0 0 0
-    3 0 0 0 2 0 0 0
-    5 0 0 0 4 0 0 0
-    ...
-*/
-template <size_t SIZE>
-consteval std::array<vec256, SIZE> generate_increments()
+ALWAYS_INLINE void arr_read_xor_write(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, const vec256* arr0, const vec256* arr1, const vec256* arr2, const vec256* arr3, size_t half_states)
 {
-    std::array<vec256, SIZE> rows;
-    for (uint32_t i = 0; i < SIZE; i ++)
-    {
-        rows[i] = (i * (vec256){2, 0, 0, 0, 2, 0, 0, 0}) + (vec256){1, 0, 0, 0, 0, 0, 0, 0};
+    for (size_t i = 0; i < half_states; ++i) {
+        const vec256& w = arr0[i];
+        const vec256& x = arr1[i];
+        const vec256& y = arr2[i];
+        const vec256& z = arr3[i];
+
+        const size_t offset = i * 128;
+        auto in_slice = in_bytes.subspan(offset, 128);
+        auto out_slice = out_bytes.subspan(offset, 128);
+
+        vec_read_xor_write(in_slice.first(32), out_slice.first(32), __builtin_shufflevector(w, x, 4, 5, 6, 7, 12, 13, 14, 15));
+        vec_read_xor_write(in_slice.subspan(32, 32), out_slice.subspan(32, 32), __builtin_shufflevector(y, z, 4, 5, 6, 7, 12, 13, 14, 15));
+        vec_read_xor_write(in_slice.subspan(64, 32), out_slice.subspan(64, 32), __builtin_shufflevector(w, x, 0, 1, 2, 3, 8, 9, 10, 11));
+        vec_read_xor_write(in_slice.subspan(96, 32), out_slice.subspan(96, 32), __builtin_shufflevector(y, z, 0, 1, 2, 3, 8, 9, 10, 11));
     }
-    return rows;
 }
 
-/* Main crypt function. Calculates up to 16 states.
-
-    Each array contains one or more vectors, with each array representing a
-    quarter of a state. Initially, the high and low parts of each vector are
-    duplicated. They each contain a portion of the current and next state.
-
-    arr0[0]    arr1[0]    arr2[0]    arr3[0]   increment
-    ----------|---------|----------|----------|---------
-    0x61707865 input[0]   input[4]   input[8]   [1]
-    0x3320646e input[1]   input[5]   input[9]   [0]
-    0x79622d32 input[2]   input[6]   input[10]  [0]
-    0x6b206574 input[3]   input[7]   input[11]  [0]
-
-    0x61707865 input[0]   input[4]   input[8]   [0]
-    0x3320646e input[1]   input[5]   input[9]   [0]
-    0x79622d32 input[2]   input[6]   input[10]  [0]
-    0x6b206574 input[3]   input[7]   input[11]  [0]
-
-    After loading the states, arr3's vectors are incremented as-necessary to
-    contain the correct counter values.
-
-    This way, operations like "arr0[0] += arr1[0]" can perform all 8 operations
-    in parallel, taking advantage of 256bit registers where available.
-
-    arrX[0] represents states 0 and 1.
-    arrX[1] represents states 2 and 3 (if present)
-    etc.
-
-    After the doublerounds have been run and the initial state has been mixed
-    back in, the high and low portions of the vectors in each array are
-    shuffled in order to prepare them for mixing with the input bytes. Finally,
-    each state is xor'd with its corresponding input, byteswapped if necessary,
-    and written to its output.
-*/
-template <size_t STATES>
-ALWAYS_INLINE void multi_block_crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, const vec256& state0, const vec256& state1, const vec256& state2)
+/* Main crypt function. Calculates up to 16 states. */
+ALWAYS_INLINE void multi_block_crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, const vec256& state0, const vec256& state1, const vec256& state2, size_t states)
 {
-    static constexpr size_t HALF_STATES = STATES / 2;
-    static constexpr vec256 nums256 = (vec256){0x61707865, 0x3320646e, 0x79622d32, 0x6b206574, 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574};
-    static constinit std::array<vec256, HALF_STATES> increments = generate_increments<HALF_STATES>();
+    const size_t half_states = states / 2;
+    const vec256* increments = increments_for_half_states(half_states);
+    assert(increments != nullptr);
 
-    std::array<vec256, HALF_STATES> arr0, arr1, arr2, arr3;
+    vec256 arr0[8], arr1[8], arr2[8], arr3[8];
 
-    arr_set_vec256(arr0, nums256);
-    arr_set_vec256(arr1, state0);
-    arr_set_vec256(arr2, state1);
-    arr_set_vec256(arr3, state2);
+    arr_set_vec256(arr0, half_states, nums256);
+    arr_set_vec256(arr1, half_states, state0);
+    arr_set_vec256(arr2, half_states, state1);
+    arr_set_vec256(arr3, half_states, state2);
 
-    arr_add_arr(arr3, increments);
+    arr_add_arr(arr3, increments, half_states);
 
-    doubleround(arr0, arr1, arr2, arr3);
+    doubleround(arr0, arr1, arr2, arr3, half_states);
 
-    arr_add_vec256(arr0, nums256);
-    arr_add_vec256(arr1, state0);
-    arr_add_vec256(arr2, state1);
-    arr_add_vec256(arr3, state2);
+    arr_add_vec256(arr0, half_states, nums256);
+    arr_add_vec256(arr1, half_states, state0);
+    arr_add_vec256(arr2, half_states, state1);
+    arr_add_vec256(arr3, half_states, state2);
 
-    arr_add_arr(arr3, increments);
+    arr_add_arr(arr3, increments, half_states);
 
-    arr_read_xor_write(in_bytes, out_bytes, arr0, arr1, arr2, arr3);
+    arr_read_xor_write(in_bytes, out_bytes, arr0, arr1, arr2, arr3, half_states);
 }
 
+#undef CHACHA20_VEC_UNROLL
+#undef CHACHA20_VEC_PRAGMA
+
 } // anonymous namespace
 #endif // CHACHA20_VEC_ALL_MULTI_STATES_DISABLED
 
@@ -318,7 +351,7 @@ void chacha20_crypt_vectorized(std::span<const std::byte>& in_bytes, std::span<s
     vec256 state2 =  (vec256){input[8], input[9], input[10], input[11], input[8], input[9], input[10], input[11]};
 #if !defined(CHACHA20_VEC_DISABLE_STATES_16)
     while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 16) {
-        multi_block_crypt<16>(in_bytes, out_bytes, state0, state1, state2);
+        multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, 16);
         state2 += (vec256){16, 0, 0, 0, 16, 0, 0, 0};
         in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 16);
         out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 16);
@@ -326,7 +359,7 @@ void chacha20_crypt_vectorized(std::span<const std::byte>& in_bytes, std::span<s
 #endif
 #if !defined(CHACHA20_VEC_DISABLE_STATES_8)
     while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 8) {
-        multi_block_crypt<8>(in_bytes, out_bytes, state0, state1, state2);
+        multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, 8);
         state2 += (vec256){8, 0, 0, 0, 8, 0, 0, 0};
         in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 8);
         out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 8);
@@ -334,7 +367,7 @@ void chacha20_crypt_vectorized(std::span<const std::byte>& in_bytes, std::span<s
 #endif
 #if !defined(CHACHA20_VEC_DISABLE_STATES_6)
     while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 6) {
-        multi_block_crypt<6>(in_bytes, out_bytes, state0, state1, state2);
+        multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, 6);
         state2 += (vec256){6, 0, 0, 0, 6, 0, 0, 0};
         in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 6);
         out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 6);
@@ -342,7 +375,7 @@ void chacha20_crypt_vectorized(std::span<const std::byte>& in_bytes, std::span<s
 #endif
 #if !defined(CHACHA20_VEC_DISABLE_STATES_4)
     while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 4) {
-        multi_block_crypt<4>(in_bytes, out_bytes, state0, state1, state2);
+        multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, 4);
         state2 += (vec256){4, 0, 0, 0, 4, 0, 0, 0};
         in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 4);
         out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 4);
@@ -350,7 +383,7 @@ void chacha20_crypt_vectorized(std::span<const std::byte>& in_bytes, std::span<s
 #endif
 #if !defined(CHACHA20_VEC_DISABLE_STATES_2)
     while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 2) {
-        multi_block_crypt<2>(in_bytes, out_bytes, state0, state1, state2);
+        multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, 2);
         state2 += (vec256){2, 0, 0, 0, 2, 0, 0, 0};
         in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 2);
         out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 2);

From 2b44fd0deade1981935d6c5776e11be2191e59eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Sun, 4 Jan 2026 01:06:04 +0100
Subject: [PATCH 05/12] refactor: unroll ChaCha20 vector operations for
 improved clarity and efficiency

Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte):
- clang 22: 1MB 1.79 [1.79-1.80], 256B 1.63 [1.63-1.64], 64B 2.59 [2.58-2.60]
- gcc 14:   1MB 5.36 [5.35-5.36], 256B 5.16 [5.15-5.16], 64B 2.72 [2.69-2.73]

The additional unrolling helps clang but hurts gcc again. On gcc the multi-state
function grows and spills more (large stack frame), pushing 1MB back near the
original regression.

Delta vs previous commit:
- gcc: 1MB 4.51 -> 5.36 ns/B (regression)
- clang: 1MB 1.85 -> 1.79 ns/B (improvement)
---
 src/crypto/chacha20_vec.ipp | 117 ++++++++++++++++++++++--------------
 1 file changed, 72 insertions(+), 45 deletions(-)

diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp
index 06b8512839a2..e414cff085e5 100644
--- a/src/crypto/chacha20_vec.ipp
+++ b/src/crypto/chacha20_vec.ipp
@@ -137,60 +137,75 @@ ALWAYS_INLINE const vec256* increments_for_half_states(size_t half_states)
 /** Store a vector in all array elements */
 ALWAYS_INLINE void arr_set_vec256(vec256* arr, size_t half_states, const vec256& vec)
 {
-    for (size_t i = 0; i < half_states; ++i) {
-        arr[i] = vec;
+    CHACHA20_VEC_UNROLL(8)
+    for (size_t i = 0; i < 8; ++i) {
+        if (i < half_states) arr[i] = vec;
     }
 }
 
 /** Add a vector to all array elements */
 ALWAYS_INLINE void arr_add_vec256(vec256* arr, size_t half_states, const vec256& vec)
 {
-    for (size_t i = 0; i < half_states; ++i) {
-        arr[i] += vec;
+    CHACHA20_VEC_UNROLL(8)
+    for (size_t i = 0; i < 8; ++i) {
+        if (i < half_states) arr[i] += vec;
     }
 }
 
 /** Add corresponding vectors in arr1 to arr0 */
 ALWAYS_INLINE void arr_add_arr(vec256* arr0, const vec256* arr1, size_t half_states)
 {
-    for (size_t i = 0; i < half_states; ++i) {
-        arr0[i] += arr1[i];
+    CHACHA20_VEC_UNROLL(8)
+    for (size_t i = 0; i < 8; ++i) {
+        if (i < half_states) arr0[i] += arr1[i];
     }
 }
 
 ALWAYS_INLINE void arr_add_xor_rot16(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states)
 {
-    for (size_t i = 0; i < half_states; ++i) {
-        arr0[i] += arr1[i];
-        arr2[i] ^= arr0[i];
-        vec_rotl16(arr2[i]);
+    CHACHA20_VEC_UNROLL(8)
+    for (size_t i = 0; i < 8; ++i) {
+        if (i < half_states) {
+            arr0[i] += arr1[i];
+            arr2[i] ^= arr0[i];
+            vec_rotl16(arr2[i]);
+        }
     }
 }
 
 ALWAYS_INLINE void arr_add_xor_rot12(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states)
 {
-    for (size_t i = 0; i < half_states; ++i) {
-        arr0[i] += arr1[i];
-        arr2[i] ^= arr0[i];
-        vec_rotl12(arr2[i]);
+    CHACHA20_VEC_UNROLL(8)
+    for (size_t i = 0; i < 8; ++i) {
+        if (i < half_states) {
+            arr0[i] += arr1[i];
+            arr2[i] ^= arr0[i];
+            vec_rotl12(arr2[i]);
+        }
     }
 }
 
 ALWAYS_INLINE void arr_add_xor_rot8(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states)
 {
-    for (size_t i = 0; i < half_states; ++i) {
-        arr0[i] += arr1[i];
-        arr2[i] ^= arr0[i];
-        vec_rotl8(arr2[i]);
+    CHACHA20_VEC_UNROLL(8)
+    for (size_t i = 0; i < 8; ++i) {
+        if (i < half_states) {
+            arr0[i] += arr1[i];
+            arr2[i] ^= arr0[i];
+            vec_rotl8(arr2[i]);
+        }
     }
 }
 
 ALWAYS_INLINE void arr_add_xor_rot7(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states)
 {
-    for (size_t i = 0; i < half_states; ++i) {
-        arr0[i] += arr1[i];
-        arr2[i] ^= arr0[i];
-        vec_rotl7(arr2[i]);
+    CHACHA20_VEC_UNROLL(8)
+    for (size_t i = 0; i < 8; ++i) {
+        if (i < half_states) {
+            arr0[i] += arr1[i];
+            arr2[i] ^= arr0[i];
+            vec_rotl7(arr2[i]);
+        }
     }
 }
 
@@ -216,25 +231,34 @@ layout.
 */
 ALWAYS_INLINE void arr_shuf0(vec256* arr, size_t half_states)
 {
-    for (size_t i = 0; i < half_states; ++i) {
-        vec256& x = arr[i];
-        x = __builtin_shufflevector(x, x, 1, 2, 3, 0, 5, 6, 7, 4);
+    CHACHA20_VEC_UNROLL(8)
+    for (size_t i = 0; i < 8; ++i) {
+        if (i < half_states) {
+            vec256& x = arr[i];
+            x = __builtin_shufflevector(x, x, 1, 2, 3, 0, 5, 6, 7, 4);
+        }
     }
 }
 
 ALWAYS_INLINE void arr_shuf1(vec256* arr, size_t half_states)
 {
-    for (size_t i = 0; i < half_states; ++i) {
-        vec256& x = arr[i];
-        x = __builtin_shufflevector(x, x, 2, 3, 0, 1, 6, 7, 4, 5);
+    CHACHA20_VEC_UNROLL(8)
+    for (size_t i = 0; i < 8; ++i) {
+        if (i < half_states) {
+            vec256& x = arr[i];
+            x = __builtin_shufflevector(x, x, 2, 3, 0, 1, 6, 7, 4, 5);
+        }
     }
 }
 
 ALWAYS_INLINE void arr_shuf2(vec256* arr, size_t half_states)
 {
-    for (size_t i = 0; i < half_states; ++i) {
-        vec256& x = arr[i];
-        x = __builtin_shufflevector(x, x, 3, 0, 1, 2, 7, 4, 5, 6);
+    CHACHA20_VEC_UNROLL(8)
+    for (size_t i = 0; i < 8; ++i) {
+        if (i < half_states) {
+            vec256& x = arr[i];
+            x = __builtin_shufflevector(x, x, 3, 0, 1, 2, 7, 4, 5, 6);
+        }
     }
 }
 
@@ -287,20 +311,23 @@ ALWAYS_INLINE void vec_read_xor_write(std::span<const std::byte> in_bytes, std::
 /* Merge the 128 bit lanes from 2 states to the proper order, then pass each vec_read_xor_write */
 ALWAYS_INLINE void arr_read_xor_write(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, const vec256* arr0, const vec256* arr1, const vec256* arr2, const vec256* arr3, size_t half_states)
 {
-    for (size_t i = 0; i < half_states; ++i) {
-        const vec256& w = arr0[i];
-        const vec256& x = arr1[i];
-        const vec256& y = arr2[i];
-        const vec256& z = arr3[i];
-
-        const size_t offset = i * 128;
-        auto in_slice = in_bytes.subspan(offset, 128);
-        auto out_slice = out_bytes.subspan(offset, 128);
-
-        vec_read_xor_write(in_slice.first(32), out_slice.first(32), __builtin_shufflevector(w, x, 4, 5, 6, 7, 12, 13, 14, 15));
-        vec_read_xor_write(in_slice.subspan(32, 32), out_slice.subspan(32, 32), __builtin_shufflevector(y, z, 4, 5, 6, 7, 12, 13, 14, 15));
-        vec_read_xor_write(in_slice.subspan(64, 32), out_slice.subspan(64, 32), __builtin_shufflevector(w, x, 0, 1, 2, 3, 8, 9, 10, 11));
-        vec_read_xor_write(in_slice.subspan(96, 32), out_slice.subspan(96, 32), __builtin_shufflevector(y, z, 0, 1, 2, 3, 8, 9, 10, 11));
+    CHACHA20_VEC_UNROLL(8)
+    for (size_t i = 0; i < 8; ++i) {
+        if (i < half_states) {
+            const vec256& w = arr0[i];
+            const vec256& x = arr1[i];
+            const vec256& y = arr2[i];
+            const vec256& z = arr3[i];
+
+            const size_t offset = i * 128;
+            auto in_slice = in_bytes.subspan(offset, 128);
+            auto out_slice = out_bytes.subspan(offset, 128);
+
+            vec_read_xor_write(in_slice.first(32), out_slice.first(32), __builtin_shufflevector(w, x, 4, 5, 6, 7, 12, 13, 14, 15));
+            vec_read_xor_write(in_slice.subspan(32, 32), out_slice.subspan(32, 32), __builtin_shufflevector(y, z, 4, 5, 6, 7, 12, 13, 14, 15));
+            vec_read_xor_write(in_slice.subspan(64, 32), out_slice.subspan(64, 32), __builtin_shufflevector(w, x, 0, 1, 2, 3, 8, 9, 10, 11));
+            vec_read_xor_write(in_slice.subspan(96, 32), out_slice.subspan(96, 32), __builtin_shufflevector(y, z, 0, 1, 2, 3, 8, 9, 10, 11));
+        }
     }
 }
 

From b40a32718d3e7927512caeee066f70a7ad06804d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Sun, 4 Jan 2026 10:29:45 +0100
Subject: [PATCH 06/12] refactor: consolidate shuffle operations and loop
 handling in ChaCha20 vector implementation

Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte):
- clang 22: 1MB 1.86 [1.86-1.87], 256B 1.73 [1.72-1.73], 64B 2.60 [2.58-2.60]
- gcc 14:   1MB 5.74 [5.73-5.74], 256B 5.29 [5.29-5.30], 64B 2.71 [2.69-2.73]

This reshuffle/loop consolidation ends up worsening both compilers slightly, but the
impact is far larger on gcc. The gcc variant again has a huge stack frame and many
extra instructions in the multi-state path (`ins/byte` ~35.7 for CHACHA20_1MB).

Assembly contrast (AArch64):
- clang: still uses `ext` for lane shuffles and keeps stack relatively small.
- gcc: spills and uses scalar pack/unpack sequences; stack allocation is ~0x60a0.

Delta vs previous commit:
- clang: 1MB 1.79 -> 1.86 ns/B
- gcc:   1MB 5.36 -> 5.74 ns/B
---
 src/attributes.h                 |  10 +
 src/crypto/chacha20_vec.ipp      | 477 ++++++++++++++-----------------
 src/crypto/chacha20_vec_base.cpp |   8 +
 3 files changed, 234 insertions(+), 261 deletions(-)

diff --git a/src/attributes.h b/src/attributes.h
index 275dad9f8ede..b3686b6c1772 100644
--- a/src/attributes.h
+++ b/src/attributes.h
@@ -24,4 +24,14 @@
 #  error No known always_inline attribute for this platform.
 #endif
 
+#define PRAGMA(x) _Pragma(#x)
+
+#if defined(__clang__)
+#  define UNROLL_LOOP(N) PRAGMA(clang loop unroll_count(N))
+#elif defined(__GNUC__)
+#  define UNROLL_LOOP(N) PRAGMA(GCC unroll N)
+#else
+#  define UNROLL_LOOP(N)
+#endif
+
 #endif // BITCOIN_ATTRIBUTES_H
diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp
index e414cff085e5..3a92649c90d8 100644
--- a/src/crypto/chacha20_vec.ipp
+++ b/src/crypto/chacha20_vec.ipp
@@ -2,48 +2,71 @@
 // Distributed under the MIT software license, see the accompanying
 // file COPYING or http://www.opensource.org/licenses/mit-license.php.
 
+#include <attributes.h>
 #include <crypto/chacha20_vec.h>
 
 #include <bit>
 #include <cassert>
+#include <cstdint>
 #include <cstring>
 #include <limits>
+#include <memory>
+#include <utility>
 
 #if defined(ENABLE_CHACHA20_VEC)
 
-#if defined(CHACHA20_VEC_DISABLE_STATES_16) && \
-    defined(CHACHA20_VEC_DISABLE_STATES_8) && \
-    defined(CHACHA20_VEC_DISABLE_STATES_6) && \
-    defined(CHACHA20_VEC_DISABLE_STATES_4) && \
-    defined(CHACHA20_VEC_DISABLE_STATES_2)
-#define CHACHA20_VEC_ALL_MULTI_STATES_DISABLED
+// Convert preprocessor flags to constexpr booleans for use with if constexpr
+#ifdef CHACHA20_VEC_DISABLE_STATES_16
+inline constexpr bool kEnableStates16 = false;
+#else
+inline constexpr bool kEnableStates16 = true;
 #endif
 
+#ifdef CHACHA20_VEC_DISABLE_STATES_8
+inline constexpr bool kEnableStates8 = false;
+#else
+inline constexpr bool kEnableStates8 = true;
+#endif
 
-#if !defined(CHACHA20_VEC_ALL_MULTI_STATES_DISABLED)
-
-#if defined(__has_attribute)
-#  if __has_attribute(always_inline)
-#    define ALWAYS_INLINE __attribute__ ((always_inline)) inline
-#  endif
+#ifdef CHACHA20_VEC_DISABLE_STATES_6
+inline constexpr bool kEnableStates6 = false;
+#else
+inline constexpr bool kEnableStates6 = true;
 #endif
 
-#if !defined(ALWAYS_INLINE)
-#  define ALWAYS_INLINE inline
+#ifdef CHACHA20_VEC_DISABLE_STATES_4
+inline constexpr bool kEnableStates4 = false;
+#else
+inline constexpr bool kEnableStates4 = true;
 #endif
 
+#ifdef CHACHA20_VEC_DISABLE_STATES_2
+inline constexpr bool kEnableStates2 = false;
+#else
+inline constexpr bool kEnableStates2 = true;
+#endif
 
-namespace {
+inline constexpr bool kEnableAnyMultiState = kEnableStates16 || kEnableStates8 || kEnableStates6 || kEnableStates4 || kEnableStates2;
 
+// vec256 type must be visible for if constexpr branches even when they're not taken
 using vec256 = uint32_t __attribute__((__vector_size__(32)));
 
-// Like Bitcoin Core's `ALWAYS_INLINE` in other files, but kept local to avoid touching shared headers.
+// Preprocessor check for conditional compilation of the anonymous namespace
+#if !defined(CHACHA20_VEC_DISABLE_STATES_16) || \
+    !defined(CHACHA20_VEC_DISABLE_STATES_8) ||  \
+    !defined(CHACHA20_VEC_DISABLE_STATES_6) ||  \
+    !defined(CHACHA20_VEC_DISABLE_STATES_4) ||  \
+    !defined(CHACHA20_VEC_DISABLE_STATES_2)
+
+namespace {
+
+// Used for an optional aligned I/O fast-path.
+static constexpr size_t CHACHA20_VEC_MEM_ALIGN{16};
 
 /** Endian-conversion for big-endian */
 ALWAYS_INLINE void vec_byteswap(vec256& vec)
 {
-    if constexpr (std::endian::native == std::endian::big)
-    {
+    if constexpr (std::endian::native == std::endian::big) {
         vec256 ret;
         ret[0] = __builtin_bswap32(vec[0]);
         ret[1] = __builtin_bswap32(vec[1]);
@@ -57,156 +80,78 @@ ALWAYS_INLINE void vec_byteswap(vec256& vec)
     }
 }
 
-/** Left-rotate vector */
-ALWAYS_INLINE void vec_rotl16(vec256& vec)
+/** Left-rotate all elements in a vector by N bits */
+template <unsigned N>
+ALWAYS_INLINE void vec_rotl(vec256& vec)
 {
-    vec = (vec << 16) | (vec >> 16);
+    static_assert(N > 0 && N < 32, "Rotation must be between 1 and 31 bits");
+    vec = (vec << N) | (vec >> (32 - N));
 }
 
-ALWAYS_INLINE void vec_rotl12(vec256& vec)
-{
-    vec = (vec << 12) | (vec >> 20);
-}
-
-ALWAYS_INLINE void vec_rotl8(vec256& vec)
-{
-    vec = (vec << 8) | (vec >> 24);
-}
+static constexpr vec256 nums256 = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574, 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574};
+
+// Counter increments for each half-state pair. Pattern: {2*i+1, 0, 0, 0, 2*i, 0, 0, 0}
+// All smaller state counts use a prefix of this array.
+static constexpr vec256 increments[8] = {
+    {1, 0, 0, 0, 0, 0, 0, 0},
+    {3, 0, 0, 0, 2, 0, 0, 0},
+    {5, 0, 0, 0, 4, 0, 0, 0},
+    {7, 0, 0, 0, 6, 0, 0, 0},
+    {9, 0, 0, 0, 8, 0, 0, 0},
+    {11, 0, 0, 0, 10, 0, 0, 0},
+    {13, 0, 0, 0, 12, 0, 0, 0},
+    {15, 0, 0, 0, 14, 0, 0, 0},
+};
 
-ALWAYS_INLINE void vec_rotl7(vec256& vec)
+template <typename Fn, size_t... I>
+ALWAYS_INLINE void for_each_half_state(size_t half_states, Fn&& fn, std::index_sequence<I...>)
 {
-    vec = (vec << 7) | (vec >> 25);
+    ((I < half_states ? (fn(std::integral_constant<size_t, I>{}), 0) : 0), ...);
 }
 
-static const vec256 nums256 = (vec256){0x61707865, 0x3320646e, 0x79622d32, 0x6b206574, 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574};
-
-static const vec256 increments_1[1] = {
-    (vec256){1, 0, 0, 0, 0, 0, 0, 0},
-};
-
-static const vec256 increments_2[2] = {
-    (vec256){1, 0, 0, 0, 0, 0, 0, 0},
-    (vec256){3, 0, 0, 0, 2, 0, 0, 0},
-};
-
-static const vec256 increments_3[3] = {
-    (vec256){1, 0, 0, 0, 0, 0, 0, 0},
-    (vec256){3, 0, 0, 0, 2, 0, 0, 0},
-    (vec256){5, 0, 0, 0, 4, 0, 0, 0},
-};
-
-static const vec256 increments_4[4] = {
-    (vec256){1, 0, 0, 0, 0, 0, 0, 0},
-    (vec256){3, 0, 0, 0, 2, 0, 0, 0},
-    (vec256){5, 0, 0, 0, 4, 0, 0, 0},
-    (vec256){7, 0, 0, 0, 6, 0, 0, 0},
-};
-
-static const vec256 increments_8[8] = {
-    (vec256){1, 0, 0, 0, 0, 0, 0, 0},
-    (vec256){3, 0, 0, 0, 2, 0, 0, 0},
-    (vec256){5, 0, 0, 0, 4, 0, 0, 0},
-    (vec256){7, 0, 0, 0, 6, 0, 0, 0},
-    (vec256){9, 0, 0, 0, 8, 0, 0, 0},
-    (vec256){11, 0, 0, 0, 10, 0, 0, 0},
-    (vec256){13, 0, 0, 0, 12, 0, 0, 0},
-    (vec256){15, 0, 0, 0, 14, 0, 0, 0},
-};
-
-#define CHACHA20_VEC_PRAGMA(x) _Pragma(#x)
-#if defined(__clang__)
-#define CHACHA20_VEC_UNROLL(N) CHACHA20_VEC_PRAGMA(clang loop unroll_count(N))
-#elif defined(__GNUC__)
-#define CHACHA20_VEC_UNROLL(N) CHACHA20_VEC_PRAGMA(GCC unroll N)
-#else
-#define CHACHA20_VEC_UNROLL(N)
-#endif
-
-ALWAYS_INLINE const vec256* increments_for_half_states(size_t half_states)
+template <typename Fn>
+ALWAYS_INLINE void for_each_half_state(size_t half_states, Fn&& fn)
 {
-    switch (half_states) {
-    case 1: return increments_1;
-    case 2: return increments_2;
-    case 3: return increments_3;
-    case 4: return increments_4;
-    case 8: return increments_8;
-    default: return nullptr;
-    }
+    for_each_half_state(half_states, std::forward<Fn>(fn), std::make_index_sequence<8>{});
 }
 
 /** Store a vector in all array elements */
 ALWAYS_INLINE void arr_set_vec256(vec256* arr, size_t half_states, const vec256& vec)
 {
-    CHACHA20_VEC_UNROLL(8)
-    for (size_t i = 0; i < 8; ++i) {
-        if (i < half_states) arr[i] = vec;
-    }
+    for_each_half_state(half_states, [&](auto idx) {
+        constexpr size_t i = decltype(idx)::value;
+        arr[i] = vec;
+    });
 }
 
 /** Add a vector to all array elements */
 ALWAYS_INLINE void arr_add_vec256(vec256* arr, size_t half_states, const vec256& vec)
 {
-    CHACHA20_VEC_UNROLL(8)
-    for (size_t i = 0; i < 8; ++i) {
-        if (i < half_states) arr[i] += vec;
-    }
+    for_each_half_state(half_states, [&](auto idx) {
+        constexpr size_t i = decltype(idx)::value;
+        arr[i] += vec;
+    });
 }
 
 /** Add corresponding vectors in arr1 to arr0 */
 ALWAYS_INLINE void arr_add_arr(vec256* arr0, const vec256* arr1, size_t half_states)
 {
-    CHACHA20_VEC_UNROLL(8)
-    for (size_t i = 0; i < 8; ++i) {
-        if (i < half_states) arr0[i] += arr1[i];
-    }
-}
-
-ALWAYS_INLINE void arr_add_xor_rot16(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states)
-{
-    CHACHA20_VEC_UNROLL(8)
-    for (size_t i = 0; i < 8; ++i) {
-        if (i < half_states) {
-            arr0[i] += arr1[i];
-            arr2[i] ^= arr0[i];
-            vec_rotl16(arr2[i]);
-        }
-    }
+    for_each_half_state(half_states, [&](auto idx) {
+        constexpr size_t i = decltype(idx)::value;
+        arr0[i] += arr1[i];
+    });
 }
 
-ALWAYS_INLINE void arr_add_xor_rot12(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states)
+/** Add arr1 to arr0, XOR result into arr2, rotate arr2 left by N bits */
+template <unsigned N>
+ALWAYS_INLINE void arr_add_xor_rot(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states)
 {
-    CHACHA20_VEC_UNROLL(8)
-    for (size_t i = 0; i < 8; ++i) {
-        if (i < half_states) {
-            arr0[i] += arr1[i];
-            arr2[i] ^= arr0[i];
-            vec_rotl12(arr2[i]);
-        }
-    }
-}
-
-ALWAYS_INLINE void arr_add_xor_rot8(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states)
-{
-    CHACHA20_VEC_UNROLL(8)
-    for (size_t i = 0; i < 8; ++i) {
-        if (i < half_states) {
-            arr0[i] += arr1[i];
-            arr2[i] ^= arr0[i];
-            vec_rotl8(arr2[i]);
-        }
-    }
-}
-
-ALWAYS_INLINE void arr_add_xor_rot7(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states)
-{
-    CHACHA20_VEC_UNROLL(8)
-    for (size_t i = 0; i < 8; ++i) {
-        if (i < half_states) {
-            arr0[i] += arr1[i];
-            arr2[i] ^= arr0[i];
-            vec_rotl7(arr2[i]);
-        }
-    }
+    for_each_half_state(half_states, [&](auto idx) {
+        constexpr size_t i = decltype(idx)::value;
+        arr0[i] += arr1[i];
+        arr2[i] ^= arr0[i];
+        vec_rotl<N>(arr2[i]);
+    });
 }
 
 /*
@@ -229,115 +174,145 @@ After the second round, they are used (in reverse) to restore the original
 layout.
 
 */
+#if defined(__GNUC__) && !defined(__clang__)
+template <int... I>
+ALWAYS_INLINE vec256 vec_shuffle(const vec256& v)
+{
+    static_assert(sizeof...(I) == 8);
+    using mask_t = int __attribute__((__vector_size__(32)));
+    constexpr mask_t mask{I...};
+    return __builtin_shuffle(v, mask);
+}
+
+template <int... I>
+ALWAYS_INLINE vec256 vec_shuffle(const vec256& a, const vec256& b)
+{
+    static_assert(sizeof...(I) == 8);
+    using mask_t = int __attribute__((__vector_size__(32)));
+    constexpr mask_t mask{I...};
+    return __builtin_shuffle(a, b, mask);
+}
+#endif // defined(__GNUC__) && !defined(__clang__)
+
+#if defined(__GNUC__) && !defined(__clang__)
+#define VEC_SHUF_SELF(x, ...) vec_shuffle<__VA_ARGS__>(x)
+#define VEC_SHUF2(a, b, ...) vec_shuffle<__VA_ARGS__>(a, b)
+#else
+#define VEC_SHUF_SELF(x, ...) __builtin_shufflevector(x, x, __VA_ARGS__)
+#define VEC_SHUF2(a, b, ...) __builtin_shufflevector(a, b, __VA_ARGS__)
+#endif
+
 ALWAYS_INLINE void arr_shuf0(vec256* arr, size_t half_states)
 {
-    CHACHA20_VEC_UNROLL(8)
-    for (size_t i = 0; i < 8; ++i) {
-        if (i < half_states) {
-            vec256& x = arr[i];
-            x = __builtin_shufflevector(x, x, 1, 2, 3, 0, 5, 6, 7, 4);
-        }
-    }
+    for_each_half_state(half_states, [&](auto idx) {
+        constexpr size_t i = decltype(idx)::value;
+        vec256& x = arr[i];
+        x = VEC_SHUF_SELF(x, 1, 2, 3, 0, 5, 6, 7, 4);
+    });
 }
 
 ALWAYS_INLINE void arr_shuf1(vec256* arr, size_t half_states)
 {
-    CHACHA20_VEC_UNROLL(8)
-    for (size_t i = 0; i < 8; ++i) {
-        if (i < half_states) {
-            vec256& x = arr[i];
-            x = __builtin_shufflevector(x, x, 2, 3, 0, 1, 6, 7, 4, 5);
-        }
-    }
+    for_each_half_state(half_states, [&](auto idx) {
+        constexpr size_t i = decltype(idx)::value;
+        vec256& x = arr[i];
+        x = VEC_SHUF_SELF(x, 2, 3, 0, 1, 6, 7, 4, 5);
+    });
 }
 
 ALWAYS_INLINE void arr_shuf2(vec256* arr, size_t half_states)
 {
-    CHACHA20_VEC_UNROLL(8)
-    for (size_t i = 0; i < 8; ++i) {
-        if (i < half_states) {
-            vec256& x = arr[i];
-            x = __builtin_shufflevector(x, x, 3, 0, 1, 2, 7, 4, 5, 6);
-        }
-    }
+    for_each_half_state(half_states, [&](auto idx) {
+        constexpr size_t i = decltype(idx)::value;
+        vec256& x = arr[i];
+        x = VEC_SHUF_SELF(x, 3, 0, 1, 2, 7, 4, 5, 6);
+    });
 }
 
 /* Main round function. */
 ALWAYS_INLINE void doubleround(vec256* arr0, vec256* arr1, vec256* arr2, vec256* arr3, size_t half_states)
 {
-    CHACHA20_VEC_UNROLL(10)
+    UNROLL_LOOP(10)
     for (size_t i = 0; i < 10; ++i) {
-        arr_add_xor_rot16(arr0, arr1, arr3, half_states);
-        arr_add_xor_rot12(arr2, arr3, arr1, half_states);
-        arr_add_xor_rot8(arr0, arr1, arr3, half_states);
-        arr_add_xor_rot7(arr2, arr3, arr1, half_states);
+        arr_add_xor_rot<16>(arr0, arr1, arr3, half_states);
+        arr_add_xor_rot<12>(arr2, arr3, arr1, half_states);
+        arr_add_xor_rot<8>(arr0, arr1, arr3, half_states);
+        arr_add_xor_rot<7>(arr2, arr3, arr1, half_states);
         arr_shuf0(arr1, half_states);
         arr_shuf1(arr2, half_states);
         arr_shuf2(arr3, half_states);
-        arr_add_xor_rot16(arr0, arr1, arr3, half_states);
-        arr_add_xor_rot12(arr2, arr3, arr1, half_states);
-        arr_add_xor_rot8(arr0, arr1, arr3, half_states);
-        arr_add_xor_rot7(arr2, arr3, arr1, half_states);
+        arr_add_xor_rot<16>(arr0, arr1, arr3, half_states);
+        arr_add_xor_rot<12>(arr2, arr3, arr1, half_states);
+        arr_add_xor_rot<8>(arr0, arr1, arr3, half_states);
+        arr_add_xor_rot<7>(arr2, arr3, arr1, half_states);
         arr_shuf2(arr1, half_states);
         arr_shuf1(arr2, half_states);
         arr_shuf0(arr3, half_states);
     }
 }
 
-/* Read 32bytes of input, xor with calculated state, write to output. Assumes
-   that input and output are unaligned, and makes no assumptions about the
-   internal layout of vec256;
+/* Read 32 bytes of input, xor with calculated state, write to output.
+   Supports unaligned input/output, with an optional aligned fast-path.
 */
-ALWAYS_INLINE void vec_read_xor_write(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, const vec256& vec)
+template <bool AssumeAligned>
+ALWAYS_INLINE void vec_read_xor_write(const std::byte* in_bytes, std::byte* out_bytes, const vec256& vec)
 {
-    assert(in_bytes.size() == 32);
-    assert(out_bytes.size() == 32);
-
-    uint32_t temparr[8];
-    memcpy(temparr, in_bytes.data(), in_bytes.size());
-    vec256 tempvec = vec ^ (vec256){temparr[0], temparr[1], temparr[2], temparr[3], temparr[4], temparr[5], temparr[6], temparr[7]};
-    vec_byteswap(tempvec);
-    temparr[0] = tempvec[0];
-    temparr[1] = tempvec[1];
-    temparr[2] = tempvec[2];
-    temparr[3] = tempvec[3];
-    temparr[4] = tempvec[4];
-    temparr[5] = tempvec[5];
-    temparr[6] = tempvec[6];
-    temparr[7] = tempvec[7];
-    memcpy(out_bytes.data(), temparr, out_bytes.size());
+    if constexpr (AssumeAligned) {
+        in_bytes = std::assume_aligned<CHACHA20_VEC_MEM_ALIGN>(in_bytes);
+        out_bytes = std::assume_aligned<CHACHA20_VEC_MEM_ALIGN>(out_bytes);
+    }
+
+    uint32_t tmp_arr[8];
+    memcpy(tmp_arr, in_bytes, sizeof(tmp_arr));
+    vec256 tmp_vec;
+    memcpy(&tmp_vec, tmp_arr, sizeof(tmp_vec));
+    vec_byteswap(tmp_vec);
+
+    tmp_vec ^= vec;
+    vec_byteswap(tmp_vec);
+
+    memcpy(out_bytes, &tmp_vec, sizeof(tmp_vec));
+}
+
+template <bool AssumeAligned>
+ALWAYS_INLINE void arr_read_xor_write_impl(const std::byte* in_bytes, std::byte* out_bytes, const vec256* arr0, const vec256* arr1, const vec256* arr2, const vec256* arr3, size_t half_states)
+{
+    for_each_half_state(half_states, [&](auto idx) {
+        constexpr size_t i = decltype(idx)::value;
+
+        const vec256& w = arr0[i];
+        const vec256& x = arr1[i];
+        const vec256& y = arr2[i];
+        const vec256& z = arr3[i];
+
+        const size_t offset = i * 128;
+        const std::byte* in_slice = in_bytes + offset;
+        std::byte* out_slice = out_bytes + offset;
+
+        vec_read_xor_write<AssumeAligned>(in_slice + 0, out_slice + 0, VEC_SHUF2(w, x, 4, 5, 6, 7, 12, 13, 14, 15));
+        vec_read_xor_write<AssumeAligned>(in_slice + 32, out_slice + 32, VEC_SHUF2(y, z, 4, 5, 6, 7, 12, 13, 14, 15));
+        vec_read_xor_write<AssumeAligned>(in_slice + 64, out_slice + 64, VEC_SHUF2(w, x, 0, 1, 2, 3, 8, 9, 10, 11));
+        vec_read_xor_write<AssumeAligned>(in_slice + 96, out_slice + 96, VEC_SHUF2(y, z, 0, 1, 2, 3, 8, 9, 10, 11));
+    });
 }
 
 /* Merge the 128 bit lanes from 2 states to the proper order, then pass each vec_read_xor_write */
 ALWAYS_INLINE void arr_read_xor_write(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, const vec256* arr0, const vec256* arr1, const vec256* arr2, const vec256* arr3, size_t half_states)
 {
-    CHACHA20_VEC_UNROLL(8)
-    for (size_t i = 0; i < 8; ++i) {
-        if (i < half_states) {
-            const vec256& w = arr0[i];
-            const vec256& x = arr1[i];
-            const vec256& y = arr2[i];
-            const vec256& z = arr3[i];
-
-            const size_t offset = i * 128;
-            auto in_slice = in_bytes.subspan(offset, 128);
-            auto out_slice = out_bytes.subspan(offset, 128);
-
-            vec_read_xor_write(in_slice.first(32), out_slice.first(32), __builtin_shufflevector(w, x, 4, 5, 6, 7, 12, 13, 14, 15));
-            vec_read_xor_write(in_slice.subspan(32, 32), out_slice.subspan(32, 32), __builtin_shufflevector(y, z, 4, 5, 6, 7, 12, 13, 14, 15));
-            vec_read_xor_write(in_slice.subspan(64, 32), out_slice.subspan(64, 32), __builtin_shufflevector(w, x, 0, 1, 2, 3, 8, 9, 10, 11));
-            vec_read_xor_write(in_slice.subspan(96, 32), out_slice.subspan(96, 32), __builtin_shufflevector(y, z, 0, 1, 2, 3, 8, 9, 10, 11));
-        }
+    constexpr std::uintptr_t mask{CHACHA20_VEC_MEM_ALIGN - 1};
+    const bool aligned = ((reinterpret_cast<std::uintptr_t>(in_bytes.data()) | reinterpret_cast<std::uintptr_t>(out_bytes.data())) & mask) == 0;
+
+    if (aligned) [[likely]] {
+        arr_read_xor_write_impl<true>(in_bytes.data(), out_bytes.data(), arr0, arr1, arr2, arr3, half_states);
+    } else {
+        arr_read_xor_write_impl<false>(in_bytes.data(), out_bytes.data(), arr0, arr1, arr2, arr3, half_states);
     }
 }
 
-/* Main crypt function. Calculates up to 16 states. */
+/* Main crypt function. Calculates up to 16 states (8 half_states). */
 ALWAYS_INLINE void multi_block_crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, const vec256& state0, const vec256& state1, const vec256& state2, size_t states)
 {
     const size_t half_states = states / 2;
-    const vec256* increments = increments_for_half_states(half_states);
-    assert(increments != nullptr);
-
     vec256 arr0[8], arr1[8], arr2[8], arr3[8];
 
     arr_set_vec256(arr0, half_states, nums256);
@@ -359,11 +334,25 @@ ALWAYS_INLINE void multi_block_crypt(std::span<const std::byte> in_bytes, std::s
     arr_read_xor_write(in_bytes, out_bytes, arr0, arr1, arr2, arr3, half_states);
 }
 
-#undef CHACHA20_VEC_UNROLL
-#undef CHACHA20_VEC_PRAGMA
+template <size_t States>
+ALWAYS_INLINE void process_blocks(std::span<const std::byte>& in_bytes, std::span<std::byte>& out_bytes, const vec256& state0, const vec256& state1, vec256& state2)
+{
+    constexpr size_t block_size = CHACHA20_VEC_BLOCKLEN * States;
+    constexpr vec256 increment = (vec256){static_cast<uint32_t>(States), 0, 0, 0, static_cast<uint32_t>(States), 0, 0, 0};
+
+    while (in_bytes.size() >= block_size) {
+        multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, States);
+        state2 += increment;
+        in_bytes = in_bytes.subspan(block_size);
+        out_bytes = out_bytes.subspan(block_size);
+    }
+}
+
+#undef VEC_SHUF_SELF
+#undef VEC_SHUF2
 
 } // anonymous namespace
-#endif // CHACHA20_VEC_ALL_MULTI_STATES_DISABLED
+#endif // any multi-state enabled
 
 #if defined(CHACHA20_NAMESPACE)
 namespace CHACHA20_NAMESPACE {
@@ -371,52 +360,18 @@ namespace CHACHA20_NAMESPACE {
 
 void chacha20_crypt_vectorized(std::span<const std::byte>& in_bytes, std::span<std::byte>& out_bytes, const std::array<uint32_t, 12>& input) noexcept
 {
-#if !defined(CHACHA20_VEC_ALL_MULTI_STATES_DISABLED)
-    assert(in_bytes.size() == out_bytes.size());
-    const vec256 state0 =  (vec256){input[0], input[1], input[2], input[3], input[0], input[1], input[2], input[3]};
-    const vec256 state1 =  (vec256){input[4], input[5], input[6], input[7], input[4], input[5], input[6], input[7]};
-    vec256 state2 =  (vec256){input[8], input[9], input[10], input[11], input[8], input[9], input[10], input[11]};
-#if !defined(CHACHA20_VEC_DISABLE_STATES_16)
-    while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 16) {
-        multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, 16);
-        state2 += (vec256){16, 0, 0, 0, 16, 0, 0, 0};
-        in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 16);
-        out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 16);
-    }
-#endif
-#if !defined(CHACHA20_VEC_DISABLE_STATES_8)
-    while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 8) {
-        multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, 8);
-        state2 += (vec256){8, 0, 0, 0, 8, 0, 0, 0};
-        in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 8);
-        out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 8);
-    }
-#endif
-#if !defined(CHACHA20_VEC_DISABLE_STATES_6)
-    while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 6) {
-        multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, 6);
-        state2 += (vec256){6, 0, 0, 0, 6, 0, 0, 0};
-        in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 6);
-        out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 6);
+    if constexpr (kEnableAnyMultiState) {
+        assert(in_bytes.size() == out_bytes.size());
+        const vec256 state0 = (vec256){input[0], input[1], input[2], input[3], input[0], input[1], input[2], input[3]};
+        const vec256 state1 = (vec256){input[4], input[5], input[6], input[7], input[4], input[5], input[6], input[7]};
+        vec256 state2 = (vec256){input[8], input[9], input[10], input[11], input[8], input[9], input[10], input[11]};
+
+        if constexpr (kEnableStates16) process_blocks<16>(in_bytes, out_bytes, state0, state1, state2);
+        if constexpr (kEnableStates8) process_blocks<8>(in_bytes, out_bytes, state0, state1, state2);
+        if constexpr (kEnableStates6) process_blocks<6>(in_bytes, out_bytes, state0, state1, state2);
+        if constexpr (kEnableStates4) process_blocks<4>(in_bytes, out_bytes, state0, state1, state2);
+        if constexpr (kEnableStates2) process_blocks<2>(in_bytes, out_bytes, state0, state1, state2);
     }
-#endif
-#if !defined(CHACHA20_VEC_DISABLE_STATES_4)
-    while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 4) {
-        multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, 4);
-        state2 += (vec256){4, 0, 0, 0, 4, 0, 0, 0};
-        in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 4);
-        out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 4);
-    }
-#endif
-#if !defined(CHACHA20_VEC_DISABLE_STATES_2)
-    while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 2) {
-        multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, 2);
-        state2 += (vec256){2, 0, 0, 0, 2, 0, 0, 0};
-        in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 2);
-        out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 2);
-    }
-#endif
-#endif // CHACHA20_VEC_ALL_MULTI_STATES_DISABLED
 }
 
 #if defined(CHACHA20_NAMESPACE)
diff --git a/src/crypto/chacha20_vec_base.cpp b/src/crypto/chacha20_vec_base.cpp
index 9fda9452a1c8..d282d7009ffc 100644
--- a/src/crypto/chacha20_vec_base.cpp
+++ b/src/crypto/chacha20_vec_base.cpp
@@ -12,6 +12,14 @@
 #  define CHACHA20_VEC_DISABLE_STATES_16
 #  define CHACHA20_VEC_DISABLE_STATES_8
 #  define CHACHA20_VEC_DISABLE_STATES_6
+#  if defined(__GNUC__) && !defined(__clang__)
+// GCC currently generates slower code for the generic vectorized implementation
+// on x86_64. Disable the 4-state path for now to avoid a regression.
+#    define CHACHA20_VEC_DISABLE_STATES_4
+// Disable the 2-state path as well (fallback to scalar) until a faster GCC x86
+// implementation exists (e.g. via AVX2/AVX512 runtime dispatch).
+#    define CHACHA20_VEC_DISABLE_STATES_2
+#  endif
 #elif defined(__ARM_NEON)
 #  define CHACHA20_VEC_DISABLE_STATES_2
 #else

From 6a1636918f205f821578fea062572eb264eae78c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Sun, 4 Jan 2026 18:39:54 +0100
Subject: [PATCH 07/12] refactor: extend NEON/AArch64 support and optimize
 multi-block ChaCha20 handling

Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte):
- clang 22: 1MB 1.86 [1.85-1.86], 256B 1.73 [1.72-1.73], 64B 2.59 [2.58-2.60]
- gcc 14:   1MB 5.74 [5.73-5.75], 256B 5.29 [5.28-5.29], 64B 2.71 [2.69-2.73]

On this Cortex-A76 benchmark, results are unchanged vs the prior commit (within
measurement noise). The changes here primarily prepare/extend the generic logic
for a broader set of targets.
---
 src/crypto/chacha20_vec.ipp      | 69 +++++++++++++++++---------------
 src/crypto/chacha20_vec_base.cpp |  2 +-
 2 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp
index 3a92649c90d8..36dfdee7fa29 100644
--- a/src/crypto/chacha20_vec.ipp
+++ b/src/crypto/chacha20_vec.ipp
@@ -174,33 +174,8 @@ After the second round, they are used (in reverse) to restore the original
 layout.
 
 */
-#if defined(__GNUC__) && !defined(__clang__)
-template <int... I>
-ALWAYS_INLINE vec256 vec_shuffle(const vec256& v)
-{
-    static_assert(sizeof...(I) == 8);
-    using mask_t = int __attribute__((__vector_size__(32)));
-    constexpr mask_t mask{I...};
-    return __builtin_shuffle(v, mask);
-}
-
-template <int... I>
-ALWAYS_INLINE vec256 vec_shuffle(const vec256& a, const vec256& b)
-{
-    static_assert(sizeof...(I) == 8);
-    using mask_t = int __attribute__((__vector_size__(32)));
-    constexpr mask_t mask{I...};
-    return __builtin_shuffle(a, b, mask);
-}
-#endif // defined(__GNUC__) && !defined(__clang__)
-
-#if defined(__GNUC__) && !defined(__clang__)
-#define VEC_SHUF_SELF(x, ...) vec_shuffle<__VA_ARGS__>(x)
-#define VEC_SHUF2(a, b, ...) vec_shuffle<__VA_ARGS__>(a, b)
-#else
 #define VEC_SHUF_SELF(x, ...) __builtin_shufflevector(x, x, __VA_ARGS__)
 #define VEC_SHUF2(a, b, ...) __builtin_shufflevector(a, b, __VA_ARGS__)
-#endif
 
 ALWAYS_INLINE void arr_shuf0(vec256* arr, size_t half_states)
 {
@@ -310,6 +285,35 @@ ALWAYS_INLINE void arr_read_xor_write(std::span<const std::byte> in_bytes, std::
 }
 
 /* Main crypt function. Calculates up to 16 states (8 half_states). */
+#if defined(__GNUC__) && !defined(__clang__)
+template <size_t States>
+ALWAYS_INLINE void multi_block_crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, const vec256& state0, const vec256& state1, const vec256& state2)
+{
+    static_assert(States == 16 || States == 8 || States == 6 || States == 4 || States == 2);
+    constexpr size_t half_states = States / 2;
+
+    std::array<vec256, half_states> arr0, arr1, arr2, arr3;
+
+    arr_set_vec256(arr0.data(), half_states, nums256);
+    arr_set_vec256(arr1.data(), half_states, state0);
+    arr_set_vec256(arr2.data(), half_states, state1);
+    arr_set_vec256(arr3.data(), half_states, state2);
+
+    arr_add_arr(arr3.data(), increments, half_states);
+
+    doubleround(arr0.data(), arr1.data(), arr2.data(), arr3.data(), half_states);
+
+    arr_add_vec256(arr0.data(), half_states, nums256);
+    arr_add_vec256(arr1.data(), half_states, state0);
+    arr_add_vec256(arr2.data(), half_states, state1);
+    arr_add_vec256(arr3.data(), half_states, state2);
+
+    arr_add_arr(arr3.data(), increments, half_states);
+
+    arr_read_xor_write(in_bytes, out_bytes, arr0.data(), arr1.data(), arr2.data(), arr3.data(), half_states);
+}
+#endif
+
 ALWAYS_INLINE void multi_block_crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, const vec256& state0, const vec256& state1, const vec256& state2, size_t states)
 {
     const size_t half_states = states / 2;
@@ -337,14 +341,15 @@ ALWAYS_INLINE void multi_block_crypt(std::span<const std::byte> in_bytes, std::s
 template <size_t States>
 ALWAYS_INLINE void process_blocks(std::span<const std::byte>& in_bytes, std::span<std::byte>& out_bytes, const vec256& state0, const vec256& state1, vec256& state2)
 {
-    constexpr size_t block_size = CHACHA20_VEC_BLOCKLEN * States;
-    constexpr vec256 increment = (vec256){static_cast<uint32_t>(States), 0, 0, 0, static_cast<uint32_t>(States), 0, 0, 0};
-
-    while (in_bytes.size() >= block_size) {
+    while (in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * States) {
+#if defined(__GNUC__) && !defined(__clang__)
+        multi_block_crypt<States>(in_bytes, out_bytes, state0, state1, state2);
+#else
         multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, States);
-        state2 += increment;
-        in_bytes = in_bytes.subspan(block_size);
-        out_bytes = out_bytes.subspan(block_size);
+#endif
+        state2 += (vec256){static_cast<uint32_t>(States), 0, 0, 0, static_cast<uint32_t>(States), 0, 0, 0};
+        in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * States);
+        out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * States);
     }
 }
 
diff --git a/src/crypto/chacha20_vec_base.cpp b/src/crypto/chacha20_vec_base.cpp
index d282d7009ffc..e865d7f8fd94 100644
--- a/src/crypto/chacha20_vec_base.cpp
+++ b/src/crypto/chacha20_vec_base.cpp
@@ -20,7 +20,7 @@
 // implementation exists (e.g. via AVX2/AVX512 runtime dispatch).
 #    define CHACHA20_VEC_DISABLE_STATES_2
 #  endif
-#elif defined(__ARM_NEON)
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__)
 #  define CHACHA20_VEC_DISABLE_STATES_2
 #else
 // Be conservative and require platforms to opt-in

From 3b26d2e480f972c0e528d4f21d28077428242d10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Sun, 4 Jan 2026 19:37:00 +0100
Subject: [PATCH 08/12] refactor: refine GCC-specific handling in ChaCha20
 vectorized paths

Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte):
- clang 22: 1MB 1.86 [1.85-1.86], 256B 1.72 [1.72-1.73], 64B 2.59 [2.59-2.60]
- gcc 14:   1MB 5.79 [5.78-5.81], 256B 5.29 [5.28-5.29], 64B 2.71 [2.69-2.72]

This change is mostly about refining GCC gating on other architectures (e.g. x86
with/without AVX2). On AArch64 it doesn't improve GCC's multi-state codegen yet;
GCC still emits a very large vectorized function (stack allocation ~0x5920) and
high instruction counts.
---
 src/crypto/chacha20_vec.ipp      | 33 +-------------------------------
 src/crypto/chacha20_vec_base.cpp |  5 +++--
 2 files changed, 4 insertions(+), 34 deletions(-)

diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp
index 36dfdee7fa29..b419111f6fb7 100644
--- a/src/crypto/chacha20_vec.ipp
+++ b/src/crypto/chacha20_vec.ipp
@@ -285,39 +285,12 @@ ALWAYS_INLINE void arr_read_xor_write(std::span<const std::byte> in_bytes, std::
 }
 
 /* Main crypt function. Calculates up to 16 states (8 half_states). */
-#if defined(__GNUC__) && !defined(__clang__)
 template <size_t States>
 ALWAYS_INLINE void multi_block_crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, const vec256& state0, const vec256& state1, const vec256& state2)
 {
     static_assert(States == 16 || States == 8 || States == 6 || States == 4 || States == 2);
     constexpr size_t half_states = States / 2;
-
-    std::array<vec256, half_states> arr0, arr1, arr2, arr3;
-
-    arr_set_vec256(arr0.data(), half_states, nums256);
-    arr_set_vec256(arr1.data(), half_states, state0);
-    arr_set_vec256(arr2.data(), half_states, state1);
-    arr_set_vec256(arr3.data(), half_states, state2);
-
-    arr_add_arr(arr3.data(), increments, half_states);
-
-    doubleround(arr0.data(), arr1.data(), arr2.data(), arr3.data(), half_states);
-
-    arr_add_vec256(arr0.data(), half_states, nums256);
-    arr_add_vec256(arr1.data(), half_states, state0);
-    arr_add_vec256(arr2.data(), half_states, state1);
-    arr_add_vec256(arr3.data(), half_states, state2);
-
-    arr_add_arr(arr3.data(), increments, half_states);
-
-    arr_read_xor_write(in_bytes, out_bytes, arr0.data(), arr1.data(), arr2.data(), arr3.data(), half_states);
-}
-#endif
-
-ALWAYS_INLINE void multi_block_crypt(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, const vec256& state0, const vec256& state1, const vec256& state2, size_t states)
-{
-    const size_t half_states = states / 2;
-    vec256 arr0[8], arr1[8], arr2[8], arr3[8];
+    vec256 arr0[half_states], arr1[half_states], arr2[half_states], arr3[half_states];
 
     arr_set_vec256(arr0, half_states, nums256);
     arr_set_vec256(arr1, half_states, state0);
@@ -342,11 +315,7 @@ template <size_t States>
 ALWAYS_INLINE void process_blocks(std::span<const std::byte>& in_bytes, std::span<std::byte>& out_bytes, const vec256& state0, const vec256& state1, vec256& state2)
 {
     while (in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * States) {
-#if defined(__GNUC__) && !defined(__clang__)
         multi_block_crypt<States>(in_bytes, out_bytes, state0, state1, state2);
-#else
-        multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, States);
-#endif
         state2 += (vec256){static_cast<uint32_t>(States), 0, 0, 0, static_cast<uint32_t>(States), 0, 0, 0};
         in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * States);
         out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * States);
diff --git a/src/crypto/chacha20_vec_base.cpp b/src/crypto/chacha20_vec_base.cpp
index e865d7f8fd94..0b3a34563ab6 100644
--- a/src/crypto/chacha20_vec_base.cpp
+++ b/src/crypto/chacha20_vec_base.cpp
@@ -12,9 +12,10 @@
 #  define CHACHA20_VEC_DISABLE_STATES_16
 #  define CHACHA20_VEC_DISABLE_STATES_8
 #  define CHACHA20_VEC_DISABLE_STATES_6
-#  if defined(__GNUC__) && !defined(__clang__)
+#  if defined(__GNUC__) && !defined(__clang__) && !defined(__AVX2__)
 // GCC currently generates slower code for the generic vectorized implementation
-// on x86_64. Disable the 4-state path for now to avoid a regression.
+// on x86_64 unless AVX2 is enabled. Disable the 4-state path for now to avoid a
+// regression.
 #    define CHACHA20_VEC_DISABLE_STATES_4
 // Disable the 2-state path as well (fallback to scalar) until a faster GCC x86
 // implementation exists (e.g. via AVX2/AVX512 runtime dispatch).

From 842a88e1ff6ef68fb8bd9ced018af3ccdcd93f1b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Sun, 4 Jan 2026 20:07:19 +0100
Subject: [PATCH 09/12] refactor: improve GCC handling for NEON/AArch64 in
 ChaCha20 vectorized paths

Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte):
- clang 22: 1MB 1.86 [1.86-1.86], 256B 1.73 [1.72-1.73], 64B 2.59 [2.58-2.60]
- gcc 14:   1MB 2.45 [2.44-2.45], 256B 2.53 [2.52-2.53], 64B 2.71 [2.69-2.72]

Key point: gcc's multi-state vectorized path was a regression on AArch64 (5.7 ns/B
class). This commit avoids that by disabling all multi-state variants for gcc on
AArch64, effectively falling back to the scalar implementation for multi-block
inputs (bringing gcc back near baseline).

Also fix the build when all multi-state paths are disabled: avoid referencing
`process_blocks<N>` from code that is preprocessor-disabled, so GCC can compile
cleanly with a complete disable set.
---
 src/crypto/chacha20_vec.ipp      | 40 +++++++++++++++++++++-----------
 src/crypto/chacha20_vec_base.cpp | 13 ++++++++++-
 2 files changed, 38 insertions(+), 15 deletions(-)

diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp
index b419111f6fb7..e7a556d078ee 100644
--- a/src/crypto/chacha20_vec.ipp
+++ b/src/crypto/chacha20_vec.ipp
@@ -51,12 +51,20 @@ inline constexpr bool kEnableAnyMultiState = kEnableStates16 || kEnableStates8 |
 // vec256 type must be visible for if constexpr branches even when they're not taken
 using vec256 = uint32_t __attribute__((__vector_size__(32)));
 
-// Preprocessor check for conditional compilation of the anonymous namespace
+// Preprocessor check for conditional compilation of the anonymous namespace and
+// the multi-state code paths. When all states are disabled, avoid referencing
+// templates/functions that are not available.
 #if !defined(CHACHA20_VEC_DISABLE_STATES_16) || \
     !defined(CHACHA20_VEC_DISABLE_STATES_8) ||  \
     !defined(CHACHA20_VEC_DISABLE_STATES_6) ||  \
     !defined(CHACHA20_VEC_DISABLE_STATES_4) ||  \
     !defined(CHACHA20_VEC_DISABLE_STATES_2)
+#  define CHACHA20_VEC_ENABLE_ANY_MULTI_STATE 1
+#else
+#  define CHACHA20_VEC_ENABLE_ANY_MULTI_STATE 0
+#endif
+
+#if CHACHA20_VEC_ENABLE_ANY_MULTI_STATE
 
 namespace {
 
@@ -326,7 +334,7 @@ ALWAYS_INLINE void process_blocks(std::span<const std::byte>& in_bytes, std::spa
 #undef VEC_SHUF2
 
 } // anonymous namespace
-#endif // any multi-state enabled
+#endif // CHACHA20_VEC_ENABLE_ANY_MULTI_STATE
 
 #if defined(CHACHA20_NAMESPACE)
 namespace CHACHA20_NAMESPACE {
@@ -334,18 +342,22 @@ namespace CHACHA20_NAMESPACE {
 
 void chacha20_crypt_vectorized(std::span<const std::byte>& in_bytes, std::span<std::byte>& out_bytes, const std::array<uint32_t, 12>& input) noexcept
 {
-    if constexpr (kEnableAnyMultiState) {
-        assert(in_bytes.size() == out_bytes.size());
-        const vec256 state0 = (vec256){input[0], input[1], input[2], input[3], input[0], input[1], input[2], input[3]};
-        const vec256 state1 = (vec256){input[4], input[5], input[6], input[7], input[4], input[5], input[6], input[7]};
-        vec256 state2 = (vec256){input[8], input[9], input[10], input[11], input[8], input[9], input[10], input[11]};
-
-        if constexpr (kEnableStates16) process_blocks<16>(in_bytes, out_bytes, state0, state1, state2);
-        if constexpr (kEnableStates8) process_blocks<8>(in_bytes, out_bytes, state0, state1, state2);
-        if constexpr (kEnableStates6) process_blocks<6>(in_bytes, out_bytes, state0, state1, state2);
-        if constexpr (kEnableStates4) process_blocks<4>(in_bytes, out_bytes, state0, state1, state2);
-        if constexpr (kEnableStates2) process_blocks<2>(in_bytes, out_bytes, state0, state1, state2);
-    }
+#if CHACHA20_VEC_ENABLE_ANY_MULTI_STATE
+    assert(in_bytes.size() == out_bytes.size());
+    const vec256 state0 = (vec256){input[0], input[1], input[2], input[3], input[0], input[1], input[2], input[3]};
+    const vec256 state1 = (vec256){input[4], input[5], input[6], input[7], input[4], input[5], input[6], input[7]};
+    vec256 state2 = (vec256){input[8], input[9], input[10], input[11], input[8], input[9], input[10], input[11]};
+
+    if constexpr (kEnableStates16) process_blocks<16>(in_bytes, out_bytes, state0, state1, state2);
+    if constexpr (kEnableStates8) process_blocks<8>(in_bytes, out_bytes, state0, state1, state2);
+    if constexpr (kEnableStates6) process_blocks<6>(in_bytes, out_bytes, state0, state1, state2);
+    if constexpr (kEnableStates4) process_blocks<4>(in_bytes, out_bytes, state0, state1, state2);
+    if constexpr (kEnableStates2) process_blocks<2>(in_bytes, out_bytes, state0, state1, state2);
+#else
+    (void)in_bytes;
+    (void)out_bytes;
+    (void)input;
+#endif
 }
 
 #if defined(CHACHA20_NAMESPACE)
diff --git a/src/crypto/chacha20_vec_base.cpp b/src/crypto/chacha20_vec_base.cpp
index 0b3a34563ab6..a2db4b34f520 100644
--- a/src/crypto/chacha20_vec_base.cpp
+++ b/src/crypto/chacha20_vec_base.cpp
@@ -22,7 +22,18 @@
 #    define CHACHA20_VEC_DISABLE_STATES_2
 #  endif
 #elif defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__)
-#  define CHACHA20_VEC_DISABLE_STATES_2
+#  if defined(__GNUC__) && !defined(__clang__)
+// Similar to x86_64, GCC currently generates slower code for the generic
+// vectorized implementation on AArch64/NEON. Disable all multi-state paths for
+// now to avoid a regression.
+#    define CHACHA20_VEC_DISABLE_STATES_16
+#    define CHACHA20_VEC_DISABLE_STATES_8
+#    define CHACHA20_VEC_DISABLE_STATES_6
+#    define CHACHA20_VEC_DISABLE_STATES_4
+#    define CHACHA20_VEC_DISABLE_STATES_2
+#  else
+#    define CHACHA20_VEC_DISABLE_STATES_2
+#  endif
 #else
 // Be conservative and require platforms to opt-in
 #  define CHACHA20_VEC_DISABLE_STATES_16

From 0182ceb344f953edbf83609698be689faea4722e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Fri, 9 Jan 2026 20:39:12 +0100
Subject: [PATCH 10/12] chacha20: make GCC competitive with Clang in vectorized
 ChaCha20

On AArch64/NEON, GCC's codegen for 256-bit `__builtin_shufflevector` patterns was the root cause of the large perf gap (scalar spills + `fmov`/`bfi`/`bfxil` sequences). Keep Clang on the existing 256-bit vector path, but use a GCC-specific split-lane `vec256` representation (two 128-bit lanes) so GCC can use native NEON shuffles and keep the state in registers.

This also enables a multi-state path for GCC again on AArch64 (use 8/4-state; keep 16/6 disabled to limit register pressure).

Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median ns/byte):
- GCC 14.2: 1MB 1.85, 256B 2.17, 64B 2.71
- Clang 22: 1MB 1.87, 256B 1.73, 64B 2.59
---
 src/crypto/chacha20_vec.ipp      | 136 +++++++++++++++++++++++++++----
 src/crypto/chacha20_vec_base.cpp |   7 +-
 2 files changed, 120 insertions(+), 23 deletions(-)

diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp
index e7a556d078ee..dacd2522f530 100644
--- a/src/crypto/chacha20_vec.ipp
+++ b/src/crypto/chacha20_vec.ipp
@@ -48,8 +48,40 @@ inline constexpr bool kEnableStates2 = true;
 
 inline constexpr bool kEnableAnyMultiState = kEnableStates16 || kEnableStates8 || kEnableStates6 || kEnableStates4 || kEnableStates2;
 
-// vec256 type must be visible for if constexpr branches even when they're not taken
+using vec128 = uint32_t __attribute__((__vector_size__(16)));
+
+#if defined(__GNUC__) && !defined(__clang__)
+#  define CHACHA20_VEC_USE_SPLIT_LANES 1
+#else
+#  define CHACHA20_VEC_USE_SPLIT_LANES 0
+#endif
+
+#if CHACHA20_VEC_USE_SPLIT_LANES
+// Represent two 128-bit lanes explicitly. This avoids GCC generating expensive
+// scalar sequences for 256-bit shuffles on targets without native 256-bit SIMD
+// registers (e.g. AArch64/NEON, x86/SSE2).
+struct vec256 {
+    vec128 lo;
+    vec128 hi;
+};
+static_assert(sizeof(vec256) == 32);
+
+ALWAYS_INLINE vec256& operator+=(vec256& a, const vec256& b)
+{
+    a.lo += b.lo;
+    a.hi += b.hi;
+    return a;
+}
+
+ALWAYS_INLINE vec256& operator^=(vec256& a, const vec256& b)
+{
+    a.lo ^= b.lo;
+    a.hi ^= b.hi;
+    return a;
+}
+#else
 using vec256 = uint32_t __attribute__((__vector_size__(32)));
+#endif
 
 // Preprocessor check for conditional compilation of the anonymous namespace and
 // the multi-state code paths. When all states are disabled, avoid referencing
@@ -75,6 +107,20 @@ static constexpr size_t CHACHA20_VEC_MEM_ALIGN{16};
 ALWAYS_INLINE void vec_byteswap(vec256& vec)
 {
     if constexpr (std::endian::native == std::endian::big) {
+#if CHACHA20_VEC_USE_SPLIT_LANES
+        vec128 lo;
+        lo[0] = __builtin_bswap32(vec.lo[0]);
+        lo[1] = __builtin_bswap32(vec.lo[1]);
+        lo[2] = __builtin_bswap32(vec.lo[2]);
+        lo[3] = __builtin_bswap32(vec.lo[3]);
+        vec128 hi;
+        hi[0] = __builtin_bswap32(vec.hi[0]);
+        hi[1] = __builtin_bswap32(vec.hi[1]);
+        hi[2] = __builtin_bswap32(vec.hi[2]);
+        hi[3] = __builtin_bswap32(vec.hi[3]);
+        vec.lo = lo;
+        vec.hi = hi;
+#else
         vec256 ret;
         ret[0] = __builtin_bswap32(vec[0]);
         ret[1] = __builtin_bswap32(vec[1]);
@@ -85,6 +131,7 @@ ALWAYS_INLINE void vec_byteswap(vec256& vec)
         ret[6] = __builtin_bswap32(vec[6]);
         ret[7] = __builtin_bswap32(vec[7]);
         vec = ret;
+#endif
     }
 }
 
@@ -93,14 +140,34 @@ template <unsigned N>
 ALWAYS_INLINE void vec_rotl(vec256& vec)
 {
     static_assert(N > 0 && N < 32, "Rotation must be between 1 and 31 bits");
+#if CHACHA20_VEC_USE_SPLIT_LANES
+    vec.lo = (vec.lo << N) | (vec.lo >> (32 - N));
+    vec.hi = (vec.hi << N) | (vec.hi >> (32 - N));
+#else
     vec = (vec << N) | (vec >> (32 - N));
+#endif
 }
 
+#if CHACHA20_VEC_USE_SPLIT_LANES
+static constexpr vec128 nums128 = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574};
+static constexpr vec256 nums256 = {nums128, nums128};
+#else
 static constexpr vec256 nums256 = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574, 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574};
+#endif
 
 // Counter increments for each half-state pair. Pattern: {2*i+1, 0, 0, 0, 2*i, 0, 0, 0}
 // All smaller state counts use a prefix of this array.
 static constexpr vec256 increments[8] = {
+#if CHACHA20_VEC_USE_SPLIT_LANES
+    {{1, 0, 0, 0}, {0, 0, 0, 0}},
+    {{3, 0, 0, 0}, {2, 0, 0, 0}},
+    {{5, 0, 0, 0}, {4, 0, 0, 0}},
+    {{7, 0, 0, 0}, {6, 0, 0, 0}},
+    {{9, 0, 0, 0}, {8, 0, 0, 0}},
+    {{11, 0, 0, 0}, {10, 0, 0, 0}},
+    {{13, 0, 0, 0}, {12, 0, 0, 0}},
+    {{15, 0, 0, 0}, {14, 0, 0, 0}},
+#else
     {1, 0, 0, 0, 0, 0, 0, 0},
     {3, 0, 0, 0, 2, 0, 0, 0},
     {5, 0, 0, 0, 4, 0, 0, 0},
@@ -109,6 +176,7 @@ static constexpr vec256 increments[8] = {
     {11, 0, 0, 0, 10, 0, 0, 0},
     {13, 0, 0, 0, 12, 0, 0, 0},
     {15, 0, 0, 0, 14, 0, 0, 0},
+#endif
 };
 
 template <typename Fn, size_t... I>
@@ -182,15 +250,17 @@ After the second round, they are used (in reverse) to restore the original
 layout.
 
 */
-#define VEC_SHUF_SELF(x, ...) __builtin_shufflevector(x, x, __VA_ARGS__)
-#define VEC_SHUF2(a, b, ...) __builtin_shufflevector(a, b, __VA_ARGS__)
-
 ALWAYS_INLINE void arr_shuf0(vec256* arr, size_t half_states)
 {
     for_each_half_state(half_states, [&](auto idx) {
         constexpr size_t i = decltype(idx)::value;
         vec256& x = arr[i];
-        x = VEC_SHUF_SELF(x, 1, 2, 3, 0, 5, 6, 7, 4);
+#if CHACHA20_VEC_USE_SPLIT_LANES
+        x.lo = __builtin_shufflevector(x.lo, x.lo, 1, 2, 3, 0);
+        x.hi = __builtin_shufflevector(x.hi, x.hi, 1, 2, 3, 0);
+#else
+        x = __builtin_shufflevector(x, x, 1, 2, 3, 0, 5, 6, 7, 4);
+#endif
     });
 }
 
@@ -199,7 +269,12 @@ ALWAYS_INLINE void arr_shuf1(vec256* arr, size_t half_states)
     for_each_half_state(half_states, [&](auto idx) {
         constexpr size_t i = decltype(idx)::value;
         vec256& x = arr[i];
-        x = VEC_SHUF_SELF(x, 2, 3, 0, 1, 6, 7, 4, 5);
+#if CHACHA20_VEC_USE_SPLIT_LANES
+        x.lo = __builtin_shufflevector(x.lo, x.lo, 2, 3, 0, 1);
+        x.hi = __builtin_shufflevector(x.hi, x.hi, 2, 3, 0, 1);
+#else
+        x = __builtin_shufflevector(x, x, 2, 3, 0, 1, 6, 7, 4, 5);
+#endif
     });
 }
 
@@ -208,7 +283,12 @@ ALWAYS_INLINE void arr_shuf2(vec256* arr, size_t half_states)
     for_each_half_state(half_states, [&](auto idx) {
         constexpr size_t i = decltype(idx)::value;
         vec256& x = arr[i];
-        x = VEC_SHUF_SELF(x, 3, 0, 1, 2, 7, 4, 5, 6);
+#if CHACHA20_VEC_USE_SPLIT_LANES
+        x.lo = __builtin_shufflevector(x.lo, x.lo, 3, 0, 1, 2);
+        x.hi = __builtin_shufflevector(x.hi, x.hi, 3, 0, 1, 2);
+#else
+        x = __builtin_shufflevector(x, x, 3, 0, 1, 2, 7, 4, 5, 6);
+#endif
     });
 }
 
@@ -245,10 +325,8 @@ ALWAYS_INLINE void vec_read_xor_write(const std::byte* in_bytes, std::byte* out_
         out_bytes = std::assume_aligned<CHACHA20_VEC_MEM_ALIGN>(out_bytes);
     }
 
-    uint32_t tmp_arr[8];
-    memcpy(tmp_arr, in_bytes, sizeof(tmp_arr));
     vec256 tmp_vec;
-    memcpy(&tmp_vec, tmp_arr, sizeof(tmp_vec));
+    memcpy(&tmp_vec, in_bytes, sizeof(tmp_vec));
     vec_byteswap(tmp_vec);
 
     tmp_vec ^= vec;
@@ -272,10 +350,17 @@ ALWAYS_INLINE void arr_read_xor_write_impl(const std::byte* in_bytes, std::byte*
         const std::byte* in_slice = in_bytes + offset;
         std::byte* out_slice = out_bytes + offset;
 
-        vec_read_xor_write<AssumeAligned>(in_slice + 0, out_slice + 0, VEC_SHUF2(w, x, 4, 5, 6, 7, 12, 13, 14, 15));
-        vec_read_xor_write<AssumeAligned>(in_slice + 32, out_slice + 32, VEC_SHUF2(y, z, 4, 5, 6, 7, 12, 13, 14, 15));
-        vec_read_xor_write<AssumeAligned>(in_slice + 64, out_slice + 64, VEC_SHUF2(w, x, 0, 1, 2, 3, 8, 9, 10, 11));
-        vec_read_xor_write<AssumeAligned>(in_slice + 96, out_slice + 96, VEC_SHUF2(y, z, 0, 1, 2, 3, 8, 9, 10, 11));
+#if CHACHA20_VEC_USE_SPLIT_LANES
+        vec_read_xor_write<AssumeAligned>(in_slice + 0, out_slice + 0, vec256{w.hi, x.hi});
+        vec_read_xor_write<AssumeAligned>(in_slice + 32, out_slice + 32, vec256{y.hi, z.hi});
+        vec_read_xor_write<AssumeAligned>(in_slice + 64, out_slice + 64, vec256{w.lo, x.lo});
+        vec_read_xor_write<AssumeAligned>(in_slice + 96, out_slice + 96, vec256{y.lo, z.lo});
+#else
+        vec_read_xor_write<AssumeAligned>(in_slice + 0, out_slice + 0, __builtin_shufflevector(w, x, 4, 5, 6, 7, 12, 13, 14, 15));
+        vec_read_xor_write<AssumeAligned>(in_slice + 32, out_slice + 32, __builtin_shufflevector(y, z, 4, 5, 6, 7, 12, 13, 14, 15));
+        vec_read_xor_write<AssumeAligned>(in_slice + 64, out_slice + 64, __builtin_shufflevector(w, x, 0, 1, 2, 3, 8, 9, 10, 11));
+        vec_read_xor_write<AssumeAligned>(in_slice + 96, out_slice + 96, __builtin_shufflevector(y, z, 0, 1, 2, 3, 8, 9, 10, 11));
+#endif
     });
 }
 
@@ -324,15 +409,17 @@ ALWAYS_INLINE void process_blocks(std::span<const std::byte>& in_bytes, std::spa
 {
     while (in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * States) {
         multi_block_crypt<States>(in_bytes, out_bytes, state0, state1, state2);
-        state2 += (vec256){static_cast<uint32_t>(States), 0, 0, 0, static_cast<uint32_t>(States), 0, 0, 0};
+        const uint32_t inc = static_cast<uint32_t>(States);
+#if CHACHA20_VEC_USE_SPLIT_LANES
+        state2 += vec256{{inc, 0, 0, 0}, {inc, 0, 0, 0}};
+#else
+        state2 += (vec256){inc, 0, 0, 0, inc, 0, 0, 0};
+#endif
         in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * States);
         out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * States);
     }
 }
 
-#undef VEC_SHUF_SELF
-#undef VEC_SHUF2
-
 } // anonymous namespace
 #endif // CHACHA20_VEC_ENABLE_ANY_MULTI_STATE
 
@@ -344,9 +431,19 @@ void chacha20_crypt_vectorized(std::span<const std::byte>& in_bytes, std::span<s
 {
 #if CHACHA20_VEC_ENABLE_ANY_MULTI_STATE
     assert(in_bytes.size() == out_bytes.size());
+#if CHACHA20_VEC_USE_SPLIT_LANES
+    const vec128 state0_lane = {input[0], input[1], input[2], input[3]};
+    const vec128 state1_lane = {input[4], input[5], input[6], input[7]};
+    const vec128 state2_lane = {input[8], input[9], input[10], input[11]};
+
+    const vec256 state0 = {state0_lane, state0_lane};
+    const vec256 state1 = {state1_lane, state1_lane};
+    vec256 state2 = {state2_lane, state2_lane};
+#else
     const vec256 state0 = (vec256){input[0], input[1], input[2], input[3], input[0], input[1], input[2], input[3]};
     const vec256 state1 = (vec256){input[4], input[5], input[6], input[7], input[4], input[5], input[6], input[7]};
     vec256 state2 = (vec256){input[8], input[9], input[10], input[11], input[8], input[9], input[10], input[11]};
+#endif
 
     if constexpr (kEnableStates16) process_blocks<16>(in_bytes, out_bytes, state0, state1, state2);
     if constexpr (kEnableStates8) process_blocks<8>(in_bytes, out_bytes, state0, state1, state2);
@@ -364,4 +461,7 @@ void chacha20_crypt_vectorized(std::span<const std::byte>& in_bytes, std::span<s
 }
 #endif
 
+#undef CHACHA20_VEC_ENABLE_ANY_MULTI_STATE
+#undef CHACHA20_VEC_USE_SPLIT_LANES
+
 #endif // ENABLE_CHACHA20_VEC
diff --git a/src/crypto/chacha20_vec_base.cpp b/src/crypto/chacha20_vec_base.cpp
index a2db4b34f520..d23be59aaa1e 100644
--- a/src/crypto/chacha20_vec_base.cpp
+++ b/src/crypto/chacha20_vec_base.cpp
@@ -23,13 +23,10 @@
 #  endif
 #elif defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__)
 #  if defined(__GNUC__) && !defined(__clang__)
-// Similar to x86_64, GCC currently generates slower code for the generic
-// vectorized implementation on AArch64/NEON. Disable all multi-state paths for
-// now to avoid a regression.
+// GCC tends to spill heavily in the widest multi-state configuration on
+// AArch64/NEON. Prefer smaller multi-state levels that fit in registers.
 #    define CHACHA20_VEC_DISABLE_STATES_16
-#    define CHACHA20_VEC_DISABLE_STATES_8
 #    define CHACHA20_VEC_DISABLE_STATES_6
-#    define CHACHA20_VEC_DISABLE_STATES_4
 #    define CHACHA20_VEC_DISABLE_STATES_2
 #  else
 #    define CHACHA20_VEC_DISABLE_STATES_2

From bac937753ecb1d42a225dd70dfa42be17ef76516 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Fri, 9 Jan 2026 20:39:24 +0100
Subject: [PATCH 11/12] chacha20: AArch64: avoid 16-state spills; tighten
 half-state loops
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On AArch64/NEON there are 32 128-bit vector registers. The “16-state” variant
(8 half-states) needs ~64 128-bit lanes worth of live state (because `vec256`
lowers to two 128-bit lanes on NEON), so it spills heavily (notably on clang).
Disable `STATES_16` on AArch64 to force the 8-state path, which fits in
registers and is substantially faster.

Also disable `STATES_6` on AArch64: it increases code size and hurts the common
8/4-state path on this target.

Make the per-half-state helpers compile-time sized (no runtime `half_states`
argument). This lets compilers fully specialize the inner loops; GCC in
particular stops generating extra control-flow and spill glue around the
multi-state path.

Finally, on AArch64/NEON clang's codegen for the aligned I/O fast-path
(`std::assume_aligned` + 32-byte memcpy) is slower than the plain unaligned
variant. Prefer the unaligned path for clang.

Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=10000`, 5 runs; median [min-max] ns/byte):
- clang 22: 1MB 1.47 [1.46-1.48], 256B 1.64 [1.64-1.65], 64B 2.59 [2.59-2.60]
- gcc 14:   1MB 1.71 [1.71-1.71], 256B 1.95 [1.95-1.97], 64B 2.70 [2.69-2.72]

Delta vs previous commit (CHACHA20_1MB, -min-time=10000):
- clang: 1.86 -> 1.47 ns/B (avoid 16-state spills; avoid aligned fast-path)
- gcc:   1.85 -> 1.71 ns/B (tightened half-state loops)
---
 src/crypto/chacha20_vec.ipp      | 173 +++++++++++++++++--------------
 src/crypto/chacha20_vec_base.cpp |   7 +-
 2 files changed, 100 insertions(+), 80 deletions(-)

diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp
index dacd2522f530..468204ceb99f 100644
--- a/src/crypto/chacha20_vec.ipp
+++ b/src/crypto/chacha20_vec.ipp
@@ -46,8 +46,6 @@ inline constexpr bool kEnableStates2 = false;
 inline constexpr bool kEnableStates2 = true;
 #endif
 
-inline constexpr bool kEnableAnyMultiState = kEnableStates16 || kEnableStates8 || kEnableStates6 || kEnableStates4 || kEnableStates2;
-
 using vec128 = uint32_t __attribute__((__vector_size__(16)));
 
 #if defined(__GNUC__) && !defined(__clang__)
@@ -179,54 +177,62 @@ static constexpr vec256 increments[8] = {
 #endif
 };
 
-template <typename Fn, size_t... I>
-ALWAYS_INLINE void for_each_half_state(size_t half_states, Fn&& fn, std::index_sequence<I...>)
+/** Store a vector in all array elements */
+template <size_t HalfStates, typename Fn, size_t... I>
+ALWAYS_INLINE void for_each_half_state(Fn&& fn, std::index_sequence<I...>)
 {
-    ((I < half_states ? (fn(std::integral_constant<size_t, I>{}), 0) : 0), ...);
+    (fn(std::integral_constant<size_t, I>{}), ...);
 }
 
-template <typename Fn>
-ALWAYS_INLINE void for_each_half_state(size_t half_states, Fn&& fn)
+template <size_t HalfStates, typename Fn>
+ALWAYS_INLINE void for_each_half_state(Fn&& fn)
 {
-    for_each_half_state(half_states, std::forward<Fn>(fn), std::make_index_sequence<8>{});
+    for_each_half_state<HalfStates>(std::forward<Fn>(fn), std::make_index_sequence<HalfStates>{});
 }
 
 /** Store a vector in all array elements */
-ALWAYS_INLINE void arr_set_vec256(vec256* arr, size_t half_states, const vec256& vec)
+template <size_t HalfStates>
+ALWAYS_INLINE void arr_set_vec256(std::array<vec256, HalfStates>& arr, const vec256& vec)
 {
-    for_each_half_state(half_states, [&](auto idx) {
-        constexpr size_t i = decltype(idx)::value;
-        arr[i] = vec;
+    for_each_half_state<HalfStates>([&](auto idx) {
+        constexpr size_t i{decltype(idx)::value};
+        std::get<i>(arr) = vec;
     });
 }
 
 /** Add a vector to all array elements */
-ALWAYS_INLINE void arr_add_vec256(vec256* arr, size_t half_states, const vec256& vec)
+template <size_t HalfStates>
+ALWAYS_INLINE void arr_add_vec256(std::array<vec256, HalfStates>& arr, const vec256& vec)
 {
-    for_each_half_state(half_states, [&](auto idx) {
-        constexpr size_t i = decltype(idx)::value;
-        arr[i] += vec;
+    for_each_half_state<HalfStates>([&](auto idx) {
+        constexpr size_t i{decltype(idx)::value};
+        std::get<i>(arr) += vec;
     });
 }
 
 /** Add corresponding vectors in arr1 to arr0 */
-ALWAYS_INLINE void arr_add_arr(vec256* arr0, const vec256* arr1, size_t half_states)
+template <size_t HalfStates>
+ALWAYS_INLINE void arr_add_arr(std::array<vec256, HalfStates>& arr0, const vec256* arr1)
 {
-    for_each_half_state(half_states, [&](auto idx) {
-        constexpr size_t i = decltype(idx)::value;
-        arr0[i] += arr1[i];
+    for_each_half_state<HalfStates>([&](auto idx) {
+        constexpr size_t i{decltype(idx)::value};
+        std::get<i>(arr0) += arr1[i];
     });
 }
 
 /** Add arr1 to arr0, XOR result into arr2, rotate arr2 left by N bits */
-template <unsigned N>
-ALWAYS_INLINE void arr_add_xor_rot(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states)
+template <unsigned N, size_t HalfStates>
+ALWAYS_INLINE void arr_add_xor_rot(std::array<vec256, HalfStates>& arr0, const std::array<vec256, HalfStates>& arr1, std::array<vec256, HalfStates>& arr2)
 {
-    for_each_half_state(half_states, [&](auto idx) {
-        constexpr size_t i = decltype(idx)::value;
-        arr0[i] += arr1[i];
-        arr2[i] ^= arr0[i];
-        vec_rotl<N>(arr2[i]);
+    for_each_half_state<HalfStates>([&](auto idx) {
+        constexpr size_t i{decltype(idx)::value};
+        vec256& x = std::get<i>(arr0);
+        const vec256& y = std::get<i>(arr1);
+        vec256& z = std::get<i>(arr2);
+
+        x += y;
+        z ^= x;
+        vec_rotl<N>(z);
     });
 }
 
@@ -250,11 +256,12 @@ After the second round, they are used (in reverse) to restore the original
 layout.
 
 */
-ALWAYS_INLINE void arr_shuf0(vec256* arr, size_t half_states)
+template <size_t HalfStates>
+ALWAYS_INLINE void arr_shuf0(std::array<vec256, HalfStates>& arr)
 {
-    for_each_half_state(half_states, [&](auto idx) {
-        constexpr size_t i = decltype(idx)::value;
-        vec256& x = arr[i];
+    for_each_half_state<HalfStates>([&](auto idx) {
+        constexpr size_t i{decltype(idx)::value};
+        vec256& x = std::get<i>(arr);
 #if CHACHA20_VEC_USE_SPLIT_LANES
         x.lo = __builtin_shufflevector(x.lo, x.lo, 1, 2, 3, 0);
         x.hi = __builtin_shufflevector(x.hi, x.hi, 1, 2, 3, 0);
@@ -264,11 +271,12 @@ ALWAYS_INLINE void arr_shuf0(vec256* arr, size_t half_states)
     });
 }
 
-ALWAYS_INLINE void arr_shuf1(vec256* arr, size_t half_states)
+template <size_t HalfStates>
+ALWAYS_INLINE void arr_shuf1(std::array<vec256, HalfStates>& arr)
 {
-    for_each_half_state(half_states, [&](auto idx) {
-        constexpr size_t i = decltype(idx)::value;
-        vec256& x = arr[i];
+    for_each_half_state<HalfStates>([&](auto idx) {
+        constexpr size_t i{decltype(idx)::value};
+        vec256& x = std::get<i>(arr);
 #if CHACHA20_VEC_USE_SPLIT_LANES
         x.lo = __builtin_shufflevector(x.lo, x.lo, 2, 3, 0, 1);
         x.hi = __builtin_shufflevector(x.hi, x.hi, 2, 3, 0, 1);
@@ -278,11 +286,12 @@ ALWAYS_INLINE void arr_shuf1(vec256* arr, size_t half_states)
     });
 }
 
-ALWAYS_INLINE void arr_shuf2(vec256* arr, size_t half_states)
+template <size_t HalfStates>
+ALWAYS_INLINE void arr_shuf2(std::array<vec256, HalfStates>& arr)
 {
-    for_each_half_state(half_states, [&](auto idx) {
-        constexpr size_t i = decltype(idx)::value;
-        vec256& x = arr[i];
+    for_each_half_state<HalfStates>([&](auto idx) {
+        constexpr size_t i{decltype(idx)::value};
+        vec256& x = std::get<i>(arr);
 #if CHACHA20_VEC_USE_SPLIT_LANES
         x.lo = __builtin_shufflevector(x.lo, x.lo, 3, 0, 1, 2);
         x.hi = __builtin_shufflevector(x.hi, x.hi, 3, 0, 1, 2);
@@ -293,24 +302,25 @@ ALWAYS_INLINE void arr_shuf2(vec256* arr, size_t half_states)
 }
 
 /* Main round function. */
-ALWAYS_INLINE void doubleround(vec256* arr0, vec256* arr1, vec256* arr2, vec256* arr3, size_t half_states)
+template <size_t HalfStates>
+ALWAYS_INLINE void doubleround(std::array<vec256, HalfStates>& arr0, std::array<vec256, HalfStates>& arr1, std::array<vec256, HalfStates>& arr2, std::array<vec256, HalfStates>& arr3)
 {
     UNROLL_LOOP(10)
     for (size_t i = 0; i < 10; ++i) {
-        arr_add_xor_rot<16>(arr0, arr1, arr3, half_states);
-        arr_add_xor_rot<12>(arr2, arr3, arr1, half_states);
-        arr_add_xor_rot<8>(arr0, arr1, arr3, half_states);
-        arr_add_xor_rot<7>(arr2, arr3, arr1, half_states);
-        arr_shuf0(arr1, half_states);
-        arr_shuf1(arr2, half_states);
-        arr_shuf2(arr3, half_states);
-        arr_add_xor_rot<16>(arr0, arr1, arr3, half_states);
-        arr_add_xor_rot<12>(arr2, arr3, arr1, half_states);
-        arr_add_xor_rot<8>(arr0, arr1, arr3, half_states);
-        arr_add_xor_rot<7>(arr2, arr3, arr1, half_states);
-        arr_shuf2(arr1, half_states);
-        arr_shuf1(arr2, half_states);
-        arr_shuf0(arr3, half_states);
+        arr_add_xor_rot<16>(arr0, arr1, arr3);
+        arr_add_xor_rot<12>(arr2, arr3, arr1);
+        arr_add_xor_rot<8>(arr0, arr1, arr3);
+        arr_add_xor_rot<7>(arr2, arr3, arr1);
+        arr_shuf0(arr1);
+        arr_shuf1(arr2);
+        arr_shuf2(arr3);
+        arr_add_xor_rot<16>(arr0, arr1, arr3);
+        arr_add_xor_rot<12>(arr2, arr3, arr1);
+        arr_add_xor_rot<8>(arr0, arr1, arr3);
+        arr_add_xor_rot<7>(arr2, arr3, arr1);
+        arr_shuf2(arr1);
+        arr_shuf1(arr2);
+        arr_shuf0(arr3);
     }
 }
 
@@ -335,16 +345,16 @@ ALWAYS_INLINE void vec_read_xor_write(const std::byte* in_bytes, std::byte* out_
     memcpy(out_bytes, &tmp_vec, sizeof(tmp_vec));
 }
 
-template <bool AssumeAligned>
-ALWAYS_INLINE void arr_read_xor_write_impl(const std::byte* in_bytes, std::byte* out_bytes, const vec256* arr0, const vec256* arr1, const vec256* arr2, const vec256* arr3, size_t half_states)
+template <bool AssumeAligned, size_t HalfStates>
+ALWAYS_INLINE void arr_read_xor_write_impl(const std::byte* in_bytes, std::byte* out_bytes, const std::array<vec256, HalfStates>& arr0, const std::array<vec256, HalfStates>& arr1, const std::array<vec256, HalfStates>& arr2, const std::array<vec256, HalfStates>& arr3)
 {
-    for_each_half_state(half_states, [&](auto idx) {
-        constexpr size_t i = decltype(idx)::value;
+    for_each_half_state<HalfStates>([&](auto idx) {
+        constexpr size_t i{decltype(idx)::value};
 
-        const vec256& w = arr0[i];
-        const vec256& x = arr1[i];
-        const vec256& y = arr2[i];
-        const vec256& z = arr3[i];
+        const vec256& w = std::get<i>(arr0);
+        const vec256& x = std::get<i>(arr1);
+        const vec256& y = std::get<i>(arr2);
+        const vec256& z = std::get<i>(arr3);
 
         const size_t offset = i * 128;
         const std::byte* in_slice = in_bytes + offset;
@@ -365,16 +375,23 @@ ALWAYS_INLINE void arr_read_xor_write_impl(const std::byte* in_bytes, std::byte*
 }
 
 /* Merge the 128 bit lanes from 2 states to the proper order, then pass each vec_read_xor_write */
-ALWAYS_INLINE void arr_read_xor_write(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, const vec256* arr0, const vec256* arr1, const vec256* arr2, const vec256* arr3, size_t half_states)
+template <size_t HalfStates>
+ALWAYS_INLINE void arr_read_xor_write(std::span<const std::byte> in_bytes, std::span<std::byte> out_bytes, const std::array<vec256, HalfStates>& arr0, const std::array<vec256, HalfStates>& arr1, const std::array<vec256, HalfStates>& arr2, const std::array<vec256, HalfStates>& arr3)
 {
+#if defined(__clang__) && (defined(__aarch64__) || defined(__ARM_NEON) || defined(__ARM_NEON__))
+    // On AArch64/NEON, clang's codegen for `std::assume_aligned` + 32-byte memcpy
+    // can be slower than the unaligned path. Prefer the single unaligned variant.
+    arr_read_xor_write_impl<false>(in_bytes.data(), out_bytes.data(), arr0, arr1, arr2, arr3);
+#else
     constexpr std::uintptr_t mask{CHACHA20_VEC_MEM_ALIGN - 1};
     const bool aligned = ((reinterpret_cast<std::uintptr_t>(in_bytes.data()) | reinterpret_cast<std::uintptr_t>(out_bytes.data())) & mask) == 0;
 
     if (aligned) [[likely]] {
-        arr_read_xor_write_impl<true>(in_bytes.data(), out_bytes.data(), arr0, arr1, arr2, arr3, half_states);
+        arr_read_xor_write_impl<true>(in_bytes.data(), out_bytes.data(), arr0, arr1, arr2, arr3);
     } else {
-        arr_read_xor_write_impl<false>(in_bytes.data(), out_bytes.data(), arr0, arr1, arr2, arr3, half_states);
+        arr_read_xor_write_impl<false>(in_bytes.data(), out_bytes.data(), arr0, arr1, arr2, arr3);
     }
+#endif
 }
 
 /* Main crypt function. Calculates up to 16 states (8 half_states). */
@@ -383,25 +400,25 @@ ALWAYS_INLINE void multi_block_crypt(std::span<const std::byte> in_bytes, std::s
 {
     static_assert(States == 16 || States == 8 || States == 6 || States == 4 || States == 2);
     constexpr size_t half_states = States / 2;
-    vec256 arr0[half_states], arr1[half_states], arr2[half_states], arr3[half_states];
+    std::array<vec256, half_states> arr0, arr1, arr2, arr3;
 
-    arr_set_vec256(arr0, half_states, nums256);
-    arr_set_vec256(arr1, half_states, state0);
-    arr_set_vec256(arr2, half_states, state1);
-    arr_set_vec256(arr3, half_states, state2);
+    arr_set_vec256(arr0, nums256);
+    arr_set_vec256(arr1, state0);
+    arr_set_vec256(arr2, state1);
+    arr_set_vec256(arr3, state2);
 
-    arr_add_arr(arr3, increments, half_states);
+    arr_add_arr(arr3, increments);
 
-    doubleround(arr0, arr1, arr2, arr3, half_states);
+    doubleround(arr0, arr1, arr2, arr3);
 
-    arr_add_vec256(arr0, half_states, nums256);
-    arr_add_vec256(arr1, half_states, state0);
-    arr_add_vec256(arr2, half_states, state1);
-    arr_add_vec256(arr3, half_states, state2);
+    arr_add_vec256(arr0, nums256);
+    arr_add_vec256(arr1, state0);
+    arr_add_vec256(arr2, state1);
+    arr_add_vec256(arr3, state2);
 
-    arr_add_arr(arr3, increments, half_states);
+    arr_add_arr(arr3, increments);
 
-    arr_read_xor_write(in_bytes, out_bytes, arr0, arr1, arr2, arr3, half_states);
+    arr_read_xor_write(in_bytes, out_bytes, arr0, arr1, arr2, arr3);
 }
 
 template <size_t States>
diff --git a/src/crypto/chacha20_vec_base.cpp b/src/crypto/chacha20_vec_base.cpp
index d23be59aaa1e..4d7ab6839406 100644
--- a/src/crypto/chacha20_vec_base.cpp
+++ b/src/crypto/chacha20_vec_base.cpp
@@ -23,12 +23,15 @@
 #  endif
 #elif defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__)
 #  if defined(__GNUC__) && !defined(__clang__)
-// GCC tends to spill heavily in the widest multi-state configuration on
-// AArch64/NEON. Prefer smaller multi-state levels that fit in registers.
+// The widest multi-state configuration (16) tends to spill on AArch64/NEON.
+// Also disable the 6-state variant: it increases code size and hurts the
+// common 8/4-state path on this target.
 #    define CHACHA20_VEC_DISABLE_STATES_16
 #    define CHACHA20_VEC_DISABLE_STATES_6
 #    define CHACHA20_VEC_DISABLE_STATES_2
 #  else
+#    define CHACHA20_VEC_DISABLE_STATES_16
+#    define CHACHA20_VEC_DISABLE_STATES_6
 #    define CHACHA20_VEC_DISABLE_STATES_2
 #  endif
 #else

From 12b92cff62d70757e4dfe8888655db4a8f031fd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C5=91rinc?= <pap.lorinc@gmail.com>
Date: Sat, 10 Jan 2026 22:35:32 +0100
Subject: [PATCH 12/12] chacha20: refactor vector operations with utility-based
 loops and helper functions

---
 src/crypto/chacha20_vec.ipp | 219 ++++++++++++++++++------------------
 src/util/for_each_index.h   |  30 +++++
 2 files changed, 137 insertions(+), 112 deletions(-)
 create mode 100644 src/util/for_each_index.h

diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp
index 468204ceb99f..02de40183343 100644
--- a/src/crypto/chacha20_vec.ipp
+++ b/src/crypto/chacha20_vec.ipp
@@ -4,6 +4,7 @@
 
 #include <attributes.h>
 #include <crypto/chacha20_vec.h>
+#include <util/for_each_index.h>
 
 #include <bit>
 #include <cassert>
@@ -146,6 +147,15 @@ ALWAYS_INLINE void vec_rotl(vec256& vec)
 #endif
 }
 
+ALWAYS_INLINE void vec_add_counter(vec256& vec, uint32_t inc)
+{
+#if CHACHA20_VEC_USE_SPLIT_LANES
+    vec += vec256{{inc, 0, 0, 0}, {inc, 0, 0, 0}};
+#else
+    vec += (vec256){inc, 0, 0, 0, inc, 0, 0, 0};
+#endif
+}
+
 #if CHACHA20_VEC_USE_SPLIT_LANES
 static constexpr vec128 nums128 = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574};
 static constexpr vec256 nums256 = {nums128, nums128};
@@ -153,82 +163,118 @@ static constexpr vec256 nums256 = {nums128, nums128};
 static constexpr vec256 nums256 = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574, 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574};
 #endif
 
-// Counter increments for each half-state pair. Pattern: {2*i+1, 0, 0, 0, 2*i, 0, 0, 0}
-// All smaller state counts use a prefix of this array.
-static constexpr vec256 increments[8] = {
+ALWAYS_INLINE vec256 vec_broadcast4(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
+{
 #if CHACHA20_VEC_USE_SPLIT_LANES
-    {{1, 0, 0, 0}, {0, 0, 0, 0}},
-    {{3, 0, 0, 0}, {2, 0, 0, 0}},
-    {{5, 0, 0, 0}, {4, 0, 0, 0}},
-    {{7, 0, 0, 0}, {6, 0, 0, 0}},
-    {{9, 0, 0, 0}, {8, 0, 0, 0}},
-    {{11, 0, 0, 0}, {10, 0, 0, 0}},
-    {{13, 0, 0, 0}, {12, 0, 0, 0}},
-    {{15, 0, 0, 0}, {14, 0, 0, 0}},
+    const vec128 lane = {a, b, c, d};
+    return vec256{lane, lane};
 #else
-    {1, 0, 0, 0, 0, 0, 0, 0},
-    {3, 0, 0, 0, 2, 0, 0, 0},
-    {5, 0, 0, 0, 4, 0, 0, 0},
-    {7, 0, 0, 0, 6, 0, 0, 0},
-    {9, 0, 0, 0, 8, 0, 0, 0},
-    {11, 0, 0, 0, 10, 0, 0, 0},
-    {13, 0, 0, 0, 12, 0, 0, 0},
-    {15, 0, 0, 0, 14, 0, 0, 0},
+    return (vec256){a, b, c, d, a, b, c, d};
 #endif
-};
+}
 
-/** Store a vector in all array elements */
-template <size_t HalfStates, typename Fn, size_t... I>
-ALWAYS_INLINE void for_each_half_state(Fn&& fn, std::index_sequence<I...>)
+ALWAYS_INLINE void vec_shuf0(vec256& x)
+{
+#if CHACHA20_VEC_USE_SPLIT_LANES
+    x.lo = __builtin_shufflevector(x.lo, x.lo, 1, 2, 3, 0);
+    x.hi = __builtin_shufflevector(x.hi, x.hi, 1, 2, 3, 0);
+#else
+    x = __builtin_shufflevector(x, x, 1, 2, 3, 0, 5, 6, 7, 4);
+#endif
+}
+
+ALWAYS_INLINE void vec_shuf1(vec256& x)
 {
-    (fn(std::integral_constant<size_t, I>{}), ...);
+#if CHACHA20_VEC_USE_SPLIT_LANES
+    x.lo = __builtin_shufflevector(x.lo, x.lo, 2, 3, 0, 1);
+    x.hi = __builtin_shufflevector(x.hi, x.hi, 2, 3, 0, 1);
+#else
+    x = __builtin_shufflevector(x, x, 2, 3, 0, 1, 6, 7, 4, 5);
+#endif
+}
+
+ALWAYS_INLINE void vec_shuf2(vec256& x)
+{
+#if CHACHA20_VEC_USE_SPLIT_LANES
+    x.lo = __builtin_shufflevector(x.lo, x.lo, 3, 0, 1, 2);
+    x.hi = __builtin_shufflevector(x.hi, x.hi, 3, 0, 1, 2);
+#else
+    x = __builtin_shufflevector(x, x, 3, 0, 1, 2, 7, 4, 5, 6);
+#endif
+}
+
+ALWAYS_INLINE vec256 vec_pack_hi(const vec256& a, const vec256& b)
+{
+#if CHACHA20_VEC_USE_SPLIT_LANES
+    return vec256{a.hi, b.hi};
+#else
+    return __builtin_shufflevector(a, b, 4, 5, 6, 7, 12, 13, 14, 15);
+#endif
+}
+
+ALWAYS_INLINE vec256 vec_pack_lo(const vec256& a, const vec256& b)
+{
+#if CHACHA20_VEC_USE_SPLIT_LANES
+    return vec256{a.lo, b.lo};
+#else
+    return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8, 9, 10, 11);
+#endif
 }
 
-template <size_t HalfStates, typename Fn>
-ALWAYS_INLINE void for_each_half_state(Fn&& fn)
+#if CHACHA20_VEC_USE_SPLIT_LANES
+static constexpr vec256 make_increment(uint32_t odd, uint32_t even)
+{
+    return vec256{{odd, 0, 0, 0}, {even, 0, 0, 0}};
+}
+#else
+static constexpr vec256 make_increment(uint32_t odd, uint32_t even)
 {
-    for_each_half_state<HalfStates>(std::forward<Fn>(fn), std::make_index_sequence<HalfStates>{});
+    return (vec256){odd, 0, 0, 0, even, 0, 0, 0};
 }
+#endif
+
+// Counter increments for each half-state pair. Pattern: {2*i+1, 0, 0, 0, 2*i, 0, 0, 0}
+// All smaller state counts use a prefix of this array.
+static constexpr vec256 increments[8] = {
+    make_increment(1, 0),
+    make_increment(3, 2),
+    make_increment(5, 4),
+    make_increment(7, 6),
+    make_increment(9, 8),
+    make_increment(11, 10),
+    make_increment(13, 12),
+    make_increment(15, 14),
+};
 
 /** Store a vector in all array elements */
 template <size_t HalfStates>
 ALWAYS_INLINE void arr_set_vec256(std::array<vec256, HalfStates>& arr, const vec256& vec)
 {
-    for_each_half_state<HalfStates>([&](auto idx) {
-        constexpr size_t i{decltype(idx)::value};
-        std::get<i>(arr) = vec;
-    });
+    util::ForEachIndex<HalfStates>([&]<size_t I>() { arr[I] = vec; });
 }
 
 /** Add a vector to all array elements */
 template <size_t HalfStates>
 ALWAYS_INLINE void arr_add_vec256(std::array<vec256, HalfStates>& arr, const vec256& vec)
 {
-    for_each_half_state<HalfStates>([&](auto idx) {
-        constexpr size_t i{decltype(idx)::value};
-        std::get<i>(arr) += vec;
-    });
+    util::ForEachIndex<HalfStates>([&]<size_t I>() { arr[I] += vec; });
 }
 
 /** Add corresponding vectors in arr1 to arr0 */
 template <size_t HalfStates>
 ALWAYS_INLINE void arr_add_arr(std::array<vec256, HalfStates>& arr0, const vec256* arr1)
 {
-    for_each_half_state<HalfStates>([&](auto idx) {
-        constexpr size_t i{decltype(idx)::value};
-        std::get<i>(arr0) += arr1[i];
-    });
+    util::ForEachIndex<HalfStates>([&]<size_t I>() { arr0[I] += arr1[I]; });
 }
 
 /** Add arr1 to arr0, XOR result into arr2, rotate arr2 left by N bits */
 template <unsigned N, size_t HalfStates>
 ALWAYS_INLINE void arr_add_xor_rot(std::array<vec256, HalfStates>& arr0, const std::array<vec256, HalfStates>& arr1, std::array<vec256, HalfStates>& arr2)
 {
-    for_each_half_state<HalfStates>([&](auto idx) {
-        constexpr size_t i{decltype(idx)::value};
-        vec256& x = std::get<i>(arr0);
-        const vec256& y = std::get<i>(arr1);
-        vec256& z = std::get<i>(arr2);
+    util::ForEachIndex<HalfStates>([&]<size_t I>() {
+        vec256& x = arr0[I];
+        const vec256& y = arr1[I];
+        vec256& z = arr2[I];
 
         x += y;
         z ^= x;
@@ -259,46 +305,19 @@ layout.
 template <size_t HalfStates>
 ALWAYS_INLINE void arr_shuf0(std::array<vec256, HalfStates>& arr)
 {
-    for_each_half_state<HalfStates>([&](auto idx) {
-        constexpr size_t i{decltype(idx)::value};
-        vec256& x = std::get<i>(arr);
-#if CHACHA20_VEC_USE_SPLIT_LANES
-        x.lo = __builtin_shufflevector(x.lo, x.lo, 1, 2, 3, 0);
-        x.hi = __builtin_shufflevector(x.hi, x.hi, 1, 2, 3, 0);
-#else
-        x = __builtin_shufflevector(x, x, 1, 2, 3, 0, 5, 6, 7, 4);
-#endif
-    });
+    util::ForEachIndex<HalfStates>([&]<size_t I>() { vec_shuf0(arr[I]); });
 }
 
 template <size_t HalfStates>
 ALWAYS_INLINE void arr_shuf1(std::array<vec256, HalfStates>& arr)
 {
-    for_each_half_state<HalfStates>([&](auto idx) {
-        constexpr size_t i{decltype(idx)::value};
-        vec256& x = std::get<i>(arr);
-#if CHACHA20_VEC_USE_SPLIT_LANES
-        x.lo = __builtin_shufflevector(x.lo, x.lo, 2, 3, 0, 1);
-        x.hi = __builtin_shufflevector(x.hi, x.hi, 2, 3, 0, 1);
-#else
-        x = __builtin_shufflevector(x, x, 2, 3, 0, 1, 6, 7, 4, 5);
-#endif
-    });
+    util::ForEachIndex<HalfStates>([&]<size_t I>() { vec_shuf1(arr[I]); });
 }
 
 template <size_t HalfStates>
 ALWAYS_INLINE void arr_shuf2(std::array<vec256, HalfStates>& arr)
 {
-    for_each_half_state<HalfStates>([&](auto idx) {
-        constexpr size_t i{decltype(idx)::value};
-        vec256& x = std::get<i>(arr);
-#if CHACHA20_VEC_USE_SPLIT_LANES
-        x.lo = __builtin_shufflevector(x.lo, x.lo, 3, 0, 1, 2);
-        x.hi = __builtin_shufflevector(x.hi, x.hi, 3, 0, 1, 2);
-#else
-        x = __builtin_shufflevector(x, x, 3, 0, 1, 2, 7, 4, 5, 6);
-#endif
-    });
+    util::ForEachIndex<HalfStates>([&]<size_t I>() { vec_shuf2(arr[I]); });
 }
 
 /* Main round function. */
@@ -348,29 +367,20 @@ ALWAYS_INLINE void vec_read_xor_write(const std::byte* in_bytes, std::byte* out_
 template <bool AssumeAligned, size_t HalfStates>
 ALWAYS_INLINE void arr_read_xor_write_impl(const std::byte* in_bytes, std::byte* out_bytes, const std::array<vec256, HalfStates>& arr0, const std::array<vec256, HalfStates>& arr1, const std::array<vec256, HalfStates>& arr2, const std::array<vec256, HalfStates>& arr3)
 {
-    for_each_half_state<HalfStates>([&](auto idx) {
-        constexpr size_t i{decltype(idx)::value};
+    util::ForEachIndex<HalfStates>([&]<size_t I>() {
+        const vec256& w = arr0[I];
+        const vec256& x = arr1[I];
+        const vec256& y = arr2[I];
+        const vec256& z = arr3[I];
 
-        const vec256& w = std::get<i>(arr0);
-        const vec256& x = std::get<i>(arr1);
-        const vec256& y = std::get<i>(arr2);
-        const vec256& z = std::get<i>(arr3);
-
-        const size_t offset = i * 128;
+        constexpr size_t offset = I * 128;
         const std::byte* in_slice = in_bytes + offset;
         std::byte* out_slice = out_bytes + offset;
 
-#if CHACHA20_VEC_USE_SPLIT_LANES
-        vec_read_xor_write<AssumeAligned>(in_slice + 0, out_slice + 0, vec256{w.hi, x.hi});
-        vec_read_xor_write<AssumeAligned>(in_slice + 32, out_slice + 32, vec256{y.hi, z.hi});
-        vec_read_xor_write<AssumeAligned>(in_slice + 64, out_slice + 64, vec256{w.lo, x.lo});
-        vec_read_xor_write<AssumeAligned>(in_slice + 96, out_slice + 96, vec256{y.lo, z.lo});
-#else
-        vec_read_xor_write<AssumeAligned>(in_slice + 0, out_slice + 0, __builtin_shufflevector(w, x, 4, 5, 6, 7, 12, 13, 14, 15));
-        vec_read_xor_write<AssumeAligned>(in_slice + 32, out_slice + 32, __builtin_shufflevector(y, z, 4, 5, 6, 7, 12, 13, 14, 15));
-        vec_read_xor_write<AssumeAligned>(in_slice + 64, out_slice + 64, __builtin_shufflevector(w, x, 0, 1, 2, 3, 8, 9, 10, 11));
-        vec_read_xor_write<AssumeAligned>(in_slice + 96, out_slice + 96, __builtin_shufflevector(y, z, 0, 1, 2, 3, 8, 9, 10, 11));
-#endif
+        vec_read_xor_write<AssumeAligned>(in_slice + 0, out_slice + 0, vec_pack_hi(w, x));
+        vec_read_xor_write<AssumeAligned>(in_slice + 32, out_slice + 32, vec_pack_hi(y, z));
+        vec_read_xor_write<AssumeAligned>(in_slice + 64, out_slice + 64, vec_pack_lo(w, x));
+        vec_read_xor_write<AssumeAligned>(in_slice + 96, out_slice + 96, vec_pack_lo(y, z));
     });
 }
 
@@ -426,12 +436,7 @@ ALWAYS_INLINE void process_blocks(std::span<const std::byte>& in_bytes, std::spa
 {
     while (in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * States) {
         multi_block_crypt<States>(in_bytes, out_bytes, state0, state1, state2);
-        const uint32_t inc = static_cast<uint32_t>(States);
-#if CHACHA20_VEC_USE_SPLIT_LANES
-        state2 += vec256{{inc, 0, 0, 0}, {inc, 0, 0, 0}};
-#else
-        state2 += (vec256){inc, 0, 0, 0, inc, 0, 0, 0};
-#endif
+        vec_add_counter(state2, static_cast<uint32_t>(States));
         in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * States);
         out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * States);
     }
@@ -448,19 +453,9 @@ void chacha20_crypt_vectorized(std::span<const std::byte>& in_bytes, std::span<s
 {
 #if CHACHA20_VEC_ENABLE_ANY_MULTI_STATE
     assert(in_bytes.size() == out_bytes.size());
-#if CHACHA20_VEC_USE_SPLIT_LANES
-    const vec128 state0_lane = {input[0], input[1], input[2], input[3]};
-    const vec128 state1_lane = {input[4], input[5], input[6], input[7]};
-    const vec128 state2_lane = {input[8], input[9], input[10], input[11]};
-
-    const vec256 state0 = {state0_lane, state0_lane};
-    const vec256 state1 = {state1_lane, state1_lane};
-    vec256 state2 = {state2_lane, state2_lane};
-#else
-    const vec256 state0 = (vec256){input[0], input[1], input[2], input[3], input[0], input[1], input[2], input[3]};
-    const vec256 state1 = (vec256){input[4], input[5], input[6], input[7], input[4], input[5], input[6], input[7]};
-    vec256 state2 = (vec256){input[8], input[9], input[10], input[11], input[8], input[9], input[10], input[11]};
-#endif
+    const vec256 state0 = vec_broadcast4(input[0], input[1], input[2], input[3]);
+    const vec256 state1 = vec_broadcast4(input[4], input[5], input[6], input[7]);
+    vec256 state2 = vec_broadcast4(input[8], input[9], input[10], input[11]);
 
     if constexpr (kEnableStates16) process_blocks<16>(in_bytes, out_bytes, state0, state1, state2);
     if constexpr (kEnableStates8) process_blocks<8>(in_bytes, out_bytes, state0, state1, state2);
diff --git a/src/util/for_each_index.h b/src/util/for_each_index.h
new file mode 100644
index 000000000000..9bbd21c56479
--- /dev/null
+++ b/src/util/for_each_index.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2026-present The Bitcoin Core developers
+// Distributed under the MIT software license, see the accompanying
+// file COPYING or http://www.opensource.org/licenses/mit-license.php.
+
+#ifndef BITCOIN_UTIL_FOR_EACH_INDEX_H
+#define BITCOIN_UTIL_FOR_EACH_INDEX_H
+
+#include <attributes.h>
+
+#include <cstddef>
+#include <utility>
+
+namespace util {
+
+/** Invoke `fn.template operator()<I>()` for each `I` in `[0, N)`. */
+template <size_t N, typename Fn, size_t... I>
+ALWAYS_INLINE void ForEachIndex(Fn&& fn, std::index_sequence<I...>)
+{
+    (fn.template operator()<I>(), ...);
+}
+
+template <size_t N, typename Fn>
+ALWAYS_INLINE void ForEachIndex(Fn&& fn)
+{
+    ForEachIndex<N>(std::forward<Fn>(fn), std::make_index_sequence<N>{});
+}
+
+} // namespace util
+
+#endif // BITCOIN_UTIL_FOR_EACH_INDEX_H