From a535ff10ba1da8fea809fcb5a77b2179bb8c90f9 Mon Sep 17 00:00:00 2001 From: Cory Fields Date: Fri, 12 Dec 2025 21:12:40 +0000 Subject: [PATCH 01/12] chacha20: move single-block crypt to inline helper function Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte): - clang 22: 1MB 2.20 [2.20-2.21], 256B 2.35 [2.35-2.35], 64B 2.59 [2.57-2.62] - gcc 14: 1MB 2.45 [2.44-2.45], 256B 2.51 [2.51-2.51], 64B 2.69 [2.68-2.70] CHACHA20_64BYTES is the single-block path, so it's a good sanity-check for noise. Assembly (scalar path): both compilers lower `std::rotl` to rotates and keep the round math in scalar registers. Example (gcc, quarterround fragment): eor w3, w3, w7 ror w3, w3, #16 add w5, w5, w2 Delta vs base: no measurable change (this is a refactor to simplify later vector work). --- src/crypto/chacha20.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/crypto/chacha20.cpp b/src/crypto/chacha20.cpp index 6bdffe691a6b..e06e79527dc9 100644 --- a/src/crypto/chacha20.cpp +++ b/src/crypto/chacha20.cpp @@ -157,13 +157,14 @@ inline void ChaCha20Aligned::Keystream(std::span output) noexcept } } -inline void ChaCha20Aligned::Crypt(std::span in_bytes, std::span out_bytes) noexcept +static inline void chacha20_crypt(std::span in_bytes, std::span out_bytes, uint32_t input[12]) noexcept { assert(in_bytes.size() == out_bytes.size()); const std::byte* m = in_bytes.data(); std::byte* c = out_bytes.data(); - size_t blocks = out_bytes.size() / BLOCKLEN; - assert(blocks * BLOCKLEN == out_bytes.size()); + size_t blocks = out_bytes.size() / ChaCha20Aligned::BLOCKLEN; + assert(blocks * ChaCha20Aligned::BLOCKLEN == out_bytes.size()); + uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; uint32_t j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15; @@ -273,11 +274,17 @@ inline void ChaCha20Aligned::Crypt(std::span in_bytes, std::spa return; } blocks -= 1; - c += BLOCKLEN; - m += BLOCKLEN; + c += ChaCha20Aligned::BLOCKLEN; + m += ChaCha20Aligned::BLOCKLEN; } } + +inline void ChaCha20Aligned::Crypt(std::span in_bytes, std::span out_bytes) noexcept +{ + chacha20_crypt(in_bytes, out_bytes, input); +} + void ChaCha20::Keystream(std::span out) noexcept { if (out.empty()) return; From 3c4a209fd34fe07d6a2a5827bc9e95677ece3d1d Mon Sep 17 00:00:00 2001 From: Cory Fields Date: Fri, 12 Dec 2025 21:37:52 +0000 Subject: [PATCH 02/12] chacha20: Add generic vectorized chacha20 implementation Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte): - clang 22: 1MB 1.80 [1.79-1.80], 256B 1.63 [1.63-1.64], 64B 2.59 [2.57-2.60] - gcc 14: 1MB 5.37 [5.37-5.38], 256B 5.14 [5.13-5.15], 64B 2.70 [2.70-2.75] The speedup/slowdown only shows up once we hit the multi-block path (1MB/256B). Single-block (64B) remains scalar and stays ~unchanged. Assembly highlights (AArch64): - clang emits NEON-friendly rotates/shuffles (`shl`+`usra` and `ext`) with a small stack frame. - gcc emits a very large stack frame and scalar pack/unpack sequences around shuffles. Example prologue (gcc): mov x13, #0x9160 sub sp, sp, x13 Example inner-sequence (gcc): fmov x18, d18 bfxil x10, x18, #0, #32 Example inner-sequence (clang): usra v25.4s, v16.4s, #25 ext v22.16b, v10.16b, v10.16b, #4 Delta vs previous commit: - clang: ~18% faster at 1MB (2.20 -> 1.80 ns/B) - gcc: ~2.2x slower at 1MB (2.45 -> 5.37 ns/B) due to poor multi-state codegen. --- src/crypto/CMakeLists.txt | 1 + src/crypto/chacha20.cpp | 21 +- src/crypto/chacha20_vec.h | 30 +++ src/crypto/chacha20_vec.ipp | 342 +++++++++++++++++++++++++++++++ src/crypto/chacha20_vec_base.cpp | 26 +++ 5 files changed, 419 insertions(+), 1 deletion(-) create mode 100644 src/crypto/chacha20_vec.h create mode 100644 src/crypto/chacha20_vec.ipp create mode 100644 src/crypto/chacha20_vec_base.cpp diff --git a/src/crypto/CMakeLists.txt b/src/crypto/CMakeLists.txt index 92653ade5a7a..a247516e4a95 100644 --- a/src/crypto/CMakeLists.txt +++ b/src/crypto/CMakeLists.txt @@ -5,6 +5,7 @@ add_library(bitcoin_crypto STATIC EXCLUDE_FROM_ALL aes.cpp chacha20.cpp + chacha20_vec_base.cpp chacha20poly1305.cpp hex_base.cpp hkdf_sha256_32.cpp diff --git a/src/crypto/chacha20.cpp b/src/crypto/chacha20.cpp index e06e79527dc9..53d5e0857549 100644 --- a/src/crypto/chacha20.cpp +++ b/src/crypto/chacha20.cpp @@ -7,11 +7,15 @@ #include #include +#include #include #include #include #include +#include + +static_assert(ChaCha20Aligned::BLOCKLEN == CHACHA20_VEC_BLOCKLEN); #define QUARTERROUND(a,b,c,d) \ a += b; d = std::rotl(d ^ a, 16); \ @@ -282,7 +286,22 @@ static inline void chacha20_crypt(std::span in_bytes, std::span inline void ChaCha20Aligned::Crypt(std::span in_bytes, std::span out_bytes) noexcept { - chacha20_crypt(in_bytes, out_bytes, input); + assert(in_bytes.size() == out_bytes.size()); + size_t blocks = out_bytes.size() / ChaCha20Aligned::BLOCKLEN; + assert(blocks * ChaCha20Aligned::BLOCKLEN == out_bytes.size()); +#ifdef ENABLE_CHACHA20_VEC + // Only use the vectorized implementations if the counter will not overflow. + const bool overflow = static_cast(input[8]) + blocks > std::numeric_limits::max(); + if (blocks > 1 && !overflow) { + const auto state = std::to_array(input); + chacha20_vec_base::chacha20_crypt_vectorized(in_bytes, out_bytes, state); + const size_t blocks_written = blocks - (out_bytes.size() / ChaCha20Aligned::BLOCKLEN); + input[8] += blocks_written; + } +#endif + if (in_bytes.size()) { + chacha20_crypt(in_bytes, out_bytes, input); + } } void ChaCha20::Keystream(std::span out) noexcept diff --git a/src/crypto/chacha20_vec.h b/src/crypto/chacha20_vec.h new file mode 100644 index 000000000000..b1176d2b8dbf --- /dev/null +++ b/src/crypto/chacha20_vec.h @@ -0,0 +1,30 @@ +// Copyright (c) 2025-present The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#ifndef BITCOIN_CRYPTO_CHACHA20_VEC_H +#define BITCOIN_CRYPTO_CHACHA20_VEC_H + +#include +#include +#include +#include + +static constexpr size_t CHACHA20_VEC_BLOCKLEN = 64; + +#ifdef __has_builtin + #if __has_builtin(__builtin_shufflevector) + #define ENABLE_CHACHA20_VEC 1 + #endif +#endif + +#ifdef ENABLE_CHACHA20_VEC + +namespace chacha20_vec_base +{ + void chacha20_crypt_vectorized(std::span& in_bytes, std::span& out_bytes, const std::array& input) noexcept; +} + +#endif // ENABLE_CHACHA20_VEC + +#endif // BITCOIN_CRYPTO_CHACHA20_VEC_H diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp new file mode 100644 index 000000000000..46a159ce01c6 --- /dev/null +++ b/src/crypto/chacha20_vec.ipp @@ -0,0 +1,342 @@ +// Copyright (c) 2025-present The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#include + +#include +#include +#include +#include + +#if defined(ENABLE_CHACHA20_VEC) + +#if defined(CHACHA20_VEC_DISABLE_STATES_16) && \ + defined(CHACHA20_VEC_DISABLE_STATES_8) && \ + defined(CHACHA20_VEC_DISABLE_STATES_6) && \ + defined(CHACHA20_VEC_DISABLE_STATES_4) && \ + defined(CHACHA20_VEC_DISABLE_STATES_2) +#define CHACHA20_VEC_ALL_MULTI_STATES_DISABLED +#endif + + +#if !defined(CHACHA20_VEC_ALL_MULTI_STATES_DISABLED) + +#if defined(__has_attribute) +# if __has_attribute(always_inline) +# define ALWAYS_INLINE __attribute__ ((always_inline)) inline +# endif +#endif + +#if !defined(ALWAYS_INLINE) +# define ALWAYS_INLINE inline +#endif + + +namespace { + +using vec256 = uint32_t __attribute__((__vector_size__(32))); + +/** Endian-conversion for big-endian */ +ALWAYS_INLINE void vec_byteswap(vec256& vec) +{ + if constexpr (std::endian::native == std::endian::big) + { + vec256 ret; + ret[0] = __builtin_bswap32(vec[0]); + ret[1] = __builtin_bswap32(vec[1]); + ret[2] = __builtin_bswap32(vec[2]); + ret[3] = __builtin_bswap32(vec[3]); + ret[4] = __builtin_bswap32(vec[4]); + ret[5] = __builtin_bswap32(vec[5]); + ret[6] = __builtin_bswap32(vec[6]); + ret[7] = __builtin_bswap32(vec[7]); + vec = ret; + } +} + +/** Left-rotate vector */ +template +ALWAYS_INLINE void vec_rotl(vec256& vec) +{ + vec = (vec << BITS) | (vec >> (32 - BITS)); +} + +/** Store a vector in all array elements */ +template +ALWAYS_INLINE void arr_set_vec256(std::array& arr, const vec256& vec) +{ + std::get(arr) = vec; + if constexpr(ITER + 1 < I ) arr_set_vec256(arr, vec); +} + +/** Add a vector to all array elements */ +template +ALWAYS_INLINE void arr_add_vec256(std::array& arr, const vec256& vec) +{ + std::get(arr) += vec; + if constexpr(ITER + 1 < I ) arr_add_vec256(arr, vec); +} + +/** Add corresponding vectors in arr1 to arr0 */ +template +ALWAYS_INLINE void arr_add_arr(std::array& arr0, const std::array& arr1) +{ + std::get(arr0) += std::get(arr1); + if constexpr(ITER + 1 < I ) arr_add_arr(arr0, arr1); +} + +/** Perform add/xor/rotate for the round function */ +template +ALWAYS_INLINE void arr_add_xor_rot(std::array& arr0, const std::array& arr1, std::array& arr2) +{ + vec256& x = std::get(arr0); + const vec256& y = std::get(arr1); + vec256& z = std::get(arr2); + + x += y; + z ^= x; + vec_rotl(z); + + if constexpr(ITER + 1 < I ) arr_add_xor_rot(arr0, arr1, arr2); +} + +/* +The first round: + QUARTERROUND( x0, x4, x8,x12); + QUARTERROUND( x1, x5, x9,x13); + QUARTERROUND( x2, x6,x10,x14); + QUARTERROUND( x3, x7,x11,x15); + +The second round: + QUARTERROUND( x0, x5,x10,x15); + QUARTERROUND( x1, x6,x11,x12); + QUARTERROUND( x2, x7, x8,x13); + QUARTERROUND( x3, x4, x9,x14); + +After the first round, arr_shuf0, arr_shuf1, and arr_shuf2 are used to shuffle +the layout to prepare for the second round. + +After the second round, they are used (in reverse) to restore the original +layout. + +*/ +template +ALWAYS_INLINE void arr_shuf0(std::array& arr) +{ + vec256& x = std::get(arr); + x = __builtin_shufflevector(x, x, 1, 2, 3, 0, 5, 6, 7, 4); + if constexpr(ITER + 1 < I ) arr_shuf0(arr); +} + +template +ALWAYS_INLINE void arr_shuf1(std::array& arr) +{ + vec256& x = std::get(arr); + x = __builtin_shufflevector(x, x, 2, 3, 0, 1, 6, 7, 4, 5); + if constexpr(ITER + 1 < I ) arr_shuf1(arr); +} + +template +ALWAYS_INLINE void arr_shuf2(std::array& arr) +{ + vec256& x = std::get(arr); + x = __builtin_shufflevector(x, x, 3, 0, 1, 2, 7, 4, 5, 6); + if constexpr(ITER + 1 < I ) arr_shuf2(arr); +} + +/* Main round function. */ +template +ALWAYS_INLINE void doubleround(std::array& arr0, std::array& arr1, std::array&arr2, std::array&arr3) +{ + arr_add_xor_rot<16>(arr0, arr1, arr3); + arr_add_xor_rot<12>(arr2, arr3, arr1); + arr_add_xor_rot<8>(arr0, arr1, arr3); + arr_add_xor_rot<7>(arr2, arr3, arr1); + arr_shuf0(arr1); + arr_shuf1(arr2); + arr_shuf2(arr3); + arr_add_xor_rot<16>(arr0, arr1, arr3); + arr_add_xor_rot<12>(arr2, arr3, arr1); + arr_add_xor_rot<8>(arr0, arr1, arr3); + arr_add_xor_rot<7>(arr2, arr3, arr1); + arr_shuf2(arr1); + arr_shuf1(arr2); + arr_shuf0(arr3); + if constexpr (ITER + 1 < 10) doubleround(arr0, arr1, arr2, arr3); +} + +/* Read 32bytes of input, xor with calculated state, write to output. Assumes + that input and output are unaligned, and makes no assumptions about the + internal layout of vec256; +*/ +ALWAYS_INLINE void vec_read_xor_write(std::span in_bytes, std::span out_bytes, const vec256& vec) +{ + std::array temparr; + memcpy(temparr.data(), in_bytes.data(), in_bytes.size()); + vec256 tempvec = vec ^ (vec256){temparr[0], temparr[1], temparr[2], temparr[3], temparr[4], temparr[5], temparr[6], temparr[7]}; + vec_byteswap(tempvec); + temparr = {tempvec[0], tempvec[1], tempvec[2], tempvec[3], tempvec[4], tempvec[5], tempvec[6], tempvec[7]}; + memcpy(out_bytes.data(), temparr.data(), out_bytes.size()); +} + +/* Merge the 128 bit lanes from 2 states to the proper order, then pass each vec_read_xor_write */ +template +ALWAYS_INLINE void arr_read_xor_write(std::span in_bytes, std::span out_bytes, const std::array& arr0, const std::array& arr1, const std::array& arr2, const std::array& arr3) +{ + const vec256& w = std::get(arr0); + const vec256& x = std::get(arr1); + const vec256& y = std::get(arr2); + const vec256& z = std::get(arr3); + + vec_read_xor_write(in_bytes.first<32>(), out_bytes.first<32>(), __builtin_shufflevector(w, x, 4, 5, 6, 7, 12, 13, 14, 15)); + vec_read_xor_write(in_bytes.subspan<32, 32>(), out_bytes.subspan<32, 32>(), __builtin_shufflevector(y, z, 4, 5, 6, 7, 12, 13, 14, 15)); + vec_read_xor_write(in_bytes.subspan<64, 32>(), out_bytes.subspan<64, 32>(), __builtin_shufflevector(w, x, 0, 1, 2, 3, 8, 9, 10, 11)); + vec_read_xor_write(in_bytes.subspan<96, 32>(), out_bytes.subspan<96, 32>(), __builtin_shufflevector(y, z, 0, 1, 2, 3, 8, 9, 10, 11)); + + if constexpr(ITER + 1 < I ) arr_read_xor_write(in_bytes.subspan<128>(), out_bytes.subspan<128>(), arr0, arr1, arr2, arr3); +} + +/* Compile-time helper to create addend vectors which used to increment the states + + Generates vectors of the pattern: + 1 0 0 0 0 0 0 0 + 3 0 0 0 2 0 0 0 + 5 0 0 0 4 0 0 0 + ... +*/ +template +consteval std::array generate_increments() +{ + std::array rows; + for (uint32_t i = 0; i < SIZE; i ++) + { + rows[i] = (i * (vec256){2, 0, 0, 0, 2, 0, 0, 0}) + (vec256){1, 0, 0, 0, 0, 0, 0, 0}; + } + return rows; +} + +/* Main crypt function. Calculates up to 16 states. + + Each array contains one or more vectors, with each array representing a + quarter of a state. Initially, the high and low parts of each vector are + duplicated. They each contain a portion of the current and next state. + + arr0[0] arr1[0] arr2[0] arr3[0] increment + ----------|---------|----------|----------|--------- + 0x61707865 input[0] input[4] input[8] [1] + 0x3320646e input[1] input[5] input[9] [0] + 0x79622d32 input[2] input[6] input[10] [0] + 0x6b206574 input[3] input[7] input[11] [0] + + 0x61707865 input[0] input[4] input[8] [0] + 0x3320646e input[1] input[5] input[9] [0] + 0x79622d32 input[2] input[6] input[10] [0] + 0x6b206574 input[3] input[7] input[11] [0] + + After loading the states, arr3's vectors are incremented as-necessary to + contain the correct counter values. + + This way, operations like "arr0[0] += arr1[0]" can perform all 8 operations + in parallel, taking advantage of 256bit registers where available. + + arrX[0] represents states 0 and 1. + arrX[1] represents states 2 and 3 (if present) + etc. + + After the doublerounds have been run and the initial state has been mixed + back in, the high and low portions of the vectors in each array are + shuffled in order to prepare them for mixing with the input bytes. Finally, + each state is xor'd with its corresponding input, byteswapped if necessary, + and written to its output. +*/ +template +ALWAYS_INLINE void multi_block_crypt(std::span in_bytes, std::span out_bytes, const vec256& state0, const vec256& state1, const vec256& state2) +{ + static constexpr size_t HALF_STATES = STATES / 2; + static constexpr vec256 nums256 = (vec256){0x61707865, 0x3320646e, 0x79622d32, 0x6b206574, 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; + static constinit std::array increments = generate_increments(); + + std::array arr0, arr1, arr2, arr3; + + arr_set_vec256(arr0, nums256); + arr_set_vec256(arr1, state0); + arr_set_vec256(arr2, state1); + arr_set_vec256(arr3, state2); + + arr_add_arr(arr3, increments); + + doubleround(arr0, arr1, arr2, arr3); + + arr_add_vec256(arr0, nums256); + arr_add_vec256(arr1, state0); + arr_add_vec256(arr2, state1); + arr_add_vec256(arr3, state2); + + arr_add_arr(arr3, increments); + + arr_read_xor_write(in_bytes, out_bytes, arr0, arr1, arr2, arr3); +} + +} // anonymous namespace +#endif // CHACHA20_VEC_ALL_MULTI_STATES_DISABLED + +#if defined(CHACHA20_NAMESPACE) +namespace CHACHA20_NAMESPACE { +#endif + +void chacha20_crypt_vectorized(std::span& in_bytes, std::span& out_bytes, const std::array& input) noexcept +{ +#if !defined(CHACHA20_VEC_ALL_MULTI_STATES_DISABLED) + assert(in_bytes.size() == out_bytes.size()); + const vec256 state0 = (vec256){input[0], input[1], input[2], input[3], input[0], input[1], input[2], input[3]}; + const vec256 state1 = (vec256){input[4], input[5], input[6], input[7], input[4], input[5], input[6], input[7]}; + vec256 state2 = (vec256){input[8], input[9], input[10], input[11], input[8], input[9], input[10], input[11]}; +#if !defined(CHACHA20_VEC_DISABLE_STATES_16) + while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 16) { + multi_block_crypt<16>(in_bytes, out_bytes, state0, state1, state2); + state2 += (vec256){16, 0, 0, 0, 16, 0, 0, 0}; + in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 16); + out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 16); + } +#endif +#if !defined(CHACHA20_VEC_DISABLE_STATES_8) + while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 8) { + multi_block_crypt<8>(in_bytes, out_bytes, state0, state1, state2); + state2 += (vec256){8, 0, 0, 0, 8, 0, 0, 0}; + in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 8); + out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 8); + } +#endif +#if !defined(CHACHA20_VEC_DISABLE_STATES_6) + while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 6) { + multi_block_crypt<6>(in_bytes, out_bytes, state0, state1, state2); + state2 += (vec256){6, 0, 0, 0, 6, 0, 0, 0}; + in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 6); + out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 6); + } +#endif +#if !defined(CHACHA20_VEC_DISABLE_STATES_4) + while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 4) { + multi_block_crypt<4>(in_bytes, out_bytes, state0, state1, state2); + state2 += (vec256){4, 0, 0, 0, 4, 0, 0, 0}; + in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 4); + out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 4); + } +#endif +#if !defined(CHACHA20_VEC_DISABLE_STATES_2) + while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 2) { + multi_block_crypt<2>(in_bytes, out_bytes, state0, state1, state2); + state2 += (vec256){2, 0, 0, 0, 2, 0, 0, 0}; + in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 2); + out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 2); + } +#endif +#endif // CHACHA20_VEC_ALL_MULTI_STATES_DISABLED +} + +#if defined(CHACHA20_NAMESPACE) +} +#endif + +#endif // ENABLE_CHACHA20_VEC diff --git a/src/crypto/chacha20_vec_base.cpp b/src/crypto/chacha20_vec_base.cpp new file mode 100644 index 000000000000..9fda9452a1c8 --- /dev/null +++ b/src/crypto/chacha20_vec_base.cpp @@ -0,0 +1,26 @@ +// Copyright (c) 2025-present The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#define CHACHA20_NAMESPACE chacha20_vec_base + +// This file should define which states should be en/disabled for all +// supported architectures. For some, like x86-64 and armv8, simd features +// (sse2 and neon respectively) are safe to use without runtime detection. + +#if defined(__x86_64__) || defined(__amd64__) +# define CHACHA20_VEC_DISABLE_STATES_16 +# define CHACHA20_VEC_DISABLE_STATES_8 +# define CHACHA20_VEC_DISABLE_STATES_6 +#elif defined(__ARM_NEON) +# define CHACHA20_VEC_DISABLE_STATES_2 +#else +// Be conservative and require platforms to opt-in +# define CHACHA20_VEC_DISABLE_STATES_16 +# define CHACHA20_VEC_DISABLE_STATES_8 +# define CHACHA20_VEC_DISABLE_STATES_6 +# define CHACHA20_VEC_DISABLE_STATES_4 +# define CHACHA20_VEC_DISABLE_STATES_2 +#endif + +#include From 0d8d4400a737d9f2dec664c9a1a48619a5eb98d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc?= Date: Sun, 4 Jan 2026 00:06:15 +0100 Subject: [PATCH 03/12] refactor: replace recursive templates in ChaCha20 implementation with `static_for` loops Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte): - clang 22: 1MB 1.80 [1.79-1.80], 256B 1.63 [1.63-1.64], 64B 2.59 [2.58-2.60] - gcc 14: 1MB 6.66 [6.64-6.79], 256B 5.02 [5.02-5.03], 64B 2.70 [2.68-2.72] This refactor keeps clang flat, but makes gcc's 1MB case substantially worse. Assembly highlights (gcc): instruction count explodes (CHACHA20_1MB `ins/byte` ~43.7) with many vector loads/stores and branches (lambda clones / `ld1`/`st1` heavy). Example (from one of the inlined helper clones): st1 {v26.16b-v27.16b}, [x4] ldp q26, q27, [x2, #64] Delta vs previous commit: - gcc: 1MB 5.37 -> 6.66 ns/B (regression) - clang: essentially unchanged. --- src/crypto/chacha20_vec.ipp | 118 ++++++++++++++++++++++-------------- 1 file changed, 71 insertions(+), 47 deletions(-) diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp index 46a159ce01c6..236ab84c0ae8 100644 --- a/src/crypto/chacha20_vec.ipp +++ b/src/crypto/chacha20_vec.ipp @@ -8,6 +8,8 @@ #include #include #include +#include +#include #if defined(ENABLE_CHACHA20_VEC) @@ -62,43 +64,52 @@ ALWAYS_INLINE void vec_rotl(vec256& vec) vec = (vec << BITS) | (vec >> (32 - BITS)); } +template +ALWAYS_INLINE void static_for_impl(Fn&& fn, std::index_sequence) +{ + (fn(std::integral_constant{}), ...); +} + +template +ALWAYS_INLINE void static_for(Fn&& fn) +{ + static_for_impl(std::forward(fn), std::make_index_sequence{}); +} + /** Store a vector in all array elements */ -template +template ALWAYS_INLINE void arr_set_vec256(std::array& arr, const vec256& vec) { - std::get(arr) = vec; - if constexpr(ITER + 1 < I ) arr_set_vec256(arr, vec); + static_for([&](auto idx) { std::get(arr) = vec; }); } /** Add a vector to all array elements */ -template +template ALWAYS_INLINE void arr_add_vec256(std::array& arr, const vec256& vec) { - std::get(arr) += vec; - if constexpr(ITER + 1 < I ) arr_add_vec256(arr, vec); + static_for([&](auto idx) { std::get(arr) += vec; }); } /** Add corresponding vectors in arr1 to arr0 */ -template +template ALWAYS_INLINE void arr_add_arr(std::array& arr0, const std::array& arr1) { - std::get(arr0) += std::get(arr1); - if constexpr(ITER + 1 < I ) arr_add_arr(arr0, arr1); + static_for([&](auto idx) { std::get(arr0) += std::get(arr1); }); } /** Perform add/xor/rotate for the round function */ -template +template ALWAYS_INLINE void arr_add_xor_rot(std::array& arr0, const std::array& arr1, std::array& arr2) { - vec256& x = std::get(arr0); - const vec256& y = std::get(arr1); - vec256& z = std::get(arr2); - - x += y; - z ^= x; - vec_rotl(z); - - if constexpr(ITER + 1 < I ) arr_add_xor_rot(arr0, arr1, arr2); + static_for([&](auto idx) { + vec256& x = std::get(arr0); + const vec256& y = std::get(arr1); + vec256& z = std::get(arr2); + + x += y; + z ^= x; + vec_rotl(z); + }); } /* @@ -121,33 +132,36 @@ After the second round, they are used (in reverse) to restore the original layout. */ -template +template ALWAYS_INLINE void arr_shuf0(std::array& arr) { - vec256& x = std::get(arr); - x = __builtin_shufflevector(x, x, 1, 2, 3, 0, 5, 6, 7, 4); - if constexpr(ITER + 1 < I ) arr_shuf0(arr); + static_for([&](auto idx) { + vec256& x = std::get(arr); + x = __builtin_shufflevector(x, x, 1, 2, 3, 0, 5, 6, 7, 4); + }); } -template +template ALWAYS_INLINE void arr_shuf1(std::array& arr) { - vec256& x = std::get(arr); - x = __builtin_shufflevector(x, x, 2, 3, 0, 1, 6, 7, 4, 5); - if constexpr(ITER + 1 < I ) arr_shuf1(arr); + static_for([&](auto idx) { + vec256& x = std::get(arr); + x = __builtin_shufflevector(x, x, 2, 3, 0, 1, 6, 7, 4, 5); + }); } -template +template ALWAYS_INLINE void arr_shuf2(std::array& arr) { - vec256& x = std::get(arr); - x = __builtin_shufflevector(x, x, 3, 0, 1, 2, 7, 4, 5, 6); - if constexpr(ITER + 1 < I ) arr_shuf2(arr); + static_for([&](auto idx) { + vec256& x = std::get(arr); + x = __builtin_shufflevector(x, x, 3, 0, 1, 2, 7, 4, 5, 6); + }); } -/* Main round function. */ -template -ALWAYS_INLINE void doubleround(std::array& arr0, std::array& arr1, std::array&arr2, std::array&arr3) +/* Run a single ChaCha20 double-round. */ +template +ALWAYS_INLINE void doubleround_once(std::array& arr0, std::array& arr1, std::array& arr2, std::array& arr3) { arr_add_xor_rot<16>(arr0, arr1, arr3); arr_add_xor_rot<12>(arr2, arr3, arr1); @@ -163,7 +177,13 @@ ALWAYS_INLINE void doubleround(std::array& arr0, std::array(arr0, arr1, arr2, arr3); +} + +/* Main round function. */ +template +ALWAYS_INLINE void doubleround(std::array& arr0, std::array& arr1, std::array& arr2, std::array& arr3) +{ + static_for<10>([&](auto) { doubleround_once(arr0, arr1, arr2, arr3); }); } /* Read 32bytes of input, xor with calculated state, write to output. Assumes @@ -181,20 +201,24 @@ ALWAYS_INLINE void vec_read_xor_write(std::span in_bytes, s } /* Merge the 128 bit lanes from 2 states to the proper order, then pass each vec_read_xor_write */ -template +template ALWAYS_INLINE void arr_read_xor_write(std::span in_bytes, std::span out_bytes, const std::array& arr0, const std::array& arr1, const std::array& arr2, const std::array& arr3) { - const vec256& w = std::get(arr0); - const vec256& x = std::get(arr1); - const vec256& y = std::get(arr2); - const vec256& z = std::get(arr3); - - vec_read_xor_write(in_bytes.first<32>(), out_bytes.first<32>(), __builtin_shufflevector(w, x, 4, 5, 6, 7, 12, 13, 14, 15)); - vec_read_xor_write(in_bytes.subspan<32, 32>(), out_bytes.subspan<32, 32>(), __builtin_shufflevector(y, z, 4, 5, 6, 7, 12, 13, 14, 15)); - vec_read_xor_write(in_bytes.subspan<64, 32>(), out_bytes.subspan<64, 32>(), __builtin_shufflevector(w, x, 0, 1, 2, 3, 8, 9, 10, 11)); - vec_read_xor_write(in_bytes.subspan<96, 32>(), out_bytes.subspan<96, 32>(), __builtin_shufflevector(y, z, 0, 1, 2, 3, 8, 9, 10, 11)); - - if constexpr(ITER + 1 < I ) arr_read_xor_write(in_bytes.subspan<128>(), out_bytes.subspan<128>(), arr0, arr1, arr2, arr3); + static_for([&](auto idx) { + constexpr size_t offset = idx.value * 128; + const vec256& w = std::get(arr0); + const vec256& x = std::get(arr1); + const vec256& y = std::get(arr2); + const vec256& z = std::get(arr3); + + auto in_slice = in_bytes.template subspan(); + auto out_slice = out_bytes.template subspan(); + + vec_read_xor_write(in_slice.template first<32>(), out_slice.template first<32>(), __builtin_shufflevector(w, x, 4, 5, 6, 7, 12, 13, 14, 15)); + vec_read_xor_write(in_slice.template subspan<32, 32>(), out_slice.template subspan<32, 32>(), __builtin_shufflevector(y, z, 4, 5, 6, 7, 12, 13, 14, 15)); + vec_read_xor_write(in_slice.template subspan<64, 32>(), out_slice.template subspan<64, 32>(), __builtin_shufflevector(w, x, 0, 1, 2, 3, 8, 9, 10, 11)); + vec_read_xor_write(in_slice.template subspan<96, 32>(), out_slice.template subspan<96, 32>(), __builtin_shufflevector(y, z, 0, 1, 2, 3, 8, 9, 10, 11)); + }); } /* Compile-time helper to create addend vectors which used to increment the states From 9d0a168f5085818b3d1992667899f6102900f9c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc?= Date: Sun, 4 Jan 2026 00:26:57 +0100 Subject: [PATCH 04/12] refactor: replace template-based static_for use in ChaCha20 with runtime iteration Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte): - clang 22: 1MB 1.85 [1.85-1.89], 256B 1.72 [1.72-1.73], 64B 2.59 [2.59-2.60] - gcc 14: 1MB 4.51 [4.50-4.51], 256B 4.59 [4.58-4.59], 64B 2.72 [2.70-2.72] This is the first refactor that materially helps gcc again: the multi-state path shrinks substantially (much less codegen bloat), reducing `ins/byte` (43.7 -> 25.5) for CHACHA20_1MB. Assembly highlight (gcc): far less scalar shuffling glue and reduced stack pressure (stack allocation drops from ~0x16c0 to ~0x1530, and objdump size shrinks sharply). Delta vs previous commit: - gcc: 1MB 6.66 -> 4.51 ns/B (still slower than scalar baseline, but improved) - clang: slight regression (1.80 -> 1.85 ns/B), consistent with less aggressive unrolling. --- src/crypto/chacha20_vec.ipp | 367 ++++++++++++++++++++---------------- 1 file changed, 200 insertions(+), 167 deletions(-) diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp index 236ab84c0ae8..06b8512839a2 100644 --- a/src/crypto/chacha20_vec.ipp +++ b/src/crypto/chacha20_vec.ipp @@ -8,8 +8,6 @@ #include #include #include -#include -#include #if defined(ENABLE_CHACHA20_VEC) @@ -39,6 +37,8 @@ namespace { using vec256 = uint32_t __attribute__((__vector_size__(32))); +// Like Bitcoin Core's `ALWAYS_INLINE` in other files, but kept local to avoid touching shared headers. + /** Endian-conversion for big-endian */ ALWAYS_INLINE void vec_byteswap(vec256& vec) { @@ -58,58 +58,140 @@ ALWAYS_INLINE void vec_byteswap(vec256& vec) } /** Left-rotate vector */ -template -ALWAYS_INLINE void vec_rotl(vec256& vec) +ALWAYS_INLINE void vec_rotl16(vec256& vec) +{ + vec = (vec << 16) | (vec >> 16); +} + +ALWAYS_INLINE void vec_rotl12(vec256& vec) { - vec = (vec << BITS) | (vec >> (32 - BITS)); + vec = (vec << 12) | (vec >> 20); } -template -ALWAYS_INLINE void static_for_impl(Fn&& fn, std::index_sequence) +ALWAYS_INLINE void vec_rotl8(vec256& vec) { - (fn(std::integral_constant{}), ...); + vec = (vec << 8) | (vec >> 24); } -template -ALWAYS_INLINE void static_for(Fn&& fn) +ALWAYS_INLINE void vec_rotl7(vec256& vec) { - static_for_impl(std::forward(fn), std::make_index_sequence{}); + vec = (vec << 7) | (vec >> 25); +} + +static const vec256 nums256 = (vec256){0x61707865, 0x3320646e, 0x79622d32, 0x6b206574, 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; + +static const vec256 increments_1[1] = { + (vec256){1, 0, 0, 0, 0, 0, 0, 0}, +}; + +static const vec256 increments_2[2] = { + (vec256){1, 0, 0, 0, 0, 0, 0, 0}, + (vec256){3, 0, 0, 0, 2, 0, 0, 0}, +}; + +static const vec256 increments_3[3] = { + (vec256){1, 0, 0, 0, 0, 0, 0, 0}, + (vec256){3, 0, 0, 0, 2, 0, 0, 0}, + (vec256){5, 0, 0, 0, 4, 0, 0, 0}, +}; + +static const vec256 increments_4[4] = { + (vec256){1, 0, 0, 0, 0, 0, 0, 0}, + (vec256){3, 0, 0, 0, 2, 0, 0, 0}, + (vec256){5, 0, 0, 0, 4, 0, 0, 0}, + (vec256){7, 0, 0, 0, 6, 0, 0, 0}, +}; + +static const vec256 increments_8[8] = { + (vec256){1, 0, 0, 0, 0, 0, 0, 0}, + (vec256){3, 0, 0, 0, 2, 0, 0, 0}, + (vec256){5, 0, 0, 0, 4, 0, 0, 0}, + (vec256){7, 0, 0, 0, 6, 0, 0, 0}, + (vec256){9, 0, 0, 0, 8, 0, 0, 0}, + (vec256){11, 0, 0, 0, 10, 0, 0, 0}, + (vec256){13, 0, 0, 0, 12, 0, 0, 0}, + (vec256){15, 0, 0, 0, 14, 0, 0, 0}, +}; + +#define CHACHA20_VEC_PRAGMA(x) _Pragma(#x) +#if defined(__clang__) +#define CHACHA20_VEC_UNROLL(N) CHACHA20_VEC_PRAGMA(clang loop unroll_count(N)) +#elif defined(__GNUC__) +#define CHACHA20_VEC_UNROLL(N) CHACHA20_VEC_PRAGMA(GCC unroll N) +#else +#define CHACHA20_VEC_UNROLL(N) +#endif + +ALWAYS_INLINE const vec256* increments_for_half_states(size_t half_states) +{ + switch (half_states) { + case 1: return increments_1; + case 2: return increments_2; + case 3: return increments_3; + case 4: return increments_4; + case 8: return increments_8; + default: return nullptr; + } } /** Store a vector in all array elements */ -template -ALWAYS_INLINE void arr_set_vec256(std::array& arr, const vec256& vec) +ALWAYS_INLINE void arr_set_vec256(vec256* arr, size_t half_states, const vec256& vec) { - static_for([&](auto idx) { std::get(arr) = vec; }); + for (size_t i = 0; i < half_states; ++i) { + arr[i] = vec; + } } /** Add a vector to all array elements */ -template -ALWAYS_INLINE void arr_add_vec256(std::array& arr, const vec256& vec) +ALWAYS_INLINE void arr_add_vec256(vec256* arr, size_t half_states, const vec256& vec) { - static_for([&](auto idx) { std::get(arr) += vec; }); + for (size_t i = 0; i < half_states; ++i) { + arr[i] += vec; + } } /** Add corresponding vectors in arr1 to arr0 */ -template -ALWAYS_INLINE void arr_add_arr(std::array& arr0, const std::array& arr1) +ALWAYS_INLINE void arr_add_arr(vec256* arr0, const vec256* arr1, size_t half_states) { - static_for([&](auto idx) { std::get(arr0) += std::get(arr1); }); + for (size_t i = 0; i < half_states; ++i) { + arr0[i] += arr1[i]; + } } -/** Perform add/xor/rotate for the round function */ -template -ALWAYS_INLINE void arr_add_xor_rot(std::array& arr0, const std::array& arr1, std::array& arr2) +ALWAYS_INLINE void arr_add_xor_rot16(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states) { - static_for([&](auto idx) { - vec256& x = std::get(arr0); - const vec256& y = std::get(arr1); - vec256& z = std::get(arr2); - - x += y; - z ^= x; - vec_rotl(z); - }); + for (size_t i = 0; i < half_states; ++i) { + arr0[i] += arr1[i]; + arr2[i] ^= arr0[i]; + vec_rotl16(arr2[i]); + } +} + +ALWAYS_INLINE void arr_add_xor_rot12(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states) +{ + for (size_t i = 0; i < half_states; ++i) { + arr0[i] += arr1[i]; + arr2[i] ^= arr0[i]; + vec_rotl12(arr2[i]); + } +} + +ALWAYS_INLINE void arr_add_xor_rot8(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states) +{ + for (size_t i = 0; i < half_states; ++i) { + arr0[i] += arr1[i]; + arr2[i] ^= arr0[i]; + vec_rotl8(arr2[i]); + } +} + +ALWAYS_INLINE void arr_add_xor_rot7(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states) +{ + for (size_t i = 0; i < half_states; ++i) { + arr0[i] += arr1[i]; + arr2[i] ^= arr0[i]; + vec_rotl7(arr2[i]); + } } /* @@ -132,176 +214,127 @@ After the second round, they are used (in reverse) to restore the original layout. */ -template -ALWAYS_INLINE void arr_shuf0(std::array& arr) +ALWAYS_INLINE void arr_shuf0(vec256* arr, size_t half_states) { - static_for([&](auto idx) { - vec256& x = std::get(arr); + for (size_t i = 0; i < half_states; ++i) { + vec256& x = arr[i]; x = __builtin_shufflevector(x, x, 1, 2, 3, 0, 5, 6, 7, 4); - }); + } } -template -ALWAYS_INLINE void arr_shuf1(std::array& arr) +ALWAYS_INLINE void arr_shuf1(vec256* arr, size_t half_states) { - static_for([&](auto idx) { - vec256& x = std::get(arr); + for (size_t i = 0; i < half_states; ++i) { + vec256& x = arr[i]; x = __builtin_shufflevector(x, x, 2, 3, 0, 1, 6, 7, 4, 5); - }); + } } -template -ALWAYS_INLINE void arr_shuf2(std::array& arr) +ALWAYS_INLINE void arr_shuf2(vec256* arr, size_t half_states) { - static_for([&](auto idx) { - vec256& x = std::get(arr); + for (size_t i = 0; i < half_states; ++i) { + vec256& x = arr[i]; x = __builtin_shufflevector(x, x, 3, 0, 1, 2, 7, 4, 5, 6); - }); -} - -/* Run a single ChaCha20 double-round. */ -template -ALWAYS_INLINE void doubleround_once(std::array& arr0, std::array& arr1, std::array& arr2, std::array& arr3) -{ - arr_add_xor_rot<16>(arr0, arr1, arr3); - arr_add_xor_rot<12>(arr2, arr3, arr1); - arr_add_xor_rot<8>(arr0, arr1, arr3); - arr_add_xor_rot<7>(arr2, arr3, arr1); - arr_shuf0(arr1); - arr_shuf1(arr2); - arr_shuf2(arr3); - arr_add_xor_rot<16>(arr0, arr1, arr3); - arr_add_xor_rot<12>(arr2, arr3, arr1); - arr_add_xor_rot<8>(arr0, arr1, arr3); - arr_add_xor_rot<7>(arr2, arr3, arr1); - arr_shuf2(arr1); - arr_shuf1(arr2); - arr_shuf0(arr3); + } } /* Main round function. */ -template -ALWAYS_INLINE void doubleround(std::array& arr0, std::array& arr1, std::array& arr2, std::array& arr3) +ALWAYS_INLINE void doubleround(vec256* arr0, vec256* arr1, vec256* arr2, vec256* arr3, size_t half_states) { - static_for<10>([&](auto) { doubleround_once(arr0, arr1, arr2, arr3); }); + CHACHA20_VEC_UNROLL(10) + for (size_t i = 0; i < 10; ++i) { + arr_add_xor_rot16(arr0, arr1, arr3, half_states); + arr_add_xor_rot12(arr2, arr3, arr1, half_states); + arr_add_xor_rot8(arr0, arr1, arr3, half_states); + arr_add_xor_rot7(arr2, arr3, arr1, half_states); + arr_shuf0(arr1, half_states); + arr_shuf1(arr2, half_states); + arr_shuf2(arr3, half_states); + arr_add_xor_rot16(arr0, arr1, arr3, half_states); + arr_add_xor_rot12(arr2, arr3, arr1, half_states); + arr_add_xor_rot8(arr0, arr1, arr3, half_states); + arr_add_xor_rot7(arr2, arr3, arr1, half_states); + arr_shuf2(arr1, half_states); + arr_shuf1(arr2, half_states); + arr_shuf0(arr3, half_states); + } } /* Read 32bytes of input, xor with calculated state, write to output. Assumes that input and output are unaligned, and makes no assumptions about the internal layout of vec256; */ -ALWAYS_INLINE void vec_read_xor_write(std::span in_bytes, std::span out_bytes, const vec256& vec) +ALWAYS_INLINE void vec_read_xor_write(std::span in_bytes, std::span out_bytes, const vec256& vec) { - std::array temparr; - memcpy(temparr.data(), in_bytes.data(), in_bytes.size()); + assert(in_bytes.size() == 32); + assert(out_bytes.size() == 32); + + uint32_t temparr[8]; + memcpy(temparr, in_bytes.data(), in_bytes.size()); vec256 tempvec = vec ^ (vec256){temparr[0], temparr[1], temparr[2], temparr[3], temparr[4], temparr[5], temparr[6], temparr[7]}; vec_byteswap(tempvec); - temparr = {tempvec[0], tempvec[1], tempvec[2], tempvec[3], tempvec[4], tempvec[5], tempvec[6], tempvec[7]}; - memcpy(out_bytes.data(), temparr.data(), out_bytes.size()); + temparr[0] = tempvec[0]; + temparr[1] = tempvec[1]; + temparr[2] = tempvec[2]; + temparr[3] = tempvec[3]; + temparr[4] = tempvec[4]; + temparr[5] = tempvec[5]; + temparr[6] = tempvec[6]; + temparr[7] = tempvec[7]; + memcpy(out_bytes.data(), temparr, out_bytes.size()); } /* Merge the 128 bit lanes from 2 states to the proper order, then pass each vec_read_xor_write */ -template -ALWAYS_INLINE void arr_read_xor_write(std::span in_bytes, std::span out_bytes, const std::array& arr0, const std::array& arr1, const std::array& arr2, const std::array& arr3) -{ - static_for([&](auto idx) { - constexpr size_t offset = idx.value * 128; - const vec256& w = std::get(arr0); - const vec256& x = std::get(arr1); - const vec256& y = std::get(arr2); - const vec256& z = std::get(arr3); - - auto in_slice = in_bytes.template subspan(); - auto out_slice = out_bytes.template subspan(); - - vec_read_xor_write(in_slice.template first<32>(), out_slice.template first<32>(), __builtin_shufflevector(w, x, 4, 5, 6, 7, 12, 13, 14, 15)); - vec_read_xor_write(in_slice.template subspan<32, 32>(), out_slice.template subspan<32, 32>(), __builtin_shufflevector(y, z, 4, 5, 6, 7, 12, 13, 14, 15)); - vec_read_xor_write(in_slice.template subspan<64, 32>(), out_slice.template subspan<64, 32>(), __builtin_shufflevector(w, x, 0, 1, 2, 3, 8, 9, 10, 11)); - vec_read_xor_write(in_slice.template subspan<96, 32>(), out_slice.template subspan<96, 32>(), __builtin_shufflevector(y, z, 0, 1, 2, 3, 8, 9, 10, 11)); - }); -} - -/* Compile-time helper to create addend vectors which used to increment the states - - Generates vectors of the pattern: - 1 0 0 0 0 0 0 0 - 3 0 0 0 2 0 0 0 - 5 0 0 0 4 0 0 0 - ... -*/ -template -consteval std::array generate_increments() +ALWAYS_INLINE void arr_read_xor_write(std::span in_bytes, std::span out_bytes, const vec256* arr0, const vec256* arr1, const vec256* arr2, const vec256* arr3, size_t half_states) { - std::array rows; - for (uint32_t i = 0; i < SIZE; i ++) - { - rows[i] = (i * (vec256){2, 0, 0, 0, 2, 0, 0, 0}) + (vec256){1, 0, 0, 0, 0, 0, 0, 0}; + for (size_t i = 0; i < half_states; ++i) { + const vec256& w = arr0[i]; + const vec256& x = arr1[i]; + const vec256& y = arr2[i]; + const vec256& z = arr3[i]; + + const size_t offset = i * 128; + auto in_slice = in_bytes.subspan(offset, 128); + auto out_slice = out_bytes.subspan(offset, 128); + + vec_read_xor_write(in_slice.first(32), out_slice.first(32), __builtin_shufflevector(w, x, 4, 5, 6, 7, 12, 13, 14, 15)); + vec_read_xor_write(in_slice.subspan(32, 32), out_slice.subspan(32, 32), __builtin_shufflevector(y, z, 4, 5, 6, 7, 12, 13, 14, 15)); + vec_read_xor_write(in_slice.subspan(64, 32), out_slice.subspan(64, 32), __builtin_shufflevector(w, x, 0, 1, 2, 3, 8, 9, 10, 11)); + vec_read_xor_write(in_slice.subspan(96, 32), out_slice.subspan(96, 32), __builtin_shufflevector(y, z, 0, 1, 2, 3, 8, 9, 10, 11)); } - return rows; } -/* Main crypt function. Calculates up to 16 states. - - Each array contains one or more vectors, with each array representing a - quarter of a state. Initially, the high and low parts of each vector are - duplicated. They each contain a portion of the current and next state. - - arr0[0] arr1[0] arr2[0] arr3[0] increment - ----------|---------|----------|----------|--------- - 0x61707865 input[0] input[4] input[8] [1] - 0x3320646e input[1] input[5] input[9] [0] - 0x79622d32 input[2] input[6] input[10] [0] - 0x6b206574 input[3] input[7] input[11] [0] - - 0x61707865 input[0] input[4] input[8] [0] - 0x3320646e input[1] input[5] input[9] [0] - 0x79622d32 input[2] input[6] input[10] [0] - 0x6b206574 input[3] input[7] input[11] [0] - - After loading the states, arr3's vectors are incremented as-necessary to - contain the correct counter values. - - This way, operations like "arr0[0] += arr1[0]" can perform all 8 operations - in parallel, taking advantage of 256bit registers where available. - - arrX[0] represents states 0 and 1. - arrX[1] represents states 2 and 3 (if present) - etc. - - After the doublerounds have been run and the initial state has been mixed - back in, the high and low portions of the vectors in each array are - shuffled in order to prepare them for mixing with the input bytes. Finally, - each state is xor'd with its corresponding input, byteswapped if necessary, - and written to its output. -*/ -template -ALWAYS_INLINE void multi_block_crypt(std::span in_bytes, std::span out_bytes, const vec256& state0, const vec256& state1, const vec256& state2) +/* Main crypt function. Calculates up to 16 states. */ +ALWAYS_INLINE void multi_block_crypt(std::span in_bytes, std::span out_bytes, const vec256& state0, const vec256& state1, const vec256& state2, size_t states) { - static constexpr size_t HALF_STATES = STATES / 2; - static constexpr vec256 nums256 = (vec256){0x61707865, 0x3320646e, 0x79622d32, 0x6b206574, 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; - static constinit std::array increments = generate_increments(); + const size_t half_states = states / 2; + const vec256* increments = increments_for_half_states(half_states); + assert(increments != nullptr); - std::array arr0, arr1, arr2, arr3; + vec256 arr0[8], arr1[8], arr2[8], arr3[8]; - arr_set_vec256(arr0, nums256); - arr_set_vec256(arr1, state0); - arr_set_vec256(arr2, state1); - arr_set_vec256(arr3, state2); + arr_set_vec256(arr0, half_states, nums256); + arr_set_vec256(arr1, half_states, state0); + arr_set_vec256(arr2, half_states, state1); + arr_set_vec256(arr3, half_states, state2); - arr_add_arr(arr3, increments); + arr_add_arr(arr3, increments, half_states); - doubleround(arr0, arr1, arr2, arr3); + doubleround(arr0, arr1, arr2, arr3, half_states); - arr_add_vec256(arr0, nums256); - arr_add_vec256(arr1, state0); - arr_add_vec256(arr2, state1); - arr_add_vec256(arr3, state2); + arr_add_vec256(arr0, half_states, nums256); + arr_add_vec256(arr1, half_states, state0); + arr_add_vec256(arr2, half_states, state1); + arr_add_vec256(arr3, half_states, state2); - arr_add_arr(arr3, increments); + arr_add_arr(arr3, increments, half_states); - arr_read_xor_write(in_bytes, out_bytes, arr0, arr1, arr2, arr3); + arr_read_xor_write(in_bytes, out_bytes, arr0, arr1, arr2, arr3, half_states); } +#undef CHACHA20_VEC_UNROLL +#undef CHACHA20_VEC_PRAGMA + } // anonymous namespace #endif // CHACHA20_VEC_ALL_MULTI_STATES_DISABLED @@ -318,7 +351,7 @@ void chacha20_crypt_vectorized(std::span& in_bytes, std::span= CHACHA20_VEC_BLOCKLEN * 16) { - multi_block_crypt<16>(in_bytes, out_bytes, state0, state1, state2); + multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, 16); state2 += (vec256){16, 0, 0, 0, 16, 0, 0, 0}; in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 16); out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 16); @@ -326,7 +359,7 @@ void chacha20_crypt_vectorized(std::span& in_bytes, std::span= CHACHA20_VEC_BLOCKLEN * 8) { - multi_block_crypt<8>(in_bytes, out_bytes, state0, state1, state2); + multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, 8); state2 += (vec256){8, 0, 0, 0, 8, 0, 0, 0}; in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 8); out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 8); @@ -334,7 +367,7 @@ void chacha20_crypt_vectorized(std::span& in_bytes, std::span= CHACHA20_VEC_BLOCKLEN * 6) { - multi_block_crypt<6>(in_bytes, out_bytes, state0, state1, state2); + multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, 6); state2 += (vec256){6, 0, 0, 0, 6, 0, 0, 0}; in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 6); out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 6); @@ -342,7 +375,7 @@ void chacha20_crypt_vectorized(std::span& in_bytes, std::span= CHACHA20_VEC_BLOCKLEN * 4) { - multi_block_crypt<4>(in_bytes, out_bytes, state0, state1, state2); + multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, 4); state2 += (vec256){4, 0, 0, 0, 4, 0, 0, 0}; in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 4); out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 4); @@ -350,7 +383,7 @@ void chacha20_crypt_vectorized(std::span& in_bytes, std::span= CHACHA20_VEC_BLOCKLEN * 2) { - multi_block_crypt<2>(in_bytes, out_bytes, state0, state1, state2); + multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, 2); state2 += (vec256){2, 0, 0, 0, 2, 0, 0, 0}; in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 2); out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 2); From 2b44fd0deade1981935d6c5776e11be2191e59eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc?= Date: Sun, 4 Jan 2026 01:06:04 +0100 Subject: [PATCH 05/12] refactor: unroll ChaCha20 vector operations for improved clarity and efficiency Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte): - clang 22: 1MB 1.79 [1.79-1.80], 256B 1.63 [1.63-1.64], 64B 2.59 [2.58-2.60] - gcc 14: 1MB 5.36 [5.35-5.36], 256B 5.16 [5.15-5.16], 64B 2.72 [2.69-2.73] The additional unrolling helps clang but hurts gcc again. On gcc the multi-state function grows and spills more (large stack frame), pushing 1MB back near the original regression. Delta vs previous commit: - gcc: 1MB 4.51 -> 5.36 ns/B (regression) - clang: 1MB 1.85 -> 1.79 ns/B (improvement) --- src/crypto/chacha20_vec.ipp | 117 ++++++++++++++++++++++-------------- 1 file changed, 72 insertions(+), 45 deletions(-) diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp index 06b8512839a2..e414cff085e5 100644 --- a/src/crypto/chacha20_vec.ipp +++ b/src/crypto/chacha20_vec.ipp @@ -137,60 +137,75 @@ ALWAYS_INLINE const vec256* increments_for_half_states(size_t half_states) /** Store a vector in all array elements */ ALWAYS_INLINE void arr_set_vec256(vec256* arr, size_t half_states, const vec256& vec) { - for (size_t i = 0; i < half_states; ++i) { - arr[i] = vec; + CHACHA20_VEC_UNROLL(8) + for (size_t i = 0; i < 8; ++i) { + if (i < half_states) arr[i] = vec; } } /** Add a vector to all array elements */ ALWAYS_INLINE void arr_add_vec256(vec256* arr, size_t half_states, const vec256& vec) { - for (size_t i = 0; i < half_states; ++i) { - arr[i] += vec; + CHACHA20_VEC_UNROLL(8) + for (size_t i = 0; i < 8; ++i) { + if (i < half_states) arr[i] += vec; } } /** Add corresponding vectors in arr1 to arr0 */ ALWAYS_INLINE void arr_add_arr(vec256* arr0, const vec256* arr1, size_t half_states) { - for (size_t i = 0; i < half_states; ++i) { - arr0[i] += arr1[i]; + CHACHA20_VEC_UNROLL(8) + for (size_t i = 0; i < 8; ++i) { + if (i < half_states) arr0[i] += arr1[i]; } } ALWAYS_INLINE void arr_add_xor_rot16(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states) { - for (size_t i = 0; i < half_states; ++i) { - arr0[i] += arr1[i]; - arr2[i] ^= arr0[i]; - vec_rotl16(arr2[i]); + CHACHA20_VEC_UNROLL(8) + for (size_t i = 0; i < 8; ++i) { + if (i < half_states) { + arr0[i] += arr1[i]; + arr2[i] ^= arr0[i]; + vec_rotl16(arr2[i]); + } } } ALWAYS_INLINE void arr_add_xor_rot12(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states) { - for (size_t i = 0; i < half_states; ++i) { - arr0[i] += arr1[i]; - arr2[i] ^= arr0[i]; - vec_rotl12(arr2[i]); + CHACHA20_VEC_UNROLL(8) + for (size_t i = 0; i < 8; ++i) { + if (i < half_states) { + arr0[i] += arr1[i]; + arr2[i] ^= arr0[i]; + vec_rotl12(arr2[i]); + } } } ALWAYS_INLINE void arr_add_xor_rot8(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states) { - for (size_t i = 0; i < half_states; ++i) { - arr0[i] += arr1[i]; - arr2[i] ^= arr0[i]; - vec_rotl8(arr2[i]); + CHACHA20_VEC_UNROLL(8) + for (size_t i = 0; i < 8; ++i) { + if (i < half_states) { + arr0[i] += arr1[i]; + arr2[i] ^= arr0[i]; + vec_rotl8(arr2[i]); + } } } ALWAYS_INLINE void arr_add_xor_rot7(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states) { - for (size_t i = 0; i < half_states; ++i) { - arr0[i] += arr1[i]; - arr2[i] ^= arr0[i]; - vec_rotl7(arr2[i]); + CHACHA20_VEC_UNROLL(8) + for (size_t i = 0; i < 8; ++i) { + if (i < half_states) { + arr0[i] += arr1[i]; + arr2[i] ^= arr0[i]; + vec_rotl7(arr2[i]); + } } } @@ -216,25 +231,34 @@ layout. */ ALWAYS_INLINE void arr_shuf0(vec256* arr, size_t half_states) { - for (size_t i = 0; i < half_states; ++i) { - vec256& x = arr[i]; - x = __builtin_shufflevector(x, x, 1, 2, 3, 0, 5, 6, 7, 4); + CHACHA20_VEC_UNROLL(8) + for (size_t i = 0; i < 8; ++i) { + if (i < half_states) { + vec256& x = arr[i]; + x = __builtin_shufflevector(x, x, 1, 2, 3, 0, 5, 6, 7, 4); + } } } ALWAYS_INLINE void arr_shuf1(vec256* arr, size_t half_states) { - for (size_t i = 0; i < half_states; ++i) { - vec256& x = arr[i]; - x = __builtin_shufflevector(x, x, 2, 3, 0, 1, 6, 7, 4, 5); + CHACHA20_VEC_UNROLL(8) + for (size_t i = 0; i < 8; ++i) { + if (i < half_states) { + vec256& x = arr[i]; + x = __builtin_shufflevector(x, x, 2, 3, 0, 1, 6, 7, 4, 5); + } } } ALWAYS_INLINE void arr_shuf2(vec256* arr, size_t half_states) { - for (size_t i = 0; i < half_states; ++i) { - vec256& x = arr[i]; - x = __builtin_shufflevector(x, x, 3, 0, 1, 2, 7, 4, 5, 6); + CHACHA20_VEC_UNROLL(8) + for (size_t i = 0; i < 8; ++i) { + if (i < half_states) { + vec256& x = arr[i]; + x = __builtin_shufflevector(x, x, 3, 0, 1, 2, 7, 4, 5, 6); + } } } @@ -287,20 +311,23 @@ ALWAYS_INLINE void vec_read_xor_write(std::span in_bytes, std:: /* Merge the 128 bit lanes from 2 states to the proper order, then pass each vec_read_xor_write */ ALWAYS_INLINE void arr_read_xor_write(std::span in_bytes, std::span out_bytes, const vec256* arr0, const vec256* arr1, const vec256* arr2, const vec256* arr3, size_t half_states) { - for (size_t i = 0; i < half_states; ++i) { - const vec256& w = arr0[i]; - const vec256& x = arr1[i]; - const vec256& y = arr2[i]; - const vec256& z = arr3[i]; - - const size_t offset = i * 128; - auto in_slice = in_bytes.subspan(offset, 128); - auto out_slice = out_bytes.subspan(offset, 128); - - vec_read_xor_write(in_slice.first(32), out_slice.first(32), __builtin_shufflevector(w, x, 4, 5, 6, 7, 12, 13, 14, 15)); - vec_read_xor_write(in_slice.subspan(32, 32), out_slice.subspan(32, 32), __builtin_shufflevector(y, z, 4, 5, 6, 7, 12, 13, 14, 15)); - vec_read_xor_write(in_slice.subspan(64, 32), out_slice.subspan(64, 32), __builtin_shufflevector(w, x, 0, 1, 2, 3, 8, 9, 10, 11)); - vec_read_xor_write(in_slice.subspan(96, 32), out_slice.subspan(96, 32), __builtin_shufflevector(y, z, 0, 1, 2, 3, 8, 9, 10, 11)); + CHACHA20_VEC_UNROLL(8) + for (size_t i = 0; i < 8; ++i) { + if (i < half_states) { + const vec256& w = arr0[i]; + const vec256& x = arr1[i]; + const vec256& y = arr2[i]; + const vec256& z = arr3[i]; + + const size_t offset = i * 128; + auto in_slice = in_bytes.subspan(offset, 128); + auto out_slice = out_bytes.subspan(offset, 128); + + vec_read_xor_write(in_slice.first(32), out_slice.first(32), __builtin_shufflevector(w, x, 4, 5, 6, 7, 12, 13, 14, 15)); + vec_read_xor_write(in_slice.subspan(32, 32), out_slice.subspan(32, 32), __builtin_shufflevector(y, z, 4, 5, 6, 7, 12, 13, 14, 15)); + vec_read_xor_write(in_slice.subspan(64, 32), out_slice.subspan(64, 32), __builtin_shufflevector(w, x, 0, 1, 2, 3, 8, 9, 10, 11)); + vec_read_xor_write(in_slice.subspan(96, 32), out_slice.subspan(96, 32), __builtin_shufflevector(y, z, 0, 1, 2, 3, 8, 9, 10, 11)); + } } } From b40a32718d3e7927512caeee066f70a7ad06804d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc?= Date: Sun, 4 Jan 2026 10:29:45 +0100 Subject: [PATCH 06/12] refactor: consolidate shuffle operations and loop handling in ChaCha20 vector implementation Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte): - clang 22: 1MB 1.86 [1.86-1.87], 256B 1.73 [1.72-1.73], 64B 2.60 [2.58-2.60] - gcc 14: 1MB 5.74 [5.73-5.74], 256B 5.29 [5.29-5.30], 64B 2.71 [2.69-2.73] This reshuffle/loop consolidation ends up worsening both compilers slightly, but the impact is far larger on gcc. The gcc variant again has a huge stack frame and many extra instructions in the multi-state path (`ins/byte` ~35.7 for CHACHA20_1MB). Assembly contrast (AArch64): - clang: still uses `ext` for lane shuffles and keeps stack relatively small. - gcc: spills and uses scalar pack/unpack sequences; stack allocation is ~0x60a0. Delta vs previous commit: - clang: 1MB 1.79 -> 1.86 ns/B - gcc: 1MB 5.36 -> 5.74 ns/B --- src/attributes.h | 10 + src/crypto/chacha20_vec.ipp | 477 ++++++++++++++----------------- src/crypto/chacha20_vec_base.cpp | 8 + 3 files changed, 234 insertions(+), 261 deletions(-) diff --git a/src/attributes.h b/src/attributes.h index 275dad9f8ede..b3686b6c1772 100644 --- a/src/attributes.h +++ b/src/attributes.h @@ -24,4 +24,14 @@ # error No known always_inline attribute for this platform. #endif +#define PRAGMA(x) _Pragma(#x) + +#if defined(__clang__) +# define UNROLL_LOOP(N) PRAGMA(clang loop unroll_count(N)) +#elif defined(__GNUC__) +# define UNROLL_LOOP(N) PRAGMA(GCC unroll N) +#else +# define UNROLL_LOOP(N) +#endif + #endif // BITCOIN_ATTRIBUTES_H diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp index e414cff085e5..3a92649c90d8 100644 --- a/src/crypto/chacha20_vec.ipp +++ b/src/crypto/chacha20_vec.ipp @@ -2,48 +2,71 @@ // Distributed under the MIT software license, see the accompanying // file COPYING or http://www.opensource.org/licenses/mit-license.php. +#include #include #include #include +#include #include #include +#include +#include #if defined(ENABLE_CHACHA20_VEC) -#if defined(CHACHA20_VEC_DISABLE_STATES_16) && \ - defined(CHACHA20_VEC_DISABLE_STATES_8) && \ - defined(CHACHA20_VEC_DISABLE_STATES_6) && \ - defined(CHACHA20_VEC_DISABLE_STATES_4) && \ - defined(CHACHA20_VEC_DISABLE_STATES_2) -#define CHACHA20_VEC_ALL_MULTI_STATES_DISABLED +// Convert preprocessor flags to constexpr booleans for use with if constexpr +#ifdef CHACHA20_VEC_DISABLE_STATES_16 +inline constexpr bool kEnableStates16 = false; +#else +inline constexpr bool kEnableStates16 = true; #endif +#ifdef CHACHA20_VEC_DISABLE_STATES_8 +inline constexpr bool kEnableStates8 = false; +#else +inline constexpr bool kEnableStates8 = true; +#endif -#if !defined(CHACHA20_VEC_ALL_MULTI_STATES_DISABLED) - -#if defined(__has_attribute) -# if __has_attribute(always_inline) -# define ALWAYS_INLINE __attribute__ ((always_inline)) inline -# endif +#ifdef CHACHA20_VEC_DISABLE_STATES_6 +inline constexpr bool kEnableStates6 = false; +#else +inline constexpr bool kEnableStates6 = true; #endif -#if !defined(ALWAYS_INLINE) -# define ALWAYS_INLINE inline +#ifdef CHACHA20_VEC_DISABLE_STATES_4 +inline constexpr bool kEnableStates4 = false; +#else +inline constexpr bool kEnableStates4 = true; #endif +#ifdef CHACHA20_VEC_DISABLE_STATES_2 +inline constexpr bool kEnableStates2 = false; +#else +inline constexpr bool kEnableStates2 = true; +#endif -namespace { +inline constexpr bool kEnableAnyMultiState = kEnableStates16 || kEnableStates8 || kEnableStates6 || kEnableStates4 || kEnableStates2; +// vec256 type must be visible for if constexpr branches even when they're not taken using vec256 = uint32_t __attribute__((__vector_size__(32))); -// Like Bitcoin Core's `ALWAYS_INLINE` in other files, but kept local to avoid touching shared headers. +// Preprocessor check for conditional compilation of the anonymous namespace +#if !defined(CHACHA20_VEC_DISABLE_STATES_16) || \ + !defined(CHACHA20_VEC_DISABLE_STATES_8) || \ + !defined(CHACHA20_VEC_DISABLE_STATES_6) || \ + !defined(CHACHA20_VEC_DISABLE_STATES_4) || \ + !defined(CHACHA20_VEC_DISABLE_STATES_2) + +namespace { + +// Used for an optional aligned I/O fast-path. +static constexpr size_t CHACHA20_VEC_MEM_ALIGN{16}; /** Endian-conversion for big-endian */ ALWAYS_INLINE void vec_byteswap(vec256& vec) { - if constexpr (std::endian::native == std::endian::big) - { + if constexpr (std::endian::native == std::endian::big) { vec256 ret; ret[0] = __builtin_bswap32(vec[0]); ret[1] = __builtin_bswap32(vec[1]); @@ -57,156 +80,78 @@ ALWAYS_INLINE void vec_byteswap(vec256& vec) } } -/** Left-rotate vector */ -ALWAYS_INLINE void vec_rotl16(vec256& vec) +/** Left-rotate all elements in a vector by N bits */ +template +ALWAYS_INLINE void vec_rotl(vec256& vec) { - vec = (vec << 16) | (vec >> 16); + static_assert(N > 0 && N < 32, "Rotation must be between 1 and 31 bits"); + vec = (vec << N) | (vec >> (32 - N)); } -ALWAYS_INLINE void vec_rotl12(vec256& vec) -{ - vec = (vec << 12) | (vec >> 20); -} - -ALWAYS_INLINE void vec_rotl8(vec256& vec) -{ - vec = (vec << 8) | (vec >> 24); -} +static constexpr vec256 nums256 = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574, 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; + +// Counter increments for each half-state pair. Pattern: {2*i+1, 0, 0, 0, 2*i, 0, 0, 0} +// All smaller state counts use a prefix of this array. +static constexpr vec256 increments[8] = { + {1, 0, 0, 0, 0, 0, 0, 0}, + {3, 0, 0, 0, 2, 0, 0, 0}, + {5, 0, 0, 0, 4, 0, 0, 0}, + {7, 0, 0, 0, 6, 0, 0, 0}, + {9, 0, 0, 0, 8, 0, 0, 0}, + {11, 0, 0, 0, 10, 0, 0, 0}, + {13, 0, 0, 0, 12, 0, 0, 0}, + {15, 0, 0, 0, 14, 0, 0, 0}, +}; -ALWAYS_INLINE void vec_rotl7(vec256& vec) +template +ALWAYS_INLINE void for_each_half_state(size_t half_states, Fn&& fn, std::index_sequence) { - vec = (vec << 7) | (vec >> 25); + ((I < half_states ? (fn(std::integral_constant{}), 0) : 0), ...); } -static const vec256 nums256 = (vec256){0x61707865, 0x3320646e, 0x79622d32, 0x6b206574, 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; - -static const vec256 increments_1[1] = { - (vec256){1, 0, 0, 0, 0, 0, 0, 0}, -}; - -static const vec256 increments_2[2] = { - (vec256){1, 0, 0, 0, 0, 0, 0, 0}, - (vec256){3, 0, 0, 0, 2, 0, 0, 0}, -}; - -static const vec256 increments_3[3] = { - (vec256){1, 0, 0, 0, 0, 0, 0, 0}, - (vec256){3, 0, 0, 0, 2, 0, 0, 0}, - (vec256){5, 0, 0, 0, 4, 0, 0, 0}, -}; - -static const vec256 increments_4[4] = { - (vec256){1, 0, 0, 0, 0, 0, 0, 0}, - (vec256){3, 0, 0, 0, 2, 0, 0, 0}, - (vec256){5, 0, 0, 0, 4, 0, 0, 0}, - (vec256){7, 0, 0, 0, 6, 0, 0, 0}, -}; - -static const vec256 increments_8[8] = { - (vec256){1, 0, 0, 0, 0, 0, 0, 0}, - (vec256){3, 0, 0, 0, 2, 0, 0, 0}, - (vec256){5, 0, 0, 0, 4, 0, 0, 0}, - (vec256){7, 0, 0, 0, 6, 0, 0, 0}, - (vec256){9, 0, 0, 0, 8, 0, 0, 0}, - (vec256){11, 0, 0, 0, 10, 0, 0, 0}, - (vec256){13, 0, 0, 0, 12, 0, 0, 0}, - (vec256){15, 0, 0, 0, 14, 0, 0, 0}, -}; - -#define CHACHA20_VEC_PRAGMA(x) _Pragma(#x) -#if defined(__clang__) -#define CHACHA20_VEC_UNROLL(N) CHACHA20_VEC_PRAGMA(clang loop unroll_count(N)) -#elif defined(__GNUC__) -#define CHACHA20_VEC_UNROLL(N) CHACHA20_VEC_PRAGMA(GCC unroll N) -#else -#define CHACHA20_VEC_UNROLL(N) -#endif - -ALWAYS_INLINE const vec256* increments_for_half_states(size_t half_states) +template +ALWAYS_INLINE void for_each_half_state(size_t half_states, Fn&& fn) { - switch (half_states) { - case 1: return increments_1; - case 2: return increments_2; - case 3: return increments_3; - case 4: return increments_4; - case 8: return increments_8; - default: return nullptr; - } + for_each_half_state(half_states, std::forward(fn), std::make_index_sequence<8>{}); } /** Store a vector in all array elements */ ALWAYS_INLINE void arr_set_vec256(vec256* arr, size_t half_states, const vec256& vec) { - CHACHA20_VEC_UNROLL(8) - for (size_t i = 0; i < 8; ++i) { - if (i < half_states) arr[i] = vec; - } + for_each_half_state(half_states, [&](auto idx) { + constexpr size_t i = decltype(idx)::value; + arr[i] = vec; + }); } /** Add a vector to all array elements */ ALWAYS_INLINE void arr_add_vec256(vec256* arr, size_t half_states, const vec256& vec) { - CHACHA20_VEC_UNROLL(8) - for (size_t i = 0; i < 8; ++i) { - if (i < half_states) arr[i] += vec; - } + for_each_half_state(half_states, [&](auto idx) { + constexpr size_t i = decltype(idx)::value; + arr[i] += vec; + }); } /** Add corresponding vectors in arr1 to arr0 */ ALWAYS_INLINE void arr_add_arr(vec256* arr0, const vec256* arr1, size_t half_states) { - CHACHA20_VEC_UNROLL(8) - for (size_t i = 0; i < 8; ++i) { - if (i < half_states) arr0[i] += arr1[i]; - } -} - -ALWAYS_INLINE void arr_add_xor_rot16(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states) -{ - CHACHA20_VEC_UNROLL(8) - for (size_t i = 0; i < 8; ++i) { - if (i < half_states) { - arr0[i] += arr1[i]; - arr2[i] ^= arr0[i]; - vec_rotl16(arr2[i]); - } - } + for_each_half_state(half_states, [&](auto idx) { + constexpr size_t i = decltype(idx)::value; + arr0[i] += arr1[i]; + }); } -ALWAYS_INLINE void arr_add_xor_rot12(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states) +/** Add arr1 to arr0, XOR result into arr2, rotate arr2 left by N bits */ +template +ALWAYS_INLINE void arr_add_xor_rot(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states) { - CHACHA20_VEC_UNROLL(8) - for (size_t i = 0; i < 8; ++i) { - if (i < half_states) { - arr0[i] += arr1[i]; - arr2[i] ^= arr0[i]; - vec_rotl12(arr2[i]); - } - } -} - -ALWAYS_INLINE void arr_add_xor_rot8(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states) -{ - CHACHA20_VEC_UNROLL(8) - for (size_t i = 0; i < 8; ++i) { - if (i < half_states) { - arr0[i] += arr1[i]; - arr2[i] ^= arr0[i]; - vec_rotl8(arr2[i]); - } - } -} - -ALWAYS_INLINE void arr_add_xor_rot7(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states) -{ - CHACHA20_VEC_UNROLL(8) - for (size_t i = 0; i < 8; ++i) { - if (i < half_states) { - arr0[i] += arr1[i]; - arr2[i] ^= arr0[i]; - vec_rotl7(arr2[i]); - } - } + for_each_half_state(half_states, [&](auto idx) { + constexpr size_t i = decltype(idx)::value; + arr0[i] += arr1[i]; + arr2[i] ^= arr0[i]; + vec_rotl(arr2[i]); + }); } /* @@ -229,115 +174,145 @@ After the second round, they are used (in reverse) to restore the original layout. */ +#if defined(__GNUC__) && !defined(__clang__) +template +ALWAYS_INLINE vec256 vec_shuffle(const vec256& v) +{ + static_assert(sizeof...(I) == 8); + using mask_t = int __attribute__((__vector_size__(32))); + constexpr mask_t mask{I...}; + return __builtin_shuffle(v, mask); +} + +template +ALWAYS_INLINE vec256 vec_shuffle(const vec256& a, const vec256& b) +{ + static_assert(sizeof...(I) == 8); + using mask_t = int __attribute__((__vector_size__(32))); + constexpr mask_t mask{I...}; + return __builtin_shuffle(a, b, mask); +} +#endif // defined(__GNUC__) && !defined(__clang__) + +#if defined(__GNUC__) && !defined(__clang__) +#define VEC_SHUF_SELF(x, ...) vec_shuffle<__VA_ARGS__>(x) +#define VEC_SHUF2(a, b, ...) vec_shuffle<__VA_ARGS__>(a, b) +#else +#define VEC_SHUF_SELF(x, ...) __builtin_shufflevector(x, x, __VA_ARGS__) +#define VEC_SHUF2(a, b, ...) __builtin_shufflevector(a, b, __VA_ARGS__) +#endif + ALWAYS_INLINE void arr_shuf0(vec256* arr, size_t half_states) { - CHACHA20_VEC_UNROLL(8) - for (size_t i = 0; i < 8; ++i) { - if (i < half_states) { - vec256& x = arr[i]; - x = __builtin_shufflevector(x, x, 1, 2, 3, 0, 5, 6, 7, 4); - } - } + for_each_half_state(half_states, [&](auto idx) { + constexpr size_t i = decltype(idx)::value; + vec256& x = arr[i]; + x = VEC_SHUF_SELF(x, 1, 2, 3, 0, 5, 6, 7, 4); + }); } ALWAYS_INLINE void arr_shuf1(vec256* arr, size_t half_states) { - CHACHA20_VEC_UNROLL(8) - for (size_t i = 0; i < 8; ++i) { - if (i < half_states) { - vec256& x = arr[i]; - x = __builtin_shufflevector(x, x, 2, 3, 0, 1, 6, 7, 4, 5); - } - } + for_each_half_state(half_states, [&](auto idx) { + constexpr size_t i = decltype(idx)::value; + vec256& x = arr[i]; + x = VEC_SHUF_SELF(x, 2, 3, 0, 1, 6, 7, 4, 5); + }); } ALWAYS_INLINE void arr_shuf2(vec256* arr, size_t half_states) { - CHACHA20_VEC_UNROLL(8) - for (size_t i = 0; i < 8; ++i) { - if (i < half_states) { - vec256& x = arr[i]; - x = __builtin_shufflevector(x, x, 3, 0, 1, 2, 7, 4, 5, 6); - } - } + for_each_half_state(half_states, [&](auto idx) { + constexpr size_t i = decltype(idx)::value; + vec256& x = arr[i]; + x = VEC_SHUF_SELF(x, 3, 0, 1, 2, 7, 4, 5, 6); + }); } /* Main round function. */ ALWAYS_INLINE void doubleround(vec256* arr0, vec256* arr1, vec256* arr2, vec256* arr3, size_t half_states) { - CHACHA20_VEC_UNROLL(10) + UNROLL_LOOP(10) for (size_t i = 0; i < 10; ++i) { - arr_add_xor_rot16(arr0, arr1, arr3, half_states); - arr_add_xor_rot12(arr2, arr3, arr1, half_states); - arr_add_xor_rot8(arr0, arr1, arr3, half_states); - arr_add_xor_rot7(arr2, arr3, arr1, half_states); + arr_add_xor_rot<16>(arr0, arr1, arr3, half_states); + arr_add_xor_rot<12>(arr2, arr3, arr1, half_states); + arr_add_xor_rot<8>(arr0, arr1, arr3, half_states); + arr_add_xor_rot<7>(arr2, arr3, arr1, half_states); arr_shuf0(arr1, half_states); arr_shuf1(arr2, half_states); arr_shuf2(arr3, half_states); - arr_add_xor_rot16(arr0, arr1, arr3, half_states); - arr_add_xor_rot12(arr2, arr3, arr1, half_states); - arr_add_xor_rot8(arr0, arr1, arr3, half_states); - arr_add_xor_rot7(arr2, arr3, arr1, half_states); + arr_add_xor_rot<16>(arr0, arr1, arr3, half_states); + arr_add_xor_rot<12>(arr2, arr3, arr1, half_states); + arr_add_xor_rot<8>(arr0, arr1, arr3, half_states); + arr_add_xor_rot<7>(arr2, arr3, arr1, half_states); arr_shuf2(arr1, half_states); arr_shuf1(arr2, half_states); arr_shuf0(arr3, half_states); } } -/* Read 32bytes of input, xor with calculated state, write to output. Assumes - that input and output are unaligned, and makes no assumptions about the - internal layout of vec256; +/* Read 32 bytes of input, xor with calculated state, write to output. + Supports unaligned input/output, with an optional aligned fast-path. */ -ALWAYS_INLINE void vec_read_xor_write(std::span in_bytes, std::span out_bytes, const vec256& vec) +template +ALWAYS_INLINE void vec_read_xor_write(const std::byte* in_bytes, std::byte* out_bytes, const vec256& vec) { - assert(in_bytes.size() == 32); - assert(out_bytes.size() == 32); - - uint32_t temparr[8]; - memcpy(temparr, in_bytes.data(), in_bytes.size()); - vec256 tempvec = vec ^ (vec256){temparr[0], temparr[1], temparr[2], temparr[3], temparr[4], temparr[5], temparr[6], temparr[7]}; - vec_byteswap(tempvec); - temparr[0] = tempvec[0]; - temparr[1] = tempvec[1]; - temparr[2] = tempvec[2]; - temparr[3] = tempvec[3]; - temparr[4] = tempvec[4]; - temparr[5] = tempvec[5]; - temparr[6] = tempvec[6]; - temparr[7] = tempvec[7]; - memcpy(out_bytes.data(), temparr, out_bytes.size()); + if constexpr (AssumeAligned) { + in_bytes = std::assume_aligned(in_bytes); + out_bytes = std::assume_aligned(out_bytes); + } + + uint32_t tmp_arr[8]; + memcpy(tmp_arr, in_bytes, sizeof(tmp_arr)); + vec256 tmp_vec; + memcpy(&tmp_vec, tmp_arr, sizeof(tmp_vec)); + vec_byteswap(tmp_vec); + + tmp_vec ^= vec; + vec_byteswap(tmp_vec); + + memcpy(out_bytes, &tmp_vec, sizeof(tmp_vec)); +} + +template +ALWAYS_INLINE void arr_read_xor_write_impl(const std::byte* in_bytes, std::byte* out_bytes, const vec256* arr0, const vec256* arr1, const vec256* arr2, const vec256* arr3, size_t half_states) +{ + for_each_half_state(half_states, [&](auto idx) { + constexpr size_t i = decltype(idx)::value; + + const vec256& w = arr0[i]; + const vec256& x = arr1[i]; + const vec256& y = arr2[i]; + const vec256& z = arr3[i]; + + const size_t offset = i * 128; + const std::byte* in_slice = in_bytes + offset; + std::byte* out_slice = out_bytes + offset; + + vec_read_xor_write(in_slice + 0, out_slice + 0, VEC_SHUF2(w, x, 4, 5, 6, 7, 12, 13, 14, 15)); + vec_read_xor_write(in_slice + 32, out_slice + 32, VEC_SHUF2(y, z, 4, 5, 6, 7, 12, 13, 14, 15)); + vec_read_xor_write(in_slice + 64, out_slice + 64, VEC_SHUF2(w, x, 0, 1, 2, 3, 8, 9, 10, 11)); + vec_read_xor_write(in_slice + 96, out_slice + 96, VEC_SHUF2(y, z, 0, 1, 2, 3, 8, 9, 10, 11)); + }); } /* Merge the 128 bit lanes from 2 states to the proper order, then pass each vec_read_xor_write */ ALWAYS_INLINE void arr_read_xor_write(std::span in_bytes, std::span out_bytes, const vec256* arr0, const vec256* arr1, const vec256* arr2, const vec256* arr3, size_t half_states) { - CHACHA20_VEC_UNROLL(8) - for (size_t i = 0; i < 8; ++i) { - if (i < half_states) { - const vec256& w = arr0[i]; - const vec256& x = arr1[i]; - const vec256& y = arr2[i]; - const vec256& z = arr3[i]; - - const size_t offset = i * 128; - auto in_slice = in_bytes.subspan(offset, 128); - auto out_slice = out_bytes.subspan(offset, 128); - - vec_read_xor_write(in_slice.first(32), out_slice.first(32), __builtin_shufflevector(w, x, 4, 5, 6, 7, 12, 13, 14, 15)); - vec_read_xor_write(in_slice.subspan(32, 32), out_slice.subspan(32, 32), __builtin_shufflevector(y, z, 4, 5, 6, 7, 12, 13, 14, 15)); - vec_read_xor_write(in_slice.subspan(64, 32), out_slice.subspan(64, 32), __builtin_shufflevector(w, x, 0, 1, 2, 3, 8, 9, 10, 11)); - vec_read_xor_write(in_slice.subspan(96, 32), out_slice.subspan(96, 32), __builtin_shufflevector(y, z, 0, 1, 2, 3, 8, 9, 10, 11)); - } + constexpr std::uintptr_t mask{CHACHA20_VEC_MEM_ALIGN - 1}; + const bool aligned = ((reinterpret_cast(in_bytes.data()) | reinterpret_cast(out_bytes.data())) & mask) == 0; + + if (aligned) [[likely]] { + arr_read_xor_write_impl(in_bytes.data(), out_bytes.data(), arr0, arr1, arr2, arr3, half_states); + } else { + arr_read_xor_write_impl(in_bytes.data(), out_bytes.data(), arr0, arr1, arr2, arr3, half_states); } } -/* Main crypt function. Calculates up to 16 states. */ +/* Main crypt function. Calculates up to 16 states (8 half_states). */ ALWAYS_INLINE void multi_block_crypt(std::span in_bytes, std::span out_bytes, const vec256& state0, const vec256& state1, const vec256& state2, size_t states) { const size_t half_states = states / 2; - const vec256* increments = increments_for_half_states(half_states); - assert(increments != nullptr); - vec256 arr0[8], arr1[8], arr2[8], arr3[8]; arr_set_vec256(arr0, half_states, nums256); @@ -359,11 +334,25 @@ ALWAYS_INLINE void multi_block_crypt(std::span in_bytes, std::s arr_read_xor_write(in_bytes, out_bytes, arr0, arr1, arr2, arr3, half_states); } -#undef CHACHA20_VEC_UNROLL -#undef CHACHA20_VEC_PRAGMA +template +ALWAYS_INLINE void process_blocks(std::span& in_bytes, std::span& out_bytes, const vec256& state0, const vec256& state1, vec256& state2) +{ + constexpr size_t block_size = CHACHA20_VEC_BLOCKLEN * States; + constexpr vec256 increment = (vec256){static_cast(States), 0, 0, 0, static_cast(States), 0, 0, 0}; + + while (in_bytes.size() >= block_size) { + multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, States); + state2 += increment; + in_bytes = in_bytes.subspan(block_size); + out_bytes = out_bytes.subspan(block_size); + } +} + +#undef VEC_SHUF_SELF +#undef VEC_SHUF2 } // anonymous namespace -#endif // CHACHA20_VEC_ALL_MULTI_STATES_DISABLED +#endif // any multi-state enabled #if defined(CHACHA20_NAMESPACE) namespace CHACHA20_NAMESPACE { @@ -371,52 +360,18 @@ namespace CHACHA20_NAMESPACE { void chacha20_crypt_vectorized(std::span& in_bytes, std::span& out_bytes, const std::array& input) noexcept { -#if !defined(CHACHA20_VEC_ALL_MULTI_STATES_DISABLED) - assert(in_bytes.size() == out_bytes.size()); - const vec256 state0 = (vec256){input[0], input[1], input[2], input[3], input[0], input[1], input[2], input[3]}; - const vec256 state1 = (vec256){input[4], input[5], input[6], input[7], input[4], input[5], input[6], input[7]}; - vec256 state2 = (vec256){input[8], input[9], input[10], input[11], input[8], input[9], input[10], input[11]}; -#if !defined(CHACHA20_VEC_DISABLE_STATES_16) - while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 16) { - multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, 16); - state2 += (vec256){16, 0, 0, 0, 16, 0, 0, 0}; - in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 16); - out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 16); - } -#endif -#if !defined(CHACHA20_VEC_DISABLE_STATES_8) - while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 8) { - multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, 8); - state2 += (vec256){8, 0, 0, 0, 8, 0, 0, 0}; - in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 8); - out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 8); - } -#endif -#if !defined(CHACHA20_VEC_DISABLE_STATES_6) - while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 6) { - multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, 6); - state2 += (vec256){6, 0, 0, 0, 6, 0, 0, 0}; - in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 6); - out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 6); + if constexpr (kEnableAnyMultiState) { + assert(in_bytes.size() == out_bytes.size()); + const vec256 state0 = (vec256){input[0], input[1], input[2], input[3], input[0], input[1], input[2], input[3]}; + const vec256 state1 = (vec256){input[4], input[5], input[6], input[7], input[4], input[5], input[6], input[7]}; + vec256 state2 = (vec256){input[8], input[9], input[10], input[11], input[8], input[9], input[10], input[11]}; + + if constexpr (kEnableStates16) process_blocks<16>(in_bytes, out_bytes, state0, state1, state2); + if constexpr (kEnableStates8) process_blocks<8>(in_bytes, out_bytes, state0, state1, state2); + if constexpr (kEnableStates6) process_blocks<6>(in_bytes, out_bytes, state0, state1, state2); + if constexpr (kEnableStates4) process_blocks<4>(in_bytes, out_bytes, state0, state1, state2); + if constexpr (kEnableStates2) process_blocks<2>(in_bytes, out_bytes, state0, state1, state2); } -#endif -#if !defined(CHACHA20_VEC_DISABLE_STATES_4) - while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 4) { - multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, 4); - state2 += (vec256){4, 0, 0, 0, 4, 0, 0, 0}; - in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 4); - out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 4); - } -#endif -#if !defined(CHACHA20_VEC_DISABLE_STATES_2) - while(in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * 2) { - multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, 2); - state2 += (vec256){2, 0, 0, 0, 2, 0, 0, 0}; - in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 2); - out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * 2); - } -#endif -#endif // CHACHA20_VEC_ALL_MULTI_STATES_DISABLED } #if defined(CHACHA20_NAMESPACE) diff --git a/src/crypto/chacha20_vec_base.cpp b/src/crypto/chacha20_vec_base.cpp index 9fda9452a1c8..d282d7009ffc 100644 --- a/src/crypto/chacha20_vec_base.cpp +++ b/src/crypto/chacha20_vec_base.cpp @@ -12,6 +12,14 @@ # define CHACHA20_VEC_DISABLE_STATES_16 # define CHACHA20_VEC_DISABLE_STATES_8 # define CHACHA20_VEC_DISABLE_STATES_6 +# if defined(__GNUC__) && !defined(__clang__) +// GCC currently generates slower code for the generic vectorized implementation +// on x86_64. Disable the 4-state path for now to avoid a regression. +# define CHACHA20_VEC_DISABLE_STATES_4 +// Disable the 2-state path as well (fallback to scalar) until a faster GCC x86 +// implementation exists (e.g. via AVX2/AVX512 runtime dispatch). +# define CHACHA20_VEC_DISABLE_STATES_2 +# endif #elif defined(__ARM_NEON) # define CHACHA20_VEC_DISABLE_STATES_2 #else From 6a1636918f205f821578fea062572eb264eae78c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc?= Date: Sun, 4 Jan 2026 18:39:54 +0100 Subject: [PATCH 07/12] refactor: extend NEON/AArch64 support and optimize multi-block ChaCha20 handling Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte): - clang 22: 1MB 1.86 [1.85-1.86], 256B 1.73 [1.72-1.73], 64B 2.59 [2.58-2.60] - gcc 14: 1MB 5.74 [5.73-5.75], 256B 5.29 [5.28-5.29], 64B 2.71 [2.69-2.73] On this Cortex-A76 benchmark, results are unchanged vs the prior commit (within measurement noise). The changes here primarily prepare/extend the generic logic for a broader set of targets. --- src/crypto/chacha20_vec.ipp | 69 +++++++++++++++++--------------- src/crypto/chacha20_vec_base.cpp | 2 +- 2 files changed, 38 insertions(+), 33 deletions(-) diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp index 3a92649c90d8..36dfdee7fa29 100644 --- a/src/crypto/chacha20_vec.ipp +++ b/src/crypto/chacha20_vec.ipp @@ -174,33 +174,8 @@ After the second round, they are used (in reverse) to restore the original layout. */ -#if defined(__GNUC__) && !defined(__clang__) -template -ALWAYS_INLINE vec256 vec_shuffle(const vec256& v) -{ - static_assert(sizeof...(I) == 8); - using mask_t = int __attribute__((__vector_size__(32))); - constexpr mask_t mask{I...}; - return __builtin_shuffle(v, mask); -} - -template -ALWAYS_INLINE vec256 vec_shuffle(const vec256& a, const vec256& b) -{ - static_assert(sizeof...(I) == 8); - using mask_t = int __attribute__((__vector_size__(32))); - constexpr mask_t mask{I...}; - return __builtin_shuffle(a, b, mask); -} -#endif // defined(__GNUC__) && !defined(__clang__) - -#if defined(__GNUC__) && !defined(__clang__) -#define VEC_SHUF_SELF(x, ...) vec_shuffle<__VA_ARGS__>(x) -#define VEC_SHUF2(a, b, ...) vec_shuffle<__VA_ARGS__>(a, b) -#else #define VEC_SHUF_SELF(x, ...) __builtin_shufflevector(x, x, __VA_ARGS__) #define VEC_SHUF2(a, b, ...) __builtin_shufflevector(a, b, __VA_ARGS__) -#endif ALWAYS_INLINE void arr_shuf0(vec256* arr, size_t half_states) { @@ -310,6 +285,35 @@ ALWAYS_INLINE void arr_read_xor_write(std::span in_bytes, std:: } /* Main crypt function. Calculates up to 16 states (8 half_states). */ +#if defined(__GNUC__) && !defined(__clang__) +template +ALWAYS_INLINE void multi_block_crypt(std::span in_bytes, std::span out_bytes, const vec256& state0, const vec256& state1, const vec256& state2) +{ + static_assert(States == 16 || States == 8 || States == 6 || States == 4 || States == 2); + constexpr size_t half_states = States / 2; + + std::array arr0, arr1, arr2, arr3; + + arr_set_vec256(arr0.data(), half_states, nums256); + arr_set_vec256(arr1.data(), half_states, state0); + arr_set_vec256(arr2.data(), half_states, state1); + arr_set_vec256(arr3.data(), half_states, state2); + + arr_add_arr(arr3.data(), increments, half_states); + + doubleround(arr0.data(), arr1.data(), arr2.data(), arr3.data(), half_states); + + arr_add_vec256(arr0.data(), half_states, nums256); + arr_add_vec256(arr1.data(), half_states, state0); + arr_add_vec256(arr2.data(), half_states, state1); + arr_add_vec256(arr3.data(), half_states, state2); + + arr_add_arr(arr3.data(), increments, half_states); + + arr_read_xor_write(in_bytes, out_bytes, arr0.data(), arr1.data(), arr2.data(), arr3.data(), half_states); +} +#endif + ALWAYS_INLINE void multi_block_crypt(std::span in_bytes, std::span out_bytes, const vec256& state0, const vec256& state1, const vec256& state2, size_t states) { const size_t half_states = states / 2; @@ -337,14 +341,15 @@ ALWAYS_INLINE void multi_block_crypt(std::span in_bytes, std::s template ALWAYS_INLINE void process_blocks(std::span& in_bytes, std::span& out_bytes, const vec256& state0, const vec256& state1, vec256& state2) { - constexpr size_t block_size = CHACHA20_VEC_BLOCKLEN * States; - constexpr vec256 increment = (vec256){static_cast(States), 0, 0, 0, static_cast(States), 0, 0, 0}; - - while (in_bytes.size() >= block_size) { + while (in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * States) { +#if defined(__GNUC__) && !defined(__clang__) + multi_block_crypt(in_bytes, out_bytes, state0, state1, state2); +#else multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, States); - state2 += increment; - in_bytes = in_bytes.subspan(block_size); - out_bytes = out_bytes.subspan(block_size); +#endif + state2 += (vec256){static_cast(States), 0, 0, 0, static_cast(States), 0, 0, 0}; + in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * States); + out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * States); } } diff --git a/src/crypto/chacha20_vec_base.cpp b/src/crypto/chacha20_vec_base.cpp index d282d7009ffc..e865d7f8fd94 100644 --- a/src/crypto/chacha20_vec_base.cpp +++ b/src/crypto/chacha20_vec_base.cpp @@ -20,7 +20,7 @@ // implementation exists (e.g. via AVX2/AVX512 runtime dispatch). # define CHACHA20_VEC_DISABLE_STATES_2 # endif -#elif defined(__ARM_NEON) +#elif defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) # define CHACHA20_VEC_DISABLE_STATES_2 #else // Be conservative and require platforms to opt-in From 3b26d2e480f972c0e528d4f21d28077428242d10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc?= Date: Sun, 4 Jan 2026 19:37:00 +0100 Subject: [PATCH 08/12] refactor: refine GCC-specific handling in ChaCha20 vectorized paths Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte): - clang 22: 1MB 1.86 [1.85-1.86], 256B 1.72 [1.72-1.73], 64B 2.59 [2.59-2.60] - gcc 14: 1MB 5.79 [5.78-5.81], 256B 5.29 [5.28-5.29], 64B 2.71 [2.69-2.72] This change is mostly about refining GCC gating on other architectures (e.g. x86 with/without AVX2). On AArch64 it doesn't improve GCC's multi-state codegen yet; GCC still emits a very large vectorized function (stack allocation ~0x5920) and high instruction counts. --- src/crypto/chacha20_vec.ipp | 33 +------------------------------- src/crypto/chacha20_vec_base.cpp | 5 +++-- 2 files changed, 4 insertions(+), 34 deletions(-) diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp index 36dfdee7fa29..b419111f6fb7 100644 --- a/src/crypto/chacha20_vec.ipp +++ b/src/crypto/chacha20_vec.ipp @@ -285,39 +285,12 @@ ALWAYS_INLINE void arr_read_xor_write(std::span in_bytes, std:: } /* Main crypt function. Calculates up to 16 states (8 half_states). */ -#if defined(__GNUC__) && !defined(__clang__) template ALWAYS_INLINE void multi_block_crypt(std::span in_bytes, std::span out_bytes, const vec256& state0, const vec256& state1, const vec256& state2) { static_assert(States == 16 || States == 8 || States == 6 || States == 4 || States == 2); constexpr size_t half_states = States / 2; - - std::array arr0, arr1, arr2, arr3; - - arr_set_vec256(arr0.data(), half_states, nums256); - arr_set_vec256(arr1.data(), half_states, state0); - arr_set_vec256(arr2.data(), half_states, state1); - arr_set_vec256(arr3.data(), half_states, state2); - - arr_add_arr(arr3.data(), increments, half_states); - - doubleround(arr0.data(), arr1.data(), arr2.data(), arr3.data(), half_states); - - arr_add_vec256(arr0.data(), half_states, nums256); - arr_add_vec256(arr1.data(), half_states, state0); - arr_add_vec256(arr2.data(), half_states, state1); - arr_add_vec256(arr3.data(), half_states, state2); - - arr_add_arr(arr3.data(), increments, half_states); - - arr_read_xor_write(in_bytes, out_bytes, arr0.data(), arr1.data(), arr2.data(), arr3.data(), half_states); -} -#endif - -ALWAYS_INLINE void multi_block_crypt(std::span in_bytes, std::span out_bytes, const vec256& state0, const vec256& state1, const vec256& state2, size_t states) -{ - const size_t half_states = states / 2; - vec256 arr0[8], arr1[8], arr2[8], arr3[8]; + vec256 arr0[half_states], arr1[half_states], arr2[half_states], arr3[half_states]; arr_set_vec256(arr0, half_states, nums256); arr_set_vec256(arr1, half_states, state0); @@ -342,11 +315,7 @@ template ALWAYS_INLINE void process_blocks(std::span& in_bytes, std::span& out_bytes, const vec256& state0, const vec256& state1, vec256& state2) { while (in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * States) { -#if defined(__GNUC__) && !defined(__clang__) multi_block_crypt(in_bytes, out_bytes, state0, state1, state2); -#else - multi_block_crypt(in_bytes, out_bytes, state0, state1, state2, States); -#endif state2 += (vec256){static_cast(States), 0, 0, 0, static_cast(States), 0, 0, 0}; in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * States); out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * States); diff --git a/src/crypto/chacha20_vec_base.cpp b/src/crypto/chacha20_vec_base.cpp index e865d7f8fd94..0b3a34563ab6 100644 --- a/src/crypto/chacha20_vec_base.cpp +++ b/src/crypto/chacha20_vec_base.cpp @@ -12,9 +12,10 @@ # define CHACHA20_VEC_DISABLE_STATES_16 # define CHACHA20_VEC_DISABLE_STATES_8 # define CHACHA20_VEC_DISABLE_STATES_6 -# if defined(__GNUC__) && !defined(__clang__) +# if defined(__GNUC__) && !defined(__clang__) && !defined(__AVX2__) // GCC currently generates slower code for the generic vectorized implementation -// on x86_64. Disable the 4-state path for now to avoid a regression. +// on x86_64 unless AVX2 is enabled. Disable the 4-state path for now to avoid a +// regression. # define CHACHA20_VEC_DISABLE_STATES_4 // Disable the 2-state path as well (fallback to scalar) until a faster GCC x86 // implementation exists (e.g. via AVX2/AVX512 runtime dispatch). From 842a88e1ff6ef68fb8bd9ced018af3ccdcd93f1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc?= Date: Sun, 4 Jan 2026 20:07:19 +0100 Subject: [PATCH 09/12] refactor: improve GCC handling for NEON/AArch64 in ChaCha20 vectorized paths Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median [min-max] ns/byte): - clang 22: 1MB 1.86 [1.86-1.86], 256B 1.73 [1.72-1.73], 64B 2.59 [2.58-2.60] - gcc 14: 1MB 2.45 [2.44-2.45], 256B 2.53 [2.52-2.53], 64B 2.71 [2.69-2.72] Key point: gcc's multi-state vectorized path was a regression on AArch64 (5.7 ns/B class). This commit avoids that by disabling all multi-state variants for gcc on AArch64, effectively falling back to the scalar implementation for multi-block inputs (bringing gcc back near baseline). Also fix the build when all multi-state paths are disabled: avoid referencing `process_blocks` from code that is preprocessor-disabled, so GCC can compile cleanly with a complete disable set. --- src/crypto/chacha20_vec.ipp | 40 +++++++++++++++++++++----------- src/crypto/chacha20_vec_base.cpp | 13 ++++++++++- 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp index b419111f6fb7..e7a556d078ee 100644 --- a/src/crypto/chacha20_vec.ipp +++ b/src/crypto/chacha20_vec.ipp @@ -51,12 +51,20 @@ inline constexpr bool kEnableAnyMultiState = kEnableStates16 || kEnableStates8 | // vec256 type must be visible for if constexpr branches even when they're not taken using vec256 = uint32_t __attribute__((__vector_size__(32))); -// Preprocessor check for conditional compilation of the anonymous namespace +// Preprocessor check for conditional compilation of the anonymous namespace and +// the multi-state code paths. When all states are disabled, avoid referencing +// templates/functions that are not available. #if !defined(CHACHA20_VEC_DISABLE_STATES_16) || \ !defined(CHACHA20_VEC_DISABLE_STATES_8) || \ !defined(CHACHA20_VEC_DISABLE_STATES_6) || \ !defined(CHACHA20_VEC_DISABLE_STATES_4) || \ !defined(CHACHA20_VEC_DISABLE_STATES_2) +# define CHACHA20_VEC_ENABLE_ANY_MULTI_STATE 1 +#else +# define CHACHA20_VEC_ENABLE_ANY_MULTI_STATE 0 +#endif + +#if CHACHA20_VEC_ENABLE_ANY_MULTI_STATE namespace { @@ -326,7 +334,7 @@ ALWAYS_INLINE void process_blocks(std::span& in_bytes, std::spa #undef VEC_SHUF2 } // anonymous namespace -#endif // any multi-state enabled +#endif // CHACHA20_VEC_ENABLE_ANY_MULTI_STATE #if defined(CHACHA20_NAMESPACE) namespace CHACHA20_NAMESPACE { @@ -334,18 +342,22 @@ namespace CHACHA20_NAMESPACE { void chacha20_crypt_vectorized(std::span& in_bytes, std::span& out_bytes, const std::array& input) noexcept { - if constexpr (kEnableAnyMultiState) { - assert(in_bytes.size() == out_bytes.size()); - const vec256 state0 = (vec256){input[0], input[1], input[2], input[3], input[0], input[1], input[2], input[3]}; - const vec256 state1 = (vec256){input[4], input[5], input[6], input[7], input[4], input[5], input[6], input[7]}; - vec256 state2 = (vec256){input[8], input[9], input[10], input[11], input[8], input[9], input[10], input[11]}; - - if constexpr (kEnableStates16) process_blocks<16>(in_bytes, out_bytes, state0, state1, state2); - if constexpr (kEnableStates8) process_blocks<8>(in_bytes, out_bytes, state0, state1, state2); - if constexpr (kEnableStates6) process_blocks<6>(in_bytes, out_bytes, state0, state1, state2); - if constexpr (kEnableStates4) process_blocks<4>(in_bytes, out_bytes, state0, state1, state2); - if constexpr (kEnableStates2) process_blocks<2>(in_bytes, out_bytes, state0, state1, state2); - } +#if CHACHA20_VEC_ENABLE_ANY_MULTI_STATE + assert(in_bytes.size() == out_bytes.size()); + const vec256 state0 = (vec256){input[0], input[1], input[2], input[3], input[0], input[1], input[2], input[3]}; + const vec256 state1 = (vec256){input[4], input[5], input[6], input[7], input[4], input[5], input[6], input[7]}; + vec256 state2 = (vec256){input[8], input[9], input[10], input[11], input[8], input[9], input[10], input[11]}; + + if constexpr (kEnableStates16) process_blocks<16>(in_bytes, out_bytes, state0, state1, state2); + if constexpr (kEnableStates8) process_blocks<8>(in_bytes, out_bytes, state0, state1, state2); + if constexpr (kEnableStates6) process_blocks<6>(in_bytes, out_bytes, state0, state1, state2); + if constexpr (kEnableStates4) process_blocks<4>(in_bytes, out_bytes, state0, state1, state2); + if constexpr (kEnableStates2) process_blocks<2>(in_bytes, out_bytes, state0, state1, state2); +#else + (void)in_bytes; + (void)out_bytes; + (void)input; +#endif } #if defined(CHACHA20_NAMESPACE) diff --git a/src/crypto/chacha20_vec_base.cpp b/src/crypto/chacha20_vec_base.cpp index 0b3a34563ab6..a2db4b34f520 100644 --- a/src/crypto/chacha20_vec_base.cpp +++ b/src/crypto/chacha20_vec_base.cpp @@ -22,7 +22,18 @@ # define CHACHA20_VEC_DISABLE_STATES_2 # endif #elif defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) -# define CHACHA20_VEC_DISABLE_STATES_2 +# if defined(__GNUC__) && !defined(__clang__) +// Similar to x86_64, GCC currently generates slower code for the generic +// vectorized implementation on AArch64/NEON. Disable all multi-state paths for +// now to avoid a regression. +# define CHACHA20_VEC_DISABLE_STATES_16 +# define CHACHA20_VEC_DISABLE_STATES_8 +# define CHACHA20_VEC_DISABLE_STATES_6 +# define CHACHA20_VEC_DISABLE_STATES_4 +# define CHACHA20_VEC_DISABLE_STATES_2 +# else +# define CHACHA20_VEC_DISABLE_STATES_2 +# endif #else // Be conservative and require platforms to opt-in # define CHACHA20_VEC_DISABLE_STATES_16 From 0182ceb344f953edbf83609698be689faea4722e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc?= Date: Fri, 9 Jan 2026 20:39:12 +0100 Subject: [PATCH 10/12] chacha20: make GCC competitive with Clang in vectorized ChaCha20 On AArch64/NEON, GCC's codegen for 256-bit `__builtin_shufflevector` patterns was the root cause of the large perf gap (scalar spills + `fmov`/`bfi`/`bfxil` sequences). Keep Clang on the existing 256-bit vector path, but use a GCC-specific split-lane `vec256` representation (two 128-bit lanes) so GCC can use native NEON shuffles and keep the state in registers. This also enables a multi-state path for GCC again on AArch64 (use 8/4-state; keep 16/6 disabled to limit register pressure). Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=1000`, 5 runs; median ns/byte): - GCC 14.2: 1MB 1.85, 256B 2.17, 64B 2.71 - Clang 22: 1MB 1.87, 256B 1.73, 64B 2.59 --- src/crypto/chacha20_vec.ipp | 136 +++++++++++++++++++++++++++---- src/crypto/chacha20_vec_base.cpp | 7 +- 2 files changed, 120 insertions(+), 23 deletions(-) diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp index e7a556d078ee..dacd2522f530 100644 --- a/src/crypto/chacha20_vec.ipp +++ b/src/crypto/chacha20_vec.ipp @@ -48,8 +48,40 @@ inline constexpr bool kEnableStates2 = true; inline constexpr bool kEnableAnyMultiState = kEnableStates16 || kEnableStates8 || kEnableStates6 || kEnableStates4 || kEnableStates2; -// vec256 type must be visible for if constexpr branches even when they're not taken +using vec128 = uint32_t __attribute__((__vector_size__(16))); + +#if defined(__GNUC__) && !defined(__clang__) +# define CHACHA20_VEC_USE_SPLIT_LANES 1 +#else +# define CHACHA20_VEC_USE_SPLIT_LANES 0 +#endif + +#if CHACHA20_VEC_USE_SPLIT_LANES +// Represent two 128-bit lanes explicitly. This avoids GCC generating expensive +// scalar sequences for 256-bit shuffles on targets without native 256-bit SIMD +// registers (e.g. AArch64/NEON, x86/SSE2). +struct vec256 { + vec128 lo; + vec128 hi; +}; +static_assert(sizeof(vec256) == 32); + +ALWAYS_INLINE vec256& operator+=(vec256& a, const vec256& b) +{ + a.lo += b.lo; + a.hi += b.hi; + return a; +} + +ALWAYS_INLINE vec256& operator^=(vec256& a, const vec256& b) +{ + a.lo ^= b.lo; + a.hi ^= b.hi; + return a; +} +#else using vec256 = uint32_t __attribute__((__vector_size__(32))); +#endif // Preprocessor check for conditional compilation of the anonymous namespace and // the multi-state code paths. When all states are disabled, avoid referencing @@ -75,6 +107,20 @@ static constexpr size_t CHACHA20_VEC_MEM_ALIGN{16}; ALWAYS_INLINE void vec_byteswap(vec256& vec) { if constexpr (std::endian::native == std::endian::big) { +#if CHACHA20_VEC_USE_SPLIT_LANES + vec128 lo; + lo[0] = __builtin_bswap32(vec.lo[0]); + lo[1] = __builtin_bswap32(vec.lo[1]); + lo[2] = __builtin_bswap32(vec.lo[2]); + lo[3] = __builtin_bswap32(vec.lo[3]); + vec128 hi; + hi[0] = __builtin_bswap32(vec.hi[0]); + hi[1] = __builtin_bswap32(vec.hi[1]); + hi[2] = __builtin_bswap32(vec.hi[2]); + hi[3] = __builtin_bswap32(vec.hi[3]); + vec.lo = lo; + vec.hi = hi; +#else vec256 ret; ret[0] = __builtin_bswap32(vec[0]); ret[1] = __builtin_bswap32(vec[1]); @@ -85,6 +131,7 @@ ALWAYS_INLINE void vec_byteswap(vec256& vec) ret[6] = __builtin_bswap32(vec[6]); ret[7] = __builtin_bswap32(vec[7]); vec = ret; +#endif } } @@ -93,14 +140,34 @@ template ALWAYS_INLINE void vec_rotl(vec256& vec) { static_assert(N > 0 && N < 32, "Rotation must be between 1 and 31 bits"); +#if CHACHA20_VEC_USE_SPLIT_LANES + vec.lo = (vec.lo << N) | (vec.lo >> (32 - N)); + vec.hi = (vec.hi << N) | (vec.hi >> (32 - N)); +#else vec = (vec << N) | (vec >> (32 - N)); +#endif } +#if CHACHA20_VEC_USE_SPLIT_LANES +static constexpr vec128 nums128 = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; +static constexpr vec256 nums256 = {nums128, nums128}; +#else static constexpr vec256 nums256 = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574, 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; +#endif // Counter increments for each half-state pair. Pattern: {2*i+1, 0, 0, 0, 2*i, 0, 0, 0} // All smaller state counts use a prefix of this array. static constexpr vec256 increments[8] = { +#if CHACHA20_VEC_USE_SPLIT_LANES + {{1, 0, 0, 0}, {0, 0, 0, 0}}, + {{3, 0, 0, 0}, {2, 0, 0, 0}}, + {{5, 0, 0, 0}, {4, 0, 0, 0}}, + {{7, 0, 0, 0}, {6, 0, 0, 0}}, + {{9, 0, 0, 0}, {8, 0, 0, 0}}, + {{11, 0, 0, 0}, {10, 0, 0, 0}}, + {{13, 0, 0, 0}, {12, 0, 0, 0}}, + {{15, 0, 0, 0}, {14, 0, 0, 0}}, +#else {1, 0, 0, 0, 0, 0, 0, 0}, {3, 0, 0, 0, 2, 0, 0, 0}, {5, 0, 0, 0, 4, 0, 0, 0}, @@ -109,6 +176,7 @@ static constexpr vec256 increments[8] = { {11, 0, 0, 0, 10, 0, 0, 0}, {13, 0, 0, 0, 12, 0, 0, 0}, {15, 0, 0, 0, 14, 0, 0, 0}, +#endif }; template @@ -182,15 +250,17 @@ After the second round, they are used (in reverse) to restore the original layout. */ -#define VEC_SHUF_SELF(x, ...) __builtin_shufflevector(x, x, __VA_ARGS__) -#define VEC_SHUF2(a, b, ...) __builtin_shufflevector(a, b, __VA_ARGS__) - ALWAYS_INLINE void arr_shuf0(vec256* arr, size_t half_states) { for_each_half_state(half_states, [&](auto idx) { constexpr size_t i = decltype(idx)::value; vec256& x = arr[i]; - x = VEC_SHUF_SELF(x, 1, 2, 3, 0, 5, 6, 7, 4); +#if CHACHA20_VEC_USE_SPLIT_LANES + x.lo = __builtin_shufflevector(x.lo, x.lo, 1, 2, 3, 0); + x.hi = __builtin_shufflevector(x.hi, x.hi, 1, 2, 3, 0); +#else + x = __builtin_shufflevector(x, x, 1, 2, 3, 0, 5, 6, 7, 4); +#endif }); } @@ -199,7 +269,12 @@ ALWAYS_INLINE void arr_shuf1(vec256* arr, size_t half_states) for_each_half_state(half_states, [&](auto idx) { constexpr size_t i = decltype(idx)::value; vec256& x = arr[i]; - x = VEC_SHUF_SELF(x, 2, 3, 0, 1, 6, 7, 4, 5); +#if CHACHA20_VEC_USE_SPLIT_LANES + x.lo = __builtin_shufflevector(x.lo, x.lo, 2, 3, 0, 1); + x.hi = __builtin_shufflevector(x.hi, x.hi, 2, 3, 0, 1); +#else + x = __builtin_shufflevector(x, x, 2, 3, 0, 1, 6, 7, 4, 5); +#endif }); } @@ -208,7 +283,12 @@ ALWAYS_INLINE void arr_shuf2(vec256* arr, size_t half_states) for_each_half_state(half_states, [&](auto idx) { constexpr size_t i = decltype(idx)::value; vec256& x = arr[i]; - x = VEC_SHUF_SELF(x, 3, 0, 1, 2, 7, 4, 5, 6); +#if CHACHA20_VEC_USE_SPLIT_LANES + x.lo = __builtin_shufflevector(x.lo, x.lo, 3, 0, 1, 2); + x.hi = __builtin_shufflevector(x.hi, x.hi, 3, 0, 1, 2); +#else + x = __builtin_shufflevector(x, x, 3, 0, 1, 2, 7, 4, 5, 6); +#endif }); } @@ -245,10 +325,8 @@ ALWAYS_INLINE void vec_read_xor_write(const std::byte* in_bytes, std::byte* out_ out_bytes = std::assume_aligned(out_bytes); } - uint32_t tmp_arr[8]; - memcpy(tmp_arr, in_bytes, sizeof(tmp_arr)); vec256 tmp_vec; - memcpy(&tmp_vec, tmp_arr, sizeof(tmp_vec)); + memcpy(&tmp_vec, in_bytes, sizeof(tmp_vec)); vec_byteswap(tmp_vec); tmp_vec ^= vec; @@ -272,10 +350,17 @@ ALWAYS_INLINE void arr_read_xor_write_impl(const std::byte* in_bytes, std::byte* const std::byte* in_slice = in_bytes + offset; std::byte* out_slice = out_bytes + offset; - vec_read_xor_write(in_slice + 0, out_slice + 0, VEC_SHUF2(w, x, 4, 5, 6, 7, 12, 13, 14, 15)); - vec_read_xor_write(in_slice + 32, out_slice + 32, VEC_SHUF2(y, z, 4, 5, 6, 7, 12, 13, 14, 15)); - vec_read_xor_write(in_slice + 64, out_slice + 64, VEC_SHUF2(w, x, 0, 1, 2, 3, 8, 9, 10, 11)); - vec_read_xor_write(in_slice + 96, out_slice + 96, VEC_SHUF2(y, z, 0, 1, 2, 3, 8, 9, 10, 11)); +#if CHACHA20_VEC_USE_SPLIT_LANES + vec_read_xor_write(in_slice + 0, out_slice + 0, vec256{w.hi, x.hi}); + vec_read_xor_write(in_slice + 32, out_slice + 32, vec256{y.hi, z.hi}); + vec_read_xor_write(in_slice + 64, out_slice + 64, vec256{w.lo, x.lo}); + vec_read_xor_write(in_slice + 96, out_slice + 96, vec256{y.lo, z.lo}); +#else + vec_read_xor_write(in_slice + 0, out_slice + 0, __builtin_shufflevector(w, x, 4, 5, 6, 7, 12, 13, 14, 15)); + vec_read_xor_write(in_slice + 32, out_slice + 32, __builtin_shufflevector(y, z, 4, 5, 6, 7, 12, 13, 14, 15)); + vec_read_xor_write(in_slice + 64, out_slice + 64, __builtin_shufflevector(w, x, 0, 1, 2, 3, 8, 9, 10, 11)); + vec_read_xor_write(in_slice + 96, out_slice + 96, __builtin_shufflevector(y, z, 0, 1, 2, 3, 8, 9, 10, 11)); +#endif }); } @@ -324,15 +409,17 @@ ALWAYS_INLINE void process_blocks(std::span& in_bytes, std::spa { while (in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * States) { multi_block_crypt(in_bytes, out_bytes, state0, state1, state2); - state2 += (vec256){static_cast(States), 0, 0, 0, static_cast(States), 0, 0, 0}; + const uint32_t inc = static_cast(States); +#if CHACHA20_VEC_USE_SPLIT_LANES + state2 += vec256{{inc, 0, 0, 0}, {inc, 0, 0, 0}}; +#else + state2 += (vec256){inc, 0, 0, 0, inc, 0, 0, 0}; +#endif in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * States); out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * States); } } -#undef VEC_SHUF_SELF -#undef VEC_SHUF2 - } // anonymous namespace #endif // CHACHA20_VEC_ENABLE_ANY_MULTI_STATE @@ -344,9 +431,19 @@ void chacha20_crypt_vectorized(std::span& in_bytes, std::span(in_bytes, out_bytes, state0, state1, state2); if constexpr (kEnableStates8) process_blocks<8>(in_bytes, out_bytes, state0, state1, state2); @@ -364,4 +461,7 @@ void chacha20_crypt_vectorized(std::span& in_bytes, std::span Date: Fri, 9 Jan 2026 20:39:24 +0100 Subject: [PATCH 11/12] chacha20: AArch64: avoid 16-state spills; tighten half-state loops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On AArch64/NEON there are 32 128-bit vector registers. The “16-state” variant (8 half-states) needs ~64 128-bit lanes worth of live state (because `vec256` lowers to two 128-bit lanes on NEON), so it spills heavily (notably on clang). Disable `STATES_16` on AArch64 to force the 8-state path, which fits in registers and is substantially faster. Also disable `STATES_6` on AArch64: it increases code size and hurts the common 8/4-state path on this target. Make the per-half-state helpers compile-time sized (no runtime `half_states` argument). This lets compilers fully specialize the inner loops; GCC in particular stops generating extra control-flow and spill glue around the multi-state path. Finally, on AArch64/NEON clang's codegen for the aligned I/O fast-path (`std::assume_aligned` + 32-byte memcpy) is slower than the plain unaligned variant. Prefer the unaligned path for clang. Bench (AArch64 Cortex-A76, -O2, taskset core 2, `bench_bitcoin -filter='CHACHA20_.*' -min-time=10000`, 5 runs; median [min-max] ns/byte): - clang 22: 1MB 1.47 [1.46-1.48], 256B 1.64 [1.64-1.65], 64B 2.59 [2.59-2.60] - gcc 14: 1MB 1.71 [1.71-1.71], 256B 1.95 [1.95-1.97], 64B 2.70 [2.69-2.72] Delta vs previous commit (CHACHA20_1MB, -min-time=10000): - clang: 1.86 -> 1.47 ns/B (avoid 16-state spills; avoid aligned fast-path) - gcc: 1.85 -> 1.71 ns/B (tightened half-state loops) --- src/crypto/chacha20_vec.ipp | 173 +++++++++++++++++-------------- src/crypto/chacha20_vec_base.cpp | 7 +- 2 files changed, 100 insertions(+), 80 deletions(-) diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp index dacd2522f530..468204ceb99f 100644 --- a/src/crypto/chacha20_vec.ipp +++ b/src/crypto/chacha20_vec.ipp @@ -46,8 +46,6 @@ inline constexpr bool kEnableStates2 = false; inline constexpr bool kEnableStates2 = true; #endif -inline constexpr bool kEnableAnyMultiState = kEnableStates16 || kEnableStates8 || kEnableStates6 || kEnableStates4 || kEnableStates2; - using vec128 = uint32_t __attribute__((__vector_size__(16))); #if defined(__GNUC__) && !defined(__clang__) @@ -179,54 +177,62 @@ static constexpr vec256 increments[8] = { #endif }; -template -ALWAYS_INLINE void for_each_half_state(size_t half_states, Fn&& fn, std::index_sequence) +/** Store a vector in all array elements */ +template +ALWAYS_INLINE void for_each_half_state(Fn&& fn, std::index_sequence) { - ((I < half_states ? (fn(std::integral_constant{}), 0) : 0), ...); + (fn(std::integral_constant{}), ...); } -template -ALWAYS_INLINE void for_each_half_state(size_t half_states, Fn&& fn) +template +ALWAYS_INLINE void for_each_half_state(Fn&& fn) { - for_each_half_state(half_states, std::forward(fn), std::make_index_sequence<8>{}); + for_each_half_state(std::forward(fn), std::make_index_sequence{}); } /** Store a vector in all array elements */ -ALWAYS_INLINE void arr_set_vec256(vec256* arr, size_t half_states, const vec256& vec) +template +ALWAYS_INLINE void arr_set_vec256(std::array& arr, const vec256& vec) { - for_each_half_state(half_states, [&](auto idx) { - constexpr size_t i = decltype(idx)::value; - arr[i] = vec; + for_each_half_state([&](auto idx) { + constexpr size_t i{decltype(idx)::value}; + std::get(arr) = vec; }); } /** Add a vector to all array elements */ -ALWAYS_INLINE void arr_add_vec256(vec256* arr, size_t half_states, const vec256& vec) +template +ALWAYS_INLINE void arr_add_vec256(std::array& arr, const vec256& vec) { - for_each_half_state(half_states, [&](auto idx) { - constexpr size_t i = decltype(idx)::value; - arr[i] += vec; + for_each_half_state([&](auto idx) { + constexpr size_t i{decltype(idx)::value}; + std::get(arr) += vec; }); } /** Add corresponding vectors in arr1 to arr0 */ -ALWAYS_INLINE void arr_add_arr(vec256* arr0, const vec256* arr1, size_t half_states) +template +ALWAYS_INLINE void arr_add_arr(std::array& arr0, const vec256* arr1) { - for_each_half_state(half_states, [&](auto idx) { - constexpr size_t i = decltype(idx)::value; - arr0[i] += arr1[i]; + for_each_half_state([&](auto idx) { + constexpr size_t i{decltype(idx)::value}; + std::get(arr0) += arr1[i]; }); } /** Add arr1 to arr0, XOR result into arr2, rotate arr2 left by N bits */ -template -ALWAYS_INLINE void arr_add_xor_rot(vec256* arr0, const vec256* arr1, vec256* arr2, size_t half_states) +template +ALWAYS_INLINE void arr_add_xor_rot(std::array& arr0, const std::array& arr1, std::array& arr2) { - for_each_half_state(half_states, [&](auto idx) { - constexpr size_t i = decltype(idx)::value; - arr0[i] += arr1[i]; - arr2[i] ^= arr0[i]; - vec_rotl(arr2[i]); + for_each_half_state([&](auto idx) { + constexpr size_t i{decltype(idx)::value}; + vec256& x = std::get(arr0); + const vec256& y = std::get(arr1); + vec256& z = std::get(arr2); + + x += y; + z ^= x; + vec_rotl(z); }); } @@ -250,11 +256,12 @@ After the second round, they are used (in reverse) to restore the original layout. */ -ALWAYS_INLINE void arr_shuf0(vec256* arr, size_t half_states) +template +ALWAYS_INLINE void arr_shuf0(std::array& arr) { - for_each_half_state(half_states, [&](auto idx) { - constexpr size_t i = decltype(idx)::value; - vec256& x = arr[i]; + for_each_half_state([&](auto idx) { + constexpr size_t i{decltype(idx)::value}; + vec256& x = std::get(arr); #if CHACHA20_VEC_USE_SPLIT_LANES x.lo = __builtin_shufflevector(x.lo, x.lo, 1, 2, 3, 0); x.hi = __builtin_shufflevector(x.hi, x.hi, 1, 2, 3, 0); @@ -264,11 +271,12 @@ ALWAYS_INLINE void arr_shuf0(vec256* arr, size_t half_states) }); } -ALWAYS_INLINE void arr_shuf1(vec256* arr, size_t half_states) +template +ALWAYS_INLINE void arr_shuf1(std::array& arr) { - for_each_half_state(half_states, [&](auto idx) { - constexpr size_t i = decltype(idx)::value; - vec256& x = arr[i]; + for_each_half_state([&](auto idx) { + constexpr size_t i{decltype(idx)::value}; + vec256& x = std::get(arr); #if CHACHA20_VEC_USE_SPLIT_LANES x.lo = __builtin_shufflevector(x.lo, x.lo, 2, 3, 0, 1); x.hi = __builtin_shufflevector(x.hi, x.hi, 2, 3, 0, 1); @@ -278,11 +286,12 @@ ALWAYS_INLINE void arr_shuf1(vec256* arr, size_t half_states) }); } -ALWAYS_INLINE void arr_shuf2(vec256* arr, size_t half_states) +template +ALWAYS_INLINE void arr_shuf2(std::array& arr) { - for_each_half_state(half_states, [&](auto idx) { - constexpr size_t i = decltype(idx)::value; - vec256& x = arr[i]; + for_each_half_state([&](auto idx) { + constexpr size_t i{decltype(idx)::value}; + vec256& x = std::get(arr); #if CHACHA20_VEC_USE_SPLIT_LANES x.lo = __builtin_shufflevector(x.lo, x.lo, 3, 0, 1, 2); x.hi = __builtin_shufflevector(x.hi, x.hi, 3, 0, 1, 2); @@ -293,24 +302,25 @@ ALWAYS_INLINE void arr_shuf2(vec256* arr, size_t half_states) } /* Main round function. */ -ALWAYS_INLINE void doubleround(vec256* arr0, vec256* arr1, vec256* arr2, vec256* arr3, size_t half_states) +template +ALWAYS_INLINE void doubleround(std::array& arr0, std::array& arr1, std::array& arr2, std::array& arr3) { UNROLL_LOOP(10) for (size_t i = 0; i < 10; ++i) { - arr_add_xor_rot<16>(arr0, arr1, arr3, half_states); - arr_add_xor_rot<12>(arr2, arr3, arr1, half_states); - arr_add_xor_rot<8>(arr0, arr1, arr3, half_states); - arr_add_xor_rot<7>(arr2, arr3, arr1, half_states); - arr_shuf0(arr1, half_states); - arr_shuf1(arr2, half_states); - arr_shuf2(arr3, half_states); - arr_add_xor_rot<16>(arr0, arr1, arr3, half_states); - arr_add_xor_rot<12>(arr2, arr3, arr1, half_states); - arr_add_xor_rot<8>(arr0, arr1, arr3, half_states); - arr_add_xor_rot<7>(arr2, arr3, arr1, half_states); - arr_shuf2(arr1, half_states); - arr_shuf1(arr2, half_states); - arr_shuf0(arr3, half_states); + arr_add_xor_rot<16>(arr0, arr1, arr3); + arr_add_xor_rot<12>(arr2, arr3, arr1); + arr_add_xor_rot<8>(arr0, arr1, arr3); + arr_add_xor_rot<7>(arr2, arr3, arr1); + arr_shuf0(arr1); + arr_shuf1(arr2); + arr_shuf2(arr3); + arr_add_xor_rot<16>(arr0, arr1, arr3); + arr_add_xor_rot<12>(arr2, arr3, arr1); + arr_add_xor_rot<8>(arr0, arr1, arr3); + arr_add_xor_rot<7>(arr2, arr3, arr1); + arr_shuf2(arr1); + arr_shuf1(arr2); + arr_shuf0(arr3); } } @@ -335,16 +345,16 @@ ALWAYS_INLINE void vec_read_xor_write(const std::byte* in_bytes, std::byte* out_ memcpy(out_bytes, &tmp_vec, sizeof(tmp_vec)); } -template -ALWAYS_INLINE void arr_read_xor_write_impl(const std::byte* in_bytes, std::byte* out_bytes, const vec256* arr0, const vec256* arr1, const vec256* arr2, const vec256* arr3, size_t half_states) +template +ALWAYS_INLINE void arr_read_xor_write_impl(const std::byte* in_bytes, std::byte* out_bytes, const std::array& arr0, const std::array& arr1, const std::array& arr2, const std::array& arr3) { - for_each_half_state(half_states, [&](auto idx) { - constexpr size_t i = decltype(idx)::value; + for_each_half_state([&](auto idx) { + constexpr size_t i{decltype(idx)::value}; - const vec256& w = arr0[i]; - const vec256& x = arr1[i]; - const vec256& y = arr2[i]; - const vec256& z = arr3[i]; + const vec256& w = std::get(arr0); + const vec256& x = std::get(arr1); + const vec256& y = std::get(arr2); + const vec256& z = std::get(arr3); const size_t offset = i * 128; const std::byte* in_slice = in_bytes + offset; @@ -365,16 +375,23 @@ ALWAYS_INLINE void arr_read_xor_write_impl(const std::byte* in_bytes, std::byte* } /* Merge the 128 bit lanes from 2 states to the proper order, then pass each vec_read_xor_write */ -ALWAYS_INLINE void arr_read_xor_write(std::span in_bytes, std::span out_bytes, const vec256* arr0, const vec256* arr1, const vec256* arr2, const vec256* arr3, size_t half_states) +template +ALWAYS_INLINE void arr_read_xor_write(std::span in_bytes, std::span out_bytes, const std::array& arr0, const std::array& arr1, const std::array& arr2, const std::array& arr3) { +#if defined(__clang__) && (defined(__aarch64__) || defined(__ARM_NEON) || defined(__ARM_NEON__)) + // On AArch64/NEON, clang's codegen for `std::assume_aligned` + 32-byte memcpy + // can be slower than the unaligned path. Prefer the single unaligned variant. + arr_read_xor_write_impl(in_bytes.data(), out_bytes.data(), arr0, arr1, arr2, arr3); +#else constexpr std::uintptr_t mask{CHACHA20_VEC_MEM_ALIGN - 1}; const bool aligned = ((reinterpret_cast(in_bytes.data()) | reinterpret_cast(out_bytes.data())) & mask) == 0; if (aligned) [[likely]] { - arr_read_xor_write_impl(in_bytes.data(), out_bytes.data(), arr0, arr1, arr2, arr3, half_states); + arr_read_xor_write_impl(in_bytes.data(), out_bytes.data(), arr0, arr1, arr2, arr3); } else { - arr_read_xor_write_impl(in_bytes.data(), out_bytes.data(), arr0, arr1, arr2, arr3, half_states); + arr_read_xor_write_impl(in_bytes.data(), out_bytes.data(), arr0, arr1, arr2, arr3); } +#endif } /* Main crypt function. Calculates up to 16 states (8 half_states). */ @@ -383,25 +400,25 @@ ALWAYS_INLINE void multi_block_crypt(std::span in_bytes, std::s { static_assert(States == 16 || States == 8 || States == 6 || States == 4 || States == 2); constexpr size_t half_states = States / 2; - vec256 arr0[half_states], arr1[half_states], arr2[half_states], arr3[half_states]; + std::array arr0, arr1, arr2, arr3; - arr_set_vec256(arr0, half_states, nums256); - arr_set_vec256(arr1, half_states, state0); - arr_set_vec256(arr2, half_states, state1); - arr_set_vec256(arr3, half_states, state2); + arr_set_vec256(arr0, nums256); + arr_set_vec256(arr1, state0); + arr_set_vec256(arr2, state1); + arr_set_vec256(arr3, state2); - arr_add_arr(arr3, increments, half_states); + arr_add_arr(arr3, increments); - doubleround(arr0, arr1, arr2, arr3, half_states); + doubleround(arr0, arr1, arr2, arr3); - arr_add_vec256(arr0, half_states, nums256); - arr_add_vec256(arr1, half_states, state0); - arr_add_vec256(arr2, half_states, state1); - arr_add_vec256(arr3, half_states, state2); + arr_add_vec256(arr0, nums256); + arr_add_vec256(arr1, state0); + arr_add_vec256(arr2, state1); + arr_add_vec256(arr3, state2); - arr_add_arr(arr3, increments, half_states); + arr_add_arr(arr3, increments); - arr_read_xor_write(in_bytes, out_bytes, arr0, arr1, arr2, arr3, half_states); + arr_read_xor_write(in_bytes, out_bytes, arr0, arr1, arr2, arr3); } template diff --git a/src/crypto/chacha20_vec_base.cpp b/src/crypto/chacha20_vec_base.cpp index d23be59aaa1e..4d7ab6839406 100644 --- a/src/crypto/chacha20_vec_base.cpp +++ b/src/crypto/chacha20_vec_base.cpp @@ -23,12 +23,15 @@ # endif #elif defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) # if defined(__GNUC__) && !defined(__clang__) -// GCC tends to spill heavily in the widest multi-state configuration on -// AArch64/NEON. Prefer smaller multi-state levels that fit in registers. +// The widest multi-state configuration (16) tends to spill on AArch64/NEON. +// Also disable the 6-state variant: it increases code size and hurts the +// common 8/4-state path on this target. # define CHACHA20_VEC_DISABLE_STATES_16 # define CHACHA20_VEC_DISABLE_STATES_6 # define CHACHA20_VEC_DISABLE_STATES_2 # else +# define CHACHA20_VEC_DISABLE_STATES_16 +# define CHACHA20_VEC_DISABLE_STATES_6 # define CHACHA20_VEC_DISABLE_STATES_2 # endif #else From 12b92cff62d70757e4dfe8888655db4a8f031fd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C5=91rinc?= Date: Sat, 10 Jan 2026 22:35:32 +0100 Subject: [PATCH 12/12] chacha20: refactor vector operations with utility-based loops and helper functions --- src/crypto/chacha20_vec.ipp | 219 ++++++++++++++++++------------------ src/util/for_each_index.h | 30 +++++ 2 files changed, 137 insertions(+), 112 deletions(-) create mode 100644 src/util/for_each_index.h diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp index 468204ceb99f..02de40183343 100644 --- a/src/crypto/chacha20_vec.ipp +++ b/src/crypto/chacha20_vec.ipp @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -146,6 +147,15 @@ ALWAYS_INLINE void vec_rotl(vec256& vec) #endif } +ALWAYS_INLINE void vec_add_counter(vec256& vec, uint32_t inc) +{ +#if CHACHA20_VEC_USE_SPLIT_LANES + vec += vec256{{inc, 0, 0, 0}, {inc, 0, 0, 0}}; +#else + vec += (vec256){inc, 0, 0, 0, inc, 0, 0, 0}; +#endif +} + #if CHACHA20_VEC_USE_SPLIT_LANES static constexpr vec128 nums128 = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; static constexpr vec256 nums256 = {nums128, nums128}; @@ -153,82 +163,118 @@ static constexpr vec256 nums256 = {nums128, nums128}; static constexpr vec256 nums256 = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574, 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; #endif -// Counter increments for each half-state pair. Pattern: {2*i+1, 0, 0, 0, 2*i, 0, 0, 0} -// All smaller state counts use a prefix of this array. -static constexpr vec256 increments[8] = { +ALWAYS_INLINE vec256 vec_broadcast4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) +{ #if CHACHA20_VEC_USE_SPLIT_LANES - {{1, 0, 0, 0}, {0, 0, 0, 0}}, - {{3, 0, 0, 0}, {2, 0, 0, 0}}, - {{5, 0, 0, 0}, {4, 0, 0, 0}}, - {{7, 0, 0, 0}, {6, 0, 0, 0}}, - {{9, 0, 0, 0}, {8, 0, 0, 0}}, - {{11, 0, 0, 0}, {10, 0, 0, 0}}, - {{13, 0, 0, 0}, {12, 0, 0, 0}}, - {{15, 0, 0, 0}, {14, 0, 0, 0}}, + const vec128 lane = {a, b, c, d}; + return vec256{lane, lane}; #else - {1, 0, 0, 0, 0, 0, 0, 0}, - {3, 0, 0, 0, 2, 0, 0, 0}, - {5, 0, 0, 0, 4, 0, 0, 0}, - {7, 0, 0, 0, 6, 0, 0, 0}, - {9, 0, 0, 0, 8, 0, 0, 0}, - {11, 0, 0, 0, 10, 0, 0, 0}, - {13, 0, 0, 0, 12, 0, 0, 0}, - {15, 0, 0, 0, 14, 0, 0, 0}, + return (vec256){a, b, c, d, a, b, c, d}; #endif -}; +} -/** Store a vector in all array elements */ -template -ALWAYS_INLINE void for_each_half_state(Fn&& fn, std::index_sequence) +ALWAYS_INLINE void vec_shuf0(vec256& x) +{ +#if CHACHA20_VEC_USE_SPLIT_LANES + x.lo = __builtin_shufflevector(x.lo, x.lo, 1, 2, 3, 0); + x.hi = __builtin_shufflevector(x.hi, x.hi, 1, 2, 3, 0); +#else + x = __builtin_shufflevector(x, x, 1, 2, 3, 0, 5, 6, 7, 4); +#endif +} + +ALWAYS_INLINE void vec_shuf1(vec256& x) { - (fn(std::integral_constant{}), ...); +#if CHACHA20_VEC_USE_SPLIT_LANES + x.lo = __builtin_shufflevector(x.lo, x.lo, 2, 3, 0, 1); + x.hi = __builtin_shufflevector(x.hi, x.hi, 2, 3, 0, 1); +#else + x = __builtin_shufflevector(x, x, 2, 3, 0, 1, 6, 7, 4, 5); +#endif +} + +ALWAYS_INLINE void vec_shuf2(vec256& x) +{ +#if CHACHA20_VEC_USE_SPLIT_LANES + x.lo = __builtin_shufflevector(x.lo, x.lo, 3, 0, 1, 2); + x.hi = __builtin_shufflevector(x.hi, x.hi, 3, 0, 1, 2); +#else + x = __builtin_shufflevector(x, x, 3, 0, 1, 2, 7, 4, 5, 6); +#endif +} + +ALWAYS_INLINE vec256 vec_pack_hi(const vec256& a, const vec256& b) +{ +#if CHACHA20_VEC_USE_SPLIT_LANES + return vec256{a.hi, b.hi}; +#else + return __builtin_shufflevector(a, b, 4, 5, 6, 7, 12, 13, 14, 15); +#endif +} + +ALWAYS_INLINE vec256 vec_pack_lo(const vec256& a, const vec256& b) +{ +#if CHACHA20_VEC_USE_SPLIT_LANES + return vec256{a.lo, b.lo}; +#else + return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8, 9, 10, 11); +#endif } -template -ALWAYS_INLINE void for_each_half_state(Fn&& fn) +#if CHACHA20_VEC_USE_SPLIT_LANES +static constexpr vec256 make_increment(uint32_t odd, uint32_t even) +{ + return vec256{{odd, 0, 0, 0}, {even, 0, 0, 0}}; +} +#else +static constexpr vec256 make_increment(uint32_t odd, uint32_t even) { - for_each_half_state(std::forward(fn), std::make_index_sequence{}); + return (vec256){odd, 0, 0, 0, even, 0, 0, 0}; } +#endif + +// Counter increments for each half-state pair. Pattern: {2*i+1, 0, 0, 0, 2*i, 0, 0, 0} +// All smaller state counts use a prefix of this array. +static constexpr vec256 increments[8] = { + make_increment(1, 0), + make_increment(3, 2), + make_increment(5, 4), + make_increment(7, 6), + make_increment(9, 8), + make_increment(11, 10), + make_increment(13, 12), + make_increment(15, 14), +}; /** Store a vector in all array elements */ template ALWAYS_INLINE void arr_set_vec256(std::array& arr, const vec256& vec) { - for_each_half_state([&](auto idx) { - constexpr size_t i{decltype(idx)::value}; - std::get(arr) = vec; - }); + util::ForEachIndex([&]() { arr[I] = vec; }); } /** Add a vector to all array elements */ template ALWAYS_INLINE void arr_add_vec256(std::array& arr, const vec256& vec) { - for_each_half_state([&](auto idx) { - constexpr size_t i{decltype(idx)::value}; - std::get(arr) += vec; - }); + util::ForEachIndex([&]() { arr[I] += vec; }); } /** Add corresponding vectors in arr1 to arr0 */ template ALWAYS_INLINE void arr_add_arr(std::array& arr0, const vec256* arr1) { - for_each_half_state([&](auto idx) { - constexpr size_t i{decltype(idx)::value}; - std::get(arr0) += arr1[i]; - }); + util::ForEachIndex([&]() { arr0[I] += arr1[I]; }); } /** Add arr1 to arr0, XOR result into arr2, rotate arr2 left by N bits */ template ALWAYS_INLINE void arr_add_xor_rot(std::array& arr0, const std::array& arr1, std::array& arr2) { - for_each_half_state([&](auto idx) { - constexpr size_t i{decltype(idx)::value}; - vec256& x = std::get(arr0); - const vec256& y = std::get(arr1); - vec256& z = std::get(arr2); + util::ForEachIndex([&]() { + vec256& x = arr0[I]; + const vec256& y = arr1[I]; + vec256& z = arr2[I]; x += y; z ^= x; @@ -259,46 +305,19 @@ layout. template ALWAYS_INLINE void arr_shuf0(std::array& arr) { - for_each_half_state([&](auto idx) { - constexpr size_t i{decltype(idx)::value}; - vec256& x = std::get(arr); -#if CHACHA20_VEC_USE_SPLIT_LANES - x.lo = __builtin_shufflevector(x.lo, x.lo, 1, 2, 3, 0); - x.hi = __builtin_shufflevector(x.hi, x.hi, 1, 2, 3, 0); -#else - x = __builtin_shufflevector(x, x, 1, 2, 3, 0, 5, 6, 7, 4); -#endif - }); + util::ForEachIndex([&]() { vec_shuf0(arr[I]); }); } template ALWAYS_INLINE void arr_shuf1(std::array& arr) { - for_each_half_state([&](auto idx) { - constexpr size_t i{decltype(idx)::value}; - vec256& x = std::get(arr); -#if CHACHA20_VEC_USE_SPLIT_LANES - x.lo = __builtin_shufflevector(x.lo, x.lo, 2, 3, 0, 1); - x.hi = __builtin_shufflevector(x.hi, x.hi, 2, 3, 0, 1); -#else - x = __builtin_shufflevector(x, x, 2, 3, 0, 1, 6, 7, 4, 5); -#endif - }); + util::ForEachIndex([&]() { vec_shuf1(arr[I]); }); } template ALWAYS_INLINE void arr_shuf2(std::array& arr) { - for_each_half_state([&](auto idx) { - constexpr size_t i{decltype(idx)::value}; - vec256& x = std::get(arr); -#if CHACHA20_VEC_USE_SPLIT_LANES - x.lo = __builtin_shufflevector(x.lo, x.lo, 3, 0, 1, 2); - x.hi = __builtin_shufflevector(x.hi, x.hi, 3, 0, 1, 2); -#else - x = __builtin_shufflevector(x, x, 3, 0, 1, 2, 7, 4, 5, 6); -#endif - }); + util::ForEachIndex([&]() { vec_shuf2(arr[I]); }); } /* Main round function. */ @@ -348,29 +367,20 @@ ALWAYS_INLINE void vec_read_xor_write(const std::byte* in_bytes, std::byte* out_ template ALWAYS_INLINE void arr_read_xor_write_impl(const std::byte* in_bytes, std::byte* out_bytes, const std::array& arr0, const std::array& arr1, const std::array& arr2, const std::array& arr3) { - for_each_half_state([&](auto idx) { - constexpr size_t i{decltype(idx)::value}; + util::ForEachIndex([&]() { + const vec256& w = arr0[I]; + const vec256& x = arr1[I]; + const vec256& y = arr2[I]; + const vec256& z = arr3[I]; - const vec256& w = std::get(arr0); - const vec256& x = std::get(arr1); - const vec256& y = std::get(arr2); - const vec256& z = std::get(arr3); - - const size_t offset = i * 128; + constexpr size_t offset = I * 128; const std::byte* in_slice = in_bytes + offset; std::byte* out_slice = out_bytes + offset; -#if CHACHA20_VEC_USE_SPLIT_LANES - vec_read_xor_write(in_slice + 0, out_slice + 0, vec256{w.hi, x.hi}); - vec_read_xor_write(in_slice + 32, out_slice + 32, vec256{y.hi, z.hi}); - vec_read_xor_write(in_slice + 64, out_slice + 64, vec256{w.lo, x.lo}); - vec_read_xor_write(in_slice + 96, out_slice + 96, vec256{y.lo, z.lo}); -#else - vec_read_xor_write(in_slice + 0, out_slice + 0, __builtin_shufflevector(w, x, 4, 5, 6, 7, 12, 13, 14, 15)); - vec_read_xor_write(in_slice + 32, out_slice + 32, __builtin_shufflevector(y, z, 4, 5, 6, 7, 12, 13, 14, 15)); - vec_read_xor_write(in_slice + 64, out_slice + 64, __builtin_shufflevector(w, x, 0, 1, 2, 3, 8, 9, 10, 11)); - vec_read_xor_write(in_slice + 96, out_slice + 96, __builtin_shufflevector(y, z, 0, 1, 2, 3, 8, 9, 10, 11)); -#endif + vec_read_xor_write(in_slice + 0, out_slice + 0, vec_pack_hi(w, x)); + vec_read_xor_write(in_slice + 32, out_slice + 32, vec_pack_hi(y, z)); + vec_read_xor_write(in_slice + 64, out_slice + 64, vec_pack_lo(w, x)); + vec_read_xor_write(in_slice + 96, out_slice + 96, vec_pack_lo(y, z)); }); } @@ -426,12 +436,7 @@ ALWAYS_INLINE void process_blocks(std::span& in_bytes, std::spa { while (in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * States) { multi_block_crypt(in_bytes, out_bytes, state0, state1, state2); - const uint32_t inc = static_cast(States); -#if CHACHA20_VEC_USE_SPLIT_LANES - state2 += vec256{{inc, 0, 0, 0}, {inc, 0, 0, 0}}; -#else - state2 += (vec256){inc, 0, 0, 0, inc, 0, 0, 0}; -#endif + vec_add_counter(state2, static_cast(States)); in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * States); out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * States); } @@ -448,19 +453,9 @@ void chacha20_crypt_vectorized(std::span& in_bytes, std::span(in_bytes, out_bytes, state0, state1, state2); if constexpr (kEnableStates8) process_blocks<8>(in_bytes, out_bytes, state0, state1, state2); diff --git a/src/util/for_each_index.h b/src/util/for_each_index.h new file mode 100644 index 000000000000..9bbd21c56479 --- /dev/null +++ b/src/util/for_each_index.h @@ -0,0 +1,30 @@ +// Copyright (c) 2026-present The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#ifndef BITCOIN_UTIL_FOR_EACH_INDEX_H +#define BITCOIN_UTIL_FOR_EACH_INDEX_H + +#include + +#include +#include + +namespace util { + +/** Invoke `fn.template operator()()` for each `I` in `[0, N)`. */ +template +ALWAYS_INLINE void ForEachIndex(Fn&& fn, std::index_sequence) +{ + (fn.template operator()(), ...); +} + +template +ALWAYS_INLINE void ForEachIndex(Fn&& fn) +{ + ForEachIndex(std::forward(fn), std::make_index_sequence{}); +} + +} // namespace util + +#endif // BITCOIN_UTIL_FOR_EACH_INDEX_H