diff --git a/src/attributes.h b/src/attributes.h index 275dad9f8ede..b3686b6c1772 100644 --- a/src/attributes.h +++ b/src/attributes.h @@ -24,4 +24,14 @@ # error No known always_inline attribute for this platform. #endif +#define PRAGMA(x) _Pragma(#x) + +#if defined(__clang__) +# define UNROLL_LOOP(N) PRAGMA(clang loop unroll_count(N)) +#elif defined(__GNUC__) +# define UNROLL_LOOP(N) PRAGMA(GCC unroll N) +#else +# define UNROLL_LOOP(N) +#endif + #endif // BITCOIN_ATTRIBUTES_H diff --git a/src/crypto/CMakeLists.txt b/src/crypto/CMakeLists.txt index 92653ade5a7a..a247516e4a95 100644 --- a/src/crypto/CMakeLists.txt +++ b/src/crypto/CMakeLists.txt @@ -5,6 +5,7 @@ add_library(bitcoin_crypto STATIC EXCLUDE_FROM_ALL aes.cpp chacha20.cpp + chacha20_vec_base.cpp chacha20poly1305.cpp hex_base.cpp hkdf_sha256_32.cpp diff --git a/src/crypto/chacha20.cpp b/src/crypto/chacha20.cpp index 6bdffe691a6b..53d5e0857549 100644 --- a/src/crypto/chacha20.cpp +++ b/src/crypto/chacha20.cpp @@ -7,11 +7,15 @@ #include #include +#include #include #include #include #include +#include + +static_assert(ChaCha20Aligned::BLOCKLEN == CHACHA20_VEC_BLOCKLEN); #define QUARTERROUND(a,b,c,d) \ a += b; d = std::rotl(d ^ a, 16); \ @@ -157,13 +161,14 @@ inline void ChaCha20Aligned::Keystream(std::span output) noexcept } } -inline void ChaCha20Aligned::Crypt(std::span in_bytes, std::span out_bytes) noexcept +static inline void chacha20_crypt(std::span in_bytes, std::span out_bytes, uint32_t input[12]) noexcept { assert(in_bytes.size() == out_bytes.size()); const std::byte* m = in_bytes.data(); std::byte* c = out_bytes.data(); - size_t blocks = out_bytes.size() / BLOCKLEN; - assert(blocks * BLOCKLEN == out_bytes.size()); + size_t blocks = out_bytes.size() / ChaCha20Aligned::BLOCKLEN; + assert(blocks * ChaCha20Aligned::BLOCKLEN == out_bytes.size()); + uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; uint32_t j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15; @@ -273,8 +278,29 @@ inline void ChaCha20Aligned::Crypt(std::span in_bytes, std::spa return; } blocks -= 1; - c += BLOCKLEN; - m += BLOCKLEN; + c += ChaCha20Aligned::BLOCKLEN; + m += ChaCha20Aligned::BLOCKLEN; + } +} + + +inline void ChaCha20Aligned::Crypt(std::span in_bytes, std::span out_bytes) noexcept +{ + assert(in_bytes.size() == out_bytes.size()); + size_t blocks = out_bytes.size() / ChaCha20Aligned::BLOCKLEN; + assert(blocks * ChaCha20Aligned::BLOCKLEN == out_bytes.size()); +#ifdef ENABLE_CHACHA20_VEC + // Only use the vectorized implementations if the counter will not overflow. + const bool overflow = static_cast(input[8]) + blocks > std::numeric_limits::max(); + if (blocks > 1 && !overflow) { + const auto state = std::to_array(input); + chacha20_vec_base::chacha20_crypt_vectorized(in_bytes, out_bytes, state); + const size_t blocks_written = blocks - (out_bytes.size() / ChaCha20Aligned::BLOCKLEN); + input[8] += blocks_written; + } +#endif + if (in_bytes.size()) { + chacha20_crypt(in_bytes, out_bytes, input); } } diff --git a/src/crypto/chacha20_vec.h b/src/crypto/chacha20_vec.h new file mode 100644 index 000000000000..b1176d2b8dbf --- /dev/null +++ b/src/crypto/chacha20_vec.h @@ -0,0 +1,30 @@ +// Copyright (c) 2025-present The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#ifndef BITCOIN_CRYPTO_CHACHA20_VEC_H +#define BITCOIN_CRYPTO_CHACHA20_VEC_H + +#include +#include +#include +#include + +static constexpr size_t CHACHA20_VEC_BLOCKLEN = 64; + +#ifdef __has_builtin + #if __has_builtin(__builtin_shufflevector) + #define ENABLE_CHACHA20_VEC 1 + #endif +#endif + +#ifdef ENABLE_CHACHA20_VEC + +namespace chacha20_vec_base +{ + void chacha20_crypt_vectorized(std::span& in_bytes, std::span& out_bytes, const std::array& input) noexcept; +} + +#endif // ENABLE_CHACHA20_VEC + +#endif // BITCOIN_CRYPTO_CHACHA20_VEC_H diff --git a/src/crypto/chacha20_vec.ipp b/src/crypto/chacha20_vec.ipp new file mode 100644 index 000000000000..02de40183343 --- /dev/null +++ b/src/crypto/chacha20_vec.ipp @@ -0,0 +1,479 @@ +// Copyright (c) 2025-present The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#if defined(ENABLE_CHACHA20_VEC) + +// Convert preprocessor flags to constexpr booleans for use with if constexpr +#ifdef CHACHA20_VEC_DISABLE_STATES_16 +inline constexpr bool kEnableStates16 = false; +#else +inline constexpr bool kEnableStates16 = true; +#endif + +#ifdef CHACHA20_VEC_DISABLE_STATES_8 +inline constexpr bool kEnableStates8 = false; +#else +inline constexpr bool kEnableStates8 = true; +#endif + +#ifdef CHACHA20_VEC_DISABLE_STATES_6 +inline constexpr bool kEnableStates6 = false; +#else +inline constexpr bool kEnableStates6 = true; +#endif + +#ifdef CHACHA20_VEC_DISABLE_STATES_4 +inline constexpr bool kEnableStates4 = false; +#else +inline constexpr bool kEnableStates4 = true; +#endif + +#ifdef CHACHA20_VEC_DISABLE_STATES_2 +inline constexpr bool kEnableStates2 = false; +#else +inline constexpr bool kEnableStates2 = true; +#endif + +using vec128 = uint32_t __attribute__((__vector_size__(16))); + +#if defined(__GNUC__) && !defined(__clang__) +# define CHACHA20_VEC_USE_SPLIT_LANES 1 +#else +# define CHACHA20_VEC_USE_SPLIT_LANES 0 +#endif + +#if CHACHA20_VEC_USE_SPLIT_LANES +// Represent two 128-bit lanes explicitly. This avoids GCC generating expensive +// scalar sequences for 256-bit shuffles on targets without native 256-bit SIMD +// registers (e.g. AArch64/NEON, x86/SSE2). +struct vec256 { + vec128 lo; + vec128 hi; +}; +static_assert(sizeof(vec256) == 32); + +ALWAYS_INLINE vec256& operator+=(vec256& a, const vec256& b) +{ + a.lo += b.lo; + a.hi += b.hi; + return a; +} + +ALWAYS_INLINE vec256& operator^=(vec256& a, const vec256& b) +{ + a.lo ^= b.lo; + a.hi ^= b.hi; + return a; +} +#else +using vec256 = uint32_t __attribute__((__vector_size__(32))); +#endif + +// Preprocessor check for conditional compilation of the anonymous namespace and +// the multi-state code paths. When all states are disabled, avoid referencing +// templates/functions that are not available. +#if !defined(CHACHA20_VEC_DISABLE_STATES_16) || \ + !defined(CHACHA20_VEC_DISABLE_STATES_8) || \ + !defined(CHACHA20_VEC_DISABLE_STATES_6) || \ + !defined(CHACHA20_VEC_DISABLE_STATES_4) || \ + !defined(CHACHA20_VEC_DISABLE_STATES_2) +# define CHACHA20_VEC_ENABLE_ANY_MULTI_STATE 1 +#else +# define CHACHA20_VEC_ENABLE_ANY_MULTI_STATE 0 +#endif + +#if CHACHA20_VEC_ENABLE_ANY_MULTI_STATE + +namespace { + +// Used for an optional aligned I/O fast-path. +static constexpr size_t CHACHA20_VEC_MEM_ALIGN{16}; + +/** Endian-conversion for big-endian */ +ALWAYS_INLINE void vec_byteswap(vec256& vec) +{ + if constexpr (std::endian::native == std::endian::big) { +#if CHACHA20_VEC_USE_SPLIT_LANES + vec128 lo; + lo[0] = __builtin_bswap32(vec.lo[0]); + lo[1] = __builtin_bswap32(vec.lo[1]); + lo[2] = __builtin_bswap32(vec.lo[2]); + lo[3] = __builtin_bswap32(vec.lo[3]); + vec128 hi; + hi[0] = __builtin_bswap32(vec.hi[0]); + hi[1] = __builtin_bswap32(vec.hi[1]); + hi[2] = __builtin_bswap32(vec.hi[2]); + hi[3] = __builtin_bswap32(vec.hi[3]); + vec.lo = lo; + vec.hi = hi; +#else + vec256 ret; + ret[0] = __builtin_bswap32(vec[0]); + ret[1] = __builtin_bswap32(vec[1]); + ret[2] = __builtin_bswap32(vec[2]); + ret[3] = __builtin_bswap32(vec[3]); + ret[4] = __builtin_bswap32(vec[4]); + ret[5] = __builtin_bswap32(vec[5]); + ret[6] = __builtin_bswap32(vec[6]); + ret[7] = __builtin_bswap32(vec[7]); + vec = ret; +#endif + } +} + +/** Left-rotate all elements in a vector by N bits */ +template +ALWAYS_INLINE void vec_rotl(vec256& vec) +{ + static_assert(N > 0 && N < 32, "Rotation must be between 1 and 31 bits"); +#if CHACHA20_VEC_USE_SPLIT_LANES + vec.lo = (vec.lo << N) | (vec.lo >> (32 - N)); + vec.hi = (vec.hi << N) | (vec.hi >> (32 - N)); +#else + vec = (vec << N) | (vec >> (32 - N)); +#endif +} + +ALWAYS_INLINE void vec_add_counter(vec256& vec, uint32_t inc) +{ +#if CHACHA20_VEC_USE_SPLIT_LANES + vec += vec256{{inc, 0, 0, 0}, {inc, 0, 0, 0}}; +#else + vec += (vec256){inc, 0, 0, 0, inc, 0, 0, 0}; +#endif +} + +#if CHACHA20_VEC_USE_SPLIT_LANES +static constexpr vec128 nums128 = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; +static constexpr vec256 nums256 = {nums128, nums128}; +#else +static constexpr vec256 nums256 = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574, 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}; +#endif + +ALWAYS_INLINE vec256 vec_broadcast4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) +{ +#if CHACHA20_VEC_USE_SPLIT_LANES + const vec128 lane = {a, b, c, d}; + return vec256{lane, lane}; +#else + return (vec256){a, b, c, d, a, b, c, d}; +#endif +} + +ALWAYS_INLINE void vec_shuf0(vec256& x) +{ +#if CHACHA20_VEC_USE_SPLIT_LANES + x.lo = __builtin_shufflevector(x.lo, x.lo, 1, 2, 3, 0); + x.hi = __builtin_shufflevector(x.hi, x.hi, 1, 2, 3, 0); +#else + x = __builtin_shufflevector(x, x, 1, 2, 3, 0, 5, 6, 7, 4); +#endif +} + +ALWAYS_INLINE void vec_shuf1(vec256& x) +{ +#if CHACHA20_VEC_USE_SPLIT_LANES + x.lo = __builtin_shufflevector(x.lo, x.lo, 2, 3, 0, 1); + x.hi = __builtin_shufflevector(x.hi, x.hi, 2, 3, 0, 1); +#else + x = __builtin_shufflevector(x, x, 2, 3, 0, 1, 6, 7, 4, 5); +#endif +} + +ALWAYS_INLINE void vec_shuf2(vec256& x) +{ +#if CHACHA20_VEC_USE_SPLIT_LANES + x.lo = __builtin_shufflevector(x.lo, x.lo, 3, 0, 1, 2); + x.hi = __builtin_shufflevector(x.hi, x.hi, 3, 0, 1, 2); +#else + x = __builtin_shufflevector(x, x, 3, 0, 1, 2, 7, 4, 5, 6); +#endif +} + +ALWAYS_INLINE vec256 vec_pack_hi(const vec256& a, const vec256& b) +{ +#if CHACHA20_VEC_USE_SPLIT_LANES + return vec256{a.hi, b.hi}; +#else + return __builtin_shufflevector(a, b, 4, 5, 6, 7, 12, 13, 14, 15); +#endif +} + +ALWAYS_INLINE vec256 vec_pack_lo(const vec256& a, const vec256& b) +{ +#if CHACHA20_VEC_USE_SPLIT_LANES + return vec256{a.lo, b.lo}; +#else + return __builtin_shufflevector(a, b, 0, 1, 2, 3, 8, 9, 10, 11); +#endif +} + +#if CHACHA20_VEC_USE_SPLIT_LANES +static constexpr vec256 make_increment(uint32_t odd, uint32_t even) +{ + return vec256{{odd, 0, 0, 0}, {even, 0, 0, 0}}; +} +#else +static constexpr vec256 make_increment(uint32_t odd, uint32_t even) +{ + return (vec256){odd, 0, 0, 0, even, 0, 0, 0}; +} +#endif + +// Counter increments for each half-state pair. Pattern: {2*i+1, 0, 0, 0, 2*i, 0, 0, 0} +// All smaller state counts use a prefix of this array. +static constexpr vec256 increments[8] = { + make_increment(1, 0), + make_increment(3, 2), + make_increment(5, 4), + make_increment(7, 6), + make_increment(9, 8), + make_increment(11, 10), + make_increment(13, 12), + make_increment(15, 14), +}; + +/** Store a vector in all array elements */ +template +ALWAYS_INLINE void arr_set_vec256(std::array& arr, const vec256& vec) +{ + util::ForEachIndex([&]() { arr[I] = vec; }); +} + +/** Add a vector to all array elements */ +template +ALWAYS_INLINE void arr_add_vec256(std::array& arr, const vec256& vec) +{ + util::ForEachIndex([&]() { arr[I] += vec; }); +} + +/** Add corresponding vectors in arr1 to arr0 */ +template +ALWAYS_INLINE void arr_add_arr(std::array& arr0, const vec256* arr1) +{ + util::ForEachIndex([&]() { arr0[I] += arr1[I]; }); +} + +/** Add arr1 to arr0, XOR result into arr2, rotate arr2 left by N bits */ +template +ALWAYS_INLINE void arr_add_xor_rot(std::array& arr0, const std::array& arr1, std::array& arr2) +{ + util::ForEachIndex([&]() { + vec256& x = arr0[I]; + const vec256& y = arr1[I]; + vec256& z = arr2[I]; + + x += y; + z ^= x; + vec_rotl(z); + }); +} + +/* +The first round: + QUARTERROUND( x0, x4, x8,x12); + QUARTERROUND( x1, x5, x9,x13); + QUARTERROUND( x2, x6,x10,x14); + QUARTERROUND( x3, x7,x11,x15); + +The second round: + QUARTERROUND( x0, x5,x10,x15); + QUARTERROUND( x1, x6,x11,x12); + QUARTERROUND( x2, x7, x8,x13); + QUARTERROUND( x3, x4, x9,x14); + +After the first round, arr_shuf0, arr_shuf1, and arr_shuf2 are used to shuffle +the layout to prepare for the second round. + +After the second round, they are used (in reverse) to restore the original +layout. + +*/ +template +ALWAYS_INLINE void arr_shuf0(std::array& arr) +{ + util::ForEachIndex([&]() { vec_shuf0(arr[I]); }); +} + +template +ALWAYS_INLINE void arr_shuf1(std::array& arr) +{ + util::ForEachIndex([&]() { vec_shuf1(arr[I]); }); +} + +template +ALWAYS_INLINE void arr_shuf2(std::array& arr) +{ + util::ForEachIndex([&]() { vec_shuf2(arr[I]); }); +} + +/* Main round function. */ +template +ALWAYS_INLINE void doubleround(std::array& arr0, std::array& arr1, std::array& arr2, std::array& arr3) +{ + UNROLL_LOOP(10) + for (size_t i = 0; i < 10; ++i) { + arr_add_xor_rot<16>(arr0, arr1, arr3); + arr_add_xor_rot<12>(arr2, arr3, arr1); + arr_add_xor_rot<8>(arr0, arr1, arr3); + arr_add_xor_rot<7>(arr2, arr3, arr1); + arr_shuf0(arr1); + arr_shuf1(arr2); + arr_shuf2(arr3); + arr_add_xor_rot<16>(arr0, arr1, arr3); + arr_add_xor_rot<12>(arr2, arr3, arr1); + arr_add_xor_rot<8>(arr0, arr1, arr3); + arr_add_xor_rot<7>(arr2, arr3, arr1); + arr_shuf2(arr1); + arr_shuf1(arr2); + arr_shuf0(arr3); + } +} + +/* Read 32 bytes of input, xor with calculated state, write to output. + Supports unaligned input/output, with an optional aligned fast-path. +*/ +template +ALWAYS_INLINE void vec_read_xor_write(const std::byte* in_bytes, std::byte* out_bytes, const vec256& vec) +{ + if constexpr (AssumeAligned) { + in_bytes = std::assume_aligned(in_bytes); + out_bytes = std::assume_aligned(out_bytes); + } + + vec256 tmp_vec; + memcpy(&tmp_vec, in_bytes, sizeof(tmp_vec)); + vec_byteswap(tmp_vec); + + tmp_vec ^= vec; + vec_byteswap(tmp_vec); + + memcpy(out_bytes, &tmp_vec, sizeof(tmp_vec)); +} + +template +ALWAYS_INLINE void arr_read_xor_write_impl(const std::byte* in_bytes, std::byte* out_bytes, const std::array& arr0, const std::array& arr1, const std::array& arr2, const std::array& arr3) +{ + util::ForEachIndex([&]() { + const vec256& w = arr0[I]; + const vec256& x = arr1[I]; + const vec256& y = arr2[I]; + const vec256& z = arr3[I]; + + constexpr size_t offset = I * 128; + const std::byte* in_slice = in_bytes + offset; + std::byte* out_slice = out_bytes + offset; + + vec_read_xor_write(in_slice + 0, out_slice + 0, vec_pack_hi(w, x)); + vec_read_xor_write(in_slice + 32, out_slice + 32, vec_pack_hi(y, z)); + vec_read_xor_write(in_slice + 64, out_slice + 64, vec_pack_lo(w, x)); + vec_read_xor_write(in_slice + 96, out_slice + 96, vec_pack_lo(y, z)); + }); +} + +/* Merge the 128 bit lanes from 2 states to the proper order, then pass each vec_read_xor_write */ +template +ALWAYS_INLINE void arr_read_xor_write(std::span in_bytes, std::span out_bytes, const std::array& arr0, const std::array& arr1, const std::array& arr2, const std::array& arr3) +{ +#if defined(__clang__) && (defined(__aarch64__) || defined(__ARM_NEON) || defined(__ARM_NEON__)) + // On AArch64/NEON, clang's codegen for `std::assume_aligned` + 32-byte memcpy + // can be slower than the unaligned path. Prefer the single unaligned variant. + arr_read_xor_write_impl(in_bytes.data(), out_bytes.data(), arr0, arr1, arr2, arr3); +#else + constexpr std::uintptr_t mask{CHACHA20_VEC_MEM_ALIGN - 1}; + const bool aligned = ((reinterpret_cast(in_bytes.data()) | reinterpret_cast(out_bytes.data())) & mask) == 0; + + if (aligned) [[likely]] { + arr_read_xor_write_impl(in_bytes.data(), out_bytes.data(), arr0, arr1, arr2, arr3); + } else { + arr_read_xor_write_impl(in_bytes.data(), out_bytes.data(), arr0, arr1, arr2, arr3); + } +#endif +} + +/* Main crypt function. Calculates up to 16 states (8 half_states). */ +template +ALWAYS_INLINE void multi_block_crypt(std::span in_bytes, std::span out_bytes, const vec256& state0, const vec256& state1, const vec256& state2) +{ + static_assert(States == 16 || States == 8 || States == 6 || States == 4 || States == 2); + constexpr size_t half_states = States / 2; + std::array arr0, arr1, arr2, arr3; + + arr_set_vec256(arr0, nums256); + arr_set_vec256(arr1, state0); + arr_set_vec256(arr2, state1); + arr_set_vec256(arr3, state2); + + arr_add_arr(arr3, increments); + + doubleround(arr0, arr1, arr2, arr3); + + arr_add_vec256(arr0, nums256); + arr_add_vec256(arr1, state0); + arr_add_vec256(arr2, state1); + arr_add_vec256(arr3, state2); + + arr_add_arr(arr3, increments); + + arr_read_xor_write(in_bytes, out_bytes, arr0, arr1, arr2, arr3); +} + +template +ALWAYS_INLINE void process_blocks(std::span& in_bytes, std::span& out_bytes, const vec256& state0, const vec256& state1, vec256& state2) +{ + while (in_bytes.size() >= CHACHA20_VEC_BLOCKLEN * States) { + multi_block_crypt(in_bytes, out_bytes, state0, state1, state2); + vec_add_counter(state2, static_cast(States)); + in_bytes = in_bytes.subspan(CHACHA20_VEC_BLOCKLEN * States); + out_bytes = out_bytes.subspan(CHACHA20_VEC_BLOCKLEN * States); + } +} + +} // anonymous namespace +#endif // CHACHA20_VEC_ENABLE_ANY_MULTI_STATE + +#if defined(CHACHA20_NAMESPACE) +namespace CHACHA20_NAMESPACE { +#endif + +void chacha20_crypt_vectorized(std::span& in_bytes, std::span& out_bytes, const std::array& input) noexcept +{ +#if CHACHA20_VEC_ENABLE_ANY_MULTI_STATE + assert(in_bytes.size() == out_bytes.size()); + const vec256 state0 = vec_broadcast4(input[0], input[1], input[2], input[3]); + const vec256 state1 = vec_broadcast4(input[4], input[5], input[6], input[7]); + vec256 state2 = vec_broadcast4(input[8], input[9], input[10], input[11]); + + if constexpr (kEnableStates16) process_blocks<16>(in_bytes, out_bytes, state0, state1, state2); + if constexpr (kEnableStates8) process_blocks<8>(in_bytes, out_bytes, state0, state1, state2); + if constexpr (kEnableStates6) process_blocks<6>(in_bytes, out_bytes, state0, state1, state2); + if constexpr (kEnableStates4) process_blocks<4>(in_bytes, out_bytes, state0, state1, state2); + if constexpr (kEnableStates2) process_blocks<2>(in_bytes, out_bytes, state0, state1, state2); +#else + (void)in_bytes; + (void)out_bytes; + (void)input; +#endif +} + +#if defined(CHACHA20_NAMESPACE) +} +#endif + +#undef CHACHA20_VEC_ENABLE_ANY_MULTI_STATE +#undef CHACHA20_VEC_USE_SPLIT_LANES + +#endif // ENABLE_CHACHA20_VEC diff --git a/src/crypto/chacha20_vec_base.cpp b/src/crypto/chacha20_vec_base.cpp new file mode 100644 index 000000000000..4d7ab6839406 --- /dev/null +++ b/src/crypto/chacha20_vec_base.cpp @@ -0,0 +1,46 @@ +// Copyright (c) 2025-present The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#define CHACHA20_NAMESPACE chacha20_vec_base + +// This file should define which states should be en/disabled for all +// supported architectures. For some, like x86-64 and armv8, simd features +// (sse2 and neon respectively) are safe to use without runtime detection. + +#if defined(__x86_64__) || defined(__amd64__) +# define CHACHA20_VEC_DISABLE_STATES_16 +# define CHACHA20_VEC_DISABLE_STATES_8 +# define CHACHA20_VEC_DISABLE_STATES_6 +# if defined(__GNUC__) && !defined(__clang__) && !defined(__AVX2__) +// GCC currently generates slower code for the generic vectorized implementation +// on x86_64 unless AVX2 is enabled. Disable the 4-state path for now to avoid a +// regression. +# define CHACHA20_VEC_DISABLE_STATES_4 +// Disable the 2-state path as well (fallback to scalar) until a faster GCC x86 +// implementation exists (e.g. via AVX2/AVX512 runtime dispatch). +# define CHACHA20_VEC_DISABLE_STATES_2 +# endif +#elif defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) +# if defined(__GNUC__) && !defined(__clang__) +// The widest multi-state configuration (16) tends to spill on AArch64/NEON. +// Also disable the 6-state variant: it increases code size and hurts the +// common 8/4-state path on this target. +# define CHACHA20_VEC_DISABLE_STATES_16 +# define CHACHA20_VEC_DISABLE_STATES_6 +# define CHACHA20_VEC_DISABLE_STATES_2 +# else +# define CHACHA20_VEC_DISABLE_STATES_16 +# define CHACHA20_VEC_DISABLE_STATES_6 +# define CHACHA20_VEC_DISABLE_STATES_2 +# endif +#else +// Be conservative and require platforms to opt-in +# define CHACHA20_VEC_DISABLE_STATES_16 +# define CHACHA20_VEC_DISABLE_STATES_8 +# define CHACHA20_VEC_DISABLE_STATES_6 +# define CHACHA20_VEC_DISABLE_STATES_4 +# define CHACHA20_VEC_DISABLE_STATES_2 +#endif + +#include diff --git a/src/util/for_each_index.h b/src/util/for_each_index.h new file mode 100644 index 000000000000..9bbd21c56479 --- /dev/null +++ b/src/util/for_each_index.h @@ -0,0 +1,30 @@ +// Copyright (c) 2026-present The Bitcoin Core developers +// Distributed under the MIT software license, see the accompanying +// file COPYING or http://www.opensource.org/licenses/mit-license.php. + +#ifndef BITCOIN_UTIL_FOR_EACH_INDEX_H +#define BITCOIN_UTIL_FOR_EACH_INDEX_H + +#include + +#include +#include + +namespace util { + +/** Invoke `fn.template operator()()` for each `I` in `[0, N)`. */ +template +ALWAYS_INLINE void ForEachIndex(Fn&& fn, std::index_sequence) +{ + (fn.template operator()(), ...); +} + +template +ALWAYS_INLINE void ForEachIndex(Fn&& fn) +{ + ForEachIndex(std::forward(fn), std::make_index_sequence{}); +} + +} // namespace util + +#endif // BITCOIN_UTIL_FOR_EACH_INDEX_H