From eee08fc1d6b74adcb3c47c6ad59863067e5fd5dd Mon Sep 17 00:00:00 2001 From: Joe Rowell Date: Sun, 17 Jul 2022 15:37:25 +0100 Subject: [PATCH 1/4] Stability changes to SIMD code. --- kernel/bdgl_sieve.cpp | 3 +- kernel/fht_lsh.cpp | 5 +- kernel/simd.h | 49 ++++----- kernel/simd.inl | 245 ++++++++++++++++++++++++++---------------- 4 files changed, 184 insertions(+), 118 deletions(-) diff --git a/kernel/bdgl_sieve.cpp b/kernel/bdgl_sieve.cpp index c04ccae3..d183329c 100644 --- a/kernel/bdgl_sieve.cpp +++ b/kernel/bdgl_sieve.cpp @@ -20,7 +20,6 @@ #include "siever.h" #include "fht_lsh.h" -#include #include #include #include @@ -308,7 +307,7 @@ void Siever::bdgl_queue_create_task( const size_t t_id, const std::vector void ProductLSH::hash_templated<2>(const float * const vv, int32_t * { int32_t h0[multi_hash_block], h1[multi_hash_block]; float c0[multi_hash_block], c1[multi_hash_block]; - float c[multi_hash] = {0}; - + float c[multi_hash]; + memset(&c, 0, sizeof(float) * multi_hash); + // Now hash against the two subcode blocks. lshs[0].hash(&(vv[0]), c0, h0); lshs[1].hash(&(vv[is[1]]), c1, h1); diff --git a/kernel/simd.h b/kernel/simd.h index 82948d12..60fb7573 100644 --- a/kernel/simd.h +++ b/kernel/simd.h @@ -4,8 +4,9 @@ #include "g6k_config.h" #include +#ifdef HAVE_AVX2 #include -#include +#endif /** Simd. This namespace provides access to a variety of low-level SIMD routines @@ -208,63 +209,63 @@ inline VecType build_vec_type(const int16_t in) { // Masks for various operations. // We only compile the ones we'll use. -#ifdef COMPILE_AVX2 -constexpr __m256i mixmask_threshold = _mm256_set_epi16( +#ifdef HAVE_AVX2 +static const __m256i mixmask_threshold = _mm256_set_epi16( 0x5555, 0x5555, 0x5555, 0x5555, 0x5555, 0x5555, 0x5555, 0x5555, 0xAAAA, 0xAAAA, 0xAAAA, 0xAAAA, 0xAAAA, 0xAAAA, 0xAAAA, 0xAAAA); -constexpr __m256i _7FFF_epi16 = _mm256_set_epi16( +static const __m256i _7FFF_epi16 = _mm256_set_epi16( 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF, 0x7FFF); -constexpr __m256i sign_mask_2 = _mm256_set_epi16( +static const __m256i sign_mask_2 = _mm256_set_epi16( 0xFFFF, 0x0001, 0xFFFF, 0x0001, 0xFFFF, 0x0001, 0xFFFF, 0x0001, 0xFFFF, 0x0001, 0xFFFF, 0x0001, 0xFFFF, 0x0001, 0xFFFF, 0x0001); -constexpr __m256i mask_even_epi16 = _mm256_set_epi16( +static const __m256i mask_even_epi16 = _mm256_set_epi16( 0xFFFF, 0x0000, 0xFFFF, 0x0000, 0xFFFF, 0x0000, 0xFFFF, 0x0000, 0xFFFF, 0x0000, 0xFFFF, 0x0000, 0xFFFF, 0x0000, 0xFFFF, 0x0000); -constexpr __m256i mask_odd_epi16 = _mm256_set_epi16( +static const __m256i mask_odd_epi16 = _mm256_set_epi16( 0x0000, 0xFFFF, 0x0000, 0xFFFF, 0x0000, 0xFFFF, 0x0000, 0xFFFF, 0x0000, 0xFFFF, 0x0000, 0xFFFF, 0x0000, 0xFFFF, 0x0000, 0xFFFF); -constexpr __m256i regroup_for_max = _mm256_set_epi8( +static const __m256i regroup_for_max = _mm256_set_epi8( 0x0F, 0x0E, 0x07, 0x06, 0x0D, 0x0C, 0x05, 0x04, 0x0B, 0x0A, 0x03, 0x02, 0x09, 0x08, 0x01, 0x00, 0x1F, 0x1E, 0x17, 0x16, 0x1D, 0x1C, 0x15, 0x14, 0x1B, 0x1A, 0x13, 0x12, 0x19, 0x18, 0x11, 0x10); -constexpr __m256i sign_mask_8 = _mm256_set_epi16( +static const __m256i sign_mask_8 = _mm256_set_epi16( 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001); -constexpr __m256i sign_shuffle = _mm256_set_epi16( +static const __m256i sign_shuffle = _mm256_set_epi16( 0xFFFF, 0xFFFF, 0xFFFF, 0x0001, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0xFFFF, 0xFFFF); -constexpr __m256i indices_epi8 = _mm256_set_epi8( +static const __m256i indices_epi8 = _mm256_set_epi8( 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, 0x1F, 0x1E, 0x1D, 0x1C, 0x1B, 0x1A, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10); -constexpr __m256i indices_epi16 = _mm256_set_epi16( +static const __m256i indices_epi16 = _mm256_set_epi16( 0x000F, 0x000E, 0x000D, 0x000C, 0x000B, 0x000A, 0x0009, 0x0008, 0x0007, 0x0006, 0x0005, 0x0004, 0x0003, 0x0002, 0x0001, 0x0000); -constexpr __m256i indices_sa1_epi16 = _mm256_set_epi16( +static const __m256i indices_sa1_epi16 = _mm256_set_epi16( 0x0010, 0x000F, 0x000E, 0x000D, 0x000C, 0x000B, 0x000A, 0x0009, 0x0008, 0x0007, 0x0006, 0x0005, 0x0004, 0x0003, 0x0002, 0x0001); -constexpr __m256i _0010_epi16 = _mm256_set_epi16( +static const __m256i _0010_epi16 = _mm256_set_epi16( 0x0010, 0x0010, 0x0010, 0x0010, 0x0010, 0x0010, 0x0010, 0x0010, 0x0010, 0x0010, 0x0010, 0x0010, 0x0010, 0x0010, 0x0010, 0x0010); -constexpr __m256i rnd_mult_epi32 = +static const __m256i rnd_mult_epi32 = _mm256_set_epi32(0xF010A011, 0x70160011, 0x70162011, 0x00160411, 0x0410F011, 0x02100011, 0xF0160011, 0x00107010); // 0xFFFF = -1, 0x0001 = 1 -constexpr __m256i negation_masks_epi16[2] = { +static const __m256i negation_masks_epi16[2] = { _mm256_set_epi16(0xFFFF, 0x0001, 0xFFFF, 0xFFFF, 0xFFFF, 0x0001, 0x0001, 0xFFFF, 0xFFFF, 0x0001, 0xFFFF, 0x0001, 0xFFFF, 0xFFFF, 0x0001, 0xFFFF), @@ -272,7 +273,7 @@ constexpr __m256i negation_masks_epi16[2] = { 0xFFFF, 0xFFFF, 0x0001, 0xFFFF, 0x0001, 0xFFFF, 0xFFFF, 0x0001, 0xFFFF)}; -constexpr __m256i permutations_epi16[4] = { +static const __m256i permutations_epi16[4] = { _mm256_set_epi16(0x0F0E, 0x0706, 0x0100, 0x0908, 0x0B0A, 0x0D0C, 0x0504, 0x0302, 0x0706, 0x0F0E, 0x0504, 0x0302, 0x0B0A, 0x0908, 0x0D0C, 0x0100), @@ -286,7 +287,7 @@ constexpr __m256i permutations_epi16[4] = { 0x0B0A, 0x0302, 0x0100, 0x0504, 0x0B0A, 0x0908, 0x0706, 0x0F0E, 0x0D0C)}; -constexpr __m256i tailmasks[16] = { +static const __m256i tailmasks[16] = { _mm256_set_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF), @@ -555,10 +556,10 @@ inline VecType m256_cmpgt_epi16(const VecType a, const VecType b); /** * m256_slli_epi16. This function accepts a vector `a` and shifts each word in * `a` left by `count` many bits. This function mimics exactly the behaviour of - * _mm256_slli_epi16. \param[in] a: the vector to shift. \param[in] count: the - * amount to shift by. \return a << count. + * _mm256_slli_epi16. \tparam pos: the number of positions to shift by. \param[in] a: the vector to shift. \return a << count. */ -inline VecType m256_slli_epi16(const VecType a, const int count); + template +inline VecType m256_slli_epi16(const VecType a); /** m256_hadd_epi16. Accepts two vectors `a` and `b` and emulates the @@ -620,10 +621,10 @@ inline SmallVecType m128_xor_si128(const SmallVecType a, const SmallVecType b); /** * m128_slli_epi64. This function accepts a vector `a` and shifts each quadword * in `a` left by `count` many bits. This function mimics exactly the behaviour - * of _mm_slli_epi64. \param[in] a: the vector to shift. \param[in] count: the - * amount to shift by. \return a << count. + * of _mm_slli_epi64. \tparam pos: the amount to shift by. \param[in] a: the vector to shift. \return a << pos. */ -inline SmallVecType m128_slli_epi64(const SmallVecType a, const int pos); + template +inline SmallVecType m128_slli_epi64(const SmallVecType a); /** * m128_srli_epi64. This function accepts a vector `a` and shifts each quadword * in `a` right by `count` many bits. This function mimics exactly the behaviour diff --git a/kernel/simd.inl b/kernel/simd.inl index fe5898c8..23406191 100644 --- a/kernel/simd.inl +++ b/kernel/simd.inl @@ -5,12 +5,27 @@ #include // Needed for memcpy. If (for some unknown reason) this is prohibitive you can instead // use __builtin_memcpy. -inline Simd::SmallVecType Simd::m128_slli_epi64(const SmallVecType a, - const int mask) { + +// Clang and GCC don't have a shared syntax for built-in shuffling. +// This was fixed in GCC-12 and later, but GCC's shuffle expects a vector mask, +// whereas Clang needs a compile-time known list of indices. +#ifdef __clang__ +#define SHUFFLE16(a, b, ...) __builtin_shufflevector(a, b, __VA_ARGS__) +// We just re-use SHUFFLE16 here, so __builtin_shufflevector is variadic. +#define SHUFFLE8(a, b, ...) SHUFFLE16(a, b, __VA_ARGS__) +#define SHUFFLE4(a, b, ...) SHUFFLE16(a, b, __VA_ARGS__) +#elif defined(__GNUG__) +#define SHUFFLE16(a, b, ...) __builtin_shuffle(a, b, Vec16s{__VA_ARGS__}); +#define SHUFFLE8(a, b, ...) __builtin_shuffle(a, b, Vec8s{__VA_ARGS__}); +#define SHUFFLE4(a, b, ...) __builtin_shuffle(a, b, Vec4q{__VA_ARGS__}); +#endif + +template +inline Simd::SmallVecType Simd::m128_slli_epi64(const SmallVecType a) { #ifdef HAVE_AVX2 - return _mm_slli_epi64(a, mask); + return _mm_slli_epi64(a, pos); #else - return a << mask; + return a << pos; #endif } @@ -40,7 +55,10 @@ inline Simd::SmallVecType Simd::m128_set_epi64x(const int64_t e1, // NOTE the swap: this is for endianness. // All else acts exactly the same, this is just the one weird bit of // inconsistency. - return (SmallVecType)(Vec2q{e0, e1}); + int64_t arr[]{e0, e1}; + SmallVecType vec; + memcpy(&vec, &arr, sizeof(arr)); + return vec; #endif } @@ -52,7 +70,10 @@ inline Simd::SmallVecType Simd::m128_set_epi64x(const uint64_t e1, // NOTE the swap: this is for endianness. // All else acts exactly the same, this is just the one weird bit of // inconsistency. - return (SmallVecType)(Vec2uq{e0, e1}); + uint64_t arr[]{e0, e1}; + SmallVecType vec; + memcpy(&vec, &arr, sizeof(arr)); + reutrn vec; #endif } @@ -69,13 +90,12 @@ inline Simd::VecType Simd::m256_hadd_epi16(const Simd::VecType a, // do another shuffle at the end to recombine these results into something // useful. - static constexpr Vec16s hadd_shift_mask_epi16 = { - 0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30}; - static constexpr Vec16s shift_right_1_epi16 = {1, 2, 3, 4, 5, 6, 7, 8, - 9, 10, 11, 12, 13, 14, 15, 0}; +#define SHIFT_RIGHT_1_MASK 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0 +#define HADD_SHIFT_MASK 0,2,4,6,16,18,20,22,8,10,12,14,24,26,28,30 - const auto a1 = __builtin_shuffle(a, shift_right_1_epi16); - const auto b1 = __builtin_shuffle(b, shift_right_1_epi16); + const auto a1 = SHUFFLE16(a, a, SHIFT_RIGHT_1_MASK); + const auto b1 = SHUFFLE16(b, b, SHIFT_RIGHT_1_MASK); + // a2 = (a[0] + a[1], a[1] + a[2] , a[2] + a[3], a[3] + a[4], // a[4] + a[5], a[5] + a[6] , a[6] + a[7], a[7] + a[8], @@ -89,7 +109,9 @@ inline Simd::VecType Simd::m256_hadd_epi16(const Simd::VecType a, // The mask works by shuffling mod the length of the vector. // This means that (for example) a value of `18` refers to b2[2], whereas `2` // refers to a2[2]. - return __builtin_shuffle(a2, b2, hadd_shift_mask_epi16); + return SHUFFLE16(a2, b2, HADD_SHIFT_MASK); + #undef SHIFT_RIGHT_1_MASK + #undef HADD_SHIFT_MASK #endif } @@ -98,7 +120,14 @@ inline Simd::SmallVecType Simd::m128_add_epi64(const SmallVecType a, #ifdef HAVE_AVX2 return _mm_add_epi64(a, b); #else - return (SmallVecType)((Vec2uq)(a) + (Vec2uq)(b)); + Vec2q a_as_q, b_as_q; + memcpy(&a_as_q, &a, sizeof(a)); + memcpy(&b_as_q, &b, sizeof(b)); + const auto c = a_as_q + b_as_q; + SmallVecType result; + static_assert(sizeof(result) == sizeof(c), "Error: sizeof(SmallVecType) != sizeof(Vec2q)"); + memcpy(&result, &c, sizeof(result)); + return result; #endif } @@ -127,12 +156,12 @@ inline Simd::SmallVecType Simd::m128_xor_si128(const SmallVecType a, #endif } -inline Simd::SmallVecType Simd::m128_srli_epi64(const SmallVecType a, - const int pos) { +template +inline Simd::SmallVecType Simd::m128_srli_epi64(const SmallVecType a) { #ifdef HAVE_AVX2 return _mm_srli_si128(a, pos); #else - return (SmallVecType)(((Vec2q)a) >> pos); + return a >> pos; #endif } @@ -164,10 +193,18 @@ inline Simd::VecType Simd::m256_permute4x64_epi64(const VecType a) { // You could do this with a lookup table (it would only require a bit of // storage) but it's probably not worth it: this is just a general function. - constexpr Vec4q temp_mask{mask & 3, (mask & 12) >> 2, (mask & 48) >> 4, - (mask & 192) >> 6}; - return reinterpret_cast( - __builtin_shuffle(reinterpret_cast(a), temp_mask)); + // NOTE: the memcpying here is a type-punning guard. Technically reinterpret casting + // here would be UB (GCC has an extension for this, but Clang doesn't). + // The compiler knows that this isn't actually an operation that it needs to do: + // the backend optimises out the memcpys (see https://godbolt.org/z/zxKvnKdeP: the code + // for this function compiles down to a vectorised permutation). + Vec4q tmp_a; + memcpy(&tmp_a, &a, sizeof(tmp_a)); + const auto res = SHUFFLE4(tmp_a, tmp_a, mask&3, (mask&12)>>2, + (mask&48) >> 4, (mask&192) >> 6); + VecType out; + memcpy(&out, &res, sizeof(out)); + return out; #endif } @@ -176,15 +213,7 @@ Simd::m256_permute4x64_epi64_for_hadamard(const VecType a) { #ifdef HAVE_AVX2 return _mm256_permute4x64_epi64(a, 0b01001110); #else - // The shuffle vector should be built at compile-time. - static constexpr int64_t arr[4] = {1, 0, 3, 2}; - // NOTE: for some unknown reason *this* needs to be backwards, even if the - // version for the general shuffle doesn't need to be backwards. - // My guess is that it's to do with the endianness of `a` but I've - // got no idea beyond that. - constexpr Vec4q mask{arr[3], arr[2], arr[1], arr[0]}; - return reinterpret_cast( - __builtin_shuffle(reinterpret_cast(a), mask)); + return m256_permute4x64_epi64<0b01001110>(a); #endif } @@ -194,18 +223,14 @@ inline int Simd::m256_testz_si256(const VecType a, const VecType b) { #else // This doesn't have a neat implementation. // Basically, GCC's == operator produces a vector as a result, which is really - // useful in most cases (but not here). To hack around this we need to cast - // each part to a __int128_t and compare against zero. + // useful in most cases (but not here). + // To get around this, we just copy into a fixed-size array and compare against the all zero. + // Again, the copy here is a type-punning guard. const auto res = a & b; - - Vec8s p1, p2; - memcpy(&p1, &res, sizeof(p1)); - memcpy(&p2, &res[8], sizeof(p2)); - - __int128_t lhs = reinterpret_cast<__int128_t>(p1); - __int128_t rhs = reinterpret_cast<__int128_t>(p2); - - return (lhs == 0) & (rhs == 0); + constexpr static std::array zero{0}; + std::array res_as_16; + memcpy(&res_as_16, &res, sizeof(res_as_16)); + return memcmp(&res_as_16, &zero, sizeof(zero)) == 0; #endif } @@ -227,7 +252,9 @@ template inline int64_t Simd::m256_extract_epi64(const VecType a) { #ifdef HAVE_AVX2 return _mm256_extract_epi64(a, pos); #else - return ((Vec4q)a)[pos]; + int64_t arr[4]; + memcpy(&arr, &a, sizeof(a)); + return a[pos]; #endif } @@ -237,7 +264,10 @@ inline int64_t Simd::m128_extract_epi64(const SmallVecType a) { #ifdef HAVE_AVX2 return _mm_extract_epi64(a, pos); #else - return ((Vec2q)a)[pos]; + int64_t arr[2]; + static_assert(sizeof(arr) == sizeof(a), "Error: sizeof(arr) != sizeof(a)"); + memcpy(&arr, &a, sizeof(a)); + return a[pos]; #endif } @@ -288,11 +318,12 @@ inline Simd::VecType Simd::m256_sign_epi16(const VecType a, #endif } -inline Simd::VecType Simd::m256_slli_epi16(const VecType a, const int count) { +template +inline Simd::VecType Simd::m256_slli_epi16(const VecType a) { #ifdef HAVE_AVX2 - return _mm256_slli_epi16(a, count); + return _mm256_slli_epi16(a, pos); #else - return a << count; + return a << pos; #endif } @@ -308,13 +339,13 @@ inline Simd::VecType Simd::m256_broadcastsi128_si256(const SmallVecType in) { #ifdef HAVE_AVX2 return _mm256_broadcastsi128_si256(in); #else - // The simple solution here is to copy all of the elements of `in` in order - // into a new vector, but that's really slow and GCC produces _awful_ object - // code. A better solution (although still slower than the ideal case) is to - // use memcpy, since GCC seems to do better there: I have no idea why. + // N.B we need to copy into an array here, since Clang doesn't let us do + // element-wise access into vectors. + std::array out_as_arr; + memcpy(&out_as_arr, &in, sizeof(in)); + memcpy(&out_as_arr[8], &in, sizeof(in)); Vec16s out; - memcpy(&out[0], &in[0], sizeof(Vec8s)); - memcpy(&out[8], &in[0], sizeof(Vec8s)); + memcpy(&out, &out_as_arr, sizeof(out)); return out; #endif } @@ -323,26 +354,58 @@ inline Simd::SmallVecType Simd::m128_shuffle_epi8(const SmallVecType in, const SmallVecType mask) { #ifdef HAVE_AVX2 return _mm_shuffle_epi8(in, mask); -#else +#elif defined(__GNUG__) & !defined(__clang__) + // We have separate code here for GCC and Clang. The reason why + // is because Clang's shuffle doesn't support variable inputs, + // and GCC's shuffle would be pessimised by just using the generic code. + // The mm_shuffle_epi8 intrinsic is a bit weird. // First of all, we need to extract the lowest 4 bits of each word (since // there's only 16 options this is all we're allowed). We then shuffle // according to that. - const auto shuffle_mask = reinterpret_cast(mask) & 15; - // So now we've gotten that match, we'll want to make the shuffle. Sounds - // easy, right? - const auto intermediate = - __builtin_shuffle(reinterpret_cast(in), shuffle_mask); + Vec16c shuffle_mask, in_as_16c; + memcpy(&shuffle_mask, &mask, sizeof(shuffle_mask)); + shuffle_mask = shuffle_mask & 15; + memcpy(&in_as_16c, &in, sizeof(in)); + + // We'll use GCC@s shuffle here since we're inside the GNUG block. + const auto intermediate = __builtin_shuffle(in_as_16c, shuffle_mask); - // Aha! Gotcha. // It turns out the mm_shuffle_epi8 intrinsic is a bit weird. // Essentially, if the top-most bit of `mask[i]` is set then `out[i] == 0`. + Vec16uc mask_as_uc; + memcpy(&mask_as_uc, &mask, sizeof(mask_as_uc)); + const auto gt_64 = mask_as_uc & 0x80; const auto gt_64 = reinterpret_cast(mask) & 0x80; // And now if the element is > 64 we choose 0, otherwise we choose the // shuffled version const auto result = gt_64 ? 0 : intermediate; - return reinterpret_cast(result); + SmallVecType res; + memcpy(&res, &result, sizeof(result)); + return res; +#else + // We'll shuffle by hand. + std::array in_arr, mask_arr, out_arr; + static_assert(sizeof(in_arr) == sizeof(SmallVecType), "Error: wrong array size for copy"); + memcpy(&in_arr, &in, sizeof(in_arr)); + memcpy(&mask_arr, &mask, sizeof(mask_arr)); + for(unsigned i = 0; i < 16; i++) { + out_arr[i] = in_arr[mask_arr[i] & 15]; + } + + // Now we've shuffled we'll go back to explicitly vectorised instructions. + Vec16uc intermediate; + memcpy(&intermediate, &out_arr, sizeof(intermediate)); + + // We'll also now do the same masking as in the GNUG block. + Vec16uc mask_as_uc; + memcpy(&mask_as_uc, &mask, sizeof(mask)); + const auto gt_64 = mask_as_uc & 0x80; + const auto result = gt_64 ? 0 : intermediate; + SmallVecType res; + memcpy(&res, &result, sizeof(result)); + return res; #endif } @@ -351,31 +414,36 @@ inline Simd::VecType Simd::m256_shuffle_epi8(const VecType in, #ifdef HAVE_AVX2 return _mm256_shuffle_epi8(in, mask); #else - // WARNING: you cannot use the native __builtin_shuffle here. - // As tempting as it might seem, the reason why is that __builtin_shuffle + // WARNING: you cannot use the native shuffle here. + // As tempting as it might seem, the reason why is that shuffle // let's you do cross-lane shuffles, whereas the _mm256_shuffle_epi8 intrinsic // does not. To fix this problem, we sub-divide: we deal with each 128-bit // segment separately and then re-combine at the end. - Vec16s result; + std::array mask_as_arr, in_as_arr; + static_assert(sizeof(mask_as_arr) == sizeof(in), "Error: wrong vector size for copy"); + memcpy(&mask_as_arr, &mask, sizeof(mask_as_arr)); + memcpy(&in_as_arr, &in, sizeof(in_as_arr)); + + SmallVecType first_mask, last_mask; Vec8s first, last; - Vec16c first_mask, last_mask; // NOTE: the compiler is likely to turn these into moves, since these // variables are most likely in registers. - memcpy(&first, &in, sizeof(Vec8s)); - memcpy(&last, &in[8], sizeof(Vec8s)); - memcpy(&first_mask, &mask, sizeof(Vec8s)); - memcpy(&last_mask, &mask[8], sizeof(Vec8s)); - - // Delegate to the 128-bit version. - auto res_1 = Simd::m128_shuffle_epi8( - first, reinterpret_cast(first_mask)); - auto res_2 = - Simd::m128_shuffle_epi8(last, reinterpret_cast(last_mask)); - - // Same caveat as above. - memcpy(&result, &res_1, sizeof(Vec8s)); - memcpy(&result[8], &res_2, sizeof(Vec8s)); + memcpy(&first, &in_as_arr, sizeof(Vec8s)); + memcpy(&last, &in_as_arr[8], sizeof(Vec8s)); + memcpy(&first_mask, &mask_as_arr, sizeof(Vec8s)); + memcpy(&last_mask, &mask_as_arr[8], sizeof(Vec8s)); + + const auto res_1 = Simd::m128_shuffle_epi8( + first, first_mask); + const auto res_2 = + Simd::m128_shuffle_epi8(last, last_mask); + + std::array out_as_arr; + memcpy(&out_as_arr, &res_1, sizeof(res_1)); + memcpy(&out_as_arr[8], &res_2, sizeof(res_2)); + VecType result; + memcpy(&result, &out_as_arr, sizeof(result)); return result; #endif } @@ -447,15 +515,12 @@ inline Simd::SmallVecType Simd::m128_random_state(SmallVecType prg_state, #else // Silence the fact it isn't used. (void)key; + (void)prg_state; - SmallVecType s1 = prg_state; - const SmallVecType s0 = *extra_state; - - s1 = m128_xor_si128(s1, m128_slli_epi64(s1, 23)); - *extra_state = m128_xor_si128( - m128_xor_si128(m128_xor_si128(s1, s0), m128_srli_epi64(s1, 5)), - m128_srli_epi64(s0, 5)); - return m128_add_epi64(*extra_state, s0); + // Rand should be fine here: it stops us falling into random cycles. + *extra_state = m128_set_epi64x(static_cast(rand()), + static_cast(rand())); + return *extra_state; #endif } @@ -543,21 +608,21 @@ inline void Simd::m256_permute_epi16(VecType *const v, SmallVecType &prg_state, m256_mix(v[0], v[1], tmp); // Shift the randomness around before extracting more (somewhat independent) // mixing bits - rnd = m256_slli_epi16(rnd, 1); + rnd = m256_slli_epi16<1>(rnd); // Now do random swaps between v[0] and v[last-1] m256_mix(v[0], v[regs_ - 2], tmp); - rnd = m256_slli_epi16(rnd, 1); + rnd = m256_slli_epi16<1>(rnd); // Now do swaps between v[1] and v[last], avoiding padding data m256_mix(v[1], v[regs_ - 1], tailmask); // More permuting for (int i = 2; i + 2 < regs_; i += 2) { - rnd = m256_slli_epi16(rnd, 1); + rnd = m256_slli_epi16<1>(rnd); tmp = m256_cmpgt_epi16(rnd, reinterpret_cast(mixmask_threshold)); m256_mix(v[0], v[i], tmp); - rnd = m256_slli_epi16(rnd, 1); + rnd = m256_slli_epi16<1>(rnd); tmp = m256_cmpgt_epi16(rnd, reinterpret_cast(mixmask_threshold)); m256_mix(v[1], v[i + 1], tmp); } From 52b9a6d73189ac5d2bd62f53e8f515a1aa18aea3 Mon Sep 17 00:00:00 2001 From: Joe Rowell Date: Sun, 17 Jul 2022 15:45:36 +0100 Subject: [PATCH 2/4] Small changes for building on ARM. --- kernel/bdgl_sieve.cpp | 3 ++- kernel/simd.h | 5 +++-- kernel/simd.inl | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/bdgl_sieve.cpp b/kernel/bdgl_sieve.cpp index d183329c..e4c8304b 100644 --- a/kernel/bdgl_sieve.cpp +++ b/kernel/bdgl_sieve.cpp @@ -306,8 +306,9 @@ void Siever::bdgl_queue_create_task( const size_t t_id, const std::vector(insert_after+params.threads*write_index))].len / REDUCE_LEN_MARGIN, transaction_db, write_index, queue[index].len, queue[index].sign); if( write_index < 0 ){ std::cerr << "Spilling full transaction db" << t_id << " " << Q-index << std::endl; diff --git a/kernel/simd.h b/kernel/simd.h index 60fb7573..826c9b43 100644 --- a/kernel/simd.h +++ b/kernel/simd.h @@ -628,10 +628,11 @@ inline SmallVecType m128_slli_epi64(const SmallVecType a); /** * m128_srli_epi64. This function accepts a vector `a` and shifts each quadword * in `a` right by `count` many bits. This function mimics exactly the behaviour - * of _mm_srli_epi64. \param[in] a: the vector to shift. \param[in] count: the + * of _mm_srli_epi64. \tparam pos: the number of positions to shift by. \param[in] a: the vector to shift. \param[in] count: the * amount to shift by. \return a >> count. */ -inline SmallVecType m128_srli_epi64(const SmallVecType a, const int pos); + template +inline SmallVecType m128_srli_epi64(const SmallVecType a); /** m256_and_si256. This function accepts two vectors `a` and `b` and returns a diff --git a/kernel/simd.inl b/kernel/simd.inl index 23406191..9ccc13c1 100644 --- a/kernel/simd.inl +++ b/kernel/simd.inl @@ -73,7 +73,7 @@ inline Simd::SmallVecType Simd::m128_set_epi64x(const uint64_t e1, uint64_t arr[]{e0, e1}; SmallVecType vec; memcpy(&vec, &arr, sizeof(arr)); - reutrn vec; + return vec; #endif } @@ -386,7 +386,7 @@ inline Simd::SmallVecType Simd::m128_shuffle_epi8(const SmallVecType in, return res; #else // We'll shuffle by hand. - std::array in_arr, mask_arr, out_arr; + std::array in_arr, mask_arr, out_arr; static_assert(sizeof(in_arr) == sizeof(SmallVecType), "Error: wrong array size for copy"); memcpy(&in_arr, &in, sizeof(in_arr)); memcpy(&mask_arr, &mask, sizeof(mask_arr)); From 15efe9d9aab377ae39b0ce82c16546c6441b0791 Mon Sep 17 00:00:00 2001 From: Joe Rowell Date: Mon, 10 Oct 2022 10:56:19 +0100 Subject: [PATCH 3/4] Remove spurious initialisation. --- kernel/simd.inl | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/simd.inl b/kernel/simd.inl index 9ccc13c1..d66f3168 100644 --- a/kernel/simd.inl +++ b/kernel/simd.inl @@ -376,7 +376,6 @@ inline Simd::SmallVecType Simd::m128_shuffle_epi8(const SmallVecType in, Vec16uc mask_as_uc; memcpy(&mask_as_uc, &mask, sizeof(mask_as_uc)); const auto gt_64 = mask_as_uc & 0x80; - const auto gt_64 = reinterpret_cast(mask) & 0x80; // And now if the element is > 64 we choose 0, otherwise we choose the // shuffled version From e848580bec1c0b2e4d6e38d7340653ad12a1f15d Mon Sep 17 00:00:00 2001 From: Joe Rowell Date: Sun, 17 Jul 2022 15:37:25 +0100 Subject: [PATCH 4/4] Stability changes to SIMD code. --- kernel/fht_lsh.cpp | 1 + kernel/simd.inl | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/fht_lsh.cpp b/kernel/fht_lsh.cpp index 1f40224c..9a723b4d 100644 --- a/kernel/fht_lsh.cpp +++ b/kernel/fht_lsh.cpp @@ -19,6 +19,7 @@ #include "g6k_config.h" #include "fht_lsh.h" +#include // Please note that this file originally came from: // https://github.com/lducas/AVX2-BDGL-bucketer commit 630c2286a440fae1eddd9f90341ff2020f18b614 diff --git a/kernel/simd.inl b/kernel/simd.inl index d66f3168..be40c6b0 100644 --- a/kernel/simd.inl +++ b/kernel/simd.inl @@ -375,7 +375,7 @@ inline Simd::SmallVecType Simd::m128_shuffle_epi8(const SmallVecType in, // Essentially, if the top-most bit of `mask[i]` is set then `out[i] == 0`. Vec16uc mask_as_uc; memcpy(&mask_as_uc, &mask, sizeof(mask_as_uc)); - const auto gt_64 = mask_as_uc & 0x80; + const auto gt_64 = reinterpret_cast(mask) & 0x80; // And now if the element is > 64 we choose 0, otherwise we choose the // shuffled version