From bbe4f3bfd5ac543ed805c58cacac922f0ccf70f2 Mon Sep 17 00:00:00 2001 From: "zhuna.1024" Date: Thu, 27 Oct 2022 22:08:07 +0800 Subject: [PATCH] rvv simd acceleration --- CMakeLists.txt | 10 ++++++++++ cmake/config.h.in | 3 +++ snappy-internal.h | 34 +++++++++++++++++++++++++++++++--- snappy.cc | 44 ++++++++++++++++++++++++++++++++++++++------ 4 files changed, 82 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6eef485..0c6a0dd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -187,6 +187,16 @@ int main() { return 0; }" SNAPPY_HAVE_NEON) +check_cxx_source_compiles(" +#include +int main() { + uint8_t val = 3, dup[8]; + vuint8m1_t v = vmv_v_x_u8m1(val, 128); + vuint64m1_t vv = vreinterpret_v_u8m1_u64m1(v); + vse64_v_u64m1(reinterpret_cast(dup), vv, 128); + return 0; +}" SNAPPY_HAVE_RVV) + include(CheckSymbolExists) check_symbol_exists("mmap" "sys/mman.h" HAVE_FUNC_MMAP) check_symbol_exists("sysconf" "unistd.h" HAVE_FUNC_SYSCONF) diff --git a/cmake/config.h.in b/cmake/config.h.in index 5ea2b5a..4f0b87e 100644 --- a/cmake/config.h.in +++ b/cmake/config.h.in @@ -56,4 +56,7 @@ first (like Motorola and SPARC, unlike Intel and VAX). */ #cmakedefine01 SNAPPY_IS_BIG_ENDIAN +/* Define to 1 if you target processors with RVV and have . */ +#cmakedefine01 SNAPPY_HAVE_RVV + #endif // THIRD_PARTY_SNAPPY_OPENSOURCE_CMAKE_CONFIG_H_ diff --git a/snappy-internal.h b/snappy-internal.h index e552ea0..017bcaa 100644 --- a/snappy-internal.h +++ b/snappy-internal.h @@ -44,7 +44,11 @@ #include #endif -#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON +#if SNAPPY_HAVE_RVV +#include +#endif + +#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON || SNAPPY_HAVE_RVV #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 1 #else #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 0 @@ -58,6 +62,8 @@ namespace internal { using V128 = __m128i; #elif SNAPPY_HAVE_NEON using V128 = uint8x16_t; +#elif SNAPPY_HAVE_RVV +using V128 = vuint8m1_t; #endif // Load 128 bits of integer data. `src` must be 16-byte aligned. @@ -108,7 +114,29 @@ inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) { } inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); } -#endif + +#elif SNAPPY_HAVE_RVV +inline V128 V128_Load(const V128* src) { + return vle8_v_u8m1(reinterpret_cast(src), 128); +} + +inline V128 V128_LoadU(const V128* src) { + return vle8_v_u8m1(reinterpret_cast(src), 128); +} + +inline void V128_StoreU(V128* dst, V128 val) { + vse8_v_u8m1(reinterpret_cast(dst), val, 128); +} + +inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) { + return vrgather_vv_u8m1(input, shuffle_mask, 128); +} + +inline V128 V128_DupChar(char c) { + return vmv_v_x_u8m1(c, 128); +} + +#endif // SNAPPY_HAVE_RVV #endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE // Working memory performs a single allocation to hold all scratch space @@ -172,7 +200,7 @@ char* CompressFragment(const char* input, // Separate implementation for 64-bit, little-endian cpus. #if !SNAPPY_IS_BIG_ENDIAN && \ (defined(__x86_64__) || defined(_M_X64) || defined(ARCH_PPC) || \ - defined(ARCH_ARM)) + defined(ARCH_ARM) || defined(__riscv)) static inline std::pair FindMatchLength(const char* s1, const char* s2, const char* s2_limit, diff --git a/snappy.cc b/snappy.cc index b072e5d..a241821 100644 --- a/snappy.cc +++ b/snappy.cc @@ -232,22 +232,32 @@ inline constexpr std::array MakePatternMaskBytes( return {static_cast((index_offset + indexes) % pattern_size)...}; } +template +struct SizeOfV128 { + static constexpr uint64_t size = sizeof(V); +}; + +template <> +struct SizeOfV128 { + static constexpr uint64_t size = 128; +}; + // Computes the shuffle control mask bytes array for given pattern-sizes and // returns an array. template -inline constexpr std::array, +inline constexpr std::array::size>, sizeof...(pattern_sizes_minus_one)> MakePatternMaskBytesTable(int index_offset, index_sequence) { return { MakePatternMaskBytes(index_offset, pattern_sizes_minus_one + 1, - make_index_sequence())...}; + make_index_sequence::size>())...}; } // This is an array of shuffle control masks that can be used as the source // operand for PSHUFB to permute the contents of the destination XMM register // into a repeating byte pattern. -alignas(16) constexpr std::array, +alignas(16) constexpr std::array::size>()>, 16> pattern_generation_masks = MakePatternMaskBytesTable( /*index_offset=*/0, @@ -258,7 +268,7 @@ alignas(16) constexpr std::array, // Basically, pattern_reshuffle_masks is a continuation of // pattern_generation_masks. It follows that, pattern_reshuffle_masks is same as // pattern_generation_masks for offsets 1, 2, 4, 8 and 16. -alignas(16) constexpr std::array, +alignas(16) constexpr std::array::size>()>, 16> pattern_reshuffle_masks = MakePatternMaskBytesTable( /*index_offset=*/16, @@ -275,6 +285,15 @@ static inline V128 LoadPattern(const char* src, const size_t pattern_size) { generation_mask); } +// fix sizeless compiler issue +#if SNAPPY_HAVE_RVV +#define LoadPatternAndReshuffleMask(src, pattern_size) \ + V128 pattern = LoadPattern(src, pattern_size);\ + V128 reshuffle_mask = V128_Load(reinterpret_cast(\ + pattern_reshuffle_masks[pattern_size - 1].data())); + +#else + SNAPPY_ATTRIBUTE_ALWAYS_INLINE static inline std::pair LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) { @@ -290,6 +309,7 @@ LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) { pattern_reshuffle_masks[pattern_size - 1].data())); return {pattern, reshuffle_mask}; } +#endif #endif // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE @@ -324,10 +344,14 @@ static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) { return true; } default: { + #if SNAPPY_HAVE_RVV + LoadPatternAndReshuffleMask(dst - offset, offset) + #else auto pattern_and_reshuffle_mask = LoadPatternAndReshuffleMask(dst - offset, offset); V128 pattern = pattern_and_reshuffle_mask.first; V128 reshuffle_mask = pattern_and_reshuffle_mask.second; + #endif for (int i = 0; i < 4; i++) { V128_StoreU(reinterpret_cast(dst + 16 * i), pattern); pattern = V128_Shuffle(pattern, reshuffle_mask); @@ -435,10 +459,14 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, // Typically, the op_limit is the gating factor so try to simplify the loop // based on that. if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) { + #if SNAPPY_HAVE_RVV + LoadPatternAndReshuffleMask(src, pattern_size) + #else auto pattern_and_reshuffle_mask = LoadPatternAndReshuffleMask(src, pattern_size); V128 pattern = pattern_and_reshuffle_mask.first; V128 reshuffle_mask = pattern_and_reshuffle_mask.second; + #endif // There is at least one, and at most four 16-byte blocks. Writing four // conditionals instead of a loop allows FDO to layout the code with @@ -462,10 +490,14 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit, } char* const op_end = buf_limit - 15; if (SNAPPY_PREDICT_TRUE(op < op_end)) { + #if SNAPPY_HAVE_RVV + LoadPatternAndReshuffleMask(src, pattern_size) + #else auto pattern_and_reshuffle_mask = LoadPatternAndReshuffleMask(src, pattern_size); V128 pattern = pattern_and_reshuffle_mask.first; V128 reshuffle_mask = pattern_and_reshuffle_mask.second; + #endif // This code path is relatively cold however so we save code size // by avoiding unrolling and vectorizing. @@ -1099,7 +1131,7 @@ inline uint32_t ExtractOffset(uint32_t val, size_t tag_type) { reinterpret_cast(&kExtractMasksCombined) + 2 * tag_type, sizeof(result)); return val & result; -#elif defined(__aarch64__) +#elif defined(__aarch64__) || defined(__riscv) constexpr uint64_t kExtractMasksCombined = 0x0000FFFF00FF0000ull; return val & static_cast( (kExtractMasksCombined >> (tag_type * 16)) & 0xFFFF); @@ -1149,7 +1181,7 @@ std::pair DecompressBranchless( // For literals tag_type = 0, hence we will always obtain 0 from // ExtractLowBytes. For literals offset will thus be kLiteralOffset. ptrdiff_t len_min_offset = kLengthMinusOffset[tag]; -#if defined(__aarch64__) +#if defined(__aarch64__) || defined(__riscv) size_t tag_type = AdvanceToNextTagARMOptimized(&ip, &tag); #else size_t tag_type = AdvanceToNextTagX86Optimized(&ip, &tag);