From bbe4f3bfd5ac543ed805c58cacac922f0ccf70f2 Mon Sep 17 00:00:00 2001
From: "zhuna.1024" <zhuna.1024@bytedance.com>
Date: Thu, 27 Oct 2022 22:08:07 +0800
Subject: [PATCH] rvv simd acceleration

---
 CMakeLists.txt    | 10 ++++++++++
 cmake/config.h.in |  3 +++
 snappy-internal.h | 34 +++++++++++++++++++++++++++++++---
 snappy.cc         | 44 ++++++++++++++++++++++++++++++++++++++------
 4 files changed, 82 insertions(+), 9 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6eef485..0c6a0dd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -187,6 +187,16 @@ int main() {
   return 0;
 }" SNAPPY_HAVE_NEON)
 
+check_cxx_source_compiles("
+#include <riscv_vector.h>
+int main() {
+  uint8_t val = 3, dup[8];
+  vuint8m1_t v = vmv_v_x_u8m1(val, 128);
+  vuint64m1_t vv = vreinterpret_v_u8m1_u64m1(v);
+  vse64_v_u64m1(reinterpret_cast<uint64_t*>(dup), vv, 128);
+  return 0;
+}" SNAPPY_HAVE_RVV)
+
 include(CheckSymbolExists)
 check_symbol_exists("mmap" "sys/mman.h" HAVE_FUNC_MMAP)
 check_symbol_exists("sysconf" "unistd.h" HAVE_FUNC_SYSCONF)
diff --git a/cmake/config.h.in b/cmake/config.h.in
index 5ea2b5a..4f0b87e 100644
--- a/cmake/config.h.in
+++ b/cmake/config.h.in
@@ -56,4 +56,7 @@
    first (like Motorola and SPARC, unlike Intel and VAX). */
 #cmakedefine01 SNAPPY_IS_BIG_ENDIAN
 
+/* Define to 1 if you target processors with RVV and have <riscv_vector.h>. */
+#cmakedefine01 SNAPPY_HAVE_RVV
+
 #endif  // THIRD_PARTY_SNAPPY_OPENSOURCE_CMAKE_CONFIG_H_
diff --git a/snappy-internal.h b/snappy-internal.h
index e552ea0..017bcaa 100644
--- a/snappy-internal.h
+++ b/snappy-internal.h
@@ -44,7 +44,11 @@
 #include <arm_neon.h>
 #endif
 
-#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON
+#if SNAPPY_HAVE_RVV
+#include <riscv_vector.h>
+#endif
+
+#if SNAPPY_HAVE_SSSE3 || SNAPPY_HAVE_NEON || SNAPPY_HAVE_RVV
 #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 1
 #else
 #define SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE 0
@@ -58,6 +62,8 @@ namespace internal {
 using V128 = __m128i;
 #elif SNAPPY_HAVE_NEON
 using V128 = uint8x16_t;
+#elif SNAPPY_HAVE_RVV
+using V128 = vuint8m1_t;
 #endif
 
 // Load 128 bits of integer data. `src` must be 16-byte aligned.
@@ -108,7 +114,29 @@ inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
 }
 
 inline V128 V128_DupChar(char c) { return vdupq_n_u8(c); }
-#endif
+
+#elif SNAPPY_HAVE_RVV
+inline V128 V128_Load(const V128* src) {
+  return vle8_v_u8m1(reinterpret_cast<const uint8_t*>(src), 128);
+}
+
+inline V128 V128_LoadU(const V128* src) {
+  return vle8_v_u8m1(reinterpret_cast<const uint8_t*>(src), 128);
+}
+
+inline void V128_StoreU(V128* dst, V128 val) {
+  vse8_v_u8m1(reinterpret_cast<uint8_t*>(dst), val, 128);
+}
+
+inline V128 V128_Shuffle(V128 input, V128 shuffle_mask) {
+  return vrgather_vv_u8m1(input, shuffle_mask, 128);
+}
+
+inline V128 V128_DupChar(char c) { 
+  return vmv_v_x_u8m1(c, 128);
+}
+
+#endif  // SNAPPY_HAVE_RVV
 #endif  // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
 
 // Working memory performs a single allocation to hold all scratch space
@@ -172,7 +200,7 @@ char* CompressFragment(const char* input,
 // Separate implementation for 64-bit, little-endian cpus.
 #if !SNAPPY_IS_BIG_ENDIAN && \
     (defined(__x86_64__) || defined(_M_X64) || defined(ARCH_PPC) || \
-     defined(ARCH_ARM))
+     defined(ARCH_ARM) || defined(__riscv))
 static inline std::pair<size_t, bool> FindMatchLength(const char* s1,
                                                       const char* s2,
                                                       const char* s2_limit,
diff --git a/snappy.cc b/snappy.cc
index b072e5d..a241821 100644
--- a/snappy.cc
+++ b/snappy.cc
@@ -232,22 +232,32 @@ inline constexpr std::array<char, sizeof...(indexes)> MakePatternMaskBytes(
   return {static_cast<char>((index_offset + indexes) % pattern_size)...};
 }
 
+template <typename V>
+struct SizeOfV128 {
+  static constexpr uint64_t size = sizeof(V);
+};
+
+template <>
+struct SizeOfV128<vuint8m1_t> {
+  static constexpr uint64_t size = 128;
+};
+
 // Computes the shuffle control mask bytes array for given pattern-sizes and
 // returns an array.
 template <size_t... pattern_sizes_minus_one>
-inline constexpr std::array<std::array<char, sizeof(V128)>,
+inline constexpr std::array<std::array<char, SizeOfV128<V128>::size>,
                             sizeof...(pattern_sizes_minus_one)>
 MakePatternMaskBytesTable(int index_offset,
                           index_sequence<pattern_sizes_minus_one...>) {
   return {
       MakePatternMaskBytes(index_offset, pattern_sizes_minus_one + 1,
-                           make_index_sequence</*indexes=*/sizeof(V128)>())...};
+                           make_index_sequence</*indexes=*/SizeOfV128<V128>::size>())...};
 }
 
 // This is an array of shuffle control masks that can be used as the source
 // operand for PSHUFB to permute the contents of the destination XMM register
 // into a repeating byte pattern.
-alignas(16) constexpr std::array<std::array<char, sizeof(V128)>,
+alignas(16) constexpr std::array<std::array<char, SizeOfV128<V128>::size>()>,
                                  16> pattern_generation_masks =
     MakePatternMaskBytesTable(
         /*index_offset=*/0,
@@ -258,7 +268,7 @@ alignas(16) constexpr std::array<std::array<char, sizeof(V128)>,
 // Basically, pattern_reshuffle_masks is a continuation of
 // pattern_generation_masks. It follows that, pattern_reshuffle_masks is same as
 // pattern_generation_masks for offsets 1, 2, 4, 8 and 16.
-alignas(16) constexpr std::array<std::array<char, sizeof(V128)>,
+alignas(16) constexpr std::array<std::array<char, SizeOfV128<V128>::size>()>,
                                  16> pattern_reshuffle_masks =
     MakePatternMaskBytesTable(
         /*index_offset=*/16,
@@ -275,6 +285,15 @@ static inline V128 LoadPattern(const char* src, const size_t pattern_size) {
                       generation_mask);
 }
 
+// fix sizeless compiler issue
+#if SNAPPY_HAVE_RVV
+#define LoadPatternAndReshuffleMask(src, pattern_size) \
+  V128 pattern = LoadPattern(src, pattern_size);\
+  V128 reshuffle_mask = V128_Load(reinterpret_cast<const V128*>(\
+      pattern_reshuffle_masks[pattern_size - 1].data()));
+
+#else
+
 SNAPPY_ATTRIBUTE_ALWAYS_INLINE
 static inline std::pair<V128 /* pattern */, V128 /* reshuffle_mask */>
 LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) {
@@ -290,6 +309,7 @@ LoadPatternAndReshuffleMask(const char* src, const size_t pattern_size) {
       pattern_reshuffle_masks[pattern_size - 1].data()));
   return {pattern, reshuffle_mask};
 }
+#endif
 
 #endif  // SNAPPY_HAVE_VECTOR_BYTE_SHUFFLE
 
@@ -324,10 +344,14 @@ static inline bool Copy64BytesWithPatternExtension(char* dst, size_t offset) {
         return true;
       }
       default: {
+        #if SNAPPY_HAVE_RVV
+        LoadPatternAndReshuffleMask(dst - offset, offset)
+        #else
         auto pattern_and_reshuffle_mask =
             LoadPatternAndReshuffleMask(dst - offset, offset);
         V128 pattern = pattern_and_reshuffle_mask.first;
         V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
+        #endif
         for (int i = 0; i < 4; i++) {
           V128_StoreU(reinterpret_cast<V128*>(dst + 16 * i), pattern);
           pattern = V128_Shuffle(pattern, reshuffle_mask);
@@ -435,10 +459,14 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
     // Typically, the op_limit is the gating factor so try to simplify the loop
     // based on that.
     if (SNAPPY_PREDICT_TRUE(op_limit <= buf_limit - 15)) {
+      #if SNAPPY_HAVE_RVV
+      LoadPatternAndReshuffleMask(src, pattern_size)
+      #else
       auto pattern_and_reshuffle_mask =
           LoadPatternAndReshuffleMask(src, pattern_size);
       V128 pattern = pattern_and_reshuffle_mask.first;
       V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
+      #endif
 
       // There is at least one, and at most four 16-byte blocks. Writing four
       // conditionals instead of a loop allows FDO to layout the code with
@@ -462,10 +490,14 @@ inline char* IncrementalCopy(const char* src, char* op, char* const op_limit,
     }
     char* const op_end = buf_limit - 15;
     if (SNAPPY_PREDICT_TRUE(op < op_end)) {
+      #if SNAPPY_HAVE_RVV
+      LoadPatternAndReshuffleMask(src, pattern_size)
+      #else
       auto pattern_and_reshuffle_mask =
           LoadPatternAndReshuffleMask(src, pattern_size);
       V128 pattern = pattern_and_reshuffle_mask.first;
       V128 reshuffle_mask = pattern_and_reshuffle_mask.second;
+      #endif
 
       // This code path is relatively cold however so we save code size
       // by avoiding unrolling and vectorizing.
@@ -1099,7 +1131,7 @@ inline uint32_t ExtractOffset(uint32_t val, size_t tag_type) {
          reinterpret_cast<const char*>(&kExtractMasksCombined) + 2 * tag_type,
          sizeof(result));
   return val & result;
-#elif defined(__aarch64__)
+#elif defined(__aarch64__) || defined(__riscv)
   constexpr uint64_t kExtractMasksCombined = 0x0000FFFF00FF0000ull;
   return val & static_cast<uint32_t>(
       (kExtractMasksCombined >> (tag_type * 16)) & 0xFFFF);
@@ -1149,7 +1181,7 @@ std::pair<const uint8_t*, ptrdiff_t> DecompressBranchless(
         // For literals tag_type = 0, hence we will always obtain 0 from
         // ExtractLowBytes. For literals offset will thus be kLiteralOffset.
         ptrdiff_t len_min_offset = kLengthMinusOffset[tag];
-#if defined(__aarch64__)
+#if defined(__aarch64__) || defined(__riscv)
         size_t tag_type = AdvanceToNextTagARMOptimized(&ip, &tag);
 #else
         size_t tag_type = AdvanceToNextTagX86Optimized(&ip, &tag);