howjmay · howjmay · Apr 14, 2026 · Jul 31, 2024
diff --git a/.github/workflows/github_actions.yml b/.github/workflows/github_actions.yml
@@ -13,6 +13,8 @@ jobs:
     steps:
       - name: checkout code
         uses: actions/checkout@v3.2.0
+        with:
+          submodules: recursive
       - name: setup riscv toolchain
         run: |
           mkdir /opt/riscv
@@ -29,30 +31,22 @@ jobs:
 
   # for validate test cases only
   check_test_cases:
-    runs-on: ubuntu-22.04
+    runs-on: ubuntu-22.04-arm
     strategy:
       matrix:
-        arch: [aarch64]
-        cxx_compiler: [g++-10, clang++-11]
+        cxx_compiler: [g++-12, clang++-14]
     steps:
       - name: checkout code
         uses: actions/checkout@v3.2.0
+      - name: install dependencies
+        run: |
+          sudo apt-get update -q -y
+          sudo apt-get install -q -y "${{ matrix.cxx_compiler }}" make gcc
       - name: build artifact
-        # The Github Action for non-x86 CPU
-        # https://github.com/uraimo/run-on-arch-action
-        uses: uraimo/run-on-arch-action@v2.5.0
-        with:
-          arch: ${{ matrix.arch }}
-          distro: ubuntu20.04
-          env: |
-            CXX: ${{ matrix.cxx_compiler }}
-          install: |
-            apt-get update -q -y
-            apt-get install -q -y "${{ matrix.cxx_compiler }}" make
-            apt-get install -q -y gcc
-          run: |
-            export ENABLE_TEST_ALL=true
-            make test
+        env:
+          CXX: ${{ matrix.cxx_compiler }}
+          ENABLE_TEST_ALL: "true"
+        run: make test
 
   coding_style:
     runs-on: ubuntu-22.04
@@ -63,5 +57,5 @@ jobs:
         # clang-format version should be set
         run: |
             sudo apt-get install -q -y clang-format
-            sh scripts/check-format.sh
+            bash scripts/check-format.sh
         shell: bash
diff --git a/neon2rvv.h b/neon2rvv.h
@@ -237,6 +237,11 @@ FORCE_INLINE uint64x2_t vdupq_n_u64(uint64_t a);
 FORCE_INLINE int8x8_t vcnt_s8(int8x8_t a);
 FORCE_INLINE uint8x8_t vcnt_u8(uint8x8_t a);
 
+FORCE_INLINE int64_t vget_lane_s64(int64x1_t a, const int b);
+FORCE_INLINE uint64_t vget_lane_u64(uint64x1_t a, const int b);
+FORCE_INLINE int64_t vqrshld_s64(int64_t a, int64_t b);
+FORCE_INLINE uint64_t vqrshld_u64(uint64_t a, int64_t b);
+
 /* vadd */
 FORCE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b) { return __riscv_vadd_vv_i8m1(a, b, 8); }
 
@@ -5422,24 +5427,56 @@ FORCE_INLINE int16_t vqshlh_s16(int16_t a, int16_t b) {
 
 FORCE_INLINE int32_t vqshls_s32(int32_t a, int32_t b) {
   if (b < 0) {
-    return a >> -b;
+    int32_t rshift = -b;
+    if (rshift >= 32) {
+      return (a < 0) ? -1 : 0;
+    }
+    return a >> rshift;
   }
-  if ((INT32_MAX >> b) < a) {
+  if (b >= 32) {
+    if (a > 0) {
+      return INT32_MAX;
+    }
+    if (a < 0) {
+      return INT32_MIN;
+    }
+    return 0;
+  }
+  int64_t tmp = ((int64_t)a) << b;
+  if (tmp > INT32_MAX) {
     return INT32_MAX;
-  } else {
-    return a << b;
   }
+  if (tmp < INT32_MIN) {
+    return INT32_MIN;
+  }
+  return (int32_t)tmp;
 }
 
 FORCE_INLINE int64_t vqshld_s64(int64_t a, int64_t b) {
   if (b < 0) {
-    return a >> -b;
+    int64_t rshift = -b;
+    if (rshift >= 64) {
+      return (a < 0) ? -1 : 0;
+    }
+    return a >> rshift;
+  }
+  if (b >= 64) {
+    if (a > 0) {
+      return INT64_MAX;
+    }
+    if (a < 0) {
+      return INT64_MIN;
+    }
+    return 0;
   }
-  if ((INT64_MAX >> b) < a) {
+  __int128 tmp = ((__int128)a) << b;
+  if (tmp > INT64_MAX) {
     return INT64_MAX;
-  } else {
-    return (int64_t)a << b;
   }
+  if (tmp < INT64_MIN) {
+    return INT64_MIN;
+  }
+  return (int64_t)tmp;
 }
 
 FORCE_INLINE uint8_t vqshlb_u8(uint8_t a, int8_t b) {
@@ -5525,7 +5562,11 @@ FORCE_INLINE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b) {
   return __riscv_vmerge_vvm_i32m1(shr, shl, positive_mask, 2);
 }
 
-// FORCE_INLINE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b);
+FORCE_INLINE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b) {
+  int64_t a0 = vget_lane_s64(a, 0);
+  int64_t b0 = vget_lane_s64(b, 0);
+  return vdup_n_s64(vqrshld_s64(a0, b0));
+}
 
 FORCE_INLINE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b) {
   vbool8_t positive_mask = __riscv_vmsgt_vx_i8m1_b8(b, 0, 8);
@@ -5566,7 +5607,11 @@ FORCE_INLINE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b) {
   return __riscv_vmerge_vvm_u32m1(shr, shl, positive_mask, 2);
 }
 
-// FORCE_INLINE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b);
+FORCE_INLINE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b) {
+  uint64_t a0 = vget_lane_u64(a, 0);
+  int64_t b0 = vget_lane_s64(b, 0);
+  return vdup_n_u64(vqrshld_u64(a0, b0));
+}
 
 FORCE_INLINE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b) {
   vbool8_t positive_mask = __riscv_vmsgt_vx_i8m1_b8(b, 0, 16);
@@ -5650,21 +5695,135 @@ FORCE_INLINE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b) {
 
 // FORCE_INLINE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b);
 
-// FORCE_INLINE int8_t vqrshlb_s8(int8_t a, int8_t b);
+FORCE_INLINE int8_t vqrshlb_s8(int8_t a, int8_t b) {
+  if (b < 0) {
+    return ((int16_t)a + (1 << (-b - 1))) >> (-b);
+  } else {
+    return vqshlb_s8(a, b);
+  }
+}
 
-// FORCE_INLINE int16_t vqrshlh_s16(int16_t a, int16_t b);
+FORCE_INLINE int16_t vqrshlh_s16(int16_t a, int16_t b) {
+  if (b < 0) {
+    return ((int32_t)a + (1 << (-b - 1))) >> (-b);
+  } else {
+    return vqshlh_s16(a, b);
+  }
+}
 
-// FORCE_INLINE int32_t vqrshls_s32(int32_t a, int32_t b);
+FORCE_INLINE int32_t vqrshls_s32(int32_t a, int32_t b) {
+  if (b < 0) {
+    return ((int64_t)a + (1 << (-b - 1))) >> (-b);
+  } else {
+    return vqshls_s32(a, b);
+  }
+}
 
-// FORCE_INLINE int64_t vqrshld_s64(int64_t a, int64_t b);
+FORCE_INLINE int64_t vqrshld_s64(int64_t a, int64_t b) {
+  if (b < 0) {
+    if (b <= -64) {
+      return 0;
+    }
+    uint64_t b_neg = (uint64_t)(-b);
+    // Rounded arithmetic right shift without __int128:
+    // a = q * 2^n + r, where q = a >> n and r are low n bits of a.
+    // Add round-to-nearest increment (2^(n-1)) to r and carry into q.
+    int64_t q = a >> b_neg;
+    uint64_t mask = (UINT64_C(1) << b_neg) - 1;
+    uint64_t r = ((uint64_t)a) & mask;
+    uint64_t carry = (r + (UINT64_C(1) << (b_neg - 1))) >> b_neg;
+    return q + (int64_t)carry;
+  } else {
+    if (b >= 64) {
+      if (a > 0) {
+        return INT64_MAX;
+      }
+      if (a < 0) {
+        return INT64_MIN;
+      }
+      return 0;
+    }
+    return vqshld_s64(a, b);
+  }
+}
 
-// FORCE_INLINE uint8_t vqrshlb_u8(uint8_t a, int8_t b);
+FORCE_INLINE uint8_t vqrshlb_u8(uint8_t a, int8_t b) {
+  if (b < 0) {
+    if (b <= -8) {
+      return 0;
+    }
+    uint32_t b_neg = (uint32_t)(-b);
+    uint32_t q = (uint32_t)a >> b_neg;
+    uint32_t mask = (UINT32_C(1) << b_neg) - 1;
+    uint32_t r = (uint32_t)a & mask;
+    uint32_t carry = (r + (UINT32_C(1) << (b_neg - 1))) >> b_neg;
+    return (uint8_t)(q + carry);
+  } else {
+    if (b >= 8) {
+      return a == 0 ? 0 : UINT8_MAX;
+    }
+    return vqshlb_u8(a, b);
+  }
+}
 
-// FORCE_INLINE uint16_t vqrshlh_u16(uint16_t a, int16_t b);
+FORCE_INLINE uint16_t vqrshlh_u16(uint16_t a, int16_t b) {
+  if (b < 0) {
+    if (b <= -16) {
+      return 0;
+    }
+    uint32_t b_neg = (uint32_t)(-b);
+    uint32_t q = (uint32_t)a >> b_neg;
+    uint32_t mask = (UINT32_C(1) << b_neg) - 1;
+    uint32_t r = (uint32_t)a & mask;
+    uint32_t carry = (r + (UINT32_C(1) << (b_neg - 1))) >> b_neg;
+    return (uint16_t)(q + carry);
+  } else {
+    if (b >= 16) {
+      return a == 0 ? 0 : UINT16_MAX;
+    }
+    return vqshlh_u16(a, b);
+  }
+}
 
-// FORCE_INLINE uint32_t vqrshls_u32(uint32_t a, int32_t b);
+FORCE_INLINE uint32_t vqrshls_u32(uint32_t a, int32_t b) {
+  if (b < 0) {
+    if (b <= -32) {
+      return 0;
+    }
+    uint64_t b_neg = (uint64_t)(-b);
+    uint64_t q = (uint64_t)a >> b_neg;
+    uint64_t mask = (UINT64_C(1) << b_neg) - 1;
+    uint64_t r = (uint64_t)a & mask;
+    uint64_t carry = (r + (UINT64_C(1) << (b_neg - 1))) >> b_neg;
+    return (uint32_t)(q + carry);
+  } else {
+    if (b >= 32) {
+      return a == 0 ? 0 : UINT32_MAX;
+    }
+    return vqshls_u32(a, b);
+  }
+}
 
-// FORCE_INLINE uint64_t vqrshld_u64(uint64_t a, int64_t b);
+FORCE_INLINE uint64_t vqrshld_u64(uint64_t a, int64_t b) {
+  if (b < 0) {
+    if (b <= -64) {
+      return 0;
+    }
+    uint64_t b_neg = (uint64_t)(-b);
+    // Rounded logical right shift without __uint128_t:
+    // split into quotient/remainder at 2^n and propagate rounding carry.
+    uint64_t q = a >> b_neg;
+    uint64_t mask = (UINT64_C(1) << b_neg) - 1;
+    uint64_t r = a & mask;
+    uint64_t carry = (r + (UINT64_C(1) << (b_neg - 1))) >> b_neg;
+    return q + carry;
+  } else {
+    if (b >= 64) {
+      return a == 0 ? 0 : UINT64_MAX;
+    }
+    return vqshld_u64(a, b);
+  }
+}
 
 FORCE_INLINE int8x8_t vshr_n_s8(int8x8_t a, const int b) {
   const int imm = b - (b >> 3);

diff --git a/tests/common.cpp b/tests/common.cpp
@@ -1,5 +1,6 @@
 #include "common.h"
-#include <cmath>
+#include <math.h>
+#include <stdlib.h>
 
 namespace NEON2RVV {
 int32_t NaN = ~0;
@@ -596,7 +597,7 @@ result_t validate_double_pair(double a, double b) {
   // We do an integer (binary) compare rather than a
   // floating point compare to take NaNs and infinities
   // into account as well.
-  if (std::isnan(a) && std::isnan(b)) {
+  if (isnan(a) && isnan(b)) {
     return TEST_SUCCESS;
   }
   return (*ua) == (*ub) ? TEST_SUCCESS : TEST_FAIL;
@@ -663,16 +664,16 @@ result_t validate_float_error(float32x4_t a, float f0, float f1, float f2, float
   float df1 = fabsf((t[1] - f1) / f1);
   float df2 = fabsf((t[2] - f2) / f2);
   float df3 = fabsf((t[3] - f3) / f3);
-  if ((std::isnan(t[0]) && std::isnan(f0)) || (t[0] == 0 && f0 == 0) || (std::isinf(t[0]) && std::isinf(f0))) {
+  if ((isnan(t[0]) && isnan(f0)) || (t[0] == 0 && f0 == 0) || (isinf(t[0]) && isinf(f0))) {
     df0 = 0;
   }
-  if ((std::isnan(t[1]) && std::isnan(f1)) || (t[1] == 0 && f1 == 0) || (std::isinf(t[1]) && std::isinf(f1))) {
+  if ((isnan(t[1]) && isnan(f1)) || (t[1] == 0 && f1 == 0) || (isinf(t[1]) && isinf(f1))) {
     df1 = 0;
   }
-  if ((std::isnan(t[2]) && std::isnan(f2)) || (t[2] == 0 && f2 == 0) || (std::isinf(t[2]) && std::isinf(f2))) {
+  if ((isnan(t[2]) && isnan(f2)) || (t[2] == 0 && f2 == 0) || (isinf(t[2]) && isinf(f2))) {
     df2 = 0;
   }
-  if ((std::isnan(t[3]) && std::isnan(f3)) || (t[3] == 0 && f3 == 0) || (std::isinf(t[3]) && std::isinf(f3))) {
+  if ((isnan(t[3]) && isnan(f3)) || (t[3] == 0 && f3 == 0) || (isinf(t[3]) && isinf(f3))) {
     df3 = 0;
   }
   ASSERT_RETURN(df0 < err);
@@ -686,10 +687,10 @@ result_t validate_float_error(float32x2_t a, float f0, float f1, float err) {
   const float *t = (const float *)&a;
   float df0 = fabsf((t[0] - f0) / f0);
   float df1 = fabsf((t[1] - f1) / f1);
-  if ((std::isnan(t[0]) && std::isnan(f0)) || (t[0] == 0 && f0 == 0) || (std::isinf(t[0]) && std::isinf(f0))) {
+  if ((isnan(t[0]) && isnan(f0)) || (t[0] == 0 && f0 == 0) || (isinf(t[0]) && isinf(f0))) {
     df0 = 0;
   }
-  if ((std::isnan(t[1]) && std::isnan(f1)) || (t[1] == 0 && f1 == 0) || (std::isinf(t[1]) && std::isinf(f1))) {
+  if ((isnan(t[1]) && isnan(f1)) || (t[1] == 0 && f1 == 0) || (isinf(t[1]) && isinf(f1))) {
     df1 = 0;
   }
   ASSERT_RETURN(df0 < err);
@@ -699,7 +700,7 @@ result_t validate_float_error(float32x2_t a, float f0, float f1, float err) {
 
 result_t validate_float_error(float32_t a, float f0, float err) {
   float df0 = fabsf((a - f0) / f0);
-  if ((std::isnan(a) && std::isnan(f0)) || (a == 0 && f0 == 0) || (std::isinf(a) && std::isinf(f0))) {
+  if ((isnan(a) && isnan(f0)) || (a == 0 && f0 == 0) || (isinf(a) && isinf(f0))) {
     df0 = 0;
   }
   ASSERT_RETURN(df0 < err);
@@ -755,10 +756,10 @@ result_t validate_double_error(float64x2_t a, double d0, double d1, double err)
   const double *t = (const double *)&a;
   double td0 = fabs((t[0] - d0) / d0);
   double td1 = fabs((t[1] - d1) / d1);
-  if (std::isnan(t[0]) && std::isnan(d0)) {
+  if (isnan(t[0]) && isnan(d0)) {
     td0 = 0;
   }
-  if (std::isnan(t[1]) && std::isnan(d1)) {
+  if (isnan(t[1]) && isnan(d1)) {
     td1 = 0;
   }
   ASSERT_RETURN(td0 < err);
@@ -769,16 +770,16 @@ result_t validate_double_error(float64x2_t a, double d0, double d1, double err)
 result_t validate_double_error(float64x1_t a, double d0, double err) {
   const double *t = (const double *)&a;
   double td0 = fabs((t[0] - d0) / d0);
-  if (std::isnan(t[0]) && std::isnan(d0)) {
+  if (isnan(t[0]) && isnan(d0)) {
     td0 = 0;
   }
   ASSERT_RETURN(td0 < err);
   return TEST_SUCCESS;
 }
 
 result_t validate_double_error(double a, double d0, double err) {
-  double df0 = abs((a - d0) / d0);
-  if ((std::isnan(a) && std::isnan(d0)) || (a == 0 && d0 == 0) || (std::isinf(a) && std::isinf(d0))) {
+  double df0 = fabs((a - d0) / d0);
+  if ((isnan(a) && isnan(d0)) || (a == 0 && d0 == 0) || (isinf(a) && isinf(d0))) {
     df0 = 0;
   }
   ASSERT_RETURN(df0 < err);