Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 10 additions & 18 deletions .github/workflows/github_actions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,30 +29,22 @@ jobs:

# for validate test cases only
check_test_cases:
runs-on: ubuntu-22.04
runs-on: ubuntu-24.04-arm
strategy:
matrix:
arch: [aarch64]
cxx_compiler: [g++-10, clang++-11]
cxx_compiler: [g++-12, clang++-14]
steps:
- name: checkout code
uses: actions/checkout@v3.2.0
- name: install dependencies
run: |
sudo apt-get update -q -y
sudo apt-get install -q -y ${{ matrix.cxx_compiler }} make gcc
- name: build artifact
# The Github Action for non-x86 CPU
# https://github.com/uraimo/run-on-arch-action
uses: uraimo/run-on-arch-action@v2.5.0
with:
arch: ${{ matrix.arch }}
distro: ubuntu20.04
env: |
CXX: ${{ matrix.cxx_compiler }}
install: |
apt-get update -q -y
apt-get install -q -y "${{ matrix.cxx_compiler }}" make
apt-get install -q -y gcc
run: |
export ENABLE_TEST_ALL=true
make test
env:
CXX: ${{ matrix.cxx_compiler }}
ENABLE_TEST_ALL: true
run: make test

coding_style:
runs-on: ubuntu-22.04
Expand Down
108 changes: 83 additions & 25 deletions neon2rvv.h
Original file line number Diff line number Diff line change
Expand Up @@ -9844,69 +9844,127 @@ FORCE_INLINE int32x2_t vqrdmulh_lane_s32(int32x2_t a, int32x2_t b, const int c)
return __riscv_vnclip_wx_i32m1(ab_mulx2, 32, __RISCV_VXRM_RNU, 2);
}

FORCE_INLINE int16x8_t vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c, const int __d) {
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, __d, 8);
FORCE_INLINE int16x8_t vqrdmlahq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c, const int lane) {
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 8);
vint32m2_t bc_mul = __riscv_vwmul_vv_i32m2(b, c_dup, 8);
vint32m2_t bc_mulx2 = __riscv_vsll_vx_i32m2(bc_mul, 1, 8);
vint16m1_t bc_s = __riscv_vnclip_wx_i16m1(bc_mulx2, 16, __RISCV_VXRM_RNU, 8);
return __riscv_vsadd_vv_i16m1(a, bc_s, 8);
}

// FORCE_INLINE int16x4_t vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v, const int lane);
FORCE_INLINE int16x4_t vqrdmlah_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t c, const int lane) {
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 4);
vint32m2_t bc_mul = __riscv_vwmul_vv_i32m2(b, c_dup, 4);
vint32m2_t bc_mulx2 = __riscv_vsll_vx_i32m2(bc_mul, 1, 4);
vint16m1_t bc_s = __riscv_vnclip_wx_i16m1(bc_mulx2, 16, __RISCV_VXRM_RNU, 4);
return __riscv_vsadd_vv_i16m1(a, bc_s, 4);
}

// FORCE_INLINE int16x8_t vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v, const int lane);
FORCE_INLINE int16x8_t vqrdmlahq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t c, const int lane) {
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 8);
vint32m2_t bc_mul = __riscv_vwmul_vv_i32m2(b, c_dup, 8);
vint32m2_t bc_mulx2 = __riscv_vsll_vx_i32m2(bc_mul, 1, 8);
vint16m1_t bc_s = __riscv_vnclip_wx_i16m1(bc_mulx2, 16, __RISCV_VXRM_RNU, 8);
return __riscv_vsadd_vv_i16m1(a, bc_s, 8);
}

FORCE_INLINE int32x4_t vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c, const int __d) {
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, __d, 4);
FORCE_INLINE int32x4_t vqrdmlahq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c, const int lane) {
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 4);
vint64m2_t bc_mul = __riscv_vwmul_vv_i64m2(b, c_dup, 4);
vint64m2_t bc_mulx2 = __riscv_vsll_vx_i64m2(bc_mul, 1, 4);
vint32m1_t bc_s = __riscv_vnclip_wx_i32m1(bc_mulx2, 32, __RISCV_VXRM_RNU, 4);
return __riscv_vsadd_vv_i32m1(a, bc_s, 4);
}

// FORCE_INLINE int32x2_t vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v, const int lane);
FORCE_INLINE int32x2_t vqrdmlah_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t c, const int lane) {
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 2);
vint64m2_t bc_mul = __riscv_vwmul_vv_i64m2(b, c_dup, 2);
vint64m2_t bc_mulx2 = __riscv_vsll_vx_i64m2(bc_mul, 1, 2);
vint32m1_t bc_s = __riscv_vnclip_wx_i32m1(bc_mulx2, 32, __RISCV_VXRM_RNU, 2);
return __riscv_vsadd_vv_i32m1(a, bc_s, 2);
}

// FORCE_INLINE int32x4_t vqrdmlahq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v, const int lane);
FORCE_INLINE int32x4_t vqrdmlahq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t c, const int lane) {
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 4);
vint64m2_t bc_mul = __riscv_vwmul_vv_i64m2(b, c_dup, 4);
vint64m2_t bc_mulx2 = __riscv_vsll_vx_i64m2(bc_mul, 1, 4);
vint32m1_t bc_s = __riscv_vnclip_wx_i32m1(bc_mulx2, 32, __RISCV_VXRM_RNU, 4);
return __riscv_vsadd_vv_i32m1(a, bc_s, 4);
}

FORCE_INLINE int16x4_t vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c, const int __d) {
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, __d, 4);
FORCE_INLINE int16x4_t vqrdmlah_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c, const int lane) {
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 4);
vint32m2_t bc_mul = __riscv_vwmul_vv_i32m2(b, c_dup, 4);
vint32m2_t bc_mulx2 = __riscv_vsll_vx_i32m2(bc_mul, 1, 4);
vint16m1_t bc_s = __riscv_vnclip_wx_i16m1(bc_mulx2, 16, __RISCV_VXRM_RNU, 4);
return __riscv_vsadd_vv_i16m1(a, bc_s, 4);
}

FORCE_INLINE int32x2_t vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c, const int __d) {
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, __d, 2);
FORCE_INLINE int32x2_t vqrdmlah_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c, const int lane) {
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 2);
vint64m2_t bc_mul = __riscv_vwmul_vv_i64m2(b, c_dup, 2);
vint64m2_t bc_mulx2 = __riscv_vsll_vx_i64m2(bc_mul, 1, 2);
vint32m1_t bc_s = __riscv_vnclip_wx_i32m1(bc_mulx2, 32, __RISCV_VXRM_RNU, 2);
return __riscv_vsadd_vv_i32m1(a, bc_s, 2);
}

FORCE_INLINE int16x8_t vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c, const int __d) {
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, __d, 4);
vint32m2_t bc_mul = __riscv_vwmul_vv_i32m2(b, c_dup, 4);
vint32m2_t bc_mulx2 = __riscv_vsll_vx_i32m2(bc_mul, 1, 4);
vint16m1_t bc_s = __riscv_vnclip_wx_i16m1(bc_mulx2, 16, __RISCV_VXRM_RNU, 4);
return __riscv_vssub_vv_i16m1(a, bc_s, 4);
FORCE_INLINE int16x8_t vqrdmlshq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c, const int lane) {
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 8);
vint32m2_t bc_mul = __riscv_vwmul_vv_i32m2(b, c_dup, 8);
vint32m2_t bc_mulx2 = __riscv_vsll_vx_i32m2(bc_mul, 1, 8);
vint16m1_t bc_s = __riscv_vnclip_wx_i16m1(bc_mulx2, 16, __RISCV_VXRM_RNU, 8);
return __riscv_vssub_vv_i16m1(a, bc_s, 8);
}

// FORCE_INLINE int16x4_t vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v, const int lane);
FORCE_INLINE int16x4_t vqrdmlsh_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t c, const int lane) {
int16_t _a[4], _b[4], _d[4];
int16_t c_lane = vgetq_lane_s16(c, lane);
__riscv_vse16_v_i16m1(_a, a, 4);
__riscv_vse16_v_i16m1(_b, b, 4);

for (int i = 0; i < 4; i++) {
int64_t tmp = (int64_t)_b[i] * (int64_t)c_lane * 2;
tmp = neon2rvv_saturate_int32(tmp);
tmp += ((int64_t)1 << 15) - 1;
tmp = neon2rvv_saturate_int32(tmp);
int16_t bc_s = (int16_t)(tmp >> 16);
_d[i] = neon2rvv_saturate_int16((int32_t)_a[i] - (int32_t)bc_s);
}

// FORCE_INLINE int16x8_t vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v, const int lane);
return __riscv_vle16_v_i16m1(_d, 4);
}

FORCE_INLINE int32x4_t vqrdmlshq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c, const int __d) {
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, __d, 2);
FORCE_INLINE int16x8_t vqrdmlshq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t c, const int lane) {
vint16m1_t c_dup = __riscv_vrgather_vx_i16m1(c, lane, 8);
vint32m2_t bc_mul = __riscv_vwmul_vv_i32m2(b, c_dup, 8);
vint32m2_t bc_mulx2 = __riscv_vsll_vx_i32m2(bc_mul, 1, 8);
vint16m1_t bc_s = __riscv_vnclip_wx_i16m1(bc_mulx2, 16, __RISCV_VXRM_RNU, 8);
return __riscv_vssub_vv_i16m1(a, bc_s, 8);
}

FORCE_INLINE int32x4_t vqrdmlshq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c, const int lane) {
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(c, lane, 4);
vint64m2_t bc_mul = __riscv_vwmul_vv_i64m2(b, c_dup, 4);
vint64m2_t bc_mulx2 = __riscv_vsll_vx_i64m2(bc_mul, 1, 4);
vint32m1_t bc_s = __riscv_vnclip_wx_i32m1(bc_mulx2, 32, __RISCV_VXRM_RNU, 4);
return __riscv_vssub_vv_i32m1(a, bc_s, 4);
}

FORCE_INLINE int32x2_t vqrdmlsh_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v, const int lane) {
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(v, lane, 2);
vint64m2_t bc_mul = __riscv_vwmul_vv_i64m2(b, c_dup, 2);
vint64m2_t bc_mulx2 = __riscv_vsll_vx_i64m2(bc_mul, 1, 2);
vint32m1_t bc_s = __riscv_vnclip_wx_i32m1(bc_mulx2, 32, __RISCV_VXRM_RNU, 2);
return __riscv_vssub_vv_i32m1(a, bc_s, 2);
}

// FORCE_INLINE int32x2_t vqrdmlsh_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v, const int lane);

// FORCE_INLINE int32x4_t vqrdmlshq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v, const int lane);
FORCE_INLINE int32x4_t vqrdmlshq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v, const int lane) {
vint32m1_t c_dup = __riscv_vrgather_vx_i32m1(v, lane, 4);
vint64m2_t bc_mul = __riscv_vwmul_vv_i64m2(b, c_dup, 4);
vint64m2_t bc_mulx2 = __riscv_vsll_vx_i64m2(bc_mul, 1, 4);
vint32m1_t bc_s = __riscv_vnclip_wx_i32m1(bc_mulx2, 32, __RISCV_VXRM_RNU, 4);
return __riscv_vssub_vv_i32m1(a, bc_s, 4);
}

// FORCE_INLINE int16_t vqrdmlahh_s16(int16_t a, int16_t b, int16_t c);

Expand Down
4 changes: 2 additions & 2 deletions tests/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,12 +202,12 @@ TEST_SATURATE_ADD_SUB(64)
} \
static inline int##CBIT##_t sat_rdmlah(int##CBIT##_t a, int##CBIT##_t b, int##CBIT##_t c) { \
int##HBIT##_t tmp = sat_dmull(b, c); \
tmp = sat_add(tmp, (int##HBIT##_t)(1 << (CBIT - 1))); \
tmp = sat_add(tmp, (int##HBIT##_t)((int##HBIT##_t)1 << (CBIT - 1))); \
return sat_add(a, (int##CBIT##_t)(tmp >> CBIT)); \
} \
static inline int##CBIT##_t sat_rdmlsh(int##CBIT##_t a, int##CBIT##_t b, int##CBIT##_t c) { \
int##HBIT##_t tmp = sat_dmull(b, c); \
tmp = sat_sub(tmp, (int##HBIT##_t)(1 << (CBIT - 1))); \
tmp = sat_add(tmp, (int##HBIT##_t)(((int##HBIT##_t)1 << (CBIT - 1)) - 1)); \
return sat_sub(a, (int##CBIT##_t)(tmp >> CBIT)); \
}
TEST_SATURATE_DMUL(8, 16)
Expand Down
Loading
Loading