From 48f3192dc0937e7f1a8010a3355f595320f2fe51 Mon Sep 17 00:00:00 2001
From: yxy <summerhh54@gmail.com>
Date: Mon, 5 Feb 2024 17:41:37 +0000
Subject: [PATCH 01/17] add float2int8

---
 src/layer/riscv/riscv_usability.h | 86 +++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h
index e2824646f871..f63984d21fc9 100644
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -19,6 +19,14 @@
 #include <riscv_vector.h>
 #endif // __riscv_vector
 
+static inline signed char float2int8(float v)
+{
+    int int32 = (int)roundf(v);
+    if (int32 > 127) return 127;
+    if (int32 < -127) return -127;
+    return (signed char)int32;
+}
+
 #if __riscv_vector
 static inline int csrr_vl()
 {
@@ -50,6 +58,84 @@ static inline int csrr_vlenb()
     return a;
 }
 
+static inline int64_t float2int8(vfloat32m1_t _vlow, vfloat32m1_t _vhigh)
+{
+    int vl = vsetvlmax_e32m1();
+    // _MM_ROUND_NEAREST round to even
+    // simulate round to nearest via +/-0.5 with round to zero
+    vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl);
+    // vint32m1_t _signmask = vmv_v_f(1 << 31, vl);
+    int32_t _signmask = 1 << 31;
+    vint32m1_t _signlow = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vlow), _signmask, vl);
+    vint32m1_t _signhigh = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vhigh), _signmask, vl);
+
+    vfloat32m1_t _p5low = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signlow, vreinterpret_v_f32m1_i32m1(_p5), vl));
+    vfloat32m1_t _p5high = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signhigh, vreinterpret_v_f32m1_i32m1(_p5), vl));
+
+    vfloat32m1_t _vlow5 = vfadd_vv_f32m1(_vlow, _p5low, vl);
+    vfloat32m1_t _vhigh5 = vfadd_vv_f32m1(_vhigh, _p5high, vl);
+
+    vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow5, vl);
+    vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl);
+    
+    // combine _vlow32 and _vhigh32 to a single vint32m2_t
+    vint32m2_t _v32 = vundefined_i32m2();
+    _v32 = vset_v_i32m1_i32m2 (_v32, 0, _vlow32);
+    _v32 = vset_v_i32m1_i32m2 (_v32, 1, _vhigh32);
+
+    vint16m1_t _v16 = vnclip_wx_i16m1(_v32, 0, vl);
+    vint16m2_t _v16_2 = vundefined_i16m2();
+    _v16_2 = vset_v_i16m1_i16m2 (_v16_2, 0, _v16);
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16_2, 0, vl);
+    _v8 = vmax_vx_i8m1(_v8, -127, vl);
+    int64_t _ret;
+    vse8_v_i8m1((int8_t *)&_ret, _v8, vl);
+    return _ret;
+}
+
+static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3)
+{
+    int vl = vsetvlmax_e32m1();
+    // _MM_ROUND_NEAREST round to even
+    // simulate round to nearest via +/-0.5 with round to zero
+    vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl);
+    // vint32m1_t _signmask = vmv_v_f(1 << 31, vl);
+    int32_t _signmask = 1 << 31;
+    vint32m1_t _sign0 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v0), _signmask, vl);
+    vint32m1_t _sign1 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v1), _signmask, vl);
+    vint32m1_t _sign2 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v2), _signmask, vl);
+    vint32m1_t _sign3 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v3), _signmask, vl);
+
+    vfloat32m1_t _p5s0 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign0, vreinterpret_v_f32m1_i32m1(_p5), vl));
+    vfloat32m1_t _p5s1 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign1, vreinterpret_v_f32m1_i32m1(_p5), vl));
+    vfloat32m1_t _p5s2 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign2, vreinterpret_v_f32m1_i32m1(_p5), vl));
+    vfloat32m1_t _p5s3 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign3, vreinterpret_v_f32m1_i32m1(_p5), vl));
+
+    vfloat32m1_t _v05 = vfadd_vv_f32m1(_v0, _p5s0, vl);
+    vfloat32m1_t _v15 = vfadd_vv_f32m1(_v1, _p5s1, vl);
+    vfloat32m1_t _v25 = vfadd_vv_f32m1(_v2, _p5s2, vl);
+    vfloat32m1_t _v35 = vfadd_vv_f32m1(_v3, _p5s3, vl);
+
+    vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v05, vl);
+    vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v15, vl);
+    vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v25, vl);
+    vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v35, vl);
+
+    vl = vsetvlmax_e32m4();
+
+    vint32m4_t _v32 = vundefined_i32m4();
+    _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v0_32);
+    _v32 = vset_v_i32m1_i32m4 (_v32, 1, _v1_32);
+    _v32 = vset_v_i32m1_i32m4 (_v32, 2, _v2_32);
+    _v32 = vset_v_i32m1_i32m4 (_v32, 3, _v3_32);
+
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+    _v8 = vmax_vx_i8m1(_v8, -127, vl);
+    return _v8;
+}
+
+
 static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr)
 {
     const int packn = csrr_vlenb() / 4;

From b83e64ee04562b2fac34797556e8169f05236b8f Mon Sep 17 00:00:00 2001
From: yxy <summerhh54@gmail.com>
Date: Tue, 6 Feb 2024 02:44:16 +0000
Subject: [PATCH 02/17] pass compile

---
 src/layer/riscv/riscv_usability.h | 117 ++++++++++++++++++++++++++++--
 1 file changed, 110 insertions(+), 7 deletions(-)

diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h
index f63984d21fc9..803c78756a6b 100644
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -58,6 +58,7 @@ static inline int csrr_vlenb()
     return a;
 }
 
+
 static inline int64_t float2int8(vfloat32m1_t _vlow, vfloat32m1_t _vhigh)
 {
     int vl = vsetvlmax_e32m1();
@@ -79,20 +80,46 @@ static inline int64_t float2int8(vfloat32m1_t _vlow, vfloat32m1_t _vhigh)
     vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl);
     
     // combine _vlow32 and _vhigh32 to a single vint32m2_t
-    vint32m2_t _v32 = vundefined_i32m2();
-    _v32 = vset_v_i32m1_i32m2 (_v32, 0, _vlow32);
-    _v32 = vset_v_i32m1_i32m2 (_v32, 1, _vhigh32);
+    vint32m4_t _v32 = vundefined_i32m4();
+    _v32 = vset_v_i32m1_i32m4 (_v32, 0, _vlow32);
+    _v32 = vset_v_i32m1_i32m4 (_v32, 1, _vhigh32);
 
-    vint16m1_t _v16 = vnclip_wx_i16m1(_v32, 0, vl);
-    vint16m2_t _v16_2 = vundefined_i16m2();
-    _v16_2 = vset_v_i16m1_i16m2 (_v16_2, 0, _v16);
-    vint8m1_t _v8 = vnclip_wx_i8m1(_v16_2, 0, vl);
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
     _v8 = vmax_vx_i8m1(_v8, -127, vl);
     int64_t _ret;
     vse8_v_i8m1((int8_t *)&_ret, _v8, vl);
     return _ret;
 }
 
+static inline int32_t float2int8(vfloat32m1_t _v) {
+    int vl = vsetvlmax_e32m1();
+    // _MM_ROUND_NEAREST round to even
+    // simulate round to nearest via +/-0.5 with round to zero
+
+    vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl);
+    // vint32m1_t _signmask = vmv_v_f(1 << 31, vl);
+    int32_t _signmask = 1 << 31;
+    vint32m1_t _sign = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v), _signmask, vl);
+
+    vfloat32m1_t _p5s = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign, vreinterpret_v_f32m1_i32m1(_p5), vl));
+
+    vfloat32m1_t _v5 = vfadd_vv_f32m1(_v, _p5s, vl);
+    vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v5, vl);
+
+
+    vint32m4_t _v32 = vundefined_i32m4();
+    _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v32m1);
+
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+    _v8 = vmax_vx_i8m1(_v8, -127, vl);
+    int32_t _ret;
+    vse8_v_i8m1((int8_t *)&_ret, _v8, vl);
+    return _ret;
+}
+
+
 static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3)
 {
     int vl = vsetvlmax_e32m1();
@@ -135,6 +162,82 @@ static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m
     return _v8;
 }
 
+static inline int64_t float2int8relu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh)
+{
+    int vl = vsetvlmax_e32m1();
+    // _MM_ROUND_NEAREST round to even
+    // simulate round to nearest via +/-0.5 with round to zero
+    vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl);
+    // vint32m1_t _signmask = vmv_v_f(1 << 31, vl);
+    int32_t _signmask = 1 << 31;
+    vint32m1_t _signlow = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vlow), _signmask, vl);
+    vint32m1_t _signhigh = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vhigh), _signmask, vl);
+
+    vfloat32m1_t _p5low = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signlow, vreinterpret_v_f32m1_i32m1(_p5), vl));
+    vfloat32m1_t _p5high = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signhigh, vreinterpret_v_f32m1_i32m1(_p5), vl));
+
+    vfloat32m1_t _vlow5 = vfadd_vv_f32m1(_vlow, _p5low, vl);
+    vfloat32m1_t _vhigh5 = vfadd_vv_f32m1(_vhigh, _p5high, vl);
+
+    vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow5, vl);
+    vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl);
+    
+    // combine _vlow32 and _vhigh32 to a single vint32m2_t
+    vint32m4_t _v32 = vundefined_i32m4();
+    _v32 = vset_v_i32m1_i32m4 (_v32, 0, _vlow32);
+    _v32 = vset_v_i32m1_i32m4 (_v32, 1, _vhigh32);
+
+    vint16m2_t _v16_2 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16_2, 0, vl);
+    _v8 = vmax_vx_i8m1(_v8, 0, vl);
+    int64_t _ret;
+    vse8_v_i8m1((int8_t *)&_ret, _v8, vl);
+    return _ret;
+}
+
+static inline vint8m1_t float2int8relu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3)
+{
+    int vl = vsetvlmax_e32m1();
+    // _MM_ROUND_NEAREST round to even
+    // simulate round to nearest via +/-0.5 with round to zero
+    vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl);
+    // vint32m1_t _signmask = vmv_v_f(1 << 31, vl);
+    int32_t _signmask = 1 << 31;
+    vint32m1_t _sign0 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v0), _signmask, vl);
+    vint32m1_t _sign1 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v1), _signmask, vl);
+    vint32m1_t _sign2 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v2), _signmask, vl);
+    vint32m1_t _sign3 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v3), _signmask, vl);
+
+    vfloat32m1_t _p5s0 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign0, vreinterpret_v_f32m1_i32m1(_p5), vl));
+    vfloat32m1_t _p5s1 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign1, vreinterpret_v_f32m1_i32m1(_p5), vl));
+    vfloat32m1_t _p5s2 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign2, vreinterpret_v_f32m1_i32m1(_p5), vl));
+    vfloat32m1_t _p5s3 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign3, vreinterpret_v_f32m1_i32m1(_p5), vl));
+
+    vfloat32m1_t _v05 = vfadd_vv_f32m1(_v0, _p5s0, vl);
+    vfloat32m1_t _v15 = vfadd_vv_f32m1(_v1, _p5s1, vl);
+    vfloat32m1_t _v25 = vfadd_vv_f32m1(_v2, _p5s2, vl);
+    vfloat32m1_t _v35 = vfadd_vv_f32m1(_v3, _p5s3, vl);
+
+    vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v05, vl);
+    vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v15, vl);
+    vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v25, vl);
+    vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v35, vl);
+
+    vl = vsetvlmax_e32m4();
+
+    vint32m4_t _v32 = vundefined_i32m4();
+    _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v0_32);
+    _v32 = vset_v_i32m1_i32m4 (_v32, 1, _v1_32);
+    _v32 = vset_v_i32m1_i32m4 (_v32, 2, _v2_32);
+    _v32 = vset_v_i32m1_i32m4 (_v32, 3, _v3_32);
+
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+    _v8 = vmax_vx_i8m1(_v8, 0, vl);
+    return _v8;
+}
+
+
 
 static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr)
 {

From 648d56645ac621880ed39754a608e129c9e2fde2 Mon Sep 17 00:00:00 2001
From: Xinyu302 <Xinyu302@users.noreply.github.com>
Date: Tue, 6 Feb 2024 05:49:26 +0000
Subject: [PATCH 03/17] apply code-format changes

---
 src/layer/riscv/riscv_usability.h | 44 ++++++++++++++-----------------
 1 file changed, 20 insertions(+), 24 deletions(-)

diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h
index 803c78756a6b..153299f0a549 100644
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -58,7 +58,6 @@ static inline int csrr_vlenb()
     return a;
 }
 
-
 static inline int64_t float2int8(vfloat32m1_t _vlow, vfloat32m1_t _vhigh)
 {
     int vl = vsetvlmax_e32m1();
@@ -78,21 +77,22 @@ static inline int64_t float2int8(vfloat32m1_t _vlow, vfloat32m1_t _vhigh)
 
     vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow5, vl);
     vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl);
-    
+
     // combine _vlow32 and _vhigh32 to a single vint32m2_t
     vint32m4_t _v32 = vundefined_i32m4();
-    _v32 = vset_v_i32m1_i32m4 (_v32, 0, _vlow32);
-    _v32 = vset_v_i32m1_i32m4 (_v32, 1, _vhigh32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 0, _vlow32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 1, _vhigh32);
 
     vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
     vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
     _v8 = vmax_vx_i8m1(_v8, -127, vl);
     int64_t _ret;
-    vse8_v_i8m1((int8_t *)&_ret, _v8, vl);
+    vse8_v_i8m1((int8_t*)&_ret, _v8, vl);
     return _ret;
 }
 
-static inline int32_t float2int8(vfloat32m1_t _v) {
+static inline int32_t float2int8(vfloat32m1_t _v)
+{
     int vl = vsetvlmax_e32m1();
     // _MM_ROUND_NEAREST round to even
     // simulate round to nearest via +/-0.5 with round to zero
@@ -107,19 +107,17 @@ static inline int32_t float2int8(vfloat32m1_t _v) {
     vfloat32m1_t _v5 = vfadd_vv_f32m1(_v, _p5s, vl);
     vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v5, vl);
 
-
     vint32m4_t _v32 = vundefined_i32m4();
-    _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v32m1);
+    _v32 = vset_v_i32m1_i32m4(_v32, 0, _v32m1);
 
     vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
     vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
     _v8 = vmax_vx_i8m1(_v8, -127, vl);
     int32_t _ret;
-    vse8_v_i8m1((int8_t *)&_ret, _v8, vl);
+    vse8_v_i8m1((int8_t*)&_ret, _v8, vl);
     return _ret;
 }
 
-
 static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3)
 {
     int vl = vsetvlmax_e32m1();
@@ -151,10 +149,10 @@ static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m
     vl = vsetvlmax_e32m4();
 
     vint32m4_t _v32 = vundefined_i32m4();
-    _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v0_32);
-    _v32 = vset_v_i32m1_i32m4 (_v32, 1, _v1_32);
-    _v32 = vset_v_i32m1_i32m4 (_v32, 2, _v2_32);
-    _v32 = vset_v_i32m1_i32m4 (_v32, 3, _v3_32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 0, _v0_32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 1, _v1_32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 2, _v2_32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 3, _v3_32);
 
     vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
     vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
@@ -181,17 +179,17 @@ static inline int64_t float2int8relu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh)
 
     vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow5, vl);
     vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl);
-    
+
     // combine _vlow32 and _vhigh32 to a single vint32m2_t
     vint32m4_t _v32 = vundefined_i32m4();
-    _v32 = vset_v_i32m1_i32m4 (_v32, 0, _vlow32);
-    _v32 = vset_v_i32m1_i32m4 (_v32, 1, _vhigh32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 0, _vlow32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 1, _vhigh32);
 
     vint16m2_t _v16_2 = vnclip_wx_i16m2(_v32, 0, vl);
     vint8m1_t _v8 = vnclip_wx_i8m1(_v16_2, 0, vl);
     _v8 = vmax_vx_i8m1(_v8, 0, vl);
     int64_t _ret;
-    vse8_v_i8m1((int8_t *)&_ret, _v8, vl);
+    vse8_v_i8m1((int8_t*)&_ret, _v8, vl);
     return _ret;
 }
 
@@ -226,10 +224,10 @@ static inline vint8m1_t float2int8relu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloa
     vl = vsetvlmax_e32m4();
 
     vint32m4_t _v32 = vundefined_i32m4();
-    _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v0_32);
-    _v32 = vset_v_i32m1_i32m4 (_v32, 1, _v1_32);
-    _v32 = vset_v_i32m1_i32m4 (_v32, 2, _v2_32);
-    _v32 = vset_v_i32m1_i32m4 (_v32, 3, _v3_32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 0, _v0_32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 1, _v1_32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 2, _v2_32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 3, _v3_32);
 
     vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
     vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
@@ -237,8 +235,6 @@ static inline vint8m1_t float2int8relu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloa
     return _v8;
 }
 
-
-
 static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr)
 {
     const int packn = csrr_vlenb() / 4;

From 3efeab172817e3d6eb3c5915fc549accf11b248a Mon Sep 17 00:00:00 2001
From: yxy <summerhh54@gmail.com>
Date: Tue, 6 Feb 2024 08:34:36 +0000
Subject: [PATCH 04/17] add float2int8leakyrelu

---
 src/layer/riscv/riscv_usability.h | 176 +++++++++++++++++++++++++++++-
 1 file changed, 175 insertions(+), 1 deletion(-)

diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h
index 803c78756a6b..49b620325fb4 100644
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -80,6 +80,7 @@ static inline int64_t float2int8(vfloat32m1_t _vlow, vfloat32m1_t _vhigh)
     vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl);
     
     // combine _vlow32 and _vhigh32 to a single vint32m2_t
+    vl = 2 * vsetvlmax_e32m1();
     vint32m4_t _v32 = vundefined_i32m4();
     _v32 = vset_v_i32m1_i32m4 (_v32, 0, _vlow32);
     _v32 = vset_v_i32m1_i32m4 (_v32, 1, _vhigh32);
@@ -107,7 +108,6 @@ static inline int32_t float2int8(vfloat32m1_t _v) {
     vfloat32m1_t _v5 = vfadd_vv_f32m1(_v, _p5s, vl);
     vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v5, vl);
 
-
     vint32m4_t _v32 = vundefined_i32m4();
     _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v32m1);
 
@@ -183,6 +183,7 @@ static inline int64_t float2int8relu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh)
     vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl);
     
     // combine _vlow32 and _vhigh32 to a single vint32m2_t
+    vl = 2 * vsetvlmax_e32m1();
     vint32m4_t _v32 = vundefined_i32m4();
     _v32 = vset_v_i32m1_i32m4 (_v32, 0, _vlow32);
     _v32 = vset_v_i32m1_i32m4 (_v32, 1, _vhigh32);
@@ -195,6 +196,32 @@ static inline int64_t float2int8relu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh)
     return _ret;
 }
 
+static inline int32_t float2int8relu(vfloat32m1_t _v) {
+    int vl = vsetvlmax_e32m1();
+    // _MM_ROUND_NEAREST round to even
+    // simulate round to nearest via +/-0.5 with round to zero
+
+    vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl);
+    // vint32m1_t _signmask = vmv_v_f(1 << 31, vl);
+    int32_t _signmask = 1 << 31;
+    vint32m1_t _sign = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v), _signmask, vl);
+
+    vfloat32m1_t _p5s = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign, vreinterpret_v_f32m1_i32m1(_p5), vl));
+
+    vfloat32m1_t _v5 = vfadd_vv_f32m1(_v, _p5s, vl);
+    vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v5, vl);
+
+    vint32m4_t _v32 = vundefined_i32m4();
+    _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v32m1);
+
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+    _v8 = vmax_vx_i8m1(_v8, 0, vl);
+    int32_t _ret;
+    vse8_v_i8m1((int8_t *)&_ret, _v8, vl);
+    return _ret;
+}
+
 static inline vint8m1_t float2int8relu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3)
 {
     int vl = vsetvlmax_e32m1();
@@ -237,6 +264,153 @@ static inline vint8m1_t float2int8relu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloa
     return _v8;
 }
 
+static inline int64_t float2int8leakyrelu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh, vfloat32m1_t _slope)
+{
+    int vl = vsetvlmax_e32m1();
+    vfloat32m1_t _vlow_leaky = vsmul_vv_f32m1(_vlow, _slope, vl);
+    vfloat32m1_t _vhigh_leaky = vsmul_vv_f32m1(_vhigh, _slope, vl);
+
+    // simulate round to nearest via +/-0.5 with round to zero
+    vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl);
+    // vint32m1_t _signmask = vmv_v_f(1 << 31, vl);
+    int32_t _signmask = 1 << 31;
+
+    vint32m1_t _signlow = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vlow), _signmask, vl);
+    vint32m1_t _signhigh = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vhigh), _signmask, vl);
+
+    vfloat32m1_t _p5low = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signlow, vreinterpret_v_f32m1_i32m1(_p5), vl));
+    vfloat32m1_t _p5high = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signhigh, vreinterpret_v_f32m1_i32m1(_p5), vl));
+
+    vfloat32m1_t _vlow5 = vfadd_vv_f32m1(_vlow_leaky, _p5low, vl);
+    vfloat32m1_t _vhigh5 = vfadd_vv_f32m1(_vhigh_leaky, _p5high, vl);
+
+    vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow5, vl);
+    vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl);
+
+    vint32m1_t _signlow_leaky = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vlow_leaky), _signmask, vl);
+    vint32m1_t _signhigh_leaky = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vhigh_leaky), _signmask, vl);
+
+    vfloat32m1_t _p5low_leaky = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signlow_leaky, vreinterpret_v_f32m1_i32m1(_p5), vl));
+    vfloat32m1_t _p5high_leaky = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signhigh_leaky, vreinterpret_v_f32m1_i32m1(_p5), vl));
+
+    vfloat32m1_t _vlow5_leaky = vfadd_vv_f32m1(_vlow_leaky, _p5low_leaky, vl);
+    vfloat32m1_t _vhigh5_leaky = vfadd_vv_f32m1(_vhigh_leaky, _p5high_leaky, vl);
+
+    vint32m1_t _vlow32_leaky = vfcvt_x_f_v_i32m1(_vlow5_leaky, vl);
+    vint32m1_t _vhigh32_leaky = vfcvt_x_f_v_i32m1(_vhigh5_leaky, vl);
+
+    vl = 2 * vsetvlmax_e32m1();
+
+    vint32m4_t _v32 = vundefined_i32m4();
+    vint32m4_t _v32_leaky = vundefined_i32m4();
+    _v32 = vset_v_i32m1_i32m4 (_v32, 0, _vlow32);
+    _v32 = vset_v_i32m1_i32m4 (_v32, 1, _vhigh32);
+
+    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 0, _vlow32_leaky);
+    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 1, _vhigh32_leaky);
+
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl);
+
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+    vint8m1_t _v8_leaky = vnclip_wx_i8m1(_v16_leaky, 0, vl);
+
+    _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl);
+    int64_t _ret;
+    vse8_v_i8m1((int8_t *)&_ret, _v8, vl);
+    return _ret;
+}
+
+static inline int32_t float2int8leakyrelu(vfloat32m1_t _v, vfloat32m1_t _slope) {
+    int vl = vsetvlmax_e32m1();
+    vfloat32m1_t _v_leaky = vsmul_vv_f32m1(_v, _slope, vl);
+
+    vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl);
+    int32_t _signmask = 1 << 31;
+    vint32m1_t _sign = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v), _signmask, vl);
+
+    vfloat32m1_t _p5s = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign, vreinterpret_v_f32m1_i32m1(_p5), vl));
+
+    vfloat32m1_t _v5 = vfadd_vv_f32m1(_v, _p5s, vl);
+    vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v5, vl);
+
+    vint32m1_t _sign_leaky = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v_leaky), _signmask, vl);
+    vfloat32m1_t _p5s_leaky = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign_leaky, vreinterpret_v_f32m1_i32m1(_p5), vl));
+
+    vfloat32m1_t _v5_leaky = vfadd_vv_f32m1(_v_leaky, _p5s_leaky, vl);
+    vint32m1_t _v32m1_leaky = vfcvt_x_f_v_i32m1(_v5_leaky, vl);
+
+    // vl = vsetvlmax_e32m4();
+    vint32m4_t _v32 = vundefined_i32m4();
+    vint32m4_t _v32_leaky = vundefined_i32m4();
+    
+    _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v32m1);
+    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 0, _v32m1_leaky);
+
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl);
+
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+    vint8m1_t _v8_leaky = vnclip_wx_i8m1(_v16_leaky, 0, vl);
+
+    _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl);
+    int32_t _ret;
+    vse8_v_i8m1((int8_t *)&_ret, _v8, vl);
+    return _ret;
+}
+
+static inline vint8m1_t float2int8leakyrelu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3, vfloat32m1_t _slope)
+{
+    int vl = vsetvlmax_e32m1();
+    vfloat32m1_t _v0_leaky = vsmul_vv_f32m1(_v0, _slope, vl);
+    vfloat32m1_t _v1_leaky = vsmul_vv_f32m1(_v1, _slope, vl);
+    vfloat32m1_t _v2_leaky = vsmul_vv_f32m1(_v2, _slope, vl);
+    vfloat32m1_t _v3_leaky = vsmul_vv_f32m1(_v3, _slope, vl);
+
+    vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl);
+    int32_t _signmask = 1 << 31;
+    vint32m1_t _sign0 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v0), _signmask, vl);
+    vint32m1_t _sign1 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v1), _signmask, vl);
+    vint32m1_t _sign2 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v2), _signmask, vl);
+    vint32m1_t _sign3 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v3), _signmask, vl);
+
+    vfloat32m1_t _p5s0 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign0, vreinterpret_v_f32m1_i32m1(_p5), vl));
+    vfloat32m1_t _p5s1 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign1, vreinterpret_v_f32m1_i32m1(_p5), vl));
+    vfloat32m1_t _p5s2 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign2, vreinterpret_v_f32m1_i32m1(_p5), vl));
+    vfloat32m1_t _p5s3 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign3, vreinterpret_v_f32m1_i32m1(_p5), vl));
+
+    vfloat32m1_t _v05 = vfadd_vv_f32m1(_v0_leaky, _p5s0, vl);
+    vfloat32m1_t _v15 = vfadd_vv_f32m1(_v1_leaky, _p5s1, vl);
+    vfloat32m1_t _v25 = vfadd_vv_f32m1(_v2_leaky, _p5s2, vl);
+    vfloat32m1_t _v35 = vfadd_vv_f32m1(_v3_leaky, _p5s3, vl);
+
+    vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v05, vl);
+    vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v15, vl);
+    vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v25, vl);
+    vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v35, vl);
+
+    vl = vsetvlmax_e32m4();
+    vint32m4_t _v32 = vundefined_i32m4();
+    _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v0_32);
+    _v32 = vset_v_i32m1_i32m4 (_v32, 1, _v1_32);
+    _v32 = vset_v_i32m1_i32m4 (_v32, 2, _v2_32);
+    _v32 = vset_v_i32m1_i32m4 (_v32, 3, _v3_32);
+
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+
+    vint32m4_t _v32_leaky = vundefined_i32m4();
+    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 0, _v0_32);
+    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 1, _v1_32);
+    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 2, _v2_32);
+    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 3, _v3_32);
+
+    vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl);
+    vint8m1_t _v8_leaky = vnclip_wx_i8m1(_v16_leaky, 0, vl);
+
+    _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl);
+    return _v8;
+}    
 
 
 static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr)

From c9c545113a6b0fa0ac8f94dd3d5cb30a0c85f17f Mon Sep 17 00:00:00 2001
From: yxy <summerhh54@gmail.com>
Date: Tue, 6 Feb 2024 09:23:34 +0000
Subject: [PATCH 05/17] add quantize_riscv but has bug

---
 src/layer/riscv/quantize_riscv.cpp | 566 +++++++++++++++++++++++++++++
 src/layer/riscv/quantize_riscv.h   |  32 ++
 src/layer/riscv/riscv_usability.h  |  14 +-
 3 files changed, 605 insertions(+), 7 deletions(-)
 create mode 100644 src/layer/riscv/quantize_riscv.cpp
 create mode 100644 src/layer/riscv/quantize_riscv.h

diff --git a/src/layer/riscv/quantize_riscv.cpp b/src/layer/riscv/quantize_riscv.cpp
new file mode 100644
index 000000000000..df429d9e18e9
--- /dev/null
+++ b/src/layer/riscv/quantize_riscv.cpp
@@ -0,0 +1,566 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 BUG1989. All rights reserved.
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "quantize_riscv.h"
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#endif // __riscv_vector
+
+#include "riscv_usability.h"
+
+#include "cpu.h"
+
+namespace ncnn {
+
+Quantize_riscv::Quantize_riscv()
+{
+    fprintf(stderr, "Quantize_riscv init\n");
+#if __riscv_vector
+    support_packing = true;
+#endif // __riscv_vector
+}
+
+int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int vl = vsetvlmax_e32m1();
+    int elembits = bottom_blob.elembits();
+
+    int dims = bottom_blob.dims;
+    int elempack = bottom_blob.elempack;
+
+#if __riscv_vector
+    if (elempack == 4)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+            int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
+            int outw = w * elempack / out_elempack;
+
+            top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                const float scale = scale_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const float* ptr0 = (const float*)bottom_blob + i * 4;
+                    signed char* outptr = (signed char*)top_blob + i * 4;
+
+                    outptr[0] = float2int8(ptr0[0] * scale);
+                    outptr[1] = float2int8(ptr0[1] * scale);
+                    outptr[2] = float2int8(ptr0[2] * scale);
+                    outptr[3] = float2int8(ptr0[3] * scale);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const float* ptr0 = (const float*)bottom_blob + i * 4;
+                    signed char* outptr = (signed char*)top_blob + i * 4;
+
+                    outptr[0] = float2int8(ptr0[0] * scale_data[i * 4]);
+                    outptr[1] = float2int8(ptr0[1] * scale_data[i * 4 + 1]);
+                    outptr[2] = float2int8(ptr0[2] * scale_data[i * 4 + 2]);
+                    outptr[3] = float2int8(ptr0[3] * scale_data[i * 4 + 3]);
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
+            int outh = h * elempack / out_elempack;
+
+            top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (out_elempack == 8)
+            {
+                if (scale_data_size == 1)
+                {
+                    // vfloat32m1_t _scale = vfmv_v_f_f32m1(scale_data[0]);
+                    // float32x4_t _scale = vdupq_n_f32(scale_data[0]);
+                    float _scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const float* ptr0 = bottom_blob.row(i * 2);
+                        const float* ptr1 = bottom_blob.row(i * 2 + 1);
+                        signed char* outptr = top_blob.row<signed char>(i);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            vfloat32m1_t _vlow = vle32_v_f32m1(ptr0, vl);
+                            vfloat32m1_t _vhigh = vle32_v_f32m1(ptr1, vl);
+                            _vlow = vfmul_vf_f32m1(_vlow, _scale, vl);
+                            _vhigh = vfmul_vf_f32m1(_vhigh, _scale, vl);
+                            
+                            int64_t _v = float2int8(_vlow, _vhigh);
+                            // float32x4_t _vlow = vld1q_f32(ptr0);
+                            // float32x4_t _vhigh = vld1q_f32(ptr1);
+                            // _vlow = vmulq_f32(_vlow, _scale);
+                            // _vhigh = vmulq_f32(_vhigh, _scale);
+                            // int8x8_t _v = float2int8(_vlow, _vhigh);
+                            // vst1_s8(outptr, _v);
+                            *(int64_t*)outptr = _v;
+
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const float* ptr0 = bottom_blob.row(i * 2);
+                        const float* ptr1 = bottom_blob.row(i * 2 + 1);
+                        signed char* outptr = top_blob.row<signed char>(i);
+
+                        vfloat32m1_t _scale0 = vle32_v_f32m1((const float*)scale_data + i * 8, vl);
+                        vfloat32m1_t _scale1 = vle32_v_f32m1((const float*)scale_data + i * 8 + 4, vl);
+
+                        // float32x4_t _scale0 = vld1q_f32((const float*)scale_data + i * 8);
+                        // float32x4_t _scale1 = vld1q_f32((const float*)scale_data + i * 8 + 4);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            vfloat32m1_t _vlow = vle32_v_f32m1(ptr0, vl);
+                            vfloat32m1_t _vhigh = vle32_v_f32m1(ptr1, vl);
+                            _vlow = vfmul_vv_f32m1(_vlow, _scale0, vl);
+                            _vhigh = vfmul_vv_f32m1(_vhigh, _scale1, vl);
+                            int64_t _v = float2int8(_vlow, _vhigh);
+                            *(int64_t*)outptr = _v;
+                            // float32x4_t _vlow = vld1q_f32(ptr0);
+                            // float32x4_t _vhigh = vld1q_f32(ptr1);
+                            // _vlow = vmulq_f32(_vlow, _scale0);
+                            // _vhigh = vmulq_f32(_vhigh, _scale1);
+                            // int8x8_t _v = float2int8(_vlow, _vhigh);
+                            // vst1_s8(outptr, _v);
+
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (scale_data_size == 1)
+                {
+                    const float scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const float* ptr0 = bottom_blob.row(i);
+                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
+                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
+                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
+                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            outptr0[0] = float2int8(ptr0[0] * scale);
+                            outptr1[0] = float2int8(ptr0[1] * scale);
+                            outptr2[0] = float2int8(ptr0[2] * scale);
+                            outptr3[0] = float2int8(ptr0[3] * scale);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const float* ptr0 = bottom_blob.row(i);
+                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
+                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
+                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
+                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                        const float s0 = scale_data[i * 4];
+                        const float s1 = scale_data[i * 4 + 1];
+                        const float s2 = scale_data[i * 4 + 2];
+                        const float s3 = scale_data[i * 4 + 3];
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            outptr0[0] = float2int8(ptr0[0] * s0);
+                            outptr1[0] = float2int8(ptr0[1] * s1);
+                            outptr2[0] = float2int8(ptr0[2] * s2);
+                            outptr3[0] = float2int8(ptr0[3] * s3);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+            int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
+            int outc = channels * elempack / out_elempack;
+
+            top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (out_elempack == 8)
+            {
+                if (scale_data_size == 1)
+                {
+                    // float32x4_t _scale = vdupq_n_f32(scale_data[0]);
+                    float _scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const float* ptr0 = bottom_blob.channel(q * 2);
+                        const float* ptr1 = bottom_blob.channel(q * 2 + 1);
+                        signed char* outptr = top_blob.channel(q);
+
+                        int i = 0;
+                        for (; i + 1 < size; i += 2)
+                        {
+                            vfloat32m1_t _v0 = vle32_v_f32m1(ptr0, vl);
+                            vfloat32m1_t _v1 = vle32_v_f32m1(ptr0 + 4, vl);
+                            vfloat32m1_t _v2 = vle32_v_f32m1(ptr1, vl);
+                            vfloat32m1_t _v3 = vle32_v_f32m1(ptr1 + 4, vl);
+                            _v0 = vfmul_vf_f32m1(_v0, _scale, vl);
+                            _v1 = vfmul_vf_f32m1(_v1, _scale, vl);
+                            _v2 = vfmul_vf_f32m1(_v2, _scale, vl);
+                            _v3 = vfmul_vf_f32m1(_v3, _scale, vl);
+
+                            vint8m1_t _v = float2int8(_v0, _v2, _v1, _v3);
+                            vse8_v_i8m1(outptr, _v, 4 * vl);
+                            // float32x4_t _v0 = vld1q_f32(ptr0);
+                            // float32x4_t _v1 = vld1q_f32(ptr0 + 4);
+                            // float32x4_t _v2 = vld1q_f32(ptr1);
+                            // float32x4_t _v3 = vld1q_f32(ptr1 + 4);
+                            // _v0 = vmulq_f32(_v0, _scale);
+                            // _v1 = vmulq_f32(_v1, _scale);
+                            // _v2 = vmulq_f32(_v2, _scale);
+                            // _v3 = vmulq_f32(_v3, _scale);
+                            // vst1_s8(outptr, float2int8(_v0, _v2));
+                            // vst1_s8(outptr + 8, float2int8(_v1, _v3));
+
+                            ptr0 += 8;
+                            ptr1 += 8;
+                            outptr += 16;
+                        }
+                        for (; i < size; i++)
+                        {
+                            vfloat32m1_t _vlow = vle32_v_f32m1(ptr0, vl);
+                            vfloat32m1_t _vhigh = vle32_v_f32m1(ptr1, vl);
+
+                            _vlow = vfmul_vf_f32m1(_vlow, _scale, vl);
+                            _vhigh = vfmul_vf_f32m1(_vhigh, _scale, vl);
+
+                            int64_t _v = float2int8(_vlow, _vhigh);
+                            *(int64_t*)outptr = _v;
+
+                            // float32x4_t _vlow = vld1q_f32(ptr0);
+                            // float32x4_t _vhigh = vld1q_f32(ptr1);
+                            // _vlow = vmulq_f32(_vlow, _scale);
+                            // _vhigh = vmulq_f32(_vhigh, _scale);
+                            // int8x8_t _v = float2int8(_vlow, _vhigh);
+                            // vst1_s8(outptr, _v);
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const float* ptr0 = bottom_blob.channel(q * 2);
+                        const float* ptr1 = bottom_blob.channel(q * 2 + 1);
+                        signed char* outptr = top_blob.channel(q);
+
+                        vfloat32m1_t _scale0 = vle32_v_f32m1((const float*)scale_data + q * 8, vl);
+                        vfloat32m1_t _scale1 = vle32_v_f32m1((const float*)scale_data + q * 8 + 4, vl);
+
+                        // float32x4_t _scale0 = vld1q_f32((const float*)scale_data + q * 8);
+                        // float32x4_t _scale1 = vld1q_f32((const float*)scale_data + q * 8 + 4);
+
+                        int i = 0;
+                        for (; i < size; i++)
+                        {
+                            vfloat32m1_t _vlow = vle32_v_f32m1(ptr0, vl);
+                            vfloat32m1_t _vhigh = vle32_v_f32m1(ptr1, vl);
+
+                            _vlow = vfmul_vv_f32m1(_vlow, _scale0, vl);
+                            _vhigh = vfmul_vv_f32m1(_vhigh, _scale1, vl);
+
+                            int64_t _v = float2int8(_vlow, _vhigh);
+                            *(int64_t*)outptr = _v;
+
+                            // float32x4_t _vlow = vld1q_f32(ptr0);
+                            // float32x4_t _vhigh = vld1q_f32(ptr1);
+                            // _vlow = vmulq_f32(_vlow, _scale0);
+                            // _vhigh = vmulq_f32(_vhigh, _scale1);
+                            // int8x8_t _v = float2int8(_vlow, _vhigh);
+                            // vst1_s8(outptr, _v);
+
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (scale_data_size == 1)
+                {
+                    const float scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const float* ptr0 = bottom_blob.channel(q);
+                        signed char* outptr0 = top_blob.channel(q * 4);
+                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
+                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
+                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            outptr0[0] = float2int8(ptr0[0] * scale);
+                            outptr1[0] = float2int8(ptr0[1] * scale);
+                            outptr2[0] = float2int8(ptr0[2] * scale);
+                            outptr3[0] = float2int8(ptr0[3] * scale);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const float* ptr0 = bottom_blob.channel(q);
+                        signed char* outptr0 = top_blob.channel(q * 4);
+                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
+                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
+                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
+
+                        const float s0 = scale_data[q * 4];
+                        const float s1 = scale_data[q * 4 + 1];
+                        const float s2 = scale_data[q * 4 + 2];
+                        const float s3 = scale_data[q * 4 + 3];
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            outptr0[0] = float2int8(ptr0[0] * s0);
+                            outptr1[0] = float2int8(ptr0[1] * s1);
+                            outptr2[0] = float2int8(ptr0[2] * s2);
+                            outptr3[0] = float2int8(ptr0[3] * s3);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+#endif // __riscv_vector
+
+    if (dims == 1)
+    {
+        int w = bottom_blob.w;
+
+        top_blob.create(w, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        const float* ptr = bottom_blob;
+        signed char* outptr = top_blob;
+
+        if (scale_data_size == 1)
+        {
+            const float scale = scale_data[0];
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                outptr[i] = float2int8(ptr[i] * scale);
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                outptr[i] = float2int8(ptr[i] * scale_data[i]);
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            const float* ptr0 = bottom_blob.row(i);
+            signed char* outptr0 = top_blob.row<signed char>(i);
+
+            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+
+            for (int j = 0; j < w; j++)
+            {
+                *outptr0++ = float2int8(*ptr0++ * scale);
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+        int size = w * h;
+
+        top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            signed char* outptr = top_blob.channel(q);
+
+            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+
+            int i = 0;
+#if __riscv_vector
+            // vfloat32m1_t _scale = vfmv_v_f_f32m1(scale);
+            // float32x4_t _scale = vdupq_n_f32(scale);
+            float _scale = scale;
+
+            for (; i + 15 < size; i += 16)
+            {
+                vfloat32m1_t _v0 = vle32_v_f32m1(ptr, vl);
+                vfloat32m1_t _v1 = vle32_v_f32m1(ptr + 4, vl);
+                vfloat32m1_t _v2 = vle32_v_f32m1(ptr + 8, vl);
+                vfloat32m1_t _v3 = vle32_v_f32m1(ptr + 12, vl);
+
+                _v0 = vfmul_vf_f32m1(_v0, _scale, vl);
+                _v1 = vfmul_vf_f32m1(_v1, _scale, vl);
+                _v2 = vfmul_vf_f32m1(_v2, _scale, vl);
+                _v3 = vfmul_vf_f32m1(_v3, _scale, vl);
+
+                vint8m1_t _v = float2int8(_v0, _v1, _v2, _v3);
+                vse8_v_i8m1(outptr, _v, 4 * vl);
+
+                // float32x4_t _v0 = vld1q_f32(ptr);
+                // float32x4_t _v1 = vld1q_f32(ptr + 4);
+                // float32x4_t _v2 = vld1q_f32(ptr + 8);
+                // float32x4_t _v3 = vld1q_f32(ptr + 12);
+                // _v0 = vmulq_f32(_v0, _scale);
+                // _v1 = vmulq_f32(_v1, _scale);
+                // _v2 = vmulq_f32(_v2, _scale);
+                // _v3 = vmulq_f32(_v3, _scale);
+                // vst1_s8(outptr, float2int8(_v0, _v1));
+                // vst1_s8(outptr + 8, float2int8(_v2, _v3));
+
+                ptr += 16;
+                outptr += 16;
+            }
+            for (; i + 7 < size; i += 8)
+            {
+                vfloat32m1_t _v0 = vle32_v_f32m1(ptr, vl);
+                vfloat32m1_t _v1 = vle32_v_f32m1(ptr + 4, vl);
+
+                _v0 = vfmul_vf_f32m1(_v0, _scale, vl);
+                _v1 = vfmul_vf_f32m1(_v1, _scale, vl);
+
+                int64_t _v = float2int8(_v0, _v1);
+                *(int64_t*)outptr = _v;
+                // float32x4_t _v0 = vld1q_f32(ptr);
+                // float32x4_t _v1 = vld1q_f32(ptr + 4);
+                // _v0 = vmulq_f32(_v0, _scale);
+                // _v1 = vmulq_f32(_v1, _scale);
+                // int8x8_t _v = float2int8(_v0, _v1);
+                // vst1_s8(outptr, _v);
+
+                ptr += 8;
+                outptr += 8;
+            }
+#endif // __riscv_vector
+            for (; i < size; i++)
+            {
+                *outptr++ = float2int8(*ptr++ * scale);
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/riscv/quantize_riscv.h b/src/layer/riscv/quantize_riscv.h
new file mode 100644
index 000000000000..128aca97e640
--- /dev/null
+++ b/src/layer/riscv/quantize_riscv.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_QUANTIZE_RISCV_H
+#define LAYER_QUANTIZE_RISCV_H
+
+#include "quantize.h"
+
+namespace ncnn {
+
+class Quantize_riscv : public Quantize
+{
+public:
+    Quantize_riscv();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_QUANTIZE_RISCV_H
diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h
index 8f1fe5aaad88..efe639b2e3fd 100644
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -266,8 +266,8 @@ static inline vint8m1_t float2int8relu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloa
 static inline int64_t float2int8leakyrelu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh, vfloat32m1_t _slope)
 {
     int vl = vsetvlmax_e32m1();
-    vfloat32m1_t _vlow_leaky = vsmul_vv_f32m1(_vlow, _slope, vl);
-    vfloat32m1_t _vhigh_leaky = vsmul_vv_f32m1(_vhigh, _slope, vl);
+    vfloat32m1_t _vlow_leaky = vfmul_vv_f32m1(_vlow, _slope, vl);
+    vfloat32m1_t _vhigh_leaky = vfmul_vv_f32m1(_vhigh, _slope, vl);
 
     // simulate round to nearest via +/-0.5 with round to zero
     vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl);
@@ -322,7 +322,7 @@ static inline int64_t float2int8leakyrelu(vfloat32m1_t _vlow, vfloat32m1_t _vhig
 
 static inline int32_t float2int8leakyrelu(vfloat32m1_t _v, vfloat32m1_t _slope) {
     int vl = vsetvlmax_e32m1();
-    vfloat32m1_t _v_leaky = vsmul_vv_f32m1(_v, _slope, vl);
+    vfloat32m1_t _v_leaky = vfmul_vv_f32m1(_v, _slope, vl);
 
     vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl);
     int32_t _signmask = 1 << 31;
@@ -361,10 +361,10 @@ static inline int32_t float2int8leakyrelu(vfloat32m1_t _v, vfloat32m1_t _slope)
 static inline vint8m1_t float2int8leakyrelu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3, vfloat32m1_t _slope)
 {
     int vl = vsetvlmax_e32m1();
-    vfloat32m1_t _v0_leaky = vsmul_vv_f32m1(_v0, _slope, vl);
-    vfloat32m1_t _v1_leaky = vsmul_vv_f32m1(_v1, _slope, vl);
-    vfloat32m1_t _v2_leaky = vsmul_vv_f32m1(_v2, _slope, vl);
-    vfloat32m1_t _v3_leaky = vsmul_vv_f32m1(_v3, _slope, vl);
+    vfloat32m1_t _v0_leaky = vfmul_vv_f32m1(_v0, _slope, vl);
+    vfloat32m1_t _v1_leaky = vfmul_vv_f32m1(_v1, _slope, vl);
+    vfloat32m1_t _v2_leaky = vfmul_vv_f32m1(_v2, _slope, vl);
+    vfloat32m1_t _v3_leaky = vfmul_vv_f32m1(_v3, _slope, vl);
 
     vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl);
     int32_t _signmask = 1 << 31;

From 88e0910944c54acbb909ce244e4265f7a6844ee4 Mon Sep 17 00:00:00 2001
From: yxy <summerhh54@gmail.com>
Date: Tue, 6 Feb 2024 16:15:00 +0000
Subject: [PATCH 06/17] fix bug

---
 src/layer/riscv/quantize_riscv.cpp | 67 ---------------------
 src/layer/riscv/riscv_usability.h  | 93 ++++++++++++++----------------
 2 files changed, 42 insertions(+), 118 deletions(-)

diff --git a/src/layer/riscv/quantize_riscv.cpp b/src/layer/riscv/quantize_riscv.cpp
index df429d9e18e9..9ea8030dfde0 100644
--- a/src/layer/riscv/quantize_riscv.cpp
+++ b/src/layer/riscv/quantize_riscv.cpp
@@ -27,7 +27,6 @@ namespace ncnn {
 
 Quantize_riscv::Quantize_riscv()
 {
-    fprintf(stderr, "Quantize_riscv init\n");
 #if __riscv_vector
     support_packing = true;
 #endif // __riscv_vector
@@ -120,12 +119,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                             _vhigh = vfmul_vf_f32m1(_vhigh, _scale, vl);
                             
                             int64_t _v = float2int8(_vlow, _vhigh);
-                            // float32x4_t _vlow = vld1q_f32(ptr0);
-                            // float32x4_t _vhigh = vld1q_f32(ptr1);
-                            // _vlow = vmulq_f32(_vlow, _scale);
-                            // _vhigh = vmulq_f32(_vhigh, _scale);
-                            // int8x8_t _v = float2int8(_vlow, _vhigh);
-                            // vst1_s8(outptr, _v);
                             *(int64_t*)outptr = _v;
 
                             ptr0 += 4;
@@ -146,9 +139,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                         vfloat32m1_t _scale0 = vle32_v_f32m1((const float*)scale_data + i * 8, vl);
                         vfloat32m1_t _scale1 = vle32_v_f32m1((const float*)scale_data + i * 8 + 4, vl);
 
-                        // float32x4_t _scale0 = vld1q_f32((const float*)scale_data + i * 8);
-                        // float32x4_t _scale1 = vld1q_f32((const float*)scale_data + i * 8 + 4);
-
                         for (int j = 0; j < w; j++)
                         {
                             vfloat32m1_t _vlow = vle32_v_f32m1(ptr0, vl);
@@ -157,13 +147,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                             _vhigh = vfmul_vv_f32m1(_vhigh, _scale1, vl);
                             int64_t _v = float2int8(_vlow, _vhigh);
                             *(int64_t*)outptr = _v;
-                            // float32x4_t _vlow = vld1q_f32(ptr0);
-                            // float32x4_t _vhigh = vld1q_f32(ptr1);
-                            // _vlow = vmulq_f32(_vlow, _scale0);
-                            // _vhigh = vmulq_f32(_vhigh, _scale1);
-                            // int8x8_t _v = float2int8(_vlow, _vhigh);
-                            // vst1_s8(outptr, _v);
-
                             ptr0 += 4;
                             ptr1 += 4;
                             outptr += 8;
@@ -252,7 +235,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
             {
                 if (scale_data_size == 1)
                 {
-                    // float32x4_t _scale = vdupq_n_f32(scale_data[0]);
                     float _scale = scale_data[0];
 
                     #pragma omp parallel for num_threads(opt.num_threads)
@@ -276,17 +258,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
 
                             vint8m1_t _v = float2int8(_v0, _v2, _v1, _v3);
                             vse8_v_i8m1(outptr, _v, 4 * vl);
-                            // float32x4_t _v0 = vld1q_f32(ptr0);
-                            // float32x4_t _v1 = vld1q_f32(ptr0 + 4);
-                            // float32x4_t _v2 = vld1q_f32(ptr1);
-                            // float32x4_t _v3 = vld1q_f32(ptr1 + 4);
-                            // _v0 = vmulq_f32(_v0, _scale);
-                            // _v1 = vmulq_f32(_v1, _scale);
-                            // _v2 = vmulq_f32(_v2, _scale);
-                            // _v3 = vmulq_f32(_v3, _scale);
-                            // vst1_s8(outptr, float2int8(_v0, _v2));
-                            // vst1_s8(outptr + 8, float2int8(_v1, _v3));
-
                             ptr0 += 8;
                             ptr1 += 8;
                             outptr += 16;
@@ -301,13 +272,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
 
                             int64_t _v = float2int8(_vlow, _vhigh);
                             *(int64_t*)outptr = _v;
-
-                            // float32x4_t _vlow = vld1q_f32(ptr0);
-                            // float32x4_t _vhigh = vld1q_f32(ptr1);
-                            // _vlow = vmulq_f32(_vlow, _scale);
-                            // _vhigh = vmulq_f32(_vhigh, _scale);
-                            // int8x8_t _v = float2int8(_vlow, _vhigh);
-                            // vst1_s8(outptr, _v);
                             ptr0 += 4;
                             ptr1 += 4;
                             outptr += 8;
@@ -326,9 +290,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                         vfloat32m1_t _scale0 = vle32_v_f32m1((const float*)scale_data + q * 8, vl);
                         vfloat32m1_t _scale1 = vle32_v_f32m1((const float*)scale_data + q * 8 + 4, vl);
 
-                        // float32x4_t _scale0 = vld1q_f32((const float*)scale_data + q * 8);
-                        // float32x4_t _scale1 = vld1q_f32((const float*)scale_data + q * 8 + 4);
-
                         int i = 0;
                         for (; i < size; i++)
                         {
@@ -340,14 +301,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
 
                             int64_t _v = float2int8(_vlow, _vhigh);
                             *(int64_t*)outptr = _v;
-
-                            // float32x4_t _vlow = vld1q_f32(ptr0);
-                            // float32x4_t _vhigh = vld1q_f32(ptr1);
-                            // _vlow = vmulq_f32(_vlow, _scale0);
-                            // _vhigh = vmulq_f32(_vhigh, _scale1);
-                            // int8x8_t _v = float2int8(_vlow, _vhigh);
-                            // vst1_s8(outptr, _v);
-
                             ptr0 += 4;
                             ptr1 += 4;
                             outptr += 8;
@@ -499,8 +452,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
 
             int i = 0;
 #if __riscv_vector
-            // vfloat32m1_t _scale = vfmv_v_f_f32m1(scale);
-            // float32x4_t _scale = vdupq_n_f32(scale);
             float _scale = scale;
 
             for (; i + 15 < size; i += 16)
@@ -518,17 +469,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 vint8m1_t _v = float2int8(_v0, _v1, _v2, _v3);
                 vse8_v_i8m1(outptr, _v, 4 * vl);
 
-                // float32x4_t _v0 = vld1q_f32(ptr);
-                // float32x4_t _v1 = vld1q_f32(ptr + 4);
-                // float32x4_t _v2 = vld1q_f32(ptr + 8);
-                // float32x4_t _v3 = vld1q_f32(ptr + 12);
-                // _v0 = vmulq_f32(_v0, _scale);
-                // _v1 = vmulq_f32(_v1, _scale);
-                // _v2 = vmulq_f32(_v2, _scale);
-                // _v3 = vmulq_f32(_v3, _scale);
-                // vst1_s8(outptr, float2int8(_v0, _v1));
-                // vst1_s8(outptr + 8, float2int8(_v2, _v3));
-
                 ptr += 16;
                 outptr += 16;
             }
@@ -542,13 +482,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
 
                 int64_t _v = float2int8(_v0, _v1);
                 *(int64_t*)outptr = _v;
-                // float32x4_t _v0 = vld1q_f32(ptr);
-                // float32x4_t _v1 = vld1q_f32(ptr + 4);
-                // _v0 = vmulq_f32(_v0, _scale);
-                // _v1 = vmulq_f32(_v1, _scale);
-                // int8x8_t _v = float2int8(_v0, _v1);
-                // vst1_s8(outptr, _v);
-
                 ptr += 8;
                 outptr += 8;
             }
diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h
index efe639b2e3fd..9075ba9e667c 100644
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -61,22 +61,8 @@ static inline int csrr_vlenb()
 static inline int64_t float2int8(vfloat32m1_t _vlow, vfloat32m1_t _vhigh)
 {
     int vl = vsetvlmax_e32m1();
-    // _MM_ROUND_NEAREST round to even
-    // simulate round to nearest via +/-0.5 with round to zero
-    vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl);
-    // vint32m1_t _signmask = vmv_v_f(1 << 31, vl);
-    int32_t _signmask = 1 << 31;
-    vint32m1_t _signlow = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vlow), _signmask, vl);
-    vint32m1_t _signhigh = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vhigh), _signmask, vl);
-
-    vfloat32m1_t _p5low = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signlow, vreinterpret_v_f32m1_i32m1(_p5), vl));
-    vfloat32m1_t _p5high = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signhigh, vreinterpret_v_f32m1_i32m1(_p5), vl));
-
-    vfloat32m1_t _vlow5 = vfadd_vv_f32m1(_vlow, _p5low, vl);
-    vfloat32m1_t _vhigh5 = vfadd_vv_f32m1(_vhigh, _p5high, vl);
-
-    vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow5, vl);
-    vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl);
+    vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow, vl);
+    vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh, vl);
 
     // combine _vlow32 and _vhigh32 to a single vint32m2_t
     vl = 2 * vsetvlmax_e32m1();
@@ -95,18 +81,7 @@ static inline int64_t float2int8(vfloat32m1_t _vlow, vfloat32m1_t _vhigh)
 static inline int32_t float2int8(vfloat32m1_t _v)
 {
     int vl = vsetvlmax_e32m1();
-    // _MM_ROUND_NEAREST round to even
-    // simulate round to nearest via +/-0.5 with round to zero
-
-    vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl);
-    // vint32m1_t _signmask = vmv_v_f(1 << 31, vl);
-    int32_t _signmask = 1 << 31;
-    vint32m1_t _sign = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v), _signmask, vl);
-
-    vfloat32m1_t _p5s = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign, vreinterpret_v_f32m1_i32m1(_p5), vl));
-
-    vfloat32m1_t _v5 = vfadd_vv_f32m1(_v, _p5s, vl);
-    vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v5, vl);
+    vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v, vl);
 
     vint32m4_t _v32 = vundefined_i32m4();
     _v32 = vset_v_i32m1_i32m4(_v32, 0, _v32m1);
@@ -119,33 +94,47 @@ static inline int32_t float2int8(vfloat32m1_t _v)
     return _ret;
 }
 
-static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3)
+static void  print_vint8m1(vint8m1_t _v)
 {
-    int vl = vsetvlmax_e32m1();
-    // _MM_ROUND_NEAREST round to even
-    // simulate round to nearest via +/-0.5 with round to zero
-    vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl);
-    // vint32m1_t _signmask = vmv_v_f(1 << 31, vl);
-    int32_t _signmask = 1 << 31;
-    vint32m1_t _sign0 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v0), _signmask, vl);
-    vint32m1_t _sign1 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v1), _signmask, vl);
-    vint32m1_t _sign2 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v2), _signmask, vl);
-    vint32m1_t _sign3 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v3), _signmask, vl);
+    int8_t *i8 = (int8_t *)malloc(16 * sizeof(int8_t));
+    vse8_v_i8m1(i8, _v, 16);
+    for (int i = 0; i < 16; i++)
+    {
+        fprintf(stderr, "i8[%d]: %d\n", i, i8[i]);
+    }
+    free(i8);
+}
 
-    vfloat32m1_t _p5s0 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign0, vreinterpret_v_f32m1_i32m1(_p5), vl));
-    vfloat32m1_t _p5s1 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign1, vreinterpret_v_f32m1_i32m1(_p5), vl));
-    vfloat32m1_t _p5s2 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign2, vreinterpret_v_f32m1_i32m1(_p5), vl));
-    vfloat32m1_t _p5s3 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign3, vreinterpret_v_f32m1_i32m1(_p5), vl));
+static void print_vint32m1(vint32m1_t _v)
+{
+    int32_t *i32 = (int32_t *)malloc(4 * sizeof(int32_t));
+    vse32_v_i32m1(i32, _v, 4);
+    for (int i = 0; i < 4; i++)
+    {
+        fprintf(stderr, "i32[%d]: %d\n", i, i32[i]);
+    }
+    free(i32);
+}
 
-    vfloat32m1_t _v05 = vfadd_vv_f32m1(_v0, _p5s0, vl);
-    vfloat32m1_t _v15 = vfadd_vv_f32m1(_v1, _p5s1, vl);
-    vfloat32m1_t _v25 = vfadd_vv_f32m1(_v2, _p5s2, vl);
-    vfloat32m1_t _v35 = vfadd_vv_f32m1(_v3, _p5s3, vl);
+static void print_vfloat32m1(vfloat32m1_t _v)
+{
+    float *f32 = (float *)malloc(4 * sizeof(float));
+    vse32_v_f32m1(f32, _v, 4);
+    for (int i = 0; i < 4; i++)
+    {
+        fprintf(stderr, "f32[%d]: %f\n", i, f32[i]);
+    }
+    free(f32);
+}
 
-    vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v05, vl);
-    vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v15, vl);
-    vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v25, vl);
-    vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v35, vl);
+static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3)
+{
+    int vl = vsetvlmax_e32m1();
+
+    vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v0, vl);
+    vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v1, vl);
+    vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v2, vl);
+    vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v3, vl);
 
     vl = vsetvlmax_e32m4();
 
@@ -158,6 +147,8 @@ static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m
     vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
     vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
     _v8 = vmax_vx_i8m1(_v8, -127, vl);
+    int8_t *i8 = (int8_t *)malloc(16 * sizeof(int8_t));
+    vse8_v_i8m1(i8, _v8, 16);
     return _v8;
 }
 

From c3d77e0a89fe3d557d905385407d56524e10d0d0 Mon Sep 17 00:00:00 2001
From: yxy <summerhh54@gmail.com>
Date: Tue, 6 Feb 2024 16:30:46 +0000
Subject: [PATCH 07/17] fix bug of float2int8relu

---
 src/layer/riscv/riscv_usability.h | 142 ++++++------------------------
 1 file changed, 25 insertions(+), 117 deletions(-)

diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h
index 9075ba9e667c..1ae86f767553 100644
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -155,24 +155,10 @@ static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m
 static inline int64_t float2int8relu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh)
 {
     int vl = vsetvlmax_e32m1();
-    // _MM_ROUND_NEAREST round to even
-    // simulate round to nearest via +/-0.5 with round to zero
-    vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl);
-    // vint32m1_t _signmask = vmv_v_f(1 << 31, vl);
-    int32_t _signmask = 1 << 31;
-    vint32m1_t _signlow = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vlow), _signmask, vl);
-    vint32m1_t _signhigh = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vhigh), _signmask, vl);
 
-    vfloat32m1_t _p5low = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signlow, vreinterpret_v_f32m1_i32m1(_p5), vl));
-    vfloat32m1_t _p5high = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signhigh, vreinterpret_v_f32m1_i32m1(_p5), vl));
-
-    vfloat32m1_t _vlow5 = vfadd_vv_f32m1(_vlow, _p5low, vl);
-    vfloat32m1_t _vhigh5 = vfadd_vv_f32m1(_vhigh, _p5high, vl);
-
-    vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow5, vl);
-    vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl);
+    vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow, vl);
+    vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh, vl);
 
-    // combine _vlow32 and _vhigh32 to a single vint32m2_t
     vl = 2 * vsetvlmax_e32m1();
     vint32m4_t _v32 = vundefined_i32m4();
     _v32 = vset_v_i32m1_i32m4(_v32, 0, _vlow32);
@@ -188,18 +174,7 @@ static inline int64_t float2int8relu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh)
 
 static inline int32_t float2int8relu(vfloat32m1_t _v) {
     int vl = vsetvlmax_e32m1();
-    // _MM_ROUND_NEAREST round to even
-    // simulate round to nearest via +/-0.5 with round to zero
-
-    vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl);
-    // vint32m1_t _signmask = vmv_v_f(1 << 31, vl);
-    int32_t _signmask = 1 << 31;
-    vint32m1_t _sign = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v), _signmask, vl);
-
-    vfloat32m1_t _p5s = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign, vreinterpret_v_f32m1_i32m1(_p5), vl));
-
-    vfloat32m1_t _v5 = vfadd_vv_f32m1(_v, _p5s, vl);
-    vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v5, vl);
+    vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v, vl);
 
     vint32m4_t _v32 = vundefined_i32m4();
     _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v32m1);
@@ -215,30 +190,10 @@ static inline int32_t float2int8relu(vfloat32m1_t _v) {
 static inline vint8m1_t float2int8relu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3)
 {
     int vl = vsetvlmax_e32m1();
-    // _MM_ROUND_NEAREST round to even
-    // simulate round to nearest via +/-0.5 with round to zero
-    vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl);
-    // vint32m1_t _signmask = vmv_v_f(1 << 31, vl);
-    int32_t _signmask = 1 << 31;
-    vint32m1_t _sign0 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v0), _signmask, vl);
-    vint32m1_t _sign1 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v1), _signmask, vl);
-    vint32m1_t _sign2 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v2), _signmask, vl);
-    vint32m1_t _sign3 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v3), _signmask, vl);
-
-    vfloat32m1_t _p5s0 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign0, vreinterpret_v_f32m1_i32m1(_p5), vl));
-    vfloat32m1_t _p5s1 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign1, vreinterpret_v_f32m1_i32m1(_p5), vl));
-    vfloat32m1_t _p5s2 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign2, vreinterpret_v_f32m1_i32m1(_p5), vl));
-    vfloat32m1_t _p5s3 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign3, vreinterpret_v_f32m1_i32m1(_p5), vl));
-
-    vfloat32m1_t _v05 = vfadd_vv_f32m1(_v0, _p5s0, vl);
-    vfloat32m1_t _v15 = vfadd_vv_f32m1(_v1, _p5s1, vl);
-    vfloat32m1_t _v25 = vfadd_vv_f32m1(_v2, _p5s2, vl);
-    vfloat32m1_t _v35 = vfadd_vv_f32m1(_v3, _p5s3, vl);
-
-    vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v05, vl);
-    vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v15, vl);
-    vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v25, vl);
-    vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v35, vl);
+    vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v0, vl);
+    vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v1, vl);
+    vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v2, vl);
+    vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v3, vl);
 
     vl = vsetvlmax_e32m4();
 
@@ -260,34 +215,11 @@ static inline int64_t float2int8leakyrelu(vfloat32m1_t _vlow, vfloat32m1_t _vhig
     vfloat32m1_t _vlow_leaky = vfmul_vv_f32m1(_vlow, _slope, vl);
     vfloat32m1_t _vhigh_leaky = vfmul_vv_f32m1(_vhigh, _slope, vl);
 
-    // simulate round to nearest via +/-0.5 with round to zero
-    vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl);
-    // vint32m1_t _signmask = vmv_v_f(1 << 31, vl);
-    int32_t _signmask = 1 << 31;
-
-    vint32m1_t _signlow = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vlow), _signmask, vl);
-    vint32m1_t _signhigh = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vhigh), _signmask, vl);
-
-    vfloat32m1_t _p5low = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signlow, vreinterpret_v_f32m1_i32m1(_p5), vl));
-    vfloat32m1_t _p5high = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signhigh, vreinterpret_v_f32m1_i32m1(_p5), vl));
-
-    vfloat32m1_t _vlow5 = vfadd_vv_f32m1(_vlow_leaky, _p5low, vl);
-    vfloat32m1_t _vhigh5 = vfadd_vv_f32m1(_vhigh_leaky, _p5high, vl);
-
-    vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow5, vl);
-    vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl);
-
-    vint32m1_t _signlow_leaky = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vlow_leaky), _signmask, vl);
-    vint32m1_t _signhigh_leaky = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vhigh_leaky), _signmask, vl);
-
-    vfloat32m1_t _p5low_leaky = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signlow_leaky, vreinterpret_v_f32m1_i32m1(_p5), vl));
-    vfloat32m1_t _p5high_leaky = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signhigh_leaky, vreinterpret_v_f32m1_i32m1(_p5), vl));
-
-    vfloat32m1_t _vlow5_leaky = vfadd_vv_f32m1(_vlow_leaky, _p5low_leaky, vl);
-    vfloat32m1_t _vhigh5_leaky = vfadd_vv_f32m1(_vhigh_leaky, _p5high_leaky, vl);
+    vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow, vl);
+    vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh, vl);
 
-    vint32m1_t _vlow32_leaky = vfcvt_x_f_v_i32m1(_vlow5_leaky, vl);
-    vint32m1_t _vhigh32_leaky = vfcvt_x_f_v_i32m1(_vhigh5_leaky, vl);
+    vint32m1_t _vlow32_leaky = vfcvt_x_f_v_i32m1(_vlow_leaky, vl);
+    vint32m1_t _vhigh32_leaky = vfcvt_x_f_v_i32m1(_vhigh_leaky, vl);
 
     vl = 2 * vsetvlmax_e32m1();
 
@@ -315,20 +247,8 @@ static inline int32_t float2int8leakyrelu(vfloat32m1_t _v, vfloat32m1_t _slope)
     int vl = vsetvlmax_e32m1();
     vfloat32m1_t _v_leaky = vfmul_vv_f32m1(_v, _slope, vl);
 
-    vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl);
-    int32_t _signmask = 1 << 31;
-    vint32m1_t _sign = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v), _signmask, vl);
-
-    vfloat32m1_t _p5s = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign, vreinterpret_v_f32m1_i32m1(_p5), vl));
-
-    vfloat32m1_t _v5 = vfadd_vv_f32m1(_v, _p5s, vl);
-    vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v5, vl);
-
-    vint32m1_t _sign_leaky = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v_leaky), _signmask, vl);
-    vfloat32m1_t _p5s_leaky = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign_leaky, vreinterpret_v_f32m1_i32m1(_p5), vl));
-
-    vfloat32m1_t _v5_leaky = vfadd_vv_f32m1(_v_leaky, _p5s_leaky, vl);
-    vint32m1_t _v32m1_leaky = vfcvt_x_f_v_i32m1(_v5_leaky, vl);
+    vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v, vl);
+    vint32m1_t _v32m1_leaky = vfcvt_x_f_v_i32m1(_v_leaky, vl);
 
     // vl = vsetvlmax_e32m4();
     vint32m4_t _v32 = vundefined_i32m4();
@@ -357,27 +277,15 @@ static inline vint8m1_t float2int8leakyrelu(vfloat32m1_t _v0, vfloat32m1_t _v1,
     vfloat32m1_t _v2_leaky = vfmul_vv_f32m1(_v2, _slope, vl);
     vfloat32m1_t _v3_leaky = vfmul_vv_f32m1(_v3, _slope, vl);
 
-    vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl);
-    int32_t _signmask = 1 << 31;
-    vint32m1_t _sign0 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v0), _signmask, vl);
-    vint32m1_t _sign1 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v1), _signmask, vl);
-    vint32m1_t _sign2 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v2), _signmask, vl);
-    vint32m1_t _sign3 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v3), _signmask, vl);
-
-    vfloat32m1_t _p5s0 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign0, vreinterpret_v_f32m1_i32m1(_p5), vl));
-    vfloat32m1_t _p5s1 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign1, vreinterpret_v_f32m1_i32m1(_p5), vl));
-    vfloat32m1_t _p5s2 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign2, vreinterpret_v_f32m1_i32m1(_p5), vl));
-    vfloat32m1_t _p5s3 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign3, vreinterpret_v_f32m1_i32m1(_p5), vl));
-
-    vfloat32m1_t _v05 = vfadd_vv_f32m1(_v0_leaky, _p5s0, vl);
-    vfloat32m1_t _v15 = vfadd_vv_f32m1(_v1_leaky, _p5s1, vl);
-    vfloat32m1_t _v25 = vfadd_vv_f32m1(_v2_leaky, _p5s2, vl);
-    vfloat32m1_t _v35 = vfadd_vv_f32m1(_v3_leaky, _p5s3, vl);
+    vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v0, vl);
+    vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v1, vl);
+    vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v2, vl);
+    vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v3, vl);
 
-    vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v05, vl);
-    vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v15, vl);
-    vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v25, vl);
-    vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v35, vl);
+    vint32m1_t _v0_32_leaky = vfcvt_x_f_v_i32m1(_v0_leaky, vl);
+    vint32m1_t _v1_32_leaky = vfcvt_x_f_v_i32m1(_v1_leaky, vl);
+    vint32m1_t _v2_32_leaky = vfcvt_x_f_v_i32m1(_v2_leaky, vl);
+    vint32m1_t _v3_32_leaky = vfcvt_x_f_v_i32m1(_v3_leaky, vl);
 
     vl = vsetvlmax_e32m4();
     vint32m4_t _v32 = vundefined_i32m4();
@@ -390,10 +298,10 @@ static inline vint8m1_t float2int8leakyrelu(vfloat32m1_t _v0, vfloat32m1_t _v1,
     vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
 
     vint32m4_t _v32_leaky = vundefined_i32m4();
-    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 0, _v0_32);
-    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 1, _v1_32);
-    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 2, _v2_32);
-    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 3, _v3_32);
+    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 0, _v0_32_leaky);
+    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 1, _v1_32_leaky);
+    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 2, _v2_32_leaky);
+    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 3, _v3_32_leaky);
 
     vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl);
     vint8m1_t _v8_leaky = vnclip_wx_i8m1(_v16_leaky, 0, vl);

From 05fdab0add27b5e42fedae7e8290073e2c395792 Mon Sep 17 00:00:00 2001
From: Xinyu302 <Xinyu302@users.noreply.github.com>
Date: Tue, 6 Feb 2024 16:32:08 +0000
Subject: [PATCH 08/17] apply code-format changes

---
 src/layer/riscv/quantize_riscv.cpp |  2 +-
 src/layer/riscv/riscv_usability.h  | 57 +++++++++++++++---------------
 2 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/src/layer/riscv/quantize_riscv.cpp b/src/layer/riscv/quantize_riscv.cpp
index 9ea8030dfde0..a459688227b9 100644
--- a/src/layer/riscv/quantize_riscv.cpp
+++ b/src/layer/riscv/quantize_riscv.cpp
@@ -117,7 +117,7 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                             vfloat32m1_t _vhigh = vle32_v_f32m1(ptr1, vl);
                             _vlow = vfmul_vf_f32m1(_vlow, _scale, vl);
                             _vhigh = vfmul_vf_f32m1(_vhigh, _scale, vl);
-                            
+
                             int64_t _v = float2int8(_vlow, _vhigh);
                             *(int64_t*)outptr = _v;
 
diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h
index 1ae86f767553..cbbfded60a91 100644
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -94,9 +94,9 @@ static inline int32_t float2int8(vfloat32m1_t _v)
     return _ret;
 }
 
-static void  print_vint8m1(vint8m1_t _v)
+static void print_vint8m1(vint8m1_t _v)
 {
-    int8_t *i8 = (int8_t *)malloc(16 * sizeof(int8_t));
+    int8_t* i8 = (int8_t*)malloc(16 * sizeof(int8_t));
     vse8_v_i8m1(i8, _v, 16);
     for (int i = 0; i < 16; i++)
     {
@@ -107,7 +107,7 @@ static void  print_vint8m1(vint8m1_t _v)
 
 static void print_vint32m1(vint32m1_t _v)
 {
-    int32_t *i32 = (int32_t *)malloc(4 * sizeof(int32_t));
+    int32_t* i32 = (int32_t*)malloc(4 * sizeof(int32_t));
     vse32_v_i32m1(i32, _v, 4);
     for (int i = 0; i < 4; i++)
     {
@@ -118,7 +118,7 @@ static void print_vint32m1(vint32m1_t _v)
 
 static void print_vfloat32m1(vfloat32m1_t _v)
 {
-    float *f32 = (float *)malloc(4 * sizeof(float));
+    float* f32 = (float*)malloc(4 * sizeof(float));
     vse32_v_f32m1(f32, _v, 4);
     for (int i = 0; i < 4; i++)
     {
@@ -147,7 +147,7 @@ static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m
     vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
     vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
     _v8 = vmax_vx_i8m1(_v8, -127, vl);
-    int8_t *i8 = (int8_t *)malloc(16 * sizeof(int8_t));
+    int8_t* i8 = (int8_t*)malloc(16 * sizeof(int8_t));
     vse8_v_i8m1(i8, _v8, 16);
     return _v8;
 }
@@ -172,18 +172,19 @@ static inline int64_t float2int8relu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh)
     return _ret;
 }
 
-static inline int32_t float2int8relu(vfloat32m1_t _v) {
+static inline int32_t float2int8relu(vfloat32m1_t _v)
+{
     int vl = vsetvlmax_e32m1();
     vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v, vl);
 
     vint32m4_t _v32 = vundefined_i32m4();
-    _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v32m1);
+    _v32 = vset_v_i32m1_i32m4(_v32, 0, _v32m1);
 
     vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
     vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
     _v8 = vmax_vx_i8m1(_v8, 0, vl);
     int32_t _ret;
-    vse8_v_i8m1((int8_t *)&_ret, _v8, vl);
+    vse8_v_i8m1((int8_t*)&_ret, _v8, vl);
     return _ret;
 }
 
@@ -225,11 +226,11 @@ static inline int64_t float2int8leakyrelu(vfloat32m1_t _vlow, vfloat32m1_t _vhig
 
     vint32m4_t _v32 = vundefined_i32m4();
     vint32m4_t _v32_leaky = vundefined_i32m4();
-    _v32 = vset_v_i32m1_i32m4 (_v32, 0, _vlow32);
-    _v32 = vset_v_i32m1_i32m4 (_v32, 1, _vhigh32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 0, _vlow32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 1, _vhigh32);
 
-    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 0, _vlow32_leaky);
-    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 1, _vhigh32_leaky);
+    _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 0, _vlow32_leaky);
+    _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 1, _vhigh32_leaky);
 
     vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
     vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl);
@@ -239,11 +240,12 @@ static inline int64_t float2int8leakyrelu(vfloat32m1_t _vlow, vfloat32m1_t _vhig
 
     _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl);
     int64_t _ret;
-    vse8_v_i8m1((int8_t *)&_ret, _v8, vl);
+    vse8_v_i8m1((int8_t*)&_ret, _v8, vl);
     return _ret;
 }
 
-static inline int32_t float2int8leakyrelu(vfloat32m1_t _v, vfloat32m1_t _slope) {
+static inline int32_t float2int8leakyrelu(vfloat32m1_t _v, vfloat32m1_t _slope)
+{
     int vl = vsetvlmax_e32m1();
     vfloat32m1_t _v_leaky = vfmul_vv_f32m1(_v, _slope, vl);
 
@@ -253,9 +255,9 @@ static inline int32_t float2int8leakyrelu(vfloat32m1_t _v, vfloat32m1_t _slope)
     // vl = vsetvlmax_e32m4();
     vint32m4_t _v32 = vundefined_i32m4();
     vint32m4_t _v32_leaky = vundefined_i32m4();
-    
-    _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v32m1);
-    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 0, _v32m1_leaky);
+
+    _v32 = vset_v_i32m1_i32m4(_v32, 0, _v32m1);
+    _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 0, _v32m1_leaky);
 
     vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
     vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl);
@@ -265,7 +267,7 @@ static inline int32_t float2int8leakyrelu(vfloat32m1_t _v, vfloat32m1_t _slope)
 
     _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl);
     int32_t _ret;
-    vse8_v_i8m1((int8_t *)&_ret, _v8, vl);
+    vse8_v_i8m1((int8_t*)&_ret, _v8, vl);
     return _ret;
 }
 
@@ -289,27 +291,26 @@ static inline vint8m1_t float2int8leakyrelu(vfloat32m1_t _v0, vfloat32m1_t _v1,
 
     vl = vsetvlmax_e32m4();
     vint32m4_t _v32 = vundefined_i32m4();
-    _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v0_32);
-    _v32 = vset_v_i32m1_i32m4 (_v32, 1, _v1_32);
-    _v32 = vset_v_i32m1_i32m4 (_v32, 2, _v2_32);
-    _v32 = vset_v_i32m1_i32m4 (_v32, 3, _v3_32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 0, _v0_32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 1, _v1_32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 2, _v2_32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 3, _v3_32);
 
     vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
     vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
 
     vint32m4_t _v32_leaky = vundefined_i32m4();
-    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 0, _v0_32_leaky);
-    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 1, _v1_32_leaky);
-    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 2, _v2_32_leaky);
-    _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 3, _v3_32_leaky);
+    _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 0, _v0_32_leaky);
+    _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 1, _v1_32_leaky);
+    _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 2, _v2_32_leaky);
+    _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 3, _v3_32_leaky);
 
     vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl);
     vint8m1_t _v8_leaky = vnclip_wx_i8m1(_v16_leaky, 0, vl);
 
     _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl);
     return _v8;
-}    
-
+}
 
 static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr)
 {

From 77ebe76285838dec5fca0e9196d8f13ae545e52b Mon Sep 17 00:00:00 2001
From: Xinyu Yang <ltyxy@buaa.edu.cn>
Date: Sat, 10 Feb 2024 13:51:34 +0800
Subject: [PATCH 09/17] Add quantize fp16 (#6)

* finish quantize fp16 to int8

* fix quantize bug

* apply code-format changes

---------

Co-authored-by: Xinyu302 <Xinyu302@users.noreply.github.com>
---
 src/layer/riscv/quantize_riscv.cpp | 863 ++++++++++++++++++++++++++++-
 src/layer/riscv/quantize_riscv.h   |   5 +
 src/layer/riscv/riscv_usability.h  |  16 +
 tests/testutil.cpp                 |   4 +-
 4 files changed, 885 insertions(+), 3 deletions(-)

diff --git a/src/layer/riscv/quantize_riscv.cpp b/src/layer/riscv/quantize_riscv.cpp
index a459688227b9..91004baebe0a 100644
--- a/src/layer/riscv/quantize_riscv.cpp
+++ b/src/layer/riscv/quantize_riscv.cpp
@@ -29,14 +29,27 @@ Quantize_riscv::Quantize_riscv()
 {
 #if __riscv_vector
     support_packing = true;
+
+#if __riscv_zfh
+    support_fp16_storage = true;
+#endif // __riscv_zfh
 #endif // __riscv_vector
 }
 
 int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
-    int vl = vsetvlmax_e32m1();
     int elembits = bottom_blob.elembits();
+#if __riscv_vector && __riscv_zfh
+    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
+    {
+        if (opt.use_fp16_arithmetic)
+            return forward_fp16sa(bottom_blob, top_blob, opt);
+        else
+            return forward_fp16s(bottom_blob, top_blob, opt);
+    }
+#endif // __riscv_vector && __riscv_zfh
 
+    int vl = vsetvlmax_e32m1();
     int dims = bottom_blob.dims;
     int elempack = bottom_blob.elempack;
 
@@ -496,4 +509,852 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
     return 0;
 }
 
+#if __riscv_vector && __riscv_zfh
+
+int Quantize_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int dims = bottom_blob.dims;
+    int elempack = bottom_blob.elempack;
+    int vl;
+
+    if (elempack == 4)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+            int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
+            int outw = w * elempack / out_elempack;
+
+            top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                const float scale = scale_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4;
+                    signed char* outptr = (signed char*)top_blob + i * 4;
+
+                    outptr[0] = float2int8((float)ptr0[0] * scale);
+                    outptr[1] = float2int8((float)ptr0[1] * scale);
+                    outptr[2] = float2int8((float)ptr0[2] * scale);
+                    outptr[3] = float2int8((float)ptr0[3] * scale);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4;
+                    signed char* outptr = (signed char*)top_blob + i * 4;
+
+                    outptr[0] = float2int8((float)ptr0[0] * scale_data[i * 4]);
+                    outptr[1] = float2int8((float)ptr0[1] * scale_data[i * 4 + 1]);
+                    outptr[2] = float2int8((float)ptr0[2] * scale_data[i * 4 + 2]);
+                    outptr[3] = float2int8((float)ptr0[3] * scale_data[i * 4 + 3]);
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
+            int outh = h * elempack / out_elempack;
+
+            top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (out_elempack == 8)
+            {
+                if (scale_data_size == 1)
+                {
+                    float _scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const __fp16* ptr0 = bottom_blob.row<const __fp16>(i * 2);
+                        const __fp16* ptr1 = bottom_blob.row<const __fp16>(i * 2 + 1);
+                        signed char* outptr = top_blob.row<signed char>(i);
+                        vl = 4;
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            vfloat32m1_t _vlow = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr0, vl), vl), 0);
+                            vfloat32m1_t _vhigh = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr1, vl), vl), 0);
+                            _vlow = vfmul_vf_f32m1(_vlow, _scale, vl);
+                            _vhigh = vfmul_vf_f32m1(_vhigh, _scale, vl);
+                            int64_t _v = float2int8(_vlow, _vhigh);
+                            *(int64_t*)outptr = _v;
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const __fp16* ptr0 = bottom_blob.row<const __fp16>(i * 2);
+                        const __fp16* ptr1 = bottom_blob.row<const __fp16>(i * 2 + 1);
+                        signed char* outptr = top_blob.row<signed char>(i);
+
+                        vl = 4;
+                        vfloat32m1_t _scale0 = vle32_v_f32m1((const float*)scale_data + i * 8, vl);
+                        vfloat32m1_t _scale1 = vle32_v_f32m1((const float*)scale_data + i * 8 + 4, vl);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            vfloat32m1_t _vlow = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr0, vl), vl), 0);
+                            vfloat32m1_t _vhigh = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr1, vl), vl), 0);
+                            _vlow = vfmul_vv_f32m1(_vlow, _scale0, vl);
+                            _vhigh = vfmul_vv_f32m1(_vhigh, _scale1, vl);
+                            int64_t _v = float2int8(_vlow, _vhigh);
+                            *(int64_t*)outptr = _v;
+
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (scale_data_size == 1)
+                {
+                    const float scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
+                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
+                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
+                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
+                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            outptr0[0] = float2int8((float)ptr0[0] * scale);
+                            outptr1[0] = float2int8((float)ptr0[1] * scale);
+                            outptr2[0] = float2int8((float)ptr0[2] * scale);
+                            outptr3[0] = float2int8((float)ptr0[3] * scale);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
+                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
+                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
+                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
+                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                        const float s0 = scale_data[i * 4];
+                        const float s1 = scale_data[i * 4 + 1];
+                        const float s2 = scale_data[i * 4 + 2];
+                        const float s3 = scale_data[i * 4 + 3];
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            outptr0[0] = float2int8((float)ptr0[0] * s0);
+                            outptr1[0] = float2int8((float)ptr0[1] * s1);
+                            outptr2[0] = float2int8((float)ptr0[2] * s2);
+                            outptr3[0] = float2int8((float)ptr0[3] * s3);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+            int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
+            int outc = channels * elempack / out_elempack;
+
+            top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (out_elempack == 8)
+            {
+                if (scale_data_size == 1)
+                {
+                    float _scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const __fp16* ptr0 = bottom_blob.channel(q * 2);
+                        const __fp16* ptr1 = bottom_blob.channel(q * 2 + 1);
+                        signed char* outptr = top_blob.channel(q);
+
+                        vl = 4;
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            vfloat32m1_t _vlow = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr0, vl), vl), 0);
+                            vfloat32m1_t _vhigh = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr1, vl), vl), 0);
+                            _vlow = vfmul_vf_f32m1(_vlow, _scale, vl);
+                            _vhigh = vfmul_vf_f32m1(_vhigh, _scale, vl);
+                            int64_t _v = float2int8(_vlow, _vhigh);
+                            *(int64_t*)outptr = _v;
+
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const __fp16* ptr0 = bottom_blob.channel(q * 2);
+                        const __fp16* ptr1 = bottom_blob.channel(q * 2 + 1);
+                        signed char* outptr = top_blob.channel(q);
+
+                        vl = 4;
+                        vfloat32m1_t _scale0 = vle32_v_f32m1((const float*)scale_data + q * 8, vl);
+                        vfloat32m1_t _scale1 = vle32_v_f32m1((const float*)scale_data + q * 8 + 4, vl);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            vfloat32m1_t _vlow = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr0, vl), vl), 0);
+                            vfloat32m1_t _vhigh = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr1, vl), vl), 0);
+                            _vlow = vfmul_vv_f32m1(_vlow, _scale0, vl);
+                            _vhigh = vfmul_vv_f32m1(_vhigh, _scale1, vl);
+                            int64_t _v = float2int8(_vlow, _vhigh);
+                            *(int64_t*)outptr = _v;
+
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (scale_data_size == 1)
+                {
+                    const float scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const __fp16* ptr0 = bottom_blob.channel(q);
+                        signed char* outptr0 = top_blob.channel(q * 4);
+                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
+                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
+                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            outptr0[0] = float2int8((float)ptr0[0] * scale);
+                            outptr1[0] = float2int8((float)ptr0[1] * scale);
+                            outptr2[0] = float2int8((float)ptr0[2] * scale);
+                            outptr3[0] = float2int8((float)ptr0[3] * scale);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const __fp16* ptr0 = bottom_blob.channel(q);
+                        signed char* outptr0 = top_blob.channel(q * 4);
+                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
+                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
+                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
+
+                        const float s0 = scale_data[q * 4];
+                        const float s1 = scale_data[q * 4 + 1];
+                        const float s2 = scale_data[q * 4 + 2];
+                        const float s3 = scale_data[q * 4 + 3];
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            outptr0[0] = float2int8((float)ptr0[0] * s0);
+                            outptr1[0] = float2int8((float)ptr0[1] * s1);
+                            outptr2[0] = float2int8((float)ptr0[2] * s2);
+                            outptr3[0] = float2int8((float)ptr0[3] * s3);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    if (dims == 1)
+    {
+        int w = bottom_blob.w;
+
+        top_blob.create(w, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        const __fp16* ptr = bottom_blob;
+        signed char* outptr = top_blob;
+
+        if (scale_data_size == 1)
+        {
+            const float scale = scale_data[0];
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                outptr[i] = float2int8((float)ptr[i] * scale);
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                outptr[i] = float2int8((float)ptr[i] * scale_data[i]);
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
+            signed char* outptr0 = top_blob.row<signed char>(i);
+
+            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+
+            for (int j = 0; j < w; j++)
+            {
+                *outptr0++ = float2int8((float)*ptr0++ * scale);
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+        int size = w * h;
+
+        top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const __fp16* ptr = bottom_blob.channel(q);
+            signed char* outptr = top_blob.channel(q);
+
+            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+
+            for (int i = 0; i < size; i++)
+            {
+                *outptr++ = float2int8((float)*ptr++ * scale);
+            }
+        }
+    }
+
+    return 0;
+}
+
+int Quantize_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int dims = bottom_blob.dims;
+    int elempack = bottom_blob.elempack;
+    int vl;
+
+    if (elempack == 8)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+
+            top_blob.create(w, (size_t)8u, 8, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                __fp16 _scale = (__fp16)scale_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 8;
+                    signed char* outptr = (signed char*)top_blob + i * 8;
+                    vl = 8;
+                    vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl);
+                    _v = vfmul_vf_f16m1(_v, _scale, vl);
+                    *(int64_t*)outptr = float2int8(_v);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 8;
+                    signed char* outptr = (signed char*)top_blob + i * 8;
+
+                    vl = 8;
+                    vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl);
+                    vfloat16m1_t _scale = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)scale_data + i * 8, vl), vl);
+
+                    _v = vfmul_vv_f16m1(_v, _scale, vl);
+                    *(int64_t*)outptr = float2int8(_v);
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+
+            top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                __fp16 _scale = (__fp16)scale_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
+                    signed char* outptr0 = top_blob.row<signed char>(i);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        vl = 8;
+                        vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl);
+                        _v = vfmul_vf_f16m1(_v, _scale, vl);
+
+                        *(int64_t*)outptr0 = float2int8(_v);
+
+                        ptr0 += 8;
+                        outptr0 += 8;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
+                    signed char* outptr0 = top_blob.row<signed char>(i);
+
+                    vl = 8;
+                    vfloat16m1_t _scale = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)scale_data + i * 8, vl), vl); // float16x8_t _scale = vcombine_f16(vcvt_f16_f32(vld1q_f32((const float*)scale_data + i * 8)), vcvt_f16_f32(vld1q_f32((const float*)scale_data + i * 8 + 4)));
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl);
+                        _v = vfmul_vv_f16m1(_v, _scale, vl);
+                        *(int64_t*)outptr0 = float2int8(_v);
+
+                        ptr0 += 8;
+                        outptr0 += 8;
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+
+            top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                __fp16 _scale = (__fp16)scale_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const __fp16* ptr0 = bottom_blob.channel(q);
+                    signed char* outptr0 = top_blob.channel(q);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        vl = 8;
+                        vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl);
+                        _v = vfmul_vf_f16m1(_v, _scale, vl);
+                        *(int64_t*)outptr0 = float2int8(_v);
+
+                        ptr0 += 8;
+                        outptr0 += 8;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const __fp16* ptr0 = bottom_blob.channel(q);
+                    signed char* outptr0 = top_blob.channel(q);
+
+                    vl = 8;
+                    vfloat16m1_t _scale = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)scale_data + q * 8, vl), vl);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl);
+                        _v = vfmul_vv_f16m1(_v, _scale, vl);
+                        *(int64_t*)outptr0 = float2int8(_v);
+
+                        ptr0 += 8;
+                        outptr0 += 8;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    if (elempack == 4)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+            int outw = w * elempack;
+
+            top_blob.create(outw, (size_t)1u, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                const __fp16 scale = scale_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4;
+                    signed char* outptr = (signed char*)top_blob + i * 4;
+
+                    outptr[0] = float2int8(ptr0[0] * scale);
+                    outptr[1] = float2int8(ptr0[1] * scale);
+                    outptr[2] = float2int8(ptr0[2] * scale);
+                    outptr[3] = float2int8(ptr0[3] * scale);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4;
+                    signed char* outptr = (signed char*)top_blob + i * 4;
+
+                    outptr[0] = float2int8(ptr0[0] * (__fp16)scale_data[i * 4]);
+                    outptr[1] = float2int8(ptr0[1] * (__fp16)scale_data[i * 4 + 1]);
+                    outptr[2] = float2int8(ptr0[2] * (__fp16)scale_data[i * 4 + 2]);
+                    outptr[3] = float2int8(ptr0[3] * (__fp16)scale_data[i * 4 + 3]);
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int outh = h * elempack;
+
+            top_blob.create(w, outh, (size_t)1u, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                const __fp16 scale = scale_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
+                    signed char* outptr0 = top_blob.row<signed char>(i * 4);
+                    signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
+                    signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
+                    signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        outptr0[0] = float2int8(ptr0[0] * scale);
+                        outptr1[0] = float2int8(ptr0[1] * scale);
+                        outptr2[0] = float2int8(ptr0[2] * scale);
+                        outptr3[0] = float2int8(ptr0[3] * scale);
+
+                        ptr0 += 4;
+                        outptr0 += 1;
+                        outptr1 += 1;
+                        outptr2 += 1;
+                        outptr3 += 1;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
+                    signed char* outptr0 = top_blob.row<signed char>(i * 4);
+                    signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
+                    signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
+                    signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                    const __fp16 s0 = scale_data[i * 4];
+                    const __fp16 s1 = scale_data[i * 4 + 1];
+                    const __fp16 s2 = scale_data[i * 4 + 2];
+                    const __fp16 s3 = scale_data[i * 4 + 3];
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        outptr0[0] = float2int8(ptr0[0] * s0);
+                        outptr1[0] = float2int8(ptr0[1] * s1);
+                        outptr2[0] = float2int8(ptr0[2] * s2);
+                        outptr3[0] = float2int8(ptr0[3] * s3);
+
+                        ptr0 += 4;
+                        outptr0 += 1;
+                        outptr1 += 1;
+                        outptr2 += 1;
+                        outptr3 += 1;
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+            int outc = channels * elempack;
+
+            top_blob.create(w, h, outc, (size_t)1u, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                const __fp16 scale = scale_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const __fp16* ptr0 = bottom_blob.channel(q);
+                    signed char* outptr0 = top_blob.channel(q * 4);
+                    signed char* outptr1 = top_blob.channel(q * 4 + 1);
+                    signed char* outptr2 = top_blob.channel(q * 4 + 2);
+                    signed char* outptr3 = top_blob.channel(q * 4 + 3);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        outptr0[0] = float2int8(ptr0[0] * scale);
+                        outptr1[0] = float2int8(ptr0[1] * scale);
+                        outptr2[0] = float2int8(ptr0[2] * scale);
+                        outptr3[0] = float2int8(ptr0[3] * scale);
+
+                        ptr0 += 4;
+                        outptr0 += 1;
+                        outptr1 += 1;
+                        outptr2 += 1;
+                        outptr3 += 1;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const __fp16* ptr0 = bottom_blob.channel(q);
+                    signed char* outptr0 = top_blob.channel(q * 4);
+                    signed char* outptr1 = top_blob.channel(q * 4 + 1);
+                    signed char* outptr2 = top_blob.channel(q * 4 + 2);
+                    signed char* outptr3 = top_blob.channel(q * 4 + 3);
+
+                    const __fp16 s0 = scale_data[q * 4];
+                    const __fp16 s1 = scale_data[q * 4 + 1];
+                    const __fp16 s2 = scale_data[q * 4 + 2];
+                    const __fp16 s3 = scale_data[q * 4 + 3];
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        outptr0[0] = float2int8(ptr0[0] * s0);
+                        outptr1[0] = float2int8(ptr0[1] * s1);
+                        outptr2[0] = float2int8(ptr0[2] * s2);
+                        outptr3[0] = float2int8(ptr0[3] * s3);
+
+                        ptr0 += 4;
+                        outptr0 += 1;
+                        outptr1 += 1;
+                        outptr2 += 1;
+                        outptr3 += 1;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    if (dims == 1)
+    {
+        int w = bottom_blob.w;
+
+        top_blob.create(w, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        const __fp16* ptr = bottom_blob;
+        signed char* outptr = top_blob;
+
+        if (scale_data_size == 1)
+        {
+            const __fp16 scale = scale_data[0];
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                outptr[i] = float2int8(ptr[i] * scale);
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                outptr[i] = float2int8(ptr[i] * (__fp16)scale_data[i]);
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
+            signed char* outptr0 = top_blob.row<signed char>(i);
+
+            const __fp16 scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+
+            for (int j = 0; j < w; j++)
+            {
+                *outptr0++ = float2int8(*ptr0++ * scale);
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+        int size = w * h;
+
+        top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const __fp16* ptr = bottom_blob.channel(q);
+            signed char* outptr = top_blob.channel(q);
+
+            const __fp16 scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+
+            for (int i = 0; i < size; i++)
+            {
+                *outptr++ = float2int8(*ptr++ * scale);
+            }
+        }
+    }
+
+    return 0;
+}
+
+#endif // __riscv_vector && __riscv_zfh
+
 } // namespace ncnn
diff --git a/src/layer/riscv/quantize_riscv.h b/src/layer/riscv/quantize_riscv.h
index 128aca97e640..0eb90aed7e5b 100644
--- a/src/layer/riscv/quantize_riscv.h
+++ b/src/layer/riscv/quantize_riscv.h
@@ -25,6 +25,11 @@ class Quantize_riscv : public Quantize
     Quantize_riscv();
 
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+#if __riscv_vector && __riscv_zfh
+    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+#endif // __riscv_vector && __riscv_zfh
 };
 
 } // namespace ncnn
diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h
index cbbfded60a91..ad5fadea5701 100644
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -58,6 +58,22 @@ static inline int csrr_vlenb()
     return a;
 }
 
+#if __riscv_zfh
+static inline int64_t float2int8(vfloat16m1_t _v)
+{
+    int vl = vsetvlmax_e16m1();
+    vint16m2_t _v16 = vundefined_i16m2();
+    _v16 = vset_v_i16m1_i16m2(_v16, 0, vfcvt_x_f_v_i16m1(_v, vl));
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+    int64_t _ret;
+    vse8_v_i8m1((int8_t*)&_ret, _v8, vl);
+    return _ret;
+    // int16x8_t _v16 = vcvtaq_s16_f16(_v);
+    // int8x8_t _v8 = vqmovn_s16(_v16);
+    // return vmax_s8(_v8, vdup_n_s8(-127));
+}
+#endif // __riscv_zfh
+
 static inline int64_t float2int8(vfloat32m1_t _vlow, vfloat32m1_t _vhigh)
 {
     int vl = vsetvlmax_e32m1();
diff --git a/tests/testutil.cpp b/tests/testutil.cpp
index f0bf3c51a20d..2f01b32c8a96 100644
--- a/tests/testutil.cpp
+++ b/tests/testutil.cpp
@@ -523,7 +523,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
                     dst_elempack = 4;
 #elif NCNN_RVV
                 const int packn = ncnn::cpu_riscv_vlenb() / 2;
-                if (elemcount % packn == 0)
+                if (elemcount % packn == 0 && opt.use_fp16_arithmetic)
                     dst_elempack = packn;
 #else
                 if (elemcount % 4 == 0)
@@ -1038,7 +1038,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
                 dst_elempack = 4;
 #elif NCNN_RVV
             const int packn = ncnn::cpu_riscv_vlenb() / 2;
-            if (elemcount % packn == 0)
+            if (elemcount % packn == 0 && opt.use_fp16_arithmetic)
                 dst_elempack = packn;
 #else
             if (elemcount % 4 == 0)

From c5f0937112c64a73227d863b6f30b57871031df6 Mon Sep 17 00:00:00 2001
From: Xinyu Yang <ltyxy@buaa.edu.cn>
Date: Sat, 10 Feb 2024 17:09:11 +0800
Subject: [PATCH 10/17] Add dequantize (#3)

* add-deq

* apply code-format changes

* ongoing dequantize

* apply code-format changes

* finish dequantize_riscv, but has bug

* fix vset bug

* fix bug

* delete debug info

* apply code-format changes

* refine dequantize

---------

Co-authored-by: Xinyu302 <Xinyu302@users.noreply.github.com>
---
 src/layer/riscv/dequantize_riscv.cpp | 891 +++++++++++++++++++++++++++
 src/layer/riscv/dequantize_riscv.h   |  32 +
 2 files changed, 923 insertions(+)
 create mode 100644 src/layer/riscv/dequantize_riscv.cpp
 create mode 100644 src/layer/riscv/dequantize_riscv.h

diff --git a/src/layer/riscv/dequantize_riscv.cpp b/src/layer/riscv/dequantize_riscv.cpp
new file mode 100644
index 000000000000..a0ccb758120b
--- /dev/null
+++ b/src/layer/riscv/dequantize_riscv.cpp
@@ -0,0 +1,891 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2019 BUG1989. All rights reserved.
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "dequantize_riscv.h"
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#endif // __riscv_vector
+
+#include "riscv_usability.h"
+#include "cpu.h"
+
+namespace ncnn {
+
+Dequantize_riscv::Dequantize_riscv()
+{
+#if __riscv_vector
+    support_packing = true;
+#endif // __riscv_vector
+}
+
+int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int vl;
+    int dims = bottom_blob.dims;
+    int elempack = bottom_blob.elempack;
+
+#if __riscv_vector
+    if (elempack == 8)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+            int outw = w * 2;
+
+            top_blob.create(outw, (size_t)16u, 4, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                float _scale = scale_data[0];
+                if (bias_data_size == 0)
+                {
+                    // #pragma omp parallel for num_threads(opt.num_threads)
+                    int n = outw * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmul_vf_f32m8(_v, _scale, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    int n = outw * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+                        vfloat32m8_t _bias = vfmv_v_f_f32m8(bias_data[0], vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmadd_vf_f32m8(_v, _scale, _bias, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+                else
+                {
+                    // #pragma omp parallel for num_threads(opt.num_threads)
+                    int n = outw * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+                        vfloat32m8_t _bias = vle32_v_f32m8((const float*)bias_data + offset, vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmadd_vf_f32m8(_v, _scale, _bias, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+            }
+            else
+            {
+                if (bias_data_size == 0)
+                {
+                    // #pragma omp parallel for num_threads(opt.num_threads)
+                    int n = outw * 4;
+                    int offset = 0;
+
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+
+                        vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmul_vv_f32m8(_v, _scale, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    int n = outw * 4;
+                    int offset = 0;
+
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+
+                        vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl);
+                        vfloat32m8_t _bias = vfmv_v_f_f32m8(bias_data[0], vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+                else
+                {
+                    int n = outw * 4;
+                    int offset = 0;
+
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+
+                        vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl);
+                        vfloat32m8_t _bias = vle32_v_f32m8((const float*)bias_data + offset, vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int outh = h * 2;
+            vl = 4;
+
+            top_blob.create(w, outh, (size_t)16u, 4, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    float* ptr0 = top_blob.row(i * 2);
+                    float* ptr1 = top_blob.row(i * 2 + 1);
+
+                    vl = 4;
+                    vfloat32m1_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 8, vl);
+                    vfloat32m1_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 8 + 4, vl);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl);
+                        _v0 = vfmul_vv_f32m1(_v0, _scale0, vl);
+                        _v1 = vfmul_vv_f32m1(_v1, _scale1, vl);
+                        vse32_v_f32m1(ptr0, _v0, vl);
+                        vse32_v_f32m1(ptr1, _v1, vl);
+                        intptr += 8;
+                        ptr0 += 4;
+                        ptr1 += 4;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    float* ptr0 = top_blob.row(i * 2);
+                    float* ptr1 = top_blob.row(i * 2 + 1);
+
+                    vl = 4;
+                    vfloat32m1_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 8, vl);
+                    vfloat32m1_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 8 + 4, vl);
+                    vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 8, vl);
+                    vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 8 + 4, vl);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl);
+                        _v0 = vfmadd_vv_f32m1(_v0, _scale0, _bias0, vl);
+                        _v1 = vfmadd_vv_f32m1(_v1, _scale1, _bias1, vl);
+                        vse32_v_f32m1(ptr0, _v0, vl);
+                        vse32_v_f32m1(ptr1, _v1, vl);
+                        intptr += 8;
+                        ptr0 += 4;
+                        ptr1 += 4;
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+            int outc = channels * 2;
+
+            top_blob.create(w, h, outc, (size_t)16u, 4, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    float* ptr0 = top_blob.channel(q * 2);
+                    float* ptr1 = top_blob.channel(q * 2 + 1);
+
+                    vl = 4;
+                    vfloat32m1_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 8, vl);
+                    vfloat32m1_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 8 + 4, vl);
+
+                    int i = 0;
+                    for (; i + 1 < size; i += 2)
+                    {
+                        vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl);
+                        vfloat32m1_t _v2 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 8, vl), vl);
+                        vfloat32m1_t _v3 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 12, vl), vl);
+
+                        _v0 = vfmul_vv_f32m1(_v0, _scale0, vl);
+                        _v1 = vfmul_vv_f32m1(_v1, _scale1, vl);
+                        _v2 = vfmul_vv_f32m1(_v2, _scale0, vl);
+                        _v3 = vfmul_vv_f32m1(_v3, _scale1, vl);
+
+                        vse32_v_f32m1(ptr0, _v0, vl);
+                        vse32_v_f32m1(ptr0 + 4, _v2, vl);
+                        vse32_v_f32m1(ptr1, _v1, vl);
+                        vse32_v_f32m1(ptr1 + 4, _v3, vl);
+                        intptr += 16;
+                        ptr0 += 8;
+                        ptr1 += 8;
+                    }
+                    for (; i < size; i++)
+                    {
+                        vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl);
+
+                        _v0 = vfmul_vv_f32m1(_v0, _scale0, vl);
+                        _v1 = vfmul_vv_f32m1(_v1, _scale1, vl);
+
+                        vse32_v_f32m1(ptr0, _v0, vl);
+                        vse32_v_f32m1(ptr1, _v1, vl);
+
+                        intptr += 8;
+                        ptr0 += 4;
+                        ptr1 += 4;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    float* ptr0 = top_blob.channel(q * 2);
+                    float* ptr1 = top_blob.channel(q * 2 + 1);
+                    vl = 4;
+
+                    vfloat32m1_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 8, vl);
+                    vfloat32m1_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 8 + 4, vl);
+                    vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8, vl);
+                    vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8 + 4, vl);
+
+                    int i = 0;
+                    for (; i + 1 < size; i += 2)
+                    {
+                        vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl);
+                        vfloat32m1_t _v2 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 8, vl), vl);
+                        vfloat32m1_t _v3 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 12, vl), vl);
+
+                        _v0 = vfmadd_vv_f32m1(_v0, _scale0, _bias0, vl);
+                        _v1 = vfmadd_vv_f32m1(_v1, _scale1, _bias1, vl);
+                        _v2 = vfmadd_vv_f32m1(_v2, _scale0, _bias0, vl);
+                        _v3 = vfmadd_vv_f32m1(_v3, _scale1, _bias1, vl);
+
+                        vse32_v_f32m1(ptr0, _v0, vl);
+                        vse32_v_f32m1(ptr0 + 4, _v2, vl);
+                        vse32_v_f32m1(ptr1, _v1, vl);
+                        vse32_v_f32m1(ptr1 + 4, _v3, vl);
+
+                        intptr += 16;
+                        ptr0 += 8;
+                        ptr1 += 8;
+                    }
+                    for (; i < size; i++)
+                    {
+                        vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl);
+
+                        _v0 = vfmadd_vv_f32m1(_v0, _scale0, _bias0, vl);
+                        _v1 = vfmadd_vv_f32m1(_v1, _scale1, _bias1, vl);
+
+                        vse32_v_f32m1(ptr0, _v0, vl);
+                        vse32_v_f32m1(ptr1, _v1, vl);
+
+                        intptr += 8;
+                        ptr0 += 4;
+                        ptr1 += 4;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    if (elempack == 4)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+
+            top_blob.create(w, (size_t)16u, elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                if (bias_data_size == 0)
+                {
+                    int n = w * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+                        vfloat32m8_t _scale = vfmv_v_f_f32m8(scale_data[0], vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmul_vv_f32m8(_v, _scale, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    int n = w * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+                        vfloat32m8_t _scale = vfmv_v_f_f32m8(scale_data[0], vl);
+                        vfloat32m8_t _bias = vfmv_v_f_f32m8(bias_data[0], vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+                else
+                {
+                    // #pragma omp parallel for num_threads(opt.num_threads)
+                    int n = w * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+                        vfloat32m8_t _scale = vfmv_v_f_f32m8(scale_data[0], vl);
+                        vfloat32m8_t _bias = vle32_v_f32m8((const float*)bias_data + offset, vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+            }
+            else
+            {
+                if (bias_data_size == 0)
+                {
+                    // #pragma omp parallel for num_threads(opt.num_threads)
+                    int n = w * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+                        vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmul_vv_f32m8(_v, _scale, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    int n = w * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+                        vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl);
+                        vfloat32m8_t _bias = vfmv_v_f_f32m8(bias_data[0], vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+                else
+                {
+                    int n = w * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+                        vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl);
+                        vfloat32m8_t _bias = vle32_v_f32m8((const float*)bias_data + offset, vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+
+            top_blob.create(w, h, (size_t)16u, elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    float* ptr = top_blob.row(i);
+
+                    int n = w * 4;
+                    int offset = 0;
+                    vl = 4;
+                    vfloat32m1_t _scale_m1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 4, vl);
+                    vfloat32m8_t _scale = vundefined_f32m8();
+                    _scale = vset_v_f32m1_f32m8(_scale, 0, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 1, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 2, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 3, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 4, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 5, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 6, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 7, _scale_m1);
+
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl);
+                        _v = vfmul_vv_f32m8(_v, _scale, vl);
+                        vse32_v_f32m8(ptr + offset, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    float* ptr = top_blob.row(i);
+
+                    vl = 4;
+                    vfloat32m1_t _scale_m1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 4, vl);
+                    vfloat32m1_t _bias_m1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 4, vl);
+
+                    vfloat32m8_t _scale = vundefined_f32m8();
+                    _scale = vset_v_f32m1_f32m8(_scale, 0, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 1, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 2, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 3, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 4, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 5, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 6, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 7, _scale_m1);
+
+                    vfloat32m8_t _bias = vundefined_f32m8();
+                    _bias = vset_v_f32m1_f32m8(_bias, 0, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 1, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 2, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 3, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 4, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 5, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 6, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 7, _bias_m1);
+
+                    int n = w * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl);
+                        _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl);
+                        vse32_v_f32m8(ptr + offset, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+
+            top_blob.create(w, h, channels, (size_t)16u, elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    float* ptr = top_blob.channel(q);
+                    vl = 4;
+                    vfloat32m1_t _scale_m1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 4, vl);
+                    vfloat32m8_t _scale = vundefined_f32m8();
+                    _scale = vset_v_f32m1_f32m8(_scale, 0, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 1, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 2, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 3, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 4, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 5, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 6, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 7, _scale_m1);
+
+                    int n = size * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl);
+                        _v = vfmul_vv_f32m8(_v, _scale, vl);
+                        vse32_v_f32m8(ptr + offset, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    float* ptr = top_blob.channel(q);
+
+                    vl = 4;
+                    vfloat32m1_t _scale_m1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 4, vl);
+                    vfloat32m1_t _bias_m1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 4, vl);
+
+                    vfloat32m8_t _scale = vundefined_f32m8();
+                    _scale = vset_v_f32m1_f32m8(_scale, 0, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 1, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 2, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 3, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 4, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 5, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 6, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 7, _scale_m1);
+
+                    vfloat32m8_t _bias = vundefined_f32m8();
+                    _bias = vset_v_f32m1_f32m8(_bias, 0, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 1, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 2, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 3, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 4, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 5, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 6, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 7, _bias_m1);
+
+                    int n = size * 4;
+                    int offset = 0;
+
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl);
+                        _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl);
+                        vse32_v_f32m8(ptr + offset, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+#endif // __riscv_vector
+
+    if (dims == 1)
+    {
+        int w = bottom_blob.w;
+
+        top_blob.create(w, (size_t)4u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        const int* intptr = bottom_blob;
+        float* ptr = top_blob;
+
+        if (scale_data_size == 1)
+        {
+            const float scale = scale_data[0];
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale;
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale + bias;
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale + bias_data[i];
+                }
+            }
+        }
+        else
+        {
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale_data[i];
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale_data[i] + bias;
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale_data[i] + bias_data[i];
+                }
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        top_blob.create(w, h, (size_t)4u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                float* ptr = top_blob.row(i);
+
+                const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+
+                int n = w;
+                int offset = 0;
+                while (n > 0)
+                {
+                    vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _scale = vfmv_v_f_f32m8(scale, vl);
+                    vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl);
+                    _v = vfmul_vv_f32m8(_v, _scale, vl);
+                    vse32_v_f32m8(ptr + offset, _v, vl);
+                    offset += vl;
+                    n -= vl;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                float* ptr = top_blob.row(i);
+
+                const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+                const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i];
+
+                int n = w;
+                int offset = 0;
+                while (n > 0)
+                {
+                    vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _scale = vfmv_v_f_f32m8(scale, vl);
+                    vfloat32m8_t _bias = vfmv_v_f_f32m8(bias, vl);
+                    vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl);
+                    _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl);
+                    vse32_v_f32m8(ptr + offset, _v, vl);
+                    offset += vl;
+                    n -= vl;
+                }
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+        int size = w * h;
+
+        top_blob.create(w, h, channels, (size_t)4u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                float* ptr = top_blob.channel(q);
+
+                const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+
+                int i = 0;
+#if __riscv_vector
+                int n = size;
+                int offset = 0;
+
+                while (n > 0)
+                {
+                    vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _scale = vfmv_v_f_f32m8(scale, vl);
+                    vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl);
+                    _v = vfmul_vv_f32m8(_v, _scale, vl);
+                    vse32_v_f32m8(ptr + offset, _v, vl);
+                    offset += vl;
+                    n -= vl;
+                }
+#endif // __riscv_vector               \
+// for (; i < size; i++)           \
+// {                               \
+//     *ptr++ = *intptr++ * scale; \
+// }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                float* ptr = top_blob.channel(q);
+
+                const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+                const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q];
+
+                int i = 0;
+#if __riscv_vector
+                int n = size;
+                int offset = 0;
+
+                while (n > 0)
+                {
+                    vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _scale = vfmv_v_f_f32m8(scale, vl);
+                    vfloat32m8_t _bias = vfmv_v_f_f32m8(bias, vl);
+                    vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl);
+                    _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl);
+                    vse32_v_f32m8(ptr + offset, _v, vl);
+                    offset += vl;
+                    n -= vl;
+                }
+#endif // __riscv_vector                      \
+// for (; i < size; i++)                  \
+// {                                      \
+//     *ptr++ = *intptr++ * scale + bias; \
+// }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/riscv/dequantize_riscv.h b/src/layer/riscv/dequantize_riscv.h
new file mode 100644
index 000000000000..be7bc6bc90c5
--- /dev/null
+++ b/src/layer/riscv/dequantize_riscv.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_DEQUANTIZE_RISCV_H
+#define LAYER_DEQUANTIZE_RISCV_H
+
+#include "dequantize.h"
+
+namespace ncnn {
+
+class Dequantize_riscv : public Dequantize
+{
+public:
+    Dequantize_riscv();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_DEQUANTIZE_RISCV_H

From f0779b8ff4662459b2cc4daadbcbafbccf34fb6d Mon Sep 17 00:00:00 2001
From: Xinyu Yang <ltyxy@buaa.edu.cn>
Date: Sat, 10 Feb 2024 17:09:48 +0800
Subject: [PATCH 11/17] Add innerproduct (#5)

* add int innerproduct

* apply code-format changes

* finish innerproduct, but not tested yet

* apply code-format changes

* bug because no fp16 quantize

* change flatten to make it right

* pass test

* delete useless code

* apply code-format changes

* delete useless code

---------

Co-authored-by: Xinyu302 <Xinyu302@users.noreply.github.com>
---
 src/layer/riscv/flatten_riscv.cpp      |  57 ++-
 src/layer/riscv/innerproduct_riscv.cpp | 532 ++++++++++++++++++++++++-
 src/layer/riscv/innerproduct_riscv.h   |   9 +
 src/layer/riscv/riscv_usability.h      |  28 +-
 4 files changed, 595 insertions(+), 31 deletions(-)

diff --git a/src/layer/riscv/flatten_riscv.cpp b/src/layer/riscv/flatten_riscv.cpp
index 491c051c7fea..e10bcb86bd3c 100644
--- a/src/layer/riscv/flatten_riscv.cpp
+++ b/src/layer/riscv/flatten_riscv.cpp
@@ -348,7 +348,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
     }
 
 #if __riscv_vector
-    const int packn = csrr_vlenb() / 1;
+    const int packn = csrr_vlenb() / 2;
 #endif
 
     int w = bottom_blob.w;
@@ -365,7 +365,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
 #if __riscv_vector
     if (opt.use_packing_layout)
     {
-        out_elempack = total % packn == 0 ? packn : 1;
+        out_elempack = total % 8 == 0 ? 8 : 1;
     }
 #endif
     size_t out_elemsize = elemsize / elempack * out_elempack;
@@ -394,7 +394,29 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
     if (dims == 2)
     {
 #if __riscv_vector
-        if (elempack == packn) // out_elempack == packn
+        // if (elempack == packn) // out_elempack == packn
+        // {
+        //     #pragma omp parallel for num_threads(opt.num_threads)
+        //     for (int i = 0; i < h; i++)
+        //     {
+        //         const signed char* ptr = bottom_blob.row<signed char>(i);
+        //         signed char* outptr = (signed char*)top_blob + w * i * packn;
+
+        //         int n = w * elempack;
+        //         while (n > 0)
+        //         {
+        //             size_t vl = vsetvl_e8m1(n);
+
+        //             vint8m1_t _p = vle8_v_i8m1(ptr, vl);
+        //             vsse8_v_i8m1(outptr, w * sizeof(unsigned char), _p, vl);
+
+        //             ptr += vl;
+        //             outptr += 1;
+        //             n -= vl;
+        //         }
+        //     }
+        // }
+        if (elempack == 8) // must add, because in innerproduct, elempack is 8
         {
             #pragma omp parallel for num_threads(opt.num_threads)
             for (int i = 0; i < h; i++)
@@ -405,7 +427,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    size_t vl = vsetvl_e8m1(n);
+                    size_t vl = elempack;
 
                     vint8m1_t _p = vle8_v_i8m1(ptr, vl);
                     vsse8_v_i8m1(outptr, w * sizeof(unsigned char), _p, vl);
@@ -422,7 +444,30 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
     if (dims == 3 || dims == 4)
     {
 #if __riscv_vector
-        if (elempack == packn) // out_elempack == packn
+        // if (elempack == packn) // out_elempack == packn
+        // {
+        //     #pragma omp parallel for num_threads(opt.num_threads)
+        //     for (int q = 0; q < channels; q++)
+        //     {
+        //         const signed char* ptr = bottom_blob.channel(q);
+        //         signed char* outptr = (signed char*)top_blob + size * q * packn;
+
+        //         int n = size * elempack;
+        //         while (n > 0)
+        //         {
+        //             size_t vl = vsetvl_e8m1(n);
+
+        //             vint8m1_t _p = vle8_v_i8m1(ptr, vl);
+        //             vsse8_v_i8m1(outptr, size * sizeof(signed char), _p, vl);
+
+        //             ptr += vl;
+        //             outptr += 1;
+        //             n -= vl;
+        //         }
+        //     }
+        // }
+
+        if (elempack == 8) // must add, because in innerproduct, elempack is 8
         {
             #pragma omp parallel for num_threads(opt.num_threads)
             for (int q = 0; q < channels; q++)
@@ -433,7 +478,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    size_t vl = vsetvl_e8m1(n);
+                    size_t vl = elempack;
 
                     vint8m1_t _p = vle8_v_i8m1(ptr, vl);
                     vsse8_v_i8m1(outptr, size * sizeof(signed char), _p, vl);
diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp
index accfc683584f..20213c9951f0 100644
--- a/src/layer/riscv/innerproduct_riscv.cpp
+++ b/src/layer/riscv/innerproduct_riscv.cpp
@@ -53,7 +53,7 @@ int InnerProduct_riscv::create_pipeline(const Option& opt)
     if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
     {
         // TODO implement int8
-        return 0;
+        return create_pipeline_int8(opt);
     }
 #endif
 
@@ -128,27 +128,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
 #if NCNN_INT8
     if (opt.use_int8_inference && int8_scale_term)
     {
-        Mat bottom_blob_unpacked = bottom_blob;
-        if (bottom_blob.elempack != 1)
-        {
-            Option opt_pack1 = opt;
-            opt_pack1.blob_allocator = opt.workspace_allocator;
-
-            convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
-        }
-
-        Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked;
-        if (bottom_blob_unpacked.elembits() == 16)
-        {
-            Option opt_pack1 = opt;
-            opt_pack1.blob_allocator = opt.workspace_allocator;
-
-            cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1);
-        }
-
-        Option opt_unpacked = opt;
-        opt_unpacked.use_packing_layout = false;
-        return InnerProduct::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked);
+        return forward_int8(bottom_blob, top_blob, opt);
     }
 #endif
 
@@ -1090,4 +1070,512 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co
 }
 #endif // __riscv_vector && __riscv_zfh
 
+#if NCNN_INT8
+int InnerProduct_riscv::create_pipeline_int8(const Option& opt)
+{
+    const int num_input = weight_data_size / num_output;
+
+    int out_elempack = 1;
+#if __riscv_vector
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 8 == 0 ? 8 : 1;
+    }
+#endif
+
+    // src = inch-outch
+    // dst = pb-inch-outch/pb
+    {
+        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
+
+        weight_data_tm.create(num_input, num_output / out_elempack, (size_t)out_elempack, out_elempack);
+
+        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
+        {
+            signed char* g0 = weight_data_tm.row<signed char>(q / out_elempack);
+
+            for (int p = 0; p < num_input; p++)
+            {
+                for (int j = 0; j < out_elempack; j++)
+                {
+                    *g0++ = weight_data_r2.row<signed char>(q + j)[p];
+                }
+            }
+        }
+    }
+
+    scale_in_data.create(num_output);
+    for (int p = 0; p < num_output; p++)
+    {
+        // dequantize
+        float scale_in;
+        if (weight_data_int8_scales[p] == 0)
+            scale_in = 0;
+        else
+            scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
+
+        scale_in_data[p] = scale_in;
+    }
+
+    weight_data.release();
+
+    return 0;
+}
+
+int InnerProduct_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int vl;
+    const int num_input = weight_data_size / num_output;
+
+    int elembits = bottom_blob.elembits();
+
+    Mat bottom_blob_int8 = bottom_blob;
+
+    if (elembits != 8)
+    {
+        Option opt_q = opt;
+        opt_q.blob_allocator = opt.workspace_allocator;
+        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
+    }
+
+    if (bottom_blob_int8.dims == 2 && bottom_blob_int8.w == num_input)
+    {
+        // gemm
+        Mat bottom_blob_int8_unpacked;
+        Option opt_unpack = opt;
+        opt_unpack.blob_allocator = opt.workspace_allocator;
+        convert_packing(bottom_blob_int8, bottom_blob_int8_unpacked, 1, opt_unpack);
+
+        int h = bottom_blob_int8_unpacked.h;
+
+        int out_elempack = 1;
+#if __riscv_vector
+        if (opt.use_packing_layout)
+        {
+            out_elempack = h % 4 == 0 ? 4 : 1;
+        }
+#endif
+
+        int outh = h / out_elempack;
+
+        top_blob.create(num_output, outh, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        int num_output_elempack = 1;
+#if __riscv_vector
+        if (opt.use_packing_layout)
+        {
+            num_output_elempack = num_output % 8 == 0 ? 8 : 1;
+        }
+#endif
+
+#if __riscv_vector
+        if (num_output_elempack == 8 && out_elempack == 4)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int j = 0; j < outh; j++)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output / num_output_elempack; p++)
+                {
+                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
+                    const signed char* m0 = bottom_blob_int8_unpacked.row<const signed char>(j * 4);
+                    const signed char* m1 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 1);
+                    const signed char* m2 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 2);
+                    const signed char* m3 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 3);
+
+                    vl = 8;
+                    vint32m2_t _sum0 = vmv_v_x_i32m2(0, vl);
+                    vint32m2_t _sum1 = vmv_v_x_i32m2(0, vl);
+                    vint32m2_t _sum2 = vmv_v_x_i32m2(0, vl);
+                    vint32m2_t _sum3 = vmv_v_x_i32m2(0, vl);
+
+                    int i = 0;
+                    for (; i < num_input; i++)
+                    {
+                        vint8m1_t _val0 = vmv_v_x_i8m1(m0[0], vl);
+                        vint8m1_t _val1 = vmv_v_x_i8m1(m1[0], vl);
+                        vint8m1_t _val2 = vmv_v_x_i8m1(m2[0], vl);
+                        vint8m1_t _val3 = vmv_v_x_i8m1(m3[0], vl);
+
+                        vint8m1_t _w = vle8_v_i8m1(kptr, vl);
+
+                        vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val0, _w, vl), 0);
+                        vint16m1_t _s1 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val1, _w, vl), 0);
+                        vint16m1_t _s2 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val2, _w, vl), 0);
+                        vint16m1_t _s3 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val3, _w, vl), 0);
+
+                        _sum0 = vwadd_wv_i32m2(_sum0, _s0, vl);
+                        _sum1 = vwadd_wv_i32m2(_sum1, _s1, vl);
+                        _sum2 = vwadd_wv_i32m2(_sum2, _s2, vl);
+                        _sum3 = vwadd_wv_i32m2(_sum3, _s3, vl);
+
+                        m0++;
+                        m1++;
+                        m2++;
+                        m3++;
+                        kptr += 8;
+                    }
+
+                    // dequantize and relu
+                    vfloat32m2_t _scale_in = vle32_v_f32m2((const float*)scale_in_data + p * 8, vl);
+
+                    vfloat32m2_t _sumfp32_0 = vfcvt_f_x_v_f32m2(_sum0, vl);
+                    vfloat32m2_t _sumfp32_1 = vfcvt_f_x_v_f32m2(_sum1, vl);
+                    vfloat32m2_t _sumfp32_2 = vfcvt_f_x_v_f32m2(_sum2, vl);
+                    vfloat32m2_t _sumfp32_3 = vfcvt_f_x_v_f32m2(_sum3, vl);
+
+                    if (bias_term)
+                    {
+                        vfloat32m2_t _bias = vle32_v_f32m2((const float*)bias_data + p * 8, vl);
+                        _sumfp32_0 = vfmacc_vv_f32m2(_bias, _sumfp32_0, _scale_in, vl);
+                        _sumfp32_1 = vfmacc_vv_f32m2(_bias, _sumfp32_1, _scale_in, vl);
+                        _sumfp32_2 = vfmacc_vv_f32m2(_bias, _sumfp32_2, _scale_in, vl);
+                        _sumfp32_3 = vfmacc_vv_f32m2(_bias, _sumfp32_3, _scale_in, vl);
+                    }
+                    else
+                    {
+                        _sumfp32_0 = vfmul_vv_f32m2(_sumfp32_0, _scale_in, vl);
+                        _sumfp32_1 = vfmul_vv_f32m2(_sumfp32_1, _scale_in, vl);
+                        _sumfp32_2 = vfmul_vv_f32m2(_sumfp32_2, _scale_in, vl);
+                        _sumfp32_3 = vfmul_vv_f32m2(_sumfp32_3, _scale_in, vl);
+                    }
+
+                    _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params, vl);
+                    _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params, vl);
+                    _sumfp32_2 = activation_ps(_sumfp32_2, activation_type, activation_params, vl);
+                    _sumfp32_3 = activation_ps(_sumfp32_3, activation_type, activation_params, vl);
+
+                    vsseg4e32_v_f32m2(outptr, _sumfp32_0, _sumfp32_1, _sumfp32_2, _sumfp32_3, vl);
+
+                    outptr += 32;
+                }
+            }
+        }
+
+        if (num_output_elempack == 1 && out_elempack == 4)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int j = 0; j < outh; j++)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output; p++)
+                {
+                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
+                    const signed char* m0 = bottom_blob_int8_unpacked.row<const signed char>(j * 4);
+                    const signed char* m1 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 1);
+                    const signed char* m2 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 2);
+                    const signed char* m3 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 3);
+
+                    int sum0 = 0;
+                    int sum1 = 0;
+                    int sum2 = 0;
+                    int sum3 = 0;
+
+                    int i = 0;
+
+                    int n = num_input;
+
+                    vl = vsetvlmax_e32m4();
+                    vint32m4_t _sum0 = vmv_v_x_i32m4(0, vl);
+                    vint32m4_t _sum1 = vmv_v_x_i32m4(0, vl);
+                    vint32m4_t _sum2 = vmv_v_x_i32m4(0, vl);
+                    vint32m4_t _sum3 = vmv_v_x_i32m4(0, vl);
+
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m4(n);
+                        vint8m1_t _val0 = vle8_v_i8m1(m0, vl);
+                        vint8m1_t _val1 = vle8_v_i8m1(m1, vl);
+                        vint8m1_t _val2 = vle8_v_i8m1(m2, vl);
+                        vint8m1_t _val3 = vle8_v_i8m1(m3, vl);
+
+                        vint8m1_t _w = vle8_v_i8m1(kptr, vl);
+
+                        vint16m2_t _s0 = vwmul_vv_i16m2(_val0, _w, vl);
+                        vint16m2_t _s1 = vwmul_vv_i16m2(_val1, _w, vl);
+                        vint16m2_t _s2 = vwmul_vv_i16m2(_val2, _w, vl);
+                        vint16m2_t _s3 = vwmul_vv_i16m2(_val3, _w, vl);
+
+                        _sum0 = vwadd_wv_i32m4(_sum0, _s0, vl);
+                        _sum1 = vwadd_wv_i32m4(_sum1, _s1, vl);
+                        _sum2 = vwadd_wv_i32m4(_sum2, _s2, vl);
+                        _sum3 = vwadd_wv_i32m4(_sum3, _s3, vl);
+
+                        m0 += vl;
+                        m1 += vl;
+                        m2 += vl;
+                        m3 += vl;
+
+                        kptr += vl;
+                        n -= vl;
+                    }
+
+                    vint32m1_t _sum0_scala = vmv_v_x_i32m1(0, vl);
+                    vint32m1_t _sum1_scala = vmv_v_x_i32m1(0, vl);
+                    vint32m1_t _sum2_scala = vmv_v_x_i32m1(0, vl);
+                    vint32m1_t _sum3_scala = vmv_v_x_i32m1(0, vl);
+
+                    vl = vsetvlmax_e32m4();
+                    _sum0_scala = vredsum_vs_i32m4_i32m1(_sum0_scala, _sum0, _sum0_scala, vl);
+                    _sum1_scala = vredsum_vs_i32m4_i32m1(_sum1_scala, _sum1, _sum1_scala, vl);
+                    _sum2_scala = vredsum_vs_i32m4_i32m1(_sum2_scala, _sum2, _sum2_scala, vl);
+                    _sum3_scala = vredsum_vs_i32m4_i32m1(_sum3_scala, _sum3, _sum3_scala, vl);
+                    sum0 = vmv_x_s_i32m1_i32(_sum0_scala);
+                    sum1 = vmv_x_s_i32m1_i32(_sum1_scala);
+                    sum2 = vmv_x_s_i32m1_i32(_sum2_scala);
+                    sum3 = vmv_x_s_i32m1_i32(_sum3_scala);
+
+                    // dequantize and relu
+                    float sumfp32_0 = sum0 * scale_in_data[p];
+                    float sumfp32_1 = sum1 * scale_in_data[p];
+                    float sumfp32_2 = sum2 * scale_in_data[p];
+                    float sumfp32_3 = sum3 * scale_in_data[p];
+
+                    if (bias_term)
+                    {
+                        sumfp32_0 += bias_data[p];
+                        sumfp32_1 += bias_data[p];
+                        sumfp32_2 += bias_data[p];
+                        sumfp32_3 += bias_data[p];
+                    }
+
+                    outptr[0] = activation_ss(sumfp32_0, activation_type, activation_params);
+                    outptr[1] = activation_ss(sumfp32_1, activation_type, activation_params);
+                    outptr[2] = activation_ss(sumfp32_2, activation_type, activation_params);
+                    outptr[3] = activation_ss(sumfp32_3, activation_type, activation_params);
+                    outptr += 4;
+                }
+            }
+        }
+
+        if (num_output_elempack == 8 && out_elempack == 1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int j = 0; j < outh; j++)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output / num_output_elempack; p++)
+                {
+                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
+                    const signed char* m = bottom_blob_int8_unpacked.row<const signed char>(j);
+
+                    vint32m2_t _sum = vmv_v_x_i32m2(0, vl);
+
+                    int i = 0;
+
+                    vl = 8;
+                    for (; i < num_input; i++)
+                    {
+                        vint8m1_t _val = vmv_v_x_i8m1(m[0], vl);
+                        vint8m1_t _w = vle8_v_i8m1(kptr, vl);
+
+                        // int8x8_t _val = vld1_dup_s8(m);
+                        // int8x8_t _w = vld1_s8(kptr);
+
+                        vint16m2_t _s = vwmul_vv_i16m2(_val, _w, vl);
+                        vint16m1_t _s0 = vget_v_i16m2_i16m1(_s, 0);
+
+                        _sum = vwadd_wv_i32m2(_sum, _s0, vl);
+
+                        m++;
+                        kptr += 8;
+                    }
+
+                    // dequantize and relu
+                    vfloat32m2_t _scale_in = vle32_v_f32m2((const float*)scale_in_data + p * 8, vl);
+                    vfloat32m2_t _sumfp32 = vfcvt_f_x_v_f32m2(_sum, vl);
+
+                    if (bias_term)
+                    {
+                        vfloat32m2_t _bias = vle32_v_f32m2((const float*)bias_data + p * 8, vl);
+                        _sumfp32 = vfmacc_vv_f32m2(_bias, _sumfp32, _scale_in, vl);
+                    }
+                    else
+                    {
+                        _sumfp32 = vfmul_vv_f32m2(_sumfp32, _scale_in, vl);
+                    }
+
+                    // _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params, vl);
+                    _sumfp32 = activation_ps(_sumfp32, activation_type, activation_params, vl);
+
+                    vse32_v_f32m2(outptr, _sumfp32, vl);
+                    outptr += 8;
+                }
+            }
+        }
+#endif // __riscv_vector
+
+        if (num_output_elempack == 1 && out_elempack == 1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int j = 0; j < outh; j++)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output; p++)
+                {
+                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
+                    const signed char* m = bottom_blob_int8_unpacked.row<const signed char>(j);
+
+                    int sum = 0;
+
+                    int i = 0;
+#if __riscv_vector
+
+                    int n = num_input;
+                    vint32m4_t _sum = vmv_v_x_i32m4(0, vsetvlmax_e32m4());
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m4(n);
+                        vint8m1_t _val = vle8_v_i8m1(m, vl);
+                        vint8m1_t _w = vle8_v_i8m1(kptr, vl);
+
+                        vint16m2_t _s = vwmul_vv_i16m2(_val, _w, vl);
+                        _sum = vwadd_wv_i32m4(_sum, _s, vl);
+
+                        m += vl;
+                        kptr += vl;
+                        n -= vl;
+                    }
+
+                    vint32m1_t _sum_scala = vmv_v_x_i32m1(0, vl);
+                    _sum_scala = vredsum_vs_i32m4_i32m1(_sum_scala, _sum, _sum_scala, vl);
+                    sum = vmv_x_s_i32m1_i32(_sum_scala);
+
+#endif // __riscv_vector
+
+                    // for (; i < num_input; i++) \
+// {                          \
+//     sum += *m++ * *kptr++; \
+// }
+
+                    // dequantize and relu
+                    float sumfp32 = sum * scale_in_data[p];
+
+                    if (bias_term)
+                        sumfp32 += bias_data[p];
+
+                    outptr[0] = activation_ss(sumfp32, activation_type, activation_params);
+                    outptr += 1;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    Mat bottom_blob_int8_flattened = bottom_blob_int8;
+    if (bottom_blob_int8.dims != 1)
+    {
+        Option opt_flatten = opt;
+        opt_flatten.blob_allocator = opt.workspace_allocator;
+        flatten->forward(bottom_blob_int8, bottom_blob_int8_flattened, opt_flatten);
+    }
+
+    //     int elempack = bottom_blob_int8_flattened.elempack;
+
+    int out_elempack = 1;
+#if __riscv_vector
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 8 == 0 ? 8 : 1;
+    }
+#endif
+
+    top_blob.create(num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+#if __riscv_vector
+    if (out_elempack == 8)
+    {
+        // num_output
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < num_output / out_elempack; p++)
+        {
+            const signed char* kptr = weight_data_tm.row<const signed char>(p);
+            const signed char* sptr = bottom_blob_int8_flattened;
+
+            vl = 8;
+
+            vint32m2_t _sum0 = vmv_v_x_i32m2(0, vl);
+
+            int i = 0;
+            for (; i < num_input; i++)
+            {
+                vint8m1_t _val = vmv_v_x_i8m1(sptr[0], vl);
+                vint8m1_t _w = vle8_v_i8m1(kptr, vl);
+
+                vint16m1_t _s = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val, _w, vl), 0);
+                _sum0 = vwadd_wv_i32m2(_sum0, _s, vl);
+
+                sptr += 1;
+                kptr += 8;
+            }
+
+            // dequantize and relu
+            vfloat32m2_t _scale_in = vle32_v_f32m2((const float*)scale_in_data + p * 8, vl);
+
+            vfloat32m2_t _sumfp32 = vfcvt_f_x_v_f32m2(_sum0, vl);
+
+            if (bias_term)
+            {
+                vfloat32m2_t _bias = vle32_v_f32m2((const float*)bias_data + p * 8, vl);
+                _sumfp32 = vfmacc_vv_f32m2(_bias, _sumfp32, _scale_in, vl);
+            }
+            else
+            {
+                _sumfp32 = vfmul_vv_f32m2(_sumfp32, _scale_in, vl);
+            }
+
+            _sumfp32 = activation_ps(_sumfp32, activation_type, activation_params, vl);
+
+            float* outptr = (float*)top_blob + p * 8;
+            vse32_v_f32m2(outptr, _sumfp32, vl);
+        }
+    }
+#endif // __riscv_vector
+
+    if (out_elempack == 1)
+    {
+        // num_output
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < num_output / out_elempack; p++)
+        {
+            const signed char* kptr = weight_data_tm.row<const signed char>(p);
+            const signed char* sptr = bottom_blob_int8_flattened;
+
+            int sum = 0;
+
+            int i = 0;
+            for (; i < num_input; i++)
+            {
+                signed char val = sptr[0];
+                signed char w = kptr[0];
+
+                sum += val * w;
+
+                sptr += 1;
+                kptr += 1;
+            }
+            // dequantize and relu
+            float sumfp32 = sum * scale_in_data[p];
+
+            if (bias_term)
+                sumfp32 += bias_data[p];
+            sumfp32 = activation_ss(sumfp32, activation_type, activation_params);
+
+            top_blob[p] = sumfp32;
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_INT8
+
 } // namespace ncnn
diff --git a/src/layer/riscv/innerproduct_riscv.h b/src/layer/riscv/innerproduct_riscv.h
index d3056d5801d0..77a3b93de12a 100644
--- a/src/layer/riscv/innerproduct_riscv.h
+++ b/src/layer/riscv/innerproduct_riscv.h
@@ -36,6 +36,11 @@ class InnerProduct_riscv : public InnerProduct
     int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
 
+#if __riscv_vector
+    int create_pipeline_int8(const Option& opt);
+    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+#endif
+
 public:
     Layer* flatten;
 
@@ -43,6 +48,10 @@ class InnerProduct_riscv : public InnerProduct
 
     // fp16
     Mat bias_data_fp16;
+
+#if NCNN_INT8
+    Mat scale_in_data;
+#endif
 };
 
 } // namespace ncnn
diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h
index ad5fadea5701..54d98834fb7c 100644
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -110,11 +110,11 @@ static inline int32_t float2int8(vfloat32m1_t _v)
     return _ret;
 }
 
-static void print_vint8m1(vint8m1_t _v)
+static void print_vint8m1(vint8m1_t _v, int vl = 16)
 {
     int8_t* i8 = (int8_t*)malloc(16 * sizeof(int8_t));
-    vse8_v_i8m1(i8, _v, 16);
-    for (int i = 0; i < 16; i++)
+    vse8_v_i8m1(i8, _v, vl);
+    for (int i = 0; i < vl; i++)
     {
         fprintf(stderr, "i8[%d]: %d\n", i, i8[i]);
     }
@@ -132,6 +132,28 @@ static void print_vint32m1(vint32m1_t _v)
     free(i32);
 }
 
+static void print_vint32m2(vint32m2_t _v, int vl = 8)
+{
+    int32_t* i32 = (int32_t*)malloc(8 * sizeof(int32_t));
+    vse32_v_i32m2(i32, _v, vl);
+    for (int i = 0; i < vl; i++)
+    {
+        fprintf(stderr, "i32[%d]: %d\n", i, i32[i]);
+    }
+    free(i32);
+}
+
+static void print_vfloat32m2(vfloat32m2_t _v, int vl = 8)
+{
+    float* f32 = (float*)malloc(8 * sizeof(float));
+    vse32_v_f32m2(f32, _v, vl);
+    for (int i = 0; i < vl; i++)
+    {
+        fprintf(stderr, "f32[%d]: %f\n", i, f32[i]);
+    }
+    free(f32);
+}
+
 static void print_vfloat32m1(vfloat32m1_t _v)
 {
     float* f32 = (float*)malloc(4 * sizeof(float));

From 38d5bb1ad6dd8f5c5c42a0180df4eaac9399e49f Mon Sep 17 00:00:00 2001
From: Xinyu Yang <ltyxy@buaa.edu.cn>
Date: Wed, 14 Feb 2024 15:37:25 +0800
Subject: [PATCH 12/17] change int8 packn to 8 (#7)

* change int8 packn to 8

* test_convert packing right
---
 src/layer/riscv/flatten_riscv.cpp | 53 +++----------------------------
 src/layer/riscv/padding_riscv.cpp |  2 +-
 tests/testutil.cpp                |  4 +--
 3 files changed, 7 insertions(+), 52 deletions(-)

diff --git a/src/layer/riscv/flatten_riscv.cpp b/src/layer/riscv/flatten_riscv.cpp
index e10bcb86bd3c..e6a97798a0fb 100644
--- a/src/layer/riscv/flatten_riscv.cpp
+++ b/src/layer/riscv/flatten_riscv.cpp
@@ -348,7 +348,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
     }
 
 #if __riscv_vector
-    const int packn = csrr_vlenb() / 2;
+    const int packn = csrr_vlenb() / 2; // packn should be 8
 #endif
 
     int w = bottom_blob.w;
@@ -365,7 +365,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
 #if __riscv_vector
     if (opt.use_packing_layout)
     {
-        out_elempack = total % 8 == 0 ? 8 : 1;
+        out_elempack = total % packn == 0 ? packn : 1;
     }
 #endif
     size_t out_elemsize = elemsize / elempack * out_elempack;
@@ -394,29 +394,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
     if (dims == 2)
     {
 #if __riscv_vector
-        // if (elempack == packn) // out_elempack == packn
-        // {
-        //     #pragma omp parallel for num_threads(opt.num_threads)
-        //     for (int i = 0; i < h; i++)
-        //     {
-        //         const signed char* ptr = bottom_blob.row<signed char>(i);
-        //         signed char* outptr = (signed char*)top_blob + w * i * packn;
-
-        //         int n = w * elempack;
-        //         while (n > 0)
-        //         {
-        //             size_t vl = vsetvl_e8m1(n);
-
-        //             vint8m1_t _p = vle8_v_i8m1(ptr, vl);
-        //             vsse8_v_i8m1(outptr, w * sizeof(unsigned char), _p, vl);
-
-        //             ptr += vl;
-        //             outptr += 1;
-        //             n -= vl;
-        //         }
-        //     }
-        // }
-        if (elempack == 8) // must add, because in innerproduct, elempack is 8
+        if (elempack == packn) // must add, because in innerproduct, elempack is 8
         {
             #pragma omp parallel for num_threads(opt.num_threads)
             for (int i = 0; i < h; i++)
@@ -444,30 +422,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
     if (dims == 3 || dims == 4)
     {
 #if __riscv_vector
-        // if (elempack == packn) // out_elempack == packn
-        // {
-        //     #pragma omp parallel for num_threads(opt.num_threads)
-        //     for (int q = 0; q < channels; q++)
-        //     {
-        //         const signed char* ptr = bottom_blob.channel(q);
-        //         signed char* outptr = (signed char*)top_blob + size * q * packn;
-
-        //         int n = size * elempack;
-        //         while (n > 0)
-        //         {
-        //             size_t vl = vsetvl_e8m1(n);
-
-        //             vint8m1_t _p = vle8_v_i8m1(ptr, vl);
-        //             vsse8_v_i8m1(outptr, size * sizeof(signed char), _p, vl);
-
-        //             ptr += vl;
-        //             outptr += 1;
-        //             n -= vl;
-        //         }
-        //     }
-        // }
-
-        if (elempack == 8) // must add, because in innerproduct, elempack is 8
+        if (elempack == packn) // must add, because in innerproduct, elempack is 8
         {
             #pragma omp parallel for num_threads(opt.num_threads)
             for (int q = 0; q < channels; q++)
diff --git a/src/layer/riscv/padding_riscv.cpp b/src/layer/riscv/padding_riscv.cpp
index 8f4b54da5904..2e2d7471f477 100644
--- a/src/layer/riscv/padding_riscv.cpp
+++ b/src/layer/riscv/padding_riscv.cpp
@@ -510,7 +510,7 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
 int Padding_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
 #if __riscv_vector
-    const int packn = csrr_vlenb() / 1;
+    const int packn = csrr_vlenb() / 2;
     const size_t vl = vsetvl_e8m1(packn);
 #endif
 
diff --git a/tests/testutil.cpp b/tests/testutil.cpp
index 2f01b32c8a96..e271eafb4185 100644
--- a/tests/testutil.cpp
+++ b/tests/testutil.cpp
@@ -533,7 +533,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
             if (elembits == 8)
             {
 #if NCNN_RVV
-                const int packn = ncnn::cpu_riscv_vlenb() / 1;
+                const int packn = ncnn::cpu_riscv_vlenb() / 2;
                 if (elemcount % packn == 0)
                     dst_elempack = packn;
 #else
@@ -1048,7 +1048,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
         if (elembits == 8)
         {
 #if NCNN_RVV
-            const int packn = ncnn::cpu_riscv_vlenb() / 1;
+            const int packn = ncnn::cpu_riscv_vlenb() / 2;
             if (elemcount % packn == 0)
                 dst_elempack = packn;
 #else

From 800e1de948c8a6d6e9c2773c0d7b81f23cc9012f Mon Sep 17 00:00:00 2001
From: Xinyu Yang <ltyxy@buaa.edu.cn>
Date: Wed, 14 Feb 2024 15:59:57 +0800
Subject: [PATCH 13/17] modify header (#8)

---
 src/layer/riscv/dequantize_riscv.cpp   | 1 +
 src/layer/riscv/innerproduct_riscv.cpp | 1 +
 src/layer/riscv/quantize_riscv.cpp     | 1 +
 3 files changed, 3 insertions(+)

diff --git a/src/layer/riscv/dequantize_riscv.cpp b/src/layer/riscv/dequantize_riscv.cpp
index a0ccb758120b..4c3900e4ef15 100644
--- a/src/layer/riscv/dequantize_riscv.cpp
+++ b/src/layer/riscv/dequantize_riscv.cpp
@@ -1,5 +1,6 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
+// Copyright (C) 2024 Xinyu302. All rights reserved.
 // Copyright (C) 2019 BUG1989. All rights reserved.
 // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
 //
diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp
index 20213c9951f0..08cddbc151f7 100644
--- a/src/layer/riscv/innerproduct_riscv.cpp
+++ b/src/layer/riscv/innerproduct_riscv.cpp
@@ -1,5 +1,6 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
+// Copyright (C) 2024 Xinyu302. All rights reserved.
 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
diff --git a/src/layer/riscv/quantize_riscv.cpp b/src/layer/riscv/quantize_riscv.cpp
index 91004baebe0a..0430336d3ed6 100644
--- a/src/layer/riscv/quantize_riscv.cpp
+++ b/src/layer/riscv/quantize_riscv.cpp
@@ -1,5 +1,6 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
+// Copyright (C) 2024 Xinyu302. All rights reserved.
 // Copyright (C) 2019 BUG1989. All rights reserved.
 // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
 //

From 639cf3d5a9e37b3beec762c702d5d3cc308fcaf8 Mon Sep 17 00:00:00 2001
From: Xinyu Yang <ltyxy@buaa.edu.cn>
Date: Wed, 14 Feb 2024 23:41:08 +0800
Subject: [PATCH 14/17] Fix pack (#10)

* modify

* debug

* fix quantize and innerproduct

* apply code-format changes

---------

Co-authored-by: Xinyu302 <Xinyu302@users.noreply.github.com>
---
 src/layer/riscv/quantize_riscv.cpp | 170 ++++++++++++++++++++++++++++-
 tests/testutil.cpp                 |   4 +-
 2 files changed, 171 insertions(+), 3 deletions(-)

diff --git a/src/layer/riscv/quantize_riscv.cpp b/src/layer/riscv/quantize_riscv.cpp
index 0430336d3ed6..89d82106aa04 100644
--- a/src/layer/riscv/quantize_riscv.cpp
+++ b/src/layer/riscv/quantize_riscv.cpp
@@ -517,6 +517,174 @@ int Quantize_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const O
     int dims = bottom_blob.dims;
     int elempack = bottom_blob.elempack;
     int vl;
+    if (elempack == 8)
+    {
+        vl = 8;
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+            int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
+            int outw = w * elempack / out_elempack;
+
+            top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                const float scale = scale_data[0];
+                vfloat32m2_t _scale = vfmv_v_f_f32m2(scale, vl);
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 8;
+                    signed char* outptr = (signed char*)top_blob + i * 8;
+
+                    vl = 8;
+                    vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl);
+                    vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl);
+                    _v = vfmul_vv_f32m2(_v, _scale, vl);
+                    *(int64_t*)outptr = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1));
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 8;
+                    signed char* outptr = (signed char*)top_blob + i * 8;
+
+                    vl = 8;
+                    vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl);
+                    vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl);
+                    vfloat32m2_t _scale = vle32_v_f32m2((const float*)scale_data + i * 8, vl);
+                    _v = vfmul_vv_f32m2(_v, _scale, vl);
+                    *(int64_t*)outptr = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1));
+                }
+            }
+        }
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
+            int outh = h * elempack / out_elempack;
+
+            top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                const float scale = scale_data[0];
+                vfloat32m2_t _scale = vfmv_v_f_f32m2(scale, vl);
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
+                    signed char* outptr0 = top_blob.row<signed char>(i);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        vl = 8;
+                        vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl);
+                        vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl);
+                        _v = vfmul_vv_f32m2(_v, _scale, vl);
+                        *(int64_t*)outptr0 = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1));
+
+                        ptr0 += 8;
+                        outptr0 += 8;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
+                    signed char* outptr0 = top_blob.row<signed char>(i);
+
+                    vfloat32m2_t _scale = vle32_v_f32m2((const float*)scale_data + i * 8, vl);
+                    for (int j = 0; j < w; j++)
+                    {
+                        vl = 8;
+                        vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl);
+                        vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl);
+                        _v = vfmul_vv_f32m2(_v, _scale, vl);
+                        *(int64_t*)outptr0 = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1));
+
+                        ptr0 += 8;
+                        outptr0 += 8;
+                    }
+                }
+            }
+        }
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+
+            int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
+            int outc = channels * elempack / out_elempack;
+
+            top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                const float scale = scale_data[0];
+                vfloat32m2_t _scale = vfmv_v_f_f32m2(scale, vl);
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const __fp16* ptr0 = bottom_blob.channel(q);
+                    signed char* outptr = top_blob.channel(q);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        vl = 8;
+                        vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl);
+                        vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl);
+                        _v = vfmul_vv_f32m2(_v, _scale, vl);
+                        *(int64_t*)outptr = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1));
+                        ptr0 += 8;
+                        outptr += 8;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const __fp16* ptr0 = bottom_blob.channel(q);
+                    signed char* outptr = top_blob.channel(q);
+
+                    vfloat32m2_t _scale = vle32_v_f32m2((const float*)scale_data + q * 8, vl);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        vl = 8;
+                        vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl);
+                        vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl);
+                        _v = vfmul_vv_f32m2(_v, _scale, vl);
+                        *(int64_t*)outptr = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1));
+                        ptr0 += 8;
+                        outptr += 8;
+                    }
+                }
+            }
+        }
+        return 0;
+    }
 
     if (elempack == 4)
     {
@@ -1007,7 +1175,7 @@ int Quantize_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const
                     signed char* outptr0 = top_blob.row<signed char>(i);
 
                     vl = 8;
-                    vfloat16m1_t _scale = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)scale_data + i * 8, vl), vl); // float16x8_t _scale = vcombine_f16(vcvt_f16_f32(vld1q_f32((const float*)scale_data + i * 8)), vcvt_f16_f32(vld1q_f32((const float*)scale_data + i * 8 + 4)));
+                    vfloat16m1_t _scale = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)scale_data + i * 8, vl), vl);
 
                     for (int j = 0; j < w; j++)
                     {
diff --git a/tests/testutil.cpp b/tests/testutil.cpp
index e271eafb4185..96a7c903103f 100644
--- a/tests/testutil.cpp
+++ b/tests/testutil.cpp
@@ -523,7 +523,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
                     dst_elempack = 4;
 #elif NCNN_RVV
                 const int packn = ncnn::cpu_riscv_vlenb() / 2;
-                if (elemcount % packn == 0 && opt.use_fp16_arithmetic)
+                if (elemcount % packn == 0)
                     dst_elempack = packn;
 #else
                 if (elemcount % 4 == 0)
@@ -1038,7 +1038,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
                 dst_elempack = 4;
 #elif NCNN_RVV
             const int packn = ncnn::cpu_riscv_vlenb() / 2;
-            if (elemcount % packn == 0 && opt.use_fp16_arithmetic)
+            if (elemcount % packn == 0)
                 dst_elempack = packn;
 #else
             if (elemcount % 4 == 0)

From b15306a4a8e4e58ab1945b3969bc390b12235975 Mon Sep 17 00:00:00 2001
From: Xinyu Yang <ltyxy@buaa.edu.cn>
Date: Thu, 15 Feb 2024 15:49:00 +0800
Subject: [PATCH 15/17] modify padding using pack8 (#12)

---
 src/layer/riscv/padding_packn.h | 9 ++++++---
 src/mat.h                       | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/layer/riscv/padding_packn.h b/src/layer/riscv/padding_packn.h
index 50f5efe1216d..647ef1bb0b13 100644
--- a/src/layer/riscv/padding_packn.h
+++ b/src/layer/riscv/padding_packn.h
@@ -15,7 +15,8 @@
 #define _PADDING_PACKN_RVV(SEW, TSEW, LMUL, T, VT)                                                                                          \
     static void padding_constant_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right, v##VT##m##LMUL##_t v) \
     {                                                                                                                                       \
-        const int packn = csrr_vlenb() / sizeof(T);                                                                                         \
+        int packn = csrr_vlenb() / sizeof(T);                                                                                         \
+        if (packn > 8) packn = 8;                                                                                                           \
         const size_t vl = vsetvl_e##SEW##m##LMUL(packn);                                                                                    \
                                                                                                                                             \
         const T* ptr = src;                                                                                                                 \
@@ -64,7 +65,8 @@
                                                                                                                                             \
     static void padding_replicate_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right)                      \
     {                                                                                                                                       \
-        const int packn = csrr_vlenb() / sizeof(T);                                                                                         \
+        int packn = csrr_vlenb() / sizeof(T);                                                                                         \
+        if (packn > 8) packn = 8;                                                                                                           \
         const size_t vl = vsetvl_e##SEW##m##LMUL(packn);                                                                                    \
                                                                                                                                             \
         const T* ptr = src;                                                                                                                 \
@@ -143,7 +145,8 @@
                                                                                                                                             \
     static void padding_reflect_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right)                        \
     {                                                                                                                                       \
-        const int packn = csrr_vlenb() / sizeof(T);                                                                                         \
+        int packn = csrr_vlenb() / sizeof(T);                                                                                         \
+        if (packn > 8) packn = 8;                                                                                                           \
         const size_t vl = vsetvl_e##SEW##m##LMUL(packn);                                                                                    \
                                                                                                                                             \
         const T* ptr = src;                                                                                                                 \
diff --git a/src/mat.h b/src/mat.h
index fdf5cc597c4e..b6849565fe13 100644
--- a/src/mat.h
+++ b/src/mat.h
@@ -1120,7 +1120,7 @@ NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v)
 
 NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v)
 {
-    const int packn = cpu_riscv_vlenb() / 1;
+    const int packn = cpu_riscv_vlenb() / 2;
     const size_t vl = vsetvl_e8m1(packn);
 
     int size = (int)total();

From da87ff652b7328091661fa03ef3f3535fb98a142 Mon Sep 17 00:00:00 2001
From: Xinyu302 <Xinyu302@users.noreply.github.com>
Date: Thu, 15 Feb 2024 07:52:41 +0000
Subject: [PATCH 16/17] apply code-format changes

---
 src/layer/riscv/padding_packn.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/layer/riscv/padding_packn.h b/src/layer/riscv/padding_packn.h
index 647ef1bb0b13..60465da64ffc 100644
--- a/src/layer/riscv/padding_packn.h
+++ b/src/layer/riscv/padding_packn.h
@@ -15,7 +15,7 @@
 #define _PADDING_PACKN_RVV(SEW, TSEW, LMUL, T, VT)                                                                                          \
     static void padding_constant_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right, v##VT##m##LMUL##_t v) \
     {                                                                                                                                       \
-        int packn = csrr_vlenb() / sizeof(T);                                                                                         \
+        int packn = csrr_vlenb() / sizeof(T);                                                                                               \
         if (packn > 8) packn = 8;                                                                                                           \
         const size_t vl = vsetvl_e##SEW##m##LMUL(packn);                                                                                    \
                                                                                                                                             \
@@ -65,7 +65,7 @@
                                                                                                                                             \
     static void padding_replicate_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right)                      \
     {                                                                                                                                       \
-        int packn = csrr_vlenb() / sizeof(T);                                                                                         \
+        int packn = csrr_vlenb() / sizeof(T);                                                                                               \
         if (packn > 8) packn = 8;                                                                                                           \
         const size_t vl = vsetvl_e##SEW##m##LMUL(packn);                                                                                    \
                                                                                                                                             \
@@ -145,7 +145,7 @@
                                                                                                                                             \
     static void padding_reflect_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right)                        \
     {                                                                                                                                       \
-        int packn = csrr_vlenb() / sizeof(T);                                                                                         \
+        int packn = csrr_vlenb() / sizeof(T);                                                                                               \
         if (packn > 8) packn = 8;                                                                                                           \
         const size_t vl = vsetvl_e##SEW##m##LMUL(packn);                                                                                    \
                                                                                                                                             \

From 9586bc59798ecab0a64c0e53d197859025de1d72 Mon Sep 17 00:00:00 2001
From: Xinyu Yang <ltyxy@buaa.edu.cn>
Date: Fri, 16 Feb 2024 23:43:26 +0800
Subject: [PATCH 17/17] finish requantize but has bug (#13)

* finish requantize but has bug

* fix bug

* delete comment

* apply code-format changes

---------

Co-authored-by: Xinyu302 <Xinyu302@users.noreply.github.com>
---
 src/layer/riscv/requantize_leakyrelu_pack4.h |  263 ++++
 src/layer/riscv/requantize_leakyrelu_pack8.h |  160 +++
 src/layer/riscv/requantize_relu_pack4.h      |  259 ++++
 src/layer/riscv/requantize_relu_pack8.h      |  155 +++
 src/layer/riscv/requantize_riscv.cpp         | 1240 ++++++++++++++++++
 src/layer/riscv/requantize_riscv.h           |   34 +
 src/layer/riscv/riscv_usability.h            |   64 +
 7 files changed, 2175 insertions(+)
 create mode 100644 src/layer/riscv/requantize_leakyrelu_pack4.h
 create mode 100644 src/layer/riscv/requantize_leakyrelu_pack8.h
 create mode 100644 src/layer/riscv/requantize_relu_pack4.h
 create mode 100644 src/layer/riscv/requantize_relu_pack8.h
 create mode 100644 src/layer/riscv/requantize_riscv.cpp
 create mode 100644 src/layer/riscv/requantize_riscv.h

diff --git a/src/layer/riscv/requantize_leakyrelu_pack4.h b/src/layer/riscv/requantize_leakyrelu_pack4.h
new file mode 100644
index 000000000000..fd5c4dfd93d2
--- /dev/null
+++ b/src/layer/riscv/requantize_leakyrelu_pack4.h
@@ -0,0 +1,263 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void requantize_leakyrelu_pack4_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt)
+{
+    int vl = 4;
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+    int outc = top_blob.c;
+    int out_elempack = top_blob.elempack;
+
+    int scale_in_data_size = scale_in_data.w;
+    int scale_out_data_size = scale_out_data.w;
+    int bias_data_size = bias_data.w;
+
+    // int8(relu(v * scale_in) * scale_out)
+    // int8_relu(v * (scale_in * scale_out))
+
+    // int8(relu(v * scale_in + bias) * scale_out)
+    // int8_relu(v * (scale_in * scale_out) + (bias * scale_out))
+
+    if (out_elempack == 8)
+    {
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < outc; q++)
+            {
+                const int* intptr0 = bottom_blob.channel(q * 2);
+                const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                signed char* ptr = top_blob.channel(q);
+
+                vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl);
+                vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl);
+                vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl);
+                vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl);
+
+                vfloat32m1_t _scale0 = vfmul_vv_f32m1(_scale_in0, _scale_out0, vl);
+                vfloat32m1_t _scale1 = vfmul_vv_f32m1(_scale_in1, _scale_out1, vl);
+                vfloat32m1_t _slope = vfmv_v_f_f32m1(slope, vl);
+
+                int i = 0;
+                for (; i + 3 < size; i += 4)
+                {
+                    vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                    vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl);
+                    vfloat32m1_t _v02 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 8, vl), vl);
+                    vfloat32m1_t _v03 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 12, vl), vl);
+                    vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                    vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl);
+                    vfloat32m1_t _v12 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 8, vl), vl);
+                    vfloat32m1_t _v13 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 12, vl), vl);
+                    _v00 = vfmul_vv_f32m1(_v00, _scale0, vl);
+                    _v01 = vfmul_vv_f32m1(_v01, _scale0, vl);
+                    _v02 = vfmul_vv_f32m1(_v02, _scale0, vl);
+                    _v03 = vfmul_vv_f32m1(_v03, _scale0, vl);
+                    _v10 = vfmul_vv_f32m1(_v10, _scale1, vl);
+                    _v11 = vfmul_vv_f32m1(_v11, _scale1, vl);
+                    _v12 = vfmul_vv_f32m1(_v12, _scale1, vl);
+                    _v13 = vfmul_vv_f32m1(_v13, _scale1, vl);
+                    *(int64_t*)ptr = float2int8leakyrelu(_v00, _v10, _slope);
+                    *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v01, _v11, _slope);
+                    *(int64_t*)(ptr + 16) = float2int8leakyrelu(_v02, _v12, _slope);
+                    *(int64_t*)(ptr + 24) = float2int8leakyrelu(_v03, _v13, _slope);
+
+                    intptr0 += 16;
+                    intptr1 += 16;
+                    ptr += 32;
+                }
+                for (; i < size; i++)
+                {
+                    vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                    vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                    _v0 = vfmul_vv_f32m1(_v0, _scale0, vl);
+                    _v1 = vfmul_vv_f32m1(_v1, _scale1, vl);
+                    *(int64_t*)ptr = float2int8leakyrelu(_v0, _v1, _slope);
+
+                    intptr0 += 4;
+                    intptr1 += 4;
+                    ptr += 8;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < outc; q++)
+            {
+                const int* intptr0 = bottom_blob.channel(q * 2);
+                const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                signed char* ptr = top_blob.channel(q);
+
+                vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl);
+                vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl);
+                vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl);
+                vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl);
+                vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8, vl);
+                vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8 + 4, vl);
+
+                vfloat32m1_t _scale0 = vfmul_vv_f32m1(_scale_in0, _scale_out0, vl);
+                vfloat32m1_t _scale1 = vfmul_vv_f32m1(_scale_in1, _scale_out1, vl);
+                vfloat32m1_t _slope = vfmv_v_f_f32m1(slope, vl);
+                _bias0 = vfmul_vv_f32m1(_bias0, _scale_out0, vl);
+                _bias1 = vfmul_vv_f32m1(_bias1, _scale_out1, vl);
+
+                int i = 0;
+                for (; i + 3 < size; i += 4)
+                {
+                    vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                    vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl);
+                    vfloat32m1_t _v02 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 8, vl), vl);
+                    vfloat32m1_t _v03 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 12, vl), vl);
+                    vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                    vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl);
+                    vfloat32m1_t _v12 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 8, vl), vl);
+                    vfloat32m1_t _v13 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 12, vl), vl);
+
+                    _v00 = vfmacc_vv_f32m1(_bias0, _v00, _scale0, vl);
+                    _v01 = vfmacc_vv_f32m1(_bias0, _v01, _scale0, vl);
+                    _v02 = vfmacc_vv_f32m1(_bias0, _v02, _scale0, vl);
+                    _v03 = vfmacc_vv_f32m1(_bias0, _v03, _scale0, vl);
+                    _v10 = vfmacc_vv_f32m1(_bias1, _v10, _scale1, vl);
+                    _v11 = vfmacc_vv_f32m1(_bias1, _v11, _scale1, vl);
+                    _v12 = vfmacc_vv_f32m1(_bias1, _v12, _scale1, vl);
+                    _v13 = vfmacc_vv_f32m1(_bias1, _v13, _scale1, vl);
+
+                    *(int64_t*)ptr = float2int8leakyrelu(_v00, _v10, _slope);
+                    *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v01, _v11, _slope);
+                    *(int64_t*)(ptr + 16) = float2int8leakyrelu(_v02, _v12, _slope);
+                    *(int64_t*)(ptr + 24) = float2int8leakyrelu(_v03, _v13, _slope);
+
+                    intptr0 += 16;
+                    intptr1 += 16;
+                    ptr += 32;
+                }
+                for (; i + 1 < size; i += 2)
+                {
+                    vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                    vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl);
+                    vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                    vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl);
+
+                    _v00 = vfmacc_vv_f32m1(_bias0, _v00, _scale0, vl);
+                    _v01 = vfmacc_vv_f32m1(_bias0, _v01, _scale0, vl);
+                    _v10 = vfmacc_vv_f32m1(_bias1, _v10, _scale1, vl);
+                    _v11 = vfmacc_vv_f32m1(_bias1, _v11, _scale1, vl);
+
+                    *(int64_t*)ptr = float2int8leakyrelu(_v00, _v10, _slope);
+                    *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v01, _v11, _slope);
+
+                    intptr0 += 8;
+                    intptr1 += 8;
+                    ptr += 16;
+                }
+                for (; i < size; i++)
+                {
+                    vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                    vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+
+                    _v0 = vfmacc_vv_f32m1(_bias0, _v0, _scale0, vl);
+                    _v1 = vfmacc_vv_f32m1(_bias1, _v1, _scale1, vl);
+
+                    *(int64_t*)ptr = float2int8leakyrelu(_v0, _v1, _slope);
+
+                    intptr0 += 4;
+                    intptr1 += 4;
+                    ptr += 8;
+                }
+            }
+        }
+    }
+    if (out_elempack == 1)
+    {
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr0 = top_blob.channel(q * 4);
+                signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                signed char* ptr3 = top_blob.channel(q * 4 + 3);
+
+                vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl);
+                vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl);
+                vfloat32m1_t _slope = vfmv_v_f_f32m1(slope, vl);
+
+                vfloat32m1_t _scale = vfmul_vv_f32m1(_scale_in, _scale_out, vl);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                    _v = vfmul_vv_f32m1(_v, _scale, vl);
+
+                    int res = float2int8leakyrelu(_v, _slope);
+                    ptr0[0] = (res)&0xff;
+                    ptr1[0] = (res >> 8) & 0xff;
+                    ptr2[0] = (res >> 16) & 0xff;
+                    ptr3[0] = (res >> 24) & 0xff;
+
+                    intptr += 4;
+                    ptr0 += 1;
+                    ptr1 += 1;
+                    ptr2 += 1;
+                    ptr3 += 1;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr0 = top_blob.channel(q * 4);
+                signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                signed char* ptr3 = top_blob.channel(q * 4 + 3);
+
+                vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl);
+                vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl);
+                vfloat32m1_t _bias = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 4, vl);
+                vfloat32m1_t _slope = vfmv_v_f_f32m1(slope, vl);
+
+                vfloat32m1_t _scale = vfmul_vv_f32m1(_scale_in, _scale_out, vl);
+                _bias = vfmul_vv_f32m1(_bias, _scale_out, vl);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                    _v = vfmacc_vv_f32m1(_bias, _v, _scale, vl);
+                    int res = float2int8leakyrelu(_v, _slope);
+                    ptr0[0] = (res)&0xff;
+                    ptr1[0] = (res >> 8) & 0xff;
+                    ptr2[0] = (res >> 16) & 0xff;
+                    ptr3[0] = (res >> 24) & 0xff;
+
+                    intptr += 4;
+                    ptr0 += 1;
+                    ptr1 += 1;
+                    ptr2 += 1;
+                    ptr3 += 1;
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/riscv/requantize_leakyrelu_pack8.h b/src/layer/riscv/requantize_leakyrelu_pack8.h
new file mode 100644
index 000000000000..87991ad94394
--- /dev/null
+++ b/src/layer/riscv/requantize_leakyrelu_pack8.h
@@ -0,0 +1,160 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void requantize_leakyrelu_pack8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    int scale_in_data_size = scale_in_data.w;
+    int scale_out_data_size = scale_out_data.w;
+    int bias_data_size = bias_data.w;
+    int vl = 8;
+
+    // int8(relu(v * scale_in) * scale_out)
+    // int8_relu(v * (scale_in * scale_out))
+
+    // int8(relu(v * scale_in + bias) * scale_out)
+    // int8_relu(v * (scale_in * scale_out) + (bias * scale_out))
+
+    if (bias_data_size == 0)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const int* intptr = bottom_blob.channel(q);
+            signed char* ptr = top_blob.channel(q);
+
+            vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl);
+            vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl);
+
+            vfloat32m2_t _scale0 = vfmul_vv_f32m2(_scale_in0, _scale_out0, vl);
+            vfloat32m2_t _slope = vfmv_v_f_f32m2(slope, vl);
+
+            int i = 0;
+            for (; i + 3 < size; i += 4)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl);
+                vfloat32m2_t _v45 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 16, vl), vl);
+                vfloat32m2_t _v67 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 24, vl), vl);
+
+                _v01 = vfmul_vv_f32m2(_v01, _scale0, vl);
+                _v23 = vfmul_vv_f32m2(_v23, _scale0, vl);
+                _v45 = vfmul_vv_f32m2(_v45, _scale0, vl);
+                _v67 = vfmul_vv_f32m2(_v67, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope);
+                *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v23, _slope);
+                *(int64_t*)(ptr + 16) = float2int8leakyrelu(_v45, _slope);
+                *(int64_t*)(ptr + 24) = float2int8leakyrelu(_v67, _slope);
+
+                intptr += 32;
+                ptr += 32;
+            }
+            for (; i + 1 < size; i += 2)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl);
+
+                _v01 = vfmul_vv_f32m2(_v01, _scale0, vl);
+                _v23 = vfmul_vv_f32m2(_v23, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope);
+                *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v23, _slope);
+
+                intptr += 16;
+                ptr += 16;
+            }
+            for (; i < size; i++)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+
+                _v01 = vfmul_vv_f32m2(_v01, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope);
+
+                intptr += 8;
+                ptr += 8;
+            }
+        }
+    }
+    else
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const int* intptr = bottom_blob.channel(q);
+            signed char* ptr = top_blob.channel(q);
+
+            vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl);
+            vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl);
+            vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + q * 8, vl);
+
+            vfloat32m2_t _scale0 = vfmul_vv_f32m2(_scale_in0, _scale_out0, vl);
+            _bias0 = vfmul_vv_f32m2(_bias0, _scale_out0, vl);
+
+            vfloat32m2_t _slope = vfmv_v_f_f32m2(slope, vl);
+
+            int i = 0;
+            for (; i + 3 < size; i += 4)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl);
+                vfloat32m2_t _v45 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 16, vl), vl);
+                vfloat32m2_t _v67 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 24, vl), vl);
+
+                _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl);
+                _v23 = vfmacc_vv_f32m2(_bias0, _v23, _scale0, vl);
+                _v45 = vfmacc_vv_f32m2(_bias0, _v45, _scale0, vl);
+                _v67 = vfmacc_vv_f32m2(_bias0, _v67, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope);
+                *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v23, _slope);
+                *(int64_t*)(ptr + 16) = float2int8leakyrelu(_v45, _slope);
+                *(int64_t*)(ptr + 24) = float2int8leakyrelu(_v67, _slope);
+
+                intptr += 32;
+                ptr += 32;
+            }
+            for (; i + 1 < size; i += 2)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl);
+
+                _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl);
+                _v23 = vfmacc_vv_f32m2(_bias0, _v23, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope);
+                *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v23, _slope);
+
+                intptr += 16;
+                ptr += 16;
+            }
+            for (; i < size; i++)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+
+                _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope);
+
+                intptr += 8;
+                ptr += 8;
+            }
+        }
+    }
+}
diff --git a/src/layer/riscv/requantize_relu_pack4.h b/src/layer/riscv/requantize_relu_pack4.h
new file mode 100644
index 000000000000..ca5285de57c5
--- /dev/null
+++ b/src/layer/riscv/requantize_relu_pack4.h
@@ -0,0 +1,259 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void requantize_relu_pack4_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt)
+{
+    int vl = 4;
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+    int outc = top_blob.c;
+    int out_elempack = top_blob.elempack;
+
+    int scale_in_data_size = scale_in_data.w;
+    int scale_out_data_size = scale_out_data.w;
+    int bias_data_size = bias_data.w;
+
+    // int8(relu(v * scale_in) * scale_out)
+    // int8_relu(v * (scale_in * scale_out))
+
+    // int8(relu(v * scale_in + bias) * scale_out)
+    // int8_relu(v * (scale_in * scale_out) + (bias * scale_out))
+
+    if (out_elempack == 8)
+    {
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < outc; q++)
+            {
+                const int* intptr0 = bottom_blob.channel(q * 2);
+                const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                signed char* ptr = top_blob.channel(q);
+
+                vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl);
+                vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl);
+                vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl);
+                vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl);
+
+                vfloat32m1_t _scale0 = vfmul_vv_f32m1(_scale_in0, _scale_out0, vl);
+                vfloat32m1_t _scale1 = vfmul_vv_f32m1(_scale_in1, _scale_out1, vl);
+
+                int i = 0;
+                for (; i + 3 < size; i += 4)
+                {
+                    vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                    vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl);
+                    vfloat32m1_t _v02 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 8, vl), vl);
+                    vfloat32m1_t _v03 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 12, vl), vl);
+                    vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                    vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl);
+                    vfloat32m1_t _v12 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 8, vl), vl);
+                    vfloat32m1_t _v13 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 12, vl), vl);
+                    _v00 = vfmul_vv_f32m1(_v00, _scale0, vl);
+                    _v01 = vfmul_vv_f32m1(_v01, _scale0, vl);
+                    _v02 = vfmul_vv_f32m1(_v02, _scale0, vl);
+                    _v03 = vfmul_vv_f32m1(_v03, _scale0, vl);
+                    _v10 = vfmul_vv_f32m1(_v10, _scale1, vl);
+                    _v11 = vfmul_vv_f32m1(_v11, _scale1, vl);
+                    _v12 = vfmul_vv_f32m1(_v12, _scale1, vl);
+                    _v13 = vfmul_vv_f32m1(_v13, _scale1, vl);
+                    *(int64_t*)ptr = float2int8relu(_v00, _v10);
+                    *(int64_t*)(ptr + 8) = float2int8relu(_v01, _v11);
+                    *(int64_t*)(ptr + 16) = float2int8relu(_v02, _v12);
+                    *(int64_t*)(ptr + 24) = float2int8relu(_v03, _v13);
+
+                    intptr0 += 16;
+                    intptr1 += 16;
+                    ptr += 32;
+                }
+                for (; i < size; i++)
+                {
+                    vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                    vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                    _v0 = vfmul_vv_f32m1(_v0, _scale0, vl);
+                    _v1 = vfmul_vv_f32m1(_v1, _scale1, vl);
+                    *(int64_t*)ptr = float2int8relu(_v0, _v1);
+
+                    intptr0 += 4;
+                    intptr1 += 4;
+                    ptr += 8;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < outc; q++)
+            {
+                const int* intptr0 = bottom_blob.channel(q * 2);
+                const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                signed char* ptr = top_blob.channel(q);
+
+                vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl);
+                vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl);
+                vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl);
+                vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl);
+                vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8, vl);
+                vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8 + 4, vl);
+
+                vfloat32m1_t _scale0 = vfmul_vv_f32m1(_scale_in0, _scale_out0, vl);
+                vfloat32m1_t _scale1 = vfmul_vv_f32m1(_scale_in1, _scale_out1, vl);
+                _bias0 = vfmul_vv_f32m1(_bias0, _scale_out0, vl);
+                _bias1 = vfmul_vv_f32m1(_bias1, _scale_out1, vl);
+
+                int i = 0;
+                for (; i + 3 < size; i += 4)
+                {
+                    vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                    vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl);
+                    vfloat32m1_t _v02 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 8, vl), vl);
+                    vfloat32m1_t _v03 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 12, vl), vl);
+                    vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                    vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl);
+                    vfloat32m1_t _v12 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 8, vl), vl);
+                    vfloat32m1_t _v13 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 12, vl), vl);
+
+                    _v00 = vfmacc_vv_f32m1(_bias0, _v00, _scale0, vl);
+                    _v01 = vfmacc_vv_f32m1(_bias0, _v01, _scale0, vl);
+                    _v02 = vfmacc_vv_f32m1(_bias0, _v02, _scale0, vl);
+                    _v03 = vfmacc_vv_f32m1(_bias0, _v03, _scale0, vl);
+                    _v10 = vfmacc_vv_f32m1(_bias1, _v10, _scale1, vl);
+                    _v11 = vfmacc_vv_f32m1(_bias1, _v11, _scale1, vl);
+                    _v12 = vfmacc_vv_f32m1(_bias1, _v12, _scale1, vl);
+                    _v13 = vfmacc_vv_f32m1(_bias1, _v13, _scale1, vl);
+
+                    *(int64_t*)ptr = float2int8relu(_v00, _v10);
+                    *(int64_t*)(ptr + 8) = float2int8relu(_v01, _v11);
+                    *(int64_t*)(ptr + 16) = float2int8relu(_v02, _v12);
+                    *(int64_t*)(ptr + 24) = float2int8relu(_v03, _v13);
+
+                    intptr0 += 16;
+                    intptr1 += 16;
+                    ptr += 32;
+                }
+                for (; i + 1 < size; i += 2)
+                {
+                    vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                    vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl);
+                    vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                    vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl);
+
+                    _v00 = vfmacc_vv_f32m1(_bias0, _v00, _scale0, vl);
+                    _v01 = vfmacc_vv_f32m1(_bias0, _v01, _scale0, vl);
+                    _v10 = vfmacc_vv_f32m1(_bias1, _v10, _scale1, vl);
+                    _v11 = vfmacc_vv_f32m1(_bias1, _v11, _scale1, vl);
+
+                    *(int64_t*)ptr = float2int8relu(_v00, _v10);
+                    *(int64_t*)(ptr + 8) = float2int8relu(_v01, _v11);
+
+                    intptr0 += 8;
+                    intptr1 += 8;
+                    ptr += 16;
+                }
+                for (; i < size; i++)
+                {
+                    vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                    vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+
+                    _v0 = vfmacc_vv_f32m1(_bias0, _v0, _scale0, vl);
+                    _v1 = vfmacc_vv_f32m1(_bias1, _v1, _scale1, vl);
+
+                    *(int64_t*)ptr = float2int8relu(_v0, _v1);
+
+                    intptr0 += 4;
+                    intptr1 += 4;
+                    ptr += 8;
+                }
+            }
+        }
+    }
+    if (out_elempack == 1)
+    {
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr0 = top_blob.channel(q * 4);
+                signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                signed char* ptr3 = top_blob.channel(q * 4 + 3);
+
+                vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl);
+                vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl);
+
+                vfloat32m1_t _scale = vfmul_vv_f32m1(_scale_in, _scale_out, vl);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                    _v = vfmul_vv_f32m1(_v, _scale, vl);
+
+                    int res = float2int8relu(_v);
+                    ptr0[0] = (res)&0xff;
+                    ptr1[0] = (res >> 8) & 0xff;
+                    ptr2[0] = (res >> 16) & 0xff;
+                    ptr3[0] = (res >> 24) & 0xff;
+
+                    intptr += 4;
+                    ptr0 += 1;
+                    ptr1 += 1;
+                    ptr2 += 1;
+                    ptr3 += 1;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr0 = top_blob.channel(q * 4);
+                signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                signed char* ptr3 = top_blob.channel(q * 4 + 3);
+
+                vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl);
+                vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl);
+                vfloat32m1_t _bias = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 4, vl);
+
+                vfloat32m1_t _scale = vfmul_vv_f32m1(_scale_in, _scale_out, vl);
+                _bias = vfmul_vv_f32m1(_bias, _scale_out, vl);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                    _v = vfmacc_vv_f32m1(_bias, _v, _scale, vl);
+                    int res = float2int8relu(_v);
+                    ptr0[0] = (res)&0xff;
+                    ptr1[0] = (res >> 8) & 0xff;
+                    ptr2[0] = (res >> 16) & 0xff;
+                    ptr3[0] = (res >> 24) & 0xff;
+
+                    intptr += 4;
+                    ptr0 += 1;
+                    ptr1 += 1;
+                    ptr2 += 1;
+                    ptr3 += 1;
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/riscv/requantize_relu_pack8.h b/src/layer/riscv/requantize_relu_pack8.h
new file mode 100644
index 000000000000..e3f18dbb98a3
--- /dev/null
+++ b/src/layer/riscv/requantize_relu_pack8.h
@@ -0,0 +1,155 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void requantize_relu_pack8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    int scale_in_data_size = scale_in_data.w;
+    int scale_out_data_size = scale_out_data.w;
+    int bias_data_size = bias_data.w;
+    int vl = 8;
+
+    // int8(relu(v * scale_in) * scale_out)
+    // int8_relu(v * (scale_in * scale_out))
+
+    // int8(relu(v * scale_in + bias) * scale_out)
+    // int8_relu(v * (scale_in * scale_out) + (bias * scale_out))
+
+    if (bias_data_size == 0)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const int* intptr = bottom_blob.channel(q);
+            signed char* ptr = top_blob.channel(q);
+
+            vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl);
+            vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl);
+
+            vfloat32m2_t _scale0 = vfmul_vv_f32m2(_scale_in0, _scale_out0, vl);
+
+            int i = 0;
+            for (; i + 3 < size; i += 4)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl);
+                vfloat32m2_t _v45 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 16, vl), vl);
+                vfloat32m2_t _v67 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 24, vl), vl);
+
+                _v01 = vfmul_vv_f32m2(_v01, _scale0, vl);
+                _v23 = vfmul_vv_f32m2(_v23, _scale0, vl);
+                _v45 = vfmul_vv_f32m2(_v45, _scale0, vl);
+                _v67 = vfmul_vv_f32m2(_v67, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8relu(_v01);
+                *(int64_t*)(ptr + 8) = float2int8relu(_v23);
+                *(int64_t*)(ptr + 16) = float2int8relu(_v45);
+                *(int64_t*)(ptr + 24) = float2int8relu(_v67);
+
+                intptr += 32;
+                ptr += 32;
+            }
+            for (; i + 1 < size; i += 2)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl);
+
+                _v01 = vfmul_vv_f32m2(_v01, _scale0, vl);
+                _v23 = vfmul_vv_f32m2(_v23, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8relu(_v01);
+                *(int64_t*)(ptr + 8) = float2int8relu(_v23);
+                intptr += 16;
+                ptr += 16;
+            }
+            for (; i < size; i++)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+
+                _v01 = vfmul_vv_f32m2(_v01, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8relu(_v01);
+
+                intptr += 8;
+                ptr += 8;
+            }
+        }
+    }
+    else
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const int* intptr = bottom_blob.channel(q);
+            signed char* ptr = top_blob.channel(q);
+
+            vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl);
+            vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl);
+            vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + q * 8, vl);
+
+            vfloat32m2_t _scale0 = vfmul_vv_f32m2(_scale_in0, _scale_out0, vl);
+            _bias0 = vfmul_vv_f32m2(_bias0, _scale_out0, vl);
+            int i = 0;
+            for (; i + 3 < size; i += 4)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl);
+                vfloat32m2_t _v45 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 16, vl), vl);
+                vfloat32m2_t _v67 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 24, vl), vl);
+
+                _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl);
+                _v23 = vfmacc_vv_f32m2(_bias0, _v23, _scale0, vl);
+                _v45 = vfmacc_vv_f32m2(_bias0, _v45, _scale0, vl);
+                _v67 = vfmacc_vv_f32m2(_bias0, _v67, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8relu(_v01);
+                *(int64_t*)(ptr + 8) = float2int8relu(_v23);
+                *(int64_t*)(ptr + 16) = float2int8relu(_v45);
+                *(int64_t*)(ptr + 24) = float2int8relu(_v67);
+
+                intptr += 32;
+                ptr += 32;
+            }
+            for (; i + 1 < size; i += 2)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl);
+
+                _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl);
+                _v23 = vfmacc_vv_f32m2(_bias0, _v23, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8relu(_v01);
+                *(int64_t*)(ptr + 8) = float2int8relu(_v23);
+
+                intptr += 16;
+                ptr += 16;
+            }
+            for (; i < size; i++)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+
+                _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8relu(_v01);
+
+                intptr += 8;
+                ptr += 8;
+            }
+        }
+    }
+}
diff --git a/src/layer/riscv/requantize_riscv.cpp b/src/layer/riscv/requantize_riscv.cpp
new file mode 100644
index 000000000000..81e0e9aaa8da
--- /dev/null
+++ b/src/layer/riscv/requantize_riscv.cpp
@@ -0,0 +1,1240 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 Xinyu302. All rights reserved.
+// Copyright (C) 2019 BUG1989. All rights reserved.
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "requantize_riscv.h"
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#endif // __riscv_vector
+
+#include "riscv_activation.h"
+#include "riscv_usability.h"
+
+namespace ncnn {
+
+#if __riscv_vector
+#include "requantize_leakyrelu_pack4.h"
+#include "requantize_leakyrelu_pack8.h"
+#include "requantize_relu_pack4.h"
+#include "requantize_relu_pack8.h"
+#endif // __riscv_vector
+
+Requantize_riscv::Requantize_riscv()
+{
+#if __riscv_vector
+    support_packing = true;
+#endif // __riscv_vector
+}
+
+int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int dims = bottom_blob.dims;
+    int elempack = bottom_blob.elempack;
+    int vl;
+
+#if __riscv_vector
+    if (elempack == 8)
+    {
+        vl = 8;
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+
+            top_blob.create(w, (size_t)8u, 8, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_in_data_size == 1 && scale_out_data_size == 1)
+            {
+                vfloat32m2_t _scale_in = vfmv_v_f_f32m2(scale_in_data[0], vl);
+                vfloat32m2_t _scale_out = vfmv_v_f_f32m2(scale_out_data[0], vl);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_in, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    vfloat32m2_t _bias = vfmv_v_f_f32m2(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmacc_vv_f32m2(_bias, _v0, _scale_in, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + i * 8, vl);
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+            }
+            else if (scale_in_data_size == 1 && scale_out_data_size > 1)
+            {
+                vfloat32m2_t _scale_in = vfmv_v_f_f32m2(scale_in_data[0], vl);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl);
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_in, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    vfloat32m2_t _bias = vfmv_v_f_f32m2(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl);
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmacc_vv_f32m2(_bias, _v0, _scale_in, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl);
+                        vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + i * 8, vl);
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+            }
+            else if (scale_in_data_size > 1 && scale_out_data_size == 1)
+            {
+                vfloat32m2_t _scale_out = vfmv_v_f_f32m2(scale_out_data[0], vl);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl);
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_in0, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    vfloat32m2_t _bias = vfmv_v_f_f32m2(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl);
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmacc_vv_f32m2(_bias, _v0, _scale_in0, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl);
+                        vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + i * 8, vl);
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in0, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+            }
+            else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl);
+                        vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl);
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_in0, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    vfloat32m2_t _bias = vfmv_v_f_f32m2(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl);
+                        vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl);
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmacc_vv_f32m2(_bias, _v0, _scale_in0, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl);
+                        vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl);
+                        vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + i * 8, vl);
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in0, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+
+            top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    signed char* ptr = top_blob.row<signed char>(i);
+
+                    vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl);
+                    vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_in0, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+
+                        intptr += 8;
+                        ptr += 8;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    signed char* ptr = top_blob.row<signed char>(i);
+
+                    vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl);
+                    vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl);
+                    vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + i * 8, vl);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in0, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+
+                        intptr += 8;
+                        ptr += 8;
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+
+            top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (activation_type == 1)
+            {
+                requantize_relu_pack8_rvv(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt);
+                return 0;
+            }
+
+            if (activation_type == 2 && activation_params[0] > 0.f)
+            {
+                requantize_leakyrelu_pack8_rvv(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt);
+                return 0;
+            }
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    signed char* ptr = top_blob.channel(q);
+
+                    vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl);
+                    vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_in0, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+
+                        intptr += 8;
+                        ptr += 8;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    signed char* ptr = top_blob.channel(q);
+
+                    vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl);
+                    vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl);
+                    vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + q * 8, vl);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in0, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+
+                        intptr += 8;
+                        ptr += 8;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    if (elempack == 4)
+    {
+        vl = 4;
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+            int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
+            int outw = w * elempack / out_elempack;
+
+            top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_in_data_size == 1 && scale_out_data_size == 1)
+            {
+                vfloat32m1_t _scale_in = vfmv_v_f_f32m1(scale_in_data[0], vl);
+                vfloat32m1_t _scale_out = vfmv_v_f_f32m1(scale_out_data[0], vl);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    vfloat32m1_t _bias = vfmv_v_f_f32m1(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _bias = vle32_v_f32m1((const float*)bias_data + i * 4, vl);
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+            }
+            else if (scale_in_data_size == 1 && scale_out_data_size > 1)
+            {
+                vfloat32m1_t _scale_in = vfmv_v_f_f32m1(scale_in_data[0], vl);
+                // float32x4_t _scale_in = vdupq_n_f32(scale_in_data[0]);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl);
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    vfloat32m1_t _bias = vfmv_v_f_f32m1(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl);
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _bias = vle32_v_f32m1((const float*)bias_data + i * 4, vl);
+                        vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl);
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+            }
+            else if (scale_in_data_size > 1 && scale_out_data_size == 1)
+            {
+                vfloat32m1_t _scale_out = vfmv_v_f_f32m1(scale_out_data[0], vl);
+                // float32x4_t _scale_out = vdupq_n_f32(scale_out_data[0]);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl);
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    // float32x4_t _bias = vdupq_n_f32(bias_data[0]);
+                    vfloat32m1_t _bias = vfmv_v_f_f32m1(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl);
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl);
+                        vfloat32m1_t _bias = vle32_v_f32m1((const float*)bias_data + i * 4, vl);
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+            }
+            else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl);
+                        vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl);
+
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    vfloat32m1_t _bias = vfmv_v_f_f32m1(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl);
+                        vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl);
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl);
+                        vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl);
+                        vfloat32m1_t _bias = vle32_v_f32m1((const float*)bias_data + i * 4, vl);
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
+            int outh = h * elempack / out_elempack;
+
+            top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (out_elempack == 8)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const int* intptr0 = bottom_blob.row<const int>(i * 2);
+                        const int* intptr1 = bottom_blob.row<const int>(i * 2 + 1);
+                        signed char* ptr = top_blob.row<signed char>(i);
+
+                        vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 8, vl);
+                        vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 8 + 4, vl);
+                        vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 8, vl);
+                        vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 8 + 4, vl);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                            vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                            _v0 = vfmul_vv_f32m1(_v0, _scale_in0, vl);
+                            _v1 = vfmul_vv_f32m1(_v1, _scale_in1, vl);
+                            _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                            _v1 = activation_ps(_v1, activation_type, activation_params, vl);
+                            _v0 = vfmul_vv_f32m1(_v0, _scale_out0, vl);
+                            _v1 = vfmul_vv_f32m1(_v1, _scale_out1, vl);
+                            *(int64_t*)ptr = float2int8(_v0, _v1);
+
+                            intptr0 += 4;
+                            intptr1 += 4;
+                            ptr += 8;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const int* intptr0 = bottom_blob.row<const int>(i * 2);
+                        const int* intptr1 = bottom_blob.row<const int>(i * 2 + 1);
+                        signed char* ptr = top_blob.row<signed char>(i);
+
+                        vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 8, vl);
+                        vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 8 + 4, vl);
+                        vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 8, vl);
+                        vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 8 + 4, vl);
+                        vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 8, vl);
+                        vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 8 + 4, vl);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                            vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                            _v0 = vfmacc_vv_f32m1(_bias0, _v0, _scale_in0, vl);
+                            _v1 = vfmacc_vv_f32m1(_bias1, _v1, _scale_in1, vl);
+                            _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                            _v1 = activation_ps(_v1, activation_type, activation_params, vl);
+                            _v0 = vfmul_vv_f32m1(_v0, _scale_out0, vl);
+                            _v1 = vfmul_vv_f32m1(_v1, _scale_out1, vl);
+                            *(int64_t*)ptr = float2int8(_v0, _v1);
+
+                            intptr0 += 4;
+                            intptr1 += 4;
+                            ptr += 8;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const int* intptr = bottom_blob.row<const int>(i);
+                        signed char* ptr0 = top_blob.row<signed char>(i * 4);
+                        signed char* ptr1 = top_blob.row<signed char>(i * 4 + 1);
+                        signed char* ptr2 = top_blob.row<signed char>(i * 4 + 2);
+                        signed char* ptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                        vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 4, vl);
+                        vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 4, vl);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                            _v = vfmul_vv_f32m1(_v, _scale_in, vl);
+                            _v = activation_ps(_v, activation_type, activation_params, vl);
+                            _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                            int res = float2int8(_v);
+                            ptr0[0] = (res)&0xff;
+                            ptr1[0] = (res >> 8) & 0xff;
+                            ptr2[0] = (res >> 16) & 0xff;
+                            ptr3[0] = (res >> 24) & 0xff;
+
+                            intptr += 4;
+                            ptr0 += 1;
+                            ptr1 += 1;
+                            ptr2 += 1;
+                            ptr3 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const int* intptr = bottom_blob.row<const int>(i);
+                        signed char* ptr0 = top_blob.row<signed char>(i * 4);
+                        signed char* ptr1 = top_blob.row<signed char>(i * 4 + 1);
+                        signed char* ptr2 = top_blob.row<signed char>(i * 4 + 2);
+                        signed char* ptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                        vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 4, vl);
+                        vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 4, vl);
+                        vfloat32m1_t _bias = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 4, vl);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                            _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl);
+                            _v = activation_ps(_v, activation_type, activation_params, vl);
+                            _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                            int res = float2int8(_v);
+
+                            ptr0[0] = (res)&0xff;
+                            ptr1[0] = (res >> 8) & 0xff;
+                            ptr2[0] = (res >> 16) & 0xff;
+                            ptr3[0] = (res >> 24) & 0xff;
+
+                            intptr += 4;
+                            ptr0 += 1;
+                            ptr1 += 1;
+                            ptr2 += 1;
+                            ptr3 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+            int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
+            int outc = channels * elempack / out_elempack;
+
+            top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (activation_type == 1)
+            {
+                requantize_relu_pack4_rvv(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt);
+                return 0;
+            }
+
+            if (activation_type == 2 && activation_params[0] > 0.f)
+            {
+                requantize_leakyrelu_pack4_rvv(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt);
+                return 0;
+            }
+
+            if (out_elempack == 8)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const int* intptr0 = bottom_blob.channel(q * 2);
+                        const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                        signed char* ptr = top_blob.channel(q);
+
+                        vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl);
+                        vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl);
+                        vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl);
+                        vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                            vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                            _v0 = vfmul_vv_f32m1(_v0, _scale_in0, vl);
+                            _v1 = vfmul_vv_f32m1(_v1, _scale_in1, vl);
+                            _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                            _v1 = activation_ps(_v1, activation_type, activation_params, vl);
+                            _v0 = vfmul_vv_f32m1(_v0, _scale_out0, vl);
+                            _v1 = vfmul_vv_f32m1(_v1, _scale_out1, vl);
+                            *(int64_t*)ptr = float2int8(_v0, _v1);
+
+                            intptr0 += 4;
+                            intptr1 += 4;
+                            ptr += 8;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const int* intptr0 = bottom_blob.channel(q * 2);
+                        const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                        signed char* ptr = top_blob.channel(q);
+
+                        vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl);
+                        vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl);
+                        vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl);
+                        vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl);
+                        vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8, vl);
+                        vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8 + 4, vl);
+                        for (int i = 0; i < size; i++)
+                        {
+                            vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                            vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                            _v0 = vfmacc_vv_f32m1(_bias0, _v0, _scale_in0, vl);
+                            _v1 = vfmacc_vv_f32m1(_bias1, _v1, _scale_in1, vl);
+                            _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                            _v1 = activation_ps(_v1, activation_type, activation_params, vl);
+                            _v0 = vfmul_vv_f32m1(_v0, _scale_out0, vl);
+                            _v1 = vfmul_vv_f32m1(_v1, _scale_out1, vl);
+                            *(int64_t*)ptr = float2int8(_v0, _v1);
+
+                            intptr0 += 4;
+                            intptr1 += 4;
+                            ptr += 8;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const int* intptr = bottom_blob.channel(q);
+                        signed char* ptr0 = top_blob.channel(q * 4);
+                        signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                        signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                        signed char* ptr3 = top_blob.channel(q * 4 + 3);
+
+                        vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl);
+                        vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl);
+                        // float32x4_t _scale_in = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 4);
+                        // float32x4_t _scale_out = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 4);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                            _v = vfmul_vv_f32m1(_v, _scale_in, vl);
+                            _v = activation_ps(_v, activation_type, activation_params, vl);
+                            _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                            int res = float2int8(_v);
+                            ptr0[0] = (res)&0xff;
+                            ptr1[0] = (res >> 8) & 0xff;
+                            ptr2[0] = (res >> 16) & 0xff;
+                            ptr3[0] = (res >> 24) & 0xff;
+
+                            intptr += 4;
+                            ptr0 += 1;
+                            ptr1 += 1;
+                            ptr2 += 1;
+                            ptr3 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const int* intptr = bottom_blob.channel(q);
+                        signed char* ptr0 = top_blob.channel(q * 4);
+                        signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                        signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                        signed char* ptr3 = top_blob.channel(q * 4 + 3);
+
+                        vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl);
+                        vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl);
+                        vfloat32m1_t _bias = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 4, vl);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                            _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl);
+                            _v = activation_ps(_v, activation_type, activation_params, vl);
+                            _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                            int res = float2int8(_v);
+                            ptr0[0] = (res)&0xff;
+                            ptr1[0] = (res >> 8) & 0xff;
+                            ptr2[0] = (res >> 16) & 0xff;
+                            ptr3[0] = (res >> 24) & 0xff;
+
+                            intptr += 4;
+                            ptr0 += 1;
+                            ptr1 += 1;
+                            ptr2 += 1;
+                            ptr3 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+#endif // __riscv_vector
+
+    if (dims == 1)
+    {
+        int w = bottom_blob.w;
+
+        top_blob.create(w, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        const int* intptr = bottom_blob;
+        signed char* ptr = top_blob;
+
+        if (scale_in_data_size == 1 && scale_out_data_size == 1)
+        {
+            const float scale_in = scale_in_data[0];
+            const float scale_out = scale_out_data[0];
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in + bias_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+        else if (scale_in_data_size == 1 && scale_out_data_size > 1)
+        {
+            const float scale_in = scale_in_data[0];
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in + bias_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+        }
+        else if (scale_in_data_size > 1 && scale_out_data_size == 1)
+        {
+            const float scale_out = scale_out_data[0];
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i] + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i] + bias_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+        else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
+        {
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i] + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i] + bias_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                signed char* ptr = top_blob.row<signed char>(i);
+
+                const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i];
+                const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i];
+
+                for (int j = 0; j < w; j++)
+                {
+                    float v = intptr[j] * scale_in;
+                    ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                signed char* ptr = top_blob.row<signed char>(i);
+
+                const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i];
+                const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i];
+                const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i];
+
+                for (int j = 0; j < w; j++)
+                {
+                    float v = intptr[j] * scale_in + bias;
+                    ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+        int size = w * h;
+
+        top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr = top_blob.channel(q);
+
+                const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q];
+                const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q];
+
+                for (int i = 0; i < size; i++)
+                {
+                    float v = intptr[i] * scale_in;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr = top_blob.channel(q);
+
+                const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q];
+                const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q];
+                const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q];
+
+                for (int i = 0; i < size; i++)
+                {
+                    float v = intptr[i] * scale_in + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/riscv/requantize_riscv.h b/src/layer/riscv/requantize_riscv.h
new file mode 100644
index 000000000000..df12b54ce3a4
--- /dev/null
+++ b/src/layer/riscv/requantize_riscv.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 Xinyu302. All rights reserved.
+// Copyright (C) 2019 BUG1989. All rights reserved.
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_REQUANTIZE_RISCV_H
+#define LAYER_REQUANTIZE_RISCV_H
+
+#include "requantize.h"
+
+namespace ncnn {
+
+class Requantize_riscv : public Requantize
+{
+public:
+    Requantize_riscv();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_REQUANTIZE_RISCV_H
diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h
index 54d98834fb7c..1802a28336dd 100644
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -110,6 +110,70 @@ static inline int32_t float2int8(vfloat32m1_t _v)
     return _ret;
 }
 
+static inline int64_t float2int8(vfloat32m2_t _v)
+{
+    int vl = vsetvlmax_e32m2();
+    vint32m2_t _v32m2 = vfcvt_x_f_v_i32m2(_v, vl);
+    vint32m4_t _v32 = vundefined_i32m4();
+    _v32 = vset_v_i32m2_i32m4(_v32, 0, _v32m2);
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+    _v8 = vmax_vx_i8m1(_v8, -127, vl);
+    int64_t _ret;
+    vse8_v_i8m1((int8_t*)&_ret, _v8, vl);
+    return _ret;
+}
+
+static inline int64_t float2int8relu(vfloat32m2_t _v)
+{
+    int vl = vsetvlmax_e32m2();
+    vint32m2_t _v32m2 = vfcvt_x_f_v_i32m2(_v, vl);
+    vint32m4_t _v32 = vundefined_i32m4();
+    _v32 = vset_v_i32m2_i32m4(_v32, 0, _v32m2);
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+    _v8 = vmax_vx_i8m1(_v8, 0, vl);
+    int64_t _ret;
+    vse8_v_i8m1((int8_t*)&_ret, _v8, vl);
+    return _ret;
+}
+
+static inline int64_t float2int8leakyrelu(vfloat32m2_t _v, vfloat32m2_t _slope)
+{
+    int vl = vsetvlmax_e32m2();
+
+    vfloat32m2_t _v_leaky = vfmul_vv_f32m2(_v, _slope, vl);
+    vint32m2_t _v_int32 = vfcvt_x_f_v_i32m2(_v, vl);
+
+    // vfloat32m1_t _vlow_leaky = vfmul_vv_f32m1(_vlow, _slope, vl);
+    // vfloat32m1_t _vhigh_leaky = vfmul_vv_f32m1(_vhigh, _slope, vl);
+
+    // vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow, vl);
+    // vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh, vl);
+
+    // vint32m1_t _vlow32_leaky = vfcvt_x_f_v_i32m1(_vlow_leaky, vl);
+    // vint32m1_t _vhigh32_leaky = vfcvt_x_f_v_i32m1(_vhigh_leaky, vl);
+    vint32m2_t _v_int32_leaky = vfcvt_x_f_v_i32m2(_v_leaky, vl);
+
+    vl = 2 * vsetvlmax_e32m1();
+
+    vint32m4_t _v32 = vundefined_i32m4();
+    vint32m4_t _v32_leaky = vundefined_i32m4();
+    _v32 = vset_v_i32m2_i32m4(_v32, 0, _v_int32);
+    _v32_leaky = vset_v_i32m2_i32m4(_v32_leaky, 0, _v_int32_leaky);
+
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl);
+
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+    vint8m1_t _v8_leaky = vnclip_wx_i8m1(_v16_leaky, 0, vl);
+
+    _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl);
+    int64_t _ret;
+    vse8_v_i8m1((int8_t*)&_ret, _v8, vl);
+    return _ret;
+}
+
 static void print_vint8m1(vint8m1_t _v, int vl = 16)
 {
     int8_t* i8 = (int8_t*)malloc(16 * sizeof(int8_t));