From 48f3192dc0937e7f1a8010a3355f595320f2fe51 Mon Sep 17 00:00:00 2001 From: yxy Date: Mon, 5 Feb 2024 17:41:37 +0000 Subject: [PATCH 01/17] add float2int8 --- src/layer/riscv/riscv_usability.h | 86 +++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h index e2824646f871..f63984d21fc9 100644 --- a/src/layer/riscv/riscv_usability.h +++ b/src/layer/riscv/riscv_usability.h @@ -19,6 +19,14 @@ #include #endif // __riscv_vector +static inline signed char float2int8(float v) +{ + int int32 = (int)roundf(v); + if (int32 > 127) return 127; + if (int32 < -127) return -127; + return (signed char)int32; +} + #if __riscv_vector static inline int csrr_vl() { @@ -50,6 +58,84 @@ static inline int csrr_vlenb() return a; } +static inline int64_t float2int8(vfloat32m1_t _vlow, vfloat32m1_t _vhigh) +{ + int vl = vsetvlmax_e32m1(); + // _MM_ROUND_NEAREST round to even + // simulate round to nearest via +/-0.5 with round to zero + vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl); + // vint32m1_t _signmask = vmv_v_f(1 << 31, vl); + int32_t _signmask = 1 << 31; + vint32m1_t _signlow = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vlow), _signmask, vl); + vint32m1_t _signhigh = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vhigh), _signmask, vl); + + vfloat32m1_t _p5low = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signlow, vreinterpret_v_f32m1_i32m1(_p5), vl)); + vfloat32m1_t _p5high = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signhigh, vreinterpret_v_f32m1_i32m1(_p5), vl)); + + vfloat32m1_t _vlow5 = vfadd_vv_f32m1(_vlow, _p5low, vl); + vfloat32m1_t _vhigh5 = vfadd_vv_f32m1(_vhigh, _p5high, vl); + + vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow5, vl); + vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl); + + // combine _vlow32 and _vhigh32 to a single vint32m2_t + vint32m2_t _v32 = vundefined_i32m2(); + _v32 = vset_v_i32m1_i32m2 (_v32, 0, _vlow32); + _v32 = vset_v_i32m1_i32m2 (_v32, 1, _vhigh32); + + vint16m1_t _v16 = vnclip_wx_i16m1(_v32, 0, vl); + vint16m2_t _v16_2 = vundefined_i16m2(); + _v16_2 = vset_v_i16m1_i16m2 (_v16_2, 0, _v16); + vint8m1_t _v8 = vnclip_wx_i8m1(_v16_2, 0, vl); + _v8 = vmax_vx_i8m1(_v8, -127, vl); + int64_t _ret; + vse8_v_i8m1((int8_t *)&_ret, _v8, vl); + return _ret; +} + +static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3) +{ + int vl = vsetvlmax_e32m1(); + // _MM_ROUND_NEAREST round to even + // simulate round to nearest via +/-0.5 with round to zero + vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl); + // vint32m1_t _signmask = vmv_v_f(1 << 31, vl); + int32_t _signmask = 1 << 31; + vint32m1_t _sign0 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v0), _signmask, vl); + vint32m1_t _sign1 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v1), _signmask, vl); + vint32m1_t _sign2 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v2), _signmask, vl); + vint32m1_t _sign3 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v3), _signmask, vl); + + vfloat32m1_t _p5s0 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign0, vreinterpret_v_f32m1_i32m1(_p5), vl)); + vfloat32m1_t _p5s1 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign1, vreinterpret_v_f32m1_i32m1(_p5), vl)); + vfloat32m1_t _p5s2 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign2, vreinterpret_v_f32m1_i32m1(_p5), vl)); + vfloat32m1_t _p5s3 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign3, vreinterpret_v_f32m1_i32m1(_p5), vl)); + + vfloat32m1_t _v05 = vfadd_vv_f32m1(_v0, _p5s0, vl); + vfloat32m1_t _v15 = vfadd_vv_f32m1(_v1, _p5s1, vl); + vfloat32m1_t _v25 = vfadd_vv_f32m1(_v2, _p5s2, vl); + vfloat32m1_t _v35 = vfadd_vv_f32m1(_v3, _p5s3, vl); + + vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v05, vl); + vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v15, vl); + vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v25, vl); + vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v35, vl); + + vl = vsetvlmax_e32m4(); + + vint32m4_t _v32 = vundefined_i32m4(); + _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v0_32); + _v32 = vset_v_i32m1_i32m4 (_v32, 1, _v1_32); + _v32 = vset_v_i32m1_i32m4 (_v32, 2, _v2_32); + _v32 = vset_v_i32m1_i32m4 (_v32, 3, _v3_32); + + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + _v8 = vmax_vx_i8m1(_v8, -127, vl); + return _v8; +} + + static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr) { const int packn = csrr_vlenb() / 4; From b83e64ee04562b2fac34797556e8169f05236b8f Mon Sep 17 00:00:00 2001 From: yxy Date: Tue, 6 Feb 2024 02:44:16 +0000 Subject: [PATCH 02/17] pass compile --- src/layer/riscv/riscv_usability.h | 117 ++++++++++++++++++++++++++++-- 1 file changed, 110 insertions(+), 7 deletions(-) diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h index f63984d21fc9..803c78756a6b 100644 --- a/src/layer/riscv/riscv_usability.h +++ b/src/layer/riscv/riscv_usability.h @@ -58,6 +58,7 @@ static inline int csrr_vlenb() return a; } + static inline int64_t float2int8(vfloat32m1_t _vlow, vfloat32m1_t _vhigh) { int vl = vsetvlmax_e32m1(); @@ -79,20 +80,46 @@ static inline int64_t float2int8(vfloat32m1_t _vlow, vfloat32m1_t _vhigh) vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl); // combine _vlow32 and _vhigh32 to a single vint32m2_t - vint32m2_t _v32 = vundefined_i32m2(); - _v32 = vset_v_i32m1_i32m2 (_v32, 0, _vlow32); - _v32 = vset_v_i32m1_i32m2 (_v32, 1, _vhigh32); + vint32m4_t _v32 = vundefined_i32m4(); + _v32 = vset_v_i32m1_i32m4 (_v32, 0, _vlow32); + _v32 = vset_v_i32m1_i32m4 (_v32, 1, _vhigh32); - vint16m1_t _v16 = vnclip_wx_i16m1(_v32, 0, vl); - vint16m2_t _v16_2 = vundefined_i16m2(); - _v16_2 = vset_v_i16m1_i16m2 (_v16_2, 0, _v16); - vint8m1_t _v8 = vnclip_wx_i8m1(_v16_2, 0, vl); + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); _v8 = vmax_vx_i8m1(_v8, -127, vl); int64_t _ret; vse8_v_i8m1((int8_t *)&_ret, _v8, vl); return _ret; } +static inline int32_t float2int8(vfloat32m1_t _v) { + int vl = vsetvlmax_e32m1(); + // _MM_ROUND_NEAREST round to even + // simulate round to nearest via +/-0.5 with round to zero + + vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl); + // vint32m1_t _signmask = vmv_v_f(1 << 31, vl); + int32_t _signmask = 1 << 31; + vint32m1_t _sign = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v), _signmask, vl); + + vfloat32m1_t _p5s = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign, vreinterpret_v_f32m1_i32m1(_p5), vl)); + + vfloat32m1_t _v5 = vfadd_vv_f32m1(_v, _p5s, vl); + vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v5, vl); + + + vint32m4_t _v32 = vundefined_i32m4(); + _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v32m1); + + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + _v8 = vmax_vx_i8m1(_v8, -127, vl); + int32_t _ret; + vse8_v_i8m1((int8_t *)&_ret, _v8, vl); + return _ret; +} + + static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3) { int vl = vsetvlmax_e32m1(); @@ -135,6 +162,82 @@ static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m return _v8; } +static inline int64_t float2int8relu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh) +{ + int vl = vsetvlmax_e32m1(); + // _MM_ROUND_NEAREST round to even + // simulate round to nearest via +/-0.5 with round to zero + vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl); + // vint32m1_t _signmask = vmv_v_f(1 << 31, vl); + int32_t _signmask = 1 << 31; + vint32m1_t _signlow = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vlow), _signmask, vl); + vint32m1_t _signhigh = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vhigh), _signmask, vl); + + vfloat32m1_t _p5low = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signlow, vreinterpret_v_f32m1_i32m1(_p5), vl)); + vfloat32m1_t _p5high = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signhigh, vreinterpret_v_f32m1_i32m1(_p5), vl)); + + vfloat32m1_t _vlow5 = vfadd_vv_f32m1(_vlow, _p5low, vl); + vfloat32m1_t _vhigh5 = vfadd_vv_f32m1(_vhigh, _p5high, vl); + + vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow5, vl); + vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl); + + // combine _vlow32 and _vhigh32 to a single vint32m2_t + vint32m4_t _v32 = vundefined_i32m4(); + _v32 = vset_v_i32m1_i32m4 (_v32, 0, _vlow32); + _v32 = vset_v_i32m1_i32m4 (_v32, 1, _vhigh32); + + vint16m2_t _v16_2 = vnclip_wx_i16m2(_v32, 0, vl); + vint8m1_t _v8 = vnclip_wx_i8m1(_v16_2, 0, vl); + _v8 = vmax_vx_i8m1(_v8, 0, vl); + int64_t _ret; + vse8_v_i8m1((int8_t *)&_ret, _v8, vl); + return _ret; +} + +static inline vint8m1_t float2int8relu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3) +{ + int vl = vsetvlmax_e32m1(); + // _MM_ROUND_NEAREST round to even + // simulate round to nearest via +/-0.5 with round to zero + vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl); + // vint32m1_t _signmask = vmv_v_f(1 << 31, vl); + int32_t _signmask = 1 << 31; + vint32m1_t _sign0 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v0), _signmask, vl); + vint32m1_t _sign1 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v1), _signmask, vl); + vint32m1_t _sign2 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v2), _signmask, vl); + vint32m1_t _sign3 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v3), _signmask, vl); + + vfloat32m1_t _p5s0 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign0, vreinterpret_v_f32m1_i32m1(_p5), vl)); + vfloat32m1_t _p5s1 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign1, vreinterpret_v_f32m1_i32m1(_p5), vl)); + vfloat32m1_t _p5s2 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign2, vreinterpret_v_f32m1_i32m1(_p5), vl)); + vfloat32m1_t _p5s3 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign3, vreinterpret_v_f32m1_i32m1(_p5), vl)); + + vfloat32m1_t _v05 = vfadd_vv_f32m1(_v0, _p5s0, vl); + vfloat32m1_t _v15 = vfadd_vv_f32m1(_v1, _p5s1, vl); + vfloat32m1_t _v25 = vfadd_vv_f32m1(_v2, _p5s2, vl); + vfloat32m1_t _v35 = vfadd_vv_f32m1(_v3, _p5s3, vl); + + vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v05, vl); + vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v15, vl); + vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v25, vl); + vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v35, vl); + + vl = vsetvlmax_e32m4(); + + vint32m4_t _v32 = vundefined_i32m4(); + _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v0_32); + _v32 = vset_v_i32m1_i32m4 (_v32, 1, _v1_32); + _v32 = vset_v_i32m1_i32m4 (_v32, 2, _v2_32); + _v32 = vset_v_i32m1_i32m4 (_v32, 3, _v3_32); + + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + _v8 = vmax_vx_i8m1(_v8, 0, vl); + return _v8; +} + + static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr) { From 648d56645ac621880ed39754a608e129c9e2fde2 Mon Sep 17 00:00:00 2001 From: Xinyu302 Date: Tue, 6 Feb 2024 05:49:26 +0000 Subject: [PATCH 03/17] apply code-format changes --- src/layer/riscv/riscv_usability.h | 44 ++++++++++++++----------------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h index 803c78756a6b..153299f0a549 100644 --- a/src/layer/riscv/riscv_usability.h +++ b/src/layer/riscv/riscv_usability.h @@ -58,7 +58,6 @@ static inline int csrr_vlenb() return a; } - static inline int64_t float2int8(vfloat32m1_t _vlow, vfloat32m1_t _vhigh) { int vl = vsetvlmax_e32m1(); @@ -78,21 +77,22 @@ static inline int64_t float2int8(vfloat32m1_t _vlow, vfloat32m1_t _vhigh) vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow5, vl); vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl); - + // combine _vlow32 and _vhigh32 to a single vint32m2_t vint32m4_t _v32 = vundefined_i32m4(); - _v32 = vset_v_i32m1_i32m4 (_v32, 0, _vlow32); - _v32 = vset_v_i32m1_i32m4 (_v32, 1, _vhigh32); + _v32 = vset_v_i32m1_i32m4(_v32, 0, _vlow32); + _v32 = vset_v_i32m1_i32m4(_v32, 1, _vhigh32); vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); _v8 = vmax_vx_i8m1(_v8, -127, vl); int64_t _ret; - vse8_v_i8m1((int8_t *)&_ret, _v8, vl); + vse8_v_i8m1((int8_t*)&_ret, _v8, vl); return _ret; } -static inline int32_t float2int8(vfloat32m1_t _v) { +static inline int32_t float2int8(vfloat32m1_t _v) +{ int vl = vsetvlmax_e32m1(); // _MM_ROUND_NEAREST round to even // simulate round to nearest via +/-0.5 with round to zero @@ -107,19 +107,17 @@ static inline int32_t float2int8(vfloat32m1_t _v) { vfloat32m1_t _v5 = vfadd_vv_f32m1(_v, _p5s, vl); vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v5, vl); - vint32m4_t _v32 = vundefined_i32m4(); - _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v32m1); + _v32 = vset_v_i32m1_i32m4(_v32, 0, _v32m1); vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); _v8 = vmax_vx_i8m1(_v8, -127, vl); int32_t _ret; - vse8_v_i8m1((int8_t *)&_ret, _v8, vl); + vse8_v_i8m1((int8_t*)&_ret, _v8, vl); return _ret; } - static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3) { int vl = vsetvlmax_e32m1(); @@ -151,10 +149,10 @@ static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m vl = vsetvlmax_e32m4(); vint32m4_t _v32 = vundefined_i32m4(); - _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v0_32); - _v32 = vset_v_i32m1_i32m4 (_v32, 1, _v1_32); - _v32 = vset_v_i32m1_i32m4 (_v32, 2, _v2_32); - _v32 = vset_v_i32m1_i32m4 (_v32, 3, _v3_32); + _v32 = vset_v_i32m1_i32m4(_v32, 0, _v0_32); + _v32 = vset_v_i32m1_i32m4(_v32, 1, _v1_32); + _v32 = vset_v_i32m1_i32m4(_v32, 2, _v2_32); + _v32 = vset_v_i32m1_i32m4(_v32, 3, _v3_32); vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); @@ -181,17 +179,17 @@ static inline int64_t float2int8relu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh) vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow5, vl); vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl); - + // combine _vlow32 and _vhigh32 to a single vint32m2_t vint32m4_t _v32 = vundefined_i32m4(); - _v32 = vset_v_i32m1_i32m4 (_v32, 0, _vlow32); - _v32 = vset_v_i32m1_i32m4 (_v32, 1, _vhigh32); + _v32 = vset_v_i32m1_i32m4(_v32, 0, _vlow32); + _v32 = vset_v_i32m1_i32m4(_v32, 1, _vhigh32); vint16m2_t _v16_2 = vnclip_wx_i16m2(_v32, 0, vl); vint8m1_t _v8 = vnclip_wx_i8m1(_v16_2, 0, vl); _v8 = vmax_vx_i8m1(_v8, 0, vl); int64_t _ret; - vse8_v_i8m1((int8_t *)&_ret, _v8, vl); + vse8_v_i8m1((int8_t*)&_ret, _v8, vl); return _ret; } @@ -226,10 +224,10 @@ static inline vint8m1_t float2int8relu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloa vl = vsetvlmax_e32m4(); vint32m4_t _v32 = vundefined_i32m4(); - _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v0_32); - _v32 = vset_v_i32m1_i32m4 (_v32, 1, _v1_32); - _v32 = vset_v_i32m1_i32m4 (_v32, 2, _v2_32); - _v32 = vset_v_i32m1_i32m4 (_v32, 3, _v3_32); + _v32 = vset_v_i32m1_i32m4(_v32, 0, _v0_32); + _v32 = vset_v_i32m1_i32m4(_v32, 1, _v1_32); + _v32 = vset_v_i32m1_i32m4(_v32, 2, _v2_32); + _v32 = vset_v_i32m1_i32m4(_v32, 3, _v3_32); vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); @@ -237,8 +235,6 @@ static inline vint8m1_t float2int8relu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloa return _v8; } - - static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr) { const int packn = csrr_vlenb() / 4; From 3efeab172817e3d6eb3c5915fc549accf11b248a Mon Sep 17 00:00:00 2001 From: yxy Date: Tue, 6 Feb 2024 08:34:36 +0000 Subject: [PATCH 04/17] add float2int8leakyrelu --- src/layer/riscv/riscv_usability.h | 176 +++++++++++++++++++++++++++++- 1 file changed, 175 insertions(+), 1 deletion(-) diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h index 803c78756a6b..49b620325fb4 100644 --- a/src/layer/riscv/riscv_usability.h +++ b/src/layer/riscv/riscv_usability.h @@ -80,6 +80,7 @@ static inline int64_t float2int8(vfloat32m1_t _vlow, vfloat32m1_t _vhigh) vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl); // combine _vlow32 and _vhigh32 to a single vint32m2_t + vl = 2 * vsetvlmax_e32m1(); vint32m4_t _v32 = vundefined_i32m4(); _v32 = vset_v_i32m1_i32m4 (_v32, 0, _vlow32); _v32 = vset_v_i32m1_i32m4 (_v32, 1, _vhigh32); @@ -107,7 +108,6 @@ static inline int32_t float2int8(vfloat32m1_t _v) { vfloat32m1_t _v5 = vfadd_vv_f32m1(_v, _p5s, vl); vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v5, vl); - vint32m4_t _v32 = vundefined_i32m4(); _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v32m1); @@ -183,6 +183,7 @@ static inline int64_t float2int8relu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh) vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl); // combine _vlow32 and _vhigh32 to a single vint32m2_t + vl = 2 * vsetvlmax_e32m1(); vint32m4_t _v32 = vundefined_i32m4(); _v32 = vset_v_i32m1_i32m4 (_v32, 0, _vlow32); _v32 = vset_v_i32m1_i32m4 (_v32, 1, _vhigh32); @@ -195,6 +196,32 @@ static inline int64_t float2int8relu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh) return _ret; } +static inline int32_t float2int8relu(vfloat32m1_t _v) { + int vl = vsetvlmax_e32m1(); + // _MM_ROUND_NEAREST round to even + // simulate round to nearest via +/-0.5 with round to zero + + vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl); + // vint32m1_t _signmask = vmv_v_f(1 << 31, vl); + int32_t _signmask = 1 << 31; + vint32m1_t _sign = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v), _signmask, vl); + + vfloat32m1_t _p5s = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign, vreinterpret_v_f32m1_i32m1(_p5), vl)); + + vfloat32m1_t _v5 = vfadd_vv_f32m1(_v, _p5s, vl); + vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v5, vl); + + vint32m4_t _v32 = vundefined_i32m4(); + _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v32m1); + + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + _v8 = vmax_vx_i8m1(_v8, 0, vl); + int32_t _ret; + vse8_v_i8m1((int8_t *)&_ret, _v8, vl); + return _ret; +} + static inline vint8m1_t float2int8relu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3) { int vl = vsetvlmax_e32m1(); @@ -237,6 +264,153 @@ static inline vint8m1_t float2int8relu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloa return _v8; } +static inline int64_t float2int8leakyrelu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh, vfloat32m1_t _slope) +{ + int vl = vsetvlmax_e32m1(); + vfloat32m1_t _vlow_leaky = vsmul_vv_f32m1(_vlow, _slope, vl); + vfloat32m1_t _vhigh_leaky = vsmul_vv_f32m1(_vhigh, _slope, vl); + + // simulate round to nearest via +/-0.5 with round to zero + vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl); + // vint32m1_t _signmask = vmv_v_f(1 << 31, vl); + int32_t _signmask = 1 << 31; + + vint32m1_t _signlow = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vlow), _signmask, vl); + vint32m1_t _signhigh = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vhigh), _signmask, vl); + + vfloat32m1_t _p5low = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signlow, vreinterpret_v_f32m1_i32m1(_p5), vl)); + vfloat32m1_t _p5high = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signhigh, vreinterpret_v_f32m1_i32m1(_p5), vl)); + + vfloat32m1_t _vlow5 = vfadd_vv_f32m1(_vlow_leaky, _p5low, vl); + vfloat32m1_t _vhigh5 = vfadd_vv_f32m1(_vhigh_leaky, _p5high, vl); + + vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow5, vl); + vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl); + + vint32m1_t _signlow_leaky = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vlow_leaky), _signmask, vl); + vint32m1_t _signhigh_leaky = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vhigh_leaky), _signmask, vl); + + vfloat32m1_t _p5low_leaky = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signlow_leaky, vreinterpret_v_f32m1_i32m1(_p5), vl)); + vfloat32m1_t _p5high_leaky = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signhigh_leaky, vreinterpret_v_f32m1_i32m1(_p5), vl)); + + vfloat32m1_t _vlow5_leaky = vfadd_vv_f32m1(_vlow_leaky, _p5low_leaky, vl); + vfloat32m1_t _vhigh5_leaky = vfadd_vv_f32m1(_vhigh_leaky, _p5high_leaky, vl); + + vint32m1_t _vlow32_leaky = vfcvt_x_f_v_i32m1(_vlow5_leaky, vl); + vint32m1_t _vhigh32_leaky = vfcvt_x_f_v_i32m1(_vhigh5_leaky, vl); + + vl = 2 * vsetvlmax_e32m1(); + + vint32m4_t _v32 = vundefined_i32m4(); + vint32m4_t _v32_leaky = vundefined_i32m4(); + _v32 = vset_v_i32m1_i32m4 (_v32, 0, _vlow32); + _v32 = vset_v_i32m1_i32m4 (_v32, 1, _vhigh32); + + _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 0, _vlow32_leaky); + _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 1, _vhigh32_leaky); + + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl); + + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + vint8m1_t _v8_leaky = vnclip_wx_i8m1(_v16_leaky, 0, vl); + + _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl); + int64_t _ret; + vse8_v_i8m1((int8_t *)&_ret, _v8, vl); + return _ret; +} + +static inline int32_t float2int8leakyrelu(vfloat32m1_t _v, vfloat32m1_t _slope) { + int vl = vsetvlmax_e32m1(); + vfloat32m1_t _v_leaky = vsmul_vv_f32m1(_v, _slope, vl); + + vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl); + int32_t _signmask = 1 << 31; + vint32m1_t _sign = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v), _signmask, vl); + + vfloat32m1_t _p5s = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign, vreinterpret_v_f32m1_i32m1(_p5), vl)); + + vfloat32m1_t _v5 = vfadd_vv_f32m1(_v, _p5s, vl); + vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v5, vl); + + vint32m1_t _sign_leaky = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v_leaky), _signmask, vl); + vfloat32m1_t _p5s_leaky = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign_leaky, vreinterpret_v_f32m1_i32m1(_p5), vl)); + + vfloat32m1_t _v5_leaky = vfadd_vv_f32m1(_v_leaky, _p5s_leaky, vl); + vint32m1_t _v32m1_leaky = vfcvt_x_f_v_i32m1(_v5_leaky, vl); + + // vl = vsetvlmax_e32m4(); + vint32m4_t _v32 = vundefined_i32m4(); + vint32m4_t _v32_leaky = vundefined_i32m4(); + + _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v32m1); + _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 0, _v32m1_leaky); + + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl); + + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + vint8m1_t _v8_leaky = vnclip_wx_i8m1(_v16_leaky, 0, vl); + + _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl); + int32_t _ret; + vse8_v_i8m1((int8_t *)&_ret, _v8, vl); + return _ret; +} + +static inline vint8m1_t float2int8leakyrelu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3, vfloat32m1_t _slope) +{ + int vl = vsetvlmax_e32m1(); + vfloat32m1_t _v0_leaky = vsmul_vv_f32m1(_v0, _slope, vl); + vfloat32m1_t _v1_leaky = vsmul_vv_f32m1(_v1, _slope, vl); + vfloat32m1_t _v2_leaky = vsmul_vv_f32m1(_v2, _slope, vl); + vfloat32m1_t _v3_leaky = vsmul_vv_f32m1(_v3, _slope, vl); + + vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl); + int32_t _signmask = 1 << 31; + vint32m1_t _sign0 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v0), _signmask, vl); + vint32m1_t _sign1 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v1), _signmask, vl); + vint32m1_t _sign2 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v2), _signmask, vl); + vint32m1_t _sign3 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v3), _signmask, vl); + + vfloat32m1_t _p5s0 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign0, vreinterpret_v_f32m1_i32m1(_p5), vl)); + vfloat32m1_t _p5s1 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign1, vreinterpret_v_f32m1_i32m1(_p5), vl)); + vfloat32m1_t _p5s2 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign2, vreinterpret_v_f32m1_i32m1(_p5), vl)); + vfloat32m1_t _p5s3 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign3, vreinterpret_v_f32m1_i32m1(_p5), vl)); + + vfloat32m1_t _v05 = vfadd_vv_f32m1(_v0_leaky, _p5s0, vl); + vfloat32m1_t _v15 = vfadd_vv_f32m1(_v1_leaky, _p5s1, vl); + vfloat32m1_t _v25 = vfadd_vv_f32m1(_v2_leaky, _p5s2, vl); + vfloat32m1_t _v35 = vfadd_vv_f32m1(_v3_leaky, _p5s3, vl); + + vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v05, vl); + vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v15, vl); + vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v25, vl); + vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v35, vl); + + vl = vsetvlmax_e32m4(); + vint32m4_t _v32 = vundefined_i32m4(); + _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v0_32); + _v32 = vset_v_i32m1_i32m4 (_v32, 1, _v1_32); + _v32 = vset_v_i32m1_i32m4 (_v32, 2, _v2_32); + _v32 = vset_v_i32m1_i32m4 (_v32, 3, _v3_32); + + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + + vint32m4_t _v32_leaky = vundefined_i32m4(); + _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 0, _v0_32); + _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 1, _v1_32); + _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 2, _v2_32); + _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 3, _v3_32); + + vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl); + vint8m1_t _v8_leaky = vnclip_wx_i8m1(_v16_leaky, 0, vl); + + _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl); + return _v8; +} static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr) From c9c545113a6b0fa0ac8f94dd3d5cb30a0c85f17f Mon Sep 17 00:00:00 2001 From: yxy Date: Tue, 6 Feb 2024 09:23:34 +0000 Subject: [PATCH 05/17] add quantize_riscv but has bug --- src/layer/riscv/quantize_riscv.cpp | 566 +++++++++++++++++++++++++++++ src/layer/riscv/quantize_riscv.h | 32 ++ src/layer/riscv/riscv_usability.h | 14 +- 3 files changed, 605 insertions(+), 7 deletions(-) create mode 100644 src/layer/riscv/quantize_riscv.cpp create mode 100644 src/layer/riscv/quantize_riscv.h diff --git a/src/layer/riscv/quantize_riscv.cpp b/src/layer/riscv/quantize_riscv.cpp new file mode 100644 index 000000000000..df429d9e18e9 --- /dev/null +++ b/src/layer/riscv/quantize_riscv.cpp @@ -0,0 +1,566 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 BUG1989. All rights reserved. +// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "quantize_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +#include "riscv_usability.h" + +#include "cpu.h" + +namespace ncnn { + +Quantize_riscv::Quantize_riscv() +{ + fprintf(stderr, "Quantize_riscv init\n"); +#if __riscv_vector + support_packing = true; +#endif // __riscv_vector +} + +int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int vl = vsetvlmax_e32m1(); + int elembits = bottom_blob.elembits(); + + int dims = bottom_blob.dims; + int elempack = bottom_blob.elempack; + +#if __riscv_vector + if (elempack == 4) + { + if (dims == 1) + { + int w = bottom_blob.w; + int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; + int outw = w * elempack / out_elempack; + + top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const float* ptr0 = (const float*)bottom_blob + i * 4; + signed char* outptr = (signed char*)top_blob + i * 4; + + outptr[0] = float2int8(ptr0[0] * scale); + outptr[1] = float2int8(ptr0[1] * scale); + outptr[2] = float2int8(ptr0[2] * scale); + outptr[3] = float2int8(ptr0[3] * scale); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const float* ptr0 = (const float*)bottom_blob + i * 4; + signed char* outptr = (signed char*)top_blob + i * 4; + + outptr[0] = float2int8(ptr0[0] * scale_data[i * 4]); + outptr[1] = float2int8(ptr0[1] * scale_data[i * 4 + 1]); + outptr[2] = float2int8(ptr0[2] * scale_data[i * 4 + 2]); + outptr[3] = float2int8(ptr0[3] * scale_data[i * 4 + 3]); + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; + int outh = h * elempack / out_elempack; + + top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == 8) + { + if (scale_data_size == 1) + { + // vfloat32m1_t _scale = vfmv_v_f_f32m1(scale_data[0]); + // float32x4_t _scale = vdupq_n_f32(scale_data[0]); + float _scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const float* ptr0 = bottom_blob.row(i * 2); + const float* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* outptr = top_blob.row(i); + + for (int j = 0; j < w; j++) + { + vfloat32m1_t _vlow = vle32_v_f32m1(ptr0, vl); + vfloat32m1_t _vhigh = vle32_v_f32m1(ptr1, vl); + _vlow = vfmul_vf_f32m1(_vlow, _scale, vl); + _vhigh = vfmul_vf_f32m1(_vhigh, _scale, vl); + + int64_t _v = float2int8(_vlow, _vhigh); + // float32x4_t _vlow = vld1q_f32(ptr0); + // float32x4_t _vhigh = vld1q_f32(ptr1); + // _vlow = vmulq_f32(_vlow, _scale); + // _vhigh = vmulq_f32(_vhigh, _scale); + // int8x8_t _v = float2int8(_vlow, _vhigh); + // vst1_s8(outptr, _v); + *(int64_t*)outptr = _v; + + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const float* ptr0 = bottom_blob.row(i * 2); + const float* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* outptr = top_blob.row(i); + + vfloat32m1_t _scale0 = vle32_v_f32m1((const float*)scale_data + i * 8, vl); + vfloat32m1_t _scale1 = vle32_v_f32m1((const float*)scale_data + i * 8 + 4, vl); + + // float32x4_t _scale0 = vld1q_f32((const float*)scale_data + i * 8); + // float32x4_t _scale1 = vld1q_f32((const float*)scale_data + i * 8 + 4); + + for (int j = 0; j < w; j++) + { + vfloat32m1_t _vlow = vle32_v_f32m1(ptr0, vl); + vfloat32m1_t _vhigh = vle32_v_f32m1(ptr1, vl); + _vlow = vfmul_vv_f32m1(_vlow, _scale0, vl); + _vhigh = vfmul_vv_f32m1(_vhigh, _scale1, vl); + int64_t _v = float2int8(_vlow, _vhigh); + *(int64_t*)outptr = _v; + // float32x4_t _vlow = vld1q_f32(ptr0); + // float32x4_t _vhigh = vld1q_f32(ptr1); + // _vlow = vmulq_f32(_vlow, _scale0); + // _vhigh = vmulq_f32(_vhigh, _scale1); + // int8x8_t _v = float2int8(_vlow, _vhigh); + // vst1_s8(outptr, _v); + + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i * 4); + signed char* outptr1 = top_blob.row(i * 4 + 1); + signed char* outptr2 = top_blob.row(i * 4 + 2); + signed char* outptr3 = top_blob.row(i * 4 + 3); + + for (int j = 0; j < w; j++) + { + outptr0[0] = float2int8(ptr0[0] * scale); + outptr1[0] = float2int8(ptr0[1] * scale); + outptr2[0] = float2int8(ptr0[2] * scale); + outptr3[0] = float2int8(ptr0[3] * scale); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i * 4); + signed char* outptr1 = top_blob.row(i * 4 + 1); + signed char* outptr2 = top_blob.row(i * 4 + 2); + signed char* outptr3 = top_blob.row(i * 4 + 3); + + const float s0 = scale_data[i * 4]; + const float s1 = scale_data[i * 4 + 1]; + const float s2 = scale_data[i * 4 + 2]; + const float s3 = scale_data[i * 4 + 3]; + + for (int j = 0; j < w; j++) + { + outptr0[0] = float2int8(ptr0[0] * s0); + outptr1[0] = float2int8(ptr0[1] * s1); + outptr2[0] = float2int8(ptr0[2] * s2); + outptr3[0] = float2int8(ptr0[3] * s3); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; + int outc = channels * elempack / out_elempack; + + top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == 8) + { + if (scale_data_size == 1) + { + // float32x4_t _scale = vdupq_n_f32(scale_data[0]); + float _scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const float* ptr0 = bottom_blob.channel(q * 2); + const float* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* outptr = top_blob.channel(q); + + int i = 0; + for (; i + 1 < size; i += 2) + { + vfloat32m1_t _v0 = vle32_v_f32m1(ptr0, vl); + vfloat32m1_t _v1 = vle32_v_f32m1(ptr0 + 4, vl); + vfloat32m1_t _v2 = vle32_v_f32m1(ptr1, vl); + vfloat32m1_t _v3 = vle32_v_f32m1(ptr1 + 4, vl); + _v0 = vfmul_vf_f32m1(_v0, _scale, vl); + _v1 = vfmul_vf_f32m1(_v1, _scale, vl); + _v2 = vfmul_vf_f32m1(_v2, _scale, vl); + _v3 = vfmul_vf_f32m1(_v3, _scale, vl); + + vint8m1_t _v = float2int8(_v0, _v2, _v1, _v3); + vse8_v_i8m1(outptr, _v, 4 * vl); + // float32x4_t _v0 = vld1q_f32(ptr0); + // float32x4_t _v1 = vld1q_f32(ptr0 + 4); + // float32x4_t _v2 = vld1q_f32(ptr1); + // float32x4_t _v3 = vld1q_f32(ptr1 + 4); + // _v0 = vmulq_f32(_v0, _scale); + // _v1 = vmulq_f32(_v1, _scale); + // _v2 = vmulq_f32(_v2, _scale); + // _v3 = vmulq_f32(_v3, _scale); + // vst1_s8(outptr, float2int8(_v0, _v2)); + // vst1_s8(outptr + 8, float2int8(_v1, _v3)); + + ptr0 += 8; + ptr1 += 8; + outptr += 16; + } + for (; i < size; i++) + { + vfloat32m1_t _vlow = vle32_v_f32m1(ptr0, vl); + vfloat32m1_t _vhigh = vle32_v_f32m1(ptr1, vl); + + _vlow = vfmul_vf_f32m1(_vlow, _scale, vl); + _vhigh = vfmul_vf_f32m1(_vhigh, _scale, vl); + + int64_t _v = float2int8(_vlow, _vhigh); + *(int64_t*)outptr = _v; + + // float32x4_t _vlow = vld1q_f32(ptr0); + // float32x4_t _vhigh = vld1q_f32(ptr1); + // _vlow = vmulq_f32(_vlow, _scale); + // _vhigh = vmulq_f32(_vhigh, _scale); + // int8x8_t _v = float2int8(_vlow, _vhigh); + // vst1_s8(outptr, _v); + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const float* ptr0 = bottom_blob.channel(q * 2); + const float* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* outptr = top_blob.channel(q); + + vfloat32m1_t _scale0 = vle32_v_f32m1((const float*)scale_data + q * 8, vl); + vfloat32m1_t _scale1 = vle32_v_f32m1((const float*)scale_data + q * 8 + 4, vl); + + // float32x4_t _scale0 = vld1q_f32((const float*)scale_data + q * 8); + // float32x4_t _scale1 = vld1q_f32((const float*)scale_data + q * 8 + 4); + + int i = 0; + for (; i < size; i++) + { + vfloat32m1_t _vlow = vle32_v_f32m1(ptr0, vl); + vfloat32m1_t _vhigh = vle32_v_f32m1(ptr1, vl); + + _vlow = vfmul_vv_f32m1(_vlow, _scale0, vl); + _vhigh = vfmul_vv_f32m1(_vhigh, _scale1, vl); + + int64_t _v = float2int8(_vlow, _vhigh); + *(int64_t*)outptr = _v; + + // float32x4_t _vlow = vld1q_f32(ptr0); + // float32x4_t _vhigh = vld1q_f32(ptr1); + // _vlow = vmulq_f32(_vlow, _scale0); + // _vhigh = vmulq_f32(_vhigh, _scale1); + // int8x8_t _v = float2int8(_vlow, _vhigh); + // vst1_s8(outptr, _v); + + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q * 4); + signed char* outptr1 = top_blob.channel(q * 4 + 1); + signed char* outptr2 = top_blob.channel(q * 4 + 2); + signed char* outptr3 = top_blob.channel(q * 4 + 3); + + for (int i = 0; i < size; i++) + { + outptr0[0] = float2int8(ptr0[0] * scale); + outptr1[0] = float2int8(ptr0[1] * scale); + outptr2[0] = float2int8(ptr0[2] * scale); + outptr3[0] = float2int8(ptr0[3] * scale); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q * 4); + signed char* outptr1 = top_blob.channel(q * 4 + 1); + signed char* outptr2 = top_blob.channel(q * 4 + 2); + signed char* outptr3 = top_blob.channel(q * 4 + 3); + + const float s0 = scale_data[q * 4]; + const float s1 = scale_data[q * 4 + 1]; + const float s2 = scale_data[q * 4 + 2]; + const float s3 = scale_data[q * 4 + 3]; + + for (int i = 0; i < size; i++) + { + outptr0[0] = float2int8(ptr0[0] * s0); + outptr1[0] = float2int8(ptr0[1] * s1); + outptr2[0] = float2int8(ptr0[2] * s2); + outptr3[0] = float2int8(ptr0[3] * s3); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + } + } + + return 0; + } +#endif // __riscv_vector + + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const float* ptr = bottom_blob; + signed char* outptr = top_blob; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + outptr[i] = float2int8(ptr[i] * scale); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + outptr[i] = float2int8(ptr[i] * scale_data[i]); + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + + for (int j = 0; j < w; j++) + { + *outptr0++ = float2int8(*ptr0++ * scale); + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + signed char* outptr = top_blob.channel(q); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + + int i = 0; +#if __riscv_vector + // vfloat32m1_t _scale = vfmv_v_f_f32m1(scale); + // float32x4_t _scale = vdupq_n_f32(scale); + float _scale = scale; + + for (; i + 15 < size; i += 16) + { + vfloat32m1_t _v0 = vle32_v_f32m1(ptr, vl); + vfloat32m1_t _v1 = vle32_v_f32m1(ptr + 4, vl); + vfloat32m1_t _v2 = vle32_v_f32m1(ptr + 8, vl); + vfloat32m1_t _v3 = vle32_v_f32m1(ptr + 12, vl); + + _v0 = vfmul_vf_f32m1(_v0, _scale, vl); + _v1 = vfmul_vf_f32m1(_v1, _scale, vl); + _v2 = vfmul_vf_f32m1(_v2, _scale, vl); + _v3 = vfmul_vf_f32m1(_v3, _scale, vl); + + vint8m1_t _v = float2int8(_v0, _v1, _v2, _v3); + vse8_v_i8m1(outptr, _v, 4 * vl); + + // float32x4_t _v0 = vld1q_f32(ptr); + // float32x4_t _v1 = vld1q_f32(ptr + 4); + // float32x4_t _v2 = vld1q_f32(ptr + 8); + // float32x4_t _v3 = vld1q_f32(ptr + 12); + // _v0 = vmulq_f32(_v0, _scale); + // _v1 = vmulq_f32(_v1, _scale); + // _v2 = vmulq_f32(_v2, _scale); + // _v3 = vmulq_f32(_v3, _scale); + // vst1_s8(outptr, float2int8(_v0, _v1)); + // vst1_s8(outptr + 8, float2int8(_v2, _v3)); + + ptr += 16; + outptr += 16; + } + for (; i + 7 < size; i += 8) + { + vfloat32m1_t _v0 = vle32_v_f32m1(ptr, vl); + vfloat32m1_t _v1 = vle32_v_f32m1(ptr + 4, vl); + + _v0 = vfmul_vf_f32m1(_v0, _scale, vl); + _v1 = vfmul_vf_f32m1(_v1, _scale, vl); + + int64_t _v = float2int8(_v0, _v1); + *(int64_t*)outptr = _v; + // float32x4_t _v0 = vld1q_f32(ptr); + // float32x4_t _v1 = vld1q_f32(ptr + 4); + // _v0 = vmulq_f32(_v0, _scale); + // _v1 = vmulq_f32(_v1, _scale); + // int8x8_t _v = float2int8(_v0, _v1); + // vst1_s8(outptr, _v); + + ptr += 8; + outptr += 8; + } +#endif // __riscv_vector + for (; i < size; i++) + { + *outptr++ = float2int8(*ptr++ * scale); + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/riscv/quantize_riscv.h b/src/layer/riscv/quantize_riscv.h new file mode 100644 index 000000000000..128aca97e640 --- /dev/null +++ b/src/layer/riscv/quantize_riscv.h @@ -0,0 +1,32 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_QUANTIZE_RISCV_H +#define LAYER_QUANTIZE_RISCV_H + +#include "quantize.h" + +namespace ncnn { + +class Quantize_riscv : public Quantize +{ +public: + Quantize_riscv(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_QUANTIZE_RISCV_H diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h index 8f1fe5aaad88..efe639b2e3fd 100644 --- a/src/layer/riscv/riscv_usability.h +++ b/src/layer/riscv/riscv_usability.h @@ -266,8 +266,8 @@ static inline vint8m1_t float2int8relu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloa static inline int64_t float2int8leakyrelu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh, vfloat32m1_t _slope) { int vl = vsetvlmax_e32m1(); - vfloat32m1_t _vlow_leaky = vsmul_vv_f32m1(_vlow, _slope, vl); - vfloat32m1_t _vhigh_leaky = vsmul_vv_f32m1(_vhigh, _slope, vl); + vfloat32m1_t _vlow_leaky = vfmul_vv_f32m1(_vlow, _slope, vl); + vfloat32m1_t _vhigh_leaky = vfmul_vv_f32m1(_vhigh, _slope, vl); // simulate round to nearest via +/-0.5 with round to zero vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl); @@ -322,7 +322,7 @@ static inline int64_t float2int8leakyrelu(vfloat32m1_t _vlow, vfloat32m1_t _vhig static inline int32_t float2int8leakyrelu(vfloat32m1_t _v, vfloat32m1_t _slope) { int vl = vsetvlmax_e32m1(); - vfloat32m1_t _v_leaky = vsmul_vv_f32m1(_v, _slope, vl); + vfloat32m1_t _v_leaky = vfmul_vv_f32m1(_v, _slope, vl); vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl); int32_t _signmask = 1 << 31; @@ -361,10 +361,10 @@ static inline int32_t float2int8leakyrelu(vfloat32m1_t _v, vfloat32m1_t _slope) static inline vint8m1_t float2int8leakyrelu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3, vfloat32m1_t _slope) { int vl = vsetvlmax_e32m1(); - vfloat32m1_t _v0_leaky = vsmul_vv_f32m1(_v0, _slope, vl); - vfloat32m1_t _v1_leaky = vsmul_vv_f32m1(_v1, _slope, vl); - vfloat32m1_t _v2_leaky = vsmul_vv_f32m1(_v2, _slope, vl); - vfloat32m1_t _v3_leaky = vsmul_vv_f32m1(_v3, _slope, vl); + vfloat32m1_t _v0_leaky = vfmul_vv_f32m1(_v0, _slope, vl); + vfloat32m1_t _v1_leaky = vfmul_vv_f32m1(_v1, _slope, vl); + vfloat32m1_t _v2_leaky = vfmul_vv_f32m1(_v2, _slope, vl); + vfloat32m1_t _v3_leaky = vfmul_vv_f32m1(_v3, _slope, vl); vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl); int32_t _signmask = 1 << 31; From 88e0910944c54acbb909ce244e4265f7a6844ee4 Mon Sep 17 00:00:00 2001 From: yxy Date: Tue, 6 Feb 2024 16:15:00 +0000 Subject: [PATCH 06/17] fix bug --- src/layer/riscv/quantize_riscv.cpp | 67 --------------------- src/layer/riscv/riscv_usability.h | 93 ++++++++++++++---------------- 2 files changed, 42 insertions(+), 118 deletions(-) diff --git a/src/layer/riscv/quantize_riscv.cpp b/src/layer/riscv/quantize_riscv.cpp index df429d9e18e9..9ea8030dfde0 100644 --- a/src/layer/riscv/quantize_riscv.cpp +++ b/src/layer/riscv/quantize_riscv.cpp @@ -27,7 +27,6 @@ namespace ncnn { Quantize_riscv::Quantize_riscv() { - fprintf(stderr, "Quantize_riscv init\n"); #if __riscv_vector support_packing = true; #endif // __riscv_vector @@ -120,12 +119,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& _vhigh = vfmul_vf_f32m1(_vhigh, _scale, vl); int64_t _v = float2int8(_vlow, _vhigh); - // float32x4_t _vlow = vld1q_f32(ptr0); - // float32x4_t _vhigh = vld1q_f32(ptr1); - // _vlow = vmulq_f32(_vlow, _scale); - // _vhigh = vmulq_f32(_vhigh, _scale); - // int8x8_t _v = float2int8(_vlow, _vhigh); - // vst1_s8(outptr, _v); *(int64_t*)outptr = _v; ptr0 += 4; @@ -146,9 +139,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& vfloat32m1_t _scale0 = vle32_v_f32m1((const float*)scale_data + i * 8, vl); vfloat32m1_t _scale1 = vle32_v_f32m1((const float*)scale_data + i * 8 + 4, vl); - // float32x4_t _scale0 = vld1q_f32((const float*)scale_data + i * 8); - // float32x4_t _scale1 = vld1q_f32((const float*)scale_data + i * 8 + 4); - for (int j = 0; j < w; j++) { vfloat32m1_t _vlow = vle32_v_f32m1(ptr0, vl); @@ -157,13 +147,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& _vhigh = vfmul_vv_f32m1(_vhigh, _scale1, vl); int64_t _v = float2int8(_vlow, _vhigh); *(int64_t*)outptr = _v; - // float32x4_t _vlow = vld1q_f32(ptr0); - // float32x4_t _vhigh = vld1q_f32(ptr1); - // _vlow = vmulq_f32(_vlow, _scale0); - // _vhigh = vmulq_f32(_vhigh, _scale1); - // int8x8_t _v = float2int8(_vlow, _vhigh); - // vst1_s8(outptr, _v); - ptr0 += 4; ptr1 += 4; outptr += 8; @@ -252,7 +235,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& { if (scale_data_size == 1) { - // float32x4_t _scale = vdupq_n_f32(scale_data[0]); float _scale = scale_data[0]; #pragma omp parallel for num_threads(opt.num_threads) @@ -276,17 +258,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& vint8m1_t _v = float2int8(_v0, _v2, _v1, _v3); vse8_v_i8m1(outptr, _v, 4 * vl); - // float32x4_t _v0 = vld1q_f32(ptr0); - // float32x4_t _v1 = vld1q_f32(ptr0 + 4); - // float32x4_t _v2 = vld1q_f32(ptr1); - // float32x4_t _v3 = vld1q_f32(ptr1 + 4); - // _v0 = vmulq_f32(_v0, _scale); - // _v1 = vmulq_f32(_v1, _scale); - // _v2 = vmulq_f32(_v2, _scale); - // _v3 = vmulq_f32(_v3, _scale); - // vst1_s8(outptr, float2int8(_v0, _v2)); - // vst1_s8(outptr + 8, float2int8(_v1, _v3)); - ptr0 += 8; ptr1 += 8; outptr += 16; @@ -301,13 +272,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int64_t _v = float2int8(_vlow, _vhigh); *(int64_t*)outptr = _v; - - // float32x4_t _vlow = vld1q_f32(ptr0); - // float32x4_t _vhigh = vld1q_f32(ptr1); - // _vlow = vmulq_f32(_vlow, _scale); - // _vhigh = vmulq_f32(_vhigh, _scale); - // int8x8_t _v = float2int8(_vlow, _vhigh); - // vst1_s8(outptr, _v); ptr0 += 4; ptr1 += 4; outptr += 8; @@ -326,9 +290,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& vfloat32m1_t _scale0 = vle32_v_f32m1((const float*)scale_data + q * 8, vl); vfloat32m1_t _scale1 = vle32_v_f32m1((const float*)scale_data + q * 8 + 4, vl); - // float32x4_t _scale0 = vld1q_f32((const float*)scale_data + q * 8); - // float32x4_t _scale1 = vld1q_f32((const float*)scale_data + q * 8 + 4); - int i = 0; for (; i < size; i++) { @@ -340,14 +301,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int64_t _v = float2int8(_vlow, _vhigh); *(int64_t*)outptr = _v; - - // float32x4_t _vlow = vld1q_f32(ptr0); - // float32x4_t _vhigh = vld1q_f32(ptr1); - // _vlow = vmulq_f32(_vlow, _scale0); - // _vhigh = vmulq_f32(_vhigh, _scale1); - // int8x8_t _v = float2int8(_vlow, _vhigh); - // vst1_s8(outptr, _v); - ptr0 += 4; ptr1 += 4; outptr += 8; @@ -499,8 +452,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int i = 0; #if __riscv_vector - // vfloat32m1_t _scale = vfmv_v_f_f32m1(scale); - // float32x4_t _scale = vdupq_n_f32(scale); float _scale = scale; for (; i + 15 < size; i += 16) @@ -518,17 +469,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& vint8m1_t _v = float2int8(_v0, _v1, _v2, _v3); vse8_v_i8m1(outptr, _v, 4 * vl); - // float32x4_t _v0 = vld1q_f32(ptr); - // float32x4_t _v1 = vld1q_f32(ptr + 4); - // float32x4_t _v2 = vld1q_f32(ptr + 8); - // float32x4_t _v3 = vld1q_f32(ptr + 12); - // _v0 = vmulq_f32(_v0, _scale); - // _v1 = vmulq_f32(_v1, _scale); - // _v2 = vmulq_f32(_v2, _scale); - // _v3 = vmulq_f32(_v3, _scale); - // vst1_s8(outptr, float2int8(_v0, _v1)); - // vst1_s8(outptr + 8, float2int8(_v2, _v3)); - ptr += 16; outptr += 16; } @@ -542,13 +482,6 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int64_t _v = float2int8(_v0, _v1); *(int64_t*)outptr = _v; - // float32x4_t _v0 = vld1q_f32(ptr); - // float32x4_t _v1 = vld1q_f32(ptr + 4); - // _v0 = vmulq_f32(_v0, _scale); - // _v1 = vmulq_f32(_v1, _scale); - // int8x8_t _v = float2int8(_v0, _v1); - // vst1_s8(outptr, _v); - ptr += 8; outptr += 8; } diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h index efe639b2e3fd..9075ba9e667c 100644 --- a/src/layer/riscv/riscv_usability.h +++ b/src/layer/riscv/riscv_usability.h @@ -61,22 +61,8 @@ static inline int csrr_vlenb() static inline int64_t float2int8(vfloat32m1_t _vlow, vfloat32m1_t _vhigh) { int vl = vsetvlmax_e32m1(); - // _MM_ROUND_NEAREST round to even - // simulate round to nearest via +/-0.5 with round to zero - vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl); - // vint32m1_t _signmask = vmv_v_f(1 << 31, vl); - int32_t _signmask = 1 << 31; - vint32m1_t _signlow = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vlow), _signmask, vl); - vint32m1_t _signhigh = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vhigh), _signmask, vl); - - vfloat32m1_t _p5low = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signlow, vreinterpret_v_f32m1_i32m1(_p5), vl)); - vfloat32m1_t _p5high = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signhigh, vreinterpret_v_f32m1_i32m1(_p5), vl)); - - vfloat32m1_t _vlow5 = vfadd_vv_f32m1(_vlow, _p5low, vl); - vfloat32m1_t _vhigh5 = vfadd_vv_f32m1(_vhigh, _p5high, vl); - - vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow5, vl); - vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl); + vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow, vl); + vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh, vl); // combine _vlow32 and _vhigh32 to a single vint32m2_t vl = 2 * vsetvlmax_e32m1(); @@ -95,18 +81,7 @@ static inline int64_t float2int8(vfloat32m1_t _vlow, vfloat32m1_t _vhigh) static inline int32_t float2int8(vfloat32m1_t _v) { int vl = vsetvlmax_e32m1(); - // _MM_ROUND_NEAREST round to even - // simulate round to nearest via +/-0.5 with round to zero - - vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl); - // vint32m1_t _signmask = vmv_v_f(1 << 31, vl); - int32_t _signmask = 1 << 31; - vint32m1_t _sign = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v), _signmask, vl); - - vfloat32m1_t _p5s = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign, vreinterpret_v_f32m1_i32m1(_p5), vl)); - - vfloat32m1_t _v5 = vfadd_vv_f32m1(_v, _p5s, vl); - vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v5, vl); + vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v, vl); vint32m4_t _v32 = vundefined_i32m4(); _v32 = vset_v_i32m1_i32m4(_v32, 0, _v32m1); @@ -119,33 +94,47 @@ static inline int32_t float2int8(vfloat32m1_t _v) return _ret; } -static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3) +static void print_vint8m1(vint8m1_t _v) { - int vl = vsetvlmax_e32m1(); - // _MM_ROUND_NEAREST round to even - // simulate round to nearest via +/-0.5 with round to zero - vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl); - // vint32m1_t _signmask = vmv_v_f(1 << 31, vl); - int32_t _signmask = 1 << 31; - vint32m1_t _sign0 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v0), _signmask, vl); - vint32m1_t _sign1 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v1), _signmask, vl); - vint32m1_t _sign2 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v2), _signmask, vl); - vint32m1_t _sign3 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v3), _signmask, vl); + int8_t *i8 = (int8_t *)malloc(16 * sizeof(int8_t)); + vse8_v_i8m1(i8, _v, 16); + for (int i = 0; i < 16; i++) + { + fprintf(stderr, "i8[%d]: %d\n", i, i8[i]); + } + free(i8); +} - vfloat32m1_t _p5s0 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign0, vreinterpret_v_f32m1_i32m1(_p5), vl)); - vfloat32m1_t _p5s1 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign1, vreinterpret_v_f32m1_i32m1(_p5), vl)); - vfloat32m1_t _p5s2 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign2, vreinterpret_v_f32m1_i32m1(_p5), vl)); - vfloat32m1_t _p5s3 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign3, vreinterpret_v_f32m1_i32m1(_p5), vl)); +static void print_vint32m1(vint32m1_t _v) +{ + int32_t *i32 = (int32_t *)malloc(4 * sizeof(int32_t)); + vse32_v_i32m1(i32, _v, 4); + for (int i = 0; i < 4; i++) + { + fprintf(stderr, "i32[%d]: %d\n", i, i32[i]); + } + free(i32); +} - vfloat32m1_t _v05 = vfadd_vv_f32m1(_v0, _p5s0, vl); - vfloat32m1_t _v15 = vfadd_vv_f32m1(_v1, _p5s1, vl); - vfloat32m1_t _v25 = vfadd_vv_f32m1(_v2, _p5s2, vl); - vfloat32m1_t _v35 = vfadd_vv_f32m1(_v3, _p5s3, vl); +static void print_vfloat32m1(vfloat32m1_t _v) +{ + float *f32 = (float *)malloc(4 * sizeof(float)); + vse32_v_f32m1(f32, _v, 4); + for (int i = 0; i < 4; i++) + { + fprintf(stderr, "f32[%d]: %f\n", i, f32[i]); + } + free(f32); +} - vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v05, vl); - vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v15, vl); - vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v25, vl); - vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v35, vl); +static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3) +{ + int vl = vsetvlmax_e32m1(); + + vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v0, vl); + vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v1, vl); + vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v2, vl); + vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v3, vl); vl = vsetvlmax_e32m4(); @@ -158,6 +147,8 @@ static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); _v8 = vmax_vx_i8m1(_v8, -127, vl); + int8_t *i8 = (int8_t *)malloc(16 * sizeof(int8_t)); + vse8_v_i8m1(i8, _v8, 16); return _v8; } From c3d77e0a89fe3d557d905385407d56524e10d0d0 Mon Sep 17 00:00:00 2001 From: yxy Date: Tue, 6 Feb 2024 16:30:46 +0000 Subject: [PATCH 07/17] fix bug of float2int8relu --- src/layer/riscv/riscv_usability.h | 142 ++++++------------------------ 1 file changed, 25 insertions(+), 117 deletions(-) diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h index 9075ba9e667c..1ae86f767553 100644 --- a/src/layer/riscv/riscv_usability.h +++ b/src/layer/riscv/riscv_usability.h @@ -155,24 +155,10 @@ static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m static inline int64_t float2int8relu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh) { int vl = vsetvlmax_e32m1(); - // _MM_ROUND_NEAREST round to even - // simulate round to nearest via +/-0.5 with round to zero - vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl); - // vint32m1_t _signmask = vmv_v_f(1 << 31, vl); - int32_t _signmask = 1 << 31; - vint32m1_t _signlow = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vlow), _signmask, vl); - vint32m1_t _signhigh = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vhigh), _signmask, vl); - vfloat32m1_t _p5low = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signlow, vreinterpret_v_f32m1_i32m1(_p5), vl)); - vfloat32m1_t _p5high = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signhigh, vreinterpret_v_f32m1_i32m1(_p5), vl)); - - vfloat32m1_t _vlow5 = vfadd_vv_f32m1(_vlow, _p5low, vl); - vfloat32m1_t _vhigh5 = vfadd_vv_f32m1(_vhigh, _p5high, vl); - - vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow5, vl); - vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl); + vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow, vl); + vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh, vl); - // combine _vlow32 and _vhigh32 to a single vint32m2_t vl = 2 * vsetvlmax_e32m1(); vint32m4_t _v32 = vundefined_i32m4(); _v32 = vset_v_i32m1_i32m4(_v32, 0, _vlow32); @@ -188,18 +174,7 @@ static inline int64_t float2int8relu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh) static inline int32_t float2int8relu(vfloat32m1_t _v) { int vl = vsetvlmax_e32m1(); - // _MM_ROUND_NEAREST round to even - // simulate round to nearest via +/-0.5 with round to zero - - vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl); - // vint32m1_t _signmask = vmv_v_f(1 << 31, vl); - int32_t _signmask = 1 << 31; - vint32m1_t _sign = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v), _signmask, vl); - - vfloat32m1_t _p5s = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign, vreinterpret_v_f32m1_i32m1(_p5), vl)); - - vfloat32m1_t _v5 = vfadd_vv_f32m1(_v, _p5s, vl); - vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v5, vl); + vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v, vl); vint32m4_t _v32 = vundefined_i32m4(); _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v32m1); @@ -215,30 +190,10 @@ static inline int32_t float2int8relu(vfloat32m1_t _v) { static inline vint8m1_t float2int8relu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3) { int vl = vsetvlmax_e32m1(); - // _MM_ROUND_NEAREST round to even - // simulate round to nearest via +/-0.5 with round to zero - vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl); - // vint32m1_t _signmask = vmv_v_f(1 << 31, vl); - int32_t _signmask = 1 << 31; - vint32m1_t _sign0 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v0), _signmask, vl); - vint32m1_t _sign1 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v1), _signmask, vl); - vint32m1_t _sign2 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v2), _signmask, vl); - vint32m1_t _sign3 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v3), _signmask, vl); - - vfloat32m1_t _p5s0 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign0, vreinterpret_v_f32m1_i32m1(_p5), vl)); - vfloat32m1_t _p5s1 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign1, vreinterpret_v_f32m1_i32m1(_p5), vl)); - vfloat32m1_t _p5s2 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign2, vreinterpret_v_f32m1_i32m1(_p5), vl)); - vfloat32m1_t _p5s3 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign3, vreinterpret_v_f32m1_i32m1(_p5), vl)); - - vfloat32m1_t _v05 = vfadd_vv_f32m1(_v0, _p5s0, vl); - vfloat32m1_t _v15 = vfadd_vv_f32m1(_v1, _p5s1, vl); - vfloat32m1_t _v25 = vfadd_vv_f32m1(_v2, _p5s2, vl); - vfloat32m1_t _v35 = vfadd_vv_f32m1(_v3, _p5s3, vl); - - vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v05, vl); - vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v15, vl); - vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v25, vl); - vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v35, vl); + vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v0, vl); + vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v1, vl); + vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v2, vl); + vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v3, vl); vl = vsetvlmax_e32m4(); @@ -260,34 +215,11 @@ static inline int64_t float2int8leakyrelu(vfloat32m1_t _vlow, vfloat32m1_t _vhig vfloat32m1_t _vlow_leaky = vfmul_vv_f32m1(_vlow, _slope, vl); vfloat32m1_t _vhigh_leaky = vfmul_vv_f32m1(_vhigh, _slope, vl); - // simulate round to nearest via +/-0.5 with round to zero - vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl); - // vint32m1_t _signmask = vmv_v_f(1 << 31, vl); - int32_t _signmask = 1 << 31; - - vint32m1_t _signlow = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vlow), _signmask, vl); - vint32m1_t _signhigh = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vhigh), _signmask, vl); - - vfloat32m1_t _p5low = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signlow, vreinterpret_v_f32m1_i32m1(_p5), vl)); - vfloat32m1_t _p5high = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signhigh, vreinterpret_v_f32m1_i32m1(_p5), vl)); - - vfloat32m1_t _vlow5 = vfadd_vv_f32m1(_vlow_leaky, _p5low, vl); - vfloat32m1_t _vhigh5 = vfadd_vv_f32m1(_vhigh_leaky, _p5high, vl); - - vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow5, vl); - vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh5, vl); - - vint32m1_t _signlow_leaky = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vlow_leaky), _signmask, vl); - vint32m1_t _signhigh_leaky = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_vhigh_leaky), _signmask, vl); - - vfloat32m1_t _p5low_leaky = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signlow_leaky, vreinterpret_v_f32m1_i32m1(_p5), vl)); - vfloat32m1_t _p5high_leaky = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_signhigh_leaky, vreinterpret_v_f32m1_i32m1(_p5), vl)); - - vfloat32m1_t _vlow5_leaky = vfadd_vv_f32m1(_vlow_leaky, _p5low_leaky, vl); - vfloat32m1_t _vhigh5_leaky = vfadd_vv_f32m1(_vhigh_leaky, _p5high_leaky, vl); + vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow, vl); + vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh, vl); - vint32m1_t _vlow32_leaky = vfcvt_x_f_v_i32m1(_vlow5_leaky, vl); - vint32m1_t _vhigh32_leaky = vfcvt_x_f_v_i32m1(_vhigh5_leaky, vl); + vint32m1_t _vlow32_leaky = vfcvt_x_f_v_i32m1(_vlow_leaky, vl); + vint32m1_t _vhigh32_leaky = vfcvt_x_f_v_i32m1(_vhigh_leaky, vl); vl = 2 * vsetvlmax_e32m1(); @@ -315,20 +247,8 @@ static inline int32_t float2int8leakyrelu(vfloat32m1_t _v, vfloat32m1_t _slope) int vl = vsetvlmax_e32m1(); vfloat32m1_t _v_leaky = vfmul_vv_f32m1(_v, _slope, vl); - vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl); - int32_t _signmask = 1 << 31; - vint32m1_t _sign = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v), _signmask, vl); - - vfloat32m1_t _p5s = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign, vreinterpret_v_f32m1_i32m1(_p5), vl)); - - vfloat32m1_t _v5 = vfadd_vv_f32m1(_v, _p5s, vl); - vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v5, vl); - - vint32m1_t _sign_leaky = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v_leaky), _signmask, vl); - vfloat32m1_t _p5s_leaky = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign_leaky, vreinterpret_v_f32m1_i32m1(_p5), vl)); - - vfloat32m1_t _v5_leaky = vfadd_vv_f32m1(_v_leaky, _p5s_leaky, vl); - vint32m1_t _v32m1_leaky = vfcvt_x_f_v_i32m1(_v5_leaky, vl); + vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v, vl); + vint32m1_t _v32m1_leaky = vfcvt_x_f_v_i32m1(_v_leaky, vl); // vl = vsetvlmax_e32m4(); vint32m4_t _v32 = vundefined_i32m4(); @@ -357,27 +277,15 @@ static inline vint8m1_t float2int8leakyrelu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2_leaky = vfmul_vv_f32m1(_v2, _slope, vl); vfloat32m1_t _v3_leaky = vfmul_vv_f32m1(_v3, _slope, vl); - vfloat32m1_t _p5 = vfmv_v_f_f32m1(0.5f, vl); - int32_t _signmask = 1 << 31; - vint32m1_t _sign0 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v0), _signmask, vl); - vint32m1_t _sign1 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v1), _signmask, vl); - vint32m1_t _sign2 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v2), _signmask, vl); - vint32m1_t _sign3 = vand_vx_i32m1(vreinterpret_v_f32m1_i32m1(_v3), _signmask, vl); - - vfloat32m1_t _p5s0 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign0, vreinterpret_v_f32m1_i32m1(_p5), vl)); - vfloat32m1_t _p5s1 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign1, vreinterpret_v_f32m1_i32m1(_p5), vl)); - vfloat32m1_t _p5s2 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign2, vreinterpret_v_f32m1_i32m1(_p5), vl)); - vfloat32m1_t _p5s3 = vreinterpret_v_i32m1_f32m1(vxor_vv_i32m1(_sign3, vreinterpret_v_f32m1_i32m1(_p5), vl)); - - vfloat32m1_t _v05 = vfadd_vv_f32m1(_v0_leaky, _p5s0, vl); - vfloat32m1_t _v15 = vfadd_vv_f32m1(_v1_leaky, _p5s1, vl); - vfloat32m1_t _v25 = vfadd_vv_f32m1(_v2_leaky, _p5s2, vl); - vfloat32m1_t _v35 = vfadd_vv_f32m1(_v3_leaky, _p5s3, vl); + vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v0, vl); + vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v1, vl); + vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v2, vl); + vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v3, vl); - vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v05, vl); - vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v15, vl); - vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v25, vl); - vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v35, vl); + vint32m1_t _v0_32_leaky = vfcvt_x_f_v_i32m1(_v0_leaky, vl); + vint32m1_t _v1_32_leaky = vfcvt_x_f_v_i32m1(_v1_leaky, vl); + vint32m1_t _v2_32_leaky = vfcvt_x_f_v_i32m1(_v2_leaky, vl); + vint32m1_t _v3_32_leaky = vfcvt_x_f_v_i32m1(_v3_leaky, vl); vl = vsetvlmax_e32m4(); vint32m4_t _v32 = vundefined_i32m4(); @@ -390,10 +298,10 @@ static inline vint8m1_t float2int8leakyrelu(vfloat32m1_t _v0, vfloat32m1_t _v1, vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); vint32m4_t _v32_leaky = vundefined_i32m4(); - _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 0, _v0_32); - _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 1, _v1_32); - _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 2, _v2_32); - _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 3, _v3_32); + _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 0, _v0_32_leaky); + _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 1, _v1_32_leaky); + _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 2, _v2_32_leaky); + _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 3, _v3_32_leaky); vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl); vint8m1_t _v8_leaky = vnclip_wx_i8m1(_v16_leaky, 0, vl); From 05fdab0add27b5e42fedae7e8290073e2c395792 Mon Sep 17 00:00:00 2001 From: Xinyu302 Date: Tue, 6 Feb 2024 16:32:08 +0000 Subject: [PATCH 08/17] apply code-format changes --- src/layer/riscv/quantize_riscv.cpp | 2 +- src/layer/riscv/riscv_usability.h | 57 +++++++++++++++--------------- 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/src/layer/riscv/quantize_riscv.cpp b/src/layer/riscv/quantize_riscv.cpp index 9ea8030dfde0..a459688227b9 100644 --- a/src/layer/riscv/quantize_riscv.cpp +++ b/src/layer/riscv/quantize_riscv.cpp @@ -117,7 +117,7 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& vfloat32m1_t _vhigh = vle32_v_f32m1(ptr1, vl); _vlow = vfmul_vf_f32m1(_vlow, _scale, vl); _vhigh = vfmul_vf_f32m1(_vhigh, _scale, vl); - + int64_t _v = float2int8(_vlow, _vhigh); *(int64_t*)outptr = _v; diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h index 1ae86f767553..cbbfded60a91 100644 --- a/src/layer/riscv/riscv_usability.h +++ b/src/layer/riscv/riscv_usability.h @@ -94,9 +94,9 @@ static inline int32_t float2int8(vfloat32m1_t _v) return _ret; } -static void print_vint8m1(vint8m1_t _v) +static void print_vint8m1(vint8m1_t _v) { - int8_t *i8 = (int8_t *)malloc(16 * sizeof(int8_t)); + int8_t* i8 = (int8_t*)malloc(16 * sizeof(int8_t)); vse8_v_i8m1(i8, _v, 16); for (int i = 0; i < 16; i++) { @@ -107,7 +107,7 @@ static void print_vint8m1(vint8m1_t _v) static void print_vint32m1(vint32m1_t _v) { - int32_t *i32 = (int32_t *)malloc(4 * sizeof(int32_t)); + int32_t* i32 = (int32_t*)malloc(4 * sizeof(int32_t)); vse32_v_i32m1(i32, _v, 4); for (int i = 0; i < 4; i++) { @@ -118,7 +118,7 @@ static void print_vint32m1(vint32m1_t _v) static void print_vfloat32m1(vfloat32m1_t _v) { - float *f32 = (float *)malloc(4 * sizeof(float)); + float* f32 = (float*)malloc(4 * sizeof(float)); vse32_v_f32m1(f32, _v, 4); for (int i = 0; i < 4; i++) { @@ -147,7 +147,7 @@ static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); _v8 = vmax_vx_i8m1(_v8, -127, vl); - int8_t *i8 = (int8_t *)malloc(16 * sizeof(int8_t)); + int8_t* i8 = (int8_t*)malloc(16 * sizeof(int8_t)); vse8_v_i8m1(i8, _v8, 16); return _v8; } @@ -172,18 +172,19 @@ static inline int64_t float2int8relu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh) return _ret; } -static inline int32_t float2int8relu(vfloat32m1_t _v) { +static inline int32_t float2int8relu(vfloat32m1_t _v) +{ int vl = vsetvlmax_e32m1(); vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v, vl); vint32m4_t _v32 = vundefined_i32m4(); - _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v32m1); + _v32 = vset_v_i32m1_i32m4(_v32, 0, _v32m1); vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); _v8 = vmax_vx_i8m1(_v8, 0, vl); int32_t _ret; - vse8_v_i8m1((int8_t *)&_ret, _v8, vl); + vse8_v_i8m1((int8_t*)&_ret, _v8, vl); return _ret; } @@ -225,11 +226,11 @@ static inline int64_t float2int8leakyrelu(vfloat32m1_t _vlow, vfloat32m1_t _vhig vint32m4_t _v32 = vundefined_i32m4(); vint32m4_t _v32_leaky = vundefined_i32m4(); - _v32 = vset_v_i32m1_i32m4 (_v32, 0, _vlow32); - _v32 = vset_v_i32m1_i32m4 (_v32, 1, _vhigh32); + _v32 = vset_v_i32m1_i32m4(_v32, 0, _vlow32); + _v32 = vset_v_i32m1_i32m4(_v32, 1, _vhigh32); - _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 0, _vlow32_leaky); - _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 1, _vhigh32_leaky); + _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 0, _vlow32_leaky); + _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 1, _vhigh32_leaky); vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl); @@ -239,11 +240,12 @@ static inline int64_t float2int8leakyrelu(vfloat32m1_t _vlow, vfloat32m1_t _vhig _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl); int64_t _ret; - vse8_v_i8m1((int8_t *)&_ret, _v8, vl); + vse8_v_i8m1((int8_t*)&_ret, _v8, vl); return _ret; } -static inline int32_t float2int8leakyrelu(vfloat32m1_t _v, vfloat32m1_t _slope) { +static inline int32_t float2int8leakyrelu(vfloat32m1_t _v, vfloat32m1_t _slope) +{ int vl = vsetvlmax_e32m1(); vfloat32m1_t _v_leaky = vfmul_vv_f32m1(_v, _slope, vl); @@ -253,9 +255,9 @@ static inline int32_t float2int8leakyrelu(vfloat32m1_t _v, vfloat32m1_t _slope) // vl = vsetvlmax_e32m4(); vint32m4_t _v32 = vundefined_i32m4(); vint32m4_t _v32_leaky = vundefined_i32m4(); - - _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v32m1); - _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 0, _v32m1_leaky); + + _v32 = vset_v_i32m1_i32m4(_v32, 0, _v32m1); + _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 0, _v32m1_leaky); vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl); @@ -265,7 +267,7 @@ static inline int32_t float2int8leakyrelu(vfloat32m1_t _v, vfloat32m1_t _slope) _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl); int32_t _ret; - vse8_v_i8m1((int8_t *)&_ret, _v8, vl); + vse8_v_i8m1((int8_t*)&_ret, _v8, vl); return _ret; } @@ -289,27 +291,26 @@ static inline vint8m1_t float2int8leakyrelu(vfloat32m1_t _v0, vfloat32m1_t _v1, vl = vsetvlmax_e32m4(); vint32m4_t _v32 = vundefined_i32m4(); - _v32 = vset_v_i32m1_i32m4 (_v32, 0, _v0_32); - _v32 = vset_v_i32m1_i32m4 (_v32, 1, _v1_32); - _v32 = vset_v_i32m1_i32m4 (_v32, 2, _v2_32); - _v32 = vset_v_i32m1_i32m4 (_v32, 3, _v3_32); + _v32 = vset_v_i32m1_i32m4(_v32, 0, _v0_32); + _v32 = vset_v_i32m1_i32m4(_v32, 1, _v1_32); + _v32 = vset_v_i32m1_i32m4(_v32, 2, _v2_32); + _v32 = vset_v_i32m1_i32m4(_v32, 3, _v3_32); vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); vint32m4_t _v32_leaky = vundefined_i32m4(); - _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 0, _v0_32_leaky); - _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 1, _v1_32_leaky); - _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 2, _v2_32_leaky); - _v32_leaky = vset_v_i32m1_i32m4 (_v32_leaky, 3, _v3_32_leaky); + _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 0, _v0_32_leaky); + _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 1, _v1_32_leaky); + _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 2, _v2_32_leaky); + _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 3, _v3_32_leaky); vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl); vint8m1_t _v8_leaky = vnclip_wx_i8m1(_v16_leaky, 0, vl); _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl); return _v8; -} - +} static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr) { From 77ebe76285838dec5fca0e9196d8f13ae545e52b Mon Sep 17 00:00:00 2001 From: Xinyu Yang Date: Sat, 10 Feb 2024 13:51:34 +0800 Subject: [PATCH 09/17] Add quantize fp16 (#6) * finish quantize fp16 to int8 * fix quantize bug * apply code-format changes --------- Co-authored-by: Xinyu302 --- src/layer/riscv/quantize_riscv.cpp | 863 ++++++++++++++++++++++++++++- src/layer/riscv/quantize_riscv.h | 5 + src/layer/riscv/riscv_usability.h | 16 + tests/testutil.cpp | 4 +- 4 files changed, 885 insertions(+), 3 deletions(-) diff --git a/src/layer/riscv/quantize_riscv.cpp b/src/layer/riscv/quantize_riscv.cpp index a459688227b9..91004baebe0a 100644 --- a/src/layer/riscv/quantize_riscv.cpp +++ b/src/layer/riscv/quantize_riscv.cpp @@ -29,14 +29,27 @@ Quantize_riscv::Quantize_riscv() { #if __riscv_vector support_packing = true; + +#if __riscv_zfh + support_fp16_storage = true; +#endif // __riscv_zfh #endif // __riscv_vector } int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { - int vl = vsetvlmax_e32m1(); int elembits = bottom_blob.elembits(); +#if __riscv_vector && __riscv_zfh + if (support_fp16_storage && opt.use_fp16_storage && elembits == 16) + { + if (opt.use_fp16_arithmetic) + return forward_fp16sa(bottom_blob, top_blob, opt); + else + return forward_fp16s(bottom_blob, top_blob, opt); + } +#endif // __riscv_vector && __riscv_zfh + int vl = vsetvlmax_e32m1(); int dims = bottom_blob.dims; int elempack = bottom_blob.elempack; @@ -496,4 +509,852 @@ int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& return 0; } +#if __riscv_vector && __riscv_zfh + +int Quantize_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int dims = bottom_blob.dims; + int elempack = bottom_blob.elempack; + int vl; + + if (elempack == 4) + { + if (dims == 1) + { + int w = bottom_blob.w; + int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; + int outw = w * elempack / out_elempack; + + top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4; + signed char* outptr = (signed char*)top_blob + i * 4; + + outptr[0] = float2int8((float)ptr0[0] * scale); + outptr[1] = float2int8((float)ptr0[1] * scale); + outptr[2] = float2int8((float)ptr0[2] * scale); + outptr[3] = float2int8((float)ptr0[3] * scale); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4; + signed char* outptr = (signed char*)top_blob + i * 4; + + outptr[0] = float2int8((float)ptr0[0] * scale_data[i * 4]); + outptr[1] = float2int8((float)ptr0[1] * scale_data[i * 4 + 1]); + outptr[2] = float2int8((float)ptr0[2] * scale_data[i * 4 + 2]); + outptr[3] = float2int8((float)ptr0[3] * scale_data[i * 4 + 3]); + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; + int outh = h * elempack / out_elempack; + + top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == 8) + { + if (scale_data_size == 1) + { + float _scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const __fp16* ptr0 = bottom_blob.row(i * 2); + const __fp16* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* outptr = top_blob.row(i); + vl = 4; + + for (int j = 0; j < w; j++) + { + vfloat32m1_t _vlow = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr0, vl), vl), 0); + vfloat32m1_t _vhigh = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr1, vl), vl), 0); + _vlow = vfmul_vf_f32m1(_vlow, _scale, vl); + _vhigh = vfmul_vf_f32m1(_vhigh, _scale, vl); + int64_t _v = float2int8(_vlow, _vhigh); + *(int64_t*)outptr = _v; + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const __fp16* ptr0 = bottom_blob.row(i * 2); + const __fp16* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* outptr = top_blob.row(i); + + vl = 4; + vfloat32m1_t _scale0 = vle32_v_f32m1((const float*)scale_data + i * 8, vl); + vfloat32m1_t _scale1 = vle32_v_f32m1((const float*)scale_data + i * 8 + 4, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m1_t _vlow = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr0, vl), vl), 0); + vfloat32m1_t _vhigh = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr1, vl), vl), 0); + _vlow = vfmul_vv_f32m1(_vlow, _scale0, vl); + _vhigh = vfmul_vv_f32m1(_vhigh, _scale1, vl); + int64_t _v = float2int8(_vlow, _vhigh); + *(int64_t*)outptr = _v; + + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i * 4); + signed char* outptr1 = top_blob.row(i * 4 + 1); + signed char* outptr2 = top_blob.row(i * 4 + 2); + signed char* outptr3 = top_blob.row(i * 4 + 3); + + for (int j = 0; j < w; j++) + { + outptr0[0] = float2int8((float)ptr0[0] * scale); + outptr1[0] = float2int8((float)ptr0[1] * scale); + outptr2[0] = float2int8((float)ptr0[2] * scale); + outptr3[0] = float2int8((float)ptr0[3] * scale); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i * 4); + signed char* outptr1 = top_blob.row(i * 4 + 1); + signed char* outptr2 = top_blob.row(i * 4 + 2); + signed char* outptr3 = top_blob.row(i * 4 + 3); + + const float s0 = scale_data[i * 4]; + const float s1 = scale_data[i * 4 + 1]; + const float s2 = scale_data[i * 4 + 2]; + const float s3 = scale_data[i * 4 + 3]; + + for (int j = 0; j < w; j++) + { + outptr0[0] = float2int8((float)ptr0[0] * s0); + outptr1[0] = float2int8((float)ptr0[1] * s1); + outptr2[0] = float2int8((float)ptr0[2] * s2); + outptr3[0] = float2int8((float)ptr0[3] * s3); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; + int outc = channels * elempack / out_elempack; + + top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == 8) + { + if (scale_data_size == 1) + { + float _scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q * 2); + const __fp16* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* outptr = top_blob.channel(q); + + vl = 4; + + for (int i = 0; i < size; i++) + { + vfloat32m1_t _vlow = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr0, vl), vl), 0); + vfloat32m1_t _vhigh = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr1, vl), vl), 0); + _vlow = vfmul_vf_f32m1(_vlow, _scale, vl); + _vhigh = vfmul_vf_f32m1(_vhigh, _scale, vl); + int64_t _v = float2int8(_vlow, _vhigh); + *(int64_t*)outptr = _v; + + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q * 2); + const __fp16* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* outptr = top_blob.channel(q); + + vl = 4; + vfloat32m1_t _scale0 = vle32_v_f32m1((const float*)scale_data + q * 8, vl); + vfloat32m1_t _scale1 = vle32_v_f32m1((const float*)scale_data + q * 8 + 4, vl); + + for (int i = 0; i < size; i++) + { + vfloat32m1_t _vlow = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr0, vl), vl), 0); + vfloat32m1_t _vhigh = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr1, vl), vl), 0); + _vlow = vfmul_vv_f32m1(_vlow, _scale0, vl); + _vhigh = vfmul_vv_f32m1(_vhigh, _scale1, vl); + int64_t _v = float2int8(_vlow, _vhigh); + *(int64_t*)outptr = _v; + + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q * 4); + signed char* outptr1 = top_blob.channel(q * 4 + 1); + signed char* outptr2 = top_blob.channel(q * 4 + 2); + signed char* outptr3 = top_blob.channel(q * 4 + 3); + + for (int i = 0; i < size; i++) + { + outptr0[0] = float2int8((float)ptr0[0] * scale); + outptr1[0] = float2int8((float)ptr0[1] * scale); + outptr2[0] = float2int8((float)ptr0[2] * scale); + outptr3[0] = float2int8((float)ptr0[3] * scale); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q * 4); + signed char* outptr1 = top_blob.channel(q * 4 + 1); + signed char* outptr2 = top_blob.channel(q * 4 + 2); + signed char* outptr3 = top_blob.channel(q * 4 + 3); + + const float s0 = scale_data[q * 4]; + const float s1 = scale_data[q * 4 + 1]; + const float s2 = scale_data[q * 4 + 2]; + const float s3 = scale_data[q * 4 + 3]; + + for (int i = 0; i < size; i++) + { + outptr0[0] = float2int8((float)ptr0[0] * s0); + outptr1[0] = float2int8((float)ptr0[1] * s1); + outptr2[0] = float2int8((float)ptr0[2] * s2); + outptr3[0] = float2int8((float)ptr0[3] * s3); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + } + } + + return 0; + } + + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const __fp16* ptr = bottom_blob; + signed char* outptr = top_blob; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + outptr[i] = float2int8((float)ptr[i] * scale); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + outptr[i] = float2int8((float)ptr[i] * scale_data[i]); + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + + for (int j = 0; j < w; j++) + { + *outptr0++ = float2int8((float)*ptr0++ * scale); + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob.channel(q); + signed char* outptr = top_blob.channel(q); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + + for (int i = 0; i < size; i++) + { + *outptr++ = float2int8((float)*ptr++ * scale); + } + } + } + + return 0; +} + +int Quantize_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int dims = bottom_blob.dims; + int elempack = bottom_blob.elempack; + int vl; + + if (elempack == 8) + { + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)8u, 8, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + __fp16 _scale = (__fp16)scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 8; + signed char* outptr = (signed char*)top_blob + i * 8; + vl = 8; + vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl); + _v = vfmul_vf_f16m1(_v, _scale, vl); + *(int64_t*)outptr = float2int8(_v); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 8; + signed char* outptr = (signed char*)top_blob + i * 8; + + vl = 8; + vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl); + vfloat16m1_t _scale = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)scale_data + i * 8, vl), vl); + + _v = vfmul_vv_f16m1(_v, _scale, vl); + *(int64_t*)outptr = float2int8(_v); + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + __fp16 _scale = (__fp16)scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i); + + for (int j = 0; j < w; j++) + { + vl = 8; + vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl); + _v = vfmul_vf_f16m1(_v, _scale, vl); + + *(int64_t*)outptr0 = float2int8(_v); + + ptr0 += 8; + outptr0 += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i); + + vl = 8; + vfloat16m1_t _scale = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)scale_data + i * 8, vl), vl); // float16x8_t _scale = vcombine_f16(vcvt_f16_f32(vld1q_f32((const float*)scale_data + i * 8)), vcvt_f16_f32(vld1q_f32((const float*)scale_data + i * 8 + 4))); + + for (int j = 0; j < w; j++) + { + vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl); + _v = vfmul_vv_f16m1(_v, _scale, vl); + *(int64_t*)outptr0 = float2int8(_v); + + ptr0 += 8; + outptr0 += 8; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + __fp16 _scale = (__fp16)scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q); + + for (int i = 0; i < size; i++) + { + vl = 8; + vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl); + _v = vfmul_vf_f16m1(_v, _scale, vl); + *(int64_t*)outptr0 = float2int8(_v); + + ptr0 += 8; + outptr0 += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q); + + vl = 8; + vfloat16m1_t _scale = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)scale_data + q * 8, vl), vl); + + for (int i = 0; i < size; i++) + { + vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl); + _v = vfmul_vv_f16m1(_v, _scale, vl); + *(int64_t*)outptr0 = float2int8(_v); + + ptr0 += 8; + outptr0 += 8; + } + } + } + } + + return 0; + } + + if (elempack == 4) + { + if (dims == 1) + { + int w = bottom_blob.w; + int outw = w * elempack; + + top_blob.create(outw, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + const __fp16 scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4; + signed char* outptr = (signed char*)top_blob + i * 4; + + outptr[0] = float2int8(ptr0[0] * scale); + outptr[1] = float2int8(ptr0[1] * scale); + outptr[2] = float2int8(ptr0[2] * scale); + outptr[3] = float2int8(ptr0[3] * scale); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4; + signed char* outptr = (signed char*)top_blob + i * 4; + + outptr[0] = float2int8(ptr0[0] * (__fp16)scale_data[i * 4]); + outptr[1] = float2int8(ptr0[1] * (__fp16)scale_data[i * 4 + 1]); + outptr[2] = float2int8(ptr0[2] * (__fp16)scale_data[i * 4 + 2]); + outptr[3] = float2int8(ptr0[3] * (__fp16)scale_data[i * 4 + 3]); + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int outh = h * elempack; + + top_blob.create(w, outh, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + const __fp16 scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i * 4); + signed char* outptr1 = top_blob.row(i * 4 + 1); + signed char* outptr2 = top_blob.row(i * 4 + 2); + signed char* outptr3 = top_blob.row(i * 4 + 3); + + for (int j = 0; j < w; j++) + { + outptr0[0] = float2int8(ptr0[0] * scale); + outptr1[0] = float2int8(ptr0[1] * scale); + outptr2[0] = float2int8(ptr0[2] * scale); + outptr3[0] = float2int8(ptr0[3] * scale); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i * 4); + signed char* outptr1 = top_blob.row(i * 4 + 1); + signed char* outptr2 = top_blob.row(i * 4 + 2); + signed char* outptr3 = top_blob.row(i * 4 + 3); + + const __fp16 s0 = scale_data[i * 4]; + const __fp16 s1 = scale_data[i * 4 + 1]; + const __fp16 s2 = scale_data[i * 4 + 2]; + const __fp16 s3 = scale_data[i * 4 + 3]; + + for (int j = 0; j < w; j++) + { + outptr0[0] = float2int8(ptr0[0] * s0); + outptr1[0] = float2int8(ptr0[1] * s1); + outptr2[0] = float2int8(ptr0[2] * s2); + outptr3[0] = float2int8(ptr0[3] * s3); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int outc = channels * elempack; + + top_blob.create(w, h, outc, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + const __fp16 scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q * 4); + signed char* outptr1 = top_blob.channel(q * 4 + 1); + signed char* outptr2 = top_blob.channel(q * 4 + 2); + signed char* outptr3 = top_blob.channel(q * 4 + 3); + + for (int i = 0; i < size; i++) + { + outptr0[0] = float2int8(ptr0[0] * scale); + outptr1[0] = float2int8(ptr0[1] * scale); + outptr2[0] = float2int8(ptr0[2] * scale); + outptr3[0] = float2int8(ptr0[3] * scale); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q * 4); + signed char* outptr1 = top_blob.channel(q * 4 + 1); + signed char* outptr2 = top_blob.channel(q * 4 + 2); + signed char* outptr3 = top_blob.channel(q * 4 + 3); + + const __fp16 s0 = scale_data[q * 4]; + const __fp16 s1 = scale_data[q * 4 + 1]; + const __fp16 s2 = scale_data[q * 4 + 2]; + const __fp16 s3 = scale_data[q * 4 + 3]; + + for (int i = 0; i < size; i++) + { + outptr0[0] = float2int8(ptr0[0] * s0); + outptr1[0] = float2int8(ptr0[1] * s1); + outptr2[0] = float2int8(ptr0[2] * s2); + outptr3[0] = float2int8(ptr0[3] * s3); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + } + + return 0; + } + + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const __fp16* ptr = bottom_blob; + signed char* outptr = top_blob; + + if (scale_data_size == 1) + { + const __fp16 scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + outptr[i] = float2int8(ptr[i] * scale); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + outptr[i] = float2int8(ptr[i] * (__fp16)scale_data[i]); + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i); + + const __fp16 scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + + for (int j = 0; j < w; j++) + { + *outptr0++ = float2int8(*ptr0++ * scale); + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob.channel(q); + signed char* outptr = top_blob.channel(q); + + const __fp16 scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + + for (int i = 0; i < size; i++) + { + *outptr++ = float2int8(*ptr++ * scale); + } + } + } + + return 0; +} + +#endif // __riscv_vector && __riscv_zfh + } // namespace ncnn diff --git a/src/layer/riscv/quantize_riscv.h b/src/layer/riscv/quantize_riscv.h index 128aca97e640..0eb90aed7e5b 100644 --- a/src/layer/riscv/quantize_riscv.h +++ b/src/layer/riscv/quantize_riscv.h @@ -25,6 +25,11 @@ class Quantize_riscv : public Quantize Quantize_riscv(); virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + +#if __riscv_vector && __riscv_zfh + int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif // __riscv_vector && __riscv_zfh }; } // namespace ncnn diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h index cbbfded60a91..ad5fadea5701 100644 --- a/src/layer/riscv/riscv_usability.h +++ b/src/layer/riscv/riscv_usability.h @@ -58,6 +58,22 @@ static inline int csrr_vlenb() return a; } +#if __riscv_zfh +static inline int64_t float2int8(vfloat16m1_t _v) +{ + int vl = vsetvlmax_e16m1(); + vint16m2_t _v16 = vundefined_i16m2(); + _v16 = vset_v_i16m1_i16m2(_v16, 0, vfcvt_x_f_v_i16m1(_v, vl)); + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + int64_t _ret; + vse8_v_i8m1((int8_t*)&_ret, _v8, vl); + return _ret; + // int16x8_t _v16 = vcvtaq_s16_f16(_v); + // int8x8_t _v8 = vqmovn_s16(_v16); + // return vmax_s8(_v8, vdup_n_s8(-127)); +} +#endif // __riscv_zfh + static inline int64_t float2int8(vfloat32m1_t _vlow, vfloat32m1_t _vhigh) { int vl = vsetvlmax_e32m1(); diff --git a/tests/testutil.cpp b/tests/testutil.cpp index f0bf3c51a20d..2f01b32c8a96 100644 --- a/tests/testutil.cpp +++ b/tests/testutil.cpp @@ -523,7 +523,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector Date: Sat, 10 Feb 2024 17:09:11 +0800 Subject: [PATCH 10/17] Add dequantize (#3) * add-deq * apply code-format changes * ongoing dequantize * apply code-format changes * finish dequantize_riscv, but has bug * fix vset bug * fix bug * delete debug info * apply code-format changes * refine dequantize --------- Co-authored-by: Xinyu302 --- src/layer/riscv/dequantize_riscv.cpp | 891 +++++++++++++++++++++++++++ src/layer/riscv/dequantize_riscv.h | 32 + 2 files changed, 923 insertions(+) create mode 100644 src/layer/riscv/dequantize_riscv.cpp create mode 100644 src/layer/riscv/dequantize_riscv.h diff --git a/src/layer/riscv/dequantize_riscv.cpp b/src/layer/riscv/dequantize_riscv.cpp new file mode 100644 index 000000000000..a0ccb758120b --- /dev/null +++ b/src/layer/riscv/dequantize_riscv.cpp @@ -0,0 +1,891 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2019 BUG1989. All rights reserved. +// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "dequantize_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +#include "riscv_usability.h" +#include "cpu.h" + +namespace ncnn { + +Dequantize_riscv::Dequantize_riscv() +{ +#if __riscv_vector + support_packing = true; +#endif // __riscv_vector +} + +int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int vl; + int dims = bottom_blob.dims; + int elempack = bottom_blob.elempack; + +#if __riscv_vector + if (elempack == 8) + { + if (dims == 1) + { + int w = bottom_blob.w; + int outw = w * 2; + + top_blob.create(outw, (size_t)16u, 4, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + float _scale = scale_data[0]; + if (bias_data_size == 0) + { + // #pragma omp parallel for num_threads(opt.num_threads) + int n = outw * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmul_vf_f32m8(_v, _scale, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + else if (bias_data_size == 1) + { + int n = outw * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + vfloat32m8_t _bias = vfmv_v_f_f32m8(bias_data[0], vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmadd_vf_f32m8(_v, _scale, _bias, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + else + { + // #pragma omp parallel for num_threads(opt.num_threads) + int n = outw * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + vfloat32m8_t _bias = vle32_v_f32m8((const float*)bias_data + offset, vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmadd_vf_f32m8(_v, _scale, _bias, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + } + else + { + if (bias_data_size == 0) + { + // #pragma omp parallel for num_threads(opt.num_threads) + int n = outw * 4; + int offset = 0; + + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + + vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmul_vv_f32m8(_v, _scale, vl); + vse32_v_f32m8(ptr, _v, vl); + + offset += vl; + n -= vl; + } + } + else if (bias_data_size == 1) + { + int n = outw * 4; + int offset = 0; + + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + + vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl); + vfloat32m8_t _bias = vfmv_v_f_f32m8(bias_data[0], vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + else + { + int n = outw * 4; + int offset = 0; + + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + + vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl); + vfloat32m8_t _bias = vle32_v_f32m8((const float*)bias_data + offset, vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int outh = h * 2; + vl = 4; + + top_blob.create(w, outh, (size_t)16u, 4, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr0 = top_blob.row(i * 2); + float* ptr1 = top_blob.row(i * 2 + 1); + + vl = 4; + vfloat32m1_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 8, vl); + vfloat32m1_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 8 + 4, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl); + _v0 = vfmul_vv_f32m1(_v0, _scale0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale1, vl); + vse32_v_f32m1(ptr0, _v0, vl); + vse32_v_f32m1(ptr1, _v1, vl); + intptr += 8; + ptr0 += 4; + ptr1 += 4; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr0 = top_blob.row(i * 2); + float* ptr1 = top_blob.row(i * 2 + 1); + + vl = 4; + vfloat32m1_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 8, vl); + vfloat32m1_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 8 + 4, vl); + vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 8, vl); + vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 8 + 4, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl); + _v0 = vfmadd_vv_f32m1(_v0, _scale0, _bias0, vl); + _v1 = vfmadd_vv_f32m1(_v1, _scale1, _bias1, vl); + vse32_v_f32m1(ptr0, _v0, vl); + vse32_v_f32m1(ptr1, _v1, vl); + intptr += 8; + ptr0 += 4; + ptr1 += 4; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int outc = channels * 2; + + top_blob.create(w, h, outc, (size_t)16u, 4, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr0 = top_blob.channel(q * 2); + float* ptr1 = top_blob.channel(q * 2 + 1); + + vl = 4; + vfloat32m1_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 8, vl); + vfloat32m1_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 8 + 4, vl); + + int i = 0; + for (; i + 1 < size; i += 2) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl); + vfloat32m1_t _v2 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 8, vl), vl); + vfloat32m1_t _v3 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 12, vl), vl); + + _v0 = vfmul_vv_f32m1(_v0, _scale0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale1, vl); + _v2 = vfmul_vv_f32m1(_v2, _scale0, vl); + _v3 = vfmul_vv_f32m1(_v3, _scale1, vl); + + vse32_v_f32m1(ptr0, _v0, vl); + vse32_v_f32m1(ptr0 + 4, _v2, vl); + vse32_v_f32m1(ptr1, _v1, vl); + vse32_v_f32m1(ptr1 + 4, _v3, vl); + intptr += 16; + ptr0 += 8; + ptr1 += 8; + } + for (; i < size; i++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl); + + _v0 = vfmul_vv_f32m1(_v0, _scale0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale1, vl); + + vse32_v_f32m1(ptr0, _v0, vl); + vse32_v_f32m1(ptr1, _v1, vl); + + intptr += 8; + ptr0 += 4; + ptr1 += 4; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr0 = top_blob.channel(q * 2); + float* ptr1 = top_blob.channel(q * 2 + 1); + vl = 4; + + vfloat32m1_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 8, vl); + vfloat32m1_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 8 + 4, vl); + vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8, vl); + vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8 + 4, vl); + + int i = 0; + for (; i + 1 < size; i += 2) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl); + vfloat32m1_t _v2 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 8, vl), vl); + vfloat32m1_t _v3 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 12, vl), vl); + + _v0 = vfmadd_vv_f32m1(_v0, _scale0, _bias0, vl); + _v1 = vfmadd_vv_f32m1(_v1, _scale1, _bias1, vl); + _v2 = vfmadd_vv_f32m1(_v2, _scale0, _bias0, vl); + _v3 = vfmadd_vv_f32m1(_v3, _scale1, _bias1, vl); + + vse32_v_f32m1(ptr0, _v0, vl); + vse32_v_f32m1(ptr0 + 4, _v2, vl); + vse32_v_f32m1(ptr1, _v1, vl); + vse32_v_f32m1(ptr1 + 4, _v3, vl); + + intptr += 16; + ptr0 += 8; + ptr1 += 8; + } + for (; i < size; i++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl); + + _v0 = vfmadd_vv_f32m1(_v0, _scale0, _bias0, vl); + _v1 = vfmadd_vv_f32m1(_v1, _scale1, _bias1, vl); + + vse32_v_f32m1(ptr0, _v0, vl); + vse32_v_f32m1(ptr1, _v1, vl); + + intptr += 8; + ptr0 += 4; + ptr1 += 4; + } + } + } + } + + return 0; + } + + if (elempack == 4) + { + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)16u, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + if (bias_data_size == 0) + { + int n = w * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + vfloat32m8_t _scale = vfmv_v_f_f32m8(scale_data[0], vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmul_vv_f32m8(_v, _scale, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + else if (bias_data_size == 1) + { + int n = w * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + vfloat32m8_t _scale = vfmv_v_f_f32m8(scale_data[0], vl); + vfloat32m8_t _bias = vfmv_v_f_f32m8(bias_data[0], vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + else + { + // #pragma omp parallel for num_threads(opt.num_threads) + int n = w * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + vfloat32m8_t _scale = vfmv_v_f_f32m8(scale_data[0], vl); + vfloat32m8_t _bias = vle32_v_f32m8((const float*)bias_data + offset, vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + } + else + { + if (bias_data_size == 0) + { + // #pragma omp parallel for num_threads(opt.num_threads) + int n = w * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmul_vv_f32m8(_v, _scale, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + else if (bias_data_size == 1) + { + int n = w * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl); + vfloat32m8_t _bias = vfmv_v_f_f32m8(bias_data[0], vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + else + { + int n = w * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl); + vfloat32m8_t _bias = vle32_v_f32m8((const float*)bias_data + offset, vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)16u, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); + + int n = w * 4; + int offset = 0; + vl = 4; + vfloat32m1_t _scale_m1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 4, vl); + vfloat32m8_t _scale = vundefined_f32m8(); + _scale = vset_v_f32m1_f32m8(_scale, 0, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 1, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 2, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 3, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 4, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 5, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 6, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 7, _scale_m1); + + while (n > 0) + { + vl = vsetvl_e32m8(n); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl); + _v = vfmul_vv_f32m8(_v, _scale, vl); + vse32_v_f32m8(ptr + offset, _v, vl); + offset += vl; + n -= vl; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); + + vl = 4; + vfloat32m1_t _scale_m1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 4, vl); + vfloat32m1_t _bias_m1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 4, vl); + + vfloat32m8_t _scale = vundefined_f32m8(); + _scale = vset_v_f32m1_f32m8(_scale, 0, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 1, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 2, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 3, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 4, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 5, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 6, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 7, _scale_m1); + + vfloat32m8_t _bias = vundefined_f32m8(); + _bias = vset_v_f32m1_f32m8(_bias, 0, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 1, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 2, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 3, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 4, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 5, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 6, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 7, _bias_m1); + + int n = w * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl); + _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl); + vse32_v_f32m8(ptr + offset, _v, vl); + offset += vl; + n -= vl; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)16u, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); + vl = 4; + vfloat32m1_t _scale_m1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 4, vl); + vfloat32m8_t _scale = vundefined_f32m8(); + _scale = vset_v_f32m1_f32m8(_scale, 0, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 1, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 2, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 3, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 4, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 5, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 6, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 7, _scale_m1); + + int n = size * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl); + _v = vfmul_vv_f32m8(_v, _scale, vl); + vse32_v_f32m8(ptr + offset, _v, vl); + offset += vl; + n -= vl; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); + + vl = 4; + vfloat32m1_t _scale_m1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 4, vl); + vfloat32m1_t _bias_m1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 4, vl); + + vfloat32m8_t _scale = vundefined_f32m8(); + _scale = vset_v_f32m1_f32m8(_scale, 0, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 1, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 2, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 3, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 4, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 5, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 6, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 7, _scale_m1); + + vfloat32m8_t _bias = vundefined_f32m8(); + _bias = vset_v_f32m1_f32m8(_bias, 0, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 1, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 2, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 3, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 4, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 5, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 6, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 7, _bias_m1); + + int n = size * 4; + int offset = 0; + + while (n > 0) + { + vl = vsetvl_e32m8(n); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl); + _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl); + vse32_v_f32m8(ptr + offset, _v, vl); + offset += vl; + n -= vl; + } + } + } + } + + return 0; + } +#endif // __riscv_vector + + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int* intptr = bottom_blob; + float* ptr = top_blob; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale; + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale + bias; + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale + bias_data[i]; + } + } + } + else + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale_data[i]; + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale_data[i] + bias; + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale_data[i] + bias_data[i]; + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + + int n = w; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + vfloat32m8_t _scale = vfmv_v_f_f32m8(scale, vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl); + _v = vfmul_vv_f32m8(_v, _scale, vl); + vse32_v_f32m8(ptr + offset, _v, vl); + offset += vl; + n -= vl; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; + + int n = w; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + vfloat32m8_t _scale = vfmv_v_f_f32m8(scale, vl); + vfloat32m8_t _bias = vfmv_v_f_f32m8(bias, vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl); + _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl); + vse32_v_f32m8(ptr + offset, _v, vl); + offset += vl; + n -= vl; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + + int i = 0; +#if __riscv_vector + int n = size; + int offset = 0; + + while (n > 0) + { + vl = vsetvl_e32m8(n); + vfloat32m8_t _scale = vfmv_v_f_f32m8(scale, vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl); + _v = vfmul_vv_f32m8(_v, _scale, vl); + vse32_v_f32m8(ptr + offset, _v, vl); + offset += vl; + n -= vl; + } +#endif // __riscv_vector \ +// for (; i < size; i++) \ +// { \ +// *ptr++ = *intptr++ * scale; \ +// } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; + + int i = 0; +#if __riscv_vector + int n = size; + int offset = 0; + + while (n > 0) + { + vl = vsetvl_e32m8(n); + vfloat32m8_t _scale = vfmv_v_f_f32m8(scale, vl); + vfloat32m8_t _bias = vfmv_v_f_f32m8(bias, vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl); + _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl); + vse32_v_f32m8(ptr + offset, _v, vl); + offset += vl; + n -= vl; + } +#endif // __riscv_vector \ +// for (; i < size; i++) \ +// { \ +// *ptr++ = *intptr++ * scale + bias; \ +// } + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/riscv/dequantize_riscv.h b/src/layer/riscv/dequantize_riscv.h new file mode 100644 index 000000000000..be7bc6bc90c5 --- /dev/null +++ b/src/layer/riscv/dequantize_riscv.h @@ -0,0 +1,32 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_DEQUANTIZE_RISCV_H +#define LAYER_DEQUANTIZE_RISCV_H + +#include "dequantize.h" + +namespace ncnn { + +class Dequantize_riscv : public Dequantize +{ +public: + Dequantize_riscv(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_DEQUANTIZE_RISCV_H From f0779b8ff4662459b2cc4daadbcbafbccf34fb6d Mon Sep 17 00:00:00 2001 From: Xinyu Yang Date: Sat, 10 Feb 2024 17:09:48 +0800 Subject: [PATCH 11/17] Add innerproduct (#5) * add int innerproduct * apply code-format changes * finish innerproduct, but not tested yet * apply code-format changes * bug because no fp16 quantize * change flatten to make it right * pass test * delete useless code * apply code-format changes * delete useless code --------- Co-authored-by: Xinyu302 --- src/layer/riscv/flatten_riscv.cpp | 57 ++- src/layer/riscv/innerproduct_riscv.cpp | 532 ++++++++++++++++++++++++- src/layer/riscv/innerproduct_riscv.h | 9 + src/layer/riscv/riscv_usability.h | 28 +- 4 files changed, 595 insertions(+), 31 deletions(-) diff --git a/src/layer/riscv/flatten_riscv.cpp b/src/layer/riscv/flatten_riscv.cpp index 491c051c7fea..e10bcb86bd3c 100644 --- a/src/layer/riscv/flatten_riscv.cpp +++ b/src/layer/riscv/flatten_riscv.cpp @@ -348,7 +348,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt } #if __riscv_vector - const int packn = csrr_vlenb() / 1; + const int packn = csrr_vlenb() / 2; #endif int w = bottom_blob.w; @@ -365,7 +365,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt #if __riscv_vector if (opt.use_packing_layout) { - out_elempack = total % packn == 0 ? packn : 1; + out_elempack = total % 8 == 0 ? 8 : 1; } #endif size_t out_elemsize = elemsize / elempack * out_elempack; @@ -394,7 +394,29 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt if (dims == 2) { #if __riscv_vector - if (elempack == packn) // out_elempack == packn + // if (elempack == packn) // out_elempack == packn + // { + // #pragma omp parallel for num_threads(opt.num_threads) + // for (int i = 0; i < h; i++) + // { + // const signed char* ptr = bottom_blob.row(i); + // signed char* outptr = (signed char*)top_blob + w * i * packn; + + // int n = w * elempack; + // while (n > 0) + // { + // size_t vl = vsetvl_e8m1(n); + + // vint8m1_t _p = vle8_v_i8m1(ptr, vl); + // vsse8_v_i8m1(outptr, w * sizeof(unsigned char), _p, vl); + + // ptr += vl; + // outptr += 1; + // n -= vl; + // } + // } + // } + if (elempack == 8) // must add, because in innerproduct, elempack is 8 { #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < h; i++) @@ -405,7 +427,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt int n = w * elempack; while (n > 0) { - size_t vl = vsetvl_e8m1(n); + size_t vl = elempack; vint8m1_t _p = vle8_v_i8m1(ptr, vl); vsse8_v_i8m1(outptr, w * sizeof(unsigned char), _p, vl); @@ -422,7 +444,30 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt if (dims == 3 || dims == 4) { #if __riscv_vector - if (elempack == packn) // out_elempack == packn + // if (elempack == packn) // out_elempack == packn + // { + // #pragma omp parallel for num_threads(opt.num_threads) + // for (int q = 0; q < channels; q++) + // { + // const signed char* ptr = bottom_blob.channel(q); + // signed char* outptr = (signed char*)top_blob + size * q * packn; + + // int n = size * elempack; + // while (n > 0) + // { + // size_t vl = vsetvl_e8m1(n); + + // vint8m1_t _p = vle8_v_i8m1(ptr, vl); + // vsse8_v_i8m1(outptr, size * sizeof(signed char), _p, vl); + + // ptr += vl; + // outptr += 1; + // n -= vl; + // } + // } + // } + + if (elempack == 8) // must add, because in innerproduct, elempack is 8 { #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) @@ -433,7 +478,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt int n = size * elempack; while (n > 0) { - size_t vl = vsetvl_e8m1(n); + size_t vl = elempack; vint8m1_t _p = vle8_v_i8m1(ptr, vl); vsse8_v_i8m1(outptr, size * sizeof(signed char), _p, vl); diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp index accfc683584f..20213c9951f0 100644 --- a/src/layer/riscv/innerproduct_riscv.cpp +++ b/src/layer/riscv/innerproduct_riscv.cpp @@ -53,7 +53,7 @@ int InnerProduct_riscv::create_pipeline(const Option& opt) if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) { // TODO implement int8 - return 0; + return create_pipeline_int8(opt); } #endif @@ -128,27 +128,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt #if NCNN_INT8 if (opt.use_int8_inference && int8_scale_term) { - Mat bottom_blob_unpacked = bottom_blob; - if (bottom_blob.elempack != 1) - { - Option opt_pack1 = opt; - opt_pack1.blob_allocator = opt.workspace_allocator; - - convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); - } - - Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked; - if (bottom_blob_unpacked.elembits() == 16) - { - Option opt_pack1 = opt; - opt_pack1.blob_allocator = opt.workspace_allocator; - - cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1); - } - - Option opt_unpacked = opt; - opt_unpacked.use_packing_layout = false; - return InnerProduct::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked); + return forward_int8(bottom_blob, top_blob, opt); } #endif @@ -1090,4 +1070,512 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co } #endif // __riscv_vector && __riscv_zfh +#if NCNN_INT8 +int InnerProduct_riscv::create_pipeline_int8(const Option& opt) +{ + const int num_input = weight_data_size / num_output; + + int out_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + out_elempack = num_output % 8 == 0 ? 8 : 1; + } +#endif + + // src = inch-outch + // dst = pb-inch-outch/pb + { + Mat weight_data_r2 = weight_data.reshape(num_input, num_output); + + weight_data_tm.create(num_input, num_output / out_elempack, (size_t)out_elempack, out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + signed char* g0 = weight_data_tm.row(q / out_elempack); + + for (int p = 0; p < num_input; p++) + { + for (int j = 0; j < out_elempack; j++) + { + *g0++ = weight_data_r2.row(q + j)[p]; + } + } + } + } + + scale_in_data.create(num_output); + for (int p = 0; p < num_output; p++) + { + // dequantize + float scale_in; + if (weight_data_int8_scales[p] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]); + + scale_in_data[p] = scale_in; + } + + weight_data.release(); + + return 0; +} + +int InnerProduct_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int vl; + const int num_input = weight_data_size / num_output; + + int elembits = bottom_blob.elembits(); + + Mat bottom_blob_int8 = bottom_blob; + + if (elembits != 8) + { + Option opt_q = opt; + opt_q.blob_allocator = opt.workspace_allocator; + quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q); + } + + if (bottom_blob_int8.dims == 2 && bottom_blob_int8.w == num_input) + { + // gemm + Mat bottom_blob_int8_unpacked; + Option opt_unpack = opt; + opt_unpack.blob_allocator = opt.workspace_allocator; + convert_packing(bottom_blob_int8, bottom_blob_int8_unpacked, 1, opt_unpack); + + int h = bottom_blob_int8_unpacked.h; + + int out_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + out_elempack = h % 4 == 0 ? 4 : 1; + } +#endif + + int outh = h / out_elempack; + + top_blob.create(num_output, outh, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int num_output_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + num_output_elempack = num_output % 8 == 0 ? 8 : 1; + } +#endif + +#if __riscv_vector + if (num_output_elempack == 8 && out_elempack == 4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < outh; j++) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + const signed char* kptr = weight_data_tm.row(p); + const signed char* m0 = bottom_blob_int8_unpacked.row(j * 4); + const signed char* m1 = bottom_blob_int8_unpacked.row(j * 4 + 1); + const signed char* m2 = bottom_blob_int8_unpacked.row(j * 4 + 2); + const signed char* m3 = bottom_blob_int8_unpacked.row(j * 4 + 3); + + vl = 8; + vint32m2_t _sum0 = vmv_v_x_i32m2(0, vl); + vint32m2_t _sum1 = vmv_v_x_i32m2(0, vl); + vint32m2_t _sum2 = vmv_v_x_i32m2(0, vl); + vint32m2_t _sum3 = vmv_v_x_i32m2(0, vl); + + int i = 0; + for (; i < num_input; i++) + { + vint8m1_t _val0 = vmv_v_x_i8m1(m0[0], vl); + vint8m1_t _val1 = vmv_v_x_i8m1(m1[0], vl); + vint8m1_t _val2 = vmv_v_x_i8m1(m2[0], vl); + vint8m1_t _val3 = vmv_v_x_i8m1(m3[0], vl); + + vint8m1_t _w = vle8_v_i8m1(kptr, vl); + + vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val0, _w, vl), 0); + vint16m1_t _s1 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val1, _w, vl), 0); + vint16m1_t _s2 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val2, _w, vl), 0); + vint16m1_t _s3 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val3, _w, vl), 0); + + _sum0 = vwadd_wv_i32m2(_sum0, _s0, vl); + _sum1 = vwadd_wv_i32m2(_sum1, _s1, vl); + _sum2 = vwadd_wv_i32m2(_sum2, _s2, vl); + _sum3 = vwadd_wv_i32m2(_sum3, _s3, vl); + + m0++; + m1++; + m2++; + m3++; + kptr += 8; + } + + // dequantize and relu + vfloat32m2_t _scale_in = vle32_v_f32m2((const float*)scale_in_data + p * 8, vl); + + vfloat32m2_t _sumfp32_0 = vfcvt_f_x_v_f32m2(_sum0, vl); + vfloat32m2_t _sumfp32_1 = vfcvt_f_x_v_f32m2(_sum1, vl); + vfloat32m2_t _sumfp32_2 = vfcvt_f_x_v_f32m2(_sum2, vl); + vfloat32m2_t _sumfp32_3 = vfcvt_f_x_v_f32m2(_sum3, vl); + + if (bias_term) + { + vfloat32m2_t _bias = vle32_v_f32m2((const float*)bias_data + p * 8, vl); + _sumfp32_0 = vfmacc_vv_f32m2(_bias, _sumfp32_0, _scale_in, vl); + _sumfp32_1 = vfmacc_vv_f32m2(_bias, _sumfp32_1, _scale_in, vl); + _sumfp32_2 = vfmacc_vv_f32m2(_bias, _sumfp32_2, _scale_in, vl); + _sumfp32_3 = vfmacc_vv_f32m2(_bias, _sumfp32_3, _scale_in, vl); + } + else + { + _sumfp32_0 = vfmul_vv_f32m2(_sumfp32_0, _scale_in, vl); + _sumfp32_1 = vfmul_vv_f32m2(_sumfp32_1, _scale_in, vl); + _sumfp32_2 = vfmul_vv_f32m2(_sumfp32_2, _scale_in, vl); + _sumfp32_3 = vfmul_vv_f32m2(_sumfp32_3, _scale_in, vl); + } + + _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params, vl); + _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params, vl); + _sumfp32_2 = activation_ps(_sumfp32_2, activation_type, activation_params, vl); + _sumfp32_3 = activation_ps(_sumfp32_3, activation_type, activation_params, vl); + + vsseg4e32_v_f32m2(outptr, _sumfp32_0, _sumfp32_1, _sumfp32_2, _sumfp32_3, vl); + + outptr += 32; + } + } + } + + if (num_output_elempack == 1 && out_elempack == 4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < outh; j++) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output; p++) + { + const signed char* kptr = weight_data_tm.row(p); + const signed char* m0 = bottom_blob_int8_unpacked.row(j * 4); + const signed char* m1 = bottom_blob_int8_unpacked.row(j * 4 + 1); + const signed char* m2 = bottom_blob_int8_unpacked.row(j * 4 + 2); + const signed char* m3 = bottom_blob_int8_unpacked.row(j * 4 + 3); + + int sum0 = 0; + int sum1 = 0; + int sum2 = 0; + int sum3 = 0; + + int i = 0; + + int n = num_input; + + vl = vsetvlmax_e32m4(); + vint32m4_t _sum0 = vmv_v_x_i32m4(0, vl); + vint32m4_t _sum1 = vmv_v_x_i32m4(0, vl); + vint32m4_t _sum2 = vmv_v_x_i32m4(0, vl); + vint32m4_t _sum3 = vmv_v_x_i32m4(0, vl); + + while (n > 0) + { + vl = vsetvl_e32m4(n); + vint8m1_t _val0 = vle8_v_i8m1(m0, vl); + vint8m1_t _val1 = vle8_v_i8m1(m1, vl); + vint8m1_t _val2 = vle8_v_i8m1(m2, vl); + vint8m1_t _val3 = vle8_v_i8m1(m3, vl); + + vint8m1_t _w = vle8_v_i8m1(kptr, vl); + + vint16m2_t _s0 = vwmul_vv_i16m2(_val0, _w, vl); + vint16m2_t _s1 = vwmul_vv_i16m2(_val1, _w, vl); + vint16m2_t _s2 = vwmul_vv_i16m2(_val2, _w, vl); + vint16m2_t _s3 = vwmul_vv_i16m2(_val3, _w, vl); + + _sum0 = vwadd_wv_i32m4(_sum0, _s0, vl); + _sum1 = vwadd_wv_i32m4(_sum1, _s1, vl); + _sum2 = vwadd_wv_i32m4(_sum2, _s2, vl); + _sum3 = vwadd_wv_i32m4(_sum3, _s3, vl); + + m0 += vl; + m1 += vl; + m2 += vl; + m3 += vl; + + kptr += vl; + n -= vl; + } + + vint32m1_t _sum0_scala = vmv_v_x_i32m1(0, vl); + vint32m1_t _sum1_scala = vmv_v_x_i32m1(0, vl); + vint32m1_t _sum2_scala = vmv_v_x_i32m1(0, vl); + vint32m1_t _sum3_scala = vmv_v_x_i32m1(0, vl); + + vl = vsetvlmax_e32m4(); + _sum0_scala = vredsum_vs_i32m4_i32m1(_sum0_scala, _sum0, _sum0_scala, vl); + _sum1_scala = vredsum_vs_i32m4_i32m1(_sum1_scala, _sum1, _sum1_scala, vl); + _sum2_scala = vredsum_vs_i32m4_i32m1(_sum2_scala, _sum2, _sum2_scala, vl); + _sum3_scala = vredsum_vs_i32m4_i32m1(_sum3_scala, _sum3, _sum3_scala, vl); + sum0 = vmv_x_s_i32m1_i32(_sum0_scala); + sum1 = vmv_x_s_i32m1_i32(_sum1_scala); + sum2 = vmv_x_s_i32m1_i32(_sum2_scala); + sum3 = vmv_x_s_i32m1_i32(_sum3_scala); + + // dequantize and relu + float sumfp32_0 = sum0 * scale_in_data[p]; + float sumfp32_1 = sum1 * scale_in_data[p]; + float sumfp32_2 = sum2 * scale_in_data[p]; + float sumfp32_3 = sum3 * scale_in_data[p]; + + if (bias_term) + { + sumfp32_0 += bias_data[p]; + sumfp32_1 += bias_data[p]; + sumfp32_2 += bias_data[p]; + sumfp32_3 += bias_data[p]; + } + + outptr[0] = activation_ss(sumfp32_0, activation_type, activation_params); + outptr[1] = activation_ss(sumfp32_1, activation_type, activation_params); + outptr[2] = activation_ss(sumfp32_2, activation_type, activation_params); + outptr[3] = activation_ss(sumfp32_3, activation_type, activation_params); + outptr += 4; + } + } + } + + if (num_output_elempack == 8 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < outh; j++) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + const signed char* kptr = weight_data_tm.row(p); + const signed char* m = bottom_blob_int8_unpacked.row(j); + + vint32m2_t _sum = vmv_v_x_i32m2(0, vl); + + int i = 0; + + vl = 8; + for (; i < num_input; i++) + { + vint8m1_t _val = vmv_v_x_i8m1(m[0], vl); + vint8m1_t _w = vle8_v_i8m1(kptr, vl); + + // int8x8_t _val = vld1_dup_s8(m); + // int8x8_t _w = vld1_s8(kptr); + + vint16m2_t _s = vwmul_vv_i16m2(_val, _w, vl); + vint16m1_t _s0 = vget_v_i16m2_i16m1(_s, 0); + + _sum = vwadd_wv_i32m2(_sum, _s0, vl); + + m++; + kptr += 8; + } + + // dequantize and relu + vfloat32m2_t _scale_in = vle32_v_f32m2((const float*)scale_in_data + p * 8, vl); + vfloat32m2_t _sumfp32 = vfcvt_f_x_v_f32m2(_sum, vl); + + if (bias_term) + { + vfloat32m2_t _bias = vle32_v_f32m2((const float*)bias_data + p * 8, vl); + _sumfp32 = vfmacc_vv_f32m2(_bias, _sumfp32, _scale_in, vl); + } + else + { + _sumfp32 = vfmul_vv_f32m2(_sumfp32, _scale_in, vl); + } + + // _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params, vl); + _sumfp32 = activation_ps(_sumfp32, activation_type, activation_params, vl); + + vse32_v_f32m2(outptr, _sumfp32, vl); + outptr += 8; + } + } + } +#endif // __riscv_vector + + if (num_output_elempack == 1 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < outh; j++) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output; p++) + { + const signed char* kptr = weight_data_tm.row(p); + const signed char* m = bottom_blob_int8_unpacked.row(j); + + int sum = 0; + + int i = 0; +#if __riscv_vector + + int n = num_input; + vint32m4_t _sum = vmv_v_x_i32m4(0, vsetvlmax_e32m4()); + while (n > 0) + { + vl = vsetvl_e32m4(n); + vint8m1_t _val = vle8_v_i8m1(m, vl); + vint8m1_t _w = vle8_v_i8m1(kptr, vl); + + vint16m2_t _s = vwmul_vv_i16m2(_val, _w, vl); + _sum = vwadd_wv_i32m4(_sum, _s, vl); + + m += vl; + kptr += vl; + n -= vl; + } + + vint32m1_t _sum_scala = vmv_v_x_i32m1(0, vl); + _sum_scala = vredsum_vs_i32m4_i32m1(_sum_scala, _sum, _sum_scala, vl); + sum = vmv_x_s_i32m1_i32(_sum_scala); + +#endif // __riscv_vector + + // for (; i < num_input; i++) \ +// { \ +// sum += *m++ * *kptr++; \ +// } + + // dequantize and relu + float sumfp32 = sum * scale_in_data[p]; + + if (bias_term) + sumfp32 += bias_data[p]; + + outptr[0] = activation_ss(sumfp32, activation_type, activation_params); + outptr += 1; + } + } + } + + return 0; + } + + Mat bottom_blob_int8_flattened = bottom_blob_int8; + if (bottom_blob_int8.dims != 1) + { + Option opt_flatten = opt; + opt_flatten.blob_allocator = opt.workspace_allocator; + flatten->forward(bottom_blob_int8, bottom_blob_int8_flattened, opt_flatten); + } + + // int elempack = bottom_blob_int8_flattened.elempack; + + int out_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + out_elempack = num_output % 8 == 0 ? 8 : 1; + } +#endif + + top_blob.create(num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __riscv_vector + if (out_elempack == 8) + { + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output / out_elempack; p++) + { + const signed char* kptr = weight_data_tm.row(p); + const signed char* sptr = bottom_blob_int8_flattened; + + vl = 8; + + vint32m2_t _sum0 = vmv_v_x_i32m2(0, vl); + + int i = 0; + for (; i < num_input; i++) + { + vint8m1_t _val = vmv_v_x_i8m1(sptr[0], vl); + vint8m1_t _w = vle8_v_i8m1(kptr, vl); + + vint16m1_t _s = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val, _w, vl), 0); + _sum0 = vwadd_wv_i32m2(_sum0, _s, vl); + + sptr += 1; + kptr += 8; + } + + // dequantize and relu + vfloat32m2_t _scale_in = vle32_v_f32m2((const float*)scale_in_data + p * 8, vl); + + vfloat32m2_t _sumfp32 = vfcvt_f_x_v_f32m2(_sum0, vl); + + if (bias_term) + { + vfloat32m2_t _bias = vle32_v_f32m2((const float*)bias_data + p * 8, vl); + _sumfp32 = vfmacc_vv_f32m2(_bias, _sumfp32, _scale_in, vl); + } + else + { + _sumfp32 = vfmul_vv_f32m2(_sumfp32, _scale_in, vl); + } + + _sumfp32 = activation_ps(_sumfp32, activation_type, activation_params, vl); + + float* outptr = (float*)top_blob + p * 8; + vse32_v_f32m2(outptr, _sumfp32, vl); + } + } +#endif // __riscv_vector + + if (out_elempack == 1) + { + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output / out_elempack; p++) + { + const signed char* kptr = weight_data_tm.row(p); + const signed char* sptr = bottom_blob_int8_flattened; + + int sum = 0; + + int i = 0; + for (; i < num_input; i++) + { + signed char val = sptr[0]; + signed char w = kptr[0]; + + sum += val * w; + + sptr += 1; + kptr += 1; + } + // dequantize and relu + float sumfp32 = sum * scale_in_data[p]; + + if (bias_term) + sumfp32 += bias_data[p]; + sumfp32 = activation_ss(sumfp32, activation_type, activation_params); + + top_blob[p] = sumfp32; + } + } + + return 0; +} +#endif // NCNN_INT8 + } // namespace ncnn diff --git a/src/layer/riscv/innerproduct_riscv.h b/src/layer/riscv/innerproduct_riscv.h index d3056d5801d0..77a3b93de12a 100644 --- a/src/layer/riscv/innerproduct_riscv.h +++ b/src/layer/riscv/innerproduct_riscv.h @@ -36,6 +36,11 @@ class InnerProduct_riscv : public InnerProduct int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif +#if __riscv_vector + int create_pipeline_int8(const Option& opt); + int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif + public: Layer* flatten; @@ -43,6 +48,10 @@ class InnerProduct_riscv : public InnerProduct // fp16 Mat bias_data_fp16; + +#if NCNN_INT8 + Mat scale_in_data; +#endif }; } // namespace ncnn diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h index ad5fadea5701..54d98834fb7c 100644 --- a/src/layer/riscv/riscv_usability.h +++ b/src/layer/riscv/riscv_usability.h @@ -110,11 +110,11 @@ static inline int32_t float2int8(vfloat32m1_t _v) return _ret; } -static void print_vint8m1(vint8m1_t _v) +static void print_vint8m1(vint8m1_t _v, int vl = 16) { int8_t* i8 = (int8_t*)malloc(16 * sizeof(int8_t)); - vse8_v_i8m1(i8, _v, 16); - for (int i = 0; i < 16; i++) + vse8_v_i8m1(i8, _v, vl); + for (int i = 0; i < vl; i++) { fprintf(stderr, "i8[%d]: %d\n", i, i8[i]); } @@ -132,6 +132,28 @@ static void print_vint32m1(vint32m1_t _v) free(i32); } +static void print_vint32m2(vint32m2_t _v, int vl = 8) +{ + int32_t* i32 = (int32_t*)malloc(8 * sizeof(int32_t)); + vse32_v_i32m2(i32, _v, vl); + for (int i = 0; i < vl; i++) + { + fprintf(stderr, "i32[%d]: %d\n", i, i32[i]); + } + free(i32); +} + +static void print_vfloat32m2(vfloat32m2_t _v, int vl = 8) +{ + float* f32 = (float*)malloc(8 * sizeof(float)); + vse32_v_f32m2(f32, _v, vl); + for (int i = 0; i < vl; i++) + { + fprintf(stderr, "f32[%d]: %f\n", i, f32[i]); + } + free(f32); +} + static void print_vfloat32m1(vfloat32m1_t _v) { float* f32 = (float*)malloc(4 * sizeof(float)); From 38d5bb1ad6dd8f5c5c42a0180df4eaac9399e49f Mon Sep 17 00:00:00 2001 From: Xinyu Yang Date: Wed, 14 Feb 2024 15:37:25 +0800 Subject: [PATCH 12/17] change int8 packn to 8 (#7) * change int8 packn to 8 * test_convert packing right --- src/layer/riscv/flatten_riscv.cpp | 53 +++---------------------------- src/layer/riscv/padding_riscv.cpp | 2 +- tests/testutil.cpp | 4 +-- 3 files changed, 7 insertions(+), 52 deletions(-) diff --git a/src/layer/riscv/flatten_riscv.cpp b/src/layer/riscv/flatten_riscv.cpp index e10bcb86bd3c..e6a97798a0fb 100644 --- a/src/layer/riscv/flatten_riscv.cpp +++ b/src/layer/riscv/flatten_riscv.cpp @@ -348,7 +348,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt } #if __riscv_vector - const int packn = csrr_vlenb() / 2; + const int packn = csrr_vlenb() / 2; // packn should be 8 #endif int w = bottom_blob.w; @@ -365,7 +365,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt #if __riscv_vector if (opt.use_packing_layout) { - out_elempack = total % 8 == 0 ? 8 : 1; + out_elempack = total % packn == 0 ? packn : 1; } #endif size_t out_elemsize = elemsize / elempack * out_elempack; @@ -394,29 +394,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt if (dims == 2) { #if __riscv_vector - // if (elempack == packn) // out_elempack == packn - // { - // #pragma omp parallel for num_threads(opt.num_threads) - // for (int i = 0; i < h; i++) - // { - // const signed char* ptr = bottom_blob.row(i); - // signed char* outptr = (signed char*)top_blob + w * i * packn; - - // int n = w * elempack; - // while (n > 0) - // { - // size_t vl = vsetvl_e8m1(n); - - // vint8m1_t _p = vle8_v_i8m1(ptr, vl); - // vsse8_v_i8m1(outptr, w * sizeof(unsigned char), _p, vl); - - // ptr += vl; - // outptr += 1; - // n -= vl; - // } - // } - // } - if (elempack == 8) // must add, because in innerproduct, elempack is 8 + if (elempack == packn) // must add, because in innerproduct, elempack is 8 { #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < h; i++) @@ -444,30 +422,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt if (dims == 3 || dims == 4) { #if __riscv_vector - // if (elempack == packn) // out_elempack == packn - // { - // #pragma omp parallel for num_threads(opt.num_threads) - // for (int q = 0; q < channels; q++) - // { - // const signed char* ptr = bottom_blob.channel(q); - // signed char* outptr = (signed char*)top_blob + size * q * packn; - - // int n = size * elempack; - // while (n > 0) - // { - // size_t vl = vsetvl_e8m1(n); - - // vint8m1_t _p = vle8_v_i8m1(ptr, vl); - // vsse8_v_i8m1(outptr, size * sizeof(signed char), _p, vl); - - // ptr += vl; - // outptr += 1; - // n -= vl; - // } - // } - // } - - if (elempack == 8) // must add, because in innerproduct, elempack is 8 + if (elempack == packn) // must add, because in innerproduct, elempack is 8 { #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) diff --git a/src/layer/riscv/padding_riscv.cpp b/src/layer/riscv/padding_riscv.cpp index 8f4b54da5904..2e2d7471f477 100644 --- a/src/layer/riscv/padding_riscv.cpp +++ b/src/layer/riscv/padding_riscv.cpp @@ -510,7 +510,7 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int Padding_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { #if __riscv_vector - const int packn = csrr_vlenb() / 1; + const int packn = csrr_vlenb() / 2; const size_t vl = vsetvl_e8m1(packn); #endif diff --git a/tests/testutil.cpp b/tests/testutil.cpp index 2f01b32c8a96..e271eafb4185 100644 --- a/tests/testutil.cpp +++ b/tests/testutil.cpp @@ -533,7 +533,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector Date: Wed, 14 Feb 2024 15:59:57 +0800 Subject: [PATCH 13/17] modify header (#8) --- src/layer/riscv/dequantize_riscv.cpp | 1 + src/layer/riscv/innerproduct_riscv.cpp | 1 + src/layer/riscv/quantize_riscv.cpp | 1 + 3 files changed, 3 insertions(+) diff --git a/src/layer/riscv/dequantize_riscv.cpp b/src/layer/riscv/dequantize_riscv.cpp index a0ccb758120b..4c3900e4ef15 100644 --- a/src/layer/riscv/dequantize_riscv.cpp +++ b/src/layer/riscv/dequantize_riscv.cpp @@ -1,5 +1,6 @@ // Tencent is pleased to support the open source community by making ncnn available. // +// Copyright (C) 2024 Xinyu302. All rights reserved. // Copyright (C) 2019 BUG1989. All rights reserved. // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. // diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp index 20213c9951f0..08cddbc151f7 100644 --- a/src/layer/riscv/innerproduct_riscv.cpp +++ b/src/layer/riscv/innerproduct_riscv.cpp @@ -1,5 +1,6 @@ // Tencent is pleased to support the open source community by making ncnn available. // +// Copyright (C) 2024 Xinyu302. All rights reserved. // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except diff --git a/src/layer/riscv/quantize_riscv.cpp b/src/layer/riscv/quantize_riscv.cpp index 91004baebe0a..0430336d3ed6 100644 --- a/src/layer/riscv/quantize_riscv.cpp +++ b/src/layer/riscv/quantize_riscv.cpp @@ -1,5 +1,6 @@ // Tencent is pleased to support the open source community by making ncnn available. // +// Copyright (C) 2024 Xinyu302. All rights reserved. // Copyright (C) 2019 BUG1989. All rights reserved. // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. // From 639cf3d5a9e37b3beec762c702d5d3cc308fcaf8 Mon Sep 17 00:00:00 2001 From: Xinyu Yang Date: Wed, 14 Feb 2024 23:41:08 +0800 Subject: [PATCH 14/17] Fix pack (#10) * modify * debug * fix quantize and innerproduct * apply code-format changes --------- Co-authored-by: Xinyu302 --- src/layer/riscv/quantize_riscv.cpp | 170 ++++++++++++++++++++++++++++- tests/testutil.cpp | 4 +- 2 files changed, 171 insertions(+), 3 deletions(-) diff --git a/src/layer/riscv/quantize_riscv.cpp b/src/layer/riscv/quantize_riscv.cpp index 0430336d3ed6..89d82106aa04 100644 --- a/src/layer/riscv/quantize_riscv.cpp +++ b/src/layer/riscv/quantize_riscv.cpp @@ -517,6 +517,174 @@ int Quantize_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const O int dims = bottom_blob.dims; int elempack = bottom_blob.elempack; int vl; + if (elempack == 8) + { + vl = 8; + if (dims == 1) + { + int w = bottom_blob.w; + int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; + int outw = w * elempack / out_elempack; + + top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + vfloat32m2_t _scale = vfmv_v_f_f32m2(scale, vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 8; + signed char* outptr = (signed char*)top_blob + i * 8; + + vl = 8; + vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl); + vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl); + _v = vfmul_vv_f32m2(_v, _scale, vl); + *(int64_t*)outptr = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1)); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 8; + signed char* outptr = (signed char*)top_blob + i * 8; + + vl = 8; + vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl); + vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl); + vfloat32m2_t _scale = vle32_v_f32m2((const float*)scale_data + i * 8, vl); + _v = vfmul_vv_f32m2(_v, _scale, vl); + *(int64_t*)outptr = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1)); + } + } + } + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; + int outh = h * elempack / out_elempack; + + top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + vfloat32m2_t _scale = vfmv_v_f_f32m2(scale, vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i); + + for (int j = 0; j < w; j++) + { + vl = 8; + vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl); + vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl); + _v = vfmul_vv_f32m2(_v, _scale, vl); + *(int64_t*)outptr0 = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1)); + + ptr0 += 8; + outptr0 += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i); + + vfloat32m2_t _scale = vle32_v_f32m2((const float*)scale_data + i * 8, vl); + for (int j = 0; j < w; j++) + { + vl = 8; + vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl); + vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl); + _v = vfmul_vv_f32m2(_v, _scale, vl); + *(int64_t*)outptr0 = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1)); + + ptr0 += 8; + outptr0 += 8; + } + } + } + } + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; + int outc = channels * elempack / out_elempack; + + top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + vfloat32m2_t _scale = vfmv_v_f_f32m2(scale, vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q); + signed char* outptr = top_blob.channel(q); + + for (int i = 0; i < size; i++) + { + vl = 8; + vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl); + vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl); + _v = vfmul_vv_f32m2(_v, _scale, vl); + *(int64_t*)outptr = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1)); + ptr0 += 8; + outptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q); + signed char* outptr = top_blob.channel(q); + + vfloat32m2_t _scale = vle32_v_f32m2((const float*)scale_data + q * 8, vl); + + for (int i = 0; i < size; i++) + { + vl = 8; + vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl); + vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl); + _v = vfmul_vv_f32m2(_v, _scale, vl); + *(int64_t*)outptr = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1)); + ptr0 += 8; + outptr += 8; + } + } + } + } + return 0; + } if (elempack == 4) { @@ -1007,7 +1175,7 @@ int Quantize_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const signed char* outptr0 = top_blob.row(i); vl = 8; - vfloat16m1_t _scale = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)scale_data + i * 8, vl), vl); // float16x8_t _scale = vcombine_f16(vcvt_f16_f32(vld1q_f32((const float*)scale_data + i * 8)), vcvt_f16_f32(vld1q_f32((const float*)scale_data + i * 8 + 4))); + vfloat16m1_t _scale = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)scale_data + i * 8, vl), vl); for (int j = 0; j < w; j++) { diff --git a/tests/testutil.cpp b/tests/testutil.cpp index e271eafb4185..96a7c903103f 100644 --- a/tests/testutil.cpp +++ b/tests/testutil.cpp @@ -523,7 +523,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector Date: Thu, 15 Feb 2024 15:49:00 +0800 Subject: [PATCH 15/17] modify padding using pack8 (#12) --- src/layer/riscv/padding_packn.h | 9 ++++++--- src/mat.h | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/layer/riscv/padding_packn.h b/src/layer/riscv/padding_packn.h index 50f5efe1216d..647ef1bb0b13 100644 --- a/src/layer/riscv/padding_packn.h +++ b/src/layer/riscv/padding_packn.h @@ -15,7 +15,8 @@ #define _PADDING_PACKN_RVV(SEW, TSEW, LMUL, T, VT) \ static void padding_constant_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right, v##VT##m##LMUL##_t v) \ { \ - const int packn = csrr_vlenb() / sizeof(T); \ + int packn = csrr_vlenb() / sizeof(T); \ + if (packn > 8) packn = 8; \ const size_t vl = vsetvl_e##SEW##m##LMUL(packn); \ \ const T* ptr = src; \ @@ -64,7 +65,8 @@ \ static void padding_replicate_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right) \ { \ - const int packn = csrr_vlenb() / sizeof(T); \ + int packn = csrr_vlenb() / sizeof(T); \ + if (packn > 8) packn = 8; \ const size_t vl = vsetvl_e##SEW##m##LMUL(packn); \ \ const T* ptr = src; \ @@ -143,7 +145,8 @@ \ static void padding_reflect_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right) \ { \ - const int packn = csrr_vlenb() / sizeof(T); \ + int packn = csrr_vlenb() / sizeof(T); \ + if (packn > 8) packn = 8; \ const size_t vl = vsetvl_e##SEW##m##LMUL(packn); \ \ const T* ptr = src; \ diff --git a/src/mat.h b/src/mat.h index fdf5cc597c4e..b6849565fe13 100644 --- a/src/mat.h +++ b/src/mat.h @@ -1120,7 +1120,7 @@ NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v) NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v) { - const int packn = cpu_riscv_vlenb() / 1; + const int packn = cpu_riscv_vlenb() / 2; const size_t vl = vsetvl_e8m1(packn); int size = (int)total(); From da87ff652b7328091661fa03ef3f3535fb98a142 Mon Sep 17 00:00:00 2001 From: Xinyu302 Date: Thu, 15 Feb 2024 07:52:41 +0000 Subject: [PATCH 16/17] apply code-format changes --- src/layer/riscv/padding_packn.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/layer/riscv/padding_packn.h b/src/layer/riscv/padding_packn.h index 647ef1bb0b13..60465da64ffc 100644 --- a/src/layer/riscv/padding_packn.h +++ b/src/layer/riscv/padding_packn.h @@ -15,7 +15,7 @@ #define _PADDING_PACKN_RVV(SEW, TSEW, LMUL, T, VT) \ static void padding_constant_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right, v##VT##m##LMUL##_t v) \ { \ - int packn = csrr_vlenb() / sizeof(T); \ + int packn = csrr_vlenb() / sizeof(T); \ if (packn > 8) packn = 8; \ const size_t vl = vsetvl_e##SEW##m##LMUL(packn); \ \ @@ -65,7 +65,7 @@ \ static void padding_replicate_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right) \ { \ - int packn = csrr_vlenb() / sizeof(T); \ + int packn = csrr_vlenb() / sizeof(T); \ if (packn > 8) packn = 8; \ const size_t vl = vsetvl_e##SEW##m##LMUL(packn); \ \ @@ -145,7 +145,7 @@ \ static void padding_reflect_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right) \ { \ - int packn = csrr_vlenb() / sizeof(T); \ + int packn = csrr_vlenb() / sizeof(T); \ if (packn > 8) packn = 8; \ const size_t vl = vsetvl_e##SEW##m##LMUL(packn); \ \ From 9586bc59798ecab0a64c0e53d197859025de1d72 Mon Sep 17 00:00:00 2001 From: Xinyu Yang Date: Fri, 16 Feb 2024 23:43:26 +0800 Subject: [PATCH 17/17] finish requantize but has bug (#13) * finish requantize but has bug * fix bug * delete comment * apply code-format changes --------- Co-authored-by: Xinyu302 --- src/layer/riscv/requantize_leakyrelu_pack4.h | 263 ++++ src/layer/riscv/requantize_leakyrelu_pack8.h | 160 +++ src/layer/riscv/requantize_relu_pack4.h | 259 ++++ src/layer/riscv/requantize_relu_pack8.h | 155 +++ src/layer/riscv/requantize_riscv.cpp | 1240 ++++++++++++++++++ src/layer/riscv/requantize_riscv.h | 34 + src/layer/riscv/riscv_usability.h | 64 + 7 files changed, 2175 insertions(+) create mode 100644 src/layer/riscv/requantize_leakyrelu_pack4.h create mode 100644 src/layer/riscv/requantize_leakyrelu_pack8.h create mode 100644 src/layer/riscv/requantize_relu_pack4.h create mode 100644 src/layer/riscv/requantize_relu_pack8.h create mode 100644 src/layer/riscv/requantize_riscv.cpp create mode 100644 src/layer/riscv/requantize_riscv.h diff --git a/src/layer/riscv/requantize_leakyrelu_pack4.h b/src/layer/riscv/requantize_leakyrelu_pack4.h new file mode 100644 index 000000000000..fd5c4dfd93d2 --- /dev/null +++ b/src/layer/riscv/requantize_leakyrelu_pack4.h @@ -0,0 +1,263 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void requantize_leakyrelu_pack4_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt) +{ + int vl = 4; + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int outc = top_blob.c; + int out_elempack = top_blob.elempack; + + int scale_in_data_size = scale_in_data.w; + int scale_out_data_size = scale_out_data.w; + int bias_data_size = bias_data.w; + + // int8(relu(v * scale_in) * scale_out) + // int8_relu(v * (scale_in * scale_out)) + + // int8(relu(v * scale_in + bias) * scale_out) + // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) + + if (out_elempack == 8) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl); + vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl); + vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl); + vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl); + + vfloat32m1_t _scale0 = vfmul_vv_f32m1(_scale_in0, _scale_out0, vl); + vfloat32m1_t _scale1 = vfmul_vv_f32m1(_scale_in1, _scale_out1, vl); + vfloat32m1_t _slope = vfmv_v_f_f32m1(slope, vl); + + int i = 0; + for (; i + 3 < size; i += 4) + { + vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl); + vfloat32m1_t _v02 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 8, vl), vl); + vfloat32m1_t _v03 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 12, vl), vl); + vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl); + vfloat32m1_t _v12 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 8, vl), vl); + vfloat32m1_t _v13 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 12, vl), vl); + _v00 = vfmul_vv_f32m1(_v00, _scale0, vl); + _v01 = vfmul_vv_f32m1(_v01, _scale0, vl); + _v02 = vfmul_vv_f32m1(_v02, _scale0, vl); + _v03 = vfmul_vv_f32m1(_v03, _scale0, vl); + _v10 = vfmul_vv_f32m1(_v10, _scale1, vl); + _v11 = vfmul_vv_f32m1(_v11, _scale1, vl); + _v12 = vfmul_vv_f32m1(_v12, _scale1, vl); + _v13 = vfmul_vv_f32m1(_v13, _scale1, vl); + *(int64_t*)ptr = float2int8leakyrelu(_v00, _v10, _slope); + *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v01, _v11, _slope); + *(int64_t*)(ptr + 16) = float2int8leakyrelu(_v02, _v12, _slope); + *(int64_t*)(ptr + 24) = float2int8leakyrelu(_v03, _v13, _slope); + + intptr0 += 16; + intptr1 += 16; + ptr += 32; + } + for (; i < size; i++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + _v0 = vfmul_vv_f32m1(_v0, _scale0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale1, vl); + *(int64_t*)ptr = float2int8leakyrelu(_v0, _v1, _slope); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl); + vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl); + vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl); + vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl); + vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8, vl); + vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8 + 4, vl); + + vfloat32m1_t _scale0 = vfmul_vv_f32m1(_scale_in0, _scale_out0, vl); + vfloat32m1_t _scale1 = vfmul_vv_f32m1(_scale_in1, _scale_out1, vl); + vfloat32m1_t _slope = vfmv_v_f_f32m1(slope, vl); + _bias0 = vfmul_vv_f32m1(_bias0, _scale_out0, vl); + _bias1 = vfmul_vv_f32m1(_bias1, _scale_out1, vl); + + int i = 0; + for (; i + 3 < size; i += 4) + { + vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl); + vfloat32m1_t _v02 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 8, vl), vl); + vfloat32m1_t _v03 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 12, vl), vl); + vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl); + vfloat32m1_t _v12 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 8, vl), vl); + vfloat32m1_t _v13 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 12, vl), vl); + + _v00 = vfmacc_vv_f32m1(_bias0, _v00, _scale0, vl); + _v01 = vfmacc_vv_f32m1(_bias0, _v01, _scale0, vl); + _v02 = vfmacc_vv_f32m1(_bias0, _v02, _scale0, vl); + _v03 = vfmacc_vv_f32m1(_bias0, _v03, _scale0, vl); + _v10 = vfmacc_vv_f32m1(_bias1, _v10, _scale1, vl); + _v11 = vfmacc_vv_f32m1(_bias1, _v11, _scale1, vl); + _v12 = vfmacc_vv_f32m1(_bias1, _v12, _scale1, vl); + _v13 = vfmacc_vv_f32m1(_bias1, _v13, _scale1, vl); + + *(int64_t*)ptr = float2int8leakyrelu(_v00, _v10, _slope); + *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v01, _v11, _slope); + *(int64_t*)(ptr + 16) = float2int8leakyrelu(_v02, _v12, _slope); + *(int64_t*)(ptr + 24) = float2int8leakyrelu(_v03, _v13, _slope); + + intptr0 += 16; + intptr1 += 16; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl); + vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl); + + _v00 = vfmacc_vv_f32m1(_bias0, _v00, _scale0, vl); + _v01 = vfmacc_vv_f32m1(_bias0, _v01, _scale0, vl); + _v10 = vfmacc_vv_f32m1(_bias1, _v10, _scale1, vl); + _v11 = vfmacc_vv_f32m1(_bias1, _v11, _scale1, vl); + + *(int64_t*)ptr = float2int8leakyrelu(_v00, _v10, _slope); + *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v01, _v11, _slope); + + intptr0 += 8; + intptr1 += 8; + ptr += 16; + } + for (; i < size; i++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + + _v0 = vfmacc_vv_f32m1(_bias0, _v0, _scale0, vl); + _v1 = vfmacc_vv_f32m1(_bias1, _v1, _scale1, vl); + + *(int64_t*)ptr = float2int8leakyrelu(_v0, _v1, _slope); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + + vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl); + vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl); + vfloat32m1_t _slope = vfmv_v_f_f32m1(slope, vl); + + vfloat32m1_t _scale = vfmul_vv_f32m1(_scale_in, _scale_out, vl); + + int i = 0; + for (; i < size; i++) + { + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmul_vv_f32m1(_v, _scale, vl); + + int res = float2int8leakyrelu(_v, _slope); + ptr0[0] = (res)&0xff; + ptr1[0] = (res >> 8) & 0xff; + ptr2[0] = (res >> 16) & 0xff; + ptr3[0] = (res >> 24) & 0xff; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + + vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl); + vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl); + vfloat32m1_t _bias = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 4, vl); + vfloat32m1_t _slope = vfmv_v_f_f32m1(slope, vl); + + vfloat32m1_t _scale = vfmul_vv_f32m1(_scale_in, _scale_out, vl); + _bias = vfmul_vv_f32m1(_bias, _scale_out, vl); + + int i = 0; + for (; i < size; i++) + { + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale, vl); + int res = float2int8leakyrelu(_v, _slope); + ptr0[0] = (res)&0xff; + ptr1[0] = (res >> 8) & 0xff; + ptr2[0] = (res >> 16) & 0xff; + ptr3[0] = (res >> 24) & 0xff; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + } +} diff --git a/src/layer/riscv/requantize_leakyrelu_pack8.h b/src/layer/riscv/requantize_leakyrelu_pack8.h new file mode 100644 index 000000000000..87991ad94394 --- /dev/null +++ b/src/layer/riscv/requantize_leakyrelu_pack8.h @@ -0,0 +1,160 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void requantize_leakyrelu_pack8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + int scale_in_data_size = scale_in_data.w; + int scale_out_data_size = scale_out_data.w; + int bias_data_size = bias_data.w; + int vl = 8; + + // int8(relu(v * scale_in) * scale_out) + // int8_relu(v * (scale_in * scale_out)) + + // int8(relu(v * scale_in + bias) * scale_out) + // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl); + + vfloat32m2_t _scale0 = vfmul_vv_f32m2(_scale_in0, _scale_out0, vl); + vfloat32m2_t _slope = vfmv_v_f_f32m2(slope, vl); + + int i = 0; + for (; i + 3 < size; i += 4) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl); + vfloat32m2_t _v45 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 16, vl), vl); + vfloat32m2_t _v67 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 24, vl), vl); + + _v01 = vfmul_vv_f32m2(_v01, _scale0, vl); + _v23 = vfmul_vv_f32m2(_v23, _scale0, vl); + _v45 = vfmul_vv_f32m2(_v45, _scale0, vl); + _v67 = vfmul_vv_f32m2(_v67, _scale0, vl); + + *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope); + *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v23, _slope); + *(int64_t*)(ptr + 16) = float2int8leakyrelu(_v45, _slope); + *(int64_t*)(ptr + 24) = float2int8leakyrelu(_v67, _slope); + + intptr += 32; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl); + + _v01 = vfmul_vv_f32m2(_v01, _scale0, vl); + _v23 = vfmul_vv_f32m2(_v23, _scale0, vl); + + *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope); + *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v23, _slope); + + intptr += 16; + ptr += 16; + } + for (; i < size; i++) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + + _v01 = vfmul_vv_f32m2(_v01, _scale0, vl); + + *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope); + + intptr += 8; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl); + vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + q * 8, vl); + + vfloat32m2_t _scale0 = vfmul_vv_f32m2(_scale_in0, _scale_out0, vl); + _bias0 = vfmul_vv_f32m2(_bias0, _scale_out0, vl); + + vfloat32m2_t _slope = vfmv_v_f_f32m2(slope, vl); + + int i = 0; + for (; i + 3 < size; i += 4) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl); + vfloat32m2_t _v45 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 16, vl), vl); + vfloat32m2_t _v67 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 24, vl), vl); + + _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl); + _v23 = vfmacc_vv_f32m2(_bias0, _v23, _scale0, vl); + _v45 = vfmacc_vv_f32m2(_bias0, _v45, _scale0, vl); + _v67 = vfmacc_vv_f32m2(_bias0, _v67, _scale0, vl); + + *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope); + *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v23, _slope); + *(int64_t*)(ptr + 16) = float2int8leakyrelu(_v45, _slope); + *(int64_t*)(ptr + 24) = float2int8leakyrelu(_v67, _slope); + + intptr += 32; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl); + + _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl); + _v23 = vfmacc_vv_f32m2(_bias0, _v23, _scale0, vl); + + *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope); + *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v23, _slope); + + intptr += 16; + ptr += 16; + } + for (; i < size; i++) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + + _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl); + + *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope); + + intptr += 8; + ptr += 8; + } + } + } +} diff --git a/src/layer/riscv/requantize_relu_pack4.h b/src/layer/riscv/requantize_relu_pack4.h new file mode 100644 index 000000000000..ca5285de57c5 --- /dev/null +++ b/src/layer/riscv/requantize_relu_pack4.h @@ -0,0 +1,259 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void requantize_relu_pack4_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt) +{ + int vl = 4; + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int outc = top_blob.c; + int out_elempack = top_blob.elempack; + + int scale_in_data_size = scale_in_data.w; + int scale_out_data_size = scale_out_data.w; + int bias_data_size = bias_data.w; + + // int8(relu(v * scale_in) * scale_out) + // int8_relu(v * (scale_in * scale_out)) + + // int8(relu(v * scale_in + bias) * scale_out) + // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) + + if (out_elempack == 8) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl); + vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl); + vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl); + vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl); + + vfloat32m1_t _scale0 = vfmul_vv_f32m1(_scale_in0, _scale_out0, vl); + vfloat32m1_t _scale1 = vfmul_vv_f32m1(_scale_in1, _scale_out1, vl); + + int i = 0; + for (; i + 3 < size; i += 4) + { + vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl); + vfloat32m1_t _v02 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 8, vl), vl); + vfloat32m1_t _v03 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 12, vl), vl); + vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl); + vfloat32m1_t _v12 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 8, vl), vl); + vfloat32m1_t _v13 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 12, vl), vl); + _v00 = vfmul_vv_f32m1(_v00, _scale0, vl); + _v01 = vfmul_vv_f32m1(_v01, _scale0, vl); + _v02 = vfmul_vv_f32m1(_v02, _scale0, vl); + _v03 = vfmul_vv_f32m1(_v03, _scale0, vl); + _v10 = vfmul_vv_f32m1(_v10, _scale1, vl); + _v11 = vfmul_vv_f32m1(_v11, _scale1, vl); + _v12 = vfmul_vv_f32m1(_v12, _scale1, vl); + _v13 = vfmul_vv_f32m1(_v13, _scale1, vl); + *(int64_t*)ptr = float2int8relu(_v00, _v10); + *(int64_t*)(ptr + 8) = float2int8relu(_v01, _v11); + *(int64_t*)(ptr + 16) = float2int8relu(_v02, _v12); + *(int64_t*)(ptr + 24) = float2int8relu(_v03, _v13); + + intptr0 += 16; + intptr1 += 16; + ptr += 32; + } + for (; i < size; i++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + _v0 = vfmul_vv_f32m1(_v0, _scale0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale1, vl); + *(int64_t*)ptr = float2int8relu(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl); + vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl); + vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl); + vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl); + vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8, vl); + vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8 + 4, vl); + + vfloat32m1_t _scale0 = vfmul_vv_f32m1(_scale_in0, _scale_out0, vl); + vfloat32m1_t _scale1 = vfmul_vv_f32m1(_scale_in1, _scale_out1, vl); + _bias0 = vfmul_vv_f32m1(_bias0, _scale_out0, vl); + _bias1 = vfmul_vv_f32m1(_bias1, _scale_out1, vl); + + int i = 0; + for (; i + 3 < size; i += 4) + { + vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl); + vfloat32m1_t _v02 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 8, vl), vl); + vfloat32m1_t _v03 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 12, vl), vl); + vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl); + vfloat32m1_t _v12 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 8, vl), vl); + vfloat32m1_t _v13 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 12, vl), vl); + + _v00 = vfmacc_vv_f32m1(_bias0, _v00, _scale0, vl); + _v01 = vfmacc_vv_f32m1(_bias0, _v01, _scale0, vl); + _v02 = vfmacc_vv_f32m1(_bias0, _v02, _scale0, vl); + _v03 = vfmacc_vv_f32m1(_bias0, _v03, _scale0, vl); + _v10 = vfmacc_vv_f32m1(_bias1, _v10, _scale1, vl); + _v11 = vfmacc_vv_f32m1(_bias1, _v11, _scale1, vl); + _v12 = vfmacc_vv_f32m1(_bias1, _v12, _scale1, vl); + _v13 = vfmacc_vv_f32m1(_bias1, _v13, _scale1, vl); + + *(int64_t*)ptr = float2int8relu(_v00, _v10); + *(int64_t*)(ptr + 8) = float2int8relu(_v01, _v11); + *(int64_t*)(ptr + 16) = float2int8relu(_v02, _v12); + *(int64_t*)(ptr + 24) = float2int8relu(_v03, _v13); + + intptr0 += 16; + intptr1 += 16; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl); + vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl); + + _v00 = vfmacc_vv_f32m1(_bias0, _v00, _scale0, vl); + _v01 = vfmacc_vv_f32m1(_bias0, _v01, _scale0, vl); + _v10 = vfmacc_vv_f32m1(_bias1, _v10, _scale1, vl); + _v11 = vfmacc_vv_f32m1(_bias1, _v11, _scale1, vl); + + *(int64_t*)ptr = float2int8relu(_v00, _v10); + *(int64_t*)(ptr + 8) = float2int8relu(_v01, _v11); + + intptr0 += 8; + intptr1 += 8; + ptr += 16; + } + for (; i < size; i++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + + _v0 = vfmacc_vv_f32m1(_bias0, _v0, _scale0, vl); + _v1 = vfmacc_vv_f32m1(_bias1, _v1, _scale1, vl); + + *(int64_t*)ptr = float2int8relu(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + + vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl); + vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl); + + vfloat32m1_t _scale = vfmul_vv_f32m1(_scale_in, _scale_out, vl); + + int i = 0; + for (; i < size; i++) + { + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmul_vv_f32m1(_v, _scale, vl); + + int res = float2int8relu(_v); + ptr0[0] = (res)&0xff; + ptr1[0] = (res >> 8) & 0xff; + ptr2[0] = (res >> 16) & 0xff; + ptr3[0] = (res >> 24) & 0xff; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + + vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl); + vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl); + vfloat32m1_t _bias = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 4, vl); + + vfloat32m1_t _scale = vfmul_vv_f32m1(_scale_in, _scale_out, vl); + _bias = vfmul_vv_f32m1(_bias, _scale_out, vl); + + int i = 0; + for (; i < size; i++) + { + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale, vl); + int res = float2int8relu(_v); + ptr0[0] = (res)&0xff; + ptr1[0] = (res >> 8) & 0xff; + ptr2[0] = (res >> 16) & 0xff; + ptr3[0] = (res >> 24) & 0xff; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + } +} diff --git a/src/layer/riscv/requantize_relu_pack8.h b/src/layer/riscv/requantize_relu_pack8.h new file mode 100644 index 000000000000..e3f18dbb98a3 --- /dev/null +++ b/src/layer/riscv/requantize_relu_pack8.h @@ -0,0 +1,155 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void requantize_relu_pack8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + int scale_in_data_size = scale_in_data.w; + int scale_out_data_size = scale_out_data.w; + int bias_data_size = bias_data.w; + int vl = 8; + + // int8(relu(v * scale_in) * scale_out) + // int8_relu(v * (scale_in * scale_out)) + + // int8(relu(v * scale_in + bias) * scale_out) + // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl); + + vfloat32m2_t _scale0 = vfmul_vv_f32m2(_scale_in0, _scale_out0, vl); + + int i = 0; + for (; i + 3 < size; i += 4) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl); + vfloat32m2_t _v45 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 16, vl), vl); + vfloat32m2_t _v67 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 24, vl), vl); + + _v01 = vfmul_vv_f32m2(_v01, _scale0, vl); + _v23 = vfmul_vv_f32m2(_v23, _scale0, vl); + _v45 = vfmul_vv_f32m2(_v45, _scale0, vl); + _v67 = vfmul_vv_f32m2(_v67, _scale0, vl); + + *(int64_t*)ptr = float2int8relu(_v01); + *(int64_t*)(ptr + 8) = float2int8relu(_v23); + *(int64_t*)(ptr + 16) = float2int8relu(_v45); + *(int64_t*)(ptr + 24) = float2int8relu(_v67); + + intptr += 32; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl); + + _v01 = vfmul_vv_f32m2(_v01, _scale0, vl); + _v23 = vfmul_vv_f32m2(_v23, _scale0, vl); + + *(int64_t*)ptr = float2int8relu(_v01); + *(int64_t*)(ptr + 8) = float2int8relu(_v23); + intptr += 16; + ptr += 16; + } + for (; i < size; i++) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + + _v01 = vfmul_vv_f32m2(_v01, _scale0, vl); + + *(int64_t*)ptr = float2int8relu(_v01); + + intptr += 8; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl); + vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + q * 8, vl); + + vfloat32m2_t _scale0 = vfmul_vv_f32m2(_scale_in0, _scale_out0, vl); + _bias0 = vfmul_vv_f32m2(_bias0, _scale_out0, vl); + int i = 0; + for (; i + 3 < size; i += 4) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl); + vfloat32m2_t _v45 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 16, vl), vl); + vfloat32m2_t _v67 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 24, vl), vl); + + _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl); + _v23 = vfmacc_vv_f32m2(_bias0, _v23, _scale0, vl); + _v45 = vfmacc_vv_f32m2(_bias0, _v45, _scale0, vl); + _v67 = vfmacc_vv_f32m2(_bias0, _v67, _scale0, vl); + + *(int64_t*)ptr = float2int8relu(_v01); + *(int64_t*)(ptr + 8) = float2int8relu(_v23); + *(int64_t*)(ptr + 16) = float2int8relu(_v45); + *(int64_t*)(ptr + 24) = float2int8relu(_v67); + + intptr += 32; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl); + + _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl); + _v23 = vfmacc_vv_f32m2(_bias0, _v23, _scale0, vl); + + *(int64_t*)ptr = float2int8relu(_v01); + *(int64_t*)(ptr + 8) = float2int8relu(_v23); + + intptr += 16; + ptr += 16; + } + for (; i < size; i++) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + + _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl); + + *(int64_t*)ptr = float2int8relu(_v01); + + intptr += 8; + ptr += 8; + } + } + } +} diff --git a/src/layer/riscv/requantize_riscv.cpp b/src/layer/riscv/requantize_riscv.cpp new file mode 100644 index 000000000000..81e0e9aaa8da --- /dev/null +++ b/src/layer/riscv/requantize_riscv.cpp @@ -0,0 +1,1240 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 Xinyu302. All rights reserved. +// Copyright (C) 2019 BUG1989. All rights reserved. +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "requantize_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +#include "riscv_activation.h" +#include "riscv_usability.h" + +namespace ncnn { + +#if __riscv_vector +#include "requantize_leakyrelu_pack4.h" +#include "requantize_leakyrelu_pack8.h" +#include "requantize_relu_pack4.h" +#include "requantize_relu_pack8.h" +#endif // __riscv_vector + +Requantize_riscv::Requantize_riscv() +{ +#if __riscv_vector + support_packing = true; +#endif // __riscv_vector +} + +int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int dims = bottom_blob.dims; + int elempack = bottom_blob.elempack; + int vl; + +#if __riscv_vector + if (elempack == 8) + { + vl = 8; + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)8u, 8, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_in_data_size == 1 && scale_out_data_size == 1) + { + vfloat32m2_t _scale_in = vfmv_v_f_f32m2(scale_in_data[0], vl); + vfloat32m2_t _scale_out = vfmv_v_f_f32m2(scale_out_data[0], vl); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_in, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + else if (bias_data_size == 1) + { + vfloat32m2_t _bias = vfmv_v_f_f32m2(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmacc_vv_f32m2(_bias, _v0, _scale_in, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + i * 8, vl); + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + } + else if (scale_in_data_size == 1 && scale_out_data_size > 1) + { + vfloat32m2_t _scale_in = vfmv_v_f_f32m2(scale_in_data[0], vl); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl); + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_in, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + else if (bias_data_size == 1) + { + vfloat32m2_t _bias = vfmv_v_f_f32m2(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl); + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmacc_vv_f32m2(_bias, _v0, _scale_in, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl); + vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + i * 8, vl); + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + } + else if (scale_in_data_size > 1 && scale_out_data_size == 1) + { + vfloat32m2_t _scale_out = vfmv_v_f_f32m2(scale_out_data[0], vl); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl); + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_in0, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + else if (bias_data_size == 1) + { + vfloat32m2_t _bias = vfmv_v_f_f32m2(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl); + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmacc_vv_f32m2(_bias, _v0, _scale_in0, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl); + vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + i * 8, vl); + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in0, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + } + else // if (scale_in_data_size > 1 && scale_out_data_size > 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl); + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_in0, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + else if (bias_data_size == 1) + { + vfloat32m2_t _bias = vfmv_v_f_f32m2(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl); + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmacc_vv_f32m2(_bias, _v0, _scale_in0, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl); + vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + i * 8, vl); + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in0, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_in0, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl); + *(int64_t*)ptr = float2int8(_v0); + + intptr += 8; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl); + vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + i * 8, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in0, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl); + *(int64_t*)ptr = float2int8(_v0); + + intptr += 8; + ptr += 8; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (activation_type == 1) + { + requantize_relu_pack8_rvv(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt); + return 0; + } + + if (activation_type == 2 && activation_params[0] > 0.f) + { + requantize_leakyrelu_pack8_rvv(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt); + return 0; + } + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl); + + for (int i = 0; i < size; i++) + { + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_in0, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl); + *(int64_t*)ptr = float2int8(_v0); + + intptr += 8; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl); + vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + q * 8, vl); + + for (int i = 0; i < size; i++) + { + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in0, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl); + *(int64_t*)ptr = float2int8(_v0); + + intptr += 8; + ptr += 8; + } + } + } + } + + return 0; + } + + if (elempack == 4) + { + vl = 4; + if (dims == 1) + { + int w = bottom_blob.w; + int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; + int outw = w * elempack / out_elempack; + + top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_in_data_size == 1 && scale_out_data_size == 1) + { + vfloat32m1_t _scale_in = vfmv_v_f_f32m1(scale_in_data[0], vl); + vfloat32m1_t _scale_out = vfmv_v_f_f32m1(scale_out_data[0], vl); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmul_vv_f32m1(_v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + else if (bias_data_size == 1) + { + vfloat32m1_t _bias = vfmv_v_f_f32m1(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _bias = vle32_v_f32m1((const float*)bias_data + i * 4, vl); + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + } + else if (scale_in_data_size == 1 && scale_out_data_size > 1) + { + vfloat32m1_t _scale_in = vfmv_v_f_f32m1(scale_in_data[0], vl); + // float32x4_t _scale_in = vdupq_n_f32(scale_in_data[0]); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl); + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmul_vv_f32m1(_v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + else if (bias_data_size == 1) + { + vfloat32m1_t _bias = vfmv_v_f_f32m1(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl); + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _bias = vle32_v_f32m1((const float*)bias_data + i * 4, vl); + vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl); + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + } + else if (scale_in_data_size > 1 && scale_out_data_size == 1) + { + vfloat32m1_t _scale_out = vfmv_v_f_f32m1(scale_out_data[0], vl); + // float32x4_t _scale_out = vdupq_n_f32(scale_out_data[0]); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl); + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmul_vv_f32m1(_v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + else if (bias_data_size == 1) + { + // float32x4_t _bias = vdupq_n_f32(bias_data[0]); + vfloat32m1_t _bias = vfmv_v_f_f32m1(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl); + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl); + vfloat32m1_t _bias = vle32_v_f32m1((const float*)bias_data + i * 4, vl); + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + } + else // if (scale_in_data_size > 1 && scale_out_data_size > 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl); + vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl); + + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmul_vv_f32m1(_v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + else if (bias_data_size == 1) + { + vfloat32m1_t _bias = vfmv_v_f_f32m1(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl); + vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl); + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl); + vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl); + vfloat32m1_t _bias = vle32_v_f32m1((const float*)bias_data + i * 4, vl); + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; + int outh = h * elempack / out_elempack; + + top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == 8) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const int* intptr0 = bottom_blob.row(i * 2); + const int* intptr1 = bottom_blob.row(i * 2 + 1); + signed char* ptr = top_blob.row(i); + + vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 8, vl); + vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 8 + 4, vl); + vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 8, vl); + vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 8 + 4, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + _v0 = vfmul_vv_f32m1(_v0, _scale_in0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale_in1, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v1 = activation_ps(_v1, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m1(_v0, _scale_out0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale_out1, vl); + *(int64_t*)ptr = float2int8(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const int* intptr0 = bottom_blob.row(i * 2); + const int* intptr1 = bottom_blob.row(i * 2 + 1); + signed char* ptr = top_blob.row(i); + + vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 8, vl); + vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 8 + 4, vl); + vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 8, vl); + vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 8 + 4, vl); + vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 8, vl); + vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 8 + 4, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + _v0 = vfmacc_vv_f32m1(_bias0, _v0, _scale_in0, vl); + _v1 = vfmacc_vv_f32m1(_bias1, _v1, _scale_in1, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v1 = activation_ps(_v1, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m1(_v0, _scale_out0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale_out1, vl); + *(int64_t*)ptr = float2int8(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr0 = top_blob.row(i * 4); + signed char* ptr1 = top_blob.row(i * 4 + 1); + signed char* ptr2 = top_blob.row(i * 4 + 2); + signed char* ptr3 = top_blob.row(i * 4 + 3); + + vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 4, vl); + vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 4, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmul_vv_f32m1(_v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + int res = float2int8(_v); + ptr0[0] = (res)&0xff; + ptr1[0] = (res >> 8) & 0xff; + ptr2[0] = (res >> 16) & 0xff; + ptr3[0] = (res >> 24) & 0xff; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr0 = top_blob.row(i * 4); + signed char* ptr1 = top_blob.row(i * 4 + 1); + signed char* ptr2 = top_blob.row(i * 4 + 2); + signed char* ptr3 = top_blob.row(i * 4 + 3); + + vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 4, vl); + vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 4, vl); + vfloat32m1_t _bias = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 4, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + int res = float2int8(_v); + + ptr0[0] = (res)&0xff; + ptr1[0] = (res >> 8) & 0xff; + ptr2[0] = (res >> 16) & 0xff; + ptr3[0] = (res >> 24) & 0xff; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; + int outc = channels * elempack / out_elempack; + + top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (activation_type == 1) + { + requantize_relu_pack4_rvv(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt); + return 0; + } + + if (activation_type == 2 && activation_params[0] > 0.f) + { + requantize_leakyrelu_pack4_rvv(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt); + return 0; + } + + if (out_elempack == 8) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl); + vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl); + vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl); + vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl); + + for (int i = 0; i < size; i++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + _v0 = vfmul_vv_f32m1(_v0, _scale_in0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale_in1, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v1 = activation_ps(_v1, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m1(_v0, _scale_out0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale_out1, vl); + *(int64_t*)ptr = float2int8(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl); + vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl); + vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl); + vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl); + vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8, vl); + vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8 + 4, vl); + for (int i = 0; i < size; i++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + _v0 = vfmacc_vv_f32m1(_bias0, _v0, _scale_in0, vl); + _v1 = vfmacc_vv_f32m1(_bias1, _v1, _scale_in1, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v1 = activation_ps(_v1, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m1(_v0, _scale_out0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale_out1, vl); + *(int64_t*)ptr = float2int8(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + + vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl); + vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl); + // float32x4_t _scale_in = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 4); + // float32x4_t _scale_out = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 4); + + for (int i = 0; i < size; i++) + { + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmul_vv_f32m1(_v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + int res = float2int8(_v); + ptr0[0] = (res)&0xff; + ptr1[0] = (res >> 8) & 0xff; + ptr2[0] = (res >> 16) & 0xff; + ptr3[0] = (res >> 24) & 0xff; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + + vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl); + vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl); + vfloat32m1_t _bias = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 4, vl); + + for (int i = 0; i < size; i++) + { + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + int res = float2int8(_v); + ptr0[0] = (res)&0xff; + ptr1[0] = (res >> 8) & 0xff; + ptr2[0] = (res >> 16) & 0xff; + ptr3[0] = (res >> 24) & 0xff; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + } + } + + return 0; + } +#endif // __riscv_vector + + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int* intptr = bottom_blob; + signed char* ptr = top_blob; + + if (scale_in_data_size == 1 && scale_out_data_size == 1) + { + const float scale_in = scale_in_data[0]; + const float scale_out = scale_out_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in + bias_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + else if (scale_in_data_size == 1 && scale_out_data_size > 1) + { + const float scale_in = scale_in_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in + bias_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + } + else if (scale_in_data_size > 1 && scale_out_data_size == 1) + { + const float scale_out = scale_out_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i] + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i] + bias_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + else // if (scale_in_data_size > 1 && scale_out_data_size > 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i] + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i] + bias_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i]; + + for (int j = 0; j < w; j++) + { + float v = intptr[j] * scale_in; + ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i]; + const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; + + for (int j = 0; j < w; j++) + { + float v = intptr[j] * scale_in + bias; + ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q]; + + for (int i = 0; i < size; i++) + { + float v = intptr[i] * scale_in; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q]; + const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; + + for (int i = 0; i < size; i++) + { + float v = intptr[i] * scale_in + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/riscv/requantize_riscv.h b/src/layer/riscv/requantize_riscv.h new file mode 100644 index 000000000000..df12b54ce3a4 --- /dev/null +++ b/src/layer/riscv/requantize_riscv.h @@ -0,0 +1,34 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 Xinyu302. All rights reserved. +// Copyright (C) 2019 BUG1989. All rights reserved. +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_REQUANTIZE_RISCV_H +#define LAYER_REQUANTIZE_RISCV_H + +#include "requantize.h" + +namespace ncnn { + +class Requantize_riscv : public Requantize +{ +public: + Requantize_riscv(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_REQUANTIZE_RISCV_H diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h index 54d98834fb7c..1802a28336dd 100644 --- a/src/layer/riscv/riscv_usability.h +++ b/src/layer/riscv/riscv_usability.h @@ -110,6 +110,70 @@ static inline int32_t float2int8(vfloat32m1_t _v) return _ret; } +static inline int64_t float2int8(vfloat32m2_t _v) +{ + int vl = vsetvlmax_e32m2(); + vint32m2_t _v32m2 = vfcvt_x_f_v_i32m2(_v, vl); + vint32m4_t _v32 = vundefined_i32m4(); + _v32 = vset_v_i32m2_i32m4(_v32, 0, _v32m2); + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + _v8 = vmax_vx_i8m1(_v8, -127, vl); + int64_t _ret; + vse8_v_i8m1((int8_t*)&_ret, _v8, vl); + return _ret; +} + +static inline int64_t float2int8relu(vfloat32m2_t _v) +{ + int vl = vsetvlmax_e32m2(); + vint32m2_t _v32m2 = vfcvt_x_f_v_i32m2(_v, vl); + vint32m4_t _v32 = vundefined_i32m4(); + _v32 = vset_v_i32m2_i32m4(_v32, 0, _v32m2); + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + _v8 = vmax_vx_i8m1(_v8, 0, vl); + int64_t _ret; + vse8_v_i8m1((int8_t*)&_ret, _v8, vl); + return _ret; +} + +static inline int64_t float2int8leakyrelu(vfloat32m2_t _v, vfloat32m2_t _slope) +{ + int vl = vsetvlmax_e32m2(); + + vfloat32m2_t _v_leaky = vfmul_vv_f32m2(_v, _slope, vl); + vint32m2_t _v_int32 = vfcvt_x_f_v_i32m2(_v, vl); + + // vfloat32m1_t _vlow_leaky = vfmul_vv_f32m1(_vlow, _slope, vl); + // vfloat32m1_t _vhigh_leaky = vfmul_vv_f32m1(_vhigh, _slope, vl); + + // vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow, vl); + // vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh, vl); + + // vint32m1_t _vlow32_leaky = vfcvt_x_f_v_i32m1(_vlow_leaky, vl); + // vint32m1_t _vhigh32_leaky = vfcvt_x_f_v_i32m1(_vhigh_leaky, vl); + vint32m2_t _v_int32_leaky = vfcvt_x_f_v_i32m2(_v_leaky, vl); + + vl = 2 * vsetvlmax_e32m1(); + + vint32m4_t _v32 = vundefined_i32m4(); + vint32m4_t _v32_leaky = vundefined_i32m4(); + _v32 = vset_v_i32m2_i32m4(_v32, 0, _v_int32); + _v32_leaky = vset_v_i32m2_i32m4(_v32_leaky, 0, _v_int32_leaky); + + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl); + + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + vint8m1_t _v8_leaky = vnclip_wx_i8m1(_v16_leaky, 0, vl); + + _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl); + int64_t _ret; + vse8_v_i8m1((int8_t*)&_ret, _v8, vl); + return _ret; +} + static void print_vint8m1(vint8m1_t _v, int vl = 16) { int8_t* i8 = (int8_t*)malloc(16 * sizeof(int8_t));