diff --git a/src/layer/riscv/convolution_packed_int8.h b/src/layer/riscv/convolution_packed_int8.h new file mode 100644 index 000000000000..36a36abca8b6 --- /dev/null +++ b/src/layer/riscv/convolution_packed_int8.h @@ -0,0 +1,988 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +// #if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD) +// #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8 +// void convolution_transform_kernel_packed_int8_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h); +// void convolution_packed_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt); +// #endif + +// #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD +// void convolution_transform_kernel_packed_int8_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h); +// void convolution_packed_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt); +// #endif +// #endif + +static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ + const int maxk = kernel_w * kernel_h; + + // src = kw-kh-inch-outch + // dst = pb-pa-kw-kh-inch/pa-outch/pb + + // clang-format off + // *INDENT-OFF* +#if __riscv_vector + if (outch >= 8) + { + if (inch >= 8) + kernel_tm.create(maxk, inch / 8 + inch % 8, outch / 8 + (outch % 8) / 4 + (outch % 4) / 2 + outch % 2, (size_t)64u, 64); + else + kernel_tm.create(maxk, inch, outch / 8 + (outch % 8) / 4 + (outch % 4) / 2 + outch % 2, (size_t)8u, 8); + } + else if (outch >= 4) + { + if (inch >= 8) + kernel_tm.create(maxk, inch / 8 + inch % 8, outch / 4 + (outch % 4) / 2 + outch % 2, (size_t)32u, 32); + else + kernel_tm.create(maxk, inch, outch / 4 + (outch % 4) / 2 + outch % 2, (size_t)4u, 4); + } + else +#endif // __riscv_vector + if (outch >= 2) + { +#if __riscv_vector + if (inch >= 8) + kernel_tm.create(maxk, inch / 8 + inch % 8, outch / 2 + outch % 2, (size_t)16u, 16); + else +#endif // __riscv_vector + kernel_tm.create(maxk, inch, outch / 2 + outch % 2, (size_t)2u, 2); + } + else + { +#if __riscv_vector + if (inch >= 8) + kernel_tm.create(maxk, inch / 8 + inch % 8, outch, (size_t)8u, 8); + else +#endif // __riscv_vector + kernel_tm.create(maxk, inch, outch, (size_t)1u, 1); + } + // *INDENT-ON* + // clang-format on + + int q = 0; +#if __riscv_vector + for (; q + 7 < outch; q += 8) + { + const signed char* kptr0 = (const signed char*)kernel + q * inch * maxk; + const signed char* kptr1 = (const signed char*)kernel + (q + 1) * inch * maxk; + const signed char* kptr2 = (const signed char*)kernel + (q + 2) * inch * maxk; + const signed char* kptr3 = (const signed char*)kernel + (q + 3) * inch * maxk; + const signed char* kptr4 = (const signed char*)kernel + (q + 4) * inch * maxk; + const signed char* kptr5 = (const signed char*)kernel + (q + 5) * inch * maxk; + const signed char* kptr6 = (const signed char*)kernel + (q + 6) * inch * maxk; + const signed char* kptr7 = (const signed char*)kernel + (q + 7) * inch * maxk; + + signed char* g00 = kernel_tm.channel(q / 8); + + int p = 0; + for (; p + 7 < inch; p += 8) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k0 = kptr0 + k; + const signed char* k1 = kptr1 + k; + const signed char* k2 = kptr2 + k; + const signed char* k3 = kptr3 + k; + const signed char* k4 = kptr4 + k; + const signed char* k5 = kptr5 + k; + const signed char* k6 = kptr6 + k; + const signed char* k7 = kptr7 + k; + + for (int i = 0; i < 4; i++) + { + g00[0] = k0[0]; + g00[1] = k0[maxk]; + g00[2] = k1[0]; + g00[3] = k1[maxk]; + g00[4] = k2[0]; + g00[5] = k2[maxk]; + g00[6] = k3[0]; + g00[7] = k3[maxk]; + g00[8] = k4[0]; + g00[9] = k4[maxk]; + g00[10] = k5[0]; + g00[11] = k5[maxk]; + g00[12] = k6[0]; + g00[13] = k6[maxk]; + g00[14] = k7[0]; + g00[15] = k7[maxk]; + g00 += 16; + k0 += maxk * 2; + k1 += maxk * 2; + k2 += maxk * 2; + k3 += maxk * 2; + k4 += maxk * 2; + k5 += maxk * 2; + k6 += maxk * 2; + k7 += maxk * 2; + } + } + + kptr0 += maxk * 8; + kptr1 += maxk * 8; + kptr2 += maxk * 8; + kptr3 += maxk * 8; + kptr4 += maxk * 8; + kptr5 += maxk * 8; + kptr6 += maxk * 8; + kptr7 += maxk * 8; + } + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k0 = kptr0 + k; + const signed char* k1 = kptr1 + k; + const signed char* k2 = kptr2 + k; + const signed char* k3 = kptr3 + k; + const signed char* k4 = kptr4 + k; + const signed char* k5 = kptr5 + k; + const signed char* k6 = kptr6 + k; + const signed char* k7 = kptr7 + k; + + g00[0] = k0[0]; + g00[1] = k1[0]; + g00[2] = k2[0]; + g00[3] = k3[0]; + g00[4] = k4[0]; + g00[5] = k5[0]; + g00[6] = k6[0]; + g00[7] = k7[0]; + g00 += 8; + } + + kptr0 += maxk; + kptr1 += maxk; + kptr2 += maxk; + kptr3 += maxk; + kptr4 += maxk; + kptr5 += maxk; + kptr6 += maxk; + kptr7 += maxk; + } + } + for (; q + 3 < outch; q += 4) + { + const signed char* kptr0 = (const signed char*)kernel + q * inch * maxk; + const signed char* kptr1 = (const signed char*)kernel + (q + 1) * inch * maxk; + const signed char* kptr2 = (const signed char*)kernel + (q + 2) * inch * maxk; + const signed char* kptr3 = (const signed char*)kernel + (q + 3) * inch * maxk; + + signed char* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4); + + int p = 0; + for (; p + 7 < inch; p += 8) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k0 = kptr0 + k; + const signed char* k1 = kptr1 + k; + const signed char* k2 = kptr2 + k; + const signed char* k3 = kptr3 + k; + + for (int i = 0; i < 4; i++) + { + g00[0] = k0[0]; + g00[1] = k0[maxk]; + g00[2] = k1[0]; + g00[3] = k1[maxk]; + g00[4] = k2[0]; + g00[5] = k2[maxk]; + g00[6] = k3[0]; + g00[7] = k3[maxk]; + g00 += 8; + k0 += maxk * 2; + k1 += maxk * 2; + k2 += maxk * 2; + k3 += maxk * 2; + } + } + + kptr0 += maxk * 8; + kptr1 += maxk * 8; + kptr2 += maxk * 8; + kptr3 += maxk * 8; + } + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k0 = kptr0 + k; + const signed char* k1 = kptr1 + k; + const signed char* k2 = kptr2 + k; + const signed char* k3 = kptr3 + k; + + g00[0] = k0[0]; + g00[1] = k1[0]; + g00[2] = k2[0]; + g00[3] = k3[0]; + g00 += 4; + } + + kptr0 += maxk; + kptr1 += maxk; + kptr2 += maxk; + kptr3 += maxk; + } + } +#endif // __riscv_vector + for (; q + 1 < outch; q += 2) + { + const signed char* kptr0 = (const signed char*)kernel + q * inch * maxk; + const signed char* kptr1 = (const signed char*)kernel + (q + 1) * inch * maxk; + +#if __riscv_vector + signed char* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2); +#else + signed char* g00 = kernel_tm.channel(q / 2); +#endif + + int p = 0; +#if __riscv_vector + for (; p + 7 < inch; p += 8) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k0 = kptr0 + k; + const signed char* k1 = kptr1 + k; + + for (int i = 0; i < 4; i++) + { + g00[0] = k0[0]; + k0 += maxk; + g00 += 1; + } + for (int i = 0; i < 4; i++) + { + g00[0] = k1[0]; + k1 += maxk; + g00 += 1; + } + + for (int i = 4; i < 8; i++) + { + g00[0] = k0[0]; + k0 += maxk; + g00 += 1; + } + for (int i = 4; i < 8; i++) + { + g00[0] = k1[0]; + k1 += maxk; + g00 += 1; + } + } + + kptr0 += maxk * 8; + kptr1 += maxk * 8; + } +#endif // __riscv_vector + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k0 = kptr0 + k; + const signed char* k1 = kptr1 + k; + + g00[0] = k0[0]; + g00[1] = k1[0]; + g00 += 2; + } + + kptr0 += maxk; + kptr1 += maxk; + } + } + for (; q < outch; q++) + { + const signed char* kptr = (const signed char*)kernel + q * inch * maxk; + +#if __riscv_vector + signed char* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2 + q % 2); +#else + signed char* g00 = kernel_tm.channel(q / 2 + q % 2); +#endif + + int p = 0; +#if __riscv_vector + for (; p + 7 < inch; p += 8) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k0 = kptr + k; + + for (int i = 0; i < 8; i++) + { + g00[0] = k0[0]; + k0 += maxk; + g00 += 1; + } + } + + kptr += maxk * 8; + } +#endif // __riscv_vector + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k0 = kptr + k; + g00[0] = k0[0]; + g00++; + } + + kptr += maxk; + } + } +} + +static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + // #if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD) + // #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8 + // if (ncnn::cpu_support_arm_i8mm()) + // { + // convolution_packed_int8_i8mm(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + // return; + // } + // #endif + + // #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD + // if (ncnn::cpu_support_arm_asimddp()) + // { + // convolution_packed_int8_asimddp(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + // return; + // } + // #endif + // #endif + int vl; + + const int w = bottom_blob.w; + const int elempack = bottom_blob.elempack; + const int inch = bottom_blob.c * elempack; + + const int N = bottom_blob.cstep * elempack; + + const int outw = top_blob.w; + const int outh = top_blob.h; + const int out_elempack = top_blob.elempack; + const int outch = top_blob.c * out_elempack; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2 * elempack; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + int nn_outch = 0; + int remain_outch_start = 0; +#if __riscv_vector + nn_outch = (outch - remain_outch_start) / 8; + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + const int p = remain_outch_start + pp * 8; + + // shadowed variable for less openmp task args + const int outw = top_blob.w; + const int outh = top_blob.h; + const int N = bottom_blob.cstep * elempack; + const int M = top_blob.cstep * out_elempack; + + int* outptr = top_blob.channel(p / out_elempack); + + int ij = 0; + + for (; ij < outw * outh; ij++) + { + const int i = ij / outw; + const int j = ij % outw; + + // int32x4_t _sum0 = vdupq_n_s32(0); + // int32x4_t _sum1 = vdupq_n_s32(0); + // int32x4_t _sum2 = vdupq_n_s32(0); + // int32x4_t _sum3 = vdupq_n_s32(0); + + vl = 8; + vint32m2_t _sum01 = vmv_v_x_i32m2(0, vl); + // vint32m2_t _sum23 = vmv_v_x_i32m2(0, vl); + + const signed char* kptr = weight_data_tm.channel(p / 8); + + int q = 0; + { + for (; q + 7 < inch; q += 8) + { + const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; + + for (int k = 0; k < maxk; k++) + { + vl = 8; + const signed char* r0s = r0 + space_ofs[k]; + + // int8x8_t _r0; + vint8m1_t _r0; + if (elempack == 8) + { + _r0 = vle8_v_i8m1(r0s, vl); + // _r0 = vld1_s8(r0s); + } + else // if (elempack == 1) + { + _r0 = vlse8_v_i8m1(r0s, N * sizeof(signed char), vl); + // signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + // _r0 = vld1_s8(tmp); + } + + // int8x16_t _w0 = vld1q_s8(kptr); + // int8x16_t _w1 = vld1q_s8(kptr + 16); + // int8x16_t _w2 = vld1q_s8(kptr + 32); + // int8x16_t _w3 = vld1q_s8(kptr + 48); + vl = 16; + vint8m1_t _w0 = vle8_v_i8m1(kptr, vl); + vint8m1_t _w1 = vle8_v_i8m1(kptr + 16, vl); + vint8m1_t _w2 = vle8_v_i8m1(kptr + 32, vl); + vint8m1_t _w3 = vle8_v_i8m1(kptr + 48, vl); + + vl = 8; + + // int16x4_t _rr0 = vreinterpret_s16_s8(_r0); + vint16m1_t _rr0 = vreinterpret_v_i8m1_i16m1(_r0); + + vint8m1_t _r0ll = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 0, vl)); + vint8m1_t _r0lh = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 1, vl)); + vint8m1_t _r0hl = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 2, vl)); + vint8m1_t _r0hh = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 3, vl)); + + // uint8_t mask[8] = {8, 9, 10, 11, 12, 13, 14, 15}; + // vuint8m1_t _index = vle8_v_u8m1(mask, vl); + + // int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); + // int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); + // int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); + // int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); + + vint16m2_t _s0l_m2 = vwmul_vv_i16m2(_r0ll, _w0, vl); + vint16m2_t _s1l_m2 = vwmul_vv_i16m2(_r0ll, vslidedown_vx_i8m1(_w0, _w0, 8, vl), vl); + vint16m2_t _s0h_m2 = vwmul_vv_i16m2(_r0lh, _w1, vl); + vint16m2_t _s1h_m2 = vwmul_vv_i16m2(_r0lh, vslidedown_vx_i8m1(_w1, _w1, 8, vl), vl); + + // int16x8_t _s0l = vmull_s8(_r0ll, vget_low_s8(_w0)); + // int16x8_t _s1l = vmull_s8(_r0ll, vget_high_s8(_w0)); + // int16x8_t _s0h = vmull_s8(_r0lh, vget_low_s8(_w1)); + // int16x8_t _s1h = vmull_s8(_r0lh, vget_high_s8(_w1)); + + // vint16m1_t _s0l = vget_v_i16m2_i16m2(vwmacc_vv_i16m2(_s0l_m2, _r0hl, _w2, vl), 0); + // vint16m1_t _s1l = vget_v_i16m2_i16m2(vwmacc_vv_i16m2(_s1l_m2, _r0hl, vrgather_vv_i8m1(_w2, _index, vl), vl), 0); + // vint16m1_t _s2l = vget_v_i16m2_i16m2(vwmacc_vv_i16m2(_s0h_m2, _r0hh, _w3, vl), 0); + // vint16m1_t _s3l = vget_v_i16m2_i16m2(vwmacc_vv_i16m2(_s1h_m2, _r0hh, vrgather_vv_i8m1(_w3, _index, vl), vl), 0); + + _s0l_m2 = vwmacc_vv_i16m2(_s0l_m2, _r0hl, _w2, vl); + _s1l_m2 = vwmacc_vv_i16m2(_s1l_m2, _r0hl, vslidedown_vx_i8m1(_w2, _w2, 8, vl), vl); + _s0h_m2 = vwmacc_vv_i16m2(_s0h_m2, _r0hh, _w3, vl); + _s1h_m2 = vwmacc_vv_i16m2(_s1h_m2, _r0hh, vslidedown_vx_i8m1(_w3, _w3, 8, vl), vl); + + // _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w2)); + // _s1l = vmlal_s8(_s1l, _r0hl, vget_high_s8(_w2)); + // _s0h = vmlal_s8(_s0h, _r0hh, vget_low_s8(_w3)); + // _s1h = vmlal_s8(_s1h, _r0hh, vget_high_s8(_w3)); + + vint16m2_t _s01l = vset_v_i16m1_i16m2(_s0l_m2, 1, vget_v_i16m2_i16m1(_s1l_m2, 0)); + vint16m2_t _s01h = vset_v_i16m1_i16m2(_s0h_m2, 1, vget_v_i16m2_i16m1(_s1h_m2, 0)); + uint16_t odd_index[8] = {1, 3, 5, 7, 9, 11, 13, 15}; + uint16_t even_index[8] = {0, 2, 4, 6, 8, 10, 12, 14}; + vuint16m2_t _odd_index = vle16_v_u16m2(odd_index, vl); + vuint16m2_t _even_index = vle16_v_u16m2(even_index, vl); + + _sum01 = vwadd_wv_i32m2(_sum01, vget_v_i16m2_i16m1(vrgather_vv_i16m2(_s01l, _odd_index, vl), 0), vl); + _sum01 = vwadd_wv_i32m2(_sum01, vget_v_i16m2_i16m1(vrgather_vv_i16m2(_s01l, _even_index, vl), 0), vl); + _sum01 = vwadd_wv_i32m2(_sum01, vget_v_i16m2_i16m1(vrgather_vv_i16m2(_s01h, _odd_index, vl), 0), vl); + _sum01 = vwadd_wv_i32m2(_sum01, vget_v_i16m2_i16m1(vrgather_vv_i16m2(_s01h, _even_index, vl), 0), vl); + + // _sum0 = vpadalq_s16(_sum0, _s0l); + // _sum1 = vpadalq_s16(_sum1, _s1l); + // _sum2 = vpadalq_s16(_sum2, _s0h); + // _sum3 = vpadalq_s16(_sum3, _s1h); + + kptr += 64; + } + } + + { + // _sum0 = vaddq_s32(_sum0, _sum2); + // _sum1 = vaddq_s32(_sum1, _sum3); + // _sum01 = vadd_vv_i32m2(_sum01, _sum23, vl); + } + } + for (; q < inch; q++) + { + vl = 8; + const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + vl = 8; + const signed char* r0s = r0 + space_ofs[k]; + + // if (elempack == 1) + { + vint8m1_t _val = vmv_v_x_i8m1(r0s[0], vl); + vint8m1_t _w = vle8_v_i8m1(kptr, vl); + vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val, _w, vl), 0); + _sum01 = vwadd_wv_i32m2(_sum01, _s0, vl); + // int8x8_t _val = vdup_n_s8(r0s[0]); + // int8x8_t _w = vld1_s8(kptr); + // int16x8_t _s0 = vmull_s8(_val, _w); + // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); + // _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); + + kptr += 8; + } + } + } + vl = 8; + + if (out_elempack == 8) + { + // vst1q_s32(outptr, _sum0); + // vst1q_s32(outptr + 4, _sum1); + vse32_v_i32m2(outptr, _sum01, vl); + outptr += 8; + } + if (out_elempack == 4) + { + // vst1q_s32(outptr, _sum0); + // vst1q_s32(outptr + M, _sum1); + vl = 4; + vse32_v_i32m1(outptr, vget_v_i32m2_i32m1(_sum01, 0), vl); + vse32_v_i32m1(outptr + M, vget_v_i32m2_i32m1(_sum01, 1), vl); + outptr += 4; + } + if (out_elempack == 1) + { + vsse32_v_i32m2(outptr, M * sizeof(int), _sum01, vl); + // outptr[0] = vgetq_lane_s32(_sum0, 0); + // outptr[M] = vgetq_lane_s32(_sum0, 1); + // outptr[M * 2] = vgetq_lane_s32(_sum0, 2); + // outptr[M * 3] = vgetq_lane_s32(_sum0, 3); + // outptr[M * 4] = vgetq_lane_s32(_sum1, 0); + // outptr[M * 5] = vgetq_lane_s32(_sum1, 1); + // outptr[M * 6] = vgetq_lane_s32(_sum1, 2); + // outptr[M * 7] = vgetq_lane_s32(_sum1, 3); + outptr += 1; + } + } + } + remain_outch_start += nn_outch * 8; + nn_outch = (outch - remain_outch_start) / 4; + for (int pp = 0; pp < nn_outch; pp++) + { + const int p = remain_outch_start + pp * 4; + + // shadowed variable for less openmp task args + const int outw = top_blob.w; + const int outh = top_blob.h; + const int N = bottom_blob.cstep * elempack; + const int M = top_blob.cstep * out_elempack; + + int* outptr = top_blob.channel(p / out_elempack); + + int ij = 0; + + for (; ij < outw * outh; ij++) + { + const int i = ij / outw; + const int j = ij % outw; + vl = 4; + + vint32m2_t _sum01 = vmv_v_x_i32m2(0, vl); + // int32x4_t _sum0 = vdupq_n_s32(0); + // int32x4_t _sum1 = vdupq_n_s32(0); + + const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4); + + int q = 0; + { + for (; q + 7 < inch; q += 8) + { + const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; + + for (int k = 0; k < maxk; k++) + { + vl = 8; + const signed char* r0s = r0 + space_ofs[k]; + + // int8x8_t _r0; + vint8m1_t _r0; + if (elempack == 8) + { + _r0 = vle8_v_i8m1(r0s, vl); + // _r0 = vld1_s8(r0s); + } + else // if (elempack == 1) + { + // signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + // _r0 = vld1_s8(tmp); + _r0 = vlse8_v_i8m1(r0s, N * sizeof(signed char), vl); + } + + // int8x16_t _w0 = vld1q_s8(kptr); + // int8x16_t _w1 = vld1q_s8(kptr + 16); + vl = 16; + vint8m1_t _w0 = vle8_v_i8m1(kptr, vl); + vint8m1_t _w1 = vle8_v_i8m1(kptr + 16, vl); + vl = 8; + + // int16x4_t _rr0 = vreinterpret_s16_s8(_r0); + vint16m1_t _rr0 = vreinterpret_v_i8m1_i16m1(_r0); + + vint8m1_t _r0ll = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 0, vl)); + vint8m1_t _r0lh = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 1, vl)); + vint8m1_t _r0hl = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 2, vl)); + vint8m1_t _r0hh = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 3, vl)); + + vint16m2_t _sl_m2 = vwmul_vv_i16m2(_r0ll, _w0, vl); + vint16m2_t _sh_m2 = vwmul_vv_i16m2(_r0lh, vslidedown_vx_i8m1(_w0, _w0, 8, vl), vl); + _sl_m2 = vwmacc_vv_i16m2(_sl_m2, _r0hl, _w1, vl); + _sh_m2 = vwmacc_vv_i16m2(_sh_m2, _r0hh, vslidedown_vx_i8m1(_w1, _w1, 8, vl), vl); + + vint16m1_t _sl = vget_v_i16m2_i16m1(_sl_m2, 0); + vint16m1_t _sh = vget_v_i16m2_i16m1(_sh_m2, 0); + + // int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); + // int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); + // int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); + // int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); + + // int16x8_t _sl = vmull_s8(_r0ll, vget_low_s8(_w0)); + // int16x8_t _sh = vmull_s8(_r0lh, vget_high_s8(_w0)); + // _sl = vmlal_s8(_sl, _r0hl, vget_low_s8(_w1)); + // _sh = vmlal_s8(_sh, _r0hh, vget_high_s8(_w1)); + vl = 4; + + uint16_t odd_index[4] = {1, 3, 5, 7}; + uint16_t even_index[4] = {0, 2, 4, 6}; + vuint16m1_t _odd_index = vle16_v_u16m1(odd_index, vl); + vuint16m1_t _even_index = vle16_v_u16m1(even_index, vl); + + _sum01 = vwadd_wv_i32m2(_sum01, vrgather_vv_i16m1(_sl, _odd_index, vl), vl); + _sum01 = vwadd_wv_i32m2(_sum01, vrgather_vv_i16m1(_sl, _even_index, vl), vl); + _sum01 = vwadd_wv_i32m2(_sum01, vrgather_vv_i16m1(_sh, _odd_index, vl), vl); + _sum01 = vwadd_wv_i32m2(_sum01, vrgather_vv_i16m1(_sh, _even_index, vl), vl); + + // _sum0 = vpadalq_s16(_sum0, _sl); + // _sum1 = vpadalq_s16(_sum1, _sh); + + kptr += 32; + } + } + // { + // _sum0 = vaddq_s32(_sum0, _sum1); + // } + } + for (; q < inch; q++) + { + vl = 4; + const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + vl = 4; + const signed char* r0s = r0 + space_ofs[k]; + + // if (elempack == 1) + { + vint8m1_t _val = vmv_v_x_i8m1(r0s[0], vl); + vint8m1_t _w = vle8_v_i8m1(kptr, vl); + vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val, _w, vl), 0); + _sum01 = vwadd_wv_i32m2(_sum01, _s0, vl); + // int8x8_t _val = vdup_n_s8(r0s[0]); + // int8x8_t _w = vld1_s8(kptr); + // int16x8_t _s0 = vmull_s8(_val, _w); + // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); + + kptr += 4; + } + } + } + vl = 4; + + if (out_elempack == 4) + { + // vst1q_s32(outptr, _sum0); + vse32_v_i32m2(outptr, _sum01, vl); + outptr += 4; + } + if (out_elempack == 1) + { + vsse32_v_i32m2(outptr, M * sizeof(int), _sum01, vl); + // outptr[0] = vgetq_lane_s32(_sum0, 0); + // outptr[M] = vgetq_lane_s32(_sum0, 1); + // outptr[M * 2] = vgetq_lane_s32(_sum0, 2); + // outptr[M * 3] = vgetq_lane_s32(_sum0, 3); + outptr += 1; + } + } + } + remain_outch_start += nn_outch * 4; + nn_outch = (outch - remain_outch_start) / 2; +#else // __riscv_vector + nn_outch = (outch - remain_outch_start) / 2; + #pragma omp parallel for num_threads(opt.num_threads) +#endif // __riscv_vector + for (int pp = 0; pp < nn_outch; pp++) + { + const int p = remain_outch_start + pp * 2; + + // shadowed variable for less openmp task args + const int outw = top_blob.w; + const int outh = top_blob.h; + const int N = bottom_blob.cstep * elempack; + + int* outptr0 = top_blob.channel(p); + int* outptr1 = top_blob.channel(p + 1); + + int ij = 0; + + for (; ij < outw * outh; ij++) + { + const int i = ij / outw; + const int j = ij % outw; + + int sum0 = 0; + int sum1 = 0; + +#if __riscv_vector + const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2); +#else + const signed char* kptr = weight_data_tm.channel(p / 2); +#endif + + int q = 0; +#if __riscv_vector + { + // int32x4_t _sum01 = vdupq_n_s32(0); + vl = 4; + vint32m2_t _sum01 = vmv_v_x_i32m2(0, vl); + for (; q + 7 < inch; q += 8) + { + vl = 8; + const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; + + for (int k = 0; k < maxk; k++) + { + vl = 8; + const signed char* r0s = r0 + space_ofs[k]; + + // int8x8_t _r0; + vint8m1_t _r0; + if (elempack == 8) + { + _r0 = vle8_v_i8m1(r0s, vl); + // _r0 = vld1_s8(r0s); + } + else // if (elempack == 1) + { + _r0 = vlse8_v_i8m1(r0s, N * sizeof(signed char), vl); + // signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + // _r0 = vld1_s8(tmp); + } + + // int8x16_t _w0 = vld1q_s8(kptr); + vl = 16; + vint8m1_t _w0 = vle8_v_i8m1(kptr, vl); + vl = 8; + // fprintf(stderr, "r0: \n"); + // print_vint8m1(_r0, 8); + vint8m1_t _r0l = vslideup_vx_i8m1(_r0, _r0, 4, vl); + vint8m1_t _r0h = vslidedown_vx_i8m1(_r0, _r0, 4, vl); + _r0h = vslideup_vx_i8m1(_r0h, _r0h, 4, vl); + + // vint32m1_t _r0_i16 = vreinterpret_v_i32m1_i8m1(_r0); + + // int32x2x2_t _rr0 = vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r0)); + // int8x8_t _r0l = vreinterpret_s8_s32(_rr0.val[0]); + // int8x8_t _r0h = vreinterpret_s8_s32(_rr0.val[1]); + + vint16m2_t _s01_m2 = vwmul_vv_i16m2(_r0l, _w0, vl); + _s01_m2 = vwmacc_vv_i16m2(_s01_m2, _r0h, vslidedown_vx_i8m1(_w0, _w0, 8, vl), vl); + vint16m1_t _s01 = vget_v_i16m2_i16m1(_s01_m2, 0); + + vl = 4; + uint16_t odd_index[4] = {1, 3, 5, 7}; + uint16_t even_index[4] = {0, 2, 4, 6}; + vuint16m1_t _odd_index = vle16_v_u16m1(odd_index, vl); + vuint16m1_t _even_index = vle16_v_u16m1(even_index, vl); + _sum01 = vwadd_wv_i32m2(_sum01, vrgather_vv_i16m1(_s01, _odd_index, vl), vl); + _sum01 = vwadd_wv_i32m2(_sum01, vrgather_vv_i16m1(_s01, _even_index, vl), vl); + + // int16x8_t _s01 = vmull_s8(_r0l, vget_low_s8(_w0)); + // _s01 = vmlal_s8(_s01, _r0h, vget_high_s8(_w0)); + // _sum01 = vpadalq_s16(_sum01, _s01); + + kptr += 16; + } + } + int res[4] = {0, 0, 0, 0}; + vl = 4; + vse32_v_i32m2(res, _sum01, vl); + sum0 += (res[0] + res[1]); + sum1 += (res[2] + res[3]); + // int32x2_t _s0 = vpadd_s32(vget_low_s32(_sum01), vget_high_s32(_sum01)); + // sum0 += vget_lane_s32(_s0, 0); + // sum1 += vget_lane_s32(_s0, 1); + } +#endif // __riscv_vector + for (; q < inch; q++) + { + const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + + // if (elempack == 1) + { + sum0 += r0s[0] * kptr[0]; + sum1 += r0s[0] * kptr[1]; + + kptr += 2; + } + } + } + + outptr0[0] = sum0; + outptr1[0] = sum1; + outptr0 += 1; + outptr1 += 1; + } + } + remain_outch_start += nn_outch * 2; + for (int p = remain_outch_start; p < outch; p++) + { + int* outptr = top_blob.channel(p); + + int ij = 0; + + for (; ij < outw * outh; ij++) + { + const int i = ij / outw; + const int j = ij % outw; + + int sum = 0; + +#if __riscv_vector + const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2 + p % 2); +#else + const signed char* kptr = weight_data_tm.channel(p / 2 + p % 2); +#endif + + int q = 0; +#if __riscv_vector + { + vl = 8; + vint32m2_t _sum01 = vmv_v_x_i32m2(0, vl); + // int32x4_t _sum0 = vdupq_n_s32(0); + // int32x4_t _sum1 = vdupq_n_s32(0); + for (; q + 7 < inch; q += 8) + { + const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; + + for (int k = 0; k < maxk; k++) + { + vl = 8; + const signed char* r0s = r0 + space_ofs[k]; + + vint8m1_t _r0; + // int8x8_t _r0; + if (elempack == 8) + { + // _r0 = vld1_s8(r0s); + _r0 = vle8_v_i8m1(r0s, vl); + } + else // if (elempack == 1) + { + _r0 = vlse8_v_i8m1(r0s, N * sizeof(signed char), vl); + // signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + // _r0 = vld1_s8(tmp); + } + + vint8m1_t _w = vle8_v_i8m1(kptr, vl); + vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_r0, _w, vl), 0); + _sum01 = vwadd_wv_i32m2(_sum01, _s0, vl); + // int8x8_t _w = vld1_s8(kptr); + + // int16x8_t _s0 = vmull_s8(_r0, _w); + + // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); + // _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); + + kptr += 8; + } + } + // int32x4_t _sum = vaddq_s32(_sum0, _sum1); + // #if __aarch64__ + vl = 8; + vint32m1_t _scalar_sum = vmv_s_x_i32m1(vint32m1_t(), sum, vl); + sum = vmv_x_s_i32m1_i32(vredsum_vs_i32m2_i32m1(_scalar_sum, _sum01, _scalar_sum, vl)); + // int res[8] = {0, 0, 0, 0}; + // vl = 4; + // vse32_v_i32m2(res, _sum01, vl); + // sum += (res[0] + res[1] + res[2] + res[3]); + // sum += vaddvq_s32(_sum); + // #else + // int32x2_t _ss = vadd_s32(vget_low_s32(_sum), vget_high_s32(_sum)); + // _ss = vpadd_s32(_ss, _ss); + // sum += vget_lane_s32(_ss, 0); + // #endif + } +#endif // __riscv_vector + for (; q < inch; q++) + { + const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + + // if (elempack == 1) + { + sum += r0s[0] * kptr[0]; + + kptr += 1; + } + } + } + + outptr[0] = sum; + outptr += 1; + } + } +} diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp index be413e5be252..6aaea2b90fdf 100644 --- a/src/layer/riscv/convolution_riscv.cpp +++ b/src/layer/riscv/convolution_riscv.cpp @@ -35,6 +35,10 @@ namespace ncnn { #include "convolution_1x1.h" #include "convolution_3x3.h" +#if NCNN_INT8 +#include "convolution_packed_int8.h" +#endif // NCNN_INT8 + #if __riscv_vector #include "convolution_packn.h" #include "convolution_pack1ton.h" @@ -134,7 +138,7 @@ int Convolution_riscv::create_pipeline(const Option& opt) if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) { // TODO implement int8 - return 0; + return create_pipeline_int8(opt); } #endif @@ -259,27 +263,28 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti #if NCNN_INT8 if (opt.use_int8_inference && int8_scale_term) { - Mat bottom_blob_unpacked = bottom_blob; - if (bottom_blob.elempack != 1) - { - Option opt_pack1 = opt; - opt_pack1.blob_allocator = opt.workspace_allocator; - - convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); - } - - Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked; - if (bottom_blob_unpacked.elembits() == 16) - { - Option opt_pack1 = opt; - opt_pack1.blob_allocator = opt.workspace_allocator; - - cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1); - } - - Option opt_unpacked = opt; - opt_unpacked.use_packing_layout = false; - return Convolution::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked); + return forward_int8(bottom_blob, top_blob, opt); + // Mat bottom_blob_unpacked = bottom_blob; + // if (bottom_blob.elempack != 1) + // { + // Option opt_pack1 = opt; + // opt_pack1.blob_allocator = opt.workspace_allocator; + + // convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); + // } + + // Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked; + // if (bottom_blob_unpacked.elembits() == 16) + // { + // Option opt_pack1 = opt; + // opt_pack1.blob_allocator = opt.workspace_allocator; + + // cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1); + // } + + // Option opt_unpacked = opt; + // opt_unpacked.use_packing_layout = false; + // return Convolution::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked); } #endif @@ -1102,4 +1107,179 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con } #endif // __riscv_vector && __riscv_zfh +#if NCNN_INT8 +int Convolution_riscv::create_pipeline_int8(const Option& opt) +{ + const int maxk = kernel_w * kernel_h; + const int num_input = weight_data_size / maxk / num_output; + + bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input >= 8 && num_output >= 8) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1; + // #if NCNN_ARM82DOT + // if (ncnn::cpu_support_arm_asimddp()) + // { + // prefer_winograd = false; + // } + // #endif + +#if 0 + if (opt.use_winograd_convolution && prefer_winograd) + { + if (opt.use_winograd43_convolution) + conv3x3s1_winograd43_transform_kernel_int8(weight_data, weight_winograd43_data, num_input, num_output, opt); + else + conv3x3s1_winograd23_transform_kernel_int8(weight_data, weight_winograd23_data, num_input, num_output, opt); + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_gemm_transform_kernel_int8(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt); + } + else +#endif + { + convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); + } + + scale_in_data.create(num_output); + for (int p = 0; p < num_output; p++) + { + // requantize and relu + float scale_in; + if (weight_data_int8_scales[p] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]); + + scale_in_data[p] = scale_in; + } + + weight_data.release(); + + return 0; +} + +int Convolution_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int elembits = bottom_blob.elembits(); + + Mat bottom_blob_int8 = bottom_blob; + if (elembits != 8) + { + Option opt_q = opt; + opt_q.blob_allocator = opt.workspace_allocator; + quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q); + } + + // NCNN_LOGE("Convolution_arm input %d x %d ksize=%d %d stride=%d %d", w, h, kernel_w, kernel_h, stride_w, stride_h); + + Mat bottom_blob_bordered; + make_padding(bottom_blob_int8, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + int w = bottom_blob_bordered.w; + int h = bottom_blob_bordered.h; + int elempack = bottom_blob_bordered.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + + bool use_int8_requantize = int8_scale_term > 100; + int out_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + if (use_int8_requantize) + out_elempack = num_output % 8 == 0 ? 8 : 1; + else + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif // __riscv_vector + size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; +#if __riscv_vector && __riscv_zfh + if (support_fp16_storage && opt.use_fp16_storage) + { + out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; + } +#endif // __riscv_vector && __riscv_zfh + if (opt.use_bf16_storage) + out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; + + // NCNN_LOGE("forward_int8_arm %d %d %d %d %d", w, h, bottom_blob_bordered.c, elempack, out_elempack); + + int channels = bottom_blob_bordered.c; + const int num_input = channels * elempack; + + bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input >= 8 && num_output >= 8) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1; + // #if NCNN_ARM82DOT + // if (ncnn::cpu_support_arm_asimddp()) + // { + // prefer_winograd = false; + // } + // #endif + + int out_elempack_int32 = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + out_elempack_int32 = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + } +#endif // __riscv_vector + + Mat top_blob_int32; + top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator); + if (top_blob_int32.empty()) + return -100; + + // int _nT = nT ? nT : opt.num_threads; + // if (nT != 0 && opt.num_threads != nT) + // { + // // force num_threads the same as in create_pipeline + // // so we could use pre-packed A/B from the same tile config + // NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT); + // } +#if 0 + if (opt.use_winograd_convolution && prefer_winograd) + { + if (opt.use_winograd43_convolution && !weight_winograd43_data.empty()) + conv3x3s1_winograd43_int8(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, _nT, opt); + else + conv3x3s1_winograd23_int8(bottom_blob_bordered, top_blob_int32, weight_winograd23_data, _nT, opt); + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_gemm_int8(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt); + } + else +#endif + { + convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + + bottom_blob_bordered.release(); + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (use_int8_requantize) + { + requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt); + } + else + { + dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + + return 0; +} +#endif // NCNN_INT8 + } // namespace ncnn diff --git a/src/layer/riscv/convolution_riscv.h b/src/layer/riscv/convolution_riscv.h index a4e008c9dd1d..8c5ee015de1a 100644 --- a/src/layer/riscv/convolution_riscv.h +++ b/src/layer/riscv/convolution_riscv.h @@ -38,6 +38,11 @@ class Convolution_riscv : public Convolution int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif +#if NCNN_INT8 + int create_pipeline_int8(const Option& opt); + int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif + public: Layer* activation; @@ -48,6 +53,10 @@ class Convolution_riscv : public Convolution // fp16 Mat bias_data_fp16; + +#if NCNN_INT8 + Mat scale_in_data; +#endif }; } // namespace ncnn diff --git a/src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h b/src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h new file mode 100644 index 000000000000..b86932b66b28 --- /dev/null +++ b/src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h @@ -0,0 +1,283 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 Xinyu302 Limited. All rights reserved. +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convdw3x3s1_pack8_int8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + fprintf(stderr, "convdw3x3s1_pack8_int8_rvv\n"); + int w = bottom_blob.w; + + int outw = top_blob.w; + int outh = top_blob.h; + + int vl = csrr_vlenb() / 2; + + const int group = bottom_blob.c; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + Mat out = top_blob.channel(g); + + const signed char* k0 = kernel.row(g); + + int* outptr0 = out.row(0); + int* outptr1 = out.row(1); + + const Mat img0 = bottom_blob.channel(g); + + const signed char* r0 = img0.row(0); + const signed char* r1 = img0.row(1); + const signed char* r2 = img0.row(2); + const signed char* r3 = img0.row(3); + + vl = 8; + vint8m1_t _k00 = vle8_v_i8m1(k0, vl); + vint8m1_t _k01 = vle8_v_i8m1(k0 + 8, vl); + vint8m1_t _k02 = vle8_v_i8m1(k0 + 16, vl); + vint8m1_t _k10 = vle8_v_i8m1(k0 + 24, vl); + vint8m1_t _k11 = vle8_v_i8m1(k0 + 32, vl); + vint8m1_t _k12 = vle8_v_i8m1(k0 + 40, vl); + vint8m1_t _k20 = vle8_v_i8m1(k0 + 48, vl); + vint8m1_t _k21 = vle8_v_i8m1(k0 + 56, vl); + vint8m1_t _k22 = vle8_v_i8m1(k0 + 64, vl); + + // int8x8_t _k00 = vld1_s8(k0); + // int8x8_t _k01 = vld1_s8(k0 + 8); + // int8x8_t _k02 = vld1_s8(k0 + 16); + // int8x8_t _k10 = vld1_s8(k0 + 24); + // int8x8_t _k11 = vld1_s8(k0 + 32); + // int8x8_t _k12 = vld1_s8(k0 + 40); + // int8x8_t _k20 = vld1_s8(k0 + 48); + // int8x8_t _k21 = vld1_s8(k0 + 56); + // int8x8_t _k22 = vld1_s8(k0 + 64); + + int i = 0; + for (; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + vint8m1_t _r00 = vle8_v_i8m1(r0, vl); + vint8m1_t _r01 = vle8_v_i8m1(r0 + 8, vl); + vint8m1_t _r02 = vle8_v_i8m1(r0 + 16, vl); + vint8m1_t _r10 = vle8_v_i8m1(r1, vl); + vint8m1_t _r11 = vle8_v_i8m1(r1 + 8, vl); + vint8m1_t _r12 = vle8_v_i8m1(r1 + 16, vl); + vint8m1_t _r20 = vle8_v_i8m1(r2, vl); + vint8m1_t _r21 = vle8_v_i8m1(r2 + 8, vl); + vint8m1_t _r22 = vle8_v_i8m1(r2 + 16, vl); + + // int8x8_t _r00 = vld1_s8(r0); + // int8x8_t _r01 = vld1_s8(r0 + 8); + // int8x8_t _r02 = vld1_s8(r0 + 16); + // int8x8_t _r10 = vld1_s8(r1); + // int8x8_t _r11 = vld1_s8(r1 + 8); + // int8x8_t _r12 = vld1_s8(r1 + 16); + // int8x8_t _r20 = vld1_s8(r2); + // int8x8_t _r21 = vld1_s8(r2 + 8); + // int8x8_t _r22 = vld1_s8(r2 + 16); + + vint16m2_t _s0 = vwmul_vv_i16m2(_r00, _k00, vl); + vint16m2_t _s1 = vwmul_vv_i16m2(_r01, _k01, vl); + vint16m2_t _s2 = vwmul_vv_i16m2(_r02, _k02, vl); + vint16m2_t _s3 = vwmul_vv_i16m2(_r10, _k10, vl); + + // int16x8_t _s0 = vmull_s8(_r00, _k00); + // int16x8_t _s1 = vmull_s8(_r01, _k01); + // int16x8_t _s2 = vmull_s8(_r02, _k02); + // int16x8_t _s3 = vmull_s8(_r10, _k10); + + _s0 = vwmacc_vv_i16m2(_s0, _r11, _k11, vl); + _s1 = vwmacc_vv_i16m2(_s1, _r12, _k12, vl); + _s2 = vwmacc_vv_i16m2(_s2, _r20, _k20, vl); + _s3 = vwmacc_vv_i16m2(_s3, _r21, _k21, vl); + + // _s0 = vmlal_s8(_s0, _r11, _k11); + // _s1 = vmlal_s8(_s1, _r12, _k12); + // _s2 = vmlal_s8(_s2, _r20, _k20); + // _s3 = vmlal_s8(_s3, _r21, _k21); + + vint16m2_t _s4 = vwmul_vv_i16m2(_r22, _k22, vl); + // int16x8_t _s4 = vmull_s8(_r22, _k22); + + vint16m1_t _s0_m1 = vget_v_i16m2_i16m1(_s0, 0); + vint16m1_t _s1_m1 = vget_v_i16m2_i16m1(_s1, 0); + vint16m1_t _s2_m1 = vget_v_i16m2_i16m1(_s2, 0); + vint16m1_t _s3_m1 = vget_v_i16m2_i16m1(_s3, 0); + vint16m1_t _s4_m1 = vget_v_i16m2_i16m1(_s4, 0); + + vint32m2_t _sum = vwadd_vv_i32m2(_s0_m1, _s1_m1, vl); + _sum = vwadd_wv_i32m2(_sum, _s2_m1, vl); + _sum = vwadd_wv_i32m2(_sum, _s3_m1, vl); + _sum = vwadd_wv_i32m2(_sum, _s4_m1, vl); + + // int32x4_t _sum0 = vaddl_s16(vget_low_s16(_s0), vget_low_s16(_s1)); + // int32x4_t _sum1 = vaddl_s16(vget_high_s16(_s0), vget_high_s16(_s1)); + // int32x4_t _sum2 = vaddl_s16(vget_low_s16(_s2), vget_low_s16(_s3)); + // int32x4_t _sum3 = vaddl_s16(vget_high_s16(_s2), vget_high_s16(_s3)); + // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s4)); + // _sum1 = vaddw_s16(_sum1, vget_high_s16(_s4)); + // _sum0 = vaddq_s32(_sum0, _sum2); + // _sum1 = vaddq_s32(_sum1, _sum3); + + vse32_v_i32m2(outptr0, _sum, vl); + // vst1q_s32(outptr0, _sum0); + // vst1q_s32(outptr0 + 4, _sum1); + r0 += 8; + r1 += 8; + r2 += 8; + outptr0 += 8; + } + + r0 += 2 * 8; + r1 += 2 * 8; + r2 += 2 * 8; + } + } +} + +static void convdw3x3s2_pack8_int8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + fprintf(stderr, "convdw3x3s2_pack8_int8_rvv\n"); + int w = bottom_blob.w; + + int outw = top_blob.w; + int outh = top_blob.h; + int vl = csrr_vlenb() / 2; + + const int group = bottom_blob.c; + + const int tailstep = (w - 2 * outw + w) * 8; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + Mat out = top_blob.channel(g); + + const signed char* k0 = kernel.row(g); + + int* outptr0 = out; + + const Mat img0 = bottom_blob.channel(g); + + const signed char* r0 = img0.row(0); + const signed char* r1 = img0.row(1); + const signed char* r2 = img0.row(2); + + vint8m1_t _k00 = vle8_v_i8m1(k0, vl); + vint8m1_t _k01 = vle8_v_i8m1(k0 + 8, vl); + vint8m1_t _k02 = vle8_v_i8m1(k0 + 16, vl); + vint8m1_t _k10 = vle8_v_i8m1(k0 + 24, vl); + vint8m1_t _k11 = vle8_v_i8m1(k0 + 32, vl); + vint8m1_t _k12 = vle8_v_i8m1(k0 + 40, vl); + vint8m1_t _k20 = vle8_v_i8m1(k0 + 48, vl); + vint8m1_t _k21 = vle8_v_i8m1(k0 + 56, vl); + vint8m1_t _k22 = vle8_v_i8m1(k0 + 64, vl); + + // int8x8_t _k00 = vld1_s8(k0); + // int8x8_t _k01 = vld1_s8(k0 + 8); + // int8x8_t _k02 = vld1_s8(k0 + 16); + // int8x8_t _k10 = vld1_s8(k0 + 24); + // int8x8_t _k11 = vld1_s8(k0 + 32); + // int8x8_t _k12 = vld1_s8(k0 + 40); + // int8x8_t _k20 = vld1_s8(k0 + 48); + // int8x8_t _k21 = vld1_s8(k0 + 56); + // int8x8_t _k22 = vld1_s8(k0 + 64); + + int i = 0; + for (; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + vint8m1_t _r00 = vle8_v_i8m1(r0, vl); + vint8m1_t _r01 = vle8_v_i8m1(r0 + 8, vl); + vint8m1_t _r02 = vle8_v_i8m1(r0 + 16, vl); + vint8m1_t _r10 = vle8_v_i8m1(r1, vl); + vint8m1_t _r11 = vle8_v_i8m1(r1 + 8, vl); + vint8m1_t _r12 = vle8_v_i8m1(r1 + 16, vl); + vint8m1_t _r20 = vle8_v_i8m1(r2, vl); + vint8m1_t _r21 = vle8_v_i8m1(r2 + 8, vl); + vint8m1_t _r22 = vle8_v_i8m1(r2 + 16, vl); + + // int8x8_t _r00 = vld1_s8(r0); + // int8x8_t _r01 = vld1_s8(r0 + 8); + // int8x8_t _r02 = vld1_s8(r0 + 16); + // int8x8_t _r10 = vld1_s8(r1); + // int8x8_t _r11 = vld1_s8(r1 + 8); + // int8x8_t _r12 = vld1_s8(r1 + 16); + // int8x8_t _r20 = vld1_s8(r2); + // int8x8_t _r21 = vld1_s8(r2 + 8); + // int8x8_t _r22 = vld1_s8(r2 + 16); + + vint16m2_t _s0 = vwmul_vv_i16m2(_r00, _k00, vl); + vint16m2_t _s1 = vwmul_vv_i16m2(_r01, _k01, vl); + vint16m2_t _s2 = vwmul_vv_i16m2(_r02, _k02, vl); + vint16m2_t _s3 = vwmul_vv_i16m2(_r10, _k10, vl); + + _s0 = vwmacc_vv_i16m2(_s0, _r11, _k11, vl); + _s1 = vwmacc_vv_i16m2(_s1, _r12, _k12, vl); + _s2 = vwmacc_vv_i16m2(_s2, _r20, _k20, vl); + _s3 = vwmacc_vv_i16m2(_s3, _r21, _k21, vl); + + vint16m2_t _s4 = vwmul_vv_i16m2(_r22, _k22, vl); + + // int16x8_t _s0 = vmull_s8(_r00, _k00); + // int16x8_t _s1 = vmull_s8(_r01, _k01); + // int16x8_t _s2 = vmull_s8(_r02, _k02); + // int16x8_t _s3 = vmull_s8(_r10, _k10); + // _s0 = vmlal_s8(_s0, _r11, _k11); + // _s1 = vmlal_s8(_s1, _r12, _k12); + // _s2 = vmlal_s8(_s2, _r20, _k20); + // _s3 = vmlal_s8(_s3, _r21, _k21); + // int16x8_t _s4 = vmull_s8(_r22, _k22); + + vint16m1_t _s0_m1 = vget_v_i16m2_i16m1(_s0, 0); + vint16m1_t _s1_m1 = vget_v_i16m2_i16m1(_s1, 0); + vint16m1_t _s2_m1 = vget_v_i16m2_i16m1(_s2, 0); + vint16m1_t _s3_m1 = vget_v_i16m2_i16m1(_s3, 0); + vint16m1_t _s4_m1 = vget_v_i16m2_i16m1(_s4, 0); + + vint32m2_t _sum = vwadd_vv_i32m2(_s0_m1, _s1_m1, vl); + _sum = vwadd_wv_i32m2(_sum, _s2_m1, vl); + _sum = vwadd_wv_i32m2(_sum, _s3_m1, vl); + _sum = vwadd_wv_i32m2(_sum, _s4_m1, vl); + + vse32_v_i32m2(outptr0, _sum, vl); + + // int32x4_t _sum0 = vaddl_s16(vget_low_s16(_s0), vget_low_s16(_s1)); + // int32x4_t _sum1 = vaddl_s16(vget_high_s16(_s0), vget_high_s16(_s1)); + // int32x4_t _sum2 = vaddl_s16(vget_low_s16(_s2), vget_low_s16(_s3)); + // int32x4_t _sum3 = vaddl_s16(vget_high_s16(_s2), vget_high_s16(_s3)); + // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s4)); + // _sum1 = vaddw_s16(_sum1, vget_high_s16(_s4)); + // _sum0 = vaddq_s32(_sum0, _sum2); + // _sum1 = vaddq_s32(_sum1, _sum3); + + // vst1q_s32(outptr0, _sum0); + // vst1q_s32(outptr0 + 4, _sum1); + + r0 += 16; + r1 += 16; + r2 += 16; + outptr0 += 8; + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + } + } +} diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp index d913fe7e1d59..7d7f77d1ca39 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp @@ -28,6 +28,10 @@ namespace ncnn { #include "convolutiondepthwise_3x3.h" +#if NCNN_INT8 +#include "convolutiondepthwise_3x3_pack8_int8.h" +#endif // NCNN_INT8 + #if __riscv_vector #include "convolutiondepthwise_3x3_packn.h" #include "convolutiondepthwise_5x5_packn.h" @@ -61,7 +65,7 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt) if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) { // TODO implement int8 - return 0; + return create_pipeline_int8(opt); } #endif @@ -238,27 +242,8 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c #if NCNN_INT8 if (opt.use_int8_inference && int8_scale_term) { - Mat bottom_blob_unpacked = bottom_blob; - if (bottom_blob.elempack != 1) - { - Option opt_pack1 = opt; - opt_pack1.blob_allocator = opt.workspace_allocator; - - convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); - } - - Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked; - if (bottom_blob_unpacked.elembits() == 16) - { - Option opt_pack1 = opt; - opt_pack1.blob_allocator = opt.workspace_allocator; - - cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1); - } - - Option opt_unpacked = opt; - opt_unpacked.use_packing_layout = false; - return ConvolutionDepthWise::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked); + fprintf(stderr, "ConvolutionDepthWise_riscv::forward int8 scale is called\n"); + return forward_int8(bottom_blob, top_blob, opt); } #endif @@ -1153,4 +1138,473 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ } #endif // __riscv_vector && __riscv_zfh +#if NCNN_INT8 +int ConvolutionDepthWise_riscv::create_pipeline_int8(const Option& opt) +{ + int vl; + const int maxk = kernel_w * kernel_h; + int channels = (weight_data_size / group) / maxk / (num_output / group) * group; + + // depth-wise + if (channels == group && group == num_output) + { + int elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + elempack = channels % 8 == 0 ? 8 : 1; + } +#endif // __riscv_vector + + if (elempack == 8) + { + Mat weight_data_r2 = weight_data.reshape(maxk, group); + convert_packing(weight_data_r2, weight_data_tm, 8, opt); + } + + if (elempack == 1) + { + weight_data_tm = weight_data; + } + + weight_data.release(); + + return 0; + } + + // group convolution + create_group_ops(opt); + + weight_data.release(); + + return 0; +} + +int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int vl; + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int elempack = bottom_blob.elempack; + + int elembits = bottom_blob.elembits(); + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + Mat bottom_blob_int8 = bottom_blob; + if (elembits != 8) + { + const int channels_g = channels * elempack / group; + + Mat scales(channels * elempack); + { + float* ps = scales; + for (int g = 0; g < group; g++) + { + float scale = bottom_blob_int8_scales[g]; + for (int q = 0; q < channels_g; q++) + { + *ps++ = scale; + } + } + } + + Option opt_q = opt; + opt_q.blob_allocator = opt.workspace_allocator; + quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q); + } + + Mat bottom_blob_bordered; + make_padding(bottom_blob_int8, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + channels = bottom_blob_bordered.c; + elempack = bottom_blob_bordered.elempack; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + + fprintf(stderr, "bottom_blob_bordered %d %d %d %d %d\n", bottom_blob_bordered.w, bottom_blob_bordered.h, bottom_blob_bordered.c, bottom_blob_bordered.elempack, bottom_blob_bordered.elemsize); + + // depth-wise + if (channels * elempack == group && group == num_output) // depth-wise conv, 逐通道卷积 + { + int out_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + out_elempack = num_output % 8 == 0 ? 8 : 1; + } +#endif // __riscv_vector + bool use_int8_requantize = int8_scale_term > 100; + fprintf(stderr, "In 1246 use_int8_requantize = %d\n", use_int8_requantize); + size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; + + if (support_fp16_storage && opt.use_fp16_storage) + { + out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; + } + + if (opt.use_bf16_storage) + out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // TODO use fp16 / bf16 + out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + fprintf(stderr, "kernel_w = %d, kernel_h = %d, dilation_w = %d, dilation_h = %d, stride_w = %d, stride_h = %d, activation_type = %d\n", kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type); + fprintf(stderr, "elempack = %d, out_elempack = %d\n", elempack, out_elempack); + +#if __riscv_vector + if (elempack == 8) + { + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (activation_type == 0 || activation_type == 1)) + { + Mat top_blob_int32; + top_blob_int32.create(outw, outh, num_output / out_elempack, (size_t)4u * out_elempack, out_elempack, opt.workspace_allocator); + if (top_blob_int32.empty()) + return -100; + + convdw3x3s1_pack8_int8_rvv(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt); + + Mat scale_in_data(group); + for (int g = 0; g < group; g++) + { + // dequantize + float scale_in; + if (weight_data_int8_scales[g] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + + scale_in_data[g] = scale_in; + } + + if (use_int8_requantize) + { + requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt); + } + else + { + dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + } + else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (activation_type == 0 || activation_type == 1)) + { + Mat top_blob_int32; + top_blob_int32.create(outw, outh, num_output / out_elempack, (size_t)4u * out_elempack, out_elempack, opt.workspace_allocator); + if (top_blob_int32.empty()) + return -100; + + convdw3x3s2_pack8_int8_rvv(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt); + + Mat scale_in_data(group); + for (int g = 0; g < group; g++) + { + // dequantize + float scale_in; + if (weight_data_int8_scales[g] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + + scale_in_data[g] = scale_in; + } + fprintf(stderr, "use_int8_requantize = %d\n", use_int8_requantize); + + if (use_int8_requantize) + { + requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt); + } + else + { + dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + } + else + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < channels; g++) + { + signed char* outptr_s8 = top_blob.channel(g); + float* outptr_f32 = top_blob.channel(g); + const signed char* kptr = (const signed char*)weight_data_tm + maxk * g * 8; + const Mat m = bottom_blob_bordered.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + vl = 8; + vint32m2_t _sum0 = vmv_v_x_i32m2(0, vl); + + const signed char* sptr = m.row(i * stride_h) + j * stride_w * 8; + + for (int k = 0; k < maxk; k++) + { + vint8m1_t _val = vle8_v_i8m1(sptr + space_ofs[k] * 8, vl); + vint8m1_t _w = vle8_v_i8m1(kptr + k * 8, vl); + vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val, _w, vl), 0); + + _sum0 = vwadd_wv_i32m2(_sum0, _s0, vl); + } + + vfloat32m2_t _scale_in; + { + vfloat32m2_t _bottom_blob_int8_scales = vle32_v_f32m2((const float*)bottom_blob_int8_scales + g * 8, vl); + vfloat32m2_t _weight_data_int8_scales = vle32_v_f32m2((const float*)weight_data_int8_scales + g * 8, vl); + + _scale_in = vfdiv_vv_f32m2(vfmv_v_f_f32m2(1.f, vl), vfmul_vv_f32m2(_bottom_blob_int8_scales, _weight_data_int8_scales, vl), vl); + vbool16_t _is_zero = vmfeq_vv_f32m2_b16(_bottom_blob_int8_scales, vfmv_v_f_f32m2(0.f, vl), vl); + _scale_in = vfsub_vv_f32m2_m(_is_zero, _scale_in, _scale_in, _scale_in, vl); + } + + vfloat32m2_t _sumfp32 = vfmul_vv_f32m2(vfcvt_f_x_v_f32m2(_sum0, vl), _scale_in, vl); + + if (bias_term) + { + vfloat32m2_t _bias = vle32_v_f32m2((const float*)bias_data + g * 8, vl); + _sumfp32 = vfadd_vv_f32m2(_sumfp32, _bias, vl); + } + _sumfp32 = activation_ps(_sumfp32, activation_type, activation_params, vl); + + if (use_int8_requantize) + { + // requantize + vfloat32m2_t _scale_out = vle32_v_f32m2((const float*)top_blob_int8_scales + g * 8, vl); + vfloat32m2_t _res = vfmul_vv_f32m2(_sumfp32, _scale_out, vl); + int64_t _sum8 = float2int8(vget_v_f32m2_f32m1(_res, 0), vget_v_f32m2_f32m1(_res, 1)); + *(int64_t*)outptr_s8 = _sum8; + outptr_s8 += 8; + } + else + { + // dequantize + vse32_v_f32m2(outptr_f32, _sumfp32, vl); + outptr_f32 += 8; + } + } + } + } + } + } +#endif // __riscv_vector + + if (elempack == 1) + { + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + signed char* outptr_s8 = top_blob.channel(g); + float* outptr_f32 = top_blob.channel(g); + const signed char* kptr = (const signed char*)weight_data_tm + maxk * g; + const Mat m = bottom_blob_bordered.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + int sum = 0; + + const signed char* sptr = m.row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + signed char val = sptr[space_ofs[k]]; + signed char w = kptr[k]; + sum += val * w; + } + + float scale_in; + if (weight_data_int8_scales[g] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + + float sumfp32 = sum * scale_in; + + if (bias_term) + sumfp32 += bias_data[g]; + + sumfp32 = activation_ss(sumfp32, activation_type, activation_params); + + if (use_int8_requantize) + { + // requantize + float scale_out = top_blob_int8_scales[g]; + signed char sums8 = float2int8(sumfp32 * scale_out); + outptr_s8[0] = sums8; + outptr_s8 += 1; + } + else + { + // dequantize + outptr_f32[0] = sumfp32; + outptr_f32 += 1; + } + } + } + } + } + } + + return 0; + } + + bool use_int8_requantize = int8_scale_term > 100; + int out_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + if (use_int8_requantize) + out_elempack = num_output % 8 == 0 ? 8 : 1; + else + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif // __riscv_vector + size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; + // #if NCNN_ARM82 + if (support_fp16_storage && opt.use_fp16_storage) + { + out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; + } + // #endif + if (opt.use_bf16_storage) + out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // group convolution + const int channels_g = channels * elempack / group; + const int num_output_g = num_output / group; + + fprintf(stderr, "group = %d, num_output = %d\n", group, num_output); + + fprintf(stderr, "channels_g = %d, num_output_g = %d\n", channels_g, num_output_g); + + int g_elempack = 1; + int out_g_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + g_elempack = channels_g % 8 == 0 ? 8 : 1; + if (use_int8_requantize) + out_g_elempack = num_output_g % 8 == 0 ? 8 : 1; + else + out_g_elempack = num_output_g % 4 == 0 ? 4 : 1; + } +#endif // __riscv_vector + + // unpacking + Mat bottom_blob_bordered_unpacked = bottom_blob_bordered; + if (elempack > g_elempack) + { + Option opt_p = opt; + opt_p.blob_allocator = opt.workspace_allocator; + convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p); + } + + Mat top_blob_unpacked = top_blob; + if (out_g_elempack < out_elempack) + { + top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator); + if (top_blob_unpacked.empty()) + return -100; + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack); + Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack); + + const ncnn::Layer* op = group_ops[g]; + + Option opt_g = opt; + opt_g.blob_allocator = top_blob_unpacked.allocator; + + // forward + op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); + } + + // packing + if (out_g_elempack < out_elempack) + { + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + } + else + { + top_blob = top_blob_unpacked; + } + + return 0; +} +#endif // NCNN_INT8 + } // namespace ncnn diff --git a/src/layer/riscv/convolutiondepthwise_riscv.h b/src/layer/riscv/convolutiondepthwise_riscv.h index f9503975296d..98b1884d298b 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.h +++ b/src/layer/riscv/convolutiondepthwise_riscv.h @@ -39,6 +39,11 @@ class ConvolutionDepthWise_riscv : public ConvolutionDepthWise int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif +#if NCNN_INT8 + int create_pipeline_int8(const Option& opt); + int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif + public: Layer* activation; std::vector group_ops; diff --git a/src/net.cpp b/src/net.cpp index ff2ab6091373..c365fd3e174f 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -708,7 +708,7 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio if (elembits == 8) { #if NCNN_RVV - const int packn = ncnn::cpu_riscv_vlenb() / 1; + const int packn = ncnn::cpu_riscv_vlenb() / 2; if (elemcount % packn == 0) dst_elempack = packn; #else