From fe578026e9dc6f3ebf2b9c0bbceb8f37b9ec4313 Mon Sep 17 00:00:00 2001 From: yxy Date: Mon, 12 Feb 2024 15:05:53 +0000 Subject: [PATCH 01/29] copy arm convolutiondepthwise to convolutiondepthwise_riscv.cpp --- .../riscv/convolutiondepthwise_riscv.cpp | 535 ++++++++++++++++++ 1 file changed, 535 insertions(+) diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp index d913fe7e1d59..5c4be90e8009 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp @@ -1153,4 +1153,539 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ } #endif // __riscv_vector && __riscv_zfh +int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int elempack = bottom_blob.elempack; + + int elembits = bottom_blob.elembits(); + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + Mat bottom_blob_int8 = bottom_blob; + if (elembits != 8) + { + const int channels_g = channels * elempack / group; + + Mat scales(channels * elempack); + { + float* ps = scales; + for (int g = 0; g < group; g++) + { + float scale = bottom_blob_int8_scales[g]; + for (int q = 0; q < channels_g; q++) + { + *ps++ = scale; + } + } + } + + Option opt_q = opt; + opt_q.blob_allocator = opt.workspace_allocator; + quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q); + } + + Mat bottom_blob_bordered; + make_padding(bottom_blob_int8, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + w = bottom_blob_bordered.w; + h = bottom_blob_bordered.h; + channels = bottom_blob_bordered.c; + elempack = bottom_blob_bordered.elempack; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + + // depth-wise + if (channels * elempack == group && group == num_output) + { + int out_elempack = 1; +#if __ARM_NEON + if (opt.use_packing_layout) + { + out_elempack = num_output % 8 == 0 ? 8 : 1; + } +#endif // __ARM_NEON + bool use_int8_requantize = int8_scale_term > 100; + size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; +#if NCNN_ARM82 + if (support_fp16_storage && opt.use_fp16_storage) + { + out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; + } +#endif + if (opt.use_bf16_storage) + out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // TODO use fp16 / bf16 + out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __ARM_NEON + if (elempack == 8) + { +#if NCNN_GNU_INLINE_ASM + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (activation_type == 0 || activation_type == 1)) + { + Mat top_blob_int32; + top_blob_int32.create(outw, outh, num_output / out_elempack, (size_t)4u * out_elempack, out_elempack, opt.workspace_allocator); + if (top_blob_int32.empty()) + return -100; + + convdw3x3s1_pack8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt); + + Mat scale_in_data(group); + for (int g = 0; g < group; g++) + { + // dequantize + float scale_in; + if (weight_data_int8_scales[g] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + + scale_in_data[g] = scale_in; + } + + if (use_int8_requantize) + { + requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt); + } + else + { + dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + } + else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (activation_type == 0 || activation_type == 1)) + { + Mat top_blob_int32; + top_blob_int32.create(outw, outh, num_output / out_elempack, (size_t)4u * out_elempack, out_elempack, opt.workspace_allocator); + if (top_blob_int32.empty()) + return -100; + + convdw3x3s2_pack8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt); + + Mat scale_in_data(group); + for (int g = 0; g < group; g++) + { + // dequantize + float scale_in; + if (weight_data_int8_scales[g] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + + scale_in_data[g] = scale_in; + } + + if (use_int8_requantize) + { + requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt); + } + else + { + dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + } + else +#endif // NCNN_GNU_INLINE_ASM + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < channels; g++) + { + signed char* outptr_s8 = top_blob.channel(g); + float* outptr_f32 = top_blob.channel(g); + const signed char* kptr = (const signed char*)weight_data_tm + maxk * g * 8; + const Mat m = bottom_blob_bordered.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + int32x4_t _sum0 = vdupq_n_s32(0); + int32x4_t _sum1 = vdupq_n_s32(0); + + const signed char* sptr = m.row(i * stride_h) + j * stride_w * 8; + + for (int k = 0; k < maxk; k++) + { + int8x8_t _val = vld1_s8(sptr + space_ofs[k] * 8); + int8x8_t _w = vld1_s8(kptr + k * 8); + int16x8_t _s0 = vmull_s8(_val, _w); + _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); + _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); + } + + float32x4_t _scale_in0; + float32x4_t _scale_in1; + { + float32x4_t _bottom_blob_int8_scales0 = vld1q_f32((const float*)bottom_blob_int8_scales + g * 8); + float32x4_t _bottom_blob_int8_scales1 = vld1q_f32((const float*)bottom_blob_int8_scales + g * 8 + 4); + float32x4_t _weight_data_int8_scales0 = vld1q_f32((const float*)weight_data_int8_scales + g * 8); + float32x4_t _weight_data_int8_scales1 = vld1q_f32((const float*)weight_data_int8_scales + g * 8 + 4); + _scale_in0 = div_ps(vdupq_n_f32(1.f), vmulq_f32(_bottom_blob_int8_scales0, _weight_data_int8_scales0)); + _scale_in1 = div_ps(vdupq_n_f32(1.f), vmulq_f32(_bottom_blob_int8_scales1, _weight_data_int8_scales1)); + + uint32x4_t _m0 = vmvnq_u32(vceqq_f32(_weight_data_int8_scales0, vdupq_n_f32(0.f))); + uint32x4_t _m1 = vmvnq_u32(vceqq_f32(_weight_data_int8_scales1, vdupq_n_f32(0.f))); + _scale_in0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(_scale_in0), _m0)); + _scale_in1 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(_scale_in1), _m1)); + } + + float32x4_t _sumfp32_0 = vmulq_f32(vcvtq_f32_s32(_sum0), _scale_in0); + float32x4_t _sumfp32_1 = vmulq_f32(vcvtq_f32_s32(_sum1), _scale_in1); + + if (bias_term) + { + float32x4_t _bias0 = vld1q_f32((const float*)bias_data + g * 8); + float32x4_t _bias1 = vld1q_f32((const float*)bias_data + g * 8 + 4); + _sumfp32_0 = vaddq_f32(_sumfp32_0, _bias0); + _sumfp32_1 = vaddq_f32(_sumfp32_1, _bias1); + } + + _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params); + _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params); + + if (use_int8_requantize) + { + // requantize + float32x4_t _scale_out0 = vld1q_f32((const float*)top_blob_int8_scales + g * 8); + float32x4_t _scale_out1 = vld1q_f32((const float*)top_blob_int8_scales + g * 8 + 4); + int8x8_t _sum8 = float2int8(vmulq_f32(_sumfp32_0, _scale_out0), vmulq_f32(_sumfp32_1, _scale_out1)); + vst1_s8(outptr_s8, _sum8); + outptr_s8 += 8; + } + else + { + // dequantize + vst1q_f32(outptr_f32, _sumfp32_0); + vst1q_f32(outptr_f32 + 4, _sumfp32_1); + outptr_f32 += 8; + } + } + } + } + } + } +#endif // __ARM_NEON + + if (elempack == 1) + { +#if NCNN_GNU_INLINE_ASM + if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (activation_type == 0 || activation_type == 1)) + { + if (use_int8_requantize) + { + std::vector requantize_scales; + for (int g = 0; g < group; g++) + { + float scale_in; + if (weight_data_int8_scales[g] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + + float scale_out = top_blob_int8_scales[g]; + + requantize_scales.push_back(scale_in); + requantize_scales.push_back(scale_out); + } + + convdw3x3s1_int8_requant_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, requantize_scales, opt); + } + else + { + Mat top_blob_int32; + top_blob_int32.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); + if (top_blob_int32.empty()) + return -100; + + convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt); + // convdw3x3s1_int8_dequant_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, bias_data, dequantize_scales, opt); + + Mat scale_data(group); + for (int g = 0; g < group; g++) + { + // dequantize + float scale_in; + if (weight_data_int8_scales[g] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + + scale_data[g] = scale_in; + } + + dequantize_from_int32(top_blob_int32, top_blob, scale_data, bias_data, opt); + } + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (activation_type == 0 || activation_type == 1)) + { + if (use_int8_requantize) + { + std::vector requantize_scales; + for (int g = 0; g < group; g++) + { + float scale_in; + if (weight_data_int8_scales[g] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + + float scale_out = top_blob_int8_scales[g]; + + requantize_scales.push_back(scale_in); + requantize_scales.push_back(scale_out); + } + + convdw3x3s2_int8_requant_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, requantize_scales, opt); + } + else + { + Mat top_blob_int32; + top_blob_int32.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); + if (top_blob_int32.empty()) + return -100; + + convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt); + // convdw3x3s2_int8_dequant_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, bias_data, dequantize_scales, opt); + + Mat scale_data(group); + for (int g = 0; g < group; g++) + { + // dequantize + float scale_in; + if (weight_data_int8_scales[g] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + + scale_data[g] = scale_in; + } + + dequantize_from_int32(top_blob_int32, top_blob, scale_data, bias_data, opt); + } + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + else +#endif // NCNN_GNU_INLINE_ASM + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + signed char* outptr_s8 = top_blob.channel(g); + float* outptr_f32 = top_blob.channel(g); + const signed char* kptr = (const signed char*)weight_data_tm + maxk * g; + const Mat m = bottom_blob_bordered.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + int sum = 0; + + const signed char* sptr = m.row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + signed char val = sptr[space_ofs[k]]; + signed char w = kptr[k]; + sum += val * w; + } + + float scale_in; + if (weight_data_int8_scales[g] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + + float sumfp32 = sum * scale_in; + + if (bias_term) + sumfp32 += bias_data[g]; + + sumfp32 = activation_ss(sumfp32, activation_type, activation_params); + + if (use_int8_requantize) + { + // requantize + float scale_out = top_blob_int8_scales[g]; + signed char sums8 = float2int8(sumfp32 * scale_out); + outptr_s8[0] = sums8; + outptr_s8 += 1; + } + else + { + // dequantize + outptr_f32[0] = sumfp32; + outptr_f32 += 1; + } + } + } + } + } + } + + return 0; + } + + bool use_int8_requantize = int8_scale_term > 100; + int out_elempack = 1; +#if __ARM_NEON + if (opt.use_packing_layout) + { + if (use_int8_requantize) + out_elempack = num_output % 8 == 0 ? 8 : 1; + else + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif // __ARM_NEON + size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; +#if NCNN_ARM82 + if (support_fp16_storage && opt.use_fp16_storage) + { + out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; + } +#endif + if (opt.use_bf16_storage) + out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // group convolution + const int channels_g = channels * elempack / group; + const int num_output_g = num_output / group; + + int g_elempack = 1; + int out_g_elempack = 1; +#if __ARM_NEON + if (opt.use_packing_layout) + { + g_elempack = channels_g % 8 == 0 ? 8 : 1; + if (use_int8_requantize) + out_g_elempack = num_output_g % 8 == 0 ? 8 : 1; + else + out_g_elempack = num_output_g % 4 == 0 ? 4 : 1; + } +#endif // __ARM_NEON + + // unpacking + Mat bottom_blob_bordered_unpacked = bottom_blob_bordered; + if (elempack > g_elempack) + { + Option opt_p = opt; + opt_p.blob_allocator = opt.workspace_allocator; + convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p); + } + + Mat top_blob_unpacked = top_blob; + if (out_g_elempack < out_elempack) + { + top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator); + if (top_blob_unpacked.empty()) + return -100; + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack); + Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack); + + const ncnn::Layer* op = group_ops[g]; + + Option opt_g = opt; + opt_g.blob_allocator = top_blob_unpacked.allocator; + + // forward + op->forward(bottom_blob_bordered_g, top_blob_g, opt_g); + } + + // packing + if (out_g_elempack < out_elempack) + { + convert_packing(top_blob_unpacked, top_blob, out_elempack, opt); + } + else + { + top_blob = top_blob_unpacked; + } + + return 0; +} + } // namespace ncnn From 97775a9e943b28eb7dfbffd8fd3a2e31de703872 Mon Sep 17 00:00:00 2001 From: yxy Date: Wed, 14 Feb 2024 13:11:25 +0000 Subject: [PATCH 02/29] finish convolutiondepthwise_3x3_pack8_int8 --- .../convolutiondepthwise_3x3_pack8_int8.h | 283 +++++++++ .../riscv/convolutiondepthwise_riscv.cpp | 589 ++++++++++-------- src/layer/riscv/convolutiondepthwise_riscv.h | 5 + 3 files changed, 619 insertions(+), 258 deletions(-) create mode 100644 src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h diff --git a/src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h b/src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h new file mode 100644 index 000000000000..bd5624e3c2e4 --- /dev/null +++ b/src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h @@ -0,0 +1,283 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 Xinyu302 Limited. All rights reserved. +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void convdw3x3s1_pack8_int8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + + int outw = top_blob.w; + int outh = top_blob.h; + + int vl = csrr_vlenb() / 1; + + const int group = bottom_blob.c; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + Mat out = top_blob.channel(g); + + const signed char* k0 = kernel.row(g); + + int* outptr0 = out.row(0); + int* outptr1 = out.row(1); + + const Mat img0 = bottom_blob.channel(g); + + const signed char* r0 = img0.row(0); + const signed char* r1 = img0.row(1); + const signed char* r2 = img0.row(2); + const signed char* r3 = img0.row(3); + + vl = 8; + vint8m1_t _k00 = vle8_v_i8m1(k0, vl); + vint8m1_t _k01 = vle8_v_i8m1(k0 + 8, vl); + vint8m1_t _k02 = vle8_v_i8m1(k0 + 16, vl); + vint8m1_t _k10 = vle8_v_i8m1(k0 + 24, vl); + vint8m1_t _k11 = vle8_v_i8m1(k0 + 32, vl); + vint8m1_t _k12 = vle8_v_i8m1(k0 + 40, vl); + vint8m1_t _k20 = vle8_v_i8m1(k0 + 48, vl); + vint8m1_t _k21 = vle8_v_i8m1(k0 + 56, vl); + vint8m1_t _k22 = vle8_v_i8m1(k0 + 64, vl); + + // int8x8_t _k00 = vld1_s8(k0); + // int8x8_t _k01 = vld1_s8(k0 + 8); + // int8x8_t _k02 = vld1_s8(k0 + 16); + // int8x8_t _k10 = vld1_s8(k0 + 24); + // int8x8_t _k11 = vld1_s8(k0 + 32); + // int8x8_t _k12 = vld1_s8(k0 + 40); + // int8x8_t _k20 = vld1_s8(k0 + 48); + // int8x8_t _k21 = vld1_s8(k0 + 56); + // int8x8_t _k22 = vld1_s8(k0 + 64); + + int i = 0; + for (; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + vint8m1_t _r00 = vle8_v_i8m1(r0, vl); + vint8m1_t _r01 = vle8_v_i8m1(r0 + 8, vl); + vint8m1_t _r02 = vle8_v_i8m1(r0 + 16, vl); + vint8m1_t _r10 = vle8_v_i8m1(r1, vl); + vint8m1_t _r11 = vle8_v_i8m1(r1 + 8, vl); + vint8m1_t _r12 = vle8_v_i8m1(r1 + 16, vl); + vint8m1_t _r20 = vle8_v_i8m1(r2, vl); + vint8m1_t _r21 = vle8_v_i8m1(r2 + 8, vl); + vint8m1_t _r22 = vle8_v_i8m1(r2 + 16, vl); + + // int8x8_t _r00 = vld1_s8(r0); + // int8x8_t _r01 = vld1_s8(r0 + 8); + // int8x8_t _r02 = vld1_s8(r0 + 16); + // int8x8_t _r10 = vld1_s8(r1); + // int8x8_t _r11 = vld1_s8(r1 + 8); + // int8x8_t _r12 = vld1_s8(r1 + 16); + // int8x8_t _r20 = vld1_s8(r2); + // int8x8_t _r21 = vld1_s8(r2 + 8); + // int8x8_t _r22 = vld1_s8(r2 + 16); + + vint16m2_t _s0 = vwmul_vv_i16m2(_r00, _k00, vl); + vint16m2_t _s1 = vwmul_vv_i16m2(_r01, _k01, vl); + vint16m2_t _s2 = vwmul_vv_i16m2(_r02, _k02, vl); + vint16m2_t _s3 = vwmul_vv_i16m2(_r10, _k10, vl); + + // int16x8_t _s0 = vmull_s8(_r00, _k00); + // int16x8_t _s1 = vmull_s8(_r01, _k01); + // int16x8_t _s2 = vmull_s8(_r02, _k02); + // int16x8_t _s3 = vmull_s8(_r10, _k10); + + _s0 = vwmacc_vv_i16m2(_s0, _r11, _k11, vl); + _s1 = vwmacc_vv_i16m2(_s1, _r12, _k12, vl); + _s2 = vwmacc_vv_i16m2(_s2, _r20, _k20, vl); + _s3 = vwmacc_vv_i16m2(_s3, _r21, _k21, vl); + + // _s0 = vmlal_s8(_s0, _r11, _k11); + // _s1 = vmlal_s8(_s1, _r12, _k12); + // _s2 = vmlal_s8(_s2, _r20, _k20); + // _s3 = vmlal_s8(_s3, _r21, _k21); + + vint16m2_t _s4 = vwmul_vv_i16m2(_r22, _k22, vl); + // int16x8_t _s4 = vmull_s8(_r22, _k22); + + vint16m1_t _s0_m1 = vget_v_i16m2_i16m1(_s0, 0); + vint16m1_t _s1_m1 = vget_v_i16m2_i16m1(_s1, 0); + vint16m1_t _s2_m1 = vget_v_i16m2_i16m1(_s2, 0); + vint16m1_t _s3_m1 = vget_v_i16m2_i16m1(_s3, 0); + vint16m1_t _s4_m1 = vget_v_i16m2_i16m1(_s4, 0); + + vint32m2_t _sum = vwadd_vv_i32m2(_s0_m1, _s1_m1, vl); + _sum = vwadd_wv_i32m2(_sum, _s2_m1, vl); + _sum = vwadd_wv_i32m2(_sum, _s3_m1, vl); + _sum = vwadd_wv_i32m2(_sum, _s4_m1, vl); + + // int32x4_t _sum0 = vaddl_s16(vget_low_s16(_s0), vget_low_s16(_s1)); + // int32x4_t _sum1 = vaddl_s16(vget_high_s16(_s0), vget_high_s16(_s1)); + // int32x4_t _sum2 = vaddl_s16(vget_low_s16(_s2), vget_low_s16(_s3)); + // int32x4_t _sum3 = vaddl_s16(vget_high_s16(_s2), vget_high_s16(_s3)); + // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s4)); + // _sum1 = vaddw_s16(_sum1, vget_high_s16(_s4)); + // _sum0 = vaddq_s32(_sum0, _sum2); + // _sum1 = vaddq_s32(_sum1, _sum3); + + vse32_v_i32m2(outptr0, _sum, vl); + // vst1q_s32(outptr0, _sum0); + // vst1q_s32(outptr0 + 4, _sum1); + r0 += 8; + r1 += 8; + r2 += 8; + outptr0 += 8; + } + + r0 += 2 * 8; + r1 += 2 * 8; + r2 += 2 * 8; + } + } +} + +static void convdw3x3s2_pack8_int8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) +{ + int w = bottom_blob.w; + + int outw = top_blob.w; + int outh = top_blob.h; + int vl = 8; + + const int group = bottom_blob.c; + + const int tailstep = (w - 2 * outw + w) * 8; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + Mat out = top_blob.channel(g); + + const signed char* k0 = kernel.row(g); + + int* outptr0 = out; + + const Mat img0 = bottom_blob.channel(g); + + const signed char* r0 = img0.row(0); + const signed char* r1 = img0.row(1); + const signed char* r2 = img0.row(2); + + vint8m1_t _k00 = vle8_v_i8m1(k0, vl); + vint8m1_t _k01 = vle8_v_i8m1(k0 + 8, vl); + vint8m1_t _k02 = vle8_v_i8m1(k0 + 16, vl); + vint8m1_t _k10 = vle8_v_i8m1(k0 + 24, vl); + vint8m1_t _k11 = vle8_v_i8m1(k0 + 32, vl); + vint8m1_t _k12 = vle8_v_i8m1(k0 + 40, vl); + vint8m1_t _k20 = vle8_v_i8m1(k0 + 48, vl); + vint8m1_t _k21 = vle8_v_i8m1(k0 + 56, vl); + vint8m1_t _k22 = vle8_v_i8m1(k0 + 64, vl); + + // int8x8_t _k00 = vld1_s8(k0); + // int8x8_t _k01 = vld1_s8(k0 + 8); + // int8x8_t _k02 = vld1_s8(k0 + 16); + // int8x8_t _k10 = vld1_s8(k0 + 24); + // int8x8_t _k11 = vld1_s8(k0 + 32); + // int8x8_t _k12 = vld1_s8(k0 + 40); + // int8x8_t _k20 = vld1_s8(k0 + 48); + // int8x8_t _k21 = vld1_s8(k0 + 56); + // int8x8_t _k22 = vld1_s8(k0 + 64); + + int i = 0; + for (; i < outh; i++) + { + int j = 0; + for (; j < outw; j++) + { + vint8m1_t _r00 = vle8_v_i8m1(r0, vl); + vint8m1_t _r01 = vle8_v_i8m1(r0 + 8, vl); + vint8m1_t _r02 = vle8_v_i8m1(r0 + 16, vl); + vint8m1_t _r10 = vle8_v_i8m1(r1, vl); + vint8m1_t _r11 = vle8_v_i8m1(r1 + 8, vl); + vint8m1_t _r12 = vle8_v_i8m1(r1 + 16, vl); + vint8m1_t _r20 = vle8_v_i8m1(r2, vl); + vint8m1_t _r21 = vle8_v_i8m1(r2 + 8, vl); + vint8m1_t _r22 = vle8_v_i8m1(r2 + 16, vl); + + // int8x8_t _r00 = vld1_s8(r0); + // int8x8_t _r01 = vld1_s8(r0 + 8); + // int8x8_t _r02 = vld1_s8(r0 + 16); + // int8x8_t _r10 = vld1_s8(r1); + // int8x8_t _r11 = vld1_s8(r1 + 8); + // int8x8_t _r12 = vld1_s8(r1 + 16); + // int8x8_t _r20 = vld1_s8(r2); + // int8x8_t _r21 = vld1_s8(r2 + 8); + // int8x8_t _r22 = vld1_s8(r2 + 16); + + vint16m2_t _s0 = vwmul_vv_i16m2(_r00, _k00, vl); + vint16m2_t _s1 = vwmul_vv_i16m2(_r01, _k01, vl); + vint16m2_t _s2 = vwmul_vv_i16m2(_r02, _k02, vl); + vint16m2_t _s3 = vwmul_vv_i16m2(_r10, _k10, vl); + + _s0 = vwmacc_vv_i16m2(_s0, _r11, _k11, vl); + _s1 = vwmacc_vv_i16m2(_s1, _r12, _k12, vl); + _s2 = vwmacc_vv_i16m2(_s2, _r20, _k20, vl); + _s3 = vwmacc_vv_i16m2(_s3, _r21, _k21, vl); + + vint16m2_t _s4 = vwmul_vv_i16m2(_r22, _k22, vl); + + + + // int16x8_t _s0 = vmull_s8(_r00, _k00); + // int16x8_t _s1 = vmull_s8(_r01, _k01); + // int16x8_t _s2 = vmull_s8(_r02, _k02); + // int16x8_t _s3 = vmull_s8(_r10, _k10); + // _s0 = vmlal_s8(_s0, _r11, _k11); + // _s1 = vmlal_s8(_s1, _r12, _k12); + // _s2 = vmlal_s8(_s2, _r20, _k20); + // _s3 = vmlal_s8(_s3, _r21, _k21); + // int16x8_t _s4 = vmull_s8(_r22, _k22); + + vint16m1_t _s0_m1 = vget_v_i16m2_i16m1(_s0, 0); + vint16m1_t _s1_m1 = vget_v_i16m2_i16m1(_s1, 0); + vint16m1_t _s2_m1 = vget_v_i16m2_i16m1(_s2, 0); + vint16m1_t _s3_m1 = vget_v_i16m2_i16m1(_s3, 0); + vint16m1_t _s4_m1 = vget_v_i16m2_i16m1(_s4, 0); + + vint32m2_t _sum = vwadd_vv_i32m2(_s0_m1, _s1_m1, vl); + _sum = vwadd_wv_i32m2(_sum, _s2_m1, vl); + _sum = vwadd_wv_i32m2(_sum, _s3_m1, vl); + _sum = vwadd_wv_i32m2(_sum, _s4_m1, vl); + + vse32_v_i32m2(outptr0, _sum, vl); + + // int32x4_t _sum0 = vaddl_s16(vget_low_s16(_s0), vget_low_s16(_s1)); + // int32x4_t _sum1 = vaddl_s16(vget_high_s16(_s0), vget_high_s16(_s1)); + // int32x4_t _sum2 = vaddl_s16(vget_low_s16(_s2), vget_low_s16(_s3)); + // int32x4_t _sum3 = vaddl_s16(vget_high_s16(_s2), vget_high_s16(_s3)); + // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s4)); + // _sum1 = vaddw_s16(_sum1, vget_high_s16(_s4)); + // _sum0 = vaddq_s32(_sum0, _sum2); + // _sum1 = vaddq_s32(_sum1, _sum3); + + // vst1q_s32(outptr0, _sum0); + // vst1q_s32(outptr0 + 4, _sum1); + + r0 += 16; + r1 += 16; + r2 += 16; + outptr0 += 8; + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + } + } +} diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp index 5c4be90e8009..4a29374a56e1 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp @@ -28,6 +28,10 @@ namespace ncnn { #include "convolutiondepthwise_3x3.h" +#if NCNN_INT8 +#include "convolutiondepthwise_3x3_pack8_int8.h" +#endif // NCNN_INT8 + #if __riscv_vector #include "convolutiondepthwise_3x3_packn.h" #include "convolutiondepthwise_5x5_packn.h" @@ -61,7 +65,7 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt) if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) { // TODO implement int8 - return 0; + return create_pipeline_int8(opt); } #endif @@ -238,27 +242,29 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c #if NCNN_INT8 if (opt.use_int8_inference && int8_scale_term) { - Mat bottom_blob_unpacked = bottom_blob; - if (bottom_blob.elempack != 1) - { - Option opt_pack1 = opt; - opt_pack1.blob_allocator = opt.workspace_allocator; - - convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); - } - - Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked; - if (bottom_blob_unpacked.elembits() == 16) - { - Option opt_pack1 = opt; - opt_pack1.blob_allocator = opt.workspace_allocator; - - cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1); - } - - Option opt_unpacked = opt; - opt_unpacked.use_packing_layout = false; - return ConvolutionDepthWise::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked); + fprintf(stderr, "ConvolutionDepthWise_riscv::forward int8 scale is called\n"); + return forward_int8(bottom_blob, top_blob, opt); + // Mat bottom_blob_unpacked = bottom_blob; + // if (bottom_blob.elempack != 1) + // { + // Option opt_pack1 = opt; + // opt_pack1.blob_allocator = opt.workspace_allocator; + + // convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); + // } + + // Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked; + // if (bottom_blob_unpacked.elembits() == 16) + // { + // Option opt_pack1 = opt; + // opt_pack1.blob_allocator = opt.workspace_allocator; + + // cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1); + // } + + // Option opt_unpacked = opt; + // opt_unpacked.use_packing_layout = false; + // return ConvolutionDepthWise::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked); } #endif @@ -1153,8 +1159,52 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ } #endif // __riscv_vector && __riscv_zfh +#if NCNN_INT8 +int ConvolutionDepthWise_riscv::create_pipeline_int8(const Option& opt) +{ + int vl; + const int maxk = kernel_w * kernel_h; + int channels = (weight_data_size / group) / maxk / (num_output / group) * group; + + // depth-wise + if (channels == group && group == num_output) + { + int elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + elempack = channels % 8 == 0 ? 8 : 1; + } +#endif // __riscv_vector + + if (elempack == 8) + { + Mat weight_data_r2 = weight_data.reshape(maxk, group); + convert_packing(weight_data_r2, weight_data_tm, 8, opt); + } + + if (elempack == 1) + { + weight_data_tm = weight_data; + } + + weight_data.release(); + + return 0; + } + + // group convolution + create_group_ops(opt); + + weight_data.release(); + + return 0; +} + + int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { + int vl; int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; @@ -1205,12 +1255,12 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl if (channels * elempack == group && group == num_output) { int out_elempack = 1; -#if __ARM_NEON +#if __riscv_vector if (opt.use_packing_layout) { out_elempack = num_output % 8 == 0 ? 8 : 1; } -#endif // __ARM_NEON +#endif // __riscv_vector bool use_int8_requantize = int8_scale_term > 100; size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; #if NCNN_ARM82 @@ -1232,10 +1282,9 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl if (top_blob.empty()) return -100; -#if __ARM_NEON +#if __riscv_vector if (elempack == 8) { -#if NCNN_GNU_INLINE_ASM if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (activation_type == 0 || activation_type == 1)) { Mat top_blob_int32; @@ -1243,7 +1292,7 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl if (top_blob_int32.empty()) return -100; - convdw3x3s1_pack8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt); + convdw3x3s1_pack8_int8_rvv(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt); Mat scale_in_data(group); for (int g = 0; g < group; g++) @@ -1279,7 +1328,7 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl if (top_blob_int32.empty()) return -100; - convdw3x3s2_pack8_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt); + convdw3x3s2_pack8_int8_rvv(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt); Mat scale_in_data(group); for (int g = 0; g < group; g++) @@ -1309,7 +1358,6 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl } } else -#endif // NCNN_GNU_INLINE_ASM { const int maxk = kernel_w * kernel_h; @@ -1344,64 +1392,88 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl { for (int j = 0; j < outw; j++) { - int32x4_t _sum0 = vdupq_n_s32(0); - int32x4_t _sum1 = vdupq_n_s32(0); + vl = 8; + vint32m2_t _sum0 = vmv_v_x_i32m2(0, vl); + // int32x4_t _sum0 = vdupq_n_s32(0); + // int32x4_t _sum1 = vdupq_n_s32(0); const signed char* sptr = m.row(i * stride_h) + j * stride_w * 8; for (int k = 0; k < maxk; k++) { - int8x8_t _val = vld1_s8(sptr + space_ofs[k] * 8); - int8x8_t _w = vld1_s8(kptr + k * 8); - int16x8_t _s0 = vmull_s8(_val, _w); - _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); - _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); + vint8m1_t _val = vle8_v_i8m1(sptr + space_ofs[k] * 8, vl); + vint8m1_t _w = vle8_v_i8m1(kptr + k * 8, vl); + vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val, _w, vl), 0); + + // int8x8_t _val = vld1_s8(sptr + space_ofs[k] * 8); + // int8x8_t _w = vld1_s8(kptr + k * 8); + // int16x8_t _s0 = vmull_s8(_val, _w); + // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); + // _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); + _sum0 = vwadd_wv_i32m2(_sum0, _s0, vl); } - float32x4_t _scale_in0; - float32x4_t _scale_in1; + // float32x4_t _scale_in0; + // float32x4_t _scale_in1; + vfloat32m2_t _scale_in; { - float32x4_t _bottom_blob_int8_scales0 = vld1q_f32((const float*)bottom_blob_int8_scales + g * 8); - float32x4_t _bottom_blob_int8_scales1 = vld1q_f32((const float*)bottom_blob_int8_scales + g * 8 + 4); - float32x4_t _weight_data_int8_scales0 = vld1q_f32((const float*)weight_data_int8_scales + g * 8); - float32x4_t _weight_data_int8_scales1 = vld1q_f32((const float*)weight_data_int8_scales + g * 8 + 4); - _scale_in0 = div_ps(vdupq_n_f32(1.f), vmulq_f32(_bottom_blob_int8_scales0, _weight_data_int8_scales0)); - _scale_in1 = div_ps(vdupq_n_f32(1.f), vmulq_f32(_bottom_blob_int8_scales1, _weight_data_int8_scales1)); - - uint32x4_t _m0 = vmvnq_u32(vceqq_f32(_weight_data_int8_scales0, vdupq_n_f32(0.f))); - uint32x4_t _m1 = vmvnq_u32(vceqq_f32(_weight_data_int8_scales1, vdupq_n_f32(0.f))); - _scale_in0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(_scale_in0), _m0)); - _scale_in1 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(_scale_in1), _m1)); + vfloat32m2_t _bottom_blob_int8_scales = vle32_v_f32m2((const float*)bottom_blob_int8_scales + g * 8, vl); + vfloat32m2_t _weight_data_int8_scales = vle32_v_f32m2((const float*)weight_data_int8_scales + g * 8, vl); + + + // float32x4_t _bottom_blob_int8_scales0 = vld1q_f32((const float*)bottom_blob_int8_scales + g * 8); + // float32x4_t _bottom_blob_int8_scales1 = vld1q_f32((const float*)bottom_blob_int8_scales + g * 8 + 4); + // float32x4_t _weight_data_int8_scales0 = vld1q_f32((const float*)weight_data_int8_scales + g * 8); + // float32x4_t _weight_data_int8_scales1 = vld1q_f32((const float*)weight_data_int8_scales + g * 8 + 4); + _scale_in = vfdiv_vv_f32m2(vfmv_v_f_f32m2(1.f, vl), vfmul_vv_f32m2(_bottom_blob_int8_scales, _weight_data_int8_scales, vl), vl); + // _scale_in0 = div_ps(vdupq_n_f32(1.f), vmulq_f32(_bottom_blob_int8_scales0, _weight_data_int8_scales0)); + // _scale_in1 = div_ps(vdupq_n_f32(1.f), vmulq_f32(_bottom_blob_int8_scales1, _weight_data_int8_scales1)); + vbool16_t _is_zero = vmfeq_vv_f32m2_b16(_bottom_blob_int8_scales, vfmv_v_f_f32m2(0.f, vl), vl); + _scale_in = vfsub_vv_f32m2_m(_is_zero, _scale_in, _scale_in, _scale_in, vl); + // uint32x4_t _m0 = vmvnq_u32(vceqq_f32(_weight_data_int8_scales0, vdupq_n_f32(0.f))); + // uint32x4_t _m1 = vmvnq_u32(vceqq_f32(_weight_data_int8_scales1, vdupq_n_f32(0.f))); + // _scale_in0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(_scale_in0), _m0)); + // _scale_in1 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(_scale_in1), _m1)); } - float32x4_t _sumfp32_0 = vmulq_f32(vcvtq_f32_s32(_sum0), _scale_in0); - float32x4_t _sumfp32_1 = vmulq_f32(vcvtq_f32_s32(_sum1), _scale_in1); + vfloat32m2_t _sumfp32 = vfmul_vv_f32m2(vfcvt_f_x_v_f32m2(_sum0, vl), _scale_in, vl); + + // float32x4_t _sumfp32_0 = vmulq_f32(vcvtq_f32_s32(_sum0), _scale_in0); + // float32x4_t _sumfp32_1 = vmulq_f32(vcvtq_f32_s32(_sum1), _scale_in1); if (bias_term) { - float32x4_t _bias0 = vld1q_f32((const float*)bias_data + g * 8); - float32x4_t _bias1 = vld1q_f32((const float*)bias_data + g * 8 + 4); - _sumfp32_0 = vaddq_f32(_sumfp32_0, _bias0); - _sumfp32_1 = vaddq_f32(_sumfp32_1, _bias1); + vfloat32m2_t _bias = vle32_v_f32m2((const float*)bias_data + g * 8, vl); + _sumfp32 = vfadd_vv_f32m2(_sumfp32, _bias, vl); + // float32x4_t _bias0 = vld1q_f32((const float*)bias_data + g * 8); + // float32x4_t _bias1 = vld1q_f32((const float*)bias_data + g * 8 + 4); + // _sumfp32_0 = vaddq_f32(_sumfp32_0, _bias0); + // _sumfp32_1 = vaddq_f32(_sumfp32_1, _bias1); } + _sumfp32 = activation_ps(_sumfp32, activation_type, activation_params, vl); - _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params); - _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params); + // _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params); + // _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params); if (use_int8_requantize) { // requantize - float32x4_t _scale_out0 = vld1q_f32((const float*)top_blob_int8_scales + g * 8); - float32x4_t _scale_out1 = vld1q_f32((const float*)top_blob_int8_scales + g * 8 + 4); - int8x8_t _sum8 = float2int8(vmulq_f32(_sumfp32_0, _scale_out0), vmulq_f32(_sumfp32_1, _scale_out1)); - vst1_s8(outptr_s8, _sum8); + vfloat32m2_t _scale_out = vle32_v_f32m2((const float*)top_blob_int8_scales + g * 8, vl); + vfloat32m2_t _res = vfmul_vv_f32m2(_sumfp32, _scale_out, vl); + int64_t _sum8 = float2int8(vget_v_f32m2_f32m1(_res, 0), vget_v_f32m2_f32m1(_res, 1)); + *(int64_t*)outptr_s8 = _sum8; + // float32x4_t _scale_out0 = vld1q_f32((const float*)top_blob_int8_scales + g * 8); + // float32x4_t _scale_out1 = vld1q_f32((const float*)top_blob_int8_scales + g * 8 + 4); + // int8x8_t _sum8 = float2int8(vmulq_f32(_sumfp32_0, _scale_out0), vmulq_f32(_sumfp32_1, _scale_out1)); + // vst1_s8(outptr_s8, _sum8); outptr_s8 += 8; } else { // dequantize - vst1q_f32(outptr_f32, _sumfp32_0); - vst1q_f32(outptr_f32 + 4, _sumfp32_1); + vse32_v_f32m2(outptr_f32, _sumfp32, vl); + // vst1q_f32(outptr_f32, _sumfp32_0); + // vst1q_f32(outptr_f32 + 4, _sumfp32_1); outptr_f32 += 8; } } @@ -1409,201 +1481,201 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl } } } -#endif // __ARM_NEON - - if (elempack == 1) - { -#if NCNN_GNU_INLINE_ASM - if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (activation_type == 0 || activation_type == 1)) - { - if (use_int8_requantize) - { - std::vector requantize_scales; - for (int g = 0; g < group; g++) - { - float scale_in; - if (weight_data_int8_scales[g] == 0) - scale_in = 0; - else - scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); - - float scale_out = top_blob_int8_scales[g]; - - requantize_scales.push_back(scale_in); - requantize_scales.push_back(scale_out); - } - - convdw3x3s1_int8_requant_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, requantize_scales, opt); - } - else - { - Mat top_blob_int32; - top_blob_int32.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); - if (top_blob_int32.empty()) - return -100; - - convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt); - // convdw3x3s1_int8_dequant_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, bias_data, dequantize_scales, opt); - - Mat scale_data(group); - for (int g = 0; g < group; g++) - { - // dequantize - float scale_in; - if (weight_data_int8_scales[g] == 0) - scale_in = 0; - else - scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); - - scale_data[g] = scale_in; - } - - dequantize_from_int32(top_blob_int32, top_blob, scale_data, bias_data, opt); - } - - if (activation) - { - activation->forward_inplace(top_blob, opt); - } - } - else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (activation_type == 0 || activation_type == 1)) - { - if (use_int8_requantize) - { - std::vector requantize_scales; - for (int g = 0; g < group; g++) - { - float scale_in; - if (weight_data_int8_scales[g] == 0) - scale_in = 0; - else - scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); - - float scale_out = top_blob_int8_scales[g]; - - requantize_scales.push_back(scale_in); - requantize_scales.push_back(scale_out); - } - - convdw3x3s2_int8_requant_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, requantize_scales, opt); - } - else - { - Mat top_blob_int32; - top_blob_int32.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); - if (top_blob_int32.empty()) - return -100; - - convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt); - // convdw3x3s2_int8_dequant_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, bias_data, dequantize_scales, opt); - - Mat scale_data(group); - for (int g = 0; g < group; g++) - { - // dequantize - float scale_in; - if (weight_data_int8_scales[g] == 0) - scale_in = 0; - else - scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); - - scale_data[g] = scale_in; - } - - dequantize_from_int32(top_blob_int32, top_blob, scale_data, bias_data, opt); - } - - if (activation) - { - activation->forward_inplace(top_blob, opt); - } - } - else -#endif // NCNN_GNU_INLINE_ASM - { - const int maxk = kernel_w * kernel_h; - - // kernel offsets - std::vector _space_ofs(maxk); - int* space_ofs = &_space_ofs[0]; - { - int p1 = 0; - int p2 = 0; - int gap = w * dilation_h - kernel_w * dilation_w; - for (int i = 0; i < kernel_h; i++) - { - for (int j = 0; j < kernel_w; j++) - { - space_ofs[p1] = p2; - p1++; - p2 += dilation_w; - } - p2 += gap; - } - } - - #pragma omp parallel for num_threads(opt.num_threads) - for (int g = 0; g < group; g++) - { - signed char* outptr_s8 = top_blob.channel(g); - float* outptr_f32 = top_blob.channel(g); - const signed char* kptr = (const signed char*)weight_data_tm + maxk * g; - const Mat m = bottom_blob_bordered.channel(g); - - for (int i = 0; i < outh; i++) - { - for (int j = 0; j < outw; j++) - { - int sum = 0; - - const signed char* sptr = m.row(i * stride_h) + j * stride_w; - - for (int k = 0; k < maxk; k++) - { - signed char val = sptr[space_ofs[k]]; - signed char w = kptr[k]; - sum += val * w; - } - - float scale_in; - if (weight_data_int8_scales[g] == 0) - scale_in = 0; - else - scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); - - float sumfp32 = sum * scale_in; - - if (bias_term) - sumfp32 += bias_data[g]; - - sumfp32 = activation_ss(sumfp32, activation_type, activation_params); +#endif // __riscv_vector - if (use_int8_requantize) - { - // requantize - float scale_out = top_blob_int8_scales[g]; - signed char sums8 = float2int8(sumfp32 * scale_out); - outptr_s8[0] = sums8; - outptr_s8 += 1; - } - else - { - // dequantize - outptr_f32[0] = sumfp32; - outptr_f32 += 1; - } - } - } - } - } - } +// if (elempack == 1) +// { +// #if NCNN_GNU_INLINE_ASM +// if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (activation_type == 0 || activation_type == 1)) +// { +// if (use_int8_requantize) +// { +// std::vector requantize_scales; +// for (int g = 0; g < group; g++) +// { +// float scale_in; +// if (weight_data_int8_scales[g] == 0) +// scale_in = 0; +// else +// scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + +// float scale_out = top_blob_int8_scales[g]; + +// requantize_scales.push_back(scale_in); +// requantize_scales.push_back(scale_out); +// } + +// convdw3x3s1_int8_requant_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, requantize_scales, opt); +// } +// else +// { +// Mat top_blob_int32; +// top_blob_int32.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); +// if (top_blob_int32.empty()) +// return -100; + +// convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt); +// // convdw3x3s1_int8_dequant_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, bias_data, dequantize_scales, opt); + +// Mat scale_data(group); +// for (int g = 0; g < group; g++) +// { +// // dequantize +// float scale_in; +// if (weight_data_int8_scales[g] == 0) +// scale_in = 0; +// else +// scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + +// scale_data[g] = scale_in; +// } + +// dequantize_from_int32(top_blob_int32, top_blob, scale_data, bias_data, opt); +// } + +// if (activation) +// { +// activation->forward_inplace(top_blob, opt); +// } +// } +// else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (activation_type == 0 || activation_type == 1)) +// { +// if (use_int8_requantize) +// { +// std::vector requantize_scales; +// for (int g = 0; g < group; g++) +// { +// float scale_in; +// if (weight_data_int8_scales[g] == 0) +// scale_in = 0; +// else +// scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + +// float scale_out = top_blob_int8_scales[g]; + +// requantize_scales.push_back(scale_in); +// requantize_scales.push_back(scale_out); +// } + +// convdw3x3s2_int8_requant_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, requantize_scales, opt); +// } +// else +// { +// Mat top_blob_int32; +// top_blob_int32.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); +// if (top_blob_int32.empty()) +// return -100; + +// convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt); +// // convdw3x3s2_int8_dequant_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, bias_data, dequantize_scales, opt); + +// Mat scale_data(group); +// for (int g = 0; g < group; g++) +// { +// // dequantize +// float scale_in; +// if (weight_data_int8_scales[g] == 0) +// scale_in = 0; +// else +// scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + +// scale_data[g] = scale_in; +// } + +// dequantize_from_int32(top_blob_int32, top_blob, scale_data, bias_data, opt); +// } + +// if (activation) +// { +// activation->forward_inplace(top_blob, opt); +// } +// } +// else +// #endif // NCNN_GNU_INLINE_ASM +// { +// const int maxk = kernel_w * kernel_h; + +// // kernel offsets +// std::vector _space_ofs(maxk); +// int* space_ofs = &_space_ofs[0]; +// { +// int p1 = 0; +// int p2 = 0; +// int gap = w * dilation_h - kernel_w * dilation_w; +// for (int i = 0; i < kernel_h; i++) +// { +// for (int j = 0; j < kernel_w; j++) +// { +// space_ofs[p1] = p2; +// p1++; +// p2 += dilation_w; +// } +// p2 += gap; +// } +// } + +// #pragma omp parallel for num_threads(opt.num_threads) +// for (int g = 0; g < group; g++) +// { +// signed char* outptr_s8 = top_blob.channel(g); +// float* outptr_f32 = top_blob.channel(g); +// const signed char* kptr = (const signed char*)weight_data_tm + maxk * g; +// const Mat m = bottom_blob_bordered.channel(g); + +// for (int i = 0; i < outh; i++) +// { +// for (int j = 0; j < outw; j++) +// { +// int sum = 0; + +// const signed char* sptr = m.row(i * stride_h) + j * stride_w; + +// for (int k = 0; k < maxk; k++) +// { +// signed char val = sptr[space_ofs[k]]; +// signed char w = kptr[k]; +// sum += val * w; +// } + +// float scale_in; +// if (weight_data_int8_scales[g] == 0) +// scale_in = 0; +// else +// scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + +// float sumfp32 = sum * scale_in; + +// if (bias_term) +// sumfp32 += bias_data[g]; + +// sumfp32 = activation_ss(sumfp32, activation_type, activation_params); + +// if (use_int8_requantize) +// { +// // requantize +// float scale_out = top_blob_int8_scales[g]; +// signed char sums8 = float2int8(sumfp32 * scale_out); +// outptr_s8[0] = sums8; +// outptr_s8 += 1; +// } +// else +// { +// // dequantize +// outptr_f32[0] = sumfp32; +// outptr_f32 += 1; +// } +// } +// } +// } +// } +// } return 0; } bool use_int8_requantize = int8_scale_term > 100; int out_elempack = 1; -#if __ARM_NEON +#if __riscv_vector if (opt.use_packing_layout) { if (use_int8_requantize) @@ -1611,16 +1683,16 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl else out_elempack = num_output % 4 == 0 ? 4 : 1; } -#endif // __ARM_NEON +#endif // __riscv_vector size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; -#if NCNN_ARM82 +// #if NCNN_ARM82 if (support_fp16_storage && opt.use_fp16_storage) { out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; } -#endif - if (opt.use_bf16_storage) - out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; +// #endif + // if (opt.use_bf16_storage) + // out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) @@ -1632,7 +1704,7 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl int g_elempack = 1; int out_g_elempack = 1; -#if __ARM_NEON +#if __riscv_vector if (opt.use_packing_layout) { g_elempack = channels_g % 8 == 0 ? 8 : 1; @@ -1641,7 +1713,7 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl else out_g_elempack = num_output_g % 4 == 0 ? 4 : 1; } -#endif // __ARM_NEON +#endif // __riscv_vector // unpacking Mat bottom_blob_bordered_unpacked = bottom_blob_bordered; @@ -1687,5 +1759,6 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl return 0; } +#endif // NCNN_INT8 } // namespace ncnn diff --git a/src/layer/riscv/convolutiondepthwise_riscv.h b/src/layer/riscv/convolutiondepthwise_riscv.h index f9503975296d..98b1884d298b 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.h +++ b/src/layer/riscv/convolutiondepthwise_riscv.h @@ -39,6 +39,11 @@ class ConvolutionDepthWise_riscv : public ConvolutionDepthWise int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif +#if NCNN_INT8 + int create_pipeline_int8(const Option& opt); + int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif + public: Layer* activation; std::vector group_ops; From 9e7e01d284bf1e879107e102ef09b21b064b9426 Mon Sep 17 00:00:00 2001 From: Xinyu302 Date: Wed, 14 Feb 2024 13:12:55 +0000 Subject: [PATCH 03/29] apply code-format changes --- .../convolutiondepthwise_3x3_pack8_int8.h | 6 +- .../riscv/convolutiondepthwise_riscv.cpp | 378 +++++++++--------- 2 files changed, 190 insertions(+), 194 deletions(-) diff --git a/src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h b/src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h index bd5624e3c2e4..3c2f23f20f00 100644 --- a/src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h +++ b/src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h @@ -107,10 +107,10 @@ static void convdw3x3s1_pack8_int8_rvv(const Mat& bottom_blob, Mat& top_blob, co // _s1 = vmlal_s8(_s1, _r12, _k12); // _s2 = vmlal_s8(_s2, _r20, _k20); // _s3 = vmlal_s8(_s3, _r21, _k21); - + vint16m2_t _s4 = vwmul_vv_i16m2(_r22, _k22, vl); // int16x8_t _s4 = vmull_s8(_r22, _k22); - + vint16m1_t _s0_m1 = vget_v_i16m2_i16m1(_s0, 0); vint16m1_t _s1_m1 = vget_v_i16m2_i16m1(_s1, 0); vint16m1_t _s2_m1 = vget_v_i16m2_i16m1(_s2, 0); @@ -232,8 +232,6 @@ static void convdw3x3s2_pack8_int8_rvv(const Mat& bottom_blob, Mat& top_blob, co vint16m2_t _s4 = vwmul_vv_i16m2(_r22, _k22, vl); - - // int16x8_t _s0 = vmull_s8(_r00, _k00); // int16x8_t _s1 = vmull_s8(_r01, _k01); // int16x8_t _s2 = vmull_s8(_r02, _k02); diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp index 4a29374a56e1..82af129fd6f6 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp @@ -1201,7 +1201,6 @@ int ConvolutionDepthWise_riscv::create_pipeline_int8(const Option& opt) return 0; } - int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { int vl; @@ -1420,7 +1419,6 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl vfloat32m2_t _bottom_blob_int8_scales = vle32_v_f32m2((const float*)bottom_blob_int8_scales + g * 8, vl); vfloat32m2_t _weight_data_int8_scales = vle32_v_f32m2((const float*)weight_data_int8_scales + g * 8, vl); - // float32x4_t _bottom_blob_int8_scales0 = vld1q_f32((const float*)bottom_blob_int8_scales + g * 8); // float32x4_t _bottom_blob_int8_scales1 = vld1q_f32((const float*)bottom_blob_int8_scales + g * 8 + 4); // float32x4_t _weight_data_int8_scales0 = vld1q_f32((const float*)weight_data_int8_scales + g * 8); @@ -1483,192 +1481,192 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl } #endif // __riscv_vector -// if (elempack == 1) -// { -// #if NCNN_GNU_INLINE_ASM -// if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (activation_type == 0 || activation_type == 1)) -// { -// if (use_int8_requantize) -// { -// std::vector requantize_scales; -// for (int g = 0; g < group; g++) -// { -// float scale_in; -// if (weight_data_int8_scales[g] == 0) -// scale_in = 0; -// else -// scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); - -// float scale_out = top_blob_int8_scales[g]; - -// requantize_scales.push_back(scale_in); -// requantize_scales.push_back(scale_out); -// } - -// convdw3x3s1_int8_requant_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, requantize_scales, opt); -// } -// else -// { -// Mat top_blob_int32; -// top_blob_int32.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); -// if (top_blob_int32.empty()) -// return -100; - -// convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt); -// // convdw3x3s1_int8_dequant_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, bias_data, dequantize_scales, opt); - -// Mat scale_data(group); -// for (int g = 0; g < group; g++) -// { -// // dequantize -// float scale_in; -// if (weight_data_int8_scales[g] == 0) -// scale_in = 0; -// else -// scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); - -// scale_data[g] = scale_in; -// } - -// dequantize_from_int32(top_blob_int32, top_blob, scale_data, bias_data, opt); -// } - -// if (activation) -// { -// activation->forward_inplace(top_blob, opt); -// } -// } -// else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (activation_type == 0 || activation_type == 1)) -// { -// if (use_int8_requantize) -// { -// std::vector requantize_scales; -// for (int g = 0; g < group; g++) -// { -// float scale_in; -// if (weight_data_int8_scales[g] == 0) -// scale_in = 0; -// else -// scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); - -// float scale_out = top_blob_int8_scales[g]; - -// requantize_scales.push_back(scale_in); -// requantize_scales.push_back(scale_out); -// } - -// convdw3x3s2_int8_requant_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, requantize_scales, opt); -// } -// else -// { -// Mat top_blob_int32; -// top_blob_int32.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); -// if (top_blob_int32.empty()) -// return -100; - -// convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt); -// // convdw3x3s2_int8_dequant_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, bias_data, dequantize_scales, opt); - -// Mat scale_data(group); -// for (int g = 0; g < group; g++) -// { -// // dequantize -// float scale_in; -// if (weight_data_int8_scales[g] == 0) -// scale_in = 0; -// else -// scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); - -// scale_data[g] = scale_in; -// } - -// dequantize_from_int32(top_blob_int32, top_blob, scale_data, bias_data, opt); -// } - -// if (activation) -// { -// activation->forward_inplace(top_blob, opt); -// } -// } -// else -// #endif // NCNN_GNU_INLINE_ASM -// { -// const int maxk = kernel_w * kernel_h; - -// // kernel offsets -// std::vector _space_ofs(maxk); -// int* space_ofs = &_space_ofs[0]; -// { -// int p1 = 0; -// int p2 = 0; -// int gap = w * dilation_h - kernel_w * dilation_w; -// for (int i = 0; i < kernel_h; i++) -// { -// for (int j = 0; j < kernel_w; j++) -// { -// space_ofs[p1] = p2; -// p1++; -// p2 += dilation_w; -// } -// p2 += gap; -// } -// } - -// #pragma omp parallel for num_threads(opt.num_threads) -// for (int g = 0; g < group; g++) -// { -// signed char* outptr_s8 = top_blob.channel(g); -// float* outptr_f32 = top_blob.channel(g); -// const signed char* kptr = (const signed char*)weight_data_tm + maxk * g; -// const Mat m = bottom_blob_bordered.channel(g); - -// for (int i = 0; i < outh; i++) -// { -// for (int j = 0; j < outw; j++) -// { -// int sum = 0; - -// const signed char* sptr = m.row(i * stride_h) + j * stride_w; - -// for (int k = 0; k < maxk; k++) -// { -// signed char val = sptr[space_ofs[k]]; -// signed char w = kptr[k]; -// sum += val * w; -// } - -// float scale_in; -// if (weight_data_int8_scales[g] == 0) -// scale_in = 0; -// else -// scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); - -// float sumfp32 = sum * scale_in; - -// if (bias_term) -// sumfp32 += bias_data[g]; - -// sumfp32 = activation_ss(sumfp32, activation_type, activation_params); - -// if (use_int8_requantize) -// { -// // requantize -// float scale_out = top_blob_int8_scales[g]; -// signed char sums8 = float2int8(sumfp32 * scale_out); -// outptr_s8[0] = sums8; -// outptr_s8 += 1; -// } -// else -// { -// // dequantize -// outptr_f32[0] = sumfp32; -// outptr_f32 += 1; -// } -// } -// } -// } -// } -// } + // if (elempack == 1) + // { + // #if NCNN_GNU_INLINE_ASM + // if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (activation_type == 0 || activation_type == 1)) + // { + // if (use_int8_requantize) + // { + // std::vector requantize_scales; + // for (int g = 0; g < group; g++) + // { + // float scale_in; + // if (weight_data_int8_scales[g] == 0) + // scale_in = 0; + // else + // scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + + // float scale_out = top_blob_int8_scales[g]; + + // requantize_scales.push_back(scale_in); + // requantize_scales.push_back(scale_out); + // } + + // convdw3x3s1_int8_requant_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, requantize_scales, opt); + // } + // else + // { + // Mat top_blob_int32; + // top_blob_int32.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); + // if (top_blob_int32.empty()) + // return -100; + + // convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt); + // // convdw3x3s1_int8_dequant_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, bias_data, dequantize_scales, opt); + + // Mat scale_data(group); + // for (int g = 0; g < group; g++) + // { + // // dequantize + // float scale_in; + // if (weight_data_int8_scales[g] == 0) + // scale_in = 0; + // else + // scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + + // scale_data[g] = scale_in; + // } + + // dequantize_from_int32(top_blob_int32, top_blob, scale_data, bias_data, opt); + // } + + // if (activation) + // { + // activation->forward_inplace(top_blob, opt); + // } + // } + // else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (activation_type == 0 || activation_type == 1)) + // { + // if (use_int8_requantize) + // { + // std::vector requantize_scales; + // for (int g = 0; g < group; g++) + // { + // float scale_in; + // if (weight_data_int8_scales[g] == 0) + // scale_in = 0; + // else + // scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + + // float scale_out = top_blob_int8_scales[g]; + + // requantize_scales.push_back(scale_in); + // requantize_scales.push_back(scale_out); + // } + + // convdw3x3s2_int8_requant_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, requantize_scales, opt); + // } + // else + // { + // Mat top_blob_int32; + // top_blob_int32.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); + // if (top_blob_int32.empty()) + // return -100; + + // convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt); + // // convdw3x3s2_int8_dequant_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, bias_data, dequantize_scales, opt); + + // Mat scale_data(group); + // for (int g = 0; g < group; g++) + // { + // // dequantize + // float scale_in; + // if (weight_data_int8_scales[g] == 0) + // scale_in = 0; + // else + // scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + + // scale_data[g] = scale_in; + // } + + // dequantize_from_int32(top_blob_int32, top_blob, scale_data, bias_data, opt); + // } + + // if (activation) + // { + // activation->forward_inplace(top_blob, opt); + // } + // } + // else + // #endif // NCNN_GNU_INLINE_ASM + // { + // const int maxk = kernel_w * kernel_h; + + // // kernel offsets + // std::vector _space_ofs(maxk); + // int* space_ofs = &_space_ofs[0]; + // { + // int p1 = 0; + // int p2 = 0; + // int gap = w * dilation_h - kernel_w * dilation_w; + // for (int i = 0; i < kernel_h; i++) + // { + // for (int j = 0; j < kernel_w; j++) + // { + // space_ofs[p1] = p2; + // p1++; + // p2 += dilation_w; + // } + // p2 += gap; + // } + // } + + // #pragma omp parallel for num_threads(opt.num_threads) + // for (int g = 0; g < group; g++) + // { + // signed char* outptr_s8 = top_blob.channel(g); + // float* outptr_f32 = top_blob.channel(g); + // const signed char* kptr = (const signed char*)weight_data_tm + maxk * g; + // const Mat m = bottom_blob_bordered.channel(g); + + // for (int i = 0; i < outh; i++) + // { + // for (int j = 0; j < outw; j++) + // { + // int sum = 0; + + // const signed char* sptr = m.row(i * stride_h) + j * stride_w; + + // for (int k = 0; k < maxk; k++) + // { + // signed char val = sptr[space_ofs[k]]; + // signed char w = kptr[k]; + // sum += val * w; + // } + + // float scale_in; + // if (weight_data_int8_scales[g] == 0) + // scale_in = 0; + // else + // scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + + // float sumfp32 = sum * scale_in; + + // if (bias_term) + // sumfp32 += bias_data[g]; + + // sumfp32 = activation_ss(sumfp32, activation_type, activation_params); + + // if (use_int8_requantize) + // { + // // requantize + // float scale_out = top_blob_int8_scales[g]; + // signed char sums8 = float2int8(sumfp32 * scale_out); + // outptr_s8[0] = sums8; + // outptr_s8 += 1; + // } + // else + // { + // // dequantize + // outptr_f32[0] = sumfp32; + // outptr_f32 += 1; + // } + // } + // } + // } + // } + // } return 0; } @@ -1685,12 +1683,12 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl } #endif // __riscv_vector size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; -// #if NCNN_ARM82 + // #if NCNN_ARM82 if (support_fp16_storage && opt.use_fp16_storage) { out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; } -// #endif + // #endif // if (opt.use_bf16_storage) // out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; From ca8b112e79cd41fe7bcb817a1a7fe1d18705b584 Mon Sep 17 00:00:00 2001 From: yxy Date: Wed, 14 Feb 2024 13:27:58 +0000 Subject: [PATCH 04/29] delete comment --- .../riscv/convolutiondepthwise_riscv.cpp | 21 ------------------- 1 file changed, 21 deletions(-) diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp index 82af129fd6f6..9f6e35a6c4a9 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp @@ -244,27 +244,6 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c { fprintf(stderr, "ConvolutionDepthWise_riscv::forward int8 scale is called\n"); return forward_int8(bottom_blob, top_blob, opt); - // Mat bottom_blob_unpacked = bottom_blob; - // if (bottom_blob.elempack != 1) - // { - // Option opt_pack1 = opt; - // opt_pack1.blob_allocator = opt.workspace_allocator; - - // convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); - // } - - // Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked; - // if (bottom_blob_unpacked.elembits() == 16) - // { - // Option opt_pack1 = opt; - // opt_pack1.blob_allocator = opt.workspace_allocator; - - // cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1); - // } - - // Option opt_unpacked = opt; - // opt_unpacked.use_packing_layout = false; - // return ConvolutionDepthWise::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked); } #endif From 9edfd27ff5966e8d17d4e758811ef7f08262a9f9 Mon Sep 17 00:00:00 2001 From: yxy Date: Thu, 15 Feb 2024 02:55:02 +0000 Subject: [PATCH 05/29] pack8 maybe right --- .../riscv/convolutiondepthwise_riscv.cpp | 298 ++++++------------ 1 file changed, 99 insertions(+), 199 deletions(-) diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp index 9f6e35a6c4a9..ff66d8da89fd 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp @@ -361,7 +361,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c } } - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < channels; g++) { float* outptr = top_blob.channel(g); @@ -443,7 +443,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c } } - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { float* outptr = top_blob.channel(g); @@ -738,7 +738,7 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b } } - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < channels; g++) { __fp16* outptr = top_blob.channel(g); @@ -800,7 +800,7 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b } } - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { __fp16* outptr = top_blob.channel(g); @@ -985,7 +985,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ } } - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < channels; g++) { __fp16* outptr = top_blob.channel(g); @@ -1047,7 +1047,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ } } - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { __fp16* outptr = top_blob.channel(g); @@ -1229,8 +1229,10 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl int outw = (w - kernel_extent_w) / stride_w + 1; int outh = (h - kernel_extent_h) / stride_h + 1; + fprintf(stderr, "bottom_blob_bordered %d %d %d %d %d\n", bottom_blob_bordered.w, bottom_blob_bordered.h, bottom_blob_bordered.c, bottom_blob_bordered.elempack, bottom_blob_bordered.elemsize); + // depth-wise - if (channels * elempack == group && group == num_output) + if (channels * elempack == group && group == num_output) // depth-wise conv, 逐通道卷积 { int out_elempack = 1; #if __riscv_vector @@ -1240,13 +1242,14 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl } #endif // __riscv_vector bool use_int8_requantize = int8_scale_term > 100; + fprintf(stderr, "In 1246 use_int8_requantize = %d\n", use_int8_requantize); size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; -#if NCNN_ARM82 + if (support_fp16_storage && opt.use_fp16_storage) { out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; } -#endif + if (opt.use_bf16_storage) out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; @@ -1358,7 +1361,7 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl } } - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < channels; g++) { signed char* outptr_s8 = top_blob.channel(g); @@ -1460,192 +1463,85 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl } #endif // __riscv_vector - // if (elempack == 1) - // { - // #if NCNN_GNU_INLINE_ASM - // if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (activation_type == 0 || activation_type == 1)) - // { - // if (use_int8_requantize) - // { - // std::vector requantize_scales; - // for (int g = 0; g < group; g++) - // { - // float scale_in; - // if (weight_data_int8_scales[g] == 0) - // scale_in = 0; - // else - // scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); - - // float scale_out = top_blob_int8_scales[g]; - - // requantize_scales.push_back(scale_in); - // requantize_scales.push_back(scale_out); - // } - - // convdw3x3s1_int8_requant_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, requantize_scales, opt); - // } - // else - // { - // Mat top_blob_int32; - // top_blob_int32.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); - // if (top_blob_int32.empty()) - // return -100; - - // convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt); - // // convdw3x3s1_int8_dequant_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, bias_data, dequantize_scales, opt); - - // Mat scale_data(group); - // for (int g = 0; g < group; g++) - // { - // // dequantize - // float scale_in; - // if (weight_data_int8_scales[g] == 0) - // scale_in = 0; - // else - // scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); - - // scale_data[g] = scale_in; - // } - - // dequantize_from_int32(top_blob_int32, top_blob, scale_data, bias_data, opt); - // } - - // if (activation) - // { - // activation->forward_inplace(top_blob, opt); - // } - // } - // else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (activation_type == 0 || activation_type == 1)) - // { - // if (use_int8_requantize) - // { - // std::vector requantize_scales; - // for (int g = 0; g < group; g++) - // { - // float scale_in; - // if (weight_data_int8_scales[g] == 0) - // scale_in = 0; - // else - // scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); - - // float scale_out = top_blob_int8_scales[g]; - - // requantize_scales.push_back(scale_in); - // requantize_scales.push_back(scale_out); - // } - - // convdw3x3s2_int8_requant_neon(bottom_blob_bordered, top_blob, weight_data_tm, bias_data, requantize_scales, opt); - // } - // else - // { - // Mat top_blob_int32; - // top_blob_int32.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); - // if (top_blob_int32.empty()) - // return -100; - - // convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt); - // // convdw3x3s2_int8_dequant_neon(bottom_blob_bordered, top_blob_int32, weight_data_tm, bias_data, dequantize_scales, opt); - - // Mat scale_data(group); - // for (int g = 0; g < group; g++) - // { - // // dequantize - // float scale_in; - // if (weight_data_int8_scales[g] == 0) - // scale_in = 0; - // else - // scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); - - // scale_data[g] = scale_in; - // } - - // dequantize_from_int32(top_blob_int32, top_blob, scale_data, bias_data, opt); - // } - - // if (activation) - // { - // activation->forward_inplace(top_blob, opt); - // } - // } - // else - // #endif // NCNN_GNU_INLINE_ASM - // { - // const int maxk = kernel_w * kernel_h; - - // // kernel offsets - // std::vector _space_ofs(maxk); - // int* space_ofs = &_space_ofs[0]; - // { - // int p1 = 0; - // int p2 = 0; - // int gap = w * dilation_h - kernel_w * dilation_w; - // for (int i = 0; i < kernel_h; i++) - // { - // for (int j = 0; j < kernel_w; j++) - // { - // space_ofs[p1] = p2; - // p1++; - // p2 += dilation_w; - // } - // p2 += gap; - // } - // } - - // #pragma omp parallel for num_threads(opt.num_threads) - // for (int g = 0; g < group; g++) - // { - // signed char* outptr_s8 = top_blob.channel(g); - // float* outptr_f32 = top_blob.channel(g); - // const signed char* kptr = (const signed char*)weight_data_tm + maxk * g; - // const Mat m = bottom_blob_bordered.channel(g); - - // for (int i = 0; i < outh; i++) - // { - // for (int j = 0; j < outw; j++) - // { - // int sum = 0; - - // const signed char* sptr = m.row(i * stride_h) + j * stride_w; - - // for (int k = 0; k < maxk; k++) - // { - // signed char val = sptr[space_ofs[k]]; - // signed char w = kptr[k]; - // sum += val * w; - // } - - // float scale_in; - // if (weight_data_int8_scales[g] == 0) - // scale_in = 0; - // else - // scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); - - // float sumfp32 = sum * scale_in; - - // if (bias_term) - // sumfp32 += bias_data[g]; - - // sumfp32 = activation_ss(sumfp32, activation_type, activation_params); - - // if (use_int8_requantize) - // { - // // requantize - // float scale_out = top_blob_int8_scales[g]; - // signed char sums8 = float2int8(sumfp32 * scale_out); - // outptr_s8[0] = sums8; - // outptr_s8 += 1; - // } - // else - // { - // // dequantize - // outptr_f32[0] = sumfp32; - // outptr_f32 += 1; - // } - // } - // } - // } - // } - // } + if (elempack == 1) + { + { + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int g = 0; g < group; g++) + { + signed char* outptr_s8 = top_blob.channel(g); + float* outptr_f32 = top_blob.channel(g); + const signed char* kptr = (const signed char*)weight_data_tm + maxk * g; + const Mat m = bottom_blob_bordered.channel(g); + + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + int sum = 0; + + const signed char* sptr = m.row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + signed char val = sptr[space_ofs[k]]; + signed char w = kptr[k]; + sum += val * w; + } + + float scale_in; + if (weight_data_int8_scales[g] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + + float sumfp32 = sum * scale_in; + + if (bias_term) + sumfp32 += bias_data[g]; + + sumfp32 = activation_ss(sumfp32, activation_type, activation_params); + + if (use_int8_requantize) + { + // requantize + float scale_out = top_blob_int8_scales[g]; + signed char sums8 = float2int8(sumfp32 * scale_out); + outptr_s8[0] = sums8; + outptr_s8 += 1; + } + else + { + // dequantize + outptr_f32[0] = sumfp32; + outptr_f32 += 1; + } + } + } + } + } + } return 0; } @@ -1668,8 +1564,8 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; } // #endif - // if (opt.use_bf16_storage) - // out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; + if (opt.use_bf16_storage) + out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) @@ -1679,6 +1575,10 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl const int channels_g = channels * elempack / group; const int num_output_g = num_output / group; + fprintf(stderr, "group = %d, num_output = %d\n", group, num_output); + + fprintf(stderr, "channels_g = %d, num_output_g = %d\n", channels_g, num_output_g); + int g_elempack = 1; int out_g_elempack = 1; #if __riscv_vector @@ -1709,7 +1609,7 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl return -100; } - #pragma omp parallel for num_threads(opt.num_threads) +#pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack); From 19c0b9d9f6435f0d513a503567f57017dd265914 Mon Sep 17 00:00:00 2001 From: Xinyu302 Date: Thu, 15 Feb 2024 02:56:25 +0000 Subject: [PATCH 06/29] apply code-format changes --- src/layer/riscv/convolutiondepthwise_riscv.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp index ff66d8da89fd..1a0879225e90 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp @@ -361,7 +361,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c } } -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < channels; g++) { float* outptr = top_blob.channel(g); @@ -443,7 +443,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c } } -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { float* outptr = top_blob.channel(g); @@ -738,7 +738,7 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b } } -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < channels; g++) { __fp16* outptr = top_blob.channel(g); @@ -800,7 +800,7 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b } } -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { __fp16* outptr = top_blob.channel(g); @@ -985,7 +985,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ } } -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < channels; g++) { __fp16* outptr = top_blob.channel(g); @@ -1047,7 +1047,7 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_ } } -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { __fp16* outptr = top_blob.channel(g); @@ -1361,7 +1361,7 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl } } -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < channels; g++) { signed char* outptr_s8 = top_blob.channel(g); @@ -1609,7 +1609,7 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl return -100; } -#pragma omp parallel for num_threads(opt.num_threads) + #pragma omp parallel for num_threads(opt.num_threads) for (int g = 0; g < group; g++) { const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack); From 5b5bac434d4a77591c20a88a733846083c4f5195 Mon Sep 17 00:00:00 2001 From: yxy Date: Thu, 15 Feb 2024 05:41:07 +0000 Subject: [PATCH 07/29] debug --- src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h | 2 ++ src/layer/riscv/convolutiondepthwise_riscv.cpp | 2 ++ src/layer/riscv/padding_riscv.cpp | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h b/src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h index 3c2f23f20f00..c8fc22c528cd 100644 --- a/src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h +++ b/src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h @@ -15,6 +15,7 @@ static void convdw3x3s1_pack8_int8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) { + fprintf(stderr, "convdw3x3s1_pack8_int8_rvv\n"); int w = bottom_blob.w; int outw = top_blob.w; @@ -149,6 +150,7 @@ static void convdw3x3s1_pack8_int8_rvv(const Mat& bottom_blob, Mat& top_blob, co static void convdw3x3s2_pack8_int8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) { + fprintf(stderr, "convdw3x3s2_pack8_int8_rvv\n"); int w = bottom_blob.w; int outw = top_blob.w; diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp index ff66d8da89fd..fb43ed9024dd 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp @@ -1263,6 +1263,8 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl if (top_blob.empty()) return -100; + fprintf(stderr, "kernel_w = %d, kernel_h = %d, dilation_w = %d, dilation_h = %d, stride_w = %d, stride_h = %d, activation_type = %d\n", kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type); + #if __riscv_vector if (elempack == 8) { diff --git a/src/layer/riscv/padding_riscv.cpp b/src/layer/riscv/padding_riscv.cpp index 2e2d7471f477..8f4b54da5904 100644 --- a/src/layer/riscv/padding_riscv.cpp +++ b/src/layer/riscv/padding_riscv.cpp @@ -510,7 +510,7 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int Padding_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { #if __riscv_vector - const int packn = csrr_vlenb() / 2; + const int packn = csrr_vlenb() / 1; const size_t vl = vsetvl_e8m1(packn); #endif From bd5da8baa9934b7576e1402f9fc6f52f22d50620 Mon Sep 17 00:00:00 2001 From: yxy Date: Thu, 15 Feb 2024 07:38:29 +0000 Subject: [PATCH 08/29] debug --- src/layer/riscv/convolutiondepthwise_riscv.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp index 7cd4c28a3172..b64e1edd6f00 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp @@ -1264,6 +1264,7 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl return -100; fprintf(stderr, "kernel_w = %d, kernel_h = %d, dilation_w = %d, dilation_h = %d, stride_w = %d, stride_h = %d, activation_type = %d\n", kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type); + fprintf(stderr, "elempack = %d, out_elempack = %d\n", elempack, out_elempack); #if __riscv_vector if (elempack == 8) From f40dec84810bc979575bb2d45198a8fb198e00fa Mon Sep 17 00:00:00 2001 From: yxy Date: Thu, 15 Feb 2024 07:55:24 +0000 Subject: [PATCH 09/29] use pack8 --- src/layer/riscv/padding_riscv.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/layer/riscv/padding_riscv.cpp b/src/layer/riscv/padding_riscv.cpp index 8f4b54da5904..2e2d7471f477 100644 --- a/src/layer/riscv/padding_riscv.cpp +++ b/src/layer/riscv/padding_riscv.cpp @@ -510,7 +510,7 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int Padding_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { #if __riscv_vector - const int packn = csrr_vlenb() / 1; + const int packn = csrr_vlenb() / 2; const size_t vl = vsetvl_e8m1(packn); #endif From b5ec194ca80f187990e95c34535ba675e166f2f3 Mon Sep 17 00:00:00 2001 From: Xinyu302 Date: Thu, 15 Feb 2024 07:56:49 +0000 Subject: [PATCH 10/29] apply code-format changes --- src/layer/riscv/padding_packn.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/layer/riscv/padding_packn.h b/src/layer/riscv/padding_packn.h index 647ef1bb0b13..60465da64ffc 100644 --- a/src/layer/riscv/padding_packn.h +++ b/src/layer/riscv/padding_packn.h @@ -15,7 +15,7 @@ #define _PADDING_PACKN_RVV(SEW, TSEW, LMUL, T, VT) \ static void padding_constant_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right, v##VT##m##LMUL##_t v) \ { \ - int packn = csrr_vlenb() / sizeof(T); \ + int packn = csrr_vlenb() / sizeof(T); \ if (packn > 8) packn = 8; \ const size_t vl = vsetvl_e##SEW##m##LMUL(packn); \ \ @@ -65,7 +65,7 @@ \ static void padding_replicate_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right) \ { \ - int packn = csrr_vlenb() / sizeof(T); \ + int packn = csrr_vlenb() / sizeof(T); \ if (packn > 8) packn = 8; \ const size_t vl = vsetvl_e##SEW##m##LMUL(packn); \ \ @@ -145,7 +145,7 @@ \ static void padding_reflect_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right) \ { \ - int packn = csrr_vlenb() / sizeof(T); \ + int packn = csrr_vlenb() / sizeof(T); \ if (packn > 8) packn = 8; \ const size_t vl = vsetvl_e##SEW##m##LMUL(packn); \ \ From 58a72a414360f51a9adc868195ec36e53a120a1a Mon Sep 17 00:00:00 2001 From: yxy Date: Thu, 15 Feb 2024 08:50:04 +0000 Subject: [PATCH 11/29] pack8 right --- src/layer/riscv/convolutiondepthwise_riscv.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp index b64e1edd6f00..cd2ce6363fb9 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp @@ -1269,6 +1269,7 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl #if __riscv_vector if (elempack == 8) { + /* if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (activation_type == 0 || activation_type == 1)) { Mat top_blob_int32; @@ -1326,6 +1327,7 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl scale_in_data[g] = scale_in; } + fprintf(stderr, "use_int8_requantize = %d\n", use_int8_requantize); if (use_int8_requantize) { @@ -1342,6 +1344,7 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl } } else + */ { const int maxk = kernel_w * kernel_h; From e69c01fd1aaa05846a25460f796223ac64eab9a8 Mon Sep 17 00:00:00 2001 From: yxy Date: Thu, 15 Feb 2024 11:04:33 +0000 Subject: [PATCH 12/29] add basic conv --- src/layer/riscv/convolution_riscv.cpp | 175 ++++++++++++++++++++++++++ src/layer/riscv/convolution_riscv.h | 9 ++ 2 files changed, 184 insertions(+) diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp index be413e5be252..2b7d5e3b39a2 100644 --- a/src/layer/riscv/convolution_riscv.cpp +++ b/src/layer/riscv/convolution_riscv.cpp @@ -1102,4 +1102,179 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con } #endif // __riscv_vector && __riscv_zfh +#if NCNN_INT8 +int Convolution_riscv::create_pipeline_int8(const Option& opt) +{ + const int maxk = kernel_w * kernel_h; + const int num_input = weight_data_size / maxk / num_output; + + bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input >= 8 && num_output >= 8) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1; +// #if NCNN_ARM82DOT +// if (ncnn::cpu_support_arm_asimddp()) +// { +// prefer_winograd = false; +// } +// #endif + +#if 0 + if (opt.use_winograd_convolution && prefer_winograd) + { + if (opt.use_winograd43_convolution) + conv3x3s1_winograd43_transform_kernel_int8(weight_data, weight_winograd43_data, num_input, num_output, opt); + else + conv3x3s1_winograd23_transform_kernel_int8(weight_data, weight_winograd23_data, num_input, num_output, opt); + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_gemm_transform_kernel_int8(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt); + } + else +#endif + { + convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); + } + + scale_in_data.create(num_output); + for (int p = 0; p < num_output; p++) + { + // requantize and relu + float scale_in; + if (weight_data_int8_scales[p] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]); + + scale_in_data[p] = scale_in; + } + + weight_data.release(); + + return 0; +} + +int Convolution_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int elembits = bottom_blob.elembits(); + + Mat bottom_blob_int8 = bottom_blob; + if (elembits != 8) + { + Option opt_q = opt; + opt_q.blob_allocator = opt.workspace_allocator; + quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q); + } + + // NCNN_LOGE("Convolution_arm input %d x %d ksize=%d %d stride=%d %d", w, h, kernel_w, kernel_h, stride_w, stride_h); + + Mat bottom_blob_bordered; + make_padding(bottom_blob_int8, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + int w = bottom_blob_bordered.w; + int h = bottom_blob_bordered.h; + int elempack = bottom_blob_bordered.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + + bool use_int8_requantize = int8_scale_term > 100; + int out_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + if (use_int8_requantize) + out_elempack = num_output % 8 == 0 ? 8 : 1; + else + out_elempack = num_output % 4 == 0 ? 4 : 1; + } +#endif // __riscv_vector + size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; +#if __riscv_vector && __riscv_zfh + if (support_fp16_storage && opt.use_fp16_storage) + { + out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; + } +#endif // __riscv_vector && __riscv_zfh + if (opt.use_bf16_storage) + out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; + + // NCNN_LOGE("forward_int8_arm %d %d %d %d %d", w, h, bottom_blob_bordered.c, elempack, out_elempack); + + int channels = bottom_blob_bordered.c; + const int num_input = channels * elempack; + + bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input >= 8 && num_output >= 8) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1; +// #if NCNN_ARM82DOT +// if (ncnn::cpu_support_arm_asimddp()) +// { +// prefer_winograd = false; +// } +// #endif + + int out_elempack_int32 = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + out_elempack_int32 = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + } +#endif // __riscv_vector + + Mat top_blob_int32; + top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator); + if (top_blob_int32.empty()) + return -100; + + int _nT = nT ? nT : opt.num_threads; + if (nT != 0 && opt.num_threads != nT) + { + // force num_threads the same as in create_pipeline + // so we could use pre-packed A/B from the same tile config + NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT); + } +#if 0 + if (opt.use_winograd_convolution && prefer_winograd) + { + if (opt.use_winograd43_convolution && !weight_winograd43_data.empty()) + conv3x3s1_winograd43_int8(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, _nT, opt); + else + conv3x3s1_winograd23_int8(bottom_blob_bordered, top_blob_int32, weight_winograd23_data, _nT, opt); + } + else if (opt.use_sgemm_convolution) + { + convolution_im2col_gemm_int8(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt); + } + else +#endif + { + convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + } + + bottom_blob_bordered.release(); + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (use_int8_requantize) + { + requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt); + } + else + { + dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + + return 0; +} +#endif // NCNN_INT8 + } // namespace ncnn diff --git a/src/layer/riscv/convolution_riscv.h b/src/layer/riscv/convolution_riscv.h index a4e008c9dd1d..8c5ee015de1a 100644 --- a/src/layer/riscv/convolution_riscv.h +++ b/src/layer/riscv/convolution_riscv.h @@ -38,6 +38,11 @@ class Convolution_riscv : public Convolution int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif +#if NCNN_INT8 + int create_pipeline_int8(const Option& opt); + int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif + public: Layer* activation; @@ -48,6 +53,10 @@ class Convolution_riscv : public Convolution // fp16 Mat bias_data_fp16; + +#if NCNN_INT8 + Mat scale_in_data; +#endif }; } // namespace ncnn From 631c8fbd302713e27cd08e64e68de6c6fe3966e9 Mon Sep 17 00:00:00 2001 From: Xinyu302 Date: Thu, 15 Feb 2024 11:06:01 +0000 Subject: [PATCH 13/29] apply code-format changes --- src/layer/riscv/convolution_riscv.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp index 2b7d5e3b39a2..dc8a2e987eba 100644 --- a/src/layer/riscv/convolution_riscv.cpp +++ b/src/layer/riscv/convolution_riscv.cpp @@ -1109,12 +1109,12 @@ int Convolution_riscv::create_pipeline_int8(const Option& opt) const int num_input = weight_data_size / maxk / num_output; bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input >= 8 && num_output >= 8) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1; -// #if NCNN_ARM82DOT -// if (ncnn::cpu_support_arm_asimddp()) -// { -// prefer_winograd = false; -// } -// #endif + // #if NCNN_ARM82DOT + // if (ncnn::cpu_support_arm_asimddp()) + // { + // prefer_winograd = false; + // } + // #endif #if 0 if (opt.use_winograd_convolution && prefer_winograd) @@ -1208,12 +1208,12 @@ int Convolution_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const const int num_input = channels * elempack; bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input >= 8 && num_output >= 8) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1; -// #if NCNN_ARM82DOT -// if (ncnn::cpu_support_arm_asimddp()) -// { -// prefer_winograd = false; -// } -// #endif + // #if NCNN_ARM82DOT + // if (ncnn::cpu_support_arm_asimddp()) + // { + // prefer_winograd = false; + // } + // #endif int out_elempack_int32 = 1; #if __riscv_vector From 9f6a7e483aca1950b50380a75c91909b8e65b0d5 Mon Sep 17 00:00:00 2001 From: yxy Date: Fri, 16 Feb 2024 16:42:21 +0000 Subject: [PATCH 14/29] now can use requantize --- src/layer/riscv/convolutiondepthwise_riscv.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp index cd2ce6363fb9..f58d58bc7a5f 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp @@ -1269,7 +1269,6 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl #if __riscv_vector if (elempack == 8) { - /* if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (activation_type == 0 || activation_type == 1)) { Mat top_blob_int32; @@ -1344,7 +1343,6 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl } } else - */ { const int maxk = kernel_w * kernel_h; From d2ebcbd169c8ba43edc0e2b7c85aba87238d90c1 Mon Sep 17 00:00:00 2001 From: yxy Date: Fri, 16 Feb 2024 16:44:05 +0000 Subject: [PATCH 15/29] delete comment --- .../riscv/convolutiondepthwise_riscv.cpp | 35 ------------------- 1 file changed, 35 deletions(-) diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp index f58d58bc7a5f..7d7f77d1ca39 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp @@ -1379,8 +1379,6 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl { vl = 8; vint32m2_t _sum0 = vmv_v_x_i32m2(0, vl); - // int32x4_t _sum0 = vdupq_n_s32(0); - // int32x4_t _sum1 = vdupq_n_s32(0); const signed char* sptr = m.row(i * stride_h) + j * stride_w * 8; @@ -1390,55 +1388,28 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl vint8m1_t _w = vle8_v_i8m1(kptr + k * 8, vl); vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val, _w, vl), 0); - // int8x8_t _val = vld1_s8(sptr + space_ofs[k] * 8); - // int8x8_t _w = vld1_s8(kptr + k * 8); - // int16x8_t _s0 = vmull_s8(_val, _w); - // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); - // _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); _sum0 = vwadd_wv_i32m2(_sum0, _s0, vl); } - // float32x4_t _scale_in0; - // float32x4_t _scale_in1; vfloat32m2_t _scale_in; { vfloat32m2_t _bottom_blob_int8_scales = vle32_v_f32m2((const float*)bottom_blob_int8_scales + g * 8, vl); vfloat32m2_t _weight_data_int8_scales = vle32_v_f32m2((const float*)weight_data_int8_scales + g * 8, vl); - // float32x4_t _bottom_blob_int8_scales0 = vld1q_f32((const float*)bottom_blob_int8_scales + g * 8); - // float32x4_t _bottom_blob_int8_scales1 = vld1q_f32((const float*)bottom_blob_int8_scales + g * 8 + 4); - // float32x4_t _weight_data_int8_scales0 = vld1q_f32((const float*)weight_data_int8_scales + g * 8); - // float32x4_t _weight_data_int8_scales1 = vld1q_f32((const float*)weight_data_int8_scales + g * 8 + 4); _scale_in = vfdiv_vv_f32m2(vfmv_v_f_f32m2(1.f, vl), vfmul_vv_f32m2(_bottom_blob_int8_scales, _weight_data_int8_scales, vl), vl); - // _scale_in0 = div_ps(vdupq_n_f32(1.f), vmulq_f32(_bottom_blob_int8_scales0, _weight_data_int8_scales0)); - // _scale_in1 = div_ps(vdupq_n_f32(1.f), vmulq_f32(_bottom_blob_int8_scales1, _weight_data_int8_scales1)); vbool16_t _is_zero = vmfeq_vv_f32m2_b16(_bottom_blob_int8_scales, vfmv_v_f_f32m2(0.f, vl), vl); _scale_in = vfsub_vv_f32m2_m(_is_zero, _scale_in, _scale_in, _scale_in, vl); - // uint32x4_t _m0 = vmvnq_u32(vceqq_f32(_weight_data_int8_scales0, vdupq_n_f32(0.f))); - // uint32x4_t _m1 = vmvnq_u32(vceqq_f32(_weight_data_int8_scales1, vdupq_n_f32(0.f))); - // _scale_in0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(_scale_in0), _m0)); - // _scale_in1 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(_scale_in1), _m1)); } vfloat32m2_t _sumfp32 = vfmul_vv_f32m2(vfcvt_f_x_v_f32m2(_sum0, vl), _scale_in, vl); - // float32x4_t _sumfp32_0 = vmulq_f32(vcvtq_f32_s32(_sum0), _scale_in0); - // float32x4_t _sumfp32_1 = vmulq_f32(vcvtq_f32_s32(_sum1), _scale_in1); - if (bias_term) { vfloat32m2_t _bias = vle32_v_f32m2((const float*)bias_data + g * 8, vl); _sumfp32 = vfadd_vv_f32m2(_sumfp32, _bias, vl); - // float32x4_t _bias0 = vld1q_f32((const float*)bias_data + g * 8); - // float32x4_t _bias1 = vld1q_f32((const float*)bias_data + g * 8 + 4); - // _sumfp32_0 = vaddq_f32(_sumfp32_0, _bias0); - // _sumfp32_1 = vaddq_f32(_sumfp32_1, _bias1); } _sumfp32 = activation_ps(_sumfp32, activation_type, activation_params, vl); - // _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params); - // _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params); - if (use_int8_requantize) { // requantize @@ -1446,18 +1417,12 @@ int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_bl vfloat32m2_t _res = vfmul_vv_f32m2(_sumfp32, _scale_out, vl); int64_t _sum8 = float2int8(vget_v_f32m2_f32m1(_res, 0), vget_v_f32m2_f32m1(_res, 1)); *(int64_t*)outptr_s8 = _sum8; - // float32x4_t _scale_out0 = vld1q_f32((const float*)top_blob_int8_scales + g * 8); - // float32x4_t _scale_out1 = vld1q_f32((const float*)top_blob_int8_scales + g * 8 + 4); - // int8x8_t _sum8 = float2int8(vmulq_f32(_sumfp32_0, _scale_out0), vmulq_f32(_sumfp32_1, _scale_out1)); - // vst1_s8(outptr_s8, _sum8); outptr_s8 += 8; } else { // dequantize vse32_v_f32m2(outptr_f32, _sumfp32, vl); - // vst1q_f32(outptr_f32, _sumfp32_0); - // vst1q_f32(outptr_f32 + 4, _sumfp32_1); outptr_f32 += 8; } } From d5b82b93e5907226d4c008262fbbd2748bbb0207 Mon Sep 17 00:00:00 2001 From: Xinyu Yang Date: Sat, 17 Feb 2024 14:09:34 +0800 Subject: [PATCH 16/29] add arm base, now to rewrite it to riscv-v extension --- src/layer/riscv/convolution_packed_int8.h | 1316 +++++++++++++++++++++ 1 file changed, 1316 insertions(+) create mode 100644 src/layer/riscv/convolution_packed_int8.h diff --git a/src/layer/riscv/convolution_packed_int8.h b/src/layer/riscv/convolution_packed_int8.h new file mode 100644 index 000000000000..c1c9432490f6 --- /dev/null +++ b/src/layer/riscv/convolution_packed_int8.h @@ -0,0 +1,1316 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD) +#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8 +void convolution_transform_kernel_packed_int8_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h); +void convolution_packed_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt); +#endif + +#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD +void convolution_transform_kernel_packed_int8_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h); +void convolution_packed_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt); +#endif +#endif + +static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ +#if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD) +#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8 + if (ncnn::cpu_support_arm_i8mm()) + { + convolution_transform_kernel_packed_int8_i8mm(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); + return; + } +#endif + +#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD + if (ncnn::cpu_support_arm_asimddp()) + { + convolution_transform_kernel_packed_int8_asimddp(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); + return; + } +#endif +#endif + + const int maxk = kernel_w * kernel_h; + + // src = kw-kh-inch-outch + // dst = pb-pa-kw-kh-inch/pa-outch/pb + + // clang-format off + // *INDENT-OFF* +#if __ARM_NEON + if (outch >= 8) + { + if (inch >= 8) + kernel_tm.create(maxk, inch / 8 + inch % 8, outch / 8 + (outch % 8) / 4 + (outch % 4) / 2 + outch % 2, (size_t)64u, 64); + else + kernel_tm.create(maxk, inch, outch / 8 + (outch % 8) / 4 + (outch % 4) / 2 + outch % 2, (size_t)8u, 8); + } + else if (outch >= 4) + { + if (inch >= 8) + kernel_tm.create(maxk, inch / 8 + inch % 8, outch / 4 + (outch % 4) / 2 + outch % 2, (size_t)32u, 32); + else + kernel_tm.create(maxk, inch, outch / 4 + (outch % 4) / 2 + outch % 2, (size_t)4u, 4); + } + else +#endif // __ARM_NEON + if (outch >= 2) + { +#if __ARM_NEON + if (inch >= 8) + kernel_tm.create(maxk, inch / 8 + inch % 8, outch / 2 + outch % 2, (size_t)16u, 16); + else +#endif // __ARM_NEON + kernel_tm.create(maxk, inch, outch / 2 + outch % 2, (size_t)2u, 2); + } + else + { +#if __ARM_NEON + if (inch >= 8) + kernel_tm.create(maxk, inch / 8 + inch % 8, outch, (size_t)8u, 8); + else +#endif // __ARM_NEON + kernel_tm.create(maxk, inch, outch, (size_t)1u, 1); + } + // *INDENT-ON* + // clang-format on + + int q = 0; +#if __ARM_NEON + for (; q + 7 < outch; q += 8) + { + const signed char* kptr0 = (const signed char*)kernel + q * inch * maxk; + const signed char* kptr1 = (const signed char*)kernel + (q + 1) * inch * maxk; + const signed char* kptr2 = (const signed char*)kernel + (q + 2) * inch * maxk; + const signed char* kptr3 = (const signed char*)kernel + (q + 3) * inch * maxk; + const signed char* kptr4 = (const signed char*)kernel + (q + 4) * inch * maxk; + const signed char* kptr5 = (const signed char*)kernel + (q + 5) * inch * maxk; + const signed char* kptr6 = (const signed char*)kernel + (q + 6) * inch * maxk; + const signed char* kptr7 = (const signed char*)kernel + (q + 7) * inch * maxk; + + signed char* g00 = kernel_tm.channel(q / 8); + + int p = 0; + for (; p + 7 < inch; p += 8) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k0 = kptr0 + k; + const signed char* k1 = kptr1 + k; + const signed char* k2 = kptr2 + k; + const signed char* k3 = kptr3 + k; + const signed char* k4 = kptr4 + k; + const signed char* k5 = kptr5 + k; + const signed char* k6 = kptr6 + k; + const signed char* k7 = kptr7 + k; + + for (int i = 0; i < 4; i++) + { + g00[0] = k0[0]; + g00[1] = k0[maxk]; + g00[2] = k1[0]; + g00[3] = k1[maxk]; + g00[4] = k2[0]; + g00[5] = k2[maxk]; + g00[6] = k3[0]; + g00[7] = k3[maxk]; + g00[8] = k4[0]; + g00[9] = k4[maxk]; + g00[10] = k5[0]; + g00[11] = k5[maxk]; + g00[12] = k6[0]; + g00[13] = k6[maxk]; + g00[14] = k7[0]; + g00[15] = k7[maxk]; + g00 += 16; + k0 += maxk * 2; + k1 += maxk * 2; + k2 += maxk * 2; + k3 += maxk * 2; + k4 += maxk * 2; + k5 += maxk * 2; + k6 += maxk * 2; + k7 += maxk * 2; + } + } + + kptr0 += maxk * 8; + kptr1 += maxk * 8; + kptr2 += maxk * 8; + kptr3 += maxk * 8; + kptr4 += maxk * 8; + kptr5 += maxk * 8; + kptr6 += maxk * 8; + kptr7 += maxk * 8; + } + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k0 = kptr0 + k; + const signed char* k1 = kptr1 + k; + const signed char* k2 = kptr2 + k; + const signed char* k3 = kptr3 + k; + const signed char* k4 = kptr4 + k; + const signed char* k5 = kptr5 + k; + const signed char* k6 = kptr6 + k; + const signed char* k7 = kptr7 + k; + + g00[0] = k0[0]; + g00[1] = k1[0]; + g00[2] = k2[0]; + g00[3] = k3[0]; + g00[4] = k4[0]; + g00[5] = k5[0]; + g00[6] = k6[0]; + g00[7] = k7[0]; + g00 += 8; + } + + kptr0 += maxk; + kptr1 += maxk; + kptr2 += maxk; + kptr3 += maxk; + kptr4 += maxk; + kptr5 += maxk; + kptr6 += maxk; + kptr7 += maxk; + } + } + for (; q + 3 < outch; q += 4) + { + const signed char* kptr0 = (const signed char*)kernel + q * inch * maxk; + const signed char* kptr1 = (const signed char*)kernel + (q + 1) * inch * maxk; + const signed char* kptr2 = (const signed char*)kernel + (q + 2) * inch * maxk; + const signed char* kptr3 = (const signed char*)kernel + (q + 3) * inch * maxk; + + signed char* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4); + + int p = 0; + for (; p + 7 < inch; p += 8) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k0 = kptr0 + k; + const signed char* k1 = kptr1 + k; + const signed char* k2 = kptr2 + k; + const signed char* k3 = kptr3 + k; + + for (int i = 0; i < 4; i++) + { + g00[0] = k0[0]; + g00[1] = k0[maxk]; + g00[2] = k1[0]; + g00[3] = k1[maxk]; + g00[4] = k2[0]; + g00[5] = k2[maxk]; + g00[6] = k3[0]; + g00[7] = k3[maxk]; + g00 += 8; + k0 += maxk * 2; + k1 += maxk * 2; + k2 += maxk * 2; + k3 += maxk * 2; + } + } + + kptr0 += maxk * 8; + kptr1 += maxk * 8; + kptr2 += maxk * 8; + kptr3 += maxk * 8; + } + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k0 = kptr0 + k; + const signed char* k1 = kptr1 + k; + const signed char* k2 = kptr2 + k; + const signed char* k3 = kptr3 + k; + + g00[0] = k0[0]; + g00[1] = k1[0]; + g00[2] = k2[0]; + g00[3] = k3[0]; + g00 += 4; + } + + kptr0 += maxk; + kptr1 += maxk; + kptr2 += maxk; + kptr3 += maxk; + } + } +#endif // __ARM_NEON + for (; q + 1 < outch; q += 2) + { + const signed char* kptr0 = (const signed char*)kernel + q * inch * maxk; + const signed char* kptr1 = (const signed char*)kernel + (q + 1) * inch * maxk; + +#if __ARM_NEON + signed char* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2); +#else + signed char* g00 = kernel_tm.channel(q / 2); +#endif + + int p = 0; +#if __ARM_NEON + for (; p + 7 < inch; p += 8) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k0 = kptr0 + k; + const signed char* k1 = kptr1 + k; + + for (int i = 0; i < 4; i++) + { + g00[0] = k0[0]; + k0 += maxk; + g00 += 1; + } + for (int i = 0; i < 4; i++) + { + g00[0] = k1[0]; + k1 += maxk; + g00 += 1; + } + + for (int i = 4; i < 8; i++) + { + g00[0] = k0[0]; + k0 += maxk; + g00 += 1; + } + for (int i = 4; i < 8; i++) + { + g00[0] = k1[0]; + k1 += maxk; + g00 += 1; + } + } + + kptr0 += maxk * 8; + kptr1 += maxk * 8; + } +#endif // __ARM_NEON + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k0 = kptr0 + k; + const signed char* k1 = kptr1 + k; + + g00[0] = k0[0]; + g00[1] = k1[0]; + g00 += 2; + } + + kptr0 += maxk; + kptr1 += maxk; + } + } + for (; q < outch; q++) + { + const signed char* kptr = (const signed char*)kernel + q * inch * maxk; + +#if __ARM_NEON + signed char* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2 + q % 2); +#else + signed char* g00 = kernel_tm.channel(q / 2 + q % 2); +#endif + + int p = 0; +#if __ARM_NEON + for (; p + 7 < inch; p += 8) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k0 = kptr + k; + + for (int i = 0; i < 8; i++) + { + g00[0] = k0[0]; + k0 += maxk; + g00 += 1; + } + } + + kptr += maxk * 8; + } +#endif // __ARM_NEON + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k0 = kptr + k; + g00[0] = k0[0]; + g00++; + } + + kptr += maxk; + } + } +} + +static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ +// #if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD) +// #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8 +// if (ncnn::cpu_support_arm_i8mm()) +// { +// convolution_packed_int8_i8mm(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); +// return; +// } +// #endif + +// #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD +// if (ncnn::cpu_support_arm_asimddp()) +// { +// convolution_packed_int8_asimddp(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); +// return; +// } +// #endif +// #endif + + const int w = bottom_blob.w; + const int elempack = bottom_blob.elempack; + const int inch = bottom_blob.c * elempack; + + const int N = bottom_blob.cstep * elempack; + + const int outw = top_blob.w; + const int outh = top_blob.h; + const int out_elempack = top_blob.elempack; + const int outch = top_blob.c * out_elempack; + + const int maxk = kernel_w * kernel_h; + + // kernel offsets + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2 * elempack; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + int nn_outch = 0; + int remain_outch_start = 0; +#if __ARM_NEON + nn_outch = (outch - remain_outch_start) / 8; + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + const int p = remain_outch_start + pp * 8; + + // shadowed variable for less openmp task args + const int outw = top_blob.w; + const int outh = top_blob.h; + const int N = bottom_blob.cstep * elempack; + const int M = top_blob.cstep * out_elempack; + + int* outptr = top_blob.channel(p / out_elempack); + + int ij = 0; + for (; ij + 1 < outw * outh; ij += 2) + { + const int i0 = ij / outw; + const int i1 = (ij + 1) / outw; + const int j0 = ij % outw; + const int j1 = (ij + 1) % outw; + + int32x4_t _sum0 = vdupq_n_s32(0); + int32x4_t _sum1 = vdupq_n_s32(0); + int32x4_t _sum2 = vdupq_n_s32(0); + int32x4_t _sum3 = vdupq_n_s32(0); + + const signed char* kptr = weight_data_tm.channel(p / 8); + + int q = 0; + { + for (; q + 7 < inch; q += 8) + { + const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; + const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + const signed char* r1s = r1 + space_ofs[k]; + + int8x8_t _r0; + int8x8_t _r1; + if (elempack == 8) + { + _r0 = vld1_s8(r0s); + _r1 = vld1_s8(r1s); + } + else // if (elempack == 1) + { + signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]}; + _r0 = vld1_s8(tmp0); + _r1 = vld1_s8(tmp1); + } + + int8x16_t _w0 = vld1q_s8(kptr); + int8x16_t _w1 = vld1q_s8(kptr + 16); + int8x16_t _w2 = vld1q_s8(kptr + 32); + int8x16_t _w3 = vld1q_s8(kptr + 48); + + int16x4_t _rr0 = vreinterpret_s16_s8(_r0); + int16x4_t _rr1 = vreinterpret_s16_s8(_r1); + + int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); + int8x8_t _r1ll = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 0)); + int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); + int8x8_t _r1hl = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 2)); + + int16x8_t _s0l = vmull_s8(_r0ll, vget_low_s8(_w0)); + int16x8_t _s1l = vmull_s8(_r0ll, vget_high_s8(_w0)); + int16x8_t _s2l = vmull_s8(_r1ll, vget_low_s8(_w0)); + int16x8_t _s3l = vmull_s8(_r1ll, vget_high_s8(_w0)); + _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w2)); + _s1l = vmlal_s8(_s1l, _r0hl, vget_high_s8(_w2)); + _s2l = vmlal_s8(_s2l, _r1hl, vget_low_s8(_w2)); + _s3l = vmlal_s8(_s3l, _r1hl, vget_high_s8(_w2)); + + _sum0 = vpadalq_s16(_sum0, _s0l); + _sum1 = vpadalq_s16(_sum1, _s1l); + _sum2 = vpadalq_s16(_sum2, _s2l); + _sum3 = vpadalq_s16(_sum3, _s3l); + + int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); + int8x8_t _r1lh = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 1)); + int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); + int8x8_t _r1hh = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 3)); + + int16x8_t _s0h = vmull_s8(_r0lh, vget_low_s8(_w1)); + int16x8_t _s1h = vmull_s8(_r0lh, vget_high_s8(_w1)); + int16x8_t _s2h = vmull_s8(_r1lh, vget_low_s8(_w1)); + int16x8_t _s3h = vmull_s8(_r1lh, vget_high_s8(_w1)); + _s0h = vmlal_s8(_s0h, _r0hh, vget_low_s8(_w3)); + _s1h = vmlal_s8(_s1h, _r0hh, vget_high_s8(_w3)); + _s2h = vmlal_s8(_s2h, _r1hh, vget_low_s8(_w3)); + _s3h = vmlal_s8(_s3h, _r1hh, vget_high_s8(_w3)); + + _sum0 = vpadalq_s16(_sum0, _s0h); + _sum1 = vpadalq_s16(_sum1, _s1h); + _sum2 = vpadalq_s16(_sum2, _s2h); + _sum3 = vpadalq_s16(_sum3, _s3h); + + kptr += 64; + } + } + } + for (; q < inch; q++) + { + const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; + const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + const signed char* r1s = r1 + space_ofs[k]; + + // if (elempack == 1) + { + int8x8_t _r0 = vdup_n_s8(r0s[0]); + int8x8_t _r1 = vdup_n_s8(r1s[0]); + int8x8_t _w = vld1_s8(kptr); + int16x8_t _s0 = vmull_s8(_r0, _w); + int16x8_t _s1 = vmull_s8(_r1, _w); + _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); + _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); + _sum2 = vaddw_s16(_sum2, vget_low_s16(_s1)); + _sum3 = vaddw_s16(_sum3, vget_high_s16(_s1)); + + kptr += 8; + } + } + } + + if (out_elempack == 8) + { + vst1q_s32(outptr, _sum0); + vst1q_s32(outptr + 4, _sum1); + vst1q_s32(outptr + 8, _sum2); + vst1q_s32(outptr + 12, _sum3); + outptr += 16; + } + if (out_elempack == 4) + { + vst1q_s32(outptr, _sum0); + vst1q_s32(outptr + 4, _sum2); + vst1q_s32(outptr + M, _sum1); + vst1q_s32(outptr + M + 4, _sum3); + outptr += 8; + } + if (out_elempack == 1) + { + outptr[0] = vgetq_lane_s32(_sum0, 0); + outptr[1] = vgetq_lane_s32(_sum2, 0); + outptr[M] = vgetq_lane_s32(_sum0, 1); + outptr[M + 1] = vgetq_lane_s32(_sum2, 1); + outptr[M * 2] = vgetq_lane_s32(_sum0, 2); + outptr[M * 2 + 1] = vgetq_lane_s32(_sum2, 2); + outptr[M * 3] = vgetq_lane_s32(_sum0, 3); + outptr[M * 3 + 1] = vgetq_lane_s32(_sum2, 3); + outptr[M * 4] = vgetq_lane_s32(_sum1, 0); + outptr[M * 4 + 1] = vgetq_lane_s32(_sum3, 0); + outptr[M * 5] = vgetq_lane_s32(_sum1, 1); + outptr[M * 5 + 1] = vgetq_lane_s32(_sum3, 1); + outptr[M * 6] = vgetq_lane_s32(_sum1, 2); + outptr[M * 6 + 1] = vgetq_lane_s32(_sum3, 2); + outptr[M * 7] = vgetq_lane_s32(_sum1, 3); + outptr[M * 7 + 1] = vgetq_lane_s32(_sum3, 3); + outptr += 2; + } + } + for (; ij < outw * outh; ij++) + { + const int i = ij / outw; + const int j = ij % outw; + + int32x4_t _sum0 = vdupq_n_s32(0); + int32x4_t _sum1 = vdupq_n_s32(0); + int32x4_t _sum2 = vdupq_n_s32(0); + int32x4_t _sum3 = vdupq_n_s32(0); + + const signed char* kptr = weight_data_tm.channel(p / 8); + + int q = 0; + { + for (; q + 7 < inch; q += 8) + { + const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + + int8x8_t _r0; + if (elempack == 8) + { + _r0 = vld1_s8(r0s); + } + else // if (elempack == 1) + { + signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + _r0 = vld1_s8(tmp); + } + + int8x16_t _w0 = vld1q_s8(kptr); + int8x16_t _w1 = vld1q_s8(kptr + 16); + int8x16_t _w2 = vld1q_s8(kptr + 32); + int8x16_t _w3 = vld1q_s8(kptr + 48); + + int16x4_t _rr0 = vreinterpret_s16_s8(_r0); + int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); + int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); + int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); + int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); + + int16x8_t _s0l = vmull_s8(_r0ll, vget_low_s8(_w0)); + int16x8_t _s1l = vmull_s8(_r0ll, vget_high_s8(_w0)); + int16x8_t _s0h = vmull_s8(_r0lh, vget_low_s8(_w1)); + int16x8_t _s1h = vmull_s8(_r0lh, vget_high_s8(_w1)); + _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w2)); + _s1l = vmlal_s8(_s1l, _r0hl, vget_high_s8(_w2)); + _s0h = vmlal_s8(_s0h, _r0hh, vget_low_s8(_w3)); + _s1h = vmlal_s8(_s1h, _r0hh, vget_high_s8(_w3)); + + _sum0 = vpadalq_s16(_sum0, _s0l); + _sum1 = vpadalq_s16(_sum1, _s1l); + _sum2 = vpadalq_s16(_sum2, _s0h); + _sum3 = vpadalq_s16(_sum3, _s1h); + + kptr += 64; + } + } + + { + _sum0 = vaddq_s32(_sum0, _sum2); + _sum1 = vaddq_s32(_sum1, _sum3); + } + } + for (; q < inch; q++) + { + const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + + // if (elempack == 1) + { + int8x8_t _val = vdup_n_s8(r0s[0]); + int8x8_t _w = vld1_s8(kptr); + int16x8_t _s0 = vmull_s8(_val, _w); + _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); + _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); + + kptr += 8; + } + } + } + + if (out_elempack == 8) + { + vst1q_s32(outptr, _sum0); + vst1q_s32(outptr + 4, _sum1); + outptr += 8; + } + if (out_elempack == 4) + { + vst1q_s32(outptr, _sum0); + vst1q_s32(outptr + M, _sum1); + outptr += 4; + } + if (out_elempack == 1) + { + outptr[0] = vgetq_lane_s32(_sum0, 0); + outptr[M] = vgetq_lane_s32(_sum0, 1); + outptr[M * 2] = vgetq_lane_s32(_sum0, 2); + outptr[M * 3] = vgetq_lane_s32(_sum0, 3); + outptr[M * 4] = vgetq_lane_s32(_sum1, 0); + outptr[M * 5] = vgetq_lane_s32(_sum1, 1); + outptr[M * 6] = vgetq_lane_s32(_sum1, 2); + outptr[M * 7] = vgetq_lane_s32(_sum1, 3); + outptr += 1; + } + } + } + remain_outch_start += nn_outch * 8; + nn_outch = (outch - remain_outch_start) / 4; + for (int pp = 0; pp < nn_outch; pp++) + { + const int p = remain_outch_start + pp * 4; + + // shadowed variable for less openmp task args + const int outw = top_blob.w; + const int outh = top_blob.h; + const int N = bottom_blob.cstep * elempack; + const int M = top_blob.cstep * out_elempack; + + int* outptr = top_blob.channel(p / out_elempack); + + int ij = 0; + for (; ij + 1 < outw * outh; ij += 2) + { + const int i0 = ij / outw; + const int i1 = (ij + 1) / outw; + const int j0 = ij % outw; + const int j1 = (ij + 1) % outw; + + int32x4_t _sum0 = vdupq_n_s32(0); + int32x4_t _sum1 = vdupq_n_s32(0); + + const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4); + + int q = 0; + { + for (; q + 7 < inch; q += 8) + { + const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; + const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + const signed char* r1s = r1 + space_ofs[k]; + + int8x8_t _r0; + int8x8_t _r1; + if (elempack == 8) + { + _r0 = vld1_s8(r0s); + _r1 = vld1_s8(r1s); + } + else // if (elempack == 1) + { + signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]}; + _r0 = vld1_s8(tmp0); + _r1 = vld1_s8(tmp1); + } + + int8x16_t _w0 = vld1q_s8(kptr); + int8x16_t _w1 = vld1q_s8(kptr + 16); + + int16x4_t _rr0 = vreinterpret_s16_s8(_r0); + int16x4_t _rr1 = vreinterpret_s16_s8(_r1); + + int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); + int8x8_t _r1ll = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 0)); + int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); + int8x8_t _r1lh = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 1)); + + int16x8_t _s0l = vmull_s8(_r0ll, vget_low_s8(_w0)); + int16x8_t _s1l = vmull_s8(_r1ll, vget_low_s8(_w0)); + int16x8_t _s0h = vmull_s8(_r0lh, vget_high_s8(_w0)); + int16x8_t _s1h = vmull_s8(_r1lh, vget_high_s8(_w0)); + + int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); + int8x8_t _r1hl = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 2)); + int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); + int8x8_t _r1hh = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 3)); + + _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w1)); + _s1l = vmlal_s8(_s1l, _r1hl, vget_low_s8(_w1)); + _s0h = vmlal_s8(_s0h, _r0hh, vget_high_s8(_w1)); + _s1h = vmlal_s8(_s1h, _r1hh, vget_high_s8(_w1)); + + _sum0 = vpadalq_s16(_sum0, _s0l); + _sum1 = vpadalq_s16(_sum1, _s1l); + _sum0 = vpadalq_s16(_sum0, _s0h); + _sum1 = vpadalq_s16(_sum1, _s1h); + + kptr += 32; + } + } + } + for (; q < inch; q++) + { + const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; + const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + const signed char* r1s = r1 + space_ofs[k]; + + // if (elempack == 1) + { + int8x8_t _r0 = vdup_n_s8(r0s[0]); + int8x8_t _r1 = vdup_n_s8(r1s[0]); + int8x8_t _r01 = vreinterpret_s8_s32(vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r1)).val[0]); + int8x8_t _w = vld1_s8(kptr); + int8x8_t _ww = vreinterpret_s8_s32(vzip_s32(vreinterpret_s32_s8(_w), vreinterpret_s32_s8(_w)).val[0]); + int16x8_t _s01 = vmull_s8(_r01, _ww); + _sum0 = vaddw_s16(_sum0, vget_low_s16(_s01)); + _sum1 = vaddw_s16(_sum1, vget_high_s16(_s01)); + + kptr += 4; + } + } + } + + if (out_elempack == 4) + { + vst1q_s32(outptr, _sum0); + vst1q_s32(outptr + 4, _sum1); + outptr += 8; + } + if (out_elempack == 1) + { + int32x4x2_t _sum01 = vzipq_s32(_sum0, _sum1); + vst1_s32(outptr, vget_low_s32(_sum01.val[0])); + vst1_s32(outptr + M, vget_high_s32(_sum01.val[0])); + vst1_s32(outptr + M * 2, vget_low_s32(_sum01.val[1])); + vst1_s32(outptr + M * 3, vget_high_s32(_sum01.val[1])); + outptr += 2; + } + } + for (; ij < outw * outh; ij++) + { + const int i = ij / outw; + const int j = ij % outw; + + int32x4_t _sum0 = vdupq_n_s32(0); + int32x4_t _sum1 = vdupq_n_s32(0); + + const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4); + + int q = 0; + { + for (; q + 7 < inch; q += 8) + { + const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + + int8x8_t _r0; + if (elempack == 8) + { + _r0 = vld1_s8(r0s); + } + else // if (elempack == 1) + { + signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + _r0 = vld1_s8(tmp); + } + + int8x16_t _w0 = vld1q_s8(kptr); + int8x16_t _w1 = vld1q_s8(kptr + 16); + + int16x4_t _rr0 = vreinterpret_s16_s8(_r0); + int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); + int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); + int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); + int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); + + int16x8_t _sl = vmull_s8(_r0ll, vget_low_s8(_w0)); + int16x8_t _sh = vmull_s8(_r0lh, vget_high_s8(_w0)); + _sl = vmlal_s8(_sl, _r0hl, vget_low_s8(_w1)); + _sh = vmlal_s8(_sh, _r0hh, vget_high_s8(_w1)); + + _sum0 = vpadalq_s16(_sum0, _sl); + _sum1 = vpadalq_s16(_sum1, _sh); + + kptr += 32; + } + } + { + _sum0 = vaddq_s32(_sum0, _sum1); + } + } + for (; q < inch; q++) + { + const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + + // if (elempack == 1) + { + int8x8_t _val = vdup_n_s8(r0s[0]); + int8x8_t _w = vld1_s8(kptr); + int16x8_t _s0 = vmull_s8(_val, _w); + _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); + + kptr += 4; + } + } + } + + if (out_elempack == 4) + { + vst1q_s32(outptr, _sum0); + outptr += 4; + } + if (out_elempack == 1) + { + outptr[0] = vgetq_lane_s32(_sum0, 0); + outptr[M] = vgetq_lane_s32(_sum0, 1); + outptr[M * 2] = vgetq_lane_s32(_sum0, 2); + outptr[M * 3] = vgetq_lane_s32(_sum0, 3); + outptr += 1; + } + } + } + remain_outch_start += nn_outch * 4; + nn_outch = (outch - remain_outch_start) / 2; +#else // __ARM_NEON + nn_outch = (outch - remain_outch_start) / 2; + #pragma omp parallel for num_threads(opt.num_threads) +#endif // __ARM_NEON + for (int pp = 0; pp < nn_outch; pp++) + { + const int p = remain_outch_start + pp * 2; + + // shadowed variable for less openmp task args + const int outw = top_blob.w; + const int outh = top_blob.h; + const int N = bottom_blob.cstep * elempack; + + int* outptr0 = top_blob.channel(p); + int* outptr1 = top_blob.channel(p + 1); + + int ij = 0; + for (; ij + 1 < outw * outh; ij += 2) + { + const int i0 = ij / outw; + const int i1 = (ij + 1) / outw; + const int j0 = ij % outw; + const int j1 = (ij + 1) % outw; + + int sum00 = 0; + int sum01 = 0; + int sum10 = 0; + int sum11 = 0; + +#if __ARM_NEON + const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2); +#else + const signed char* kptr = weight_data_tm.channel(p / 2); +#endif + + int q = 0; +#if __ARM_NEON + { + int32x4_t _sum01 = vdupq_n_s32(0); + int32x4_t _sum23 = vdupq_n_s32(0); + for (; q + 7 < inch; q += 8) + { + const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; + const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + const signed char* r1s = r1 + space_ofs[k]; + + int8x8_t _r0; + int8x8_t _r1; + if (elempack == 8) + { + _r0 = vld1_s8(r0s); + _r1 = vld1_s8(r1s); + } + else // if (elempack == 1) + { + signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]}; + _r0 = vld1_s8(tmp0); + _r1 = vld1_s8(tmp1); + } + + int8x16_t _w0 = vld1q_s8(kptr); + + int32x2x2_t _rr0 = vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r0)); + int32x2x2_t _rr1 = vzip_s32(vreinterpret_s32_s8(_r1), vreinterpret_s32_s8(_r1)); + int8x8_t _r0l = vreinterpret_s8_s32(_rr0.val[0]); + int8x8_t _r0h = vreinterpret_s8_s32(_rr0.val[1]); + int8x8_t _r1l = vreinterpret_s8_s32(_rr1.val[0]); + int8x8_t _r1h = vreinterpret_s8_s32(_rr1.val[1]); + + int16x8_t _s01 = vmull_s8(_r0l, vget_low_s8(_w0)); + int16x8_t _s23 = vmull_s8(_r1l, vget_low_s8(_w0)); + _s01 = vmlal_s8(_s01, _r0h, vget_high_s8(_w0)); + _s23 = vmlal_s8(_s23, _r1h, vget_high_s8(_w0)); + + _sum01 = vpadalq_s16(_sum01, _s01); + _sum23 = vpadalq_s16(_sum23, _s23); + + kptr += 16; + } + } + int32x2_t _s0 = vpadd_s32(vget_low_s32(_sum01), vget_high_s32(_sum01)); + int32x2_t _s1 = vpadd_s32(vget_low_s32(_sum23), vget_high_s32(_sum23)); + sum00 += vget_lane_s32(_s0, 0); + sum01 += vget_lane_s32(_s1, 0); + sum10 += vget_lane_s32(_s0, 1); + sum11 += vget_lane_s32(_s1, 1); + } +#endif // __ARM_NEON + for (; q < inch; q++) + { + const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; + const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + const signed char* r1s = r1 + space_ofs[k]; + + // if (elempack == 1) + { + sum00 += r0s[0] * kptr[0]; + sum01 += r1s[0] * kptr[0]; + sum10 += r0s[0] * kptr[1]; + sum11 += r1s[0] * kptr[1]; + + kptr += 2; + } + } + } + + outptr0[0] = sum00; + outptr0[1] = sum01; + outptr1[0] = sum10; + outptr1[1] = sum11; + outptr0 += 2; + outptr1 += 2; + } + for (; ij < outw * outh; ij++) + { + const int i = ij / outw; + const int j = ij % outw; + + int sum0 = 0; + int sum1 = 0; + +#if __ARM_NEON + const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2); +#else + const signed char* kptr = weight_data_tm.channel(p / 2); +#endif + + int q = 0; +#if __ARM_NEON + { + int32x4_t _sum01 = vdupq_n_s32(0); + for (; q + 7 < inch; q += 8) + { + const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + + int8x8_t _r0; + if (elempack == 8) + { + _r0 = vld1_s8(r0s); + } + else // if (elempack == 1) + { + signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + _r0 = vld1_s8(tmp); + } + + int8x16_t _w0 = vld1q_s8(kptr); + + int32x2x2_t _rr0 = vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r0)); + int8x8_t _r0l = vreinterpret_s8_s32(_rr0.val[0]); + int8x8_t _r0h = vreinterpret_s8_s32(_rr0.val[1]); + + int16x8_t _s01 = vmull_s8(_r0l, vget_low_s8(_w0)); + _s01 = vmlal_s8(_s01, _r0h, vget_high_s8(_w0)); + + _sum01 = vpadalq_s16(_sum01, _s01); + + kptr += 16; + } + } + int32x2_t _s0 = vpadd_s32(vget_low_s32(_sum01), vget_high_s32(_sum01)); + sum0 += vget_lane_s32(_s0, 0); + sum1 += vget_lane_s32(_s0, 1); + } +#endif // __ARM_NEON + for (; q < inch; q++) + { + const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + + // if (elempack == 1) + { + sum0 += r0s[0] * kptr[0]; + sum1 += r0s[0] * kptr[1]; + + kptr += 2; + } + } + } + + outptr0[0] = sum0; + outptr1[0] = sum1; + outptr0 += 1; + outptr1 += 1; + } + } + remain_outch_start += nn_outch * 2; + for (int p = remain_outch_start; p < outch; p++) + { + int* outptr = top_blob.channel(p); + + int ij = 0; + for (; ij + 1 < outw * outh; ij += 2) + { + const int i0 = ij / outw; + const int i1 = (ij + 1) / outw; + const int j0 = ij % outw; + const int j1 = (ij + 1) % outw; + + int sum0 = 0; + int sum1 = 0; + +#if __ARM_NEON + const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2 + p % 2); +#else + const signed char* kptr = weight_data_tm.channel(p / 2 + p % 2); +#endif + + int q = 0; +#if __ARM_NEON + { + int32x4_t _sum0 = vdupq_n_s32(0); + int32x4_t _sum1 = vdupq_n_s32(0); + int32x4_t _sum2 = vdupq_n_s32(0); + int32x4_t _sum3 = vdupq_n_s32(0); + for (; q + 7 < inch; q += 8) + { + const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; + const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + const signed char* r1s = r1 + space_ofs[k]; + + int8x8_t _r0; + int8x8_t _r1; + if (elempack == 8) + { + _r0 = vld1_s8(r0s); + _r1 = vld1_s8(r1s); + } + else // if (elempack == 1) + { + signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]}; + _r0 = vld1_s8(tmp0); + _r1 = vld1_s8(tmp1); + } + + int8x8_t _w = vld1_s8(kptr); + + int16x8_t _s0 = vmull_s8(_r0, _w); + int16x8_t _s1 = vmull_s8(_r1, _w); + + _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); + _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); + _sum2 = vaddw_s16(_sum2, vget_low_s16(_s1)); + _sum3 = vaddw_s16(_sum3, vget_high_s16(_s1)); + + kptr += 8; + } + } + _sum0 = vaddq_s32(_sum0, _sum1); + _sum2 = vaddq_s32(_sum2, _sum3); +#if __aarch64__ + sum0 += vaddvq_s32(_sum0); + sum1 += vaddvq_s32(_sum2); +#else + int32x2_t _ss0 = vadd_s32(vget_low_s32(_sum0), vget_high_s32(_sum0)); + int32x2_t _ss2 = vadd_s32(vget_low_s32(_sum2), vget_high_s32(_sum2)); + _ss0 = vpadd_s32(_ss0, _ss2); + sum0 += vget_lane_s32(_ss0, 0); + sum1 += vget_lane_s32(_ss0, 1); +#endif + } +#endif // __ARM_NEON + for (; q < inch; q++) + { + const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; + const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + const signed char* r1s = r1 + space_ofs[k]; + + // if (elempack == 1) + { + sum0 += r0s[0] * kptr[0]; + sum1 += r1s[0] * kptr[0]; + + kptr += 1; + } + } + } + + outptr[0] = sum0; + outptr[1] = sum1; + outptr += 2; + } + for (; ij < outw * outh; ij++) + { + const int i = ij / outw; + const int j = ij % outw; + + int sum = 0; + +#if __ARM_NEON + const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2 + p % 2); +#else + const signed char* kptr = weight_data_tm.channel(p / 2 + p % 2); +#endif + + int q = 0; +#if __ARM_NEON + { + int32x4_t _sum0 = vdupq_n_s32(0); + int32x4_t _sum1 = vdupq_n_s32(0); + for (; q + 7 < inch; q += 8) + { + const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + + int8x8_t _r0; + if (elempack == 8) + { + _r0 = vld1_s8(r0s); + } + else // if (elempack == 1) + { + signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + _r0 = vld1_s8(tmp); + } + + int8x8_t _w = vld1_s8(kptr); + + int16x8_t _s0 = vmull_s8(_r0, _w); + + _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); + _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); + + kptr += 8; + } + } + int32x4_t _sum = vaddq_s32(_sum0, _sum1); +#if __aarch64__ + sum += vaddvq_s32(_sum); +#else + int32x2_t _ss = vadd_s32(vget_low_s32(_sum), vget_high_s32(_sum)); + _ss = vpadd_s32(_ss, _ss); + sum += vget_lane_s32(_ss, 0); +#endif + } +#endif // __ARM_NEON + for (; q < inch; q++) + { + const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + + // if (elempack == 1) + { + sum += r0s[0] * kptr[0]; + + kptr += 1; + } + } + } + + outptr[0] = sum; + outptr += 1; + } + } +} From ef86b864c02ebab18284885e30e31599e3b74457 Mon Sep 17 00:00:00 2001 From: Xinyu302 Date: Sat, 17 Feb 2024 06:40:07 +0000 Subject: [PATCH 17/29] apply code-format changes --- src/layer/riscv/convolution_packed_int8.h | 34 +++++++++++------------ 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/layer/riscv/convolution_packed_int8.h b/src/layer/riscv/convolution_packed_int8.h index c1c9432490f6..cbbbf8797394 100644 --- a/src/layer/riscv/convolution_packed_int8.h +++ b/src/layer/riscv/convolution_packed_int8.h @@ -368,23 +368,23 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) { -// #if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD) -// #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8 -// if (ncnn::cpu_support_arm_i8mm()) -// { -// convolution_packed_int8_i8mm(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); -// return; -// } -// #endif - -// #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD -// if (ncnn::cpu_support_arm_asimddp()) -// { -// convolution_packed_int8_asimddp(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); -// return; -// } -// #endif -// #endif + // #if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD) + // #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8 + // if (ncnn::cpu_support_arm_i8mm()) + // { + // convolution_packed_int8_i8mm(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + // return; + // } + // #endif + + // #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD + // if (ncnn::cpu_support_arm_asimddp()) + // { + // convolution_packed_int8_asimddp(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + // return; + // } + // #endif + // #endif const int w = bottom_blob.w; const int elempack = bottom_blob.elempack; From 4e5fd13881514cd42f307ad1c5485966f0f25520 Mon Sep 17 00:00:00 2001 From: Xinyu Yang Date: Sun, 18 Feb 2024 00:30:14 +0800 Subject: [PATCH 18/29] try to finish --- src/layer/riscv/convolution_packed_int8.h | 1441 +++++++++++---------- 1 file changed, 742 insertions(+), 699 deletions(-) diff --git a/src/layer/riscv/convolution_packed_int8.h b/src/layer/riscv/convolution_packed_int8.h index c1c9432490f6..7a01c4773e97 100644 --- a/src/layer/riscv/convolution_packed_int8.h +++ b/src/layer/riscv/convolution_packed_int8.h @@ -26,23 +26,23 @@ void convolution_packed_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, cons static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) { -#if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD) -#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8 - if (ncnn::cpu_support_arm_i8mm()) - { - convolution_transform_kernel_packed_int8_i8mm(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); - return; - } -#endif +// #if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD) +// #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8 +// if (ncnn::cpu_support_arm_i8mm()) +// { +// convolution_transform_kernel_packed_int8_i8mm(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); +// return; +// } +// #endif -#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD - if (ncnn::cpu_support_arm_asimddp()) - { - convolution_transform_kernel_packed_int8_asimddp(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); - return; - } -#endif -#endif +// #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD +// if (ncnn::cpu_support_arm_asimddp()) +// { +// convolution_transform_kernel_packed_int8_asimddp(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); +// return; +// } +// #endif +// #endif const int maxk = kernel_w * kernel_h; @@ -51,7 +51,7 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker // clang-format off // *INDENT-OFF* -#if __ARM_NEON +#if __riscv_vector if (outch >= 8) { if (inch >= 8) @@ -67,30 +67,30 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker kernel_tm.create(maxk, inch, outch / 4 + (outch % 4) / 2 + outch % 2, (size_t)4u, 4); } else -#endif // __ARM_NEON +#endif // __riscv_vector if (outch >= 2) { -#if __ARM_NEON +#if __riscv_vector if (inch >= 8) kernel_tm.create(maxk, inch / 8 + inch % 8, outch / 2 + outch % 2, (size_t)16u, 16); else -#endif // __ARM_NEON +#endif // __riscv_vector kernel_tm.create(maxk, inch, outch / 2 + outch % 2, (size_t)2u, 2); } else { -#if __ARM_NEON +#if __riscv_vector if (inch >= 8) kernel_tm.create(maxk, inch / 8 + inch % 8, outch, (size_t)8u, 8); else -#endif // __ARM_NEON +#endif // __riscv_vector kernel_tm.create(maxk, inch, outch, (size_t)1u, 1); } // *INDENT-ON* // clang-format on int q = 0; -#if __ARM_NEON +#if __riscv_vector for (; q + 7 < outch; q += 8) { const signed char* kptr0 = (const signed char*)kernel + q * inch * maxk; @@ -255,20 +255,20 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker kptr3 += maxk; } } -#endif // __ARM_NEON +#endif // __riscv_vector for (; q + 1 < outch; q += 2) { const signed char* kptr0 = (const signed char*)kernel + q * inch * maxk; const signed char* kptr1 = (const signed char*)kernel + (q + 1) * inch * maxk; -#if __ARM_NEON +#if __riscv_vector signed char* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2); #else signed char* g00 = kernel_tm.channel(q / 2); #endif int p = 0; -#if __ARM_NEON +#if __riscv_vector for (; p + 7 < inch; p += 8) { for (int k = 0; k < maxk; k++) @@ -306,7 +306,7 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker kptr0 += maxk * 8; kptr1 += maxk * 8; } -#endif // __ARM_NEON +#endif // __riscv_vector for (; p < inch; p++) { for (int k = 0; k < maxk; k++) @@ -327,14 +327,14 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker { const signed char* kptr = (const signed char*)kernel + q * inch * maxk; -#if __ARM_NEON +#if __riscv_vector signed char* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2 + q % 2); #else signed char* g00 = kernel_tm.channel(q / 2 + q % 2); #endif int p = 0; -#if __ARM_NEON +#if __riscv_vector for (; p + 7 < inch; p += 8) { for (int k = 0; k < maxk; k++) @@ -351,7 +351,7 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& ker kptr += maxk * 8; } -#endif // __ARM_NEON +#endif // __riscv_vector for (; p < inch; p++) { for (int k = 0; k < maxk; k++) @@ -385,6 +385,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const // } // #endif // #endif + int vl; const int w = bottom_blob.w; const int elempack = bottom_blob.elempack; @@ -420,7 +421,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const int nn_outch = 0; int remain_outch_start = 0; -#if __ARM_NEON +#if __riscv_vector nn_outch = (outch - remain_outch_start) / 8; #pragma omp parallel for num_threads(opt.num_threads) for (int pp = 0; pp < nn_outch; pp++) @@ -436,228 +437,231 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const int* outptr = top_blob.channel(p / out_elempack); int ij = 0; - for (; ij + 1 < outw * outh; ij += 2) - { - const int i0 = ij / outw; - const int i1 = (ij + 1) / outw; - const int j0 = ij % outw; - const int j1 = (ij + 1) % outw; - - int32x4_t _sum0 = vdupq_n_s32(0); - int32x4_t _sum1 = vdupq_n_s32(0); - int32x4_t _sum2 = vdupq_n_s32(0); - int32x4_t _sum3 = vdupq_n_s32(0); - - const signed char* kptr = weight_data_tm.channel(p / 8); - - int q = 0; - { - for (; q + 7 < inch; q += 8) - { - const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; - const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; - - for (int k = 0; k < maxk; k++) - { - const signed char* r0s = r0 + space_ofs[k]; - const signed char* r1s = r1 + space_ofs[k]; - - int8x8_t _r0; - int8x8_t _r1; - if (elempack == 8) - { - _r0 = vld1_s8(r0s); - _r1 = vld1_s8(r1s); - } - else // if (elempack == 1) - { - signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; - signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]}; - _r0 = vld1_s8(tmp0); - _r1 = vld1_s8(tmp1); - } - - int8x16_t _w0 = vld1q_s8(kptr); - int8x16_t _w1 = vld1q_s8(kptr + 16); - int8x16_t _w2 = vld1q_s8(kptr + 32); - int8x16_t _w3 = vld1q_s8(kptr + 48); - - int16x4_t _rr0 = vreinterpret_s16_s8(_r0); - int16x4_t _rr1 = vreinterpret_s16_s8(_r1); - - int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); - int8x8_t _r1ll = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 0)); - int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); - int8x8_t _r1hl = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 2)); - - int16x8_t _s0l = vmull_s8(_r0ll, vget_low_s8(_w0)); - int16x8_t _s1l = vmull_s8(_r0ll, vget_high_s8(_w0)); - int16x8_t _s2l = vmull_s8(_r1ll, vget_low_s8(_w0)); - int16x8_t _s3l = vmull_s8(_r1ll, vget_high_s8(_w0)); - _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w2)); - _s1l = vmlal_s8(_s1l, _r0hl, vget_high_s8(_w2)); - _s2l = vmlal_s8(_s2l, _r1hl, vget_low_s8(_w2)); - _s3l = vmlal_s8(_s3l, _r1hl, vget_high_s8(_w2)); - - _sum0 = vpadalq_s16(_sum0, _s0l); - _sum1 = vpadalq_s16(_sum1, _s1l); - _sum2 = vpadalq_s16(_sum2, _s2l); - _sum3 = vpadalq_s16(_sum3, _s3l); - - int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); - int8x8_t _r1lh = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 1)); - int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); - int8x8_t _r1hh = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 3)); - - int16x8_t _s0h = vmull_s8(_r0lh, vget_low_s8(_w1)); - int16x8_t _s1h = vmull_s8(_r0lh, vget_high_s8(_w1)); - int16x8_t _s2h = vmull_s8(_r1lh, vget_low_s8(_w1)); - int16x8_t _s3h = vmull_s8(_r1lh, vget_high_s8(_w1)); - _s0h = vmlal_s8(_s0h, _r0hh, vget_low_s8(_w3)); - _s1h = vmlal_s8(_s1h, _r0hh, vget_high_s8(_w3)); - _s2h = vmlal_s8(_s2h, _r1hh, vget_low_s8(_w3)); - _s3h = vmlal_s8(_s3h, _r1hh, vget_high_s8(_w3)); - - _sum0 = vpadalq_s16(_sum0, _s0h); - _sum1 = vpadalq_s16(_sum1, _s1h); - _sum2 = vpadalq_s16(_sum2, _s2h); - _sum3 = vpadalq_s16(_sum3, _s3h); - - kptr += 64; - } - } - } - for (; q < inch; q++) - { - const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; - const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; - - for (int k = 0; k < maxk; k++) - { - const signed char* r0s = r0 + space_ofs[k]; - const signed char* r1s = r1 + space_ofs[k]; - - // if (elempack == 1) - { - int8x8_t _r0 = vdup_n_s8(r0s[0]); - int8x8_t _r1 = vdup_n_s8(r1s[0]); - int8x8_t _w = vld1_s8(kptr); - int16x8_t _s0 = vmull_s8(_r0, _w); - int16x8_t _s1 = vmull_s8(_r1, _w); - _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); - _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); - _sum2 = vaddw_s16(_sum2, vget_low_s16(_s1)); - _sum3 = vaddw_s16(_sum3, vget_high_s16(_s1)); - - kptr += 8; - } - } - } - - if (out_elempack == 8) - { - vst1q_s32(outptr, _sum0); - vst1q_s32(outptr + 4, _sum1); - vst1q_s32(outptr + 8, _sum2); - vst1q_s32(outptr + 12, _sum3); - outptr += 16; - } - if (out_elempack == 4) - { - vst1q_s32(outptr, _sum0); - vst1q_s32(outptr + 4, _sum2); - vst1q_s32(outptr + M, _sum1); - vst1q_s32(outptr + M + 4, _sum3); - outptr += 8; - } - if (out_elempack == 1) - { - outptr[0] = vgetq_lane_s32(_sum0, 0); - outptr[1] = vgetq_lane_s32(_sum2, 0); - outptr[M] = vgetq_lane_s32(_sum0, 1); - outptr[M + 1] = vgetq_lane_s32(_sum2, 1); - outptr[M * 2] = vgetq_lane_s32(_sum0, 2); - outptr[M * 2 + 1] = vgetq_lane_s32(_sum2, 2); - outptr[M * 3] = vgetq_lane_s32(_sum0, 3); - outptr[M * 3 + 1] = vgetq_lane_s32(_sum2, 3); - outptr[M * 4] = vgetq_lane_s32(_sum1, 0); - outptr[M * 4 + 1] = vgetq_lane_s32(_sum3, 0); - outptr[M * 5] = vgetq_lane_s32(_sum1, 1); - outptr[M * 5 + 1] = vgetq_lane_s32(_sum3, 1); - outptr[M * 6] = vgetq_lane_s32(_sum1, 2); - outptr[M * 6 + 1] = vgetq_lane_s32(_sum3, 2); - outptr[M * 7] = vgetq_lane_s32(_sum1, 3); - outptr[M * 7 + 1] = vgetq_lane_s32(_sum3, 3); - outptr += 2; - } - } + // for (; ij + 1 < outw * outh; ij += 2) + // { + // const int i0 = ij / outw; + // const int i1 = (ij + 1) / outw; + // const int j0 = ij % outw; + // const int j1 = (ij + 1) % outw; + // // vl = 4; + + // int32x4_t _sum0 = vdupq_n_s32(0); + // int32x4_t _sum1 = vdupq_n_s32(0); + // int32x4_t _sum2 = vdupq_n_s32(0); + // int32x4_t _sum3 = vdupq_n_s32(0); + + // const signed char* kptr = weight_data_tm.channel(p / 8); + + // int q = 0; + // { + // for (; q + 7 < inch; q += 8) + // { + // const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; + // const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; + + // for (int k = 0; k < maxk; k++) + // { + // const signed char* r0s = r0 + space_ofs[k]; + // const signed char* r1s = r1 + space_ofs[k]; + + // int8x8_t _r0; + // int8x8_t _r1; + // if (elempack == 8) + // { + // _r0 = vld1_s8(r0s); + // _r1 = vld1_s8(r1s); + // } + // else // if (elempack == 1) + // { + // signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + // signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]}; + // _r0 = vld1_s8(tmp0); + // _r1 = vld1_s8(tmp1); + // } + + // int8x16_t _w0 = vld1q_s8(kptr); + // int8x16_t _w1 = vld1q_s8(kptr + 16); + // int8x16_t _w2 = vld1q_s8(kptr + 32); + // int8x16_t _w3 = vld1q_s8(kptr + 48); + + // int16x4_t _rr0 = vreinterpret_s16_s8(_r0); + // int16x4_t _rr1 = vreinterpret_s16_s8(_r1); + + // int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); + // int8x8_t _r1ll = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 0)); + // int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); + // int8x8_t _r1hl = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 2)); + + // int16x8_t _s0l = vmull_s8(_r0ll, vget_low_s8(_w0)); + // int16x8_t _s1l = vmull_s8(_r0ll, vget_high_s8(_w0)); + // int16x8_t _s2l = vmull_s8(_r1ll, vget_low_s8(_w0)); + // int16x8_t _s3l = vmull_s8(_r1ll, vget_high_s8(_w0)); + // _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w2)); + // _s1l = vmlal_s8(_s1l, _r0hl, vget_high_s8(_w2)); + // _s2l = vmlal_s8(_s2l, _r1hl, vget_low_s8(_w2)); + // _s3l = vmlal_s8(_s3l, _r1hl, vget_high_s8(_w2)); + + // _sum0 = vpadalq_s16(_sum0, _s0l); + // _sum1 = vpadalq_s16(_sum1, _s1l); + // _sum2 = vpadalq_s16(_sum2, _s2l); + // _sum3 = vpadalq_s16(_sum3, _s3l); + + // int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); + // int8x8_t _r1lh = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 1)); + // int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); + // int8x8_t _r1hh = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 3)); + + // int16x8_t _s0h = vmull_s8(_r0lh, vget_low_s8(_w1)); + // int16x8_t _s1h = vmull_s8(_r0lh, vget_high_s8(_w1)); + // int16x8_t _s2h = vmull_s8(_r1lh, vget_low_s8(_w1)); + // int16x8_t _s3h = vmull_s8(_r1lh, vget_high_s8(_w1)); + // _s0h = vmlal_s8(_s0h, _r0hh, vget_low_s8(_w3)); + // _s1h = vmlal_s8(_s1h, _r0hh, vget_high_s8(_w3)); + // _s2h = vmlal_s8(_s2h, _r1hh, vget_low_s8(_w3)); + // _s3h = vmlal_s8(_s3h, _r1hh, vget_high_s8(_w3)); + + // _sum0 = vpadalq_s16(_sum0, _s0h); + // _sum1 = vpadalq_s16(_sum1, _s1h); + // _sum2 = vpadalq_s16(_sum2, _s2h); + // _sum3 = vpadalq_s16(_sum3, _s3h); + + // kptr += 64; + // } + // } + // } + // for (; q < inch; q++) + // { + // const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; + // const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; + + // for (int k = 0; k < maxk; k++) + // { + // const signed char* r0s = r0 + space_ofs[k]; + // const signed char* r1s = r1 + space_ofs[k]; + + // // if (elempack == 1) + // { + // int8x8_t _r0 = vdup_n_s8(r0s[0]); + // int8x8_t _r1 = vdup_n_s8(r1s[0]); + // int8x8_t _w = vld1_s8(kptr); + // int16x8_t _s0 = vmull_s8(_r0, _w); + // int16x8_t _s1 = vmull_s8(_r1, _w); + // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); + // _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); + // _sum2 = vaddw_s16(_sum2, vget_low_s16(_s1)); + // _sum3 = vaddw_s16(_sum3, vget_high_s16(_s1)); + + // kptr += 8; + // } + // } + // } + + // if (out_elempack == 8) + // { + // vst1q_s32(outptr, _sum0); + // vst1q_s32(outptr + 4, _sum1); + // vst1q_s32(outptr + 8, _sum2); + // vst1q_s32(outptr + 12, _sum3); + // outptr += 16; + // } + // if (out_elempack == 4) + // { + // vst1q_s32(outptr, _sum0); + // vst1q_s32(outptr + 4, _sum2); + // vst1q_s32(outptr + M, _sum1); + // vst1q_s32(outptr + M + 4, _sum3); + // outptr += 8; + // } + // if (out_elempack == 1) + // { + // outptr[0] = vgetq_lane_s32(_sum0, 0); + // outptr[1] = vgetq_lane_s32(_sum2, 0); + // outptr[M] = vgetq_lane_s32(_sum0, 1); + // outptr[M + 1] = vgetq_lane_s32(_sum2, 1); + // outptr[M * 2] = vgetq_lane_s32(_sum0, 2); + // outptr[M * 2 + 1] = vgetq_lane_s32(_sum2, 2); + // outptr[M * 3] = vgetq_lane_s32(_sum0, 3); + // outptr[M * 3 + 1] = vgetq_lane_s32(_sum2, 3); + // outptr[M * 4] = vgetq_lane_s32(_sum1, 0); + // outptr[M * 4 + 1] = vgetq_lane_s32(_sum3, 0); + // outptr[M * 5] = vgetq_lane_s32(_sum1, 1); + // outptr[M * 5 + 1] = vgetq_lane_s32(_sum3, 1); + // outptr[M * 6] = vgetq_lane_s32(_sum1, 2); + // outptr[M * 6 + 1] = vgetq_lane_s32(_sum3, 2); + // outptr[M * 7] = vgetq_lane_s32(_sum1, 3); + // outptr[M * 7 + 1] = vgetq_lane_s32(_sum3, 3); + // outptr += 2; + // } + // } for (; ij < outw * outh; ij++) { const int i = ij / outw; const int j = ij % outw; - int32x4_t _sum0 = vdupq_n_s32(0); - int32x4_t _sum1 = vdupq_n_s32(0); - int32x4_t _sum2 = vdupq_n_s32(0); - int32x4_t _sum3 = vdupq_n_s32(0); + // int32x4_t _sum0 = vdupq_n_s32(0); + // int32x4_t _sum1 = vdupq_n_s32(0); + // int32x4_t _sum2 = vdupq_n_s32(0); + // int32x4_t _sum3 = vdupq_n_s32(0); + vl = 8; + vint32m2_t _sum01 = vmv_v_x_i32m2(0, vl); const signed char* kptr = weight_data_tm.channel(p / 8); int q = 0; - { - for (; q + 7 < inch; q += 8) - { - const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; - - for (int k = 0; k < maxk; k++) - { - const signed char* r0s = r0 + space_ofs[k]; - - int8x8_t _r0; - if (elempack == 8) - { - _r0 = vld1_s8(r0s); - } - else // if (elempack == 1) - { - signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; - _r0 = vld1_s8(tmp); - } - - int8x16_t _w0 = vld1q_s8(kptr); - int8x16_t _w1 = vld1q_s8(kptr + 16); - int8x16_t _w2 = vld1q_s8(kptr + 32); - int8x16_t _w3 = vld1q_s8(kptr + 48); - - int16x4_t _rr0 = vreinterpret_s16_s8(_r0); - int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); - int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); - int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); - int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); - - int16x8_t _s0l = vmull_s8(_r0ll, vget_low_s8(_w0)); - int16x8_t _s1l = vmull_s8(_r0ll, vget_high_s8(_w0)); - int16x8_t _s0h = vmull_s8(_r0lh, vget_low_s8(_w1)); - int16x8_t _s1h = vmull_s8(_r0lh, vget_high_s8(_w1)); - _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w2)); - _s1l = vmlal_s8(_s1l, _r0hl, vget_high_s8(_w2)); - _s0h = vmlal_s8(_s0h, _r0hh, vget_low_s8(_w3)); - _s1h = vmlal_s8(_s1h, _r0hh, vget_high_s8(_w3)); - - _sum0 = vpadalq_s16(_sum0, _s0l); - _sum1 = vpadalq_s16(_sum1, _s1l); - _sum2 = vpadalq_s16(_sum2, _s0h); - _sum3 = vpadalq_s16(_sum3, _s1h); - - kptr += 64; - } - } - - { - _sum0 = vaddq_s32(_sum0, _sum2); - _sum1 = vaddq_s32(_sum1, _sum3); - } - } + // { + // for (; q + 7 < inch; q += 8) + // { + // const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; + + // for (int k = 0; k < maxk; k++) + // { + // const signed char* r0s = r0 + space_ofs[k]; + + // int8x8_t _r0; + // if (elempack == 8) + // { + // _r0 = vld1_s8(r0s); + // } + // else // if (elempack == 1) + // { + // signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + // _r0 = vld1_s8(tmp); + // } + + // int8x16_t _w0 = vld1q_s8(kptr); + // int8x16_t _w1 = vld1q_s8(kptr + 16); + // int8x16_t _w2 = vld1q_s8(kptr + 32); + // int8x16_t _w3 = vld1q_s8(kptr + 48); + + // int16x4_t _rr0 = vreinterpret_s16_s8(_r0); + // int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); + // int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); + // int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); + // int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); + + // int16x8_t _s0l = vmull_s8(_r0ll, vget_low_s8(_w0)); + // int16x8_t _s1l = vmull_s8(_r0ll, vget_high_s8(_w0)); + // int16x8_t _s0h = vmull_s8(_r0lh, vget_low_s8(_w1)); + // int16x8_t _s1h = vmull_s8(_r0lh, vget_high_s8(_w1)); + // _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w2)); + // _s1l = vmlal_s8(_s1l, _r0hl, vget_high_s8(_w2)); + // _s0h = vmlal_s8(_s0h, _r0hh, vget_low_s8(_w3)); + // _s1h = vmlal_s8(_s1h, _r0hh, vget_high_s8(_w3)); + + // _sum0 = vpadalq_s16(_sum0, _s0l); + // _sum1 = vpadalq_s16(_sum1, _s1l); + // _sum2 = vpadalq_s16(_sum2, _s0h); + // _sum3 = vpadalq_s16(_sum3, _s1h); + + // kptr += 64; + // } + // } + + // { + // _sum0 = vaddq_s32(_sum0, _sum2); + // _sum1 = vaddq_s32(_sum1, _sum3); + // } + // } for (; q < inch; q++) { const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; @@ -668,11 +672,17 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const // if (elempack == 1) { - int8x8_t _val = vdup_n_s8(r0s[0]); - int8x8_t _w = vld1_s8(kptr); - int16x8_t _s0 = vmull_s8(_val, _w); - _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); - _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); + + vint8m1_t _val = vmv_v_x_i32m2(r0s[0], vl); + vint8m1_t _w = vle8_v_i8m1(kptr, vl); + vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i8m1(_val, _w, vl), 0); + _sum01 = vwadd_wv_i32m2(_sum01, _s0, vl); + + // int8x8_t _val = vdup_n_s8(r0s[0]); + // int8x8_t _w = vld1_s8(kptr); + // int16x8_t _s0 = vmull_s8(_val, _w); + // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); + // _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); kptr += 8; } @@ -681,26 +691,32 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const if (out_elempack == 8) { - vst1q_s32(outptr, _sum0); - vst1q_s32(outptr + 4, _sum1); + // vst1q_s32(outptr, _sum0); + // vst1q_s32(outptr + 4, _sum1); + vse32_v_i32m2(outptr, _sum01, vl); outptr += 8; } if (out_elempack == 4) { - vst1q_s32(outptr, _sum0); - vst1q_s32(outptr + M, _sum1); + // vst1q_s32(outptr, _sum0); + // vst1q_s32(outptr + M, _sum1); + vl = 4; + vse32_v_i32m1(outptr, vget_v_i32m2_i32m1(_sum01, 0), vl); + vse32_v_i32m1(outptr + M, vget_v_i32m2_i32m1(_sum01, 1), vl); + vl = 8; outptr += 4; } if (out_elempack == 1) { - outptr[0] = vgetq_lane_s32(_sum0, 0); - outptr[M] = vgetq_lane_s32(_sum0, 1); - outptr[M * 2] = vgetq_lane_s32(_sum0, 2); - outptr[M * 3] = vgetq_lane_s32(_sum0, 3); - outptr[M * 4] = vgetq_lane_s32(_sum1, 0); - outptr[M * 5] = vgetq_lane_s32(_sum1, 1); - outptr[M * 6] = vgetq_lane_s32(_sum1, 2); - outptr[M * 7] = vgetq_lane_s32(_sum1, 3); + vsse32_v_f32m2(outptr, M * sizeof(int), _sum01, vl); + // outptr[0] = vgetq_lane_s32(_sum0, 0); + // outptr[M] = vgetq_lane_s32(_sum0, 1); + // outptr[M * 2] = vgetq_lane_s32(_sum0, 2); + // outptr[M * 3] = vgetq_lane_s32(_sum0, 3); + // outptr[M * 4] = vgetq_lane_s32(_sum1, 0); + // outptr[M * 5] = vgetq_lane_s32(_sum1, 1); + // outptr[M * 6] = vgetq_lane_s32(_sum1, 2); + // outptr[M * 7] = vgetq_lane_s32(_sum1, 3); outptr += 1; } } @@ -720,177 +736,189 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const int* outptr = top_blob.channel(p / out_elempack); int ij = 0; - for (; ij + 1 < outw * outh; ij += 2) - { - const int i0 = ij / outw; - const int i1 = (ij + 1) / outw; - const int j0 = ij % outw; - const int j1 = (ij + 1) % outw; - - int32x4_t _sum0 = vdupq_n_s32(0); - int32x4_t _sum1 = vdupq_n_s32(0); - - const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4); - - int q = 0; - { - for (; q + 7 < inch; q += 8) - { - const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; - const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; - - for (int k = 0; k < maxk; k++) - { - const signed char* r0s = r0 + space_ofs[k]; - const signed char* r1s = r1 + space_ofs[k]; - - int8x8_t _r0; - int8x8_t _r1; - if (elempack == 8) - { - _r0 = vld1_s8(r0s); - _r1 = vld1_s8(r1s); - } - else // if (elempack == 1) - { - signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; - signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]}; - _r0 = vld1_s8(tmp0); - _r1 = vld1_s8(tmp1); - } - - int8x16_t _w0 = vld1q_s8(kptr); - int8x16_t _w1 = vld1q_s8(kptr + 16); - - int16x4_t _rr0 = vreinterpret_s16_s8(_r0); - int16x4_t _rr1 = vreinterpret_s16_s8(_r1); - - int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); - int8x8_t _r1ll = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 0)); - int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); - int8x8_t _r1lh = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 1)); - - int16x8_t _s0l = vmull_s8(_r0ll, vget_low_s8(_w0)); - int16x8_t _s1l = vmull_s8(_r1ll, vget_low_s8(_w0)); - int16x8_t _s0h = vmull_s8(_r0lh, vget_high_s8(_w0)); - int16x8_t _s1h = vmull_s8(_r1lh, vget_high_s8(_w0)); - - int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); - int8x8_t _r1hl = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 2)); - int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); - int8x8_t _r1hh = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 3)); - - _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w1)); - _s1l = vmlal_s8(_s1l, _r1hl, vget_low_s8(_w1)); - _s0h = vmlal_s8(_s0h, _r0hh, vget_high_s8(_w1)); - _s1h = vmlal_s8(_s1h, _r1hh, vget_high_s8(_w1)); - - _sum0 = vpadalq_s16(_sum0, _s0l); - _sum1 = vpadalq_s16(_sum1, _s1l); - _sum0 = vpadalq_s16(_sum0, _s0h); - _sum1 = vpadalq_s16(_sum1, _s1h); - - kptr += 32; - } - } - } - for (; q < inch; q++) - { - const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; - const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; - - for (int k = 0; k < maxk; k++) - { - const signed char* r0s = r0 + space_ofs[k]; - const signed char* r1s = r1 + space_ofs[k]; - - // if (elempack == 1) - { - int8x8_t _r0 = vdup_n_s8(r0s[0]); - int8x8_t _r1 = vdup_n_s8(r1s[0]); - int8x8_t _r01 = vreinterpret_s8_s32(vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r1)).val[0]); - int8x8_t _w = vld1_s8(kptr); - int8x8_t _ww = vreinterpret_s8_s32(vzip_s32(vreinterpret_s32_s8(_w), vreinterpret_s32_s8(_w)).val[0]); - int16x8_t _s01 = vmull_s8(_r01, _ww); - _sum0 = vaddw_s16(_sum0, vget_low_s16(_s01)); - _sum1 = vaddw_s16(_sum1, vget_high_s16(_s01)); - - kptr += 4; - } - } - } - - if (out_elempack == 4) - { - vst1q_s32(outptr, _sum0); - vst1q_s32(outptr + 4, _sum1); - outptr += 8; - } - if (out_elempack == 1) - { - int32x4x2_t _sum01 = vzipq_s32(_sum0, _sum1); - vst1_s32(outptr, vget_low_s32(_sum01.val[0])); - vst1_s32(outptr + M, vget_high_s32(_sum01.val[0])); - vst1_s32(outptr + M * 2, vget_low_s32(_sum01.val[1])); - vst1_s32(outptr + M * 3, vget_high_s32(_sum01.val[1])); - outptr += 2; - } - } + // for (; ij + 1 < outw * outh; ij += 2) + // { + // const int i0 = ij / outw; + // const int i1 = (ij + 1) / outw; + // const int j0 = ij % outw; + // const int j1 = (ij + 1) % outw; + + // vint32m2_t _sum01 = vmv_v_x_i32m2(0, vl); + // int32x4_t _sum0 = vdupq_n_s32(0); + // int32x4_t _sum1 = vdupq_n_s32(0); + + // const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4); + + // int q = 0; + // { + // for (; q + 7 < inch; q += 8) + // { + // const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; + // const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; + + // for (int k = 0; k < maxk; k++) + // { + // const signed char* r0s = r0 + space_ofs[k]; + // const signed char* r1s = r1 + space_ofs[k]; + + // int8x8_t _r0; + // int8x8_t _r1; + // if (elempack == 8) + // { + // _r0 = vld1_s8(r0s); + // _r1 = vld1_s8(r1s); + // } + // else // if (elempack == 1) + // { + // signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + // signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]}; + // _r0 = vld1_s8(tmp0); + // _r1 = vld1_s8(tmp1); + // } + + // int8x16_t _w0 = vld1q_s8(kptr); + // int8x16_t _w1 = vld1q_s8(kptr + 16); + + // int16x4_t _rr0 = vreinterpret_s16_s8(_r0); + // int16x4_t _rr1 = vreinterpret_s16_s8(_r1); + + // int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); + // int8x8_t _r1ll = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 0)); + // int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); + // int8x8_t _r1lh = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 1)); + + // int16x8_t _s0l = vmull_s8(_r0ll, vget_low_s8(_w0)); + // int16x8_t _s1l = vmull_s8(_r1ll, vget_low_s8(_w0)); + // int16x8_t _s0h = vmull_s8(_r0lh, vget_high_s8(_w0)); + // int16x8_t _s1h = vmull_s8(_r1lh, vget_high_s8(_w0)); + + // int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); + // int8x8_t _r1hl = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 2)); + // int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); + // int8x8_t _r1hh = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 3)); + + // _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w1)); + // _s1l = vmlal_s8(_s1l, _r1hl, vget_low_s8(_w1)); + // _s0h = vmlal_s8(_s0h, _r0hh, vget_high_s8(_w1)); + // _s1h = vmlal_s8(_s1h, _r1hh, vget_high_s8(_w1)); + + // _sum0 = vpadalq_s16(_sum0, _s0l); + // _sum1 = vpadalq_s16(_sum1, _s1l); + // _sum0 = vpadalq_s16(_sum0, _s0h); + // _sum1 = vpadalq_s16(_sum1, _s1h); + + // kptr += 32; + // } + // } + // } + // for (; q < inch; q++) + // { + // const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; + // const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; + + // for (int k = 0; k < maxk; k++) + // { + // const signed char* r0s = r0 + space_ofs[k]; + // const signed char* r1s = r1 + space_ofs[k]; + + // // if (elempack == 1) + // { + // vint8m1_t _r0 = vmv_v_x_i8m1(r0s[0], vl); + // vint8m1_t _r1 = vmv_v_x_i8m1(r1s[0], vl); + + // // vint32m4_t _r01_int32 = vundefined_i32m4(); + + + // // int8x8_t _r0 = vdup_n_s8(r0s[0]); + // // int8x8_t _r1 = vdup_n_s8(r1s[0]); + // int8x8_t _r01 = vreinterpret_s8_s32(vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r1)).val[0]); + // int8x8_t _w = vld1_s8(kptr); + // int8x8_t _ww = vreinterpret_s8_s32(vzip_s32(vreinterpret_s32_s8(_w), vreinterpret_s32_s8(_w)).val[0]); + // int16x8_t _s01 = vmull_s8(_r01, _ww); + // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s01)); + // _sum1 = vaddw_s16(_sum1, vget_high_s16(_s01)); + + // kptr += 4; + // } + // } + // } + + // if (out_elempack == 4) + // { + // vse32_v_i32m2(outptr, _sum01, vl); + // // vst1q_s32(outptr, _sum0); + // // vst1q_s32(outptr + 4, _sum1); + // outptr += 8; + // } + // if (out_elempack == 1) + // { + + + // // int32x4x2_t _sum01 = vzipq_s32(_sum0, _sum1); + // // vst1_s32(outptr, vget_low_s32(_sum01.val[0])); + // // vst1_s32(outptr + M, vget_high_s32(_sum01.val[0])); + // // vst1_s32(outptr + M * 2, vget_low_s32(_sum01.val[1])); + // // vst1_s32(outptr + M * 3, vget_high_s32(_sum01.val[1])); + // outptr += 2; + // } + // } for (; ij < outw * outh; ij++) { const int i = ij / outw; const int j = ij % outw; + vl = 4; - int32x4_t _sum0 = vdupq_n_s32(0); - int32x4_t _sum1 = vdupq_n_s32(0); + vint32m2_t _sum01 = vmv_v_x_i32m2(0, vl); + // int32x4_t _sum0 = vdupq_n_s32(0); + // int32x4_t _sum1 = vdupq_n_s32(0); const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4); int q = 0; - { - for (; q + 7 < inch; q += 8) - { - const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; - - for (int k = 0; k < maxk; k++) - { - const signed char* r0s = r0 + space_ofs[k]; - - int8x8_t _r0; - if (elempack == 8) - { - _r0 = vld1_s8(r0s); - } - else // if (elempack == 1) - { - signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; - _r0 = vld1_s8(tmp); - } - - int8x16_t _w0 = vld1q_s8(kptr); - int8x16_t _w1 = vld1q_s8(kptr + 16); - - int16x4_t _rr0 = vreinterpret_s16_s8(_r0); - int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); - int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); - int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); - int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); - - int16x8_t _sl = vmull_s8(_r0ll, vget_low_s8(_w0)); - int16x8_t _sh = vmull_s8(_r0lh, vget_high_s8(_w0)); - _sl = vmlal_s8(_sl, _r0hl, vget_low_s8(_w1)); - _sh = vmlal_s8(_sh, _r0hh, vget_high_s8(_w1)); - - _sum0 = vpadalq_s16(_sum0, _sl); - _sum1 = vpadalq_s16(_sum1, _sh); - - kptr += 32; - } - } - { - _sum0 = vaddq_s32(_sum0, _sum1); - } - } + // { + // for (; q + 7 < inch; q += 8) + // { + // const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; + + // for (int k = 0; k < maxk; k++) + // { + // const signed char* r0s = r0 + space_ofs[k]; + + // int8x8_t _r0; + // if (elempack == 8) + // { + // _r0 = vld1_s8(r0s); + // } + // else // if (elempack == 1) + // { + // signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + // _r0 = vld1_s8(tmp); + // } + + // int8x16_t _w0 = vld1q_s8(kptr); + // int8x16_t _w1 = vld1q_s8(kptr + 16); + + // int16x4_t _rr0 = vreinterpret_s16_s8(_r0); + // int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); + // int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); + // int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); + // int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); + + // int16x8_t _sl = vmull_s8(_r0ll, vget_low_s8(_w0)); + // int16x8_t _sh = vmull_s8(_r0lh, vget_high_s8(_w0)); + // _sl = vmlal_s8(_sl, _r0hl, vget_low_s8(_w1)); + // _sh = vmlal_s8(_sh, _r0hh, vget_high_s8(_w1)); + + // _sum0 = vpadalq_s16(_sum0, _sl); + // _sum1 = vpadalq_s16(_sum1, _sh); + + // kptr += 32; + // } + // } + // { + // _sum0 = vaddq_s32(_sum0, _sum1); + // } + // } for (; q < inch; q++) { const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; @@ -901,10 +929,14 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const // if (elempack == 1) { - int8x8_t _val = vdup_n_s8(r0s[0]); - int8x8_t _w = vld1_s8(kptr); - int16x8_t _s0 = vmull_s8(_val, _w); - _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); + vint8m1_t _val = vmv_v_x_i8m1(r0s[0], vl); + vint8m1_t _w = vle8_v_i8m1(kptr, vl); + vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i8m1(_val, _w, vl), 0); + _sum01 = vwadd_wv_i32m2(_sum01, _s0, vl); + // int8x8_t _val = vdup_n_s8(r0s[0]); + // int8x8_t _w = vld1_s8(kptr); + // int16x8_t _s0 = vmull_s8(_val, _w); + // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); kptr += 4; } @@ -913,25 +945,27 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const if (out_elempack == 4) { - vst1q_s32(outptr, _sum0); + // vst1q_s32(outptr, _sum0); + vse32_v_i32m1(outptr, _sum01, vl); outptr += 4; } if (out_elempack == 1) { - outptr[0] = vgetq_lane_s32(_sum0, 0); - outptr[M] = vgetq_lane_s32(_sum0, 1); - outptr[M * 2] = vgetq_lane_s32(_sum0, 2); - outptr[M * 3] = vgetq_lane_s32(_sum0, 3); + vsse32_v_i32m2(outptr, M * sizeof(int), _sum01, vl); + // outptr[0] = vgetq_lane_s32(_sum0, 0); + // outptr[M] = vgetq_lane_s32(_sum0, 1); + // outptr[M * 2] = vgetq_lane_s32(_sum0, 2); + // outptr[M * 3] = vgetq_lane_s32(_sum0, 3); outptr += 1; } } } remain_outch_start += nn_outch * 4; nn_outch = (outch - remain_outch_start) / 2; -#else // __ARM_NEON +#else // __riscv_vector nn_outch = (outch - remain_outch_start) / 2; #pragma omp parallel for num_threads(opt.num_threads) -#endif // __ARM_NEON +#endif // __riscv_vector for (int pp = 0; pp < nn_outch; pp++) { const int p = remain_outch_start + pp * 2; @@ -945,111 +979,111 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const int* outptr1 = top_blob.channel(p + 1); int ij = 0; - for (; ij + 1 < outw * outh; ij += 2) - { - const int i0 = ij / outw; - const int i1 = (ij + 1) / outw; - const int j0 = ij % outw; - const int j1 = (ij + 1) % outw; - - int sum00 = 0; - int sum01 = 0; - int sum10 = 0; - int sum11 = 0; - -#if __ARM_NEON - const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2); -#else - const signed char* kptr = weight_data_tm.channel(p / 2); -#endif - - int q = 0; -#if __ARM_NEON - { - int32x4_t _sum01 = vdupq_n_s32(0); - int32x4_t _sum23 = vdupq_n_s32(0); - for (; q + 7 < inch; q += 8) - { - const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; - const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; - - for (int k = 0; k < maxk; k++) - { - const signed char* r0s = r0 + space_ofs[k]; - const signed char* r1s = r1 + space_ofs[k]; - - int8x8_t _r0; - int8x8_t _r1; - if (elempack == 8) - { - _r0 = vld1_s8(r0s); - _r1 = vld1_s8(r1s); - } - else // if (elempack == 1) - { - signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; - signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]}; - _r0 = vld1_s8(tmp0); - _r1 = vld1_s8(tmp1); - } - - int8x16_t _w0 = vld1q_s8(kptr); - - int32x2x2_t _rr0 = vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r0)); - int32x2x2_t _rr1 = vzip_s32(vreinterpret_s32_s8(_r1), vreinterpret_s32_s8(_r1)); - int8x8_t _r0l = vreinterpret_s8_s32(_rr0.val[0]); - int8x8_t _r0h = vreinterpret_s8_s32(_rr0.val[1]); - int8x8_t _r1l = vreinterpret_s8_s32(_rr1.val[0]); - int8x8_t _r1h = vreinterpret_s8_s32(_rr1.val[1]); - - int16x8_t _s01 = vmull_s8(_r0l, vget_low_s8(_w0)); - int16x8_t _s23 = vmull_s8(_r1l, vget_low_s8(_w0)); - _s01 = vmlal_s8(_s01, _r0h, vget_high_s8(_w0)); - _s23 = vmlal_s8(_s23, _r1h, vget_high_s8(_w0)); - - _sum01 = vpadalq_s16(_sum01, _s01); - _sum23 = vpadalq_s16(_sum23, _s23); - - kptr += 16; - } - } - int32x2_t _s0 = vpadd_s32(vget_low_s32(_sum01), vget_high_s32(_sum01)); - int32x2_t _s1 = vpadd_s32(vget_low_s32(_sum23), vget_high_s32(_sum23)); - sum00 += vget_lane_s32(_s0, 0); - sum01 += vget_lane_s32(_s1, 0); - sum10 += vget_lane_s32(_s0, 1); - sum11 += vget_lane_s32(_s1, 1); - } -#endif // __ARM_NEON - for (; q < inch; q++) - { - const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; - const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; - - for (int k = 0; k < maxk; k++) - { - const signed char* r0s = r0 + space_ofs[k]; - const signed char* r1s = r1 + space_ofs[k]; - - // if (elempack == 1) - { - sum00 += r0s[0] * kptr[0]; - sum01 += r1s[0] * kptr[0]; - sum10 += r0s[0] * kptr[1]; - sum11 += r1s[0] * kptr[1]; - - kptr += 2; - } - } - } +// for (; ij + 1 < outw * outh; ij += 2) +// { +// const int i0 = ij / outw; +// const int i1 = (ij + 1) / outw; +// const int j0 = ij % outw; +// const int j1 = (ij + 1) % outw; + +// int sum00 = 0; +// int sum01 = 0; +// int sum10 = 0; +// int sum11 = 0; + +// #if __riscv_vector +// const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2); +// #else +// const signed char* kptr = weight_data_tm.channel(p / 2); +// #endif - outptr0[0] = sum00; - outptr0[1] = sum01; - outptr1[0] = sum10; - outptr1[1] = sum11; - outptr0 += 2; - outptr1 += 2; - } +// int q = 0; +// #if __riscv_vector +// { +// int32x4_t _sum01 = vdupq_n_s32(0); +// int32x4_t _sum23 = vdupq_n_s32(0); +// for (; q + 7 < inch; q += 8) +// { +// const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; +// const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; + +// for (int k = 0; k < maxk; k++) +// { +// const signed char* r0s = r0 + space_ofs[k]; +// const signed char* r1s = r1 + space_ofs[k]; + +// int8x8_t _r0; +// int8x8_t _r1; +// if (elempack == 8) +// { +// _r0 = vld1_s8(r0s); +// _r1 = vld1_s8(r1s); +// } +// else // if (elempack == 1) +// { +// signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; +// signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]}; +// _r0 = vld1_s8(tmp0); +// _r1 = vld1_s8(tmp1); +// } + +// int8x16_t _w0 = vld1q_s8(kptr); + +// int32x2x2_t _rr0 = vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r0)); +// int32x2x2_t _rr1 = vzip_s32(vreinterpret_s32_s8(_r1), vreinterpret_s32_s8(_r1)); +// int8x8_t _r0l = vreinterpret_s8_s32(_rr0.val[0]); +// int8x8_t _r0h = vreinterpret_s8_s32(_rr0.val[1]); +// int8x8_t _r1l = vreinterpret_s8_s32(_rr1.val[0]); +// int8x8_t _r1h = vreinterpret_s8_s32(_rr1.val[1]); + +// int16x8_t _s01 = vmull_s8(_r0l, vget_low_s8(_w0)); +// int16x8_t _s23 = vmull_s8(_r1l, vget_low_s8(_w0)); +// _s01 = vmlal_s8(_s01, _r0h, vget_high_s8(_w0)); +// _s23 = vmlal_s8(_s23, _r1h, vget_high_s8(_w0)); + +// _sum01 = vpadalq_s16(_sum01, _s01); +// _sum23 = vpadalq_s16(_sum23, _s23); + +// kptr += 16; +// } +// } +// int32x2_t _s0 = vpadd_s32(vget_low_s32(_sum01), vget_high_s32(_sum01)); +// int32x2_t _s1 = vpadd_s32(vget_low_s32(_sum23), vget_high_s32(_sum23)); +// sum00 += vget_lane_s32(_s0, 0); +// sum01 += vget_lane_s32(_s1, 0); +// sum10 += vget_lane_s32(_s0, 1); +// sum11 += vget_lane_s32(_s1, 1); +// } +// #endif // __riscv_vector +// for (; q < inch; q++) +// { +// const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; +// const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; + +// for (int k = 0; k < maxk; k++) +// { +// const signed char* r0s = r0 + space_ofs[k]; +// const signed char* r1s = r1 + space_ofs[k]; + +// // if (elempack == 1) +// { +// sum00 += r0s[0] * kptr[0]; +// sum01 += r1s[0] * kptr[0]; +// sum10 += r0s[0] * kptr[1]; +// sum11 += r1s[0] * kptr[1]; + +// kptr += 2; +// } +// } +// } + +// outptr0[0] = sum00; +// outptr0[1] = sum01; +// outptr1[0] = sum10; +// outptr1[1] = sum11; +// outptr0 += 2; +// outptr1 += 2; +// } for (; ij < outw * outh; ij++) { const int i = ij / outw; @@ -1058,54 +1092,54 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const int sum0 = 0; int sum1 = 0; -#if __ARM_NEON +#if __riscv_vector const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2); #else const signed char* kptr = weight_data_tm.channel(p / 2); #endif int q = 0; -#if __ARM_NEON - { - int32x4_t _sum01 = vdupq_n_s32(0); - for (; q + 7 < inch; q += 8) - { - const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; - - for (int k = 0; k < maxk; k++) - { - const signed char* r0s = r0 + space_ofs[k]; - - int8x8_t _r0; - if (elempack == 8) - { - _r0 = vld1_s8(r0s); - } - else // if (elempack == 1) - { - signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; - _r0 = vld1_s8(tmp); - } - - int8x16_t _w0 = vld1q_s8(kptr); - - int32x2x2_t _rr0 = vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r0)); - int8x8_t _r0l = vreinterpret_s8_s32(_rr0.val[0]); - int8x8_t _r0h = vreinterpret_s8_s32(_rr0.val[1]); - - int16x8_t _s01 = vmull_s8(_r0l, vget_low_s8(_w0)); - _s01 = vmlal_s8(_s01, _r0h, vget_high_s8(_w0)); - - _sum01 = vpadalq_s16(_sum01, _s01); - - kptr += 16; - } - } - int32x2_t _s0 = vpadd_s32(vget_low_s32(_sum01), vget_high_s32(_sum01)); - sum0 += vget_lane_s32(_s0, 0); - sum1 += vget_lane_s32(_s0, 1); - } -#endif // __ARM_NEON +#if __riscv_vector + // { + // int32x4_t _sum01 = vdupq_n_s32(0); + // for (; q + 7 < inch; q += 8) + // { + // const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; + + // for (int k = 0; k < maxk; k++) + // { + // const signed char* r0s = r0 + space_ofs[k]; + + // int8x8_t _r0; + // if (elempack == 8) + // { + // _r0 = vld1_s8(r0s); + // } + // else // if (elempack == 1) + // { + // signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + // _r0 = vld1_s8(tmp); + // } + + // int8x16_t _w0 = vld1q_s8(kptr); + + // int32x2x2_t _rr0 = vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r0)); + // int8x8_t _r0l = vreinterpret_s8_s32(_rr0.val[0]); + // int8x8_t _r0h = vreinterpret_s8_s32(_rr0.val[1]); + + // int16x8_t _s01 = vmull_s8(_r0l, vget_low_s8(_w0)); + // _s01 = vmlal_s8(_s01, _r0h, vget_high_s8(_w0)); + + // _sum01 = vpadalq_s16(_sum01, _s01); + + // kptr += 16; + // } + // } + // int32x2_t _s0 = vpadd_s32(vget_low_s32(_sum01), vget_high_s32(_sum01)); + // sum0 += vget_lane_s32(_s0, 0); + // sum1 += vget_lane_s32(_s0, 1); + // } +#endif // __riscv_vector for (; q < inch; q++) { const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; @@ -1136,105 +1170,105 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const int* outptr = top_blob.channel(p); int ij = 0; - for (; ij + 1 < outw * outh; ij += 2) - { - const int i0 = ij / outw; - const int i1 = (ij + 1) / outw; - const int j0 = ij % outw; - const int j1 = (ij + 1) % outw; - - int sum0 = 0; - int sum1 = 0; - -#if __ARM_NEON - const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2 + p % 2); -#else - const signed char* kptr = weight_data_tm.channel(p / 2 + p % 2); -#endif - - int q = 0; -#if __ARM_NEON - { - int32x4_t _sum0 = vdupq_n_s32(0); - int32x4_t _sum1 = vdupq_n_s32(0); - int32x4_t _sum2 = vdupq_n_s32(0); - int32x4_t _sum3 = vdupq_n_s32(0); - for (; q + 7 < inch; q += 8) - { - const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; - const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; - - for (int k = 0; k < maxk; k++) - { - const signed char* r0s = r0 + space_ofs[k]; - const signed char* r1s = r1 + space_ofs[k]; - - int8x8_t _r0; - int8x8_t _r1; - if (elempack == 8) - { - _r0 = vld1_s8(r0s); - _r1 = vld1_s8(r1s); - } - else // if (elempack == 1) - { - signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; - signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]}; - _r0 = vld1_s8(tmp0); - _r1 = vld1_s8(tmp1); - } - - int8x8_t _w = vld1_s8(kptr); - - int16x8_t _s0 = vmull_s8(_r0, _w); - int16x8_t _s1 = vmull_s8(_r1, _w); - - _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); - _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); - _sum2 = vaddw_s16(_sum2, vget_low_s16(_s1)); - _sum3 = vaddw_s16(_sum3, vget_high_s16(_s1)); - - kptr += 8; - } - } - _sum0 = vaddq_s32(_sum0, _sum1); - _sum2 = vaddq_s32(_sum2, _sum3); -#if __aarch64__ - sum0 += vaddvq_s32(_sum0); - sum1 += vaddvq_s32(_sum2); -#else - int32x2_t _ss0 = vadd_s32(vget_low_s32(_sum0), vget_high_s32(_sum0)); - int32x2_t _ss2 = vadd_s32(vget_low_s32(_sum2), vget_high_s32(_sum2)); - _ss0 = vpadd_s32(_ss0, _ss2); - sum0 += vget_lane_s32(_ss0, 0); - sum1 += vget_lane_s32(_ss0, 1); -#endif - } -#endif // __ARM_NEON - for (; q < inch; q++) - { - const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; - const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; - - for (int k = 0; k < maxk; k++) - { - const signed char* r0s = r0 + space_ofs[k]; - const signed char* r1s = r1 + space_ofs[k]; - - // if (elempack == 1) - { - sum0 += r0s[0] * kptr[0]; - sum1 += r1s[0] * kptr[0]; - - kptr += 1; - } - } - } +// for (; ij + 1 < outw * outh; ij += 2) +// { +// const int i0 = ij / outw; +// const int i1 = (ij + 1) / outw; +// const int j0 = ij % outw; +// const int j1 = (ij + 1) % outw; + +// int sum0 = 0; +// int sum1 = 0; + +// #if __riscv_vector +// const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2 + p % 2); +// #else +// const signed char* kptr = weight_data_tm.channel(p / 2 + p % 2); +// #endif - outptr[0] = sum0; - outptr[1] = sum1; - outptr += 2; - } +// int q = 0; +// #if __riscv_vector +// { +// int32x4_t _sum0 = vdupq_n_s32(0); +// int32x4_t _sum1 = vdupq_n_s32(0); +// int32x4_t _sum2 = vdupq_n_s32(0); +// int32x4_t _sum3 = vdupq_n_s32(0); +// for (; q + 7 < inch; q += 8) +// { +// const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; +// const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; + +// for (int k = 0; k < maxk; k++) +// { +// const signed char* r0s = r0 + space_ofs[k]; +// const signed char* r1s = r1 + space_ofs[k]; + +// int8x8_t _r0; +// int8x8_t _r1; +// if (elempack == 8) +// { +// _r0 = vld1_s8(r0s); +// _r1 = vld1_s8(r1s); +// } +// else // if (elempack == 1) +// { +// signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; +// signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]}; +// _r0 = vld1_s8(tmp0); +// _r1 = vld1_s8(tmp1); +// } + +// int8x8_t _w = vld1_s8(kptr); + +// int16x8_t _s0 = vmull_s8(_r0, _w); +// int16x8_t _s1 = vmull_s8(_r1, _w); + +// _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); +// _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); +// _sum2 = vaddw_s16(_sum2, vget_low_s16(_s1)); +// _sum3 = vaddw_s16(_sum3, vget_high_s16(_s1)); + +// kptr += 8; +// } +// } +// _sum0 = vaddq_s32(_sum0, _sum1); +// _sum2 = vaddq_s32(_sum2, _sum3); +// #if __aarch64__ +// sum0 += vaddvq_s32(_sum0); +// sum1 += vaddvq_s32(_sum2); +// #else +// int32x2_t _ss0 = vadd_s32(vget_low_s32(_sum0), vget_high_s32(_sum0)); +// int32x2_t _ss2 = vadd_s32(vget_low_s32(_sum2), vget_high_s32(_sum2)); +// _ss0 = vpadd_s32(_ss0, _ss2); +// sum0 += vget_lane_s32(_ss0, 0); +// sum1 += vget_lane_s32(_ss0, 1); +// #endif +// } +// #endif // __riscv_vector +// for (; q < inch; q++) +// { +// const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; +// const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; + +// for (int k = 0; k < maxk; k++) +// { +// const signed char* r0s = r0 + space_ofs[k]; +// const signed char* r1s = r1 + space_ofs[k]; + +// // if (elempack == 1) +// { +// sum0 += r0s[0] * kptr[0]; +// sum1 += r1s[0] * kptr[0]; + +// kptr += 1; +// } +// } +// } + +// outptr[0] = sum0; +// outptr[1] = sum1; +// outptr += 2; +// } for (; ij < outw * outh; ij++) { const int i = ij / outw; @@ -1242,17 +1276,19 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const int sum = 0; -#if __ARM_NEON +#if __riscv_vector const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2 + p % 2); #else const signed char* kptr = weight_data_tm.channel(p / 2 + p % 2); #endif int q = 0; -#if __ARM_NEON +#if __riscv_vector { - int32x4_t _sum0 = vdupq_n_s32(0); - int32x4_t _sum1 = vdupq_n_s32(0); + vl = 8; + vfloat32m2_t _sum01 = vfmv_v_f_f32m2(0, vl); + // int32x4_t _sum0 = vdupq_n_s32(0); + // int32x4_t _sum1 = vdupq_n_s32(0); for (; q + 7 < inch; q += 8) { const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; @@ -1261,37 +1297,44 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const { const signed char* r0s = r0 + space_ofs[k]; - int8x8_t _r0; + vint8m1_t _r0; + // int8x8_t _r0; if (elempack == 8) { - _r0 = vld1_s8(r0s); + // _r0 = vld1_s8(r0s); + _r0 = vle8_v_i8m1(r0s, vl); } else // if (elempack == 1) { - signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; - _r0 = vld1_s8(tmp); + _r0 = vlse8_v_i8m1(r0s, N * sizeof(signed char), vl); + // signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + // _r0 = vld1_s8(tmp); } - int8x8_t _w = vld1_s8(kptr); + vint8m1_t _w = vle8_v_i8m1(kptr, vl); + vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i8m1(_r0, _w, vl), 0); + _sum01 = vwadd_wv_i32m2(_sum01, _s0, vl); + // int8x8_t _w = vld1_s8(kptr); - int16x8_t _s0 = vmull_s8(_r0, _w); + // int16x8_t _s0 = vmull_s8(_r0, _w); - _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); - _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); + // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); + // _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); kptr += 8; } } - int32x4_t _sum = vaddq_s32(_sum0, _sum1); -#if __aarch64__ - sum += vaddvq_s32(_sum); -#else - int32x2_t _ss = vadd_s32(vget_low_s32(_sum), vget_high_s32(_sum)); - _ss = vpadd_s32(_ss, _ss); - sum += vget_lane_s32(_ss, 0); -#endif + // int32x4_t _sum = vaddq_s32(_sum0, _sum1); +// #if __aarch64__ + sum = vmv_x_s_i32m1_i32(vredsum_vs_i32m2_i32m1(vint32m1_t(), _sum01, vfmv_v_f_f32m1(sum, vl), vl)); + // sum += vaddvq_s32(_sum); +// #else +// int32x2_t _ss = vadd_s32(vget_low_s32(_sum), vget_high_s32(_sum)); +// _ss = vpadd_s32(_ss, _ss); +// sum += vget_lane_s32(_ss, 0); +// #endif } -#endif // __ARM_NEON +#endif // __riscv_vector for (; q < inch; q++) { const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; From 2bb8cd26e05923ad533cfd630340fe27022d95df Mon Sep 17 00:00:00 2001 From: Xinyu302 Date: Sat, 17 Feb 2024 16:33:59 +0000 Subject: [PATCH 19/29] apply code-format changes --- src/layer/riscv/convolution_packed_int8.h | 475 +++++++++++----------- 1 file changed, 236 insertions(+), 239 deletions(-) diff --git a/src/layer/riscv/convolution_packed_int8.h b/src/layer/riscv/convolution_packed_int8.h index c18aab2b10ab..2d624fcdbac9 100644 --- a/src/layer/riscv/convolution_packed_int8.h +++ b/src/layer/riscv/convolution_packed_int8.h @@ -26,23 +26,23 @@ void convolution_packed_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, cons static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) { -// #if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD) -// #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8 -// if (ncnn::cpu_support_arm_i8mm()) -// { -// convolution_transform_kernel_packed_int8_i8mm(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); -// return; -// } -// #endif - -// #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD -// if (ncnn::cpu_support_arm_asimddp()) -// { -// convolution_transform_kernel_packed_int8_asimddp(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); -// return; -// } -// #endif -// #endif + // #if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD) + // #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8 + // if (ncnn::cpu_support_arm_i8mm()) + // { + // convolution_transform_kernel_packed_int8_i8mm(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); + // return; + // } + // #endif + + // #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD + // if (ncnn::cpu_support_arm_asimddp()) + // { + // convolution_transform_kernel_packed_int8_asimddp(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); + // return; + // } + // #endif + // #endif const int maxk = kernel_w * kernel_h; @@ -377,14 +377,14 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const // } // #endif -// #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD -// if (ncnn::cpu_support_arm_asimddp()) -// { -// convolution_packed_int8_asimddp(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); -// return; -// } -// #endif -// #endif + // #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD + // if (ncnn::cpu_support_arm_asimddp()) + // { + // convolution_packed_int8_asimddp(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + // return; + // } + // #endif + // #endif int vl; const int w = bottom_blob.w; @@ -672,7 +672,6 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const // if (elempack == 1) { - vint8m1_t _val = vmv_v_x_i32m2(r0s[0], vl); vint8m1_t _w = vle8_v_i8m1(kptr, vl); vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i8m1(_val, _w, vl), 0); @@ -828,7 +827,6 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const // // vint32m4_t _r01_int32 = vundefined_i32m4(); - // // int8x8_t _r0 = vdup_n_s8(r0s[0]); // // int8x8_t _r1 = vdup_n_s8(r1s[0]); // int8x8_t _r01 = vreinterpret_s8_s32(vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r1)).val[0]); @@ -852,8 +850,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const // } // if (out_elempack == 1) // { - - + // // int32x4x2_t _sum01 = vzipq_s32(_sum0, _sum1); // // vst1_s32(outptr, vget_low_s32(_sum01.val[0])); // // vst1_s32(outptr + M, vget_high_s32(_sum01.val[0])); @@ -979,111 +976,111 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const int* outptr1 = top_blob.channel(p + 1); int ij = 0; -// for (; ij + 1 < outw * outh; ij += 2) -// { -// const int i0 = ij / outw; -// const int i1 = (ij + 1) / outw; -// const int j0 = ij % outw; -// const int j1 = (ij + 1) % outw; - -// int sum00 = 0; -// int sum01 = 0; -// int sum10 = 0; -// int sum11 = 0; - -// #if __riscv_vector -// const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2); -// #else -// const signed char* kptr = weight_data_tm.channel(p / 2); -// #endif - -// int q = 0; -// #if __riscv_vector -// { -// int32x4_t _sum01 = vdupq_n_s32(0); -// int32x4_t _sum23 = vdupq_n_s32(0); -// for (; q + 7 < inch; q += 8) -// { -// const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; -// const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; - -// for (int k = 0; k < maxk; k++) -// { -// const signed char* r0s = r0 + space_ofs[k]; -// const signed char* r1s = r1 + space_ofs[k]; - -// int8x8_t _r0; -// int8x8_t _r1; -// if (elempack == 8) -// { -// _r0 = vld1_s8(r0s); -// _r1 = vld1_s8(r1s); -// } -// else // if (elempack == 1) -// { -// signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; -// signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]}; -// _r0 = vld1_s8(tmp0); -// _r1 = vld1_s8(tmp1); -// } - -// int8x16_t _w0 = vld1q_s8(kptr); - -// int32x2x2_t _rr0 = vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r0)); -// int32x2x2_t _rr1 = vzip_s32(vreinterpret_s32_s8(_r1), vreinterpret_s32_s8(_r1)); -// int8x8_t _r0l = vreinterpret_s8_s32(_rr0.val[0]); -// int8x8_t _r0h = vreinterpret_s8_s32(_rr0.val[1]); -// int8x8_t _r1l = vreinterpret_s8_s32(_rr1.val[0]); -// int8x8_t _r1h = vreinterpret_s8_s32(_rr1.val[1]); - -// int16x8_t _s01 = vmull_s8(_r0l, vget_low_s8(_w0)); -// int16x8_t _s23 = vmull_s8(_r1l, vget_low_s8(_w0)); -// _s01 = vmlal_s8(_s01, _r0h, vget_high_s8(_w0)); -// _s23 = vmlal_s8(_s23, _r1h, vget_high_s8(_w0)); - -// _sum01 = vpadalq_s16(_sum01, _s01); -// _sum23 = vpadalq_s16(_sum23, _s23); - -// kptr += 16; -// } -// } -// int32x2_t _s0 = vpadd_s32(vget_low_s32(_sum01), vget_high_s32(_sum01)); -// int32x2_t _s1 = vpadd_s32(vget_low_s32(_sum23), vget_high_s32(_sum23)); -// sum00 += vget_lane_s32(_s0, 0); -// sum01 += vget_lane_s32(_s1, 0); -// sum10 += vget_lane_s32(_s0, 1); -// sum11 += vget_lane_s32(_s1, 1); -// } -// #endif // __riscv_vector -// for (; q < inch; q++) -// { -// const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; -// const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; - -// for (int k = 0; k < maxk; k++) -// { -// const signed char* r0s = r0 + space_ofs[k]; -// const signed char* r1s = r1 + space_ofs[k]; - -// // if (elempack == 1) -// { -// sum00 += r0s[0] * kptr[0]; -// sum01 += r1s[0] * kptr[0]; -// sum10 += r0s[0] * kptr[1]; -// sum11 += r1s[0] * kptr[1]; - -// kptr += 2; -// } -// } -// } - -// outptr0[0] = sum00; -// outptr0[1] = sum01; -// outptr1[0] = sum10; -// outptr1[1] = sum11; -// outptr0 += 2; -// outptr1 += 2; -// } + // for (; ij + 1 < outw * outh; ij += 2) + // { + // const int i0 = ij / outw; + // const int i1 = (ij + 1) / outw; + // const int j0 = ij % outw; + // const int j1 = (ij + 1) % outw; + + // int sum00 = 0; + // int sum01 = 0; + // int sum10 = 0; + // int sum11 = 0; + + // #if __riscv_vector + // const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2); + // #else + // const signed char* kptr = weight_data_tm.channel(p / 2); + // #endif + + // int q = 0; + // #if __riscv_vector + // { + // int32x4_t _sum01 = vdupq_n_s32(0); + // int32x4_t _sum23 = vdupq_n_s32(0); + // for (; q + 7 < inch; q += 8) + // { + // const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; + // const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; + + // for (int k = 0; k < maxk; k++) + // { + // const signed char* r0s = r0 + space_ofs[k]; + // const signed char* r1s = r1 + space_ofs[k]; + + // int8x8_t _r0; + // int8x8_t _r1; + // if (elempack == 8) + // { + // _r0 = vld1_s8(r0s); + // _r1 = vld1_s8(r1s); + // } + // else // if (elempack == 1) + // { + // signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + // signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]}; + // _r0 = vld1_s8(tmp0); + // _r1 = vld1_s8(tmp1); + // } + + // int8x16_t _w0 = vld1q_s8(kptr); + + // int32x2x2_t _rr0 = vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r0)); + // int32x2x2_t _rr1 = vzip_s32(vreinterpret_s32_s8(_r1), vreinterpret_s32_s8(_r1)); + // int8x8_t _r0l = vreinterpret_s8_s32(_rr0.val[0]); + // int8x8_t _r0h = vreinterpret_s8_s32(_rr0.val[1]); + // int8x8_t _r1l = vreinterpret_s8_s32(_rr1.val[0]); + // int8x8_t _r1h = vreinterpret_s8_s32(_rr1.val[1]); + + // int16x8_t _s01 = vmull_s8(_r0l, vget_low_s8(_w0)); + // int16x8_t _s23 = vmull_s8(_r1l, vget_low_s8(_w0)); + // _s01 = vmlal_s8(_s01, _r0h, vget_high_s8(_w0)); + // _s23 = vmlal_s8(_s23, _r1h, vget_high_s8(_w0)); + + // _sum01 = vpadalq_s16(_sum01, _s01); + // _sum23 = vpadalq_s16(_sum23, _s23); + + // kptr += 16; + // } + // } + // int32x2_t _s0 = vpadd_s32(vget_low_s32(_sum01), vget_high_s32(_sum01)); + // int32x2_t _s1 = vpadd_s32(vget_low_s32(_sum23), vget_high_s32(_sum23)); + // sum00 += vget_lane_s32(_s0, 0); + // sum01 += vget_lane_s32(_s1, 0); + // sum10 += vget_lane_s32(_s0, 1); + // sum11 += vget_lane_s32(_s1, 1); + // } + // #endif // __riscv_vector + // for (; q < inch; q++) + // { + // const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; + // const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; + + // for (int k = 0; k < maxk; k++) + // { + // const signed char* r0s = r0 + space_ofs[k]; + // const signed char* r1s = r1 + space_ofs[k]; + + // // if (elempack == 1) + // { + // sum00 += r0s[0] * kptr[0]; + // sum01 += r1s[0] * kptr[0]; + // sum10 += r0s[0] * kptr[1]; + // sum11 += r1s[0] * kptr[1]; + + // kptr += 2; + // } + // } + // } + + // outptr0[0] = sum00; + // outptr0[1] = sum01; + // outptr1[0] = sum10; + // outptr1[1] = sum11; + // outptr0 += 2; + // outptr1 += 2; + // } for (; ij < outw * outh; ij++) { const int i = ij / outw; @@ -1170,105 +1167,105 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const int* outptr = top_blob.channel(p); int ij = 0; -// for (; ij + 1 < outw * outh; ij += 2) -// { -// const int i0 = ij / outw; -// const int i1 = (ij + 1) / outw; -// const int j0 = ij % outw; -// const int j1 = (ij + 1) % outw; - -// int sum0 = 0; -// int sum1 = 0; - -// #if __riscv_vector -// const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2 + p % 2); -// #else -// const signed char* kptr = weight_data_tm.channel(p / 2 + p % 2); -// #endif - -// int q = 0; -// #if __riscv_vector -// { -// int32x4_t _sum0 = vdupq_n_s32(0); -// int32x4_t _sum1 = vdupq_n_s32(0); -// int32x4_t _sum2 = vdupq_n_s32(0); -// int32x4_t _sum3 = vdupq_n_s32(0); -// for (; q + 7 < inch; q += 8) -// { -// const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; -// const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; - -// for (int k = 0; k < maxk; k++) -// { -// const signed char* r0s = r0 + space_ofs[k]; -// const signed char* r1s = r1 + space_ofs[k]; - -// int8x8_t _r0; -// int8x8_t _r1; -// if (elempack == 8) -// { -// _r0 = vld1_s8(r0s); -// _r1 = vld1_s8(r1s); -// } -// else // if (elempack == 1) -// { -// signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; -// signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]}; -// _r0 = vld1_s8(tmp0); -// _r1 = vld1_s8(tmp1); -// } - -// int8x8_t _w = vld1_s8(kptr); - -// int16x8_t _s0 = vmull_s8(_r0, _w); -// int16x8_t _s1 = vmull_s8(_r1, _w); - -// _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); -// _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); -// _sum2 = vaddw_s16(_sum2, vget_low_s16(_s1)); -// _sum3 = vaddw_s16(_sum3, vget_high_s16(_s1)); - -// kptr += 8; -// } -// } -// _sum0 = vaddq_s32(_sum0, _sum1); -// _sum2 = vaddq_s32(_sum2, _sum3); -// #if __aarch64__ -// sum0 += vaddvq_s32(_sum0); -// sum1 += vaddvq_s32(_sum2); -// #else -// int32x2_t _ss0 = vadd_s32(vget_low_s32(_sum0), vget_high_s32(_sum0)); -// int32x2_t _ss2 = vadd_s32(vget_low_s32(_sum2), vget_high_s32(_sum2)); -// _ss0 = vpadd_s32(_ss0, _ss2); -// sum0 += vget_lane_s32(_ss0, 0); -// sum1 += vget_lane_s32(_ss0, 1); -// #endif -// } -// #endif // __riscv_vector -// for (; q < inch; q++) -// { -// const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; -// const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; - -// for (int k = 0; k < maxk; k++) -// { -// const signed char* r0s = r0 + space_ofs[k]; -// const signed char* r1s = r1 + space_ofs[k]; - -// // if (elempack == 1) -// { -// sum0 += r0s[0] * kptr[0]; -// sum1 += r1s[0] * kptr[0]; - -// kptr += 1; -// } -// } -// } - -// outptr[0] = sum0; -// outptr[1] = sum1; -// outptr += 2; -// } + // for (; ij + 1 < outw * outh; ij += 2) + // { + // const int i0 = ij / outw; + // const int i1 = (ij + 1) / outw; + // const int j0 = ij % outw; + // const int j1 = (ij + 1) % outw; + + // int sum0 = 0; + // int sum1 = 0; + + // #if __riscv_vector + // const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2 + p % 2); + // #else + // const signed char* kptr = weight_data_tm.channel(p / 2 + p % 2); + // #endif + + // int q = 0; + // #if __riscv_vector + // { + // int32x4_t _sum0 = vdupq_n_s32(0); + // int32x4_t _sum1 = vdupq_n_s32(0); + // int32x4_t _sum2 = vdupq_n_s32(0); + // int32x4_t _sum3 = vdupq_n_s32(0); + // for (; q + 7 < inch; q += 8) + // { + // const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; + // const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; + + // for (int k = 0; k < maxk; k++) + // { + // const signed char* r0s = r0 + space_ofs[k]; + // const signed char* r1s = r1 + space_ofs[k]; + + // int8x8_t _r0; + // int8x8_t _r1; + // if (elempack == 8) + // { + // _r0 = vld1_s8(r0s); + // _r1 = vld1_s8(r1s); + // } + // else // if (elempack == 1) + // { + // signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + // signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]}; + // _r0 = vld1_s8(tmp0); + // _r1 = vld1_s8(tmp1); + // } + + // int8x8_t _w = vld1_s8(kptr); + + // int16x8_t _s0 = vmull_s8(_r0, _w); + // int16x8_t _s1 = vmull_s8(_r1, _w); + + // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); + // _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); + // _sum2 = vaddw_s16(_sum2, vget_low_s16(_s1)); + // _sum3 = vaddw_s16(_sum3, vget_high_s16(_s1)); + + // kptr += 8; + // } + // } + // _sum0 = vaddq_s32(_sum0, _sum1); + // _sum2 = vaddq_s32(_sum2, _sum3); + // #if __aarch64__ + // sum0 += vaddvq_s32(_sum0); + // sum1 += vaddvq_s32(_sum2); + // #else + // int32x2_t _ss0 = vadd_s32(vget_low_s32(_sum0), vget_high_s32(_sum0)); + // int32x2_t _ss2 = vadd_s32(vget_low_s32(_sum2), vget_high_s32(_sum2)); + // _ss0 = vpadd_s32(_ss0, _ss2); + // sum0 += vget_lane_s32(_ss0, 0); + // sum1 += vget_lane_s32(_ss0, 1); + // #endif + // } + // #endif // __riscv_vector + // for (; q < inch; q++) + // { + // const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; + // const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; + + // for (int k = 0; k < maxk; k++) + // { + // const signed char* r0s = r0 + space_ofs[k]; + // const signed char* r1s = r1 + space_ofs[k]; + + // // if (elempack == 1) + // { + // sum0 += r0s[0] * kptr[0]; + // sum1 += r1s[0] * kptr[0]; + + // kptr += 1; + // } + // } + // } + + // outptr[0] = sum0; + // outptr[1] = sum1; + // outptr += 2; + // } for (; ij < outw * outh; ij++) { const int i = ij / outw; @@ -1325,14 +1322,14 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const } } // int32x4_t _sum = vaddq_s32(_sum0, _sum1); -// #if __aarch64__ + // #if __aarch64__ sum = vmv_x_s_i32m1_i32(vredsum_vs_i32m2_i32m1(vint32m1_t(), _sum01, vfmv_v_f_f32m1(sum, vl), vl)); // sum += vaddvq_s32(_sum); -// #else -// int32x2_t _ss = vadd_s32(vget_low_s32(_sum), vget_high_s32(_sum)); -// _ss = vpadd_s32(_ss, _ss); -// sum += vget_lane_s32(_ss, 0); -// #endif + // #else + // int32x2_t _ss = vadd_s32(vget_low_s32(_sum), vget_high_s32(_sum)); + // _ss = vpadd_s32(_ss, _ss); + // sum += vget_lane_s32(_ss, 0); + // #endif } #endif // __riscv_vector for (; q < inch; q++) From ca99b50ead0eabe8ac44c75ae4ce2416974a967c Mon Sep 17 00:00:00 2001 From: Xinyu Yang Date: Sun, 18 Feb 2024 00:56:45 +0800 Subject: [PATCH 20/29] try to add pack8 --- src/layer/riscv/convolution_packed_int8.h | 103 ++++++++++++---------- 1 file changed, 55 insertions(+), 48 deletions(-) diff --git a/src/layer/riscv/convolution_packed_int8.h b/src/layer/riscv/convolution_packed_int8.h index c18aab2b10ab..058d65b523cc 100644 --- a/src/layer/riscv/convolution_packed_int8.h +++ b/src/layer/riscv/convolution_packed_int8.h @@ -608,60 +608,67 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const const signed char* kptr = weight_data_tm.channel(p / 8); int q = 0; - // { - // for (; q + 7 < inch; q += 8) - // { - // const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; + { + for (; q + 7 < inch; q += 8) + { + const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; - // for (int k = 0; k < maxk; k++) - // { - // const signed char* r0s = r0 + space_ofs[k]; + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; - // int8x8_t _r0; - // if (elempack == 8) - // { - // _r0 = vld1_s8(r0s); - // } - // else // if (elempack == 1) - // { - // signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; - // _r0 = vld1_s8(tmp); - // } + // int8x8_t _r0; + vint8m1_t _r0; + if (elempack == 8) + { + _r0 = vle8_v_i8m1(r0s, vl);` + // _r0 = vld1_s8(r0s); + } + else // if (elempack == 1) + { + _r0 = vlse8_v_i8m1(r0s, N * sizeof(signed char), vl); + // signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + // _r0 = vld1_s8(tmp); + } - // int8x16_t _w0 = vld1q_s8(kptr); - // int8x16_t _w1 = vld1q_s8(kptr + 16); - // int8x16_t _w2 = vld1q_s8(kptr + 32); - // int8x16_t _w3 = vld1q_s8(kptr + 48); + + // int8x16_t _w0 = vld1q_s8(kptr); + // int8x16_t _w1 = vld1q_s8(kptr + 16); + // int8x16_t _w2 = vld1q_s8(kptr + 32); + // int8x16_t _w3 = vld1q_s8(kptr + 48); - // int16x4_t _rr0 = vreinterpret_s16_s8(_r0); - // int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); - // int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); - // int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); - // int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); + // int16x4_t _rr0 = vreinterpret_s16_s8(_r0); - // int16x8_t _s0l = vmull_s8(_r0ll, vget_low_s8(_w0)); - // int16x8_t _s1l = vmull_s8(_r0ll, vget_high_s8(_w0)); - // int16x8_t _s0h = vmull_s8(_r0lh, vget_low_s8(_w1)); - // int16x8_t _s1h = vmull_s8(_r0lh, vget_high_s8(_w1)); - // _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w2)); - // _s1l = vmlal_s8(_s1l, _r0hl, vget_high_s8(_w2)); - // _s0h = vmlal_s8(_s0h, _r0hh, vget_low_s8(_w3)); - // _s1h = vmlal_s8(_s1h, _r0hh, vget_high_s8(_w3)); - - // _sum0 = vpadalq_s16(_sum0, _s0l); - // _sum1 = vpadalq_s16(_sum1, _s1l); - // _sum2 = vpadalq_s16(_sum2, _s0h); - // _sum3 = vpadalq_s16(_sum3, _s1h); - - // kptr += 64; - // } - // } + - // { - // _sum0 = vaddq_s32(_sum0, _sum2); - // _sum1 = vaddq_s32(_sum1, _sum3); - // } - // } + int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); + int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); + int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); + int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); + + int16x8_t _s0l = vmull_s8(_r0ll, vget_low_s8(_w0)); + int16x8_t _s1l = vmull_s8(_r0ll, vget_high_s8(_w0)); + int16x8_t _s0h = vmull_s8(_r0lh, vget_low_s8(_w1)); + int16x8_t _s1h = vmull_s8(_r0lh, vget_high_s8(_w1)); + _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w2)); + _s1l = vmlal_s8(_s1l, _r0hl, vget_high_s8(_w2)); + _s0h = vmlal_s8(_s0h, _r0hh, vget_low_s8(_w3)); + _s1h = vmlal_s8(_s1h, _r0hh, vget_high_s8(_w3)); + + _sum0 = vpadalq_s16(_sum0, _s0l); + _sum1 = vpadalq_s16(_sum1, _s1l); + _sum2 = vpadalq_s16(_sum2, _s0h); + _sum3 = vpadalq_s16(_sum3, _s1h); + + kptr += 64; + } + } + + { + _sum0 = vaddq_s32(_sum0, _sum2); + _sum1 = vaddq_s32(_sum1, _sum3); + } + } for (; q < inch; q++) { const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; From c69da25e092ea799022e40424b8af5f6432333d3 Mon Sep 17 00:00:00 2001 From: Xinyu Yang Date: Sun, 18 Feb 2024 12:53:32 +0800 Subject: [PATCH 21/29] try to handle vpadalq_s16 --- src/layer/riscv/convolution_packed_int8.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/layer/riscv/convolution_packed_int8.h b/src/layer/riscv/convolution_packed_int8.h index 0e2af9bfeee6..b7df03bbdd3e 100644 --- a/src/layer/riscv/convolution_packed_int8.h +++ b/src/layer/riscv/convolution_packed_int8.h @@ -621,7 +621,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const vint8m1_t _r0; if (elempack == 8) { - _r0 = vle8_v_i8m1(r0s, vl);` + _r0 = vle8_v_i8m1(r0s, vl); // _r0 = vld1_s8(r0s); } else // if (elempack == 1) From 41d08e89d3dd982e163a8116806fb94b0fd8d7ec Mon Sep 17 00:00:00 2001 From: Xinyu302 Date: Sun, 18 Feb 2024 04:54:54 +0000 Subject: [PATCH 22/29] apply code-format changes --- src/layer/riscv/convolution_packed_int8.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/layer/riscv/convolution_packed_int8.h b/src/layer/riscv/convolution_packed_int8.h index b7df03bbdd3e..1db9c5dbae48 100644 --- a/src/layer/riscv/convolution_packed_int8.h +++ b/src/layer/riscv/convolution_packed_int8.h @@ -631,7 +631,6 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const // _r0 = vld1_s8(tmp); } - // int8x16_t _w0 = vld1q_s8(kptr); // int8x16_t _w1 = vld1q_s8(kptr + 16); // int8x16_t _w2 = vld1q_s8(kptr + 32); @@ -639,8 +638,6 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const // int16x4_t _rr0 = vreinterpret_s16_s8(_r0); - - int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); From 66542ff15aa354c657c94d54a7d446edc7fa57ee Mon Sep 17 00:00:00 2001 From: yxy Date: Sun, 18 Feb 2024 14:44:34 +0000 Subject: [PATCH 23/29] finish kernel. pass test --- src/layer/riscv/convolution_packed_int8.h | 361 ++++++++++++++-------- src/layer/riscv/convolution_riscv.cpp | 18 +- 2 files changed, 248 insertions(+), 131 deletions(-) diff --git a/src/layer/riscv/convolution_packed_int8.h b/src/layer/riscv/convolution_packed_int8.h index 1db9c5dbae48..f3ac93d8fc7a 100644 --- a/src/layer/riscv/convolution_packed_int8.h +++ b/src/layer/riscv/convolution_packed_int8.h @@ -12,17 +12,17 @@ // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -#if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD) -#if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8 -void convolution_transform_kernel_packed_int8_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h); -void convolution_packed_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt); -#endif - -#if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD -void convolution_transform_kernel_packed_int8_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h); -void convolution_packed_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt); -#endif -#endif +// #if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD) +// #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8 +// void convolution_transform_kernel_packed_int8_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h); +// void convolution_packed_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt); +// #endif + +// #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD +// void convolution_transform_kernel_packed_int8_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h); +// void convolution_packed_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt); +// #endif +// #endif static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) { @@ -602,8 +602,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const // int32x4_t _sum1 = vdupq_n_s32(0); // int32x4_t _sum2 = vdupq_n_s32(0); // int32x4_t _sum3 = vdupq_n_s32(0); + vl = 8; vint32m2_t _sum01 = vmv_v_x_i32m2(0, vl); + vint32m2_t _sum23 = vmv_v_x_i32m2(0, vl); const signed char* kptr = weight_data_tm.channel(p / 8); @@ -635,35 +637,81 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const // int8x16_t _w1 = vld1q_s8(kptr + 16); // int8x16_t _w2 = vld1q_s8(kptr + 32); // int8x16_t _w3 = vld1q_s8(kptr + 48); + vl = 16; + vint8m1_t _w0 = vle8_v_i8m1(kptr, vl); + vint8m1_t _w1 = vle8_v_i8m1(kptr + 16, vl); + vint8m1_t _w2 = vle8_v_i8m1(kptr + 32, vl); + vint8m1_t _w3 = vle8_v_i8m1(kptr + 48, vl); - // int16x4_t _rr0 = vreinterpret_s16_s8(_r0); + vl = 8; - int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); - int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); - int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); - int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); - - int16x8_t _s0l = vmull_s8(_r0ll, vget_low_s8(_w0)); - int16x8_t _s1l = vmull_s8(_r0ll, vget_high_s8(_w0)); - int16x8_t _s0h = vmull_s8(_r0lh, vget_low_s8(_w1)); - int16x8_t _s1h = vmull_s8(_r0lh, vget_high_s8(_w1)); - _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w2)); - _s1l = vmlal_s8(_s1l, _r0hl, vget_high_s8(_w2)); - _s0h = vmlal_s8(_s0h, _r0hh, vget_low_s8(_w3)); - _s1h = vmlal_s8(_s1h, _r0hh, vget_high_s8(_w3)); - - _sum0 = vpadalq_s16(_sum0, _s0l); - _sum1 = vpadalq_s16(_sum1, _s1l); - _sum2 = vpadalq_s16(_sum2, _s0h); - _sum3 = vpadalq_s16(_sum3, _s1h); + // int16x4_t _rr0 = vreinterpret_s16_s8(_r0); + vint16m1_t _rr0 = vreinterpret_v_i8m1_i16m1(_r0); + + vint8m1_t _r0ll = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 0, vl)); + vint8m1_t _r0lh = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 1, vl)); + vint8m1_t _r0hl = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 2, vl)); + vint8m1_t _r0hh = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 3, vl)); + + // uint8_t mask[8] = {8, 9, 10, 11, 12, 13, 14, 15}; + // vuint8m1_t _index = vle8_v_u8m1(mask, vl); + + + // int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); + // int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); + // int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); + // int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); + + vint16m2_t _s0l_m2 = vwmul_vv_i16m2(_r0ll, _w0, vl); + vint16m2_t _s1l_m2 = vwmul_vv_i16m2(_r0ll, vslidedown_vx_i8m1(_w0, _w0, 8, vl), vl); + vint16m2_t _s0h_m2 = vwmul_vv_i16m2(_r0lh, _w1, vl); + vint16m2_t _s1h_m2 = vwmul_vv_i16m2(_r0lh, vslidedown_vx_i8m1(_w1, _w1, 8, vl), vl); + + // int16x8_t _s0l = vmull_s8(_r0ll, vget_low_s8(_w0)); + // int16x8_t _s1l = vmull_s8(_r0ll, vget_high_s8(_w0)); + // int16x8_t _s0h = vmull_s8(_r0lh, vget_low_s8(_w1)); + // int16x8_t _s1h = vmull_s8(_r0lh, vget_high_s8(_w1)); + + // vint16m1_t _s0l = vget_v_i16m2_i16m2(vwmacc_vv_i16m2(_s0l_m2, _r0hl, _w2, vl), 0); + // vint16m1_t _s1l = vget_v_i16m2_i16m2(vwmacc_vv_i16m2(_s1l_m2, _r0hl, vrgather_vv_i8m1(_w2, _index, vl), vl), 0); + // vint16m1_t _s2l = vget_v_i16m2_i16m2(vwmacc_vv_i16m2(_s0h_m2, _r0hh, _w3, vl), 0); + // vint16m1_t _s3l = vget_v_i16m2_i16m2(vwmacc_vv_i16m2(_s1h_m2, _r0hh, vrgather_vv_i8m1(_w3, _index, vl), vl), 0); + + _s0l_m2 = vwmacc_vv_i16m2(_s0l_m2, _r0hl, _w2, vl); + _s1l_m2 = vwmacc_vv_i16m2(_s1l_m2, _r0hl, vslidedown_vx_i8m1(_w2, _w2, 8, vl), vl); + _s0h_m2 = vwmacc_vv_i16m2(_s0h_m2, _r0hh, _w3, vl); + _s1h_m2 = vwmacc_vv_i16m2(_s1h_m2, _r0hh, vslidedown_vx_i8m1(_w3, _w2, 8, vl), vl); + + // _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w2)); + // _s1l = vmlal_s8(_s1l, _r0hl, vget_high_s8(_w2)); + // _s0h = vmlal_s8(_s0h, _r0hh, vget_low_s8(_w3)); + // _s1h = vmlal_s8(_s1h, _r0hh, vget_high_s8(_w3)); + + vint16m2_t _s01l = vset_v_i16m1_i16m2(_s0l_m2, 1, vget_v_i16m2_i16m1(_s1l_m2, 0)); + vint16m2_t _s01h = vset_v_i16m1_i16m2(_s0h_m2, 1, vget_v_i16m2_i16m1(_s1h_m2, 0)); + uint16_t odd_index[8] = {1, 3, 5, 7, 9, 11, 13, 15}; + uint16_t even_index[8] = {0, 2, 4, 6, 8, 10, 12, 14}; + vuint16m2_t _odd_index = vle16_v_u16m2(odd_index, vl); + vuint16m2_t _even_index = vle16_v_u16m2(even_index, vl); + + _sum01 = vwadd_wv_i32m2(_sum01, vget_v_i16m2_i16m1(vrgather_vv_i16m2(_s01l, _odd_index, vl), 0), vl); + _sum01 = vwadd_wv_i32m2(_sum01, vget_v_i16m2_i16m1(vrgather_vv_i16m2(_s01l, _even_index, vl), 0), vl); + _sum01 = vwadd_wv_i32m2(_sum01, vget_v_i16m2_i16m1(vrgather_vv_i16m2(_s01h, _odd_index, vl), 0), vl); + _sum01 = vwadd_wv_i32m2(_sum01, vget_v_i16m2_i16m1(vrgather_vv_i16m2(_s01h, _even_index, vl), 0), vl); + + // _sum0 = vpadalq_s16(_sum0, _s0l); + // _sum1 = vpadalq_s16(_sum1, _s1l); + // _sum2 = vpadalq_s16(_sum2, _s0h); + // _sum3 = vpadalq_s16(_sum3, _s1h); kptr += 64; } } { - _sum0 = vaddq_s32(_sum0, _sum2); - _sum1 = vaddq_s32(_sum1, _sum3); + // _sum0 = vaddq_s32(_sum0, _sum2); + // _sum1 = vaddq_s32(_sum1, _sum3); + // _sum01 = vadd_vv_i32m2(_sum01, _sum23, vl); } } for (; q < inch; q++) @@ -676,11 +724,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const // if (elempack == 1) { - vint8m1_t _val = vmv_v_x_i32m2(r0s[0], vl); + vint8m1_t _val = vmv_v_x_i8m1(r0s[0], vl); vint8m1_t _w = vle8_v_i8m1(kptr, vl); - vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i8m1(_val, _w, vl), 0); + vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val, _w, vl), 0); _sum01 = vwadd_wv_i32m2(_sum01, _s0, vl); - // int8x8_t _val = vdup_n_s8(r0s[0]); // int8x8_t _w = vld1_s8(kptr); // int16x8_t _s0 = vmull_s8(_val, _w); @@ -711,7 +758,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const } if (out_elempack == 1) { - vsse32_v_f32m2(outptr, M * sizeof(int), _sum01, vl); + vsse32_v_i32m2(outptr, M * sizeof(int), _sum01, vl); // outptr[0] = vgetq_lane_s32(_sum0, 0); // outptr[M] = vgetq_lane_s32(_sum0, 1); // outptr[M * 2] = vgetq_lane_s32(_sum0, 2); @@ -876,50 +923,86 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4); int q = 0; - // { - // for (; q + 7 < inch; q += 8) - // { - // const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; - - // for (int k = 0; k < maxk; k++) - // { - // const signed char* r0s = r0 + space_ofs[k]; - - // int8x8_t _r0; - // if (elempack == 8) - // { - // _r0 = vld1_s8(r0s); - // } - // else // if (elempack == 1) - // { - // signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; - // _r0 = vld1_s8(tmp); - // } - - // int8x16_t _w0 = vld1q_s8(kptr); - // int8x16_t _w1 = vld1q_s8(kptr + 16); - - // int16x4_t _rr0 = vreinterpret_s16_s8(_r0); - // int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); - // int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); - // int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); - // int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); - - // int16x8_t _sl = vmull_s8(_r0ll, vget_low_s8(_w0)); - // int16x8_t _sh = vmull_s8(_r0lh, vget_high_s8(_w0)); - // _sl = vmlal_s8(_sl, _r0hl, vget_low_s8(_w1)); - // _sh = vmlal_s8(_sh, _r0hh, vget_high_s8(_w1)); - - // _sum0 = vpadalq_s16(_sum0, _sl); - // _sum1 = vpadalq_s16(_sum1, _sh); - - // kptr += 32; - // } - // } - // { - // _sum0 = vaddq_s32(_sum0, _sum1); - // } - // } + { + vl = 8; + for (; q + 7 < inch; q += 8) + { + const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + + // int8x8_t _r0; + vint8m1_t _r0; + if (elempack == 8) + { + _r0 = vle8_v_i8m1(r0s, vl); + // _r0 = vld1_s8(r0s); + } + else // if (elempack == 1) + { + // signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + // _r0 = vld1_s8(tmp); + _r0 = vlse8_v_i8m1(r0s, N * sizeof(signed char), vl); + } + + // int8x16_t _w0 = vld1q_s8(kptr); + // int8x16_t _w1 = vld1q_s8(kptr + 16); + vl = 16; + vint8m1_t _w0 = vle8_v_i8m1(kptr, vl); + vint8m1_t _w1 = vle8_v_i8m1(kptr + 16, vl); + vl = 8; + + // int16x4_t _rr0 = vreinterpret_s16_s8(_r0); + vint16m1_t _rr0 = vreinterpret_v_i8m1_i16m1(_r0); + + vint8m1_t _r0ll = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 0, vl)); + vint8m1_t _r0lh = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 1, vl)); + vint8m1_t _r0hl = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 2, vl)); + vint8m1_t _r0hh = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 3, vl)); + + vint16m2_t _sl_m2 = vwmul_vv_i16m2(_r0ll, _w0, vl); + vint16m2_t _sh_m2 = vwmul_vv_i16m2(_r0lh, vslidedown_vx_i8m1(_w0, _w0, 8, vl), vl); + _sl_m2 = vwmacc_vv_i16m2(_sl_m2, _r0hl, _w1, vl); + _sh_m2 = vwmacc_vv_i16m2(_sh_m2, _r0hh, vslidedown_vx_i8m1(_w1, _w1, 8, vl), vl); + + vint16m1_t _sl = vget_v_i16m2_i16m1(_sl_m2, 0); + vint16m1_t _sh = vget_v_i16m2_i16m1(_sh_m2, 0); + + + // int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); + // int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); + // int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); + // int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); + + // int16x8_t _sl = vmull_s8(_r0ll, vget_low_s8(_w0)); + // int16x8_t _sh = vmull_s8(_r0lh, vget_high_s8(_w0)); + // _sl = vmlal_s8(_sl, _r0hl, vget_low_s8(_w1)); + // _sh = vmlal_s8(_sh, _r0hh, vget_high_s8(_w1)); + vl = 4; + + uint16_t odd_index[4] = {1, 3, 5, 7}; + uint16_t even_index[4] = {0, 2, 4, 6}; + vuint16m1_t _odd_index = vle16_v_u16m1(odd_index, vl); + vuint16m1_t _even_index = vle16_v_u16m1(even_index, vl); + + _sum01 = vwadd_wv_i32m2(_sum01, vrgather_vv_i16m1(_sl, _odd_index, vl), vl); + _sum01 = vwadd_wv_i32m2(_sum01, vrgather_vv_i16m1(_sl, _even_index, vl), vl); + _sum01 = vwadd_wv_i32m2(_sum01, vrgather_vv_i16m1(_sh, _odd_index, vl), vl); + _sum01 = vwadd_wv_i32m2(_sum01, vrgather_vv_i16m1(_sh, _even_index, vl), vl); + + // _sum0 = vpadalq_s16(_sum0, _sl); + // _sum1 = vpadalq_s16(_sum1, _sh); + + + kptr += 32; + } + } + // { + // _sum0 = vaddq_s32(_sum0, _sum1); + // } + } for (; q < inch; q++) { const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; @@ -932,7 +1015,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const { vint8m1_t _val = vmv_v_x_i8m1(r0s[0], vl); vint8m1_t _w = vle8_v_i8m1(kptr, vl); - vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i8m1(_val, _w, vl), 0); + vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val, _w, vl), 0); _sum01 = vwadd_wv_i32m2(_sum01, _s0, vl); // int8x8_t _val = vdup_n_s8(r0s[0]); // int8x8_t _w = vld1_s8(kptr); @@ -943,11 +1026,12 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const } } } + vl = 4; if (out_elempack == 4) { // vst1q_s32(outptr, _sum0); - vse32_v_i32m1(outptr, _sum01, vl); + vse32_v_i32m2(outptr, _sum01, vl); outptr += 4; } if (out_elempack == 1) @@ -1101,45 +1185,74 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const int q = 0; #if __riscv_vector - // { - // int32x4_t _sum01 = vdupq_n_s32(0); - // for (; q + 7 < inch; q += 8) - // { - // const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; - - // for (int k = 0; k < maxk; k++) - // { - // const signed char* r0s = r0 + space_ofs[k]; - - // int8x8_t _r0; - // if (elempack == 8) - // { - // _r0 = vld1_s8(r0s); - // } - // else // if (elempack == 1) - // { - // signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; - // _r0 = vld1_s8(tmp); - // } - - // int8x16_t _w0 = vld1q_s8(kptr); - - // int32x2x2_t _rr0 = vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r0)); - // int8x8_t _r0l = vreinterpret_s8_s32(_rr0.val[0]); - // int8x8_t _r0h = vreinterpret_s8_s32(_rr0.val[1]); - - // int16x8_t _s01 = vmull_s8(_r0l, vget_low_s8(_w0)); - // _s01 = vmlal_s8(_s01, _r0h, vget_high_s8(_w0)); - - // _sum01 = vpadalq_s16(_sum01, _s01); - - // kptr += 16; - // } - // } - // int32x2_t _s0 = vpadd_s32(vget_low_s32(_sum01), vget_high_s32(_sum01)); - // sum0 += vget_lane_s32(_s0, 0); - // sum1 += vget_lane_s32(_s0, 1); - // } + { + // int32x4_t _sum01 = vdupq_n_s32(0); + vl = 4; + uint16_t odd_index[4] = {1, 3, 5, 7}; + uint16_t even_index[4] = {0, 2, 4, 6}; + vuint16m1_t _odd_index = vle16_v_u16m1(odd_index, vl); + vuint16m1_t _even_index = vle16_v_u16m1(even_index, vl); + vint32m2_t _sum01 = vmv_v_x_i32m2(0, vl); + for (; q + 7 < inch; q += 8) + { + vl = 8; + const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + + // int8x8_t _r0; + vint8m1_t _r0; + if (elempack == 8) + { + _r0 = vle8_v_i8m1(r0s, vl); + // _r0 = vld1_s8(r0s); + } + else // if (elempack == 1) + { + _r0 = vlse8_v_i8m1(r0s, N * sizeof(signed char), vl); + // signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; + // _r0 = vld1_s8(tmp); + } + + // int8x16_t _w0 = vld1q_s8(kptr); + vl = 16; + vint8m1_t _w0 = vle8_v_i8m1(kptr, vl); + vl = 8; + vint8m1_t _r0l = vslideup_vx_i8m1(_r0, _r0, 4, vl); + vint8m1_t _r0h = vslidedown_vx_i8m1(_r0, _r0, 4, vl); + + // vint32m1_t _r0_i16 = vreinterpret_v_i32m1_i8m1(_r0); + + // int32x2x2_t _rr0 = vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r0)); + // int8x8_t _r0l = vreinterpret_s8_s32(_rr0.val[0]); + // int8x8_t _r0h = vreinterpret_s8_s32(_rr0.val[1]); + + vint16m2_t _s01_m2 = vwmul_vv_i16m2(_r0l, _w0, vl); + _s01_m2 = vwmacc_vv_i16m2(_s01_m2, _r0h, vslidedown_vx_i8m1(_w0, _w0, 8, vl), vl); + vint16m1_t _s01 = vget_v_i16m2_i16m1(_s01_m2, 0); + + vl = 4; + _sum01 = vwadd_wv_i32m2(_sum01, vrgather_vv_i16m1(_s01, _odd_index, vl), vl); + _sum01 = vwadd_wv_i32m2(_sum01, vrgather_vv_i16m1(_s01, _even_index, vl), vl); + + // int16x8_t _s01 = vmull_s8(_r0l, vget_low_s8(_w0)); + // _s01 = vmlal_s8(_s01, _r0h, vget_high_s8(_w0)); + // _sum01 = vpadalq_s16(_sum01, _s01); + + kptr += 16; + } + } + int res[4] = {0, 0, 0, 0}; + vl = 4; + vse32_v_i32m2(res, _sum01, vl); + sum0 += (res[0] + res[1]); + sum1 += (res[2] + res[3]); + // int32x2_t _s0 = vpadd_s32(vget_low_s32(_sum01), vget_high_s32(_sum01)); + // sum0 += vget_lane_s32(_s0, 0); + // sum1 += vget_lane_s32(_s0, 1); + } #endif // __riscv_vector for (; q < inch; q++) { @@ -1287,7 +1400,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const #if __riscv_vector { vl = 8; - vfloat32m2_t _sum01 = vfmv_v_f_f32m2(0, vl); + vint32m2_t _sum01 = vmv_v_x_i32m2(0, vl); // int32x4_t _sum0 = vdupq_n_s32(0); // int32x4_t _sum1 = vdupq_n_s32(0); for (; q + 7 < inch; q += 8) @@ -1313,7 +1426,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const } vint8m1_t _w = vle8_v_i8m1(kptr, vl); - vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i8m1(_r0, _w, vl), 0); + vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_r0, _w, vl), 0); _sum01 = vwadd_wv_i32m2(_sum01, _s0, vl); // int8x8_t _w = vld1_s8(kptr); @@ -1327,7 +1440,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const } // int32x4_t _sum = vaddq_s32(_sum0, _sum1); // #if __aarch64__ - sum = vmv_x_s_i32m1_i32(vredsum_vs_i32m2_i32m1(vint32m1_t(), _sum01, vfmv_v_f_f32m1(sum, vl), vl)); + sum = vmv_x_s_i32m1_i32(vredsum_vs_i32m2_i32m1(vint32m1_t(), _sum01, vmv_v_x_i32m1(sum, vl), vl)); // sum += vaddvq_s32(_sum); // #else // int32x2_t _ss = vadd_s32(vget_low_s32(_sum), vget_high_s32(_sum)); diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp index dc8a2e987eba..37661157c77e 100644 --- a/src/layer/riscv/convolution_riscv.cpp +++ b/src/layer/riscv/convolution_riscv.cpp @@ -35,6 +35,10 @@ namespace ncnn { #include "convolution_1x1.h" #include "convolution_3x3.h" +#if NCNN_INT8 +#include "convolution_packed_int8.h" +#endif // NCNN_INT8 + #if __riscv_vector #include "convolution_packn.h" #include "convolution_pack1ton.h" @@ -1228,13 +1232,13 @@ int Convolution_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const if (top_blob_int32.empty()) return -100; - int _nT = nT ? nT : opt.num_threads; - if (nT != 0 && opt.num_threads != nT) - { - // force num_threads the same as in create_pipeline - // so we could use pre-packed A/B from the same tile config - NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT); - } + // int _nT = nT ? nT : opt.num_threads; + // if (nT != 0 && opt.num_threads != nT) + // { + // // force num_threads the same as in create_pipeline + // // so we could use pre-packed A/B from the same tile config + // NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT); + // } #if 0 if (opt.use_winograd_convolution && prefer_winograd) { From c6d272615919ad699684abae39b5188def3db658 Mon Sep 17 00:00:00 2001 From: yxy Date: Sun, 18 Feb 2024 15:08:52 +0000 Subject: [PATCH 24/29] use new kernel --- src/layer/riscv/convolution_riscv.cpp | 45 ++++++++++++++------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp index 37661157c77e..6aaea2b90fdf 100644 --- a/src/layer/riscv/convolution_riscv.cpp +++ b/src/layer/riscv/convolution_riscv.cpp @@ -138,7 +138,7 @@ int Convolution_riscv::create_pipeline(const Option& opt) if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) { // TODO implement int8 - return 0; + return create_pipeline_int8(opt); } #endif @@ -263,27 +263,28 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti #if NCNN_INT8 if (opt.use_int8_inference && int8_scale_term) { - Mat bottom_blob_unpacked = bottom_blob; - if (bottom_blob.elempack != 1) - { - Option opt_pack1 = opt; - opt_pack1.blob_allocator = opt.workspace_allocator; - - convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); - } - - Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked; - if (bottom_blob_unpacked.elembits() == 16) - { - Option opt_pack1 = opt; - opt_pack1.blob_allocator = opt.workspace_allocator; - - cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1); - } - - Option opt_unpacked = opt; - opt_unpacked.use_packing_layout = false; - return Convolution::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked); + return forward_int8(bottom_blob, top_blob, opt); + // Mat bottom_blob_unpacked = bottom_blob; + // if (bottom_blob.elempack != 1) + // { + // Option opt_pack1 = opt; + // opt_pack1.blob_allocator = opt.workspace_allocator; + + // convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); + // } + + // Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked; + // if (bottom_blob_unpacked.elembits() == 16) + // { + // Option opt_pack1 = opt; + // opt_pack1.blob_allocator = opt.workspace_allocator; + + // cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1); + // } + + // Option opt_unpacked = opt; + // opt_unpacked.use_packing_layout = false; + // return Convolution::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked); } #endif From 1f6ed12719f2824b607613e867324535518cc2b4 Mon Sep 17 00:00:00 2001 From: yxy Date: Sun, 18 Feb 2024 16:42:36 +0000 Subject: [PATCH 25/29] fix kernel bug --- src/layer/riscv/convolution_packed_int8.h | 534 +--------------------- 1 file changed, 24 insertions(+), 510 deletions(-) diff --git a/src/layer/riscv/convolution_packed_int8.h b/src/layer/riscv/convolution_packed_int8.h index f3ac93d8fc7a..c8130b64d4d4 100644 --- a/src/layer/riscv/convolution_packed_int8.h +++ b/src/layer/riscv/convolution_packed_int8.h @@ -26,24 +26,6 @@ static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) { - // #if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD) - // #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8 - // if (ncnn::cpu_support_arm_i8mm()) - // { - // convolution_transform_kernel_packed_int8_i8mm(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); - // return; - // } - // #endif - - // #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD - // if (ncnn::cpu_support_arm_asimddp()) - // { - // convolution_transform_kernel_packed_int8_asimddp(kernel, kernel_tm, inch, outch, kernel_w, kernel_h); - // return; - // } - // #endif - // #endif - const int maxk = kernel_w * kernel_h; // src = kw-kh-inch-outch @@ -437,162 +419,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const int* outptr = top_blob.channel(p / out_elempack); int ij = 0; - // for (; ij + 1 < outw * outh; ij += 2) - // { - // const int i0 = ij / outw; - // const int i1 = (ij + 1) / outw; - // const int j0 = ij % outw; - // const int j1 = (ij + 1) % outw; - // // vl = 4; - - // int32x4_t _sum0 = vdupq_n_s32(0); - // int32x4_t _sum1 = vdupq_n_s32(0); - // int32x4_t _sum2 = vdupq_n_s32(0); - // int32x4_t _sum3 = vdupq_n_s32(0); - - // const signed char* kptr = weight_data_tm.channel(p / 8); - - // int q = 0; - // { - // for (; q + 7 < inch; q += 8) - // { - // const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; - // const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; - - // for (int k = 0; k < maxk; k++) - // { - // const signed char* r0s = r0 + space_ofs[k]; - // const signed char* r1s = r1 + space_ofs[k]; - - // int8x8_t _r0; - // int8x8_t _r1; - // if (elempack == 8) - // { - // _r0 = vld1_s8(r0s); - // _r1 = vld1_s8(r1s); - // } - // else // if (elempack == 1) - // { - // signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; - // signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]}; - // _r0 = vld1_s8(tmp0); - // _r1 = vld1_s8(tmp1); - // } - - // int8x16_t _w0 = vld1q_s8(kptr); - // int8x16_t _w1 = vld1q_s8(kptr + 16); - // int8x16_t _w2 = vld1q_s8(kptr + 32); - // int8x16_t _w3 = vld1q_s8(kptr + 48); - - // int16x4_t _rr0 = vreinterpret_s16_s8(_r0); - // int16x4_t _rr1 = vreinterpret_s16_s8(_r1); - - // int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); - // int8x8_t _r1ll = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 0)); - // int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); - // int8x8_t _r1hl = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 2)); - - // int16x8_t _s0l = vmull_s8(_r0ll, vget_low_s8(_w0)); - // int16x8_t _s1l = vmull_s8(_r0ll, vget_high_s8(_w0)); - // int16x8_t _s2l = vmull_s8(_r1ll, vget_low_s8(_w0)); - // int16x8_t _s3l = vmull_s8(_r1ll, vget_high_s8(_w0)); - // _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w2)); - // _s1l = vmlal_s8(_s1l, _r0hl, vget_high_s8(_w2)); - // _s2l = vmlal_s8(_s2l, _r1hl, vget_low_s8(_w2)); - // _s3l = vmlal_s8(_s3l, _r1hl, vget_high_s8(_w2)); - - // _sum0 = vpadalq_s16(_sum0, _s0l); - // _sum1 = vpadalq_s16(_sum1, _s1l); - // _sum2 = vpadalq_s16(_sum2, _s2l); - // _sum3 = vpadalq_s16(_sum3, _s3l); - - // int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); - // int8x8_t _r1lh = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 1)); - // int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); - // int8x8_t _r1hh = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 3)); - - // int16x8_t _s0h = vmull_s8(_r0lh, vget_low_s8(_w1)); - // int16x8_t _s1h = vmull_s8(_r0lh, vget_high_s8(_w1)); - // int16x8_t _s2h = vmull_s8(_r1lh, vget_low_s8(_w1)); - // int16x8_t _s3h = vmull_s8(_r1lh, vget_high_s8(_w1)); - // _s0h = vmlal_s8(_s0h, _r0hh, vget_low_s8(_w3)); - // _s1h = vmlal_s8(_s1h, _r0hh, vget_high_s8(_w3)); - // _s2h = vmlal_s8(_s2h, _r1hh, vget_low_s8(_w3)); - // _s3h = vmlal_s8(_s3h, _r1hh, vget_high_s8(_w3)); - - // _sum0 = vpadalq_s16(_sum0, _s0h); - // _sum1 = vpadalq_s16(_sum1, _s1h); - // _sum2 = vpadalq_s16(_sum2, _s2h); - // _sum3 = vpadalq_s16(_sum3, _s3h); - - // kptr += 64; - // } - // } - // } - // for (; q < inch; q++) - // { - // const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; - // const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; - - // for (int k = 0; k < maxk; k++) - // { - // const signed char* r0s = r0 + space_ofs[k]; - // const signed char* r1s = r1 + space_ofs[k]; - - // // if (elempack == 1) - // { - // int8x8_t _r0 = vdup_n_s8(r0s[0]); - // int8x8_t _r1 = vdup_n_s8(r1s[0]); - // int8x8_t _w = vld1_s8(kptr); - // int16x8_t _s0 = vmull_s8(_r0, _w); - // int16x8_t _s1 = vmull_s8(_r1, _w); - // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); - // _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); - // _sum2 = vaddw_s16(_sum2, vget_low_s16(_s1)); - // _sum3 = vaddw_s16(_sum3, vget_high_s16(_s1)); - - // kptr += 8; - // } - // } - // } - - // if (out_elempack == 8) - // { - // vst1q_s32(outptr, _sum0); - // vst1q_s32(outptr + 4, _sum1); - // vst1q_s32(outptr + 8, _sum2); - // vst1q_s32(outptr + 12, _sum3); - // outptr += 16; - // } - // if (out_elempack == 4) - // { - // vst1q_s32(outptr, _sum0); - // vst1q_s32(outptr + 4, _sum2); - // vst1q_s32(outptr + M, _sum1); - // vst1q_s32(outptr + M + 4, _sum3); - // outptr += 8; - // } - // if (out_elempack == 1) - // { - // outptr[0] = vgetq_lane_s32(_sum0, 0); - // outptr[1] = vgetq_lane_s32(_sum2, 0); - // outptr[M] = vgetq_lane_s32(_sum0, 1); - // outptr[M + 1] = vgetq_lane_s32(_sum2, 1); - // outptr[M * 2] = vgetq_lane_s32(_sum0, 2); - // outptr[M * 2 + 1] = vgetq_lane_s32(_sum2, 2); - // outptr[M * 3] = vgetq_lane_s32(_sum0, 3); - // outptr[M * 3 + 1] = vgetq_lane_s32(_sum2, 3); - // outptr[M * 4] = vgetq_lane_s32(_sum1, 0); - // outptr[M * 4 + 1] = vgetq_lane_s32(_sum3, 0); - // outptr[M * 5] = vgetq_lane_s32(_sum1, 1); - // outptr[M * 5 + 1] = vgetq_lane_s32(_sum3, 1); - // outptr[M * 6] = vgetq_lane_s32(_sum1, 2); - // outptr[M * 6 + 1] = vgetq_lane_s32(_sum3, 2); - // outptr[M * 7] = vgetq_lane_s32(_sum1, 3); - // outptr[M * 7 + 1] = vgetq_lane_s32(_sum3, 3); - // outptr += 2; - // } - // } + for (; ij < outw * outh; ij++) { const int i = ij / outw; @@ -605,7 +432,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const vl = 8; vint32m2_t _sum01 = vmv_v_x_i32m2(0, vl); - vint32m2_t _sum23 = vmv_v_x_i32m2(0, vl); + // vint32m2_t _sum23 = vmv_v_x_i32m2(0, vl); const signed char* kptr = weight_data_tm.channel(p / 8); @@ -680,7 +507,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const _s0l_m2 = vwmacc_vv_i16m2(_s0l_m2, _r0hl, _w2, vl); _s1l_m2 = vwmacc_vv_i16m2(_s1l_m2, _r0hl, vslidedown_vx_i8m1(_w2, _w2, 8, vl), vl); _s0h_m2 = vwmacc_vv_i16m2(_s0h_m2, _r0hh, _w3, vl); - _s1h_m2 = vwmacc_vv_i16m2(_s1h_m2, _r0hh, vslidedown_vx_i8m1(_w3, _w2, 8, vl), vl); + _s1h_m2 = vwmacc_vv_i16m2(_s1h_m2, _r0hh, vslidedown_vx_i8m1(_w3, _w3, 8, vl), vl); // _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w2)); // _s1l = vmlal_s8(_s1l, _r0hl, vget_high_s8(_w2)); @@ -716,6 +543,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const } for (; q < inch; q++) { + vl = 8; const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; for (int k = 0; k < maxk; k++) @@ -738,6 +566,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const } } } + vl = 8; if (out_elempack == 8) { @@ -753,7 +582,6 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const vl = 4; vse32_v_i32m1(outptr, vget_v_i32m2_i32m1(_sum01, 0), vl); vse32_v_i32m1(outptr + M, vget_v_i32m2_i32m1(_sum01, 1), vl); - vl = 8; outptr += 4; } if (out_elempack == 1) @@ -786,130 +614,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const int* outptr = top_blob.channel(p / out_elempack); int ij = 0; - // for (; ij + 1 < outw * outh; ij += 2) - // { - // const int i0 = ij / outw; - // const int i1 = (ij + 1) / outw; - // const int j0 = ij % outw; - // const int j1 = (ij + 1) % outw; - - // vint32m2_t _sum01 = vmv_v_x_i32m2(0, vl); - // int32x4_t _sum0 = vdupq_n_s32(0); - // int32x4_t _sum1 = vdupq_n_s32(0); - - // const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4); - - // int q = 0; - // { - // for (; q + 7 < inch; q += 8) - // { - // const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; - // const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; - - // for (int k = 0; k < maxk; k++) - // { - // const signed char* r0s = r0 + space_ofs[k]; - // const signed char* r1s = r1 + space_ofs[k]; - - // int8x8_t _r0; - // int8x8_t _r1; - // if (elempack == 8) - // { - // _r0 = vld1_s8(r0s); - // _r1 = vld1_s8(r1s); - // } - // else // if (elempack == 1) - // { - // signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; - // signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]}; - // _r0 = vld1_s8(tmp0); - // _r1 = vld1_s8(tmp1); - // } - - // int8x16_t _w0 = vld1q_s8(kptr); - // int8x16_t _w1 = vld1q_s8(kptr + 16); - - // int16x4_t _rr0 = vreinterpret_s16_s8(_r0); - // int16x4_t _rr1 = vreinterpret_s16_s8(_r1); - - // int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); - // int8x8_t _r1ll = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 0)); - // int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); - // int8x8_t _r1lh = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 1)); - - // int16x8_t _s0l = vmull_s8(_r0ll, vget_low_s8(_w0)); - // int16x8_t _s1l = vmull_s8(_r1ll, vget_low_s8(_w0)); - // int16x8_t _s0h = vmull_s8(_r0lh, vget_high_s8(_w0)); - // int16x8_t _s1h = vmull_s8(_r1lh, vget_high_s8(_w0)); - - // int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); - // int8x8_t _r1hl = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 2)); - // int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3)); - // int8x8_t _r1hh = vreinterpret_s8_s16(vdup_lane_s16(_rr1, 3)); - - // _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w1)); - // _s1l = vmlal_s8(_s1l, _r1hl, vget_low_s8(_w1)); - // _s0h = vmlal_s8(_s0h, _r0hh, vget_high_s8(_w1)); - // _s1h = vmlal_s8(_s1h, _r1hh, vget_high_s8(_w1)); - - // _sum0 = vpadalq_s16(_sum0, _s0l); - // _sum1 = vpadalq_s16(_sum1, _s1l); - // _sum0 = vpadalq_s16(_sum0, _s0h); - // _sum1 = vpadalq_s16(_sum1, _s1h); - - // kptr += 32; - // } - // } - // } - // for (; q < inch; q++) - // { - // const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; - // const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; - - // for (int k = 0; k < maxk; k++) - // { - // const signed char* r0s = r0 + space_ofs[k]; - // const signed char* r1s = r1 + space_ofs[k]; - - // // if (elempack == 1) - // { - // vint8m1_t _r0 = vmv_v_x_i8m1(r0s[0], vl); - // vint8m1_t _r1 = vmv_v_x_i8m1(r1s[0], vl); - - // // vint32m4_t _r01_int32 = vundefined_i32m4(); - - // // int8x8_t _r0 = vdup_n_s8(r0s[0]); - // // int8x8_t _r1 = vdup_n_s8(r1s[0]); - // int8x8_t _r01 = vreinterpret_s8_s32(vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r1)).val[0]); - // int8x8_t _w = vld1_s8(kptr); - // int8x8_t _ww = vreinterpret_s8_s32(vzip_s32(vreinterpret_s32_s8(_w), vreinterpret_s32_s8(_w)).val[0]); - // int16x8_t _s01 = vmull_s8(_r01, _ww); - // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s01)); - // _sum1 = vaddw_s16(_sum1, vget_high_s16(_s01)); - - // kptr += 4; - // } - // } - // } - - // if (out_elempack == 4) - // { - // vse32_v_i32m2(outptr, _sum01, vl); - // // vst1q_s32(outptr, _sum0); - // // vst1q_s32(outptr + 4, _sum1); - // outptr += 8; - // } - // if (out_elempack == 1) - // { - - // // int32x4x2_t _sum01 = vzipq_s32(_sum0, _sum1); - // // vst1_s32(outptr, vget_low_s32(_sum01.val[0])); - // // vst1_s32(outptr + M, vget_high_s32(_sum01.val[0])); - // // vst1_s32(outptr + M * 2, vget_low_s32(_sum01.val[1])); - // // vst1_s32(outptr + M * 3, vget_high_s32(_sum01.val[1])); - // outptr += 2; - // } - // } + for (; ij < outw * outh; ij++) { const int i = ij / outw; @@ -1005,6 +710,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const } for (; q < inch; q++) { + vl = 4; const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; for (int k = 0; k < maxk; k++) @@ -1064,111 +770,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const int* outptr1 = top_blob.channel(p + 1); int ij = 0; - // for (; ij + 1 < outw * outh; ij += 2) - // { - // const int i0 = ij / outw; - // const int i1 = (ij + 1) / outw; - // const int j0 = ij % outw; - // const int j1 = (ij + 1) % outw; - - // int sum00 = 0; - // int sum01 = 0; - // int sum10 = 0; - // int sum11 = 0; - - // #if __riscv_vector - // const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2); - // #else - // const signed char* kptr = weight_data_tm.channel(p / 2); - // #endif - - // int q = 0; - // #if __riscv_vector - // { - // int32x4_t _sum01 = vdupq_n_s32(0); - // int32x4_t _sum23 = vdupq_n_s32(0); - // for (; q + 7 < inch; q += 8) - // { - // const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; - // const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; - - // for (int k = 0; k < maxk; k++) - // { - // const signed char* r0s = r0 + space_ofs[k]; - // const signed char* r1s = r1 + space_ofs[k]; - - // int8x8_t _r0; - // int8x8_t _r1; - // if (elempack == 8) - // { - // _r0 = vld1_s8(r0s); - // _r1 = vld1_s8(r1s); - // } - // else // if (elempack == 1) - // { - // signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; - // signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]}; - // _r0 = vld1_s8(tmp0); - // _r1 = vld1_s8(tmp1); - // } - - // int8x16_t _w0 = vld1q_s8(kptr); - - // int32x2x2_t _rr0 = vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r0)); - // int32x2x2_t _rr1 = vzip_s32(vreinterpret_s32_s8(_r1), vreinterpret_s32_s8(_r1)); - // int8x8_t _r0l = vreinterpret_s8_s32(_rr0.val[0]); - // int8x8_t _r0h = vreinterpret_s8_s32(_rr0.val[1]); - // int8x8_t _r1l = vreinterpret_s8_s32(_rr1.val[0]); - // int8x8_t _r1h = vreinterpret_s8_s32(_rr1.val[1]); - - // int16x8_t _s01 = vmull_s8(_r0l, vget_low_s8(_w0)); - // int16x8_t _s23 = vmull_s8(_r1l, vget_low_s8(_w0)); - // _s01 = vmlal_s8(_s01, _r0h, vget_high_s8(_w0)); - // _s23 = vmlal_s8(_s23, _r1h, vget_high_s8(_w0)); - - // _sum01 = vpadalq_s16(_sum01, _s01); - // _sum23 = vpadalq_s16(_sum23, _s23); - - // kptr += 16; - // } - // } - // int32x2_t _s0 = vpadd_s32(vget_low_s32(_sum01), vget_high_s32(_sum01)); - // int32x2_t _s1 = vpadd_s32(vget_low_s32(_sum23), vget_high_s32(_sum23)); - // sum00 += vget_lane_s32(_s0, 0); - // sum01 += vget_lane_s32(_s1, 0); - // sum10 += vget_lane_s32(_s0, 1); - // sum11 += vget_lane_s32(_s1, 1); - // } - // #endif // __riscv_vector - // for (; q < inch; q++) - // { - // const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; - // const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; - - // for (int k = 0; k < maxk; k++) - // { - // const signed char* r0s = r0 + space_ofs[k]; - // const signed char* r1s = r1 + space_ofs[k]; - - // // if (elempack == 1) - // { - // sum00 += r0s[0] * kptr[0]; - // sum01 += r1s[0] * kptr[0]; - // sum10 += r0s[0] * kptr[1]; - // sum11 += r1s[0] * kptr[1]; - - // kptr += 2; - // } - // } - // } - - // outptr0[0] = sum00; - // outptr0[1] = sum01; - // outptr1[0] = sum10; - // outptr1[1] = sum11; - // outptr0 += 2; - // outptr1 += 2; - // } + for (; ij < outw * outh; ij++) { const int i = ij / outw; @@ -1188,10 +790,6 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const { // int32x4_t _sum01 = vdupq_n_s32(0); vl = 4; - uint16_t odd_index[4] = {1, 3, 5, 7}; - uint16_t even_index[4] = {0, 2, 4, 6}; - vuint16m1_t _odd_index = vle16_v_u16m1(odd_index, vl); - vuint16m1_t _even_index = vle16_v_u16m1(even_index, vl); vint32m2_t _sum01 = vmv_v_x_i32m2(0, vl); for (; q + 7 < inch; q += 8) { @@ -1204,6 +802,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const // int8x8_t _r0; vint8m1_t _r0; + vl = 8; if (elempack == 8) { _r0 = vle8_v_i8m1(r0s, vl); @@ -1220,8 +819,11 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const vl = 16; vint8m1_t _w0 = vle8_v_i8m1(kptr, vl); vl = 8; + // fprintf(stderr, "r0: \n"); + // print_vint8m1(_r0, 8); vint8m1_t _r0l = vslideup_vx_i8m1(_r0, _r0, 4, vl); vint8m1_t _r0h = vslidedown_vx_i8m1(_r0, _r0, 4, vl); + _r0h = vslideup_vx_i8m1(_r0h, _r0h, 4, vl); // vint32m1_t _r0_i16 = vreinterpret_v_i32m1_i8m1(_r0); @@ -1234,6 +836,10 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const vint16m1_t _s01 = vget_v_i16m2_i16m1(_s01_m2, 0); vl = 4; + uint16_t odd_index[4] = {1, 3, 5, 7}; + uint16_t even_index[4] = {0, 2, 4, 6}; + vuint16m1_t _odd_index = vle16_v_u16m1(odd_index, vl); + vuint16m1_t _even_index = vle16_v_u16m1(even_index, vl); _sum01 = vwadd_wv_i32m2(_sum01, vrgather_vv_i16m1(_s01, _odd_index, vl), vl); _sum01 = vwadd_wv_i32m2(_sum01, vrgather_vv_i16m1(_s01, _even_index, vl), vl); @@ -1284,105 +890,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const int* outptr = top_blob.channel(p); int ij = 0; - // for (; ij + 1 < outw * outh; ij += 2) - // { - // const int i0 = ij / outw; - // const int i1 = (ij + 1) / outw; - // const int j0 = ij % outw; - // const int j1 = (ij + 1) % outw; - - // int sum0 = 0; - // int sum1 = 0; - - // #if __riscv_vector - // const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2 + p % 2); - // #else - // const signed char* kptr = weight_data_tm.channel(p / 2 + p % 2); - // #endif - - // int q = 0; - // #if __riscv_vector - // { - // int32x4_t _sum0 = vdupq_n_s32(0); - // int32x4_t _sum1 = vdupq_n_s32(0); - // int32x4_t _sum2 = vdupq_n_s32(0); - // int32x4_t _sum3 = vdupq_n_s32(0); - // for (; q + 7 < inch; q += 8) - // { - // const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; - // const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; - - // for (int k = 0; k < maxk; k++) - // { - // const signed char* r0s = r0 + space_ofs[k]; - // const signed char* r1s = r1 + space_ofs[k]; - - // int8x8_t _r0; - // int8x8_t _r1; - // if (elempack == 8) - // { - // _r0 = vld1_s8(r0s); - // _r1 = vld1_s8(r1s); - // } - // else // if (elempack == 1) - // { - // signed char tmp0[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]}; - // signed char tmp1[8] = {r1s[0], r1s[N], r1s[N * 2], r1s[N * 3], r1s[N * 4], r1s[N * 5], r1s[N * 6], r1s[N * 7]}; - // _r0 = vld1_s8(tmp0); - // _r1 = vld1_s8(tmp1); - // } - - // int8x8_t _w = vld1_s8(kptr); - - // int16x8_t _s0 = vmull_s8(_r0, _w); - // int16x8_t _s1 = vmull_s8(_r1, _w); - - // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0)); - // _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0)); - // _sum2 = vaddw_s16(_sum2, vget_low_s16(_s1)); - // _sum3 = vaddw_s16(_sum3, vget_high_s16(_s1)); - - // kptr += 8; - // } - // } - // _sum0 = vaddq_s32(_sum0, _sum1); - // _sum2 = vaddq_s32(_sum2, _sum3); - // #if __aarch64__ - // sum0 += vaddvq_s32(_sum0); - // sum1 += vaddvq_s32(_sum2); - // #else - // int32x2_t _ss0 = vadd_s32(vget_low_s32(_sum0), vget_high_s32(_sum0)); - // int32x2_t _ss2 = vadd_s32(vget_low_s32(_sum2), vget_high_s32(_sum2)); - // _ss0 = vpadd_s32(_ss0, _ss2); - // sum0 += vget_lane_s32(_ss0, 0); - // sum1 += vget_lane_s32(_ss0, 1); - // #endif - // } - // #endif // __riscv_vector - // for (; q < inch; q++) - // { - // const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; - // const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; - - // for (int k = 0; k < maxk; k++) - // { - // const signed char* r0s = r0 + space_ofs[k]; - // const signed char* r1s = r1 + space_ofs[k]; - - // // if (elempack == 1) - // { - // sum0 += r0s[0] * kptr[0]; - // sum1 += r1s[0] * kptr[0]; - - // kptr += 1; - // } - // } - // } - - // outptr[0] = sum0; - // outptr[1] = sum1; - // outptr += 2; - // } + for (; ij < outw * outh; ij++) { const int i = ij / outw; @@ -1440,7 +948,13 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const } // int32x4_t _sum = vaddq_s32(_sum0, _sum1); // #if __aarch64__ - sum = vmv_x_s_i32m1_i32(vredsum_vs_i32m2_i32m1(vint32m1_t(), _sum01, vmv_v_x_i32m1(sum, vl), vl)); + vl = 8; + vint32m1_t _scalar_sum = vmv_s_x_i32m1(vint32m1_t(), sum, vl); + sum = vmv_x_s_i32m1_i32(vredsum_vs_i32m2_i32m1(_scalar_sum, _sum01, _scalar_sum, vl)); + // int res[8] = {0, 0, 0, 0}; + // vl = 4; + // vse32_v_i32m2(res, _sum01, vl); + // sum += (res[0] + res[1] + res[2] + res[3]); // sum += vaddvq_s32(_sum); // #else // int32x2_t _ss = vadd_s32(vget_low_s32(_sum), vget_high_s32(_sum)); From f838baa14aa603966168a2c62a4b6803eac3ebae Mon Sep 17 00:00:00 2001 From: yxy Date: Sun, 18 Feb 2024 16:50:02 +0000 Subject: [PATCH 26/29] pass test --- src/layer/riscv/convolution_packed_int8.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/layer/riscv/convolution_packed_int8.h b/src/layer/riscv/convolution_packed_int8.h index c8130b64d4d4..557a01423d74 100644 --- a/src/layer/riscv/convolution_packed_int8.h +++ b/src/layer/riscv/convolution_packed_int8.h @@ -444,6 +444,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const for (int k = 0; k < maxk; k++) { + vl = 8; const signed char* r0s = r0 + space_ofs[k]; // int8x8_t _r0; @@ -548,6 +549,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const for (int k = 0; k < maxk; k++) { + vl = 8; const signed char* r0s = r0 + space_ofs[k]; // if (elempack == 1) @@ -629,13 +631,13 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const int q = 0; { - vl = 8; for (; q + 7 < inch; q += 8) { const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; for (int k = 0; k < maxk; k++) { + vl = 8; const signed char* r0s = r0 + space_ofs[k]; // int8x8_t _r0; @@ -715,6 +717,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const for (int k = 0; k < maxk; k++) { + vl = 4; const signed char* r0s = r0 + space_ofs[k]; // if (elempack == 1) @@ -798,11 +801,11 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const for (int k = 0; k < maxk; k++) { + vl = 8; const signed char* r0s = r0 + space_ofs[k]; // int8x8_t _r0; vint8m1_t _r0; - vl = 8; if (elempack == 8) { _r0 = vle8_v_i8m1(r0s, vl); @@ -917,6 +920,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const for (int k = 0; k < maxk; k++) { + vl = 8; const signed char* r0s = r0 + space_ofs[k]; vint8m1_t _r0; From b54c59e587f831bf118546322c378b2c5cbd9f46 Mon Sep 17 00:00:00 2001 From: Xinyu302 Date: Sun, 18 Feb 2024 16:56:29 +0000 Subject: [PATCH 27/29] apply code-format changes --- src/layer/riscv/convolution_packed_int8.h | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/layer/riscv/convolution_packed_int8.h b/src/layer/riscv/convolution_packed_int8.h index 557a01423d74..36a36abca8b6 100644 --- a/src/layer/riscv/convolution_packed_int8.h +++ b/src/layer/riscv/convolution_packed_int8.h @@ -484,7 +484,6 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const // uint8_t mask[8] = {8, 9, 10, 11, 12, 13, 14, 15}; // vuint8m1_t _index = vle8_v_u8m1(mask, vl); - // int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); // int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); // int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); @@ -504,12 +503,12 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const // vint16m1_t _s1l = vget_v_i16m2_i16m2(vwmacc_vv_i16m2(_s1l_m2, _r0hl, vrgather_vv_i8m1(_w2, _index, vl), vl), 0); // vint16m1_t _s2l = vget_v_i16m2_i16m2(vwmacc_vv_i16m2(_s0h_m2, _r0hh, _w3, vl), 0); // vint16m1_t _s3l = vget_v_i16m2_i16m2(vwmacc_vv_i16m2(_s1h_m2, _r0hh, vrgather_vv_i8m1(_w3, _index, vl), vl), 0); - + _s0l_m2 = vwmacc_vv_i16m2(_s0l_m2, _r0hl, _w2, vl); _s1l_m2 = vwmacc_vv_i16m2(_s1l_m2, _r0hl, vslidedown_vx_i8m1(_w2, _w2, 8, vl), vl); _s0h_m2 = vwmacc_vv_i16m2(_s0h_m2, _r0hh, _w3, vl); _s1h_m2 = vwmacc_vv_i16m2(_s1h_m2, _r0hh, vslidedown_vx_i8m1(_w3, _w3, 8, vl), vl); - + // _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w2)); // _s1l = vmlal_s8(_s1l, _r0hl, vget_high_s8(_w2)); // _s0h = vmlal_s8(_s0h, _r0hh, vget_low_s8(_w3)); @@ -669,7 +668,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const vint8m1_t _r0hl = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 2, vl)); vint8m1_t _r0hh = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 3, vl)); - vint16m2_t _sl_m2 = vwmul_vv_i16m2(_r0ll, _w0, vl); + vint16m2_t _sl_m2 = vwmul_vv_i16m2(_r0ll, _w0, vl); vint16m2_t _sh_m2 = vwmul_vv_i16m2(_r0lh, vslidedown_vx_i8m1(_w0, _w0, 8, vl), vl); _sl_m2 = vwmacc_vv_i16m2(_sl_m2, _r0hl, _w1, vl); _sh_m2 = vwmacc_vv_i16m2(_sh_m2, _r0hh, vslidedown_vx_i8m1(_w1, _w1, 8, vl), vl); @@ -677,7 +676,6 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const vint16m1_t _sl = vget_v_i16m2_i16m1(_sl_m2, 0); vint16m1_t _sh = vget_v_i16m2_i16m1(_sh_m2, 0); - // int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0)); // int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1)); // int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2)); @@ -701,7 +699,6 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const // _sum0 = vpadalq_s16(_sum0, _sl); // _sum1 = vpadalq_s16(_sum1, _sh); - kptr += 32; } @@ -893,7 +890,7 @@ static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const int* outptr = top_blob.channel(p); int ij = 0; - + for (; ij < outw * outh; ij++) { const int i = ij / outw; From 2795f0e43648eeeee785ef65e824c4de1d5ae220 Mon Sep 17 00:00:00 2001 From: yxy Date: Sun, 18 Feb 2024 17:22:39 +0000 Subject: [PATCH 28/29] fix net.cpp layer pack --- src/net.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/net.cpp b/src/net.cpp index ff2ab6091373..c365fd3e174f 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -708,7 +708,7 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio if (elembits == 8) { #if NCNN_RVV - const int packn = ncnn::cpu_riscv_vlenb() / 1; + const int packn = ncnn::cpu_riscv_vlenb() / 2; if (elemcount % packn == 0) dst_elempack = packn; #else From 14e53f6699d176e2deec9685e94e274abd5ac2fd Mon Sep 17 00:00:00 2001 From: yxy Date: Sun, 18 Feb 2024 17:30:15 +0000 Subject: [PATCH 29/29] fix segfault bug --- src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h b/src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h index c8fc22c528cd..b86932b66b28 100644 --- a/src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h +++ b/src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h @@ -21,7 +21,7 @@ static void convdw3x3s1_pack8_int8_rvv(const Mat& bottom_blob, Mat& top_blob, co int outw = top_blob.w; int outh = top_blob.h; - int vl = csrr_vlenb() / 1; + int vl = csrr_vlenb() / 2; const int group = bottom_blob.c; @@ -155,7 +155,7 @@ static void convdw3x3s2_pack8_int8_rvv(const Mat& bottom_blob, Mat& top_blob, co int outw = top_blob.w; int outh = top_blob.h; - int vl = 8; + int vl = csrr_vlenb() / 2; const int group = bottom_blob.c;