diff --git a/src/layer/riscv/dequantize_riscv.cpp b/src/layer/riscv/dequantize_riscv.cpp new file mode 100644 index 000000000000..4c3900e4ef15 --- /dev/null +++ b/src/layer/riscv/dequantize_riscv.cpp @@ -0,0 +1,892 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 Xinyu302. All rights reserved. +// Copyright (C) 2019 BUG1989. All rights reserved. +// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "dequantize_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +#include "riscv_usability.h" +#include "cpu.h" + +namespace ncnn { + +Dequantize_riscv::Dequantize_riscv() +{ +#if __riscv_vector + support_packing = true; +#endif // __riscv_vector +} + +int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int vl; + int dims = bottom_blob.dims; + int elempack = bottom_blob.elempack; + +#if __riscv_vector + if (elempack == 8) + { + if (dims == 1) + { + int w = bottom_blob.w; + int outw = w * 2; + + top_blob.create(outw, (size_t)16u, 4, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + float _scale = scale_data[0]; + if (bias_data_size == 0) + { + // #pragma omp parallel for num_threads(opt.num_threads) + int n = outw * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmul_vf_f32m8(_v, _scale, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + else if (bias_data_size == 1) + { + int n = outw * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + vfloat32m8_t _bias = vfmv_v_f_f32m8(bias_data[0], vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmadd_vf_f32m8(_v, _scale, _bias, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + else + { + // #pragma omp parallel for num_threads(opt.num_threads) + int n = outw * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + vfloat32m8_t _bias = vle32_v_f32m8((const float*)bias_data + offset, vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmadd_vf_f32m8(_v, _scale, _bias, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + } + else + { + if (bias_data_size == 0) + { + // #pragma omp parallel for num_threads(opt.num_threads) + int n = outw * 4; + int offset = 0; + + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + + vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmul_vv_f32m8(_v, _scale, vl); + vse32_v_f32m8(ptr, _v, vl); + + offset += vl; + n -= vl; + } + } + else if (bias_data_size == 1) + { + int n = outw * 4; + int offset = 0; + + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + + vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl); + vfloat32m8_t _bias = vfmv_v_f_f32m8(bias_data[0], vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + else + { + int n = outw * 4; + int offset = 0; + + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + + vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl); + vfloat32m8_t _bias = vle32_v_f32m8((const float*)bias_data + offset, vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int outh = h * 2; + vl = 4; + + top_blob.create(w, outh, (size_t)16u, 4, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr0 = top_blob.row(i * 2); + float* ptr1 = top_blob.row(i * 2 + 1); + + vl = 4; + vfloat32m1_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 8, vl); + vfloat32m1_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 8 + 4, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl); + _v0 = vfmul_vv_f32m1(_v0, _scale0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale1, vl); + vse32_v_f32m1(ptr0, _v0, vl); + vse32_v_f32m1(ptr1, _v1, vl); + intptr += 8; + ptr0 += 4; + ptr1 += 4; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr0 = top_blob.row(i * 2); + float* ptr1 = top_blob.row(i * 2 + 1); + + vl = 4; + vfloat32m1_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 8, vl); + vfloat32m1_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 8 + 4, vl); + vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 8, vl); + vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 8 + 4, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl); + _v0 = vfmadd_vv_f32m1(_v0, _scale0, _bias0, vl); + _v1 = vfmadd_vv_f32m1(_v1, _scale1, _bias1, vl); + vse32_v_f32m1(ptr0, _v0, vl); + vse32_v_f32m1(ptr1, _v1, vl); + intptr += 8; + ptr0 += 4; + ptr1 += 4; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int outc = channels * 2; + + top_blob.create(w, h, outc, (size_t)16u, 4, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr0 = top_blob.channel(q * 2); + float* ptr1 = top_blob.channel(q * 2 + 1); + + vl = 4; + vfloat32m1_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 8, vl); + vfloat32m1_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 8 + 4, vl); + + int i = 0; + for (; i + 1 < size; i += 2) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl); + vfloat32m1_t _v2 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 8, vl), vl); + vfloat32m1_t _v3 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 12, vl), vl); + + _v0 = vfmul_vv_f32m1(_v0, _scale0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale1, vl); + _v2 = vfmul_vv_f32m1(_v2, _scale0, vl); + _v3 = vfmul_vv_f32m1(_v3, _scale1, vl); + + vse32_v_f32m1(ptr0, _v0, vl); + vse32_v_f32m1(ptr0 + 4, _v2, vl); + vse32_v_f32m1(ptr1, _v1, vl); + vse32_v_f32m1(ptr1 + 4, _v3, vl); + intptr += 16; + ptr0 += 8; + ptr1 += 8; + } + for (; i < size; i++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl); + + _v0 = vfmul_vv_f32m1(_v0, _scale0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale1, vl); + + vse32_v_f32m1(ptr0, _v0, vl); + vse32_v_f32m1(ptr1, _v1, vl); + + intptr += 8; + ptr0 += 4; + ptr1 += 4; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr0 = top_blob.channel(q * 2); + float* ptr1 = top_blob.channel(q * 2 + 1); + vl = 4; + + vfloat32m1_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 8, vl); + vfloat32m1_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 8 + 4, vl); + vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8, vl); + vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8 + 4, vl); + + int i = 0; + for (; i + 1 < size; i += 2) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl); + vfloat32m1_t _v2 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 8, vl), vl); + vfloat32m1_t _v3 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 12, vl), vl); + + _v0 = vfmadd_vv_f32m1(_v0, _scale0, _bias0, vl); + _v1 = vfmadd_vv_f32m1(_v1, _scale1, _bias1, vl); + _v2 = vfmadd_vv_f32m1(_v2, _scale0, _bias0, vl); + _v3 = vfmadd_vv_f32m1(_v3, _scale1, _bias1, vl); + + vse32_v_f32m1(ptr0, _v0, vl); + vse32_v_f32m1(ptr0 + 4, _v2, vl); + vse32_v_f32m1(ptr1, _v1, vl); + vse32_v_f32m1(ptr1 + 4, _v3, vl); + + intptr += 16; + ptr0 += 8; + ptr1 += 8; + } + for (; i < size; i++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl); + + _v0 = vfmadd_vv_f32m1(_v0, _scale0, _bias0, vl); + _v1 = vfmadd_vv_f32m1(_v1, _scale1, _bias1, vl); + + vse32_v_f32m1(ptr0, _v0, vl); + vse32_v_f32m1(ptr1, _v1, vl); + + intptr += 8; + ptr0 += 4; + ptr1 += 4; + } + } + } + } + + return 0; + } + + if (elempack == 4) + { + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)16u, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + if (bias_data_size == 0) + { + int n = w * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + vfloat32m8_t _scale = vfmv_v_f_f32m8(scale_data[0], vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmul_vv_f32m8(_v, _scale, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + else if (bias_data_size == 1) + { + int n = w * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + vfloat32m8_t _scale = vfmv_v_f_f32m8(scale_data[0], vl); + vfloat32m8_t _bias = vfmv_v_f_f32m8(bias_data[0], vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + else + { + // #pragma omp parallel for num_threads(opt.num_threads) + int n = w * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + vfloat32m8_t _scale = vfmv_v_f_f32m8(scale_data[0], vl); + vfloat32m8_t _bias = vle32_v_f32m8((const float*)bias_data + offset, vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + } + else + { + if (bias_data_size == 0) + { + // #pragma omp parallel for num_threads(opt.num_threads) + int n = w * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmul_vv_f32m8(_v, _scale, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + else if (bias_data_size == 1) + { + int n = w * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl); + vfloat32m8_t _bias = vfmv_v_f_f32m8(bias_data[0], vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + else + { + int n = w * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + const int* intptr = (const int*)bottom_blob + offset; + float* ptr = (float*)top_blob + offset; + vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl); + vfloat32m8_t _bias = vle32_v_f32m8((const float*)bias_data + offset, vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl); + _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl); + vse32_v_f32m8(ptr, _v, vl); + offset += vl; + n -= vl; + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)16u, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); + + int n = w * 4; + int offset = 0; + vl = 4; + vfloat32m1_t _scale_m1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 4, vl); + vfloat32m8_t _scale = vundefined_f32m8(); + _scale = vset_v_f32m1_f32m8(_scale, 0, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 1, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 2, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 3, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 4, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 5, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 6, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 7, _scale_m1); + + while (n > 0) + { + vl = vsetvl_e32m8(n); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl); + _v = vfmul_vv_f32m8(_v, _scale, vl); + vse32_v_f32m8(ptr + offset, _v, vl); + offset += vl; + n -= vl; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); + + vl = 4; + vfloat32m1_t _scale_m1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 4, vl); + vfloat32m1_t _bias_m1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 4, vl); + + vfloat32m8_t _scale = vundefined_f32m8(); + _scale = vset_v_f32m1_f32m8(_scale, 0, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 1, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 2, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 3, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 4, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 5, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 6, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 7, _scale_m1); + + vfloat32m8_t _bias = vundefined_f32m8(); + _bias = vset_v_f32m1_f32m8(_bias, 0, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 1, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 2, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 3, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 4, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 5, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 6, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 7, _bias_m1); + + int n = w * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl); + _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl); + vse32_v_f32m8(ptr + offset, _v, vl); + offset += vl; + n -= vl; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)16u, elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); + vl = 4; + vfloat32m1_t _scale_m1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 4, vl); + vfloat32m8_t _scale = vundefined_f32m8(); + _scale = vset_v_f32m1_f32m8(_scale, 0, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 1, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 2, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 3, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 4, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 5, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 6, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 7, _scale_m1); + + int n = size * 4; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl); + _v = vfmul_vv_f32m8(_v, _scale, vl); + vse32_v_f32m8(ptr + offset, _v, vl); + offset += vl; + n -= vl; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); + + vl = 4; + vfloat32m1_t _scale_m1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 4, vl); + vfloat32m1_t _bias_m1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 4, vl); + + vfloat32m8_t _scale = vundefined_f32m8(); + _scale = vset_v_f32m1_f32m8(_scale, 0, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 1, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 2, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 3, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 4, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 5, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 6, _scale_m1); + _scale = vset_v_f32m1_f32m8(_scale, 7, _scale_m1); + + vfloat32m8_t _bias = vundefined_f32m8(); + _bias = vset_v_f32m1_f32m8(_bias, 0, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 1, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 2, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 3, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 4, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 5, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 6, _bias_m1); + _bias = vset_v_f32m1_f32m8(_bias, 7, _bias_m1); + + int n = size * 4; + int offset = 0; + + while (n > 0) + { + vl = vsetvl_e32m8(n); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl); + _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl); + vse32_v_f32m8(ptr + offset, _v, vl); + offset += vl; + n -= vl; + } + } + } + } + + return 0; + } +#endif // __riscv_vector + + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int* intptr = bottom_blob; + float* ptr = top_blob; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale; + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale + bias; + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale + bias_data[i]; + } + } + } + else + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale_data[i]; + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale_data[i] + bias; + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + ptr[i] = intptr[i] * scale_data[i] + bias_data[i]; + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + + int n = w; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + vfloat32m8_t _scale = vfmv_v_f_f32m8(scale, vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl); + _v = vfmul_vv_f32m8(_v, _scale, vl); + vse32_v_f32m8(ptr + offset, _v, vl); + offset += vl; + n -= vl; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + float* ptr = top_blob.row(i); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; + + int n = w; + int offset = 0; + while (n > 0) + { + vl = vsetvl_e32m8(n); + vfloat32m8_t _scale = vfmv_v_f_f32m8(scale, vl); + vfloat32m8_t _bias = vfmv_v_f_f32m8(bias, vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl); + _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl); + vse32_v_f32m8(ptr + offset, _v, vl); + offset += vl; + n -= vl; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + + int i = 0; +#if __riscv_vector + int n = size; + int offset = 0; + + while (n > 0) + { + vl = vsetvl_e32m8(n); + vfloat32m8_t _scale = vfmv_v_f_f32m8(scale, vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl); + _v = vfmul_vv_f32m8(_v, _scale, vl); + vse32_v_f32m8(ptr + offset, _v, vl); + offset += vl; + n -= vl; + } +#endif // __riscv_vector \ +// for (; i < size; i++) \ +// { \ +// *ptr++ = *intptr++ * scale; \ +// } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + float* ptr = top_blob.channel(q); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; + + int i = 0; +#if __riscv_vector + int n = size; + int offset = 0; + + while (n > 0) + { + vl = vsetvl_e32m8(n); + vfloat32m8_t _scale = vfmv_v_f_f32m8(scale, vl); + vfloat32m8_t _bias = vfmv_v_f_f32m8(bias, vl); + vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl); + _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl); + vse32_v_f32m8(ptr + offset, _v, vl); + offset += vl; + n -= vl; + } +#endif // __riscv_vector \ +// for (; i < size; i++) \ +// { \ +// *ptr++ = *intptr++ * scale + bias; \ +// } + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/riscv/dequantize_riscv.h b/src/layer/riscv/dequantize_riscv.h new file mode 100644 index 000000000000..be7bc6bc90c5 --- /dev/null +++ b/src/layer/riscv/dequantize_riscv.h @@ -0,0 +1,32 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_DEQUANTIZE_RISCV_H +#define LAYER_DEQUANTIZE_RISCV_H + +#include "dequantize.h" + +namespace ncnn { + +class Dequantize_riscv : public Dequantize +{ +public: + Dequantize_riscv(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_DEQUANTIZE_RISCV_H diff --git a/src/layer/riscv/flatten_riscv.cpp b/src/layer/riscv/flatten_riscv.cpp index 491c051c7fea..e6a97798a0fb 100644 --- a/src/layer/riscv/flatten_riscv.cpp +++ b/src/layer/riscv/flatten_riscv.cpp @@ -348,7 +348,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt } #if __riscv_vector - const int packn = csrr_vlenb() / 1; + const int packn = csrr_vlenb() / 2; // packn should be 8 #endif int w = bottom_blob.w; @@ -394,7 +394,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt if (dims == 2) { #if __riscv_vector - if (elempack == packn) // out_elempack == packn + if (elempack == packn) // must add, because in innerproduct, elempack is 8 { #pragma omp parallel for num_threads(opt.num_threads) for (int i = 0; i < h; i++) @@ -405,7 +405,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt int n = w * elempack; while (n > 0) { - size_t vl = vsetvl_e8m1(n); + size_t vl = elempack; vint8m1_t _p = vle8_v_i8m1(ptr, vl); vsse8_v_i8m1(outptr, w * sizeof(unsigned char), _p, vl); @@ -422,7 +422,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt if (dims == 3 || dims == 4) { #if __riscv_vector - if (elempack == packn) // out_elempack == packn + if (elempack == packn) // must add, because in innerproduct, elempack is 8 { #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < channels; q++) @@ -433,7 +433,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt int n = size * elempack; while (n > 0) { - size_t vl = vsetvl_e8m1(n); + size_t vl = elempack; vint8m1_t _p = vle8_v_i8m1(ptr, vl); vsse8_v_i8m1(outptr, size * sizeof(signed char), _p, vl); diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp index accfc683584f..08cddbc151f7 100644 --- a/src/layer/riscv/innerproduct_riscv.cpp +++ b/src/layer/riscv/innerproduct_riscv.cpp @@ -1,5 +1,6 @@ // Tencent is pleased to support the open source community by making ncnn available. // +// Copyright (C) 2024 Xinyu302. All rights reserved. // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. // // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except @@ -53,7 +54,7 @@ int InnerProduct_riscv::create_pipeline(const Option& opt) if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) { // TODO implement int8 - return 0; + return create_pipeline_int8(opt); } #endif @@ -128,27 +129,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt #if NCNN_INT8 if (opt.use_int8_inference && int8_scale_term) { - Mat bottom_blob_unpacked = bottom_blob; - if (bottom_blob.elempack != 1) - { - Option opt_pack1 = opt; - opt_pack1.blob_allocator = opt.workspace_allocator; - - convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); - } - - Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked; - if (bottom_blob_unpacked.elembits() == 16) - { - Option opt_pack1 = opt; - opt_pack1.blob_allocator = opt.workspace_allocator; - - cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1); - } - - Option opt_unpacked = opt; - opt_unpacked.use_packing_layout = false; - return InnerProduct::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked); + return forward_int8(bottom_blob, top_blob, opt); } #endif @@ -1090,4 +1071,512 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co } #endif // __riscv_vector && __riscv_zfh +#if NCNN_INT8 +int InnerProduct_riscv::create_pipeline_int8(const Option& opt) +{ + const int num_input = weight_data_size / num_output; + + int out_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + out_elempack = num_output % 8 == 0 ? 8 : 1; + } +#endif + + // src = inch-outch + // dst = pb-inch-outch/pb + { + Mat weight_data_r2 = weight_data.reshape(num_input, num_output); + + weight_data_tm.create(num_input, num_output / out_elempack, (size_t)out_elempack, out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + signed char* g0 = weight_data_tm.row(q / out_elempack); + + for (int p = 0; p < num_input; p++) + { + for (int j = 0; j < out_elempack; j++) + { + *g0++ = weight_data_r2.row(q + j)[p]; + } + } + } + } + + scale_in_data.create(num_output); + for (int p = 0; p < num_output; p++) + { + // dequantize + float scale_in; + if (weight_data_int8_scales[p] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]); + + scale_in_data[p] = scale_in; + } + + weight_data.release(); + + return 0; +} + +int InnerProduct_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int vl; + const int num_input = weight_data_size / num_output; + + int elembits = bottom_blob.elembits(); + + Mat bottom_blob_int8 = bottom_blob; + + if (elembits != 8) + { + Option opt_q = opt; + opt_q.blob_allocator = opt.workspace_allocator; + quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q); + } + + if (bottom_blob_int8.dims == 2 && bottom_blob_int8.w == num_input) + { + // gemm + Mat bottom_blob_int8_unpacked; + Option opt_unpack = opt; + opt_unpack.blob_allocator = opt.workspace_allocator; + convert_packing(bottom_blob_int8, bottom_blob_int8_unpacked, 1, opt_unpack); + + int h = bottom_blob_int8_unpacked.h; + + int out_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + out_elempack = h % 4 == 0 ? 4 : 1; + } +#endif + + int outh = h / out_elempack; + + top_blob.create(num_output, outh, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + int num_output_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + num_output_elempack = num_output % 8 == 0 ? 8 : 1; + } +#endif + +#if __riscv_vector + if (num_output_elempack == 8 && out_elempack == 4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < outh; j++) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + const signed char* kptr = weight_data_tm.row(p); + const signed char* m0 = bottom_blob_int8_unpacked.row(j * 4); + const signed char* m1 = bottom_blob_int8_unpacked.row(j * 4 + 1); + const signed char* m2 = bottom_blob_int8_unpacked.row(j * 4 + 2); + const signed char* m3 = bottom_blob_int8_unpacked.row(j * 4 + 3); + + vl = 8; + vint32m2_t _sum0 = vmv_v_x_i32m2(0, vl); + vint32m2_t _sum1 = vmv_v_x_i32m2(0, vl); + vint32m2_t _sum2 = vmv_v_x_i32m2(0, vl); + vint32m2_t _sum3 = vmv_v_x_i32m2(0, vl); + + int i = 0; + for (; i < num_input; i++) + { + vint8m1_t _val0 = vmv_v_x_i8m1(m0[0], vl); + vint8m1_t _val1 = vmv_v_x_i8m1(m1[0], vl); + vint8m1_t _val2 = vmv_v_x_i8m1(m2[0], vl); + vint8m1_t _val3 = vmv_v_x_i8m1(m3[0], vl); + + vint8m1_t _w = vle8_v_i8m1(kptr, vl); + + vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val0, _w, vl), 0); + vint16m1_t _s1 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val1, _w, vl), 0); + vint16m1_t _s2 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val2, _w, vl), 0); + vint16m1_t _s3 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val3, _w, vl), 0); + + _sum0 = vwadd_wv_i32m2(_sum0, _s0, vl); + _sum1 = vwadd_wv_i32m2(_sum1, _s1, vl); + _sum2 = vwadd_wv_i32m2(_sum2, _s2, vl); + _sum3 = vwadd_wv_i32m2(_sum3, _s3, vl); + + m0++; + m1++; + m2++; + m3++; + kptr += 8; + } + + // dequantize and relu + vfloat32m2_t _scale_in = vle32_v_f32m2((const float*)scale_in_data + p * 8, vl); + + vfloat32m2_t _sumfp32_0 = vfcvt_f_x_v_f32m2(_sum0, vl); + vfloat32m2_t _sumfp32_1 = vfcvt_f_x_v_f32m2(_sum1, vl); + vfloat32m2_t _sumfp32_2 = vfcvt_f_x_v_f32m2(_sum2, vl); + vfloat32m2_t _sumfp32_3 = vfcvt_f_x_v_f32m2(_sum3, vl); + + if (bias_term) + { + vfloat32m2_t _bias = vle32_v_f32m2((const float*)bias_data + p * 8, vl); + _sumfp32_0 = vfmacc_vv_f32m2(_bias, _sumfp32_0, _scale_in, vl); + _sumfp32_1 = vfmacc_vv_f32m2(_bias, _sumfp32_1, _scale_in, vl); + _sumfp32_2 = vfmacc_vv_f32m2(_bias, _sumfp32_2, _scale_in, vl); + _sumfp32_3 = vfmacc_vv_f32m2(_bias, _sumfp32_3, _scale_in, vl); + } + else + { + _sumfp32_0 = vfmul_vv_f32m2(_sumfp32_0, _scale_in, vl); + _sumfp32_1 = vfmul_vv_f32m2(_sumfp32_1, _scale_in, vl); + _sumfp32_2 = vfmul_vv_f32m2(_sumfp32_2, _scale_in, vl); + _sumfp32_3 = vfmul_vv_f32m2(_sumfp32_3, _scale_in, vl); + } + + _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params, vl); + _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params, vl); + _sumfp32_2 = activation_ps(_sumfp32_2, activation_type, activation_params, vl); + _sumfp32_3 = activation_ps(_sumfp32_3, activation_type, activation_params, vl); + + vsseg4e32_v_f32m2(outptr, _sumfp32_0, _sumfp32_1, _sumfp32_2, _sumfp32_3, vl); + + outptr += 32; + } + } + } + + if (num_output_elempack == 1 && out_elempack == 4) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < outh; j++) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output; p++) + { + const signed char* kptr = weight_data_tm.row(p); + const signed char* m0 = bottom_blob_int8_unpacked.row(j * 4); + const signed char* m1 = bottom_blob_int8_unpacked.row(j * 4 + 1); + const signed char* m2 = bottom_blob_int8_unpacked.row(j * 4 + 2); + const signed char* m3 = bottom_blob_int8_unpacked.row(j * 4 + 3); + + int sum0 = 0; + int sum1 = 0; + int sum2 = 0; + int sum3 = 0; + + int i = 0; + + int n = num_input; + + vl = vsetvlmax_e32m4(); + vint32m4_t _sum0 = vmv_v_x_i32m4(0, vl); + vint32m4_t _sum1 = vmv_v_x_i32m4(0, vl); + vint32m4_t _sum2 = vmv_v_x_i32m4(0, vl); + vint32m4_t _sum3 = vmv_v_x_i32m4(0, vl); + + while (n > 0) + { + vl = vsetvl_e32m4(n); + vint8m1_t _val0 = vle8_v_i8m1(m0, vl); + vint8m1_t _val1 = vle8_v_i8m1(m1, vl); + vint8m1_t _val2 = vle8_v_i8m1(m2, vl); + vint8m1_t _val3 = vle8_v_i8m1(m3, vl); + + vint8m1_t _w = vle8_v_i8m1(kptr, vl); + + vint16m2_t _s0 = vwmul_vv_i16m2(_val0, _w, vl); + vint16m2_t _s1 = vwmul_vv_i16m2(_val1, _w, vl); + vint16m2_t _s2 = vwmul_vv_i16m2(_val2, _w, vl); + vint16m2_t _s3 = vwmul_vv_i16m2(_val3, _w, vl); + + _sum0 = vwadd_wv_i32m4(_sum0, _s0, vl); + _sum1 = vwadd_wv_i32m4(_sum1, _s1, vl); + _sum2 = vwadd_wv_i32m4(_sum2, _s2, vl); + _sum3 = vwadd_wv_i32m4(_sum3, _s3, vl); + + m0 += vl; + m1 += vl; + m2 += vl; + m3 += vl; + + kptr += vl; + n -= vl; + } + + vint32m1_t _sum0_scala = vmv_v_x_i32m1(0, vl); + vint32m1_t _sum1_scala = vmv_v_x_i32m1(0, vl); + vint32m1_t _sum2_scala = vmv_v_x_i32m1(0, vl); + vint32m1_t _sum3_scala = vmv_v_x_i32m1(0, vl); + + vl = vsetvlmax_e32m4(); + _sum0_scala = vredsum_vs_i32m4_i32m1(_sum0_scala, _sum0, _sum0_scala, vl); + _sum1_scala = vredsum_vs_i32m4_i32m1(_sum1_scala, _sum1, _sum1_scala, vl); + _sum2_scala = vredsum_vs_i32m4_i32m1(_sum2_scala, _sum2, _sum2_scala, vl); + _sum3_scala = vredsum_vs_i32m4_i32m1(_sum3_scala, _sum3, _sum3_scala, vl); + sum0 = vmv_x_s_i32m1_i32(_sum0_scala); + sum1 = vmv_x_s_i32m1_i32(_sum1_scala); + sum2 = vmv_x_s_i32m1_i32(_sum2_scala); + sum3 = vmv_x_s_i32m1_i32(_sum3_scala); + + // dequantize and relu + float sumfp32_0 = sum0 * scale_in_data[p]; + float sumfp32_1 = sum1 * scale_in_data[p]; + float sumfp32_2 = sum2 * scale_in_data[p]; + float sumfp32_3 = sum3 * scale_in_data[p]; + + if (bias_term) + { + sumfp32_0 += bias_data[p]; + sumfp32_1 += bias_data[p]; + sumfp32_2 += bias_data[p]; + sumfp32_3 += bias_data[p]; + } + + outptr[0] = activation_ss(sumfp32_0, activation_type, activation_params); + outptr[1] = activation_ss(sumfp32_1, activation_type, activation_params); + outptr[2] = activation_ss(sumfp32_2, activation_type, activation_params); + outptr[3] = activation_ss(sumfp32_3, activation_type, activation_params); + outptr += 4; + } + } + } + + if (num_output_elempack == 8 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < outh; j++) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output / num_output_elempack; p++) + { + const signed char* kptr = weight_data_tm.row(p); + const signed char* m = bottom_blob_int8_unpacked.row(j); + + vint32m2_t _sum = vmv_v_x_i32m2(0, vl); + + int i = 0; + + vl = 8; + for (; i < num_input; i++) + { + vint8m1_t _val = vmv_v_x_i8m1(m[0], vl); + vint8m1_t _w = vle8_v_i8m1(kptr, vl); + + // int8x8_t _val = vld1_dup_s8(m); + // int8x8_t _w = vld1_s8(kptr); + + vint16m2_t _s = vwmul_vv_i16m2(_val, _w, vl); + vint16m1_t _s0 = vget_v_i16m2_i16m1(_s, 0); + + _sum = vwadd_wv_i32m2(_sum, _s0, vl); + + m++; + kptr += 8; + } + + // dequantize and relu + vfloat32m2_t _scale_in = vle32_v_f32m2((const float*)scale_in_data + p * 8, vl); + vfloat32m2_t _sumfp32 = vfcvt_f_x_v_f32m2(_sum, vl); + + if (bias_term) + { + vfloat32m2_t _bias = vle32_v_f32m2((const float*)bias_data + p * 8, vl); + _sumfp32 = vfmacc_vv_f32m2(_bias, _sumfp32, _scale_in, vl); + } + else + { + _sumfp32 = vfmul_vv_f32m2(_sumfp32, _scale_in, vl); + } + + // _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params, vl); + _sumfp32 = activation_ps(_sumfp32, activation_type, activation_params, vl); + + vse32_v_f32m2(outptr, _sumfp32, vl); + outptr += 8; + } + } + } +#endif // __riscv_vector + + if (num_output_elempack == 1 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int j = 0; j < outh; j++) + { + float* outptr = top_blob.row(j); + + for (int p = 0; p < num_output; p++) + { + const signed char* kptr = weight_data_tm.row(p); + const signed char* m = bottom_blob_int8_unpacked.row(j); + + int sum = 0; + + int i = 0; +#if __riscv_vector + + int n = num_input; + vint32m4_t _sum = vmv_v_x_i32m4(0, vsetvlmax_e32m4()); + while (n > 0) + { + vl = vsetvl_e32m4(n); + vint8m1_t _val = vle8_v_i8m1(m, vl); + vint8m1_t _w = vle8_v_i8m1(kptr, vl); + + vint16m2_t _s = vwmul_vv_i16m2(_val, _w, vl); + _sum = vwadd_wv_i32m4(_sum, _s, vl); + + m += vl; + kptr += vl; + n -= vl; + } + + vint32m1_t _sum_scala = vmv_v_x_i32m1(0, vl); + _sum_scala = vredsum_vs_i32m4_i32m1(_sum_scala, _sum, _sum_scala, vl); + sum = vmv_x_s_i32m1_i32(_sum_scala); + +#endif // __riscv_vector + + // for (; i < num_input; i++) \ +// { \ +// sum += *m++ * *kptr++; \ +// } + + // dequantize and relu + float sumfp32 = sum * scale_in_data[p]; + + if (bias_term) + sumfp32 += bias_data[p]; + + outptr[0] = activation_ss(sumfp32, activation_type, activation_params); + outptr += 1; + } + } + } + + return 0; + } + + Mat bottom_blob_int8_flattened = bottom_blob_int8; + if (bottom_blob_int8.dims != 1) + { + Option opt_flatten = opt; + opt_flatten.blob_allocator = opt.workspace_allocator; + flatten->forward(bottom_blob_int8, bottom_blob_int8_flattened, opt_flatten); + } + + // int elempack = bottom_blob_int8_flattened.elempack; + + int out_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + out_elempack = num_output % 8 == 0 ? 8 : 1; + } +#endif + + top_blob.create(num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + +#if __riscv_vector + if (out_elempack == 8) + { + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output / out_elempack; p++) + { + const signed char* kptr = weight_data_tm.row(p); + const signed char* sptr = bottom_blob_int8_flattened; + + vl = 8; + + vint32m2_t _sum0 = vmv_v_x_i32m2(0, vl); + + int i = 0; + for (; i < num_input; i++) + { + vint8m1_t _val = vmv_v_x_i8m1(sptr[0], vl); + vint8m1_t _w = vle8_v_i8m1(kptr, vl); + + vint16m1_t _s = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val, _w, vl), 0); + _sum0 = vwadd_wv_i32m2(_sum0, _s, vl); + + sptr += 1; + kptr += 8; + } + + // dequantize and relu + vfloat32m2_t _scale_in = vle32_v_f32m2((const float*)scale_in_data + p * 8, vl); + + vfloat32m2_t _sumfp32 = vfcvt_f_x_v_f32m2(_sum0, vl); + + if (bias_term) + { + vfloat32m2_t _bias = vle32_v_f32m2((const float*)bias_data + p * 8, vl); + _sumfp32 = vfmacc_vv_f32m2(_bias, _sumfp32, _scale_in, vl); + } + else + { + _sumfp32 = vfmul_vv_f32m2(_sumfp32, _scale_in, vl); + } + + _sumfp32 = activation_ps(_sumfp32, activation_type, activation_params, vl); + + float* outptr = (float*)top_blob + p * 8; + vse32_v_f32m2(outptr, _sumfp32, vl); + } + } +#endif // __riscv_vector + + if (out_elempack == 1) + { + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = 0; p < num_output / out_elempack; p++) + { + const signed char* kptr = weight_data_tm.row(p); + const signed char* sptr = bottom_blob_int8_flattened; + + int sum = 0; + + int i = 0; + for (; i < num_input; i++) + { + signed char val = sptr[0]; + signed char w = kptr[0]; + + sum += val * w; + + sptr += 1; + kptr += 1; + } + // dequantize and relu + float sumfp32 = sum * scale_in_data[p]; + + if (bias_term) + sumfp32 += bias_data[p]; + sumfp32 = activation_ss(sumfp32, activation_type, activation_params); + + top_blob[p] = sumfp32; + } + } + + return 0; +} +#endif // NCNN_INT8 + } // namespace ncnn diff --git a/src/layer/riscv/innerproduct_riscv.h b/src/layer/riscv/innerproduct_riscv.h index d3056d5801d0..77a3b93de12a 100644 --- a/src/layer/riscv/innerproduct_riscv.h +++ b/src/layer/riscv/innerproduct_riscv.h @@ -36,6 +36,11 @@ class InnerProduct_riscv : public InnerProduct int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif +#if __riscv_vector + int create_pipeline_int8(const Option& opt); + int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif + public: Layer* flatten; @@ -43,6 +48,10 @@ class InnerProduct_riscv : public InnerProduct // fp16 Mat bias_data_fp16; + +#if NCNN_INT8 + Mat scale_in_data; +#endif }; } // namespace ncnn diff --git a/src/layer/riscv/padding_packn.h b/src/layer/riscv/padding_packn.h index 50f5efe1216d..60465da64ffc 100644 --- a/src/layer/riscv/padding_packn.h +++ b/src/layer/riscv/padding_packn.h @@ -15,7 +15,8 @@ #define _PADDING_PACKN_RVV(SEW, TSEW, LMUL, T, VT) \ static void padding_constant_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right, v##VT##m##LMUL##_t v) \ { \ - const int packn = csrr_vlenb() / sizeof(T); \ + int packn = csrr_vlenb() / sizeof(T); \ + if (packn > 8) packn = 8; \ const size_t vl = vsetvl_e##SEW##m##LMUL(packn); \ \ const T* ptr = src; \ @@ -64,7 +65,8 @@ \ static void padding_replicate_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right) \ { \ - const int packn = csrr_vlenb() / sizeof(T); \ + int packn = csrr_vlenb() / sizeof(T); \ + if (packn > 8) packn = 8; \ const size_t vl = vsetvl_e##SEW##m##LMUL(packn); \ \ const T* ptr = src; \ @@ -143,7 +145,8 @@ \ static void padding_reflect_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right) \ { \ - const int packn = csrr_vlenb() / sizeof(T); \ + int packn = csrr_vlenb() / sizeof(T); \ + if (packn > 8) packn = 8; \ const size_t vl = vsetvl_e##SEW##m##LMUL(packn); \ \ const T* ptr = src; \ diff --git a/src/layer/riscv/padding_riscv.cpp b/src/layer/riscv/padding_riscv.cpp index 8f4b54da5904..2e2d7471f477 100644 --- a/src/layer/riscv/padding_riscv.cpp +++ b/src/layer/riscv/padding_riscv.cpp @@ -510,7 +510,7 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int Padding_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { #if __riscv_vector - const int packn = csrr_vlenb() / 1; + const int packn = csrr_vlenb() / 2; const size_t vl = vsetvl_e8m1(packn); #endif diff --git a/src/layer/riscv/quantize_riscv.cpp b/src/layer/riscv/quantize_riscv.cpp new file mode 100644 index 000000000000..89d82106aa04 --- /dev/null +++ b/src/layer/riscv/quantize_riscv.cpp @@ -0,0 +1,1529 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 Xinyu302. All rights reserved. +// Copyright (C) 2019 BUG1989. All rights reserved. +// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "quantize_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +#include "riscv_usability.h" + +#include "cpu.h" + +namespace ncnn { + +Quantize_riscv::Quantize_riscv() +{ +#if __riscv_vector + support_packing = true; + +#if __riscv_zfh + support_fp16_storage = true; +#endif // __riscv_zfh +#endif // __riscv_vector +} + +int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int elembits = bottom_blob.elembits(); +#if __riscv_vector && __riscv_zfh + if (support_fp16_storage && opt.use_fp16_storage && elembits == 16) + { + if (opt.use_fp16_arithmetic) + return forward_fp16sa(bottom_blob, top_blob, opt); + else + return forward_fp16s(bottom_blob, top_blob, opt); + } +#endif // __riscv_vector && __riscv_zfh + + int vl = vsetvlmax_e32m1(); + int dims = bottom_blob.dims; + int elempack = bottom_blob.elempack; + +#if __riscv_vector + if (elempack == 4) + { + if (dims == 1) + { + int w = bottom_blob.w; + int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; + int outw = w * elempack / out_elempack; + + top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const float* ptr0 = (const float*)bottom_blob + i * 4; + signed char* outptr = (signed char*)top_blob + i * 4; + + outptr[0] = float2int8(ptr0[0] * scale); + outptr[1] = float2int8(ptr0[1] * scale); + outptr[2] = float2int8(ptr0[2] * scale); + outptr[3] = float2int8(ptr0[3] * scale); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const float* ptr0 = (const float*)bottom_blob + i * 4; + signed char* outptr = (signed char*)top_blob + i * 4; + + outptr[0] = float2int8(ptr0[0] * scale_data[i * 4]); + outptr[1] = float2int8(ptr0[1] * scale_data[i * 4 + 1]); + outptr[2] = float2int8(ptr0[2] * scale_data[i * 4 + 2]); + outptr[3] = float2int8(ptr0[3] * scale_data[i * 4 + 3]); + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; + int outh = h * elempack / out_elempack; + + top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == 8) + { + if (scale_data_size == 1) + { + // vfloat32m1_t _scale = vfmv_v_f_f32m1(scale_data[0]); + // float32x4_t _scale = vdupq_n_f32(scale_data[0]); + float _scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const float* ptr0 = bottom_blob.row(i * 2); + const float* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* outptr = top_blob.row(i); + + for (int j = 0; j < w; j++) + { + vfloat32m1_t _vlow = vle32_v_f32m1(ptr0, vl); + vfloat32m1_t _vhigh = vle32_v_f32m1(ptr1, vl); + _vlow = vfmul_vf_f32m1(_vlow, _scale, vl); + _vhigh = vfmul_vf_f32m1(_vhigh, _scale, vl); + + int64_t _v = float2int8(_vlow, _vhigh); + *(int64_t*)outptr = _v; + + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const float* ptr0 = bottom_blob.row(i * 2); + const float* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* outptr = top_blob.row(i); + + vfloat32m1_t _scale0 = vle32_v_f32m1((const float*)scale_data + i * 8, vl); + vfloat32m1_t _scale1 = vle32_v_f32m1((const float*)scale_data + i * 8 + 4, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m1_t _vlow = vle32_v_f32m1(ptr0, vl); + vfloat32m1_t _vhigh = vle32_v_f32m1(ptr1, vl); + _vlow = vfmul_vv_f32m1(_vlow, _scale0, vl); + _vhigh = vfmul_vv_f32m1(_vhigh, _scale1, vl); + int64_t _v = float2int8(_vlow, _vhigh); + *(int64_t*)outptr = _v; + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i * 4); + signed char* outptr1 = top_blob.row(i * 4 + 1); + signed char* outptr2 = top_blob.row(i * 4 + 2); + signed char* outptr3 = top_blob.row(i * 4 + 3); + + for (int j = 0; j < w; j++) + { + outptr0[0] = float2int8(ptr0[0] * scale); + outptr1[0] = float2int8(ptr0[1] * scale); + outptr2[0] = float2int8(ptr0[2] * scale); + outptr3[0] = float2int8(ptr0[3] * scale); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i * 4); + signed char* outptr1 = top_blob.row(i * 4 + 1); + signed char* outptr2 = top_blob.row(i * 4 + 2); + signed char* outptr3 = top_blob.row(i * 4 + 3); + + const float s0 = scale_data[i * 4]; + const float s1 = scale_data[i * 4 + 1]; + const float s2 = scale_data[i * 4 + 2]; + const float s3 = scale_data[i * 4 + 3]; + + for (int j = 0; j < w; j++) + { + outptr0[0] = float2int8(ptr0[0] * s0); + outptr1[0] = float2int8(ptr0[1] * s1); + outptr2[0] = float2int8(ptr0[2] * s2); + outptr3[0] = float2int8(ptr0[3] * s3); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; + int outc = channels * elempack / out_elempack; + + top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == 8) + { + if (scale_data_size == 1) + { + float _scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const float* ptr0 = bottom_blob.channel(q * 2); + const float* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* outptr = top_blob.channel(q); + + int i = 0; + for (; i + 1 < size; i += 2) + { + vfloat32m1_t _v0 = vle32_v_f32m1(ptr0, vl); + vfloat32m1_t _v1 = vle32_v_f32m1(ptr0 + 4, vl); + vfloat32m1_t _v2 = vle32_v_f32m1(ptr1, vl); + vfloat32m1_t _v3 = vle32_v_f32m1(ptr1 + 4, vl); + _v0 = vfmul_vf_f32m1(_v0, _scale, vl); + _v1 = vfmul_vf_f32m1(_v1, _scale, vl); + _v2 = vfmul_vf_f32m1(_v2, _scale, vl); + _v3 = vfmul_vf_f32m1(_v3, _scale, vl); + + vint8m1_t _v = float2int8(_v0, _v2, _v1, _v3); + vse8_v_i8m1(outptr, _v, 4 * vl); + ptr0 += 8; + ptr1 += 8; + outptr += 16; + } + for (; i < size; i++) + { + vfloat32m1_t _vlow = vle32_v_f32m1(ptr0, vl); + vfloat32m1_t _vhigh = vle32_v_f32m1(ptr1, vl); + + _vlow = vfmul_vf_f32m1(_vlow, _scale, vl); + _vhigh = vfmul_vf_f32m1(_vhigh, _scale, vl); + + int64_t _v = float2int8(_vlow, _vhigh); + *(int64_t*)outptr = _v; + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const float* ptr0 = bottom_blob.channel(q * 2); + const float* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* outptr = top_blob.channel(q); + + vfloat32m1_t _scale0 = vle32_v_f32m1((const float*)scale_data + q * 8, vl); + vfloat32m1_t _scale1 = vle32_v_f32m1((const float*)scale_data + q * 8 + 4, vl); + + int i = 0; + for (; i < size; i++) + { + vfloat32m1_t _vlow = vle32_v_f32m1(ptr0, vl); + vfloat32m1_t _vhigh = vle32_v_f32m1(ptr1, vl); + + _vlow = vfmul_vv_f32m1(_vlow, _scale0, vl); + _vhigh = vfmul_vv_f32m1(_vhigh, _scale1, vl); + + int64_t _v = float2int8(_vlow, _vhigh); + *(int64_t*)outptr = _v; + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q * 4); + signed char* outptr1 = top_blob.channel(q * 4 + 1); + signed char* outptr2 = top_blob.channel(q * 4 + 2); + signed char* outptr3 = top_blob.channel(q * 4 + 3); + + for (int i = 0; i < size; i++) + { + outptr0[0] = float2int8(ptr0[0] * scale); + outptr1[0] = float2int8(ptr0[1] * scale); + outptr2[0] = float2int8(ptr0[2] * scale); + outptr3[0] = float2int8(ptr0[3] * scale); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q * 4); + signed char* outptr1 = top_blob.channel(q * 4 + 1); + signed char* outptr2 = top_blob.channel(q * 4 + 2); + signed char* outptr3 = top_blob.channel(q * 4 + 3); + + const float s0 = scale_data[q * 4]; + const float s1 = scale_data[q * 4 + 1]; + const float s2 = scale_data[q * 4 + 2]; + const float s3 = scale_data[q * 4 + 3]; + + for (int i = 0; i < size; i++) + { + outptr0[0] = float2int8(ptr0[0] * s0); + outptr1[0] = float2int8(ptr0[1] * s1); + outptr2[0] = float2int8(ptr0[2] * s2); + outptr3[0] = float2int8(ptr0[3] * s3); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + } + } + + return 0; + } +#endif // __riscv_vector + + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const float* ptr = bottom_blob; + signed char* outptr = top_blob; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + outptr[i] = float2int8(ptr[i] * scale); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + outptr[i] = float2int8(ptr[i] * scale_data[i]); + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const float* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + + for (int j = 0; j < w; j++) + { + *outptr0++ = float2int8(*ptr0++ * scale); + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const float* ptr = bottom_blob.channel(q); + signed char* outptr = top_blob.channel(q); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + + int i = 0; +#if __riscv_vector + float _scale = scale; + + for (; i + 15 < size; i += 16) + { + vfloat32m1_t _v0 = vle32_v_f32m1(ptr, vl); + vfloat32m1_t _v1 = vle32_v_f32m1(ptr + 4, vl); + vfloat32m1_t _v2 = vle32_v_f32m1(ptr + 8, vl); + vfloat32m1_t _v3 = vle32_v_f32m1(ptr + 12, vl); + + _v0 = vfmul_vf_f32m1(_v0, _scale, vl); + _v1 = vfmul_vf_f32m1(_v1, _scale, vl); + _v2 = vfmul_vf_f32m1(_v2, _scale, vl); + _v3 = vfmul_vf_f32m1(_v3, _scale, vl); + + vint8m1_t _v = float2int8(_v0, _v1, _v2, _v3); + vse8_v_i8m1(outptr, _v, 4 * vl); + + ptr += 16; + outptr += 16; + } + for (; i + 7 < size; i += 8) + { + vfloat32m1_t _v0 = vle32_v_f32m1(ptr, vl); + vfloat32m1_t _v1 = vle32_v_f32m1(ptr + 4, vl); + + _v0 = vfmul_vf_f32m1(_v0, _scale, vl); + _v1 = vfmul_vf_f32m1(_v1, _scale, vl); + + int64_t _v = float2int8(_v0, _v1); + *(int64_t*)outptr = _v; + ptr += 8; + outptr += 8; + } +#endif // __riscv_vector + for (; i < size; i++) + { + *outptr++ = float2int8(*ptr++ * scale); + } + } + } + + return 0; +} + +#if __riscv_vector && __riscv_zfh + +int Quantize_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int dims = bottom_blob.dims; + int elempack = bottom_blob.elempack; + int vl; + if (elempack == 8) + { + vl = 8; + if (dims == 1) + { + int w = bottom_blob.w; + int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; + int outw = w * elempack / out_elempack; + + top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + vfloat32m2_t _scale = vfmv_v_f_f32m2(scale, vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 8; + signed char* outptr = (signed char*)top_blob + i * 8; + + vl = 8; + vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl); + vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl); + _v = vfmul_vv_f32m2(_v, _scale, vl); + *(int64_t*)outptr = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1)); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 8; + signed char* outptr = (signed char*)top_blob + i * 8; + + vl = 8; + vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl); + vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl); + vfloat32m2_t _scale = vle32_v_f32m2((const float*)scale_data + i * 8, vl); + _v = vfmul_vv_f32m2(_v, _scale, vl); + *(int64_t*)outptr = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1)); + } + } + } + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; + int outh = h * elempack / out_elempack; + + top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + vfloat32m2_t _scale = vfmv_v_f_f32m2(scale, vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i); + + for (int j = 0; j < w; j++) + { + vl = 8; + vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl); + vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl); + _v = vfmul_vv_f32m2(_v, _scale, vl); + *(int64_t*)outptr0 = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1)); + + ptr0 += 8; + outptr0 += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i); + + vfloat32m2_t _scale = vle32_v_f32m2((const float*)scale_data + i * 8, vl); + for (int j = 0; j < w; j++) + { + vl = 8; + vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl); + vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl); + _v = vfmul_vv_f32m2(_v, _scale, vl); + *(int64_t*)outptr0 = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1)); + + ptr0 += 8; + outptr0 += 8; + } + } + } + } + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; + int outc = channels * elempack / out_elempack; + + top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + vfloat32m2_t _scale = vfmv_v_f_f32m2(scale, vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q); + signed char* outptr = top_blob.channel(q); + + for (int i = 0; i < size; i++) + { + vl = 8; + vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl); + vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl); + _v = vfmul_vv_f32m2(_v, _scale, vl); + *(int64_t*)outptr = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1)); + ptr0 += 8; + outptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q); + signed char* outptr = top_blob.channel(q); + + vfloat32m2_t _scale = vle32_v_f32m2((const float*)scale_data + q * 8, vl); + + for (int i = 0; i < size; i++) + { + vl = 8; + vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl); + vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl); + _v = vfmul_vv_f32m2(_v, _scale, vl); + *(int64_t*)outptr = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1)); + ptr0 += 8; + outptr += 8; + } + } + } + } + return 0; + } + + if (elempack == 4) + { + if (dims == 1) + { + int w = bottom_blob.w; + int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; + int outw = w * elempack / out_elempack; + + top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4; + signed char* outptr = (signed char*)top_blob + i * 4; + + outptr[0] = float2int8((float)ptr0[0] * scale); + outptr[1] = float2int8((float)ptr0[1] * scale); + outptr[2] = float2int8((float)ptr0[2] * scale); + outptr[3] = float2int8((float)ptr0[3] * scale); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4; + signed char* outptr = (signed char*)top_blob + i * 4; + + outptr[0] = float2int8((float)ptr0[0] * scale_data[i * 4]); + outptr[1] = float2int8((float)ptr0[1] * scale_data[i * 4 + 1]); + outptr[2] = float2int8((float)ptr0[2] * scale_data[i * 4 + 2]); + outptr[3] = float2int8((float)ptr0[3] * scale_data[i * 4 + 3]); + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; + int outh = h * elempack / out_elempack; + + top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == 8) + { + if (scale_data_size == 1) + { + float _scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const __fp16* ptr0 = bottom_blob.row(i * 2); + const __fp16* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* outptr = top_blob.row(i); + vl = 4; + + for (int j = 0; j < w; j++) + { + vfloat32m1_t _vlow = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr0, vl), vl), 0); + vfloat32m1_t _vhigh = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr1, vl), vl), 0); + _vlow = vfmul_vf_f32m1(_vlow, _scale, vl); + _vhigh = vfmul_vf_f32m1(_vhigh, _scale, vl); + int64_t _v = float2int8(_vlow, _vhigh); + *(int64_t*)outptr = _v; + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const __fp16* ptr0 = bottom_blob.row(i * 2); + const __fp16* ptr1 = bottom_blob.row(i * 2 + 1); + signed char* outptr = top_blob.row(i); + + vl = 4; + vfloat32m1_t _scale0 = vle32_v_f32m1((const float*)scale_data + i * 8, vl); + vfloat32m1_t _scale1 = vle32_v_f32m1((const float*)scale_data + i * 8 + 4, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m1_t _vlow = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr0, vl), vl), 0); + vfloat32m1_t _vhigh = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr1, vl), vl), 0); + _vlow = vfmul_vv_f32m1(_vlow, _scale0, vl); + _vhigh = vfmul_vv_f32m1(_vhigh, _scale1, vl); + int64_t _v = float2int8(_vlow, _vhigh); + *(int64_t*)outptr = _v; + + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i * 4); + signed char* outptr1 = top_blob.row(i * 4 + 1); + signed char* outptr2 = top_blob.row(i * 4 + 2); + signed char* outptr3 = top_blob.row(i * 4 + 3); + + for (int j = 0; j < w; j++) + { + outptr0[0] = float2int8((float)ptr0[0] * scale); + outptr1[0] = float2int8((float)ptr0[1] * scale); + outptr2[0] = float2int8((float)ptr0[2] * scale); + outptr3[0] = float2int8((float)ptr0[3] * scale); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i * 4); + signed char* outptr1 = top_blob.row(i * 4 + 1); + signed char* outptr2 = top_blob.row(i * 4 + 2); + signed char* outptr3 = top_blob.row(i * 4 + 3); + + const float s0 = scale_data[i * 4]; + const float s1 = scale_data[i * 4 + 1]; + const float s2 = scale_data[i * 4 + 2]; + const float s3 = scale_data[i * 4 + 3]; + + for (int j = 0; j < w; j++) + { + outptr0[0] = float2int8((float)ptr0[0] * s0); + outptr1[0] = float2int8((float)ptr0[1] * s1); + outptr2[0] = float2int8((float)ptr0[2] * s2); + outptr3[0] = float2int8((float)ptr0[3] * s3); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; + int outc = channels * elempack / out_elempack; + + top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == 8) + { + if (scale_data_size == 1) + { + float _scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q * 2); + const __fp16* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* outptr = top_blob.channel(q); + + vl = 4; + + for (int i = 0; i < size; i++) + { + vfloat32m1_t _vlow = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr0, vl), vl), 0); + vfloat32m1_t _vhigh = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr1, vl), vl), 0); + _vlow = vfmul_vf_f32m1(_vlow, _scale, vl); + _vhigh = vfmul_vf_f32m1(_vhigh, _scale, vl); + int64_t _v = float2int8(_vlow, _vhigh); + *(int64_t*)outptr = _v; + + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q * 2); + const __fp16* ptr1 = bottom_blob.channel(q * 2 + 1); + signed char* outptr = top_blob.channel(q); + + vl = 4; + vfloat32m1_t _scale0 = vle32_v_f32m1((const float*)scale_data + q * 8, vl); + vfloat32m1_t _scale1 = vle32_v_f32m1((const float*)scale_data + q * 8 + 4, vl); + + for (int i = 0; i < size; i++) + { + vfloat32m1_t _vlow = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr0, vl), vl), 0); + vfloat32m1_t _vhigh = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr1, vl), vl), 0); + _vlow = vfmul_vv_f32m1(_vlow, _scale0, vl); + _vhigh = vfmul_vv_f32m1(_vhigh, _scale1, vl); + int64_t _v = float2int8(_vlow, _vhigh); + *(int64_t*)outptr = _v; + + ptr0 += 4; + ptr1 += 4; + outptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q * 4); + signed char* outptr1 = top_blob.channel(q * 4 + 1); + signed char* outptr2 = top_blob.channel(q * 4 + 2); + signed char* outptr3 = top_blob.channel(q * 4 + 3); + + for (int i = 0; i < size; i++) + { + outptr0[0] = float2int8((float)ptr0[0] * scale); + outptr1[0] = float2int8((float)ptr0[1] * scale); + outptr2[0] = float2int8((float)ptr0[2] * scale); + outptr3[0] = float2int8((float)ptr0[3] * scale); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q * 4); + signed char* outptr1 = top_blob.channel(q * 4 + 1); + signed char* outptr2 = top_blob.channel(q * 4 + 2); + signed char* outptr3 = top_blob.channel(q * 4 + 3); + + const float s0 = scale_data[q * 4]; + const float s1 = scale_data[q * 4 + 1]; + const float s2 = scale_data[q * 4 + 2]; + const float s3 = scale_data[q * 4 + 3]; + + for (int i = 0; i < size; i++) + { + outptr0[0] = float2int8((float)ptr0[0] * s0); + outptr1[0] = float2int8((float)ptr0[1] * s1); + outptr2[0] = float2int8((float)ptr0[2] * s2); + outptr3[0] = float2int8((float)ptr0[3] * s3); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + } + } + + return 0; + } + + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const __fp16* ptr = bottom_blob; + signed char* outptr = top_blob; + + if (scale_data_size == 1) + { + const float scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + outptr[i] = float2int8((float)ptr[i] * scale); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + outptr[i] = float2int8((float)ptr[i] * scale_data[i]); + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + + for (int j = 0; j < w; j++) + { + *outptr0++ = float2int8((float)*ptr0++ * scale); + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob.channel(q); + signed char* outptr = top_blob.channel(q); + + const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + + for (int i = 0; i < size; i++) + { + *outptr++ = float2int8((float)*ptr++ * scale); + } + } + } + + return 0; +} + +int Quantize_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int dims = bottom_blob.dims; + int elempack = bottom_blob.elempack; + int vl; + + if (elempack == 8) + { + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)8u, 8, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + __fp16 _scale = (__fp16)scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 8; + signed char* outptr = (signed char*)top_blob + i * 8; + vl = 8; + vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl); + _v = vfmul_vf_f16m1(_v, _scale, vl); + *(int64_t*)outptr = float2int8(_v); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 8; + signed char* outptr = (signed char*)top_blob + i * 8; + + vl = 8; + vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl); + vfloat16m1_t _scale = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)scale_data + i * 8, vl), vl); + + _v = vfmul_vv_f16m1(_v, _scale, vl); + *(int64_t*)outptr = float2int8(_v); + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + __fp16 _scale = (__fp16)scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i); + + for (int j = 0; j < w; j++) + { + vl = 8; + vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl); + _v = vfmul_vf_f16m1(_v, _scale, vl); + + *(int64_t*)outptr0 = float2int8(_v); + + ptr0 += 8; + outptr0 += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i); + + vl = 8; + vfloat16m1_t _scale = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)scale_data + i * 8, vl), vl); + + for (int j = 0; j < w; j++) + { + vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl); + _v = vfmul_vv_f16m1(_v, _scale, vl); + *(int64_t*)outptr0 = float2int8(_v); + + ptr0 += 8; + outptr0 += 8; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + __fp16 _scale = (__fp16)scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q); + + for (int i = 0; i < size; i++) + { + vl = 8; + vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl); + _v = vfmul_vf_f16m1(_v, _scale, vl); + *(int64_t*)outptr0 = float2int8(_v); + + ptr0 += 8; + outptr0 += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q); + + vl = 8; + vfloat16m1_t _scale = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)scale_data + q * 8, vl), vl); + + for (int i = 0; i < size; i++) + { + vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl); + _v = vfmul_vv_f16m1(_v, _scale, vl); + *(int64_t*)outptr0 = float2int8(_v); + + ptr0 += 8; + outptr0 += 8; + } + } + } + } + + return 0; + } + + if (elempack == 4) + { + if (dims == 1) + { + int w = bottom_blob.w; + int outw = w * elempack; + + top_blob.create(outw, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + const __fp16 scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4; + signed char* outptr = (signed char*)top_blob + i * 4; + + outptr[0] = float2int8(ptr0[0] * scale); + outptr[1] = float2int8(ptr0[1] * scale); + outptr[2] = float2int8(ptr0[2] * scale); + outptr[3] = float2int8(ptr0[3] * scale); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4; + signed char* outptr = (signed char*)top_blob + i * 4; + + outptr[0] = float2int8(ptr0[0] * (__fp16)scale_data[i * 4]); + outptr[1] = float2int8(ptr0[1] * (__fp16)scale_data[i * 4 + 1]); + outptr[2] = float2int8(ptr0[2] * (__fp16)scale_data[i * 4 + 2]); + outptr[3] = float2int8(ptr0[3] * (__fp16)scale_data[i * 4 + 3]); + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int outh = h * elempack; + + top_blob.create(w, outh, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + const __fp16 scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i * 4); + signed char* outptr1 = top_blob.row(i * 4 + 1); + signed char* outptr2 = top_blob.row(i * 4 + 2); + signed char* outptr3 = top_blob.row(i * 4 + 3); + + for (int j = 0; j < w; j++) + { + outptr0[0] = float2int8(ptr0[0] * scale); + outptr1[0] = float2int8(ptr0[1] * scale); + outptr2[0] = float2int8(ptr0[2] * scale); + outptr3[0] = float2int8(ptr0[3] * scale); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i * 4); + signed char* outptr1 = top_blob.row(i * 4 + 1); + signed char* outptr2 = top_blob.row(i * 4 + 2); + signed char* outptr3 = top_blob.row(i * 4 + 3); + + const __fp16 s0 = scale_data[i * 4]; + const __fp16 s1 = scale_data[i * 4 + 1]; + const __fp16 s2 = scale_data[i * 4 + 2]; + const __fp16 s3 = scale_data[i * 4 + 3]; + + for (int j = 0; j < w; j++) + { + outptr0[0] = float2int8(ptr0[0] * s0); + outptr1[0] = float2int8(ptr0[1] * s1); + outptr2[0] = float2int8(ptr0[2] * s2); + outptr3[0] = float2int8(ptr0[3] * s3); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int outc = channels * elempack; + + top_blob.create(w, h, outc, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_data_size == 1) + { + const __fp16 scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q * 4); + signed char* outptr1 = top_blob.channel(q * 4 + 1); + signed char* outptr2 = top_blob.channel(q * 4 + 2); + signed char* outptr3 = top_blob.channel(q * 4 + 3); + + for (int i = 0; i < size; i++) + { + outptr0[0] = float2int8(ptr0[0] * scale); + outptr1[0] = float2int8(ptr0[1] * scale); + outptr2[0] = float2int8(ptr0[2] * scale); + outptr3[0] = float2int8(ptr0[3] * scale); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr0 = bottom_blob.channel(q); + signed char* outptr0 = top_blob.channel(q * 4); + signed char* outptr1 = top_blob.channel(q * 4 + 1); + signed char* outptr2 = top_blob.channel(q * 4 + 2); + signed char* outptr3 = top_blob.channel(q * 4 + 3); + + const __fp16 s0 = scale_data[q * 4]; + const __fp16 s1 = scale_data[q * 4 + 1]; + const __fp16 s2 = scale_data[q * 4 + 2]; + const __fp16 s3 = scale_data[q * 4 + 3]; + + for (int i = 0; i < size; i++) + { + outptr0[0] = float2int8(ptr0[0] * s0); + outptr1[0] = float2int8(ptr0[1] * s1); + outptr2[0] = float2int8(ptr0[2] * s2); + outptr3[0] = float2int8(ptr0[3] * s3); + + ptr0 += 4; + outptr0 += 1; + outptr1 += 1; + outptr2 += 1; + outptr3 += 1; + } + } + } + } + + return 0; + } + + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const __fp16* ptr = bottom_blob; + signed char* outptr = top_blob; + + if (scale_data_size == 1) + { + const __fp16 scale = scale_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + outptr[i] = float2int8(ptr[i] * scale); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + outptr[i] = float2int8(ptr[i] * (__fp16)scale_data[i]); + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const __fp16* ptr0 = bottom_blob.row(i); + signed char* outptr0 = top_blob.row(i); + + const __fp16 scale = scale_data_size == 1 ? scale_data[0] : scale_data[i]; + + for (int j = 0; j < w; j++) + { + *outptr0++ = float2int8(*ptr0++ * scale); + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const __fp16* ptr = bottom_blob.channel(q); + signed char* outptr = top_blob.channel(q); + + const __fp16 scale = scale_data_size == 1 ? scale_data[0] : scale_data[q]; + + for (int i = 0; i < size; i++) + { + *outptr++ = float2int8(*ptr++ * scale); + } + } + } + + return 0; +} + +#endif // __riscv_vector && __riscv_zfh + +} // namespace ncnn diff --git a/src/layer/riscv/quantize_riscv.h b/src/layer/riscv/quantize_riscv.h new file mode 100644 index 000000000000..0eb90aed7e5b --- /dev/null +++ b/src/layer/riscv/quantize_riscv.h @@ -0,0 +1,37 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_QUANTIZE_RISCV_H +#define LAYER_QUANTIZE_RISCV_H + +#include "quantize.h" + +namespace ncnn { + +class Quantize_riscv : public Quantize +{ +public: + Quantize_riscv(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + +#if __riscv_vector && __riscv_zfh + int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; + int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif // __riscv_vector && __riscv_zfh +}; + +} // namespace ncnn + +#endif // LAYER_QUANTIZE_RISCV_H diff --git a/src/layer/riscv/requantize_leakyrelu_pack4.h b/src/layer/riscv/requantize_leakyrelu_pack4.h new file mode 100644 index 000000000000..fd5c4dfd93d2 --- /dev/null +++ b/src/layer/riscv/requantize_leakyrelu_pack4.h @@ -0,0 +1,263 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void requantize_leakyrelu_pack4_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt) +{ + int vl = 4; + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int outc = top_blob.c; + int out_elempack = top_blob.elempack; + + int scale_in_data_size = scale_in_data.w; + int scale_out_data_size = scale_out_data.w; + int bias_data_size = bias_data.w; + + // int8(relu(v * scale_in) * scale_out) + // int8_relu(v * (scale_in * scale_out)) + + // int8(relu(v * scale_in + bias) * scale_out) + // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) + + if (out_elempack == 8) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl); + vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl); + vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl); + vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl); + + vfloat32m1_t _scale0 = vfmul_vv_f32m1(_scale_in0, _scale_out0, vl); + vfloat32m1_t _scale1 = vfmul_vv_f32m1(_scale_in1, _scale_out1, vl); + vfloat32m1_t _slope = vfmv_v_f_f32m1(slope, vl); + + int i = 0; + for (; i + 3 < size; i += 4) + { + vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl); + vfloat32m1_t _v02 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 8, vl), vl); + vfloat32m1_t _v03 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 12, vl), vl); + vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl); + vfloat32m1_t _v12 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 8, vl), vl); + vfloat32m1_t _v13 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 12, vl), vl); + _v00 = vfmul_vv_f32m1(_v00, _scale0, vl); + _v01 = vfmul_vv_f32m1(_v01, _scale0, vl); + _v02 = vfmul_vv_f32m1(_v02, _scale0, vl); + _v03 = vfmul_vv_f32m1(_v03, _scale0, vl); + _v10 = vfmul_vv_f32m1(_v10, _scale1, vl); + _v11 = vfmul_vv_f32m1(_v11, _scale1, vl); + _v12 = vfmul_vv_f32m1(_v12, _scale1, vl); + _v13 = vfmul_vv_f32m1(_v13, _scale1, vl); + *(int64_t*)ptr = float2int8leakyrelu(_v00, _v10, _slope); + *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v01, _v11, _slope); + *(int64_t*)(ptr + 16) = float2int8leakyrelu(_v02, _v12, _slope); + *(int64_t*)(ptr + 24) = float2int8leakyrelu(_v03, _v13, _slope); + + intptr0 += 16; + intptr1 += 16; + ptr += 32; + } + for (; i < size; i++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + _v0 = vfmul_vv_f32m1(_v0, _scale0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale1, vl); + *(int64_t*)ptr = float2int8leakyrelu(_v0, _v1, _slope); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl); + vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl); + vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl); + vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl); + vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8, vl); + vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8 + 4, vl); + + vfloat32m1_t _scale0 = vfmul_vv_f32m1(_scale_in0, _scale_out0, vl); + vfloat32m1_t _scale1 = vfmul_vv_f32m1(_scale_in1, _scale_out1, vl); + vfloat32m1_t _slope = vfmv_v_f_f32m1(slope, vl); + _bias0 = vfmul_vv_f32m1(_bias0, _scale_out0, vl); + _bias1 = vfmul_vv_f32m1(_bias1, _scale_out1, vl); + + int i = 0; + for (; i + 3 < size; i += 4) + { + vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl); + vfloat32m1_t _v02 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 8, vl), vl); + vfloat32m1_t _v03 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 12, vl), vl); + vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl); + vfloat32m1_t _v12 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 8, vl), vl); + vfloat32m1_t _v13 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 12, vl), vl); + + _v00 = vfmacc_vv_f32m1(_bias0, _v00, _scale0, vl); + _v01 = vfmacc_vv_f32m1(_bias0, _v01, _scale0, vl); + _v02 = vfmacc_vv_f32m1(_bias0, _v02, _scale0, vl); + _v03 = vfmacc_vv_f32m1(_bias0, _v03, _scale0, vl); + _v10 = vfmacc_vv_f32m1(_bias1, _v10, _scale1, vl); + _v11 = vfmacc_vv_f32m1(_bias1, _v11, _scale1, vl); + _v12 = vfmacc_vv_f32m1(_bias1, _v12, _scale1, vl); + _v13 = vfmacc_vv_f32m1(_bias1, _v13, _scale1, vl); + + *(int64_t*)ptr = float2int8leakyrelu(_v00, _v10, _slope); + *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v01, _v11, _slope); + *(int64_t*)(ptr + 16) = float2int8leakyrelu(_v02, _v12, _slope); + *(int64_t*)(ptr + 24) = float2int8leakyrelu(_v03, _v13, _slope); + + intptr0 += 16; + intptr1 += 16; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl); + vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl); + + _v00 = vfmacc_vv_f32m1(_bias0, _v00, _scale0, vl); + _v01 = vfmacc_vv_f32m1(_bias0, _v01, _scale0, vl); + _v10 = vfmacc_vv_f32m1(_bias1, _v10, _scale1, vl); + _v11 = vfmacc_vv_f32m1(_bias1, _v11, _scale1, vl); + + *(int64_t*)ptr = float2int8leakyrelu(_v00, _v10, _slope); + *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v01, _v11, _slope); + + intptr0 += 8; + intptr1 += 8; + ptr += 16; + } + for (; i < size; i++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + + _v0 = vfmacc_vv_f32m1(_bias0, _v0, _scale0, vl); + _v1 = vfmacc_vv_f32m1(_bias1, _v1, _scale1, vl); + + *(int64_t*)ptr = float2int8leakyrelu(_v0, _v1, _slope); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + + vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl); + vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl); + vfloat32m1_t _slope = vfmv_v_f_f32m1(slope, vl); + + vfloat32m1_t _scale = vfmul_vv_f32m1(_scale_in, _scale_out, vl); + + int i = 0; + for (; i < size; i++) + { + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmul_vv_f32m1(_v, _scale, vl); + + int res = float2int8leakyrelu(_v, _slope); + ptr0[0] = (res)&0xff; + ptr1[0] = (res >> 8) & 0xff; + ptr2[0] = (res >> 16) & 0xff; + ptr3[0] = (res >> 24) & 0xff; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + + vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl); + vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl); + vfloat32m1_t _bias = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 4, vl); + vfloat32m1_t _slope = vfmv_v_f_f32m1(slope, vl); + + vfloat32m1_t _scale = vfmul_vv_f32m1(_scale_in, _scale_out, vl); + _bias = vfmul_vv_f32m1(_bias, _scale_out, vl); + + int i = 0; + for (; i < size; i++) + { + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale, vl); + int res = float2int8leakyrelu(_v, _slope); + ptr0[0] = (res)&0xff; + ptr1[0] = (res >> 8) & 0xff; + ptr2[0] = (res >> 16) & 0xff; + ptr3[0] = (res >> 24) & 0xff; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + } +} diff --git a/src/layer/riscv/requantize_leakyrelu_pack8.h b/src/layer/riscv/requantize_leakyrelu_pack8.h new file mode 100644 index 000000000000..87991ad94394 --- /dev/null +++ b/src/layer/riscv/requantize_leakyrelu_pack8.h @@ -0,0 +1,160 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void requantize_leakyrelu_pack8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + int scale_in_data_size = scale_in_data.w; + int scale_out_data_size = scale_out_data.w; + int bias_data_size = bias_data.w; + int vl = 8; + + // int8(relu(v * scale_in) * scale_out) + // int8_relu(v * (scale_in * scale_out)) + + // int8(relu(v * scale_in + bias) * scale_out) + // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl); + + vfloat32m2_t _scale0 = vfmul_vv_f32m2(_scale_in0, _scale_out0, vl); + vfloat32m2_t _slope = vfmv_v_f_f32m2(slope, vl); + + int i = 0; + for (; i + 3 < size; i += 4) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl); + vfloat32m2_t _v45 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 16, vl), vl); + vfloat32m2_t _v67 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 24, vl), vl); + + _v01 = vfmul_vv_f32m2(_v01, _scale0, vl); + _v23 = vfmul_vv_f32m2(_v23, _scale0, vl); + _v45 = vfmul_vv_f32m2(_v45, _scale0, vl); + _v67 = vfmul_vv_f32m2(_v67, _scale0, vl); + + *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope); + *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v23, _slope); + *(int64_t*)(ptr + 16) = float2int8leakyrelu(_v45, _slope); + *(int64_t*)(ptr + 24) = float2int8leakyrelu(_v67, _slope); + + intptr += 32; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl); + + _v01 = vfmul_vv_f32m2(_v01, _scale0, vl); + _v23 = vfmul_vv_f32m2(_v23, _scale0, vl); + + *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope); + *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v23, _slope); + + intptr += 16; + ptr += 16; + } + for (; i < size; i++) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + + _v01 = vfmul_vv_f32m2(_v01, _scale0, vl); + + *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope); + + intptr += 8; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl); + vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + q * 8, vl); + + vfloat32m2_t _scale0 = vfmul_vv_f32m2(_scale_in0, _scale_out0, vl); + _bias0 = vfmul_vv_f32m2(_bias0, _scale_out0, vl); + + vfloat32m2_t _slope = vfmv_v_f_f32m2(slope, vl); + + int i = 0; + for (; i + 3 < size; i += 4) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl); + vfloat32m2_t _v45 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 16, vl), vl); + vfloat32m2_t _v67 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 24, vl), vl); + + _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl); + _v23 = vfmacc_vv_f32m2(_bias0, _v23, _scale0, vl); + _v45 = vfmacc_vv_f32m2(_bias0, _v45, _scale0, vl); + _v67 = vfmacc_vv_f32m2(_bias0, _v67, _scale0, vl); + + *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope); + *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v23, _slope); + *(int64_t*)(ptr + 16) = float2int8leakyrelu(_v45, _slope); + *(int64_t*)(ptr + 24) = float2int8leakyrelu(_v67, _slope); + + intptr += 32; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl); + + _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl); + _v23 = vfmacc_vv_f32m2(_bias0, _v23, _scale0, vl); + + *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope); + *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v23, _slope); + + intptr += 16; + ptr += 16; + } + for (; i < size; i++) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + + _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl); + + *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope); + + intptr += 8; + ptr += 8; + } + } + } +} diff --git a/src/layer/riscv/requantize_relu_pack4.h b/src/layer/riscv/requantize_relu_pack4.h new file mode 100644 index 000000000000..ca5285de57c5 --- /dev/null +++ b/src/layer/riscv/requantize_relu_pack4.h @@ -0,0 +1,259 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void requantize_relu_pack4_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt) +{ + int vl = 4; + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int outc = top_blob.c; + int out_elempack = top_blob.elempack; + + int scale_in_data_size = scale_in_data.w; + int scale_out_data_size = scale_out_data.w; + int bias_data_size = bias_data.w; + + // int8(relu(v * scale_in) * scale_out) + // int8_relu(v * (scale_in * scale_out)) + + // int8(relu(v * scale_in + bias) * scale_out) + // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) + + if (out_elempack == 8) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl); + vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl); + vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl); + vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl); + + vfloat32m1_t _scale0 = vfmul_vv_f32m1(_scale_in0, _scale_out0, vl); + vfloat32m1_t _scale1 = vfmul_vv_f32m1(_scale_in1, _scale_out1, vl); + + int i = 0; + for (; i + 3 < size; i += 4) + { + vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl); + vfloat32m1_t _v02 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 8, vl), vl); + vfloat32m1_t _v03 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 12, vl), vl); + vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl); + vfloat32m1_t _v12 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 8, vl), vl); + vfloat32m1_t _v13 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 12, vl), vl); + _v00 = vfmul_vv_f32m1(_v00, _scale0, vl); + _v01 = vfmul_vv_f32m1(_v01, _scale0, vl); + _v02 = vfmul_vv_f32m1(_v02, _scale0, vl); + _v03 = vfmul_vv_f32m1(_v03, _scale0, vl); + _v10 = vfmul_vv_f32m1(_v10, _scale1, vl); + _v11 = vfmul_vv_f32m1(_v11, _scale1, vl); + _v12 = vfmul_vv_f32m1(_v12, _scale1, vl); + _v13 = vfmul_vv_f32m1(_v13, _scale1, vl); + *(int64_t*)ptr = float2int8relu(_v00, _v10); + *(int64_t*)(ptr + 8) = float2int8relu(_v01, _v11); + *(int64_t*)(ptr + 16) = float2int8relu(_v02, _v12); + *(int64_t*)(ptr + 24) = float2int8relu(_v03, _v13); + + intptr0 += 16; + intptr1 += 16; + ptr += 32; + } + for (; i < size; i++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + _v0 = vfmul_vv_f32m1(_v0, _scale0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale1, vl); + *(int64_t*)ptr = float2int8relu(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl); + vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl); + vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl); + vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl); + vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8, vl); + vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8 + 4, vl); + + vfloat32m1_t _scale0 = vfmul_vv_f32m1(_scale_in0, _scale_out0, vl); + vfloat32m1_t _scale1 = vfmul_vv_f32m1(_scale_in1, _scale_out1, vl); + _bias0 = vfmul_vv_f32m1(_bias0, _scale_out0, vl); + _bias1 = vfmul_vv_f32m1(_bias1, _scale_out1, vl); + + int i = 0; + for (; i + 3 < size; i += 4) + { + vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl); + vfloat32m1_t _v02 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 8, vl), vl); + vfloat32m1_t _v03 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 12, vl), vl); + vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl); + vfloat32m1_t _v12 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 8, vl), vl); + vfloat32m1_t _v13 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 12, vl), vl); + + _v00 = vfmacc_vv_f32m1(_bias0, _v00, _scale0, vl); + _v01 = vfmacc_vv_f32m1(_bias0, _v01, _scale0, vl); + _v02 = vfmacc_vv_f32m1(_bias0, _v02, _scale0, vl); + _v03 = vfmacc_vv_f32m1(_bias0, _v03, _scale0, vl); + _v10 = vfmacc_vv_f32m1(_bias1, _v10, _scale1, vl); + _v11 = vfmacc_vv_f32m1(_bias1, _v11, _scale1, vl); + _v12 = vfmacc_vv_f32m1(_bias1, _v12, _scale1, vl); + _v13 = vfmacc_vv_f32m1(_bias1, _v13, _scale1, vl); + + *(int64_t*)ptr = float2int8relu(_v00, _v10); + *(int64_t*)(ptr + 8) = float2int8relu(_v01, _v11); + *(int64_t*)(ptr + 16) = float2int8relu(_v02, _v12); + *(int64_t*)(ptr + 24) = float2int8relu(_v03, _v13); + + intptr0 += 16; + intptr1 += 16; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl); + vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl); + + _v00 = vfmacc_vv_f32m1(_bias0, _v00, _scale0, vl); + _v01 = vfmacc_vv_f32m1(_bias0, _v01, _scale0, vl); + _v10 = vfmacc_vv_f32m1(_bias1, _v10, _scale1, vl); + _v11 = vfmacc_vv_f32m1(_bias1, _v11, _scale1, vl); + + *(int64_t*)ptr = float2int8relu(_v00, _v10); + *(int64_t*)(ptr + 8) = float2int8relu(_v01, _v11); + + intptr0 += 8; + intptr1 += 8; + ptr += 16; + } + for (; i < size; i++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + + _v0 = vfmacc_vv_f32m1(_bias0, _v0, _scale0, vl); + _v1 = vfmacc_vv_f32m1(_bias1, _v1, _scale1, vl); + + *(int64_t*)ptr = float2int8relu(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + + vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl); + vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl); + + vfloat32m1_t _scale = vfmul_vv_f32m1(_scale_in, _scale_out, vl); + + int i = 0; + for (; i < size; i++) + { + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmul_vv_f32m1(_v, _scale, vl); + + int res = float2int8relu(_v); + ptr0[0] = (res)&0xff; + ptr1[0] = (res >> 8) & 0xff; + ptr2[0] = (res >> 16) & 0xff; + ptr3[0] = (res >> 24) & 0xff; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + + vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl); + vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl); + vfloat32m1_t _bias = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 4, vl); + + vfloat32m1_t _scale = vfmul_vv_f32m1(_scale_in, _scale_out, vl); + _bias = vfmul_vv_f32m1(_bias, _scale_out, vl); + + int i = 0; + for (; i < size; i++) + { + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale, vl); + int res = float2int8relu(_v); + ptr0[0] = (res)&0xff; + ptr1[0] = (res >> 8) & 0xff; + ptr2[0] = (res >> 16) & 0xff; + ptr3[0] = (res >> 24) & 0xff; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + } +} diff --git a/src/layer/riscv/requantize_relu_pack8.h b/src/layer/riscv/requantize_relu_pack8.h new file mode 100644 index 000000000000..e3f18dbb98a3 --- /dev/null +++ b/src/layer/riscv/requantize_relu_pack8.h @@ -0,0 +1,155 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void requantize_relu_pack8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + int scale_in_data_size = scale_in_data.w; + int scale_out_data_size = scale_out_data.w; + int bias_data_size = bias_data.w; + int vl = 8; + + // int8(relu(v * scale_in) * scale_out) + // int8_relu(v * (scale_in * scale_out)) + + // int8(relu(v * scale_in + bias) * scale_out) + // int8_relu(v * (scale_in * scale_out) + (bias * scale_out)) + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl); + + vfloat32m2_t _scale0 = vfmul_vv_f32m2(_scale_in0, _scale_out0, vl); + + int i = 0; + for (; i + 3 < size; i += 4) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl); + vfloat32m2_t _v45 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 16, vl), vl); + vfloat32m2_t _v67 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 24, vl), vl); + + _v01 = vfmul_vv_f32m2(_v01, _scale0, vl); + _v23 = vfmul_vv_f32m2(_v23, _scale0, vl); + _v45 = vfmul_vv_f32m2(_v45, _scale0, vl); + _v67 = vfmul_vv_f32m2(_v67, _scale0, vl); + + *(int64_t*)ptr = float2int8relu(_v01); + *(int64_t*)(ptr + 8) = float2int8relu(_v23); + *(int64_t*)(ptr + 16) = float2int8relu(_v45); + *(int64_t*)(ptr + 24) = float2int8relu(_v67); + + intptr += 32; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl); + + _v01 = vfmul_vv_f32m2(_v01, _scale0, vl); + _v23 = vfmul_vv_f32m2(_v23, _scale0, vl); + + *(int64_t*)ptr = float2int8relu(_v01); + *(int64_t*)(ptr + 8) = float2int8relu(_v23); + intptr += 16; + ptr += 16; + } + for (; i < size; i++) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + + _v01 = vfmul_vv_f32m2(_v01, _scale0, vl); + + *(int64_t*)ptr = float2int8relu(_v01); + + intptr += 8; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl); + vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + q * 8, vl); + + vfloat32m2_t _scale0 = vfmul_vv_f32m2(_scale_in0, _scale_out0, vl); + _bias0 = vfmul_vv_f32m2(_bias0, _scale_out0, vl); + int i = 0; + for (; i + 3 < size; i += 4) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl); + vfloat32m2_t _v45 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 16, vl), vl); + vfloat32m2_t _v67 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 24, vl), vl); + + _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl); + _v23 = vfmacc_vv_f32m2(_bias0, _v23, _scale0, vl); + _v45 = vfmacc_vv_f32m2(_bias0, _v45, _scale0, vl); + _v67 = vfmacc_vv_f32m2(_bias0, _v67, _scale0, vl); + + *(int64_t*)ptr = float2int8relu(_v01); + *(int64_t*)(ptr + 8) = float2int8relu(_v23); + *(int64_t*)(ptr + 16) = float2int8relu(_v45); + *(int64_t*)(ptr + 24) = float2int8relu(_v67); + + intptr += 32; + ptr += 32; + } + for (; i + 1 < size; i += 2) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl); + + _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl); + _v23 = vfmacc_vv_f32m2(_bias0, _v23, _scale0, vl); + + *(int64_t*)ptr = float2int8relu(_v01); + *(int64_t*)(ptr + 8) = float2int8relu(_v23); + + intptr += 16; + ptr += 16; + } + for (; i < size; i++) + { + vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + + _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl); + + *(int64_t*)ptr = float2int8relu(_v01); + + intptr += 8; + ptr += 8; + } + } + } +} diff --git a/src/layer/riscv/requantize_riscv.cpp b/src/layer/riscv/requantize_riscv.cpp new file mode 100644 index 000000000000..81e0e9aaa8da --- /dev/null +++ b/src/layer/riscv/requantize_riscv.cpp @@ -0,0 +1,1240 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 Xinyu302. All rights reserved. +// Copyright (C) 2019 BUG1989. All rights reserved. +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#include "requantize_riscv.h" + +#if __riscv_vector +#include +#endif // __riscv_vector + +#include "riscv_activation.h" +#include "riscv_usability.h" + +namespace ncnn { + +#if __riscv_vector +#include "requantize_leakyrelu_pack4.h" +#include "requantize_leakyrelu_pack8.h" +#include "requantize_relu_pack4.h" +#include "requantize_relu_pack8.h" +#endif // __riscv_vector + +Requantize_riscv::Requantize_riscv() +{ +#if __riscv_vector + support_packing = true; +#endif // __riscv_vector +} + +int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int dims = bottom_blob.dims; + int elempack = bottom_blob.elempack; + int vl; + +#if __riscv_vector + if (elempack == 8) + { + vl = 8; + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)8u, 8, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_in_data_size == 1 && scale_out_data_size == 1) + { + vfloat32m2_t _scale_in = vfmv_v_f_f32m2(scale_in_data[0], vl); + vfloat32m2_t _scale_out = vfmv_v_f_f32m2(scale_out_data[0], vl); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_in, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + else if (bias_data_size == 1) + { + vfloat32m2_t _bias = vfmv_v_f_f32m2(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmacc_vv_f32m2(_bias, _v0, _scale_in, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + i * 8, vl); + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + } + else if (scale_in_data_size == 1 && scale_out_data_size > 1) + { + vfloat32m2_t _scale_in = vfmv_v_f_f32m2(scale_in_data[0], vl); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl); + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_in, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + else if (bias_data_size == 1) + { + vfloat32m2_t _bias = vfmv_v_f_f32m2(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl); + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmacc_vv_f32m2(_bias, _v0, _scale_in, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl); + vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + i * 8, vl); + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + } + else if (scale_in_data_size > 1 && scale_out_data_size == 1) + { + vfloat32m2_t _scale_out = vfmv_v_f_f32m2(scale_out_data[0], vl); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl); + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_in0, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + else if (bias_data_size == 1) + { + vfloat32m2_t _bias = vfmv_v_f_f32m2(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl); + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmacc_vv_f32m2(_bias, _v0, _scale_in0, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl); + vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + i * 8, vl); + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in0, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + } + else // if (scale_in_data_size > 1 && scale_out_data_size > 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl); + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_in0, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + else if (bias_data_size == 1) + { + vfloat32m2_t _bias = vfmv_v_f_f32m2(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl); + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmacc_vv_f32m2(_bias, _v0, _scale_in0, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 8; + signed char* ptr = (signed char*)top_blob + i * 8; + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl); + vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + i * 8, vl); + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in0, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl); + *(int64_t*)ptr = float2int8(_v0); + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_in0, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl); + *(int64_t*)ptr = float2int8(_v0); + + intptr += 8; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl); + vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + i * 8, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in0, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl); + *(int64_t*)ptr = float2int8(_v0); + + intptr += 8; + ptr += 8; + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (activation_type == 1) + { + requantize_relu_pack8_rvv(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt); + return 0; + } + + if (activation_type == 2 && activation_params[0] > 0.f) + { + requantize_leakyrelu_pack8_rvv(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt); + return 0; + } + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl); + + for (int i = 0; i < size; i++) + { + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_in0, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl); + *(int64_t*)ptr = float2int8(_v0); + + intptr += 8; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl); + vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl); + vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + q * 8, vl); + + for (int i = 0; i < size; i++) + { + vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl); + _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in0, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl); + *(int64_t*)ptr = float2int8(_v0); + + intptr += 8; + ptr += 8; + } + } + } + } + + return 0; + } + + if (elempack == 4) + { + vl = 4; + if (dims == 1) + { + int w = bottom_blob.w; + int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1; + int outw = w * elempack / out_elempack; + + top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (scale_in_data_size == 1 && scale_out_data_size == 1) + { + vfloat32m1_t _scale_in = vfmv_v_f_f32m1(scale_in_data[0], vl); + vfloat32m1_t _scale_out = vfmv_v_f_f32m1(scale_out_data[0], vl); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmul_vv_f32m1(_v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + else if (bias_data_size == 1) + { + vfloat32m1_t _bias = vfmv_v_f_f32m1(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _bias = vle32_v_f32m1((const float*)bias_data + i * 4, vl); + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + } + else if (scale_in_data_size == 1 && scale_out_data_size > 1) + { + vfloat32m1_t _scale_in = vfmv_v_f_f32m1(scale_in_data[0], vl); + // float32x4_t _scale_in = vdupq_n_f32(scale_in_data[0]); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl); + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmul_vv_f32m1(_v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + else if (bias_data_size == 1) + { + vfloat32m1_t _bias = vfmv_v_f_f32m1(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl); + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _bias = vle32_v_f32m1((const float*)bias_data + i * 4, vl); + vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl); + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + } + else if (scale_in_data_size > 1 && scale_out_data_size == 1) + { + vfloat32m1_t _scale_out = vfmv_v_f_f32m1(scale_out_data[0], vl); + // float32x4_t _scale_out = vdupq_n_f32(scale_out_data[0]); + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl); + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmul_vv_f32m1(_v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + else if (bias_data_size == 1) + { + // float32x4_t _bias = vdupq_n_f32(bias_data[0]); + vfloat32m1_t _bias = vfmv_v_f_f32m1(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl); + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl); + vfloat32m1_t _bias = vle32_v_f32m1((const float*)bias_data + i * 4, vl); + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + } + else // if (scale_in_data_size > 1 && scale_out_data_size > 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl); + vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl); + + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmul_vv_f32m1(_v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + else if (bias_data_size == 1) + { + vfloat32m1_t _bias = vfmv_v_f_f32m1(bias_data[0], vl); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl); + vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl); + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + const int* intptr = (const int*)bottom_blob + i * 4; + signed char* ptr = (signed char*)top_blob + i * 4; + + vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl); + vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl); + vfloat32m1_t _bias = vle32_v_f32m1((const float*)bias_data + i * 4, vl); + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + *(int32_t*)ptr = float2int8(_v); + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1; + int outh = h * elempack / out_elempack; + + top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (out_elempack == 8) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const int* intptr0 = bottom_blob.row(i * 2); + const int* intptr1 = bottom_blob.row(i * 2 + 1); + signed char* ptr = top_blob.row(i); + + vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 8, vl); + vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 8 + 4, vl); + vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 8, vl); + vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 8 + 4, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + _v0 = vfmul_vv_f32m1(_v0, _scale_in0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale_in1, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v1 = activation_ps(_v1, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m1(_v0, _scale_out0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale_out1, vl); + *(int64_t*)ptr = float2int8(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const int* intptr0 = bottom_blob.row(i * 2); + const int* intptr1 = bottom_blob.row(i * 2 + 1); + signed char* ptr = top_blob.row(i); + + vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 8, vl); + vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 8 + 4, vl); + vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 8, vl); + vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 8 + 4, vl); + vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 8, vl); + vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 8 + 4, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + _v0 = vfmacc_vv_f32m1(_bias0, _v0, _scale_in0, vl); + _v1 = vfmacc_vv_f32m1(_bias1, _v1, _scale_in1, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v1 = activation_ps(_v1, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m1(_v0, _scale_out0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale_out1, vl); + *(int64_t*)ptr = float2int8(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr0 = top_blob.row(i * 4); + signed char* ptr1 = top_blob.row(i * 4 + 1); + signed char* ptr2 = top_blob.row(i * 4 + 2); + signed char* ptr3 = top_blob.row(i * 4 + 3); + + vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 4, vl); + vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 4, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmul_vv_f32m1(_v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + int res = float2int8(_v); + ptr0[0] = (res)&0xff; + ptr1[0] = (res >> 8) & 0xff; + ptr2[0] = (res >> 16) & 0xff; + ptr3[0] = (res >> 24) & 0xff; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr0 = top_blob.row(i * 4); + signed char* ptr1 = top_blob.row(i * 4 + 1); + signed char* ptr2 = top_blob.row(i * 4 + 2); + signed char* ptr3 = top_blob.row(i * 4 + 3); + + vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 4, vl); + vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 4, vl); + vfloat32m1_t _bias = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 4, vl); + + for (int j = 0; j < w; j++) + { + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + int res = float2int8(_v); + + ptr0[0] = (res)&0xff; + ptr1[0] = (res >> 8) & 0xff; + ptr2[0] = (res >> 16) & 0xff; + ptr3[0] = (res >> 24) & 0xff; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1; + int outc = channels * elempack / out_elempack; + + top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (activation_type == 1) + { + requantize_relu_pack4_rvv(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt); + return 0; + } + + if (activation_type == 2 && activation_params[0] > 0.f) + { + requantize_leakyrelu_pack4_rvv(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt); + return 0; + } + + if (out_elempack == 8) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl); + vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl); + vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl); + vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl); + + for (int i = 0; i < size; i++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + _v0 = vfmul_vv_f32m1(_v0, _scale_in0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale_in1, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v1 = activation_ps(_v1, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m1(_v0, _scale_out0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale_out1, vl); + *(int64_t*)ptr = float2int8(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* intptr0 = bottom_blob.channel(q * 2); + const int* intptr1 = bottom_blob.channel(q * 2 + 1); + signed char* ptr = top_blob.channel(q); + + vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl); + vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl); + vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl); + vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl); + vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8, vl); + vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8 + 4, vl); + for (int i = 0; i < size; i++) + { + vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl); + vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl); + _v0 = vfmacc_vv_f32m1(_bias0, _v0, _scale_in0, vl); + _v1 = vfmacc_vv_f32m1(_bias1, _v1, _scale_in1, vl); + _v0 = activation_ps(_v0, activation_type, activation_params, vl); + _v1 = activation_ps(_v1, activation_type, activation_params, vl); + _v0 = vfmul_vv_f32m1(_v0, _scale_out0, vl); + _v1 = vfmul_vv_f32m1(_v1, _scale_out1, vl); + *(int64_t*)ptr = float2int8(_v0, _v1); + + intptr0 += 4; + intptr1 += 4; + ptr += 8; + } + } + } + } + if (out_elempack == 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + + vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl); + vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl); + // float32x4_t _scale_in = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 4); + // float32x4_t _scale_out = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 4); + + for (int i = 0; i < size; i++) + { + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmul_vv_f32m1(_v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + int res = float2int8(_v); + ptr0[0] = (res)&0xff; + ptr1[0] = (res >> 8) & 0xff; + ptr2[0] = (res >> 16) & 0xff; + ptr3[0] = (res >> 24) & 0xff; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr0 = top_blob.channel(q * 4); + signed char* ptr1 = top_blob.channel(q * 4 + 1); + signed char* ptr2 = top_blob.channel(q * 4 + 2); + signed char* ptr3 = top_blob.channel(q * 4 + 3); + + vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl); + vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl); + vfloat32m1_t _bias = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 4, vl); + + for (int i = 0; i < size; i++) + { + vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl); + _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl); + _v = activation_ps(_v, activation_type, activation_params, vl); + _v = vfmul_vv_f32m1(_v, _scale_out, vl); + int res = float2int8(_v); + ptr0[0] = (res)&0xff; + ptr1[0] = (res >> 8) & 0xff; + ptr2[0] = (res >> 16) & 0xff; + ptr3[0] = (res >> 24) & 0xff; + + intptr += 4; + ptr0 += 1; + ptr1 += 1; + ptr2 += 1; + ptr3 += 1; + } + } + } + } + } + + return 0; + } +#endif // __riscv_vector + + if (dims == 1) + { + int w = bottom_blob.w; + + top_blob.create(w, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + const int* intptr = bottom_blob; + signed char* ptr = top_blob; + + if (scale_in_data_size == 1 && scale_out_data_size == 1) + { + const float scale_in = scale_in_data[0]; + const float scale_out = scale_out_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in + bias_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + else if (scale_in_data_size == 1 && scale_out_data_size > 1) + { + const float scale_in = scale_in_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in + bias_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + } + else if (scale_in_data_size > 1 && scale_out_data_size == 1) + { + const float scale_out = scale_out_data[0]; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i] + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i] + bias_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + else // if (scale_in_data_size > 1 && scale_out_data_size > 1) + { + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + else if (bias_data_size == 1) + { + const float bias = bias_data[0]; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i] + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < w; i++) + { + float v = intptr[i] * scale_in_data[i] + bias_data[i]; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]); + } + } + } + } + + if (dims == 2) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + + top_blob.create(w, h, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i]; + + for (int j = 0; j < w; j++) + { + float v = intptr[j] * scale_in; + ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i]; + const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i]; + + for (int j = 0; j < w; j++) + { + float v = intptr[j] * scale_in + bias; + ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + } + + if (dims == 3) + { + int w = bottom_blob.w; + int h = bottom_blob.h; + int channels = bottom_blob.c; + int size = w * h; + + top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (bias_data_size == 0) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q]; + + for (int i = 0; i < size; i++) + { + float v = intptr[i] * scale_in; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q]; + const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q]; + const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q]; + + for (int i = 0; i < size; i++) + { + float v = intptr[i] * scale_in + bias; + ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out); + } + } + } + } + + return 0; +} + +} // namespace ncnn diff --git a/src/layer/riscv/requantize_riscv.h b/src/layer/riscv/requantize_riscv.h new file mode 100644 index 000000000000..df12b54ce3a4 --- /dev/null +++ b/src/layer/riscv/requantize_riscv.h @@ -0,0 +1,34 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2024 Xinyu302. All rights reserved. +// Copyright (C) 2019 BUG1989. All rights reserved. +// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_REQUANTIZE_RISCV_H +#define LAYER_REQUANTIZE_RISCV_H + +#include "requantize.h" + +namespace ncnn { + +class Requantize_riscv : public Requantize +{ +public: + Requantize_riscv(); + + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_REQUANTIZE_RISCV_H diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h index e2824646f871..1802a28336dd 100644 --- a/src/layer/riscv/riscv_usability.h +++ b/src/layer/riscv/riscv_usability.h @@ -19,6 +19,14 @@ #include #endif // __riscv_vector +static inline signed char float2int8(float v) +{ + int int32 = (int)roundf(v); + if (int32 > 127) return 127; + if (int32 < -127) return -127; + return (signed char)int32; +} + #if __riscv_vector static inline int csrr_vl() { @@ -50,6 +58,362 @@ static inline int csrr_vlenb() return a; } +#if __riscv_zfh +static inline int64_t float2int8(vfloat16m1_t _v) +{ + int vl = vsetvlmax_e16m1(); + vint16m2_t _v16 = vundefined_i16m2(); + _v16 = vset_v_i16m1_i16m2(_v16, 0, vfcvt_x_f_v_i16m1(_v, vl)); + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + int64_t _ret; + vse8_v_i8m1((int8_t*)&_ret, _v8, vl); + return _ret; + // int16x8_t _v16 = vcvtaq_s16_f16(_v); + // int8x8_t _v8 = vqmovn_s16(_v16); + // return vmax_s8(_v8, vdup_n_s8(-127)); +} +#endif // __riscv_zfh + +static inline int64_t float2int8(vfloat32m1_t _vlow, vfloat32m1_t _vhigh) +{ + int vl = vsetvlmax_e32m1(); + vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow, vl); + vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh, vl); + + // combine _vlow32 and _vhigh32 to a single vint32m2_t + vl = 2 * vsetvlmax_e32m1(); + vint32m4_t _v32 = vundefined_i32m4(); + _v32 = vset_v_i32m1_i32m4(_v32, 0, _vlow32); + _v32 = vset_v_i32m1_i32m4(_v32, 1, _vhigh32); + + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + _v8 = vmax_vx_i8m1(_v8, -127, vl); + int64_t _ret; + vse8_v_i8m1((int8_t*)&_ret, _v8, vl); + return _ret; +} + +static inline int32_t float2int8(vfloat32m1_t _v) +{ + int vl = vsetvlmax_e32m1(); + vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v, vl); + + vint32m4_t _v32 = vundefined_i32m4(); + _v32 = vset_v_i32m1_i32m4(_v32, 0, _v32m1); + + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + _v8 = vmax_vx_i8m1(_v8, -127, vl); + int32_t _ret; + vse8_v_i8m1((int8_t*)&_ret, _v8, vl); + return _ret; +} + +static inline int64_t float2int8(vfloat32m2_t _v) +{ + int vl = vsetvlmax_e32m2(); + vint32m2_t _v32m2 = vfcvt_x_f_v_i32m2(_v, vl); + vint32m4_t _v32 = vundefined_i32m4(); + _v32 = vset_v_i32m2_i32m4(_v32, 0, _v32m2); + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + _v8 = vmax_vx_i8m1(_v8, -127, vl); + int64_t _ret; + vse8_v_i8m1((int8_t*)&_ret, _v8, vl); + return _ret; +} + +static inline int64_t float2int8relu(vfloat32m2_t _v) +{ + int vl = vsetvlmax_e32m2(); + vint32m2_t _v32m2 = vfcvt_x_f_v_i32m2(_v, vl); + vint32m4_t _v32 = vundefined_i32m4(); + _v32 = vset_v_i32m2_i32m4(_v32, 0, _v32m2); + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + _v8 = vmax_vx_i8m1(_v8, 0, vl); + int64_t _ret; + vse8_v_i8m1((int8_t*)&_ret, _v8, vl); + return _ret; +} + +static inline int64_t float2int8leakyrelu(vfloat32m2_t _v, vfloat32m2_t _slope) +{ + int vl = vsetvlmax_e32m2(); + + vfloat32m2_t _v_leaky = vfmul_vv_f32m2(_v, _slope, vl); + vint32m2_t _v_int32 = vfcvt_x_f_v_i32m2(_v, vl); + + // vfloat32m1_t _vlow_leaky = vfmul_vv_f32m1(_vlow, _slope, vl); + // vfloat32m1_t _vhigh_leaky = vfmul_vv_f32m1(_vhigh, _slope, vl); + + // vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow, vl); + // vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh, vl); + + // vint32m1_t _vlow32_leaky = vfcvt_x_f_v_i32m1(_vlow_leaky, vl); + // vint32m1_t _vhigh32_leaky = vfcvt_x_f_v_i32m1(_vhigh_leaky, vl); + vint32m2_t _v_int32_leaky = vfcvt_x_f_v_i32m2(_v_leaky, vl); + + vl = 2 * vsetvlmax_e32m1(); + + vint32m4_t _v32 = vundefined_i32m4(); + vint32m4_t _v32_leaky = vundefined_i32m4(); + _v32 = vset_v_i32m2_i32m4(_v32, 0, _v_int32); + _v32_leaky = vset_v_i32m2_i32m4(_v32_leaky, 0, _v_int32_leaky); + + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl); + + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + vint8m1_t _v8_leaky = vnclip_wx_i8m1(_v16_leaky, 0, vl); + + _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl); + int64_t _ret; + vse8_v_i8m1((int8_t*)&_ret, _v8, vl); + return _ret; +} + +static void print_vint8m1(vint8m1_t _v, int vl = 16) +{ + int8_t* i8 = (int8_t*)malloc(16 * sizeof(int8_t)); + vse8_v_i8m1(i8, _v, vl); + for (int i = 0; i < vl; i++) + { + fprintf(stderr, "i8[%d]: %d\n", i, i8[i]); + } + free(i8); +} + +static void print_vint32m1(vint32m1_t _v) +{ + int32_t* i32 = (int32_t*)malloc(4 * sizeof(int32_t)); + vse32_v_i32m1(i32, _v, 4); + for (int i = 0; i < 4; i++) + { + fprintf(stderr, "i32[%d]: %d\n", i, i32[i]); + } + free(i32); +} + +static void print_vint32m2(vint32m2_t _v, int vl = 8) +{ + int32_t* i32 = (int32_t*)malloc(8 * sizeof(int32_t)); + vse32_v_i32m2(i32, _v, vl); + for (int i = 0; i < vl; i++) + { + fprintf(stderr, "i32[%d]: %d\n", i, i32[i]); + } + free(i32); +} + +static void print_vfloat32m2(vfloat32m2_t _v, int vl = 8) +{ + float* f32 = (float*)malloc(8 * sizeof(float)); + vse32_v_f32m2(f32, _v, vl); + for (int i = 0; i < vl; i++) + { + fprintf(stderr, "f32[%d]: %f\n", i, f32[i]); + } + free(f32); +} + +static void print_vfloat32m1(vfloat32m1_t _v) +{ + float* f32 = (float*)malloc(4 * sizeof(float)); + vse32_v_f32m1(f32, _v, 4); + for (int i = 0; i < 4; i++) + { + fprintf(stderr, "f32[%d]: %f\n", i, f32[i]); + } + free(f32); +} + +static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3) +{ + int vl = vsetvlmax_e32m1(); + + vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v0, vl); + vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v1, vl); + vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v2, vl); + vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v3, vl); + + vl = vsetvlmax_e32m4(); + + vint32m4_t _v32 = vundefined_i32m4(); + _v32 = vset_v_i32m1_i32m4(_v32, 0, _v0_32); + _v32 = vset_v_i32m1_i32m4(_v32, 1, _v1_32); + _v32 = vset_v_i32m1_i32m4(_v32, 2, _v2_32); + _v32 = vset_v_i32m1_i32m4(_v32, 3, _v3_32); + + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + _v8 = vmax_vx_i8m1(_v8, -127, vl); + int8_t* i8 = (int8_t*)malloc(16 * sizeof(int8_t)); + vse8_v_i8m1(i8, _v8, 16); + return _v8; +} + +static inline int64_t float2int8relu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh) +{ + int vl = vsetvlmax_e32m1(); + + vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow, vl); + vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh, vl); + + vl = 2 * vsetvlmax_e32m1(); + vint32m4_t _v32 = vundefined_i32m4(); + _v32 = vset_v_i32m1_i32m4(_v32, 0, _vlow32); + _v32 = vset_v_i32m1_i32m4(_v32, 1, _vhigh32); + + vint16m2_t _v16_2 = vnclip_wx_i16m2(_v32, 0, vl); + vint8m1_t _v8 = vnclip_wx_i8m1(_v16_2, 0, vl); + _v8 = vmax_vx_i8m1(_v8, 0, vl); + int64_t _ret; + vse8_v_i8m1((int8_t*)&_ret, _v8, vl); + return _ret; +} + +static inline int32_t float2int8relu(vfloat32m1_t _v) +{ + int vl = vsetvlmax_e32m1(); + vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v, vl); + + vint32m4_t _v32 = vundefined_i32m4(); + _v32 = vset_v_i32m1_i32m4(_v32, 0, _v32m1); + + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + _v8 = vmax_vx_i8m1(_v8, 0, vl); + int32_t _ret; + vse8_v_i8m1((int8_t*)&_ret, _v8, vl); + return _ret; +} + +static inline vint8m1_t float2int8relu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3) +{ + int vl = vsetvlmax_e32m1(); + vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v0, vl); + vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v1, vl); + vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v2, vl); + vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v3, vl); + + vl = vsetvlmax_e32m4(); + + vint32m4_t _v32 = vundefined_i32m4(); + _v32 = vset_v_i32m1_i32m4(_v32, 0, _v0_32); + _v32 = vset_v_i32m1_i32m4(_v32, 1, _v1_32); + _v32 = vset_v_i32m1_i32m4(_v32, 2, _v2_32); + _v32 = vset_v_i32m1_i32m4(_v32, 3, _v3_32); + + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + _v8 = vmax_vx_i8m1(_v8, 0, vl); + return _v8; +} + +static inline int64_t float2int8leakyrelu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh, vfloat32m1_t _slope) +{ + int vl = vsetvlmax_e32m1(); + vfloat32m1_t _vlow_leaky = vfmul_vv_f32m1(_vlow, _slope, vl); + vfloat32m1_t _vhigh_leaky = vfmul_vv_f32m1(_vhigh, _slope, vl); + + vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow, vl); + vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh, vl); + + vint32m1_t _vlow32_leaky = vfcvt_x_f_v_i32m1(_vlow_leaky, vl); + vint32m1_t _vhigh32_leaky = vfcvt_x_f_v_i32m1(_vhigh_leaky, vl); + + vl = 2 * vsetvlmax_e32m1(); + + vint32m4_t _v32 = vundefined_i32m4(); + vint32m4_t _v32_leaky = vundefined_i32m4(); + _v32 = vset_v_i32m1_i32m4(_v32, 0, _vlow32); + _v32 = vset_v_i32m1_i32m4(_v32, 1, _vhigh32); + + _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 0, _vlow32_leaky); + _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 1, _vhigh32_leaky); + + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl); + + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + vint8m1_t _v8_leaky = vnclip_wx_i8m1(_v16_leaky, 0, vl); + + _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl); + int64_t _ret; + vse8_v_i8m1((int8_t*)&_ret, _v8, vl); + return _ret; +} + +static inline int32_t float2int8leakyrelu(vfloat32m1_t _v, vfloat32m1_t _slope) +{ + int vl = vsetvlmax_e32m1(); + vfloat32m1_t _v_leaky = vfmul_vv_f32m1(_v, _slope, vl); + + vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v, vl); + vint32m1_t _v32m1_leaky = vfcvt_x_f_v_i32m1(_v_leaky, vl); + + // vl = vsetvlmax_e32m4(); + vint32m4_t _v32 = vundefined_i32m4(); + vint32m4_t _v32_leaky = vundefined_i32m4(); + + _v32 = vset_v_i32m1_i32m4(_v32, 0, _v32m1); + _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 0, _v32m1_leaky); + + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl); + + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + vint8m1_t _v8_leaky = vnclip_wx_i8m1(_v16_leaky, 0, vl); + + _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl); + int32_t _ret; + vse8_v_i8m1((int8_t*)&_ret, _v8, vl); + return _ret; +} + +static inline vint8m1_t float2int8leakyrelu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3, vfloat32m1_t _slope) +{ + int vl = vsetvlmax_e32m1(); + vfloat32m1_t _v0_leaky = vfmul_vv_f32m1(_v0, _slope, vl); + vfloat32m1_t _v1_leaky = vfmul_vv_f32m1(_v1, _slope, vl); + vfloat32m1_t _v2_leaky = vfmul_vv_f32m1(_v2, _slope, vl); + vfloat32m1_t _v3_leaky = vfmul_vv_f32m1(_v3, _slope, vl); + + vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v0, vl); + vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v1, vl); + vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v2, vl); + vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v3, vl); + + vint32m1_t _v0_32_leaky = vfcvt_x_f_v_i32m1(_v0_leaky, vl); + vint32m1_t _v1_32_leaky = vfcvt_x_f_v_i32m1(_v1_leaky, vl); + vint32m1_t _v2_32_leaky = vfcvt_x_f_v_i32m1(_v2_leaky, vl); + vint32m1_t _v3_32_leaky = vfcvt_x_f_v_i32m1(_v3_leaky, vl); + + vl = vsetvlmax_e32m4(); + vint32m4_t _v32 = vundefined_i32m4(); + _v32 = vset_v_i32m1_i32m4(_v32, 0, _v0_32); + _v32 = vset_v_i32m1_i32m4(_v32, 1, _v1_32); + _v32 = vset_v_i32m1_i32m4(_v32, 2, _v2_32); + _v32 = vset_v_i32m1_i32m4(_v32, 3, _v3_32); + + vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl); + vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl); + + vint32m4_t _v32_leaky = vundefined_i32m4(); + _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 0, _v0_32_leaky); + _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 1, _v1_32_leaky); + _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 2, _v2_32_leaky); + _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 3, _v3_32_leaky); + + vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl); + vint8m1_t _v8_leaky = vnclip_wx_i8m1(_v16_leaky, 0, vl); + + _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl); + return _v8; +} + static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr) { const int packn = csrr_vlenb() / 4; diff --git a/src/mat.h b/src/mat.h index fdf5cc597c4e..b6849565fe13 100644 --- a/src/mat.h +++ b/src/mat.h @@ -1120,7 +1120,7 @@ NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v) NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v) { - const int packn = cpu_riscv_vlenb() / 1; + const int packn = cpu_riscv_vlenb() / 2; const size_t vl = vsetvl_e8m1(packn); int size = (int)total(); diff --git a/tests/testutil.cpp b/tests/testutil.cpp index f0bf3c51a20d..96a7c903103f 100644 --- a/tests/testutil.cpp +++ b/tests/testutil.cpp @@ -533,7 +533,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector