diff --git a/src/layer/riscv/dequantize_riscv.cpp b/src/layer/riscv/dequantize_riscv.cpp
new file mode 100644
index 000000000000..4c3900e4ef15
--- /dev/null
+++ b/src/layer/riscv/dequantize_riscv.cpp
@@ -0,0 +1,892 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 Xinyu302. All rights reserved.
+// Copyright (C) 2019 BUG1989. All rights reserved.
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "dequantize_riscv.h"
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#endif // __riscv_vector
+
+#include "riscv_usability.h"
+#include "cpu.h"
+
+namespace ncnn {
+
+Dequantize_riscv::Dequantize_riscv()
+{
+#if __riscv_vector
+    support_packing = true;
+#endif // __riscv_vector
+}
+
+int Dequantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int vl;
+    int dims = bottom_blob.dims;
+    int elempack = bottom_blob.elempack;
+
+#if __riscv_vector
+    if (elempack == 8)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+            int outw = w * 2;
+
+            top_blob.create(outw, (size_t)16u, 4, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                float _scale = scale_data[0];
+                if (bias_data_size == 0)
+                {
+                    // #pragma omp parallel for num_threads(opt.num_threads)
+                    int n = outw * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmul_vf_f32m8(_v, _scale, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    int n = outw * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+                        vfloat32m8_t _bias = vfmv_v_f_f32m8(bias_data[0], vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmadd_vf_f32m8(_v, _scale, _bias, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+                else
+                {
+                    // #pragma omp parallel for num_threads(opt.num_threads)
+                    int n = outw * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+                        vfloat32m8_t _bias = vle32_v_f32m8((const float*)bias_data + offset, vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmadd_vf_f32m8(_v, _scale, _bias, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+            }
+            else
+            {
+                if (bias_data_size == 0)
+                {
+                    // #pragma omp parallel for num_threads(opt.num_threads)
+                    int n = outw * 4;
+                    int offset = 0;
+
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+
+                        vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmul_vv_f32m8(_v, _scale, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    int n = outw * 4;
+                    int offset = 0;
+
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+
+                        vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl);
+                        vfloat32m8_t _bias = vfmv_v_f_f32m8(bias_data[0], vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+                else
+                {
+                    int n = outw * 4;
+                    int offset = 0;
+
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+
+                        vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl);
+                        vfloat32m8_t _bias = vle32_v_f32m8((const float*)bias_data + offset, vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int outh = h * 2;
+            vl = 4;
+
+            top_blob.create(w, outh, (size_t)16u, 4, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    float* ptr0 = top_blob.row(i * 2);
+                    float* ptr1 = top_blob.row(i * 2 + 1);
+
+                    vl = 4;
+                    vfloat32m1_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 8, vl);
+                    vfloat32m1_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 8 + 4, vl);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl);
+                        _v0 = vfmul_vv_f32m1(_v0, _scale0, vl);
+                        _v1 = vfmul_vv_f32m1(_v1, _scale1, vl);
+                        vse32_v_f32m1(ptr0, _v0, vl);
+                        vse32_v_f32m1(ptr1, _v1, vl);
+                        intptr += 8;
+                        ptr0 += 4;
+                        ptr1 += 4;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    float* ptr0 = top_blob.row(i * 2);
+                    float* ptr1 = top_blob.row(i * 2 + 1);
+
+                    vl = 4;
+                    vfloat32m1_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 8, vl);
+                    vfloat32m1_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 8 + 4, vl);
+                    vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 8, vl);
+                    vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 8 + 4, vl);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl);
+                        _v0 = vfmadd_vv_f32m1(_v0, _scale0, _bias0, vl);
+                        _v1 = vfmadd_vv_f32m1(_v1, _scale1, _bias1, vl);
+                        vse32_v_f32m1(ptr0, _v0, vl);
+                        vse32_v_f32m1(ptr1, _v1, vl);
+                        intptr += 8;
+                        ptr0 += 4;
+                        ptr1 += 4;
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+            int outc = channels * 2;
+
+            top_blob.create(w, h, outc, (size_t)16u, 4, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    float* ptr0 = top_blob.channel(q * 2);
+                    float* ptr1 = top_blob.channel(q * 2 + 1);
+
+                    vl = 4;
+                    vfloat32m1_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 8, vl);
+                    vfloat32m1_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 8 + 4, vl);
+
+                    int i = 0;
+                    for (; i + 1 < size; i += 2)
+                    {
+                        vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl);
+                        vfloat32m1_t _v2 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 8, vl), vl);
+                        vfloat32m1_t _v3 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 12, vl), vl);
+
+                        _v0 = vfmul_vv_f32m1(_v0, _scale0, vl);
+                        _v1 = vfmul_vv_f32m1(_v1, _scale1, vl);
+                        _v2 = vfmul_vv_f32m1(_v2, _scale0, vl);
+                        _v3 = vfmul_vv_f32m1(_v3, _scale1, vl);
+
+                        vse32_v_f32m1(ptr0, _v0, vl);
+                        vse32_v_f32m1(ptr0 + 4, _v2, vl);
+                        vse32_v_f32m1(ptr1, _v1, vl);
+                        vse32_v_f32m1(ptr1 + 4, _v3, vl);
+                        intptr += 16;
+                        ptr0 += 8;
+                        ptr1 += 8;
+                    }
+                    for (; i < size; i++)
+                    {
+                        vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl);
+
+                        _v0 = vfmul_vv_f32m1(_v0, _scale0, vl);
+                        _v1 = vfmul_vv_f32m1(_v1, _scale1, vl);
+
+                        vse32_v_f32m1(ptr0, _v0, vl);
+                        vse32_v_f32m1(ptr1, _v1, vl);
+
+                        intptr += 8;
+                        ptr0 += 4;
+                        ptr1 += 4;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    float* ptr0 = top_blob.channel(q * 2);
+                    float* ptr1 = top_blob.channel(q * 2 + 1);
+                    vl = 4;
+
+                    vfloat32m1_t _scale0 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 8, vl);
+                    vfloat32m1_t _scale1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 8 + 4, vl);
+                    vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8, vl);
+                    vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8 + 4, vl);
+
+                    int i = 0;
+                    for (; i + 1 < size; i += 2)
+                    {
+                        vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl);
+                        vfloat32m1_t _v2 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 8, vl), vl);
+                        vfloat32m1_t _v3 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 12, vl), vl);
+
+                        _v0 = vfmadd_vv_f32m1(_v0, _scale0, _bias0, vl);
+                        _v1 = vfmadd_vv_f32m1(_v1, _scale1, _bias1, vl);
+                        _v2 = vfmadd_vv_f32m1(_v2, _scale0, _bias0, vl);
+                        _v3 = vfmadd_vv_f32m1(_v3, _scale1, _bias1, vl);
+
+                        vse32_v_f32m1(ptr0, _v0, vl);
+                        vse32_v_f32m1(ptr0 + 4, _v2, vl);
+                        vse32_v_f32m1(ptr1, _v1, vl);
+                        vse32_v_f32m1(ptr1 + 4, _v3, vl);
+
+                        intptr += 16;
+                        ptr0 += 8;
+                        ptr1 += 8;
+                    }
+                    for (; i < size; i++)
+                    {
+                        vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr + 4, vl), vl);
+
+                        _v0 = vfmadd_vv_f32m1(_v0, _scale0, _bias0, vl);
+                        _v1 = vfmadd_vv_f32m1(_v1, _scale1, _bias1, vl);
+
+                        vse32_v_f32m1(ptr0, _v0, vl);
+                        vse32_v_f32m1(ptr1, _v1, vl);
+
+                        intptr += 8;
+                        ptr0 += 4;
+                        ptr1 += 4;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    if (elempack == 4)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+
+            top_blob.create(w, (size_t)16u, elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                if (bias_data_size == 0)
+                {
+                    int n = w * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+                        vfloat32m8_t _scale = vfmv_v_f_f32m8(scale_data[0], vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmul_vv_f32m8(_v, _scale, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    int n = w * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+                        vfloat32m8_t _scale = vfmv_v_f_f32m8(scale_data[0], vl);
+                        vfloat32m8_t _bias = vfmv_v_f_f32m8(bias_data[0], vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+                else
+                {
+                    // #pragma omp parallel for num_threads(opt.num_threads)
+                    int n = w * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+                        vfloat32m8_t _scale = vfmv_v_f_f32m8(scale_data[0], vl);
+                        vfloat32m8_t _bias = vle32_v_f32m8((const float*)bias_data + offset, vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+            }
+            else
+            {
+                if (bias_data_size == 0)
+                {
+                    // #pragma omp parallel for num_threads(opt.num_threads)
+                    int n = w * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+                        vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmul_vv_f32m8(_v, _scale, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    int n = w * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+                        vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl);
+                        vfloat32m8_t _bias = vfmv_v_f_f32m8(bias_data[0], vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+                else
+                {
+                    int n = w * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        const int* intptr = (const int*)bottom_blob + offset;
+                        float* ptr = (float*)top_blob + offset;
+                        vfloat32m8_t _scale = vle32_v_f32m8((const float*)scale_data + offset, vl);
+                        vfloat32m8_t _bias = vle32_v_f32m8((const float*)bias_data + offset, vl);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr, vl), vl);
+                        _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl);
+                        vse32_v_f32m8(ptr, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+
+            top_blob.create(w, h, (size_t)16u, elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    float* ptr = top_blob.row(i);
+
+                    int n = w * 4;
+                    int offset = 0;
+                    vl = 4;
+                    vfloat32m1_t _scale_m1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 4, vl);
+                    vfloat32m8_t _scale = vundefined_f32m8();
+                    _scale = vset_v_f32m1_f32m8(_scale, 0, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 1, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 2, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 3, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 4, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 5, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 6, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 7, _scale_m1);
+
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl);
+                        _v = vfmul_vv_f32m8(_v, _scale, vl);
+                        vse32_v_f32m8(ptr + offset, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    float* ptr = top_blob.row(i);
+
+                    vl = 4;
+                    vfloat32m1_t _scale_m1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + i * 4, vl);
+                    vfloat32m1_t _bias_m1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 4, vl);
+
+                    vfloat32m8_t _scale = vundefined_f32m8();
+                    _scale = vset_v_f32m1_f32m8(_scale, 0, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 1, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 2, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 3, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 4, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 5, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 6, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 7, _scale_m1);
+
+                    vfloat32m8_t _bias = vundefined_f32m8();
+                    _bias = vset_v_f32m1_f32m8(_bias, 0, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 1, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 2, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 3, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 4, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 5, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 6, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 7, _bias_m1);
+
+                    int n = w * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl);
+                        _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl);
+                        vse32_v_f32m8(ptr + offset, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+
+            top_blob.create(w, h, channels, (size_t)16u, elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    float* ptr = top_blob.channel(q);
+                    vl = 4;
+                    vfloat32m1_t _scale_m1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 4, vl);
+                    vfloat32m8_t _scale = vundefined_f32m8();
+                    _scale = vset_v_f32m1_f32m8(_scale, 0, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 1, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 2, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 3, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 4, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 5, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 6, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 7, _scale_m1);
+
+                    int n = size * 4;
+                    int offset = 0;
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl);
+                        _v = vfmul_vv_f32m8(_v, _scale, vl);
+                        vse32_v_f32m8(ptr + offset, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    float* ptr = top_blob.channel(q);
+
+                    vl = 4;
+                    vfloat32m1_t _scale_m1 = scale_data_size == 1 ? vfmv_v_f_f32m1(scale_data[0], vl) : vle32_v_f32m1((const float*)scale_data + q * 4, vl);
+                    vfloat32m1_t _bias_m1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 4, vl);
+
+                    vfloat32m8_t _scale = vundefined_f32m8();
+                    _scale = vset_v_f32m1_f32m8(_scale, 0, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 1, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 2, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 3, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 4, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 5, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 6, _scale_m1);
+                    _scale = vset_v_f32m1_f32m8(_scale, 7, _scale_m1);
+
+                    vfloat32m8_t _bias = vundefined_f32m8();
+                    _bias = vset_v_f32m1_f32m8(_bias, 0, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 1, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 2, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 3, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 4, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 5, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 6, _bias_m1);
+                    _bias = vset_v_f32m1_f32m8(_bias, 7, _bias_m1);
+
+                    int n = size * 4;
+                    int offset = 0;
+
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m8(n);
+                        vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl);
+                        _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl);
+                        vse32_v_f32m8(ptr + offset, _v, vl);
+                        offset += vl;
+                        n -= vl;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+#endif // __riscv_vector
+
+    if (dims == 1)
+    {
+        int w = bottom_blob.w;
+
+        top_blob.create(w, (size_t)4u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        const int* intptr = bottom_blob;
+        float* ptr = top_blob;
+
+        if (scale_data_size == 1)
+        {
+            const float scale = scale_data[0];
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale;
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale + bias;
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale + bias_data[i];
+                }
+            }
+        }
+        else
+        {
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale_data[i];
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale_data[i] + bias;
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    ptr[i] = intptr[i] * scale_data[i] + bias_data[i];
+                }
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        top_blob.create(w, h, (size_t)4u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                float* ptr = top_blob.row(i);
+
+                const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+
+                int n = w;
+                int offset = 0;
+                while (n > 0)
+                {
+                    vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _scale = vfmv_v_f_f32m8(scale, vl);
+                    vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl);
+                    _v = vfmul_vv_f32m8(_v, _scale, vl);
+                    vse32_v_f32m8(ptr + offset, _v, vl);
+                    offset += vl;
+                    n -= vl;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                float* ptr = top_blob.row(i);
+
+                const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+                const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i];
+
+                int n = w;
+                int offset = 0;
+                while (n > 0)
+                {
+                    vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _scale = vfmv_v_f_f32m8(scale, vl);
+                    vfloat32m8_t _bias = vfmv_v_f_f32m8(bias, vl);
+                    vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl);
+                    _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl);
+                    vse32_v_f32m8(ptr + offset, _v, vl);
+                    offset += vl;
+                    n -= vl;
+                }
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+        int size = w * h;
+
+        top_blob.create(w, h, channels, (size_t)4u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                float* ptr = top_blob.channel(q);
+
+                const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+
+                int i = 0;
+#if __riscv_vector
+                int n = size;
+                int offset = 0;
+
+                while (n > 0)
+                {
+                    vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _scale = vfmv_v_f_f32m8(scale, vl);
+                    vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl);
+                    _v = vfmul_vv_f32m8(_v, _scale, vl);
+                    vse32_v_f32m8(ptr + offset, _v, vl);
+                    offset += vl;
+                    n -= vl;
+                }
+#endif // __riscv_vector               \
+// for (; i < size; i++)           \
+// {                               \
+//     *ptr++ = *intptr++ * scale; \
+// }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                float* ptr = top_blob.channel(q);
+
+                const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+                const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q];
+
+                int i = 0;
+#if __riscv_vector
+                int n = size;
+                int offset = 0;
+
+                while (n > 0)
+                {
+                    vl = vsetvl_e32m8(n);
+                    vfloat32m8_t _scale = vfmv_v_f_f32m8(scale, vl);
+                    vfloat32m8_t _bias = vfmv_v_f_f32m8(bias, vl);
+                    vfloat32m8_t _v = vfcvt_f_x_v_f32m8(vle32_v_i32m8(intptr + offset, vl), vl);
+                    _v = vfmadd_vv_f32m8(_scale, _v, _bias, vl);
+                    vse32_v_f32m8(ptr + offset, _v, vl);
+                    offset += vl;
+                    n -= vl;
+                }
+#endif // __riscv_vector                      \
+// for (; i < size; i++)                  \
+// {                                      \
+//     *ptr++ = *intptr++ * scale + bias; \
+// }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/riscv/dequantize_riscv.h b/src/layer/riscv/dequantize_riscv.h
new file mode 100644
index 000000000000..be7bc6bc90c5
--- /dev/null
+++ b/src/layer/riscv/dequantize_riscv.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_DEQUANTIZE_RISCV_H
+#define LAYER_DEQUANTIZE_RISCV_H
+
+#include "dequantize.h"
+
+namespace ncnn {
+
+class Dequantize_riscv : public Dequantize
+{
+public:
+    Dequantize_riscv();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_DEQUANTIZE_RISCV_H
diff --git a/src/layer/riscv/flatten_riscv.cpp b/src/layer/riscv/flatten_riscv.cpp
index 491c051c7fea..e6a97798a0fb 100644
--- a/src/layer/riscv/flatten_riscv.cpp
+++ b/src/layer/riscv/flatten_riscv.cpp
@@ -348,7 +348,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
     }
 
 #if __riscv_vector
-    const int packn = csrr_vlenb() / 1;
+    const int packn = csrr_vlenb() / 2; // packn should be 8
 #endif
 
     int w = bottom_blob.w;
@@ -394,7 +394,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
     if (dims == 2)
     {
 #if __riscv_vector
-        if (elempack == packn) // out_elempack == packn
+        if (elempack == packn) // must add, because in innerproduct, elempack is 8
         {
             #pragma omp parallel for num_threads(opt.num_threads)
             for (int i = 0; i < h; i++)
@@ -405,7 +405,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    size_t vl = vsetvl_e8m1(n);
+                    size_t vl = elempack;
 
                     vint8m1_t _p = vle8_v_i8m1(ptr, vl);
                     vsse8_v_i8m1(outptr, w * sizeof(unsigned char), _p, vl);
@@ -422,7 +422,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
     if (dims == 3 || dims == 4)
     {
 #if __riscv_vector
-        if (elempack == packn) // out_elempack == packn
+        if (elempack == packn) // must add, because in innerproduct, elempack is 8
         {
             #pragma omp parallel for num_threads(opt.num_threads)
             for (int q = 0; q < channels; q++)
@@ -433,7 +433,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    size_t vl = vsetvl_e8m1(n);
+                    size_t vl = elempack;
 
                     vint8m1_t _p = vle8_v_i8m1(ptr, vl);
                     vsse8_v_i8m1(outptr, size * sizeof(signed char), _p, vl);
diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp
index accfc683584f..08cddbc151f7 100644
--- a/src/layer/riscv/innerproduct_riscv.cpp
+++ b/src/layer/riscv/innerproduct_riscv.cpp
@@ -1,5 +1,6 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
+// Copyright (C) 2024 Xinyu302. All rights reserved.
 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
@@ -53,7 +54,7 @@ int InnerProduct_riscv::create_pipeline(const Option& opt)
     if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
     {
         // TODO implement int8
-        return 0;
+        return create_pipeline_int8(opt);
     }
 #endif
 
@@ -128,27 +129,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
 #if NCNN_INT8
     if (opt.use_int8_inference && int8_scale_term)
     {
-        Mat bottom_blob_unpacked = bottom_blob;
-        if (bottom_blob.elempack != 1)
-        {
-            Option opt_pack1 = opt;
-            opt_pack1.blob_allocator = opt.workspace_allocator;
-
-            convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
-        }
-
-        Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked;
-        if (bottom_blob_unpacked.elembits() == 16)
-        {
-            Option opt_pack1 = opt;
-            opt_pack1.blob_allocator = opt.workspace_allocator;
-
-            cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1);
-        }
-
-        Option opt_unpacked = opt;
-        opt_unpacked.use_packing_layout = false;
-        return InnerProduct::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked);
+        return forward_int8(bottom_blob, top_blob, opt);
     }
 #endif
 
@@ -1090,4 +1071,512 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co
 }
 #endif // __riscv_vector && __riscv_zfh
 
+#if NCNN_INT8
+int InnerProduct_riscv::create_pipeline_int8(const Option& opt)
+{
+    const int num_input = weight_data_size / num_output;
+
+    int out_elempack = 1;
+#if __riscv_vector
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 8 == 0 ? 8 : 1;
+    }
+#endif
+
+    // src = inch-outch
+    // dst = pb-inch-outch/pb
+    {
+        Mat weight_data_r2 = weight_data.reshape(num_input, num_output);
+
+        weight_data_tm.create(num_input, num_output / out_elempack, (size_t)out_elempack, out_elempack);
+
+        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
+        {
+            signed char* g0 = weight_data_tm.row<signed char>(q / out_elempack);
+
+            for (int p = 0; p < num_input; p++)
+            {
+                for (int j = 0; j < out_elempack; j++)
+                {
+                    *g0++ = weight_data_r2.row<signed char>(q + j)[p];
+                }
+            }
+        }
+    }
+
+    scale_in_data.create(num_output);
+    for (int p = 0; p < num_output; p++)
+    {
+        // dequantize
+        float scale_in;
+        if (weight_data_int8_scales[p] == 0)
+            scale_in = 0;
+        else
+            scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
+
+        scale_in_data[p] = scale_in;
+    }
+
+    weight_data.release();
+
+    return 0;
+}
+
+int InnerProduct_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int vl;
+    const int num_input = weight_data_size / num_output;
+
+    int elembits = bottom_blob.elembits();
+
+    Mat bottom_blob_int8 = bottom_blob;
+
+    if (elembits != 8)
+    {
+        Option opt_q = opt;
+        opt_q.blob_allocator = opt.workspace_allocator;
+        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
+    }
+
+    if (bottom_blob_int8.dims == 2 && bottom_blob_int8.w == num_input)
+    {
+        // gemm
+        Mat bottom_blob_int8_unpacked;
+        Option opt_unpack = opt;
+        opt_unpack.blob_allocator = opt.workspace_allocator;
+        convert_packing(bottom_blob_int8, bottom_blob_int8_unpacked, 1, opt_unpack);
+
+        int h = bottom_blob_int8_unpacked.h;
+
+        int out_elempack = 1;
+#if __riscv_vector
+        if (opt.use_packing_layout)
+        {
+            out_elempack = h % 4 == 0 ? 4 : 1;
+        }
+#endif
+
+        int outh = h / out_elempack;
+
+        top_blob.create(num_output, outh, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        int num_output_elempack = 1;
+#if __riscv_vector
+        if (opt.use_packing_layout)
+        {
+            num_output_elempack = num_output % 8 == 0 ? 8 : 1;
+        }
+#endif
+
+#if __riscv_vector
+        if (num_output_elempack == 8 && out_elempack == 4)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int j = 0; j < outh; j++)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output / num_output_elempack; p++)
+                {
+                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
+                    const signed char* m0 = bottom_blob_int8_unpacked.row<const signed char>(j * 4);
+                    const signed char* m1 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 1);
+                    const signed char* m2 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 2);
+                    const signed char* m3 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 3);
+
+                    vl = 8;
+                    vint32m2_t _sum0 = vmv_v_x_i32m2(0, vl);
+                    vint32m2_t _sum1 = vmv_v_x_i32m2(0, vl);
+                    vint32m2_t _sum2 = vmv_v_x_i32m2(0, vl);
+                    vint32m2_t _sum3 = vmv_v_x_i32m2(0, vl);
+
+                    int i = 0;
+                    for (; i < num_input; i++)
+                    {
+                        vint8m1_t _val0 = vmv_v_x_i8m1(m0[0], vl);
+                        vint8m1_t _val1 = vmv_v_x_i8m1(m1[0], vl);
+                        vint8m1_t _val2 = vmv_v_x_i8m1(m2[0], vl);
+                        vint8m1_t _val3 = vmv_v_x_i8m1(m3[0], vl);
+
+                        vint8m1_t _w = vle8_v_i8m1(kptr, vl);
+
+                        vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val0, _w, vl), 0);
+                        vint16m1_t _s1 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val1, _w, vl), 0);
+                        vint16m1_t _s2 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val2, _w, vl), 0);
+                        vint16m1_t _s3 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val3, _w, vl), 0);
+
+                        _sum0 = vwadd_wv_i32m2(_sum0, _s0, vl);
+                        _sum1 = vwadd_wv_i32m2(_sum1, _s1, vl);
+                        _sum2 = vwadd_wv_i32m2(_sum2, _s2, vl);
+                        _sum3 = vwadd_wv_i32m2(_sum3, _s3, vl);
+
+                        m0++;
+                        m1++;
+                        m2++;
+                        m3++;
+                        kptr += 8;
+                    }
+
+                    // dequantize and relu
+                    vfloat32m2_t _scale_in = vle32_v_f32m2((const float*)scale_in_data + p * 8, vl);
+
+                    vfloat32m2_t _sumfp32_0 = vfcvt_f_x_v_f32m2(_sum0, vl);
+                    vfloat32m2_t _sumfp32_1 = vfcvt_f_x_v_f32m2(_sum1, vl);
+                    vfloat32m2_t _sumfp32_2 = vfcvt_f_x_v_f32m2(_sum2, vl);
+                    vfloat32m2_t _sumfp32_3 = vfcvt_f_x_v_f32m2(_sum3, vl);
+
+                    if (bias_term)
+                    {
+                        vfloat32m2_t _bias = vle32_v_f32m2((const float*)bias_data + p * 8, vl);
+                        _sumfp32_0 = vfmacc_vv_f32m2(_bias, _sumfp32_0, _scale_in, vl);
+                        _sumfp32_1 = vfmacc_vv_f32m2(_bias, _sumfp32_1, _scale_in, vl);
+                        _sumfp32_2 = vfmacc_vv_f32m2(_bias, _sumfp32_2, _scale_in, vl);
+                        _sumfp32_3 = vfmacc_vv_f32m2(_bias, _sumfp32_3, _scale_in, vl);
+                    }
+                    else
+                    {
+                        _sumfp32_0 = vfmul_vv_f32m2(_sumfp32_0, _scale_in, vl);
+                        _sumfp32_1 = vfmul_vv_f32m2(_sumfp32_1, _scale_in, vl);
+                        _sumfp32_2 = vfmul_vv_f32m2(_sumfp32_2, _scale_in, vl);
+                        _sumfp32_3 = vfmul_vv_f32m2(_sumfp32_3, _scale_in, vl);
+                    }
+
+                    _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params, vl);
+                    _sumfp32_1 = activation_ps(_sumfp32_1, activation_type, activation_params, vl);
+                    _sumfp32_2 = activation_ps(_sumfp32_2, activation_type, activation_params, vl);
+                    _sumfp32_3 = activation_ps(_sumfp32_3, activation_type, activation_params, vl);
+
+                    vsseg4e32_v_f32m2(outptr, _sumfp32_0, _sumfp32_1, _sumfp32_2, _sumfp32_3, vl);
+
+                    outptr += 32;
+                }
+            }
+        }
+
+        if (num_output_elempack == 1 && out_elempack == 4)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int j = 0; j < outh; j++)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output; p++)
+                {
+                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
+                    const signed char* m0 = bottom_blob_int8_unpacked.row<const signed char>(j * 4);
+                    const signed char* m1 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 1);
+                    const signed char* m2 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 2);
+                    const signed char* m3 = bottom_blob_int8_unpacked.row<const signed char>(j * 4 + 3);
+
+                    int sum0 = 0;
+                    int sum1 = 0;
+                    int sum2 = 0;
+                    int sum3 = 0;
+
+                    int i = 0;
+
+                    int n = num_input;
+
+                    vl = vsetvlmax_e32m4();
+                    vint32m4_t _sum0 = vmv_v_x_i32m4(0, vl);
+                    vint32m4_t _sum1 = vmv_v_x_i32m4(0, vl);
+                    vint32m4_t _sum2 = vmv_v_x_i32m4(0, vl);
+                    vint32m4_t _sum3 = vmv_v_x_i32m4(0, vl);
+
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m4(n);
+                        vint8m1_t _val0 = vle8_v_i8m1(m0, vl);
+                        vint8m1_t _val1 = vle8_v_i8m1(m1, vl);
+                        vint8m1_t _val2 = vle8_v_i8m1(m2, vl);
+                        vint8m1_t _val3 = vle8_v_i8m1(m3, vl);
+
+                        vint8m1_t _w = vle8_v_i8m1(kptr, vl);
+
+                        vint16m2_t _s0 = vwmul_vv_i16m2(_val0, _w, vl);
+                        vint16m2_t _s1 = vwmul_vv_i16m2(_val1, _w, vl);
+                        vint16m2_t _s2 = vwmul_vv_i16m2(_val2, _w, vl);
+                        vint16m2_t _s3 = vwmul_vv_i16m2(_val3, _w, vl);
+
+                        _sum0 = vwadd_wv_i32m4(_sum0, _s0, vl);
+                        _sum1 = vwadd_wv_i32m4(_sum1, _s1, vl);
+                        _sum2 = vwadd_wv_i32m4(_sum2, _s2, vl);
+                        _sum3 = vwadd_wv_i32m4(_sum3, _s3, vl);
+
+                        m0 += vl;
+                        m1 += vl;
+                        m2 += vl;
+                        m3 += vl;
+
+                        kptr += vl;
+                        n -= vl;
+                    }
+
+                    vint32m1_t _sum0_scala = vmv_v_x_i32m1(0, vl);
+                    vint32m1_t _sum1_scala = vmv_v_x_i32m1(0, vl);
+                    vint32m1_t _sum2_scala = vmv_v_x_i32m1(0, vl);
+                    vint32m1_t _sum3_scala = vmv_v_x_i32m1(0, vl);
+
+                    vl = vsetvlmax_e32m4();
+                    _sum0_scala = vredsum_vs_i32m4_i32m1(_sum0_scala, _sum0, _sum0_scala, vl);
+                    _sum1_scala = vredsum_vs_i32m4_i32m1(_sum1_scala, _sum1, _sum1_scala, vl);
+                    _sum2_scala = vredsum_vs_i32m4_i32m1(_sum2_scala, _sum2, _sum2_scala, vl);
+                    _sum3_scala = vredsum_vs_i32m4_i32m1(_sum3_scala, _sum3, _sum3_scala, vl);
+                    sum0 = vmv_x_s_i32m1_i32(_sum0_scala);
+                    sum1 = vmv_x_s_i32m1_i32(_sum1_scala);
+                    sum2 = vmv_x_s_i32m1_i32(_sum2_scala);
+                    sum3 = vmv_x_s_i32m1_i32(_sum3_scala);
+
+                    // dequantize and relu
+                    float sumfp32_0 = sum0 * scale_in_data[p];
+                    float sumfp32_1 = sum1 * scale_in_data[p];
+                    float sumfp32_2 = sum2 * scale_in_data[p];
+                    float sumfp32_3 = sum3 * scale_in_data[p];
+
+                    if (bias_term)
+                    {
+                        sumfp32_0 += bias_data[p];
+                        sumfp32_1 += bias_data[p];
+                        sumfp32_2 += bias_data[p];
+                        sumfp32_3 += bias_data[p];
+                    }
+
+                    outptr[0] = activation_ss(sumfp32_0, activation_type, activation_params);
+                    outptr[1] = activation_ss(sumfp32_1, activation_type, activation_params);
+                    outptr[2] = activation_ss(sumfp32_2, activation_type, activation_params);
+                    outptr[3] = activation_ss(sumfp32_3, activation_type, activation_params);
+                    outptr += 4;
+                }
+            }
+        }
+
+        if (num_output_elempack == 8 && out_elempack == 1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int j = 0; j < outh; j++)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output / num_output_elempack; p++)
+                {
+                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
+                    const signed char* m = bottom_blob_int8_unpacked.row<const signed char>(j);
+
+                    vint32m2_t _sum = vmv_v_x_i32m2(0, vl);
+
+                    int i = 0;
+
+                    vl = 8;
+                    for (; i < num_input; i++)
+                    {
+                        vint8m1_t _val = vmv_v_x_i8m1(m[0], vl);
+                        vint8m1_t _w = vle8_v_i8m1(kptr, vl);
+
+                        // int8x8_t _val = vld1_dup_s8(m);
+                        // int8x8_t _w = vld1_s8(kptr);
+
+                        vint16m2_t _s = vwmul_vv_i16m2(_val, _w, vl);
+                        vint16m1_t _s0 = vget_v_i16m2_i16m1(_s, 0);
+
+                        _sum = vwadd_wv_i32m2(_sum, _s0, vl);
+
+                        m++;
+                        kptr += 8;
+                    }
+
+                    // dequantize and relu
+                    vfloat32m2_t _scale_in = vle32_v_f32m2((const float*)scale_in_data + p * 8, vl);
+                    vfloat32m2_t _sumfp32 = vfcvt_f_x_v_f32m2(_sum, vl);
+
+                    if (bias_term)
+                    {
+                        vfloat32m2_t _bias = vle32_v_f32m2((const float*)bias_data + p * 8, vl);
+                        _sumfp32 = vfmacc_vv_f32m2(_bias, _sumfp32, _scale_in, vl);
+                    }
+                    else
+                    {
+                        _sumfp32 = vfmul_vv_f32m2(_sumfp32, _scale_in, vl);
+                    }
+
+                    // _sumfp32_0 = activation_ps(_sumfp32_0, activation_type, activation_params, vl);
+                    _sumfp32 = activation_ps(_sumfp32, activation_type, activation_params, vl);
+
+                    vse32_v_f32m2(outptr, _sumfp32, vl);
+                    outptr += 8;
+                }
+            }
+        }
+#endif // __riscv_vector
+
+        if (num_output_elempack == 1 && out_elempack == 1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int j = 0; j < outh; j++)
+            {
+                float* outptr = top_blob.row(j);
+
+                for (int p = 0; p < num_output; p++)
+                {
+                    const signed char* kptr = weight_data_tm.row<const signed char>(p);
+                    const signed char* m = bottom_blob_int8_unpacked.row<const signed char>(j);
+
+                    int sum = 0;
+
+                    int i = 0;
+#if __riscv_vector
+
+                    int n = num_input;
+                    vint32m4_t _sum = vmv_v_x_i32m4(0, vsetvlmax_e32m4());
+                    while (n > 0)
+                    {
+                        vl = vsetvl_e32m4(n);
+                        vint8m1_t _val = vle8_v_i8m1(m, vl);
+                        vint8m1_t _w = vle8_v_i8m1(kptr, vl);
+
+                        vint16m2_t _s = vwmul_vv_i16m2(_val, _w, vl);
+                        _sum = vwadd_wv_i32m4(_sum, _s, vl);
+
+                        m += vl;
+                        kptr += vl;
+                        n -= vl;
+                    }
+
+                    vint32m1_t _sum_scala = vmv_v_x_i32m1(0, vl);
+                    _sum_scala = vredsum_vs_i32m4_i32m1(_sum_scala, _sum, _sum_scala, vl);
+                    sum = vmv_x_s_i32m1_i32(_sum_scala);
+
+#endif // __riscv_vector
+
+                    // for (; i < num_input; i++) \
+// {                          \
+//     sum += *m++ * *kptr++; \
+// }
+
+                    // dequantize and relu
+                    float sumfp32 = sum * scale_in_data[p];
+
+                    if (bias_term)
+                        sumfp32 += bias_data[p];
+
+                    outptr[0] = activation_ss(sumfp32, activation_type, activation_params);
+                    outptr += 1;
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    Mat bottom_blob_int8_flattened = bottom_blob_int8;
+    if (bottom_blob_int8.dims != 1)
+    {
+        Option opt_flatten = opt;
+        opt_flatten.blob_allocator = opt.workspace_allocator;
+        flatten->forward(bottom_blob_int8, bottom_blob_int8_flattened, opt_flatten);
+    }
+
+    //     int elempack = bottom_blob_int8_flattened.elempack;
+
+    int out_elempack = 1;
+#if __riscv_vector
+    if (opt.use_packing_layout)
+    {
+        out_elempack = num_output % 8 == 0 ? 8 : 1;
+    }
+#endif
+
+    top_blob.create(num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+#if __riscv_vector
+    if (out_elempack == 8)
+    {
+        // num_output
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < num_output / out_elempack; p++)
+        {
+            const signed char* kptr = weight_data_tm.row<const signed char>(p);
+            const signed char* sptr = bottom_blob_int8_flattened;
+
+            vl = 8;
+
+            vint32m2_t _sum0 = vmv_v_x_i32m2(0, vl);
+
+            int i = 0;
+            for (; i < num_input; i++)
+            {
+                vint8m1_t _val = vmv_v_x_i8m1(sptr[0], vl);
+                vint8m1_t _w = vle8_v_i8m1(kptr, vl);
+
+                vint16m1_t _s = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val, _w, vl), 0);
+                _sum0 = vwadd_wv_i32m2(_sum0, _s, vl);
+
+                sptr += 1;
+                kptr += 8;
+            }
+
+            // dequantize and relu
+            vfloat32m2_t _scale_in = vle32_v_f32m2((const float*)scale_in_data + p * 8, vl);
+
+            vfloat32m2_t _sumfp32 = vfcvt_f_x_v_f32m2(_sum0, vl);
+
+            if (bias_term)
+            {
+                vfloat32m2_t _bias = vle32_v_f32m2((const float*)bias_data + p * 8, vl);
+                _sumfp32 = vfmacc_vv_f32m2(_bias, _sumfp32, _scale_in, vl);
+            }
+            else
+            {
+                _sumfp32 = vfmul_vv_f32m2(_sumfp32, _scale_in, vl);
+            }
+
+            _sumfp32 = activation_ps(_sumfp32, activation_type, activation_params, vl);
+
+            float* outptr = (float*)top_blob + p * 8;
+            vse32_v_f32m2(outptr, _sumfp32, vl);
+        }
+    }
+#endif // __riscv_vector
+
+    if (out_elempack == 1)
+    {
+        // num_output
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p = 0; p < num_output / out_elempack; p++)
+        {
+            const signed char* kptr = weight_data_tm.row<const signed char>(p);
+            const signed char* sptr = bottom_blob_int8_flattened;
+
+            int sum = 0;
+
+            int i = 0;
+            for (; i < num_input; i++)
+            {
+                signed char val = sptr[0];
+                signed char w = kptr[0];
+
+                sum += val * w;
+
+                sptr += 1;
+                kptr += 1;
+            }
+            // dequantize and relu
+            float sumfp32 = sum * scale_in_data[p];
+
+            if (bias_term)
+                sumfp32 += bias_data[p];
+            sumfp32 = activation_ss(sumfp32, activation_type, activation_params);
+
+            top_blob[p] = sumfp32;
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_INT8
+
 } // namespace ncnn
diff --git a/src/layer/riscv/innerproduct_riscv.h b/src/layer/riscv/innerproduct_riscv.h
index d3056d5801d0..77a3b93de12a 100644
--- a/src/layer/riscv/innerproduct_riscv.h
+++ b/src/layer/riscv/innerproduct_riscv.h
@@ -36,6 +36,11 @@ class InnerProduct_riscv : public InnerProduct
     int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
 
+#if __riscv_vector
+    int create_pipeline_int8(const Option& opt);
+    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+#endif
+
 public:
     Layer* flatten;
 
@@ -43,6 +48,10 @@ class InnerProduct_riscv : public InnerProduct
 
     // fp16
     Mat bias_data_fp16;
+
+#if NCNN_INT8
+    Mat scale_in_data;
+#endif
 };
 
 } // namespace ncnn
diff --git a/src/layer/riscv/padding_packn.h b/src/layer/riscv/padding_packn.h
index 50f5efe1216d..60465da64ffc 100644
--- a/src/layer/riscv/padding_packn.h
+++ b/src/layer/riscv/padding_packn.h
@@ -15,7 +15,8 @@
 #define _PADDING_PACKN_RVV(SEW, TSEW, LMUL, T, VT)                                                                                          \
     static void padding_constant_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right, v##VT##m##LMUL##_t v) \
     {                                                                                                                                       \
-        const int packn = csrr_vlenb() / sizeof(T);                                                                                         \
+        int packn = csrr_vlenb() / sizeof(T);                                                                                               \
+        if (packn > 8) packn = 8;                                                                                                           \
         const size_t vl = vsetvl_e##SEW##m##LMUL(packn);                                                                                    \
                                                                                                                                             \
         const T* ptr = src;                                                                                                                 \
@@ -64,7 +65,8 @@
                                                                                                                                             \
     static void padding_replicate_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right)                      \
     {                                                                                                                                       \
-        const int packn = csrr_vlenb() / sizeof(T);                                                                                         \
+        int packn = csrr_vlenb() / sizeof(T);                                                                                               \
+        if (packn > 8) packn = 8;                                                                                                           \
         const size_t vl = vsetvl_e##SEW##m##LMUL(packn);                                                                                    \
                                                                                                                                             \
         const T* ptr = src;                                                                                                                 \
@@ -143,7 +145,8 @@
                                                                                                                                             \
     static void padding_reflect_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right)                        \
     {                                                                                                                                       \
-        const int packn = csrr_vlenb() / sizeof(T);                                                                                         \
+        int packn = csrr_vlenb() / sizeof(T);                                                                                               \
+        if (packn > 8) packn = 8;                                                                                                           \
         const size_t vl = vsetvl_e##SEW##m##LMUL(packn);                                                                                    \
                                                                                                                                             \
         const T* ptr = src;                                                                                                                 \
diff --git a/src/layer/riscv/padding_riscv.cpp b/src/layer/riscv/padding_riscv.cpp
index 8f4b54da5904..2e2d7471f477 100644
--- a/src/layer/riscv/padding_riscv.cpp
+++ b/src/layer/riscv/padding_riscv.cpp
@@ -510,7 +510,7 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
 int Padding_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
 #if __riscv_vector
-    const int packn = csrr_vlenb() / 1;
+    const int packn = csrr_vlenb() / 2;
     const size_t vl = vsetvl_e8m1(packn);
 #endif
 
diff --git a/src/layer/riscv/quantize_riscv.cpp b/src/layer/riscv/quantize_riscv.cpp
new file mode 100644
index 000000000000..89d82106aa04
--- /dev/null
+++ b/src/layer/riscv/quantize_riscv.cpp
@@ -0,0 +1,1529 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 Xinyu302. All rights reserved.
+// Copyright (C) 2019 BUG1989. All rights reserved.
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "quantize_riscv.h"
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#endif // __riscv_vector
+
+#include "riscv_usability.h"
+
+#include "cpu.h"
+
+namespace ncnn {
+
+Quantize_riscv::Quantize_riscv()
+{
+#if __riscv_vector
+    support_packing = true;
+
+#if __riscv_zfh
+    support_fp16_storage = true;
+#endif // __riscv_zfh
+#endif // __riscv_vector
+}
+
+int Quantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int elembits = bottom_blob.elembits();
+#if __riscv_vector && __riscv_zfh
+    if (support_fp16_storage && opt.use_fp16_storage && elembits == 16)
+    {
+        if (opt.use_fp16_arithmetic)
+            return forward_fp16sa(bottom_blob, top_blob, opt);
+        else
+            return forward_fp16s(bottom_blob, top_blob, opt);
+    }
+#endif // __riscv_vector && __riscv_zfh
+
+    int vl = vsetvlmax_e32m1();
+    int dims = bottom_blob.dims;
+    int elempack = bottom_blob.elempack;
+
+#if __riscv_vector
+    if (elempack == 4)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+            int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
+            int outw = w * elempack / out_elempack;
+
+            top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                const float scale = scale_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const float* ptr0 = (const float*)bottom_blob + i * 4;
+                    signed char* outptr = (signed char*)top_blob + i * 4;
+
+                    outptr[0] = float2int8(ptr0[0] * scale);
+                    outptr[1] = float2int8(ptr0[1] * scale);
+                    outptr[2] = float2int8(ptr0[2] * scale);
+                    outptr[3] = float2int8(ptr0[3] * scale);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const float* ptr0 = (const float*)bottom_blob + i * 4;
+                    signed char* outptr = (signed char*)top_blob + i * 4;
+
+                    outptr[0] = float2int8(ptr0[0] * scale_data[i * 4]);
+                    outptr[1] = float2int8(ptr0[1] * scale_data[i * 4 + 1]);
+                    outptr[2] = float2int8(ptr0[2] * scale_data[i * 4 + 2]);
+                    outptr[3] = float2int8(ptr0[3] * scale_data[i * 4 + 3]);
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
+            int outh = h * elempack / out_elempack;
+
+            top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (out_elempack == 8)
+            {
+                if (scale_data_size == 1)
+                {
+                    // vfloat32m1_t _scale = vfmv_v_f_f32m1(scale_data[0]);
+                    // float32x4_t _scale = vdupq_n_f32(scale_data[0]);
+                    float _scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const float* ptr0 = bottom_blob.row(i * 2);
+                        const float* ptr1 = bottom_blob.row(i * 2 + 1);
+                        signed char* outptr = top_blob.row<signed char>(i);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            vfloat32m1_t _vlow = vle32_v_f32m1(ptr0, vl);
+                            vfloat32m1_t _vhigh = vle32_v_f32m1(ptr1, vl);
+                            _vlow = vfmul_vf_f32m1(_vlow, _scale, vl);
+                            _vhigh = vfmul_vf_f32m1(_vhigh, _scale, vl);
+
+                            int64_t _v = float2int8(_vlow, _vhigh);
+                            *(int64_t*)outptr = _v;
+
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const float* ptr0 = bottom_blob.row(i * 2);
+                        const float* ptr1 = bottom_blob.row(i * 2 + 1);
+                        signed char* outptr = top_blob.row<signed char>(i);
+
+                        vfloat32m1_t _scale0 = vle32_v_f32m1((const float*)scale_data + i * 8, vl);
+                        vfloat32m1_t _scale1 = vle32_v_f32m1((const float*)scale_data + i * 8 + 4, vl);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            vfloat32m1_t _vlow = vle32_v_f32m1(ptr0, vl);
+                            vfloat32m1_t _vhigh = vle32_v_f32m1(ptr1, vl);
+                            _vlow = vfmul_vv_f32m1(_vlow, _scale0, vl);
+                            _vhigh = vfmul_vv_f32m1(_vhigh, _scale1, vl);
+                            int64_t _v = float2int8(_vlow, _vhigh);
+                            *(int64_t*)outptr = _v;
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (scale_data_size == 1)
+                {
+                    const float scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const float* ptr0 = bottom_blob.row(i);
+                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
+                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
+                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
+                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            outptr0[0] = float2int8(ptr0[0] * scale);
+                            outptr1[0] = float2int8(ptr0[1] * scale);
+                            outptr2[0] = float2int8(ptr0[2] * scale);
+                            outptr3[0] = float2int8(ptr0[3] * scale);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const float* ptr0 = bottom_blob.row(i);
+                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
+                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
+                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
+                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                        const float s0 = scale_data[i * 4];
+                        const float s1 = scale_data[i * 4 + 1];
+                        const float s2 = scale_data[i * 4 + 2];
+                        const float s3 = scale_data[i * 4 + 3];
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            outptr0[0] = float2int8(ptr0[0] * s0);
+                            outptr1[0] = float2int8(ptr0[1] * s1);
+                            outptr2[0] = float2int8(ptr0[2] * s2);
+                            outptr3[0] = float2int8(ptr0[3] * s3);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+            int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
+            int outc = channels * elempack / out_elempack;
+
+            top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (out_elempack == 8)
+            {
+                if (scale_data_size == 1)
+                {
+                    float _scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const float* ptr0 = bottom_blob.channel(q * 2);
+                        const float* ptr1 = bottom_blob.channel(q * 2 + 1);
+                        signed char* outptr = top_blob.channel(q);
+
+                        int i = 0;
+                        for (; i + 1 < size; i += 2)
+                        {
+                            vfloat32m1_t _v0 = vle32_v_f32m1(ptr0, vl);
+                            vfloat32m1_t _v1 = vle32_v_f32m1(ptr0 + 4, vl);
+                            vfloat32m1_t _v2 = vle32_v_f32m1(ptr1, vl);
+                            vfloat32m1_t _v3 = vle32_v_f32m1(ptr1 + 4, vl);
+                            _v0 = vfmul_vf_f32m1(_v0, _scale, vl);
+                            _v1 = vfmul_vf_f32m1(_v1, _scale, vl);
+                            _v2 = vfmul_vf_f32m1(_v2, _scale, vl);
+                            _v3 = vfmul_vf_f32m1(_v3, _scale, vl);
+
+                            vint8m1_t _v = float2int8(_v0, _v2, _v1, _v3);
+                            vse8_v_i8m1(outptr, _v, 4 * vl);
+                            ptr0 += 8;
+                            ptr1 += 8;
+                            outptr += 16;
+                        }
+                        for (; i < size; i++)
+                        {
+                            vfloat32m1_t _vlow = vle32_v_f32m1(ptr0, vl);
+                            vfloat32m1_t _vhigh = vle32_v_f32m1(ptr1, vl);
+
+                            _vlow = vfmul_vf_f32m1(_vlow, _scale, vl);
+                            _vhigh = vfmul_vf_f32m1(_vhigh, _scale, vl);
+
+                            int64_t _v = float2int8(_vlow, _vhigh);
+                            *(int64_t*)outptr = _v;
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const float* ptr0 = bottom_blob.channel(q * 2);
+                        const float* ptr1 = bottom_blob.channel(q * 2 + 1);
+                        signed char* outptr = top_blob.channel(q);
+
+                        vfloat32m1_t _scale0 = vle32_v_f32m1((const float*)scale_data + q * 8, vl);
+                        vfloat32m1_t _scale1 = vle32_v_f32m1((const float*)scale_data + q * 8 + 4, vl);
+
+                        int i = 0;
+                        for (; i < size; i++)
+                        {
+                            vfloat32m1_t _vlow = vle32_v_f32m1(ptr0, vl);
+                            vfloat32m1_t _vhigh = vle32_v_f32m1(ptr1, vl);
+
+                            _vlow = vfmul_vv_f32m1(_vlow, _scale0, vl);
+                            _vhigh = vfmul_vv_f32m1(_vhigh, _scale1, vl);
+
+                            int64_t _v = float2int8(_vlow, _vhigh);
+                            *(int64_t*)outptr = _v;
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (scale_data_size == 1)
+                {
+                    const float scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const float* ptr0 = bottom_blob.channel(q);
+                        signed char* outptr0 = top_blob.channel(q * 4);
+                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
+                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
+                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            outptr0[0] = float2int8(ptr0[0] * scale);
+                            outptr1[0] = float2int8(ptr0[1] * scale);
+                            outptr2[0] = float2int8(ptr0[2] * scale);
+                            outptr3[0] = float2int8(ptr0[3] * scale);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const float* ptr0 = bottom_blob.channel(q);
+                        signed char* outptr0 = top_blob.channel(q * 4);
+                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
+                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
+                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
+
+                        const float s0 = scale_data[q * 4];
+                        const float s1 = scale_data[q * 4 + 1];
+                        const float s2 = scale_data[q * 4 + 2];
+                        const float s3 = scale_data[q * 4 + 3];
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            outptr0[0] = float2int8(ptr0[0] * s0);
+                            outptr1[0] = float2int8(ptr0[1] * s1);
+                            outptr2[0] = float2int8(ptr0[2] * s2);
+                            outptr3[0] = float2int8(ptr0[3] * s3);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+#endif // __riscv_vector
+
+    if (dims == 1)
+    {
+        int w = bottom_blob.w;
+
+        top_blob.create(w, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        const float* ptr = bottom_blob;
+        signed char* outptr = top_blob;
+
+        if (scale_data_size == 1)
+        {
+            const float scale = scale_data[0];
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                outptr[i] = float2int8(ptr[i] * scale);
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                outptr[i] = float2int8(ptr[i] * scale_data[i]);
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            const float* ptr0 = bottom_blob.row(i);
+            signed char* outptr0 = top_blob.row<signed char>(i);
+
+            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+
+            for (int j = 0; j < w; j++)
+            {
+                *outptr0++ = float2int8(*ptr0++ * scale);
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+        int size = w * h;
+
+        top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            signed char* outptr = top_blob.channel(q);
+
+            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+
+            int i = 0;
+#if __riscv_vector
+            float _scale = scale;
+
+            for (; i + 15 < size; i += 16)
+            {
+                vfloat32m1_t _v0 = vle32_v_f32m1(ptr, vl);
+                vfloat32m1_t _v1 = vle32_v_f32m1(ptr + 4, vl);
+                vfloat32m1_t _v2 = vle32_v_f32m1(ptr + 8, vl);
+                vfloat32m1_t _v3 = vle32_v_f32m1(ptr + 12, vl);
+
+                _v0 = vfmul_vf_f32m1(_v0, _scale, vl);
+                _v1 = vfmul_vf_f32m1(_v1, _scale, vl);
+                _v2 = vfmul_vf_f32m1(_v2, _scale, vl);
+                _v3 = vfmul_vf_f32m1(_v3, _scale, vl);
+
+                vint8m1_t _v = float2int8(_v0, _v1, _v2, _v3);
+                vse8_v_i8m1(outptr, _v, 4 * vl);
+
+                ptr += 16;
+                outptr += 16;
+            }
+            for (; i + 7 < size; i += 8)
+            {
+                vfloat32m1_t _v0 = vle32_v_f32m1(ptr, vl);
+                vfloat32m1_t _v1 = vle32_v_f32m1(ptr + 4, vl);
+
+                _v0 = vfmul_vf_f32m1(_v0, _scale, vl);
+                _v1 = vfmul_vf_f32m1(_v1, _scale, vl);
+
+                int64_t _v = float2int8(_v0, _v1);
+                *(int64_t*)outptr = _v;
+                ptr += 8;
+                outptr += 8;
+            }
+#endif // __riscv_vector
+            for (; i < size; i++)
+            {
+                *outptr++ = float2int8(*ptr++ * scale);
+            }
+        }
+    }
+
+    return 0;
+}
+
+#if __riscv_vector && __riscv_zfh
+
+int Quantize_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int dims = bottom_blob.dims;
+    int elempack = bottom_blob.elempack;
+    int vl;
+    if (elempack == 8)
+    {
+        vl = 8;
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+            int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
+            int outw = w * elempack / out_elempack;
+
+            top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                const float scale = scale_data[0];
+                vfloat32m2_t _scale = vfmv_v_f_f32m2(scale, vl);
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 8;
+                    signed char* outptr = (signed char*)top_blob + i * 8;
+
+                    vl = 8;
+                    vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl);
+                    vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl);
+                    _v = vfmul_vv_f32m2(_v, _scale, vl);
+                    *(int64_t*)outptr = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1));
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 8;
+                    signed char* outptr = (signed char*)top_blob + i * 8;
+
+                    vl = 8;
+                    vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl);
+                    vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl);
+                    vfloat32m2_t _scale = vle32_v_f32m2((const float*)scale_data + i * 8, vl);
+                    _v = vfmul_vv_f32m2(_v, _scale, vl);
+                    *(int64_t*)outptr = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1));
+                }
+            }
+        }
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
+            int outh = h * elempack / out_elempack;
+
+            top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                const float scale = scale_data[0];
+                vfloat32m2_t _scale = vfmv_v_f_f32m2(scale, vl);
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
+                    signed char* outptr0 = top_blob.row<signed char>(i);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        vl = 8;
+                        vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl);
+                        vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl);
+                        _v = vfmul_vv_f32m2(_v, _scale, vl);
+                        *(int64_t*)outptr0 = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1));
+
+                        ptr0 += 8;
+                        outptr0 += 8;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
+                    signed char* outptr0 = top_blob.row<signed char>(i);
+
+                    vfloat32m2_t _scale = vle32_v_f32m2((const float*)scale_data + i * 8, vl);
+                    for (int j = 0; j < w; j++)
+                    {
+                        vl = 8;
+                        vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl);
+                        vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl);
+                        _v = vfmul_vv_f32m2(_v, _scale, vl);
+                        *(int64_t*)outptr0 = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1));
+
+                        ptr0 += 8;
+                        outptr0 += 8;
+                    }
+                }
+            }
+        }
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+
+            int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
+            int outc = channels * elempack / out_elempack;
+
+            top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                const float scale = scale_data[0];
+                vfloat32m2_t _scale = vfmv_v_f_f32m2(scale, vl);
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const __fp16* ptr0 = bottom_blob.channel(q);
+                    signed char* outptr = top_blob.channel(q);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        vl = 8;
+                        vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl);
+                        vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl);
+                        _v = vfmul_vv_f32m2(_v, _scale, vl);
+                        *(int64_t*)outptr = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1));
+                        ptr0 += 8;
+                        outptr += 8;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const __fp16* ptr0 = bottom_blob.channel(q);
+                    signed char* outptr = top_blob.channel(q);
+
+                    vfloat32m2_t _scale = vle32_v_f32m2((const float*)scale_data + q * 8, vl);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        vl = 8;
+                        vfloat16m1_t _v0 = vle16_v_f16m1(ptr0, vl);
+                        vfloat32m2_t _v = vfwcvt_f_f_v_f32m2(_v0, vl);
+                        _v = vfmul_vv_f32m2(_v, _scale, vl);
+                        *(int64_t*)outptr = float2int8(vget_v_f32m2_f32m1(_v, 0), vget_v_f32m2_f32m1(_v, 1));
+                        ptr0 += 8;
+                        outptr += 8;
+                    }
+                }
+            }
+        }
+        return 0;
+    }
+
+    if (elempack == 4)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+            int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
+            int outw = w * elempack / out_elempack;
+
+            top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                const float scale = scale_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4;
+                    signed char* outptr = (signed char*)top_blob + i * 4;
+
+                    outptr[0] = float2int8((float)ptr0[0] * scale);
+                    outptr[1] = float2int8((float)ptr0[1] * scale);
+                    outptr[2] = float2int8((float)ptr0[2] * scale);
+                    outptr[3] = float2int8((float)ptr0[3] * scale);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4;
+                    signed char* outptr = (signed char*)top_blob + i * 4;
+
+                    outptr[0] = float2int8((float)ptr0[0] * scale_data[i * 4]);
+                    outptr[1] = float2int8((float)ptr0[1] * scale_data[i * 4 + 1]);
+                    outptr[2] = float2int8((float)ptr0[2] * scale_data[i * 4 + 2]);
+                    outptr[3] = float2int8((float)ptr0[3] * scale_data[i * 4 + 3]);
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
+            int outh = h * elempack / out_elempack;
+
+            top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (out_elempack == 8)
+            {
+                if (scale_data_size == 1)
+                {
+                    float _scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const __fp16* ptr0 = bottom_blob.row<const __fp16>(i * 2);
+                        const __fp16* ptr1 = bottom_blob.row<const __fp16>(i * 2 + 1);
+                        signed char* outptr = top_blob.row<signed char>(i);
+                        vl = 4;
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            vfloat32m1_t _vlow = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr0, vl), vl), 0);
+                            vfloat32m1_t _vhigh = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr1, vl), vl), 0);
+                            _vlow = vfmul_vf_f32m1(_vlow, _scale, vl);
+                            _vhigh = vfmul_vf_f32m1(_vhigh, _scale, vl);
+                            int64_t _v = float2int8(_vlow, _vhigh);
+                            *(int64_t*)outptr = _v;
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const __fp16* ptr0 = bottom_blob.row<const __fp16>(i * 2);
+                        const __fp16* ptr1 = bottom_blob.row<const __fp16>(i * 2 + 1);
+                        signed char* outptr = top_blob.row<signed char>(i);
+
+                        vl = 4;
+                        vfloat32m1_t _scale0 = vle32_v_f32m1((const float*)scale_data + i * 8, vl);
+                        vfloat32m1_t _scale1 = vle32_v_f32m1((const float*)scale_data + i * 8 + 4, vl);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            vfloat32m1_t _vlow = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr0, vl), vl), 0);
+                            vfloat32m1_t _vhigh = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr1, vl), vl), 0);
+                            _vlow = vfmul_vv_f32m1(_vlow, _scale0, vl);
+                            _vhigh = vfmul_vv_f32m1(_vhigh, _scale1, vl);
+                            int64_t _v = float2int8(_vlow, _vhigh);
+                            *(int64_t*)outptr = _v;
+
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (scale_data_size == 1)
+                {
+                    const float scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
+                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
+                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
+                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
+                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            outptr0[0] = float2int8((float)ptr0[0] * scale);
+                            outptr1[0] = float2int8((float)ptr0[1] * scale);
+                            outptr2[0] = float2int8((float)ptr0[2] * scale);
+                            outptr3[0] = float2int8((float)ptr0[3] * scale);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
+                        signed char* outptr0 = top_blob.row<signed char>(i * 4);
+                        signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
+                        signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
+                        signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                        const float s0 = scale_data[i * 4];
+                        const float s1 = scale_data[i * 4 + 1];
+                        const float s2 = scale_data[i * 4 + 2];
+                        const float s3 = scale_data[i * 4 + 3];
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            outptr0[0] = float2int8((float)ptr0[0] * s0);
+                            outptr1[0] = float2int8((float)ptr0[1] * s1);
+                            outptr2[0] = float2int8((float)ptr0[2] * s2);
+                            outptr3[0] = float2int8((float)ptr0[3] * s3);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+            int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
+            int outc = channels * elempack / out_elempack;
+
+            top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (out_elempack == 8)
+            {
+                if (scale_data_size == 1)
+                {
+                    float _scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const __fp16* ptr0 = bottom_blob.channel(q * 2);
+                        const __fp16* ptr1 = bottom_blob.channel(q * 2 + 1);
+                        signed char* outptr = top_blob.channel(q);
+
+                        vl = 4;
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            vfloat32m1_t _vlow = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr0, vl), vl), 0);
+                            vfloat32m1_t _vhigh = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr1, vl), vl), 0);
+                            _vlow = vfmul_vf_f32m1(_vlow, _scale, vl);
+                            _vhigh = vfmul_vf_f32m1(_vhigh, _scale, vl);
+                            int64_t _v = float2int8(_vlow, _vhigh);
+                            *(int64_t*)outptr = _v;
+
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const __fp16* ptr0 = bottom_blob.channel(q * 2);
+                        const __fp16* ptr1 = bottom_blob.channel(q * 2 + 1);
+                        signed char* outptr = top_blob.channel(q);
+
+                        vl = 4;
+                        vfloat32m1_t _scale0 = vle32_v_f32m1((const float*)scale_data + q * 8, vl);
+                        vfloat32m1_t _scale1 = vle32_v_f32m1((const float*)scale_data + q * 8 + 4, vl);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            vfloat32m1_t _vlow = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr0, vl), vl), 0);
+                            vfloat32m1_t _vhigh = vget_v_f32m2_f32m1(vfwcvt_f_f_v_f32m2(vle16_v_f16m1(ptr1, vl), vl), 0);
+                            _vlow = vfmul_vv_f32m1(_vlow, _scale0, vl);
+                            _vhigh = vfmul_vv_f32m1(_vhigh, _scale1, vl);
+                            int64_t _v = float2int8(_vlow, _vhigh);
+                            *(int64_t*)outptr = _v;
+
+                            ptr0 += 4;
+                            ptr1 += 4;
+                            outptr += 8;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (scale_data_size == 1)
+                {
+                    const float scale = scale_data[0];
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const __fp16* ptr0 = bottom_blob.channel(q);
+                        signed char* outptr0 = top_blob.channel(q * 4);
+                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
+                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
+                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            outptr0[0] = float2int8((float)ptr0[0] * scale);
+                            outptr1[0] = float2int8((float)ptr0[1] * scale);
+                            outptr2[0] = float2int8((float)ptr0[2] * scale);
+                            outptr3[0] = float2int8((float)ptr0[3] * scale);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const __fp16* ptr0 = bottom_blob.channel(q);
+                        signed char* outptr0 = top_blob.channel(q * 4);
+                        signed char* outptr1 = top_blob.channel(q * 4 + 1);
+                        signed char* outptr2 = top_blob.channel(q * 4 + 2);
+                        signed char* outptr3 = top_blob.channel(q * 4 + 3);
+
+                        const float s0 = scale_data[q * 4];
+                        const float s1 = scale_data[q * 4 + 1];
+                        const float s2 = scale_data[q * 4 + 2];
+                        const float s3 = scale_data[q * 4 + 3];
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            outptr0[0] = float2int8((float)ptr0[0] * s0);
+                            outptr1[0] = float2int8((float)ptr0[1] * s1);
+                            outptr2[0] = float2int8((float)ptr0[2] * s2);
+                            outptr3[0] = float2int8((float)ptr0[3] * s3);
+
+                            ptr0 += 4;
+                            outptr0 += 1;
+                            outptr1 += 1;
+                            outptr2 += 1;
+                            outptr3 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    if (dims == 1)
+    {
+        int w = bottom_blob.w;
+
+        top_blob.create(w, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        const __fp16* ptr = bottom_blob;
+        signed char* outptr = top_blob;
+
+        if (scale_data_size == 1)
+        {
+            const float scale = scale_data[0];
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                outptr[i] = float2int8((float)ptr[i] * scale);
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                outptr[i] = float2int8((float)ptr[i] * scale_data[i]);
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
+            signed char* outptr0 = top_blob.row<signed char>(i);
+
+            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+
+            for (int j = 0; j < w; j++)
+            {
+                *outptr0++ = float2int8((float)*ptr0++ * scale);
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+        int size = w * h;
+
+        top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const __fp16* ptr = bottom_blob.channel(q);
+            signed char* outptr = top_blob.channel(q);
+
+            const float scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+
+            for (int i = 0; i < size; i++)
+            {
+                *outptr++ = float2int8((float)*ptr++ * scale);
+            }
+        }
+    }
+
+    return 0;
+}
+
+int Quantize_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int dims = bottom_blob.dims;
+    int elempack = bottom_blob.elempack;
+    int vl;
+
+    if (elempack == 8)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+
+            top_blob.create(w, (size_t)8u, 8, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                __fp16 _scale = (__fp16)scale_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 8;
+                    signed char* outptr = (signed char*)top_blob + i * 8;
+                    vl = 8;
+                    vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl);
+                    _v = vfmul_vf_f16m1(_v, _scale, vl);
+                    *(int64_t*)outptr = float2int8(_v);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 8;
+                    signed char* outptr = (signed char*)top_blob + i * 8;
+
+                    vl = 8;
+                    vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl);
+                    vfloat16m1_t _scale = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)scale_data + i * 8, vl), vl);
+
+                    _v = vfmul_vv_f16m1(_v, _scale, vl);
+                    *(int64_t*)outptr = float2int8(_v);
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+
+            top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                __fp16 _scale = (__fp16)scale_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
+                    signed char* outptr0 = top_blob.row<signed char>(i);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        vl = 8;
+                        vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl);
+                        _v = vfmul_vf_f16m1(_v, _scale, vl);
+
+                        *(int64_t*)outptr0 = float2int8(_v);
+
+                        ptr0 += 8;
+                        outptr0 += 8;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
+                    signed char* outptr0 = top_blob.row<signed char>(i);
+
+                    vl = 8;
+                    vfloat16m1_t _scale = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)scale_data + i * 8, vl), vl);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl);
+                        _v = vfmul_vv_f16m1(_v, _scale, vl);
+                        *(int64_t*)outptr0 = float2int8(_v);
+
+                        ptr0 += 8;
+                        outptr0 += 8;
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+
+            top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                __fp16 _scale = (__fp16)scale_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const __fp16* ptr0 = bottom_blob.channel(q);
+                    signed char* outptr0 = top_blob.channel(q);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        vl = 8;
+                        vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl);
+                        _v = vfmul_vf_f16m1(_v, _scale, vl);
+                        *(int64_t*)outptr0 = float2int8(_v);
+
+                        ptr0 += 8;
+                        outptr0 += 8;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const __fp16* ptr0 = bottom_blob.channel(q);
+                    signed char* outptr0 = top_blob.channel(q);
+
+                    vl = 8;
+                    vfloat16m1_t _scale = vfncvt_f_f_w_f16m1(vle32_v_f32m2((const float*)scale_data + q * 8, vl), vl);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        vfloat16m1_t _v = vle16_v_f16m1(ptr0, vl);
+                        _v = vfmul_vv_f16m1(_v, _scale, vl);
+                        *(int64_t*)outptr0 = float2int8(_v);
+
+                        ptr0 += 8;
+                        outptr0 += 8;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    if (elempack == 4)
+    {
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+            int outw = w * elempack;
+
+            top_blob.create(outw, (size_t)1u, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                const __fp16 scale = scale_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4;
+                    signed char* outptr = (signed char*)top_blob + i * 4;
+
+                    outptr[0] = float2int8(ptr0[0] * scale);
+                    outptr[1] = float2int8(ptr0[1] * scale);
+                    outptr[2] = float2int8(ptr0[2] * scale);
+                    outptr[3] = float2int8(ptr0[3] * scale);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    const __fp16* ptr0 = (const __fp16*)bottom_blob + i * 4;
+                    signed char* outptr = (signed char*)top_blob + i * 4;
+
+                    outptr[0] = float2int8(ptr0[0] * (__fp16)scale_data[i * 4]);
+                    outptr[1] = float2int8(ptr0[1] * (__fp16)scale_data[i * 4 + 1]);
+                    outptr[2] = float2int8(ptr0[2] * (__fp16)scale_data[i * 4 + 2]);
+                    outptr[3] = float2int8(ptr0[3] * (__fp16)scale_data[i * 4 + 3]);
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int outh = h * elempack;
+
+            top_blob.create(w, outh, (size_t)1u, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                const __fp16 scale = scale_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
+                    signed char* outptr0 = top_blob.row<signed char>(i * 4);
+                    signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
+                    signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
+                    signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        outptr0[0] = float2int8(ptr0[0] * scale);
+                        outptr1[0] = float2int8(ptr0[1] * scale);
+                        outptr2[0] = float2int8(ptr0[2] * scale);
+                        outptr3[0] = float2int8(ptr0[3] * scale);
+
+                        ptr0 += 4;
+                        outptr0 += 1;
+                        outptr1 += 1;
+                        outptr2 += 1;
+                        outptr3 += 1;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
+                    signed char* outptr0 = top_blob.row<signed char>(i * 4);
+                    signed char* outptr1 = top_blob.row<signed char>(i * 4 + 1);
+                    signed char* outptr2 = top_blob.row<signed char>(i * 4 + 2);
+                    signed char* outptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                    const __fp16 s0 = scale_data[i * 4];
+                    const __fp16 s1 = scale_data[i * 4 + 1];
+                    const __fp16 s2 = scale_data[i * 4 + 2];
+                    const __fp16 s3 = scale_data[i * 4 + 3];
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        outptr0[0] = float2int8(ptr0[0] * s0);
+                        outptr1[0] = float2int8(ptr0[1] * s1);
+                        outptr2[0] = float2int8(ptr0[2] * s2);
+                        outptr3[0] = float2int8(ptr0[3] * s3);
+
+                        ptr0 += 4;
+                        outptr0 += 1;
+                        outptr1 += 1;
+                        outptr2 += 1;
+                        outptr3 += 1;
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+            int outc = channels * elempack;
+
+            top_blob.create(w, h, outc, (size_t)1u, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_data_size == 1)
+            {
+                const __fp16 scale = scale_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const __fp16* ptr0 = bottom_blob.channel(q);
+                    signed char* outptr0 = top_blob.channel(q * 4);
+                    signed char* outptr1 = top_blob.channel(q * 4 + 1);
+                    signed char* outptr2 = top_blob.channel(q * 4 + 2);
+                    signed char* outptr3 = top_blob.channel(q * 4 + 3);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        outptr0[0] = float2int8(ptr0[0] * scale);
+                        outptr1[0] = float2int8(ptr0[1] * scale);
+                        outptr2[0] = float2int8(ptr0[2] * scale);
+                        outptr3[0] = float2int8(ptr0[3] * scale);
+
+                        ptr0 += 4;
+                        outptr0 += 1;
+                        outptr1 += 1;
+                        outptr2 += 1;
+                        outptr3 += 1;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const __fp16* ptr0 = bottom_blob.channel(q);
+                    signed char* outptr0 = top_blob.channel(q * 4);
+                    signed char* outptr1 = top_blob.channel(q * 4 + 1);
+                    signed char* outptr2 = top_blob.channel(q * 4 + 2);
+                    signed char* outptr3 = top_blob.channel(q * 4 + 3);
+
+                    const __fp16 s0 = scale_data[q * 4];
+                    const __fp16 s1 = scale_data[q * 4 + 1];
+                    const __fp16 s2 = scale_data[q * 4 + 2];
+                    const __fp16 s3 = scale_data[q * 4 + 3];
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        outptr0[0] = float2int8(ptr0[0] * s0);
+                        outptr1[0] = float2int8(ptr0[1] * s1);
+                        outptr2[0] = float2int8(ptr0[2] * s2);
+                        outptr3[0] = float2int8(ptr0[3] * s3);
+
+                        ptr0 += 4;
+                        outptr0 += 1;
+                        outptr1 += 1;
+                        outptr2 += 1;
+                        outptr3 += 1;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    if (dims == 1)
+    {
+        int w = bottom_blob.w;
+
+        top_blob.create(w, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        const __fp16* ptr = bottom_blob;
+        signed char* outptr = top_blob;
+
+        if (scale_data_size == 1)
+        {
+            const __fp16 scale = scale_data[0];
+
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                outptr[i] = float2int8(ptr[i] * scale);
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < w; i++)
+            {
+                outptr[i] = float2int8(ptr[i] * (__fp16)scale_data[i]);
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < h; i++)
+        {
+            const __fp16* ptr0 = bottom_blob.row<const __fp16>(i);
+            signed char* outptr0 = top_blob.row<signed char>(i);
+
+            const __fp16 scale = scale_data_size == 1 ? scale_data[0] : scale_data[i];
+
+            for (int j = 0; j < w; j++)
+            {
+                *outptr0++ = float2int8(*ptr0++ * scale);
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+        int size = w * h;
+
+        top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const __fp16* ptr = bottom_blob.channel(q);
+            signed char* outptr = top_blob.channel(q);
+
+            const __fp16 scale = scale_data_size == 1 ? scale_data[0] : scale_data[q];
+
+            for (int i = 0; i < size; i++)
+            {
+                *outptr++ = float2int8(*ptr++ * scale);
+            }
+        }
+    }
+
+    return 0;
+}
+
+#endif // __riscv_vector && __riscv_zfh
+
+} // namespace ncnn
diff --git a/src/layer/riscv/quantize_riscv.h b/src/layer/riscv/quantize_riscv.h
new file mode 100644
index 000000000000..0eb90aed7e5b
--- /dev/null
+++ b/src/layer/riscv/quantize_riscv.h
@@ -0,0 +1,37 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_QUANTIZE_RISCV_H
+#define LAYER_QUANTIZE_RISCV_H
+
+#include "quantize.h"
+
+namespace ncnn {
+
+class Quantize_riscv : public Quantize
+{
+public:
+    Quantize_riscv();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+#if __riscv_vector && __riscv_zfh
+    int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+    int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+#endif // __riscv_vector && __riscv_zfh
+};
+
+} // namespace ncnn
+
+#endif // LAYER_QUANTIZE_RISCV_H
diff --git a/src/layer/riscv/requantize_leakyrelu_pack4.h b/src/layer/riscv/requantize_leakyrelu_pack4.h
new file mode 100644
index 000000000000..fd5c4dfd93d2
--- /dev/null
+++ b/src/layer/riscv/requantize_leakyrelu_pack4.h
@@ -0,0 +1,263 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void requantize_leakyrelu_pack4_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt)
+{
+    int vl = 4;
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+    int outc = top_blob.c;
+    int out_elempack = top_blob.elempack;
+
+    int scale_in_data_size = scale_in_data.w;
+    int scale_out_data_size = scale_out_data.w;
+    int bias_data_size = bias_data.w;
+
+    // int8(relu(v * scale_in) * scale_out)
+    // int8_relu(v * (scale_in * scale_out))
+
+    // int8(relu(v * scale_in + bias) * scale_out)
+    // int8_relu(v * (scale_in * scale_out) + (bias * scale_out))
+
+    if (out_elempack == 8)
+    {
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < outc; q++)
+            {
+                const int* intptr0 = bottom_blob.channel(q * 2);
+                const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                signed char* ptr = top_blob.channel(q);
+
+                vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl);
+                vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl);
+                vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl);
+                vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl);
+
+                vfloat32m1_t _scale0 = vfmul_vv_f32m1(_scale_in0, _scale_out0, vl);
+                vfloat32m1_t _scale1 = vfmul_vv_f32m1(_scale_in1, _scale_out1, vl);
+                vfloat32m1_t _slope = vfmv_v_f_f32m1(slope, vl);
+
+                int i = 0;
+                for (; i + 3 < size; i += 4)
+                {
+                    vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                    vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl);
+                    vfloat32m1_t _v02 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 8, vl), vl);
+                    vfloat32m1_t _v03 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 12, vl), vl);
+                    vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                    vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl);
+                    vfloat32m1_t _v12 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 8, vl), vl);
+                    vfloat32m1_t _v13 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 12, vl), vl);
+                    _v00 = vfmul_vv_f32m1(_v00, _scale0, vl);
+                    _v01 = vfmul_vv_f32m1(_v01, _scale0, vl);
+                    _v02 = vfmul_vv_f32m1(_v02, _scale0, vl);
+                    _v03 = vfmul_vv_f32m1(_v03, _scale0, vl);
+                    _v10 = vfmul_vv_f32m1(_v10, _scale1, vl);
+                    _v11 = vfmul_vv_f32m1(_v11, _scale1, vl);
+                    _v12 = vfmul_vv_f32m1(_v12, _scale1, vl);
+                    _v13 = vfmul_vv_f32m1(_v13, _scale1, vl);
+                    *(int64_t*)ptr = float2int8leakyrelu(_v00, _v10, _slope);
+                    *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v01, _v11, _slope);
+                    *(int64_t*)(ptr + 16) = float2int8leakyrelu(_v02, _v12, _slope);
+                    *(int64_t*)(ptr + 24) = float2int8leakyrelu(_v03, _v13, _slope);
+
+                    intptr0 += 16;
+                    intptr1 += 16;
+                    ptr += 32;
+                }
+                for (; i < size; i++)
+                {
+                    vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                    vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                    _v0 = vfmul_vv_f32m1(_v0, _scale0, vl);
+                    _v1 = vfmul_vv_f32m1(_v1, _scale1, vl);
+                    *(int64_t*)ptr = float2int8leakyrelu(_v0, _v1, _slope);
+
+                    intptr0 += 4;
+                    intptr1 += 4;
+                    ptr += 8;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < outc; q++)
+            {
+                const int* intptr0 = bottom_blob.channel(q * 2);
+                const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                signed char* ptr = top_blob.channel(q);
+
+                vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl);
+                vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl);
+                vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl);
+                vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl);
+                vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8, vl);
+                vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8 + 4, vl);
+
+                vfloat32m1_t _scale0 = vfmul_vv_f32m1(_scale_in0, _scale_out0, vl);
+                vfloat32m1_t _scale1 = vfmul_vv_f32m1(_scale_in1, _scale_out1, vl);
+                vfloat32m1_t _slope = vfmv_v_f_f32m1(slope, vl);
+                _bias0 = vfmul_vv_f32m1(_bias0, _scale_out0, vl);
+                _bias1 = vfmul_vv_f32m1(_bias1, _scale_out1, vl);
+
+                int i = 0;
+                for (; i + 3 < size; i += 4)
+                {
+                    vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                    vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl);
+                    vfloat32m1_t _v02 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 8, vl), vl);
+                    vfloat32m1_t _v03 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 12, vl), vl);
+                    vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                    vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl);
+                    vfloat32m1_t _v12 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 8, vl), vl);
+                    vfloat32m1_t _v13 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 12, vl), vl);
+
+                    _v00 = vfmacc_vv_f32m1(_bias0, _v00, _scale0, vl);
+                    _v01 = vfmacc_vv_f32m1(_bias0, _v01, _scale0, vl);
+                    _v02 = vfmacc_vv_f32m1(_bias0, _v02, _scale0, vl);
+                    _v03 = vfmacc_vv_f32m1(_bias0, _v03, _scale0, vl);
+                    _v10 = vfmacc_vv_f32m1(_bias1, _v10, _scale1, vl);
+                    _v11 = vfmacc_vv_f32m1(_bias1, _v11, _scale1, vl);
+                    _v12 = vfmacc_vv_f32m1(_bias1, _v12, _scale1, vl);
+                    _v13 = vfmacc_vv_f32m1(_bias1, _v13, _scale1, vl);
+
+                    *(int64_t*)ptr = float2int8leakyrelu(_v00, _v10, _slope);
+                    *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v01, _v11, _slope);
+                    *(int64_t*)(ptr + 16) = float2int8leakyrelu(_v02, _v12, _slope);
+                    *(int64_t*)(ptr + 24) = float2int8leakyrelu(_v03, _v13, _slope);
+
+                    intptr0 += 16;
+                    intptr1 += 16;
+                    ptr += 32;
+                }
+                for (; i + 1 < size; i += 2)
+                {
+                    vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                    vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl);
+                    vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                    vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl);
+
+                    _v00 = vfmacc_vv_f32m1(_bias0, _v00, _scale0, vl);
+                    _v01 = vfmacc_vv_f32m1(_bias0, _v01, _scale0, vl);
+                    _v10 = vfmacc_vv_f32m1(_bias1, _v10, _scale1, vl);
+                    _v11 = vfmacc_vv_f32m1(_bias1, _v11, _scale1, vl);
+
+                    *(int64_t*)ptr = float2int8leakyrelu(_v00, _v10, _slope);
+                    *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v01, _v11, _slope);
+
+                    intptr0 += 8;
+                    intptr1 += 8;
+                    ptr += 16;
+                }
+                for (; i < size; i++)
+                {
+                    vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                    vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+
+                    _v0 = vfmacc_vv_f32m1(_bias0, _v0, _scale0, vl);
+                    _v1 = vfmacc_vv_f32m1(_bias1, _v1, _scale1, vl);
+
+                    *(int64_t*)ptr = float2int8leakyrelu(_v0, _v1, _slope);
+
+                    intptr0 += 4;
+                    intptr1 += 4;
+                    ptr += 8;
+                }
+            }
+        }
+    }
+    if (out_elempack == 1)
+    {
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr0 = top_blob.channel(q * 4);
+                signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                signed char* ptr3 = top_blob.channel(q * 4 + 3);
+
+                vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl);
+                vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl);
+                vfloat32m1_t _slope = vfmv_v_f_f32m1(slope, vl);
+
+                vfloat32m1_t _scale = vfmul_vv_f32m1(_scale_in, _scale_out, vl);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                    _v = vfmul_vv_f32m1(_v, _scale, vl);
+
+                    int res = float2int8leakyrelu(_v, _slope);
+                    ptr0[0] = (res)&0xff;
+                    ptr1[0] = (res >> 8) & 0xff;
+                    ptr2[0] = (res >> 16) & 0xff;
+                    ptr3[0] = (res >> 24) & 0xff;
+
+                    intptr += 4;
+                    ptr0 += 1;
+                    ptr1 += 1;
+                    ptr2 += 1;
+                    ptr3 += 1;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr0 = top_blob.channel(q * 4);
+                signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                signed char* ptr3 = top_blob.channel(q * 4 + 3);
+
+                vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl);
+                vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl);
+                vfloat32m1_t _bias = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 4, vl);
+                vfloat32m1_t _slope = vfmv_v_f_f32m1(slope, vl);
+
+                vfloat32m1_t _scale = vfmul_vv_f32m1(_scale_in, _scale_out, vl);
+                _bias = vfmul_vv_f32m1(_bias, _scale_out, vl);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                    _v = vfmacc_vv_f32m1(_bias, _v, _scale, vl);
+                    int res = float2int8leakyrelu(_v, _slope);
+                    ptr0[0] = (res)&0xff;
+                    ptr1[0] = (res >> 8) & 0xff;
+                    ptr2[0] = (res >> 16) & 0xff;
+                    ptr3[0] = (res >> 24) & 0xff;
+
+                    intptr += 4;
+                    ptr0 += 1;
+                    ptr1 += 1;
+                    ptr2 += 1;
+                    ptr3 += 1;
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/riscv/requantize_leakyrelu_pack8.h b/src/layer/riscv/requantize_leakyrelu_pack8.h
new file mode 100644
index 000000000000..87991ad94394
--- /dev/null
+++ b/src/layer/riscv/requantize_leakyrelu_pack8.h
@@ -0,0 +1,160 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void requantize_leakyrelu_pack8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, float slope, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    int scale_in_data_size = scale_in_data.w;
+    int scale_out_data_size = scale_out_data.w;
+    int bias_data_size = bias_data.w;
+    int vl = 8;
+
+    // int8(relu(v * scale_in) * scale_out)
+    // int8_relu(v * (scale_in * scale_out))
+
+    // int8(relu(v * scale_in + bias) * scale_out)
+    // int8_relu(v * (scale_in * scale_out) + (bias * scale_out))
+
+    if (bias_data_size == 0)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const int* intptr = bottom_blob.channel(q);
+            signed char* ptr = top_blob.channel(q);
+
+            vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl);
+            vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl);
+
+            vfloat32m2_t _scale0 = vfmul_vv_f32m2(_scale_in0, _scale_out0, vl);
+            vfloat32m2_t _slope = vfmv_v_f_f32m2(slope, vl);
+
+            int i = 0;
+            for (; i + 3 < size; i += 4)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl);
+                vfloat32m2_t _v45 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 16, vl), vl);
+                vfloat32m2_t _v67 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 24, vl), vl);
+
+                _v01 = vfmul_vv_f32m2(_v01, _scale0, vl);
+                _v23 = vfmul_vv_f32m2(_v23, _scale0, vl);
+                _v45 = vfmul_vv_f32m2(_v45, _scale0, vl);
+                _v67 = vfmul_vv_f32m2(_v67, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope);
+                *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v23, _slope);
+                *(int64_t*)(ptr + 16) = float2int8leakyrelu(_v45, _slope);
+                *(int64_t*)(ptr + 24) = float2int8leakyrelu(_v67, _slope);
+
+                intptr += 32;
+                ptr += 32;
+            }
+            for (; i + 1 < size; i += 2)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl);
+
+                _v01 = vfmul_vv_f32m2(_v01, _scale0, vl);
+                _v23 = vfmul_vv_f32m2(_v23, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope);
+                *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v23, _slope);
+
+                intptr += 16;
+                ptr += 16;
+            }
+            for (; i < size; i++)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+
+                _v01 = vfmul_vv_f32m2(_v01, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope);
+
+                intptr += 8;
+                ptr += 8;
+            }
+        }
+    }
+    else
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const int* intptr = bottom_blob.channel(q);
+            signed char* ptr = top_blob.channel(q);
+
+            vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl);
+            vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl);
+            vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + q * 8, vl);
+
+            vfloat32m2_t _scale0 = vfmul_vv_f32m2(_scale_in0, _scale_out0, vl);
+            _bias0 = vfmul_vv_f32m2(_bias0, _scale_out0, vl);
+
+            vfloat32m2_t _slope = vfmv_v_f_f32m2(slope, vl);
+
+            int i = 0;
+            for (; i + 3 < size; i += 4)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl);
+                vfloat32m2_t _v45 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 16, vl), vl);
+                vfloat32m2_t _v67 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 24, vl), vl);
+
+                _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl);
+                _v23 = vfmacc_vv_f32m2(_bias0, _v23, _scale0, vl);
+                _v45 = vfmacc_vv_f32m2(_bias0, _v45, _scale0, vl);
+                _v67 = vfmacc_vv_f32m2(_bias0, _v67, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope);
+                *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v23, _slope);
+                *(int64_t*)(ptr + 16) = float2int8leakyrelu(_v45, _slope);
+                *(int64_t*)(ptr + 24) = float2int8leakyrelu(_v67, _slope);
+
+                intptr += 32;
+                ptr += 32;
+            }
+            for (; i + 1 < size; i += 2)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl);
+
+                _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl);
+                _v23 = vfmacc_vv_f32m2(_bias0, _v23, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope);
+                *(int64_t*)(ptr + 8) = float2int8leakyrelu(_v23, _slope);
+
+                intptr += 16;
+                ptr += 16;
+            }
+            for (; i < size; i++)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+
+                _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8leakyrelu(_v01, _slope);
+
+                intptr += 8;
+                ptr += 8;
+            }
+        }
+    }
+}
diff --git a/src/layer/riscv/requantize_relu_pack4.h b/src/layer/riscv/requantize_relu_pack4.h
new file mode 100644
index 000000000000..ca5285de57c5
--- /dev/null
+++ b/src/layer/riscv/requantize_relu_pack4.h
@@ -0,0 +1,259 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void requantize_relu_pack4_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt)
+{
+    int vl = 4;
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+    int outc = top_blob.c;
+    int out_elempack = top_blob.elempack;
+
+    int scale_in_data_size = scale_in_data.w;
+    int scale_out_data_size = scale_out_data.w;
+    int bias_data_size = bias_data.w;
+
+    // int8(relu(v * scale_in) * scale_out)
+    // int8_relu(v * (scale_in * scale_out))
+
+    // int8(relu(v * scale_in + bias) * scale_out)
+    // int8_relu(v * (scale_in * scale_out) + (bias * scale_out))
+
+    if (out_elempack == 8)
+    {
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < outc; q++)
+            {
+                const int* intptr0 = bottom_blob.channel(q * 2);
+                const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                signed char* ptr = top_blob.channel(q);
+
+                vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl);
+                vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl);
+                vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl);
+                vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl);
+
+                vfloat32m1_t _scale0 = vfmul_vv_f32m1(_scale_in0, _scale_out0, vl);
+                vfloat32m1_t _scale1 = vfmul_vv_f32m1(_scale_in1, _scale_out1, vl);
+
+                int i = 0;
+                for (; i + 3 < size; i += 4)
+                {
+                    vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                    vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl);
+                    vfloat32m1_t _v02 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 8, vl), vl);
+                    vfloat32m1_t _v03 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 12, vl), vl);
+                    vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                    vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl);
+                    vfloat32m1_t _v12 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 8, vl), vl);
+                    vfloat32m1_t _v13 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 12, vl), vl);
+                    _v00 = vfmul_vv_f32m1(_v00, _scale0, vl);
+                    _v01 = vfmul_vv_f32m1(_v01, _scale0, vl);
+                    _v02 = vfmul_vv_f32m1(_v02, _scale0, vl);
+                    _v03 = vfmul_vv_f32m1(_v03, _scale0, vl);
+                    _v10 = vfmul_vv_f32m1(_v10, _scale1, vl);
+                    _v11 = vfmul_vv_f32m1(_v11, _scale1, vl);
+                    _v12 = vfmul_vv_f32m1(_v12, _scale1, vl);
+                    _v13 = vfmul_vv_f32m1(_v13, _scale1, vl);
+                    *(int64_t*)ptr = float2int8relu(_v00, _v10);
+                    *(int64_t*)(ptr + 8) = float2int8relu(_v01, _v11);
+                    *(int64_t*)(ptr + 16) = float2int8relu(_v02, _v12);
+                    *(int64_t*)(ptr + 24) = float2int8relu(_v03, _v13);
+
+                    intptr0 += 16;
+                    intptr1 += 16;
+                    ptr += 32;
+                }
+                for (; i < size; i++)
+                {
+                    vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                    vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                    _v0 = vfmul_vv_f32m1(_v0, _scale0, vl);
+                    _v1 = vfmul_vv_f32m1(_v1, _scale1, vl);
+                    *(int64_t*)ptr = float2int8relu(_v0, _v1);
+
+                    intptr0 += 4;
+                    intptr1 += 4;
+                    ptr += 8;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < outc; q++)
+            {
+                const int* intptr0 = bottom_blob.channel(q * 2);
+                const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                signed char* ptr = top_blob.channel(q);
+
+                vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl);
+                vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl);
+                vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl);
+                vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl);
+                vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8, vl);
+                vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8 + 4, vl);
+
+                vfloat32m1_t _scale0 = vfmul_vv_f32m1(_scale_in0, _scale_out0, vl);
+                vfloat32m1_t _scale1 = vfmul_vv_f32m1(_scale_in1, _scale_out1, vl);
+                _bias0 = vfmul_vv_f32m1(_bias0, _scale_out0, vl);
+                _bias1 = vfmul_vv_f32m1(_bias1, _scale_out1, vl);
+
+                int i = 0;
+                for (; i + 3 < size; i += 4)
+                {
+                    vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                    vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl);
+                    vfloat32m1_t _v02 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 8, vl), vl);
+                    vfloat32m1_t _v03 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 12, vl), vl);
+                    vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                    vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl);
+                    vfloat32m1_t _v12 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 8, vl), vl);
+                    vfloat32m1_t _v13 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 12, vl), vl);
+
+                    _v00 = vfmacc_vv_f32m1(_bias0, _v00, _scale0, vl);
+                    _v01 = vfmacc_vv_f32m1(_bias0, _v01, _scale0, vl);
+                    _v02 = vfmacc_vv_f32m1(_bias0, _v02, _scale0, vl);
+                    _v03 = vfmacc_vv_f32m1(_bias0, _v03, _scale0, vl);
+                    _v10 = vfmacc_vv_f32m1(_bias1, _v10, _scale1, vl);
+                    _v11 = vfmacc_vv_f32m1(_bias1, _v11, _scale1, vl);
+                    _v12 = vfmacc_vv_f32m1(_bias1, _v12, _scale1, vl);
+                    _v13 = vfmacc_vv_f32m1(_bias1, _v13, _scale1, vl);
+
+                    *(int64_t*)ptr = float2int8relu(_v00, _v10);
+                    *(int64_t*)(ptr + 8) = float2int8relu(_v01, _v11);
+                    *(int64_t*)(ptr + 16) = float2int8relu(_v02, _v12);
+                    *(int64_t*)(ptr + 24) = float2int8relu(_v03, _v13);
+
+                    intptr0 += 16;
+                    intptr1 += 16;
+                    ptr += 32;
+                }
+                for (; i + 1 < size; i += 2)
+                {
+                    vfloat32m1_t _v00 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                    vfloat32m1_t _v01 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0 + 4, vl), vl);
+                    vfloat32m1_t _v10 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                    vfloat32m1_t _v11 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1 + 4, vl), vl);
+
+                    _v00 = vfmacc_vv_f32m1(_bias0, _v00, _scale0, vl);
+                    _v01 = vfmacc_vv_f32m1(_bias0, _v01, _scale0, vl);
+                    _v10 = vfmacc_vv_f32m1(_bias1, _v10, _scale1, vl);
+                    _v11 = vfmacc_vv_f32m1(_bias1, _v11, _scale1, vl);
+
+                    *(int64_t*)ptr = float2int8relu(_v00, _v10);
+                    *(int64_t*)(ptr + 8) = float2int8relu(_v01, _v11);
+
+                    intptr0 += 8;
+                    intptr1 += 8;
+                    ptr += 16;
+                }
+                for (; i < size; i++)
+                {
+                    vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                    vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+
+                    _v0 = vfmacc_vv_f32m1(_bias0, _v0, _scale0, vl);
+                    _v1 = vfmacc_vv_f32m1(_bias1, _v1, _scale1, vl);
+
+                    *(int64_t*)ptr = float2int8relu(_v0, _v1);
+
+                    intptr0 += 4;
+                    intptr1 += 4;
+                    ptr += 8;
+                }
+            }
+        }
+    }
+    if (out_elempack == 1)
+    {
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr0 = top_blob.channel(q * 4);
+                signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                signed char* ptr3 = top_blob.channel(q * 4 + 3);
+
+                vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl);
+                vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl);
+
+                vfloat32m1_t _scale = vfmul_vv_f32m1(_scale_in, _scale_out, vl);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                    _v = vfmul_vv_f32m1(_v, _scale, vl);
+
+                    int res = float2int8relu(_v);
+                    ptr0[0] = (res)&0xff;
+                    ptr1[0] = (res >> 8) & 0xff;
+                    ptr2[0] = (res >> 16) & 0xff;
+                    ptr3[0] = (res >> 24) & 0xff;
+
+                    intptr += 4;
+                    ptr0 += 1;
+                    ptr1 += 1;
+                    ptr2 += 1;
+                    ptr3 += 1;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr0 = top_blob.channel(q * 4);
+                signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                signed char* ptr3 = top_blob.channel(q * 4 + 3);
+
+                vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl);
+                vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl);
+                vfloat32m1_t _bias = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 4, vl);
+
+                vfloat32m1_t _scale = vfmul_vv_f32m1(_scale_in, _scale_out, vl);
+                _bias = vfmul_vv_f32m1(_bias, _scale_out, vl);
+
+                int i = 0;
+                for (; i < size; i++)
+                {
+                    vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                    _v = vfmacc_vv_f32m1(_bias, _v, _scale, vl);
+                    int res = float2int8relu(_v);
+                    ptr0[0] = (res)&0xff;
+                    ptr1[0] = (res >> 8) & 0xff;
+                    ptr2[0] = (res >> 16) & 0xff;
+                    ptr3[0] = (res >> 24) & 0xff;
+
+                    intptr += 4;
+                    ptr0 += 1;
+                    ptr1 += 1;
+                    ptr2 += 1;
+                    ptr3 += 1;
+                }
+            }
+        }
+    }
+}
diff --git a/src/layer/riscv/requantize_relu_pack8.h b/src/layer/riscv/requantize_relu_pack8.h
new file mode 100644
index 000000000000..e3f18dbb98a3
--- /dev/null
+++ b/src/layer/riscv/requantize_relu_pack8.h
@@ -0,0 +1,155 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void requantize_relu_pack8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& scale_in_data, const Mat& scale_out_data, const Mat& bias_data, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int size = w * h;
+
+    int scale_in_data_size = scale_in_data.w;
+    int scale_out_data_size = scale_out_data.w;
+    int bias_data_size = bias_data.w;
+    int vl = 8;
+
+    // int8(relu(v * scale_in) * scale_out)
+    // int8_relu(v * (scale_in * scale_out))
+
+    // int8(relu(v * scale_in + bias) * scale_out)
+    // int8_relu(v * (scale_in * scale_out) + (bias * scale_out))
+
+    if (bias_data_size == 0)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const int* intptr = bottom_blob.channel(q);
+            signed char* ptr = top_blob.channel(q);
+
+            vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl);
+            vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl);
+
+            vfloat32m2_t _scale0 = vfmul_vv_f32m2(_scale_in0, _scale_out0, vl);
+
+            int i = 0;
+            for (; i + 3 < size; i += 4)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl);
+                vfloat32m2_t _v45 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 16, vl), vl);
+                vfloat32m2_t _v67 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 24, vl), vl);
+
+                _v01 = vfmul_vv_f32m2(_v01, _scale0, vl);
+                _v23 = vfmul_vv_f32m2(_v23, _scale0, vl);
+                _v45 = vfmul_vv_f32m2(_v45, _scale0, vl);
+                _v67 = vfmul_vv_f32m2(_v67, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8relu(_v01);
+                *(int64_t*)(ptr + 8) = float2int8relu(_v23);
+                *(int64_t*)(ptr + 16) = float2int8relu(_v45);
+                *(int64_t*)(ptr + 24) = float2int8relu(_v67);
+
+                intptr += 32;
+                ptr += 32;
+            }
+            for (; i + 1 < size; i += 2)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl);
+
+                _v01 = vfmul_vv_f32m2(_v01, _scale0, vl);
+                _v23 = vfmul_vv_f32m2(_v23, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8relu(_v01);
+                *(int64_t*)(ptr + 8) = float2int8relu(_v23);
+                intptr += 16;
+                ptr += 16;
+            }
+            for (; i < size; i++)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+
+                _v01 = vfmul_vv_f32m2(_v01, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8relu(_v01);
+
+                intptr += 8;
+                ptr += 8;
+            }
+        }
+    }
+    else
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const int* intptr = bottom_blob.channel(q);
+            signed char* ptr = top_blob.channel(q);
+
+            vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl);
+            vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl);
+            vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + q * 8, vl);
+
+            vfloat32m2_t _scale0 = vfmul_vv_f32m2(_scale_in0, _scale_out0, vl);
+            _bias0 = vfmul_vv_f32m2(_bias0, _scale_out0, vl);
+            int i = 0;
+            for (; i + 3 < size; i += 4)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl);
+                vfloat32m2_t _v45 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 16, vl), vl);
+                vfloat32m2_t _v67 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 24, vl), vl);
+
+                _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl);
+                _v23 = vfmacc_vv_f32m2(_bias0, _v23, _scale0, vl);
+                _v45 = vfmacc_vv_f32m2(_bias0, _v45, _scale0, vl);
+                _v67 = vfmacc_vv_f32m2(_bias0, _v67, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8relu(_v01);
+                *(int64_t*)(ptr + 8) = float2int8relu(_v23);
+                *(int64_t*)(ptr + 16) = float2int8relu(_v45);
+                *(int64_t*)(ptr + 24) = float2int8relu(_v67);
+
+                intptr += 32;
+                ptr += 32;
+            }
+            for (; i + 1 < size; i += 2)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                vfloat32m2_t _v23 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr + 8, vl), vl);
+
+                _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl);
+                _v23 = vfmacc_vv_f32m2(_bias0, _v23, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8relu(_v01);
+                *(int64_t*)(ptr + 8) = float2int8relu(_v23);
+
+                intptr += 16;
+                ptr += 16;
+            }
+            for (; i < size; i++)
+            {
+                vfloat32m2_t _v01 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+
+                _v01 = vfmacc_vv_f32m2(_bias0, _v01, _scale0, vl);
+
+                *(int64_t*)ptr = float2int8relu(_v01);
+
+                intptr += 8;
+                ptr += 8;
+            }
+        }
+    }
+}
diff --git a/src/layer/riscv/requantize_riscv.cpp b/src/layer/riscv/requantize_riscv.cpp
new file mode 100644
index 000000000000..81e0e9aaa8da
--- /dev/null
+++ b/src/layer/riscv/requantize_riscv.cpp
@@ -0,0 +1,1240 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 Xinyu302. All rights reserved.
+// Copyright (C) 2019 BUG1989. All rights reserved.
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "requantize_riscv.h"
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#endif // __riscv_vector
+
+#include "riscv_activation.h"
+#include "riscv_usability.h"
+
+namespace ncnn {
+
+#if __riscv_vector
+#include "requantize_leakyrelu_pack4.h"
+#include "requantize_leakyrelu_pack8.h"
+#include "requantize_relu_pack4.h"
+#include "requantize_relu_pack8.h"
+#endif // __riscv_vector
+
+Requantize_riscv::Requantize_riscv()
+{
+#if __riscv_vector
+    support_packing = true;
+#endif // __riscv_vector
+}
+
+int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int dims = bottom_blob.dims;
+    int elempack = bottom_blob.elempack;
+    int vl;
+
+#if __riscv_vector
+    if (elempack == 8)
+    {
+        vl = 8;
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+
+            top_blob.create(w, (size_t)8u, 8, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_in_data_size == 1 && scale_out_data_size == 1)
+            {
+                vfloat32m2_t _scale_in = vfmv_v_f_f32m2(scale_in_data[0], vl);
+                vfloat32m2_t _scale_out = vfmv_v_f_f32m2(scale_out_data[0], vl);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_in, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    vfloat32m2_t _bias = vfmv_v_f_f32m2(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmacc_vv_f32m2(_bias, _v0, _scale_in, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + i * 8, vl);
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+            }
+            else if (scale_in_data_size == 1 && scale_out_data_size > 1)
+            {
+                vfloat32m2_t _scale_in = vfmv_v_f_f32m2(scale_in_data[0], vl);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl);
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_in, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    vfloat32m2_t _bias = vfmv_v_f_f32m2(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl);
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmacc_vv_f32m2(_bias, _v0, _scale_in, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl);
+                        vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + i * 8, vl);
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+            }
+            else if (scale_in_data_size > 1 && scale_out_data_size == 1)
+            {
+                vfloat32m2_t _scale_out = vfmv_v_f_f32m2(scale_out_data[0], vl);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl);
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_in0, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    vfloat32m2_t _bias = vfmv_v_f_f32m2(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl);
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmacc_vv_f32m2(_bias, _v0, _scale_in0, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl);
+                        vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + i * 8, vl);
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in0, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+            }
+            else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl);
+                        vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl);
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_in0, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    vfloat32m2_t _bias = vfmv_v_f_f32m2(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl);
+                        vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl);
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmacc_vv_f32m2(_bias, _v0, _scale_in0, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 8;
+                        signed char* ptr = (signed char*)top_blob + i * 8;
+
+                        vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl);
+                        vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl);
+                        vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + i * 8, vl);
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in0, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+                    }
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+
+            top_blob.create(w, h, (size_t)8u, 8, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    signed char* ptr = top_blob.row<signed char>(i);
+
+                    vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl);
+                    vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_in0, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+
+                        intptr += 8;
+                        ptr += 8;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < h; i++)
+                {
+                    const int* intptr = bottom_blob.row<const int>(i);
+                    signed char* ptr = top_blob.row<signed char>(i);
+
+                    vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + i * 8, vl);
+                    vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + i * 8, vl);
+                    vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + i * 8, vl);
+
+                    for (int j = 0; j < w; j++)
+                    {
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in0, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+
+                        intptr += 8;
+                        ptr += 8;
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+
+            top_blob.create(w, h, channels, (size_t)8u, 8, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (activation_type == 1)
+            {
+                requantize_relu_pack8_rvv(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt);
+                return 0;
+            }
+
+            if (activation_type == 2 && activation_params[0] > 0.f)
+            {
+                requantize_leakyrelu_pack8_rvv(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt);
+                return 0;
+            }
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    signed char* ptr = top_blob.channel(q);
+
+                    vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl);
+                    vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_in0, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+
+                        intptr += 8;
+                        ptr += 8;
+                    }
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int q = 0; q < channels; q++)
+                {
+                    const int* intptr = bottom_blob.channel(q);
+                    signed char* ptr = top_blob.channel(q);
+
+                    vfloat32m2_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m2(scale_in_data[0], vl) : vle32_v_f32m2((const float*)scale_in_data + q * 8, vl);
+                    vfloat32m2_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m2(scale_out_data[0], vl) : vle32_v_f32m2((const float*)scale_out_data + q * 8, vl);
+                    vfloat32m2_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m2(bias_data[0], vl) : vle32_v_f32m2((const float*)bias_data + q * 8, vl);
+
+                    for (int i = 0; i < size; i++)
+                    {
+                        vfloat32m2_t _v0 = vfcvt_f_x_v_f32m2(vle32_v_i32m2(intptr, vl), vl);
+                        _v0 = vfmacc_vv_f32m2(_bias0, _v0, _scale_in0, vl);
+                        _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                        _v0 = vfmul_vv_f32m2(_v0, _scale_out0, vl);
+                        *(int64_t*)ptr = float2int8(_v0);
+
+                        intptr += 8;
+                        ptr += 8;
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    if (elempack == 4)
+    {
+        vl = 4;
+        if (dims == 1)
+        {
+            int w = bottom_blob.w;
+            int out_elempack = opt.use_packing_layout && w * elempack % 8 == 0 ? 8 : 1;
+            int outw = w * elempack / out_elempack;
+
+            top_blob.create(outw, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (scale_in_data_size == 1 && scale_out_data_size == 1)
+            {
+                vfloat32m1_t _scale_in = vfmv_v_f_f32m1(scale_in_data[0], vl);
+                vfloat32m1_t _scale_out = vfmv_v_f_f32m1(scale_out_data[0], vl);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    vfloat32m1_t _bias = vfmv_v_f_f32m1(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _bias = vle32_v_f32m1((const float*)bias_data + i * 4, vl);
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+            }
+            else if (scale_in_data_size == 1 && scale_out_data_size > 1)
+            {
+                vfloat32m1_t _scale_in = vfmv_v_f_f32m1(scale_in_data[0], vl);
+                // float32x4_t _scale_in = vdupq_n_f32(scale_in_data[0]);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl);
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    vfloat32m1_t _bias = vfmv_v_f_f32m1(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl);
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _bias = vle32_v_f32m1((const float*)bias_data + i * 4, vl);
+                        vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl);
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+            }
+            else if (scale_in_data_size > 1 && scale_out_data_size == 1)
+            {
+                vfloat32m1_t _scale_out = vfmv_v_f_f32m1(scale_out_data[0], vl);
+                // float32x4_t _scale_out = vdupq_n_f32(scale_out_data[0]);
+
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl);
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    // float32x4_t _bias = vdupq_n_f32(bias_data[0]);
+                    vfloat32m1_t _bias = vfmv_v_f_f32m1(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl);
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl);
+                        vfloat32m1_t _bias = vle32_v_f32m1((const float*)bias_data + i * 4, vl);
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+            }
+            else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl);
+                        vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl);
+
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+                else if (bias_data_size == 1)
+                {
+                    vfloat32m1_t _bias = vfmv_v_f_f32m1(bias_data[0], vl);
+
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl);
+                        vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl);
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < w; i++)
+                    {
+                        const int* intptr = (const int*)bottom_blob + i * 4;
+                        signed char* ptr = (signed char*)top_blob + i * 4;
+
+                        vfloat32m1_t _scale_in = vle32_v_f32m1((const float*)scale_in_data + i * 4, vl);
+                        vfloat32m1_t _scale_out = vle32_v_f32m1((const float*)scale_out_data + i * 4, vl);
+                        vfloat32m1_t _bias = vle32_v_f32m1((const float*)bias_data + i * 4, vl);
+                        vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                        _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl);
+                        _v = activation_ps(_v, activation_type, activation_params, vl);
+                        _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                        *(int32_t*)ptr = float2int8(_v);
+                    }
+                }
+            }
+        }
+
+        if (dims == 2)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int out_elempack = opt.use_packing_layout && h * elempack % 8 == 0 ? 8 : 1;
+            int outh = h * elempack / out_elempack;
+
+            top_blob.create(w, outh, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (out_elempack == 8)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const int* intptr0 = bottom_blob.row<const int>(i * 2);
+                        const int* intptr1 = bottom_blob.row<const int>(i * 2 + 1);
+                        signed char* ptr = top_blob.row<signed char>(i);
+
+                        vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 8, vl);
+                        vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 8 + 4, vl);
+                        vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 8, vl);
+                        vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 8 + 4, vl);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                            vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                            _v0 = vfmul_vv_f32m1(_v0, _scale_in0, vl);
+                            _v1 = vfmul_vv_f32m1(_v1, _scale_in1, vl);
+                            _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                            _v1 = activation_ps(_v1, activation_type, activation_params, vl);
+                            _v0 = vfmul_vv_f32m1(_v0, _scale_out0, vl);
+                            _v1 = vfmul_vv_f32m1(_v1, _scale_out1, vl);
+                            *(int64_t*)ptr = float2int8(_v0, _v1);
+
+                            intptr0 += 4;
+                            intptr1 += 4;
+                            ptr += 8;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < outh; i++)
+                    {
+                        const int* intptr0 = bottom_blob.row<const int>(i * 2);
+                        const int* intptr1 = bottom_blob.row<const int>(i * 2 + 1);
+                        signed char* ptr = top_blob.row<signed char>(i);
+
+                        vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 8, vl);
+                        vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 8 + 4, vl);
+                        vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 8, vl);
+                        vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 8 + 4, vl);
+                        vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 8, vl);
+                        vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 8 + 4, vl);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                            vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                            _v0 = vfmacc_vv_f32m1(_bias0, _v0, _scale_in0, vl);
+                            _v1 = vfmacc_vv_f32m1(_bias1, _v1, _scale_in1, vl);
+                            _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                            _v1 = activation_ps(_v1, activation_type, activation_params, vl);
+                            _v0 = vfmul_vv_f32m1(_v0, _scale_out0, vl);
+                            _v1 = vfmul_vv_f32m1(_v1, _scale_out1, vl);
+                            *(int64_t*)ptr = float2int8(_v0, _v1);
+
+                            intptr0 += 4;
+                            intptr1 += 4;
+                            ptr += 8;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const int* intptr = bottom_blob.row<const int>(i);
+                        signed char* ptr0 = top_blob.row<signed char>(i * 4);
+                        signed char* ptr1 = top_blob.row<signed char>(i * 4 + 1);
+                        signed char* ptr2 = top_blob.row<signed char>(i * 4 + 2);
+                        signed char* ptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                        vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 4, vl);
+                        vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 4, vl);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                            _v = vfmul_vv_f32m1(_v, _scale_in, vl);
+                            _v = activation_ps(_v, activation_type, activation_params, vl);
+                            _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                            int res = float2int8(_v);
+                            ptr0[0] = (res)&0xff;
+                            ptr1[0] = (res >> 8) & 0xff;
+                            ptr2[0] = (res >> 16) & 0xff;
+                            ptr3[0] = (res >> 24) & 0xff;
+
+                            intptr += 4;
+                            ptr0 += 1;
+                            ptr1 += 1;
+                            ptr2 += 1;
+                            ptr3 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int i = 0; i < h; i++)
+                    {
+                        const int* intptr = bottom_blob.row<const int>(i);
+                        signed char* ptr0 = top_blob.row<signed char>(i * 4);
+                        signed char* ptr1 = top_blob.row<signed char>(i * 4 + 1);
+                        signed char* ptr2 = top_blob.row<signed char>(i * 4 + 2);
+                        signed char* ptr3 = top_blob.row<signed char>(i * 4 + 3);
+
+                        vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + i * 4, vl);
+                        vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + i * 4, vl);
+                        vfloat32m1_t _bias = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + i * 4, vl);
+
+                        for (int j = 0; j < w; j++)
+                        {
+                            vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                            _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl);
+                            _v = activation_ps(_v, activation_type, activation_params, vl);
+                            _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                            int res = float2int8(_v);
+
+                            ptr0[0] = (res)&0xff;
+                            ptr1[0] = (res >> 8) & 0xff;
+                            ptr2[0] = (res >> 16) & 0xff;
+                            ptr3[0] = (res >> 24) & 0xff;
+
+                            intptr += 4;
+                            ptr0 += 1;
+                            ptr1 += 1;
+                            ptr2 += 1;
+                            ptr3 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        if (dims == 3)
+        {
+            int w = bottom_blob.w;
+            int h = bottom_blob.h;
+            int channels = bottom_blob.c;
+            int size = w * h;
+            int out_elempack = opt.use_packing_layout && channels * elempack % 8 == 0 ? 8 : 1;
+            int outc = channels * elempack / out_elempack;
+
+            top_blob.create(w, h, outc, (size_t)out_elempack, out_elempack, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+
+            if (activation_type == 1)
+            {
+                requantize_relu_pack4_rvv(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, opt);
+                return 0;
+            }
+
+            if (activation_type == 2 && activation_params[0] > 0.f)
+            {
+                requantize_leakyrelu_pack4_rvv(bottom_blob, top_blob, scale_in_data, scale_out_data, bias_data, activation_params[0], opt);
+                return 0;
+            }
+
+            if (out_elempack == 8)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const int* intptr0 = bottom_blob.channel(q * 2);
+                        const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                        signed char* ptr = top_blob.channel(q);
+
+                        vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl);
+                        vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl);
+                        vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl);
+                        vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                            vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                            _v0 = vfmul_vv_f32m1(_v0, _scale_in0, vl);
+                            _v1 = vfmul_vv_f32m1(_v1, _scale_in1, vl);
+                            _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                            _v1 = activation_ps(_v1, activation_type, activation_params, vl);
+                            _v0 = vfmul_vv_f32m1(_v0, _scale_out0, vl);
+                            _v1 = vfmul_vv_f32m1(_v1, _scale_out1, vl);
+                            *(int64_t*)ptr = float2int8(_v0, _v1);
+
+                            intptr0 += 4;
+                            intptr1 += 4;
+                            ptr += 8;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < outc; q++)
+                    {
+                        const int* intptr0 = bottom_blob.channel(q * 2);
+                        const int* intptr1 = bottom_blob.channel(q * 2 + 1);
+                        signed char* ptr = top_blob.channel(q);
+
+                        vfloat32m1_t _scale_in0 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8, vl);
+                        vfloat32m1_t _scale_in1 = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 8 + 4, vl);
+                        vfloat32m1_t _scale_out0 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8, vl);
+                        vfloat32m1_t _scale_out1 = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 8 + 4, vl);
+                        vfloat32m1_t _bias0 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8, vl);
+                        vfloat32m1_t _bias1 = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 8 + 4, vl);
+                        for (int i = 0; i < size; i++)
+                        {
+                            vfloat32m1_t _v0 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr0, vl), vl);
+                            vfloat32m1_t _v1 = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr1, vl), vl);
+                            _v0 = vfmacc_vv_f32m1(_bias0, _v0, _scale_in0, vl);
+                            _v1 = vfmacc_vv_f32m1(_bias1, _v1, _scale_in1, vl);
+                            _v0 = activation_ps(_v0, activation_type, activation_params, vl);
+                            _v1 = activation_ps(_v1, activation_type, activation_params, vl);
+                            _v0 = vfmul_vv_f32m1(_v0, _scale_out0, vl);
+                            _v1 = vfmul_vv_f32m1(_v1, _scale_out1, vl);
+                            *(int64_t*)ptr = float2int8(_v0, _v1);
+
+                            intptr0 += 4;
+                            intptr1 += 4;
+                            ptr += 8;
+                        }
+                    }
+                }
+            }
+            if (out_elempack == 1)
+            {
+                if (bias_data_size == 0)
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const int* intptr = bottom_blob.channel(q);
+                        signed char* ptr0 = top_blob.channel(q * 4);
+                        signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                        signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                        signed char* ptr3 = top_blob.channel(q * 4 + 3);
+
+                        vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl);
+                        vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl);
+                        // float32x4_t _scale_in = scale_in_data_size == 1 ? vdupq_n_f32(scale_in_data[0]) : vld1q_f32((const float*)scale_in_data + q * 4);
+                        // float32x4_t _scale_out = scale_out_data_size == 1 ? vdupq_n_f32(scale_out_data[0]) : vld1q_f32((const float*)scale_out_data + q * 4);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                            _v = vfmul_vv_f32m1(_v, _scale_in, vl);
+                            _v = activation_ps(_v, activation_type, activation_params, vl);
+                            _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                            int res = float2int8(_v);
+                            ptr0[0] = (res)&0xff;
+                            ptr1[0] = (res >> 8) & 0xff;
+                            ptr2[0] = (res >> 16) & 0xff;
+                            ptr3[0] = (res >> 24) & 0xff;
+
+                            intptr += 4;
+                            ptr0 += 1;
+                            ptr1 += 1;
+                            ptr2 += 1;
+                            ptr3 += 1;
+                        }
+                    }
+                }
+                else
+                {
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int q = 0; q < channels; q++)
+                    {
+                        const int* intptr = bottom_blob.channel(q);
+                        signed char* ptr0 = top_blob.channel(q * 4);
+                        signed char* ptr1 = top_blob.channel(q * 4 + 1);
+                        signed char* ptr2 = top_blob.channel(q * 4 + 2);
+                        signed char* ptr3 = top_blob.channel(q * 4 + 3);
+
+                        vfloat32m1_t _scale_in = scale_in_data_size == 1 ? vfmv_v_f_f32m1(scale_in_data[0], vl) : vle32_v_f32m1((const float*)scale_in_data + q * 4, vl);
+                        vfloat32m1_t _scale_out = scale_out_data_size == 1 ? vfmv_v_f_f32m1(scale_out_data[0], vl) : vle32_v_f32m1((const float*)scale_out_data + q * 4, vl);
+                        vfloat32m1_t _bias = bias_data_size == 1 ? vfmv_v_f_f32m1(bias_data[0], vl) : vle32_v_f32m1((const float*)bias_data + q * 4, vl);
+
+                        for (int i = 0; i < size; i++)
+                        {
+                            vfloat32m1_t _v = vfcvt_f_x_v_f32m1(vle32_v_i32m1(intptr, vl), vl);
+                            _v = vfmacc_vv_f32m1(_bias, _v, _scale_in, vl);
+                            _v = activation_ps(_v, activation_type, activation_params, vl);
+                            _v = vfmul_vv_f32m1(_v, _scale_out, vl);
+                            int res = float2int8(_v);
+                            ptr0[0] = (res)&0xff;
+                            ptr1[0] = (res >> 8) & 0xff;
+                            ptr2[0] = (res >> 16) & 0xff;
+                            ptr3[0] = (res >> 24) & 0xff;
+
+                            intptr += 4;
+                            ptr0 += 1;
+                            ptr1 += 1;
+                            ptr2 += 1;
+                            ptr3 += 1;
+                        }
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+#endif // __riscv_vector
+
+    if (dims == 1)
+    {
+        int w = bottom_blob.w;
+
+        top_blob.create(w, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        const int* intptr = bottom_blob;
+        signed char* ptr = top_blob;
+
+        if (scale_in_data_size == 1 && scale_out_data_size == 1)
+        {
+            const float scale_in = scale_in_data[0];
+            const float scale_out = scale_out_data[0];
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in + bias_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+        else if (scale_in_data_size == 1 && scale_out_data_size > 1)
+        {
+            const float scale_in = scale_in_data[0];
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in + bias_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+        }
+        else if (scale_in_data_size > 1 && scale_out_data_size == 1)
+        {
+            const float scale_out = scale_out_data[0];
+
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i] + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i] + bias_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+        else // if (scale_in_data_size > 1 && scale_out_data_size > 1)
+        {
+            if (bias_data_size == 0)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+            else if (bias_data_size == 1)
+            {
+                const float bias = bias_data[0];
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i] + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+            else
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i = 0; i < w; i++)
+                {
+                    float v = intptr[i] * scale_in_data[i] + bias_data[i];
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out_data[i]);
+                }
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        top_blob.create(w, h, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                signed char* ptr = top_blob.row<signed char>(i);
+
+                const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i];
+                const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i];
+
+                for (int j = 0; j < w; j++)
+                {
+                    float v = intptr[j] * scale_in;
+                    ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                signed char* ptr = top_blob.row<signed char>(i);
+
+                const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[i];
+                const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[i];
+                const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[i];
+
+                for (int j = 0; j < w; j++)
+                {
+                    float v = intptr[j] * scale_in + bias;
+                    ptr[j] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+        int size = w * h;
+
+        top_blob.create(w, h, channels, (size_t)1u, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (bias_data_size == 0)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr = top_blob.channel(q);
+
+                const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q];
+                const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q];
+
+                for (int i = 0; i < size; i++)
+                {
+                    float v = intptr[i] * scale_in;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr = top_blob.channel(q);
+
+                const float scale_in = scale_in_data_size == 1 ? scale_in_data[0] : scale_in_data[q];
+                const float scale_out = scale_out_data_size == 1 ? scale_out_data[0] : scale_out_data[q];
+                const float bias = bias_data_size == 1 ? bias_data[0] : bias_data[q];
+
+                for (int i = 0; i < size; i++)
+                {
+                    float v = intptr[i] * scale_in + bias;
+                    ptr[i] = float2int8(activation_ss(v, activation_type, activation_params) * scale_out);
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/riscv/requantize_riscv.h b/src/layer/riscv/requantize_riscv.h
new file mode 100644
index 000000000000..df12b54ce3a4
--- /dev/null
+++ b/src/layer/riscv/requantize_riscv.h
@@ -0,0 +1,34 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 Xinyu302. All rights reserved.
+// Copyright (C) 2019 BUG1989. All rights reserved.
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_REQUANTIZE_RISCV_H
+#define LAYER_REQUANTIZE_RISCV_H
+
+#include "requantize.h"
+
+namespace ncnn {
+
+class Requantize_riscv : public Requantize
+{
+public:
+    Requantize_riscv();
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_REQUANTIZE_RISCV_H
diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h
index e2824646f871..1802a28336dd 100644
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -19,6 +19,14 @@
 #include <riscv_vector.h>
 #endif // __riscv_vector
 
+static inline signed char float2int8(float v)
+{
+    int int32 = (int)roundf(v);
+    if (int32 > 127) return 127;
+    if (int32 < -127) return -127;
+    return (signed char)int32;
+}
+
 #if __riscv_vector
 static inline int csrr_vl()
 {
@@ -50,6 +58,362 @@ static inline int csrr_vlenb()
     return a;
 }
 
+#if __riscv_zfh
+static inline int64_t float2int8(vfloat16m1_t _v)
+{
+    int vl = vsetvlmax_e16m1();
+    vint16m2_t _v16 = vundefined_i16m2();
+    _v16 = vset_v_i16m1_i16m2(_v16, 0, vfcvt_x_f_v_i16m1(_v, vl));
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+    int64_t _ret;
+    vse8_v_i8m1((int8_t*)&_ret, _v8, vl);
+    return _ret;
+    // int16x8_t _v16 = vcvtaq_s16_f16(_v);
+    // int8x8_t _v8 = vqmovn_s16(_v16);
+    // return vmax_s8(_v8, vdup_n_s8(-127));
+}
+#endif // __riscv_zfh
+
+static inline int64_t float2int8(vfloat32m1_t _vlow, vfloat32m1_t _vhigh)
+{
+    int vl = vsetvlmax_e32m1();
+    vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow, vl);
+    vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh, vl);
+
+    // combine _vlow32 and _vhigh32 to a single vint32m2_t
+    vl = 2 * vsetvlmax_e32m1();
+    vint32m4_t _v32 = vundefined_i32m4();
+    _v32 = vset_v_i32m1_i32m4(_v32, 0, _vlow32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 1, _vhigh32);
+
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+    _v8 = vmax_vx_i8m1(_v8, -127, vl);
+    int64_t _ret;
+    vse8_v_i8m1((int8_t*)&_ret, _v8, vl);
+    return _ret;
+}
+
+static inline int32_t float2int8(vfloat32m1_t _v)
+{
+    int vl = vsetvlmax_e32m1();
+    vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v, vl);
+
+    vint32m4_t _v32 = vundefined_i32m4();
+    _v32 = vset_v_i32m1_i32m4(_v32, 0, _v32m1);
+
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+    _v8 = vmax_vx_i8m1(_v8, -127, vl);
+    int32_t _ret;
+    vse8_v_i8m1((int8_t*)&_ret, _v8, vl);
+    return _ret;
+}
+
+static inline int64_t float2int8(vfloat32m2_t _v)
+{
+    int vl = vsetvlmax_e32m2();
+    vint32m2_t _v32m2 = vfcvt_x_f_v_i32m2(_v, vl);
+    vint32m4_t _v32 = vundefined_i32m4();
+    _v32 = vset_v_i32m2_i32m4(_v32, 0, _v32m2);
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+    _v8 = vmax_vx_i8m1(_v8, -127, vl);
+    int64_t _ret;
+    vse8_v_i8m1((int8_t*)&_ret, _v8, vl);
+    return _ret;
+}
+
+static inline int64_t float2int8relu(vfloat32m2_t _v)
+{
+    int vl = vsetvlmax_e32m2();
+    vint32m2_t _v32m2 = vfcvt_x_f_v_i32m2(_v, vl);
+    vint32m4_t _v32 = vundefined_i32m4();
+    _v32 = vset_v_i32m2_i32m4(_v32, 0, _v32m2);
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+    _v8 = vmax_vx_i8m1(_v8, 0, vl);
+    int64_t _ret;
+    vse8_v_i8m1((int8_t*)&_ret, _v8, vl);
+    return _ret;
+}
+
+static inline int64_t float2int8leakyrelu(vfloat32m2_t _v, vfloat32m2_t _slope)
+{
+    int vl = vsetvlmax_e32m2();
+
+    vfloat32m2_t _v_leaky = vfmul_vv_f32m2(_v, _slope, vl);
+    vint32m2_t _v_int32 = vfcvt_x_f_v_i32m2(_v, vl);
+
+    // vfloat32m1_t _vlow_leaky = vfmul_vv_f32m1(_vlow, _slope, vl);
+    // vfloat32m1_t _vhigh_leaky = vfmul_vv_f32m1(_vhigh, _slope, vl);
+
+    // vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow, vl);
+    // vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh, vl);
+
+    // vint32m1_t _vlow32_leaky = vfcvt_x_f_v_i32m1(_vlow_leaky, vl);
+    // vint32m1_t _vhigh32_leaky = vfcvt_x_f_v_i32m1(_vhigh_leaky, vl);
+    vint32m2_t _v_int32_leaky = vfcvt_x_f_v_i32m2(_v_leaky, vl);
+
+    vl = 2 * vsetvlmax_e32m1();
+
+    vint32m4_t _v32 = vundefined_i32m4();
+    vint32m4_t _v32_leaky = vundefined_i32m4();
+    _v32 = vset_v_i32m2_i32m4(_v32, 0, _v_int32);
+    _v32_leaky = vset_v_i32m2_i32m4(_v32_leaky, 0, _v_int32_leaky);
+
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl);
+
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+    vint8m1_t _v8_leaky = vnclip_wx_i8m1(_v16_leaky, 0, vl);
+
+    _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl);
+    int64_t _ret;
+    vse8_v_i8m1((int8_t*)&_ret, _v8, vl);
+    return _ret;
+}
+
+static void print_vint8m1(vint8m1_t _v, int vl = 16)
+{
+    int8_t* i8 = (int8_t*)malloc(16 * sizeof(int8_t));
+    vse8_v_i8m1(i8, _v, vl);
+    for (int i = 0; i < vl; i++)
+    {
+        fprintf(stderr, "i8[%d]: %d\n", i, i8[i]);
+    }
+    free(i8);
+}
+
+static void print_vint32m1(vint32m1_t _v)
+{
+    int32_t* i32 = (int32_t*)malloc(4 * sizeof(int32_t));
+    vse32_v_i32m1(i32, _v, 4);
+    for (int i = 0; i < 4; i++)
+    {
+        fprintf(stderr, "i32[%d]: %d\n", i, i32[i]);
+    }
+    free(i32);
+}
+
+static void print_vint32m2(vint32m2_t _v, int vl = 8)
+{
+    int32_t* i32 = (int32_t*)malloc(8 * sizeof(int32_t));
+    vse32_v_i32m2(i32, _v, vl);
+    for (int i = 0; i < vl; i++)
+    {
+        fprintf(stderr, "i32[%d]: %d\n", i, i32[i]);
+    }
+    free(i32);
+}
+
+static void print_vfloat32m2(vfloat32m2_t _v, int vl = 8)
+{
+    float* f32 = (float*)malloc(8 * sizeof(float));
+    vse32_v_f32m2(f32, _v, vl);
+    for (int i = 0; i < vl; i++)
+    {
+        fprintf(stderr, "f32[%d]: %f\n", i, f32[i]);
+    }
+    free(f32);
+}
+
+static void print_vfloat32m1(vfloat32m1_t _v)
+{
+    float* f32 = (float*)malloc(4 * sizeof(float));
+    vse32_v_f32m1(f32, _v, 4);
+    for (int i = 0; i < 4; i++)
+    {
+        fprintf(stderr, "f32[%d]: %f\n", i, f32[i]);
+    }
+    free(f32);
+}
+
+static inline vint8m1_t float2int8(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3)
+{
+    int vl = vsetvlmax_e32m1();
+
+    vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v0, vl);
+    vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v1, vl);
+    vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v2, vl);
+    vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v3, vl);
+
+    vl = vsetvlmax_e32m4();
+
+    vint32m4_t _v32 = vundefined_i32m4();
+    _v32 = vset_v_i32m1_i32m4(_v32, 0, _v0_32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 1, _v1_32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 2, _v2_32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 3, _v3_32);
+
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+    _v8 = vmax_vx_i8m1(_v8, -127, vl);
+    int8_t* i8 = (int8_t*)malloc(16 * sizeof(int8_t));
+    vse8_v_i8m1(i8, _v8, 16);
+    return _v8;
+}
+
+static inline int64_t float2int8relu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh)
+{
+    int vl = vsetvlmax_e32m1();
+
+    vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow, vl);
+    vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh, vl);
+
+    vl = 2 * vsetvlmax_e32m1();
+    vint32m4_t _v32 = vundefined_i32m4();
+    _v32 = vset_v_i32m1_i32m4(_v32, 0, _vlow32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 1, _vhigh32);
+
+    vint16m2_t _v16_2 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16_2, 0, vl);
+    _v8 = vmax_vx_i8m1(_v8, 0, vl);
+    int64_t _ret;
+    vse8_v_i8m1((int8_t*)&_ret, _v8, vl);
+    return _ret;
+}
+
+static inline int32_t float2int8relu(vfloat32m1_t _v)
+{
+    int vl = vsetvlmax_e32m1();
+    vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v, vl);
+
+    vint32m4_t _v32 = vundefined_i32m4();
+    _v32 = vset_v_i32m1_i32m4(_v32, 0, _v32m1);
+
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+    _v8 = vmax_vx_i8m1(_v8, 0, vl);
+    int32_t _ret;
+    vse8_v_i8m1((int8_t*)&_ret, _v8, vl);
+    return _ret;
+}
+
+static inline vint8m1_t float2int8relu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3)
+{
+    int vl = vsetvlmax_e32m1();
+    vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v0, vl);
+    vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v1, vl);
+    vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v2, vl);
+    vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v3, vl);
+
+    vl = vsetvlmax_e32m4();
+
+    vint32m4_t _v32 = vundefined_i32m4();
+    _v32 = vset_v_i32m1_i32m4(_v32, 0, _v0_32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 1, _v1_32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 2, _v2_32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 3, _v3_32);
+
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+    _v8 = vmax_vx_i8m1(_v8, 0, vl);
+    return _v8;
+}
+
+static inline int64_t float2int8leakyrelu(vfloat32m1_t _vlow, vfloat32m1_t _vhigh, vfloat32m1_t _slope)
+{
+    int vl = vsetvlmax_e32m1();
+    vfloat32m1_t _vlow_leaky = vfmul_vv_f32m1(_vlow, _slope, vl);
+    vfloat32m1_t _vhigh_leaky = vfmul_vv_f32m1(_vhigh, _slope, vl);
+
+    vint32m1_t _vlow32 = vfcvt_x_f_v_i32m1(_vlow, vl);
+    vint32m1_t _vhigh32 = vfcvt_x_f_v_i32m1(_vhigh, vl);
+
+    vint32m1_t _vlow32_leaky = vfcvt_x_f_v_i32m1(_vlow_leaky, vl);
+    vint32m1_t _vhigh32_leaky = vfcvt_x_f_v_i32m1(_vhigh_leaky, vl);
+
+    vl = 2 * vsetvlmax_e32m1();
+
+    vint32m4_t _v32 = vundefined_i32m4();
+    vint32m4_t _v32_leaky = vundefined_i32m4();
+    _v32 = vset_v_i32m1_i32m4(_v32, 0, _vlow32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 1, _vhigh32);
+
+    _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 0, _vlow32_leaky);
+    _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 1, _vhigh32_leaky);
+
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl);
+
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+    vint8m1_t _v8_leaky = vnclip_wx_i8m1(_v16_leaky, 0, vl);
+
+    _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl);
+    int64_t _ret;
+    vse8_v_i8m1((int8_t*)&_ret, _v8, vl);
+    return _ret;
+}
+
+static inline int32_t float2int8leakyrelu(vfloat32m1_t _v, vfloat32m1_t _slope)
+{
+    int vl = vsetvlmax_e32m1();
+    vfloat32m1_t _v_leaky = vfmul_vv_f32m1(_v, _slope, vl);
+
+    vint32m1_t _v32m1 = vfcvt_x_f_v_i32m1(_v, vl);
+    vint32m1_t _v32m1_leaky = vfcvt_x_f_v_i32m1(_v_leaky, vl);
+
+    // vl = vsetvlmax_e32m4();
+    vint32m4_t _v32 = vundefined_i32m4();
+    vint32m4_t _v32_leaky = vundefined_i32m4();
+
+    _v32 = vset_v_i32m1_i32m4(_v32, 0, _v32m1);
+    _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 0, _v32m1_leaky);
+
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl);
+
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+    vint8m1_t _v8_leaky = vnclip_wx_i8m1(_v16_leaky, 0, vl);
+
+    _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl);
+    int32_t _ret;
+    vse8_v_i8m1((int8_t*)&_ret, _v8, vl);
+    return _ret;
+}
+
+static inline vint8m1_t float2int8leakyrelu(vfloat32m1_t _v0, vfloat32m1_t _v1, vfloat32m1_t _v2, vfloat32m1_t _v3, vfloat32m1_t _slope)
+{
+    int vl = vsetvlmax_e32m1();
+    vfloat32m1_t _v0_leaky = vfmul_vv_f32m1(_v0, _slope, vl);
+    vfloat32m1_t _v1_leaky = vfmul_vv_f32m1(_v1, _slope, vl);
+    vfloat32m1_t _v2_leaky = vfmul_vv_f32m1(_v2, _slope, vl);
+    vfloat32m1_t _v3_leaky = vfmul_vv_f32m1(_v3, _slope, vl);
+
+    vint32m1_t _v0_32 = vfcvt_x_f_v_i32m1(_v0, vl);
+    vint32m1_t _v1_32 = vfcvt_x_f_v_i32m1(_v1, vl);
+    vint32m1_t _v2_32 = vfcvt_x_f_v_i32m1(_v2, vl);
+    vint32m1_t _v3_32 = vfcvt_x_f_v_i32m1(_v3, vl);
+
+    vint32m1_t _v0_32_leaky = vfcvt_x_f_v_i32m1(_v0_leaky, vl);
+    vint32m1_t _v1_32_leaky = vfcvt_x_f_v_i32m1(_v1_leaky, vl);
+    vint32m1_t _v2_32_leaky = vfcvt_x_f_v_i32m1(_v2_leaky, vl);
+    vint32m1_t _v3_32_leaky = vfcvt_x_f_v_i32m1(_v3_leaky, vl);
+
+    vl = vsetvlmax_e32m4();
+    vint32m4_t _v32 = vundefined_i32m4();
+    _v32 = vset_v_i32m1_i32m4(_v32, 0, _v0_32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 1, _v1_32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 2, _v2_32);
+    _v32 = vset_v_i32m1_i32m4(_v32, 3, _v3_32);
+
+    vint16m2_t _v16 = vnclip_wx_i16m2(_v32, 0, vl);
+    vint8m1_t _v8 = vnclip_wx_i8m1(_v16, 0, vl);
+
+    vint32m4_t _v32_leaky = vundefined_i32m4();
+    _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 0, _v0_32_leaky);
+    _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 1, _v1_32_leaky);
+    _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 2, _v2_32_leaky);
+    _v32_leaky = vset_v_i32m1_i32m4(_v32_leaky, 3, _v3_32_leaky);
+
+    vint16m2_t _v16_leaky = vnclip_wx_i16m2(_v32_leaky, 0, vl);
+    vint8m1_t _v8_leaky = vnclip_wx_i8m1(_v16_leaky, 0, vl);
+
+    _v8 = vmax_vv_i8m1(_v8, _v8_leaky, vl);
+    return _v8;
+}
+
 static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr)
 {
     const int packn = csrr_vlenb() / 4;
diff --git a/src/mat.h b/src/mat.h
index fdf5cc597c4e..b6849565fe13 100644
--- a/src/mat.h
+++ b/src/mat.h
@@ -1120,7 +1120,7 @@ NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v)
 
 NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v)
 {
-    const int packn = cpu_riscv_vlenb() / 1;
+    const int packn = cpu_riscv_vlenb() / 2;
     const size_t vl = vsetvl_e8m1(packn);
 
     int size = (int)total();
diff --git a/tests/testutil.cpp b/tests/testutil.cpp
index f0bf3c51a20d..96a7c903103f 100644
--- a/tests/testutil.cpp
+++ b/tests/testutil.cpp
@@ -533,7 +533,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
             if (elembits == 8)
             {
 #if NCNN_RVV
-                const int packn = ncnn::cpu_riscv_vlenb() / 1;
+                const int packn = ncnn::cpu_riscv_vlenb() / 2;
                 if (elemcount % packn == 0)
                     dst_elempack = packn;
 #else
@@ -1048,7 +1048,7 @@ int test_layer_cpu(int typeindex, const ncnn::ParamDict& pd, const std::vector<n
         if (elembits == 8)
         {
 #if NCNN_RVV
-            const int packn = ncnn::cpu_riscv_vlenb() / 1;
+            const int packn = ncnn::cpu_riscv_vlenb() / 2;
             if (elemcount % packn == 0)
                 dst_elempack = packn;
 #else