diff --git a/src/layer/riscv/convolution_packed_int8.h b/src/layer/riscv/convolution_packed_int8.h
new file mode 100644
index 000000000000..36a36abca8b6
--- /dev/null
+++ b/src/layer/riscv/convolution_packed_int8.h
@@ -0,0 +1,988 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+// #if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD)
+// #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
+// void convolution_transform_kernel_packed_int8_i8mm(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h);
+// void convolution_packed_int8_i8mm(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt);
+// #endif
+
+// #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD
+// void convolution_transform_kernel_packed_int8_asimddp(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h);
+// void convolution_packed_int8_asimddp(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt);
+// #endif
+// #endif
+
+static void convolution_transform_kernel_packed_int8(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // src = kw-kh-inch-outch
+    // dst = pb-pa-kw-kh-inch/pa-outch/pb
+
+    // clang-format off
+    // *INDENT-OFF*
+#if __riscv_vector
+    if (outch >= 8)
+    {
+        if (inch >= 8)
+            kernel_tm.create(maxk, inch / 8 + inch % 8, outch / 8 + (outch % 8) / 4 + (outch % 4) / 2 + outch % 2, (size_t)64u, 64);
+        else
+            kernel_tm.create(maxk, inch, outch / 8 + (outch % 8) / 4 + (outch % 4) / 2 + outch % 2, (size_t)8u, 8);
+    }
+    else if (outch >= 4)
+    {
+        if (inch >= 8)
+            kernel_tm.create(maxk, inch / 8 + inch % 8, outch / 4 + (outch % 4) / 2 + outch % 2, (size_t)32u, 32);
+        else
+            kernel_tm.create(maxk, inch, outch / 4 + (outch % 4) / 2 + outch % 2, (size_t)4u, 4);
+    }
+    else
+#endif // __riscv_vector
+    if (outch >= 2)
+    {
+#if __riscv_vector
+        if (inch >= 8)
+            kernel_tm.create(maxk, inch / 8 + inch % 8, outch / 2 + outch % 2, (size_t)16u, 16);
+        else
+#endif // __riscv_vector
+            kernel_tm.create(maxk, inch, outch / 2 + outch % 2, (size_t)2u, 2);
+    }
+    else
+    {
+#if __riscv_vector
+        if (inch >= 8)
+            kernel_tm.create(maxk, inch / 8 + inch % 8, outch, (size_t)8u, 8);
+        else
+#endif // __riscv_vector
+            kernel_tm.create(maxk, inch, outch, (size_t)1u, 1);
+    }
+    // *INDENT-ON*
+    // clang-format on
+
+    int q = 0;
+#if __riscv_vector
+    for (; q + 7 < outch; q += 8)
+    {
+        const signed char* kptr0 = (const signed char*)kernel + q * inch * maxk;
+        const signed char* kptr1 = (const signed char*)kernel + (q + 1) * inch * maxk;
+        const signed char* kptr2 = (const signed char*)kernel + (q + 2) * inch * maxk;
+        const signed char* kptr3 = (const signed char*)kernel + (q + 3) * inch * maxk;
+        const signed char* kptr4 = (const signed char*)kernel + (q + 4) * inch * maxk;
+        const signed char* kptr5 = (const signed char*)kernel + (q + 5) * inch * maxk;
+        const signed char* kptr6 = (const signed char*)kernel + (q + 6) * inch * maxk;
+        const signed char* kptr7 = (const signed char*)kernel + (q + 7) * inch * maxk;
+
+        signed char* g00 = kernel_tm.channel(q / 8);
+
+        int p = 0;
+        for (; p + 7 < inch; p += 8)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                const signed char* k0 = kptr0 + k;
+                const signed char* k1 = kptr1 + k;
+                const signed char* k2 = kptr2 + k;
+                const signed char* k3 = kptr3 + k;
+                const signed char* k4 = kptr4 + k;
+                const signed char* k5 = kptr5 + k;
+                const signed char* k6 = kptr6 + k;
+                const signed char* k7 = kptr7 + k;
+
+                for (int i = 0; i < 4; i++)
+                {
+                    g00[0] = k0[0];
+                    g00[1] = k0[maxk];
+                    g00[2] = k1[0];
+                    g00[3] = k1[maxk];
+                    g00[4] = k2[0];
+                    g00[5] = k2[maxk];
+                    g00[6] = k3[0];
+                    g00[7] = k3[maxk];
+                    g00[8] = k4[0];
+                    g00[9] = k4[maxk];
+                    g00[10] = k5[0];
+                    g00[11] = k5[maxk];
+                    g00[12] = k6[0];
+                    g00[13] = k6[maxk];
+                    g00[14] = k7[0];
+                    g00[15] = k7[maxk];
+                    g00 += 16;
+                    k0 += maxk * 2;
+                    k1 += maxk * 2;
+                    k2 += maxk * 2;
+                    k3 += maxk * 2;
+                    k4 += maxk * 2;
+                    k5 += maxk * 2;
+                    k6 += maxk * 2;
+                    k7 += maxk * 2;
+                }
+            }
+
+            kptr0 += maxk * 8;
+            kptr1 += maxk * 8;
+            kptr2 += maxk * 8;
+            kptr3 += maxk * 8;
+            kptr4 += maxk * 8;
+            kptr5 += maxk * 8;
+            kptr6 += maxk * 8;
+            kptr7 += maxk * 8;
+        }
+        for (; p < inch; p++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                const signed char* k0 = kptr0 + k;
+                const signed char* k1 = kptr1 + k;
+                const signed char* k2 = kptr2 + k;
+                const signed char* k3 = kptr3 + k;
+                const signed char* k4 = kptr4 + k;
+                const signed char* k5 = kptr5 + k;
+                const signed char* k6 = kptr6 + k;
+                const signed char* k7 = kptr7 + k;
+
+                g00[0] = k0[0];
+                g00[1] = k1[0];
+                g00[2] = k2[0];
+                g00[3] = k3[0];
+                g00[4] = k4[0];
+                g00[5] = k5[0];
+                g00[6] = k6[0];
+                g00[7] = k7[0];
+                g00 += 8;
+            }
+
+            kptr0 += maxk;
+            kptr1 += maxk;
+            kptr2 += maxk;
+            kptr3 += maxk;
+            kptr4 += maxk;
+            kptr5 += maxk;
+            kptr6 += maxk;
+            kptr7 += maxk;
+        }
+    }
+    for (; q + 3 < outch; q += 4)
+    {
+        const signed char* kptr0 = (const signed char*)kernel + q * inch * maxk;
+        const signed char* kptr1 = (const signed char*)kernel + (q + 1) * inch * maxk;
+        const signed char* kptr2 = (const signed char*)kernel + (q + 2) * inch * maxk;
+        const signed char* kptr3 = (const signed char*)kernel + (q + 3) * inch * maxk;
+
+        signed char* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4);
+
+        int p = 0;
+        for (; p + 7 < inch; p += 8)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                const signed char* k0 = kptr0 + k;
+                const signed char* k1 = kptr1 + k;
+                const signed char* k2 = kptr2 + k;
+                const signed char* k3 = kptr3 + k;
+
+                for (int i = 0; i < 4; i++)
+                {
+                    g00[0] = k0[0];
+                    g00[1] = k0[maxk];
+                    g00[2] = k1[0];
+                    g00[3] = k1[maxk];
+                    g00[4] = k2[0];
+                    g00[5] = k2[maxk];
+                    g00[6] = k3[0];
+                    g00[7] = k3[maxk];
+                    g00 += 8;
+                    k0 += maxk * 2;
+                    k1 += maxk * 2;
+                    k2 += maxk * 2;
+                    k3 += maxk * 2;
+                }
+            }
+
+            kptr0 += maxk * 8;
+            kptr1 += maxk * 8;
+            kptr2 += maxk * 8;
+            kptr3 += maxk * 8;
+        }
+        for (; p < inch; p++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                const signed char* k0 = kptr0 + k;
+                const signed char* k1 = kptr1 + k;
+                const signed char* k2 = kptr2 + k;
+                const signed char* k3 = kptr3 + k;
+
+                g00[0] = k0[0];
+                g00[1] = k1[0];
+                g00[2] = k2[0];
+                g00[3] = k3[0];
+                g00 += 4;
+            }
+
+            kptr0 += maxk;
+            kptr1 += maxk;
+            kptr2 += maxk;
+            kptr3 += maxk;
+        }
+    }
+#endif // __riscv_vector
+    for (; q + 1 < outch; q += 2)
+    {
+        const signed char* kptr0 = (const signed char*)kernel + q * inch * maxk;
+        const signed char* kptr1 = (const signed char*)kernel + (q + 1) * inch * maxk;
+
+#if __riscv_vector
+        signed char* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2);
+#else
+        signed char* g00 = kernel_tm.channel(q / 2);
+#endif
+
+        int p = 0;
+#if __riscv_vector
+        for (; p + 7 < inch; p += 8)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                const signed char* k0 = kptr0 + k;
+                const signed char* k1 = kptr1 + k;
+
+                for (int i = 0; i < 4; i++)
+                {
+                    g00[0] = k0[0];
+                    k0 += maxk;
+                    g00 += 1;
+                }
+                for (int i = 0; i < 4; i++)
+                {
+                    g00[0] = k1[0];
+                    k1 += maxk;
+                    g00 += 1;
+                }
+
+                for (int i = 4; i < 8; i++)
+                {
+                    g00[0] = k0[0];
+                    k0 += maxk;
+                    g00 += 1;
+                }
+                for (int i = 4; i < 8; i++)
+                {
+                    g00[0] = k1[0];
+                    k1 += maxk;
+                    g00 += 1;
+                }
+            }
+
+            kptr0 += maxk * 8;
+            kptr1 += maxk * 8;
+        }
+#endif // __riscv_vector
+        for (; p < inch; p++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                const signed char* k0 = kptr0 + k;
+                const signed char* k1 = kptr1 + k;
+
+                g00[0] = k0[0];
+                g00[1] = k1[0];
+                g00 += 2;
+            }
+
+            kptr0 += maxk;
+            kptr1 += maxk;
+        }
+    }
+    for (; q < outch; q++)
+    {
+        const signed char* kptr = (const signed char*)kernel + q * inch * maxk;
+
+#if __riscv_vector
+        signed char* g00 = kernel_tm.channel(q / 8 + (q % 8) / 4 + (q % 4) / 2 + q % 2);
+#else
+        signed char* g00 = kernel_tm.channel(q / 2 + q % 2);
+#endif
+
+        int p = 0;
+#if __riscv_vector
+        for (; p + 7 < inch; p += 8)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                const signed char* k0 = kptr + k;
+
+                for (int i = 0; i < 8; i++)
+                {
+                    g00[0] = k0[0];
+                    k0 += maxk;
+                    g00 += 1;
+                }
+            }
+
+            kptr += maxk * 8;
+        }
+#endif // __riscv_vector
+        for (; p < inch; p++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                const signed char* k0 = kptr + k;
+                g00[0] = k0[0];
+                g00++;
+            }
+
+            kptr += maxk;
+        }
+    }
+}
+
+static void convolution_packed_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    // #if !(__ARM_FEATURE_MATMUL_INT8 || __ARM_FEATURE_DOTPROD)
+    // #if NCNN_RUNTIME_CPU && NCNN_ARM84I8MM && __aarch64__ && !__ARM_FEATURE_MATMUL_INT8
+    //     if (ncnn::cpu_support_arm_i8mm())
+    //     {
+    //         convolution_packed_int8_i8mm(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+    //         return;
+    //     }
+    // #endif
+
+    // #if NCNN_RUNTIME_CPU && NCNN_ARM82DOT && __aarch64__ && !__ARM_FEATURE_DOTPROD
+    //     if (ncnn::cpu_support_arm_asimddp())
+    //     {
+    //         convolution_packed_int8_asimddp(bottom_blob, top_blob, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+    //         return;
+    //     }
+    // #endif
+    // #endif
+    int vl;
+
+    const int w = bottom_blob.w;
+    const int elempack = bottom_blob.elempack;
+    const int inch = bottom_blob.c * elempack;
+
+    const int N = bottom_blob.cstep * elempack;
+
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int out_elempack = top_blob.elempack;
+    const int outch = top_blob.c * out_elempack;
+
+    const int maxk = kernel_w * kernel_h;
+
+    // kernel offsets
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation_h - kernel_w * dilation_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2 * elempack;
+                p1++;
+                p2 += dilation_w;
+            }
+            p2 += gap;
+        }
+    }
+
+    int nn_outch = 0;
+    int remain_outch_start = 0;
+#if __riscv_vector
+    nn_outch = (outch - remain_outch_start) / 8;
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        const int p = remain_outch_start + pp * 8;
+
+        // shadowed variable for less openmp task args
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+        const int N = bottom_blob.cstep * elempack;
+        const int M = top_blob.cstep * out_elempack;
+
+        int* outptr = top_blob.channel(p / out_elempack);
+
+        int ij = 0;
+
+        for (; ij < outw * outh; ij++)
+        {
+            const int i = ij / outw;
+            const int j = ij % outw;
+
+            // int32x4_t _sum0 = vdupq_n_s32(0);
+            // int32x4_t _sum1 = vdupq_n_s32(0);
+            // int32x4_t _sum2 = vdupq_n_s32(0);
+            // int32x4_t _sum3 = vdupq_n_s32(0);
+
+            vl = 8;
+            vint32m2_t _sum01 = vmv_v_x_i32m2(0, vl);
+            // vint32m2_t _sum23 = vmv_v_x_i32m2(0, vl);
+
+            const signed char* kptr = weight_data_tm.channel(p / 8);
+
+            int q = 0;
+            {
+                for (; q + 7 < inch; q += 8)
+                {
+                    const signed char* r0 = bottom_blob.channel(q / elempack).row<const signed char>(i * stride_h) + j * stride_w * elempack;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        vl = 8;
+                        const signed char* r0s = r0 + space_ofs[k];
+
+                        // int8x8_t _r0;
+                        vint8m1_t _r0;
+                        if (elempack == 8)
+                        {
+                            _r0 = vle8_v_i8m1(r0s, vl);
+                            // _r0 = vld1_s8(r0s);
+                        }
+                        else // if (elempack == 1)
+                        {
+                            _r0 = vlse8_v_i8m1(r0s, N * sizeof(signed char), vl);
+                            // signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]};
+                            // _r0 = vld1_s8(tmp);
+                        }
+
+                        // int8x16_t _w0 = vld1q_s8(kptr);
+                        // int8x16_t _w1 = vld1q_s8(kptr + 16);
+                        // int8x16_t _w2 = vld1q_s8(kptr + 32);
+                        // int8x16_t _w3 = vld1q_s8(kptr + 48);
+                        vl = 16;
+                        vint8m1_t _w0 = vle8_v_i8m1(kptr, vl);
+                        vint8m1_t _w1 = vle8_v_i8m1(kptr + 16, vl);
+                        vint8m1_t _w2 = vle8_v_i8m1(kptr + 32, vl);
+                        vint8m1_t _w3 = vle8_v_i8m1(kptr + 48, vl);
+
+                        vl = 8;
+
+                        // int16x4_t _rr0 = vreinterpret_s16_s8(_r0);
+                        vint16m1_t _rr0 = vreinterpret_v_i8m1_i16m1(_r0);
+
+                        vint8m1_t _r0ll = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 0, vl));
+                        vint8m1_t _r0lh = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 1, vl));
+                        vint8m1_t _r0hl = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 2, vl));
+                        vint8m1_t _r0hh = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 3, vl));
+
+                        // uint8_t mask[8] = {8, 9, 10, 11, 12, 13, 14, 15};
+                        // vuint8m1_t _index = vle8_v_u8m1(mask, vl);
+
+                        // int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0));
+                        // int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1));
+                        // int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2));
+                        // int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3));
+
+                        vint16m2_t _s0l_m2 = vwmul_vv_i16m2(_r0ll, _w0, vl);
+                        vint16m2_t _s1l_m2 = vwmul_vv_i16m2(_r0ll, vslidedown_vx_i8m1(_w0, _w0, 8, vl), vl);
+                        vint16m2_t _s0h_m2 = vwmul_vv_i16m2(_r0lh, _w1, vl);
+                        vint16m2_t _s1h_m2 = vwmul_vv_i16m2(_r0lh, vslidedown_vx_i8m1(_w1, _w1, 8, vl), vl);
+
+                        // int16x8_t _s0l = vmull_s8(_r0ll, vget_low_s8(_w0));
+                        // int16x8_t _s1l = vmull_s8(_r0ll, vget_high_s8(_w0));
+                        // int16x8_t _s0h = vmull_s8(_r0lh, vget_low_s8(_w1));
+                        // int16x8_t _s1h = vmull_s8(_r0lh, vget_high_s8(_w1));
+
+                        // vint16m1_t _s0l = vget_v_i16m2_i16m2(vwmacc_vv_i16m2(_s0l_m2, _r0hl, _w2, vl), 0);
+                        // vint16m1_t _s1l = vget_v_i16m2_i16m2(vwmacc_vv_i16m2(_s1l_m2, _r0hl, vrgather_vv_i8m1(_w2, _index, vl), vl), 0);
+                        // vint16m1_t _s2l = vget_v_i16m2_i16m2(vwmacc_vv_i16m2(_s0h_m2, _r0hh, _w3, vl), 0);
+                        // vint16m1_t _s3l = vget_v_i16m2_i16m2(vwmacc_vv_i16m2(_s1h_m2, _r0hh, vrgather_vv_i8m1(_w3, _index, vl), vl), 0);
+
+                        _s0l_m2 = vwmacc_vv_i16m2(_s0l_m2, _r0hl, _w2, vl);
+                        _s1l_m2 = vwmacc_vv_i16m2(_s1l_m2, _r0hl, vslidedown_vx_i8m1(_w2, _w2, 8, vl), vl);
+                        _s0h_m2 = vwmacc_vv_i16m2(_s0h_m2, _r0hh, _w3, vl);
+                        _s1h_m2 = vwmacc_vv_i16m2(_s1h_m2, _r0hh, vslidedown_vx_i8m1(_w3, _w3, 8, vl), vl);
+
+                        // _s0l = vmlal_s8(_s0l, _r0hl, vget_low_s8(_w2));
+                        // _s1l = vmlal_s8(_s1l, _r0hl, vget_high_s8(_w2));
+                        // _s0h = vmlal_s8(_s0h, _r0hh, vget_low_s8(_w3));
+                        // _s1h = vmlal_s8(_s1h, _r0hh, vget_high_s8(_w3));
+
+                        vint16m2_t _s01l = vset_v_i16m1_i16m2(_s0l_m2, 1, vget_v_i16m2_i16m1(_s1l_m2, 0));
+                        vint16m2_t _s01h = vset_v_i16m1_i16m2(_s0h_m2, 1, vget_v_i16m2_i16m1(_s1h_m2, 0));
+                        uint16_t odd_index[8] = {1, 3, 5, 7, 9, 11, 13, 15};
+                        uint16_t even_index[8] = {0, 2, 4, 6, 8, 10, 12, 14};
+                        vuint16m2_t _odd_index = vle16_v_u16m2(odd_index, vl);
+                        vuint16m2_t _even_index = vle16_v_u16m2(even_index, vl);
+
+                        _sum01 = vwadd_wv_i32m2(_sum01, vget_v_i16m2_i16m1(vrgather_vv_i16m2(_s01l, _odd_index, vl), 0), vl);
+                        _sum01 = vwadd_wv_i32m2(_sum01, vget_v_i16m2_i16m1(vrgather_vv_i16m2(_s01l, _even_index, vl), 0), vl);
+                        _sum01 = vwadd_wv_i32m2(_sum01, vget_v_i16m2_i16m1(vrgather_vv_i16m2(_s01h, _odd_index, vl), 0), vl);
+                        _sum01 = vwadd_wv_i32m2(_sum01, vget_v_i16m2_i16m1(vrgather_vv_i16m2(_s01h, _even_index, vl), 0), vl);
+
+                        // _sum0 = vpadalq_s16(_sum0, _s0l);
+                        // _sum1 = vpadalq_s16(_sum1, _s1l);
+                        // _sum2 = vpadalq_s16(_sum2, _s0h);
+                        // _sum3 = vpadalq_s16(_sum3, _s1h);
+
+                        kptr += 64;
+                    }
+                }
+
+                {
+                    // _sum0 = vaddq_s32(_sum0, _sum2);
+                    // _sum1 = vaddq_s32(_sum1, _sum3);
+                    // _sum01 = vadd_vv_i32m2(_sum01, _sum23, vl);
+                }
+            }
+            for (; q < inch; q++)
+            {
+                vl = 8;
+                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i * stride_h) + j * stride_w;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    vl = 8;
+                    const signed char* r0s = r0 + space_ofs[k];
+
+                    // if (elempack == 1)
+                    {
+                        vint8m1_t _val = vmv_v_x_i8m1(r0s[0], vl);
+                        vint8m1_t _w = vle8_v_i8m1(kptr, vl);
+                        vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val, _w, vl), 0);
+                        _sum01 = vwadd_wv_i32m2(_sum01, _s0, vl);
+                        // int8x8_t _val = vdup_n_s8(r0s[0]);
+                        // int8x8_t _w = vld1_s8(kptr);
+                        // int16x8_t _s0 = vmull_s8(_val, _w);
+                        // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
+                        // _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));
+
+                        kptr += 8;
+                    }
+                }
+            }
+            vl = 8;
+
+            if (out_elempack == 8)
+            {
+                // vst1q_s32(outptr, _sum0);
+                // vst1q_s32(outptr + 4, _sum1);
+                vse32_v_i32m2(outptr, _sum01, vl);
+                outptr += 8;
+            }
+            if (out_elempack == 4)
+            {
+                // vst1q_s32(outptr, _sum0);
+                // vst1q_s32(outptr + M, _sum1);
+                vl = 4;
+                vse32_v_i32m1(outptr, vget_v_i32m2_i32m1(_sum01, 0), vl);
+                vse32_v_i32m1(outptr + M, vget_v_i32m2_i32m1(_sum01, 1), vl);
+                outptr += 4;
+            }
+            if (out_elempack == 1)
+            {
+                vsse32_v_i32m2(outptr, M * sizeof(int), _sum01, vl);
+                // outptr[0] = vgetq_lane_s32(_sum0, 0);
+                // outptr[M] = vgetq_lane_s32(_sum0, 1);
+                // outptr[M * 2] = vgetq_lane_s32(_sum0, 2);
+                // outptr[M * 3] = vgetq_lane_s32(_sum0, 3);
+                // outptr[M * 4] = vgetq_lane_s32(_sum1, 0);
+                // outptr[M * 5] = vgetq_lane_s32(_sum1, 1);
+                // outptr[M * 6] = vgetq_lane_s32(_sum1, 2);
+                // outptr[M * 7] = vgetq_lane_s32(_sum1, 3);
+                outptr += 1;
+            }
+        }
+    }
+    remain_outch_start += nn_outch * 8;
+    nn_outch = (outch - remain_outch_start) / 4;
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        const int p = remain_outch_start + pp * 4;
+
+        // shadowed variable for less openmp task args
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+        const int N = bottom_blob.cstep * elempack;
+        const int M = top_blob.cstep * out_elempack;
+
+        int* outptr = top_blob.channel(p / out_elempack);
+
+        int ij = 0;
+
+        for (; ij < outw * outh; ij++)
+        {
+            const int i = ij / outw;
+            const int j = ij % outw;
+            vl = 4;
+
+            vint32m2_t _sum01 = vmv_v_x_i32m2(0, vl);
+            // int32x4_t _sum0 = vdupq_n_s32(0);
+            // int32x4_t _sum1 = vdupq_n_s32(0);
+
+            const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4);
+
+            int q = 0;
+            {
+                for (; q + 7 < inch; q += 8)
+                {
+                    const signed char* r0 = bottom_blob.channel(q / elempack).row<const signed char>(i * stride_h) + j * stride_w * elempack;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        vl = 8;
+                        const signed char* r0s = r0 + space_ofs[k];
+
+                        // int8x8_t _r0;
+                        vint8m1_t _r0;
+                        if (elempack == 8)
+                        {
+                            _r0 = vle8_v_i8m1(r0s, vl);
+                            // _r0 = vld1_s8(r0s);
+                        }
+                        else // if (elempack == 1)
+                        {
+                            // signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]};
+                            // _r0 = vld1_s8(tmp);
+                            _r0 = vlse8_v_i8m1(r0s, N * sizeof(signed char), vl);
+                        }
+
+                        // int8x16_t _w0 = vld1q_s8(kptr);
+                        // int8x16_t _w1 = vld1q_s8(kptr + 16);
+                        vl = 16;
+                        vint8m1_t _w0 = vle8_v_i8m1(kptr, vl);
+                        vint8m1_t _w1 = vle8_v_i8m1(kptr + 16, vl);
+                        vl = 8;
+
+                        // int16x4_t _rr0 = vreinterpret_s16_s8(_r0);
+                        vint16m1_t _rr0 = vreinterpret_v_i8m1_i16m1(_r0);
+
+                        vint8m1_t _r0ll = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 0, vl));
+                        vint8m1_t _r0lh = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 1, vl));
+                        vint8m1_t _r0hl = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 2, vl));
+                        vint8m1_t _r0hh = vreinterpret_v_i16m1_i8m1(vrgather_vx_i16m1(_rr0, 3, vl));
+
+                        vint16m2_t _sl_m2 = vwmul_vv_i16m2(_r0ll, _w0, vl);
+                        vint16m2_t _sh_m2 = vwmul_vv_i16m2(_r0lh, vslidedown_vx_i8m1(_w0, _w0, 8, vl), vl);
+                        _sl_m2 = vwmacc_vv_i16m2(_sl_m2, _r0hl, _w1, vl);
+                        _sh_m2 = vwmacc_vv_i16m2(_sh_m2, _r0hh, vslidedown_vx_i8m1(_w1, _w1, 8, vl), vl);
+
+                        vint16m1_t _sl = vget_v_i16m2_i16m1(_sl_m2, 0);
+                        vint16m1_t _sh = vget_v_i16m2_i16m1(_sh_m2, 0);
+
+                        // int8x8_t _r0ll = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 0));
+                        // int8x8_t _r0lh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 1));
+                        // int8x8_t _r0hl = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 2));
+                        // int8x8_t _r0hh = vreinterpret_s8_s16(vdup_lane_s16(_rr0, 3));
+
+                        // int16x8_t _sl = vmull_s8(_r0ll, vget_low_s8(_w0));
+                        // int16x8_t _sh = vmull_s8(_r0lh, vget_high_s8(_w0));
+                        // _sl = vmlal_s8(_sl, _r0hl, vget_low_s8(_w1));
+                        // _sh = vmlal_s8(_sh, _r0hh, vget_high_s8(_w1));
+                        vl = 4;
+
+                        uint16_t odd_index[4] = {1, 3, 5, 7};
+                        uint16_t even_index[4] = {0, 2, 4, 6};
+                        vuint16m1_t _odd_index = vle16_v_u16m1(odd_index, vl);
+                        vuint16m1_t _even_index = vle16_v_u16m1(even_index, vl);
+
+                        _sum01 = vwadd_wv_i32m2(_sum01, vrgather_vv_i16m1(_sl, _odd_index, vl), vl);
+                        _sum01 = vwadd_wv_i32m2(_sum01, vrgather_vv_i16m1(_sl, _even_index, vl), vl);
+                        _sum01 = vwadd_wv_i32m2(_sum01, vrgather_vv_i16m1(_sh, _odd_index, vl), vl);
+                        _sum01 = vwadd_wv_i32m2(_sum01, vrgather_vv_i16m1(_sh, _even_index, vl), vl);
+
+                        // _sum0 = vpadalq_s16(_sum0, _sl);
+                        // _sum1 = vpadalq_s16(_sum1, _sh);
+
+                        kptr += 32;
+                    }
+                }
+                // {
+                //     _sum0 = vaddq_s32(_sum0, _sum1);
+                // }
+            }
+            for (; q < inch; q++)
+            {
+                vl = 4;
+                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i * stride_h) + j * stride_w;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    vl = 4;
+                    const signed char* r0s = r0 + space_ofs[k];
+
+                    // if (elempack == 1)
+                    {
+                        vint8m1_t _val = vmv_v_x_i8m1(r0s[0], vl);
+                        vint8m1_t _w = vle8_v_i8m1(kptr, vl);
+                        vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val, _w, vl), 0);
+                        _sum01 = vwadd_wv_i32m2(_sum01, _s0, vl);
+                        // int8x8_t _val = vdup_n_s8(r0s[0]);
+                        // int8x8_t _w = vld1_s8(kptr);
+                        // int16x8_t _s0 = vmull_s8(_val, _w);
+                        // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
+
+                        kptr += 4;
+                    }
+                }
+            }
+            vl = 4;
+
+            if (out_elempack == 4)
+            {
+                // vst1q_s32(outptr, _sum0);
+                vse32_v_i32m2(outptr, _sum01, vl);
+                outptr += 4;
+            }
+            if (out_elempack == 1)
+            {
+                vsse32_v_i32m2(outptr, M * sizeof(int), _sum01, vl);
+                // outptr[0] = vgetq_lane_s32(_sum0, 0);
+                // outptr[M] = vgetq_lane_s32(_sum0, 1);
+                // outptr[M * 2] = vgetq_lane_s32(_sum0, 2);
+                // outptr[M * 3] = vgetq_lane_s32(_sum0, 3);
+                outptr += 1;
+            }
+        }
+    }
+    remain_outch_start += nn_outch * 4;
+    nn_outch = (outch - remain_outch_start) / 2;
+#else // __riscv_vector
+    nn_outch = (outch - remain_outch_start) / 2;
+    #pragma omp parallel for num_threads(opt.num_threads)
+#endif // __riscv_vector
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        const int p = remain_outch_start + pp * 2;
+
+        // shadowed variable for less openmp task args
+        const int outw = top_blob.w;
+        const int outh = top_blob.h;
+        const int N = bottom_blob.cstep * elempack;
+
+        int* outptr0 = top_blob.channel(p);
+        int* outptr1 = top_blob.channel(p + 1);
+
+        int ij = 0;
+
+        for (; ij < outw * outh; ij++)
+        {
+            const int i = ij / outw;
+            const int j = ij % outw;
+
+            int sum0 = 0;
+            int sum1 = 0;
+
+#if __riscv_vector
+            const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2);
+#else
+            const signed char* kptr = weight_data_tm.channel(p / 2);
+#endif
+
+            int q = 0;
+#if __riscv_vector
+            {
+                // int32x4_t _sum01 = vdupq_n_s32(0);
+                vl = 4;
+                vint32m2_t _sum01 = vmv_v_x_i32m2(0, vl);
+                for (; q + 7 < inch; q += 8)
+                {
+                    vl = 8;
+                    const signed char* r0 = bottom_blob.channel(q / elempack).row<const signed char>(i * stride_h) + j * stride_w * elempack;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        vl = 8;
+                        const signed char* r0s = r0 + space_ofs[k];
+
+                        // int8x8_t _r0;
+                        vint8m1_t _r0;
+                        if (elempack == 8)
+                        {
+                            _r0 = vle8_v_i8m1(r0s, vl);
+                            // _r0 = vld1_s8(r0s);
+                        }
+                        else // if (elempack == 1)
+                        {
+                            _r0 = vlse8_v_i8m1(r0s, N * sizeof(signed char), vl);
+                            // signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]};
+                            // _r0 = vld1_s8(tmp);
+                        }
+
+                        // int8x16_t _w0 = vld1q_s8(kptr);
+                        vl = 16;
+                        vint8m1_t _w0 = vle8_v_i8m1(kptr, vl);
+                        vl = 8;
+                        // fprintf(stderr, "r0: \n");
+                        // print_vint8m1(_r0, 8);
+                        vint8m1_t _r0l = vslideup_vx_i8m1(_r0, _r0, 4, vl);
+                        vint8m1_t _r0h = vslidedown_vx_i8m1(_r0, _r0, 4, vl);
+                        _r0h = vslideup_vx_i8m1(_r0h, _r0h, 4, vl);
+
+                        // vint32m1_t _r0_i16 = vreinterpret_v_i32m1_i8m1(_r0);
+
+                        // int32x2x2_t _rr0 = vzip_s32(vreinterpret_s32_s8(_r0), vreinterpret_s32_s8(_r0));
+                        // int8x8_t _r0l = vreinterpret_s8_s32(_rr0.val[0]);
+                        // int8x8_t _r0h = vreinterpret_s8_s32(_rr0.val[1]);
+
+                        vint16m2_t _s01_m2 = vwmul_vv_i16m2(_r0l, _w0, vl);
+                        _s01_m2 = vwmacc_vv_i16m2(_s01_m2, _r0h, vslidedown_vx_i8m1(_w0, _w0, 8, vl), vl);
+                        vint16m1_t _s01 = vget_v_i16m2_i16m1(_s01_m2, 0);
+
+                        vl = 4;
+                        uint16_t odd_index[4] = {1, 3, 5, 7};
+                        uint16_t even_index[4] = {0, 2, 4, 6};
+                        vuint16m1_t _odd_index = vle16_v_u16m1(odd_index, vl);
+                        vuint16m1_t _even_index = vle16_v_u16m1(even_index, vl);
+                        _sum01 = vwadd_wv_i32m2(_sum01, vrgather_vv_i16m1(_s01, _odd_index, vl), vl);
+                        _sum01 = vwadd_wv_i32m2(_sum01, vrgather_vv_i16m1(_s01, _even_index, vl), vl);
+
+                        // int16x8_t _s01 = vmull_s8(_r0l, vget_low_s8(_w0));
+                        // _s01 = vmlal_s8(_s01, _r0h, vget_high_s8(_w0));
+                        // _sum01 = vpadalq_s16(_sum01, _s01);
+
+                        kptr += 16;
+                    }
+                }
+                int res[4] = {0, 0, 0, 0};
+                vl = 4;
+                vse32_v_i32m2(res, _sum01, vl);
+                sum0 += (res[0] + res[1]);
+                sum1 += (res[2] + res[3]);
+                // int32x2_t _s0 = vpadd_s32(vget_low_s32(_sum01), vget_high_s32(_sum01));
+                // sum0 += vget_lane_s32(_s0, 0);
+                // sum1 += vget_lane_s32(_s0, 1);
+            }
+#endif // __riscv_vector
+            for (; q < inch; q++)
+            {
+                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i * stride_h) + j * stride_w;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    const signed char* r0s = r0 + space_ofs[k];
+
+                    // if (elempack == 1)
+                    {
+                        sum0 += r0s[0] * kptr[0];
+                        sum1 += r0s[0] * kptr[1];
+
+                        kptr += 2;
+                    }
+                }
+            }
+
+            outptr0[0] = sum0;
+            outptr1[0] = sum1;
+            outptr0 += 1;
+            outptr1 += 1;
+        }
+    }
+    remain_outch_start += nn_outch * 2;
+    for (int p = remain_outch_start; p < outch; p++)
+    {
+        int* outptr = top_blob.channel(p);
+
+        int ij = 0;
+
+        for (; ij < outw * outh; ij++)
+        {
+            const int i = ij / outw;
+            const int j = ij % outw;
+
+            int sum = 0;
+
+#if __riscv_vector
+            const signed char* kptr = weight_data_tm.channel(p / 8 + (p % 8) / 4 + (p % 4) / 2 + p % 2);
+#else
+            const signed char* kptr = weight_data_tm.channel(p / 2 + p % 2);
+#endif
+
+            int q = 0;
+#if __riscv_vector
+            {
+                vl = 8;
+                vint32m2_t _sum01 = vmv_v_x_i32m2(0, vl);
+                // int32x4_t _sum0 = vdupq_n_s32(0);
+                // int32x4_t _sum1 = vdupq_n_s32(0);
+                for (; q + 7 < inch; q += 8)
+                {
+                    const signed char* r0 = bottom_blob.channel(q / elempack).row<const signed char>(i * stride_h) + j * stride_w * elempack;
+
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        vl = 8;
+                        const signed char* r0s = r0 + space_ofs[k];
+
+                        vint8m1_t _r0;
+                        // int8x8_t _r0;
+                        if (elempack == 8)
+                        {
+                            // _r0 = vld1_s8(r0s);
+                            _r0 = vle8_v_i8m1(r0s, vl);
+                        }
+                        else // if (elempack == 1)
+                        {
+                            _r0 = vlse8_v_i8m1(r0s, N * sizeof(signed char), vl);
+                            // signed char tmp[8] = {r0s[0], r0s[N], r0s[N * 2], r0s[N * 3], r0s[N * 4], r0s[N * 5], r0s[N * 6], r0s[N * 7]};
+                            // _r0 = vld1_s8(tmp);
+                        }
+
+                        vint8m1_t _w = vle8_v_i8m1(kptr, vl);
+                        vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_r0, _w, vl), 0);
+                        _sum01 = vwadd_wv_i32m2(_sum01, _s0, vl);
+                        // int8x8_t _w = vld1_s8(kptr);
+
+                        // int16x8_t _s0 = vmull_s8(_r0, _w);
+
+                        // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s0));
+                        // _sum1 = vaddw_s16(_sum1, vget_high_s16(_s0));
+
+                        kptr += 8;
+                    }
+                }
+                // int32x4_t _sum = vaddq_s32(_sum0, _sum1);
+                // #if __aarch64__
+                vl = 8;
+                vint32m1_t _scalar_sum = vmv_s_x_i32m1(vint32m1_t(), sum, vl);
+                sum = vmv_x_s_i32m1_i32(vredsum_vs_i32m2_i32m1(_scalar_sum, _sum01, _scalar_sum, vl));
+                // int res[8] = {0, 0, 0, 0};
+                // vl = 4;
+                // vse32_v_i32m2(res, _sum01, vl);
+                // sum += (res[0] + res[1] + res[2] + res[3]);
+                // sum += vaddvq_s32(_sum);
+                // #else
+                //                 int32x2_t _ss = vadd_s32(vget_low_s32(_sum), vget_high_s32(_sum));
+                //                 _ss = vpadd_s32(_ss, _ss);
+                //                 sum += vget_lane_s32(_ss, 0);
+                // #endif
+            }
+#endif // __riscv_vector
+            for (; q < inch; q++)
+            {
+                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i * stride_h) + j * stride_w;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    const signed char* r0s = r0 + space_ofs[k];
+
+                    // if (elempack == 1)
+                    {
+                        sum += r0s[0] * kptr[0];
+
+                        kptr += 1;
+                    }
+                }
+            }
+
+            outptr[0] = sum;
+            outptr += 1;
+        }
+    }
+}
diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp
index be413e5be252..6aaea2b90fdf 100644
--- a/src/layer/riscv/convolution_riscv.cpp
+++ b/src/layer/riscv/convolution_riscv.cpp
@@ -35,6 +35,10 @@ namespace ncnn {
 #include "convolution_1x1.h"
 #include "convolution_3x3.h"
 
+#if NCNN_INT8
+#include "convolution_packed_int8.h"
+#endif // NCNN_INT8
+
 #if __riscv_vector
 #include "convolution_packn.h"
 #include "convolution_pack1ton.h"
@@ -134,7 +138,7 @@ int Convolution_riscv::create_pipeline(const Option& opt)
     if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
     {
         // TODO implement int8
-        return 0;
+        return create_pipeline_int8(opt);
     }
 #endif
 
@@ -259,27 +263,28 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
 #if NCNN_INT8
     if (opt.use_int8_inference && int8_scale_term)
     {
-        Mat bottom_blob_unpacked = bottom_blob;
-        if (bottom_blob.elempack != 1)
-        {
-            Option opt_pack1 = opt;
-            opt_pack1.blob_allocator = opt.workspace_allocator;
-
-            convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
-        }
-
-        Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked;
-        if (bottom_blob_unpacked.elembits() == 16)
-        {
-            Option opt_pack1 = opt;
-            opt_pack1.blob_allocator = opt.workspace_allocator;
-
-            cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1);
-        }
-
-        Option opt_unpacked = opt;
-        opt_unpacked.use_packing_layout = false;
-        return Convolution::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked);
+        return forward_int8(bottom_blob, top_blob, opt);
+        // Mat bottom_blob_unpacked = bottom_blob;
+        // if (bottom_blob.elempack != 1)
+        // {
+        //     Option opt_pack1 = opt;
+        //     opt_pack1.blob_allocator = opt.workspace_allocator;
+
+        //     convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
+        // }
+
+        // Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked;
+        // if (bottom_blob_unpacked.elembits() == 16)
+        // {
+        //     Option opt_pack1 = opt;
+        //     opt_pack1.blob_allocator = opt.workspace_allocator;
+
+        //     cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1);
+        // }
+
+        // Option opt_unpacked = opt;
+        // opt_unpacked.use_packing_layout = false;
+        // return Convolution::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked);
     }
 #endif
 
@@ -1102,4 +1107,179 @@ int Convolution_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, con
 }
 #endif // __riscv_vector && __riscv_zfh
 
+#if NCNN_INT8
+int Convolution_riscv::create_pipeline_int8(const Option& opt)
+{
+    const int maxk = kernel_w * kernel_h;
+    const int num_input = weight_data_size / maxk / num_output;
+
+    bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input >= 8 && num_output >= 8) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1;
+    // #if NCNN_ARM82DOT
+    //     if (ncnn::cpu_support_arm_asimddp())
+    //     {
+    //         prefer_winograd = false;
+    //     }
+    // #endif
+
+#if 0
+    if (opt.use_winograd_convolution && prefer_winograd)
+    {
+        if (opt.use_winograd43_convolution)
+            conv3x3s1_winograd43_transform_kernel_int8(weight_data, weight_winograd43_data, num_input, num_output, opt);
+        else
+            conv3x3s1_winograd23_transform_kernel_int8(weight_data, weight_winograd23_data, num_input, num_output, opt);
+    }
+    else if (opt.use_sgemm_convolution)
+    {
+        convolution_im2col_gemm_transform_kernel_int8(weight_data, weight_sgemm_data, num_input, num_output, kernel_w, kernel_h, opt);
+    }
+    else
+#endif
+    {
+        convolution_transform_kernel_packed_int8(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
+    }
+
+    scale_in_data.create(num_output);
+    for (int p = 0; p < num_output; p++)
+    {
+        // requantize and relu
+        float scale_in;
+        if (weight_data_int8_scales[p] == 0)
+            scale_in = 0;
+        else
+            scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
+
+        scale_in_data[p] = scale_in;
+    }
+
+    weight_data.release();
+
+    return 0;
+}
+
+int Convolution_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int elembits = bottom_blob.elembits();
+
+    Mat bottom_blob_int8 = bottom_blob;
+    if (elembits != 8)
+    {
+        Option opt_q = opt;
+        opt_q.blob_allocator = opt.workspace_allocator;
+        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
+    }
+
+    //     NCNN_LOGE("Convolution_arm input %d x %d  ksize=%d %d  stride=%d %d", w, h, kernel_w, kernel_h, stride_w, stride_h);
+
+    Mat bottom_blob_bordered;
+    make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
+    if (bottom_blob_bordered.empty())
+        return -100;
+
+    int w = bottom_blob_bordered.w;
+    int h = bottom_blob_bordered.h;
+    int elempack = bottom_blob_bordered.elempack;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    int outw = (w - kernel_extent_w) / stride_w + 1;
+    int outh = (h - kernel_extent_h) / stride_h + 1;
+
+    bool use_int8_requantize = int8_scale_term > 100;
+    int out_elempack = 1;
+#if __riscv_vector
+    if (opt.use_packing_layout)
+    {
+        if (use_int8_requantize)
+            out_elempack = num_output % 8 == 0 ? 8 : 1;
+        else
+            out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif // __riscv_vector
+    size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
+#if __riscv_vector && __riscv_zfh
+    if (support_fp16_storage && opt.use_fp16_storage)
+    {
+        out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack;
+    }
+#endif // __riscv_vector && __riscv_zfh
+    if (opt.use_bf16_storage)
+        out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack;
+
+    //     NCNN_LOGE("forward_int8_arm %d %d %d    %d %d", w, h, bottom_blob_bordered.c, elempack, out_elempack);
+
+    int channels = bottom_blob_bordered.c;
+    const int num_input = channels * elempack;
+
+    bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input >= 8 && num_output >= 8) && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1;
+    // #if NCNN_ARM82DOT
+    //     if (ncnn::cpu_support_arm_asimddp())
+    //     {
+    //         prefer_winograd = false;
+    //     }
+    // #endif
+
+    int out_elempack_int32 = 1;
+#if __riscv_vector
+    if (opt.use_packing_layout)
+    {
+        out_elempack_int32 = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+    }
+#endif // __riscv_vector
+
+    Mat top_blob_int32;
+    top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator);
+    if (top_blob_int32.empty())
+        return -100;
+
+    // int _nT = nT ? nT : opt.num_threads;
+    // if (nT != 0 && opt.num_threads != nT)
+    // {
+    //     // force num_threads the same as in create_pipeline
+    //     // so we could use pre-packed A/B from the same tile config
+    //     NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT);
+    // }
+#if 0
+    if (opt.use_winograd_convolution && prefer_winograd)
+    {
+        if (opt.use_winograd43_convolution && !weight_winograd43_data.empty())
+            conv3x3s1_winograd43_int8(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, _nT, opt);
+        else
+            conv3x3s1_winograd23_int8(bottom_blob_bordered, top_blob_int32, weight_winograd23_data, _nT, opt);
+    }
+    else if (opt.use_sgemm_convolution)
+    {
+        convolution_im2col_gemm_int8(bottom_blob_bordered, top_blob_int32, weight_sgemm_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, _nT, opt);
+    }
+    else
+#endif
+    {
+        convolution_packed_int8(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+    }
+
+    bottom_blob_bordered.release();
+
+    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    if (use_int8_requantize)
+    {
+        requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
+    }
+    else
+    {
+        dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);
+
+        if (activation)
+        {
+            activation->forward_inplace(top_blob, opt);
+        }
+    }
+
+    return 0;
+}
+#endif // NCNN_INT8
+
 } // namespace ncnn
diff --git a/src/layer/riscv/convolution_riscv.h b/src/layer/riscv/convolution_riscv.h
index a4e008c9dd1d..8c5ee015de1a 100644
--- a/src/layer/riscv/convolution_riscv.h
+++ b/src/layer/riscv/convolution_riscv.h
@@ -38,6 +38,11 @@ class Convolution_riscv : public Convolution
     int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
 
+#if NCNN_INT8
+    int create_pipeline_int8(const Option& opt);
+    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+#endif
+
 public:
     Layer* activation;
 
@@ -48,6 +53,10 @@ class Convolution_riscv : public Convolution
 
     // fp16
     Mat bias_data_fp16;
+
+#if NCNN_INT8
+    Mat scale_in_data;
+#endif
 };
 
 } // namespace ncnn
diff --git a/src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h b/src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h
new file mode 100644
index 000000000000..b86932b66b28
--- /dev/null
+++ b/src/layer/riscv/convolutiondepthwise_3x3_pack8_int8.h
@@ -0,0 +1,283 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2024 Xinyu302 Limited. All rights reserved.
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convdw3x3s1_pack8_int8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    fprintf(stderr, "convdw3x3s1_pack8_int8_rvv\n");
+    int w = bottom_blob.w;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    int vl = csrr_vlenb() / 2;
+
+    const int group = bottom_blob.c;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int g = 0; g < group; g++)
+    {
+        Mat out = top_blob.channel(g);
+
+        const signed char* k0 = kernel.row<const signed char>(g);
+
+        int* outptr0 = out.row<int>(0);
+        int* outptr1 = out.row<int>(1);
+
+        const Mat img0 = bottom_blob.channel(g);
+
+        const signed char* r0 = img0.row<const signed char>(0);
+        const signed char* r1 = img0.row<const signed char>(1);
+        const signed char* r2 = img0.row<const signed char>(2);
+        const signed char* r3 = img0.row<const signed char>(3);
+
+        vl = 8;
+        vint8m1_t _k00 = vle8_v_i8m1(k0, vl);
+        vint8m1_t _k01 = vle8_v_i8m1(k0 + 8, vl);
+        vint8m1_t _k02 = vle8_v_i8m1(k0 + 16, vl);
+        vint8m1_t _k10 = vle8_v_i8m1(k0 + 24, vl);
+        vint8m1_t _k11 = vle8_v_i8m1(k0 + 32, vl);
+        vint8m1_t _k12 = vle8_v_i8m1(k0 + 40, vl);
+        vint8m1_t _k20 = vle8_v_i8m1(k0 + 48, vl);
+        vint8m1_t _k21 = vle8_v_i8m1(k0 + 56, vl);
+        vint8m1_t _k22 = vle8_v_i8m1(k0 + 64, vl);
+
+        // int8x8_t _k00 = vld1_s8(k0);
+        // int8x8_t _k01 = vld1_s8(k0 + 8);
+        // int8x8_t _k02 = vld1_s8(k0 + 16);
+        // int8x8_t _k10 = vld1_s8(k0 + 24);
+        // int8x8_t _k11 = vld1_s8(k0 + 32);
+        // int8x8_t _k12 = vld1_s8(k0 + 40);
+        // int8x8_t _k20 = vld1_s8(k0 + 48);
+        // int8x8_t _k21 = vld1_s8(k0 + 56);
+        // int8x8_t _k22 = vld1_s8(k0 + 64);
+
+        int i = 0;
+        for (; i < outh; i++)
+        {
+            int j = 0;
+            for (; j < outw; j++)
+            {
+                vint8m1_t _r00 = vle8_v_i8m1(r0, vl);
+                vint8m1_t _r01 = vle8_v_i8m1(r0 + 8, vl);
+                vint8m1_t _r02 = vle8_v_i8m1(r0 + 16, vl);
+                vint8m1_t _r10 = vle8_v_i8m1(r1, vl);
+                vint8m1_t _r11 = vle8_v_i8m1(r1 + 8, vl);
+                vint8m1_t _r12 = vle8_v_i8m1(r1 + 16, vl);
+                vint8m1_t _r20 = vle8_v_i8m1(r2, vl);
+                vint8m1_t _r21 = vle8_v_i8m1(r2 + 8, vl);
+                vint8m1_t _r22 = vle8_v_i8m1(r2 + 16, vl);
+
+                // int8x8_t _r00 = vld1_s8(r0);
+                // int8x8_t _r01 = vld1_s8(r0 + 8);
+                // int8x8_t _r02 = vld1_s8(r0 + 16);
+                // int8x8_t _r10 = vld1_s8(r1);
+                // int8x8_t _r11 = vld1_s8(r1 + 8);
+                // int8x8_t _r12 = vld1_s8(r1 + 16);
+                // int8x8_t _r20 = vld1_s8(r2);
+                // int8x8_t _r21 = vld1_s8(r2 + 8);
+                // int8x8_t _r22 = vld1_s8(r2 + 16);
+
+                vint16m2_t _s0 = vwmul_vv_i16m2(_r00, _k00, vl);
+                vint16m2_t _s1 = vwmul_vv_i16m2(_r01, _k01, vl);
+                vint16m2_t _s2 = vwmul_vv_i16m2(_r02, _k02, vl);
+                vint16m2_t _s3 = vwmul_vv_i16m2(_r10, _k10, vl);
+
+                // int16x8_t _s0 = vmull_s8(_r00, _k00);
+                // int16x8_t _s1 = vmull_s8(_r01, _k01);
+                // int16x8_t _s2 = vmull_s8(_r02, _k02);
+                // int16x8_t _s3 = vmull_s8(_r10, _k10);
+
+                _s0 = vwmacc_vv_i16m2(_s0, _r11, _k11, vl);
+                _s1 = vwmacc_vv_i16m2(_s1, _r12, _k12, vl);
+                _s2 = vwmacc_vv_i16m2(_s2, _r20, _k20, vl);
+                _s3 = vwmacc_vv_i16m2(_s3, _r21, _k21, vl);
+
+                // _s0 = vmlal_s8(_s0, _r11, _k11);
+                // _s1 = vmlal_s8(_s1, _r12, _k12);
+                // _s2 = vmlal_s8(_s2, _r20, _k20);
+                // _s3 = vmlal_s8(_s3, _r21, _k21);
+
+                vint16m2_t _s4 = vwmul_vv_i16m2(_r22, _k22, vl);
+                // int16x8_t _s4 = vmull_s8(_r22, _k22);
+
+                vint16m1_t _s0_m1 = vget_v_i16m2_i16m1(_s0, 0);
+                vint16m1_t _s1_m1 = vget_v_i16m2_i16m1(_s1, 0);
+                vint16m1_t _s2_m1 = vget_v_i16m2_i16m1(_s2, 0);
+                vint16m1_t _s3_m1 = vget_v_i16m2_i16m1(_s3, 0);
+                vint16m1_t _s4_m1 = vget_v_i16m2_i16m1(_s4, 0);
+
+                vint32m2_t _sum = vwadd_vv_i32m2(_s0_m1, _s1_m1, vl);
+                _sum = vwadd_wv_i32m2(_sum, _s2_m1, vl);
+                _sum = vwadd_wv_i32m2(_sum, _s3_m1, vl);
+                _sum = vwadd_wv_i32m2(_sum, _s4_m1, vl);
+
+                // int32x4_t _sum0 = vaddl_s16(vget_low_s16(_s0), vget_low_s16(_s1));
+                // int32x4_t _sum1 = vaddl_s16(vget_high_s16(_s0), vget_high_s16(_s1));
+                // int32x4_t _sum2 = vaddl_s16(vget_low_s16(_s2), vget_low_s16(_s3));
+                // int32x4_t _sum3 = vaddl_s16(vget_high_s16(_s2), vget_high_s16(_s3));
+                // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s4));
+                // _sum1 = vaddw_s16(_sum1, vget_high_s16(_s4));
+                // _sum0 = vaddq_s32(_sum0, _sum2);
+                // _sum1 = vaddq_s32(_sum1, _sum3);
+
+                vse32_v_i32m2(outptr0, _sum, vl);
+                // vst1q_s32(outptr0, _sum0);
+                // vst1q_s32(outptr0 + 4, _sum1);
+                r0 += 8;
+                r1 += 8;
+                r2 += 8;
+                outptr0 += 8;
+            }
+
+            r0 += 2 * 8;
+            r1 += 2 * 8;
+            r2 += 2 * 8;
+        }
+    }
+}
+
+static void convdw3x3s2_pack8_int8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+{
+    fprintf(stderr, "convdw3x3s2_pack8_int8_rvv\n");
+    int w = bottom_blob.w;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int vl = csrr_vlenb() / 2;
+
+    const int group = bottom_blob.c;
+
+    const int tailstep = (w - 2 * outw + w) * 8;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int g = 0; g < group; g++)
+    {
+        Mat out = top_blob.channel(g);
+
+        const signed char* k0 = kernel.row<const signed char>(g);
+
+        int* outptr0 = out;
+
+        const Mat img0 = bottom_blob.channel(g);
+
+        const signed char* r0 = img0.row<const signed char>(0);
+        const signed char* r1 = img0.row<const signed char>(1);
+        const signed char* r2 = img0.row<const signed char>(2);
+
+        vint8m1_t _k00 = vle8_v_i8m1(k0, vl);
+        vint8m1_t _k01 = vle8_v_i8m1(k0 + 8, vl);
+        vint8m1_t _k02 = vle8_v_i8m1(k0 + 16, vl);
+        vint8m1_t _k10 = vle8_v_i8m1(k0 + 24, vl);
+        vint8m1_t _k11 = vle8_v_i8m1(k0 + 32, vl);
+        vint8m1_t _k12 = vle8_v_i8m1(k0 + 40, vl);
+        vint8m1_t _k20 = vle8_v_i8m1(k0 + 48, vl);
+        vint8m1_t _k21 = vle8_v_i8m1(k0 + 56, vl);
+        vint8m1_t _k22 = vle8_v_i8m1(k0 + 64, vl);
+
+        // int8x8_t _k00 = vld1_s8(k0);
+        // int8x8_t _k01 = vld1_s8(k0 + 8);
+        // int8x8_t _k02 = vld1_s8(k0 + 16);
+        // int8x8_t _k10 = vld1_s8(k0 + 24);
+        // int8x8_t _k11 = vld1_s8(k0 + 32);
+        // int8x8_t _k12 = vld1_s8(k0 + 40);
+        // int8x8_t _k20 = vld1_s8(k0 + 48);
+        // int8x8_t _k21 = vld1_s8(k0 + 56);
+        // int8x8_t _k22 = vld1_s8(k0 + 64);
+
+        int i = 0;
+        for (; i < outh; i++)
+        {
+            int j = 0;
+            for (; j < outw; j++)
+            {
+                vint8m1_t _r00 = vle8_v_i8m1(r0, vl);
+                vint8m1_t _r01 = vle8_v_i8m1(r0 + 8, vl);
+                vint8m1_t _r02 = vle8_v_i8m1(r0 + 16, vl);
+                vint8m1_t _r10 = vle8_v_i8m1(r1, vl);
+                vint8m1_t _r11 = vle8_v_i8m1(r1 + 8, vl);
+                vint8m1_t _r12 = vle8_v_i8m1(r1 + 16, vl);
+                vint8m1_t _r20 = vle8_v_i8m1(r2, vl);
+                vint8m1_t _r21 = vle8_v_i8m1(r2 + 8, vl);
+                vint8m1_t _r22 = vle8_v_i8m1(r2 + 16, vl);
+
+                // int8x8_t _r00 = vld1_s8(r0);
+                // int8x8_t _r01 = vld1_s8(r0 + 8);
+                // int8x8_t _r02 = vld1_s8(r0 + 16);
+                // int8x8_t _r10 = vld1_s8(r1);
+                // int8x8_t _r11 = vld1_s8(r1 + 8);
+                // int8x8_t _r12 = vld1_s8(r1 + 16);
+                // int8x8_t _r20 = vld1_s8(r2);
+                // int8x8_t _r21 = vld1_s8(r2 + 8);
+                // int8x8_t _r22 = vld1_s8(r2 + 16);
+
+                vint16m2_t _s0 = vwmul_vv_i16m2(_r00, _k00, vl);
+                vint16m2_t _s1 = vwmul_vv_i16m2(_r01, _k01, vl);
+                vint16m2_t _s2 = vwmul_vv_i16m2(_r02, _k02, vl);
+                vint16m2_t _s3 = vwmul_vv_i16m2(_r10, _k10, vl);
+
+                _s0 = vwmacc_vv_i16m2(_s0, _r11, _k11, vl);
+                _s1 = vwmacc_vv_i16m2(_s1, _r12, _k12, vl);
+                _s2 = vwmacc_vv_i16m2(_s2, _r20, _k20, vl);
+                _s3 = vwmacc_vv_i16m2(_s3, _r21, _k21, vl);
+
+                vint16m2_t _s4 = vwmul_vv_i16m2(_r22, _k22, vl);
+
+                // int16x8_t _s0 = vmull_s8(_r00, _k00);
+                // int16x8_t _s1 = vmull_s8(_r01, _k01);
+                // int16x8_t _s2 = vmull_s8(_r02, _k02);
+                // int16x8_t _s3 = vmull_s8(_r10, _k10);
+                // _s0 = vmlal_s8(_s0, _r11, _k11);
+                // _s1 = vmlal_s8(_s1, _r12, _k12);
+                // _s2 = vmlal_s8(_s2, _r20, _k20);
+                // _s3 = vmlal_s8(_s3, _r21, _k21);
+                // int16x8_t _s4 = vmull_s8(_r22, _k22);
+
+                vint16m1_t _s0_m1 = vget_v_i16m2_i16m1(_s0, 0);
+                vint16m1_t _s1_m1 = vget_v_i16m2_i16m1(_s1, 0);
+                vint16m1_t _s2_m1 = vget_v_i16m2_i16m1(_s2, 0);
+                vint16m1_t _s3_m1 = vget_v_i16m2_i16m1(_s3, 0);
+                vint16m1_t _s4_m1 = vget_v_i16m2_i16m1(_s4, 0);
+
+                vint32m2_t _sum = vwadd_vv_i32m2(_s0_m1, _s1_m1, vl);
+                _sum = vwadd_wv_i32m2(_sum, _s2_m1, vl);
+                _sum = vwadd_wv_i32m2(_sum, _s3_m1, vl);
+                _sum = vwadd_wv_i32m2(_sum, _s4_m1, vl);
+
+                vse32_v_i32m2(outptr0, _sum, vl);
+
+                // int32x4_t _sum0 = vaddl_s16(vget_low_s16(_s0), vget_low_s16(_s1));
+                // int32x4_t _sum1 = vaddl_s16(vget_high_s16(_s0), vget_high_s16(_s1));
+                // int32x4_t _sum2 = vaddl_s16(vget_low_s16(_s2), vget_low_s16(_s3));
+                // int32x4_t _sum3 = vaddl_s16(vget_high_s16(_s2), vget_high_s16(_s3));
+                // _sum0 = vaddw_s16(_sum0, vget_low_s16(_s4));
+                // _sum1 = vaddw_s16(_sum1, vget_high_s16(_s4));
+                // _sum0 = vaddq_s32(_sum0, _sum2);
+                // _sum1 = vaddq_s32(_sum1, _sum3);
+
+                // vst1q_s32(outptr0, _sum0);
+                // vst1q_s32(outptr0 + 4, _sum1);
+
+                r0 += 16;
+                r1 += 16;
+                r2 += 16;
+                outptr0 += 8;
+            }
+
+            r0 += tailstep;
+            r1 += tailstep;
+            r2 += tailstep;
+        }
+    }
+}
diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp
index d913fe7e1d59..7d7f77d1ca39 100644
--- a/src/layer/riscv/convolutiondepthwise_riscv.cpp
+++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp
@@ -28,6 +28,10 @@ namespace ncnn {
 
 #include "convolutiondepthwise_3x3.h"
 
+#if NCNN_INT8
+#include "convolutiondepthwise_3x3_pack8_int8.h"
+#endif // NCNN_INT8
+
 #if __riscv_vector
 #include "convolutiondepthwise_3x3_packn.h"
 #include "convolutiondepthwise_5x5_packn.h"
@@ -61,7 +65,7 @@ int ConvolutionDepthWise_riscv::create_pipeline(const Option& opt)
     if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
     {
         // TODO implement int8
-        return 0;
+        return create_pipeline_int8(opt);
     }
 #endif
 
@@ -238,27 +242,8 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c
 #if NCNN_INT8
     if (opt.use_int8_inference && int8_scale_term)
     {
-        Mat bottom_blob_unpacked = bottom_blob;
-        if (bottom_blob.elempack != 1)
-        {
-            Option opt_pack1 = opt;
-            opt_pack1.blob_allocator = opt.workspace_allocator;
-
-            convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
-        }
-
-        Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked;
-        if (bottom_blob_unpacked.elembits() == 16)
-        {
-            Option opt_pack1 = opt;
-            opt_pack1.blob_allocator = opt.workspace_allocator;
-
-            cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1);
-        }
-
-        Option opt_unpacked = opt;
-        opt_unpacked.use_packing_layout = false;
-        return ConvolutionDepthWise::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked);
+        fprintf(stderr, "ConvolutionDepthWise_riscv::forward int8 scale is called\n");
+        return forward_int8(bottom_blob, top_blob, opt);
     }
 #endif
 
@@ -1153,4 +1138,473 @@ int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_
 }
 #endif // __riscv_vector && __riscv_zfh
 
+#if NCNN_INT8
+int ConvolutionDepthWise_riscv::create_pipeline_int8(const Option& opt)
+{
+    int vl;
+    const int maxk = kernel_w * kernel_h;
+    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
+
+    // depth-wise
+    if (channels == group && group == num_output)
+    {
+        int elempack = 1;
+#if __riscv_vector
+        if (opt.use_packing_layout)
+        {
+            elempack = channels % 8 == 0 ? 8 : 1;
+        }
+#endif // __riscv_vector
+
+        if (elempack == 8)
+        {
+            Mat weight_data_r2 = weight_data.reshape(maxk, group);
+            convert_packing(weight_data_r2, weight_data_tm, 8, opt);
+        }
+
+        if (elempack == 1)
+        {
+            weight_data_tm = weight_data;
+        }
+
+        weight_data.release();
+
+        return 0;
+    }
+
+    // group convolution
+    create_group_ops(opt);
+
+    weight_data.release();
+
+    return 0;
+}
+
+int ConvolutionDepthWise_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    int vl;
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    int elempack = bottom_blob.elempack;
+
+    int elembits = bottom_blob.elembits();
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    Mat bottom_blob_int8 = bottom_blob;
+    if (elembits != 8)
+    {
+        const int channels_g = channels * elempack / group;
+
+        Mat scales(channels * elempack);
+        {
+            float* ps = scales;
+            for (int g = 0; g < group; g++)
+            {
+                float scale = bottom_blob_int8_scales[g];
+                for (int q = 0; q < channels_g; q++)
+                {
+                    *ps++ = scale;
+                }
+            }
+        }
+
+        Option opt_q = opt;
+        opt_q.blob_allocator = opt.workspace_allocator;
+        quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q);
+    }
+
+    Mat bottom_blob_bordered;
+    make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
+    if (bottom_blob_bordered.empty())
+        return -100;
+
+    w = bottom_blob_bordered.w;
+    h = bottom_blob_bordered.h;
+    channels = bottom_blob_bordered.c;
+    elempack = bottom_blob_bordered.elempack;
+
+    int outw = (w - kernel_extent_w) / stride_w + 1;
+    int outh = (h - kernel_extent_h) / stride_h + 1;
+
+    fprintf(stderr, "bottom_blob_bordered %d %d %d %d %d\n", bottom_blob_bordered.w, bottom_blob_bordered.h, bottom_blob_bordered.c, bottom_blob_bordered.elempack, bottom_blob_bordered.elemsize);
+
+    // depth-wise
+    if (channels * elempack == group && group == num_output) // depth-wise conv, 逐通道卷积
+    {
+        int out_elempack = 1;
+#if __riscv_vector
+        if (opt.use_packing_layout)
+        {
+            out_elempack = num_output % 8 == 0 ? 8 : 1;
+        }
+#endif // __riscv_vector
+        bool use_int8_requantize = int8_scale_term > 100;
+        fprintf(stderr, "In 1246 use_int8_requantize = %d\n", use_int8_requantize);
+        size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
+
+        if (support_fp16_storage && opt.use_fp16_storage)
+        {
+            out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack;
+        }
+
+        if (opt.use_bf16_storage)
+            out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack;
+
+        top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        // TODO use fp16 / bf16
+        out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
+        top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        fprintf(stderr, "kernel_w = %d, kernel_h = %d, dilation_w = %d, dilation_h = %d, stride_w = %d, stride_h = %d, activation_type = %d\n", kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, activation_type);
+        fprintf(stderr, "elempack = %d, out_elempack = %d\n", elempack, out_elempack);
+
+#if __riscv_vector
+        if (elempack == 8)
+        {
+            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && (activation_type == 0 || activation_type == 1))
+            {
+                Mat top_blob_int32;
+                top_blob_int32.create(outw, outh, num_output / out_elempack, (size_t)4u * out_elempack, out_elempack, opt.workspace_allocator);
+                if (top_blob_int32.empty())
+                    return -100;
+
+                convdw3x3s1_pack8_int8_rvv(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt);
+
+                Mat scale_in_data(group);
+                for (int g = 0; g < group; g++)
+                {
+                    // dequantize
+                    float scale_in;
+                    if (weight_data_int8_scales[g] == 0)
+                        scale_in = 0;
+                    else
+                        scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
+
+                    scale_in_data[g] = scale_in;
+                }
+
+                if (use_int8_requantize)
+                {
+                    requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
+                }
+                else
+                {
+                    dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);
+
+                    if (activation)
+                    {
+                        activation->forward_inplace(top_blob, opt);
+                    }
+                }
+            }
+            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (activation_type == 0 || activation_type == 1))
+            {
+                Mat top_blob_int32;
+                top_blob_int32.create(outw, outh, num_output / out_elempack, (size_t)4u * out_elempack, out_elempack, opt.workspace_allocator);
+                if (top_blob_int32.empty())
+                    return -100;
+
+                convdw3x3s2_pack8_int8_rvv(bottom_blob_bordered, top_blob_int32, weight_data_tm, opt);
+
+                Mat scale_in_data(group);
+                for (int g = 0; g < group; g++)
+                {
+                    // dequantize
+                    float scale_in;
+                    if (weight_data_int8_scales[g] == 0)
+                        scale_in = 0;
+                    else
+                        scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
+
+                    scale_in_data[g] = scale_in;
+                }
+                fprintf(stderr, "use_int8_requantize = %d\n", use_int8_requantize);
+
+                if (use_int8_requantize)
+                {
+                    requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
+                }
+                else
+                {
+                    dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);
+
+                    if (activation)
+                    {
+                        activation->forward_inplace(top_blob, opt);
+                    }
+                }
+            }
+            else
+            {
+                const int maxk = kernel_w * kernel_h;
+
+                // kernel offsets
+                std::vector<int> _space_ofs(maxk);
+                int* space_ofs = &_space_ofs[0];
+                {
+                    int p1 = 0;
+                    int p2 = 0;
+                    int gap = w * dilation_h - kernel_w * dilation_w;
+                    for (int i = 0; i < kernel_h; i++)
+                    {
+                        for (int j = 0; j < kernel_w; j++)
+                        {
+                            space_ofs[p1] = p2;
+                            p1++;
+                            p2 += dilation_w;
+                        }
+                        p2 += gap;
+                    }
+                }
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int g = 0; g < channels; g++)
+                {
+                    signed char* outptr_s8 = top_blob.channel(g);
+                    float* outptr_f32 = top_blob.channel(g);
+                    const signed char* kptr = (const signed char*)weight_data_tm + maxk * g * 8;
+                    const Mat m = bottom_blob_bordered.channel(g);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        for (int j = 0; j < outw; j++)
+                        {
+                            vl = 8;
+                            vint32m2_t _sum0 = vmv_v_x_i32m2(0, vl);
+
+                            const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w * 8;
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                vint8m1_t _val = vle8_v_i8m1(sptr + space_ofs[k] * 8, vl);
+                                vint8m1_t _w = vle8_v_i8m1(kptr + k * 8, vl);
+                                vint16m1_t _s0 = vget_v_i16m2_i16m1(vwmul_vv_i16m2(_val, _w, vl), 0);
+
+                                _sum0 = vwadd_wv_i32m2(_sum0, _s0, vl);
+                            }
+
+                            vfloat32m2_t _scale_in;
+                            {
+                                vfloat32m2_t _bottom_blob_int8_scales = vle32_v_f32m2((const float*)bottom_blob_int8_scales + g * 8, vl);
+                                vfloat32m2_t _weight_data_int8_scales = vle32_v_f32m2((const float*)weight_data_int8_scales + g * 8, vl);
+
+                                _scale_in = vfdiv_vv_f32m2(vfmv_v_f_f32m2(1.f, vl), vfmul_vv_f32m2(_bottom_blob_int8_scales, _weight_data_int8_scales, vl), vl);
+                                vbool16_t _is_zero = vmfeq_vv_f32m2_b16(_bottom_blob_int8_scales, vfmv_v_f_f32m2(0.f, vl), vl);
+                                _scale_in = vfsub_vv_f32m2_m(_is_zero, _scale_in, _scale_in, _scale_in, vl);
+                            }
+
+                            vfloat32m2_t _sumfp32 = vfmul_vv_f32m2(vfcvt_f_x_v_f32m2(_sum0, vl), _scale_in, vl);
+
+                            if (bias_term)
+                            {
+                                vfloat32m2_t _bias = vle32_v_f32m2((const float*)bias_data + g * 8, vl);
+                                _sumfp32 = vfadd_vv_f32m2(_sumfp32, _bias, vl);
+                            }
+                            _sumfp32 = activation_ps(_sumfp32, activation_type, activation_params, vl);
+
+                            if (use_int8_requantize)
+                            {
+                                // requantize
+                                vfloat32m2_t _scale_out = vle32_v_f32m2((const float*)top_blob_int8_scales + g * 8, vl);
+                                vfloat32m2_t _res = vfmul_vv_f32m2(_sumfp32, _scale_out, vl);
+                                int64_t _sum8 = float2int8(vget_v_f32m2_f32m1(_res, 0), vget_v_f32m2_f32m1(_res, 1));
+                                *(int64_t*)outptr_s8 = _sum8;
+                                outptr_s8 += 8;
+                            }
+                            else
+                            {
+                                // dequantize
+                                vse32_v_f32m2(outptr_f32, _sumfp32, vl);
+                                outptr_f32 += 8;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+#endif // __riscv_vector
+
+        if (elempack == 1)
+        {
+            {
+                const int maxk = kernel_w * kernel_h;
+
+                // kernel offsets
+                std::vector<int> _space_ofs(maxk);
+                int* space_ofs = &_space_ofs[0];
+                {
+                    int p1 = 0;
+                    int p2 = 0;
+                    int gap = w * dilation_h - kernel_w * dilation_w;
+                    for (int i = 0; i < kernel_h; i++)
+                    {
+                        for (int j = 0; j < kernel_w; j++)
+                        {
+                            space_ofs[p1] = p2;
+                            p1++;
+                            p2 += dilation_w;
+                        }
+                        p2 += gap;
+                    }
+                }
+
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int g = 0; g < group; g++)
+                {
+                    signed char* outptr_s8 = top_blob.channel(g);
+                    float* outptr_f32 = top_blob.channel(g);
+                    const signed char* kptr = (const signed char*)weight_data_tm + maxk * g;
+                    const Mat m = bottom_blob_bordered.channel(g);
+
+                    for (int i = 0; i < outh; i++)
+                    {
+                        for (int j = 0; j < outw; j++)
+                        {
+                            int sum = 0;
+
+                            const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w;
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                signed char val = sptr[space_ofs[k]];
+                                signed char w = kptr[k];
+                                sum += val * w;
+                            }
+
+                            float scale_in;
+                            if (weight_data_int8_scales[g] == 0)
+                                scale_in = 0;
+                            else
+                                scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
+
+                            float sumfp32 = sum * scale_in;
+
+                            if (bias_term)
+                                sumfp32 += bias_data[g];
+
+                            sumfp32 = activation_ss(sumfp32, activation_type, activation_params);
+
+                            if (use_int8_requantize)
+                            {
+                                // requantize
+                                float scale_out = top_blob_int8_scales[g];
+                                signed char sums8 = float2int8(sumfp32 * scale_out);
+                                outptr_s8[0] = sums8;
+                                outptr_s8 += 1;
+                            }
+                            else
+                            {
+                                // dequantize
+                                outptr_f32[0] = sumfp32;
+                                outptr_f32 += 1;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return 0;
+    }
+
+    bool use_int8_requantize = int8_scale_term > 100;
+    int out_elempack = 1;
+#if __riscv_vector
+    if (opt.use_packing_layout)
+    {
+        if (use_int8_requantize)
+            out_elempack = num_output % 8 == 0 ? 8 : 1;
+        else
+            out_elempack = num_output % 4 == 0 ? 4 : 1;
+    }
+#endif // __riscv_vector
+    size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
+    // #if NCNN_ARM82
+    if (support_fp16_storage && opt.use_fp16_storage)
+    {
+        out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack;
+    }
+    // #endif
+    if (opt.use_bf16_storage)
+        out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack;
+
+    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    // group convolution
+    const int channels_g = channels * elempack / group;
+    const int num_output_g = num_output / group;
+
+    fprintf(stderr, "group = %d, num_output = %d\n", group, num_output);
+
+    fprintf(stderr, "channels_g = %d, num_output_g = %d\n", channels_g, num_output_g);
+
+    int g_elempack = 1;
+    int out_g_elempack = 1;
+#if __riscv_vector
+    if (opt.use_packing_layout)
+    {
+        g_elempack = channels_g % 8 == 0 ? 8 : 1;
+        if (use_int8_requantize)
+            out_g_elempack = num_output_g % 8 == 0 ? 8 : 1;
+        else
+            out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
+    }
+#endif // __riscv_vector
+
+    // unpacking
+    Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
+    if (elempack > g_elempack)
+    {
+        Option opt_p = opt;
+        opt_p.blob_allocator = opt.workspace_allocator;
+        convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
+    }
+
+    Mat top_blob_unpacked = top_blob;
+    if (out_g_elempack < out_elempack)
+    {
+        top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
+        if (top_blob_unpacked.empty())
+            return -100;
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int g = 0; g < group; g++)
+    {
+        const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
+        Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
+
+        const ncnn::Layer* op = group_ops[g];
+
+        Option opt_g = opt;
+        opt_g.blob_allocator = top_blob_unpacked.allocator;
+
+        // forward
+        op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
+    }
+
+    // packing
+    if (out_g_elempack < out_elempack)
+    {
+        convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
+    }
+    else
+    {
+        top_blob = top_blob_unpacked;
+    }
+
+    return 0;
+}
+#endif // NCNN_INT8
+
 } // namespace ncnn
diff --git a/src/layer/riscv/convolutiondepthwise_riscv.h b/src/layer/riscv/convolutiondepthwise_riscv.h
index f9503975296d..98b1884d298b 100644
--- a/src/layer/riscv/convolutiondepthwise_riscv.h
+++ b/src/layer/riscv/convolutiondepthwise_riscv.h
@@ -39,6 +39,11 @@ class ConvolutionDepthWise_riscv : public ConvolutionDepthWise
     int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
 
+#if NCNN_INT8
+    int create_pipeline_int8(const Option& opt);
+    int forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+#endif
+
 public:
     Layer* activation;
     std::vector<ncnn::Layer*> group_ops;
diff --git a/src/net.cpp b/src/net.cpp
index ff2ab6091373..c365fd3e174f 100644
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -708,7 +708,7 @@ int NetPrivate::convert_layout(Mat& bottom_blob, const Layer* layer, const Optio
             if (elembits == 8)
             {
 #if NCNN_RVV
-                const int packn = ncnn::cpu_riscv_vlenb() / 1;
+                const int packn = ncnn::cpu_riscv_vlenb() / 2;
                 if (elemcount % packn == 0)
                     dst_elempack = packn;
 #else