diff --git a/.gitignore b/.gitignore
index f7687219..f9309490 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,7 @@ build*
 third_party/googletest
 third_party/turbojpeg
 third_party/benchmark
-tests/data
\ No newline at end of file
+tests/data
+
+# IDE's configs
+.vscode/settings.json
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c518d85e..1740fae5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -279,10 +279,10 @@ else()
     add_compile_options(-march=armv8-a)
     add_compile_options(-fno-lax-vector-conversions)
   elseif(ARCH STREQUAL "riscv64")
-    add_compile_options(-march=rv64gc)
+    add_compile_options(-march=rv64gcv)
     add_compile_options(-mabi=lp64d)
   elseif(ARCH STREQUAL "riscv32")
-    add_compile_options(-march=rv32gc)
+    add_compile_options(-march=rv32gcv)
     add_compile_options(-mabi=ilp32d)
   elseif(ARCH STREQUAL "loong64")
     add_compile_options(-march=loongarch64)
@@ -553,6 +553,14 @@ if(UHDR_ENABLE_INTRINSICS)
     file(GLOB UHDR_CORE_NEON_SRCS_LIST "${SOURCE_DIR}/src/dsp/arm/*.cpp")
     list(APPEND UHDR_CORE_SRCS_LIST ${UHDR_CORE_NEON_SRCS_LIST})
   endif()
+  if(ARCH STREQUAL "riscv64")
+    file(GLOB UHDR_CORE_RVV_SRCS_LIST "${SOURCE_DIR}/src/dsp/riscv/*.cpp")
+    list(APPEND UHDR_CORE_SRCS_LIST ${UHDR_CORE_RVV_SRCS_LIST})
+  endif()
+  if(ARCH STREQUAL "riscv32")
+    file(GLOB UHDR_CORE_RVV_SRCS_LIST "${SOURCE_DIR}/src/dsp/riscv/*.cpp")
+    list(APPEND UHDR_CORE_SRCS_LIST ${UHDR_CORE_RVV_SRCS_LIST})
+  endif()
 endif()
 if(UHDR_ENABLE_GLES)
   file(GLOB UHDR_CORE_GLES_SRCS_LIST "${SOURCE_DIR}/src/gpu/*.cpp")
diff --git a/lib/include/ultrahdr/gainmapmath.h b/lib/include/ultrahdr/gainmapmath.h
index d604ad2b..61144e12 100644
--- a/lib/include/ultrahdr/gainmapmath.h
+++ b/lib/include/ultrahdr/gainmapmath.h
@@ -414,14 +414,20 @@ extern const std::array<float, 9> kYuvBt601ToBt2100;
 extern const std::array<float, 9> kYuvBt2100ToBt709;
 extern const std::array<float, 9> kYuvBt2100ToBt601;
 
-#if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON)))
+#ifdef UHDR_ENABLE_INTRINSICS
+
+extern const int16_t kYuv709To601_coeffs_simd[8];
+extern const int16_t kYuv709To2100_coeffs_simd[8];
+extern const int16_t kYuv601To709_coeffs_simd[8];
+extern const int16_t kYuv601To2100_coeffs_simd[8];
+extern const int16_t kYuv2100To709_coeffs_simd[8];
+extern const int16_t kYuv2100To601_coeffs_simd[8];
+
+extern const uint16_t kRgb709ToYuv_coeffs_simd[8];
+extern const uint16_t kRgbDispP3ToYuv_coeffs_simd[8];
+extern const uint16_t kRgb2100ToYuv_coeffs_simd[8];
 
-extern const int16_t kYuv709To601_coeffs_neon[8];
-extern const int16_t kYuv709To2100_coeffs_neon[8];
-extern const int16_t kYuv601To709_coeffs_neon[8];
-extern const int16_t kYuv601To2100_coeffs_neon[8];
-extern const int16_t kYuv2100To709_coeffs_neon[8];
-extern const int16_t kYuv2100To601_coeffs_neon[8];
+#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
 
 /*
  * The Y values are provided at half the width of U & V values to allow use of the widening
@@ -435,6 +441,15 @@ void transformYuv444_neon(uhdr_raw_image_t* image, const int16_t* coeffs_ptr);
 
 uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t src_encoding,
                                   uhdr_color_gamut_t dst_encoding);
+
+#elif defined(__riscv_v_intrinsic)
+
+void transformYuv420_rvv(uhdr_raw_image_t* image, const int16_t* coeffs_ptr);
+
+uhdr_error_info_t convertYuv_rvv(uhdr_raw_image_t* image, uhdr_color_gamut_t src_encoding,
+                                  uhdr_color_gamut_t dst_encoding);
+
+#endif
 #endif
 
 // Performs a color gamut transformation on an yuv image.
@@ -588,6 +603,8 @@ std::unique_ptr<uhdr_raw_image_ext_t> convert_raw_input_to_ycbcr(
 
 #if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON)))
 std::unique_ptr<uhdr_raw_image_ext_t> convert_raw_input_to_ycbcr_neon(uhdr_raw_image_t* src);
+#elif (defined(UHDR_ENABLE_INTRINSICS) && defined(__riscv_v_intrinsic))
+std::unique_ptr<uhdr_raw_image_ext_t> convert_raw_input_to_ycbcr_rvv(uhdr_raw_image_t* src);
 #endif
 
 bool floatToSignedFraction(float v, int32_t* numerator, uint32_t* denominator);
diff --git a/lib/src/dsp/arm/gainmapmath_neon.cpp b/lib/src/dsp/arm/gainmapmath_neon.cpp
index 306a971a..aba42898 100644
--- a/lib/src/dsp/arm/gainmapmath_neon.cpp
+++ b/lib/src/dsp/arm/gainmapmath_neon.cpp
@@ -27,55 +27,6 @@
 
 namespace ultrahdr {
 
-// Scale all coefficients by 2^14 to avoid needing floating-point arithmetic. This can cause an off
-// by one error compared to the scalar floating-point implementation.
-
-// Removing conversion coefficients 1 and 0 from the group for each standard leaves 6 coefficients.
-// Pack them into a single 128-bit vector as follows, zeroing the remaining elements:
-// {Y1, Y2, U1, U2, V1, V2, 0, 0}
-
-// Yuv Bt709 -> Yuv Bt601
-// Y' = (1.0f * Y) + ( 0.101579f * U) + ( 0.196076f * V)
-// U' = (0.0f * Y) + ( 0.989854f * U) + (-0.110653f * V)
-// V' = (0.0f * Y) + (-0.072453f * U) + ( 0.983398f * V)
-ALIGNED(16)
-const int16_t kYuv709To601_coeffs_neon[8] = {1664, 3213, 16218, -1813, -1187, 16112, 0, 0};
-
-// Yuv Bt709 -> Yuv Bt2100
-// Y' = (1.0f * Y) + (-0.016969f * U) + ( 0.096312f * V)
-// U' = (0.0f * Y) + ( 0.995306f * U) + (-0.051192f * V)
-// V' = (0.0f * Y) + ( 0.011507f * U) + ( 1.002637f * V)
-ALIGNED(16)
-const int16_t kYuv709To2100_coeffs_neon[8] = {-278, 1578, 16307, -839, 189, 16427, 0, 0};
-
-// Yuv Bt601 -> Yuv Bt709
-// Y' = (1.0f * Y) + (-0.118188f * U) + (-0.212685f * V),
-// U' = (0.0f * Y) + ( 1.018640f * U) + ( 0.114618f * V),
-// V' = (0.0f * Y) + ( 0.075049f * U) + ( 1.025327f * V);
-ALIGNED(16)
-const int16_t kYuv601To709_coeffs_neon[8] = {-1936, -3485, 16689, 1878, 1230, 16799, 0, 0};
-
-// Yuv Bt601 -> Yuv Bt2100
-// Y' = (1.0f * Y) + (-0.128245f * U) + (-0.115879f * V)
-// U' = (0.0f * Y) + ( 1.010016f * U) + ( 0.061592f * V)
-// V' = (0.0f * Y) + ( 0.086969f * U) + ( 1.029350f * V)
-ALIGNED(16)
-const int16_t kYuv601To2100_coeffs_neon[8] = {-2101, -1899, 16548, 1009, 1425, 16865, 0, 0};
-
-// Yuv Bt2100 -> Yuv Bt709
-// Y' = (1.0f * Y) + ( 0.018149f * U) + (-0.095132f * V)
-// U' = (0.0f * Y) + ( 1.004123f * U) + ( 0.051267f * V)
-// V' = (0.0f * Y) + (-0.011524f * U) + ( 0.996782f * V)
-ALIGNED(16)
-const int16_t kYuv2100To709_coeffs_neon[8] = {297, -1559, 16452, 840, -189, 16331, 0, 0};
-
-// Yuv Bt2100 -> Yuv Bt601
-// Y' = (1.0f * Y) + ( 0.117887f * U) + ( 0.105521f * V)
-// U' = (0.0f * Y) + ( 0.995211f * U) + (-0.059549f * V)
-// V' = (0.0f * Y) + (-0.084085f * U) + ( 0.976518f * V)
-ALIGNED(16)
-const int16_t kYuv2100To601_coeffs_neon[8] = {1931, 1729, 16306, -976, -1378, 15999, 0, 0};
-
 static inline int16x8_t yConversion_neon(uint8x8_t y, int16x8_t u, int16x8_t v, int16x8_t coeffs) {
   int32x4_t lo = vmull_lane_s16(vget_low_s16(u), vget_low_s16(coeffs), 0);
   int32x4_t hi = vmull_lane_s16(vget_high_s16(u), vget_low_s16(coeffs), 0);
@@ -244,10 +195,10 @@ uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t sr
         case UHDR_CG_BT_709:
           return status;
         case UHDR_CG_DISPLAY_P3:
-          coeffs = kYuv709To601_coeffs_neon;
+          coeffs = kYuv709To601_coeffs_simd;
           break;
         case UHDR_CG_BT_2100:
-          coeffs = kYuv709To2100_coeffs_neon;
+          coeffs = kYuv709To2100_coeffs_simd;
           break;
         default:
           status.error_code = UHDR_CODEC_INVALID_PARAM;
@@ -260,12 +211,12 @@ uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t sr
     case UHDR_CG_DISPLAY_P3:
       switch (dst_encoding) {
         case UHDR_CG_BT_709:
-          coeffs = kYuv601To709_coeffs_neon;
+          coeffs = kYuv601To709_coeffs_simd;
           break;
         case UHDR_CG_DISPLAY_P3:
           return status;
         case UHDR_CG_BT_2100:
-          coeffs = kYuv601To2100_coeffs_neon;
+          coeffs = kYuv601To2100_coeffs_simd;
           break;
         default:
           status.error_code = UHDR_CODEC_INVALID_PARAM;
@@ -278,10 +229,10 @@ uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t sr
     case UHDR_CG_BT_2100:
       switch (dst_encoding) {
         case UHDR_CG_BT_709:
-          coeffs = kYuv2100To709_coeffs_neon;
+          coeffs = kYuv2100To709_coeffs_simd;
           break;
         case UHDR_CG_DISPLAY_P3:
-          coeffs = kYuv2100To601_coeffs_neon;
+          coeffs = kYuv2100To601_coeffs_simd;
           break;
         case UHDR_CG_BT_2100:
           return status;
@@ -317,33 +268,6 @@ uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t sr
   return status;
 }
 
-// Scale all coefficients by 2^14 to avoid needing floating-point arithmetic. This can cause an off
-// by one error compared to the scalar floating-point implementation.
-
-// In the 3x3 conversion matrix, 0.5 is duplicated. But represented as only one entry in lut leaving
-// with an array size of 8 elements.
-
-// RGB Bt709 -> Yuv Bt709
-// Y = 0.212639 * R + 0.715169 * G + 0.072192 * B
-// U = -0.114592135 * R + -0.385407865 * G + 0.5 * B
-// V = 0.5 * R + -0.454155718 * G + -0.045844282 * B
-ALIGNED(16)
-const uint16_t kRgb709ToYuv_coeffs_neon[8] = {3484, 11717, 1183, 1877, 6315, 8192, 7441, 751};
-
-// RGB Display P3 -> Yuv Display P3
-// Y = 0.2289746 * R + 0.6917385 * G + 0.0792869 * B
-// U = -0.124346335 * R + -0.375653665 * G + 0.5 * B
-// V = 0.5 * R + -0.448583471 * G + -0.051416529 * B
-ALIGNED(16)
-const uint16_t kRgbDispP3ToYuv_coeffs_neon[8] = {3752, 11333, 1299, 2037, 6155, 8192, 7350, 842};
-
-// RGB Bt2100 -> Yuv Bt2100
-// Y = 0.2627 * R + 0.677998 * G + 0.059302 * B
-// U = -0.13963036 * R + -0.36036964 * G + 0.5 * B
-// V = 0.5 * R + -0.459784348 * G + -0.040215652 * B
-ALIGNED(16)
-const uint16_t kRgb2100ToYuv_coeffs_neon[8] = {4304, 11108, 972, 2288, 5904, 8192, 7533, 659};
-
 // The core logic is taken from jsimd_rgb_ycc_convert_neon implementation in jccolext-neon.c of
 // libjpeg-turbo
 static void ConvertRgba8888ToYuv444_neon(uhdr_raw_image_t* src, uhdr_raw_image_t* dst,
@@ -460,11 +384,11 @@ std::unique_ptr<uhdr_raw_image_ext_t> convert_raw_input_to_ycbcr_neon(uhdr_raw_i
     const uint16_t* coeffs_ptr = nullptr;
 
     if (src->cg == UHDR_CG_BT_709) {
-      coeffs_ptr = kRgb709ToYuv_coeffs_neon;
+      coeffs_ptr = kRgb709ToYuv_coeffs_simd;
     } else if (src->cg == UHDR_CG_BT_2100) {
-      coeffs_ptr = kRgbDispP3ToYuv_coeffs_neon;
+      coeffs_ptr = kRgbDispP3ToYuv_coeffs_simd;
     } else if (src->cg == UHDR_CG_DISPLAY_P3) {
-      coeffs_ptr = kRgb2100ToYuv_coeffs_neon;
+      coeffs_ptr = kRgb2100ToYuv_coeffs_simd;
     } else {
       return dst;
     }
diff --git a/lib/src/dsp/riscv/gainmapmath_rvv.cpp b/lib/src/dsp/riscv/gainmapmath_rvv.cpp
new file mode 100644
index 00000000..9d9d958b
--- /dev/null
+++ b/lib/src/dsp/riscv/gainmapmath_rvv.cpp
@@ -0,0 +1,538 @@
+/*
+ * Copyright 2024 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ultrahdr/gainmapmath.h"
+#include <riscv_vector.h>
+#include <cassert>
+
+namespace ultrahdr {
+
+static inline vuint16m8_t zip_self(vuint16m4_t a, size_t vl) {
+  vuint32m8_t a_wide = __riscv_vzext_vf2_u32m8(a, vl / 2);
+  vuint16m8_t a_zero = __riscv_vreinterpret_v_u32m8_u16m8(a_wide);
+  vuint16m8_t a_zero_slide = __riscv_vslide1up_vx_u16m8(a_zero, 0, vl);
+  vuint16m8_t a_zip = __riscv_vadd_vv_u16m8(a_zero, a_zero_slide, vl);
+  return a_zip;
+}
+
+static inline vint16m4_t vqrshrn_n_s32(vint32m8_t a, const int b, size_t vl) {
+  return __riscv_vnclip_wx_i16m4(a, b, vl);
+}
+
+static inline vuint8m4_t vget_low_u8(vuint8m8_t u) { return __riscv_vget_v_u8m8_u8m4(u, 0); }
+
+static inline vuint8m4_t vget_high_u8(vuint8m8_t u, size_t vl) {
+  return __riscv_vget_v_u8m8_u8m4(__riscv_vslidedown_vx_u8m8(u, vl / 2, vl), 0);
+}
+
+static inline vint16m4_t vget_low_s16(vint16m8_t u) { return __riscv_vget_v_i16m8_i16m4(u, 0); }
+
+static inline vint16m4_t vget_high_s16(vint16m8_t u, size_t vl) {
+  return __riscv_vget_v_i16m8_i16m4(__riscv_vslidedown_vx_i16m8(u, vl / 2, vl), 0);
+}
+
+static inline vuint16m4_t vget_low_u16(vuint16m8_t u) { return __riscv_vget_v_u16m8_u16m4(u, 0); }
+
+static inline vuint16m2_t vget_low_u16m4(vuint16m4_t u) { return __riscv_vget_v_u16m4_u16m2(u, 0); }
+
+static inline vuint16m4_t vget_high_u16(vuint16m8_t u, size_t vl) {
+  return __riscv_vget_v_u16m8_u16m4(__riscv_vslidedown_vx_u16m8(u, vl / 2, vl), 0);
+}
+
+static inline vuint16m2_t vget_high_u16m4(vuint16m4_t u, size_t vl) {
+  return __riscv_vget_v_u16m4_u16m2(__riscv_vslidedown_vx_u16m4(u, vl / 2, vl), 0);
+}
+
+static inline vint16m8_t vcombine_s16(vint16m4_t a, vint16m4_t b, size_t vl) {
+  vint16m8_t a_wide = __riscv_vlmul_ext_v_i16m4_i16m8(a);
+  vint16m8_t b_wide = __riscv_vlmul_ext_v_i16m4_i16m8(b);
+  return __riscv_vslideup_vx_i16m8(a_wide, b_wide, vl / 2, vl);
+}
+
+static inline vuint8m8_t vcombine_u8(vuint8m4_t a, vuint8m4_t b, size_t vl) {
+  vuint8m8_t a_wide = __riscv_vlmul_ext_v_u8m4_u8m8(a);
+  vuint8m8_t b_wide = __riscv_vlmul_ext_v_u8m4_u8m8(b);
+  return __riscv_vslideup_vx_u8m8(a_wide, b_wide, vl / 2, vl);
+}
+
+static inline vuint16m8_t vcombine_u16(vuint16m4_t a, vuint16m4_t b, size_t vl) {
+  vuint16m8_t a_wide = __riscv_vlmul_ext_v_u16m4_u16m8(a);
+  vuint16m8_t b_wide = __riscv_vlmul_ext_v_u16m4_u16m8(b);
+  return __riscv_vslideup_vx_u16m8(a_wide, b_wide, vl / 2, vl);
+}
+
+static inline vuint8m4_t vmovn_u16(vuint16m8_t a, size_t vl) {
+  return __riscv_vnsrl_wx_u8m4(a, 0, vl);
+}
+
+static inline vuint8m4_t vqmovun_s16(vint16m8_t a, size_t vl) {
+  vuint16m8_t a_non_neg = __riscv_vreinterpret_v_i16m8_u16m8(__riscv_vmax_vx_i16m8(a, 0, vl));
+  return __riscv_vnclipu_wx_u8m4(a_non_neg, 0, vl);
+}
+
+static inline vuint16m4_t vmovl_u8(vuint8m4_t a, size_t vl) {
+  vuint16m8_t a_16 = __riscv_vzext_vf2_u16m8(a, vl);
+  return __riscv_vlmul_trunc_v_u16m8_u16m4(a_16);
+}
+
+static inline vuint16m4_t vrshrn_n_u32(vuint32m4_t a, const int b, size_t vl) {
+  vuint32m4_t a_round = __riscv_vadd_vx_u32m4(a, 1 << (b - 1), vl);
+  return __riscv_vnsrl_wx_u16m4(__riscv_vlmul_ext_v_u32m4_u32m8(a_round), b, vl);
+}
+
+static inline vint16m8_t yConversion_rvv(vuint8m4_t y, vint16m8_t u, vint16m8_t v,
+                                         const int16_t* coeffs, size_t vl) {
+  vint32m8_t u_lo = __riscv_vwmul_vx_i32m8(vget_low_s16(u), coeffs[0], vl / 2);
+  vint32m8_t u_hi = __riscv_vwmul_vx_i32m8(vget_high_s16(u, vl), coeffs[0], vl / 2);
+
+  vint32m8_t v_lo = __riscv_vwmul_vx_i32m8(vget_low_s16(v), coeffs[1], vl / 2);
+  vint32m8_t v_hi = __riscv_vwmul_vx_i32m8(vget_high_s16(v, vl), coeffs[1], vl / 2);
+
+  vint32m8_t lo = __riscv_vadd_vv_i32m8(u_lo, v_lo, vl / 2);
+  vint32m8_t hi = __riscv_vadd_vv_i32m8(u_hi, v_hi, vl / 2);
+
+  vint16m4_t lo_shr = vqrshrn_n_s32(lo, 14, vl / 2);
+  vint16m4_t hi_shr = vqrshrn_n_s32(hi, 14, vl / 2);
+
+  vint16m8_t y_output = vcombine_s16(lo_shr, hi_shr, vl);
+  vuint16m8_t y_u16 = __riscv_vreinterpret_v_i16m8_u16m8(y_output);
+  vuint16m8_t y_ret = __riscv_vwaddu_wv_u16m8(y_u16, y, vl);
+  return __riscv_vreinterpret_v_u16m8_i16m8(y_ret);
+}
+
+static inline vint16m8_t uConversion_rvv(vint16m8_t u, vint16m8_t v, const int16_t* coeffs,
+                                         size_t vl) {
+  vint32m8_t u_lo = __riscv_vwmul_vx_i32m8(vget_low_s16(u), coeffs[2], vl / 2);
+  vint32m8_t u_hi = __riscv_vwmul_vx_i32m8(vget_high_s16(u, vl), coeffs[2], vl / 2);
+
+  vint32m8_t v_lo = __riscv_vwmul_vx_i32m8(vget_low_s16(v), coeffs[3], vl / 2);
+  vint32m8_t v_hi = __riscv_vwmul_vx_i32m8(vget_high_s16(v, vl), coeffs[3], vl / 2);
+
+  vint32m8_t lo = __riscv_vadd_vv_i32m8(u_lo, v_lo, vl / 2);
+  vint32m8_t hi = __riscv_vadd_vv_i32m8(u_hi, v_hi, vl / 2);
+
+  vint16m4_t lo_shr = vqrshrn_n_s32(lo, 14, vl / 2);
+  vint16m4_t hi_shr = vqrshrn_n_s32(hi, 14, vl / 2);
+
+  vint16m8_t u_output = vcombine_s16(lo_shr, hi_shr, vl);
+  return u_output;
+}
+
+static inline vint16m8_t vConversion_rvv(vint16m8_t u, vint16m8_t v, const int16_t* coeffs,
+                                         size_t vl) {
+  vint32m8_t u_lo = __riscv_vwmul_vx_i32m8(vget_low_s16(u), coeffs[4], vl / 2);
+  vint32m8_t u_hi = __riscv_vwmul_vx_i32m8(vget_high_s16(u, vl), coeffs[4], vl / 2);
+
+  vint32m8_t v_lo = __riscv_vwmul_vx_i32m8(vget_low_s16(v), coeffs[5], vl / 2);
+  vint32m8_t v_hi = __riscv_vwmul_vx_i32m8(vget_high_s16(v, vl), coeffs[5], vl / 2);
+
+  vint32m8_t lo = __riscv_vadd_vv_i32m8(u_lo, v_lo, vl / 2);
+  vint32m8_t hi = __riscv_vadd_vv_i32m8(u_hi, v_hi, vl / 2);
+
+  vint16m4_t lo_shr = vqrshrn_n_s32(lo, 14, vl / 2);
+  vint16m4_t hi_shr = vqrshrn_n_s32(hi, 14, vl / 2);
+
+  vint16m8_t v_output = vcombine_s16(lo_shr, hi_shr, vl);
+  return v_output;
+}
+
+void transformYuv420_rvv(uhdr_raw_image_t* image, const int16_t* coeffs_ptr) {
+  assert(image->w % 16 == 0);
+  uint8_t* y0_ptr = static_cast<uint8_t*>(image->planes[UHDR_PLANE_Y]);
+  uint8_t* y1_ptr = y0_ptr + image->stride[UHDR_PLANE_Y];
+  uint8_t* u_ptr = static_cast<uint8_t*>(image->planes[UHDR_PLANE_U]);
+  uint8_t* v_ptr = static_cast<uint8_t*>(image->planes[UHDR_PLANE_V]);
+  size_t vl;
+  size_t h = 0;
+  do {
+    size_t w = 0;
+    do {
+      vl = __riscv_vsetvl_e8m8((image->w) - w);
+      assert((vl % 4) == 0 && vl >= 4);
+
+      vuint8m8_t y0 = __riscv_vle8_v_u8m8(y0_ptr + w * 2, vl);
+      vuint8m8_t y1 = __riscv_vle8_v_u8m8(y1_ptr + w * 2, vl);
+
+      vuint8m4_t u8 = __riscv_vle8_v_u8m4(u_ptr + w, vl / 2);
+      vuint8m4_t v8 = __riscv_vle8_v_u8m4(v_ptr + w, vl / 2);
+
+      vuint16m8_t u16_wide = __riscv_vwsubu_vx_u16m8(u8, 128, vl / 2);
+      vuint16m8_t v16_wide = __riscv_vwsubu_vx_u16m8(v8, 128, vl / 2);
+
+      vuint16m8_t uu_wide_lo = zip_self(__riscv_vget_v_u16m8_u16m4(u16_wide, 0), vl / 2);
+      vuint16m8_t uu_wide_hi = zip_self(vget_high_u16(u16_wide, vl / 2), vl / 2);
+      vuint16m8_t uv_wide_lo = zip_self(__riscv_vget_v_u16m8_u16m4(v16_wide, 0), vl / 2);
+      vuint16m8_t uv_wide_hi = zip_self(vget_high_u16(v16_wide, vl / 2), vl / 2);
+
+      vint16m8_t u_wide_lo = __riscv_vreinterpret_v_u16m8_i16m8(uu_wide_lo);
+      vint16m8_t v_wide_lo = __riscv_vreinterpret_v_u16m8_i16m8(uv_wide_lo);
+      vint16m8_t u_wide_hi = __riscv_vreinterpret_v_u16m8_i16m8(uu_wide_hi);
+      vint16m8_t v_wide_hi = __riscv_vreinterpret_v_u16m8_i16m8(uv_wide_hi);
+
+      vint16m8_t y0_lo = yConversion_rvv(vget_low_u8(y0), u_wide_lo, v_wide_lo, coeffs_ptr, vl / 2);
+      vint16m8_t y1_lo = yConversion_rvv(vget_low_u8(y1), u_wide_lo, v_wide_lo, coeffs_ptr, vl / 2);
+      vint16m8_t y0_hi =
+          yConversion_rvv(vget_high_u8(y0, vl / 2), u_wide_hi, v_wide_hi, coeffs_ptr, vl / 2);
+      vint16m8_t y1_hi =
+          yConversion_rvv(vget_high_u8(y1, vl / 2), u_wide_hi, v_wide_hi, coeffs_ptr, vl / 2);
+
+      vint16m8_t u_wide_s16 = __riscv_vreinterpret_v_u16m8_i16m8(u16_wide);
+      vint16m8_t v_wide_s16 = __riscv_vreinterpret_v_u16m8_i16m8(v16_wide);
+      vint16m8_t new_u = uConversion_rvv(u_wide_s16, v_wide_s16, coeffs_ptr, vl / 2);
+      vint16m8_t new_v = vConversion_rvv(u_wide_s16, v_wide_s16, coeffs_ptr, vl / 2);
+
+      vuint8m8_t y0_output =
+          vcombine_u8(vqmovun_s16(y0_lo, vl / 2), vqmovun_s16(y0_hi, vl / 2), vl);
+      vuint8m8_t y1_output =
+          vcombine_u8(vqmovun_s16(y1_lo, vl / 2), vqmovun_s16(y1_hi, vl / 2), vl);
+      vuint8m4_t u_output = vqmovun_s16(__riscv_vadd_vx_i16m8(new_u, 128, vl / 2), vl / 2);
+      vuint8m4_t v_output = vqmovun_s16(__riscv_vadd_vx_i16m8(new_v, 128, vl / 2), vl / 2);
+
+      __riscv_vse8_v_u8m8(y0_ptr + w * 2, y0_output, vl);
+      __riscv_vse8_v_u8m8(y1_ptr + w * 2, y1_output, vl);
+      __riscv_vse8_v_u8m4(u_ptr + w, u_output, vl / 2);
+      __riscv_vse8_v_u8m4(v_ptr + w, v_output, vl / 2);
+
+      w += (vl / 2);
+    } while (w < image->w / 2);
+    y0_ptr += image->stride[UHDR_PLANE_Y] * 2;
+    y1_ptr += image->stride[UHDR_PLANE_Y] * 2;
+    u_ptr += image->stride[UHDR_PLANE_U];
+    v_ptr += image->stride[UHDR_PLANE_V];
+  } while (++h < image->h / 2);
+}
+
+void transformYuv444_rvv(uhdr_raw_image_t* image, const int16_t* coeffs_ptr) {
+  // Implementation assumes image buffer is multiple of 16.
+  assert(image->w % 16 == 0);
+  uint8_t* y_ptr = static_cast<uint8_t*>(image->planes[UHDR_PLANE_Y]);
+  uint8_t* u_ptr = static_cast<uint8_t*>(image->planes[UHDR_PLANE_U]);
+  uint8_t* v_ptr = static_cast<uint8_t*>(image->planes[UHDR_PLANE_V]);
+
+  size_t vl;
+  size_t h = 0;
+  do {
+    size_t w = 0;
+    do {
+      vl = __riscv_vsetvl_e8m8((image->w) - w);
+
+      vuint8m8_t y = __riscv_vle8_v_u8m8(y_ptr + w, vl);
+      vuint8m8_t u = __riscv_vle8_v_u8m8(u_ptr + w, vl);
+      vuint8m8_t v = __riscv_vle8_v_u8m8(v_ptr + w, vl);
+
+      vuint16m8_t u16_wide_low = __riscv_vwsubu_vx_u16m8(vget_low_u8(u), 128, vl / 2);
+      vuint16m8_t v16_wide_low = __riscv_vwsubu_vx_u16m8(vget_low_u8(v), 128, vl / 2);
+      vuint16m8_t u16_wide_high = __riscv_vwsubu_vx_u16m8(vget_high_u8(u, vl), 128, vl / 2);
+      vuint16m8_t v16_wide_high = __riscv_vwsubu_vx_u16m8(vget_high_u8(v, vl), 128, vl / 2);
+
+      vint16m8_t u_wide_low_s16 = __riscv_vreinterpret_v_u16m8_i16m8(u16_wide_low);
+      vint16m8_t v_wide_low_s16 = __riscv_vreinterpret_v_u16m8_i16m8(v16_wide_low);
+      vint16m8_t u_wide_high_s16 = __riscv_vreinterpret_v_u16m8_i16m8(u16_wide_high);
+      vint16m8_t v_wide_high_s16 = __riscv_vreinterpret_v_u16m8_i16m8(v16_wide_high);
+
+      vint16m8_t y_lo =
+          yConversion_rvv(vget_low_u8(y), u_wide_low_s16, v_wide_low_s16, coeffs_ptr, vl / 2);
+      vint16m8_t y_hi = yConversion_rvv(vget_high_u8(y, vl / 2), u_wide_high_s16, v_wide_high_s16,
+                                        coeffs_ptr, vl / 2);
+
+      vint16m8_t new_u_lo = uConversion_rvv(u_wide_low_s16, v_wide_low_s16, coeffs_ptr, vl / 2);
+      vint16m8_t new_v_lo = vConversion_rvv(u_wide_low_s16, v_wide_low_s16, coeffs_ptr, vl / 2);
+      vint16m8_t new_u_hi = uConversion_rvv(u_wide_high_s16, v_wide_high_s16, coeffs_ptr, vl / 2);
+      vint16m8_t new_v_hi = vConversion_rvv(u_wide_high_s16, v_wide_high_s16, coeffs_ptr, vl / 2);
+
+      // Narrow from 16-bit to 8-bit with saturation.
+      vuint8m8_t y_output = vcombine_u8(vqmovun_s16(y_lo, vl / 2), vqmovun_s16(y_hi, vl / 2), vl);
+      vuint8m4_t u_output_hi = vqmovun_s16(__riscv_vadd_vx_i16m8(new_u_hi, 128, vl / 2), vl / 2);
+      vuint8m4_t u_output_lo = vqmovun_s16(__riscv_vadd_vx_i16m8(new_u_lo, 128, vl / 2), vl / 2);
+      vuint8m4_t v_output_hi = vqmovun_s16(__riscv_vadd_vx_i16m8(new_v_hi, 128, vl / 2), vl / 2);
+      vuint8m4_t v_output_lo = vqmovun_s16(__riscv_vadd_vx_i16m8(new_v_lo, 128, vl / 2), vl / 2);
+
+      vuint8m8_t u_output = vcombine_u8(u_output_lo, u_output_hi, vl);
+      vuint8m8_t v_output = vcombine_u8(v_output_lo, v_output_hi, vl);
+
+      __riscv_vse8_v_u8m8(y_ptr + w, y_output, vl);
+      __riscv_vse8_v_u8m8(u_ptr + w, u_output, vl);
+      __riscv_vse8_v_u8m8(v_ptr + w, v_output, vl);
+
+      w += vl;
+    } while (w < image->w);
+    y_ptr += image->stride[UHDR_PLANE_Y];
+    u_ptr += image->stride[UHDR_PLANE_U];
+    v_ptr += image->stride[UHDR_PLANE_V];
+  } while (++h < image->h);
+}
+
+uhdr_error_info_t convertYuv_rvv(uhdr_raw_image_t* image, uhdr_color_gamut_t src_encoding,
+                                 uhdr_color_gamut_t dst_encoding) {
+  uhdr_error_info_t status = g_no_error;
+  const int16_t* coeffs = nullptr;
+
+  switch (src_encoding) {
+    case UHDR_CG_BT_709:
+      switch (dst_encoding) {
+        case UHDR_CG_BT_709:
+          return status;
+        case UHDR_CG_DISPLAY_P3:
+          coeffs = kYuv709To601_coeffs_simd;
+          break;
+        case UHDR_CG_BT_2100:
+          coeffs = kYuv709To2100_coeffs_simd;
+          break;
+        default:
+          status.error_code = UHDR_CODEC_INVALID_PARAM;
+          status.has_detail = 1;
+          snprintf(status.detail, sizeof status.detail, "Unrecognized dest color gamut %d",
+                   dst_encoding);
+          return status;
+      }
+      break;
+    case UHDR_CG_DISPLAY_P3:
+      switch (dst_encoding) {
+        case UHDR_CG_BT_709:
+          coeffs = kYuv601To709_coeffs_simd;
+          break;
+        case UHDR_CG_DISPLAY_P3:
+          return status;
+        case UHDR_CG_BT_2100:
+          coeffs = kYuv601To2100_coeffs_simd;
+          break;
+        default:
+          status.error_code = UHDR_CODEC_INVALID_PARAM;
+          status.has_detail = 1;
+          snprintf(status.detail, sizeof status.detail, "Unrecognized dest color gamut %d",
+                   dst_encoding);
+          return status;
+      }
+      break;
+    case UHDR_CG_BT_2100:
+      switch (dst_encoding) {
+        case UHDR_CG_BT_709:
+          coeffs = kYuv2100To709_coeffs_simd;
+          break;
+        case UHDR_CG_DISPLAY_P3:
+          coeffs = kYuv2100To601_coeffs_simd;
+          break;
+        case UHDR_CG_BT_2100:
+          return status;
+        default:
+          status.error_code = UHDR_CODEC_INVALID_PARAM;
+          status.has_detail = 1;
+          snprintf(status.detail, sizeof status.detail, "Unrecognized dest color gamut %d",
+                   dst_encoding);
+          return status;
+      }
+      break;
+    default:
+      status.error_code = UHDR_CODEC_INVALID_PARAM;
+      status.has_detail = 1;
+      snprintf(status.detail, sizeof status.detail, "Unrecognized src color gamut %d",
+               src_encoding);
+      return status;
+  }
+
+  if (image->fmt == UHDR_IMG_FMT_12bppYCbCr420) {
+    transformYuv420_rvv(image, coeffs);
+  } else if (image->fmt == UHDR_IMG_FMT_24bppYCbCr444) {
+    transformYuv444_rvv(image, coeffs);
+  } else {
+    status.error_code = UHDR_CODEC_UNSUPPORTED_FEATURE;
+    status.has_detail = 1;
+    snprintf(status.detail, sizeof status.detail,
+             "No implementation available for performing gamut conversion for color format %d",
+             image->fmt);
+    return status;
+  }
+
+  return status;
+}
+
+static void ConvertRgba8888ToYuv444_rvv(uhdr_raw_image_t* src, uhdr_raw_image_t* dst,
+                                        const uint16_t* coeffs_ptr) {
+  assert(src->stride[UHDR_PLANE_PACKED] % 16 == 0);
+  uint8_t* rgba_base_ptr = static_cast<uint8_t*>(src->planes[UHDR_PLANE_PACKED]);
+
+  uint8_t* y_base_ptr = static_cast<uint8_t*>(dst->planes[UHDR_PLANE_Y]);
+  uint8_t* u_base_ptr = static_cast<uint8_t*>(dst->planes[UHDR_PLANE_U]);
+  uint8_t* v_base_ptr = static_cast<uint8_t*>(dst->planes[UHDR_PLANE_V]);
+
+  uint32_t bias = (128 << 14) + 8191;
+
+  size_t vl;
+  size_t h = 0;
+  do {
+    size_t w = 0;
+    uint8_t* rgba_ptr = rgba_base_ptr + (size_t)src->stride[UHDR_PLANE_PACKED] * 4 * h;
+    uint8_t* y_ptr = y_base_ptr + (size_t)dst->stride[UHDR_PLANE_Y] * h;
+    uint8_t* u_ptr = u_base_ptr + (size_t)dst->stride[UHDR_PLANE_U] * h;
+    uint8_t* v_ptr = v_base_ptr + (size_t)dst->stride[UHDR_PLANE_V] * h;
+    do {
+      vl = __riscv_vsetvl_e8m8((src->w) - w);
+      assert(vl % 4 == 0);
+
+      vuint8m8_t r = __riscv_vlse8_v_u8m8(rgba_ptr, 4, vl);
+      vuint8m8_t g = __riscv_vlse8_v_u8m8(rgba_ptr, 4, vl);
+      vuint8m8_t b = __riscv_vlse8_v_u8m8(rgba_ptr, 4, vl);
+
+      vuint16m4_t r_l = vmovl_u8(vget_low_u8(r), vl / 2);
+      vuint16m4_t r_h = vmovl_u8(vget_high_u8(r, vl / 2), vl / 2);
+      vuint16m4_t g_l = vmovl_u8(vget_low_u8(g), vl / 2);
+      vuint16m4_t g_h = vmovl_u8(vget_high_u8(g, vl / 2), vl / 2);
+      vuint16m4_t b_l = vmovl_u8(vget_low_u8(b), vl / 2);
+      vuint16m4_t b_h = vmovl_u8(vget_high_u8(b, vl / 2), vl / 2);
+
+      vuint32m4_t y_ll = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(r_l), coeffs_ptr[0], vl / 4);
+      y_ll = __riscv_vwmaccu_vx_u32m4(y_ll, coeffs_ptr[1], vget_low_u16m4(g_l), vl / 4);
+      y_ll = __riscv_vwmaccu_vx_u32m4(y_ll, coeffs_ptr[2], vget_low_u16m4(b_l), vl / 4);
+      vuint32m4_t y_lh =
+          __riscv_vwmulu_vx_u32m4(vget_high_u16m4(r_l, vl / 2), coeffs_ptr[0], vl / 4);
+      y_lh = __riscv_vwmaccu_vx_u32m4(y_lh, coeffs_ptr[1], vget_high_u16m4(g_l, vl / 2), vl / 4);
+      y_lh = __riscv_vwmaccu_vx_u32m4(y_lh, coeffs_ptr[2], vget_high_u16m4(b_l, vl / 2), vl / 4);
+      vuint32m4_t y_hl = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(r_h), coeffs_ptr[0], vl / 4);
+      y_hl = __riscv_vwmaccu_vx_u32m4(y_hl, coeffs_ptr[1], vget_low_u16m4(g_h), vl / 4);
+      y_hl = __riscv_vwmaccu_vx_u32m4(y_hl, coeffs_ptr[2], vget_low_u16m4(b_h), vl / 4);
+      vuint32m4_t y_hh =
+          __riscv_vwmulu_vx_u32m4(vget_high_u16m4(r_h, vl / 2), coeffs_ptr[0], vl / 4);
+      y_hh = __riscv_vwmaccu_vx_u32m4(y_hh, coeffs_ptr[1], vget_high_u16m4(g_h, vl / 2), vl / 4);
+      y_hh = __riscv_vwmaccu_vx_u32m4(y_hh, coeffs_ptr[2], vget_high_u16m4(b_h, vl / 2), vl / 4);
+
+      // B - R - G + bias
+      vuint32m4_t cb_ll = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(b_l), coeffs_ptr[5], vl / 4);
+      vuint32m4_t cb_r_ll = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(r_l), coeffs_ptr[3], vl / 4);
+      cb_ll = __riscv_vsub_vv_u32m4(cb_ll, cb_r_ll, vl / 4);
+      vuint32m4_t cb_g_ll = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(g_l), coeffs_ptr[4], vl / 4);
+      cb_ll = __riscv_vsub_vv_u32m4(cb_ll, cb_g_ll, vl / 4);
+      cb_ll = __riscv_vadd_vx_u32m4(cb_ll, bias, vl / 4);
+
+      vuint32m4_t cb_lh =
+          __riscv_vwmulu_vx_u32m4(vget_high_u16m4(b_l, vl / 2), coeffs_ptr[5], vl / 4);
+      vuint32m4_t cb_r_lh =
+          __riscv_vwmulu_vx_u32m4(vget_high_u16m4(r_l, vl / 2), coeffs_ptr[3], vl / 4);
+      cb_lh = __riscv_vsub_vv_u32m4(cb_lh, cb_r_lh, vl / 4);
+      vuint32m4_t cb_g_lh =
+          __riscv_vwmulu_vx_u32m4(vget_high_u16m4(g_l, vl / 2), coeffs_ptr[4], vl / 4);
+      cb_lh = __riscv_vsub_vv_u32m4(cb_lh, cb_g_lh, vl / 4);
+      cb_lh = __riscv_vadd_vx_u32m4(cb_lh, bias, vl / 4);
+
+      vuint32m4_t cb_hl = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(b_h), coeffs_ptr[5], vl / 4);
+      vuint32m4_t cb_r_hl = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(r_h), coeffs_ptr[3], vl / 4);
+      cb_hl = __riscv_vsub_vv_u32m4(cb_hl, cb_r_hl, vl / 4);
+      vuint32m4_t cb_g_hl = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(g_h), coeffs_ptr[4], vl / 4);
+      cb_hl = __riscv_vsub_vv_u32m4(cb_hl, cb_g_hl, vl / 4);
+      cb_hl = __riscv_vadd_vx_u32m4(cb_hl, bias, vl / 4);
+
+      vuint32m4_t cb_hh =
+          __riscv_vwmulu_vx_u32m4(vget_high_u16m4(b_h, vl / 2), coeffs_ptr[5], vl / 4);
+      vuint32m4_t cb_r_hh =
+          __riscv_vwmulu_vx_u32m4(vget_high_u16m4(r_h, vl / 2), coeffs_ptr[3], vl / 4);
+      cb_hh = __riscv_vsub_vv_u32m4(cb_hh, cb_r_hh, vl / 4);
+      vuint32m4_t cb_g_hh =
+          __riscv_vwmulu_vx_u32m4(vget_high_u16m4(g_h, vl / 2), coeffs_ptr[4], vl / 4);
+      cb_hh = __riscv_vsub_vv_u32m4(cb_hh, cb_g_hh, vl / 4);
+      cb_hh = __riscv_vadd_vx_u32m4(cb_hh, bias, vl / 4);
+
+      // R - G - B + bias
+      vuint32m4_t cr_ll = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(r_l), coeffs_ptr[5], vl / 4);
+      vuint32m4_t cr_g_ll = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(g_l), coeffs_ptr[6], vl / 4);
+      cr_ll = __riscv_vsub_vv_u32m4(cr_ll, cr_g_ll, vl / 4);
+      vuint32m4_t cr_b_ll = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(b_l), coeffs_ptr[7], vl / 4);
+      cr_ll = __riscv_vsub_vv_u32m4(cr_ll, cr_b_ll, vl / 4);
+      cr_ll = __riscv_vadd_vx_u32m4(cr_ll, bias, vl / 4);
+
+      vuint32m4_t cr_lh =
+          __riscv_vwmulu_vx_u32m4(vget_high_u16m4(r_l, vl / 2), coeffs_ptr[5], vl / 4);
+      vuint32m4_t cr_g_lh =
+          __riscv_vwmulu_vx_u32m4(vget_high_u16m4(g_l, vl / 2), coeffs_ptr[6], vl / 4);
+      cr_lh = __riscv_vsub_vv_u32m4(cr_lh, cr_g_lh, vl / 4);
+      vuint32m4_t cr_b_lh =
+          __riscv_vwmulu_vx_u32m4(vget_high_u16m4(b_l, vl / 2), coeffs_ptr[7], vl / 4);
+      cr_lh = __riscv_vsub_vv_u32m4(cr_lh, cr_b_lh, vl / 4);
+      cr_lh = __riscv_vadd_vx_u32m4(cr_lh, bias, vl / 4);
+
+      vuint32m4_t cr_hl = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(r_h), coeffs_ptr[5], vl / 4);
+      vuint32m4_t cr_g_hl = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(g_h), coeffs_ptr[6], vl / 4);
+      cr_hl = __riscv_vsub_vv_u32m4(cr_hl, cr_g_hl, vl / 4);
+      vuint32m4_t cr_b_hl = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(b_h), coeffs_ptr[7], vl / 4);
+      cr_hl = __riscv_vsub_vv_u32m4(cr_hl, cr_b_hl, vl / 4);
+      cr_hl = __riscv_vadd_vx_u32m4(cr_hl, bias, vl / 4);
+
+      vuint32m4_t cr_hh =
+          __riscv_vwmulu_vx_u32m4(vget_high_u16m4(r_h, vl / 2), coeffs_ptr[5], vl / 4);
+      vuint32m4_t cr_g_hh =
+          __riscv_vwmulu_vx_u32m4(vget_high_u16m4(g_h, vl / 2), coeffs_ptr[6], vl / 4);
+      cr_hh = __riscv_vsub_vv_u32m4(cr_hh, cr_g_hh, vl / 4);
+      vuint32m4_t cr_b_hh =
+          __riscv_vwmulu_vx_u32m4(vget_high_u16m4(b_h, vl / 2), coeffs_ptr[7], vl / 4);
+      cr_hh = __riscv_vsub_vv_u32m4(cr_hh, cr_b_hh, vl / 4);
+      cr_hh = __riscv_vadd_vx_u32m4(cr_hh, bias, vl / 4);
+
+      vuint16m8_t y_l =
+          vcombine_u16(vrshrn_n_u32(y_ll, 14, vl / 4), vrshrn_n_u32(y_lh, 14, vl / 4), vl / 2);
+      vuint16m8_t y_h =
+          vcombine_u16(vrshrn_n_u32(y_hl, 14, vl / 4), vrshrn_n_u32(y_hh, 14, vl / 4), vl / 2);
+      vuint16m8_t cb_l =
+          vcombine_u16(vrshrn_n_u32(cb_ll, 14, vl / 4), vrshrn_n_u32(cb_lh, 14, vl / 4), vl / 2);
+      vuint16m8_t cb_h =
+          vcombine_u16(vrshrn_n_u32(cb_hl, 14, vl / 4), vrshrn_n_u32(cb_hh, 14, vl / 4), vl / 2);
+      vuint16m8_t cr_l =
+          vcombine_u16(vrshrn_n_u32(cr_ll, 14, vl / 4), vrshrn_n_u32(cr_lh, 14, vl / 4), vl / 2);
+      vuint16m8_t cr_h =
+          vcombine_u16(vrshrn_n_u32(cr_hl, 14, vl / 4), vrshrn_n_u32(cr_hh, 14, vl / 4), vl / 2);
+
+      __riscv_vse8_v_u8m8(y_ptr, vcombine_u8(vmovn_u16(y_l, vl / 2), vmovn_u16(y_h, vl / 2), vl),
+                          vl);
+      __riscv_vse8_v_u8m8(u_ptr, vcombine_u8(vmovn_u16(cb_l, vl / 2), vmovn_u16(cb_h, vl / 2), vl),
+                          vl);
+      __riscv_vse8_v_u8m8(v_ptr, vcombine_u8(vmovn_u16(cr_l, vl / 2), vmovn_u16(cr_h, vl / 2), vl),
+                          vl);
+
+      /* Increment pointers. */
+      rgba_ptr += (vl * 4);
+      y_ptr += vl;
+      u_ptr += vl;
+      v_ptr += vl;
+
+      w += vl;
+    } while (w < src->w);
+    rgba_base_ptr += src->stride[UHDR_PLANE_PACKED];
+    y_base_ptr += dst->stride[UHDR_PLANE_Y];
+    u_base_ptr += dst->stride[UHDR_PLANE_U];
+    v_base_ptr += dst->stride[UHDR_PLANE_V];
+  } while (++h < src->h);
+}
+
+std::unique_ptr<uhdr_raw_image_ext_t> convert_raw_input_to_ycbcr_rvv(uhdr_raw_image_t* src) {
+  if (src->fmt == UHDR_IMG_FMT_32bppRGBA8888) {
+    std::unique_ptr<uhdr_raw_image_ext_t> dst = nullptr;
+    const uint16_t* coeffs_ptr = nullptr;
+
+    if (src->cg == UHDR_CG_BT_709) {
+      coeffs_ptr = kRgb709ToYuv_coeffs_simd;
+    } else if (src->cg == UHDR_CG_BT_2100) {
+      coeffs_ptr = kRgbDispP3ToYuv_coeffs_simd;
+    } else if (src->cg == UHDR_CG_DISPLAY_P3) {
+      coeffs_ptr = kRgb2100ToYuv_coeffs_simd;
+    } else {
+      return dst;
+    }
+    dst = std::make_unique<uhdr_raw_image_ext_t>(UHDR_IMG_FMT_24bppYCbCr444, src->cg, src->ct,
+                                                 UHDR_CR_FULL_RANGE, src->w, src->h, 64);
+    ConvertRgba8888ToYuv444_rvv(src, dst.get(), coeffs_ptr);
+    return dst;
+  }
+  return nullptr;
+}
+
+}  // namespace ultrahdr
diff --git a/lib/src/gainmapmath.cpp b/lib/src/gainmapmath.cpp
index fa56c3e8..f90e3564 100644
--- a/lib/src/gainmapmath.cpp
+++ b/lib/src/gainmapmath.cpp
@@ -684,6 +684,90 @@ const std::array<float, 9> kYuvBt2100ToBt709 = {
 const std::array<float, 9> kYuvBt2100ToBt601 = {
     1.0f, 0.117887f, 0.105521f, 0.0f, 0.995211f, -0.059549f, 0.0f, -0.084085f, 0.976518f};
 
+#ifdef UHDR_ENABLE_INTRINSICS
+
+#ifdef _MSC_VER
+#define ALIGNED(x) __declspec(align(x))
+#else
+#define ALIGNED(x) __attribute__((aligned(x)))
+#endif
+// Scale all coefficients by 2^14 to avoid needing floating-point arithmetic. This can cause an off
+// by one error compared to the scalar floating-point implementation.
+
+// Removing conversion coefficients 1 and 0 from the group for each standard leaves 6 coefficients.
+// Pack them into a single 128-bit vector as follows, zeroing the remaining elements:
+// {Y1, Y2, U1, U2, V1, V2, 0, 0}
+
+// Yuv Bt709 -> Yuv Bt601
+// Y' = (1.0f * Y) + ( 0.101579f * U) + ( 0.196076f * V)
+// U' = (0.0f * Y) + ( 0.989854f * U) + (-0.110653f * V)
+// V' = (0.0f * Y) + (-0.072453f * U) + ( 0.983398f * V)
+ALIGNED(16)
+const int16_t kYuv709To601_coeffs_simd[8] = {1664, 3213, 16218, -1813, -1187, 16112, 0, 0};
+
+// Yuv Bt709 -> Yuv Bt2100
+// Y' = (1.0f * Y) + (-0.016969f * U) + ( 0.096312f * V)
+// U' = (0.0f * Y) + ( 0.995306f * U) + (-0.051192f * V)
+// V' = (0.0f * Y) + ( 0.011507f * U) + ( 1.002637f * V)
+ALIGNED(16)
+const int16_t kYuv709To2100_coeffs_simd[8] = {-278, 1578, 16307, -839, 189, 16427, 0, 0};
+
+// Yuv Bt601 -> Yuv Bt709
+// Y' = (1.0f * Y) + (-0.118188f * U) + (-0.212685f * V),
+// U' = (0.0f * Y) + ( 1.018640f * U) + ( 0.114618f * V),
+// V' = (0.0f * Y) + ( 0.075049f * U) + ( 1.025327f * V);
+ALIGNED(16)
+const int16_t kYuv601To709_coeffs_simd[8] = {-1936, -3485, 16689, 1878, 1230, 16799, 0, 0};
+
+// Yuv Bt601 -> Yuv Bt2100
+// Y' = (1.0f * Y) + (-0.128245f * U) + (-0.115879f * V)
+// U' = (0.0f * Y) + ( 1.010016f * U) + ( 0.061592f * V)
+// V' = (0.0f * Y) + ( 0.086969f * U) + ( 1.029350f * V)
+ALIGNED(16)
+const int16_t kYuv601To2100_coeffs_simd[8] = {-2101, -1899, 16548, 1009, 1425, 16865, 0, 0};
+
+// Yuv Bt2100 -> Yuv Bt709
+// Y' = (1.0f * Y) + ( 0.018149f * U) + (-0.095132f * V)
+// U' = (0.0f * Y) + ( 1.004123f * U) + ( 0.051267f * V)
+// V' = (0.0f * Y) + (-0.011524f * U) + ( 0.996782f * V)
+ALIGNED(16)
+const int16_t kYuv2100To709_coeffs_simd[8] = {297, -1559, 16452, 840, -189, 16331, 0, 0};
+
+// Yuv Bt2100 -> Yuv Bt601
+// Y' = (1.0f * Y) + ( 0.117887f * U) + ( 0.105521f * V)
+// U' = (0.0f * Y) + ( 0.995211f * U) + (-0.059549f * V)
+// V' = (0.0f * Y) + (-0.084085f * U) + ( 0.976518f * V)
+ALIGNED(16)
+const int16_t kYuv2100To601_coeffs_simd[8] = {1931, 1729, 16306, -976, -1378, 15999, 0, 0};
+
+// RGB -> Yuv
+
+// In the 3x3 conversion matrix, 0.5 is duplicated. But represented as only one entry in lut leaving
+// with an array size of 8 elements.
+
+// RGB Bt709 -> Yuv Bt709
+// Y = 0.212639 * R + 0.715169 * G + 0.072192 * B
+// U = -0.114592135 * R + -0.385407865 * G + 0.5 * B
+// V = 0.5 * R + -0.454155718 * G + -0.045844282 * B
+ALIGNED(16)
+const uint16_t kRgb709ToYuv_coeffs_simd[8] = {3484, 11717, 1183, 1877, 6315, 8192, 7441, 751};
+
+// RGB Display P3 -> Yuv Display P3
+// Y = 0.2289746 * R + 0.6917385 * G + 0.0792869 * B
+// U = -0.124346335 * R + -0.375653665 * G + 0.5 * B
+// V = 0.5 * R + -0.448583471 * G + -0.051416529 * B
+ALIGNED(16)
+const uint16_t kRgbDispP3ToYuv_coeffs_simd[8] = {3752, 11333, 1299, 2037, 6155, 8192, 7350, 842};
+
+// RGB Bt2100 -> Yuv Bt2100
+// Y = 0.2627 * R + 0.677998 * G + 0.059302 * B
+// U = -0.13963036 * R + -0.36036964 * G + 0.5 * B
+// V = 0.5 * R + -0.459784348 * G + -0.040215652 * B
+ALIGNED(16)
+const uint16_t kRgb2100ToYuv_coeffs_simd[8] = {4304, 11108, 972, 2288, 5904, 8192, 7533, 659};
+
+#endif
+
 Color yuvColorGamutConversion(Color e_gamma, const std::array<float, 9>& coeffs) {
   const float y = e_gamma.y * std::get<0>(coeffs) + e_gamma.u * std::get<1>(coeffs) +
                   e_gamma.v * std::get<2>(coeffs);
diff --git a/lib/src/jpegr.cpp b/lib/src/jpegr.cpp
index 1f83b34d..0b2aeb52 100644
--- a/lib/src/jpegr.cpp
+++ b/lib/src/jpegr.cpp
@@ -216,6 +216,8 @@ uhdr_error_info_t JpegR::encodeJPEGR(uhdr_raw_image_t* hdr_intent, uhdr_compress
   if (isPixelFormatRgb(sdr_intent->fmt)) {
 #if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON)))
     sdr_intent_yuv_ext = convert_raw_input_to_ycbcr_neon(sdr_intent.get());
+#elif (defined(UHDR_ENABLE_INTRINSICS) && defined(__riscv_v_intrinsic))
+    sdr_intent_yuv_ext = convert_raw_input_to_ycbcr_rvv(sdr_intent.get());
 #else
     sdr_intent_yuv_ext = convert_raw_input_to_ycbcr(sdr_intent.get());
 #endif
@@ -255,6 +257,8 @@ uhdr_error_info_t JpegR::encodeJPEGR(uhdr_raw_image_t* hdr_intent, uhdr_raw_imag
   if (isPixelFormatRgb(sdr_intent->fmt)) {
 #if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON)))
     sdr_intent_yuv_ext = convert_raw_input_to_ycbcr_neon(sdr_intent);
+#elif (defined(UHDR_ENABLE_INTRINSICS) && defined(__riscv_v_intrinsic))
+    sdr_intent_yuv_ext = convert_raw_input_to_ycbcr_rvv(sdr_intent);
 #else
     sdr_intent_yuv_ext = convert_raw_input_to_ycbcr(sdr_intent);
 #endif
@@ -264,6 +268,8 @@ uhdr_error_info_t JpegR::encodeJPEGR(uhdr_raw_image_t* hdr_intent, uhdr_raw_imag
   // convert to bt601 YUV encoding for JPEG encode
 #if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON)))
   UHDR_ERR_CHECK(convertYuv_neon(sdr_intent_yuv, sdr_intent_yuv->cg, UHDR_CG_DISPLAY_P3));
+#elif (defined(UHDR_ENABLE_INTRINSICS) && defined(__riscv_v_intrinsic))
+  UHDR_ERR_CHECK(convertYuv_rvv(sdr_intent_yuv, sdr_intent_yuv->cg, UHDR_CG_DISPLAY_P3));
 #else
   UHDR_ERR_CHECK(convertYuv(sdr_intent_yuv, sdr_intent_yuv->cg, UHDR_CG_DISPLAY_P3));
 #endif
diff --git a/tests/gainmapmath_test.cpp b/tests/gainmapmath_test.cpp
index 91d942a8..240b667c 100644
--- a/tests/gainmapmath_test.cpp
+++ b/tests/gainmapmath_test.cpp
@@ -788,12 +788,12 @@ TEST_F(GainMapMathTest, YuvConversionNeon) {
   const std::array<
       std::tuple<const int16_t*, const std::array<Pixel, 5>, const std::array<Pixel, 5>>, 6>
       coeffs_setup_correct{{
-          {kYuv709To601_coeffs_neon, SrgbYuvColors, P3YuvColors},
-          {kYuv709To2100_coeffs_neon, SrgbYuvColors, Bt2100YuvColors},
-          {kYuv601To709_coeffs_neon, P3YuvColors, SrgbYuvColors},
-          {kYuv601To2100_coeffs_neon, P3YuvColors, Bt2100YuvColors},
-          {kYuv2100To709_coeffs_neon, Bt2100YuvColors, SrgbYuvColors},
-          {kYuv2100To601_coeffs_neon, Bt2100YuvColors, P3YuvColors},
+          {kYuv709To601_coeffs_simd, SrgbYuvColors, P3YuvColors},
+          {kYuv709To2100_coeffs_simd, SrgbYuvColors, Bt2100YuvColors},
+          {kYuv601To709_coeffs_simd, P3YuvColors, SrgbYuvColors},
+          {kYuv601To2100_coeffs_simd, P3YuvColors, Bt2100YuvColors},
+          {kYuv2100To709_coeffs_simd, Bt2100YuvColors, SrgbYuvColors},
+          {kYuv2100To601_coeffs_simd, Bt2100YuvColors, P3YuvColors},
       }};
 
   for (const auto& [coeff_ptr, input, expected] : coeffs_setup_correct) {
@@ -954,16 +954,15 @@ TEST_F(GainMapMathTest, TransformYuv420) {
     }
   }
 }
-
-#if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON)))
-TEST_F(GainMapMathTest, TransformYuv420Neon) {
+#ifdef UHDR_ENABLE_INTRINSICS
+TEST_F(GainMapMathTest, TransformYuv420SIMD) {
   const std::array<std::pair<const int16_t*, const std::array<float, 9>>, 6> fixed_floating_coeffs{
-      {{kYuv709To601_coeffs_neon, kYuvBt709ToBt601},
-       {kYuv709To2100_coeffs_neon, kYuvBt709ToBt2100},
-       {kYuv601To709_coeffs_neon, kYuvBt601ToBt709},
-       {kYuv601To2100_coeffs_neon, kYuvBt601ToBt2100},
-       {kYuv2100To709_coeffs_neon, kYuvBt2100ToBt709},
-       {kYuv2100To601_coeffs_neon, kYuvBt2100ToBt601}}};
+      {{kYuv709To601_coeffs_simd, kYuvBt709ToBt601},
+       {kYuv709To2100_coeffs_simd, kYuvBt709ToBt2100},
+       {kYuv601To709_coeffs_simd, kYuvBt601ToBt709},
+       {kYuv601To2100_coeffs_simd, kYuvBt601ToBt2100},
+       {kYuv2100To709_coeffs_simd, kYuvBt2100ToBt709},
+       {kYuv2100To601_coeffs_simd, kYuvBt2100ToBt601}}};
 
   for (const auto& [neon_coeffs_ptr, floating_point_coeffs] : fixed_floating_coeffs) {
     uhdr_raw_image_t input = Yuv420Image32x4();
@@ -980,8 +979,14 @@ TEST_F(GainMapMathTest, TransformYuv420Neon) {
     output.planes[UHDR_PLANE_Y] = luma;
     output.planes[UHDR_PLANE_U] = cb;
     output.planes[UHDR_PLANE_V] = cr;
-
+    
+#if (defined(__ARM_NEON__) || defined(__ARM_NEON))
     transformYuv420_neon(&output, neon_coeffs_ptr);
+#elif defined(__riscv_v_intrinsic)
+    transformYuv420_rvv(&output, neon_coeffs_ptr);
+#else
+    return;
+#endif
 
     for (size_t y = 0; y < input.h / 2; ++y) {
       for (size_t x = 0; x < input.w / 2; ++x) {
@@ -1014,11 +1019,17 @@ TEST_F(GainMapMathTest, TransformYuv420Neon) {
 
         // Due to the Neon version using a fixed-point approximation, this can result in an off by
         // one error compared with the standard floating-point version.
+#if defined(__riscv_v_intrinsic)
+        EXPECT_NEAR(expect_y1, out1.y, 2);
+        EXPECT_NEAR(expect_y2, out2.y, 2);
+        EXPECT_NEAR(expect_y3, out3.y, 2);
+        EXPECT_NEAR(expect_y4, out4.y, 2);
+#else
         EXPECT_NEAR(expect_y1, out1.y, 1);
         EXPECT_NEAR(expect_y2, out2.y, 1);
         EXPECT_NEAR(expect_y3, out3.y, 1);
         EXPECT_NEAR(expect_y4, out4.y, 1);
-
+#endif
         EXPECT_NEAR(expect_u, out1.u, 1);
         EXPECT_NEAR(expect_u, out2.u, 1);
         EXPECT_NEAR(expect_u, out3.u, 1);
@@ -1678,5 +1689,4 @@ TEST_F(GainMapMathTest, ApplyMap) {
   EXPECT_RGB_EQ(Recover(YuvWhite(), 0.25f, &metadata), RgbWhite());
   EXPECT_RGB_EQ(Recover(YuvWhite(), 0.0f, &metadata), RgbWhite() / 2.0f);
 }
-
 }  // namespace ultrahdr