diff --git a/.gitignore b/.gitignore index f7687219..f9309490 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,7 @@ build* third_party/googletest third_party/turbojpeg third_party/benchmark -tests/data \ No newline at end of file +tests/data + +# IDE's configs +.vscode/settings.json \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index c518d85e..1740fae5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -279,10 +279,10 @@ else() add_compile_options(-march=armv8-a) add_compile_options(-fno-lax-vector-conversions) elseif(ARCH STREQUAL "riscv64") - add_compile_options(-march=rv64gc) + add_compile_options(-march=rv64gcv) add_compile_options(-mabi=lp64d) elseif(ARCH STREQUAL "riscv32") - add_compile_options(-march=rv32gc) + add_compile_options(-march=rv32gcv) add_compile_options(-mabi=ilp32d) elseif(ARCH STREQUAL "loong64") add_compile_options(-march=loongarch64) @@ -553,6 +553,14 @@ if(UHDR_ENABLE_INTRINSICS) file(GLOB UHDR_CORE_NEON_SRCS_LIST "${SOURCE_DIR}/src/dsp/arm/*.cpp") list(APPEND UHDR_CORE_SRCS_LIST ${UHDR_CORE_NEON_SRCS_LIST}) endif() + if(ARCH STREQUAL "riscv64") + file(GLOB UHDR_CORE_RVV_SRCS_LIST "${SOURCE_DIR}/src/dsp/riscv/*.cpp") + list(APPEND UHDR_CORE_SRCS_LIST ${UHDR_CORE_RVV_SRCS_LIST}) + endif() + if(ARCH STREQUAL "riscv32") + file(GLOB UHDR_CORE_RVV_SRCS_LIST "${SOURCE_DIR}/src/dsp/riscv/*.cpp") + list(APPEND UHDR_CORE_SRCS_LIST ${UHDR_CORE_RVV_SRCS_LIST}) + endif() endif() if(UHDR_ENABLE_GLES) file(GLOB UHDR_CORE_GLES_SRCS_LIST "${SOURCE_DIR}/src/gpu/*.cpp") diff --git a/lib/include/ultrahdr/gainmapmath.h b/lib/include/ultrahdr/gainmapmath.h index d604ad2b..61144e12 100644 --- a/lib/include/ultrahdr/gainmapmath.h +++ b/lib/include/ultrahdr/gainmapmath.h @@ -414,14 +414,20 @@ extern const std::array kYuvBt601ToBt2100; extern const std::array kYuvBt2100ToBt709; extern const std::array kYuvBt2100ToBt601; -#if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON))) +#ifdef UHDR_ENABLE_INTRINSICS + +extern const int16_t kYuv709To601_coeffs_simd[8]; +extern const int16_t kYuv709To2100_coeffs_simd[8]; +extern const int16_t kYuv601To709_coeffs_simd[8]; +extern const int16_t kYuv601To2100_coeffs_simd[8]; +extern const int16_t kYuv2100To709_coeffs_simd[8]; +extern const int16_t kYuv2100To601_coeffs_simd[8]; + +extern const uint16_t kRgb709ToYuv_coeffs_simd[8]; +extern const uint16_t kRgbDispP3ToYuv_coeffs_simd[8]; +extern const uint16_t kRgb2100ToYuv_coeffs_simd[8]; -extern const int16_t kYuv709To601_coeffs_neon[8]; -extern const int16_t kYuv709To2100_coeffs_neon[8]; -extern const int16_t kYuv601To709_coeffs_neon[8]; -extern const int16_t kYuv601To2100_coeffs_neon[8]; -extern const int16_t kYuv2100To709_coeffs_neon[8]; -extern const int16_t kYuv2100To601_coeffs_neon[8]; +#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) /* * The Y values are provided at half the width of U & V values to allow use of the widening @@ -435,6 +441,15 @@ void transformYuv444_neon(uhdr_raw_image_t* image, const int16_t* coeffs_ptr); uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t src_encoding, uhdr_color_gamut_t dst_encoding); + +#elif defined(__riscv_v_intrinsic) + +void transformYuv420_rvv(uhdr_raw_image_t* image, const int16_t* coeffs_ptr); + +uhdr_error_info_t convertYuv_rvv(uhdr_raw_image_t* image, uhdr_color_gamut_t src_encoding, + uhdr_color_gamut_t dst_encoding); + +#endif #endif // Performs a color gamut transformation on an yuv image. @@ -588,6 +603,8 @@ std::unique_ptr convert_raw_input_to_ycbcr( #if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON))) std::unique_ptr convert_raw_input_to_ycbcr_neon(uhdr_raw_image_t* src); +#elif (defined(UHDR_ENABLE_INTRINSICS) && defined(__riscv_v_intrinsic)) +std::unique_ptr convert_raw_input_to_ycbcr_rvv(uhdr_raw_image_t* src); #endif bool floatToSignedFraction(float v, int32_t* numerator, uint32_t* denominator); diff --git a/lib/src/dsp/arm/gainmapmath_neon.cpp b/lib/src/dsp/arm/gainmapmath_neon.cpp index 306a971a..aba42898 100644 --- a/lib/src/dsp/arm/gainmapmath_neon.cpp +++ b/lib/src/dsp/arm/gainmapmath_neon.cpp @@ -27,55 +27,6 @@ namespace ultrahdr { -// Scale all coefficients by 2^14 to avoid needing floating-point arithmetic. This can cause an off -// by one error compared to the scalar floating-point implementation. - -// Removing conversion coefficients 1 and 0 from the group for each standard leaves 6 coefficients. -// Pack them into a single 128-bit vector as follows, zeroing the remaining elements: -// {Y1, Y2, U1, U2, V1, V2, 0, 0} - -// Yuv Bt709 -> Yuv Bt601 -// Y' = (1.0f * Y) + ( 0.101579f * U) + ( 0.196076f * V) -// U' = (0.0f * Y) + ( 0.989854f * U) + (-0.110653f * V) -// V' = (0.0f * Y) + (-0.072453f * U) + ( 0.983398f * V) -ALIGNED(16) -const int16_t kYuv709To601_coeffs_neon[8] = {1664, 3213, 16218, -1813, -1187, 16112, 0, 0}; - -// Yuv Bt709 -> Yuv Bt2100 -// Y' = (1.0f * Y) + (-0.016969f * U) + ( 0.096312f * V) -// U' = (0.0f * Y) + ( 0.995306f * U) + (-0.051192f * V) -// V' = (0.0f * Y) + ( 0.011507f * U) + ( 1.002637f * V) -ALIGNED(16) -const int16_t kYuv709To2100_coeffs_neon[8] = {-278, 1578, 16307, -839, 189, 16427, 0, 0}; - -// Yuv Bt601 -> Yuv Bt709 -// Y' = (1.0f * Y) + (-0.118188f * U) + (-0.212685f * V), -// U' = (0.0f * Y) + ( 1.018640f * U) + ( 0.114618f * V), -// V' = (0.0f * Y) + ( 0.075049f * U) + ( 1.025327f * V); -ALIGNED(16) -const int16_t kYuv601To709_coeffs_neon[8] = {-1936, -3485, 16689, 1878, 1230, 16799, 0, 0}; - -// Yuv Bt601 -> Yuv Bt2100 -// Y' = (1.0f * Y) + (-0.128245f * U) + (-0.115879f * V) -// U' = (0.0f * Y) + ( 1.010016f * U) + ( 0.061592f * V) -// V' = (0.0f * Y) + ( 0.086969f * U) + ( 1.029350f * V) -ALIGNED(16) -const int16_t kYuv601To2100_coeffs_neon[8] = {-2101, -1899, 16548, 1009, 1425, 16865, 0, 0}; - -// Yuv Bt2100 -> Yuv Bt709 -// Y' = (1.0f * Y) + ( 0.018149f * U) + (-0.095132f * V) -// U' = (0.0f * Y) + ( 1.004123f * U) + ( 0.051267f * V) -// V' = (0.0f * Y) + (-0.011524f * U) + ( 0.996782f * V) -ALIGNED(16) -const int16_t kYuv2100To709_coeffs_neon[8] = {297, -1559, 16452, 840, -189, 16331, 0, 0}; - -// Yuv Bt2100 -> Yuv Bt601 -// Y' = (1.0f * Y) + ( 0.117887f * U) + ( 0.105521f * V) -// U' = (0.0f * Y) + ( 0.995211f * U) + (-0.059549f * V) -// V' = (0.0f * Y) + (-0.084085f * U) + ( 0.976518f * V) -ALIGNED(16) -const int16_t kYuv2100To601_coeffs_neon[8] = {1931, 1729, 16306, -976, -1378, 15999, 0, 0}; - static inline int16x8_t yConversion_neon(uint8x8_t y, int16x8_t u, int16x8_t v, int16x8_t coeffs) { int32x4_t lo = vmull_lane_s16(vget_low_s16(u), vget_low_s16(coeffs), 0); int32x4_t hi = vmull_lane_s16(vget_high_s16(u), vget_low_s16(coeffs), 0); @@ -244,10 +195,10 @@ uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t sr case UHDR_CG_BT_709: return status; case UHDR_CG_DISPLAY_P3: - coeffs = kYuv709To601_coeffs_neon; + coeffs = kYuv709To601_coeffs_simd; break; case UHDR_CG_BT_2100: - coeffs = kYuv709To2100_coeffs_neon; + coeffs = kYuv709To2100_coeffs_simd; break; default: status.error_code = UHDR_CODEC_INVALID_PARAM; @@ -260,12 +211,12 @@ uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t sr case UHDR_CG_DISPLAY_P3: switch (dst_encoding) { case UHDR_CG_BT_709: - coeffs = kYuv601To709_coeffs_neon; + coeffs = kYuv601To709_coeffs_simd; break; case UHDR_CG_DISPLAY_P3: return status; case UHDR_CG_BT_2100: - coeffs = kYuv601To2100_coeffs_neon; + coeffs = kYuv601To2100_coeffs_simd; break; default: status.error_code = UHDR_CODEC_INVALID_PARAM; @@ -278,10 +229,10 @@ uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t sr case UHDR_CG_BT_2100: switch (dst_encoding) { case UHDR_CG_BT_709: - coeffs = kYuv2100To709_coeffs_neon; + coeffs = kYuv2100To709_coeffs_simd; break; case UHDR_CG_DISPLAY_P3: - coeffs = kYuv2100To601_coeffs_neon; + coeffs = kYuv2100To601_coeffs_simd; break; case UHDR_CG_BT_2100: return status; @@ -317,33 +268,6 @@ uhdr_error_info_t convertYuv_neon(uhdr_raw_image_t* image, uhdr_color_gamut_t sr return status; } -// Scale all coefficients by 2^14 to avoid needing floating-point arithmetic. This can cause an off -// by one error compared to the scalar floating-point implementation. - -// In the 3x3 conversion matrix, 0.5 is duplicated. But represented as only one entry in lut leaving -// with an array size of 8 elements. - -// RGB Bt709 -> Yuv Bt709 -// Y = 0.212639 * R + 0.715169 * G + 0.072192 * B -// U = -0.114592135 * R + -0.385407865 * G + 0.5 * B -// V = 0.5 * R + -0.454155718 * G + -0.045844282 * B -ALIGNED(16) -const uint16_t kRgb709ToYuv_coeffs_neon[8] = {3484, 11717, 1183, 1877, 6315, 8192, 7441, 751}; - -// RGB Display P3 -> Yuv Display P3 -// Y = 0.2289746 * R + 0.6917385 * G + 0.0792869 * B -// U = -0.124346335 * R + -0.375653665 * G + 0.5 * B -// V = 0.5 * R + -0.448583471 * G + -0.051416529 * B -ALIGNED(16) -const uint16_t kRgbDispP3ToYuv_coeffs_neon[8] = {3752, 11333, 1299, 2037, 6155, 8192, 7350, 842}; - -// RGB Bt2100 -> Yuv Bt2100 -// Y = 0.2627 * R + 0.677998 * G + 0.059302 * B -// U = -0.13963036 * R + -0.36036964 * G + 0.5 * B -// V = 0.5 * R + -0.459784348 * G + -0.040215652 * B -ALIGNED(16) -const uint16_t kRgb2100ToYuv_coeffs_neon[8] = {4304, 11108, 972, 2288, 5904, 8192, 7533, 659}; - // The core logic is taken from jsimd_rgb_ycc_convert_neon implementation in jccolext-neon.c of // libjpeg-turbo static void ConvertRgba8888ToYuv444_neon(uhdr_raw_image_t* src, uhdr_raw_image_t* dst, @@ -460,11 +384,11 @@ std::unique_ptr convert_raw_input_to_ycbcr_neon(uhdr_raw_i const uint16_t* coeffs_ptr = nullptr; if (src->cg == UHDR_CG_BT_709) { - coeffs_ptr = kRgb709ToYuv_coeffs_neon; + coeffs_ptr = kRgb709ToYuv_coeffs_simd; } else if (src->cg == UHDR_CG_BT_2100) { - coeffs_ptr = kRgbDispP3ToYuv_coeffs_neon; + coeffs_ptr = kRgbDispP3ToYuv_coeffs_simd; } else if (src->cg == UHDR_CG_DISPLAY_P3) { - coeffs_ptr = kRgb2100ToYuv_coeffs_neon; + coeffs_ptr = kRgb2100ToYuv_coeffs_simd; } else { return dst; } diff --git a/lib/src/dsp/riscv/gainmapmath_rvv.cpp b/lib/src/dsp/riscv/gainmapmath_rvv.cpp new file mode 100644 index 00000000..9d9d958b --- /dev/null +++ b/lib/src/dsp/riscv/gainmapmath_rvv.cpp @@ -0,0 +1,538 @@ +/* + * Copyright 2024 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ultrahdr/gainmapmath.h" +#include +#include + +namespace ultrahdr { + +static inline vuint16m8_t zip_self(vuint16m4_t a, size_t vl) { + vuint32m8_t a_wide = __riscv_vzext_vf2_u32m8(a, vl / 2); + vuint16m8_t a_zero = __riscv_vreinterpret_v_u32m8_u16m8(a_wide); + vuint16m8_t a_zero_slide = __riscv_vslide1up_vx_u16m8(a_zero, 0, vl); + vuint16m8_t a_zip = __riscv_vadd_vv_u16m8(a_zero, a_zero_slide, vl); + return a_zip; +} + +static inline vint16m4_t vqrshrn_n_s32(vint32m8_t a, const int b, size_t vl) { + return __riscv_vnclip_wx_i16m4(a, b, vl); +} + +static inline vuint8m4_t vget_low_u8(vuint8m8_t u) { return __riscv_vget_v_u8m8_u8m4(u, 0); } + +static inline vuint8m4_t vget_high_u8(vuint8m8_t u, size_t vl) { + return __riscv_vget_v_u8m8_u8m4(__riscv_vslidedown_vx_u8m8(u, vl / 2, vl), 0); +} + +static inline vint16m4_t vget_low_s16(vint16m8_t u) { return __riscv_vget_v_i16m8_i16m4(u, 0); } + +static inline vint16m4_t vget_high_s16(vint16m8_t u, size_t vl) { + return __riscv_vget_v_i16m8_i16m4(__riscv_vslidedown_vx_i16m8(u, vl / 2, vl), 0); +} + +static inline vuint16m4_t vget_low_u16(vuint16m8_t u) { return __riscv_vget_v_u16m8_u16m4(u, 0); } + +static inline vuint16m2_t vget_low_u16m4(vuint16m4_t u) { return __riscv_vget_v_u16m4_u16m2(u, 0); } + +static inline vuint16m4_t vget_high_u16(vuint16m8_t u, size_t vl) { + return __riscv_vget_v_u16m8_u16m4(__riscv_vslidedown_vx_u16m8(u, vl / 2, vl), 0); +} + +static inline vuint16m2_t vget_high_u16m4(vuint16m4_t u, size_t vl) { + return __riscv_vget_v_u16m4_u16m2(__riscv_vslidedown_vx_u16m4(u, vl / 2, vl), 0); +} + +static inline vint16m8_t vcombine_s16(vint16m4_t a, vint16m4_t b, size_t vl) { + vint16m8_t a_wide = __riscv_vlmul_ext_v_i16m4_i16m8(a); + vint16m8_t b_wide = __riscv_vlmul_ext_v_i16m4_i16m8(b); + return __riscv_vslideup_vx_i16m8(a_wide, b_wide, vl / 2, vl); +} + +static inline vuint8m8_t vcombine_u8(vuint8m4_t a, vuint8m4_t b, size_t vl) { + vuint8m8_t a_wide = __riscv_vlmul_ext_v_u8m4_u8m8(a); + vuint8m8_t b_wide = __riscv_vlmul_ext_v_u8m4_u8m8(b); + return __riscv_vslideup_vx_u8m8(a_wide, b_wide, vl / 2, vl); +} + +static inline vuint16m8_t vcombine_u16(vuint16m4_t a, vuint16m4_t b, size_t vl) { + vuint16m8_t a_wide = __riscv_vlmul_ext_v_u16m4_u16m8(a); + vuint16m8_t b_wide = __riscv_vlmul_ext_v_u16m4_u16m8(b); + return __riscv_vslideup_vx_u16m8(a_wide, b_wide, vl / 2, vl); +} + +static inline vuint8m4_t vmovn_u16(vuint16m8_t a, size_t vl) { + return __riscv_vnsrl_wx_u8m4(a, 0, vl); +} + +static inline vuint8m4_t vqmovun_s16(vint16m8_t a, size_t vl) { + vuint16m8_t a_non_neg = __riscv_vreinterpret_v_i16m8_u16m8(__riscv_vmax_vx_i16m8(a, 0, vl)); + return __riscv_vnclipu_wx_u8m4(a_non_neg, 0, vl); +} + +static inline vuint16m4_t vmovl_u8(vuint8m4_t a, size_t vl) { + vuint16m8_t a_16 = __riscv_vzext_vf2_u16m8(a, vl); + return __riscv_vlmul_trunc_v_u16m8_u16m4(a_16); +} + +static inline vuint16m4_t vrshrn_n_u32(vuint32m4_t a, const int b, size_t vl) { + vuint32m4_t a_round = __riscv_vadd_vx_u32m4(a, 1 << (b - 1), vl); + return __riscv_vnsrl_wx_u16m4(__riscv_vlmul_ext_v_u32m4_u32m8(a_round), b, vl); +} + +static inline vint16m8_t yConversion_rvv(vuint8m4_t y, vint16m8_t u, vint16m8_t v, + const int16_t* coeffs, size_t vl) { + vint32m8_t u_lo = __riscv_vwmul_vx_i32m8(vget_low_s16(u), coeffs[0], vl / 2); + vint32m8_t u_hi = __riscv_vwmul_vx_i32m8(vget_high_s16(u, vl), coeffs[0], vl / 2); + + vint32m8_t v_lo = __riscv_vwmul_vx_i32m8(vget_low_s16(v), coeffs[1], vl / 2); + vint32m8_t v_hi = __riscv_vwmul_vx_i32m8(vget_high_s16(v, vl), coeffs[1], vl / 2); + + vint32m8_t lo = __riscv_vadd_vv_i32m8(u_lo, v_lo, vl / 2); + vint32m8_t hi = __riscv_vadd_vv_i32m8(u_hi, v_hi, vl / 2); + + vint16m4_t lo_shr = vqrshrn_n_s32(lo, 14, vl / 2); + vint16m4_t hi_shr = vqrshrn_n_s32(hi, 14, vl / 2); + + vint16m8_t y_output = vcombine_s16(lo_shr, hi_shr, vl); + vuint16m8_t y_u16 = __riscv_vreinterpret_v_i16m8_u16m8(y_output); + vuint16m8_t y_ret = __riscv_vwaddu_wv_u16m8(y_u16, y, vl); + return __riscv_vreinterpret_v_u16m8_i16m8(y_ret); +} + +static inline vint16m8_t uConversion_rvv(vint16m8_t u, vint16m8_t v, const int16_t* coeffs, + size_t vl) { + vint32m8_t u_lo = __riscv_vwmul_vx_i32m8(vget_low_s16(u), coeffs[2], vl / 2); + vint32m8_t u_hi = __riscv_vwmul_vx_i32m8(vget_high_s16(u, vl), coeffs[2], vl / 2); + + vint32m8_t v_lo = __riscv_vwmul_vx_i32m8(vget_low_s16(v), coeffs[3], vl / 2); + vint32m8_t v_hi = __riscv_vwmul_vx_i32m8(vget_high_s16(v, vl), coeffs[3], vl / 2); + + vint32m8_t lo = __riscv_vadd_vv_i32m8(u_lo, v_lo, vl / 2); + vint32m8_t hi = __riscv_vadd_vv_i32m8(u_hi, v_hi, vl / 2); + + vint16m4_t lo_shr = vqrshrn_n_s32(lo, 14, vl / 2); + vint16m4_t hi_shr = vqrshrn_n_s32(hi, 14, vl / 2); + + vint16m8_t u_output = vcombine_s16(lo_shr, hi_shr, vl); + return u_output; +} + +static inline vint16m8_t vConversion_rvv(vint16m8_t u, vint16m8_t v, const int16_t* coeffs, + size_t vl) { + vint32m8_t u_lo = __riscv_vwmul_vx_i32m8(vget_low_s16(u), coeffs[4], vl / 2); + vint32m8_t u_hi = __riscv_vwmul_vx_i32m8(vget_high_s16(u, vl), coeffs[4], vl / 2); + + vint32m8_t v_lo = __riscv_vwmul_vx_i32m8(vget_low_s16(v), coeffs[5], vl / 2); + vint32m8_t v_hi = __riscv_vwmul_vx_i32m8(vget_high_s16(v, vl), coeffs[5], vl / 2); + + vint32m8_t lo = __riscv_vadd_vv_i32m8(u_lo, v_lo, vl / 2); + vint32m8_t hi = __riscv_vadd_vv_i32m8(u_hi, v_hi, vl / 2); + + vint16m4_t lo_shr = vqrshrn_n_s32(lo, 14, vl / 2); + vint16m4_t hi_shr = vqrshrn_n_s32(hi, 14, vl / 2); + + vint16m8_t v_output = vcombine_s16(lo_shr, hi_shr, vl); + return v_output; +} + +void transformYuv420_rvv(uhdr_raw_image_t* image, const int16_t* coeffs_ptr) { + assert(image->w % 16 == 0); + uint8_t* y0_ptr = static_cast(image->planes[UHDR_PLANE_Y]); + uint8_t* y1_ptr = y0_ptr + image->stride[UHDR_PLANE_Y]; + uint8_t* u_ptr = static_cast(image->planes[UHDR_PLANE_U]); + uint8_t* v_ptr = static_cast(image->planes[UHDR_PLANE_V]); + size_t vl; + size_t h = 0; + do { + size_t w = 0; + do { + vl = __riscv_vsetvl_e8m8((image->w) - w); + assert((vl % 4) == 0 && vl >= 4); + + vuint8m8_t y0 = __riscv_vle8_v_u8m8(y0_ptr + w * 2, vl); + vuint8m8_t y1 = __riscv_vle8_v_u8m8(y1_ptr + w * 2, vl); + + vuint8m4_t u8 = __riscv_vle8_v_u8m4(u_ptr + w, vl / 2); + vuint8m4_t v8 = __riscv_vle8_v_u8m4(v_ptr + w, vl / 2); + + vuint16m8_t u16_wide = __riscv_vwsubu_vx_u16m8(u8, 128, vl / 2); + vuint16m8_t v16_wide = __riscv_vwsubu_vx_u16m8(v8, 128, vl / 2); + + vuint16m8_t uu_wide_lo = zip_self(__riscv_vget_v_u16m8_u16m4(u16_wide, 0), vl / 2); + vuint16m8_t uu_wide_hi = zip_self(vget_high_u16(u16_wide, vl / 2), vl / 2); + vuint16m8_t uv_wide_lo = zip_self(__riscv_vget_v_u16m8_u16m4(v16_wide, 0), vl / 2); + vuint16m8_t uv_wide_hi = zip_self(vget_high_u16(v16_wide, vl / 2), vl / 2); + + vint16m8_t u_wide_lo = __riscv_vreinterpret_v_u16m8_i16m8(uu_wide_lo); + vint16m8_t v_wide_lo = __riscv_vreinterpret_v_u16m8_i16m8(uv_wide_lo); + vint16m8_t u_wide_hi = __riscv_vreinterpret_v_u16m8_i16m8(uu_wide_hi); + vint16m8_t v_wide_hi = __riscv_vreinterpret_v_u16m8_i16m8(uv_wide_hi); + + vint16m8_t y0_lo = yConversion_rvv(vget_low_u8(y0), u_wide_lo, v_wide_lo, coeffs_ptr, vl / 2); + vint16m8_t y1_lo = yConversion_rvv(vget_low_u8(y1), u_wide_lo, v_wide_lo, coeffs_ptr, vl / 2); + vint16m8_t y0_hi = + yConversion_rvv(vget_high_u8(y0, vl / 2), u_wide_hi, v_wide_hi, coeffs_ptr, vl / 2); + vint16m8_t y1_hi = + yConversion_rvv(vget_high_u8(y1, vl / 2), u_wide_hi, v_wide_hi, coeffs_ptr, vl / 2); + + vint16m8_t u_wide_s16 = __riscv_vreinterpret_v_u16m8_i16m8(u16_wide); + vint16m8_t v_wide_s16 = __riscv_vreinterpret_v_u16m8_i16m8(v16_wide); + vint16m8_t new_u = uConversion_rvv(u_wide_s16, v_wide_s16, coeffs_ptr, vl / 2); + vint16m8_t new_v = vConversion_rvv(u_wide_s16, v_wide_s16, coeffs_ptr, vl / 2); + + vuint8m8_t y0_output = + vcombine_u8(vqmovun_s16(y0_lo, vl / 2), vqmovun_s16(y0_hi, vl / 2), vl); + vuint8m8_t y1_output = + vcombine_u8(vqmovun_s16(y1_lo, vl / 2), vqmovun_s16(y1_hi, vl / 2), vl); + vuint8m4_t u_output = vqmovun_s16(__riscv_vadd_vx_i16m8(new_u, 128, vl / 2), vl / 2); + vuint8m4_t v_output = vqmovun_s16(__riscv_vadd_vx_i16m8(new_v, 128, vl / 2), vl / 2); + + __riscv_vse8_v_u8m8(y0_ptr + w * 2, y0_output, vl); + __riscv_vse8_v_u8m8(y1_ptr + w * 2, y1_output, vl); + __riscv_vse8_v_u8m4(u_ptr + w, u_output, vl / 2); + __riscv_vse8_v_u8m4(v_ptr + w, v_output, vl / 2); + + w += (vl / 2); + } while (w < image->w / 2); + y0_ptr += image->stride[UHDR_PLANE_Y] * 2; + y1_ptr += image->stride[UHDR_PLANE_Y] * 2; + u_ptr += image->stride[UHDR_PLANE_U]; + v_ptr += image->stride[UHDR_PLANE_V]; + } while (++h < image->h / 2); +} + +void transformYuv444_rvv(uhdr_raw_image_t* image, const int16_t* coeffs_ptr) { + // Implementation assumes image buffer is multiple of 16. + assert(image->w % 16 == 0); + uint8_t* y_ptr = static_cast(image->planes[UHDR_PLANE_Y]); + uint8_t* u_ptr = static_cast(image->planes[UHDR_PLANE_U]); + uint8_t* v_ptr = static_cast(image->planes[UHDR_PLANE_V]); + + size_t vl; + size_t h = 0; + do { + size_t w = 0; + do { + vl = __riscv_vsetvl_e8m8((image->w) - w); + + vuint8m8_t y = __riscv_vle8_v_u8m8(y_ptr + w, vl); + vuint8m8_t u = __riscv_vle8_v_u8m8(u_ptr + w, vl); + vuint8m8_t v = __riscv_vle8_v_u8m8(v_ptr + w, vl); + + vuint16m8_t u16_wide_low = __riscv_vwsubu_vx_u16m8(vget_low_u8(u), 128, vl / 2); + vuint16m8_t v16_wide_low = __riscv_vwsubu_vx_u16m8(vget_low_u8(v), 128, vl / 2); + vuint16m8_t u16_wide_high = __riscv_vwsubu_vx_u16m8(vget_high_u8(u, vl), 128, vl / 2); + vuint16m8_t v16_wide_high = __riscv_vwsubu_vx_u16m8(vget_high_u8(v, vl), 128, vl / 2); + + vint16m8_t u_wide_low_s16 = __riscv_vreinterpret_v_u16m8_i16m8(u16_wide_low); + vint16m8_t v_wide_low_s16 = __riscv_vreinterpret_v_u16m8_i16m8(v16_wide_low); + vint16m8_t u_wide_high_s16 = __riscv_vreinterpret_v_u16m8_i16m8(u16_wide_high); + vint16m8_t v_wide_high_s16 = __riscv_vreinterpret_v_u16m8_i16m8(v16_wide_high); + + vint16m8_t y_lo = + yConversion_rvv(vget_low_u8(y), u_wide_low_s16, v_wide_low_s16, coeffs_ptr, vl / 2); + vint16m8_t y_hi = yConversion_rvv(vget_high_u8(y, vl / 2), u_wide_high_s16, v_wide_high_s16, + coeffs_ptr, vl / 2); + + vint16m8_t new_u_lo = uConversion_rvv(u_wide_low_s16, v_wide_low_s16, coeffs_ptr, vl / 2); + vint16m8_t new_v_lo = vConversion_rvv(u_wide_low_s16, v_wide_low_s16, coeffs_ptr, vl / 2); + vint16m8_t new_u_hi = uConversion_rvv(u_wide_high_s16, v_wide_high_s16, coeffs_ptr, vl / 2); + vint16m8_t new_v_hi = vConversion_rvv(u_wide_high_s16, v_wide_high_s16, coeffs_ptr, vl / 2); + + // Narrow from 16-bit to 8-bit with saturation. + vuint8m8_t y_output = vcombine_u8(vqmovun_s16(y_lo, vl / 2), vqmovun_s16(y_hi, vl / 2), vl); + vuint8m4_t u_output_hi = vqmovun_s16(__riscv_vadd_vx_i16m8(new_u_hi, 128, vl / 2), vl / 2); + vuint8m4_t u_output_lo = vqmovun_s16(__riscv_vadd_vx_i16m8(new_u_lo, 128, vl / 2), vl / 2); + vuint8m4_t v_output_hi = vqmovun_s16(__riscv_vadd_vx_i16m8(new_v_hi, 128, vl / 2), vl / 2); + vuint8m4_t v_output_lo = vqmovun_s16(__riscv_vadd_vx_i16m8(new_v_lo, 128, vl / 2), vl / 2); + + vuint8m8_t u_output = vcombine_u8(u_output_lo, u_output_hi, vl); + vuint8m8_t v_output = vcombine_u8(v_output_lo, v_output_hi, vl); + + __riscv_vse8_v_u8m8(y_ptr + w, y_output, vl); + __riscv_vse8_v_u8m8(u_ptr + w, u_output, vl); + __riscv_vse8_v_u8m8(v_ptr + w, v_output, vl); + + w += vl; + } while (w < image->w); + y_ptr += image->stride[UHDR_PLANE_Y]; + u_ptr += image->stride[UHDR_PLANE_U]; + v_ptr += image->stride[UHDR_PLANE_V]; + } while (++h < image->h); +} + +uhdr_error_info_t convertYuv_rvv(uhdr_raw_image_t* image, uhdr_color_gamut_t src_encoding, + uhdr_color_gamut_t dst_encoding) { + uhdr_error_info_t status = g_no_error; + const int16_t* coeffs = nullptr; + + switch (src_encoding) { + case UHDR_CG_BT_709: + switch (dst_encoding) { + case UHDR_CG_BT_709: + return status; + case UHDR_CG_DISPLAY_P3: + coeffs = kYuv709To601_coeffs_simd; + break; + case UHDR_CG_BT_2100: + coeffs = kYuv709To2100_coeffs_simd; + break; + default: + status.error_code = UHDR_CODEC_INVALID_PARAM; + status.has_detail = 1; + snprintf(status.detail, sizeof status.detail, "Unrecognized dest color gamut %d", + dst_encoding); + return status; + } + break; + case UHDR_CG_DISPLAY_P3: + switch (dst_encoding) { + case UHDR_CG_BT_709: + coeffs = kYuv601To709_coeffs_simd; + break; + case UHDR_CG_DISPLAY_P3: + return status; + case UHDR_CG_BT_2100: + coeffs = kYuv601To2100_coeffs_simd; + break; + default: + status.error_code = UHDR_CODEC_INVALID_PARAM; + status.has_detail = 1; + snprintf(status.detail, sizeof status.detail, "Unrecognized dest color gamut %d", + dst_encoding); + return status; + } + break; + case UHDR_CG_BT_2100: + switch (dst_encoding) { + case UHDR_CG_BT_709: + coeffs = kYuv2100To709_coeffs_simd; + break; + case UHDR_CG_DISPLAY_P3: + coeffs = kYuv2100To601_coeffs_simd; + break; + case UHDR_CG_BT_2100: + return status; + default: + status.error_code = UHDR_CODEC_INVALID_PARAM; + status.has_detail = 1; + snprintf(status.detail, sizeof status.detail, "Unrecognized dest color gamut %d", + dst_encoding); + return status; + } + break; + default: + status.error_code = UHDR_CODEC_INVALID_PARAM; + status.has_detail = 1; + snprintf(status.detail, sizeof status.detail, "Unrecognized src color gamut %d", + src_encoding); + return status; + } + + if (image->fmt == UHDR_IMG_FMT_12bppYCbCr420) { + transformYuv420_rvv(image, coeffs); + } else if (image->fmt == UHDR_IMG_FMT_24bppYCbCr444) { + transformYuv444_rvv(image, coeffs); + } else { + status.error_code = UHDR_CODEC_UNSUPPORTED_FEATURE; + status.has_detail = 1; + snprintf(status.detail, sizeof status.detail, + "No implementation available for performing gamut conversion for color format %d", + image->fmt); + return status; + } + + return status; +} + +static void ConvertRgba8888ToYuv444_rvv(uhdr_raw_image_t* src, uhdr_raw_image_t* dst, + const uint16_t* coeffs_ptr) { + assert(src->stride[UHDR_PLANE_PACKED] % 16 == 0); + uint8_t* rgba_base_ptr = static_cast(src->planes[UHDR_PLANE_PACKED]); + + uint8_t* y_base_ptr = static_cast(dst->planes[UHDR_PLANE_Y]); + uint8_t* u_base_ptr = static_cast(dst->planes[UHDR_PLANE_U]); + uint8_t* v_base_ptr = static_cast(dst->planes[UHDR_PLANE_V]); + + uint32_t bias = (128 << 14) + 8191; + + size_t vl; + size_t h = 0; + do { + size_t w = 0; + uint8_t* rgba_ptr = rgba_base_ptr + (size_t)src->stride[UHDR_PLANE_PACKED] * 4 * h; + uint8_t* y_ptr = y_base_ptr + (size_t)dst->stride[UHDR_PLANE_Y] * h; + uint8_t* u_ptr = u_base_ptr + (size_t)dst->stride[UHDR_PLANE_U] * h; + uint8_t* v_ptr = v_base_ptr + (size_t)dst->stride[UHDR_PLANE_V] * h; + do { + vl = __riscv_vsetvl_e8m8((src->w) - w); + assert(vl % 4 == 0); + + vuint8m8_t r = __riscv_vlse8_v_u8m8(rgba_ptr, 4, vl); + vuint8m8_t g = __riscv_vlse8_v_u8m8(rgba_ptr, 4, vl); + vuint8m8_t b = __riscv_vlse8_v_u8m8(rgba_ptr, 4, vl); + + vuint16m4_t r_l = vmovl_u8(vget_low_u8(r), vl / 2); + vuint16m4_t r_h = vmovl_u8(vget_high_u8(r, vl / 2), vl / 2); + vuint16m4_t g_l = vmovl_u8(vget_low_u8(g), vl / 2); + vuint16m4_t g_h = vmovl_u8(vget_high_u8(g, vl / 2), vl / 2); + vuint16m4_t b_l = vmovl_u8(vget_low_u8(b), vl / 2); + vuint16m4_t b_h = vmovl_u8(vget_high_u8(b, vl / 2), vl / 2); + + vuint32m4_t y_ll = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(r_l), coeffs_ptr[0], vl / 4); + y_ll = __riscv_vwmaccu_vx_u32m4(y_ll, coeffs_ptr[1], vget_low_u16m4(g_l), vl / 4); + y_ll = __riscv_vwmaccu_vx_u32m4(y_ll, coeffs_ptr[2], vget_low_u16m4(b_l), vl / 4); + vuint32m4_t y_lh = + __riscv_vwmulu_vx_u32m4(vget_high_u16m4(r_l, vl / 2), coeffs_ptr[0], vl / 4); + y_lh = __riscv_vwmaccu_vx_u32m4(y_lh, coeffs_ptr[1], vget_high_u16m4(g_l, vl / 2), vl / 4); + y_lh = __riscv_vwmaccu_vx_u32m4(y_lh, coeffs_ptr[2], vget_high_u16m4(b_l, vl / 2), vl / 4); + vuint32m4_t y_hl = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(r_h), coeffs_ptr[0], vl / 4); + y_hl = __riscv_vwmaccu_vx_u32m4(y_hl, coeffs_ptr[1], vget_low_u16m4(g_h), vl / 4); + y_hl = __riscv_vwmaccu_vx_u32m4(y_hl, coeffs_ptr[2], vget_low_u16m4(b_h), vl / 4); + vuint32m4_t y_hh = + __riscv_vwmulu_vx_u32m4(vget_high_u16m4(r_h, vl / 2), coeffs_ptr[0], vl / 4); + y_hh = __riscv_vwmaccu_vx_u32m4(y_hh, coeffs_ptr[1], vget_high_u16m4(g_h, vl / 2), vl / 4); + y_hh = __riscv_vwmaccu_vx_u32m4(y_hh, coeffs_ptr[2], vget_high_u16m4(b_h, vl / 2), vl / 4); + + // B - R - G + bias + vuint32m4_t cb_ll = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(b_l), coeffs_ptr[5], vl / 4); + vuint32m4_t cb_r_ll = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(r_l), coeffs_ptr[3], vl / 4); + cb_ll = __riscv_vsub_vv_u32m4(cb_ll, cb_r_ll, vl / 4); + vuint32m4_t cb_g_ll = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(g_l), coeffs_ptr[4], vl / 4); + cb_ll = __riscv_vsub_vv_u32m4(cb_ll, cb_g_ll, vl / 4); + cb_ll = __riscv_vadd_vx_u32m4(cb_ll, bias, vl / 4); + + vuint32m4_t cb_lh = + __riscv_vwmulu_vx_u32m4(vget_high_u16m4(b_l, vl / 2), coeffs_ptr[5], vl / 4); + vuint32m4_t cb_r_lh = + __riscv_vwmulu_vx_u32m4(vget_high_u16m4(r_l, vl / 2), coeffs_ptr[3], vl / 4); + cb_lh = __riscv_vsub_vv_u32m4(cb_lh, cb_r_lh, vl / 4); + vuint32m4_t cb_g_lh = + __riscv_vwmulu_vx_u32m4(vget_high_u16m4(g_l, vl / 2), coeffs_ptr[4], vl / 4); + cb_lh = __riscv_vsub_vv_u32m4(cb_lh, cb_g_lh, vl / 4); + cb_lh = __riscv_vadd_vx_u32m4(cb_lh, bias, vl / 4); + + vuint32m4_t cb_hl = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(b_h), coeffs_ptr[5], vl / 4); + vuint32m4_t cb_r_hl = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(r_h), coeffs_ptr[3], vl / 4); + cb_hl = __riscv_vsub_vv_u32m4(cb_hl, cb_r_hl, vl / 4); + vuint32m4_t cb_g_hl = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(g_h), coeffs_ptr[4], vl / 4); + cb_hl = __riscv_vsub_vv_u32m4(cb_hl, cb_g_hl, vl / 4); + cb_hl = __riscv_vadd_vx_u32m4(cb_hl, bias, vl / 4); + + vuint32m4_t cb_hh = + __riscv_vwmulu_vx_u32m4(vget_high_u16m4(b_h, vl / 2), coeffs_ptr[5], vl / 4); + vuint32m4_t cb_r_hh = + __riscv_vwmulu_vx_u32m4(vget_high_u16m4(r_h, vl / 2), coeffs_ptr[3], vl / 4); + cb_hh = __riscv_vsub_vv_u32m4(cb_hh, cb_r_hh, vl / 4); + vuint32m4_t cb_g_hh = + __riscv_vwmulu_vx_u32m4(vget_high_u16m4(g_h, vl / 2), coeffs_ptr[4], vl / 4); + cb_hh = __riscv_vsub_vv_u32m4(cb_hh, cb_g_hh, vl / 4); + cb_hh = __riscv_vadd_vx_u32m4(cb_hh, bias, vl / 4); + + // R - G - B + bias + vuint32m4_t cr_ll = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(r_l), coeffs_ptr[5], vl / 4); + vuint32m4_t cr_g_ll = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(g_l), coeffs_ptr[6], vl / 4); + cr_ll = __riscv_vsub_vv_u32m4(cr_ll, cr_g_ll, vl / 4); + vuint32m4_t cr_b_ll = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(b_l), coeffs_ptr[7], vl / 4); + cr_ll = __riscv_vsub_vv_u32m4(cr_ll, cr_b_ll, vl / 4); + cr_ll = __riscv_vadd_vx_u32m4(cr_ll, bias, vl / 4); + + vuint32m4_t cr_lh = + __riscv_vwmulu_vx_u32m4(vget_high_u16m4(r_l, vl / 2), coeffs_ptr[5], vl / 4); + vuint32m4_t cr_g_lh = + __riscv_vwmulu_vx_u32m4(vget_high_u16m4(g_l, vl / 2), coeffs_ptr[6], vl / 4); + cr_lh = __riscv_vsub_vv_u32m4(cr_lh, cr_g_lh, vl / 4); + vuint32m4_t cr_b_lh = + __riscv_vwmulu_vx_u32m4(vget_high_u16m4(b_l, vl / 2), coeffs_ptr[7], vl / 4); + cr_lh = __riscv_vsub_vv_u32m4(cr_lh, cr_b_lh, vl / 4); + cr_lh = __riscv_vadd_vx_u32m4(cr_lh, bias, vl / 4); + + vuint32m4_t cr_hl = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(r_h), coeffs_ptr[5], vl / 4); + vuint32m4_t cr_g_hl = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(g_h), coeffs_ptr[6], vl / 4); + cr_hl = __riscv_vsub_vv_u32m4(cr_hl, cr_g_hl, vl / 4); + vuint32m4_t cr_b_hl = __riscv_vwmulu_vx_u32m4(vget_low_u16m4(b_h), coeffs_ptr[7], vl / 4); + cr_hl = __riscv_vsub_vv_u32m4(cr_hl, cr_b_hl, vl / 4); + cr_hl = __riscv_vadd_vx_u32m4(cr_hl, bias, vl / 4); + + vuint32m4_t cr_hh = + __riscv_vwmulu_vx_u32m4(vget_high_u16m4(r_h, vl / 2), coeffs_ptr[5], vl / 4); + vuint32m4_t cr_g_hh = + __riscv_vwmulu_vx_u32m4(vget_high_u16m4(g_h, vl / 2), coeffs_ptr[6], vl / 4); + cr_hh = __riscv_vsub_vv_u32m4(cr_hh, cr_g_hh, vl / 4); + vuint32m4_t cr_b_hh = + __riscv_vwmulu_vx_u32m4(vget_high_u16m4(b_h, vl / 2), coeffs_ptr[7], vl / 4); + cr_hh = __riscv_vsub_vv_u32m4(cr_hh, cr_b_hh, vl / 4); + cr_hh = __riscv_vadd_vx_u32m4(cr_hh, bias, vl / 4); + + vuint16m8_t y_l = + vcombine_u16(vrshrn_n_u32(y_ll, 14, vl / 4), vrshrn_n_u32(y_lh, 14, vl / 4), vl / 2); + vuint16m8_t y_h = + vcombine_u16(vrshrn_n_u32(y_hl, 14, vl / 4), vrshrn_n_u32(y_hh, 14, vl / 4), vl / 2); + vuint16m8_t cb_l = + vcombine_u16(vrshrn_n_u32(cb_ll, 14, vl / 4), vrshrn_n_u32(cb_lh, 14, vl / 4), vl / 2); + vuint16m8_t cb_h = + vcombine_u16(vrshrn_n_u32(cb_hl, 14, vl / 4), vrshrn_n_u32(cb_hh, 14, vl / 4), vl / 2); + vuint16m8_t cr_l = + vcombine_u16(vrshrn_n_u32(cr_ll, 14, vl / 4), vrshrn_n_u32(cr_lh, 14, vl / 4), vl / 2); + vuint16m8_t cr_h = + vcombine_u16(vrshrn_n_u32(cr_hl, 14, vl / 4), vrshrn_n_u32(cr_hh, 14, vl / 4), vl / 2); + + __riscv_vse8_v_u8m8(y_ptr, vcombine_u8(vmovn_u16(y_l, vl / 2), vmovn_u16(y_h, vl / 2), vl), + vl); + __riscv_vse8_v_u8m8(u_ptr, vcombine_u8(vmovn_u16(cb_l, vl / 2), vmovn_u16(cb_h, vl / 2), vl), + vl); + __riscv_vse8_v_u8m8(v_ptr, vcombine_u8(vmovn_u16(cr_l, vl / 2), vmovn_u16(cr_h, vl / 2), vl), + vl); + + /* Increment pointers. */ + rgba_ptr += (vl * 4); + y_ptr += vl; + u_ptr += vl; + v_ptr += vl; + + w += vl; + } while (w < src->w); + rgba_base_ptr += src->stride[UHDR_PLANE_PACKED]; + y_base_ptr += dst->stride[UHDR_PLANE_Y]; + u_base_ptr += dst->stride[UHDR_PLANE_U]; + v_base_ptr += dst->stride[UHDR_PLANE_V]; + } while (++h < src->h); +} + +std::unique_ptr convert_raw_input_to_ycbcr_rvv(uhdr_raw_image_t* src) { + if (src->fmt == UHDR_IMG_FMT_32bppRGBA8888) { + std::unique_ptr dst = nullptr; + const uint16_t* coeffs_ptr = nullptr; + + if (src->cg == UHDR_CG_BT_709) { + coeffs_ptr = kRgb709ToYuv_coeffs_simd; + } else if (src->cg == UHDR_CG_BT_2100) { + coeffs_ptr = kRgbDispP3ToYuv_coeffs_simd; + } else if (src->cg == UHDR_CG_DISPLAY_P3) { + coeffs_ptr = kRgb2100ToYuv_coeffs_simd; + } else { + return dst; + } + dst = std::make_unique(UHDR_IMG_FMT_24bppYCbCr444, src->cg, src->ct, + UHDR_CR_FULL_RANGE, src->w, src->h, 64); + ConvertRgba8888ToYuv444_rvv(src, dst.get(), coeffs_ptr); + return dst; + } + return nullptr; +} + +} // namespace ultrahdr diff --git a/lib/src/gainmapmath.cpp b/lib/src/gainmapmath.cpp index fa56c3e8..f90e3564 100644 --- a/lib/src/gainmapmath.cpp +++ b/lib/src/gainmapmath.cpp @@ -684,6 +684,90 @@ const std::array kYuvBt2100ToBt709 = { const std::array kYuvBt2100ToBt601 = { 1.0f, 0.117887f, 0.105521f, 0.0f, 0.995211f, -0.059549f, 0.0f, -0.084085f, 0.976518f}; +#ifdef UHDR_ENABLE_INTRINSICS + +#ifdef _MSC_VER +#define ALIGNED(x) __declspec(align(x)) +#else +#define ALIGNED(x) __attribute__((aligned(x))) +#endif +// Scale all coefficients by 2^14 to avoid needing floating-point arithmetic. This can cause an off +// by one error compared to the scalar floating-point implementation. + +// Removing conversion coefficients 1 and 0 from the group for each standard leaves 6 coefficients. +// Pack them into a single 128-bit vector as follows, zeroing the remaining elements: +// {Y1, Y2, U1, U2, V1, V2, 0, 0} + +// Yuv Bt709 -> Yuv Bt601 +// Y' = (1.0f * Y) + ( 0.101579f * U) + ( 0.196076f * V) +// U' = (0.0f * Y) + ( 0.989854f * U) + (-0.110653f * V) +// V' = (0.0f * Y) + (-0.072453f * U) + ( 0.983398f * V) +ALIGNED(16) +const int16_t kYuv709To601_coeffs_simd[8] = {1664, 3213, 16218, -1813, -1187, 16112, 0, 0}; + +// Yuv Bt709 -> Yuv Bt2100 +// Y' = (1.0f * Y) + (-0.016969f * U) + ( 0.096312f * V) +// U' = (0.0f * Y) + ( 0.995306f * U) + (-0.051192f * V) +// V' = (0.0f * Y) + ( 0.011507f * U) + ( 1.002637f * V) +ALIGNED(16) +const int16_t kYuv709To2100_coeffs_simd[8] = {-278, 1578, 16307, -839, 189, 16427, 0, 0}; + +// Yuv Bt601 -> Yuv Bt709 +// Y' = (1.0f * Y) + (-0.118188f * U) + (-0.212685f * V), +// U' = (0.0f * Y) + ( 1.018640f * U) + ( 0.114618f * V), +// V' = (0.0f * Y) + ( 0.075049f * U) + ( 1.025327f * V); +ALIGNED(16) +const int16_t kYuv601To709_coeffs_simd[8] = {-1936, -3485, 16689, 1878, 1230, 16799, 0, 0}; + +// Yuv Bt601 -> Yuv Bt2100 +// Y' = (1.0f * Y) + (-0.128245f * U) + (-0.115879f * V) +// U' = (0.0f * Y) + ( 1.010016f * U) + ( 0.061592f * V) +// V' = (0.0f * Y) + ( 0.086969f * U) + ( 1.029350f * V) +ALIGNED(16) +const int16_t kYuv601To2100_coeffs_simd[8] = {-2101, -1899, 16548, 1009, 1425, 16865, 0, 0}; + +// Yuv Bt2100 -> Yuv Bt709 +// Y' = (1.0f * Y) + ( 0.018149f * U) + (-0.095132f * V) +// U' = (0.0f * Y) + ( 1.004123f * U) + ( 0.051267f * V) +// V' = (0.0f * Y) + (-0.011524f * U) + ( 0.996782f * V) +ALIGNED(16) +const int16_t kYuv2100To709_coeffs_simd[8] = {297, -1559, 16452, 840, -189, 16331, 0, 0}; + +// Yuv Bt2100 -> Yuv Bt601 +// Y' = (1.0f * Y) + ( 0.117887f * U) + ( 0.105521f * V) +// U' = (0.0f * Y) + ( 0.995211f * U) + (-0.059549f * V) +// V' = (0.0f * Y) + (-0.084085f * U) + ( 0.976518f * V) +ALIGNED(16) +const int16_t kYuv2100To601_coeffs_simd[8] = {1931, 1729, 16306, -976, -1378, 15999, 0, 0}; + +// RGB -> Yuv + +// In the 3x3 conversion matrix, 0.5 is duplicated. But represented as only one entry in lut leaving +// with an array size of 8 elements. + +// RGB Bt709 -> Yuv Bt709 +// Y = 0.212639 * R + 0.715169 * G + 0.072192 * B +// U = -0.114592135 * R + -0.385407865 * G + 0.5 * B +// V = 0.5 * R + -0.454155718 * G + -0.045844282 * B +ALIGNED(16) +const uint16_t kRgb709ToYuv_coeffs_simd[8] = {3484, 11717, 1183, 1877, 6315, 8192, 7441, 751}; + +// RGB Display P3 -> Yuv Display P3 +// Y = 0.2289746 * R + 0.6917385 * G + 0.0792869 * B +// U = -0.124346335 * R + -0.375653665 * G + 0.5 * B +// V = 0.5 * R + -0.448583471 * G + -0.051416529 * B +ALIGNED(16) +const uint16_t kRgbDispP3ToYuv_coeffs_simd[8] = {3752, 11333, 1299, 2037, 6155, 8192, 7350, 842}; + +// RGB Bt2100 -> Yuv Bt2100 +// Y = 0.2627 * R + 0.677998 * G + 0.059302 * B +// U = -0.13963036 * R + -0.36036964 * G + 0.5 * B +// V = 0.5 * R + -0.459784348 * G + -0.040215652 * B +ALIGNED(16) +const uint16_t kRgb2100ToYuv_coeffs_simd[8] = {4304, 11108, 972, 2288, 5904, 8192, 7533, 659}; + +#endif + Color yuvColorGamutConversion(Color e_gamma, const std::array& coeffs) { const float y = e_gamma.y * std::get<0>(coeffs) + e_gamma.u * std::get<1>(coeffs) + e_gamma.v * std::get<2>(coeffs); diff --git a/lib/src/jpegr.cpp b/lib/src/jpegr.cpp index 1f83b34d..0b2aeb52 100644 --- a/lib/src/jpegr.cpp +++ b/lib/src/jpegr.cpp @@ -216,6 +216,8 @@ uhdr_error_info_t JpegR::encodeJPEGR(uhdr_raw_image_t* hdr_intent, uhdr_compress if (isPixelFormatRgb(sdr_intent->fmt)) { #if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON))) sdr_intent_yuv_ext = convert_raw_input_to_ycbcr_neon(sdr_intent.get()); +#elif (defined(UHDR_ENABLE_INTRINSICS) && defined(__riscv_v_intrinsic)) + sdr_intent_yuv_ext = convert_raw_input_to_ycbcr_rvv(sdr_intent.get()); #else sdr_intent_yuv_ext = convert_raw_input_to_ycbcr(sdr_intent.get()); #endif @@ -255,6 +257,8 @@ uhdr_error_info_t JpegR::encodeJPEGR(uhdr_raw_image_t* hdr_intent, uhdr_raw_imag if (isPixelFormatRgb(sdr_intent->fmt)) { #if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON))) sdr_intent_yuv_ext = convert_raw_input_to_ycbcr_neon(sdr_intent); +#elif (defined(UHDR_ENABLE_INTRINSICS) && defined(__riscv_v_intrinsic)) + sdr_intent_yuv_ext = convert_raw_input_to_ycbcr_rvv(sdr_intent); #else sdr_intent_yuv_ext = convert_raw_input_to_ycbcr(sdr_intent); #endif @@ -264,6 +268,8 @@ uhdr_error_info_t JpegR::encodeJPEGR(uhdr_raw_image_t* hdr_intent, uhdr_raw_imag // convert to bt601 YUV encoding for JPEG encode #if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON))) UHDR_ERR_CHECK(convertYuv_neon(sdr_intent_yuv, sdr_intent_yuv->cg, UHDR_CG_DISPLAY_P3)); +#elif (defined(UHDR_ENABLE_INTRINSICS) && defined(__riscv_v_intrinsic)) + UHDR_ERR_CHECK(convertYuv_rvv(sdr_intent_yuv, sdr_intent_yuv->cg, UHDR_CG_DISPLAY_P3)); #else UHDR_ERR_CHECK(convertYuv(sdr_intent_yuv, sdr_intent_yuv->cg, UHDR_CG_DISPLAY_P3)); #endif diff --git a/tests/gainmapmath_test.cpp b/tests/gainmapmath_test.cpp index 91d942a8..240b667c 100644 --- a/tests/gainmapmath_test.cpp +++ b/tests/gainmapmath_test.cpp @@ -788,12 +788,12 @@ TEST_F(GainMapMathTest, YuvConversionNeon) { const std::array< std::tuple, const std::array>, 6> coeffs_setup_correct{{ - {kYuv709To601_coeffs_neon, SrgbYuvColors, P3YuvColors}, - {kYuv709To2100_coeffs_neon, SrgbYuvColors, Bt2100YuvColors}, - {kYuv601To709_coeffs_neon, P3YuvColors, SrgbYuvColors}, - {kYuv601To2100_coeffs_neon, P3YuvColors, Bt2100YuvColors}, - {kYuv2100To709_coeffs_neon, Bt2100YuvColors, SrgbYuvColors}, - {kYuv2100To601_coeffs_neon, Bt2100YuvColors, P3YuvColors}, + {kYuv709To601_coeffs_simd, SrgbYuvColors, P3YuvColors}, + {kYuv709To2100_coeffs_simd, SrgbYuvColors, Bt2100YuvColors}, + {kYuv601To709_coeffs_simd, P3YuvColors, SrgbYuvColors}, + {kYuv601To2100_coeffs_simd, P3YuvColors, Bt2100YuvColors}, + {kYuv2100To709_coeffs_simd, Bt2100YuvColors, SrgbYuvColors}, + {kYuv2100To601_coeffs_simd, Bt2100YuvColors, P3YuvColors}, }}; for (const auto& [coeff_ptr, input, expected] : coeffs_setup_correct) { @@ -954,16 +954,15 @@ TEST_F(GainMapMathTest, TransformYuv420) { } } } - -#if (defined(UHDR_ENABLE_INTRINSICS) && (defined(__ARM_NEON__) || defined(__ARM_NEON))) -TEST_F(GainMapMathTest, TransformYuv420Neon) { +#ifdef UHDR_ENABLE_INTRINSICS +TEST_F(GainMapMathTest, TransformYuv420SIMD) { const std::array>, 6> fixed_floating_coeffs{ - {{kYuv709To601_coeffs_neon, kYuvBt709ToBt601}, - {kYuv709To2100_coeffs_neon, kYuvBt709ToBt2100}, - {kYuv601To709_coeffs_neon, kYuvBt601ToBt709}, - {kYuv601To2100_coeffs_neon, kYuvBt601ToBt2100}, - {kYuv2100To709_coeffs_neon, kYuvBt2100ToBt709}, - {kYuv2100To601_coeffs_neon, kYuvBt2100ToBt601}}}; + {{kYuv709To601_coeffs_simd, kYuvBt709ToBt601}, + {kYuv709To2100_coeffs_simd, kYuvBt709ToBt2100}, + {kYuv601To709_coeffs_simd, kYuvBt601ToBt709}, + {kYuv601To2100_coeffs_simd, kYuvBt601ToBt2100}, + {kYuv2100To709_coeffs_simd, kYuvBt2100ToBt709}, + {kYuv2100To601_coeffs_simd, kYuvBt2100ToBt601}}}; for (const auto& [neon_coeffs_ptr, floating_point_coeffs] : fixed_floating_coeffs) { uhdr_raw_image_t input = Yuv420Image32x4(); @@ -980,8 +979,14 @@ TEST_F(GainMapMathTest, TransformYuv420Neon) { output.planes[UHDR_PLANE_Y] = luma; output.planes[UHDR_PLANE_U] = cb; output.planes[UHDR_PLANE_V] = cr; - + +#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) transformYuv420_neon(&output, neon_coeffs_ptr); +#elif defined(__riscv_v_intrinsic) + transformYuv420_rvv(&output, neon_coeffs_ptr); +#else + return; +#endif for (size_t y = 0; y < input.h / 2; ++y) { for (size_t x = 0; x < input.w / 2; ++x) { @@ -1014,11 +1019,17 @@ TEST_F(GainMapMathTest, TransformYuv420Neon) { // Due to the Neon version using a fixed-point approximation, this can result in an off by // one error compared with the standard floating-point version. +#if defined(__riscv_v_intrinsic) + EXPECT_NEAR(expect_y1, out1.y, 2); + EXPECT_NEAR(expect_y2, out2.y, 2); + EXPECT_NEAR(expect_y3, out3.y, 2); + EXPECT_NEAR(expect_y4, out4.y, 2); +#else EXPECT_NEAR(expect_y1, out1.y, 1); EXPECT_NEAR(expect_y2, out2.y, 1); EXPECT_NEAR(expect_y3, out3.y, 1); EXPECT_NEAR(expect_y4, out4.y, 1); - +#endif EXPECT_NEAR(expect_u, out1.u, 1); EXPECT_NEAR(expect_u, out2.u, 1); EXPECT_NEAR(expect_u, out3.u, 1); @@ -1678,5 +1689,4 @@ TEST_F(GainMapMathTest, ApplyMap) { EXPECT_RGB_EQ(Recover(YuvWhite(), 0.25f, &metadata), RgbWhite()); EXPECT_RGB_EQ(Recover(YuvWhite(), 0.0f, &metadata), RgbWhite() / 2.0f); } - } // namespace ultrahdr