From 02b968b751711bc0c506703c964bcda98781381b Mon Sep 17 00:00:00 2001 From: Cryptex <64497526+Cryptex-github@users.noreply.github.com> Date: Thu, 22 Dec 2022 21:58:34 -0800 Subject: [PATCH 1/4] Implement x86 specific merge impl --- src/arch/aarch64.rs | 0 src/arch/manual.rs | 40 ++++++++++++++++++++++++ src/arch/mod.rs | 17 ++++++++++ src/arch/x86.rs | 76 +++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 2 ++ src/pixel.rs | 39 ++--------------------- 6 files changed, 137 insertions(+), 37 deletions(-) create mode 100644 src/arch/aarch64.rs create mode 100644 src/arch/manual.rs create mode 100644 src/arch/mod.rs create mode 100644 src/arch/x86.rs diff --git a/src/arch/aarch64.rs b/src/arch/aarch64.rs new file mode 100644 index 000000000..e69de29bb diff --git a/src/arch/manual.rs b/src/arch/manual.rs new file mode 100644 index 000000000..8f9c36080 --- /dev/null +++ b/src/arch/manual.rs @@ -0,0 +1,40 @@ +use crate::Rgba; + +#[allow(clippy::cast_lossless)] +pub fn _merge_impl(original: Rgba, other: Rgba) -> Rgba { + // Optimize for common cases + if other.a == 255 { + return other; + } else if other.a == 0 { + return original; + } + + let (base_r, base_g, base_b, base_a) = ( + original.r as f32 / 255., + original.g as f32 / 255., + original.b as f32 / 255., + original.a as f32 / 255., + ); + + let (overlay_r, overlay_g, overlay_b, overlay_a) = ( + other.r as f32 / 255., + other.g as f32 / 255., + other.b as f32 / 255., + other.a as f32 / 255., + ); + + let a_diff = 1. - overlay_a; + let a = a_diff.mul_add(base_a, overlay_a); + + let a_ratio = a_diff * base_a; + let r = a_ratio.mul_add(base_r, overlay_a * overlay_r) / a; + let g = a_ratio.mul_add(base_g, overlay_a * overlay_g) / a; + let b = a_ratio.mul_add(base_b, overlay_a * overlay_b) / a; + + Rgba { + r: (r * 255.) as u8, + g: (g * 255.) as u8, + b: (b * 255.) as u8, + a: (a * 255.) as u8, + } +} diff --git a/src/arch/mod.rs b/src/arch/mod.rs new file mode 100644 index 000000000..fc133c62b --- /dev/null +++ b/src/arch/mod.rs @@ -0,0 +1,17 @@ +mod aarch64; +mod manual; +mod x86; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use std::is_x86_feature_detected; + +pub fn merge_impl() -> unsafe fn(crate::Rgba, crate::Rgba) -> crate::Rgba { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if is_x86_feature_detected!("sse") && is_x86_feature_detected!("fma") { + return x86::_merge_impl; + } + } + + manual::_merge_impl +} diff --git a/src/arch/x86.rs b/src/arch/x86.rs new file mode 100644 index 000000000..3219c6e49 --- /dev/null +++ b/src/arch/x86.rs @@ -0,0 +1,76 @@ +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[target_feature(enable = "sse")] +#[target_feature(enable = "fma")] +pub unsafe fn _merge_impl(original: crate::pixel::Rgba, other: crate::pixel::Rgba) -> crate::pixel::Rgba { + // Optimize for common cases + if other.a == 255 { + return other; + } else if other.a == 0 { + return original; + } + + let mut base = [0_f32; 4]; + + _mm_store_ps( + base.as_mut_ptr(), + _mm_div_ps(_mm_setr_ps(original.r as f32, original.g as f32, original.b as f32, original.a as f32), _mm_set1_ps(255.)), + ); + + let [base_r, base_g, base_b, base_a] = base; + + let mut overlay = [0_f32; 4]; + + _mm_store_ps( + overlay.as_mut_ptr(), + _mm_div_ps(_mm_setr_ps(other.r as f32, other.g as f32, other.b as f32, other.a as f32), _mm_set1_ps(255.)), + ); + + let [overlay_r, overlay_g, overlay_b, overlay_a] = overlay; + + let a_diff = 1. - overlay_a; + + let mut overlay_rgba = [0_f32; 4]; + + _mm_store_ps( + overlay_rgba.as_mut_ptr(), + _mm_mul_ps( + _mm_setr_ps(overlay_r, overlay_g, overlay_b, base_a), + _mm_setr_ps(overlay_a, overlay_a, overlay_a, a_diff), + ), + ); + + let [overlay_r, overlay_g, overlay_b, a_ratio] = overlay_rgba; + + let mut rgba = [0_f32; 4]; + + _mm_store_ps( + rgba.as_mut_ptr(), + _mm_fmadd_ps( + _mm_setr_ps(a_ratio, a_ratio, a_ratio, a_diff), + _mm_setr_ps(base_r, base_g, base_b, base_a), + _mm_setr_ps(overlay_r, overlay_g, overlay_b, overlay_a), + ), + ); + + let [r, g, b, a] = rgba; + + let mut res = [0_f32; 4]; + + _mm_store_ps( + res.as_mut_ptr(), + _mm_mul_ps( + _mm_div_ps(_mm_setr_ps(r, g, b, a), _mm_setr_ps(a, a, a, 1.)), + _mm_set1_ps(255.), + ), + ); + + let [r, g, b, a] = res; + + crate::pixel::Rgba { r: r as u8, g: g as u8, b: b as u8, a: a as u8 } +} diff --git a/src/lib.rs b/src/lib.rs index ad20623a8..dbc1e1c6e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -276,6 +276,8 @@ pub mod sequence; #[cfg(feature = "text")] pub mod text; +mod arch; + macro_rules! inline_doc { ($($token:item)*) => { $(#[doc(inline)] $token)* diff --git a/src/pixel.rs b/src/pixel.rs index e98e88e85..775603e17 100644 --- a/src/pixel.rs +++ b/src/pixel.rs @@ -1,6 +1,7 @@ //! Encloses pixel-related traits and pixel type implementations. use crate::Error::DecodingError; +use crate::arch; use crate::{ encodings::ColorType, image::OverlayMode, @@ -888,44 +889,8 @@ impl Pixel for Rgba { [self.r, self.g, self.b, self.a] } - // TODO: SIMD could speed this up significantly - #[allow(clippy::cast_lossless)] fn merge(self, other: Self) -> Self { - // Optimize for common cases - if other.a == 255 { - return other; - } else if other.a == 0 { - return self; - } - - let (base_r, base_g, base_b, base_a) = ( - self.r as f32 / 255., - self.g as f32 / 255., - self.b as f32 / 255., - self.a as f32 / 255., - ); - - let (overlay_r, overlay_g, overlay_b, overlay_a) = ( - other.r as f32 / 255., - other.g as f32 / 255., - other.b as f32 / 255., - other.a as f32 / 255., - ); - - let a_diff = 1. - overlay_a; - let a = a_diff.mul_add(base_a, overlay_a); - - let a_ratio = a_diff * base_a; - let r = a_ratio.mul_add(base_r, overlay_a * overlay_r) / a; - let g = a_ratio.mul_add(base_g, overlay_a * overlay_g) / a; - let b = a_ratio.mul_add(base_b, overlay_a * overlay_b) / a; - - Self { - r: (r * 255.) as u8, - g: (g * 255.) as u8, - b: (b * 255.) as u8, - a: (a * 255.) as u8, - } + unsafe { arch::merge_impl()(self, other) } } #[allow(clippy::cast_lossless)] From 432c9a2327d037ba534258f2f44621f92acec2bf Mon Sep 17 00:00:00 2001 From: Cryptex <64497526+Cryptex-github@users.noreply.github.com> Date: Fri, 23 Dec 2022 12:29:42 -0800 Subject: [PATCH 2/4] Resolve requested changes --- src/arch/aarch64.rs | 1 + src/arch/manual.rs | 8 --- src/arch/mod.rs | 23 ++++++-- src/arch/x86.rs | 137 +++++++++++++++++++++++--------------------- src/lib.rs | 3 +- src/pixel.rs | 7 +-- 6 files changed, 94 insertions(+), 85 deletions(-) diff --git a/src/arch/aarch64.rs b/src/arch/aarch64.rs index e69de29bb..8b1378917 100644 --- a/src/arch/aarch64.rs +++ b/src/arch/aarch64.rs @@ -0,0 +1 @@ + diff --git a/src/arch/manual.rs b/src/arch/manual.rs index 8f9c36080..4e71e10ba 100644 --- a/src/arch/manual.rs +++ b/src/arch/manual.rs @@ -1,14 +1,6 @@ use crate::Rgba; -#[allow(clippy::cast_lossless)] pub fn _merge_impl(original: Rgba, other: Rgba) -> Rgba { - // Optimize for common cases - if other.a == 255 { - return other; - } else if other.a == 0 { - return original; - } - let (base_r, base_g, base_b, base_a) = ( original.r as f32 / 255., original.g as f32 / 255., diff --git a/src/arch/mod.rs b/src/arch/mod.rs index fc133c62b..1be948743 100644 --- a/src/arch/mod.rs +++ b/src/arch/mod.rs @@ -1,17 +1,30 @@ +#![allow(clippy::cast_lossless)] +#![allow(clippy::wildcard_imports)] + mod aarch64; mod manual; mod x86; +use crate::Rgba; + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] use std::is_x86_feature_detected; -pub fn merge_impl() -> unsafe fn(crate::Rgba, crate::Rgba) -> crate::Rgba { +#[inline] +pub fn merge_impl(base: Rgba, other: Rgba) -> Rgba { + // Optimize for common cases + if other.a == 255 { + return other; + } else if other.a == 0 { + return base; + } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { - if is_x86_feature_detected!("sse") && is_x86_feature_detected!("fma") { - return x86::_merge_impl; + if is_x86_feature_detected!("sse") && is_x86_feature_detected!("fma") { + unsafe { + return x86::_merge_impl(base, other); } } - manual::_merge_impl + manual::_merge_impl(base, other) } diff --git a/src/arch/x86.rs b/src/arch/x86.rs index 3219c6e49..8598f9216 100644 --- a/src/arch/x86.rs +++ b/src/arch/x86.rs @@ -3,74 +3,79 @@ use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "sse")] #[target_feature(enable = "fma")] -pub unsafe fn _merge_impl(original: crate::pixel::Rgba, other: crate::pixel::Rgba) -> crate::pixel::Rgba { - // Optimize for common cases - if other.a == 255 { - return other; - } else if other.a == 0 { - return original; - } - - let mut base = [0_f32; 4]; - - _mm_store_ps( - base.as_mut_ptr(), - _mm_div_ps(_mm_setr_ps(original.r as f32, original.g as f32, original.b as f32, original.a as f32), _mm_set1_ps(255.)), - ); - - let [base_r, base_g, base_b, base_a] = base; - - let mut overlay = [0_f32; 4]; - - _mm_store_ps( - overlay.as_mut_ptr(), - _mm_div_ps(_mm_setr_ps(other.r as f32, other.g as f32, other.b as f32, other.a as f32), _mm_set1_ps(255.)), - ); - - let [overlay_r, overlay_g, overlay_b, overlay_a] = overlay; - - let a_diff = 1. - overlay_a; - - let mut overlay_rgba = [0_f32; 4]; - - _mm_store_ps( - overlay_rgba.as_mut_ptr(), - _mm_mul_ps( - _mm_setr_ps(overlay_r, overlay_g, overlay_b, base_a), - _mm_setr_ps(overlay_a, overlay_a, overlay_a, a_diff), +pub unsafe fn _merge_impl( + base: crate::pixel::Rgba, + other: crate::pixel::Rgba, +) -> crate::pixel::Rgba { + let mut base_rgba = [0_f32; 4]; + let mut overlay = [0_f32; 4]; + let mut overlay_rgba = [0_f32; 4]; + let mut rgba = [0_f32; 4]; + let mut res = [0_f32; 4]; + + _mm_store_ps( + base_rgba.as_mut_ptr(), + _mm_div_ps( + _mm_setr_ps(base.r as f32, base.g as f32, base.b as f32, base.a as f32), + _mm_set1_ps(255.), + ), + ); + + let [base_r, base_g, base_b, base_a] = base_rgba; + _mm_store_ps( + overlay.as_mut_ptr(), + _mm_div_ps( + _mm_setr_ps( + other.r as f32, + other.g as f32, + other.b as f32, + other.a as f32, ), - ); - - let [overlay_r, overlay_g, overlay_b, a_ratio] = overlay_rgba; - - let mut rgba = [0_f32; 4]; - - _mm_store_ps( - rgba.as_mut_ptr(), - _mm_fmadd_ps( - _mm_setr_ps(a_ratio, a_ratio, a_ratio, a_diff), - _mm_setr_ps(base_r, base_g, base_b, base_a), - _mm_setr_ps(overlay_r, overlay_g, overlay_b, overlay_a), - ), - ); - - let [r, g, b, a] = rgba; - - let mut res = [0_f32; 4]; - - _mm_store_ps( - res.as_mut_ptr(), - _mm_mul_ps( - _mm_div_ps(_mm_setr_ps(r, g, b, a), _mm_setr_ps(a, a, a, 1.)), - _mm_set1_ps(255.), - ), - ); - - let [r, g, b, a] = res; - - crate::pixel::Rgba { r: r as u8, g: g as u8, b: b as u8, a: a as u8 } + _mm_set1_ps(255.), + ), + ); + + let [overlay_r, overlay_g, overlay_b, overlay_a] = overlay; + let a_diff = 1. - overlay_a; + + _mm_store_ps( + overlay_rgba.as_mut_ptr(), + _mm_mul_ps( + _mm_setr_ps(overlay_r, overlay_g, overlay_b, base_a), + _mm_setr_ps(overlay_a, overlay_a, overlay_a, a_diff), + ), + ); + + let [overlay_r, overlay_g, overlay_b, a_ratio] = overlay_rgba; + + _mm_store_ps( + rgba.as_mut_ptr(), + _mm_fmadd_ps( + _mm_setr_ps(a_ratio, a_ratio, a_ratio, a_diff), + _mm_setr_ps(base_r, base_g, base_b, base_a), + _mm_setr_ps(overlay_r, overlay_g, overlay_b, overlay_a), + ), + ); + + let [r, g, b, a] = rgba; + + _mm_store_ps( + res.as_mut_ptr(), + _mm_mul_ps( + _mm_div_ps(_mm_setr_ps(r, g, b, a), _mm_setr_ps(a, a, a, 1.)), + _mm_set1_ps(255.), + ), + ); + + let [r, g, b, a] = res; + + crate::pixel::Rgba { + r: r as u8, + g: g as u8, + b: b as u8, + a: a as u8, + } } diff --git a/src/lib.rs b/src/lib.rs index dbc1e1c6e..7cca54251 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -261,6 +261,7 @@ clippy::doc_markdown )] +mod arch; pub mod draw; pub mod encode; pub mod encodings; @@ -276,8 +277,6 @@ pub mod sequence; #[cfg(feature = "text")] pub mod text; -mod arch; - macro_rules! inline_doc { ($($token:item)*) => { $(#[doc(inline)] $token)* diff --git a/src/pixel.rs b/src/pixel.rs index 775603e17..0af2d7e72 100644 --- a/src/pixel.rs +++ b/src/pixel.rs @@ -1,11 +1,10 @@ //! Encloses pixel-related traits and pixel type implementations. -use crate::Error::DecodingError; -use crate::arch; use crate::{ + arch, encodings::ColorType, image::OverlayMode, - Error::{InvalidHexCode, InvalidPaletteIndex, UnsupportedColorType}, + Error::{DecodingError, InvalidHexCode, InvalidPaletteIndex, UnsupportedColorType}, Result, }; use std::borrow::Cow; @@ -890,7 +889,7 @@ impl Pixel for Rgba { } fn merge(self, other: Self) -> Self { - unsafe { arch::merge_impl()(self, other) } + arch::merge_impl(self, other) } #[allow(clippy::cast_lossless)] From 3ccae17e4135cc4b05ef596cead96954bbec559f Mon Sep 17 00:00:00 2001 From: Cryptex <64497526+Cryptex-github@users.noreply.github.com> Date: Fri, 23 Dec 2022 13:36:41 -0800 Subject: [PATCH 3/4] SIMD implementation of invert --- src/arch/manual.rs | 20 +++++++++++++++----- src/arch/mod.rs | 13 +++++++++++++ src/arch/x86.rs | 34 +++++++++++++++++++++++++++++----- src/pixel.rs | 13 ++----------- 4 files changed, 59 insertions(+), 21 deletions(-) diff --git a/src/arch/manual.rs b/src/arch/manual.rs index 4e71e10ba..8264b0f25 100644 --- a/src/arch/manual.rs +++ b/src/arch/manual.rs @@ -1,11 +1,11 @@ use crate::Rgba; -pub fn _merge_impl(original: Rgba, other: Rgba) -> Rgba { +pub fn _merge_impl(base: Rgba, other: Rgba) -> Rgba { let (base_r, base_g, base_b, base_a) = ( - original.r as f32 / 255., - original.g as f32 / 255., - original.b as f32 / 255., - original.a as f32 / 255., + base.r as f32 / 255., + base.g as f32 / 255., + base.b as f32 / 255., + base.a as f32 / 255., ); let (overlay_r, overlay_g, overlay_b, overlay_a) = ( @@ -30,3 +30,13 @@ pub fn _merge_impl(original: Rgba, other: Rgba) -> Rgba { a: (a * 255.) as u8, } } + +#[inline] +pub fn _invert_impl(base: Rgba) -> Rgba { + Rgba { + r: !base.r, + g: !base.g, + b: !base.b, + a: !base.a, + } +} diff --git a/src/arch/mod.rs b/src/arch/mod.rs index 1be948743..ccac3865a 100644 --- a/src/arch/mod.rs +++ b/src/arch/mod.rs @@ -1,5 +1,6 @@ #![allow(clippy::cast_lossless)] #![allow(clippy::wildcard_imports)] +#![allow(dead_code)] mod aarch64; mod manual; @@ -28,3 +29,15 @@ pub fn merge_impl(base: Rgba, other: Rgba) -> Rgba { manual::_merge_impl(base, other) } + +#[inline] +pub fn invert_impl(base: Rgba) -> Rgba { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + if is_x86_feature_detected!("sse") { + unsafe { + return x86::_invert_impl(base); + } + } + + manual::_invert_impl(base) +} diff --git a/src/arch/x86.rs b/src/arch/x86.rs index 8598f9216..f958db4c6 100644 --- a/src/arch/x86.rs +++ b/src/arch/x86.rs @@ -3,13 +3,14 @@ use std::arch::x86::*; #[cfg(target_arch = "x86_64")] use std::arch::x86_64::*; +use crate::Rgba; + +const ONES: f32 = unsafe { std::mem::transmute(0xff_ff_ff_ff_u32) }; + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[target_feature(enable = "sse")] #[target_feature(enable = "fma")] -pub unsafe fn _merge_impl( - base: crate::pixel::Rgba, - other: crate::pixel::Rgba, -) -> crate::pixel::Rgba { +pub unsafe fn _merge_impl(base: Rgba, other: Rgba) -> Rgba { let mut base_rgba = [0_f32; 4]; let mut overlay = [0_f32; 4]; let mut overlay_rgba = [0_f32; 4]; @@ -72,7 +73,30 @@ pub unsafe fn _merge_impl( let [r, g, b, a] = res; - crate::pixel::Rgba { + Rgba { + r: r as u8, + g: g as u8, + b: b as u8, + a: a as u8, + } +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[target_feature(enable = "sse")] +pub unsafe fn _invert_impl(base: Rgba) -> Rgba { + let mut res = [0_f32; 4]; + + _mm_store_ps( + res.as_mut_ptr(), + _mm_xor_ps( + _mm_setr_ps(base.r as f32, base.g as f32, base.b as f32, base.a as f32), + _mm_set1_ps(ONES), + ), + ); + + let [r, g, b, a] = res; + + Rgba { r: r as u8, g: g as u8, b: b as u8, diff --git a/src/pixel.rs b/src/pixel.rs index c363c5653..0bdb9d4ed 100644 --- a/src/pixel.rs +++ b/src/pixel.rs @@ -625,11 +625,7 @@ impl Pixel for Rgb { type Data = [u8; 3]; fn inverted(&self) -> Self { - Self { - r: !self.r, - g: !self.g, - b: !self.b, - } + arch::invert_impl((*self).into()).into() } fn map_subpixels(self, f: F, _: A) -> Self @@ -791,12 +787,7 @@ impl Pixel for Rgba { type Data = [u8; 4]; fn inverted(&self) -> Self { - Self { - r: !self.r, - g: !self.g, - b: !self.b, - a: !self.a, - } + arch::invert_impl(*self) } fn map_subpixels(self, f: F, a: A) -> Self From bc3f458e097d8853b3743b92edd3408c5ff9cbd6 Mon Sep 17 00:00:00 2001 From: Cryptex <64497526+Cryptex-github@users.noreply.github.com> Date: Fri, 23 Dec 2022 14:37:34 -0800 Subject: [PATCH 4/4] Add a cfg flag to detect if rustc is nightly --- Cargo.toml | 3 +++ build.rs | 10 ++++++++++ 2 files changed, 13 insertions(+) create mode 100644 build.rs diff --git a/Cargo.toml b/Cargo.toml index 05e12aee5..3e0db7e1c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,9 @@ quantize = ["dep:color_quant"] gradient = ["dep:colorgrad"] static = ["libwebp-sys2?/static"] +[build-dependencies] +rustc_version = "0.4" + [dev-dependencies] criterion = "^0.4" image = "^0" diff --git a/build.rs b/build.rs new file mode 100644 index 000000000..9506529b5 --- /dev/null +++ b/build.rs @@ -0,0 +1,10 @@ +use rustc_version::{version, version_meta, Channel}; + +fn main() { + assert!(version().unwrap().major >= 1); + + match version_meta().unwrap().channel { + Channel::Nightly => println!("cargo:rustc-cfg=RUSTC_IS_NIGHTLY"), + _ => {} + } +}