diff --git a/Cargo.toml b/Cargo.toml index 05e12aee5..3e0db7e1c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -37,6 +37,9 @@ quantize = ["dep:color_quant"] gradient = ["dep:colorgrad"] static = ["libwebp-sys2?/static"] +[build-dependencies] +rustc_version = "0.4" + [dev-dependencies] criterion = "^0.4" image = "^0" diff --git a/build.rs b/build.rs new file mode 100644 index 000000000..9506529b5 --- /dev/null +++ b/build.rs @@ -0,0 +1,10 @@ +use rustc_version::{version, version_meta, Channel}; + +fn main() { + assert!(version().unwrap().major >= 1); + + match version_meta().unwrap().channel { + Channel::Nightly => println!("cargo:rustc-cfg=RUSTC_IS_NIGHTLY"), + _ => {} + } +} diff --git a/src/arch/aarch64.rs b/src/arch/aarch64.rs new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/src/arch/aarch64.rs @@ -0,0 +1 @@ + diff --git a/src/arch/manual.rs b/src/arch/manual.rs new file mode 100644 index 000000000..8264b0f25 --- /dev/null +++ b/src/arch/manual.rs @@ -0,0 +1,42 @@ +use crate::Rgba; + +pub fn _merge_impl(base: Rgba, other: Rgba) -> Rgba { + let (base_r, base_g, base_b, base_a) = ( + base.r as f32 / 255., + base.g as f32 / 255., + base.b as f32 / 255., + base.a as f32 / 255., + ); + + let (overlay_r, overlay_g, overlay_b, overlay_a) = ( + other.r as f32 / 255., + other.g as f32 / 255., + other.b as f32 / 255., + other.a as f32 / 255., + ); + + let a_diff = 1. - overlay_a; + let a = a_diff.mul_add(base_a, overlay_a); + + let a_ratio = a_diff * base_a; + let r = a_ratio.mul_add(base_r, overlay_a * overlay_r) / a; + let g = a_ratio.mul_add(base_g, overlay_a * overlay_g) / a; + let b = a_ratio.mul_add(base_b, overlay_a * overlay_b) / a; + + Rgba { + r: (r * 255.) as u8, + g: (g * 255.) as u8, + b: (b * 255.) as u8, + a: (a * 255.) as u8, + } +} + +#[inline] +pub fn _invert_impl(base: Rgba) -> Rgba { + Rgba { + r: !base.r, + g: !base.g, + b: !base.b, + a: !base.a, + } +} diff --git a/src/arch/mod.rs b/src/arch/mod.rs new file mode 100644 index 000000000..ccac3865a --- /dev/null +++ b/src/arch/mod.rs @@ -0,0 +1,43 @@ +#![allow(clippy::cast_lossless)] +#![allow(clippy::wildcard_imports)] +#![allow(dead_code)] + +mod aarch64; +mod manual; +mod x86; + +use crate::Rgba; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +use std::is_x86_feature_detected; + +#[inline] +pub fn merge_impl(base: Rgba, other: Rgba) -> Rgba { + // Optimize for common cases + if other.a == 255 { + return other; + } else if other.a == 0 { + return base; + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + if is_x86_feature_detected!("sse") && is_x86_feature_detected!("fma") { + unsafe { + return x86::_merge_impl(base, other); + } + } + + manual::_merge_impl(base, other) +} + +#[inline] +pub fn invert_impl(base: Rgba) -> Rgba { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + if is_x86_feature_detected!("sse") { + unsafe { + return x86::_invert_impl(base); + } + } + + manual::_invert_impl(base) +} diff --git a/src/arch/x86.rs b/src/arch/x86.rs new file mode 100644 index 000000000..f958db4c6 --- /dev/null +++ b/src/arch/x86.rs @@ -0,0 +1,105 @@ +#[cfg(target_arch = "x86")] +use std::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use std::arch::x86_64::*; + +use crate::Rgba; + +const ONES: f32 = unsafe { std::mem::transmute(0xff_ff_ff_ff_u32) }; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[target_feature(enable = "sse")] +#[target_feature(enable = "fma")] +pub unsafe fn _merge_impl(base: Rgba, other: Rgba) -> Rgba { + let mut base_rgba = [0_f32; 4]; + let mut overlay = [0_f32; 4]; + let mut overlay_rgba = [0_f32; 4]; + let mut rgba = [0_f32; 4]; + let mut res = [0_f32; 4]; + + _mm_store_ps( + base_rgba.as_mut_ptr(), + _mm_div_ps( + _mm_setr_ps(base.r as f32, base.g as f32, base.b as f32, base.a as f32), + _mm_set1_ps(255.), + ), + ); + + let [base_r, base_g, base_b, base_a] = base_rgba; + _mm_store_ps( + overlay.as_mut_ptr(), + _mm_div_ps( + _mm_setr_ps( + other.r as f32, + other.g as f32, + other.b as f32, + other.a as f32, + ), + _mm_set1_ps(255.), + ), + ); + + let [overlay_r, overlay_g, overlay_b, overlay_a] = overlay; + let a_diff = 1. - overlay_a; + + _mm_store_ps( + overlay_rgba.as_mut_ptr(), + _mm_mul_ps( + _mm_setr_ps(overlay_r, overlay_g, overlay_b, base_a), + _mm_setr_ps(overlay_a, overlay_a, overlay_a, a_diff), + ), + ); + + let [overlay_r, overlay_g, overlay_b, a_ratio] = overlay_rgba; + + _mm_store_ps( + rgba.as_mut_ptr(), + _mm_fmadd_ps( + _mm_setr_ps(a_ratio, a_ratio, a_ratio, a_diff), + _mm_setr_ps(base_r, base_g, base_b, base_a), + _mm_setr_ps(overlay_r, overlay_g, overlay_b, overlay_a), + ), + ); + + let [r, g, b, a] = rgba; + + _mm_store_ps( + res.as_mut_ptr(), + _mm_mul_ps( + _mm_div_ps(_mm_setr_ps(r, g, b, a), _mm_setr_ps(a, a, a, 1.)), + _mm_set1_ps(255.), + ), + ); + + let [r, g, b, a] = res; + + Rgba { + r: r as u8, + g: g as u8, + b: b as u8, + a: a as u8, + } +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[target_feature(enable = "sse")] +pub unsafe fn _invert_impl(base: Rgba) -> Rgba { + let mut res = [0_f32; 4]; + + _mm_store_ps( + res.as_mut_ptr(), + _mm_xor_ps( + _mm_setr_ps(base.r as f32, base.g as f32, base.b as f32, base.a as f32), + _mm_set1_ps(ONES), + ), + ); + + let [r, g, b, a] = res; + + Rgba { + r: r as u8, + g: g as u8, + b: b as u8, + a: a as u8, + } +} diff --git a/src/lib.rs b/src/lib.rs index ad20623a8..7cca54251 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -261,6 +261,7 @@ clippy::doc_markdown )] +mod arch; pub mod draw; pub mod encode; pub mod encodings; diff --git a/src/pixel.rs b/src/pixel.rs index ab8e2fbc4..0bdb9d4ed 100644 --- a/src/pixel.rs +++ b/src/pixel.rs @@ -1,10 +1,10 @@ //! Encloses pixel-related traits and pixel type implementations. -use crate::Error::DecodingError; use crate::{ + arch, encodings::ColorType, image::OverlayMode, - Error::{InvalidHexCode, InvalidPaletteIndex, UnsupportedColorType}, + Error::{DecodingError, InvalidHexCode, InvalidPaletteIndex, UnsupportedColorType}, Result, }; use std::borrow::Cow; @@ -625,11 +625,7 @@ impl Pixel for Rgb { type Data = [u8; 3]; fn inverted(&self) -> Self { - Self { - r: !self.r, - g: !self.g, - b: !self.b, - } + arch::invert_impl((*self).into()).into() } fn map_subpixels(self, f: F, _: A) -> Self @@ -791,12 +787,7 @@ impl Pixel for Rgba { type Data = [u8; 4]; fn inverted(&self) -> Self { - Self { - r: !self.r, - g: !self.g, - b: !self.b, - a: !self.a, - } + arch::invert_impl(*self) } fn map_subpixels(self, f: F, a: A) -> Self @@ -875,44 +866,8 @@ impl Pixel for Rgba { [self.r, self.g, self.b, self.a] } - // TODO: SIMD could speed this up significantly - #[allow(clippy::cast_lossless)] fn merge(self, other: Self) -> Self { - // Optimize for common cases - if other.a == 255 { - return other; - } else if other.a == 0 { - return self; - } - - let (base_r, base_g, base_b, base_a) = ( - self.r as f32 / 255., - self.g as f32 / 255., - self.b as f32 / 255., - self.a as f32 / 255., - ); - - let (overlay_r, overlay_g, overlay_b, overlay_a) = ( - other.r as f32 / 255., - other.g as f32 / 255., - other.b as f32 / 255., - other.a as f32 / 255., - ); - - let a_diff = 1. - overlay_a; - let a = a_diff.mul_add(base_a, overlay_a); - - let a_ratio = a_diff * base_a; - let r = a_ratio.mul_add(base_r, overlay_a * overlay_r) / a; - let g = a_ratio.mul_add(base_g, overlay_a * overlay_g) / a; - let b = a_ratio.mul_add(base_b, overlay_a * overlay_b) / a; - - Self { - r: (r * 255.) as u8, - g: (g * 255.) as u8, - b: (b * 255.) as u8, - a: (a * 255.) as u8, - } + arch::merge_impl(self, other) } #[allow(clippy::cast_lossless)]