Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ quantize = ["dep:color_quant"]
gradient = ["dep:colorgrad"]
static = ["libwebp-sys2?/static"]

[build-dependencies]
rustc_version = "0.4"

[dev-dependencies]
criterion = "^0.4"
image = "^0"
Expand Down
10 changes: 10 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
use rustc_version::{version, version_meta, Channel};

fn main() {
assert!(version().unwrap().major >= 1);

match version_meta().unwrap().channel {
Channel::Nightly => println!("cargo:rustc-cfg=RUSTC_IS_NIGHTLY"),
_ => {}
}
}
1 change: 1 addition & 0 deletions src/arch/aarch64.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

42 changes: 42 additions & 0 deletions src/arch/manual.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
use crate::Rgba;

pub fn _merge_impl(base: Rgba, other: Rgba) -> Rgba {
let (base_r, base_g, base_b, base_a) = (
base.r as f32 / 255.,
base.g as f32 / 255.,
base.b as f32 / 255.,
base.a as f32 / 255.,
);

let (overlay_r, overlay_g, overlay_b, overlay_a) = (
other.r as f32 / 255.,
other.g as f32 / 255.,
other.b as f32 / 255.,
other.a as f32 / 255.,
);

let a_diff = 1. - overlay_a;
let a = a_diff.mul_add(base_a, overlay_a);

let a_ratio = a_diff * base_a;
let r = a_ratio.mul_add(base_r, overlay_a * overlay_r) / a;
let g = a_ratio.mul_add(base_g, overlay_a * overlay_g) / a;
let b = a_ratio.mul_add(base_b, overlay_a * overlay_b) / a;

Rgba {
r: (r * 255.) as u8,
g: (g * 255.) as u8,
b: (b * 255.) as u8,
a: (a * 255.) as u8,
}
}

#[inline]
pub fn _invert_impl(base: Rgba) -> Rgba {
Rgba {
r: !base.r,
g: !base.g,
b: !base.b,
a: !base.a,
}
}
43 changes: 43 additions & 0 deletions src/arch/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#![allow(clippy::cast_lossless)]
#![allow(clippy::wildcard_imports)]
#![allow(dead_code)]

mod aarch64;
mod manual;
mod x86;

use crate::Rgba;

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
use std::is_x86_feature_detected;

#[inline]
pub fn merge_impl(base: Rgba, other: Rgba) -> Rgba {
// Optimize for common cases
if other.a == 255 {
return other;
} else if other.a == 0 {
return base;
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
if is_x86_feature_detected!("sse") && is_x86_feature_detected!("fma") {
unsafe {
return x86::_merge_impl(base, other);
}
}

manual::_merge_impl(base, other)
}

#[inline]
pub fn invert_impl(base: Rgba) -> Rgba {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
if is_x86_feature_detected!("sse") {
unsafe {
return x86::_invert_impl(base);
}
}

manual::_invert_impl(base)
}
105 changes: 105 additions & 0 deletions src/arch/x86.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#[cfg(target_arch = "x86")]
use std::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;

use crate::Rgba;

const ONES: f32 = unsafe { std::mem::transmute(0xff_ff_ff_ff_u32) };

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "sse")]
#[target_feature(enable = "fma")]
pub unsafe fn _merge_impl(base: Rgba, other: Rgba) -> Rgba {
let mut base_rgba = [0_f32; 4];
let mut overlay = [0_f32; 4];
let mut overlay_rgba = [0_f32; 4];
let mut rgba = [0_f32; 4];
let mut res = [0_f32; 4];

_mm_store_ps(
base_rgba.as_mut_ptr(),
_mm_div_ps(
_mm_setr_ps(base.r as f32, base.g as f32, base.b as f32, base.a as f32),
_mm_set1_ps(255.),
),
);

let [base_r, base_g, base_b, base_a] = base_rgba;
_mm_store_ps(
overlay.as_mut_ptr(),
_mm_div_ps(
_mm_setr_ps(
other.r as f32,
other.g as f32,
other.b as f32,
other.a as f32,
),
_mm_set1_ps(255.),
),
);

let [overlay_r, overlay_g, overlay_b, overlay_a] = overlay;
let a_diff = 1. - overlay_a;

_mm_store_ps(
overlay_rgba.as_mut_ptr(),
_mm_mul_ps(
_mm_setr_ps(overlay_r, overlay_g, overlay_b, base_a),
_mm_setr_ps(overlay_a, overlay_a, overlay_a, a_diff),
),
);

let [overlay_r, overlay_g, overlay_b, a_ratio] = overlay_rgba;

_mm_store_ps(
rgba.as_mut_ptr(),
_mm_fmadd_ps(
_mm_setr_ps(a_ratio, a_ratio, a_ratio, a_diff),
_mm_setr_ps(base_r, base_g, base_b, base_a),
_mm_setr_ps(overlay_r, overlay_g, overlay_b, overlay_a),
),
);

let [r, g, b, a] = rgba;

_mm_store_ps(
res.as_mut_ptr(),
_mm_mul_ps(
_mm_div_ps(_mm_setr_ps(r, g, b, a), _mm_setr_ps(a, a, a, 1.)),
_mm_set1_ps(255.),
),
);

let [r, g, b, a] = res;

Rgba {
r: r as u8,
g: g as u8,
b: b as u8,
a: a as u8,
}
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "sse")]
pub unsafe fn _invert_impl(base: Rgba) -> Rgba {
let mut res = [0_f32; 4];

_mm_store_ps(
res.as_mut_ptr(),
_mm_xor_ps(
_mm_setr_ps(base.r as f32, base.g as f32, base.b as f32, base.a as f32),
_mm_set1_ps(ONES),
),
);

let [r, g, b, a] = res;

Rgba {
r: r as u8,
g: g as u8,
b: b as u8,
a: a as u8,
}
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@
clippy::doc_markdown
)]

mod arch;
pub mod draw;
pub mod encode;
pub mod encodings;
Expand Down
55 changes: 5 additions & 50 deletions src/pixel.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
//! Encloses pixel-related traits and pixel type implementations.

use crate::Error::DecodingError;
use crate::{
arch,
encodings::ColorType,
image::OverlayMode,
Error::{InvalidHexCode, InvalidPaletteIndex, UnsupportedColorType},
Error::{DecodingError, InvalidHexCode, InvalidPaletteIndex, UnsupportedColorType},
Result,
};
use std::borrow::Cow;
Expand Down Expand Up @@ -625,11 +625,7 @@ impl Pixel for Rgb {
type Data = [u8; 3];

fn inverted(&self) -> Self {
Self {
r: !self.r,
g: !self.g,
b: !self.b,
}
arch::invert_impl((*self).into()).into()
}

fn map_subpixels<F, A>(self, f: F, _: A) -> Self
Expand Down Expand Up @@ -791,12 +787,7 @@ impl Pixel for Rgba {
type Data = [u8; 4];

fn inverted(&self) -> Self {
Self {
r: !self.r,
g: !self.g,
b: !self.b,
a: !self.a,
}
arch::invert_impl(*self)
}

fn map_subpixels<F, A>(self, f: F, a: A) -> Self
Expand Down Expand Up @@ -875,44 +866,8 @@ impl Pixel for Rgba {
[self.r, self.g, self.b, self.a]
}

// TODO: SIMD could speed this up significantly
#[allow(clippy::cast_lossless)]
fn merge(self, other: Self) -> Self {
// Optimize for common cases
if other.a == 255 {
return other;
} else if other.a == 0 {
return self;
}

let (base_r, base_g, base_b, base_a) = (
self.r as f32 / 255.,
self.g as f32 / 255.,
self.b as f32 / 255.,
self.a as f32 / 255.,
);

let (overlay_r, overlay_g, overlay_b, overlay_a) = (
other.r as f32 / 255.,
other.g as f32 / 255.,
other.b as f32 / 255.,
other.a as f32 / 255.,
);

let a_diff = 1. - overlay_a;
let a = a_diff.mul_add(base_a, overlay_a);

let a_ratio = a_diff * base_a;
let r = a_ratio.mul_add(base_r, overlay_a * overlay_r) / a;
let g = a_ratio.mul_add(base_g, overlay_a * overlay_g) / a;
let b = a_ratio.mul_add(base_b, overlay_a * overlay_b) / a;

Self {
r: (r * 255.) as u8,
g: (g * 255.) as u8,
b: (b * 255.) as u8,
a: (a * 255.) as u8,
}
arch::merge_impl(self, other)
}

#[allow(clippy::cast_lossless)]
Expand Down