diff --git a/Cargo.toml b/Cargo.toml
index 05e12aee5..3e0db7e1c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -37,6 +37,9 @@ quantize = ["dep:color_quant"]
 gradient = ["dep:colorgrad"]
 static = ["libwebp-sys2?/static"]
 
+[build-dependencies]
+rustc_version = "0.4"
+
 [dev-dependencies]
 criterion = "^0.4"
 image = "^0"
diff --git a/build.rs b/build.rs
new file mode 100644
index 000000000..9506529b5
--- /dev/null
+++ b/build.rs
@@ -0,0 +1,10 @@
+use rustc_version::{version, version_meta, Channel};
+
+fn main() {
+    assert!(version().unwrap().major >= 1);
+
+    match version_meta().unwrap().channel {
+        Channel::Nightly => println!("cargo:rustc-cfg=RUSTC_IS_NIGHTLY"),
+        _ => {}
+    }
+}
diff --git a/src/arch/aarch64.rs b/src/arch/aarch64.rs
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/src/arch/aarch64.rs
@@ -0,0 +1 @@
+
diff --git a/src/arch/manual.rs b/src/arch/manual.rs
new file mode 100644
index 000000000..8264b0f25
--- /dev/null
+++ b/src/arch/manual.rs
@@ -0,0 +1,42 @@
+use crate::Rgba;
+
+pub fn _merge_impl(base: Rgba, other: Rgba) -> Rgba {
+    let (base_r, base_g, base_b, base_a) = (
+        base.r as f32 / 255.,
+        base.g as f32 / 255.,
+        base.b as f32 / 255.,
+        base.a as f32 / 255.,
+    );
+
+    let (overlay_r, overlay_g, overlay_b, overlay_a) = (
+        other.r as f32 / 255.,
+        other.g as f32 / 255.,
+        other.b as f32 / 255.,
+        other.a as f32 / 255.,
+    );
+
+    let a_diff = 1. - overlay_a;
+    let a = a_diff.mul_add(base_a, overlay_a);
+
+    let a_ratio = a_diff * base_a;
+    let r = a_ratio.mul_add(base_r, overlay_a * overlay_r) / a;
+    let g = a_ratio.mul_add(base_g, overlay_a * overlay_g) / a;
+    let b = a_ratio.mul_add(base_b, overlay_a * overlay_b) / a;
+
+    Rgba {
+        r: (r * 255.) as u8,
+        g: (g * 255.) as u8,
+        b: (b * 255.) as u8,
+        a: (a * 255.) as u8,
+    }
+}
+
+#[inline]
+pub fn _invert_impl(base: Rgba) -> Rgba {
+    Rgba {
+        r: !base.r,
+        g: !base.g,
+        b: !base.b,
+        a: !base.a,
+    }
+}
diff --git a/src/arch/mod.rs b/src/arch/mod.rs
new file mode 100644
index 000000000..ccac3865a
--- /dev/null
+++ b/src/arch/mod.rs
@@ -0,0 +1,43 @@
+#![allow(clippy::cast_lossless)]
+#![allow(clippy::wildcard_imports)]
+#![allow(dead_code)]
+
+mod aarch64;
+mod manual;
+mod x86;
+
+use crate::Rgba;
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+use std::is_x86_feature_detected;
+
+#[inline]
+pub fn merge_impl(base: Rgba, other: Rgba) -> Rgba {
+    // Optimize for common cases
+    if other.a == 255 {
+        return other;
+    } else if other.a == 0 {
+        return base;
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    if is_x86_feature_detected!("sse") && is_x86_feature_detected!("fma") {
+        unsafe {
+            return x86::_merge_impl(base, other);
+        }
+    }
+
+    manual::_merge_impl(base, other)
+}
+
+#[inline]
+pub fn invert_impl(base: Rgba) -> Rgba {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    if is_x86_feature_detected!("sse") {
+        unsafe {
+            return x86::_invert_impl(base);
+        }
+    }
+
+    manual::_invert_impl(base)
+}
diff --git a/src/arch/x86.rs b/src/arch/x86.rs
new file mode 100644
index 000000000..f958db4c6
--- /dev/null
+++ b/src/arch/x86.rs
@@ -0,0 +1,105 @@
+#[cfg(target_arch = "x86")]
+use std::arch::x86::*;
+#[cfg(target_arch = "x86_64")]
+use std::arch::x86_64::*;
+
+use crate::Rgba;
+
+const ONES: f32 = unsafe { std::mem::transmute(0xff_ff_ff_ff_u32) };
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[target_feature(enable = "sse")]
+#[target_feature(enable = "fma")]
+pub unsafe fn _merge_impl(base: Rgba, other: Rgba) -> Rgba {
+    let mut base_rgba = [0_f32; 4];
+    let mut overlay = [0_f32; 4];
+    let mut overlay_rgba = [0_f32; 4];
+    let mut rgba = [0_f32; 4];
+    let mut res = [0_f32; 4];
+
+    _mm_store_ps(
+        base_rgba.as_mut_ptr(),
+        _mm_div_ps(
+            _mm_setr_ps(base.r as f32, base.g as f32, base.b as f32, base.a as f32),
+            _mm_set1_ps(255.),
+        ),
+    );
+
+    let [base_r, base_g, base_b, base_a] = base_rgba;
+    _mm_store_ps(
+        overlay.as_mut_ptr(),
+        _mm_div_ps(
+            _mm_setr_ps(
+                other.r as f32,
+                other.g as f32,
+                other.b as f32,
+                other.a as f32,
+            ),
+            _mm_set1_ps(255.),
+        ),
+    );
+
+    let [overlay_r, overlay_g, overlay_b, overlay_a] = overlay;
+    let a_diff = 1. - overlay_a;
+
+    _mm_store_ps(
+        overlay_rgba.as_mut_ptr(),
+        _mm_mul_ps(
+            _mm_setr_ps(overlay_r, overlay_g, overlay_b, base_a),
+            _mm_setr_ps(overlay_a, overlay_a, overlay_a, a_diff),
+        ),
+    );
+
+    let [overlay_r, overlay_g, overlay_b, a_ratio] = overlay_rgba;
+
+    _mm_store_ps(
+        rgba.as_mut_ptr(),
+        _mm_fmadd_ps(
+            _mm_setr_ps(a_ratio, a_ratio, a_ratio, a_diff),
+            _mm_setr_ps(base_r, base_g, base_b, base_a),
+            _mm_setr_ps(overlay_r, overlay_g, overlay_b, overlay_a),
+        ),
+    );
+
+    let [r, g, b, a] = rgba;
+
+    _mm_store_ps(
+        res.as_mut_ptr(),
+        _mm_mul_ps(
+            _mm_div_ps(_mm_setr_ps(r, g, b, a), _mm_setr_ps(a, a, a, 1.)),
+            _mm_set1_ps(255.),
+        ),
+    );
+
+    let [r, g, b, a] = res;
+
+    Rgba {
+        r: r as u8,
+        g: g as u8,
+        b: b as u8,
+        a: a as u8,
+    }
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[target_feature(enable = "sse")]
+pub unsafe fn _invert_impl(base: Rgba) -> Rgba {
+    let mut res = [0_f32; 4];
+
+    _mm_store_ps(
+        res.as_mut_ptr(),
+        _mm_xor_ps(
+            _mm_setr_ps(base.r as f32, base.g as f32, base.b as f32, base.a as f32),
+            _mm_set1_ps(ONES),
+        ),
+    );
+
+    let [r, g, b, a] = res;
+
+    Rgba {
+        r: r as u8,
+        g: g as u8,
+        b: b as u8,
+        a: a as u8,
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
index ad20623a8..7cca54251 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -261,6 +261,7 @@
     clippy::doc_markdown
 )]
 
+mod arch;
 pub mod draw;
 pub mod encode;
 pub mod encodings;
diff --git a/src/pixel.rs b/src/pixel.rs
index ab8e2fbc4..0bdb9d4ed 100644
--- a/src/pixel.rs
+++ b/src/pixel.rs
@@ -1,10 +1,10 @@
 //! Encloses pixel-related traits and pixel type implementations.
 
-use crate::Error::DecodingError;
 use crate::{
+    arch,
     encodings::ColorType,
     image::OverlayMode,
-    Error::{InvalidHexCode, InvalidPaletteIndex, UnsupportedColorType},
+    Error::{DecodingError, InvalidHexCode, InvalidPaletteIndex, UnsupportedColorType},
     Result,
 };
 use std::borrow::Cow;
@@ -625,11 +625,7 @@ impl Pixel for Rgb {
     type Data = [u8; 3];
 
     fn inverted(&self) -> Self {
-        Self {
-            r: !self.r,
-            g: !self.g,
-            b: !self.b,
-        }
+        arch::invert_impl((*self).into()).into()
     }
 
     fn map_subpixels<F, A>(self, f: F, _: A) -> Self
@@ -791,12 +787,7 @@ impl Pixel for Rgba {
     type Data = [u8; 4];
 
     fn inverted(&self) -> Self {
-        Self {
-            r: !self.r,
-            g: !self.g,
-            b: !self.b,
-            a: !self.a,
-        }
+        arch::invert_impl(*self)
     }
 
     fn map_subpixels<F, A>(self, f: F, a: A) -> Self
@@ -875,44 +866,8 @@ impl Pixel for Rgba {
         [self.r, self.g, self.b, self.a]
     }
 
-    // TODO: SIMD could speed this up significantly
-    #[allow(clippy::cast_lossless)]
     fn merge(self, other: Self) -> Self {
-        // Optimize for common cases
-        if other.a == 255 {
-            return other;
-        } else if other.a == 0 {
-            return self;
-        }
-
-        let (base_r, base_g, base_b, base_a) = (
-            self.r as f32 / 255.,
-            self.g as f32 / 255.,
-            self.b as f32 / 255.,
-            self.a as f32 / 255.,
-        );
-
-        let (overlay_r, overlay_g, overlay_b, overlay_a) = (
-            other.r as f32 / 255.,
-            other.g as f32 / 255.,
-            other.b as f32 / 255.,
-            other.a as f32 / 255.,
-        );
-
-        let a_diff = 1. - overlay_a;
-        let a = a_diff.mul_add(base_a, overlay_a);
-
-        let a_ratio = a_diff * base_a;
-        let r = a_ratio.mul_add(base_r, overlay_a * overlay_r) / a;
-        let g = a_ratio.mul_add(base_g, overlay_a * overlay_g) / a;
-        let b = a_ratio.mul_add(base_b, overlay_a * overlay_b) / a;
-
-        Self {
-            r: (r * 255.) as u8,
-            g: (g * 255.) as u8,
-            b: (b * 255.) as u8,
-            a: (a * 255.) as u8,
-        }
+        arch::merge_impl(self, other)
     }
 
     #[allow(clippy::cast_lossless)]