diff --git a/.github/workflows/fuzzing.yml b/.github/workflows/fuzzing.yml
index 799bc3d..0fce51e 100644
--- a/.github/workflows/fuzzing.yml
+++ b/.github/workflows/fuzzing.yml
@@ -36,6 +36,9 @@ jobs:
         mkdir -p build_fuzz/corpus_fuzzing/fuzz_bitmap_file_corpus
         mkdir -p build_fuzz/corpus_fuzzing/fuzz_image_operations_corpus
         mkdir -p build_fuzz/corpus_fuzzing/fuzz_matrix_corpus
+        mkdir -p build_fuzz/corpus_fuzzing/fuzz_convert_bgr_to_bgra_corpus
+        mkdir -p build_fuzz/corpus_fuzzing/fuzz_swizzle_bgra_to_rgba_corpus
+        mkdir -p build_fuzz/corpus_fuzzing/fuzz_swizzle_rgba_to_bgra_corpus
 
     - name: Run fuzz_bitmap
       run: |
@@ -56,3 +59,15 @@ jobs:
     - name: Run fuzz_matrix
       run: |
         ./build_fuzz/tests/fuzz_matrix -max_total_time=60 -print_final_stats=1 -print_pcs=1 -error_exitcode=1 build_fuzz/corpus_fuzzing/fuzz_matrix_corpus/
+    
+    - name: Run fuzz_convert_bgr_to_bgra
+      run: |
+        ./build_fuzz/tests/fuzz_convert_bgr_to_bgra -max_total_time=60 -print_final_stats=1 -print_pcs=1 -error_exitcode=1 build_fuzz/corpus_fuzzing/fuzz_convert_bgr_to_bgra_corpus/
+    
+    - name: Run fuzz_swizzle_bgra_to_rgba
+      run: |
+        ./build_fuzz/tests/fuzz_swizzle_bgra_to_rgba -max_total_time=60 -print_final_stats=1 -print_pcs=1 -error_exitcode=1 build_fuzz/corpus_fuzzing/fuzz_swizzle_bgra_to_rgba_corpus/
+
+    - name: Run fuzz_swizzle_rgba_to_bgra
+      run: |
+        ./build_fuzz/tests/fuzz_swizzle_rgba_to_bgra -max_total_time=60 -print_final_stats=1 -print_pcs=1 -error_exitcode=1 build_fuzz/corpus_fuzzing/fuzz_swizzle_rgba_to_bgra_corpus/
diff --git a/src/bitmap/bitmap.cpp b/src/bitmap/bitmap.cpp
index 9a46a3b..f248681 100644
--- a/src/bitmap/bitmap.cpp
+++ b/src/bitmap/bitmap.cpp
@@ -2,6 +2,8 @@
 #include <iostream>  // For standard I/O (though not explicitly used in this file's current state).
 #include <algorithm> // For std::min and std::max, used in ApplyBoxBlur and color adjustments.
 #include <vector>    // For std::vector, used by Matrix class and underlying bitmap data.
+#include <cstring>   // For std::memcpy, used in SIMD NEON path
+#include "../simd_utils.hpp" // Added include
 
 // Define BI_RGB as 0 if not already defined, to ensure cross-platform compatibility for bitmap compression type.
 #ifndef BI_RGB
@@ -183,6 +185,73 @@ Bitmap::File ApplyBoxBlur(Bitmap::File bitmapFile, int blurRadius)
     return CreateBitmapFromMatrix(blurredMatrix);
 }
 
+// Helper function for BGR to BGRA conversion with SIMD (declaration in bitmap.h)
+void internal_convert_bgr_to_bgra_simd(const uint8_t* src_row_bgr_ptr, ::Pixel* dest_row_pixel_ptr, size_t num_pixels_in_row) {
+    size_t current_src_byte_offset = 0;
+    size_t current_dest_pixel_idx = 0;
+    size_t num_pixels_to_process = num_pixels_in_row;
+
+#if defined(__AVX2__)
+    // As per previous implementation, AVX2 uses SSSE3 logic.
+    // No distinct 256-bit AVX2 path implemented here.
+#endif
+
+#if defined(__AVX2__) || defined(__SSSE3__) // Use SSSE3 for AVX2 as well if no specific AVX2 code
+    const size_t pixels_per_step = 4; 
+    __m128i bgr_to_bgrX_mask = _mm_setr_epi8(
+        0, 1, 2, (char)0x80, 
+        3, 4, 5, (char)0x80, 
+        6, 7, 8, (char)0x80, 
+        9, 10, 11, (char)0x80 
+    );
+    __m128i alpha_channel_ff = _mm_setr_epi8(
+        0,0,0, (char)0xFF, 0,0,0,(char)0xFF, 0,0,0,(char)0xFF, 0,0,0,(char)0xFF
+    );
+
+    while (num_pixels_to_process >= pixels_per_step) {
+        __m128i bgr_data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src_row_bgr_ptr + current_src_byte_offset));
+        __m128i bgra_pixels_expanded = _mm_shuffle_epi8(bgr_data, bgr_to_bgrX_mask);
+        __m128i bgra_pixels_final = _mm_or_si128(bgra_pixels_expanded, alpha_channel_ff);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(dest_row_pixel_ptr + current_dest_pixel_idx), bgra_pixels_final);
+        current_src_byte_offset += pixels_per_step * 3; 
+        current_dest_pixel_idx += pixels_per_step;    
+        num_pixels_to_process -= pixels_per_step;
+    }
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    const size_t pixels_per_step = 4;
+    const uint8_t table_bgr_to_bgr0[] = {0,1,2,16, 3,4,5,16, 6,7,8,16, 9,10,11,16}; 
+    uint8x16_t neon_shuffle_table = vld1q_u8(table_bgr_to_bgr0);
+    const uint8_t alpha_bytes[] = {0,0,0,0xFF, 0,0,0,0xFF, 0,0,0,0xFF, 0,0,0,0xFF};
+    uint8x16_t alpha_channel_ff_neon = vld1q_u8(alpha_bytes);
+    uint8_t temp_bgr_load[16];
+
+    while (num_pixels_to_process >= pixels_per_step) {
+        std::memcpy(temp_bgr_load, src_row_bgr_ptr + current_src_byte_offset, 12);
+        uint8x16_t bgr_data_loaded = vld1q_u8(temp_bgr_load);
+        uint8x16_t bgra_expanded = vqtbl1q_u8(bgr_data_loaded, neon_shuffle_table);
+        uint8x16_t bgra_final = vorrq_u8(bgra_expanded, alpha_channel_ff_neon);
+        vst1q_u8(reinterpret_cast<uint8_t*>(dest_row_pixel_ptr + current_dest_pixel_idx), bgra_final);
+        current_src_byte_offset += pixels_per_step * 3;
+        current_dest_pixel_idx += pixels_per_step;
+        num_pixels_to_process -= pixels_per_step;
+    }
+#else 
+    // This #else block ensures that if no SIMD path is taken (e.g. SSSE3/NEON not defined, or AVX2 defined but its specific block is empty and it's not grouped with SSSE3),
+    // the scalar loop below is the ONLY path for processing.
+    // The current structure with #if defined(__AVX2__) || defined(__SSSE3__) followed by #elif defined(__ARM_NEON)
+    // means this #else is for when NEITHER of those are true.
+    // If AVX2 is defined, it uses the SSSE3 block. If only NEON is defined, it uses NEON block.
+    // If none are defined, it falls to the scalar loop below.
+#endif
+    // Scalar fallback for remaining pixels OR if no SIMD defined/executed above
+    for (size_t k_rem = 0; k_rem < num_pixels_to_process; ++k_rem) {
+        (dest_row_pixel_ptr + current_dest_pixel_idx + k_rem)->blue  = *(src_row_bgr_ptr + current_src_byte_offset + k_rem * 3 + 0);
+        (dest_row_pixel_ptr + current_dest_pixel_idx + k_rem)->green = *(src_row_bgr_ptr + current_src_byte_offset + k_rem * 3 + 1);
+        (dest_row_pixel_ptr + current_dest_pixel_idx + k_rem)->red   = *(src_row_bgr_ptr + current_src_byte_offset + k_rem * 3 + 2);
+        (dest_row_pixel_ptr + current_dest_pixel_idx + k_rem)->alpha = 255;
+    }
+}
+
 // Converts a Bitmap::File object (containing raw bitmap data and headers)
 // into a Matrix::Matrix<Pixel> for easier pixel manipulation.
 Matrix::Matrix<Pixel> CreateMatrixFromBitmap(Bitmap::File bitmapFile)
@@ -236,16 +305,16 @@ Matrix::Matrix<Pixel> CreateMatrixFromBitmap(Bitmap::File bitmapFile)
     }
     else if (bitmapFile.bitmapInfoHeader.biBitCount == 24) // For 24-bit bitmaps (BGR)
     {
-        int k = 0; // Index for bitmapFile.bitmapData
-        for (int i = 0; i < imageMatrix.rows(); i++)
-            for (int j = 0; j < imageMatrix.cols(); j++)
-            {
-                imageMatrix[i][j].blue = bitmapFile.bitmapData[k];
-                imageMatrix[i][j].green = bitmapFile.bitmapData[k + 1];
-                imageMatrix[i][j].red = bitmapFile.bitmapData[k + 2];
-                imageMatrix[i][j].alpha = 0; // Default alpha to 0 (opaque) for 24-bit images.
-                k += 3; // Move to the next pixel (3 bytes)
-            }
+        // Calculate padding if necessary, though source data in bitmapFile.bitmapData should be packed according to BMP spec (rows padded to 4 bytes)
+        // However, we process pixel by pixel here from the linear bitmapData.
+        // The number of bytes per row in source data:
+        uint32_t src_bytes_per_row = (static_cast<uint32_t>(imageMatrix.cols()) * 3 + 3) & ~3u; // BMP rows are padded to 4 bytes for BGR
+
+        for (int i = 0; i < imageMatrix.rows(); i++) {
+            const uint8_t* src_row_bgr_ptr = bitmapFile.bitmapData.data() + (static_cast<size_t>(i) * src_bytes_per_row);
+            ::Pixel* dest_row_pixel_ptr = &imageMatrix[i][0];
+            internal_convert_bgr_to_bgra_simd(src_row_bgr_ptr, dest_row_pixel_ptr, imageMatrix.cols());
+        }
     }
     // Note: Other bit depths (e.g., 1, 4, 8, 16-bit) would require more complex handling.
 
diff --git a/src/bitmap/bitmap.h b/src/bitmap/bitmap.h
index 619beee..4f0edd7 100644
--- a/src/bitmap/bitmap.h
+++ b/src/bitmap/bitmap.h
@@ -417,4 +417,7 @@ Pixel ApplySepiaToPixel(Pixel pixel);
 
 // Note: Box blur is applied over a region within ApplyBoxBlur,
 // so it does not have a direct single-pixel helper function here.
+
+// Helper function for BGR to BGRA conversion (primarily for testing)
+void internal_convert_bgr_to_bgra_simd(const uint8_t* src_bgr_data, ::Pixel* dest_bgra_pixels, size_t num_pixels);
 #endif
\ No newline at end of file
diff --git a/src/format/bitmap.cpp b/src/format/bitmap.cpp
index f28e93f..1c0eb06 100644
--- a/src/format/bitmap.cpp
+++ b/src/format/bitmap.cpp
@@ -12,6 +12,8 @@
 #include "../../src/bitmapfile/bitmap_file.h" // For BITMAPFILEHEADER, BITMAPINFOHEADER from external lib
 #include "../../src/bitmap/bitmap.h"         // For ::Pixel, ::CreateMatrixFromBitmap, ::CreateBitmapFromMatrix
 #include "../../src/matrix/matrix.h"         // For Matrix::Matrix
+#include "../simd_utils.hpp" // Added include
+#include "format_internal_helpers.hpp" // Added include for the new helpers
 
 // Define constants for BMP format (can be used by Format::Internal helpers or if save needs them directly)
 constexpr uint16_t BMP_MAGIC_TYPE_CONST = 0x4D42; // 'BM'
@@ -150,20 +152,72 @@ Result<Bitmap, BitmapError> load(std::span<const uint8_t> bmp_data) {
     bmp_out.bpp = 32; 
     bmp_out.data.resize(static_cast<size_t>(bmp_out.w) * bmp_out.h * 4);
 
+    // The loop converting image_matrix to bmp_out.data
+    // bmp_out.data is already resized.
     for (uint32_t y = 0; y < bmp_out.h; ++y) {
-        for (uint32_t x = 0; x < bmp_out.w; ++x) {
-            const ::Pixel& src_pixel = image_matrix.at(y, x); // Changed Get(x,y) to at(y,x)
-            
-            size_t dest_idx = (static_cast<size_t>(y) * bmp_out.w + x) * 4;
-            bmp_out.data[dest_idx + 0] = src_pixel.red;   
-            bmp_out.data[dest_idx + 1] = src_pixel.green; 
-            bmp_out.data[dest_idx + 2] = src_pixel.blue;  
-            bmp_out.data[dest_idx + 3] = src_pixel.alpha; 
-        }
+        const ::Pixel* src_bgra_pixels_row = &image_matrix[y][0];
+        uint8_t* dest_rgba_data_row = bmp_out.data.data() + (static_cast<size_t>(y) * bmp_out.w * 4);
+        internal_swizzle_bgra_to_rgba_simd(src_bgra_pixels_row, dest_rgba_data_row, bmp_out.w);
     }
     return bmp_out;
 }
 
+
+// Helper function to convert an array of ::Pixel (BGRA order) to an array of uint8_t (RGBA order)
+void internal_swizzle_bgra_to_rgba_simd(const ::Pixel* src_bgra_pixels, uint8_t* dest_rgba_data, size_t num_pixels) {
+    size_t current_pixel_idx = 0;
+    size_t num_pixels_to_process = num_pixels;
+
+#if defined(__AVX2__)
+    const size_t pixels_per_step = 8; // 8 pixels = 32 bytes
+    __m256i shuffle_mask_bgra_to_rgba = _mm256_setr_epi8(
+        2, 1, 0, 3,  6, 5, 4, 7,  10, 9, 8, 11,  14,13,12,15, // First 4 pixels (16 bytes)
+        18,17,16,19, 22,21,20,23, 26,25,24,27, 30,29,28,31  // Next 4 pixels (16 bytes)
+    );
+    while (num_pixels_to_process >= pixels_per_step) {
+        __m256i bgra_pixels_loaded = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src_bgra_pixels + current_pixel_idx));
+        __m256i rgba_pixels = _mm256_shuffle_epi8(bgra_pixels_loaded, shuffle_mask_bgra_to_rgba);
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(dest_rgba_data + current_pixel_idx * 4), rgba_pixels);
+        current_pixel_idx += pixels_per_step;
+        num_pixels_to_process -= pixels_per_step;
+    }
+#elif defined(__SSSE3__) // _mm_shuffle_epi8 requires SSSE3
+    const size_t pixels_per_step = 4; // 4 pixels = 16 bytes
+    __m128i shuffle_mask_bgra_to_rgba = _mm_setr_epi8(
+        2, 1, 0, 3,  6, 5, 4, 7,  10, 9, 8, 11,  14,13,12,15
+    );
+    while (num_pixels_to_process >= pixels_per_step) {
+        __m128i bgra_pixels_loaded = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src_bgra_pixels + current_pixel_idx));
+        __m128i rgba_pixels = _mm_shuffle_epi8(bgra_pixels_loaded, shuffle_mask_bgra_to_rgba);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(dest_rgba_data + current_pixel_idx * 4), rgba_pixels);
+        current_pixel_idx += pixels_per_step;
+        num_pixels_to_process -= pixels_per_step;
+    }
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    const size_t pixels_per_step = 4; // 4 pixels = 16 bytes using uint8x16_t
+    const uint8_t shuffle_coeffs_array[] = {2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15};
+    uint8x16_t neon_shuffle_mask = vld1q_u8(shuffle_coeffs_array);
+
+    while (num_pixels_to_process >= pixels_per_step) {
+        uint8x16_t bgra_pixels_loaded = vld1q_u8(reinterpret_cast<const uint8_t*>(src_bgra_pixels + current_pixel_idx));
+        uint8x16_t rgba_pixels = vqtbl1q_u8(bgra_pixels_loaded, neon_shuffle_mask);
+        vst1q_u8(dest_rgba_data + current_pixel_idx * 4, rgba_pixels);
+        current_pixel_idx += pixels_per_step;
+        num_pixels_to_process -= pixels_per_step;
+    }
+#endif
+    // Scalar fallback for remaining pixels in the row
+    for (size_t i = 0; i < num_pixels_to_process; ++i) {
+        const ::Pixel& src_pixel = *(src_bgra_pixels + current_pixel_idx + i);
+        uint8_t* dest_pixel_ptr = dest_rgba_data + (current_pixel_idx + i) * 4;
+        dest_pixel_ptr[0] = src_pixel.red;   // R
+        dest_pixel_ptr[1] = src_pixel.green; // G
+        dest_pixel_ptr[2] = src_pixel.blue;  // B
+        dest_pixel_ptr[3] = src_pixel.alpha; // A
+    }
+}
+
+
 // The NEW BmpTool::save function using the external library
 Result<void, BitmapError> save(const Bitmap& bitmap_in, std::span<uint8_t> out_bmp_buffer) {
     // 1. Input Validation from BmpTool::Bitmap
@@ -185,17 +239,12 @@ Result<void, BitmapError> save(const Bitmap& bitmap_in, std::span<uint8_t> out_b
     // Assuming ::Pixel struct has members .red, .green, .blue, .alpha
     Matrix::Matrix<::Pixel> image_matrix(bitmap_in.h, bitmap_in.w); // Changed order to (rows, cols)
 
+    // The loop converting bitmap_in.data to image_matrix
+    // image_matrix is already sized.
     for (uint32_t y = 0; y < bitmap_in.h; ++y) {
-        for (uint32_t x = 0; x < bitmap_in.w; ++x) {
-            const uint8_t* src_pixel_ptr = &bitmap_in.data[(static_cast<size_t>(y) * bitmap_in.w + x) * 4]; // RGBA
-            ::Pixel dest_pixel; 
-            dest_pixel.red   = src_pixel_ptr[0];
-            dest_pixel.green = src_pixel_ptr[1];
-            dest_pixel.blue  = src_pixel_ptr[2];
-            dest_pixel.alpha = src_pixel_ptr[3];
-            
-            image_matrix.at(y, x) = dest_pixel; // Changed Set(x,y) to at(y,x)
-        }
+        const uint8_t* src_rgba_data_row = &bitmap_in.data[(static_cast<size_t>(y) * bitmap_in.w * 4)];
+        ::Pixel* dest_bgra_pixels_row = &image_matrix[y][0];
+        internal_swizzle_rgba_to_bgra_simd(src_rgba_data_row, dest_bgra_pixels_row, bitmap_in.w);
     }
 
     // 3. Convert Matrix<::Pixel> to ::Bitmap::File
@@ -255,4 +304,60 @@ Result<void, BitmapError> save(const Bitmap& bitmap_in, std::span<uint8_t> out_b
     return BmpTool::Success{}; // This will implicitly convert to Result<void, BitmapError>(Success{})
 }
 
+
+// Helper function to convert an array of uint8_t (RGBA order) to an array of ::Pixel (BGRA order)
+void internal_swizzle_rgba_to_bgra_simd(const uint8_t* src_rgba_data, ::Pixel* dest_bgra_pixels, size_t num_pixels) {
+    size_t current_pixel_idx = 0;
+    size_t num_pixels_to_process = num_pixels;
+
+#if defined(__AVX2__)
+    const size_t pixels_per_step = 8; // 8 pixels = 32 bytes
+    __m256i shuffle_mask_rgba_to_bgra = _mm256_setr_epi8(
+        2, 1, 0, 3,  6, 5, 4, 7,  10, 9, 8, 11,  14,13,12,15,
+        18,17,16,19, 22,21,20,23, 26,25,24,27, 30,29,28,31
+    );
+    while (num_pixels_to_process >= pixels_per_step) {
+        __m256i rgba_pixels_loaded = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src_rgba_data + current_pixel_idx * 4));
+        __m256i bgra_pixels = _mm256_shuffle_epi8(rgba_pixels_loaded, shuffle_mask_rgba_to_bgra);
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(dest_bgra_pixels + current_pixel_idx), bgra_pixels);
+        current_pixel_idx += pixels_per_step;
+        num_pixels_to_process -= pixels_per_step;
+    }
+#elif defined(__SSSE3__) // _mm_shuffle_epi8 requires SSSE3
+    const size_t pixels_per_step = 4; // 4 pixels = 16 bytes
+    __m128i shuffle_mask_rgba_to_bgra = _mm_setr_epi8(
+        2, 1, 0, 3,  6, 5, 4, 7,  10, 9, 8, 11,  14,13,12,15
+    );
+    while (num_pixels_to_process >= pixels_per_step) {
+        __m128i rgba_pixels_loaded = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src_rgba_data + current_pixel_idx * 4));
+        __m128i bgra_pixels = _mm_shuffle_epi8(rgba_pixels_loaded, shuffle_mask_rgba_to_bgra);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(dest_bgra_pixels + current_pixel_idx), bgra_pixels);
+        current_pixel_idx += pixels_per_step;
+        num_pixels_to_process -= pixels_per_step;
+    }
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    const size_t pixels_per_step = 4; // 4 pixels = 16 bytes
+    const uint8_t shuffle_coeffs_array[] = {2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15};
+    uint8x16_t neon_shuffle_mask = vld1q_u8(shuffle_coeffs_array);
+
+    while (num_pixels_to_process >= pixels_per_step) {
+        uint8x16_t rgba_pixels_loaded = vld1q_u8(src_rgba_data + current_pixel_idx * 4);
+        uint8x16_t bgra_pixels = vqtbl1q_u8(rgba_pixels_loaded, neon_shuffle_mask);
+        vst1q_u8(reinterpret_cast<uint8_t*>(dest_bgra_pixels + current_pixel_idx), bgra_pixels);
+        current_pixel_idx += pixels_per_step;
+        num_pixels_to_process -= pixels_per_step;
+    }
+#endif
+    // Scalar fallback for remaining pixels in the row
+    for (size_t i = 0; i < num_pixels_to_process; ++i) {
+        const uint8_t* src_pixel_ptr = src_rgba_data + (current_pixel_idx + i) * 4;
+        ::Pixel& dest_pixel = *(dest_bgra_pixels + current_pixel_idx + i);
+        
+        dest_pixel.red   = src_pixel_ptr[0]; // R
+        dest_pixel.green = src_pixel_ptr[1]; // G
+        dest_pixel.blue  = src_pixel_ptr[2]; // B
+        dest_pixel.alpha = src_pixel_ptr[3]; // A
+    }
+}
+
 } // namespace BmpTool
diff --git a/src/format/format_internal_helpers.hpp b/src/format/format_internal_helpers.hpp
new file mode 100644
index 0000000..fa39270
--- /dev/null
+++ b/src/format/format_internal_helpers.hpp
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <cstddef> // For size_t
+// Include bitmap.h for the definition of ::Pixel
+#include "../bitmap/bitmap.h" 
+
+namespace BmpTool {
+
+void internal_swizzle_bgra_to_rgba_simd(const ::Pixel* src_bgra_pixels, uint8_t* dest_rgba_data, size_t num_pixels);
+void internal_swizzle_rgba_to_bgra_simd(const uint8_t* src_rgba_data, ::Pixel* dest_bgra_pixels, size_t num_pixels);
+
+} // namespace BmpTool
diff --git a/src/simd_utils.hpp b/src/simd_utils.hpp
new file mode 100644
index 0000000..f2fe293
--- /dev/null
+++ b/src/simd_utils.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+// For x86/x64 SIMD intrinsics
+#if defined(__SSE2__) || defined(__AVX__) || defined(__AVX2__)
+#include <immintrin.h> // Includes SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, FMA, etc.
+#endif
+
+// For ARM NEON intrinsics
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#endif
+
+// Helper to check if a pointer is aligned to a certain byte boundary
+template <typename T>
+inline bool is_aligned(const T* ptr, std::size_t alignment) {
+    return reinterpret_cast<std::uintptr_t>(ptr) % alignment == 0;
+}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 246d741..c5c431b 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -9,6 +9,7 @@ set(TEST_SOURCES
     test_bitmap.cpp
     test_bitmap_file.cpp
     test_matrix.cpp
+    test_swizzle.cpp # <-- Add this line
     tests_main.cpp
     api_roundtrip.cpp
 )
@@ -60,6 +61,24 @@ if(ENABLE_FUZZING AND CMAKE_CXX_COMPILER_ID MATCHES "Clang")
   target_link_libraries(fuzz_matrix PRIVATE bitmap)
   set_target_properties(fuzz_matrix PROPERTIES LINK_FLAGS "${FUZZ_LINK_FLAGS}")
   message(STATUS "Fuzz target fuzz_matrix added.")
+
+  # Fuzzer for BGR to BGRA conversion
+  add_executable(fuzz_convert_bgr_to_bgra fuzz/fuzz_convert_bgr_to_bgra.cpp)
+  target_link_libraries(fuzz_convert_bgr_to_bgra PRIVATE bitmap)
+  set_target_properties(fuzz_convert_bgr_to_bgra PROPERTIES LINK_FLAGS "${FUZZ_LINK_FLAGS}")
+  message(STATUS "Fuzz target fuzz_convert_bgr_to_bgra added.")
+
+  # Fuzzer for BGRA to RGBA swizzle
+  add_executable(fuzz_swizzle_bgra_to_rgba fuzz/fuzz_swizzle_bgra_to_rgba.cpp)
+  target_link_libraries(fuzz_swizzle_bgra_to_rgba PRIVATE bitmap)
+  set_target_properties(fuzz_swizzle_bgra_to_rgba PROPERTIES LINK_FLAGS "${FUZZ_LINK_FLAGS}")
+  message(STATUS "Fuzz target fuzz_swizzle_bgra_to_rgba added.")
+
+  # Fuzzer for RGBA to BGRA swizzle
+  add_executable(fuzz_swizzle_rgba_to_bgra fuzz/fuzz_swizzle_rgba_to_bgra.cpp)
+  target_link_libraries(fuzz_swizzle_rgba_to_bgra PRIVATE bitmap)
+  set_target_properties(fuzz_swizzle_rgba_to_bgra PROPERTIES LINK_FLAGS "${FUZZ_LINK_FLAGS}")
+  message(STATUS "Fuzz target fuzz_swizzle_rgba_to_bgra added.")
   
 endif()
 # Note: The public include directories from the 'bitmap' target 
diff --git a/tests/fuzz/fuzz_convert_bgr_to_bgra.cpp b/tests/fuzz/fuzz_convert_bgr_to_bgra.cpp
new file mode 100644
index 0000000..afe57a3
--- /dev/null
+++ b/tests/fuzz/fuzz_convert_bgr_to_bgra.cpp
@@ -0,0 +1,34 @@
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+#include "../src/bitmap/bitmap.h" // For ::Pixel and internal_convert_bgr_to_bgra_simd
+
+// Limit max pixels to prevent excessive memory allocation during fuzzing.
+const size_t MAX_FUZZ_PIXELS = 1024 * 1024; // Approx 4MB for BGRA, 3MB for BGR
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+    if (Size < 3) { // Need at least one BGR pixel
+        return 0;
+    }
+
+    size_t num_pixels = Size / 3;
+    if (num_pixels == 0) {
+        return 0;
+    }
+    if (num_pixels > MAX_FUZZ_PIXELS) {
+        num_pixels = MAX_FUZZ_PIXELS;
+        // Adjust Size to match the truncated num_pixels for the source data,
+        // though internal_convert_bgr_to_bgra_simd only reads num_pixels * 3 bytes.
+        // No, Data pointer is const, and Size is the original size.
+        // The function will just process fewer pixels from the original Data buffer.
+    }
+
+    std::vector<::Pixel> output_pixels(num_pixels);
+    // The source data (Data) is used directly.
+    // internal_convert_bgr_to_bgra_simd will read num_pixels * 3 bytes from Data.
+    // This is safe because num_pixels was derived from Size/3.
+    // If num_pixels was capped by MAX_FUZZ_PIXELS, it reads a sub-segment of the original data.
+    internal_convert_bgr_to_bgra_simd(Data, output_pixels.data(), num_pixels);
+    
+    return 0; // Non-zero return values are reserved for future use.
+}
diff --git a/tests/fuzz/fuzz_swizzle_bgra_to_rgba.cpp b/tests/fuzz/fuzz_swizzle_bgra_to_rgba.cpp
new file mode 100644
index 0000000..682d220
--- /dev/null
+++ b/tests/fuzz/fuzz_swizzle_bgra_to_rgba.cpp
@@ -0,0 +1,37 @@
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+#include "../src/bitmap/bitmap.h" // For ::Pixel
+#include "../src/format/format_internal_helpers.hpp" // For internal_swizzle_bgra_to_rgba_simd
+
+const size_t MAX_FUZZ_PIXELS = 1024 * 1024;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+    if (Size < sizeof(::Pixel)) { // Need at least one ::Pixel
+        return 0;
+    }
+    // Ensure Size is a multiple of sizeof(::Pixel)
+    // If not, we can't safely cast Data to ::Pixel* for the full Size.
+    // The function expects a certain number of full Pixel structs.
+    if (Size % sizeof(::Pixel) != 0) {
+        // Or alternatively, could calculate num_pixels based on floor(Size / sizeof(::Pixel))
+        // and only pass that many. For stricter fuzzing, returning if not aligned might be intended.
+        return 0; 
+    }
+
+    size_t num_pixels = Size / sizeof(::Pixel);
+    if (num_pixels == 0) { // Should be caught by Size < sizeof(::Pixel) already
+        return 0;
+    }
+    if (num_pixels > MAX_FUZZ_PIXELS) {
+        num_pixels = MAX_FUZZ_PIXELS;
+    }
+    
+    std::vector<uint8_t> rgba_output(num_pixels * 4);
+    // Data is cast to const ::Pixel*. The function will read num_pixels from this array.
+    // This is safe because num_pixels was derived from Size / sizeof(::Pixel).
+    // If num_pixels was capped, it processes a sub-segment.
+    BmpTool::internal_swizzle_bgra_to_rgba_simd(reinterpret_cast<const ::Pixel*>(Data), rgba_output.data(), num_pixels);
+    
+    return 0;
+}
diff --git a/tests/fuzz/fuzz_swizzle_rgba_to_bgra.cpp b/tests/fuzz/fuzz_swizzle_rgba_to_bgra.cpp
new file mode 100644
index 0000000..aef6224
--- /dev/null
+++ b/tests/fuzz/fuzz_swizzle_rgba_to_bgra.cpp
@@ -0,0 +1,36 @@
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+#include "../src/bitmap/bitmap.h" // For ::Pixel
+#include "../src/format/format_internal_helpers.hpp" // For internal_swizzle_rgba_to_bgra_simd
+
+const size_t MAX_FUZZ_PIXELS = 1024 * 1024;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+    if (Size < 4) { // Need at least one RGBA pixel (4 bytes)
+        return 0;
+    }
+    // Ensure Size is a multiple of 4 for RGBA pixels
+    if (Size % 4 != 0) {
+        // Similar to the BGRA->RGBA fuzzer, could truncate or return.
+        // Returning if not perfectly divisible ensures strict input format.
+        return 0;
+    }
+
+    size_t num_pixels = Size / 4;
+    if (num_pixels == 0) { // Should be caught by Size < 4 already
+        return 0;
+    }
+    if (num_pixels > MAX_FUZZ_PIXELS) {
+        num_pixels = MAX_FUZZ_PIXELS;
+    }
+
+    std::vector<::Pixel> bgra_output(num_pixels);
+    // Data is uint8_t* source. internal_swizzle_rgba_to_bgra_simd expects this.
+    // It will read num_pixels * 4 bytes from Data.
+    // This is safe as num_pixels was derived from Size / 4.
+    // If num_pixels was capped, it processes a sub-segment.
+    BmpTool::internal_swizzle_rgba_to_bgra_simd(Data, bgra_output.data(), num_pixels);
+    
+    return 0;
+}
diff --git a/tests/test_swizzle.cpp b/tests/test_swizzle.cpp
new file mode 100644
index 0000000..a32df2b
--- /dev/null
+++ b/tests/test_swizzle.cpp
@@ -0,0 +1,149 @@
+#include <vector>
+#include <cstdint> // For uint8_t
+#include <string>  // For std::to_string
+#include "gtest/gtest.h"
+// For Pixel struct and internal_convert_bgr_to_bgra_simd declaration
+#include "../src/bitmap/bitmap.h" 
+#include "../src/format/format_internal_helpers.hpp" // For BGRA <-> RGBA swizzle helpers
+// #include "../src/simd_utils.hpp" // Not strictly needed here as func is in .cpp
+
+// --- Tests for BGR -> BGRA swizzling (from internal_convert_bgr_to_bgra_simd) ---
+
+// Test case for a single pixel conversion BGR -> BGRA
+TEST(SwizzleBGRtoBGRATest, SinglePixel) {
+    uint8_t bgr_input[] = {0x01, 0x02, 0x03}; // B, G, R
+    ::Pixel output_pixel_array[1];
+    internal_convert_bgr_to_bgra_simd(bgr_input, output_pixel_array, 1);
+    EXPECT_EQ(output_pixel_array[0].blue, 0x01);
+    EXPECT_EQ(output_pixel_array[0].green, 0x02);
+    EXPECT_EQ(output_pixel_array[0].red, 0x03);
+    EXPECT_EQ(output_pixel_array[0].alpha, 255);
+}
+
+// Helper function for various sizes test for BGR -> BGRA
+void check_bgr_to_bgra_conversion(size_t num_pixels) {
+    SCOPED_TRACE("num_pixels (BGRtoBGRA): " + std::to_string(num_pixels)); 
+    if (num_pixels == 0) {
+        internal_convert_bgr_to_bgra_simd(nullptr, nullptr, 0);
+        SUCCEED(); 
+        return;
+    }
+    std::vector<uint8_t> bgr_input(num_pixels * 3);
+    std::vector<::Pixel> output_pixels(num_pixels);
+    for (size_t i = 0; i < num_pixels; ++i) {
+        bgr_input[i*3+0] = static_cast<uint8_t>(((i*3+1)%250) + 1); // Blue
+        bgr_input[i*3+1] = static_cast<uint8_t>(((i*3+2)%250) + 1); // Green
+        bgr_input[i*3+2] = static_cast<uint8_t>(((i*3+3)%250) + 1); // Red
+    }
+    internal_convert_bgr_to_bgra_simd(bgr_input.data(), output_pixels.data(), num_pixels);
+    for (size_t j = 0; j < num_pixels; ++j) {
+        ASSERT_EQ(output_pixels[j].blue, bgr_input[j*3+0]);
+        ASSERT_EQ(output_pixels[j].green, bgr_input[j*3+1]);
+        ASSERT_EQ(output_pixels[j].red, bgr_input[j*3+2]);
+        ASSERT_EQ(output_pixels[j].alpha, 255);
+    }
+}
+
+TEST(SwizzleBGRtoBGRATest, VariousSizes) {
+    std::vector<size_t> sizes_to_test = {0, 1, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 100};
+    for (size_t size : sizes_to_test) {
+        check_bgr_to_bgra_conversion(size);
+    }
+}
+
+TEST(SwizzleBGRtoBGRATest, AllZeros) {
+    const size_t num_pixels = 32;
+    std::vector<uint8_t> bgr_input(num_pixels * 3, 0);
+    std::vector<::Pixel> output_pixels(num_pixels);
+    internal_convert_bgr_to_bgra_simd(bgr_input.data(), output_pixels.data(), num_pixels);
+    for (size_t j = 0; j < num_pixels; ++j) {
+        EXPECT_EQ(output_pixels[j].blue, 0);
+        EXPECT_EQ(output_pixels[j].green, 0);
+        EXPECT_EQ(output_pixels[j].red, 0);
+        EXPECT_EQ(output_pixels[j].alpha, 255);
+    }
+}
+
+TEST(SwizzleBGRtoBGRATest, AllMaxRGB) {
+    const size_t num_pixels = 32;
+    const uint8_t max_val = 254; 
+    std::vector<uint8_t> bgr_input(num_pixels * 3, max_val);
+    std::vector<::Pixel> output_pixels(num_pixels);
+    internal_convert_bgr_to_bgra_simd(bgr_input.data(), output_pixels.data(), num_pixels);
+    for (size_t j = 0; j < num_pixels; ++j) {
+        EXPECT_EQ(output_pixels[j].blue, max_val);
+        EXPECT_EQ(output_pixels[j].green, max_val);
+        EXPECT_EQ(output_pixels[j].red, max_val);
+        EXPECT_EQ(output_pixels[j].alpha, 255);
+    }
+}
+
+// --- Tests for BGRA <-> RGBA swizzling (from format_internal_helpers.hpp) ---
+
+// Helper function for BGRA -> RGBA tests
+void check_bgra_to_rgba_conversion(size_t num_pixels) {
+    SCOPED_TRACE("num_pixels (BGRAtoRGBA): " + std::to_string(num_pixels));
+    if (num_pixels == 0) {
+        BmpTool::internal_swizzle_bgra_to_rgba_simd(nullptr, nullptr, 0);
+        SUCCEED();
+        return;
+    }
+    std::vector<::Pixel> bgra_input(num_pixels);
+    for (size_t j = 0; j < num_pixels; ++j) {
+        bgra_input[j].blue  = static_cast<uint8_t>(((j * 4 + 0) % 250) + 1);
+        bgra_input[j].green = static_cast<uint8_t>(((j * 4 + 1) % 250) + 1);
+        bgra_input[j].red   = static_cast<uint8_t>(((j * 4 + 2) % 250) + 1);
+        bgra_input[j].alpha = static_cast<uint8_t>(((j * 4 + 3) % 250) + 1);
+    }
+    std::vector<uint8_t> rgba_output(num_pixels * 4);
+
+    BmpTool::internal_swizzle_bgra_to_rgba_simd(bgra_input.data(), rgba_output.data(), num_pixels);
+    
+    for (size_t j = 0; j < num_pixels; ++j) {
+        ASSERT_EQ(rgba_output[j*4+0], bgra_input[j].red);   // R
+        ASSERT_EQ(rgba_output[j*4+1], bgra_input[j].green); // G
+        ASSERT_EQ(rgba_output[j*4+2], bgra_input[j].blue);  // B
+        ASSERT_EQ(rgba_output[j*4+3], bgra_input[j].alpha); // A
+    }
+}
+
+TEST(SwizzleBGRARGATest, BGRAtoRGBAVariousSizes) {
+    std::vector<size_t> sizes_to_test = {0, 1, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 100};
+    for (size_t size : sizes_to_test) {
+        check_bgra_to_rgba_conversion(size);
+    }
+}
+
+// Helper function for RGBA -> BGRA tests
+void check_rgba_to_bgra_conversion(size_t num_pixels) {
+    SCOPED_TRACE("num_pixels (RGBAtoBGRA): " + std::to_string(num_pixels));
+    if (num_pixels == 0) {
+        BmpTool::internal_swizzle_rgba_to_bgra_simd(nullptr, nullptr, 0);
+        SUCCEED();
+        return;
+    }
+    std::vector<uint8_t> rgba_input(num_pixels * 4);
+    for (size_t j = 0; j < num_pixels; ++j) {
+        rgba_input[j*4+0] = static_cast<uint8_t>(((j * 4 + 0) % 250) + 1); // R
+        rgba_input[j*4+1] = static_cast<uint8_t>(((j * 4 + 1) % 250) + 1); // G
+        rgba_input[j*4+2] = static_cast<uint8_t>(((j * 4 + 2) % 250) + 1); // B
+        rgba_input[j*4+3] = static_cast<uint8_t>(((j * 4 + 3) % 250) + 1); // A
+    }
+    std::vector<::Pixel> bgra_output(num_pixels);
+
+    BmpTool::internal_swizzle_rgba_to_bgra_simd(rgba_input.data(), bgra_output.data(), num_pixels);
+
+    for (size_t j = 0; j < num_pixels; ++j) {
+        ASSERT_EQ(bgra_output[j].red,   rgba_input[j*4+0]); // R
+        ASSERT_EQ(bgra_output[j].green, rgba_input[j*4+1]); // G
+        ASSERT_EQ(bgra_output[j].blue,  rgba_input[j*4+2]); // B
+        ASSERT_EQ(bgra_output[j].alpha, rgba_input[j*4+3]); // A
+    }
+}
+
+TEST(SwizzleBGRARGATest, RGBAtoBGRAVariousSizes) {
+    std::vector<size_t> sizes_to_test = {0, 1, 3, 4, 7, 8, 15, 16, 31, 32, 63, 64, 100};
+    for (size_t size : sizes_to_test) {
+        check_rgba_to_bgra_conversion(size);
+    }
+}