JacobBorden · JacobBorden · May 30, 2025 · May 30, 2025 · May 30, 2025 · May 30, 2025
diff --git a/.github/workflows/fuzzing.yml b/.github/workflows/fuzzing.yml
@@ -36,6 +36,9 @@ jobs:
         mkdir -p build_fuzz/corpus_fuzzing/fuzz_bitmap_file_corpus
         mkdir -p build_fuzz/corpus_fuzzing/fuzz_image_operations_corpus
         mkdir -p build_fuzz/corpus_fuzzing/fuzz_matrix_corpus
+        mkdir -p build_fuzz/corpus_fuzzing/fuzz_convert_bgr_to_bgra_corpus
+        mkdir -p build_fuzz/corpus_fuzzing/fuzz_swizzle_bgra_to_rgba_corpus
+        mkdir -p build_fuzz/corpus_fuzzing/fuzz_swizzle_rgba_to_bgra_corpus
 
     - name: Run fuzz_bitmap
       run: |
@@ -56,3 +59,15 @@ jobs:
     - name: Run fuzz_matrix
       run: |
         ./build_fuzz/tests/fuzz_matrix -max_total_time=60 -print_final_stats=1 -print_pcs=1 -error_exitcode=1 build_fuzz/corpus_fuzzing/fuzz_matrix_corpus/
+
+    - name: Run fuzz_convert_bgr_to_bgra
+      run: |
+        ./build_fuzz/tests/fuzz_convert_bgr_to_bgra -max_total_time=60 -print_final_stats=1 -print_pcs=1 -error_exitcode=1 build_fuzz/corpus_fuzzing/fuzz_convert_bgr_to_bgra_corpus/
+
+    - name: Run fuzz_swizzle_bgra_to_rgba
+      run: |
+        ./build_fuzz/tests/fuzz_swizzle_bgra_to_rgba -max_total_time=60 -print_final_stats=1 -print_pcs=1 -error_exitcode=1 build_fuzz/corpus_fuzzing/fuzz_swizzle_bgra_to_rgba_corpus/
+
+    - name: Run fuzz_swizzle_rgba_to_bgra
+      run: |
+        ./build_fuzz/tests/fuzz_swizzle_rgba_to_bgra -max_total_time=60 -print_final_stats=1 -print_pcs=1 -error_exitcode=1 build_fuzz/corpus_fuzzing/fuzz_swizzle_rgba_to_bgra_corpus/
diff --git a/src/bitmap/bitmap.cpp b/src/bitmap/bitmap.cpp
@@ -2,6 +2,8 @@
 #include <iostream>  // For standard I/O (though not explicitly used in this file's current state).
 #include <algorithm> // For std::min and std::max, used in ApplyBoxBlur and color adjustments.
 #include <vector>    // For std::vector, used by Matrix class and underlying bitmap data.
+#include <cstring>   // For std::memcpy, used in SIMD NEON path
+#include "../simd_utils.hpp" // Added include
 
 // Define BI_RGB as 0 if not already defined, to ensure cross-platform compatibility for bitmap compression type.
 #ifndef BI_RGB
@@ -183,6 +185,73 @@ Bitmap::File ApplyBoxBlur(Bitmap::File bitmapFile, int blurRadius)
     return CreateBitmapFromMatrix(blurredMatrix);
 }
 
+// Helper function for BGR to BGRA conversion with SIMD (declaration in bitmap.h)
+void internal_convert_bgr_to_bgra_simd(const uint8_t* src_row_bgr_ptr, ::Pixel* dest_row_pixel_ptr, size_t num_pixels_in_row) {
+    size_t current_src_byte_offset = 0;
+    size_t current_dest_pixel_idx = 0;
+    size_t num_pixels_to_process = num_pixels_in_row;
+
+#if defined(__AVX2__)
+    // As per previous implementation, AVX2 uses SSSE3 logic.
+    // No distinct 256-bit AVX2 path implemented here.
+#endif
+
+#if defined(__AVX2__) || defined(__SSSE3__) // Use SSSE3 for AVX2 as well if no specific AVX2 code
+    const size_t pixels_per_step = 4; 
+    __m128i bgr_to_bgrX_mask = _mm_setr_epi8(
+        0, 1, 2, (char)0x80, 
+        3, 4, 5, (char)0x80, 
+        6, 7, 8, (char)0x80, 
+        9, 10, 11, (char)0x80 
+    );
+    __m128i alpha_channel_ff = _mm_setr_epi8(
+        0,0,0, (char)0xFF, 0,0,0,(char)0xFF, 0,0,0,(char)0xFF, 0,0,0,(char)0xFF
+    );
+
+    while (num_pixels_to_process >= pixels_per_step) {
+        __m128i bgr_data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src_row_bgr_ptr + current_src_byte_offset));
+        __m128i bgra_pixels_expanded = _mm_shuffle_epi8(bgr_data, bgr_to_bgrX_mask);
+        __m128i bgra_pixels_final = _mm_or_si128(bgra_pixels_expanded, alpha_channel_ff);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(dest_row_pixel_ptr + current_dest_pixel_idx), bgra_pixels_final);
+        current_src_byte_offset += pixels_per_step * 3; 
+        current_dest_pixel_idx += pixels_per_step;    
+        num_pixels_to_process -= pixels_per_step;
+    }
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    const size_t pixels_per_step = 4;
+    const uint8_t table_bgr_to_bgr0[] = {0,1,2,16, 3,4,5,16, 6,7,8,16, 9,10,11,16}; 
+    uint8x16_t neon_shuffle_table = vld1q_u8(table_bgr_to_bgr0);
+    const uint8_t alpha_bytes[] = {0,0,0,0xFF, 0,0,0,0xFF, 0,0,0,0xFF, 0,0,0,0xFF};
+    uint8x16_t alpha_channel_ff_neon = vld1q_u8(alpha_bytes);
+    uint8_t temp_bgr_load[16];
+
+    while (num_pixels_to_process >= pixels_per_step) {
+        std::memcpy(temp_bgr_load, src_row_bgr_ptr + current_src_byte_offset, 12);
+        uint8x16_t bgr_data_loaded = vld1q_u8(temp_bgr_load);
+        uint8x16_t bgra_expanded = vqtbl1q_u8(bgr_data_loaded, neon_shuffle_table);
+        uint8x16_t bgra_final = vorrq_u8(bgra_expanded, alpha_channel_ff_neon);
+        vst1q_u8(reinterpret_cast<uint8_t*>(dest_row_pixel_ptr + current_dest_pixel_idx), bgra_final);
+        current_src_byte_offset += pixels_per_step * 3;
+        current_dest_pixel_idx += pixels_per_step;
+        num_pixels_to_process -= pixels_per_step;
+    }
+#else 
+    // This #else block ensures that if no SIMD path is taken (e.g. SSSE3/NEON not defined, or AVX2 defined but its specific block is empty and it's not grouped with SSSE3),
+    // the scalar loop below is the ONLY path for processing.
+    // The current structure with #if defined(__AVX2__) || defined(__SSSE3__) followed by #elif defined(__ARM_NEON)
+    // means this #else is for when NEITHER of those are true.
+    // If AVX2 is defined, it uses the SSSE3 block. If only NEON is defined, it uses NEON block.
+    // If none are defined, it falls to the scalar loop below.
+#endif
+    // Scalar fallback for remaining pixels OR if no SIMD defined/executed above
+    for (size_t k_rem = 0; k_rem < num_pixels_to_process; ++k_rem) {
+        (dest_row_pixel_ptr + current_dest_pixel_idx + k_rem)->blue  = *(src_row_bgr_ptr + current_src_byte_offset + k_rem * 3 + 0);
+        (dest_row_pixel_ptr + current_dest_pixel_idx + k_rem)->green = *(src_row_bgr_ptr + current_src_byte_offset + k_rem * 3 + 1);
+        (dest_row_pixel_ptr + current_dest_pixel_idx + k_rem)->red   = *(src_row_bgr_ptr + current_src_byte_offset + k_rem * 3 + 2);
+        (dest_row_pixel_ptr + current_dest_pixel_idx + k_rem)->alpha = 255;
+    }
+}
+
 // Converts a Bitmap::File object (containing raw bitmap data and headers)
 // into a Matrix::Matrix<Pixel> for easier pixel manipulation.
 Matrix::Matrix<Pixel> CreateMatrixFromBitmap(Bitmap::File bitmapFile)
@@ -236,16 +305,16 @@ Matrix::Matrix<Pixel> CreateMatrixFromBitmap(Bitmap::File bitmapFile)
     }
     else if (bitmapFile.bitmapInfoHeader.biBitCount == 24) // For 24-bit bitmaps (BGR)
     {
-        int k = 0; // Index for bitmapFile.bitmapData
-        for (int i = 0; i < imageMatrix.rows(); i++)
-            for (int j = 0; j < imageMatrix.cols(); j++)
-            {
-                imageMatrix[i][j].blue = bitmapFile.bitmapData[k];
-                imageMatrix[i][j].green = bitmapFile.bitmapData[k + 1];
-                imageMatrix[i][j].red = bitmapFile.bitmapData[k + 2];
-                imageMatrix[i][j].alpha = 0; // Default alpha to 0 (opaque) for 24-bit images.
-                k += 3; // Move to the next pixel (3 bytes)
-            }
+        // Calculate padding if necessary, though source data in bitmapFile.bitmapData should be packed according to BMP spec (rows padded to 4 bytes)
+        // However, we process pixel by pixel here from the linear bitmapData.
+        // The number of bytes per row in source data:
+        uint32_t src_bytes_per_row = (static_cast<uint32_t>(imageMatrix.cols()) * 3 + 3) & ~3u; // BMP rows are padded to 4 bytes for BGR
+
+        for (int i = 0; i < imageMatrix.rows(); i++) {
+            const uint8_t* src_row_bgr_ptr = bitmapFile.bitmapData.data() + (static_cast<size_t>(i) * src_bytes_per_row);
+            ::Pixel* dest_row_pixel_ptr = &imageMatrix[i][0];
+            internal_convert_bgr_to_bgra_simd(src_row_bgr_ptr, dest_row_pixel_ptr, imageMatrix.cols());
+        }
     }
     // Note: Other bit depths (e.g., 1, 4, 8, 16-bit) would require more complex handling.
 

diff --git a/src/bitmap/bitmap.h b/src/bitmap/bitmap.h
@@ -417,4 +417,7 @@ Pixel ApplySepiaToPixel(Pixel pixel);
 
 // Note: Box blur is applied over a region within ApplyBoxBlur,
 // so it does not have a direct single-pixel helper function here.
+
+// Helper function for BGR to BGRA conversion (primarily for testing)
+void internal_convert_bgr_to_bgra_simd(const uint8_t* src_bgr_data, ::Pixel* dest_bgra_pixels, size_t num_pixels);
 #endif
diff --git a/src/format/bitmap.cpp b/src/format/bitmap.cpp
@@ -12,6 +12,8 @@
 #include "../../src/bitmapfile/bitmap_file.h" // For BITMAPFILEHEADER, BITMAPINFOHEADER from external lib
 #include "../../src/bitmap/bitmap.h"         // For ::Pixel, ::CreateMatrixFromBitmap, ::CreateBitmapFromMatrix
 #include "../../src/matrix/matrix.h"         // For Matrix::Matrix
+#include "../simd_utils.hpp" // Added include
+#include "format_internal_helpers.hpp" // Added include for the new helpers
 
 // Define constants for BMP format (can be used by Format::Internal helpers or if save needs them directly)
 constexpr uint16_t BMP_MAGIC_TYPE_CONST = 0x4D42; // 'BM'
@@ -150,20 +152,72 @@ Result<Bitmap, BitmapError> load(std::span<const uint8_t> bmp_data) {
     bmp_out.bpp = 32; 
     bmp_out.data.resize(static_cast<size_t>(bmp_out.w) * bmp_out.h * 4);
 
+    // The loop converting image_matrix to bmp_out.data
+    // bmp_out.data is already resized.
     for (uint32_t y = 0; y < bmp_out.h; ++y) {
-        for (uint32_t x = 0; x < bmp_out.w; ++x) {
-            const ::Pixel& src_pixel = image_matrix.at(y, x); // Changed Get(x,y) to at(y,x)
-
-            size_t dest_idx = (static_cast<size_t>(y) * bmp_out.w + x) * 4;
-            bmp_out.data[dest_idx + 0] = src_pixel.red;   
-            bmp_out.data[dest_idx + 1] = src_pixel.green; 
-            bmp_out.data[dest_idx + 2] = src_pixel.blue;  
-            bmp_out.data[dest_idx + 3] = src_pixel.alpha; 
-        }
+        const ::Pixel* src_bgra_pixels_row = &image_matrix[y][0];
+        uint8_t* dest_rgba_data_row = bmp_out.data.data() + (static_cast<size_t>(y) * bmp_out.w * 4);
+        internal_swizzle_bgra_to_rgba_simd(src_bgra_pixels_row, dest_rgba_data_row, bmp_out.w);
     }
     return bmp_out;
 }
 
+
+// Helper function to convert an array of ::Pixel (BGRA order) to an array of uint8_t (RGBA order)
+void internal_swizzle_bgra_to_rgba_simd(const ::Pixel* src_bgra_pixels, uint8_t* dest_rgba_data, size_t num_pixels) {
+    size_t current_pixel_idx = 0;
+    size_t num_pixels_to_process = num_pixels;
+
+#if defined(__AVX2__)
+    const size_t pixels_per_step = 8; // 8 pixels = 32 bytes
+    __m256i shuffle_mask_bgra_to_rgba = _mm256_setr_epi8(
+        2, 1, 0, 3,  6, 5, 4, 7,  10, 9, 8, 11,  14,13,12,15, // First 4 pixels (16 bytes)
+        18,17,16,19, 22,21,20,23, 26,25,24,27, 30,29,28,31  // Next 4 pixels (16 bytes)
+    );
+    while (num_pixels_to_process >= pixels_per_step) {
+        __m256i bgra_pixels_loaded = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src_bgra_pixels + current_pixel_idx));
+        __m256i rgba_pixels = _mm256_shuffle_epi8(bgra_pixels_loaded, shuffle_mask_bgra_to_rgba);
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(dest_rgba_data + current_pixel_idx * 4), rgba_pixels);
+        current_pixel_idx += pixels_per_step;
+        num_pixels_to_process -= pixels_per_step;
+    }
+#elif defined(__SSSE3__) // _mm_shuffle_epi8 requires SSSE3
+    const size_t pixels_per_step = 4; // 4 pixels = 16 bytes
+    __m128i shuffle_mask_bgra_to_rgba = _mm_setr_epi8(
+        2, 1, 0, 3,  6, 5, 4, 7,  10, 9, 8, 11,  14,13,12,15
+    );
+    while (num_pixels_to_process >= pixels_per_step) {
+        __m128i bgra_pixels_loaded = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src_bgra_pixels + current_pixel_idx));
+        __m128i rgba_pixels = _mm_shuffle_epi8(bgra_pixels_loaded, shuffle_mask_bgra_to_rgba);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(dest_rgba_data + current_pixel_idx * 4), rgba_pixels);
+        current_pixel_idx += pixels_per_step;
+        num_pixels_to_process -= pixels_per_step;
+    }
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    const size_t pixels_per_step = 4; // 4 pixels = 16 bytes using uint8x16_t
+    const uint8_t shuffle_coeffs_array[] = {2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15};
+    uint8x16_t neon_shuffle_mask = vld1q_u8(shuffle_coeffs_array);
+
+    while (num_pixels_to_process >= pixels_per_step) {
+        uint8x16_t bgra_pixels_loaded = vld1q_u8(reinterpret_cast<const uint8_t*>(src_bgra_pixels + current_pixel_idx));
+        uint8x16_t rgba_pixels = vqtbl1q_u8(bgra_pixels_loaded, neon_shuffle_mask);
+        vst1q_u8(dest_rgba_data + current_pixel_idx * 4, rgba_pixels);
+        current_pixel_idx += pixels_per_step;
+        num_pixels_to_process -= pixels_per_step;
+    }
+#endif
+    // Scalar fallback for remaining pixels in the row
+    for (size_t i = 0; i < num_pixels_to_process; ++i) {
+        const ::Pixel& src_pixel = *(src_bgra_pixels + current_pixel_idx + i);
+        uint8_t* dest_pixel_ptr = dest_rgba_data + (current_pixel_idx + i) * 4;
+        dest_pixel_ptr[0] = src_pixel.red;   // R
+        dest_pixel_ptr[1] = src_pixel.green; // G
+        dest_pixel_ptr[2] = src_pixel.blue;  // B
+        dest_pixel_ptr[3] = src_pixel.alpha; // A
+    }
+}
+
+
 // The NEW BmpTool::save function using the external library
 Result<void, BitmapError> save(const Bitmap& bitmap_in, std::span<uint8_t> out_bmp_buffer) {
     // 1. Input Validation from BmpTool::Bitmap
@@ -185,17 +239,12 @@ Result<void, BitmapError> save(const Bitmap& bitmap_in, std::span<uint8_t> out_b
     // Assuming ::Pixel struct has members .red, .green, .blue, .alpha
     Matrix::Matrix<::Pixel> image_matrix(bitmap_in.h, bitmap_in.w); // Changed order to (rows, cols)
 
+    // The loop converting bitmap_in.data to image_matrix
+    // image_matrix is already sized.
     for (uint32_t y = 0; y < bitmap_in.h; ++y) {
-        for (uint32_t x = 0; x < bitmap_in.w; ++x) {
-            const uint8_t* src_pixel_ptr = &bitmap_in.data[(static_cast<size_t>(y) * bitmap_in.w + x) * 4]; // RGBA
-            ::Pixel dest_pixel; 
-            dest_pixel.red   = src_pixel_ptr[0];
-            dest_pixel.green = src_pixel_ptr[1];
-            dest_pixel.blue  = src_pixel_ptr[2];
-            dest_pixel.alpha = src_pixel_ptr[3];
-
-            image_matrix.at(y, x) = dest_pixel; // Changed Set(x,y) to at(y,x)
-        }
+        const uint8_t* src_rgba_data_row = &bitmap_in.data[(static_cast<size_t>(y) * bitmap_in.w * 4)];
+        ::Pixel* dest_bgra_pixels_row = &image_matrix[y][0];
+        internal_swizzle_rgba_to_bgra_simd(src_rgba_data_row, dest_bgra_pixels_row, bitmap_in.w);
     }
 
     // 3. Convert Matrix<::Pixel> to ::Bitmap::File
@@ -255,4 +304,60 @@ Result<void, BitmapError> save(const Bitmap& bitmap_in, std::span<uint8_t> out_b
     return BmpTool::Success{}; // This will implicitly convert to Result<void, BitmapError>(Success{})
 }
 
+
+// Helper function to convert an array of uint8_t (RGBA order) to an array of ::Pixel (BGRA order)
+void internal_swizzle_rgba_to_bgra_simd(const uint8_t* src_rgba_data, ::Pixel* dest_bgra_pixels, size_t num_pixels) {
+    size_t current_pixel_idx = 0;
+    size_t num_pixels_to_process = num_pixels;
+
+#if defined(__AVX2__)
+    const size_t pixels_per_step = 8; // 8 pixels = 32 bytes
+    __m256i shuffle_mask_rgba_to_bgra = _mm256_setr_epi8(
+        2, 1, 0, 3,  6, 5, 4, 7,  10, 9, 8, 11,  14,13,12,15,
+        18,17,16,19, 22,21,20,23, 26,25,24,27, 30,29,28,31
+    );
+    while (num_pixels_to_process >= pixels_per_step) {
+        __m256i rgba_pixels_loaded = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src_rgba_data + current_pixel_idx * 4));
+        __m256i bgra_pixels = _mm256_shuffle_epi8(rgba_pixels_loaded, shuffle_mask_rgba_to_bgra);
+        _mm256_storeu_si256(reinterpret_cast<__m256i*>(dest_bgra_pixels + current_pixel_idx), bgra_pixels);
+        current_pixel_idx += pixels_per_step;
+        num_pixels_to_process -= pixels_per_step;
+    }
+#elif defined(__SSSE3__) // _mm_shuffle_epi8 requires SSSE3
+    const size_t pixels_per_step = 4; // 4 pixels = 16 bytes
+    __m128i shuffle_mask_rgba_to_bgra = _mm_setr_epi8(
+        2, 1, 0, 3,  6, 5, 4, 7,  10, 9, 8, 11,  14,13,12,15
+    );
+    while (num_pixels_to_process >= pixels_per_step) {
+        __m128i rgba_pixels_loaded = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src_rgba_data + current_pixel_idx * 4));
+        __m128i bgra_pixels = _mm_shuffle_epi8(rgba_pixels_loaded, shuffle_mask_rgba_to_bgra);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(dest_bgra_pixels + current_pixel_idx), bgra_pixels);
+        current_pixel_idx += pixels_per_step;
+        num_pixels_to_process -= pixels_per_step;
+    }
+#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
+    const size_t pixels_per_step = 4; // 4 pixels = 16 bytes
+    const uint8_t shuffle_coeffs_array[] = {2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15};
+    uint8x16_t neon_shuffle_mask = vld1q_u8(shuffle_coeffs_array);
+
+    while (num_pixels_to_process >= pixels_per_step) {
+        uint8x16_t rgba_pixels_loaded = vld1q_u8(src_rgba_data + current_pixel_idx * 4);
+        uint8x16_t bgra_pixels = vqtbl1q_u8(rgba_pixels_loaded, neon_shuffle_mask);
+        vst1q_u8(reinterpret_cast<uint8_t*>(dest_bgra_pixels + current_pixel_idx), bgra_pixels);
+        current_pixel_idx += pixels_per_step;
+        num_pixels_to_process -= pixels_per_step;
+    }
+#endif
+    // Scalar fallback for remaining pixels in the row
+    for (size_t i = 0; i < num_pixels_to_process; ++i) {
+        const uint8_t* src_pixel_ptr = src_rgba_data + (current_pixel_idx + i) * 4;
+        ::Pixel& dest_pixel = *(dest_bgra_pixels + current_pixel_idx + i);
+
+        dest_pixel.red   = src_pixel_ptr[0]; // R
+        dest_pixel.green = src_pixel_ptr[1]; // G
+        dest_pixel.blue  = src_pixel_ptr[2]; // B
+        dest_pixel.alpha = src_pixel_ptr[3]; // A
+    }
+}
+
 } // namespace BmpTool
diff --git a/src/format/format_internal_helpers.hpp b/src/format/format_internal_helpers.hpp
@@ -0,0 +1,12 @@
+#pragma once
+
+#include <cstddef> // For size_t
+// Include bitmap.h for the definition of ::Pixel
+#include "../bitmap/bitmap.h" 
+
+namespace BmpTool {
+
+void internal_swizzle_bgra_to_rgba_simd(const ::Pixel* src_bgra_pixels, uint8_t* dest_rgba_data, size_t num_pixels);
+void internal_swizzle_rgba_to_bgra_simd(const uint8_t* src_rgba_data, ::Pixel* dest_bgra_pixels, size_t num_pixels);
+
+} // namespace BmpTool
diff --git a/src/simd_utils.hpp b/src/simd_utils.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+// For x86/x64 SIMD intrinsics
+#if defined(__SSE2__) || defined(__AVX__) || defined(__AVX2__)
+#include <immintrin.h> // Includes SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, FMA, etc.
+#endif
+
+// For ARM NEON intrinsics
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#endif
+
+// Helper to check if a pointer is aligned to a certain byte boundary
+template <typename T>
+inline bool is_aligned(const T* ptr, std::size_t alignment) {
+    return reinterpret_cast<std::uintptr_t>(ptr) % alignment == 0;
+}