Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .github/workflows/fuzzing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ jobs:
mkdir -p build_fuzz/corpus_fuzzing/fuzz_bitmap_file_corpus
mkdir -p build_fuzz/corpus_fuzzing/fuzz_image_operations_corpus
mkdir -p build_fuzz/corpus_fuzzing/fuzz_matrix_corpus
mkdir -p build_fuzz/corpus_fuzzing/fuzz_convert_bgr_to_bgra_corpus
mkdir -p build_fuzz/corpus_fuzzing/fuzz_swizzle_bgra_to_rgba_corpus
mkdir -p build_fuzz/corpus_fuzzing/fuzz_swizzle_rgba_to_bgra_corpus

- name: Run fuzz_bitmap
run: |
Expand All @@ -56,3 +59,15 @@ jobs:
- name: Run fuzz_matrix
run: |
./build_fuzz/tests/fuzz_matrix -max_total_time=60 -print_final_stats=1 -print_pcs=1 -error_exitcode=1 build_fuzz/corpus_fuzzing/fuzz_matrix_corpus/

- name: Run fuzz_convert_bgr_to_bgra
run: |
./build_fuzz/tests/fuzz_convert_bgr_to_bgra -max_total_time=60 -print_final_stats=1 -print_pcs=1 -error_exitcode=1 build_fuzz/corpus_fuzzing/fuzz_convert_bgr_to_bgra_corpus/

- name: Run fuzz_swizzle_bgra_to_rgba
run: |
./build_fuzz/tests/fuzz_swizzle_bgra_to_rgba -max_total_time=60 -print_final_stats=1 -print_pcs=1 -error_exitcode=1 build_fuzz/corpus_fuzzing/fuzz_swizzle_bgra_to_rgba_corpus/

- name: Run fuzz_swizzle_rgba_to_bgra
run: |
./build_fuzz/tests/fuzz_swizzle_rgba_to_bgra -max_total_time=60 -print_final_stats=1 -print_pcs=1 -error_exitcode=1 build_fuzz/corpus_fuzzing/fuzz_swizzle_rgba_to_bgra_corpus/
89 changes: 79 additions & 10 deletions src/bitmap/bitmap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#include <iostream> // For standard I/O (though not explicitly used in this file's current state).
#include <algorithm> // For std::min and std::max, used in ApplyBoxBlur and color adjustments.
#include <vector> // For std::vector, used by Matrix class and underlying bitmap data.
#include <cstring> // For std::memcpy, used in SIMD NEON path
#include "../simd_utils.hpp" // Added include

// Define BI_RGB as 0 if not already defined, to ensure cross-platform compatibility for bitmap compression type.
#ifndef BI_RGB
Expand Down Expand Up @@ -183,6 +185,73 @@ Bitmap::File ApplyBoxBlur(Bitmap::File bitmapFile, int blurRadius)
return CreateBitmapFromMatrix(blurredMatrix);
}

// Helper function for BGR to BGRA conversion with SIMD (declaration in bitmap.h)
void internal_convert_bgr_to_bgra_simd(const uint8_t* src_row_bgr_ptr, ::Pixel* dest_row_pixel_ptr, size_t num_pixels_in_row) {
size_t current_src_byte_offset = 0;
size_t current_dest_pixel_idx = 0;
size_t num_pixels_to_process = num_pixels_in_row;

#if defined(__AVX2__)
// As per previous implementation, AVX2 uses SSSE3 logic.
// No distinct 256-bit AVX2 path implemented here.
#endif

#if defined(__AVX2__) || defined(__SSSE3__) // Use SSSE3 for AVX2 as well if no specific AVX2 code
const size_t pixels_per_step = 4;
__m128i bgr_to_bgrX_mask = _mm_setr_epi8(
0, 1, 2, (char)0x80,
3, 4, 5, (char)0x80,
6, 7, 8, (char)0x80,
9, 10, 11, (char)0x80
);
__m128i alpha_channel_ff = _mm_setr_epi8(
0,0,0, (char)0xFF, 0,0,0,(char)0xFF, 0,0,0,(char)0xFF, 0,0,0,(char)0xFF
);

while (num_pixels_to_process >= pixels_per_step) {
__m128i bgr_data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src_row_bgr_ptr + current_src_byte_offset));
__m128i bgra_pixels_expanded = _mm_shuffle_epi8(bgr_data, bgr_to_bgrX_mask);
__m128i bgra_pixels_final = _mm_or_si128(bgra_pixels_expanded, alpha_channel_ff);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dest_row_pixel_ptr + current_dest_pixel_idx), bgra_pixels_final);
current_src_byte_offset += pixels_per_step * 3;
current_dest_pixel_idx += pixels_per_step;
num_pixels_to_process -= pixels_per_step;
}
#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
const size_t pixels_per_step = 4;
const uint8_t table_bgr_to_bgr0[] = {0,1,2,16, 3,4,5,16, 6,7,8,16, 9,10,11,16};
uint8x16_t neon_shuffle_table = vld1q_u8(table_bgr_to_bgr0);
const uint8_t alpha_bytes[] = {0,0,0,0xFF, 0,0,0,0xFF, 0,0,0,0xFF, 0,0,0,0xFF};
uint8x16_t alpha_channel_ff_neon = vld1q_u8(alpha_bytes);
uint8_t temp_bgr_load[16];

while (num_pixels_to_process >= pixels_per_step) {
std::memcpy(temp_bgr_load, src_row_bgr_ptr + current_src_byte_offset, 12);
uint8x16_t bgr_data_loaded = vld1q_u8(temp_bgr_load);
uint8x16_t bgra_expanded = vqtbl1q_u8(bgr_data_loaded, neon_shuffle_table);
uint8x16_t bgra_final = vorrq_u8(bgra_expanded, alpha_channel_ff_neon);
vst1q_u8(reinterpret_cast<uint8_t*>(dest_row_pixel_ptr + current_dest_pixel_idx), bgra_final);
current_src_byte_offset += pixels_per_step * 3;
current_dest_pixel_idx += pixels_per_step;
num_pixels_to_process -= pixels_per_step;
}
#else
// This #else block ensures that if no SIMD path is taken (e.g. SSSE3/NEON not defined, or AVX2 defined but its specific block is empty and it's not grouped with SSSE3),
// the scalar loop below is the ONLY path for processing.
// The current structure with #if defined(__AVX2__) || defined(__SSSE3__) followed by #elif defined(__ARM_NEON)
// means this #else is for when NEITHER of those are true.
// If AVX2 is defined, it uses the SSSE3 block. If only NEON is defined, it uses NEON block.
// If none are defined, it falls to the scalar loop below.
#endif
// Scalar fallback for remaining pixels OR if no SIMD defined/executed above
for (size_t k_rem = 0; k_rem < num_pixels_to_process; ++k_rem) {
(dest_row_pixel_ptr + current_dest_pixel_idx + k_rem)->blue = *(src_row_bgr_ptr + current_src_byte_offset + k_rem * 3 + 0);
(dest_row_pixel_ptr + current_dest_pixel_idx + k_rem)->green = *(src_row_bgr_ptr + current_src_byte_offset + k_rem * 3 + 1);
(dest_row_pixel_ptr + current_dest_pixel_idx + k_rem)->red = *(src_row_bgr_ptr + current_src_byte_offset + k_rem * 3 + 2);
(dest_row_pixel_ptr + current_dest_pixel_idx + k_rem)->alpha = 255;
}
}

// Converts a Bitmap::File object (containing raw bitmap data and headers)
// into a Matrix::Matrix<Pixel> for easier pixel manipulation.
Matrix::Matrix<Pixel> CreateMatrixFromBitmap(Bitmap::File bitmapFile)
Expand Down Expand Up @@ -236,16 +305,16 @@ Matrix::Matrix<Pixel> CreateMatrixFromBitmap(Bitmap::File bitmapFile)
}
else if (bitmapFile.bitmapInfoHeader.biBitCount == 24) // For 24-bit bitmaps (BGR)
{
int k = 0; // Index for bitmapFile.bitmapData
for (int i = 0; i < imageMatrix.rows(); i++)
for (int j = 0; j < imageMatrix.cols(); j++)
{
imageMatrix[i][j].blue = bitmapFile.bitmapData[k];
imageMatrix[i][j].green = bitmapFile.bitmapData[k + 1];
imageMatrix[i][j].red = bitmapFile.bitmapData[k + 2];
imageMatrix[i][j].alpha = 0; // Default alpha to 0 (opaque) for 24-bit images.
k += 3; // Move to the next pixel (3 bytes)
}
// Calculate padding if necessary, though source data in bitmapFile.bitmapData should be packed according to BMP spec (rows padded to 4 bytes)
// However, we process pixel by pixel here from the linear bitmapData.
// The number of bytes per row in source data:
uint32_t src_bytes_per_row = (static_cast<uint32_t>(imageMatrix.cols()) * 3 + 3) & ~3u; // BMP rows are padded to 4 bytes for BGR

for (int i = 0; i < imageMatrix.rows(); i++) {
const uint8_t* src_row_bgr_ptr = bitmapFile.bitmapData.data() + (static_cast<size_t>(i) * src_bytes_per_row);
::Pixel* dest_row_pixel_ptr = &imageMatrix[i][0];
internal_convert_bgr_to_bgra_simd(src_row_bgr_ptr, dest_row_pixel_ptr, imageMatrix.cols());
}
}
// Note: Other bit depths (e.g., 1, 4, 8, 16-bit) would require more complex handling.

Expand Down
3 changes: 3 additions & 0 deletions src/bitmap/bitmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -417,4 +417,7 @@ Pixel ApplySepiaToPixel(Pixel pixel);

// Note: Box blur is applied over a region within ApplyBoxBlur,
// so it does not have a direct single-pixel helper function here.

// Helper function for BGR to BGRA conversion (primarily for testing)
void internal_convert_bgr_to_bgra_simd(const uint8_t* src_bgr_data, ::Pixel* dest_bgra_pixels, size_t num_pixels);
#endif
143 changes: 124 additions & 19 deletions src/format/bitmap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
#include "../../src/bitmapfile/bitmap_file.h" // For BITMAPFILEHEADER, BITMAPINFOHEADER from external lib
#include "../../src/bitmap/bitmap.h" // For ::Pixel, ::CreateMatrixFromBitmap, ::CreateBitmapFromMatrix
#include "../../src/matrix/matrix.h" // For Matrix::Matrix
#include "../simd_utils.hpp" // Added include
#include "format_internal_helpers.hpp" // Added include for the new helpers

// Define constants for BMP format (can be used by Format::Internal helpers or if save needs them directly)
constexpr uint16_t BMP_MAGIC_TYPE_CONST = 0x4D42; // 'BM'
Expand Down Expand Up @@ -150,20 +152,72 @@ Result<Bitmap, BitmapError> load(std::span<const uint8_t> bmp_data) {
bmp_out.bpp = 32;
bmp_out.data.resize(static_cast<size_t>(bmp_out.w) * bmp_out.h * 4);

// The loop converting image_matrix to bmp_out.data
// bmp_out.data is already resized.
for (uint32_t y = 0; y < bmp_out.h; ++y) {
for (uint32_t x = 0; x < bmp_out.w; ++x) {
const ::Pixel& src_pixel = image_matrix.at(y, x); // Changed Get(x,y) to at(y,x)

size_t dest_idx = (static_cast<size_t>(y) * bmp_out.w + x) * 4;
bmp_out.data[dest_idx + 0] = src_pixel.red;
bmp_out.data[dest_idx + 1] = src_pixel.green;
bmp_out.data[dest_idx + 2] = src_pixel.blue;
bmp_out.data[dest_idx + 3] = src_pixel.alpha;
}
const ::Pixel* src_bgra_pixels_row = &image_matrix[y][0];
uint8_t* dest_rgba_data_row = bmp_out.data.data() + (static_cast<size_t>(y) * bmp_out.w * 4);
internal_swizzle_bgra_to_rgba_simd(src_bgra_pixels_row, dest_rgba_data_row, bmp_out.w);
}
return bmp_out;
}


// Helper function to convert an array of ::Pixel (BGRA order) to an array of uint8_t (RGBA order)
void internal_swizzle_bgra_to_rgba_simd(const ::Pixel* src_bgra_pixels, uint8_t* dest_rgba_data, size_t num_pixels) {
size_t current_pixel_idx = 0;
size_t num_pixels_to_process = num_pixels;

#if defined(__AVX2__)
const size_t pixels_per_step = 8; // 8 pixels = 32 bytes
__m256i shuffle_mask_bgra_to_rgba = _mm256_setr_epi8(
2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14,13,12,15, // First 4 pixels (16 bytes)
18,17,16,19, 22,21,20,23, 26,25,24,27, 30,29,28,31 // Next 4 pixels (16 bytes)
);
while (num_pixels_to_process >= pixels_per_step) {
__m256i bgra_pixels_loaded = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src_bgra_pixels + current_pixel_idx));
__m256i rgba_pixels = _mm256_shuffle_epi8(bgra_pixels_loaded, shuffle_mask_bgra_to_rgba);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dest_rgba_data + current_pixel_idx * 4), rgba_pixels);
current_pixel_idx += pixels_per_step;
num_pixels_to_process -= pixels_per_step;
}
#elif defined(__SSSE3__) // _mm_shuffle_epi8 requires SSSE3
const size_t pixels_per_step = 4; // 4 pixels = 16 bytes
__m128i shuffle_mask_bgra_to_rgba = _mm_setr_epi8(
2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14,13,12,15
);
while (num_pixels_to_process >= pixels_per_step) {
__m128i bgra_pixels_loaded = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src_bgra_pixels + current_pixel_idx));
__m128i rgba_pixels = _mm_shuffle_epi8(bgra_pixels_loaded, shuffle_mask_bgra_to_rgba);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dest_rgba_data + current_pixel_idx * 4), rgba_pixels);
current_pixel_idx += pixels_per_step;
num_pixels_to_process -= pixels_per_step;
}
#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
const size_t pixels_per_step = 4; // 4 pixels = 16 bytes using uint8x16_t
const uint8_t shuffle_coeffs_array[] = {2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15};
uint8x16_t neon_shuffle_mask = vld1q_u8(shuffle_coeffs_array);

while (num_pixels_to_process >= pixels_per_step) {
uint8x16_t bgra_pixels_loaded = vld1q_u8(reinterpret_cast<const uint8_t*>(src_bgra_pixels + current_pixel_idx));
uint8x16_t rgba_pixels = vqtbl1q_u8(bgra_pixels_loaded, neon_shuffle_mask);
vst1q_u8(dest_rgba_data + current_pixel_idx * 4, rgba_pixels);
current_pixel_idx += pixels_per_step;
num_pixels_to_process -= pixels_per_step;
}
#endif
// Scalar fallback for remaining pixels in the row
for (size_t i = 0; i < num_pixels_to_process; ++i) {
const ::Pixel& src_pixel = *(src_bgra_pixels + current_pixel_idx + i);
uint8_t* dest_pixel_ptr = dest_rgba_data + (current_pixel_idx + i) * 4;
dest_pixel_ptr[0] = src_pixel.red; // R
dest_pixel_ptr[1] = src_pixel.green; // G
dest_pixel_ptr[2] = src_pixel.blue; // B
dest_pixel_ptr[3] = src_pixel.alpha; // A
}
}


// The NEW BmpTool::save function using the external library
Result<void, BitmapError> save(const Bitmap& bitmap_in, std::span<uint8_t> out_bmp_buffer) {
// 1. Input Validation from BmpTool::Bitmap
Expand All @@ -185,17 +239,12 @@ Result<void, BitmapError> save(const Bitmap& bitmap_in, std::span<uint8_t> out_b
// Assuming ::Pixel struct has members .red, .green, .blue, .alpha
Matrix::Matrix<::Pixel> image_matrix(bitmap_in.h, bitmap_in.w); // Changed order to (rows, cols)

// The loop converting bitmap_in.data to image_matrix
// image_matrix is already sized.
for (uint32_t y = 0; y < bitmap_in.h; ++y) {
for (uint32_t x = 0; x < bitmap_in.w; ++x) {
const uint8_t* src_pixel_ptr = &bitmap_in.data[(static_cast<size_t>(y) * bitmap_in.w + x) * 4]; // RGBA
::Pixel dest_pixel;
dest_pixel.red = src_pixel_ptr[0];
dest_pixel.green = src_pixel_ptr[1];
dest_pixel.blue = src_pixel_ptr[2];
dest_pixel.alpha = src_pixel_ptr[3];

image_matrix.at(y, x) = dest_pixel; // Changed Set(x,y) to at(y,x)
}
const uint8_t* src_rgba_data_row = &bitmap_in.data[(static_cast<size_t>(y) * bitmap_in.w * 4)];
::Pixel* dest_bgra_pixels_row = &image_matrix[y][0];
internal_swizzle_rgba_to_bgra_simd(src_rgba_data_row, dest_bgra_pixels_row, bitmap_in.w);
}

// 3. Convert Matrix<::Pixel> to ::Bitmap::File
Expand Down Expand Up @@ -255,4 +304,60 @@ Result<void, BitmapError> save(const Bitmap& bitmap_in, std::span<uint8_t> out_b
return BmpTool::Success{}; // This will implicitly convert to Result<void, BitmapError>(Success{})
}


// Helper function to convert an array of uint8_t (RGBA order) to an array of ::Pixel (BGRA order)
void internal_swizzle_rgba_to_bgra_simd(const uint8_t* src_rgba_data, ::Pixel* dest_bgra_pixels, size_t num_pixels) {
size_t current_pixel_idx = 0;
size_t num_pixels_to_process = num_pixels;

#if defined(__AVX2__)
const size_t pixels_per_step = 8; // 8 pixels = 32 bytes
__m256i shuffle_mask_rgba_to_bgra = _mm256_setr_epi8(
2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14,13,12,15,
18,17,16,19, 22,21,20,23, 26,25,24,27, 30,29,28,31
);
while (num_pixels_to_process >= pixels_per_step) {
__m256i rgba_pixels_loaded = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src_rgba_data + current_pixel_idx * 4));
__m256i bgra_pixels = _mm256_shuffle_epi8(rgba_pixels_loaded, shuffle_mask_rgba_to_bgra);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(dest_bgra_pixels + current_pixel_idx), bgra_pixels);
current_pixel_idx += pixels_per_step;
num_pixels_to_process -= pixels_per_step;
}
#elif defined(__SSSE3__) // _mm_shuffle_epi8 requires SSSE3
const size_t pixels_per_step = 4; // 4 pixels = 16 bytes
__m128i shuffle_mask_rgba_to_bgra = _mm_setr_epi8(
2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14,13,12,15
);
while (num_pixels_to_process >= pixels_per_step) {
__m128i rgba_pixels_loaded = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src_rgba_data + current_pixel_idx * 4));
__m128i bgra_pixels = _mm_shuffle_epi8(rgba_pixels_loaded, shuffle_mask_rgba_to_bgra);
_mm_storeu_si128(reinterpret_cast<__m128i*>(dest_bgra_pixels + current_pixel_idx), bgra_pixels);
current_pixel_idx += pixels_per_step;
num_pixels_to_process -= pixels_per_step;
}
#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
const size_t pixels_per_step = 4; // 4 pixels = 16 bytes
const uint8_t shuffle_coeffs_array[] = {2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15};
uint8x16_t neon_shuffle_mask = vld1q_u8(shuffle_coeffs_array);

while (num_pixels_to_process >= pixels_per_step) {
uint8x16_t rgba_pixels_loaded = vld1q_u8(src_rgba_data + current_pixel_idx * 4);
uint8x16_t bgra_pixels = vqtbl1q_u8(rgba_pixels_loaded, neon_shuffle_mask);
vst1q_u8(reinterpret_cast<uint8_t*>(dest_bgra_pixels + current_pixel_idx), bgra_pixels);
current_pixel_idx += pixels_per_step;
num_pixels_to_process -= pixels_per_step;
}
#endif
// Scalar fallback for remaining pixels in the row
for (size_t i = 0; i < num_pixels_to_process; ++i) {
const uint8_t* src_pixel_ptr = src_rgba_data + (current_pixel_idx + i) * 4;
::Pixel& dest_pixel = *(dest_bgra_pixels + current_pixel_idx + i);

dest_pixel.red = src_pixel_ptr[0]; // R
dest_pixel.green = src_pixel_ptr[1]; // G
dest_pixel.blue = src_pixel_ptr[2]; // B
dest_pixel.alpha = src_pixel_ptr[3]; // A
}
}

} // namespace BmpTool
12 changes: 12 additions & 0 deletions src/format/format_internal_helpers.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#pragma once

#include <cstddef> // For size_t
// Include bitmap.h for the definition of ::Pixel
#include "../bitmap/bitmap.h"

namespace BmpTool {

void internal_swizzle_bgra_to_rgba_simd(const ::Pixel* src_bgra_pixels, uint8_t* dest_rgba_data, size_t num_pixels);
void internal_swizzle_rgba_to_bgra_simd(const uint8_t* src_rgba_data, ::Pixel* dest_bgra_pixels, size_t num_pixels);

} // namespace BmpTool
17 changes: 17 additions & 0 deletions src/simd_utils.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#pragma once

// For x86/x64 SIMD intrinsics
#if defined(__SSE2__) || defined(__AVX__) || defined(__AVX2__)
#include <immintrin.h> // Includes SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX, AVX2, FMA, etc.
#endif

// For ARM NEON intrinsics
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
#include <arm_neon.h>
#endif

// Helper to check if a pointer is aligned to a certain byte boundary
template <typename T>
inline bool is_aligned(const T* ptr, std::size_t alignment) {
return reinterpret_cast<std::uintptr_t>(ptr) % alignment == 0;
}
Loading