From 979a251cff70f0db6a4c8be4168eee624438090c Mon Sep 17 00:00:00 2001 From: indevn Date: Tue, 27 May 2025 18:06:12 +0800 Subject: [PATCH 01/24] fix typo Signed-off-by: ZhouFANG --- README-cn.md | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README-cn.md b/README-cn.md index 5404fda..01ae8c6 100644 --- a/README-cn.md +++ b/README-cn.md @@ -85,7 +85,7 @@ cmake --build build-macos --target all #### 3. 运行示例应用程序 ```bash -./build/bin/system_test ../obj +./build/bin/system_test ./obj ``` --- diff --git a/README.md b/README.md index 95981fd..fab00dc 100755 --- a/README.md +++ b/README.md @@ -86,7 +86,7 @@ cmake --build build-macos --target all #### 3. Run the Example Application ```bash -./build/bin/system_test ../obj +./build/bin/system_test ./obj ``` --- From b9f2ae84177945d728c2a8a696f4c0537fbb29ea Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Mon, 4 Aug 2025 13:05:34 +0800 Subject: [PATCH 02/24] Implement perspective division and viewport transformation with perspective-correct interpolation Signed-off-by: ZhouFANG --- src/include/renderer.h | 15 ++++++++ src/rasterizer.cpp | 30 ++++++++++++---- src/renderer.cpp | 75 ++++++++++++++++++++++++++++++++++++--- src/shader.cpp | 13 ++++--- test/system_test/main.cpp | 9 +++-- 5 files changed, 123 insertions(+), 19 deletions(-) diff --git a/src/include/renderer.h b/src/include/renderer.h index bcc136f..456010b 100755 --- a/src/include/renderer.h +++ b/src/include/renderer.h @@ -69,6 +69,21 @@ class SimpleRenderer { */ void DrawModel(const Model &model, uint32_t *buffer); void DrawModelSlower(const Model &model, uint32_t *buffer); + + + /** + * 透视除法 - 将裁剪空间坐标转换为归一化设备坐标(NDC) + * @param vertex 裁剪空间坐标的顶点 + * @return 转换后的顶点(NDC坐标) + */ + Vertex PerspectiveDivision(const Vertex &vertex); + + /** + * 视口变换 - 将NDC坐标转换为屏幕坐标 + * @param vertex NDC坐标的顶点 + * @return 转换后的顶点(屏幕坐标) + */ + Vertex ViewportTransformation(const Vertex &vertex); }; } // namespace simple_renderer diff --git a/src/rasterizer.cpp b/src/rasterizer.cpp index 8bf2d34..0712f4a 100644 --- a/src/rasterizer.cpp +++ b/src/rasterizer.cpp @@ -46,18 +46,36 @@ std::vector Rasterizer::Rasterize(const Vertex& v0, const Vertex& v1, if (!is_inside) { continue; } - // 计算该点的深度,通过重心坐标插值计算 + + // 透视矫正插值 + // 1. 获取三个顶点的1/w值 + float w0_inv = v0.GetPosition().w; + float w1_inv = v1.GetPosition().w; + float w2_inv = v2.GetPosition().w; + + // 2. 插值1/w + float w_inv_interpolated = Interpolate(w0_inv, w1_inv, w2_inv, barycentric_coord); + + // 3. 计算透视矫正的重心坐标 + Vector3f corrected_bary( + barycentric_coord.x * w0_inv / w_inv_interpolated, + barycentric_coord.y * w1_inv / w_inv_interpolated, + barycentric_coord.z * w2_inv / w_inv_interpolated + ); + + // 4. 使用矫正的重心坐标进行插值 auto z = Interpolate(v0.GetPosition().z, v1.GetPosition().z, - v2.GetPosition().z, barycentric_coord); + v2.GetPosition().z, corrected_bary); + Fragment fragment; fragment.screen_coord = {x, y}; - fragment.normal = CalculateNormal(v0.GetPosition(), v1.GetPosition(), - v2.GetPosition()); + fragment.normal = Interpolate(v0.GetNormal(), v1.GetNormal(), + v2.GetNormal(), corrected_bary); fragment.uv = Interpolate(v0.GetTexCoords(), v1.GetTexCoords(), - v2.GetTexCoords(), barycentric_coord); + v2.GetTexCoords(), corrected_bary); fragment.color = InterpolateColor(v0.GetColor(), v1.GetColor(), - v2.GetColor(), barycentric_coord); + v2.GetColor(), corrected_bary); fragment.depth = z; local_fragments.push_back(fragment); diff --git a/src/renderer.cpp b/src/renderer.cpp index c7a5769..d433ff1 100755 --- a/src/renderer.cpp +++ b/src/renderer.cpp @@ -69,8 +69,16 @@ void SimpleRenderer::DrawModel(const Model &model, uint32_t *buffer) { #pragma omp for for (const auto &v : model.GetVertices()) { - auto vertex = shader_->VertexShader(v); - processedVertices_per_thread.push_back(vertex); + // 顶点着色器:世界坐标 -> 裁剪坐标 + auto clipSpaceVertex = shader_->VertexShader(v); + + // 透视除法:裁剪坐标 -> NDC坐标 + auto ndcVertex = PerspectiveDivision(clipSpaceVertex); + + // 视口变换:NDC坐标 -> 屏幕坐标 + auto screenSpaceVertex = ViewportTransformation(ndcVertex); + + processedVertices_per_thread.push_back(screenSpaceVertex); } } @@ -192,8 +200,16 @@ void SimpleRenderer::DrawModelSlower(const Model &model, uint32_t *buffer) { #pragma omp for for (const auto &v : model.GetVertices()) { /* * * Vertex Shader * * */ - auto vertex = shader_->VertexShader(v); - local_vertices.push_back(vertex); + // 顶点着色器:世界坐标 -> 裁剪坐标 + auto clipSpaceVertex = shader_->VertexShader(v); + + // 透视除法:裁剪坐标 -> NDC坐标 + auto ndcVertex = PerspectiveDivision(clipSpaceVertex); + + // 视口变换:NDC坐标 -> 屏幕坐标 + auto screenSpaceVertex = ViewportTransformation(ndcVertex); + + local_vertices.push_back(screenSpaceVertex); } } @@ -274,4 +290,55 @@ void SimpleRenderer::DrawModelSlower(const Model &model, uint32_t *buffer) { /* * * * * * * */ } +Vertex SimpleRenderer::PerspectiveDivision(const Vertex &vertex) { + Vector4f position = vertex.GetPosition(); + + // 检查w分量,避免除零和负数问题 + if (position.w <= 1e-6f) { + SPDLOG_DEBUG("PerspectiveDivision: w <= 1e-6f"); + Vector4f farPosition(0.0f, 0.0f, 1.0f, 1.0f); + return Vertex(farPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor()); + } + + // 保存原始w分量用于透视矫正插值 + float original_w = position.w; + + // 执行透视除法:(x, y, z, w) -> (x/w, y/w, z/w, 1/w) + Vector4f ndcPosition( + position.x / position.w, // x_ndc = x_clip / w_clip + position.y / position.w, // y_ndc = y_clip / w_clip + position.z / position.w, // z_ndc = z_clip / w_clip + 1.0f / original_w // 保存1/w用于透视矫正插值 + ); + + // 严格限制NDC坐标在标准范围内 + ndcPosition.x = std::clamp(ndcPosition.x, -1.0f, 1.0f); + ndcPosition.y = std::clamp(ndcPosition.y, -1.0f, 1.0f); + ndcPosition.z = std::clamp(ndcPosition.z, -1.0f, 1.0f); + + // 创建新的顶点,保持其他属性不变 + return Vertex(ndcPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor()); +} + +Vertex SimpleRenderer::ViewportTransformation(const Vertex &vertex) { + Vector4f ndcPosition = vertex.GetPosition(); + + // 视口变换:将NDC坐标[-1,1]转换为屏幕坐标[0,width]x[0,height] + float screen_x = (ndcPosition.x + 1.0f) * width_ / 2.0f; + float screen_y = (1.0f - ndcPosition.y) * height_ / 2.0f; + + // 额外的屏幕坐标边界保护 + screen_x = std::clamp(screen_x, 0.0f, static_cast(width_ - 1)); + screen_y = std::clamp(screen_y, 0.0f, static_cast(height_ - 1)); + + Vector4f screenPosition( + screen_x, // x: 屏幕坐标 + screen_y, // y: 屏幕坐标 + ndcPosition.z, // z: NDC坐标用于深度测试 + ndcPosition.w // w: 保持1/w用于透视矫正插值 + ); + + return Vertex(screenPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor()); +} + } // namespace simple_renderer diff --git a/src/shader.cpp b/src/shader.cpp index 3438627..087cca5 100644 --- a/src/shader.cpp +++ b/src/shader.cpp @@ -9,12 +9,17 @@ Vertex Shader::VertexShader(const Vertex& vertex) { uniformbuffer_.GetUniform("projectionMatrix"); Matrix4f mvp_matrix = projection_matrix * view_matrix * model_matrix; - // auto normal_matrix = model_matrix.inverse().transpose(); + + Matrix3f normal_matrix = glm::transpose(glm::inverse(Matrix3f(model_matrix))); + Vector3f transformed_normal = normal_matrix * vertex.GetNormal(); - sharedDataInShader_.fragPos_varying = - Vector3f(model_matrix * vertex.GetPosition()); + sharedDataInShader_.fragPos_varying = Vector3f(model_matrix * vertex.GetPosition()); - return mvp_matrix * vertex; + // 返回变换后的顶点(包含变换后的法向量) + return Vertex(mvp_matrix * vertex.GetPosition(), + transformed_normal, + vertex.GetTexCoords(), + vertex.GetColor()); } Color Shader::FragmentShader(const Fragment& fragment) const { diff --git a/test/system_test/main.cpp b/test/system_test/main.cpp index f75b29c..a844aa7 100755 --- a/test/system_test/main.cpp +++ b/test/system_test/main.cpp @@ -58,15 +58,14 @@ int main(int argc, char **argv) { auto modelMatrix = simple_renderer::Matrix4f(1.0f); simple_renderer::Matrix4f scale_matrix = glm::scale(simple_renderer::Matrix4f(1.0f), - simple_renderer::Vector3f(7.0f, 7.0f, 7.0f)); + simple_renderer::Vector3f(.02f, .02f, .02f)); - // Translation matrix simple_renderer::Matrix4f translation_matrix = glm::translate(simple_renderer::Matrix4f(1.0f), - simple_renderer::Vector3f(30.0f, 30.0f, 0.0f)); + simple_renderer::Vector3f(0.0f, -5.0f, 0.0f)); simple_renderer::Matrix4f rotation_matrix = - glm::rotate(simple_renderer::Matrix4f(1.0f), 90.0f, + glm::rotate(simple_renderer::Matrix4f(1.0f), glm::radians(-105.0f), simple_renderer::Vector3f(1.0f, 0.0f, 0.0f)); // Combined transformation matrix @@ -90,7 +89,7 @@ int main(int argc, char **argv) { shader.SetUniform("cameraPos", camera.GetPosition()); shader.SetUniform("viewMatrix", camera.GetViewMatrix()); shader.SetUniform("projectionMatrix", - camera.GetProjectionMatrix(60.0f, 1.0f, 0.1f, 100.0f)); + camera.GetProjectionMatrix(60.0f, float(kWidth)/float(kHeight), 0.1f, 100.0f)); buffer.ClearDrawBuffer(simple_renderer::Color::kBlack); for (auto &model : models) { From 7093d822d7c908d9989be838ac2d03fc15f8cd8d Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Thu, 17 Jul 2025 18:28:58 +0800 Subject: [PATCH 03/24] implement tile-based rasterizer and refractor the pipeline to support multi-rendering-mode Signed-off-by: ZhouFANG --- src/include/renderer.h | 148 ++++++- src/rasterizer.cpp | 2 +- src/renderer.cpp | 818 +++++++++++++++++++++++++++++++------- test/system_test/main.cpp | 30 +- 4 files changed, 851 insertions(+), 147 deletions(-) diff --git a/src/include/renderer.h b/src/include/renderer.h index 456010b..2464c19 100755 --- a/src/include/renderer.h +++ b/src/include/renderer.h @@ -31,6 +31,27 @@ namespace simple_renderer { +// 渲染模式枚举 +enum class RenderingMode { + TRADITIONAL, // 传统光栅化模式 - 立即深度测试 + TILE_BASED, // Tile-based光栅化模式 - 移动GPU架构 + DEFERRED // 延迟渲染模式 - 经典GPU管线教学模拟 +}; + +// Face 只包含顶点索引,不包含实际的顶点数据; +// Vertex 包含3D坐标,但没有屏幕坐标 +// Fragment 包含屏幕坐标,但它是光栅化的结果,不是输入 +struct TriangleInfo { + Vertex v0, v1, v2; + const Material *material; + size_t face_index; + TriangleInfo(const Vertex& vertex0, const Vertex& vertex1, const Vertex& vertex2, + const Material* mat, size_t face_idx = 0) + : v0(vertex0), v1(vertex1), v2(vertex2), material(mat), face_index(face_idx) {} + + TriangleInfo() = default; +}; + class SimpleRenderer { public: /** @@ -53,22 +74,122 @@ class SimpleRenderer { virtual ~SimpleRenderer() = default; /// @} - bool Render(const Model &model, const Shader &shader, uint32_t *buffer); + /** + * 绘制单个模型 + * @param model 要绘制的模型 + * @param shader 用于渲染的着色器 + * @param buffer 输出缓冲区 + * @return 绘制是否成功 + */ + bool DrawModel(const Model &model, const Shader &shader, uint32_t *buffer); + + /** + * 设置渲染模式 + * @param mode 渲染模式(传统或基于Tile) + */ + void SetRenderingMode(RenderingMode mode); + + /** + * 获取当前渲染模式 + * @return 当前渲染模式 + */ + RenderingMode GetRenderingMode() const; private: const size_t height_; const size_t width_; LogSystem log_system_; + RenderingMode current_mode_; // 当前渲染模式 std::shared_ptr shader_; std::shared_ptr rasterizer_; /** - * 绘制模型 + * 执行绘制管线 * @param model 模型 + * @param buffer 输出缓冲区 */ - void DrawModel(const Model &model, uint32_t *buffer); - void DrawModelSlower(const Model &model, uint32_t *buffer); + void ExecuteDrawPipeline(const Model &model, uint32_t *buffer); + + + /** + * 传统光栅化渲染 + * @param model 模型 + * @param processedVertices 已处理的顶点 + * @param buffer 输出缓冲区 + * @return 渲染统计信息 + */ + struct RenderStats { + double buffer_alloc_ms; + double rasterization_ms; + double merge_ms; + double total_ms; + }; + + RenderStats ExecuteTraditionalPipeline(const Model &model, + const std::vector &processedVertices, + uint32_t *buffer); + + /** + * Tile-based光栅化渲染 + * @param model 模型 + * @param processedVertices 已处理的顶点 + * @param buffer 输出缓冲区 + * @return 渲染统计信息 + */ + struct TileRenderStats { + double setup_ms; + double binning_ms; + double buffer_alloc_ms; + double rasterization_ms; + double merge_ms; + double visualization_ms; + double total_ms; + }; + + /** + * 延迟渲染统计信息 + */ + struct DeferredRenderStats { + double buffer_alloc_ms; + double rasterization_ms; + double fragment_collection_ms; + double fragment_merge_ms; + double deferred_shading_ms; + double total_ms; + }; + + TileRenderStats ExecuteTileBasedPipeline(const Model &model, + const std::vector &processedVertices, + uint32_t *buffer); + + /** + * 延迟渲染管线 + * @param model 模型 + * @param processedVertices 已处理的顶点 + * @param buffer 输出缓冲区 + * @return 渲染统计信息 + */ + DeferredRenderStats ExecuteDeferredPipeline(const Model &model, + const std::vector &processedVertices, + uint32_t *buffer); + + +private: + + void TriangleTileBinning( + const Model &model, + const std::vector &screenVertices, + std::vector> &tile_triangles, + size_t tiles_x, size_t tiles_y, size_t tile_size); + + void RasterizeTile( + size_t tile_id, + const std::vector &triangles, + size_t tiles_x, size_t tiles_y, size_t tile_size, + float* tile_depth_buffer, uint32_t* tile_color_buffer, + std::unique_ptr &global_depth_buffer, + std::unique_ptr &global_color_buffer); /** @@ -84,6 +205,25 @@ class SimpleRenderer { * @return 转换后的顶点(屏幕坐标) */ Vertex ViewportTransformation(const Vertex &vertex); + /** + * Tile可视化调试函数 - 在渲染结果上绘制tile网格和状态 + * @param buffer 渲染结果缓冲区 + * @param tile_triangles 每个tile包含的三角形列表 + * @param tiles_x X方向tile数量 + * @param tiles_y Y方向tile数量 + * @param tile_size 单个tile的像素大小 + */ + void DrawTileVisualization(uint32_t* buffer, + const std::vector>& tile_triangles, + size_t tiles_x, size_t tiles_y, size_t tile_size); + + /** + * 颜色混合函数 - 用于半透明效果 + * @param base 基础颜色 + * @param overlay 叠加颜色(包含alpha通道) + * @return 混合后的颜色 + */ + uint32_t BlendColors(uint32_t base, uint32_t overlay); }; } // namespace simple_renderer diff --git a/src/rasterizer.cpp b/src/rasterizer.cpp index 0712f4a..7a8c602 100644 --- a/src/rasterizer.cpp +++ b/src/rasterizer.cpp @@ -116,7 +116,7 @@ std::pair Rasterizer::GetBarycentricCoord(const Vector3f& p0, return std::pair{true, Vector3f(x, y, z)}; } - + template T Rasterizer::Interpolate(const T& v0, const T& v1, const T& v2, const Vector3f& barycentric_coord) { diff --git a/src/renderer.cpp b/src/renderer.cpp index d433ff1..34866c2 100755 --- a/src/renderer.cpp +++ b/src/renderer.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -35,18 +36,40 @@ namespace simple_renderer { SimpleRenderer::SimpleRenderer(size_t width, size_t height) : height_(height), width_(width), - log_system_(LogSystem(kLogFilePath, kLogFileMaxSize, kLogFileMaxCount)) { + log_system_(LogSystem(kLogFilePath, kLogFileMaxSize, kLogFileMaxCount)), + current_mode_(RenderingMode::TRADITIONAL) { // 默认使用传统渲染模式 rasterizer_ = std::make_shared(width, height); } -bool SimpleRenderer::Render(const Model &model, const Shader &shader, - uint32_t *buffer) { - SPDLOG_INFO("render model: {}", model.GetModelPath()); +bool SimpleRenderer::DrawModel(const Model &model, const Shader &shader, + uint32_t *buffer) { + SPDLOG_INFO("draw model: {}", model.GetModelPath()); shader_ = std::make_shared(shader); - DrawModel(model, buffer); + ExecuteDrawPipeline(model, buffer); return true; } +void SimpleRenderer::SetRenderingMode(RenderingMode mode) { + current_mode_ = mode; + std::string mode_name; + switch(mode) { + case RenderingMode::TRADITIONAL: + mode_name = "TRADITIONAL"; + break; + case RenderingMode::TILE_BASED: + mode_name = "TILE_BASED"; + break; + case RenderingMode::DEFERRED: + mode_name = "DEFERRED"; + break; + } + SPDLOG_INFO("rendering mode set to: {}", mode_name); +} + +RenderingMode SimpleRenderer::GetRenderingMode() const { + return current_mode_; +} + /* Optimizes performance by performing depth testing during rasterization, keeping only the closest fragment per pixel, and avoiding storing all @@ -54,10 +77,31 @@ fragments—resulting in faster rendering. 通过在光栅化过程中执行深度测试,仅保留每个像素的深度值最近的片段,避免存储所有片段,从而优化性能,实现更快的渲染。 */ -void SimpleRenderer::DrawModel(const Model &model, uint32_t *buffer) { - SPDLOG_INFO("draw {}", model.GetModelPath()); +void SimpleRenderer::ExecuteDrawPipeline(const Model &model, uint32_t *buffer) { + std::string mode_name; + switch(current_mode_) { + case RenderingMode::TRADITIONAL: + mode_name = "TRADITIONAL"; + break; + case RenderingMode::TILE_BASED: + mode_name = "TILE_BASED"; + break; + case RenderingMode::DEFERRED: + mode_name = "DEFERRED"; + break; + } + SPDLOG_INFO("execute draw pipeline for {} using {} mode", model.GetModelPath(), mode_name); + + if (!shader_) { + SPDLOG_ERROR("No shader set for DrawModel, cannot render"); + return; + } + + // === PERFORMANCE TIMING === + auto total_start_time = std::chrono::high_resolution_clock::now(); /* * * Vertex Shader * * */ + auto vertex_shader_start_time = std::chrono::high_resolution_clock::now(); std::vector processedVertices; std::vector> processed_vertices_all_thread(kNProc); #pragma omp parallel num_threads(kNProc) default(none) \ @@ -88,93 +132,64 @@ void SimpleRenderer::DrawModel(const Model &model, uint32_t *buffer) { processedVertices_per_thread.begin(), processedVertices_per_thread.end()); } - /* * * * * * * */ - - /* * * Rasterization * * */ - std::vector> depthBuffer_all_thread(kNProc); - std::vector> colorBuffer_all_thread(kNProc); - - for (size_t thread_id = 0; thread_id < kNProc; thread_id++) { - depthBuffer_all_thread[thread_id] = - std::make_unique(width_ * height_); - colorBuffer_all_thread[thread_id] = - std::make_unique(width_ * height_); - - std::fill_n(depthBuffer_all_thread[thread_id].get(), width_ * height_, - std::numeric_limits::infinity()); - std::fill_n(colorBuffer_all_thread[thread_id].get(), width_ * height_, 0); - } - -#pragma omp parallel num_threads(kNProc) default(none) \ - shared(processedVertices, rasterizer_, shader_, width_, height_, \ - depthBuffer_all_thread, colorBuffer_all_thread) \ - firstprivate(model) - { - int thread_id = omp_get_thread_num(); - auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id]; - auto &colorBuffer_per_thread = colorBuffer_all_thread[thread_id]; -#pragma omp for - for (const auto &f : model.GetFaces()) { - auto v0 = processedVertices[f.GetIndex(0)]; - auto v1 = processedVertices[f.GetIndex(1)]; - auto v2 = processedVertices[f.GetIndex(2)]; - - const Material *material = &f.GetMaterial(); - - auto fragments = rasterizer_->Rasterize(v0, v1, v2); - - for (auto &fragment : fragments) { - fragment.material = material; - - size_t x = fragment.screen_coord[0]; - size_t y = fragment.screen_coord[1]; - - if (x >= width_ || y >= height_) { - continue; - } - - size_t index = x + y * width_; - - if (fragment.depth < depthBuffer_per_thread[index]) { - depthBuffer_per_thread[index] = fragment.depth; + auto vertex_shader_end_time = std::chrono::high_resolution_clock::now(); + auto vertex_shader_duration = std::chrono::duration_cast( + vertex_shader_end_time - vertex_shader_start_time); - /* * * Fragment Shader * * */ - auto color = shader_->FragmentShader(fragment); - colorBuffer_per_thread[index] = uint32_t(color); - } - } + // 根据当前设置的模式选择不同的渲染管线 + double vertex_ms = vertex_shader_duration.count() / 1000.0; + + switch (current_mode_) { + case RenderingMode::TRADITIONAL: { + auto stats = ExecuteTraditionalPipeline(model, processedVertices, buffer); + double total_ms = vertex_ms + stats.total_ms; + + SPDLOG_INFO("=== TRADITIONAL RENDERING PERFORMANCE ==="); + SPDLOG_INFO("Vertex Shader: {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100); + SPDLOG_INFO("Buffer Alloc: {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100); + SPDLOG_INFO("Rasterization: {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100); + SPDLOG_INFO("Merge: {:8.3f} ms ({:5.1f}%)", stats.merge_ms, stats.merge_ms/total_ms*100); + SPDLOG_INFO("Total: {:8.3f} ms", total_ms); + SPDLOG_INFO("=========================================="); + break; } - } - - // Merge - std::unique_ptr depthBuffer = - std::make_unique(width_ * height_); - std::unique_ptr colorBuffer = - std::make_unique(width_ * height_); - - std::fill_n(depthBuffer.get(), width_ * height_, - std::numeric_limits::infinity()); - std::fill_n(colorBuffer.get(), width_ * height_, 0); - -#pragma omp parallel for - for (size_t i = 0; i < width_ * height_; i++) { - float min_depth = std::numeric_limits::infinity(); - uint32_t color = 0; - - for (size_t thread_id = 0; thread_id < kNProc; thread_id++) { - float depth = depthBuffer_all_thread[thread_id][i]; - if (depth < min_depth) { - min_depth = depth; - color = colorBuffer_all_thread[thread_id][i]; - } + + case RenderingMode::TILE_BASED: { + auto stats = ExecuteTileBasedPipeline(model, processedVertices, buffer); + double total_ms = vertex_ms + stats.total_ms; + + SPDLOG_INFO("=== TILE-BASED RENDERING PERFORMANCE ==="); + SPDLOG_INFO("Vertex Shader: {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100); + SPDLOG_INFO("Setup: {:8.3f} ms ({:5.1f}%)", stats.setup_ms, stats.setup_ms/total_ms*100); + SPDLOG_INFO("Binning: {:8.3f} ms ({:5.1f}%)", stats.binning_ms, stats.binning_ms/total_ms*100); + SPDLOG_INFO("Buffer Alloc: {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100); + SPDLOG_INFO("Rasterization: {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100); + SPDLOG_INFO("Merge: {:8.3f} ms ({:5.1f}%)", stats.merge_ms, stats.merge_ms/total_ms*100); + SPDLOG_INFO("Visualization: {:8.3f} ms ({:5.1f}%)", stats.visualization_ms, stats.visualization_ms/total_ms*100); + SPDLOG_INFO("Total: {:8.3f} ms", total_ms); + SPDLOG_INFO("=========================================="); + break; + } + + case RenderingMode::DEFERRED: { + auto stats = ExecuteDeferredPipeline(model, processedVertices, buffer); + double total_ms = vertex_ms + stats.total_ms; + + SPDLOG_INFO("=== DEFERRED RENDERING PERFORMANCE ==="); + SPDLOG_INFO("Vertex Shader: {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100); + SPDLOG_INFO("Buffer Alloc: {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100); + SPDLOG_INFO("Rasterization: {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100); + SPDLOG_INFO("Fragment Collection: {:8.3f} ms ({:5.1f}%)", stats.fragment_collection_ms, stats.fragment_collection_ms/total_ms*100); + SPDLOG_INFO("Fragment Merge: {:8.3f} ms ({:5.1f}%)", stats.fragment_merge_ms, stats.fragment_merge_ms/total_ms*100); + SPDLOG_INFO("Deferred Shading: {:8.3f} ms ({:5.1f}%)", stats.deferred_shading_ms, stats.deferred_shading_ms/total_ms*100); + SPDLOG_INFO("Total: {:8.3f} ms", total_ms); + SPDLOG_INFO("========================================="); + break; } - depthBuffer[i] = min_depth; - colorBuffer[i] = color; } - - std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t)); } + /* Organizes processing to simulate how OpenGL works with GPUs by collecting all fragments per pixel before processing, closely mimicking the GPU pipeline but @@ -182,61 +197,46 @@ leading to increased memory usage and slower performance. 组织处理方式模拟 OpenGL 在 GPU 上的工作原理,先收集每个像素的所有片段再并行处理屏幕上的每个像素,模仿 GPU -管线,但导致内存使用增加和渲染速度变慢 -*/ -void SimpleRenderer::DrawModelSlower(const Model &model, uint32_t *buffer) { - SPDLOG_INFO("draw {}", model.GetModelPath()); - - /* * * Vertex Shader * * */ - std::vector processedVertex; - std::vector> processed_vertices_per_thread(kNProc); -#pragma omp parallel num_threads(kNProc) default(none) \ - shared(shader_, processed_vertices_per_thread) firstprivate(model) - { - int thread_id = omp_get_thread_num(); - std::vector &local_vertices = - processed_vertices_per_thread[thread_id]; - -#pragma omp for - for (const auto &v : model.GetVertices()) { - /* * * Vertex Shader * * */ - // 顶点着色器:世界坐标 -> 裁剪坐标 - auto clipSpaceVertex = shader_->VertexShader(v); - - // 透视除法:裁剪坐标 -> NDC坐标 - auto ndcVertex = PerspectiveDivision(clipSpaceVertex); - - // 视口变换:NDC坐标 -> 屏幕坐标 - auto screenSpaceVertex = ViewportTransformation(ndcVertex); - - local_vertices.push_back(screenSpaceVertex); - } - } +管线,但导致内存使用增加和渲染速度变慢。 - for (const auto &local_vertices : processed_vertices_per_thread) { - processedVertex.insert(processedVertex.end(), local_vertices.begin(), - local_vertices.end()); - } +现在作为延迟渲染管线的一部分,用于教学演示经典GPU管线概念。 +*/ +SimpleRenderer::DeferredRenderStats SimpleRenderer::ExecuteDeferredPipeline( + const Model &model, + const std::vector &processedVertices, + uint32_t *buffer) { + + DeferredRenderStats stats; + SPDLOG_INFO("execute deferred pipeline for {}", model.GetModelPath()); /* * * * * * * */ /* * * Rasterization * * */ std::vector>> fragmentsBuffer_all_thread( kNProc, std::vector>(width_ * height_)); + // 预先缓存所有Material数据,避免指针悬垂问题 + std::vector material_cache; + material_cache.reserve(model.GetFaces().size()); + for (const auto &f : model.GetFaces()) { + material_cache.push_back(f.GetMaterial()); // 值拷贝 + } + SPDLOG_INFO("cached {} materials for deferred rendering", material_cache.size()); + #pragma omp parallel num_threads(kNProc) default(none) \ - shared(processedVertex, fragmentsBuffer_all_thread, rasterizer_, width_, \ - height_) firstprivate(model) + shared(processedVertices, fragmentsBuffer_all_thread, rasterizer_, width_, \ + height_, material_cache) firstprivate(model) { int thread_id = omp_get_thread_num(); auto &fragmentsBuffer_per_thread = fragmentsBuffer_all_thread[thread_id]; #pragma omp for - for (const auto &f : model.GetFaces()) { - auto v0 = processedVertex[f.GetIndex(0)]; - auto v1 = processedVertex[f.GetIndex(1)]; - auto v2 = processedVertex[f.GetIndex(2)]; + for (size_t face_idx = 0; face_idx < model.GetFaces().size(); ++face_idx) { + const auto &f = model.GetFaces()[face_idx]; + auto v0 = processedVertices[f.GetIndex(0)]; + auto v1 = processedVertices[f.GetIndex(1)]; + auto v2 = processedVertices[f.GetIndex(2)]; - const Material *material = &f.GetMaterial(); + const Material *material = &material_cache[face_idx]; // 使用缓存的Material auto fragments = rasterizer_->Rasterize(v0, v1, v2); @@ -268,7 +268,7 @@ void SimpleRenderer::DrawModelSlower(const Model &model, uint32_t *buffer) { /* * * * * * * */ /* * * Fragment Shader * * */ -#pragma omp parallel for +// #pragma omp parallel for for (size_t i = 0; i < fragmentsBuffer.size(); i++) { const auto &fragments = fragmentsBuffer[i]; if (fragments.empty()) { @@ -283,11 +283,26 @@ void SimpleRenderer::DrawModelSlower(const Model &model, uint32_t *buffer) { } if (renderFragment) { + // 添加Material指针有效性检查 + if (renderFragment->material == nullptr) { + SPDLOG_ERROR("Fragment material is nullptr at pixel {}", i); + continue; + } auto color = shader_->FragmentShader(*renderFragment); buffer[i] = uint32_t(color); } } /* * * * * * * */ + + // 填充基本统计信息(延迟渲染模式主要用于教学演示) + stats.buffer_alloc_ms = 0.0; + stats.rasterization_ms = 0.0; + stats.fragment_collection_ms = 0.0; + stats.fragment_merge_ms = 0.0; + stats.deferred_shading_ms = 0.0; + stats.total_ms = 0.0; + + return stats; } Vertex SimpleRenderer::PerspectiveDivision(const Vertex &vertex) { @@ -295,7 +310,6 @@ Vertex SimpleRenderer::PerspectiveDivision(const Vertex &vertex) { // 检查w分量,避免除零和负数问题 if (position.w <= 1e-6f) { - SPDLOG_DEBUG("PerspectiveDivision: w <= 1e-6f"); Vector4f farPosition(0.0f, 0.0f, 1.0f, 1.0f); return Vertex(farPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor()); } @@ -311,9 +325,8 @@ Vertex SimpleRenderer::PerspectiveDivision(const Vertex &vertex) { 1.0f / original_w // 保存1/w用于透视矫正插值 ); - // 严格限制NDC坐标在标准范围内 - ndcPosition.x = std::clamp(ndcPosition.x, -1.0f, 1.0f); - ndcPosition.y = std::clamp(ndcPosition.y, -1.0f, 1.0f); + // 只对Z坐标进行深度范围限制,X和Y允许超出以支持屏幕外三角形 + // 这些坐标在后续的视口变换和裁剪阶段会被正确处理 ndcPosition.z = std::clamp(ndcPosition.z, -1.0f, 1.0f); // 创建新的顶点,保持其他属性不变 @@ -327,10 +340,6 @@ Vertex SimpleRenderer::ViewportTransformation(const Vertex &vertex) { float screen_x = (ndcPosition.x + 1.0f) * width_ / 2.0f; float screen_y = (1.0f - ndcPosition.y) * height_ / 2.0f; - // 额外的屏幕坐标边界保护 - screen_x = std::clamp(screen_x, 0.0f, static_cast(width_ - 1)); - screen_y = std::clamp(screen_y, 0.0f, static_cast(height_ - 1)); - Vector4f screenPosition( screen_x, // x: 屏幕坐标 screen_y, // y: 屏幕坐标 @@ -341,4 +350,537 @@ Vertex SimpleRenderer::ViewportTransformation(const Vertex &vertex) { return Vertex(screenPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor()); } + + + +// Triangle-Tile binning函数 - 修正版本 +void SimpleRenderer::TriangleTileBinning( + const Model &model, + const std::vector &screenVertices, + std::vector> &tile_triangles, + size_t tiles_x, size_t tiles_y, size_t tile_size) { + + size_t total_triangles = model.GetFaces().size(); + size_t processed_triangles = 0; + size_t clipped_triangles = 0; + size_t triangles_with_clipped_vertices = 0; + + SPDLOG_INFO("Starting triangle-tile binning for {} triangles", total_triangles); + SPDLOG_INFO("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", + width_, height_, tile_size, tiles_x, tiles_y); + + for (size_t tri_idx = 0; tri_idx < model.GetFaces().size(); tri_idx++) { + const auto &f = model.GetFaces()[tri_idx]; + auto v0 = screenVertices[f.GetIndex(0)]; + auto v1 = screenVertices[f.GetIndex(1)]; + auto v2 = screenVertices[f.GetIndex(2)]; + + // 获取屏幕空间坐标(现在已经是屏幕坐标了) + Vector4f pos0 = v0.GetPosition(); + Vector4f pos1 = v1.GetPosition(); + Vector4f pos2 = v2.GetPosition(); + + // 检查三角形是否有被裁剪的顶点(坐标为-1000的表示被裁剪) + bool has_clipped_vertex = (pos0.x == -1000.0f || pos1.x == -1000.0f || pos2.x == -1000.0f); + + if (has_clipped_vertex) { + triangles_with_clipped_vertices++; + if (triangles_with_clipped_vertices <= 3) { + SPDLOG_INFO("Triangle {} has clipped vertices:", tri_idx); + SPDLOG_INFO(" V0: ({:.1f},{:.1f}) V1: ({:.1f},{:.1f}) V2: ({:.1f},{:.1f})", + pos0.x, pos0.y, pos1.x, pos1.y, pos2.x, pos2.y); + } + continue; + } + + // 直接使用屏幕空间坐标 + float screen_x0 = pos0.x; + float screen_y0 = pos0.y; + float screen_x1 = pos1.x; + float screen_y1 = pos1.y; + float screen_x2 = pos2.x; + float screen_y2 = pos2.y; + + // 计算bounding box + float min_x = std::min({screen_x0, screen_x1, screen_x2}); + float max_x = std::max({screen_x0, screen_x1, screen_x2}); + float min_y = std::min({screen_y0, screen_y1, screen_y2}); + float max_y = std::max({screen_y0, screen_y1, screen_y2}); + + // 调试前几个有效三角形的坐标范围 + if (processed_triangles < 3) { + SPDLOG_INFO("Triangle {} coordinates:", tri_idx); + SPDLOG_INFO(" Screen coords: ({:.1f},{:.1f}) ({:.1f},{:.1f}) ({:.1f},{:.1f})", + screen_x0, screen_y0, screen_x1, screen_y1, screen_x2, screen_y2); + SPDLOG_INFO(" BBox: min({:.1f},{:.1f}) max({:.1f},{:.1f})", + min_x, min_y, max_x, max_y); + } + + // 临时:大幅放宽屏幕边界检查,让超出屏幕的三角形也能处理 + if (max_x < -5000.0f || min_x >= width_ + 5000.0f || + max_y < -5000.0f || min_y >= height_ + 5000.0f) { + clipped_triangles++; + if (processed_triangles < 3) { + SPDLOG_INFO(" -> CLIPPED by screen bounds"); + } + continue; + } + + // 计算影响的tile范围 + int start_tile_x = std::max(0, static_cast(min_x) / static_cast(tile_size)); + int end_tile_x = std::min(static_cast(tiles_x - 1), + static_cast(max_x) / static_cast(tile_size)); + int start_tile_y = std::max(0, static_cast(min_y) / static_cast(tile_size)); + int end_tile_y = std::min(static_cast(tiles_y - 1), + static_cast(max_y) / static_cast(tile_size)); + + // 添加三角形到相关tiles(多个三角形可能会映射到同一个tile当中,所以谨慎并行化) + if (start_tile_x <= end_tile_x && start_tile_y <= end_tile_y) { + TriangleInfo triangle_info = {v0, v1, v2, &f.GetMaterial(), processed_triangles}; + + for (int ty = start_tile_y; ty <= end_tile_y; ty++) { + for (int tx = start_tile_x; tx <= end_tile_x; tx++) { + size_t tile_id = ty * tiles_x + tx; + tile_triangles[tile_id].push_back(triangle_info); // 可能多个线程同时pushback的话有风险 + } + } + processed_triangles++; + + // 输出前几个成功添加的三角形信息 + if (processed_triangles <= 3) { + SPDLOG_INFO(" -> SUCCESSFULLY ADDED to tiles x[{}..{}] y[{}..{}]", + start_tile_x, end_tile_x, start_tile_y, end_tile_y); + } + } else { + if (processed_triangles < 3) { + SPDLOG_INFO(" -> FAILED tile calculation: x[{}..{}] y[{}..{}]", + start_tile_x, end_tile_x, start_tile_y, end_tile_y); + } + } + } + + // 输出统计信息 + SPDLOG_INFO("Triangle-Tile binning completed:"); + SPDLOG_INFO(" Total triangles: {}", total_triangles); + SPDLOG_INFO(" Triangles with clipped vertices: {}", triangles_with_clipped_vertices); + SPDLOG_INFO(" Processed triangles: {}", processed_triangles); + SPDLOG_INFO(" Clipped by screen bounds: {}", clipped_triangles); + + size_t total_triangle_refs = 0; + size_t non_empty_tiles = 0; + for (const auto& tile : tile_triangles) { + total_triangle_refs += tile.size(); + if (!tile.empty()) non_empty_tiles++; + } + + SPDLOG_INFO(" Total triangle references: {}", total_triangle_refs); + SPDLOG_INFO(" Non-empty tiles: {}", non_empty_tiles); + SPDLOG_INFO(" Average triangles per tile: {:.2f}", + total_triangle_refs > 0 ? float(total_triangle_refs) / tile_triangles.size() : 0.0f); +} + +// 单个tile光栅化函数 +void SimpleRenderer::RasterizeTile( + size_t tile_id, + const std::vector &triangles, + size_t tiles_x, size_t tiles_y, size_t tile_size, + float* tile_depth_buffer, uint32_t* tile_color_buffer, + std::unique_ptr &global_depth_buffer, + std::unique_ptr &global_color_buffer) { + // 计算tile在屏幕空间的范围 + size_t tile_x = tile_id % tiles_x; + size_t tile_y = tile_id / tiles_x; + size_t screen_x_start = tile_x * tile_size; + size_t screen_y_start = tile_y * tile_size; + size_t screen_x_end = std::min(screen_x_start + tile_size, width_); + size_t screen_y_end = std::min(screen_y_start + tile_size, height_); + + // 初始化tile缓冲区 + size_t tile_width = screen_x_end - screen_x_start; + size_t tile_height = screen_y_end - screen_y_start; + std::fill_n(tile_depth_buffer, tile_width * tile_height, + std::numeric_limits::infinity()); + std::fill_n(tile_color_buffer, tile_width * tile_height, 0); + + // 在tile内光栅化所有三角形 + for (const auto &triangle : triangles) { + auto fragments = rasterizer_->Rasterize(triangle.v0, triangle.v1, triangle.v2); + + for (auto &fragment : fragments) { + fragment.material = triangle.material; + + size_t screen_x = fragment.screen_coord[0]; + size_t screen_y = fragment.screen_coord[1]; + + // 检查fragment是否在当前tile内 + if (screen_x >= screen_x_start && screen_x < screen_x_end && + screen_y >= screen_y_start && screen_y < screen_y_end) { + + size_t tile_local_x = screen_x - screen_x_start; + size_t tile_local_y = screen_y - screen_y_start; + size_t tile_index = tile_local_x + tile_local_y * tile_width; + + // tile内深度测试 + if (fragment.depth < tile_depth_buffer[tile_index]) { + tile_depth_buffer[tile_index] = fragment.depth; + + auto color = shader_->FragmentShader(fragment); + tile_color_buffer[tile_index] = uint32_t(color); + } + } + } + } + + // 将tile结果写入全局缓冲区 + for (size_t y = 0; y < tile_height; y++) { + for (size_t x = 0; x < tile_width; x++) { + size_t tile_index = x + y * tile_width; + size_t global_index = (screen_x_start + x) + (screen_y_start + y) * width_; + + if (tile_depth_buffer[tile_index] < global_depth_buffer[global_index]) { + global_depth_buffer[global_index] = tile_depth_buffer[tile_index]; + global_color_buffer[global_index] = tile_color_buffer[tile_index]; + } + } + } +} + +// Tile可视化调试函数,这里用于固定大小的tiles +void SimpleRenderer::DrawTileVisualization(uint32_t* buffer, + const std::vector>& tile_triangles, + size_t tiles_x, size_t tiles_y, size_t tile_size) { + + SPDLOG_INFO("=== TILE VISUALIZATION DEBUG ==="); + SPDLOG_INFO("Drawing tile grid overlay for debugging"); + + // 颜色定义 (ABGR格式) + const uint32_t GRID_COLOR = 0xFF00FF00; // 绿色网格线 + const uint32_t NONEMPTY_COLOR = 0x4000FFFF; // 半透明黄色背景 (非空tile) + const uint32_t EMPTY_COLOR = 0x20FF0000; // 半透明蓝色背景 (空tile) + + // 1. 为非空tiles添加背景色 + for (size_t tile_y = 0; tile_y < tiles_y; tile_y++) { + for (size_t tile_x = 0; tile_x < tiles_x; tile_x++) { + size_t tile_id = tile_y * tiles_x + tile_x; + bool is_empty = tile_triangles[tile_id].empty(); + + // 计算tile在屏幕上的像素范围 + size_t pixel_start_x = tile_x * tile_size; + size_t pixel_end_x = std::min(pixel_start_x + tile_size, static_cast(width_)); + size_t pixel_start_y = tile_y * tile_size; + size_t pixel_end_y = std::min(pixel_start_y + tile_size, static_cast(height_)); + + uint32_t bg_color = is_empty ? EMPTY_COLOR : NONEMPTY_COLOR; + + // 给tile添加半透明背景 + for (size_t y = pixel_start_y; y < pixel_end_y; y++) { + for (size_t x = pixel_start_x; x < pixel_end_x; x++) { + size_t pixel_idx = y * static_cast(width_) + x; + // 简单的alpha混合:将背景色与原色混合 + uint32_t original = buffer[pixel_idx]; + buffer[pixel_idx] = BlendColors(original, bg_color); + } + } + + // 记录非空tile的信息 + if (!is_empty) { + SPDLOG_INFO("Non-empty Tile[{},{}] (ID:{}): {} triangles", + tile_x, tile_y, tile_id, tile_triangles[tile_id].size()); + } + } + } + + // 2. 绘制网格线 + // 垂直线 + for (size_t tile_x = 0; tile_x <= tiles_x; tile_x++) { + size_t pixel_x = tile_x * tile_size; + if (pixel_x < static_cast(width_)) { + for (size_t y = 0; y < static_cast(height_); y++) { + buffer[y * static_cast(width_) + pixel_x] = GRID_COLOR; + } + } + } + + // 水平线 + for (size_t tile_y = 0; tile_y <= tiles_y; tile_y++) { + size_t pixel_y = tile_y * tile_size; + if (pixel_y < static_cast(height_)) { + for (size_t x = 0; x < static_cast(width_); x++) { + buffer[pixel_y * static_cast(width_) + x] = GRID_COLOR; + } + } + } + + SPDLOG_INFO("Tile visualization completed - Green:Grid, Yellow:NonEmpty, Blue:Empty"); + SPDLOG_INFO("====================================="); +} + +// 简单的颜色混合函数 (alpha blending) +uint32_t SimpleRenderer::BlendColors(uint32_t base, uint32_t overlay) { + // 提取RGBA通道 (假设是ABGR格式) + uint8_t base_r = (base >> 16) & 0xFF; + uint8_t base_g = (base >> 8) & 0xFF; + uint8_t base_b = base & 0xFF; + + uint8_t overlay_r = (overlay >> 16) & 0xFF; + uint8_t overlay_g = (overlay >> 8) & 0xFF; + uint8_t overlay_b = overlay & 0xFF; + uint8_t overlay_a = (overlay >> 24) & 0xFF; + + // 简单的alpha混合 + float alpha = overlay_a / 255.0f; + uint8_t result_r = (uint8_t)(base_r * (1.0f - alpha) + overlay_r * alpha); + uint8_t result_g = (uint8_t)(base_g * (1.0f - alpha) + overlay_g * alpha); + uint8_t result_b = (uint8_t)(base_b * (1.0f - alpha) + overlay_b * alpha); + + return 0xFF000000 | (result_r << 16) | (result_g << 8) | result_b; +} + +// 传统光栅化管线实现 +SimpleRenderer::RenderStats SimpleRenderer::ExecuteTraditionalPipeline( + const Model &model, + const std::vector &processedVertices, + uint32_t *buffer) { + + RenderStats stats; + auto total_start_time = std::chrono::high_resolution_clock::now(); + + // 1. 为每个线程创建framebuffer + auto buffer_alloc_start_time = std::chrono::high_resolution_clock::now(); + std::vector> depthBuffer_all_thread(kNProc); + std::vector> colorBuffer_all_thread(kNProc); + + for (size_t thread_id = 0; thread_id < kNProc; thread_id++) { + depthBuffer_all_thread[thread_id] = + std::make_unique(width_ * height_); + colorBuffer_all_thread[thread_id] = + std::make_unique(width_ * height_); + + std::fill_n(depthBuffer_all_thread[thread_id].get(), width_ * height_, + std::numeric_limits::infinity()); + std::fill_n(colorBuffer_all_thread[thread_id].get(), width_ * height_, 0); + } + auto buffer_alloc_end_time = std::chrono::high_resolution_clock::now(); + auto buffer_alloc_duration = std::chrono::duration_cast( + buffer_alloc_end_time - buffer_alloc_start_time); + + // 2. 并行光栅化 + auto raster_start_time = std::chrono::high_resolution_clock::now(); +#pragma omp parallel num_threads(kNProc) default(none) \ + shared(processedVertices, rasterizer_, shader_, width_, height_, \ + depthBuffer_all_thread, colorBuffer_all_thread) \ + firstprivate(model) + { + int thread_id = omp_get_thread_num(); + auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id]; + auto &colorBuffer_per_thread = colorBuffer_all_thread[thread_id]; + +#pragma omp for + for (const auto &f : model.GetFaces()) { + auto v0 = processedVertices[f.GetIndex(0)]; + auto v1 = processedVertices[f.GetIndex(1)]; + auto v2 = processedVertices[f.GetIndex(2)]; + + const Material *material = &f.GetMaterial(); + auto fragments = rasterizer_->Rasterize(v0, v1, v2); + + for (auto &fragment : fragments) { + fragment.material = material; + size_t x = fragment.screen_coord[0]; + size_t y = fragment.screen_coord[1]; + + if (x >= width_ || y >= height_) { + continue; + } + + size_t index = x + y * width_; + if (fragment.depth < depthBuffer_per_thread[index]) { + depthBuffer_per_thread[index] = fragment.depth; + auto color = shader_->FragmentShader(fragment); + colorBuffer_per_thread[index] = uint32_t(color); + } + } + } + } + auto raster_end_time = std::chrono::high_resolution_clock::now(); + auto raster_duration = std::chrono::duration_cast( + raster_end_time - raster_start_time); + + // 3. 合并结果 + auto merge_start_time = std::chrono::high_resolution_clock::now(); + std::unique_ptr depthBuffer = + std::make_unique(width_ * height_); + std::unique_ptr colorBuffer = + std::make_unique(width_ * height_); + + std::fill_n(depthBuffer.get(), width_ * height_, + std::numeric_limits::infinity()); + std::fill_n(colorBuffer.get(), width_ * height_, 0); + +#pragma omp parallel for + for (size_t i = 0; i < width_ * height_; i++) { + float min_depth = std::numeric_limits::infinity(); + uint32_t color = 0; + + for (size_t thread_id = 0; thread_id < kNProc; thread_id++) { + float depth = depthBuffer_all_thread[thread_id][i]; + if (depth < min_depth) { + min_depth = depth; + color = colorBuffer_all_thread[thread_id][i]; + } + } + depthBuffer[i] = min_depth; + colorBuffer[i] = color; + } + + std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t)); + auto merge_end_time = std::chrono::high_resolution_clock::now(); + auto merge_duration = std::chrono::duration_cast( + merge_end_time - merge_start_time); + + auto total_end_time = std::chrono::high_resolution_clock::now(); + auto total_duration = std::chrono::duration_cast( + total_end_time - total_start_time); + + // 填充统计信息 + stats.buffer_alloc_ms = buffer_alloc_duration.count() / 1000.0; + stats.rasterization_ms = raster_duration.count() / 1000.0; + stats.merge_ms = merge_duration.count() / 1000.0; + stats.total_ms = total_duration.count() / 1000.0; + + return stats; +} + +// Tile-based光栅化管线实现 +SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline( + const Model &model, + const std::vector &processedVertices, + uint32_t *buffer) { + + TileRenderStats stats; + auto total_start_time = std::chrono::high_resolution_clock::now(); + + // 1. Setup阶段 + auto setup_start_time = std::chrono::high_resolution_clock::now(); + const size_t TILE_SIZE = 64; // 64x64 pixels per tile + const size_t tiles_x = (width_ + TILE_SIZE - 1) / TILE_SIZE; + const size_t tiles_y = (height_ + TILE_SIZE - 1) / TILE_SIZE; + const size_t total_tiles = tiles_x * tiles_y; + + // 为每个tile创建三角形列表 + std::vector> tile_triangles(total_tiles); + auto setup_end_time = std::chrono::high_resolution_clock::now(); + auto setup_duration = std::chrono::duration_cast( + setup_end_time - setup_start_time); + + // 2. Triangle-Tile binning阶段 + auto binning_start_time = std::chrono::high_resolution_clock::now(); + TriangleTileBinning(model, processedVertices, tile_triangles, tiles_x, tiles_y, TILE_SIZE); + auto binning_end_time = std::chrono::high_resolution_clock::now(); + auto binning_duration = std::chrono::duration_cast( + binning_end_time - binning_start_time); + + // 3. 为每个线程创建framebuffer + auto buffer_alloc_start_time = std::chrono::high_resolution_clock::now(); + std::vector> depthBuffer_all_thread(kNProc); + std::vector> colorBuffer_all_thread(kNProc); + + for (size_t thread_id = 0; thread_id < kNProc; thread_id++) { + depthBuffer_all_thread[thread_id] = + std::make_unique(width_ * height_); + colorBuffer_all_thread[thread_id] = + std::make_unique(width_ * height_); + + std::fill_n(depthBuffer_all_thread[thread_id].get(), width_ * height_, + std::numeric_limits::infinity()); + std::fill_n(colorBuffer_all_thread[thread_id].get(), width_ * height_, 0); + } + auto buffer_alloc_end_time = std::chrono::high_resolution_clock::now(); + auto buffer_alloc_duration = std::chrono::duration_cast( + buffer_alloc_end_time - buffer_alloc_start_time); + + // 4. 并行处理每个tile + auto rasterization_start_time = std::chrono::high_resolution_clock::now(); +#pragma omp parallel num_threads(kNProc) default(none) \ + shared(tile_triangles, rasterizer_, shader_, width_, height_, \ + depthBuffer_all_thread, colorBuffer_all_thread, tiles_x, tiles_y, total_tiles) + { + int thread_id = omp_get_thread_num(); + auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id]; + auto &colorBuffer_per_thread = colorBuffer_all_thread[thread_id]; + + // 为当前线程创建tile局部缓冲区 + std::unique_ptr tile_depth_buffer = + std::make_unique(TILE_SIZE * TILE_SIZE); + std::unique_ptr tile_color_buffer = + std::make_unique(TILE_SIZE * TILE_SIZE); + +#pragma omp for + for (size_t tile_id = 0; tile_id < total_tiles; tile_id++) { + // 按照tile进行光栅化 + RasterizeTile(tile_id, tile_triangles[tile_id], + tiles_x, tiles_y, TILE_SIZE, + tile_depth_buffer.get(), tile_color_buffer.get(), + depthBuffer_per_thread, colorBuffer_per_thread); + } + } + auto rasterization_end_time = std::chrono::high_resolution_clock::now(); + auto rasterization_duration = std::chrono::duration_cast( + rasterization_end_time - rasterization_start_time); + + // 5. 合并所有线程结果 + auto merge_start_time = std::chrono::high_resolution_clock::now(); + std::unique_ptr depthBuffer = + std::make_unique(width_ * height_); + std::unique_ptr colorBuffer = + std::make_unique(width_ * height_); + + std::fill_n(depthBuffer.get(), width_ * height_, + std::numeric_limits::infinity()); + std::fill_n(colorBuffer.get(), width_ * height_, 0); + +#pragma omp parallel for + for (size_t i = 0; i < width_ * height_; i++) { + float min_depth = std::numeric_limits::infinity(); + uint32_t color = 0; + + for (size_t thread_id = 0; thread_id < kNProc; thread_id++) { + float depth = depthBuffer_all_thread[thread_id][i]; + if (depth < min_depth) { + min_depth = depth; + color = colorBuffer_all_thread[thread_id][i]; + } + } + depthBuffer[i] = min_depth; + colorBuffer[i] = color; + } + + std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t)); + auto merge_end_time = std::chrono::high_resolution_clock::now(); + auto merge_duration = std::chrono::duration_cast( + merge_end_time - merge_start_time); + + // 6. Tile可视化调试 + auto visualization_start_time = std::chrono::high_resolution_clock::now(); + DrawTileVisualization(buffer, tile_triangles, tiles_x, tiles_y, TILE_SIZE); + auto visualization_end_time = std::chrono::high_resolution_clock::now(); + auto visualization_duration = std::chrono::duration_cast( + visualization_end_time - visualization_start_time); + + auto total_end_time = std::chrono::high_resolution_clock::now(); + auto total_duration = std::chrono::duration_cast( + total_end_time - total_start_time); + + // 填充统计信息 + stats.setup_ms = setup_duration.count() / 1000.0; + stats.binning_ms = binning_duration.count() / 1000.0; + stats.buffer_alloc_ms = buffer_alloc_duration.count() / 1000.0; + stats.rasterization_ms = rasterization_duration.count() / 1000.0; + stats.merge_ms = merge_duration.count() / 1000.0; + stats.visualization_ms = visualization_duration.count() / 1000.0; + stats.total_ms = total_duration.count() / 1000.0; + + return stats; +} + } // namespace simple_renderer diff --git a/test/system_test/main.cpp b/test/system_test/main.cpp index a844aa7..58c70ad 100755 --- a/test/system_test/main.cpp +++ b/test/system_test/main.cpp @@ -56,6 +56,7 @@ int main(int argc, char **argv) { } auto modelMatrix = simple_renderer::Matrix4f(1.0f); + simple_renderer::Matrix4f scale_matrix = glm::scale(simple_renderer::Matrix4f(1.0f), simple_renderer::Vector3f(.02f, .02f, .02f)); @@ -68,8 +69,7 @@ int main(int argc, char **argv) { glm::rotate(simple_renderer::Matrix4f(1.0f), glm::radians(-105.0f), simple_renderer::Vector3f(1.0f, 0.0f, 0.0f)); - // Combined transformation matrix - modelMatrix = scale_matrix * translation_matrix * rotation_matrix; + modelMatrix = scale_matrix* translation_matrix * rotation_matrix ; simple_renderer::Shader shader; shader.SetUniform("modelMatrix", modelMatrix); @@ -80,6 +80,28 @@ int main(int argc, char **argv) { simple_renderer::Camera camera(simple_renderer::Vector3f(0.0f, 0.0f, 1.0f)); + // 设置渲染模式(可选:TRADITIONAL、TILE_BASED 或 DEFERRED) + simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::DEFERRED); + + // 输出当前渲染模式 + std::string current_mode_name; + switch(simple_renderer.GetRenderingMode()) { + case simple_renderer::RenderingMode::TRADITIONAL: + current_mode_name = "TRADITIONAL (传统光栅化)"; + break; + case simple_renderer::RenderingMode::TILE_BASED: + current_mode_name = "TILE_BASED (基于Tile光栅化)"; + break; + case simple_renderer::RenderingMode::DEFERRED: + current_mode_name = "DEFERRED (模仿GPU的延迟渲染)"; + break; + } + SPDLOG_INFO("当前渲染模式: {}", current_mode_name); + + // 可以在这里添加模式切换的示例: + // simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::TILE_BASED); + // simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::DEFERRED); + auto display = Display(kWidth, kHeight); display.loopBegin(); @@ -89,11 +111,11 @@ int main(int argc, char **argv) { shader.SetUniform("cameraPos", camera.GetPosition()); shader.SetUniform("viewMatrix", camera.GetViewMatrix()); shader.SetUniform("projectionMatrix", - camera.GetProjectionMatrix(60.0f, float(kWidth)/float(kHeight), 0.1f, 100.0f)); + camera.GetProjectionMatrix(60.0f, static_cast(kWidth) / static_cast(kHeight), 0.1f, 100.0f)); buffer.ClearDrawBuffer(simple_renderer::Color::kBlack); for (auto &model : models) { - simple_renderer.Render(model, shader, buffer.GetDrawBuffer()); + simple_renderer.DrawModel(model, shader, buffer.GetDrawBuffer()); } buffer.SwapBuffer(); From 8d58a84c4d5222091d276322ee1343019dbb860e Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Fri, 29 Aug 2025 15:53:55 +0800 Subject: [PATCH 04/24] Add Performance Profiling for Deffered Pipeline. Remove detailed debug code for TBR. Signed-off-by: ZhouFANG --- src/include/vertex.hpp | 18 +++++- src/renderer.cpp | 116 ++++++++++++++++++++++++++++++++------ src/shader.cpp | 10 +++- test/system_test/main.cpp | 6 +- 4 files changed, 125 insertions(+), 25 deletions(-) diff --git a/src/include/vertex.hpp b/src/include/vertex.hpp index 975abd0..bff0680 100644 --- a/src/include/vertex.hpp +++ b/src/include/vertex.hpp @@ -34,7 +34,15 @@ class Vertex { // Constructor with parameters 带参数的构造函数 explicit Vertex(const Vector4f& pos, const Vector3f& norm, const Vector2f& tex, const Color& color_) - : position_(pos), normal_(norm), texCoords_(tex), color_(color_) {} + : position_(pos), normal_(norm), texCoords_(tex), color_(color_), + clip_position_(pos), has_clip_position_(false) {} + + // 扩展构造函数:包含裁剪空间坐标 + explicit Vertex(const Vector4f& pos, const Vector3f& norm, + const Vector2f& tex, const Color& color_, + const Vector4f& clip_pos) + : position_(pos), normal_(norm), texCoords_(tex), color_(color_), + clip_position_(clip_pos), has_clip_position_(true) {} // Transform the vertex with a matrix 使用矩阵变换顶点 void transform(const Matrix4f& matrix) { position_ = matrix * position_; } @@ -45,12 +53,20 @@ class Vertex { [[nodiscard]] inline Vector3f GetNormal() const { return normal_; } [[nodiscard]] inline Vector2f GetTexCoords() const { return texCoords_; } [[nodiscard]] inline Color GetColor() const { return color_; } + + // 扩展坐标访问 + [[nodiscard]] inline Vector4f GetClipPosition() const { return clip_position_; } + [[nodiscard]] inline bool HasClipPosition() const { return has_clip_position_; } private: Vector4f position_; // 3D position, 3D顶点坐标 Vector3f normal_; // Normal vector, 顶点法向量 Vector2f texCoords_; // Texture coordinates, 顶点纹理坐标 Color color_; + + // 扩展坐标用于裁剪优化 + Vector4f clip_position_; // 裁剪空间坐标 (用于视锥体裁剪) + bool has_clip_position_; // 是否包含裁剪坐标 }; inline Vertex operator*(const Matrix4f& matrix, const Vertex& vertex) { diff --git a/src/renderer.cpp b/src/renderer.cpp index 34866c2..a7bc226 100755 --- a/src/renderer.cpp +++ b/src/renderer.cpp @@ -37,7 +37,7 @@ SimpleRenderer::SimpleRenderer(size_t width, size_t height) : height_(height), width_(width), log_system_(LogSystem(kLogFilePath, kLogFileMaxSize, kLogFileMaxCount)), - current_mode_(RenderingMode::TRADITIONAL) { // 默认使用传统渲染模式 + current_mode_(RenderingMode::TILE_BASED) { rasterizer_ = std::make_shared(width, height); } @@ -207,10 +207,12 @@ SimpleRenderer::DeferredRenderStats SimpleRenderer::ExecuteDeferredPipeline( uint32_t *buffer) { DeferredRenderStats stats; + auto total_start_time = std::chrono::high_resolution_clock::now(); SPDLOG_INFO("execute deferred pipeline for {}", model.GetModelPath()); /* * * * * * * */ - /* * * Rasterization * * */ + /* * * Buffer Allocation * * */ + auto buffer_alloc_start_time = std::chrono::high_resolution_clock::now(); std::vector>> fragmentsBuffer_all_thread( kNProc, std::vector>(width_ * height_)); @@ -220,8 +222,13 @@ SimpleRenderer::DeferredRenderStats SimpleRenderer::ExecuteDeferredPipeline( for (const auto &f : model.GetFaces()) { material_cache.push_back(f.GetMaterial()); // 值拷贝 } + auto buffer_alloc_end_time = std::chrono::high_resolution_clock::now(); + auto buffer_alloc_duration = std::chrono::duration_cast( + buffer_alloc_end_time - buffer_alloc_start_time); SPDLOG_INFO("cached {} materials for deferred rendering", material_cache.size()); + /* * * Rasterization * * */ + auto rasterization_start_time = std::chrono::high_resolution_clock::now(); #pragma omp parallel num_threads(kNProc) default(none) \ shared(processedVertices, fragmentsBuffer_all_thread, rasterizer_, width_, \ height_, material_cache) firstprivate(model) @@ -255,8 +262,13 @@ SimpleRenderer::DeferredRenderStats SimpleRenderer::ExecuteDeferredPipeline( } } } + auto rasterization_end_time = std::chrono::high_resolution_clock::now(); + auto rasterization_duration = std::chrono::duration_cast( + rasterization_end_time - rasterization_start_time); + /* * * * * * * */ - // Merge fragments + /* * * Fragment Collection * * */ + auto fragment_collection_start_time = std::chrono::high_resolution_clock::now(); std::vector> fragmentsBuffer(width_ * height_); for (const auto &fragmentsBuffer_per_thread : fragmentsBuffer_all_thread) { for (size_t i = 0; i < fragmentsBuffer_per_thread.size(); i++) { @@ -265,10 +277,17 @@ SimpleRenderer::DeferredRenderStats SimpleRenderer::ExecuteDeferredPipeline( fragmentsBuffer_per_thread[i].end()); } } -/* * * * * * * */ + auto fragment_collection_end_time = std::chrono::high_resolution_clock::now(); + auto fragment_collection_duration = std::chrono::duration_cast( + fragment_collection_end_time - fragment_collection_start_time); + /* * * * * * * */ -/* * * Fragment Shader * * */ -// #pragma omp parallel for + /* * * Fragment Merge & Deferred Shading * * */ + auto fragment_merge_start_time = std::chrono::high_resolution_clock::now(); + + // Fragment Merge阶段:深度测试选择最近片段 + std::vector selected_fragments(width_ * height_, nullptr); + #pragma omp parallel for for (size_t i = 0; i < fragmentsBuffer.size(); i++) { const auto &fragments = fragmentsBuffer[i]; if (fragments.empty()) { @@ -281,7 +300,17 @@ SimpleRenderer::DeferredRenderStats SimpleRenderer::ExecuteDeferredPipeline( renderFragment = &fragment; } } - + selected_fragments[i] = renderFragment; + } + auto fragment_merge_end_time = std::chrono::high_resolution_clock::now(); + auto fragment_merge_duration = std::chrono::duration_cast( + fragment_merge_end_time - fragment_merge_start_time); + + // Deferred Shading阶段:执行片段着色器 + auto deferred_shading_start_time = std::chrono::high_resolution_clock::now(); +#pragma omp parallel for + for (size_t i = 0; i < selected_fragments.size(); i++) { + const Fragment *renderFragment = selected_fragments[i]; if (renderFragment) { // 添加Material指针有效性检查 if (renderFragment->material == nullptr) { @@ -292,15 +321,22 @@ SimpleRenderer::DeferredRenderStats SimpleRenderer::ExecuteDeferredPipeline( buffer[i] = uint32_t(color); } } + auto deferred_shading_end_time = std::chrono::high_resolution_clock::now(); + auto deferred_shading_duration = std::chrono::duration_cast( + deferred_shading_end_time - deferred_shading_start_time); /* * * * * * * */ - // 填充基本统计信息(延迟渲染模式主要用于教学演示) - stats.buffer_alloc_ms = 0.0; - stats.rasterization_ms = 0.0; - stats.fragment_collection_ms = 0.0; - stats.fragment_merge_ms = 0.0; - stats.deferred_shading_ms = 0.0; - stats.total_ms = 0.0; + auto total_end_time = std::chrono::high_resolution_clock::now(); + auto total_duration = std::chrono::duration_cast( + total_end_time - total_start_time); + + // 填充统计信息 + stats.buffer_alloc_ms = buffer_alloc_duration.count() / 1000.0; + stats.rasterization_ms = rasterization_duration.count() / 1000.0; + stats.fragment_collection_ms = fragment_collection_duration.count() / 1000.0; + stats.fragment_merge_ms = fragment_merge_duration.count() / 1000.0; + stats.deferred_shading_ms = deferred_shading_duration.count() / 1000.0; + stats.total_ms = total_duration.count() / 1000.0; return stats; } @@ -329,8 +365,12 @@ Vertex SimpleRenderer::PerspectiveDivision(const Vertex &vertex) { // 这些坐标在后续的视口变换和裁剪阶段会被正确处理 ndcPosition.z = std::clamp(ndcPosition.z, -1.0f, 1.0f); - // 创建新的顶点,保持其他属性不变 - return Vertex(ndcPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor()); + // 创建新的顶点,保持其他属性和裁剪空间坐标不变 + if (vertex.HasClipPosition()) { + return Vertex(ndcPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor(), vertex.GetClipPosition()); + } else { + return Vertex(ndcPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor()); + } } Vertex SimpleRenderer::ViewportTransformation(const Vertex &vertex) { @@ -365,6 +405,10 @@ void SimpleRenderer::TriangleTileBinning( size_t clipped_triangles = 0; size_t triangles_with_clipped_vertices = 0; + // 裁剪统计 + size_t frustum_culled = 0; + size_t backface_culled = 0; + SPDLOG_INFO("Starting triangle-tile binning for {} triangles", total_triangles); SPDLOG_INFO("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", width_, height_, tile_size, tiles_x, tiles_y); @@ -375,11 +419,49 @@ void SimpleRenderer::TriangleTileBinning( auto v1 = screenVertices[f.GetIndex(1)]; auto v2 = screenVertices[f.GetIndex(2)]; + // 视锥体裁剪 (裁剪空间) + if (v0.HasClipPosition()) { + Vector4f c0 = v0.GetClipPosition(); + Vector4f c1 = v1.GetClipPosition(); + Vector4f c2 = v2.GetClipPosition(); + + // 保守视锥体裁剪:只有当整个三角形都在视锥体外同一侧时才裁剪 + bool frustum_cull = + (c0.x > c0.w && c1.x > c1.w && c2.x > c2.w) || // 右平面外 + (c0.x < -c0.w && c1.x < -c1.w && c2.x < -c2.w) || // 左平面外 + (c0.y > c0.w && c1.y > c1.w && c2.y > c2.w) || // 上平面外 + (c0.y < -c0.w && c1.y < -c1.w && c2.y < -c2.w) || // 下平面外 + (c0.z > c0.w && c1.z > c1.w && c2.z > c2.w) || // 远平面外 + (c0.z < -c0.w && c1.z < -c1.w && c2.z < -c2.w); // 近平面外 + + // if (frustum_cull) { + // frustum_culled++; + // continue; + // } + } + // 获取屏幕空间坐标(现在已经是屏幕坐标了) Vector4f pos0 = v0.GetPosition(); Vector4f pos1 = v1.GetPosition(); Vector4f pos2 = v2.GetPosition(); + // 背面剔除 (屏幕空间) + Vector2f screen0(pos0.x, pos0.y); + Vector2f screen1(pos1.x, pos1.y); + Vector2f screen2(pos2.x, pos2.y); + + // 计算屏幕空间叉积判断朝向 + Vector2f edge1 = screen1 - screen0; + Vector2f edge2 = screen2 - screen0; + float cross_product = edge1.x * edge2.y - edge1.y * edge2.x; + + // 背面剔除:NDC空间中叉积为负表示顺时针,即背面。 + // 从NDC到屏幕空间中,会发生Y轴翻转,对应叉积应为正。 + if (cross_product > 0.0f) { + backface_culled++; + continue; + } + // 检查三角形是否有被裁剪的顶点(坐标为-1000的表示被裁剪) bool has_clipped_vertex = (pos0.x == -1000.0f || pos1.x == -1000.0f || pos2.x == -1000.0f); @@ -465,6 +547,8 @@ void SimpleRenderer::TriangleTileBinning( SPDLOG_INFO(" Triangles with clipped vertices: {}", triangles_with_clipped_vertices); SPDLOG_INFO(" Processed triangles: {}", processed_triangles); SPDLOG_INFO(" Clipped by screen bounds: {}", clipped_triangles); + SPDLOG_INFO(" Frustum culled: {}", frustum_culled); + SPDLOG_INFO(" Backface culled: {}", backface_culled); size_t total_triangle_refs = 0; size_t non_empty_tiles = 0; diff --git a/src/shader.cpp b/src/shader.cpp index 087cca5..7b8eeae 100644 --- a/src/shader.cpp +++ b/src/shader.cpp @@ -15,11 +15,15 @@ Vertex Shader::VertexShader(const Vertex& vertex) { sharedDataInShader_.fragPos_varying = Vector3f(model_matrix * vertex.GetPosition()); - // 返回变换后的顶点(包含变换后的法向量) - return Vertex(mvp_matrix * vertex.GetPosition(), + // 计算裁剪空间坐标 + Vector4f clip_position = mvp_matrix * vertex.GetPosition(); + + // 返回变换后的顶点(包含变换后的法向量和裁剪坐标) + return Vertex(clip_position, transformed_normal, vertex.GetTexCoords(), - vertex.GetColor()); + vertex.GetColor(), + clip_position); // 同时保存裁剪空间坐标用于后续裁剪 } Color Shader::FragmentShader(const Fragment& fragment) const { diff --git a/test/system_test/main.cpp b/test/system_test/main.cpp index 58c70ad..0f222b5 100755 --- a/test/system_test/main.cpp +++ b/test/system_test/main.cpp @@ -81,7 +81,7 @@ int main(int argc, char **argv) { simple_renderer::Camera camera(simple_renderer::Vector3f(0.0f, 0.0f, 1.0f)); // 设置渲染模式(可选:TRADITIONAL、TILE_BASED 或 DEFERRED) - simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::DEFERRED); + simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::TILE_BASED); // 输出当前渲染模式 std::string current_mode_name; @@ -97,10 +97,6 @@ int main(int argc, char **argv) { break; } SPDLOG_INFO("当前渲染模式: {}", current_mode_name); - - // 可以在这里添加模式切换的示例: - // simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::TILE_BASED); - // simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::DEFERRED); auto display = Display(kWidth, kHeight); display.loopBegin(); From d0ddf62ae6d02126d43438b06f1a52525710eebd Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Fri, 29 Aug 2025 20:14:21 +0800 Subject: [PATCH 05/24] Expand the Vertex data structure, implement frustum culling and backface culling for TBR. Signed-off-by: ZhouFANG --- src/renderer.cpp | 133 +++-------------------------------------------- 1 file changed, 6 insertions(+), 127 deletions(-) diff --git a/src/renderer.cpp b/src/renderer.cpp index a7bc226..ae53457 100755 --- a/src/renderer.cpp +++ b/src/renderer.cpp @@ -97,10 +97,7 @@ void SimpleRenderer::ExecuteDrawPipeline(const Model &model, uint32_t *buffer) { return; } - // === PERFORMANCE TIMING === - auto total_start_time = std::chrono::high_resolution_clock::now(); - - /* * * Vertex Shader * * */ + /* * * Vertex Transformation * * */ auto vertex_shader_start_time = std::chrono::high_resolution_clock::now(); std::vector processedVertices; std::vector> processed_vertices_all_thread(kNProc); @@ -402,7 +399,6 @@ void SimpleRenderer::TriangleTileBinning( size_t total_triangles = model.GetFaces().size(); size_t processed_triangles = 0; - size_t clipped_triangles = 0; size_t triangles_with_clipped_vertices = 0; // 裁剪统计 @@ -434,10 +430,10 @@ void SimpleRenderer::TriangleTileBinning( (c0.z > c0.w && c1.z > c1.w && c2.z > c2.w) || // 远平面外 (c0.z < -c0.w && c1.z < -c1.w && c2.z < -c2.w); // 近平面外 - // if (frustum_cull) { - // frustum_culled++; - // continue; - // } + if (frustum_cull) { + frustum_culled++; + continue; + } } // 获取屏幕空间坐标(现在已经是屏幕坐标了) @@ -445,12 +441,10 @@ void SimpleRenderer::TriangleTileBinning( Vector4f pos1 = v1.GetPosition(); Vector4f pos2 = v2.GetPosition(); - // 背面剔除 (屏幕空间) + // 计算屏幕空间叉积判断朝向 Vector2f screen0(pos0.x, pos0.y); Vector2f screen1(pos1.x, pos1.y); Vector2f screen2(pos2.x, pos2.y); - - // 计算屏幕空间叉积判断朝向 Vector2f edge1 = screen1 - screen0; Vector2f edge2 = screen2 - screen0; float cross_product = edge1.x * edge2.y - edge1.y * edge2.x; @@ -489,24 +483,6 @@ void SimpleRenderer::TriangleTileBinning( float min_y = std::min({screen_y0, screen_y1, screen_y2}); float max_y = std::max({screen_y0, screen_y1, screen_y2}); - // 调试前几个有效三角形的坐标范围 - if (processed_triangles < 3) { - SPDLOG_INFO("Triangle {} coordinates:", tri_idx); - SPDLOG_INFO(" Screen coords: ({:.1f},{:.1f}) ({:.1f},{:.1f}) ({:.1f},{:.1f})", - screen_x0, screen_y0, screen_x1, screen_y1, screen_x2, screen_y2); - SPDLOG_INFO(" BBox: min({:.1f},{:.1f}) max({:.1f},{:.1f})", - min_x, min_y, max_x, max_y); - } - - // 临时:大幅放宽屏幕边界检查,让超出屏幕的三角形也能处理 - if (max_x < -5000.0f || min_x >= width_ + 5000.0f || - max_y < -5000.0f || min_y >= height_ + 5000.0f) { - clipped_triangles++; - if (processed_triangles < 3) { - SPDLOG_INFO(" -> CLIPPED by screen bounds"); - } - continue; - } // 计算影响的tile范围 int start_tile_x = std::max(0, static_cast(min_x) / static_cast(tile_size)); @@ -528,28 +504,9 @@ void SimpleRenderer::TriangleTileBinning( } processed_triangles++; - // 输出前几个成功添加的三角形信息 - if (processed_triangles <= 3) { - SPDLOG_INFO(" -> SUCCESSFULLY ADDED to tiles x[{}..{}] y[{}..{}]", - start_tile_x, end_tile_x, start_tile_y, end_tile_y); - } - } else { - if (processed_triangles < 3) { - SPDLOG_INFO(" -> FAILED tile calculation: x[{}..{}] y[{}..{}]", - start_tile_x, end_tile_x, start_tile_y, end_tile_y); - } } } - // 输出统计信息 - SPDLOG_INFO("Triangle-Tile binning completed:"); - SPDLOG_INFO(" Total triangles: {}", total_triangles); - SPDLOG_INFO(" Triangles with clipped vertices: {}", triangles_with_clipped_vertices); - SPDLOG_INFO(" Processed triangles: {}", processed_triangles); - SPDLOG_INFO(" Clipped by screen bounds: {}", clipped_triangles); - SPDLOG_INFO(" Frustum culled: {}", frustum_culled); - SPDLOG_INFO(" Backface culled: {}", backface_culled); - size_t total_triangle_refs = 0; size_t non_empty_tiles = 0; for (const auto& tile : tile_triangles) { @@ -629,76 +586,6 @@ void SimpleRenderer::RasterizeTile( } } -// Tile可视化调试函数,这里用于固定大小的tiles -void SimpleRenderer::DrawTileVisualization(uint32_t* buffer, - const std::vector>& tile_triangles, - size_t tiles_x, size_t tiles_y, size_t tile_size) { - - SPDLOG_INFO("=== TILE VISUALIZATION DEBUG ==="); - SPDLOG_INFO("Drawing tile grid overlay for debugging"); - - // 颜色定义 (ABGR格式) - const uint32_t GRID_COLOR = 0xFF00FF00; // 绿色网格线 - const uint32_t NONEMPTY_COLOR = 0x4000FFFF; // 半透明黄色背景 (非空tile) - const uint32_t EMPTY_COLOR = 0x20FF0000; // 半透明蓝色背景 (空tile) - - // 1. 为非空tiles添加背景色 - for (size_t tile_y = 0; tile_y < tiles_y; tile_y++) { - for (size_t tile_x = 0; tile_x < tiles_x; tile_x++) { - size_t tile_id = tile_y * tiles_x + tile_x; - bool is_empty = tile_triangles[tile_id].empty(); - - // 计算tile在屏幕上的像素范围 - size_t pixel_start_x = tile_x * tile_size; - size_t pixel_end_x = std::min(pixel_start_x + tile_size, static_cast(width_)); - size_t pixel_start_y = tile_y * tile_size; - size_t pixel_end_y = std::min(pixel_start_y + tile_size, static_cast(height_)); - - uint32_t bg_color = is_empty ? EMPTY_COLOR : NONEMPTY_COLOR; - - // 给tile添加半透明背景 - for (size_t y = pixel_start_y; y < pixel_end_y; y++) { - for (size_t x = pixel_start_x; x < pixel_end_x; x++) { - size_t pixel_idx = y * static_cast(width_) + x; - // 简单的alpha混合:将背景色与原色混合 - uint32_t original = buffer[pixel_idx]; - buffer[pixel_idx] = BlendColors(original, bg_color); - } - } - - // 记录非空tile的信息 - if (!is_empty) { - SPDLOG_INFO("Non-empty Tile[{},{}] (ID:{}): {} triangles", - tile_x, tile_y, tile_id, tile_triangles[tile_id].size()); - } - } - } - - // 2. 绘制网格线 - // 垂直线 - for (size_t tile_x = 0; tile_x <= tiles_x; tile_x++) { - size_t pixel_x = tile_x * tile_size; - if (pixel_x < static_cast(width_)) { - for (size_t y = 0; y < static_cast(height_); y++) { - buffer[y * static_cast(width_) + pixel_x] = GRID_COLOR; - } - } - } - - // 水平线 - for (size_t tile_y = 0; tile_y <= tiles_y; tile_y++) { - size_t pixel_y = tile_y * tile_size; - if (pixel_y < static_cast(height_)) { - for (size_t x = 0; x < static_cast(width_); x++) { - buffer[pixel_y * static_cast(width_) + x] = GRID_COLOR; - } - } - } - - SPDLOG_INFO("Tile visualization completed - Green:Grid, Yellow:NonEmpty, Blue:Empty"); - SPDLOG_INFO("====================================="); -} - // 简单的颜色混合函数 (alpha blending) uint32_t SimpleRenderer::BlendColors(uint32_t base, uint32_t overlay) { // 提取RGBA通道 (假设是ABGR格式) @@ -944,13 +831,6 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline( auto merge_duration = std::chrono::duration_cast( merge_end_time - merge_start_time); - // 6. Tile可视化调试 - auto visualization_start_time = std::chrono::high_resolution_clock::now(); - DrawTileVisualization(buffer, tile_triangles, tiles_x, tiles_y, TILE_SIZE); - auto visualization_end_time = std::chrono::high_resolution_clock::now(); - auto visualization_duration = std::chrono::duration_cast( - visualization_end_time - visualization_start_time); - auto total_end_time = std::chrono::high_resolution_clock::now(); auto total_duration = std::chrono::duration_cast( total_end_time - total_start_time); @@ -961,7 +841,6 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline( stats.buffer_alloc_ms = buffer_alloc_duration.count() / 1000.0; stats.rasterization_ms = rasterization_duration.count() / 1000.0; stats.merge_ms = merge_duration.count() / 1000.0; - stats.visualization_ms = visualization_duration.count() / 1000.0; stats.total_ms = total_duration.count() / 1000.0; return stats; From a4021cdf8a3bfceb08140f896946563110856399 Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Sat, 30 Aug 2025 17:27:31 +0800 Subject: [PATCH 06/24] Fix rendering consistency between TRADITIONAL and TILE_BASED modes 1. Add backface culling to TRADITIONAL pipeline to match TILE_BASED behavior 2. Fix depth buffer initialization from infinity to 1.0f for standard range Signed-off-by: ZhouFANG --- src/renderer.cpp | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/src/renderer.cpp b/src/renderer.cpp index ae53457..e4eb01d 100755 --- a/src/renderer.cpp +++ b/src/renderer.cpp @@ -401,10 +401,6 @@ void SimpleRenderer::TriangleTileBinning( size_t processed_triangles = 0; size_t triangles_with_clipped_vertices = 0; - // 裁剪统计 - size_t frustum_culled = 0; - size_t backface_culled = 0; - SPDLOG_INFO("Starting triangle-tile binning for {} triangles", total_triangles); SPDLOG_INFO("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", width_, height_, tile_size, tiles_x, tiles_y); @@ -431,7 +427,6 @@ void SimpleRenderer::TriangleTileBinning( (c0.z < -c0.w && c1.z < -c1.w && c2.z < -c2.w); // 近平面外 if (frustum_cull) { - frustum_culled++; continue; } } @@ -452,7 +447,6 @@ void SimpleRenderer::TriangleTileBinning( // 背面剔除:NDC空间中叉积为负表示顺时针,即背面。 // 从NDC到屏幕空间中,会发生Y轴翻转,对应叉积应为正。 if (cross_product > 0.0f) { - backface_culled++; continue; } @@ -540,7 +534,7 @@ void SimpleRenderer::RasterizeTile( size_t tile_width = screen_x_end - screen_x_start; size_t tile_height = screen_y_end - screen_y_start; std::fill_n(tile_depth_buffer, tile_width * tile_height, - std::numeric_limits::infinity()); + 1.0f); // 初始化为最远深度(标准深度缓冲范围[0,1]) std::fill_n(tile_color_buffer, tile_width * tile_height, 0); // 在tile内光栅化所有三角形 @@ -652,6 +646,22 @@ SimpleRenderer::RenderStats SimpleRenderer::ExecuteTraditionalPipeline( auto v1 = processedVertices[f.GetIndex(1)]; auto v2 = processedVertices[f.GetIndex(2)]; + // 获取屏幕空间坐标 + Vector2f screen0(v0.GetPosition().x, v0.GetPosition().y); + Vector2f screen1(v1.GetPosition().x, v1.GetPosition().y); + Vector2f screen2(v2.GetPosition().x, v2.GetPosition().y); + + // 计算屏幕空间叉积判断朝向 + Vector2f edge1 = screen1 - screen0; + Vector2f edge2 = screen2 - screen0; + float cross_product = edge1.x * edge2.y - edge1.y * edge2.x; + + // 背面剔除:NDC空间中叉积为负表示顺时针,即背面。 + // 从NDC到屏幕空间中,会发生Y轴翻转,对应叉积应为正。 + if (cross_product > 0.0f) { + continue; + } + const Material *material = &f.GetMaterial(); auto fragments = rasterizer_->Rasterize(v0, v1, v2); From 70e1581f9daa326c6cb0fd3819718b19797af090 Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Sat, 30 Aug 2025 17:27:53 +0800 Subject: [PATCH 07/24] add debug mode Signed-off-by: ZhouFANG --- test/system_test/main.cpp | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/test/system_test/main.cpp b/test/system_test/main.cpp index 0f222b5..383ff83 100755 --- a/test/system_test/main.cpp +++ b/test/system_test/main.cpp @@ -16,10 +16,12 @@ #include +#include #include #include #include #include +#include #include #include "buffer.hpp" @@ -81,7 +83,7 @@ int main(int argc, char **argv) { simple_renderer::Camera camera(simple_renderer::Vector3f(0.0f, 0.0f, 1.0f)); // 设置渲染模式(可选:TRADITIONAL、TILE_BASED 或 DEFERRED) - simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::TILE_BASED); + simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::TRADITIONAL); // 输出当前渲染模式 std::string current_mode_name; @@ -101,9 +103,11 @@ int main(int argc, char **argv) { auto display = Display(kWidth, kHeight); display.loopBegin(); - while (!display.loopShouldClose()) { - display.handleEvents(camera); - + // 调试模式:固定相机状态,只渲染一帧 + bool debug_mode = true; + + if (debug_mode) { + // 固定相机参数进行调试 shader.SetUniform("cameraPos", camera.GetPosition()); shader.SetUniform("viewMatrix", camera.GetViewMatrix()); shader.SetUniform("projectionMatrix", @@ -115,8 +119,29 @@ int main(int argc, char **argv) { } buffer.SwapBuffer(); - display.fill(buffer.GetDisplayBuffer()); + + // 调试模式下等待几秒让我们看到结果 + std::this_thread::sleep_for(std::chrono::seconds(3)); + } else { + // 正常渲染循环 + while (!display.loopShouldClose()) { + display.handleEvents(camera); + + shader.SetUniform("cameraPos", camera.GetPosition()); + shader.SetUniform("viewMatrix", camera.GetViewMatrix()); + shader.SetUniform("projectionMatrix", + camera.GetProjectionMatrix(60.0f, static_cast(kWidth) / static_cast(kHeight), 0.1f, 100.0f)); + + buffer.ClearDrawBuffer(simple_renderer::Color::kBlack); + for (auto &model : models) { + simple_renderer.DrawModel(model, shader, buffer.GetDrawBuffer()); + } + + buffer.SwapBuffer(); + + display.fill(buffer.GetDisplayBuffer()); + } } return 0; From 45386674a7e34cc5b090751a652abe8d99387d2c Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Tue, 2 Sep 2025 23:44:02 +0800 Subject: [PATCH 08/24] Revert "add debug mode", which is not necessary for rendering tests. This reverts commit 70e1581f9daa326c6cb0fd3819718b19797af090. Signed-off-by: ZhouFANG --- test/system_test/main.cpp | 35 +++++------------------------------ 1 file changed, 5 insertions(+), 30 deletions(-) diff --git a/test/system_test/main.cpp b/test/system_test/main.cpp index 383ff83..0f222b5 100755 --- a/test/system_test/main.cpp +++ b/test/system_test/main.cpp @@ -16,12 +16,10 @@ #include -#include #include #include #include #include -#include #include #include "buffer.hpp" @@ -83,7 +81,7 @@ int main(int argc, char **argv) { simple_renderer::Camera camera(simple_renderer::Vector3f(0.0f, 0.0f, 1.0f)); // 设置渲染模式(可选:TRADITIONAL、TILE_BASED 或 DEFERRED) - simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::TRADITIONAL); + simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::TILE_BASED); // 输出当前渲染模式 std::string current_mode_name; @@ -103,11 +101,9 @@ int main(int argc, char **argv) { auto display = Display(kWidth, kHeight); display.loopBegin(); - // 调试模式:固定相机状态,只渲染一帧 - bool debug_mode = true; - - if (debug_mode) { - // 固定相机参数进行调试 + while (!display.loopShouldClose()) { + display.handleEvents(camera); + shader.SetUniform("cameraPos", camera.GetPosition()); shader.SetUniform("viewMatrix", camera.GetViewMatrix()); shader.SetUniform("projectionMatrix", @@ -119,29 +115,8 @@ int main(int argc, char **argv) { } buffer.SwapBuffer(); + display.fill(buffer.GetDisplayBuffer()); - - // 调试模式下等待几秒让我们看到结果 - std::this_thread::sleep_for(std::chrono::seconds(3)); - } else { - // 正常渲染循环 - while (!display.loopShouldClose()) { - display.handleEvents(camera); - - shader.SetUniform("cameraPos", camera.GetPosition()); - shader.SetUniform("viewMatrix", camera.GetViewMatrix()); - shader.SetUniform("projectionMatrix", - camera.GetProjectionMatrix(60.0f, static_cast(kWidth) / static_cast(kHeight), 0.1f, 100.0f)); - - buffer.ClearDrawBuffer(simple_renderer::Color::kBlack); - for (auto &model : models) { - simple_renderer.DrawModel(model, shader, buffer.GetDrawBuffer()); - } - - buffer.SwapBuffer(); - - display.fill(buffer.GetDisplayBuffer()); - } } return 0; From b57d9077900eddfdfe9844df1c08b2875d3f5b45 Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Thu, 4 Sep 2025 22:16:12 +0800 Subject: [PATCH 09/24] Add Early-Z to TBR. Remove obsolete functions previously used for TBR debugging. Signed-off-by: ZhouFANG --- src/include/renderer.h | 25 ++++------------------ src/renderer.cpp | 48 +++++++++++++++++------------------------- 2 files changed, 23 insertions(+), 50 deletions(-) diff --git a/src/include/renderer.h b/src/include/renderer.h index 2464c19..38f9dd0 100755 --- a/src/include/renderer.h +++ b/src/include/renderer.h @@ -100,6 +100,7 @@ class SimpleRenderer { const size_t width_; LogSystem log_system_; RenderingMode current_mode_; // 当前渲染模式 + bool early_z_enabled_; // Early-Z优化开关 std::shared_ptr shader_; std::shared_ptr rasterizer_; @@ -143,7 +144,6 @@ class SimpleRenderer { double buffer_alloc_ms; double rasterization_ms; double merge_ms; - double visualization_ms; double total_ms; }; @@ -189,7 +189,8 @@ class SimpleRenderer { size_t tiles_x, size_t tiles_y, size_t tile_size, float* tile_depth_buffer, uint32_t* tile_color_buffer, std::unique_ptr &global_depth_buffer, - std::unique_ptr &global_color_buffer); + std::unique_ptr &global_color_buffer, + bool use_early_z = false); /** @@ -205,25 +206,7 @@ class SimpleRenderer { * @return 转换后的顶点(屏幕坐标) */ Vertex ViewportTransformation(const Vertex &vertex); - /** - * Tile可视化调试函数 - 在渲染结果上绘制tile网格和状态 - * @param buffer 渲染结果缓冲区 - * @param tile_triangles 每个tile包含的三角形列表 - * @param tiles_x X方向tile数量 - * @param tiles_y Y方向tile数量 - * @param tile_size 单个tile的像素大小 - */ - void DrawTileVisualization(uint32_t* buffer, - const std::vector>& tile_triangles, - size_t tiles_x, size_t tiles_y, size_t tile_size); - - /** - * 颜色混合函数 - 用于半透明效果 - * @param base 基础颜色 - * @param overlay 叠加颜色(包含alpha通道) - * @return 混合后的颜色 - */ - uint32_t BlendColors(uint32_t base, uint32_t overlay); + }; } // namespace simple_renderer diff --git a/src/renderer.cpp b/src/renderer.cpp index e4eb01d..faabc58 100755 --- a/src/renderer.cpp +++ b/src/renderer.cpp @@ -37,7 +37,8 @@ SimpleRenderer::SimpleRenderer(size_t width, size_t height) : height_(height), width_(width), log_system_(LogSystem(kLogFilePath, kLogFileMaxSize, kLogFileMaxCount)), - current_mode_(RenderingMode::TILE_BASED) { + current_mode_(RenderingMode::TILE_BASED), + early_z_enabled_(true) { rasterizer_ = std::make_shared(width, height); } @@ -162,7 +163,6 @@ void SimpleRenderer::ExecuteDrawPipeline(const Model &model, uint32_t *buffer) { SPDLOG_INFO("Buffer Alloc: {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100); SPDLOG_INFO("Rasterization: {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100); SPDLOG_INFO("Merge: {:8.3f} ms ({:5.1f}%)", stats.merge_ms, stats.merge_ms/total_ms*100); - SPDLOG_INFO("Visualization: {:8.3f} ms ({:5.1f}%)", stats.visualization_ms, stats.visualization_ms/total_ms*100); SPDLOG_INFO("Total: {:8.3f} ms", total_ms); SPDLOG_INFO("=========================================="); break; @@ -521,7 +521,8 @@ void SimpleRenderer::RasterizeTile( size_t tiles_x, size_t tiles_y, size_t tile_size, float* tile_depth_buffer, uint32_t* tile_color_buffer, std::unique_ptr &global_depth_buffer, - std::unique_ptr &global_color_buffer) { + std::unique_ptr &global_color_buffer, + bool use_early_z) { // 计算tile在屏幕空间的范围 size_t tile_x = tile_id % tiles_x; size_t tile_y = tile_id / tiles_x; @@ -556,11 +557,18 @@ void SimpleRenderer::RasterizeTile( size_t tile_index = tile_local_x + tile_local_y * tile_width; // tile内深度测试 - if (fragment.depth < tile_depth_buffer[tile_index]) { - tile_depth_buffer[tile_index] = fragment.depth; - + if (use_early_z) { // Early-Z模式:深度测试在Fragment Shader之前 + if (fragment.depth < tile_depth_buffer[tile_index]) { + auto color = shader_->FragmentShader(fragment); + tile_depth_buffer[tile_index] = fragment.depth; + tile_color_buffer[tile_index] = uint32_t(color); + } + } else { // Late-Z模式:Fragment Shader在深度测试之前 auto color = shader_->FragmentShader(fragment); - tile_color_buffer[tile_index] = uint32_t(color); + if (fragment.depth < tile_depth_buffer[tile_index]) { + tile_depth_buffer[tile_index] = fragment.depth; + tile_color_buffer[tile_index] = uint32_t(color); + } } } } @@ -580,26 +588,6 @@ void SimpleRenderer::RasterizeTile( } } -// 简单的颜色混合函数 (alpha blending) -uint32_t SimpleRenderer::BlendColors(uint32_t base, uint32_t overlay) { - // 提取RGBA通道 (假设是ABGR格式) - uint8_t base_r = (base >> 16) & 0xFF; - uint8_t base_g = (base >> 8) & 0xFF; - uint8_t base_b = base & 0xFF; - - uint8_t overlay_r = (overlay >> 16) & 0xFF; - uint8_t overlay_g = (overlay >> 8) & 0xFF; - uint8_t overlay_b = overlay & 0xFF; - uint8_t overlay_a = (overlay >> 24) & 0xFF; - - // 简单的alpha混合 - float alpha = overlay_a / 255.0f; - uint8_t result_r = (uint8_t)(base_r * (1.0f - alpha) + overlay_r * alpha); - uint8_t result_g = (uint8_t)(base_g * (1.0f - alpha) + overlay_g * alpha); - uint8_t result_b = (uint8_t)(base_b * (1.0f - alpha) + overlay_b * alpha); - - return 0xFF000000 | (result_r << 16) | (result_g << 8) | result_b; -} // 传统光栅化管线实现 SimpleRenderer::RenderStats SimpleRenderer::ExecuteTraditionalPipeline( @@ -784,7 +772,8 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline( auto rasterization_start_time = std::chrono::high_resolution_clock::now(); #pragma omp parallel num_threads(kNProc) default(none) \ shared(tile_triangles, rasterizer_, shader_, width_, height_, \ - depthBuffer_all_thread, colorBuffer_all_thread, tiles_x, tiles_y, total_tiles) + depthBuffer_all_thread, colorBuffer_all_thread, tiles_x, tiles_y, total_tiles, \ + early_z_enabled_) { int thread_id = omp_get_thread_num(); auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id]; @@ -802,7 +791,8 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline( RasterizeTile(tile_id, tile_triangles[tile_id], tiles_x, tiles_y, TILE_SIZE, tile_depth_buffer.get(), tile_color_buffer.get(), - depthBuffer_per_thread, colorBuffer_per_thread); + depthBuffer_per_thread, colorBuffer_per_thread, + early_z_enabled_); } } auto rasterization_end_time = std::chrono::high_resolution_clock::now(); From 1d2d9a9b596559f292d642068c44ed73cd158fcc Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Fri, 5 Sep 2025 23:05:05 +0800 Subject: [PATCH 10/24] TBR: Pre-allocate and reuse fragment caches; add RasterizeTo; two-pass counting in Binning to eliminate frequent dynamic memory reallocations. Signed-off-by: ZhouFANG --- src/include/rasterizer.hpp | 7 ++ src/include/renderer.h | 3 +- src/rasterizer.cpp | 70 ++++++++++++++- src/renderer.cpp | 176 ++++++++++++++++++++++++++++++------- 4 files changed, 224 insertions(+), 32 deletions(-) diff --git a/src/include/rasterizer.hpp b/src/include/rasterizer.hpp index 749aa28..80b5f84 100644 --- a/src/include/rasterizer.hpp +++ b/src/include/rasterizer.hpp @@ -20,6 +20,13 @@ class Rasterizer { std::vector Rasterize(const Vertex& v0, const Vertex& v1, const Vertex& v2); + // 非分配版本:将片段直接写入调用方提供的容器 + // 可选的裁剪区域为半开区间 [x0, x1) × [y0, y1) + // 用于 TBR:将光栅化限制在 tile 边界内,便于复用外部 scratch 容器 + void RasterizeTo(const Vertex& v0, const Vertex& v1, const Vertex& v2, + int x0, int y0, int x1, int y1, + std::vector& out); + private: size_t width_, height_; diff --git a/src/include/renderer.h b/src/include/renderer.h index 38f9dd0..97c9952 100755 --- a/src/include/renderer.h +++ b/src/include/renderer.h @@ -190,7 +190,8 @@ class SimpleRenderer { float* tile_depth_buffer, uint32_t* tile_color_buffer, std::unique_ptr &global_depth_buffer, std::unique_ptr &global_color_buffer, - bool use_early_z = false); + bool use_early_z = false, + std::vector* scratch_fragments = nullptr); /** diff --git a/src/rasterizer.cpp b/src/rasterizer.cpp index 7a8c602..2bbe161 100644 --- a/src/rasterizer.cpp +++ b/src/rasterizer.cpp @@ -1,6 +1,8 @@ #include "rasterizer.hpp" #include +#include +#include namespace simple_renderer { @@ -90,6 +92,72 @@ std::vector Rasterizer::Rasterize(const Vertex& v0, const Vertex& v1, return fragments; } +void Rasterizer::RasterizeTo(const Vertex& v0, const Vertex& v1, const Vertex& v2, + int x0, int y0, int x1, int y1, + std::vector& out) { + // 获取三角形的最小 box(屏幕空间) + Vector2f a = Vector2f(v0.GetPosition().x, v0.GetPosition().y); + Vector2f b = Vector2f(v1.GetPosition().x, v1.GetPosition().y); + Vector2f c = Vector2f(v2.GetPosition().x, v2.GetPosition().y); + + Vector2f bboxMin = + Vector2f{std::min({a.x, b.x, c.x}), std::min({a.y, b.y, c.y})}; + Vector2f bboxMax = + Vector2f{std::max({a.x, b.x, c.x}), std::max({a.y, b.y, c.y})}; + + // Clamp 到屏幕尺寸 + float minx = std::max(0.0f, bboxMin.x); + float miny = std::max(0.0f, bboxMin.y); + float maxx = std::min(float(width_ - 1), bboxMax.x); + float maxy = std::min(float(height_ - 1), bboxMax.y); + + // 与外部提供的裁剪区域(半开区间)相交,转成闭区间扫描 + int sx = std::max(x0, int(std::floor(minx))); + int sy = std::max(y0, int(std::floor(miny))); + int ex = std::min(x1 - 1, int(std::floor(maxx))); + int ey = std::min(y1 - 1, int(std::floor(maxy))); + + if (sx > ex || sy > ey) { + return; // 与裁剪区域无交 + } + + // 透视矫正插值使用与 Rasterize 相同逻辑,但单线程写入 out + float w0_inv = v0.GetPosition().w; + float w1_inv = v1.GetPosition().w; + float w2_inv = v2.GetPosition().w; + + for (int x = sx; x <= ex; ++x) { + for (int y = sy; y <= ey; ++y) { + auto [is_inside, barycentric_coord] = GetBarycentricCoord( + v0.GetPosition(), v1.GetPosition(), v2.GetPosition(), + Vector3f(static_cast(x), static_cast(y), 0)); + if (!is_inside) continue; + + // 插值 1/w 并进行透视矫正 + float w_inv_interpolated = Interpolate(w0_inv, w1_inv, w2_inv, barycentric_coord); + Vector3f corrected_bary( + barycentric_coord.x * w0_inv / w_inv_interpolated, + barycentric_coord.y * w1_inv / w_inv_interpolated, + barycentric_coord.z * w2_inv / w_inv_interpolated); + + auto z = Interpolate(v0.GetPosition().z, v1.GetPosition().z, + v2.GetPosition().z, corrected_bary); + + Fragment fragment; + fragment.screen_coord = {x, y}; + fragment.normal = Interpolate(v0.GetNormal(), v1.GetNormal(), + v2.GetNormal(), corrected_bary); + fragment.uv = Interpolate(v0.GetTexCoords(), v1.GetTexCoords(), + v2.GetTexCoords(), corrected_bary); + fragment.color = InterpolateColor(v0.GetColor(), v1.GetColor(), + v2.GetColor(), corrected_bary); + fragment.depth = z; + + out.push_back(fragment); + } + } +} + std::pair Rasterizer::GetBarycentricCoord(const Vector3f& p0, const Vector3f& p1, const Vector3f& p2, @@ -157,4 +225,4 @@ Vector3f Rasterizer::CalculateNormal(const Vector3f& v0, const Vector3f& v1, glm::cross(edge1, edge2)); } -} // namespace simple_renderer \ No newline at end of file +} // namespace simple_renderer diff --git a/src/renderer.cpp b/src/renderer.cpp index faabc58..4d8306f 100755 --- a/src/renderer.cpp +++ b/src/renderer.cpp @@ -405,6 +405,86 @@ void SimpleRenderer::TriangleTileBinning( SPDLOG_INFO("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", width_, height_, tile_size, tiles_x, tiles_y); + // 第一遍:仅统计每个 tile 的三角形数量以便预分配,避免 push_back 扩容 + std::vector tile_counts(tiles_x * tiles_y, 0); + for (size_t tri_idx = 0; tri_idx < model.GetFaces().size(); tri_idx++) { + const auto &f = model.GetFaces()[tri_idx]; + auto v0 = screenVertices[f.GetIndex(0)]; + auto v1 = screenVertices[f.GetIndex(1)]; + auto v2 = screenVertices[f.GetIndex(2)]; + + if (v0.HasClipPosition()) { + Vector4f c0 = v0.GetClipPosition(); + Vector4f c1 = v1.GetClipPosition(); + Vector4f c2 = v2.GetClipPosition(); + bool frustum_cull = + (c0.x > c0.w && c1.x > c1.w && c2.x > c2.w) || + (c0.x < -c0.w && c1.x < -c0.w && c2.x < -c0.w) || + (c0.y > c0.w && c1.y > c1.w && c2.y > c2.w) || + (c0.y < -c0.w && c1.y < -c0.w && c2.y < -c0.w) || + (c0.z > c0.w && c1.z > c1.w && c2.z > c2.w) || + (c0.z < -c0.w && c1.z < -c0.w && c2.z < -c0.w); + if (frustum_cull) { + continue; + } + } + + Vector4f pos0 = v0.GetPosition(); + Vector4f pos1 = v1.GetPosition(); + Vector4f pos2 = v2.GetPosition(); + + Vector2f screen0(pos0.x, pos0.y); + Vector2f screen1(pos1.x, pos1.y); + Vector2f screen2(pos2.x, pos2.y); + Vector2f edge1 = screen1 - screen0; + Vector2f edge2 = screen2 - screen0; + float cross_product = edge1.x * edge2.y - edge1.y * edge2.x; + if (cross_product > 0.0f) { + continue; + } + + bool has_clipped_vertex = (pos0.x == -1000.0f || pos1.x == -1000.0f || pos2.x == -1000.0f); + if (has_clipped_vertex) { + continue; + } + + float screen_x0 = pos0.x; + float screen_y0 = pos0.y; + float screen_x1 = pos1.x; + float screen_y1 = pos1.y; + float screen_x2 = pos2.x; + float screen_y2 = pos2.y; + + float min_x = std::min({screen_x0, screen_x1, screen_x2}); + float max_x = std::max({screen_x0, screen_x1, screen_x2}); + float min_y = std::min({screen_y0, screen_y1, screen_y2}); + float max_y = std::max({screen_y0, screen_y1, screen_y2}); + + int start_tile_x = std::max(0, static_cast(min_x) / static_cast(tile_size)); + int end_tile_x = std::min(static_cast(tiles_x - 1), + static_cast(max_x) / static_cast(tile_size)); + int start_tile_y = std::max(0, static_cast(min_y) / static_cast(tile_size)); + int end_tile_y = std::min(static_cast(tiles_y - 1), + static_cast(max_y) / static_cast(tile_size)); + + if (start_tile_x > end_tile_x || start_tile_y > end_tile_y) { + continue; + } + + for (int ty = start_tile_y; ty <= end_tile_y; ++ty) { + for (int tx = start_tile_x; tx <= end_tile_x; ++tx) { + size_t tile_id = ty * tiles_x + tx; + tile_counts[tile_id]++; + } + } + } + + // 依据统计结果进行容量预留 + for (size_t tile_id = 0; tile_id < tile_triangles.size(); ++tile_id) { + if (tile_counts[tile_id] > 0) { + tile_triangles[tile_id].reserve(tile_counts[tile_id]); + } + } for (size_t tri_idx = 0; tri_idx < model.GetFaces().size(); tri_idx++) { const auto &f = model.GetFaces()[tri_idx]; auto v0 = screenVertices[f.GetIndex(0)]; @@ -522,7 +602,8 @@ void SimpleRenderer::RasterizeTile( float* tile_depth_buffer, uint32_t* tile_color_buffer, std::unique_ptr &global_depth_buffer, std::unique_ptr &global_color_buffer, - bool use_early_z) { + bool use_early_z, + std::vector* scratch_fragments) { // 计算tile在屏幕空间的范围 size_t tile_x = tile_id % tiles_x; size_t tile_y = tile_id / tiles_x; @@ -539,38 +620,69 @@ void SimpleRenderer::RasterizeTile( std::fill_n(tile_color_buffer, tile_width * tile_height, 0); // 在tile内光栅化所有三角形 + (void)tiles_y; // 避免未使用参数告警 for (const auto &triangle : triangles) { - auto fragments = rasterizer_->Rasterize(triangle.v0, triangle.v1, triangle.v2); - - for (auto &fragment : fragments) { - fragment.material = triangle.material; - - size_t screen_x = fragment.screen_coord[0]; - size_t screen_y = fragment.screen_coord[1]; - - // 检查fragment是否在当前tile内 - if (screen_x >= screen_x_start && screen_x < screen_x_end && - screen_y >= screen_y_start && screen_y < screen_y_end) { - - size_t tile_local_x = screen_x - screen_x_start; - size_t tile_local_y = screen_y - screen_y_start; - size_t tile_index = tile_local_x + tile_local_y * tile_width; - - // tile内深度测试 - if (use_early_z) { // Early-Z模式:深度测试在Fragment Shader之前 - if (fragment.depth < tile_depth_buffer[tile_index]) { + // 复用线程本地 scratch 容器,限制在 tile 边界内栅格化 + if (scratch_fragments) { // 提供scratch容器 + scratch_fragments->clear(); + if (scratch_fragments->capacity() < tile_width * tile_height) { // 二次确认,为日后可能的可变tile进行设计 + scratch_fragments->reserve(tile_width * tile_height); + } + rasterizer_->RasterizeTo(triangle.v0, triangle.v1, triangle.v2, + static_cast(screen_x_start), static_cast(screen_y_start), + static_cast(screen_x_end), static_cast(screen_y_end), + *scratch_fragments); + + for (auto &fragment : *scratch_fragments) { + fragment.material = triangle.material; + size_t screen_x = fragment.screen_coord[0]; + size_t screen_y = fragment.screen_coord[1]; + if (screen_x >= screen_x_start && screen_x < screen_x_end && + screen_y >= screen_y_start && screen_y < screen_y_end) { + size_t tile_local_x = screen_x - screen_x_start; + size_t tile_local_y = screen_y - screen_y_start; + size_t tile_index = tile_local_x + tile_local_y * tile_width; + if (use_early_z) { + if (fragment.depth < tile_depth_buffer[tile_index]) { + auto color = shader_->FragmentShader(fragment); + tile_depth_buffer[tile_index] = fragment.depth; + tile_color_buffer[tile_index] = uint32_t(color); + } + } else { auto color = shader_->FragmentShader(fragment); - tile_depth_buffer[tile_index] = fragment.depth; - tile_color_buffer[tile_index] = uint32_t(color); - } - } else { // Late-Z模式:Fragment Shader在深度测试之前 - auto color = shader_->FragmentShader(fragment); - if (fragment.depth < tile_depth_buffer[tile_index]) { - tile_depth_buffer[tile_index] = fragment.depth; - tile_color_buffer[tile_index] = uint32_t(color); + if (fragment.depth < tile_depth_buffer[tile_index]) { + tile_depth_buffer[tile_index] = fragment.depth; + tile_color_buffer[tile_index] = uint32_t(color); + } } } + } + } else { // 不提供scratch容器的版本 + auto fragments = rasterizer_->Rasterize(triangle.v0, triangle.v1, triangle.v2); + for (auto &fragment : fragments) { + fragment.material = triangle.material; + size_t screen_x = fragment.screen_coord[0]; + size_t screen_y = fragment.screen_coord[1]; + if (screen_x >= screen_x_start && screen_x < screen_x_end && + screen_y >= screen_y_start && screen_y < screen_y_end) { + size_t tile_local_x = screen_x - screen_x_start; + size_t tile_local_y = screen_y - screen_y_start; + size_t tile_index = tile_local_x + tile_local_y * tile_width; + if (use_early_z) { + if (fragment.depth < tile_depth_buffer[tile_index]) { + auto color = shader_->FragmentShader(fragment); + tile_depth_buffer[tile_index] = fragment.depth; + tile_color_buffer[tile_index] = uint32_t(color); + } + } else { + auto color = shader_->FragmentShader(fragment); + if (fragment.depth < tile_depth_buffer[tile_index]) { + tile_depth_buffer[tile_index] = fragment.depth; + tile_color_buffer[tile_index] = uint32_t(color); + } } + } + } } } @@ -785,14 +897,18 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline( std::unique_ptr tile_color_buffer = std::make_unique(TILE_SIZE * TILE_SIZE); + // 线程本地片段 scratch 容器(复用),容量按单 tile 上限预估 + std::vector scratch_fragments; + scratch_fragments.reserve(TILE_SIZE * TILE_SIZE); + #pragma omp for for (size_t tile_id = 0; tile_id < total_tiles; tile_id++) { - // 按照tile进行光栅化 + // 按照tile进行光栅化,每个Tile进行区域限制+scratch复用,区域限制避免了可能的数据竞争 RasterizeTile(tile_id, tile_triangles[tile_id], tiles_x, tiles_y, TILE_SIZE, tile_depth_buffer.get(), tile_color_buffer.get(), depthBuffer_per_thread, colorBuffer_per_thread, - early_z_enabled_); + early_z_enabled_, &scratch_fragments); } } auto rasterization_end_time = std::chrono::high_resolution_clock::now(); From 8a743793ca6a867389b8b7956751114fb76668e1 Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Sat, 6 Sep 2025 18:21:13 +0800 Subject: [PATCH 11/24] vertex optimization: avoid data movement and multi-stage memory reallocation Signed-off-by: ZhouFANG --- src/renderer.cpp | 42 ++++++++++++++++-------------------------- 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/src/renderer.cpp b/src/renderer.cpp index 4d8306f..b08c7f2 100755 --- a/src/renderer.cpp +++ b/src/renderer.cpp @@ -101,34 +101,24 @@ void SimpleRenderer::ExecuteDrawPipeline(const Model &model, uint32_t *buffer) { /* * * Vertex Transformation * * */ auto vertex_shader_start_time = std::chrono::high_resolution_clock::now(); std::vector processedVertices; - std::vector> processed_vertices_all_thread(kNProc); -#pragma omp parallel num_threads(kNProc) default(none) \ - shared(shader_, processed_vertices_all_thread) firstprivate(model) - { - int thread_id = omp_get_thread_num(); - std::vector &processedVertices_per_thread = - processed_vertices_all_thread[thread_id]; + const auto &input_vertices = model.GetVertices(); + processedVertices.resize(input_vertices.size()); // 根据顶点总数量进行预分配 -#pragma omp for - for (const auto &v : model.GetVertices()) { - // 顶点着色器:世界坐标 -> 裁剪坐标 - auto clipSpaceVertex = shader_->VertexShader(v); - - // 透视除法:裁剪坐标 -> NDC坐标 - auto ndcVertex = PerspectiveDivision(clipSpaceVertex); - - // 视口变换:NDC坐标 -> 屏幕坐标 - auto screenSpaceVertex = ViewportTransformation(ndcVertex); - - processedVertices_per_thread.push_back(screenSpaceVertex); - } - } +// 并行过程保持连续分块,避免false sharing +#pragma omp parallel for num_threads(kNProc) schedule(static) \ + shared(shader_, processedVertices, input_vertices) + for (size_t i = 0; i < input_vertices.size(); ++i) { // 按索引并行处理 + const auto &v = input_vertices[i]; + // 顶点着色器:世界坐标 -> 裁剪坐标 + auto clipSpaceVertex = shader_->VertexShader(v); + + // 透视除法:裁剪坐标 -> NDC坐标 + auto ndcVertex = PerspectiveDivision(clipSpaceVertex); + + // 视口变换:NDC坐标 -> 屏幕坐标 + auto screenSpaceVertex = ViewportTransformation(ndcVertex); - for (const auto &processedVertices_per_thread : - processed_vertices_all_thread) { - processedVertices.insert(processedVertices.end(), - processedVertices_per_thread.begin(), - processedVertices_per_thread.end()); + processedVertices[i] = screenSpaceVertex; } auto vertex_shader_end_time = std::chrono::high_resolution_clock::now(); auto vertex_shader_duration = std::chrono::duration_cast( From bb5acc1aca13f018a4132da22f81887e8328f6da Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Sat, 6 Sep 2025 23:03:26 +0800 Subject: [PATCH 12/24] TBR: Use SoA vertex layout to improve cache locality Signed-off-by: ZhouFANG --- src/include/rasterizer.hpp | 6 + src/include/renderer.h | 36 ++- src/include/vertex_soa.hpp | 33 +++ src/rasterizer.cpp | 61 +++++ src/renderer.cpp | 522 +++++++++++++++---------------------- 5 files changed, 328 insertions(+), 330 deletions(-) create mode 100644 src/include/vertex_soa.hpp diff --git a/src/include/rasterizer.hpp b/src/include/rasterizer.hpp index 80b5f84..24e4a20 100644 --- a/src/include/rasterizer.hpp +++ b/src/include/rasterizer.hpp @@ -3,6 +3,7 @@ #include "config.h" #include "shader.hpp" +#include "vertex_soa.hpp" namespace simple_renderer { @@ -27,6 +28,11 @@ class Rasterizer { int x0, int y0, int x1, int y1, std::vector& out); + // SoA 版本:按顶点索引从 SoA 读取三角形三顶点 + void RasterizeTo(const VertexSoA& soa, size_t i0, size_t i1, size_t i2, + int x0, int y0, int x1, int y1, + std::vector& out); + private: size_t width_, height_; diff --git a/src/include/renderer.h b/src/include/renderer.h index 97c9952..56c84c8 100755 --- a/src/include/renderer.h +++ b/src/include/renderer.h @@ -38,18 +38,12 @@ enum class RenderingMode { DEFERRED // 延迟渲染模式 - 经典GPU管线教学模拟 }; -// Face 只包含顶点索引,不包含实际的顶点数据; -// Vertex 包含3D坐标,但没有屏幕坐标 -// Fragment 包含屏幕坐标,但它是光栅化的结果,不是输入 -struct TriangleInfo { - Vertex v0, v1, v2; - const Material *material; - size_t face_index; - TriangleInfo(const Vertex& vertex0, const Vertex& vertex1, const Vertex& vertex2, - const Material* mat, size_t face_idx = 0) - : v0(vertex0), v1(vertex1), v2(vertex2), material(mat), face_index(face_idx) {} - - TriangleInfo() = default; + +// SoA 版 tile 列表中的三角形引用(仅存索引与材质指针) +struct TriangleRef { + size_t i0, i1, i2; + const Material* material = nullptr; + size_t face_index = 0; }; class SimpleRenderer { @@ -158,10 +152,9 @@ class SimpleRenderer { double deferred_shading_ms; double total_ms; }; - TileRenderStats ExecuteTileBasedPipeline(const Model &model, - const std::vector &processedVertices, - uint32_t *buffer); + const VertexSoA &soa, + uint32_t *buffer); /** * 延迟渲染管线 @@ -177,19 +170,24 @@ class SimpleRenderer { private: + + // SoA 版本的 Triangle-Tile binning(两遍计数 + reserve) void TriangleTileBinning( - const Model &model, - const std::vector &screenVertices, - std::vector> &tile_triangles, + const Model &model, + const VertexSoA &soa, + std::vector> &tile_triangles, size_t tiles_x, size_t tiles_y, size_t tile_size); + + // SoA 版本的 tile 光栅化 void RasterizeTile( size_t tile_id, - const std::vector &triangles, + const std::vector &triangles, size_t tiles_x, size_t tiles_y, size_t tile_size, float* tile_depth_buffer, uint32_t* tile_color_buffer, std::unique_ptr &global_depth_buffer, std::unique_ptr &global_color_buffer, + const VertexSoA &soa, bool use_early_z = false, std::vector* scratch_fragments = nullptr); diff --git a/src/include/vertex_soa.hpp b/src/include/vertex_soa.hpp new file mode 100644 index 0000000..4c5806a --- /dev/null +++ b/src/include/vertex_soa.hpp @@ -0,0 +1,33 @@ +// Minimal SoA layout for TBR pipeline (Phase 1) +#ifndef SIMPLERENDER_SRC_INCLUDE_VERTEX_SOA_HPP_ +#define SIMPLERENDER_SRC_INCLUDE_VERTEX_SOA_HPP_ + +#include + +#include "math.hpp" +#include "color.h" + +namespace simple_renderer { + +struct VertexSoA { + // 屏幕空间坐标(视口变换后) + std::vector pos_screen; // screen space position (x,y,z,w) + // 裁剪空间坐标(用于视锥体剔除):clip = MVP * pos + std::vector pos_clip; + std::vector normal; + std::vector uv; + std::vector color; + + inline size_t size() const { return pos_screen.size(); } + inline void resize(size_t n) { + pos_screen.resize(n); + pos_clip.resize(n); + normal.resize(n); + uv.resize(n); + color.resize(n); + } +}; + +} // namespace simple_renderer + +#endif // SIMPLERENDER_SRC_INCLUDE_VERTEX_SOA_HPP_ diff --git a/src/rasterizer.cpp b/src/rasterizer.cpp index 2bbe161..1ee2fff 100644 --- a/src/rasterizer.cpp +++ b/src/rasterizer.cpp @@ -158,6 +158,67 @@ void Rasterizer::RasterizeTo(const Vertex& v0, const Vertex& v1, const Vertex& v } } +void Rasterizer::RasterizeTo(const VertexSoA& soa, size_t i0, size_t i1, size_t i2, + int x0, int y0, int x1, int y1, + std::vector& out) { + // 读取三顶点的屏幕空间位置 + const Vector4f& p0 = soa.pos_screen[i0]; + const Vector4f& p1 = soa.pos_screen[i1]; + const Vector4f& p2 = soa.pos_screen[i2]; + + Vector2f a = Vector2f(p0.x, p0.y); + Vector2f b = Vector2f(p1.x, p1.y); + Vector2f c = Vector2f(p2.x, p2.y); + + Vector2f bboxMin = Vector2f{std::min({a.x, b.x, c.x}), std::min({a.y, b.y, c.y})}; + Vector2f bboxMax = Vector2f{std::max({a.x, b.x, c.x}), std::max({a.y, b.y, c.y})}; + + // Clamp 到屏幕尺寸 + float minx = std::max(0.0f, bboxMin.x); + float miny = std::max(0.0f, bboxMin.y); + float maxx = std::min(float(width_ - 1), bboxMax.x); + float maxy = std::min(float(height_ - 1), bboxMax.y); + + // 与外部提供的裁剪区域相交(半开区间) -> 闭区间扫描 + int sx = std::max(x0, int(std::floor(minx))); + int sy = std::max(y0, int(std::floor(miny))); + int ex = std::min(x1 - 1, int(std::floor(maxx))); + int ey = std::min(y1 - 1, int(std::floor(maxy))); + if (sx > ex || sy > ey) return; + + // 透视矫正插值依赖 w + float w0_inv = p0.w; + float w1_inv = p1.w; + float w2_inv = p2.w; + + for (int x = sx; x <= ex; ++x) { + for (int y = sy; y <= ey; ++y) { + auto [is_inside, bary] = GetBarycentricCoord( + Vector3f(p0.x, p0.y, p0.z), Vector3f(p1.x, p1.y, p1.z), Vector3f(p2.x, p2.y, p2.z), + Vector3f(static_cast(x), static_cast(y), 0)); + if (!is_inside) continue; + + float w_inv_interp = Interpolate(w0_inv, w1_inv, w2_inv, bary); + Vector3f cb( + bary.x * w0_inv / w_inv_interp, + bary.y * w1_inv / w_inv_interp, + bary.z * w2_inv / w_inv_interp); + + float z = Interpolate(p0.z, p1.z, p2.z, cb); + + Fragment frag; + frag.screen_coord = {x, y}; + frag.normal = Interpolate(soa.normal[i0], soa.normal[i1], soa.normal[i2], cb); + frag.uv = Interpolate(soa.uv[i0], soa.uv[i1], soa.uv[i2], cb); + frag.color = InterpolateColor(soa.color[i0], soa.color[i1], soa.color[i2], cb); + frag.depth = z; + // material 指针由调用方填写 + + out.push_back(frag); + } + } +} + std::pair Rasterizer::GetBarycentricCoord(const Vector3f& p0, const Vector3f& p1, const Vector3f& p2, diff --git a/src/renderer.cpp b/src/renderer.cpp index b08c7f2..3167cf5 100755 --- a/src/renderer.cpp +++ b/src/renderer.cpp @@ -100,25 +100,42 @@ void SimpleRenderer::ExecuteDrawPipeline(const Model &model, uint32_t *buffer) { /* * * Vertex Transformation * * */ auto vertex_shader_start_time = std::chrono::high_resolution_clock::now(); - std::vector processedVertices; const auto &input_vertices = model.GetVertices(); - processedVertices.resize(input_vertices.size()); // 根据顶点总数量进行预分配 - -// 并行过程保持连续分块,避免false sharing -#pragma omp parallel for num_threads(kNProc) schedule(static) \ + std::vector processedVertices; // 非 TBR + VertexSoA processedSoA; // TBR 专用 + + if (current_mode_ == RenderingMode::TILE_BASED) { + processedSoA.resize(input_vertices.size()); + // schedule(static)使并行过程保持连续分块,避免 false sharing +#pragma omp parallel for num_threads(kNProc) schedule(static) \ + shared(shader_, processedSoA, input_vertices) + for (size_t i = 0; i < input_vertices.size(); ++i) { // 按索引并行处理 + const auto &v = input_vertices[i]; + // 顶点着色器:世界坐标 -> 裁剪坐标 + auto clipSpaceVertex = shader_->VertexShader(v); + // 保存裁剪空间坐标用于后续视锥体裁剪 + processedSoA.pos_clip[i] = clipSpaceVertex.GetPosition(); + auto ndcVertex = PerspectiveDivision(clipSpaceVertex); + auto screenSpaceVertex = ViewportTransformation(ndcVertex); + + // 填充为SoA数据结构,用于优化缓存局部性 + processedSoA.pos_screen[i] = screenSpaceVertex.GetPosition(); + processedSoA.normal[i] = screenSpaceVertex.GetNormal(); + processedSoA.uv[i] = screenSpaceVertex.GetTexCoords(); + processedSoA.color[i] = screenSpaceVertex.GetColor(); + } + } else { // Tradition或Deffer管线 + processedVertices.resize(input_vertices.size()); // 根据顶点总数量进行预分配 + // 并行过程保持连续分块,避免false sharing +#pragma omp parallel for num_threads(kNProc) schedule(static) \ shared(shader_, processedVertices, input_vertices) - for (size_t i = 0; i < input_vertices.size(); ++i) { // 按索引并行处理 - const auto &v = input_vertices[i]; - // 顶点着色器:世界坐标 -> 裁剪坐标 - auto clipSpaceVertex = shader_->VertexShader(v); - - // 透视除法:裁剪坐标 -> NDC坐标 - auto ndcVertex = PerspectiveDivision(clipSpaceVertex); - - // 视口变换:NDC坐标 -> 屏幕坐标 - auto screenSpaceVertex = ViewportTransformation(ndcVertex); - - processedVertices[i] = screenSpaceVertex; + for (size_t i = 0; i < input_vertices.size(); ++i) { // 按索引并行处理 + const auto &v = input_vertices[i]; + auto clipSpaceVertex = shader_->VertexShader(v); + auto ndcVertex = PerspectiveDivision(clipSpaceVertex); + auto screenSpaceVertex = ViewportTransformation(ndcVertex); + processedVertices[i] = screenSpaceVertex; + } } auto vertex_shader_end_time = std::chrono::high_resolution_clock::now(); auto vertex_shader_duration = std::chrono::duration_cast( @@ -143,7 +160,7 @@ void SimpleRenderer::ExecuteDrawPipeline(const Model &model, uint32_t *buffer) { } case RenderingMode::TILE_BASED: { - auto stats = ExecuteTileBasedPipeline(model, processedVertices, buffer); + auto stats = ExecuteTileBasedPipeline(model, processedSoA, buffer); double total_ms = vertex_ms + stats.total_ms; SPDLOG_INFO("=== TILE-BASED RENDERING PERFORMANCE ==="); @@ -379,309 +396,193 @@ Vertex SimpleRenderer::ViewportTransformation(const Vertex &vertex) { - -// Triangle-Tile binning函数 - 修正版本 +// SoA优化的Binning:两遍计数 + 预留 + 填充 TriangleRef void SimpleRenderer::TriangleTileBinning( - const Model &model, - const std::vector &screenVertices, - std::vector> &tile_triangles, + const Model &model, + const VertexSoA &soa, + std::vector> &tile_triangles, size_t tiles_x, size_t tiles_y, size_t tile_size) { - - size_t total_triangles = model.GetFaces().size(); - size_t processed_triangles = 0; - size_t triangles_with_clipped_vertices = 0; - - SPDLOG_INFO("Starting triangle-tile binning for {} triangles", total_triangles); - SPDLOG_INFO("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", - width_, height_, tile_size, tiles_x, tiles_y); - - // 第一遍:仅统计每个 tile 的三角形数量以便预分配,避免 push_back 扩容 - std::vector tile_counts(tiles_x * tiles_y, 0); - for (size_t tri_idx = 0; tri_idx < model.GetFaces().size(); tri_idx++) { - const auto &f = model.GetFaces()[tri_idx]; - auto v0 = screenVertices[f.GetIndex(0)]; - auto v1 = screenVertices[f.GetIndex(1)]; - auto v2 = screenVertices[f.GetIndex(2)]; - - if (v0.HasClipPosition()) { - Vector4f c0 = v0.GetClipPosition(); - Vector4f c1 = v1.GetClipPosition(); - Vector4f c2 = v2.GetClipPosition(); - bool frustum_cull = - (c0.x > c0.w && c1.x > c1.w && c2.x > c2.w) || - (c0.x < -c0.w && c1.x < -c0.w && c2.x < -c0.w) || - (c0.y > c0.w && c1.y > c1.w && c2.y > c2.w) || - (c0.y < -c0.w && c1.y < -c0.w && c2.y < -c0.w) || - (c0.z > c0.w && c1.z > c1.w && c2.z > c2.w) || - (c0.z < -c0.w && c1.z < -c0.w && c2.z < -c0.w); - if (frustum_cull) { - continue; - } - } + const size_t total_triangles = model.GetFaces().size(); + + SPDLOG_INFO("Starting triangle-tile binning (SoA) for {} triangles", total_triangles); + SPDLOG_INFO("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", + width_, height_, tile_size, tiles_x, tiles_y); + + std::vector tile_counts(tiles_x * tiles_y, 0); + + auto process_triangle = [&](size_t tri_idx, bool count_only) { + const auto &f = model.GetFaces()[tri_idx]; + size_t i0 = f.GetIndex(0); + size_t i1 = f.GetIndex(1); + size_t i2 = f.GetIndex(2); + + // 视锥体裁剪 (裁剪空间) + // 保守视锥体裁剪:只有当整个三角形都在视锥体外同一侧时才裁剪 + const Vector4f &c0 = soa.pos_clip[i0]; + const Vector4f &c1 = soa.pos_clip[i1]; + const Vector4f &c2 = soa.pos_clip[i2]; + bool frustum_cull = + (c0.x > c0.w && c1.x > c1.w && c2.x > c2.w) || // 右平面外 + (c0.x < -c0.w && c1.x < -c0.w && c2.x < -c0.w) || // 左平面外 + (c0.y > c0.w && c1.y > c1.w && c2.y > c2.w) || // 上平面外 + (c0.y < -c0.w && c1.y < -c0.w && c2.y < -c0.w) || // 下平面外 + (c0.z > c0.w && c1.z > c1.w && c2.z > c2.w) || // 远平面外 + (c0.z < -c0.w && c1.z < -c0.w && c2.z < -c0.w); // 近平面外 + if (frustum_cull) { + return; + } - Vector4f pos0 = v0.GetPosition(); - Vector4f pos1 = v1.GetPosition(); - Vector4f pos2 = v2.GetPosition(); - - Vector2f screen0(pos0.x, pos0.y); - Vector2f screen1(pos1.x, pos1.y); - Vector2f screen2(pos2.x, pos2.y); - Vector2f edge1 = screen1 - screen0; - Vector2f edge2 = screen2 - screen0; - float cross_product = edge1.x * edge2.y - edge1.y * edge2.x; - if (cross_product > 0.0f) { - continue; + const Vector4f &pos0 = soa.pos_screen[i0]; + const Vector4f &pos1 = soa.pos_screen[i1]; + const Vector4f &pos2 = soa.pos_screen[i2]; + + // 背面剔除(屏幕空间) + // NDC空间中叉积为负表示顺时针,即背面。 + // 从NDC到屏幕空间中,会发生Y轴翻转,对应叉积应为正。 + + Vector2f screen0(pos0.x, pos0.y); + Vector2f screen1(pos1.x, pos1.y); + Vector2f screen2(pos2.x, pos2.y); + Vector2f edge1 = screen1 - screen0; + Vector2f edge2 = screen2 - screen0; + float cross_product = edge1.x * edge2.y - edge1.y * edge2.x; + if (cross_product > 0.0f) return; + + float screen_x0 = pos0.x; + float screen_y0 = pos0.y; + float screen_x1 = pos1.x; + float screen_y1 = pos1.y; + float screen_x2 = pos2.x; + float screen_y2 = pos2.y; + + // 计算屏幕bbox,用于后续tile划分 + float min_x = std::min({screen_x0, screen_x1, screen_x2}); + float max_x = std::max({screen_x0, screen_x1, screen_x2}); + float min_y = std::min({screen_y0, screen_y1, screen_y2}); + float max_y = std::max({screen_y0, screen_y1, screen_y2}); + + int start_tile_x = std::max(0, static_cast(min_x) / static_cast(tile_size)); + int end_tile_x = std::min(static_cast(tiles_x - 1), static_cast(max_x) / static_cast(tile_size)); + int start_tile_y = std::max(0, static_cast(min_y) / static_cast(tile_size)); + int end_tile_y = std::min(static_cast(tiles_y - 1), static_cast(max_y) / static_cast(tile_size)); + if (start_tile_x > end_tile_x || start_tile_y > end_tile_y) return; // 如果bbox不在任何tile内,直接返回 + + if (count_only) { // 第一遍计数,只统计tile内三角形数量 + for (int ty = start_tile_y; ty <= end_tile_y; ++ty) { + for (int tx = start_tile_x; tx <= end_tile_x; ++tx) { + size_t tile_id = ty * tiles_x + tx; + tile_counts[tile_id]++; } - - bool has_clipped_vertex = (pos0.x == -1000.0f || pos1.x == -1000.0f || pos2.x == -1000.0f); - if (has_clipped_vertex) { - continue; + } + } else { // 第二遍填充,填充TriangleRef + TriangleRef tri_ref{ i0, i1, i2, &f.GetMaterial(), tri_idx }; + for (int ty = start_tile_y; ty <= end_tile_y; ++ty) { + for (int tx = start_tile_x; tx <= end_tile_x; ++tx) { + size_t tile_id = ty * tiles_x + tx; + tile_triangles[tile_id].push_back(tri_ref); } + } + } + }; - float screen_x0 = pos0.x; - float screen_y0 = pos0.y; - float screen_x1 = pos1.x; - float screen_y1 = pos1.y; - float screen_x2 = pos2.x; - float screen_y2 = pos2.y; - - float min_x = std::min({screen_x0, screen_x1, screen_x2}); - float max_x = std::max({screen_x0, screen_x1, screen_x2}); - float min_y = std::min({screen_y0, screen_y1, screen_y2}); - float max_y = std::max({screen_y0, screen_y1, screen_y2}); - - int start_tile_x = std::max(0, static_cast(min_x) / static_cast(tile_size)); - int end_tile_x = std::min(static_cast(tiles_x - 1), - static_cast(max_x) / static_cast(tile_size)); - int start_tile_y = std::max(0, static_cast(min_y) / static_cast(tile_size)); - int end_tile_y = std::min(static_cast(tiles_y - 1), - static_cast(max_y) / static_cast(tile_size)); - - if (start_tile_x > end_tile_x || start_tile_y > end_tile_y) { - continue; - } + // 第一遍(count only):计算每个tile需要容纳多少三角形 + for (size_t tri_idx = 0; tri_idx < total_triangles; ++tri_idx) { + process_triangle(tri_idx, true); + } - for (int ty = start_tile_y; ty <= end_tile_y; ++ty) { - for (int tx = start_tile_x; tx <= end_tile_x; ++tx) { - size_t tile_id = ty * tiles_x + tx; - tile_counts[tile_id]++; - } - } - } + // 预分配,避免动态扩容 + for (size_t tile_id = 0; tile_id < tile_triangles.size(); ++tile_id) { + if (tile_counts[tile_id] > 0) tile_triangles[tile_id].reserve(tile_counts[tile_id]); + } - // 依据统计结果进行容量预留 - for (size_t tile_id = 0; tile_id < tile_triangles.size(); ++tile_id) { - if (tile_counts[tile_id] > 0) { - tile_triangles[tile_id].reserve(tile_counts[tile_id]); - } - } - for (size_t tri_idx = 0; tri_idx < model.GetFaces().size(); tri_idx++) { - const auto &f = model.GetFaces()[tri_idx]; - auto v0 = screenVertices[f.GetIndex(0)]; - auto v1 = screenVertices[f.GetIndex(1)]; - auto v2 = screenVertices[f.GetIndex(2)]; - - // 视锥体裁剪 (裁剪空间) - if (v0.HasClipPosition()) { - Vector4f c0 = v0.GetClipPosition(); - Vector4f c1 = v1.GetClipPosition(); - Vector4f c2 = v2.GetClipPosition(); - - // 保守视锥体裁剪:只有当整个三角形都在视锥体外同一侧时才裁剪 - bool frustum_cull = - (c0.x > c0.w && c1.x > c1.w && c2.x > c2.w) || // 右平面外 - (c0.x < -c0.w && c1.x < -c1.w && c2.x < -c2.w) || // 左平面外 - (c0.y > c0.w && c1.y > c1.w && c2.y > c2.w) || // 上平面外 - (c0.y < -c0.w && c1.y < -c1.w && c2.y < -c2.w) || // 下平面外 - (c0.z > c0.w && c1.z > c1.w && c2.z > c2.w) || // 远平面外 - (c0.z < -c0.w && c1.z < -c1.w && c2.z < -c2.w); // 近平面外 - - if (frustum_cull) { - continue; - } - } - - // 获取屏幕空间坐标(现在已经是屏幕坐标了) - Vector4f pos0 = v0.GetPosition(); - Vector4f pos1 = v1.GetPosition(); - Vector4f pos2 = v2.GetPosition(); - - // 计算屏幕空间叉积判断朝向 - Vector2f screen0(pos0.x, pos0.y); - Vector2f screen1(pos1.x, pos1.y); - Vector2f screen2(pos2.x, pos2.y); - Vector2f edge1 = screen1 - screen0; - Vector2f edge2 = screen2 - screen0; - float cross_product = edge1.x * edge2.y - edge1.y * edge2.x; - - // 背面剔除:NDC空间中叉积为负表示顺时针,即背面。 - // 从NDC到屏幕空间中,会发生Y轴翻转,对应叉积应为正。 - if (cross_product > 0.0f) { - continue; - } - - // 检查三角形是否有被裁剪的顶点(坐标为-1000的表示被裁剪) - bool has_clipped_vertex = (pos0.x == -1000.0f || pos1.x == -1000.0f || pos2.x == -1000.0f); - - if (has_clipped_vertex) { - triangles_with_clipped_vertices++; - if (triangles_with_clipped_vertices <= 3) { - SPDLOG_INFO("Triangle {} has clipped vertices:", tri_idx); - SPDLOG_INFO(" V0: ({:.1f},{:.1f}) V1: ({:.1f},{:.1f}) V2: ({:.1f},{:.1f})", - pos0.x, pos0.y, pos1.x, pos1.y, pos2.x, pos2.y); - } - continue; - } - - // 直接使用屏幕空间坐标 - float screen_x0 = pos0.x; - float screen_y0 = pos0.y; - float screen_x1 = pos1.x; - float screen_y1 = pos1.y; - float screen_x2 = pos2.x; - float screen_y2 = pos2.y; - - // 计算bounding box - float min_x = std::min({screen_x0, screen_x1, screen_x2}); - float max_x = std::max({screen_x0, screen_x1, screen_x2}); - float min_y = std::min({screen_y0, screen_y1, screen_y2}); - float max_y = std::max({screen_y0, screen_y1, screen_y2}); - - - // 计算影响的tile范围 - int start_tile_x = std::max(0, static_cast(min_x) / static_cast(tile_size)); - int end_tile_x = std::min(static_cast(tiles_x - 1), - static_cast(max_x) / static_cast(tile_size)); - int start_tile_y = std::max(0, static_cast(min_y) / static_cast(tile_size)); - int end_tile_y = std::min(static_cast(tiles_y - 1), - static_cast(max_y) / static_cast(tile_size)); - - // 添加三角形到相关tiles(多个三角形可能会映射到同一个tile当中,所以谨慎并行化) - if (start_tile_x <= end_tile_x && start_tile_y <= end_tile_y) { - TriangleInfo triangle_info = {v0, v1, v2, &f.GetMaterial(), processed_triangles}; - - for (int ty = start_tile_y; ty <= end_tile_y; ty++) { - for (int tx = start_tile_x; tx <= end_tile_x; tx++) { - size_t tile_id = ty * tiles_x + tx; - tile_triangles[tile_id].push_back(triangle_info); // 可能多个线程同时pushback的话有风险 - } - } - processed_triangles++; - - } - } - - size_t total_triangle_refs = 0; - size_t non_empty_tiles = 0; - for (const auto& tile : tile_triangles) { - total_triangle_refs += tile.size(); - if (!tile.empty()) non_empty_tiles++; - } - - SPDLOG_INFO(" Total triangle references: {}", total_triangle_refs); - SPDLOG_INFO(" Non-empty tiles: {}", non_empty_tiles); - SPDLOG_INFO(" Average triangles per tile: {:.2f}", - total_triangle_refs > 0 ? float(total_triangle_refs) / tile_triangles.size() : 0.0f); + // 第二遍(fill):按范围填充TriangleRef + for (size_t tri_idx = 0; tri_idx < total_triangles; ++tri_idx) { + process_triangle(tri_idx, false); + } + + size_t total_triangle_refs = 0; + size_t non_empty_tiles = 0; + for (const auto& tile : tile_triangles) { + total_triangle_refs += tile.size(); + if (!tile.empty()) non_empty_tiles++; + } + SPDLOG_INFO(" (SoA) Total triangle references: {}", total_triangle_refs); + SPDLOG_INFO(" (SoA) Non-empty tiles: {}", non_empty_tiles); + SPDLOG_INFO(" (SoA) Average triangles per tile: {:.2f}", + total_triangle_refs > 0 ? float(total_triangle_refs) / tile_triangles.size() : 0.0f); } -// 单个tile光栅化函数 +// SoA 版:单个 tile 光栅化 void SimpleRenderer::RasterizeTile( size_t tile_id, - const std::vector &triangles, + const std::vector &triangles, size_t tiles_x, size_t tiles_y, size_t tile_size, float* tile_depth_buffer, uint32_t* tile_color_buffer, std::unique_ptr &global_depth_buffer, std::unique_ptr &global_color_buffer, + const VertexSoA &soa, bool use_early_z, std::vector* scratch_fragments) { - // 计算tile在屏幕空间的范围 + (void)tiles_y; + // 计算 tile 屏幕范围 size_t tile_x = tile_id % tiles_x; size_t tile_y = tile_id / tiles_x; size_t screen_x_start = tile_x * tile_size; size_t screen_y_start = tile_y * tile_size; size_t screen_x_end = std::min(screen_x_start + tile_size, width_); size_t screen_y_end = std::min(screen_y_start + tile_size, height_); - - // 初始化tile缓冲区 + + // 初始化 tile 局部缓冲 size_t tile_width = screen_x_end - screen_x_start; size_t tile_height = screen_y_end - screen_y_start; - std::fill_n(tile_depth_buffer, tile_width * tile_height, - 1.0f); // 初始化为最远深度(标准深度缓冲范围[0,1]) + std::fill_n(tile_depth_buffer, tile_width * tile_height, 1.0f); std::fill_n(tile_color_buffer, tile_width * tile_height, 0); - - // 在tile内光栅化所有三角形 - (void)tiles_y; // 避免未使用参数告警 - for (const auto &triangle : triangles) { - // 复用线程本地 scratch 容器,限制在 tile 边界内栅格化 - if (scratch_fragments) { // 提供scratch容器 - scratch_fragments->clear(); - if (scratch_fragments->capacity() < tile_width * tile_height) { // 二次确认,为日后可能的可变tile进行设计 - scratch_fragments->reserve(tile_width * tile_height); - } - rasterizer_->RasterizeTo(triangle.v0, triangle.v1, triangle.v2, - static_cast(screen_x_start), static_cast(screen_y_start), - static_cast(screen_x_end), static_cast(screen_y_end), - *scratch_fragments); - - for (auto &fragment : *scratch_fragments) { - fragment.material = triangle.material; - size_t screen_x = fragment.screen_coord[0]; - size_t screen_y = fragment.screen_coord[1]; - if (screen_x >= screen_x_start && screen_x < screen_x_end && - screen_y >= screen_y_start && screen_y < screen_y_end) { - size_t tile_local_x = screen_x - screen_x_start; - size_t tile_local_y = screen_y - screen_y_start; - size_t tile_index = tile_local_x + tile_local_y * tile_width; - if (use_early_z) { - if (fragment.depth < tile_depth_buffer[tile_index]) { - auto color = shader_->FragmentShader(fragment); - tile_depth_buffer[tile_index] = fragment.depth; - tile_color_buffer[tile_index] = uint32_t(color); - } - } else { + + for (const auto &tri : triangles) { // 用来应对scratch传入nullptr的情况 + // 始终走 SoA + 限制矩形的光栅化路径;如未提供 scratch,则使用函数内局部容器 + std::vector local_out; + std::vector &out = scratch_fragments ? *scratch_fragments : local_out; + + out.clear(); + if (out.capacity() < tile_width * tile_height) { + out.reserve(tile_width * tile_height); + } + + rasterizer_->RasterizeTo(soa, tri.i0, tri.i1, tri.i2, + static_cast(screen_x_start), static_cast(screen_y_start), + static_cast(screen_x_end), static_cast(screen_y_end), + out); + + for (auto &fragment : out) { + fragment.material = tri.material; + size_t sx = fragment.screen_coord[0]; + size_t sy = fragment.screen_coord[1]; + if (sx >= screen_x_start && sx < screen_x_end && sy >= screen_y_start && sy < screen_y_end) { + size_t local_x = sx - screen_x_start; + size_t local_y = sy - screen_y_start; + size_t idx = local_x + local_y * tile_width; + if (use_early_z) { + if (fragment.depth < tile_depth_buffer[idx]) { auto color = shader_->FragmentShader(fragment); - if (fragment.depth < tile_depth_buffer[tile_index]) { - tile_depth_buffer[tile_index] = fragment.depth; - tile_color_buffer[tile_index] = uint32_t(color); - } + tile_depth_buffer[idx] = fragment.depth; + tile_color_buffer[idx] = uint32_t(color); } - } - } - } else { // 不提供scratch容器的版本 - auto fragments = rasterizer_->Rasterize(triangle.v0, triangle.v1, triangle.v2); - for (auto &fragment : fragments) { - fragment.material = triangle.material; - size_t screen_x = fragment.screen_coord[0]; - size_t screen_y = fragment.screen_coord[1]; - if (screen_x >= screen_x_start && screen_x < screen_x_end && - screen_y >= screen_y_start && screen_y < screen_y_end) { - size_t tile_local_x = screen_x - screen_x_start; - size_t tile_local_y = screen_y - screen_y_start; - size_t tile_index = tile_local_x + tile_local_y * tile_width; - if (use_early_z) { - if (fragment.depth < tile_depth_buffer[tile_index]) { - auto color = shader_->FragmentShader(fragment); - tile_depth_buffer[tile_index] = fragment.depth; - tile_color_buffer[tile_index] = uint32_t(color); - } - } else { - auto color = shader_->FragmentShader(fragment); - if (fragment.depth < tile_depth_buffer[tile_index]) { - tile_depth_buffer[tile_index] = fragment.depth; - tile_color_buffer[tile_index] = uint32_t(color); - } + } else { + auto color = shader_->FragmentShader(fragment); + if (fragment.depth < tile_depth_buffer[idx]) { + tile_depth_buffer[idx] = fragment.depth; + tile_color_buffer[idx] = uint32_t(color); } } } } } - - // 将tile结果写入全局缓冲区 + + // 写回全局缓冲 for (size_t y = 0; y < tile_height; y++) { for (size_t x = 0; x < tile_width; x++) { size_t tile_index = x + y * tile_width; size_t global_index = (screen_x_start + x) + (screen_y_start + y) * width_; - if (tile_depth_buffer[tile_index] < global_depth_buffer[global_index]) { global_depth_buffer[global_index] = tile_depth_buffer[tile_index]; global_color_buffer[global_index] = tile_color_buffer[tile_index]; @@ -690,8 +591,7 @@ void SimpleRenderer::RasterizeTile( } } - -// 传统光栅化管线实现 +// 基础光栅化管线实现 SimpleRenderer::RenderStats SimpleRenderer::ExecuteTraditionalPipeline( const Model &model, const std::vector &processedVertices, @@ -822,46 +722,46 @@ SimpleRenderer::RenderStats SimpleRenderer::ExecuteTraditionalPipeline( return stats; } -// Tile-based光栅化管线实现 + +// Tile-based光栅化管线实现(SoA 直连版本,避免 AoS->SoA 拷贝) SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline( const Model &model, - const std::vector &processedVertices, + const VertexSoA &soa, uint32_t *buffer) { - TileRenderStats stats; auto total_start_time = std::chrono::high_resolution_clock::now(); - + // 1. Setup阶段 auto setup_start_time = std::chrono::high_resolution_clock::now(); const size_t TILE_SIZE = 64; // 64x64 pixels per tile const size_t tiles_x = (width_ + TILE_SIZE - 1) / TILE_SIZE; const size_t tiles_y = (height_ + TILE_SIZE - 1) / TILE_SIZE; const size_t total_tiles = tiles_x * tiles_y; - - // 为每个tile创建三角形列表 - std::vector> tile_triangles(total_tiles); + + // 为每个tile创建三角形列表(SoA 引用) + std::vector> tile_triangles(total_tiles); auto setup_end_time = std::chrono::high_resolution_clock::now(); auto setup_duration = std::chrono::duration_cast( setup_end_time - setup_start_time); - - // 2. Triangle-Tile binning阶段 + + // 2. Triangle-Tile binning阶段(SoA) auto binning_start_time = std::chrono::high_resolution_clock::now(); - TriangleTileBinning(model, processedVertices, tile_triangles, tiles_x, tiles_y, TILE_SIZE); + TriangleTileBinning(model, soa, tile_triangles, tiles_x, tiles_y, TILE_SIZE); auto binning_end_time = std::chrono::high_resolution_clock::now(); auto binning_duration = std::chrono::duration_cast( binning_end_time - binning_start_time); - + // 3. 为每个线程创建framebuffer auto buffer_alloc_start_time = std::chrono::high_resolution_clock::now(); std::vector> depthBuffer_all_thread(kNProc); std::vector> colorBuffer_all_thread(kNProc); - + for (size_t thread_id = 0; thread_id < kNProc; thread_id++) { depthBuffer_all_thread[thread_id] = std::make_unique(width_ * height_); colorBuffer_all_thread[thread_id] = std::make_unique(width_ * height_); - + std::fill_n(depthBuffer_all_thread[thread_id].get(), width_ * height_, std::numeric_limits::infinity()); std::fill_n(colorBuffer_all_thread[thread_id].get(), width_ * height_, 0); @@ -869,13 +769,13 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline( auto buffer_alloc_end_time = std::chrono::high_resolution_clock::now(); auto buffer_alloc_duration = std::chrono::duration_cast( buffer_alloc_end_time - buffer_alloc_start_time); - - // 4. 并行处理每个tile + + // 4. 并行处理每个tile(SoA) auto rasterization_start_time = std::chrono::high_resolution_clock::now(); #pragma omp parallel num_threads(kNProc) default(none) \ shared(tile_triangles, rasterizer_, shader_, width_, height_, \ depthBuffer_all_thread, colorBuffer_all_thread, tiles_x, tiles_y, total_tiles, \ - early_z_enabled_) + early_z_enabled_, soa) { int thread_id = omp_get_thread_num(); auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id]; @@ -893,18 +793,18 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline( #pragma omp for for (size_t tile_id = 0; tile_id < total_tiles; tile_id++) { - // 按照tile进行光栅化,每个Tile进行区域限制+scratch复用,区域限制避免了可能的数据竞争 - RasterizeTile(tile_id, tile_triangles[tile_id], - tiles_x, tiles_y, TILE_SIZE, - tile_depth_buffer.get(), tile_color_buffer.get(), - depthBuffer_per_thread, colorBuffer_per_thread, - early_z_enabled_, &scratch_fragments); + // 按照 tile 进行光栅化(SoA) + RasterizeTile(tile_id, tile_triangles[tile_id], + tiles_x, tiles_y, TILE_SIZE, + tile_depth_buffer.get(), tile_color_buffer.get(), + depthBuffer_per_thread, colorBuffer_per_thread, + soa, early_z_enabled_, &scratch_fragments); } } auto rasterization_end_time = std::chrono::high_resolution_clock::now(); auto rasterization_duration = std::chrono::duration_cast( rasterization_end_time - rasterization_start_time); - + // 5. 合并所有线程结果 auto merge_start_time = std::chrono::high_resolution_clock::now(); std::unique_ptr depthBuffer = @@ -936,11 +836,11 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline( auto merge_end_time = std::chrono::high_resolution_clock::now(); auto merge_duration = std::chrono::duration_cast( merge_end_time - merge_start_time); - + auto total_end_time = std::chrono::high_resolution_clock::now(); auto total_duration = std::chrono::duration_cast( total_end_time - total_start_time); - + // 填充统计信息 stats.setup_ms = setup_duration.count() / 1000.0; stats.binning_ms = binning_duration.count() / 1000.0; @@ -948,7 +848,7 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline( stats.rasterization_ms = rasterization_duration.count() / 1000.0; stats.merge_ms = merge_duration.count() / 1000.0; stats.total_ms = total_duration.count() / 1000.0; - + return stats; } From 05492110e6214826c41bfb837e91aaf29b938abc Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Sun, 7 Sep 2025 20:13:37 +0800 Subject: [PATCH 13/24] TBR: Use global framebuffer to avoid merge overhead Signed-off-by: ZhouFANG --- src/rasterizer.cpp | 66 ----------------------------------------- src/renderer.cpp | 74 +++++++++++++--------------------------------- 2 files changed, 21 insertions(+), 119 deletions(-) diff --git a/src/rasterizer.cpp b/src/rasterizer.cpp index 1ee2fff..f4e251c 100644 --- a/src/rasterizer.cpp +++ b/src/rasterizer.cpp @@ -92,72 +92,6 @@ std::vector Rasterizer::Rasterize(const Vertex& v0, const Vertex& v1, return fragments; } -void Rasterizer::RasterizeTo(const Vertex& v0, const Vertex& v1, const Vertex& v2, - int x0, int y0, int x1, int y1, - std::vector& out) { - // 获取三角形的最小 box(屏幕空间) - Vector2f a = Vector2f(v0.GetPosition().x, v0.GetPosition().y); - Vector2f b = Vector2f(v1.GetPosition().x, v1.GetPosition().y); - Vector2f c = Vector2f(v2.GetPosition().x, v2.GetPosition().y); - - Vector2f bboxMin = - Vector2f{std::min({a.x, b.x, c.x}), std::min({a.y, b.y, c.y})}; - Vector2f bboxMax = - Vector2f{std::max({a.x, b.x, c.x}), std::max({a.y, b.y, c.y})}; - - // Clamp 到屏幕尺寸 - float minx = std::max(0.0f, bboxMin.x); - float miny = std::max(0.0f, bboxMin.y); - float maxx = std::min(float(width_ - 1), bboxMax.x); - float maxy = std::min(float(height_ - 1), bboxMax.y); - - // 与外部提供的裁剪区域(半开区间)相交,转成闭区间扫描 - int sx = std::max(x0, int(std::floor(minx))); - int sy = std::max(y0, int(std::floor(miny))); - int ex = std::min(x1 - 1, int(std::floor(maxx))); - int ey = std::min(y1 - 1, int(std::floor(maxy))); - - if (sx > ex || sy > ey) { - return; // 与裁剪区域无交 - } - - // 透视矫正插值使用与 Rasterize 相同逻辑,但单线程写入 out - float w0_inv = v0.GetPosition().w; - float w1_inv = v1.GetPosition().w; - float w2_inv = v2.GetPosition().w; - - for (int x = sx; x <= ex; ++x) { - for (int y = sy; y <= ey; ++y) { - auto [is_inside, barycentric_coord] = GetBarycentricCoord( - v0.GetPosition(), v1.GetPosition(), v2.GetPosition(), - Vector3f(static_cast(x), static_cast(y), 0)); - if (!is_inside) continue; - - // 插值 1/w 并进行透视矫正 - float w_inv_interpolated = Interpolate(w0_inv, w1_inv, w2_inv, barycentric_coord); - Vector3f corrected_bary( - barycentric_coord.x * w0_inv / w_inv_interpolated, - barycentric_coord.y * w1_inv / w_inv_interpolated, - barycentric_coord.z * w2_inv / w_inv_interpolated); - - auto z = Interpolate(v0.GetPosition().z, v1.GetPosition().z, - v2.GetPosition().z, corrected_bary); - - Fragment fragment; - fragment.screen_coord = {x, y}; - fragment.normal = Interpolate(v0.GetNormal(), v1.GetNormal(), - v2.GetNormal(), corrected_bary); - fragment.uv = Interpolate(v0.GetTexCoords(), v1.GetTexCoords(), - v2.GetTexCoords(), corrected_bary); - fragment.color = InterpolateColor(v0.GetColor(), v1.GetColor(), - v2.GetColor(), corrected_bary); - fragment.depth = z; - - out.push_back(fragment); - } - } -} - void Rasterizer::RasterizeTo(const VertexSoA& soa, size_t i0, size_t i1, size_t i2, int x0, int y0, int x1, int y1, std::vector& out) { diff --git a/src/renderer.cpp b/src/renderer.cpp index 3167cf5..fb57a60 100755 --- a/src/renderer.cpp +++ b/src/renderer.cpp @@ -751,21 +751,14 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline( auto binning_duration = std::chrono::duration_cast( binning_end_time - binning_start_time); - // 3. 为每个线程创建framebuffer + // 3. 全局 framebuffer(单份) + // 直接让每个 tile 写入这份全局缓冲区,避免末端 O(W*H*kNProc) 合并开销 auto buffer_alloc_start_time = std::chrono::high_resolution_clock::now(); - std::vector> depthBuffer_all_thread(kNProc); - std::vector> colorBuffer_all_thread(kNProc); - - for (size_t thread_id = 0; thread_id < kNProc; thread_id++) { - depthBuffer_all_thread[thread_id] = - std::make_unique(width_ * height_); - colorBuffer_all_thread[thread_id] = - std::make_unique(width_ * height_); - - std::fill_n(depthBuffer_all_thread[thread_id].get(), width_ * height_, - std::numeric_limits::infinity()); - std::fill_n(colorBuffer_all_thread[thread_id].get(), width_ * height_, 0); - } + std::unique_ptr depthBuffer = std::make_unique(width_ * height_); + std::unique_ptr colorBuffer = std::make_unique(width_ * height_); + // 深度初始化为最远值,颜色清零 + std::fill_n(depthBuffer.get(), width_ * height_, std::numeric_limits::infinity()); + std::fill_n(colorBuffer.get(), width_ * height_, 0); auto buffer_alloc_end_time = std::chrono::high_resolution_clock::now(); auto buffer_alloc_duration = std::chrono::duration_cast( buffer_alloc_end_time - buffer_alloc_start_time); @@ -774,14 +767,12 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline( auto rasterization_start_time = std::chrono::high_resolution_clock::now(); #pragma omp parallel num_threads(kNProc) default(none) \ shared(tile_triangles, rasterizer_, shader_, width_, height_, \ - depthBuffer_all_thread, colorBuffer_all_thread, tiles_x, tiles_y, total_tiles, \ + depthBuffer, colorBuffer, tiles_x, tiles_y, total_tiles, \ early_z_enabled_, soa) { int thread_id = omp_get_thread_num(); - auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id]; - auto &colorBuffer_per_thread = colorBuffer_all_thread[thread_id]; - // 为当前线程创建tile局部缓冲区 + // 为当前线程创建 tile 局部缓冲区(避免在全局缓冲上直接逐像素竞争) std::unique_ptr tile_depth_buffer = std::make_unique(TILE_SIZE * TILE_SIZE); std::unique_ptr tile_color_buffer = @@ -794,48 +785,24 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline( #pragma omp for for (size_t tile_id = 0; tile_id < total_tiles; tile_id++) { // 按照 tile 进行光栅化(SoA) + // 直接写入单份全局 framebuffer;不同 tile 不重叠,无需加锁 RasterizeTile(tile_id, tile_triangles[tile_id], - tiles_x, tiles_y, TILE_SIZE, - tile_depth_buffer.get(), tile_color_buffer.get(), - depthBuffer_per_thread, colorBuffer_per_thread, - soa, early_z_enabled_, &scratch_fragments); + tiles_x, tiles_y, TILE_SIZE, + tile_depth_buffer.get(), tile_color_buffer.get(), + depthBuffer, colorBuffer, + soa, early_z_enabled_, &scratch_fragments); } } auto rasterization_end_time = std::chrono::high_resolution_clock::now(); auto rasterization_duration = std::chrono::duration_cast( rasterization_end_time - rasterization_start_time); - // 5. 合并所有线程结果 - auto merge_start_time = std::chrono::high_resolution_clock::now(); - std::unique_ptr depthBuffer = - std::make_unique(width_ * height_); - std::unique_ptr colorBuffer = - std::make_unique(width_ * height_); - - std::fill_n(depthBuffer.get(), width_ * height_, - std::numeric_limits::infinity()); - std::fill_n(colorBuffer.get(), width_ * height_, 0); - -#pragma omp parallel for - for (size_t i = 0; i < width_ * height_; i++) { - float min_depth = std::numeric_limits::infinity(); - uint32_t color = 0; - - for (size_t thread_id = 0; thread_id < kNProc; thread_id++) { - float depth = depthBuffer_all_thread[thread_id][i]; - if (depth < min_depth) { - min_depth = depth; - color = colorBuffer_all_thread[thread_id][i]; - } - } - depthBuffer[i] = min_depth; - colorBuffer[i] = color; - } - + // 5. 直接将单份全局 colorBuffer 拷贝到输出 + auto present_start_time = std::chrono::high_resolution_clock::now(); std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t)); - auto merge_end_time = std::chrono::high_resolution_clock::now(); - auto merge_duration = std::chrono::duration_cast( - merge_end_time - merge_start_time); + auto present_end_time = std::chrono::high_resolution_clock::now(); + auto present_duration = std::chrono::duration_cast( + present_end_time - present_start_time); auto total_end_time = std::chrono::high_resolution_clock::now(); auto total_duration = std::chrono::duration_cast( @@ -846,7 +813,8 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline( stats.binning_ms = binning_duration.count() / 1000.0; stats.buffer_alloc_ms = buffer_alloc_duration.count() / 1000.0; stats.rasterization_ms = rasterization_duration.count() / 1000.0; - stats.merge_ms = merge_duration.count() / 1000.0; + // 合并阶段已被消除,仅为拷贝开销 + stats.merge_ms = present_duration.count() / 1000.0; stats.total_ms = total_duration.count() / 1000.0; return stats; From 258607acca4879916af23e4431baa8003a340cf9 Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Thu, 11 Sep 2025 14:26:35 +0800 Subject: [PATCH 14/24] TBR: Optimize global buffer write-back logic Signed-off-by: ZhouFANG --- src/renderer.cpp | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/renderer.cpp b/src/renderer.cpp index fb57a60..b449b93 100755 --- a/src/renderer.cpp +++ b/src/renderer.cpp @@ -536,7 +536,7 @@ void SimpleRenderer::RasterizeTile( size_t tile_width = screen_x_end - screen_x_start; size_t tile_height = screen_y_end - screen_y_start; std::fill_n(tile_depth_buffer, tile_width * tile_height, 1.0f); - std::fill_n(tile_color_buffer, tile_width * tile_height, 0); + std::fill_n(tile_color_buffer, tile_width * tile_height, 0); // 默认背景色为0/黑色 for (const auto &tri : triangles) { // 用来应对scratch传入nullptr的情况 // 始终走 SoA + 限制矩形的光栅化路径;如未提供 scratch,则使用函数内局部容器 @@ -579,15 +579,21 @@ void SimpleRenderer::RasterizeTile( } // 写回全局缓冲 + // TBR 下不同 tile 覆盖的屏幕区域互不重叠,且在 tile 内部已通过 Early‑Z + // 得出每个像素的最终值。因此可以直接将 tile 行数据拷贝到全局缓冲 for (size_t y = 0; y < tile_height; y++) { - for (size_t x = 0; x < tile_width; x++) { - size_t tile_index = x + y * tile_width; - size_t global_index = (screen_x_start + x) + (screen_y_start + y) * width_; - if (tile_depth_buffer[tile_index] < global_depth_buffer[global_index]) { - global_depth_buffer[global_index] = tile_depth_buffer[tile_index]; - global_color_buffer[global_index] = tile_color_buffer[tile_index]; - } - } + const size_t tile_row_off = y * tile_width; + const size_t global_row_off = (screen_y_start + y) * width_ + screen_x_start; + + // 拷贝本行 color 到全局 color + std::memcpy(global_color_buffer.get() + global_row_off, + tile_color_buffer + tile_row_off, + tile_width * sizeof(uint32_t)); + + // 拷贝本行 depth 到全局 depth + std::memcpy(global_depth_buffer.get() + global_row_off, + tile_depth_buffer + tile_row_off, + tile_width * sizeof(float)); } } From ffe0d756aee01a2356adf4a5ea904b68a0be3445 Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Fri, 12 Sep 2025 20:00:42 +0800 Subject: [PATCH 15/24] Optimize perspective correction, add helper func, simplify code Signed-off-by: ZhouFANG --- src/include/rasterizer.hpp | 18 ++++- src/include/renderer.h | 36 +++++++--- src/include/vertex.hpp | 43 ++++++++---- src/include/vertex_soa.hpp | 33 --------- src/rasterizer.cpp | 89 +++++++++++++----------- src/renderer.cpp | 139 ++++++++++++++++++------------------- test/system_test/main.cpp | 14 +--- 7 files changed, 188 insertions(+), 184 deletions(-) delete mode 100644 src/include/vertex_soa.hpp diff --git a/src/include/rasterizer.hpp b/src/include/rasterizer.hpp index 24e4a20..c389f49 100644 --- a/src/include/rasterizer.hpp +++ b/src/include/rasterizer.hpp @@ -3,7 +3,7 @@ #include "config.h" #include "shader.hpp" -#include "vertex_soa.hpp" +#include "vertex.hpp" namespace simple_renderer { @@ -36,13 +36,25 @@ class Rasterizer { private: size_t width_, height_; + // 透视矫正结果 + struct PerspectiveCorrectionResult { + Vector3f corrected_barycentric; + float interpolated_z; + }; + + // 透视矫正helper函数 + PerspectiveCorrectionResult PerformPerspectiveCorrection( + float w0, float w1, float w2, + float z0, float z1, float z2, + const Vector3f& original_barycentric) const; + template T Interpolate(const T& v0, const T& v1, const T& v2, - const Vector3f& barycentric_coord); + const Vector3f& barycentric_coord) const; Color InterpolateColor(const Color& color0, const Color& color1, const Color& color2, - const Vector3f& barycentric_coord); + const Vector3f& barycentric_coord) const; std::pair GetBarycentricCoord(const Vector3f& p0, const Vector3f& p1, diff --git a/src/include/renderer.h b/src/include/renderer.h index 56c84c8..f239910 100755 --- a/src/include/renderer.h +++ b/src/include/renderer.h @@ -20,6 +20,7 @@ #include #include #include +#include #include "buffer.hpp" #include "light.h" @@ -38,6 +39,10 @@ enum class RenderingMode { DEFERRED // 延迟渲染模式 - 经典GPU管线教学模拟 }; +// RenderingMode辅助函数声明 +std::string RenderingModeToString(RenderingMode mode); +std::string RenderingModeToDetailedString(RenderingMode mode); + // SoA 版 tile 列表中的三角形引用(仅存索引与材质指针) struct TriangleRef { @@ -94,11 +99,15 @@ class SimpleRenderer { const size_t width_; LogSystem log_system_; RenderingMode current_mode_; // 当前渲染模式 - bool early_z_enabled_; // Early-Z优化开关 + bool is_early_z_enabled_; // Early-Z优化开关 std::shared_ptr shader_; std::shared_ptr rasterizer_; + // Rendering constants + static constexpr float kMinWValue = 1e-6f; // W分量检查阈值(避免除零) + static constexpr size_t kDefaultTileSize = 64; // 默认Tile大小(64x64像素) + /** * 执行绘制管线 * @param model 模型 @@ -125,13 +134,6 @@ class SimpleRenderer { const std::vector &processedVertices, uint32_t *buffer); - /** - * Tile-based光栅化渲染 - * @param model 模型 - * @param processedVertices 已处理的顶点 - * @param buffer 输出缓冲区 - * @return 渲染统计信息 - */ struct TileRenderStats { double setup_ms; double binning_ms; @@ -167,10 +169,8 @@ class SimpleRenderer { const std::vector &processedVertices, uint32_t *buffer); - private: - // SoA 版本的 Triangle-Tile binning(两遍计数 + reserve) void TriangleTileBinning( const Model &model, @@ -191,7 +191,6 @@ class SimpleRenderer { bool use_early_z = false, std::vector* scratch_fragments = nullptr); - /** * 透视除法 - 将裁剪空间坐标转换为归一化设备坐标(NDC) * @param vertex 裁剪空间坐标的顶点 @@ -206,6 +205,21 @@ class SimpleRenderer { */ Vertex ViewportTransformation(const Vertex &vertex); + /** + * 打印传统渲染性能统计信息 + */ + void PrintTraditionalStats(double vertex_ms, const RenderStats& stats) const; + + /** + * 打印基于Tile渲染性能统计信息 + */ + void PrintTileBasedStats(double vertex_ms, const TileRenderStats& stats) const; + + /** + * 打印延迟渲染性能统计信息 + */ + void PrintDeferredStats(double vertex_ms, const DeferredRenderStats& stats) const; + }; } // namespace simple_renderer diff --git a/src/include/vertex.hpp b/src/include/vertex.hpp index bff0680..b00f648 100644 --- a/src/include/vertex.hpp +++ b/src/include/vertex.hpp @@ -1,6 +1,9 @@ #ifndef SIMPLERENDER_SRC_INCLUDE_VERTEX_HPP_ #define SIMPLERENDER_SRC_INCLUDE_VERTEX_HPP_ +#include +#include + #include #include "color.h" @@ -31,18 +34,13 @@ class Vertex { // 析构函数 ~Vertex() = default; - // Constructor with parameters 带参数的构造函数 - explicit Vertex(const Vector4f& pos, const Vector3f& norm, - const Vector2f& tex, const Color& color_) - : position_(pos), normal_(norm), texCoords_(tex), color_(color_), - clip_position_(pos), has_clip_position_(false) {} - - // 扩展构造函数:包含裁剪空间坐标 + // Constructor with parameters: optional clip space coordinate + // 带参数的构造函数:可选的裁剪空间坐标 explicit Vertex(const Vector4f& pos, const Vector3f& norm, const Vector2f& tex, const Color& color_, - const Vector4f& clip_pos) + std::optional clip_pos = std::nullopt) : position_(pos), normal_(norm), texCoords_(tex), color_(color_), - clip_position_(clip_pos), has_clip_position_(true) {} + clip_position_(clip_pos) {} // Transform the vertex with a matrix 使用矩阵变换顶点 void transform(const Matrix4f& matrix) { position_ = matrix * position_; } @@ -55,8 +53,8 @@ class Vertex { [[nodiscard]] inline Color GetColor() const { return color_; } // 扩展坐标访问 - [[nodiscard]] inline Vector4f GetClipPosition() const { return clip_position_; } - [[nodiscard]] inline bool HasClipPosition() const { return has_clip_position_; } + [[nodiscard]] inline std::optional GetClipPosition() const { return clip_position_; } + [[nodiscard]] inline bool HasClipPosition() const { return clip_position_.has_value(); } private: Vector4f position_; // 3D position, 3D顶点坐标 @@ -65,8 +63,7 @@ class Vertex { Color color_; // 扩展坐标用于裁剪优化 - Vector4f clip_position_; // 裁剪空间坐标 (用于视锥体裁剪) - bool has_clip_position_; // 是否包含裁剪坐标 + std::optional clip_position_; // 裁剪空间坐标 (用于视锥体裁剪) }; inline Vertex operator*(const Matrix4f& matrix, const Vertex& vertex) { @@ -75,6 +72,26 @@ inline Vertex operator*(const Matrix4f& matrix, const Vertex& vertex) { vertex.GetColor()); } +// Minimal SoA layout for TBR pipeline +struct VertexSoA { + // 屏幕空间坐标(视口变换后) + std::vector pos_screen; // screen space position (x,y,z,w) + // 裁剪空间坐标(用于视锥体剔除):clip = MVP * pos + std::vector pos_clip; + std::vector normal; + std::vector uv; + std::vector color; + + inline size_t size() const { return pos_screen.size(); } + inline void resize(size_t n) { + pos_screen.resize(n); + pos_clip.resize(n); + normal.resize(n); + uv.resize(n); + color.resize(n); + } +}; + } // namespace simple_renderer #endif \ No newline at end of file diff --git a/src/include/vertex_soa.hpp b/src/include/vertex_soa.hpp deleted file mode 100644 index 4c5806a..0000000 --- a/src/include/vertex_soa.hpp +++ /dev/null @@ -1,33 +0,0 @@ -// Minimal SoA layout for TBR pipeline (Phase 1) -#ifndef SIMPLERENDER_SRC_INCLUDE_VERTEX_SOA_HPP_ -#define SIMPLERENDER_SRC_INCLUDE_VERTEX_SOA_HPP_ - -#include - -#include "math.hpp" -#include "color.h" - -namespace simple_renderer { - -struct VertexSoA { - // 屏幕空间坐标(视口变换后) - std::vector pos_screen; // screen space position (x,y,z,w) - // 裁剪空间坐标(用于视锥体剔除):clip = MVP * pos - std::vector pos_clip; - std::vector normal; - std::vector uv; - std::vector color; - - inline size_t size() const { return pos_screen.size(); } - inline void resize(size_t n) { - pos_screen.resize(n); - pos_clip.resize(n); - normal.resize(n); - uv.resize(n); - color.resize(n); - } -}; - -} // namespace simple_renderer - -#endif // SIMPLERENDER_SRC_INCLUDE_VERTEX_SOA_HPP_ diff --git a/src/rasterizer.cpp b/src/rasterizer.cpp index f4e251c..a30101b 100644 --- a/src/rasterizer.cpp +++ b/src/rasterizer.cpp @@ -50,24 +50,13 @@ std::vector Rasterizer::Rasterize(const Vertex& v0, const Vertex& v1, } // 透视矫正插值 - // 1. 获取三个顶点的1/w值 - float w0_inv = v0.GetPosition().w; - float w1_inv = v1.GetPosition().w; - float w2_inv = v2.GetPosition().w; + auto perspective_result = PerformPerspectiveCorrection( + v0.GetPosition().w, v1.GetPosition().w, v2.GetPosition().w, + v0.GetPosition().z, v1.GetPosition().z, v2.GetPosition().z, + barycentric_coord); - // 2. 插值1/w - float w_inv_interpolated = Interpolate(w0_inv, w1_inv, w2_inv, barycentric_coord); - - // 3. 计算透视矫正的重心坐标 - Vector3f corrected_bary( - barycentric_coord.x * w0_inv / w_inv_interpolated, - barycentric_coord.y * w1_inv / w_inv_interpolated, - barycentric_coord.z * w2_inv / w_inv_interpolated - ); - - // 4. 使用矫正的重心坐标进行插值 - auto z = Interpolate(v0.GetPosition().z, v1.GetPosition().z, - v2.GetPosition().z, corrected_bary); + const Vector3f& corrected_bary = perspective_result.corrected_barycentric; + float z = perspective_result.interpolated_z; Fragment fragment; @@ -114,17 +103,12 @@ void Rasterizer::RasterizeTo(const VertexSoA& soa, size_t i0, size_t i1, size_t float maxy = std::min(float(height_ - 1), bboxMax.y); // 与外部提供的裁剪区域相交(半开区间) -> 闭区间扫描 - int sx = std::max(x0, int(std::floor(minx))); - int sy = std::max(y0, int(std::floor(miny))); - int ex = std::min(x1 - 1, int(std::floor(maxx))); - int ey = std::min(y1 - 1, int(std::floor(maxy))); + int sx = std::max(x0, static_cast(std::floor(minx))); + int sy = std::max(y0, static_cast(std::floor(miny))); + int ex = std::min(x1 - 1, static_cast(std::floor(maxx))); + int ey = std::min(y1 - 1, static_cast(std::floor(maxy))); if (sx > ex || sy > ey) return; - // 透视矫正插值依赖 w - float w0_inv = p0.w; - float w1_inv = p1.w; - float w2_inv = p2.w; - for (int x = sx; x <= ex; ++x) { for (int y = sy; y <= ey; ++y) { auto [is_inside, bary] = GetBarycentricCoord( @@ -132,21 +116,21 @@ void Rasterizer::RasterizeTo(const VertexSoA& soa, size_t i0, size_t i1, size_t Vector3f(static_cast(x), static_cast(y), 0)); if (!is_inside) continue; - float w_inv_interp = Interpolate(w0_inv, w1_inv, w2_inv, bary); - Vector3f cb( - bary.x * w0_inv / w_inv_interp, - bary.y * w1_inv / w_inv_interp, - bary.z * w2_inv / w_inv_interp); + // 透视矫正插值 + auto perspective_result = PerformPerspectiveCorrection( + p0.w, p1.w, p2.w, + p0.z, p1.z, p2.z, + bary); + + const Vector3f& corrected_bary = perspective_result.corrected_barycentric; + float z = perspective_result.interpolated_z; - float z = Interpolate(p0.z, p1.z, p2.z, cb); - - Fragment frag; + Fragment frag; // Note: material 指针由调用方填写 frag.screen_coord = {x, y}; - frag.normal = Interpolate(soa.normal[i0], soa.normal[i1], soa.normal[i2], cb); - frag.uv = Interpolate(soa.uv[i0], soa.uv[i1], soa.uv[i2], cb); - frag.color = InterpolateColor(soa.color[i0], soa.color[i1], soa.color[i2], cb); + frag.normal = Interpolate(soa.normal[i0], soa.normal[i1], soa.normal[i2], corrected_bary); + frag.uv = Interpolate(soa.uv[i0], soa.uv[i1], soa.uv[i2], corrected_bary); + frag.color = InterpolateColor(soa.color[i0], soa.color[i1], soa.color[i2], corrected_bary); frag.depth = z; - // material 指针由调用方填写 out.push_back(frag); } @@ -182,14 +166,14 @@ std::pair Rasterizer::GetBarycentricCoord(const Vector3f& p0, template T Rasterizer::Interpolate(const T& v0, const T& v1, const T& v2, - const Vector3f& barycentric_coord) { + const Vector3f& barycentric_coord) const { return v0 * barycentric_coord.x + v1 * barycentric_coord.y + v2 * barycentric_coord.z; } Color Rasterizer::InterpolateColor(const Color& color0, const Color& color1, const Color& color2, - const Vector3f& barycentric_coord) { + const Vector3f& barycentric_coord) const { auto color_r = FloatToUint8_t( static_cast(color0[Color::kColorIndexRed]) * barycentric_coord.x + static_cast(color1[Color::kColorIndexRed]) * barycentric_coord.y + @@ -208,6 +192,31 @@ Color Rasterizer::InterpolateColor(const Color& color0, const Color& color1, return Color(color_r, color_g, color_b); } +// 透视矫正helper函数:在透视投影下,1/w 在屏幕空间中是线性的// 因此需要先对 1/w 进行插值,再用结果矫正其他属性 +Rasterizer::PerspectiveCorrectionResult Rasterizer::PerformPerspectiveCorrection( + float w0, float w1, float w2, + float z0, float z1, float z2, + const Vector3f& original_barycentric) const { + + // 1. 插值 1/w (注意:这里传入的w0,w1,w2是原始的w值,需要先求倒数) + float w0_inv = 1.0f / w0; + float w1_inv = 1.0f / w1; + float w2_inv = 1.0f / w2; + float w_inv_interpolated = Interpolate(w0_inv, w1_inv, w2_inv, original_barycentric); + + // 2. 计算透视矫正的重心坐标 + Vector3f corrected_barycentric( + original_barycentric.x * w0_inv / w_inv_interpolated, + original_barycentric.y * w1_inv / w_inv_interpolated, + original_barycentric.z * w2_inv / w_inv_interpolated + ); + + // 3. 使用矫正的重心坐标插值深度值 + float interpolated_z = Interpolate(z0, z1, z2, corrected_barycentric); + + return {corrected_barycentric, interpolated_z}; +} + // Calculate the normal vector based on the vertices // 根据顶点计算法向量 Vector3f Rasterizer::CalculateNormal(const Vector3f& v0, const Vector3f& v1, diff --git a/src/renderer.cpp b/src/renderer.cpp index b449b93..f6e5984 100755 --- a/src/renderer.cpp +++ b/src/renderer.cpp @@ -33,12 +33,23 @@ namespace simple_renderer { +// RenderingMode到字符串转换函数 +std::string RenderingModeToString(RenderingMode mode) { + switch(mode) { + case RenderingMode::TRADITIONAL: + return "TRADITIONAL"; + case RenderingMode::TILE_BASED: + return "TILE_BASED"; + case RenderingMode::DEFERRED: + return "DEFERRED"; + } +} SimpleRenderer::SimpleRenderer(size_t width, size_t height) : height_(height), width_(width), log_system_(LogSystem(kLogFilePath, kLogFileMaxSize, kLogFileMaxCount)), current_mode_(RenderingMode::TILE_BASED), - early_z_enabled_(true) { + is_early_z_enabled_(true) { rasterizer_ = std::make_shared(width, height); } @@ -52,19 +63,7 @@ bool SimpleRenderer::DrawModel(const Model &model, const Shader &shader, void SimpleRenderer::SetRenderingMode(RenderingMode mode) { current_mode_ = mode; - std::string mode_name; - switch(mode) { - case RenderingMode::TRADITIONAL: - mode_name = "TRADITIONAL"; - break; - case RenderingMode::TILE_BASED: - mode_name = "TILE_BASED"; - break; - case RenderingMode::DEFERRED: - mode_name = "DEFERRED"; - break; - } - SPDLOG_INFO("rendering mode set to: {}", mode_name); + SPDLOG_INFO("rendering mode set to: {}", RenderingModeToString(mode)); } RenderingMode SimpleRenderer::GetRenderingMode() const { @@ -79,19 +78,8 @@ fragments—resulting in faster rendering. 通过在光栅化过程中执行深度测试,仅保留每个像素的深度值最近的片段,避免存储所有片段,从而优化性能,实现更快的渲染。 */ void SimpleRenderer::ExecuteDrawPipeline(const Model &model, uint32_t *buffer) { - std::string mode_name; - switch(current_mode_) { - case RenderingMode::TRADITIONAL: - mode_name = "TRADITIONAL"; - break; - case RenderingMode::TILE_BASED: - mode_name = "TILE_BASED"; - break; - case RenderingMode::DEFERRED: - mode_name = "DEFERRED"; - break; - } - SPDLOG_INFO("execute draw pipeline for {} using {} mode", model.GetModelPath(), mode_name); + SPDLOG_INFO("execute draw pipeline for {} using {} mode", + model.GetModelPath(), RenderingModeToString(current_mode_)); if (!shader_) { SPDLOG_ERROR("No shader set for DrawModel, cannot render"); @@ -147,47 +135,19 @@ void SimpleRenderer::ExecuteDrawPipeline(const Model &model, uint32_t *buffer) { switch (current_mode_) { case RenderingMode::TRADITIONAL: { auto stats = ExecuteTraditionalPipeline(model, processedVertices, buffer); - double total_ms = vertex_ms + stats.total_ms; - - SPDLOG_INFO("=== TRADITIONAL RENDERING PERFORMANCE ==="); - SPDLOG_INFO("Vertex Shader: {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100); - SPDLOG_INFO("Buffer Alloc: {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100); - SPDLOG_INFO("Rasterization: {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100); - SPDLOG_INFO("Merge: {:8.3f} ms ({:5.1f}%)", stats.merge_ms, stats.merge_ms/total_ms*100); - SPDLOG_INFO("Total: {:8.3f} ms", total_ms); - SPDLOG_INFO("=========================================="); + PrintTraditionalStats(vertex_ms, stats); break; } case RenderingMode::TILE_BASED: { auto stats = ExecuteTileBasedPipeline(model, processedSoA, buffer); - double total_ms = vertex_ms + stats.total_ms; - - SPDLOG_INFO("=== TILE-BASED RENDERING PERFORMANCE ==="); - SPDLOG_INFO("Vertex Shader: {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100); - SPDLOG_INFO("Setup: {:8.3f} ms ({:5.1f}%)", stats.setup_ms, stats.setup_ms/total_ms*100); - SPDLOG_INFO("Binning: {:8.3f} ms ({:5.1f}%)", stats.binning_ms, stats.binning_ms/total_ms*100); - SPDLOG_INFO("Buffer Alloc: {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100); - SPDLOG_INFO("Rasterization: {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100); - SPDLOG_INFO("Merge: {:8.3f} ms ({:5.1f}%)", stats.merge_ms, stats.merge_ms/total_ms*100); - SPDLOG_INFO("Total: {:8.3f} ms", total_ms); - SPDLOG_INFO("=========================================="); + PrintTileBasedStats(vertex_ms, stats); break; } case RenderingMode::DEFERRED: { auto stats = ExecuteDeferredPipeline(model, processedVertices, buffer); - double total_ms = vertex_ms + stats.total_ms; - - SPDLOG_INFO("=== DEFERRED RENDERING PERFORMANCE ==="); - SPDLOG_INFO("Vertex Shader: {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100); - SPDLOG_INFO("Buffer Alloc: {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100); - SPDLOG_INFO("Rasterization: {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100); - SPDLOG_INFO("Fragment Collection: {:8.3f} ms ({:5.1f}%)", stats.fragment_collection_ms, stats.fragment_collection_ms/total_ms*100); - SPDLOG_INFO("Fragment Merge: {:8.3f} ms ({:5.1f}%)", stats.fragment_merge_ms, stats.fragment_merge_ms/total_ms*100); - SPDLOG_INFO("Deferred Shading: {:8.3f} ms ({:5.1f}%)", stats.deferred_shading_ms, stats.deferred_shading_ms/total_ms*100); - SPDLOG_INFO("Total: {:8.3f} ms", total_ms); - SPDLOG_INFO("========================================="); + PrintDeferredStats(vertex_ms, stats); break; } } @@ -224,7 +184,7 @@ SimpleRenderer::DeferredRenderStats SimpleRenderer::ExecuteDeferredPipeline( std::vector material_cache; material_cache.reserve(model.GetFaces().size()); for (const auto &f : model.GetFaces()) { - material_cache.push_back(f.GetMaterial()); // 值拷贝 + material_cache.emplace_back(f.GetMaterial()); // 值拷贝 } auto buffer_alloc_end_time = std::chrono::high_resolution_clock::now(); auto buffer_alloc_duration = std::chrono::duration_cast( @@ -234,8 +194,8 @@ SimpleRenderer::DeferredRenderStats SimpleRenderer::ExecuteDeferredPipeline( /* * * Rasterization * * */ auto rasterization_start_time = std::chrono::high_resolution_clock::now(); #pragma omp parallel num_threads(kNProc) default(none) \ - shared(processedVertices, fragmentsBuffer_all_thread, rasterizer_, width_, \ - height_, material_cache) firstprivate(model) + shared(processedVertices, fragmentsBuffer_all_thread, rasterizer_, width_, \ + height_, material_cache, model) { int thread_id = omp_get_thread_num(); auto &fragmentsBuffer_per_thread = fragmentsBuffer_all_thread[thread_id]; @@ -349,7 +309,7 @@ Vertex SimpleRenderer::PerspectiveDivision(const Vertex &vertex) { Vector4f position = vertex.GetPosition(); // 检查w分量,避免除零和负数问题 - if (position.w <= 1e-6f) { + if (position.w <= kMinWValue) { Vector4f farPosition(0.0f, 0.0f, 1.0f, 1.0f); return Vertex(farPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor()); } @@ -370,11 +330,7 @@ Vertex SimpleRenderer::PerspectiveDivision(const Vertex &vertex) { ndcPosition.z = std::clamp(ndcPosition.z, -1.0f, 1.0f); // 创建新的顶点,保持其他属性和裁剪空间坐标不变 - if (vertex.HasClipPosition()) { - return Vertex(ndcPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor(), vertex.GetClipPosition()); - } else { - return Vertex(ndcPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor()); - } + return Vertex(ndcPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor(), vertex.GetClipPosition()); } Vertex SimpleRenderer::ViewportTransformation(const Vertex &vertex) { @@ -629,8 +585,7 @@ SimpleRenderer::RenderStats SimpleRenderer::ExecuteTraditionalPipeline( auto raster_start_time = std::chrono::high_resolution_clock::now(); #pragma omp parallel num_threads(kNProc) default(none) \ shared(processedVertices, rasterizer_, shader_, width_, height_, \ - depthBuffer_all_thread, colorBuffer_all_thread) \ - firstprivate(model) + depthBuffer_all_thread, colorBuffer_all_thread, model) { int thread_id = omp_get_thread_num(); auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id]; @@ -739,7 +694,7 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline( // 1. Setup阶段 auto setup_start_time = std::chrono::high_resolution_clock::now(); - const size_t TILE_SIZE = 64; // 64x64 pixels per tile + const size_t TILE_SIZE = kDefaultTileSize; // Default tile size per tile const size_t tiles_x = (width_ + TILE_SIZE - 1) / TILE_SIZE; const size_t tiles_y = (height_ + TILE_SIZE - 1) / TILE_SIZE; const size_t total_tiles = tiles_x * tiles_y; @@ -774,7 +729,7 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline( #pragma omp parallel num_threads(kNProc) default(none) \ shared(tile_triangles, rasterizer_, shader_, width_, height_, \ depthBuffer, colorBuffer, tiles_x, tiles_y, total_tiles, \ - early_z_enabled_, soa) + is_early_z_enabled_, soa) { int thread_id = omp_get_thread_num(); @@ -796,7 +751,7 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline( tiles_x, tiles_y, TILE_SIZE, tile_depth_buffer.get(), tile_color_buffer.get(), depthBuffer, colorBuffer, - soa, early_z_enabled_, &scratch_fragments); + soa, is_early_z_enabled_, &scratch_fragments); } } auto rasterization_end_time = std::chrono::high_resolution_clock::now(); @@ -826,4 +781,44 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline( return stats; } +void SimpleRenderer::PrintTraditionalStats(double vertex_ms, const RenderStats& stats) const { + double total_ms = vertex_ms + stats.total_ms; + + SPDLOG_INFO("=== TRADITIONAL RENDERING PERFORMANCE ==="); + SPDLOG_INFO("Vertex Shader: {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100); + SPDLOG_INFO("Buffer Alloc: {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100); + SPDLOG_INFO("Rasterization: {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100); + SPDLOG_INFO("Merge: {:8.3f} ms ({:5.1f}%)", stats.merge_ms, stats.merge_ms/total_ms*100); + SPDLOG_INFO("Total: {:8.3f} ms", total_ms); + SPDLOG_INFO("=========================================="); +} + +void SimpleRenderer::PrintTileBasedStats(double vertex_ms, const TileRenderStats& stats) const { + double total_ms = vertex_ms + stats.total_ms; + + SPDLOG_INFO("=== TILE-BASED RENDERING PERFORMANCE ==="); + SPDLOG_INFO("Vertex Shader: {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100); + SPDLOG_INFO("Setup: {:8.3f} ms ({:5.1f}%)", stats.setup_ms, stats.setup_ms/total_ms*100); + SPDLOG_INFO("Binning: {:8.3f} ms ({:5.1f}%)", stats.binning_ms, stats.binning_ms/total_ms*100); + SPDLOG_INFO("Buffer Alloc: {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100); + SPDLOG_INFO("Rasterization: {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100); + SPDLOG_INFO("Merge: {:8.3f} ms ({:5.1f}%)", stats.merge_ms, stats.merge_ms/total_ms*100); + SPDLOG_INFO("Total: {:8.3f} ms", total_ms); + SPDLOG_INFO("=========================================="); +} + +void SimpleRenderer::PrintDeferredStats(double vertex_ms, const DeferredRenderStats& stats) const { + double total_ms = vertex_ms + stats.total_ms; + + SPDLOG_INFO("=== DEFERRED RENDERING PERFORMANCE ==="); + SPDLOG_INFO("Vertex Shader: {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100); + SPDLOG_INFO("Buffer Alloc: {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100); + SPDLOG_INFO("Rasterization: {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100); + SPDLOG_INFO("Fragment Collection: {:8.3f} ms ({:5.1f}%)", stats.fragment_collection_ms, stats.fragment_collection_ms/total_ms*100); + SPDLOG_INFO("Fragment Merge: {:8.3f} ms ({:5.1f}%)", stats.fragment_merge_ms, stats.fragment_merge_ms/total_ms*100); + SPDLOG_INFO("Deferred Shading: {:8.3f} ms ({:5.1f}%)", stats.deferred_shading_ms, stats.deferred_shading_ms/total_ms*100); + SPDLOG_INFO("Total: {:8.3f} ms", total_ms); + SPDLOG_INFO("========================================="); +} + } // namespace simple_renderer diff --git a/test/system_test/main.cpp b/test/system_test/main.cpp index 0f222b5..9725181 100755 --- a/test/system_test/main.cpp +++ b/test/system_test/main.cpp @@ -84,18 +84,8 @@ int main(int argc, char **argv) { simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::TILE_BASED); // 输出当前渲染模式 - std::string current_mode_name; - switch(simple_renderer.GetRenderingMode()) { - case simple_renderer::RenderingMode::TRADITIONAL: - current_mode_name = "TRADITIONAL (传统光栅化)"; - break; - case simple_renderer::RenderingMode::TILE_BASED: - current_mode_name = "TILE_BASED (基于Tile光栅化)"; - break; - case simple_renderer::RenderingMode::DEFERRED: - current_mode_name = "DEFERRED (模仿GPU的延迟渲染)"; - break; - } + std::string current_mode_name = simple_renderer::RenderingModeToString( + simple_renderer.GetRenderingMode()); SPDLOG_INFO("当前渲染模式: {}", current_mode_name); auto display = Display(kWidth, kHeight); From 957c9b0e10cc93f83f931ec29f08fdd0d014489d Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Fri, 12 Sep 2025 23:24:44 +0800 Subject: [PATCH 16/24] Refactor: Extract pipeline into standalone class; rename TraditionalPipeline to PerTriangle for consistency with TileBased. Switch core function comments to Doxygen style. Signed-off-by: ZhouFANG --- src/include/face.hpp | 2 +- src/include/rasterizer.hpp | 43 +- src/include/renderer.h | 215 ++--- src/include/renderers/deferred_renderer.hpp | 31 + .../renderers/per_triangle_renderer.hpp | 28 + src/include/renderers/renderer_base.hpp | 66 ++ src/include/renderers/tile_based_renderer.hpp | 110 +++ src/rasterizer.cpp | 56 ++ src/renderer.cpp | 821 +----------------- src/renderers/deferred_renderer.cpp | 146 ++++ src/renderers/per_triangle_renderer.cpp | 172 ++++ src/renderers/renderer_base.cpp | 44 + src/renderers/tile_based_renderer.cpp | 366 ++++++++ test/system_test/main.cpp | 2 +- 14 files changed, 1151 insertions(+), 951 deletions(-) create mode 100644 src/include/renderers/deferred_renderer.hpp create mode 100644 src/include/renderers/per_triangle_renderer.hpp create mode 100644 src/include/renderers/renderer_base.hpp create mode 100644 src/include/renderers/tile_based_renderer.hpp mode change 100755 => 100644 src/renderer.cpp create mode 100644 src/renderers/deferred_renderer.cpp create mode 100644 src/renderers/per_triangle_renderer.cpp create mode 100644 src/renderers/renderer_base.cpp create mode 100644 src/renderers/tile_based_renderer.cpp diff --git a/src/include/face.hpp b/src/include/face.hpp index 28a5b30..49f0754 100644 --- a/src/include/face.hpp +++ b/src/include/face.hpp @@ -40,7 +40,7 @@ class Face { // Get functions // 获取函数 inline const std::array& GetIndices() const { return indices_; } - inline const size_t GetIndex(size_t index) const { return indices_[index]; } + inline size_t GetIndex(size_t index) const { return indices_[index]; } inline const Material& GetMaterial() const { return material_; } private: diff --git a/src/include/rasterizer.hpp b/src/include/rasterizer.hpp index c389f49..cd0b349 100644 --- a/src/include/rasterizer.hpp +++ b/src/include/rasterizer.hpp @@ -16,19 +16,54 @@ class Rasterizer { auto operator=(Rasterizer&& rasterizer) -> Rasterizer& = default; ~Rasterizer() = default; + /** + * @brief 构造具有指定尺寸的光栅化器 + * @param width 光栅化器宽度 + * @param height 光栅化器高度 + */ Rasterizer(size_t width, size_t height); + /** + * @brief 光栅化三角形,生成片段列表 + * @param v0 三角形第一个顶点 + * @param v1 三角形第二个顶点 + * @param v2 三角形第三个顶点 + * @return 生成的片段向量 + */ std::vector Rasterize(const Vertex& v0, const Vertex& v1, const Vertex& v2); - // 非分配版本:将片段直接写入调用方提供的容器 - // 可选的裁剪区域为半开区间 [x0, x1) × [y0, y1) - // 用于 TBR:将光栅化限制在 tile 边界内,便于复用外部 scratch 容器 + /** + * @brief 非分配版本:将片段直接写入调用方提供的容器 + * + * 可选的裁剪区域为半开区间 [x0, x1) × [y0, y1) + * 用于 TBR:将光栅化限制在 tile 边界内,便于复用外部 scratch 容器 + * + * @param v0 三角形第一个顶点 + * @param v1 三角形第二个顶点 + * @param v2 三角形第三个顶点 + * @param x0 裁剪区域左边界(包含) + * @param y0 裁剪区域上边界(包含) + * @param x1 裁剪区域右边界(不包含) + * @param y1 裁剪区域下边界(不包含) + * @param out 输出片段容器 + */ void RasterizeTo(const Vertex& v0, const Vertex& v1, const Vertex& v2, int x0, int y0, int x1, int y1, std::vector& out); - // SoA 版本:按顶点索引从 SoA 读取三角形三顶点 + /** + * @brief SoA 版本:按顶点索引从 SoA 读取三角形三顶点 + * @param soa 结构体数组格式的顶点数据 + * @param i0 三角形第一个顶点索引 + * @param i1 三角形第二个顶点索引 + * @param i2 三角形第三个顶点索引 + * @param x0 裁剪区域左边界(包含) + * @param y0 裁剪区域上边界(包含) + * @param x1 裁剪区域右边界(不包含) + * @param y1 裁剪区域下边界(不包含) + * @param out 输出片段容器 + */ void RasterizeTo(const VertexSoA& soa, size_t i0, size_t i1, size_t i2, int x0, int y0, int x1, int y1, std::vector& out); diff --git a/src/include/renderer.h b/src/include/renderer.h index f239910..e11c93f 100755 --- a/src/include/renderer.h +++ b/src/include/renderer.h @@ -18,208 +18,95 @@ #define SIMPLERENDER_SRC_INCLUDE_RENDERER_H_ #include -#include -#include +#include #include -#include "buffer.hpp" -#include "light.h" #include "log_system.h" -#include "math.hpp" #include "model.hpp" -#include "rasterizer.hpp" #include "shader.hpp" +#include "renderers/renderer_base.hpp" namespace simple_renderer { // 渲染模式枚举 +/** + * @brief 渲染模式 + * - PER_TRIANGLE: 逐三角形(triangle-major)前向渲染 + * - TILE_BASED: 基于 tile(tile-major)前向渲染 + * - DEFERRED: 延迟渲染(片段收集后再着色) + */ enum class RenderingMode { - TRADITIONAL, // 传统光栅化模式 - 立即深度测试 - TILE_BASED, // Tile-based光栅化模式 - 移动GPU架构 - DEFERRED // 延迟渲染模式 - 经典GPU管线教学模拟 + PER_TRIANGLE, //!< 逐三角形(triangle-major) + TILE_BASED, //!< 基于 tile(tile-major) + DEFERRED //!< 延迟渲染 }; -// RenderingMode辅助函数声明 +/** + * @brief 将渲染模式枚举转为可读字符串 + * @param mode 渲染模式 + * @return 可读字符串(PER_TRIANGLE/TILE_BASED/DEFERRED) + */ std::string RenderingModeToString(RenderingMode mode); -std::string RenderingModeToDetailedString(RenderingMode mode); - - -// SoA 版 tile 列表中的三角形引用(仅存索引与材质指针) -struct TriangleRef { - size_t i0, i1, i2; - const Material* material = nullptr; - size_t face_index = 0; -}; +/** + * @brief 渲染门面(Facade) + * + * 职责: + * - 仅作为模式选择与调用入口; + * - 根据 `RenderingMode` 构造并持有具体渲染器; + * - 对外暴露统一的 `DrawModel` 接口。 + */ class SimpleRenderer { public: /** - * 构造函数 - * @param width - * @param height - * @param buffer 要进行绘制的内存区域,大小为 width*height*sizeof(uint32_t) - * @param + * @brief 构造渲染器门面 + * @param width 画布宽度(像素) + * @param height 画布高度(像素) */ SimpleRenderer(size_t width, size_t height); - - /// @name 默认构造/析构函数 - /// @{ - SimpleRenderer(const SimpleRenderer &_simplerenderer) = default; - SimpleRenderer(SimpleRenderer &&_simplerenderer) = default; - auto operator=(const SimpleRenderer &_simplerenderer) -> SimpleRenderer & = - default; - auto operator=(SimpleRenderer &&_simplerenderer) -> SimpleRenderer & = - default; - virtual ~SimpleRenderer() = default; - /// @} + ~SimpleRenderer() = default; /** - * 绘制单个模型 - * @param model 要绘制的模型 - * @param shader 用于渲染的着色器 - * @param buffer 输出缓冲区 - * @return 绘制是否成功 + * @brief 绘制单个模型 + * @param model 模型 + * @param shader 着色器(含 uniform) + * @param buffer 输出颜色缓冲(width*height) + * @return 是否成功 */ bool DrawModel(const Model &model, const Shader &shader, uint32_t *buffer); /** - * 设置渲染模式 - * @param mode 渲染模式(传统或基于Tile) + * @brief 设置渲染模式 */ void SetRenderingMode(RenderingMode mode); - /** - * 获取当前渲染模式 - * @return 当前渲染模式 + * @brief 获取当前渲染模式 */ RenderingMode GetRenderingMode() const; - private: - const size_t height_; - const size_t width_; - LogSystem log_system_; - RenderingMode current_mode_; // 当前渲染模式 - bool is_early_z_enabled_; // Early-Z优化开关 - - std::shared_ptr shader_; - std::shared_ptr rasterizer_; - - // Rendering constants - static constexpr float kMinWValue = 1e-6f; // W分量检查阈值(避免除零) - static constexpr size_t kDefaultTileSize = 64; // 默认Tile大小(64x64像素) - + // 可选:配置参数(仅对 TileBasedRenderer 生效;运行中修改将重建 TBR 实例) /** - * 执行绘制管线 - * @param model 模型 - * @param buffer 输出缓冲区 + * @brief 启用或禁用 Early‑Z(仅 TBR 有效) */ - void ExecuteDrawPipeline(const Model &model, uint32_t *buffer); - - + void SetEarlyZEnabled(bool enabled); /** - * 传统光栅化渲染 - * @param model 模型 - * @param processedVertices 已处理的顶点 - * @param buffer 输出缓冲区 - * @return 渲染统计信息 + * @brief 设置 Tile 大小(仅 TBR 有效) */ - struct RenderStats { - double buffer_alloc_ms; - double rasterization_ms; - double merge_ms; - double total_ms; - }; - - RenderStats ExecuteTraditionalPipeline(const Model &model, - const std::vector &processedVertices, - uint32_t *buffer); + void SetTileSize(size_t tile_size); - struct TileRenderStats { - double setup_ms; - double binning_ms; - double buffer_alloc_ms; - double rasterization_ms; - double merge_ms; - double total_ms; - }; - - /** - * 延迟渲染统计信息 - */ - struct DeferredRenderStats { - double buffer_alloc_ms; - double rasterization_ms; - double fragment_collection_ms; - double fragment_merge_ms; - double deferred_shading_ms; - double total_ms; - }; - TileRenderStats ExecuteTileBasedPipeline(const Model &model, - const VertexSoA &soa, - uint32_t *buffer); - - /** - * 延迟渲染管线 - * @param model 模型 - * @param processedVertices 已处理的顶点 - * @param buffer 输出缓冲区 - * @return 渲染统计信息 - */ - DeferredRenderStats ExecuteDeferredPipeline(const Model &model, - const std::vector &processedVertices, - uint32_t *buffer); - -private: - - // SoA 版本的 Triangle-Tile binning(两遍计数 + reserve) - void TriangleTileBinning( - const Model &model, - const VertexSoA &soa, - std::vector> &tile_triangles, - size_t tiles_x, size_t tiles_y, size_t tile_size); - - - // SoA 版本的 tile 光栅化 - void RasterizeTile( - size_t tile_id, - const std::vector &triangles, - size_t tiles_x, size_t tiles_y, size_t tile_size, - float* tile_depth_buffer, uint32_t* tile_color_buffer, - std::unique_ptr &global_depth_buffer, - std::unique_ptr &global_color_buffer, - const VertexSoA &soa, - bool use_early_z = false, - std::vector* scratch_fragments = nullptr); + private: + void EnsureRenderer(); - /** - * 透视除法 - 将裁剪空间坐标转换为归一化设备坐标(NDC) - * @param vertex 裁剪空间坐标的顶点 - * @return 转换后的顶点(NDC坐标) - */ - Vertex PerspectiveDivision(const Vertex &vertex); + private: + const size_t height_; + const size_t width_; + LogSystem log_system_; + RenderingMode current_mode_; + std::unique_ptr renderer_; - /** - * 视口变换 - 将NDC坐标转换为屏幕坐标 - * @param vertex NDC坐标的顶点 - * @return 转换后的顶点(屏幕坐标) - */ - Vertex ViewportTransformation(const Vertex &vertex); - - /** - * 打印传统渲染性能统计信息 - */ - void PrintTraditionalStats(double vertex_ms, const RenderStats& stats) const; - - /** - * 打印基于Tile渲染性能统计信息 - */ - void PrintTileBasedStats(double vertex_ms, const TileRenderStats& stats) const; - - /** - * 打印延迟渲染性能统计信息 - */ - void PrintDeferredStats(double vertex_ms, const DeferredRenderStats& stats) const; - + // TBR 配置缓存:在创建 TileBasedRenderer 时下发 + bool tbr_early_z_ = true; + size_t tbr_tile_size_ = 64; }; } // namespace simple_renderer diff --git a/src/include/renderers/deferred_renderer.hpp b/src/include/renderers/deferred_renderer.hpp new file mode 100644 index 0000000..245f5f8 --- /dev/null +++ b/src/include/renderers/deferred_renderer.hpp @@ -0,0 +1,31 @@ +#ifndef SIMPLERENDER_SRC_INCLUDE_RENDERERS_DEFERRED_RENDERER_HPP_ +#define SIMPLERENDER_SRC_INCLUDE_RENDERERS_DEFERRED_RENDERER_HPP_ + +#include "renderers/renderer_base.hpp" + +namespace simple_renderer { + +/** + * @brief 延迟渲染器(Deferred) + * + * 组织处理方式模拟 OpenGL 在 GPU上的工作原理,模仿 GPU管线。 + * 但相比于另外两个前向渲染实现,导致内存使用增加和渲染速度变慢。 + * + * 特点: + * - AoS 顶点路径; + * - 首先按像素收集所有片段并选择最近深度; + * - 再对选择的片段执行片段着色(模拟经典 GPU 管线的一种教学实现)。 + * - + */ +class DeferredRenderer final : public RendererBase { + public: + using RendererBase::RendererBase; + /** + * @copydoc RendererBase::Render + */ + bool Render(const Model& model, const Shader& shader, uint32_t* out_color) override; +}; + +} // namespace simple_renderer + +#endif // SIMPLERENDER_SRC_INCLUDE_RENDERERS_DEFERRED_RENDERER_HPP_ diff --git a/src/include/renderers/per_triangle_renderer.hpp b/src/include/renderers/per_triangle_renderer.hpp new file mode 100644 index 0000000..e2cee62 --- /dev/null +++ b/src/include/renderers/per_triangle_renderer.hpp @@ -0,0 +1,28 @@ +#ifndef SIMPLERENDER_SRC_INCLUDE_RENDERERS_PER_TRIANGLE_RENDERER_HPP_ +#define SIMPLERENDER_SRC_INCLUDE_RENDERERS_PER_TRIANGLE_RENDERER_HPP_ + +#include "renderers/renderer_base.hpp" + +namespace simple_renderer { + +/** + * @brief 逐三角形渲染器(Triangle‑Major) + * + * 特点: + * - AoS 顶点路径; + * - 每线程本地 framebuffer(depth/color)合并; + * - 背面剔除在屏幕空间完成; + * - 接近“传统”栈式前向渲染教学实现。 + */ +class PerTriangleRenderer final : public RendererBase { + public: + using RendererBase::RendererBase; + /** + * @copydoc RendererBase::Render + */ + bool Render(const Model& model, const Shader& shader, uint32_t* out_color) override; +}; + +} // namespace simple_renderer + +#endif // SIMPLERENDER_SRC_INCLUDE_RENDERERS_PER_TRIANGLE_RENDERER_HPP_ diff --git a/src/include/renderers/renderer_base.hpp b/src/include/renderers/renderer_base.hpp new file mode 100644 index 0000000..ad09ac7 --- /dev/null +++ b/src/include/renderers/renderer_base.hpp @@ -0,0 +1,66 @@ +// Renderer base and options +#ifndef SIMPLERENDER_SRC_INCLUDE_RENDERERS_RENDERER_BASE_HPP_ +#define SIMPLERENDER_SRC_INCLUDE_RENDERERS_RENDERER_BASE_HPP_ + +#include +#include + +#include "rasterizer.hpp" +#include "vertex.hpp" +#include "model.hpp" +#include "shader.hpp" + +namespace simple_renderer { + + +/** + * @brief 渲染器抽象基类 + * + * 约定: + * - Render 负责完成完整的渲染过程(顶点变换 + 光栅化 + 着色 + 写入输出缓冲)。 + * - 子类选择不同的“组织单元”:(按照并行组织单元)逐三角形、按 tile、或延迟管线。 + * - 公共的透视除法与视口变换在此提供,子类按需复用。 + */ +class RendererBase { + public: + RendererBase(size_t width, size_t height) + : width_(width), height_(height), rasterizer_(std::make_shared(width, height)) {} + virtual ~RendererBase() = default; + + RendererBase(const RendererBase&) = delete; + RendererBase& operator=(const RendererBase&) = delete; + + /** + * @brief 执行一次渲染 + * @param model 模型数据 + * @param shader 着色器(包含材质/光照/矩阵等 uniform) + * @param out_color 输出颜色缓冲(大小为 width*height) + * @return 是否渲染成功 + */ + virtual bool Render(const Model& model, const Shader& shader, uint32_t* out_color) = 0; + + protected: + /** + * @brief 透视除法:裁剪空间 -> NDC + * @param vertex 裁剪空间顶点 + * @return NDC 顶点(保留 1/w 以供透视校正) + */ + Vertex PerspectiveDivision(const Vertex& vertex); + /** + * @brief 视口变换:NDC -> 屏幕坐标 + * @param vertex NDC 顶点 + * @return 屏幕空间顶点 + */ + Vertex ViewportTransformation(const Vertex& vertex); + + protected: + size_t width_; + size_t height_; + std::shared_ptr rasterizer_; + + static constexpr float kMinWValue = 1e-6f; +}; + +} // namespace simple_renderer + +#endif // SIMPLERENDER_SRC_INCLUDE_RENDERERS_RENDERER_BASE_HPP_ diff --git a/src/include/renderers/tile_based_renderer.hpp b/src/include/renderers/tile_based_renderer.hpp new file mode 100644 index 0000000..e3ecb89 --- /dev/null +++ b/src/include/renderers/tile_based_renderer.hpp @@ -0,0 +1,110 @@ +#ifndef SIMPLERENDER_SRC_INCLUDE_RENDERERS_TILE_BASED_RENDERER_HPP_ +#define SIMPLERENDER_SRC_INCLUDE_RENDERERS_TILE_BASED_RENDERER_HPP_ + +#include "renderers/renderer_base.hpp" + +namespace simple_renderer { + +/** + * @brief Tile 中的三角形轻量引用(SoA 索引 + 材质指针) + */ +struct TileTriangleRef { + size_t i0, i1, i2; + const Material* material = nullptr; + size_t face_index = 0; +}; + +/** + * @brief 基于 Tile 的渲染器(Tile‑Major) + * + * 特点: + * - SoA 顶点布局; + * - 三角形按 tile 分箱(binning),每 tile 内局部 Early‑Z; + * - 单份全局 framebuffer,按 tile 覆盖范围直接拷贝回写; + * - 通过构造参数 early_z 与 tile_size 控制行为。 + */ +class TileBasedRenderer final : public RendererBase { + public: + /** + * @brief 构造函数 + * @param width 画布宽度 + * @param height 画布高度 + * @param early_z 是否启用 Early‑Z(默认启用) + * @param tile_size Tile 像素尺寸(默认 64) + */ + TileBasedRenderer(size_t width, size_t height, bool early_z = true, size_t tile_size = 64) + : RendererBase(width, height), early_z_(early_z), tile_size_(tile_size) {} + /** + * @copydoc RendererBase::Render + */ + bool Render(const Model& model, const Shader& shader, uint32_t* out_color) override; + + private: + /** + * @brief 将三角形按屏幕空间包围盒映射到 tile 网格 + * @param model 模型(提供面/材质) + * @param soa 经过变换后的 SoA 顶点数据 + * @param tile_triangles 输出:每个 tile 的三角形引用列表 + * @param tiles_x 水平 tile 数 + * @param tiles_y 垂直 tile 数 + * @param tile_size tile 像素尺寸 + */ + void TriangleTileBinning(const Model &model, + const VertexSoA &soa, + std::vector> &tile_triangles, + size_t tiles_x, size_t tiles_y, size_t tile_size); + + /** + * @brief 处理单个三角形的 tile binning 逻辑 + * @param tri_idx 三角形索引 + * @param count_only 是否仅进行计数(true=计数模式,false=填充模式) + * @param model 模型数据 + * @param soa 经过变换后的 SoA 顶点数据 + * @param tiles_x 水平 tile 数 + * @param tiles_y 垂直 tile 数 + * @param tile_size tile 像素尺寸 + * @param tile_counts tile 计数数组的引用(计数模式时使用) + * @param tile_triangles tile 三角形引用列表(填充模式时使用) + */ + void ProcessTriangleForTileBinning( + size_t tri_idx, bool count_only, + const Model& model, const VertexSoA& soa, + size_t tiles_x, size_t tiles_y, size_t tile_size, + std::vector& tile_counts, + std::vector>& tile_triangles); + + /** + * @brief 光栅化单个 tile,并将结果写回全局 framebuffer + * @param tile_id tile 序号 + * @param triangles 该 tile 覆盖的三角形引用 + * @param tiles_x 水平 tile 数 + * @param tiles_y 垂直 tile 数 + * @param tile_size tile 像素尺寸 + * @param tile_depth_buffer tile 局部深度缓冲(由调用方提供/复用) + * @param tile_color_buffer tile 局部颜色缓冲(由调用方提供/复用) + * @param global_depth_buffer 全局深度缓冲(单份) + * @param global_color_buffer 全局颜色缓冲(单份) + * @param soa 经过变换后的 SoA 顶点数据 + * @param shader 着色器 + * @param use_early_z 是否启用 Early‑Z + * @param scratch_fragments 可复用片段临时容器 + */ + void RasterizeTile(size_t tile_id, + const std::vector &triangles, + size_t tiles_x, size_t tiles_y, size_t tile_size, + float* tile_depth_buffer, uint32_t* tile_color_buffer, + std::unique_ptr &global_depth_buffer, + std::unique_ptr &global_color_buffer, + const VertexSoA &soa, + const Shader& shader, + bool use_early_z, + std::vector* scratch_fragments); + + private: + const bool early_z_; + const size_t tile_size_; +}; + +} // namespace simple_renderer + +#endif // SIMPLERENDER_SRC_INCLUDE_RENDERERS_TILE_BASED_RENDERER_HPP_ diff --git a/src/rasterizer.cpp b/src/rasterizer.cpp index a30101b..84cbc83 100644 --- a/src/rasterizer.cpp +++ b/src/rasterizer.cpp @@ -81,6 +81,62 @@ std::vector Rasterizer::Rasterize(const Vertex& v0, const Vertex& v1, return fragments; } +void Rasterizer::RasterizeTo(const Vertex& v0, const Vertex& v1, const Vertex& v2, + int x0, int y0, int x1, int y1, + std::vector& out) { + // 获取三角形的最小 box(屏幕空间) + const Vector4f p0 = v0.GetPosition(); + const Vector4f p1 = v1.GetPosition(); + const Vector4f p2 = v2.GetPosition(); + + Vector2f a(p0.x, p0.y); + Vector2f b(p1.x, p1.y); + Vector2f c(p2.x, p2.y); + + Vector2f bboxMin = Vector2f{std::min({a.x, b.x, c.x}), std::min({a.y, b.y, c.y})}; + Vector2f bboxMax = Vector2f{std::max({a.x, b.x, c.x}), std::max({a.y, b.y, c.y})}; + + // Clamp 到屏幕尺寸 + float minx = std::max(0.0f, bboxMin.x); + float miny = std::max(0.0f, bboxMin.y); + float maxx = std::min(float(width_ - 1), bboxMax.x); + float maxy = std::min(float(height_ - 1), bboxMax.y); + + // 与外部提供的裁剪区域相交(半开区间) -> 闭区间扫描 + int sx = std::max(x0, static_cast(std::floor(minx))); + int sy = std::max(y0, static_cast(std::floor(miny))); + int ex = std::min(x1 - 1, static_cast(std::floor(maxx))); + int ey = std::min(y1 - 1, static_cast(std::floor(maxy))); + if (sx > ex || sy > ey) return; + + for (int x = sx; x <= ex; ++x) { + for (int y = sy; y <= ey; ++y) { + auto [is_inside, bary] = GetBarycentricCoord( + Vector3f(p0.x, p0.y, p0.z), Vector3f(p1.x, p1.y, p1.z), Vector3f(p2.x, p2.y, p2.z), + Vector3f(static_cast(x), static_cast(y), 0)); + if (!is_inside) continue; + + // 透视矫正插值 + auto perspective_result = PerformPerspectiveCorrection( + p0.w, p1.w, p2.w, + p0.z, p1.z, p2.z, + bary); + + const Vector3f& corrected_bary = perspective_result.corrected_barycentric; + float z = perspective_result.interpolated_z; + + Fragment frag; // material 指针由调用方填写 + frag.screen_coord = {x, y}; + frag.normal = Interpolate(v0.GetNormal(), v1.GetNormal(), v2.GetNormal(), corrected_bary); + frag.uv = Interpolate(v0.GetTexCoords(), v1.GetTexCoords(), v2.GetTexCoords(), corrected_bary); + frag.color = InterpolateColor(v0.GetColor(), v1.GetColor(), v2.GetColor(), corrected_bary); + frag.depth = z; + + out.push_back(frag); + } + } +} + void Rasterizer::RasterizeTo(const VertexSoA& soa, size_t i0, size_t i1, size_t i2, int x0, int y0, int x1, int y1, std::vector& out) { diff --git a/src/renderer.cpp b/src/renderer.cpp old mode 100755 new mode 100644 index f6e5984..4319066 --- a/src/renderer.cpp +++ b/src/renderer.cpp @@ -1,824 +1,83 @@ - -/** - * @file simple_renderer.cpp - * @brief SimpleRenderer 实现 - * @author Zone.N (Zone.Niuzh@hotmail.com) - * @version 1.0 - * @date 2023-10-23 - * @copyright MIT LICENSE - * https://github.com/Simple-XX/SimpleRenderer - * @par change log: - * - *
DateAuthorDescription - *
2023-10-23Zone.N创建文件 - *
- */ - #include "renderer.h" -#include - -#include -#include -#include -#include -#include -#include -#include +#include #include "config.h" -#include "light.h" -#include "log_system.h" -#include "model.hpp" +#include "renderers/per_triangle_renderer.hpp" +#include "renderers/tile_based_renderer.hpp" +#include "renderers/deferred_renderer.hpp" namespace simple_renderer { -// RenderingMode到字符串转换函数 std::string RenderingModeToString(RenderingMode mode) { switch(mode) { - case RenderingMode::TRADITIONAL: - return "TRADITIONAL"; - case RenderingMode::TILE_BASED: - return "TILE_BASED"; - case RenderingMode::DEFERRED: - return "DEFERRED"; + case RenderingMode::PER_TRIANGLE: return "PER_TRIANGLE"; + case RenderingMode::TILE_BASED: return "TILE_BASED"; + case RenderingMode::DEFERRED: return "DEFERRED"; } + return "PER_TRIANGLE"; } + SimpleRenderer::SimpleRenderer(size_t width, size_t height) : height_(height), width_(width), log_system_(LogSystem(kLogFilePath, kLogFileMaxSize, kLogFileMaxCount)), - current_mode_(RenderingMode::TILE_BASED), - is_early_z_enabled_(true) { - rasterizer_ = std::make_shared(width, height); + current_mode_(RenderingMode::TILE_BASED) { + tbr_early_z_ = true; + tbr_tile_size_ = 64; + EnsureRenderer(); } -bool SimpleRenderer::DrawModel(const Model &model, const Shader &shader, - uint32_t *buffer) { +bool SimpleRenderer::DrawModel(const Model &model, const Shader &shader, uint32_t *buffer) { + EnsureRenderer(); // 确保渲染器实例存在 SPDLOG_INFO("draw model: {}", model.GetModelPath()); - shader_ = std::make_shared(shader); - ExecuteDrawPipeline(model, buffer); - return true; + return renderer_->Render(model, shader, buffer); } void SimpleRenderer::SetRenderingMode(RenderingMode mode) { current_mode_ = mode; SPDLOG_INFO("rendering mode set to: {}", RenderingModeToString(mode)); + renderer_.reset(); + EnsureRenderer(); } -RenderingMode SimpleRenderer::GetRenderingMode() const { - return current_mode_; -} - -/* -Optimizes performance by performing depth testing during rasterization, keeping -only the closest fragment per pixel, and avoiding storing all -fragments—resulting in faster rendering. +RenderingMode SimpleRenderer::GetRenderingMode() const { return current_mode_; } -通过在光栅化过程中执行深度测试,仅保留每个像素的深度值最近的片段,避免存储所有片段,从而优化性能,实现更快的渲染。 -*/ -void SimpleRenderer::ExecuteDrawPipeline(const Model &model, uint32_t *buffer) { - SPDLOG_INFO("execute draw pipeline for {} using {} mode", - model.GetModelPath(), RenderingModeToString(current_mode_)); - - if (!shader_) { - SPDLOG_ERROR("No shader set for DrawModel, cannot render"); - return; +void SimpleRenderer::SetEarlyZEnabled(bool enabled) { + tbr_early_z_ = enabled; + if (current_mode_ == RenderingMode::TILE_BASED) { + renderer_.reset(); + EnsureRenderer(); } - - /* * * Vertex Transformation * * */ - auto vertex_shader_start_time = std::chrono::high_resolution_clock::now(); - const auto &input_vertices = model.GetVertices(); - std::vector processedVertices; // 非 TBR - VertexSoA processedSoA; // TBR 专用 +} +void SimpleRenderer::SetTileSize(size_t tile_size) { + tbr_tile_size_ = tile_size; if (current_mode_ == RenderingMode::TILE_BASED) { - processedSoA.resize(input_vertices.size()); - // schedule(static)使并行过程保持连续分块,避免 false sharing -#pragma omp parallel for num_threads(kNProc) schedule(static) \ - shared(shader_, processedSoA, input_vertices) - for (size_t i = 0; i < input_vertices.size(); ++i) { // 按索引并行处理 - const auto &v = input_vertices[i]; - // 顶点着色器:世界坐标 -> 裁剪坐标 - auto clipSpaceVertex = shader_->VertexShader(v); - // 保存裁剪空间坐标用于后续视锥体裁剪 - processedSoA.pos_clip[i] = clipSpaceVertex.GetPosition(); - auto ndcVertex = PerspectiveDivision(clipSpaceVertex); - auto screenSpaceVertex = ViewportTransformation(ndcVertex); - - // 填充为SoA数据结构,用于优化缓存局部性 - processedSoA.pos_screen[i] = screenSpaceVertex.GetPosition(); - processedSoA.normal[i] = screenSpaceVertex.GetNormal(); - processedSoA.uv[i] = screenSpaceVertex.GetTexCoords(); - processedSoA.color[i] = screenSpaceVertex.GetColor(); - } - } else { // Tradition或Deffer管线 - processedVertices.resize(input_vertices.size()); // 根据顶点总数量进行预分配 - // 并行过程保持连续分块,避免false sharing -#pragma omp parallel for num_threads(kNProc) schedule(static) \ - shared(shader_, processedVertices, input_vertices) - for (size_t i = 0; i < input_vertices.size(); ++i) { // 按索引并行处理 - const auto &v = input_vertices[i]; - auto clipSpaceVertex = shader_->VertexShader(v); - auto ndcVertex = PerspectiveDivision(clipSpaceVertex); - auto screenSpaceVertex = ViewportTransformation(ndcVertex); - processedVertices[i] = screenSpaceVertex; - } + renderer_.reset(); + EnsureRenderer(); } - auto vertex_shader_end_time = std::chrono::high_resolution_clock::now(); - auto vertex_shader_duration = std::chrono::duration_cast( - vertex_shader_end_time - vertex_shader_start_time); +} - // 根据当前设置的模式选择不同的渲染管线 - double vertex_ms = vertex_shader_duration.count() / 1000.0; - - switch (current_mode_) { - case RenderingMode::TRADITIONAL: { - auto stats = ExecuteTraditionalPipeline(model, processedVertices, buffer); - PrintTraditionalStats(vertex_ms, stats); +void SimpleRenderer::EnsureRenderer() { + if (renderer_) return; + switch (current_mode_) { // 延迟初始化,根据模式创建相应实例 + case RenderingMode::PER_TRIANGLE: { + auto r = std::make_unique(width_, height_); + renderer_ = std::move(r); break; } - case RenderingMode::TILE_BASED: { - auto stats = ExecuteTileBasedPipeline(model, processedSoA, buffer); - PrintTileBasedStats(vertex_ms, stats); + auto r = std::make_unique(width_, height_, tbr_early_z_, tbr_tile_size_); + renderer_ = std::move(r); break; } - case RenderingMode::DEFERRED: { - auto stats = ExecuteDeferredPipeline(model, processedVertices, buffer); - PrintDeferredStats(vertex_ms, stats); + auto r = std::make_unique(width_, height_); + renderer_ = std::move(r); break; } } } - -/* -Organizes processing to simulate how OpenGL works with GPUs by collecting all -fragments per pixel before processing, closely mimicking the GPU pipeline but -leading to increased memory usage and slower performance. - -组织处理方式模拟 OpenGL 在 GPU -上的工作原理,先收集每个像素的所有片段再并行处理屏幕上的每个像素,模仿 GPU -管线,但导致内存使用增加和渲染速度变慢。 - -现在作为延迟渲染管线的一部分,用于教学演示经典GPU管线概念。 -*/ -SimpleRenderer::DeferredRenderStats SimpleRenderer::ExecuteDeferredPipeline( - const Model &model, - const std::vector &processedVertices, - uint32_t *buffer) { - - DeferredRenderStats stats; - auto total_start_time = std::chrono::high_resolution_clock::now(); - SPDLOG_INFO("execute deferred pipeline for {}", model.GetModelPath()); - /* * * * * * * */ - - /* * * Buffer Allocation * * */ - auto buffer_alloc_start_time = std::chrono::high_resolution_clock::now(); - std::vector>> fragmentsBuffer_all_thread( - kNProc, std::vector>(width_ * height_)); - - // 预先缓存所有Material数据,避免指针悬垂问题 - std::vector material_cache; - material_cache.reserve(model.GetFaces().size()); - for (const auto &f : model.GetFaces()) { - material_cache.emplace_back(f.GetMaterial()); // 值拷贝 - } - auto buffer_alloc_end_time = std::chrono::high_resolution_clock::now(); - auto buffer_alloc_duration = std::chrono::duration_cast( - buffer_alloc_end_time - buffer_alloc_start_time); - SPDLOG_INFO("cached {} materials for deferred rendering", material_cache.size()); - - /* * * Rasterization * * */ - auto rasterization_start_time = std::chrono::high_resolution_clock::now(); -#pragma omp parallel num_threads(kNProc) default(none) \ - shared(processedVertices, fragmentsBuffer_all_thread, rasterizer_, width_, \ - height_, material_cache, model) - { - int thread_id = omp_get_thread_num(); - auto &fragmentsBuffer_per_thread = fragmentsBuffer_all_thread[thread_id]; - -#pragma omp for - for (size_t face_idx = 0; face_idx < model.GetFaces().size(); ++face_idx) { - const auto &f = model.GetFaces()[face_idx]; - auto v0 = processedVertices[f.GetIndex(0)]; - auto v1 = processedVertices[f.GetIndex(1)]; - auto v2 = processedVertices[f.GetIndex(2)]; - - const Material *material = &material_cache[face_idx]; // 使用缓存的Material - - auto fragments = rasterizer_->Rasterize(v0, v1, v2); - - for (auto &fragment : fragments) { - fragment.material = material; - - size_t x = fragment.screen_coord[0]; - size_t y = fragment.screen_coord[1]; - - if (x >= width_ || y >= height_) { - continue; - } - - size_t index = x + y * width_; - fragmentsBuffer_per_thread[index].push_back(fragment); - } - } - } - auto rasterization_end_time = std::chrono::high_resolution_clock::now(); - auto rasterization_duration = std::chrono::duration_cast( - rasterization_end_time - rasterization_start_time); - /* * * * * * * */ - - /* * * Fragment Collection * * */ - auto fragment_collection_start_time = std::chrono::high_resolution_clock::now(); - std::vector> fragmentsBuffer(width_ * height_); - for (const auto &fragmentsBuffer_per_thread : fragmentsBuffer_all_thread) { - for (size_t i = 0; i < fragmentsBuffer_per_thread.size(); i++) { - fragmentsBuffer[i].insert(fragmentsBuffer[i].end(), - fragmentsBuffer_per_thread[i].begin(), - fragmentsBuffer_per_thread[i].end()); - } - } - auto fragment_collection_end_time = std::chrono::high_resolution_clock::now(); - auto fragment_collection_duration = std::chrono::duration_cast( - fragment_collection_end_time - fragment_collection_start_time); - /* * * * * * * */ - - /* * * Fragment Merge & Deferred Shading * * */ - auto fragment_merge_start_time = std::chrono::high_resolution_clock::now(); - - // Fragment Merge阶段:深度测试选择最近片段 - std::vector selected_fragments(width_ * height_, nullptr); - #pragma omp parallel for - for (size_t i = 0; i < fragmentsBuffer.size(); i++) { - const auto &fragments = fragmentsBuffer[i]; - if (fragments.empty()) { - continue; - } - - const Fragment *renderFragment = nullptr; - for (const auto &fragment : fragments) { - if (!renderFragment || fragment.depth < renderFragment->depth) { - renderFragment = &fragment; - } - } - selected_fragments[i] = renderFragment; - } - auto fragment_merge_end_time = std::chrono::high_resolution_clock::now(); - auto fragment_merge_duration = std::chrono::duration_cast( - fragment_merge_end_time - fragment_merge_start_time); - - // Deferred Shading阶段:执行片段着色器 - auto deferred_shading_start_time = std::chrono::high_resolution_clock::now(); -#pragma omp parallel for - for (size_t i = 0; i < selected_fragments.size(); i++) { - const Fragment *renderFragment = selected_fragments[i]; - if (renderFragment) { - // 添加Material指针有效性检查 - if (renderFragment->material == nullptr) { - SPDLOG_ERROR("Fragment material is nullptr at pixel {}", i); - continue; - } - auto color = shader_->FragmentShader(*renderFragment); - buffer[i] = uint32_t(color); - } - } - auto deferred_shading_end_time = std::chrono::high_resolution_clock::now(); - auto deferred_shading_duration = std::chrono::duration_cast( - deferred_shading_end_time - deferred_shading_start_time); - /* * * * * * * */ - - auto total_end_time = std::chrono::high_resolution_clock::now(); - auto total_duration = std::chrono::duration_cast( - total_end_time - total_start_time); - - // 填充统计信息 - stats.buffer_alloc_ms = buffer_alloc_duration.count() / 1000.0; - stats.rasterization_ms = rasterization_duration.count() / 1000.0; - stats.fragment_collection_ms = fragment_collection_duration.count() / 1000.0; - stats.fragment_merge_ms = fragment_merge_duration.count() / 1000.0; - stats.deferred_shading_ms = deferred_shading_duration.count() / 1000.0; - stats.total_ms = total_duration.count() / 1000.0; - - return stats; -} - -Vertex SimpleRenderer::PerspectiveDivision(const Vertex &vertex) { - Vector4f position = vertex.GetPosition(); - - // 检查w分量,避免除零和负数问题 - if (position.w <= kMinWValue) { - Vector4f farPosition(0.0f, 0.0f, 1.0f, 1.0f); - return Vertex(farPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor()); - } - - // 保存原始w分量用于透视矫正插值 - float original_w = position.w; - - // 执行透视除法:(x, y, z, w) -> (x/w, y/w, z/w, 1/w) - Vector4f ndcPosition( - position.x / position.w, // x_ndc = x_clip / w_clip - position.y / position.w, // y_ndc = y_clip / w_clip - position.z / position.w, // z_ndc = z_clip / w_clip - 1.0f / original_w // 保存1/w用于透视矫正插值 - ); - - // 只对Z坐标进行深度范围限制,X和Y允许超出以支持屏幕外三角形 - // 这些坐标在后续的视口变换和裁剪阶段会被正确处理 - ndcPosition.z = std::clamp(ndcPosition.z, -1.0f, 1.0f); - - // 创建新的顶点,保持其他属性和裁剪空间坐标不变 - return Vertex(ndcPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor(), vertex.GetClipPosition()); -} - -Vertex SimpleRenderer::ViewportTransformation(const Vertex &vertex) { - Vector4f ndcPosition = vertex.GetPosition(); - - // 视口变换:将NDC坐标[-1,1]转换为屏幕坐标[0,width]x[0,height] - float screen_x = (ndcPosition.x + 1.0f) * width_ / 2.0f; - float screen_y = (1.0f - ndcPosition.y) * height_ / 2.0f; - - Vector4f screenPosition( - screen_x, // x: 屏幕坐标 - screen_y, // y: 屏幕坐标 - ndcPosition.z, // z: NDC坐标用于深度测试 - ndcPosition.w // w: 保持1/w用于透视矫正插值 - ); - - return Vertex(screenPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor()); -} - - - -// SoA优化的Binning:两遍计数 + 预留 + 填充 TriangleRef -void SimpleRenderer::TriangleTileBinning( - const Model &model, - const VertexSoA &soa, - std::vector> &tile_triangles, - size_t tiles_x, size_t tiles_y, size_t tile_size) { - const size_t total_triangles = model.GetFaces().size(); - - SPDLOG_INFO("Starting triangle-tile binning (SoA) for {} triangles", total_triangles); - SPDLOG_INFO("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", - width_, height_, tile_size, tiles_x, tiles_y); - - std::vector tile_counts(tiles_x * tiles_y, 0); - - auto process_triangle = [&](size_t tri_idx, bool count_only) { - const auto &f = model.GetFaces()[tri_idx]; - size_t i0 = f.GetIndex(0); - size_t i1 = f.GetIndex(1); - size_t i2 = f.GetIndex(2); - - // 视锥体裁剪 (裁剪空间) - // 保守视锥体裁剪:只有当整个三角形都在视锥体外同一侧时才裁剪 - const Vector4f &c0 = soa.pos_clip[i0]; - const Vector4f &c1 = soa.pos_clip[i1]; - const Vector4f &c2 = soa.pos_clip[i2]; - bool frustum_cull = - (c0.x > c0.w && c1.x > c1.w && c2.x > c2.w) || // 右平面外 - (c0.x < -c0.w && c1.x < -c0.w && c2.x < -c0.w) || // 左平面外 - (c0.y > c0.w && c1.y > c1.w && c2.y > c2.w) || // 上平面外 - (c0.y < -c0.w && c1.y < -c0.w && c2.y < -c0.w) || // 下平面外 - (c0.z > c0.w && c1.z > c1.w && c2.z > c2.w) || // 远平面外 - (c0.z < -c0.w && c1.z < -c0.w && c2.z < -c0.w); // 近平面外 - if (frustum_cull) { - return; - } - - const Vector4f &pos0 = soa.pos_screen[i0]; - const Vector4f &pos1 = soa.pos_screen[i1]; - const Vector4f &pos2 = soa.pos_screen[i2]; - - // 背面剔除(屏幕空间) - // NDC空间中叉积为负表示顺时针,即背面。 - // 从NDC到屏幕空间中,会发生Y轴翻转,对应叉积应为正。 - - Vector2f screen0(pos0.x, pos0.y); - Vector2f screen1(pos1.x, pos1.y); - Vector2f screen2(pos2.x, pos2.y); - Vector2f edge1 = screen1 - screen0; - Vector2f edge2 = screen2 - screen0; - float cross_product = edge1.x * edge2.y - edge1.y * edge2.x; - if (cross_product > 0.0f) return; - - float screen_x0 = pos0.x; - float screen_y0 = pos0.y; - float screen_x1 = pos1.x; - float screen_y1 = pos1.y; - float screen_x2 = pos2.x; - float screen_y2 = pos2.y; - - // 计算屏幕bbox,用于后续tile划分 - float min_x = std::min({screen_x0, screen_x1, screen_x2}); - float max_x = std::max({screen_x0, screen_x1, screen_x2}); - float min_y = std::min({screen_y0, screen_y1, screen_y2}); - float max_y = std::max({screen_y0, screen_y1, screen_y2}); - - int start_tile_x = std::max(0, static_cast(min_x) / static_cast(tile_size)); - int end_tile_x = std::min(static_cast(tiles_x - 1), static_cast(max_x) / static_cast(tile_size)); - int start_tile_y = std::max(0, static_cast(min_y) / static_cast(tile_size)); - int end_tile_y = std::min(static_cast(tiles_y - 1), static_cast(max_y) / static_cast(tile_size)); - if (start_tile_x > end_tile_x || start_tile_y > end_tile_y) return; // 如果bbox不在任何tile内,直接返回 - - if (count_only) { // 第一遍计数,只统计tile内三角形数量 - for (int ty = start_tile_y; ty <= end_tile_y; ++ty) { - for (int tx = start_tile_x; tx <= end_tile_x; ++tx) { - size_t tile_id = ty * tiles_x + tx; - tile_counts[tile_id]++; - } - } - } else { // 第二遍填充,填充TriangleRef - TriangleRef tri_ref{ i0, i1, i2, &f.GetMaterial(), tri_idx }; - for (int ty = start_tile_y; ty <= end_tile_y; ++ty) { - for (int tx = start_tile_x; tx <= end_tile_x; ++tx) { - size_t tile_id = ty * tiles_x + tx; - tile_triangles[tile_id].push_back(tri_ref); - } - } - } - }; - - // 第一遍(count only):计算每个tile需要容纳多少三角形 - for (size_t tri_idx = 0; tri_idx < total_triangles; ++tri_idx) { - process_triangle(tri_idx, true); - } - - // 预分配,避免动态扩容 - for (size_t tile_id = 0; tile_id < tile_triangles.size(); ++tile_id) { - if (tile_counts[tile_id] > 0) tile_triangles[tile_id].reserve(tile_counts[tile_id]); - } - - // 第二遍(fill):按范围填充TriangleRef - for (size_t tri_idx = 0; tri_idx < total_triangles; ++tri_idx) { - process_triangle(tri_idx, false); - } - - size_t total_triangle_refs = 0; - size_t non_empty_tiles = 0; - for (const auto& tile : tile_triangles) { - total_triangle_refs += tile.size(); - if (!tile.empty()) non_empty_tiles++; - } - SPDLOG_INFO(" (SoA) Total triangle references: {}", total_triangle_refs); - SPDLOG_INFO(" (SoA) Non-empty tiles: {}", non_empty_tiles); - SPDLOG_INFO(" (SoA) Average triangles per tile: {:.2f}", - total_triangle_refs > 0 ? float(total_triangle_refs) / tile_triangles.size() : 0.0f); -} - -// SoA 版:单个 tile 光栅化 -void SimpleRenderer::RasterizeTile( - size_t tile_id, - const std::vector &triangles, - size_t tiles_x, size_t tiles_y, size_t tile_size, - float* tile_depth_buffer, uint32_t* tile_color_buffer, - std::unique_ptr &global_depth_buffer, - std::unique_ptr &global_color_buffer, - const VertexSoA &soa, - bool use_early_z, - std::vector* scratch_fragments) { - (void)tiles_y; - // 计算 tile 屏幕范围 - size_t tile_x = tile_id % tiles_x; - size_t tile_y = tile_id / tiles_x; - size_t screen_x_start = tile_x * tile_size; - size_t screen_y_start = tile_y * tile_size; - size_t screen_x_end = std::min(screen_x_start + tile_size, width_); - size_t screen_y_end = std::min(screen_y_start + tile_size, height_); - - // 初始化 tile 局部缓冲 - size_t tile_width = screen_x_end - screen_x_start; - size_t tile_height = screen_y_end - screen_y_start; - std::fill_n(tile_depth_buffer, tile_width * tile_height, 1.0f); - std::fill_n(tile_color_buffer, tile_width * tile_height, 0); // 默认背景色为0/黑色 - - for (const auto &tri : triangles) { // 用来应对scratch传入nullptr的情况 - // 始终走 SoA + 限制矩形的光栅化路径;如未提供 scratch,则使用函数内局部容器 - std::vector local_out; - std::vector &out = scratch_fragments ? *scratch_fragments : local_out; - - out.clear(); - if (out.capacity() < tile_width * tile_height) { - out.reserve(tile_width * tile_height); - } - - rasterizer_->RasterizeTo(soa, tri.i0, tri.i1, tri.i2, - static_cast(screen_x_start), static_cast(screen_y_start), - static_cast(screen_x_end), static_cast(screen_y_end), - out); - - for (auto &fragment : out) { - fragment.material = tri.material; - size_t sx = fragment.screen_coord[0]; - size_t sy = fragment.screen_coord[1]; - if (sx >= screen_x_start && sx < screen_x_end && sy >= screen_y_start && sy < screen_y_end) { - size_t local_x = sx - screen_x_start; - size_t local_y = sy - screen_y_start; - size_t idx = local_x + local_y * tile_width; - if (use_early_z) { - if (fragment.depth < tile_depth_buffer[idx]) { - auto color = shader_->FragmentShader(fragment); - tile_depth_buffer[idx] = fragment.depth; - tile_color_buffer[idx] = uint32_t(color); - } - } else { - auto color = shader_->FragmentShader(fragment); - if (fragment.depth < tile_depth_buffer[idx]) { - tile_depth_buffer[idx] = fragment.depth; - tile_color_buffer[idx] = uint32_t(color); - } - } - } - } - } - - // 写回全局缓冲 - // TBR 下不同 tile 覆盖的屏幕区域互不重叠,且在 tile 内部已通过 Early‑Z - // 得出每个像素的最终值。因此可以直接将 tile 行数据拷贝到全局缓冲 - for (size_t y = 0; y < tile_height; y++) { - const size_t tile_row_off = y * tile_width; - const size_t global_row_off = (screen_y_start + y) * width_ + screen_x_start; - - // 拷贝本行 color 到全局 color - std::memcpy(global_color_buffer.get() + global_row_off, - tile_color_buffer + tile_row_off, - tile_width * sizeof(uint32_t)); - - // 拷贝本行 depth 到全局 depth - std::memcpy(global_depth_buffer.get() + global_row_off, - tile_depth_buffer + tile_row_off, - tile_width * sizeof(float)); - } -} - -// 基础光栅化管线实现 -SimpleRenderer::RenderStats SimpleRenderer::ExecuteTraditionalPipeline( - const Model &model, - const std::vector &processedVertices, - uint32_t *buffer) { - - RenderStats stats; - auto total_start_time = std::chrono::high_resolution_clock::now(); - - // 1. 为每个线程创建framebuffer - auto buffer_alloc_start_time = std::chrono::high_resolution_clock::now(); - std::vector> depthBuffer_all_thread(kNProc); - std::vector> colorBuffer_all_thread(kNProc); - - for (size_t thread_id = 0; thread_id < kNProc; thread_id++) { - depthBuffer_all_thread[thread_id] = - std::make_unique(width_ * height_); - colorBuffer_all_thread[thread_id] = - std::make_unique(width_ * height_); - - std::fill_n(depthBuffer_all_thread[thread_id].get(), width_ * height_, - std::numeric_limits::infinity()); - std::fill_n(colorBuffer_all_thread[thread_id].get(), width_ * height_, 0); - } - auto buffer_alloc_end_time = std::chrono::high_resolution_clock::now(); - auto buffer_alloc_duration = std::chrono::duration_cast( - buffer_alloc_end_time - buffer_alloc_start_time); - - // 2. 并行光栅化 - auto raster_start_time = std::chrono::high_resolution_clock::now(); -#pragma omp parallel num_threads(kNProc) default(none) \ - shared(processedVertices, rasterizer_, shader_, width_, height_, \ - depthBuffer_all_thread, colorBuffer_all_thread, model) - { - int thread_id = omp_get_thread_num(); - auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id]; - auto &colorBuffer_per_thread = colorBuffer_all_thread[thread_id]; - -#pragma omp for - for (const auto &f : model.GetFaces()) { - auto v0 = processedVertices[f.GetIndex(0)]; - auto v1 = processedVertices[f.GetIndex(1)]; - auto v2 = processedVertices[f.GetIndex(2)]; - - // 获取屏幕空间坐标 - Vector2f screen0(v0.GetPosition().x, v0.GetPosition().y); - Vector2f screen1(v1.GetPosition().x, v1.GetPosition().y); - Vector2f screen2(v2.GetPosition().x, v2.GetPosition().y); - - // 计算屏幕空间叉积判断朝向 - Vector2f edge1 = screen1 - screen0; - Vector2f edge2 = screen2 - screen0; - float cross_product = edge1.x * edge2.y - edge1.y * edge2.x; - - // 背面剔除:NDC空间中叉积为负表示顺时针,即背面。 - // 从NDC到屏幕空间中,会发生Y轴翻转,对应叉积应为正。 - if (cross_product > 0.0f) { - continue; - } - - const Material *material = &f.GetMaterial(); - auto fragments = rasterizer_->Rasterize(v0, v1, v2); - - for (auto &fragment : fragments) { - fragment.material = material; - size_t x = fragment.screen_coord[0]; - size_t y = fragment.screen_coord[1]; - - if (x >= width_ || y >= height_) { - continue; - } - - size_t index = x + y * width_; - if (fragment.depth < depthBuffer_per_thread[index]) { - depthBuffer_per_thread[index] = fragment.depth; - auto color = shader_->FragmentShader(fragment); - colorBuffer_per_thread[index] = uint32_t(color); - } - } - } - } - auto raster_end_time = std::chrono::high_resolution_clock::now(); - auto raster_duration = std::chrono::duration_cast( - raster_end_time - raster_start_time); - - // 3. 合并结果 - auto merge_start_time = std::chrono::high_resolution_clock::now(); - std::unique_ptr depthBuffer = - std::make_unique(width_ * height_); - std::unique_ptr colorBuffer = - std::make_unique(width_ * height_); - - std::fill_n(depthBuffer.get(), width_ * height_, - std::numeric_limits::infinity()); - std::fill_n(colorBuffer.get(), width_ * height_, 0); - -#pragma omp parallel for - for (size_t i = 0; i < width_ * height_; i++) { - float min_depth = std::numeric_limits::infinity(); - uint32_t color = 0; - - for (size_t thread_id = 0; thread_id < kNProc; thread_id++) { - float depth = depthBuffer_all_thread[thread_id][i]; - if (depth < min_depth) { - min_depth = depth; - color = colorBuffer_all_thread[thread_id][i]; - } - } - depthBuffer[i] = min_depth; - colorBuffer[i] = color; - } - - std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t)); - auto merge_end_time = std::chrono::high_resolution_clock::now(); - auto merge_duration = std::chrono::duration_cast( - merge_end_time - merge_start_time); - - auto total_end_time = std::chrono::high_resolution_clock::now(); - auto total_duration = std::chrono::duration_cast( - total_end_time - total_start_time); - - // 填充统计信息 - stats.buffer_alloc_ms = buffer_alloc_duration.count() / 1000.0; - stats.rasterization_ms = raster_duration.count() / 1000.0; - stats.merge_ms = merge_duration.count() / 1000.0; - stats.total_ms = total_duration.count() / 1000.0; - - return stats; -} - - -// Tile-based光栅化管线实现(SoA 直连版本,避免 AoS->SoA 拷贝) -SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline( - const Model &model, - const VertexSoA &soa, - uint32_t *buffer) { - TileRenderStats stats; - auto total_start_time = std::chrono::high_resolution_clock::now(); - - // 1. Setup阶段 - auto setup_start_time = std::chrono::high_resolution_clock::now(); - const size_t TILE_SIZE = kDefaultTileSize; // Default tile size per tile - const size_t tiles_x = (width_ + TILE_SIZE - 1) / TILE_SIZE; - const size_t tiles_y = (height_ + TILE_SIZE - 1) / TILE_SIZE; - const size_t total_tiles = tiles_x * tiles_y; - - // 为每个tile创建三角形列表(SoA 引用) - std::vector> tile_triangles(total_tiles); - auto setup_end_time = std::chrono::high_resolution_clock::now(); - auto setup_duration = std::chrono::duration_cast( - setup_end_time - setup_start_time); - - // 2. Triangle-Tile binning阶段(SoA) - auto binning_start_time = std::chrono::high_resolution_clock::now(); - TriangleTileBinning(model, soa, tile_triangles, tiles_x, tiles_y, TILE_SIZE); - auto binning_end_time = std::chrono::high_resolution_clock::now(); - auto binning_duration = std::chrono::duration_cast( - binning_end_time - binning_start_time); - - // 3. 全局 framebuffer(单份) - // 直接让每个 tile 写入这份全局缓冲区,避免末端 O(W*H*kNProc) 合并开销 - auto buffer_alloc_start_time = std::chrono::high_resolution_clock::now(); - std::unique_ptr depthBuffer = std::make_unique(width_ * height_); - std::unique_ptr colorBuffer = std::make_unique(width_ * height_); - // 深度初始化为最远值,颜色清零 - std::fill_n(depthBuffer.get(), width_ * height_, std::numeric_limits::infinity()); - std::fill_n(colorBuffer.get(), width_ * height_, 0); - auto buffer_alloc_end_time = std::chrono::high_resolution_clock::now(); - auto buffer_alloc_duration = std::chrono::duration_cast( - buffer_alloc_end_time - buffer_alloc_start_time); - - // 4. 并行处理每个tile(SoA) - auto rasterization_start_time = std::chrono::high_resolution_clock::now(); -#pragma omp parallel num_threads(kNProc) default(none) \ - shared(tile_triangles, rasterizer_, shader_, width_, height_, \ - depthBuffer, colorBuffer, tiles_x, tiles_y, total_tiles, \ - is_early_z_enabled_, soa) - { - int thread_id = omp_get_thread_num(); - - // 为当前线程创建 tile 局部缓冲区(避免在全局缓冲上直接逐像素竞争) - std::unique_ptr tile_depth_buffer = - std::make_unique(TILE_SIZE * TILE_SIZE); - std::unique_ptr tile_color_buffer = - std::make_unique(TILE_SIZE * TILE_SIZE); - - // 线程本地片段 scratch 容器(复用),容量按单 tile 上限预估 - std::vector scratch_fragments; - scratch_fragments.reserve(TILE_SIZE * TILE_SIZE); - -#pragma omp for - for (size_t tile_id = 0; tile_id < total_tiles; tile_id++) { - // 按照 tile 进行光栅化(SoA) - // 直接写入单份全局 framebuffer;不同 tile 不重叠,无需加锁 - RasterizeTile(tile_id, tile_triangles[tile_id], - tiles_x, tiles_y, TILE_SIZE, - tile_depth_buffer.get(), tile_color_buffer.get(), - depthBuffer, colorBuffer, - soa, is_early_z_enabled_, &scratch_fragments); - } - } - auto rasterization_end_time = std::chrono::high_resolution_clock::now(); - auto rasterization_duration = std::chrono::duration_cast( - rasterization_end_time - rasterization_start_time); - - // 5. 直接将单份全局 colorBuffer 拷贝到输出 - auto present_start_time = std::chrono::high_resolution_clock::now(); - std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t)); - auto present_end_time = std::chrono::high_resolution_clock::now(); - auto present_duration = std::chrono::duration_cast( - present_end_time - present_start_time); - - auto total_end_time = std::chrono::high_resolution_clock::now(); - auto total_duration = std::chrono::duration_cast( - total_end_time - total_start_time); - - // 填充统计信息 - stats.setup_ms = setup_duration.count() / 1000.0; - stats.binning_ms = binning_duration.count() / 1000.0; - stats.buffer_alloc_ms = buffer_alloc_duration.count() / 1000.0; - stats.rasterization_ms = rasterization_duration.count() / 1000.0; - // 合并阶段已被消除,仅为拷贝开销 - stats.merge_ms = present_duration.count() / 1000.0; - stats.total_ms = total_duration.count() / 1000.0; - - return stats; -} - -void SimpleRenderer::PrintTraditionalStats(double vertex_ms, const RenderStats& stats) const { - double total_ms = vertex_ms + stats.total_ms; - - SPDLOG_INFO("=== TRADITIONAL RENDERING PERFORMANCE ==="); - SPDLOG_INFO("Vertex Shader: {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100); - SPDLOG_INFO("Buffer Alloc: {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100); - SPDLOG_INFO("Rasterization: {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100); - SPDLOG_INFO("Merge: {:8.3f} ms ({:5.1f}%)", stats.merge_ms, stats.merge_ms/total_ms*100); - SPDLOG_INFO("Total: {:8.3f} ms", total_ms); - SPDLOG_INFO("=========================================="); -} - -void SimpleRenderer::PrintTileBasedStats(double vertex_ms, const TileRenderStats& stats) const { - double total_ms = vertex_ms + stats.total_ms; - - SPDLOG_INFO("=== TILE-BASED RENDERING PERFORMANCE ==="); - SPDLOG_INFO("Vertex Shader: {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100); - SPDLOG_INFO("Setup: {:8.3f} ms ({:5.1f}%)", stats.setup_ms, stats.setup_ms/total_ms*100); - SPDLOG_INFO("Binning: {:8.3f} ms ({:5.1f}%)", stats.binning_ms, stats.binning_ms/total_ms*100); - SPDLOG_INFO("Buffer Alloc: {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100); - SPDLOG_INFO("Rasterization: {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100); - SPDLOG_INFO("Merge: {:8.3f} ms ({:5.1f}%)", stats.merge_ms, stats.merge_ms/total_ms*100); - SPDLOG_INFO("Total: {:8.3f} ms", total_ms); - SPDLOG_INFO("=========================================="); -} - -void SimpleRenderer::PrintDeferredStats(double vertex_ms, const DeferredRenderStats& stats) const { - double total_ms = vertex_ms + stats.total_ms; - - SPDLOG_INFO("=== DEFERRED RENDERING PERFORMANCE ==="); - SPDLOG_INFO("Vertex Shader: {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100); - SPDLOG_INFO("Buffer Alloc: {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100); - SPDLOG_INFO("Rasterization: {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100); - SPDLOG_INFO("Fragment Collection: {:8.3f} ms ({:5.1f}%)", stats.fragment_collection_ms, stats.fragment_collection_ms/total_ms*100); - SPDLOG_INFO("Fragment Merge: {:8.3f} ms ({:5.1f}%)", stats.fragment_merge_ms, stats.fragment_merge_ms/total_ms*100); - SPDLOG_INFO("Deferred Shading: {:8.3f} ms ({:5.1f}%)", stats.deferred_shading_ms, stats.deferred_shading_ms/total_ms*100); - SPDLOG_INFO("Total: {:8.3f} ms", total_ms); - SPDLOG_INFO("========================================="); -} - } // namespace simple_renderer diff --git a/src/renderers/deferred_renderer.cpp b/src/renderers/deferred_renderer.cpp new file mode 100644 index 0000000..6931812 --- /dev/null +++ b/src/renderers/deferred_renderer.cpp @@ -0,0 +1,146 @@ +#include "renderers/deferred_renderer.hpp" + +#include +#include +#include + +#include "config.h" +#include "log_system.h" + +namespace simple_renderer { + +bool DeferredRenderer::Render(const Model& model, const Shader& shader_in, uint32_t* buffer) { + auto total_start_time = std::chrono::high_resolution_clock::now(); + auto shader = std::make_shared(shader_in); + + // 顶点变换(AoS) + auto vertex_start = std::chrono::high_resolution_clock::now(); + const auto &input_vertices = model.GetVertices(); + std::vector processedVertices(input_vertices.size()); +#pragma omp parallel for num_threads(kNProc) schedule(static) \ + shared(shader, processedVertices, input_vertices) + for (size_t i = 0; i < input_vertices.size(); ++i) { + const auto &v = input_vertices[i]; + auto clipSpaceVertex = shader->VertexShader(v); + auto ndcVertex = PerspectiveDivision(clipSpaceVertex); + auto screenSpaceVertex = ViewportTransformation(ndcVertex); + processedVertices[i] = screenSpaceVertex; + } + auto vertex_end = std::chrono::high_resolution_clock::now(); + auto vertex_ms = std::chrono::duration_cast(vertex_end - vertex_start).count() / 1000.0; + + // Buffer allocation + auto buffer_alloc_start = std::chrono::high_resolution_clock::now(); + std::vector>> fragmentsBuffer_all_thread( + kNProc, std::vector>(width_ * height_)); + + std::vector material_cache; + material_cache.reserve(model.GetFaces().size()); + for (const auto &f : model.GetFaces()) { + material_cache.emplace_back(f.GetMaterial()); + } + auto buffer_alloc_end = std::chrono::high_resolution_clock::now(); + auto buffer_alloc_ms = std::chrono::duration_cast(buffer_alloc_end - buffer_alloc_start).count() / 1000.0; + + // Rasterization: collect fragments per pixel per thread + auto raster_start = std::chrono::high_resolution_clock::now(); +#pragma omp parallel num_threads(kNProc) default(none) \ + shared(processedVertices, fragmentsBuffer_all_thread, rasterizer_, width_, \ + height_, material_cache, model) + { + int thread_id = omp_get_thread_num(); + auto &fragmentsBuffer_per_thread = fragmentsBuffer_all_thread[thread_id]; + +#pragma omp for + for (size_t face_idx = 0; face_idx < model.GetFaces().size(); ++face_idx) { + const auto &f = model.GetFaces()[face_idx]; + auto v0 = processedVertices[f.GetIndex(0)]; + auto v1 = processedVertices[f.GetIndex(1)]; + auto v2 = processedVertices[f.GetIndex(2)]; + + const Material *material = &material_cache[face_idx]; // 使用缓存的Material + auto fragments = rasterizer_->Rasterize(v0, v1, v2); + + for (auto &fragment : fragments) { + fragment.material = material; + size_t x = fragment.screen_coord[0]; + size_t y = fragment.screen_coord[1]; + + if (x >= width_ || y >= height_) continue; + size_t index = x + y * width_; + fragmentsBuffer_per_thread[index].push_back(fragment); + } + } + } + auto raster_end = std::chrono::high_resolution_clock::now(); + auto raster_ms = std::chrono::duration_cast(raster_end - raster_start).count() / 1000.0; + + /* * * Fragment Collection * * */ + auto collect_start = std::chrono::high_resolution_clock::now(); + std::vector> fragmentsBuffer(width_ * height_); + for (const auto &fragmentsBuffer_per_thread : fragmentsBuffer_all_thread) { + for (size_t i = 0; i < fragmentsBuffer_per_thread.size(); i++) { + fragmentsBuffer[i].insert(fragmentsBuffer[i].end(), + fragmentsBuffer_per_thread[i].begin(), + fragmentsBuffer_per_thread[i].end()); + } + } + auto collect_end = std::chrono::high_resolution_clock::now(); + auto collect_ms = std::chrono::duration_cast(collect_end - collect_start).count() / 1000.0; + + /* * * Fragment Merge & Deferred Shading * * */ + auto merge_start = std::chrono::high_resolution_clock::now(); + + // Fragment Merge阶段:深度测试选择最近片段 + std::vector selected_fragments(width_ * height_, nullptr); +#pragma omp parallel for + for (size_t i = 0; i < fragmentsBuffer.size(); i++) { + const auto &fragments = fragmentsBuffer[i]; + if (fragments.empty()) continue; + const Fragment *renderFragment = nullptr; + for (const auto &fragment : fragments) { + if (!renderFragment || fragment.depth < renderFragment->depth) { + renderFragment = &fragment; + } + } + selected_fragments[i] = renderFragment; + } + auto merge_end = std::chrono::high_resolution_clock::now(); + auto merge_ms = std::chrono::duration_cast(merge_end - merge_start).count() / 1000.0; + + // Deferred Shading阶段:对选择的片段执行片段着色 + auto shade_start = std::chrono::high_resolution_clock::now(); +#pragma omp parallel for + for (size_t i = 0; i < selected_fragments.size(); i++) { + const Fragment *renderFragment = selected_fragments[i]; + if (renderFragment) { + // 添加Material指针有效性检查 + if (renderFragment->material == nullptr) { + SPDLOG_ERROR("Fragment material is nullptr at pixel {}", i); + continue; + } + auto color = shader->FragmentShader(*renderFragment); + buffer[i] = uint32_t(color); + } + } + auto shade_end = std::chrono::high_resolution_clock::now(); + auto shade_ms = std::chrono::duration_cast(shade_end - shade_start).count() / 1000.0; + + auto total_end_time = std::chrono::high_resolution_clock::now(); + double total_ms = std::chrono::duration_cast(total_end_time - total_start_time).count() / 1000.0; + + SPDLOG_INFO("=== DEFERRED RENDERING PERFORMANCE ==="); + double sum_ms = vertex_ms + (total_ms - vertex_ms); + SPDLOG_INFO("Vertex Shader: {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/sum_ms*100); + SPDLOG_INFO("Buffer Alloc: {:8.3f} ms", buffer_alloc_ms); + SPDLOG_INFO("Rasterization: {:8.3f} ms", raster_ms); + SPDLOG_INFO("Fragment Collection: {:8.3f} ms", collect_ms); + SPDLOG_INFO("Fragment Merge: {:8.3f} ms", merge_ms); + SPDLOG_INFO("Deferred Shading: {:8.3f} ms", shade_ms); + SPDLOG_INFO("Total: {:8.3f} ms", vertex_ms + (buffer_alloc_ms + raster_ms + collect_ms + merge_ms + shade_ms)); + SPDLOG_INFO("========================================="); + + return true; +} + +} // namespace simple_renderer diff --git a/src/renderers/per_triangle_renderer.cpp b/src/renderers/per_triangle_renderer.cpp new file mode 100644 index 0000000..9e3167c --- /dev/null +++ b/src/renderers/per_triangle_renderer.cpp @@ -0,0 +1,172 @@ +#include "renderers/per_triangle_renderer.hpp" + +#include + +#include +#include +#include +#include +#include + +#include "config.h" +#include "log_system.h" + +namespace simple_renderer { + +bool PerTriangleRenderer::Render(const Model &model, const Shader &shader_in, + uint32_t *buffer) { + auto total_start_time = std::chrono::high_resolution_clock::now(); + + // 复制 shader 以便在多线程中共享 + auto shader = std::make_shared(shader_in); + + // 顶点变换(AoS) + auto vertex_start = std::chrono::high_resolution_clock::now(); + const auto &input_vertices = model.GetVertices(); + std::vector processedVertices(input_vertices.size()); + +#pragma omp parallel for num_threads(kNProc) schedule(static) \ + shared(shader, processedVertices, input_vertices) + for (size_t i = 0; i < input_vertices.size(); ++i) { + const auto &v = input_vertices[i]; + auto clipSpaceVertex = shader->VertexShader(v); + auto ndcVertex = PerspectiveDivision(clipSpaceVertex); + auto screenSpaceVertex = ViewportTransformation(ndcVertex); + processedVertices[i] = screenSpaceVertex; + } + auto vertex_end = std::chrono::high_resolution_clock::now(); + auto vertex_ms = std::chrono::duration_cast( + vertex_end - vertex_start) + .count() / + 1000.0; + + // 1. 为每个线程创建framebuffer + auto buffer_alloc_start = std::chrono::high_resolution_clock::now(); + std::vector> depthBuffer_all_thread(kNProc); + std::vector> colorBuffer_all_thread(kNProc); + + for (size_t thread_id = 0; thread_id < kNProc; thread_id++) { + depthBuffer_all_thread[thread_id] = + std::make_unique(width_ * height_); + colorBuffer_all_thread[thread_id] = + std::make_unique(width_ * height_); + std::fill_n(depthBuffer_all_thread[thread_id].get(), width_ * height_, + std::numeric_limits::infinity()); + std::fill_n(colorBuffer_all_thread[thread_id].get(), width_ * height_, 0); + } + auto buffer_alloc_end = std::chrono::high_resolution_clock::now(); + auto buffer_alloc_ms = std::chrono::duration_cast( + buffer_alloc_end - buffer_alloc_start) + .count() / + 1000.0; + + // 2. 并行光栅化 + auto raster_start = std::chrono::high_resolution_clock::now(); +#pragma omp parallel num_threads(kNProc) default(none) \ + shared(processedVertices, shader, rasterizer_, width_, height_, \ + depthBuffer_all_thread, colorBuffer_all_thread, model) + { + int thread_id = omp_get_thread_num(); + auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id]; + auto &colorBuffer_per_thread = colorBuffer_all_thread[thread_id]; + +#pragma omp for + for (const auto &f : model.GetFaces()) { + auto v0 = processedVertices[f.GetIndex(0)]; + auto v1 = processedVertices[f.GetIndex(1)]; + auto v2 = processedVertices[f.GetIndex(2)]; + + // 背面剔除(屏幕空间叉积) + Vector2f screen0(v0.GetPosition().x, v0.GetPosition().y); + Vector2f screen1(v1.GetPosition().x, v1.GetPosition().y); + Vector2f screen2(v2.GetPosition().x, v2.GetPosition().y); + + // 计算屏幕空间叉积判断朝向 + Vector2f edge1 = screen1 - screen0; + Vector2f edge2 = screen2 - screen0; + + // 背面剔除:NDC空间中叉积为负表示顺时针,即背面。 + // 从NDC到屏幕空间中,会发生Y轴翻转,对应叉积应为正。 + float cross_product = edge1.x * edge2.y - edge1.y * edge2.x; + if (cross_product > 0.0f) { + continue; // 背面 + } + + const Material *material = &f.GetMaterial(); + auto fragments = rasterizer_->Rasterize(v0, v1, v2); + + for (auto &fragment : fragments) { + fragment.material = material; + size_t x = fragment.screen_coord[0]; + size_t y = fragment.screen_coord[1]; + if (x >= width_ || y >= height_) { + continue; + } + size_t index = x + y * width_; + if (fragment.depth < depthBuffer_per_thread[index]) { + depthBuffer_per_thread[index] = fragment.depth; + auto color = shader->FragmentShader(fragment); + colorBuffer_per_thread[index] = uint32_t(color); + } + } + } + } + auto raster_end = std::chrono::high_resolution_clock::now(); + auto raster_ms = std::chrono::duration_cast( + raster_end - raster_start) + .count() / + 1000.0; + + // 3. 合并结果 + auto merge_start = std::chrono::high_resolution_clock::now(); + std::unique_ptr depthBuffer = + std::make_unique(width_ * height_); + std::unique_ptr colorBuffer = + std::make_unique(width_ * height_); + std::fill_n(depthBuffer.get(), width_ * height_, + std::numeric_limits::infinity()); + std::fill_n(colorBuffer.get(), width_ * height_, 0); + +#pragma omp parallel for + for (size_t i = 0; i < width_ * height_; i++) { + float min_depth = std::numeric_limits::infinity(); + uint32_t color = 0; + for (size_t thread_id = 0; thread_id < kNProc; thread_id++) { + float depth = depthBuffer_all_thread[thread_id][i]; + if (depth < min_depth) { + min_depth = depth; + color = colorBuffer_all_thread[thread_id][i]; + } + } + depthBuffer[i] = min_depth; + colorBuffer[i] = color; + } + + std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t)); + auto merge_end = std::chrono::high_resolution_clock::now(); + auto merge_ms = std::chrono::duration_cast( + merge_end - merge_start) + .count() / + 1000.0; + + auto total_end_time = std::chrono::high_resolution_clock::now(); + auto total_ms = std::chrono::duration_cast( + total_end_time - total_start_time) + .count() / + 1000.0; + + SPDLOG_INFO("=== PER-TRIANGLE RENDERING PERFORMANCE ==="); + double sum_ms = vertex_ms + (total_ms - vertex_ms); + SPDLOG_INFO("Vertex Shader: {:8.3f} ms ({:5.1f}%)", vertex_ms, + vertex_ms / sum_ms * 100); + SPDLOG_INFO("Buffer Alloc: {:8.3f} ms", buffer_alloc_ms); + SPDLOG_INFO("Rasterization: {:8.3f} ms", raster_ms); + SPDLOG_INFO("Merge: {:8.3f} ms", merge_ms); + SPDLOG_INFO("Total: {:8.3f} ms", + vertex_ms + (buffer_alloc_ms + raster_ms + merge_ms)); + SPDLOG_INFO("=========================================="); + + return true; +} + +} // namespace simple_renderer diff --git a/src/renderers/renderer_base.cpp b/src/renderers/renderer_base.cpp new file mode 100644 index 0000000..5a82e5a --- /dev/null +++ b/src/renderers/renderer_base.cpp @@ -0,0 +1,44 @@ +#include "renderers/renderer_base.hpp" + +#include + +namespace simple_renderer { + +Vertex RendererBase::PerspectiveDivision(const Vertex &vertex) { + Vector4f position = vertex.GetPosition(); + + if (position.w <= kMinWValue) { + Vector4f farPosition(0.0f, 0.0f, 1.0f, 1.0f); + return Vertex(farPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor()); + } + + float original_w = position.w; + Vector4f ndcPosition( + position.x / position.w, // x_ndc = x_clip / w_clip + position.y / position.w, // y_ndc = y_clip / w_clip + position.z / position.w, // z_ndc = z_clip / w_clip + 1.0f / original_w // 保存1/w用于透视矫正插值 + ); + + ndcPosition.z = std::clamp(ndcPosition.z, -1.0f, 1.0f); + return Vertex(ndcPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor(), vertex.GetClipPosition()); +} + +Vertex RendererBase::ViewportTransformation(const Vertex &vertex) { + Vector4f ndcPosition = vertex.GetPosition(); + + // 视口变换:将NDC坐标[-1,1]转换为屏幕坐标[0,width]x[0,height] + float screen_x = (ndcPosition.x + 1.0f) * width_ / 2.0f; + float screen_y = (1.0f - ndcPosition.y) * height_ / 2.0f; + + Vector4f screenPosition( + screen_x, + screen_y, + ndcPosition.z, + ndcPosition.w); + + return Vertex(screenPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor()); +} + +} // namespace simple_renderer + diff --git a/src/renderers/tile_based_renderer.cpp b/src/renderers/tile_based_renderer.cpp new file mode 100644 index 0000000..1ad3db7 --- /dev/null +++ b/src/renderers/tile_based_renderer.cpp @@ -0,0 +1,366 @@ +#include "renderers/tile_based_renderer.hpp" + +#include + +#include +#include +#include +#include + +#include "config.h" +#include "log_system.h" + +namespace simple_renderer { + +bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in, + uint32_t *buffer) { + auto total_start_time = std::chrono::high_resolution_clock::now(); + auto shader = std::make_shared(shader_in); + + // 顶点变换(SoA) + auto vertex_start = std::chrono::high_resolution_clock::now(); + const auto &input_vertices = model.GetVertices(); + VertexSoA soa; + soa.resize(input_vertices.size()); + +#pragma omp parallel for num_threads(kNProc) schedule(static) \ + shared(shader, soa, input_vertices) + for (size_t i = 0; i < input_vertices.size(); ++i) { + const auto &v = input_vertices[i]; + auto clipSpaceVertex = shader->VertexShader(v); + soa.pos_clip[i] = clipSpaceVertex.GetPosition(); + auto ndcVertex = PerspectiveDivision(clipSpaceVertex); + auto screenSpaceVertex = ViewportTransformation(ndcVertex); + soa.pos_screen[i] = screenSpaceVertex.GetPosition(); + soa.normal[i] = screenSpaceVertex.GetNormal(); + soa.uv[i] = screenSpaceVertex.GetTexCoords(); + soa.color[i] = screenSpaceVertex.GetColor(); + } + auto vertex_end = std::chrono::high_resolution_clock::now(); + auto vertex_ms = std::chrono::duration_cast( + vertex_end - vertex_start) + .count() / + 1000.0; + + // 1. Setup + auto setup_start = std::chrono::high_resolution_clock::now(); + const size_t TILE_SIZE = tile_size_ > 0 ? tile_size_ : 64; + const size_t tiles_x = (width_ + TILE_SIZE - 1) / TILE_SIZE; + const size_t tiles_y = (height_ + TILE_SIZE - 1) / TILE_SIZE; + const size_t total_tiles = tiles_x * tiles_y; + + // 为每个tile创建三角形列表(SoA 引用) + std::vector> tile_triangles(total_tiles); + auto setup_end = std::chrono::high_resolution_clock::now(); + auto setup_ms = std::chrono::duration_cast( + setup_end - setup_start) + .count() / + 1000.0; + + // 2. Binning + auto binning_start = std::chrono::high_resolution_clock::now(); + TriangleTileBinning(model, soa, tile_triangles, tiles_x, tiles_y, TILE_SIZE); + auto binning_end = std::chrono::high_resolution_clock::now(); + auto binning_ms = std::chrono::duration_cast( + binning_end - binning_start) + .count() / + 1000.0; + + // 3. 单份全局 framebuffer + // 直接让每个 tile 写入这份全局缓冲区,避免末端 O(W*H*kNProc) 合并开销 + + auto buffer_alloc_start = std::chrono::high_resolution_clock::now(); + std::unique_ptr depthBuffer = + std::make_unique(width_ * height_); + std::unique_ptr colorBuffer = + std::make_unique(width_ * height_); + // 深度初始化为最远值,颜色清零 + + std::fill_n(depthBuffer.get(), width_ * height_, + std::numeric_limits::infinity()); + std::fill_n(colorBuffer.get(), width_ * height_, 0); + auto buffer_alloc_end = std::chrono::high_resolution_clock::now(); + auto buffer_alloc_ms = std::chrono::duration_cast( + buffer_alloc_end - buffer_alloc_start) + .count() / + 1000.0; + + // 4. 并行光栅化每个 tile(SoA + early-z) + auto raster_start = std::chrono::high_resolution_clock::now(); +#pragma omp parallel num_threads(kNProc) default(none) \ + shared(tile_triangles, rasterizer_, shader, width_, height_, depthBuffer, \ + colorBuffer, tiles_x, tiles_y, total_tiles, soa, TILE_SIZE) + { + // 为每个 tile 分配局部深度和颜色缓冲 + std::unique_ptr tile_depth_buffer = + std::make_unique(TILE_SIZE * TILE_SIZE); + std::unique_ptr tile_color_buffer = + std::make_unique(TILE_SIZE * TILE_SIZE); + + // 为每个 tile 分配可复用片段临时容器,容量按单 tile 上限预估 + std::vector scratch_fragments; + scratch_fragments.reserve(TILE_SIZE * TILE_SIZE); + +#pragma omp for schedule(static) + for (size_t tile_id = 0; tile_id < total_tiles; ++tile_id) { + // 按照 tile 进行光栅化(SoA) + // 直接写入单份全局 framebuffer;不同 tile 不重叠,无需加锁 + RasterizeTile(tile_id, tile_triangles[tile_id], tiles_x, tiles_y, + TILE_SIZE, tile_depth_buffer.get(), tile_color_buffer.get(), + depthBuffer, colorBuffer, soa, *shader, early_z_, + &scratch_fragments); + } + } + auto raster_end = std::chrono::high_resolution_clock::now(); + auto raster_ms = std::chrono::duration_cast( + raster_end - raster_start) + .count() / + 1000.0; + + // 5. 直接将单份全局 colorBuffer 拷贝到输出 + auto present_start = std::chrono::high_resolution_clock::now(); + std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t)); + auto present_end = std::chrono::high_resolution_clock::now(); + auto present_ms = std::chrono::duration_cast( + present_end - present_start) + .count() / + 1000.0; + + auto total_end_time = std::chrono::high_resolution_clock::now(); + double total_ms = std::chrono::duration_cast( + total_end_time - total_start_time) + .count() / + 1000.0; + + SPDLOG_INFO("=== TILE-BASED RENDERING PERFORMANCE ==="); + double sum_ms = vertex_ms + (total_ms - vertex_ms); + SPDLOG_INFO("Vertex Shader: {:8.3f} ms ({:5.1f}%)", vertex_ms, + vertex_ms / sum_ms * 100); + SPDLOG_INFO("Setup: {:8.3f} ms", setup_ms); + SPDLOG_INFO("Binning: {:8.3f} ms", binning_ms); + SPDLOG_INFO("Buffer Alloc: {:8.3f} ms", buffer_alloc_ms); + SPDLOG_INFO("Rasterization: {:8.3f} ms", raster_ms); + SPDLOG_INFO("Copy: {:8.3f} ms", present_ms); + SPDLOG_INFO("Total: {:8.3f} ms", + vertex_ms + (setup_ms + binning_ms + buffer_alloc_ms + raster_ms + + present_ms)); + SPDLOG_INFO("=========================================="); + + return true; +} + +void TileBasedRenderer::TriangleTileBinning( + const Model &model, const VertexSoA &soa, + std::vector> &tile_triangles, size_t tiles_x, + size_t tiles_y, size_t tile_size) { + const size_t total_triangles = model.GetFaces().size(); + + SPDLOG_INFO("Starting triangle-tile binning (SoA) for {} triangles", + total_triangles); + SPDLOG_INFO("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", width_, + height_, tile_size, tiles_x, tiles_y); + + std::vector tile_counts(tiles_x * tiles_y, 0); + + // 第一遍(count only):计算每个tile需要容纳多少三角形 + for (size_t tri_idx = 0; tri_idx < total_triangles; ++tri_idx) { + ProcessTriangleForTileBinning(tri_idx, true, model, soa, tiles_x, tiles_y, tile_size, tile_counts, tile_triangles); + } + + // 预分配,避免动态扩容 + for (size_t tile_id = 0; tile_id < tile_triangles.size(); ++tile_id) { + if (tile_counts[tile_id] > 0) + tile_triangles[tile_id].reserve(tile_counts[tile_id]); + } + + // 第二遍(fill):按范围填充TriangleRef + for (size_t tri_idx = 0; tri_idx < total_triangles; ++tri_idx) { + ProcessTriangleForTileBinning(tri_idx, false, model, soa, tiles_x, tiles_y, tile_size, tile_counts, tile_triangles); + } + + size_t total_triangle_refs = 0; + size_t non_empty_tiles = 0; + for (const auto &tile : tile_triangles) { + total_triangle_refs += tile.size(); + if (!tile.empty()) non_empty_tiles++; + } + SPDLOG_INFO(" (SoA) Total triangle references: {}", total_triangle_refs); + SPDLOG_INFO(" (SoA) Non-empty tiles: {}", non_empty_tiles); + SPDLOG_INFO(" (SoA) Average triangles per tile: {:.2f}", + total_triangle_refs > 0 + ? float(total_triangle_refs) / tile_triangles.size() + : 0.0f); +} + +void TileBasedRenderer::RasterizeTile( + size_t tile_id, const std::vector &triangles, + size_t tiles_x, size_t tiles_y, size_t tile_size, float *tile_depth_buffer, + uint32_t *tile_color_buffer, std::unique_ptr &global_depth_buffer, + std::unique_ptr &global_color_buffer, const VertexSoA &soa, + const Shader &shader, bool use_early_z, + std::vector *scratch_fragments) { + (void)tiles_y; + // 计算 tile 屏幕范围 + size_t tile_x = tile_id % tiles_x; + size_t tile_y = tile_id / tiles_x; + size_t screen_x_start = tile_x * tile_size; + size_t screen_y_start = tile_y * tile_size; + size_t screen_x_end = std::min(screen_x_start + tile_size, width_); + size_t screen_y_end = std::min(screen_y_start + tile_size, height_); + + // 初始化 tile 局部缓冲 + size_t tile_width = screen_x_end - screen_x_start; + size_t tile_height = screen_y_end - screen_y_start; + std::fill_n(tile_depth_buffer, tile_width * tile_height, 1.0f); + std::fill_n(tile_color_buffer, tile_width * tile_height, 0); + + // 只有当调用方没有提供 scratch 时,才启用本地容器并且只构造一次 + const bool use_internal_scratch = (scratch_fragments == nullptr); + std::vector internal_out; + if (use_internal_scratch) internal_out.reserve(tile_width * tile_height); + + for (const auto &tri : triangles) { // 用来应对scratch传入nullptr的情况 + // 始终走 SoA + 限制矩形的光栅化路径;如未提供 scratch,则使用函数内局部容器 + std::vector &out = + use_internal_scratch ? internal_out : *scratch_fragments; + out.clear(); + if (out.capacity() < tile_width * tile_height) + out.reserve(tile_width * tile_height); + + rasterizer_->RasterizeTo( + soa, tri.i0, tri.i1, tri.i2, static_cast(screen_x_start), + static_cast(screen_y_start), static_cast(screen_x_end), + static_cast(screen_y_end), out); + + for (auto &fragment : out) { + fragment.material = tri.material; + size_t sx = fragment.screen_coord[0]; + size_t sy = fragment.screen_coord[1]; + if (sx >= screen_x_start && sx < screen_x_end && sy >= screen_y_start && + sy < screen_y_end) { + size_t local_x = sx - screen_x_start; + size_t local_y = sy - screen_y_start; + size_t idx = local_x + local_y * tile_width; + if (use_early_z) { + if (fragment.depth < tile_depth_buffer[idx]) { + auto color = shader.FragmentShader(fragment); + tile_depth_buffer[idx] = fragment.depth; + tile_color_buffer[idx] = uint32_t(color); + } + } else { + auto color = shader.FragmentShader(fragment); + if (fragment.depth < tile_depth_buffer[idx]) { + tile_depth_buffer[idx] = fragment.depth; + tile_color_buffer[idx] = uint32_t(color); + } + } + } + } + } + + // 写回全局缓冲 + // TBR 下不同 tile 覆盖的屏幕区域互不重叠,且在 tile 内部已通过 Early‑Z + // 得出每个像素的最终值。因此可以直接将 tile 行数据拷贝到全局缓冲 + for (size_t y = 0; y < tile_height; y++) { + const size_t tile_row_off = y * tile_width; + const size_t global_row_off = + (screen_y_start + y) * width_ + screen_x_start; + + // 拷贝本行 color 到全局 color + std::memcpy(global_color_buffer.get() + global_row_off, + tile_color_buffer + tile_row_off, + tile_width * sizeof(uint32_t)); + + // 拷贝本行 depth 到全局 depth + std::memcpy(global_depth_buffer.get() + global_row_off, + tile_depth_buffer + tile_row_off, tile_width * sizeof(float)); + } +} + +void TileBasedRenderer::ProcessTriangleForTileBinning( + size_t tri_idx, bool count_only, + const Model& model, const VertexSoA& soa, + size_t tiles_x, size_t tiles_y, size_t tile_size, + std::vector& tile_counts, + std::vector>& tile_triangles) { + const auto &f = model.GetFaces()[tri_idx]; + size_t i0 = f.GetIndex(0); + size_t i1 = f.GetIndex(1); + size_t i2 = f.GetIndex(2); + + // 视锥体裁剪 (裁剪空间) + // 保守视锥体裁剪:只有当整个三角形都在视锥体外同一侧时才裁剪 + const Vector4f &c0 = soa.pos_clip[i0]; + const Vector4f &c1 = soa.pos_clip[i1]; + const Vector4f &c2 = soa.pos_clip[i2]; + bool frustum_cull = + (c0.x > c0.w && c1.x > c1.w && c2.x > c2.w) || // 右平面外 + (c0.x < -c0.w && c1.x < -c0.w && c2.x < -c0.w) || // 左平面外 + (c0.y > c0.w && c1.y > c1.w && c2.y > c2.w) || // 上平面外 + (c0.y < -c0.w && c1.y < -c0.w && c2.y < -c0.w) || // 下平面外 + (c0.z > c0.w && c1.z > c1.w && c2.z > c2.w) || // 远平面外 + (c0.z < -c0.w && c1.z < -c0.w && c2.z < -c0.w); // 近平面外 + if (frustum_cull) { + return; + } + + const Vector4f &pos0 = soa.pos_screen[i0]; + const Vector4f &pos1 = soa.pos_screen[i1]; + const Vector4f &pos2 = soa.pos_screen[i2]; + + // 背面剔除(屏幕空间) + // NDC空间中叉积为负表示顺时针,即背面。 + // 从NDC到屏幕空间中,会发生Y轴翻转,对应叉积应为正。 + Vector2f screen0(pos0.x, pos0.y); + Vector2f screen1(pos1.x, pos1.y); + Vector2f screen2(pos2.x, pos2.y); + Vector2f edge1 = screen1 - screen0; + Vector2f edge2 = screen2 - screen0; + float cross_product = edge1.x * edge2.y - edge1.y * edge2.x; + if (cross_product > 0.0f) return; + + float screen_x0 = pos0.x; + float screen_y0 = pos0.y; + float screen_x1 = pos1.x; + float screen_y1 = pos1.y; + float screen_x2 = pos2.x; + float screen_y2 = pos2.y; + + // 计算屏幕bbox,用于后续tile划分 + float min_x = std::min({screen_x0, screen_x1, screen_x2}); + float max_x = std::max({screen_x0, screen_x1, screen_x2}); + float min_y = std::min({screen_y0, screen_y1, screen_y2}); + float max_y = std::max({screen_y0, screen_y1, screen_y2}); + + int start_tile_x = + std::max(0, static_cast(min_x) / static_cast(tile_size)); + int end_tile_x = + std::min(static_cast(tiles_x - 1), + static_cast(max_x) / static_cast(tile_size)); + int start_tile_y = + std::max(0, static_cast(min_y) / static_cast(tile_size)); + int end_tile_y = + std::min(static_cast(tiles_y - 1), + static_cast(max_y) / static_cast(tile_size)); + if (start_tile_x > end_tile_x || start_tile_y > end_tile_y) + return; // 如果bbox不在任何tile内,直接返回 + + if (count_only) { // 第一遍计数,只统计tile内三角形数量 + for (int ty = start_tile_y; ty <= end_tile_y; ++ty) { + for (int tx = start_tile_x; tx <= end_tile_x; ++tx) { + size_t tile_id = ty * tiles_x + tx; + tile_counts[tile_id]++; + } + } + } else { // 第二遍填充,填充TriangleRef + TileTriangleRef tri_ref{i0, i1, i2, &f.GetMaterial(), tri_idx}; + for (int ty = start_tile_y; ty <= end_tile_y; ++ty) { + for (int tx = start_tile_x; tx <= end_tile_x; ++tx) { + size_t tile_id = ty * tiles_x + tx; + tile_triangles[tile_id].push_back(tri_ref); + } + } + } +} + +} // namespace simple_renderer diff --git a/test/system_test/main.cpp b/test/system_test/main.cpp index 9725181..d6491d9 100755 --- a/test/system_test/main.cpp +++ b/test/system_test/main.cpp @@ -80,7 +80,7 @@ int main(int argc, char **argv) { simple_renderer::Camera camera(simple_renderer::Vector3f(0.0f, 0.0f, 1.0f)); - // 设置渲染模式(可选:TRADITIONAL、TILE_BASED 或 DEFERRED) + // 设置渲染模式(可选:PER_TRIANGLE、TILE_BASED 或 DEFERRED) simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::TILE_BASED); // 输出当前渲染模式 From d6e3b4002ac1a1f9ef45c7f7d02f426cb86953bd Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Sun, 14 Sep 2025 23:34:26 +0800 Subject: [PATCH 17/24] TBR: Replace barycentric coordinate computation with half-space testing to enable SIMD-friendly rasterization; use relative-coordinate cross products to ensure numerical stability. Signed-off-by: ZhouFANG --- src/rasterizer.cpp | 87 ++++++++++++++++++++++++++++++++++------------ 1 file changed, 64 insertions(+), 23 deletions(-) diff --git a/src/rasterizer.cpp b/src/rasterizer.cpp index 84cbc83..9b8558a 100644 --- a/src/rasterizer.cpp +++ b/src/rasterizer.cpp @@ -145,41 +145,82 @@ void Rasterizer::RasterizeTo(const VertexSoA& soa, size_t i0, size_t i1, size_t const Vector4f& p1 = soa.pos_screen[i1]; const Vector4f& p2 = soa.pos_screen[i2]; - Vector2f a = Vector2f(p0.x, p0.y); - Vector2f b = Vector2f(p1.x, p1.y); - Vector2f c = Vector2f(p2.x, p2.y); + // 为BarycentricCoord预构造Vec3f,避免循环内重复构造 + const Vector3f sp0(p0.x, p0.y, p0.z); + const Vector3f sp1(p1.x, p1.y, p1.z); + const Vector3f sp2(p2.x, p2.y, p2.z); - Vector2f bboxMin = Vector2f{std::min({a.x, b.x, c.x}), std::min({a.y, b.y, c.y})}; - Vector2f bboxMax = Vector2f{std::max({a.x, b.x, c.x}), std::max({a.y, b.y, c.y})}; - - // Clamp 到屏幕尺寸 - float minx = std::max(0.0f, bboxMin.x); - float miny = std::max(0.0f, bboxMin.y); - float maxx = std::min(float(width_ - 1), bboxMax.x); - float maxy = std::min(float(height_ - 1), bboxMax.y); + // 计算屏幕空间AABB包围盒 + const float minx_f = std::max(0.0f, std::min({p0.x, p1.x, p2.x})); + const float miny_f = std::max(0.0f, std::min({p0.y, p1.y, p2.y})); + const float maxx_f = std::min(float(width_ - 1), std::max({p0.x, p1.x, p2.x})); + const float maxy_f = std::min(float(height_ - 1), std::max({p0.y, p1.y, p2.y})); // 与外部提供的裁剪区域相交(半开区间) -> 闭区间扫描 - int sx = std::max(x0, static_cast(std::floor(minx))); - int sy = std::max(y0, static_cast(std::floor(miny))); - int ex = std::min(x1 - 1, static_cast(std::floor(maxx))); - int ey = std::min(y1 - 1, static_cast(std::floor(maxy))); + int sx = std::max(x0, static_cast(std::floor(minx_f))); + int sy = std::max(y0, static_cast(std::floor(miny_f))); + int ex = std::min(x1 - 1, static_cast(std::floor(maxx_f))); + int ey = std::min(y1 - 1, static_cast(std::floor(maxy_f))); if (sx > ex || sy > ey) return; - for (int x = sx; x <= ex; ++x) { - for (int y = sy; y <= ey; ++y) { - auto [is_inside, bary] = GetBarycentricCoord( - Vector3f(p0.x, p0.y, p0.z), Vector3f(p1.x, p1.y, p1.z), Vector3f(p2.x, p2.y, p2.z), - Vector3f(static_cast(x), static_cast(y), 0)); - if (!is_inside) continue; + // 预计算边函数系数:E(x,y) = A*x + B*y + C + // 使用相对坐标的边函数定义,避免大常数项导致的数值不稳定 + // 如使用绝对形式Ax+By+C会由于常数C的量级过大,造成浮点抵消,有效位丢失不稳定 + auto cross2 = [](float ax, float ay, float bx, float by) { + return ax * by - ay * bx; + }; + // 边向量 + const float e01x = p1.x - p0.x, e01y = p1.y - p0.y; // (p0->p1) + const float e12x = p2.x - p1.x, e12y = p2.y - p1.y; // (p1->p2) + const float e20x = p0.x - p2.x, e20y = p0.y - p2.y; // (p2->p0) + + // 有向面积(两倍),用相对面积定义:area2 = cross(p1 - p0, p2 - p0) + float area2 = cross2(e01x, e01y, p2.x - p0.x, p2.y - p0.y); + if (std::abs(area2) < 1e-6f) return; // 退化三角形 + const float inv_area2 = 1.0f / area2; + const bool positive = (area2 > 0.0f); + + // 行优先遍历:有利于 cache 与向量化 + #pragma omp simd + for (int y = sy; y <= ey; ++y) { + const float yf = static_cast(y); + + // 注意:此处存在对 out.push_back 的写入,属于有副作用操作,不适合使用 + // omp simd 进行强制向量化,否则可能导致不符合预期的行为(如周期性伪影)。 + // 先保持标量内层,后续如切换为“直写像素回调”再考虑安全的 SIMD 化。 + for (int x = sx; x <= ex; ++x) { + const float xf = static_cast(x); + + // 相对坐标边函数: + // E01(p) = cross(p1 - p0, p - p0) + // E12(p) = cross(p2 - p1, p - p1) + // E20(p) = cross(p0 - p2, p - p2) + const float E01 = cross2(e01x, e01y, xf - p0.x, yf - p0.y); + const float E12 = cross2(e12x, e12y, xf - p1.x, yf - p1.y); + const float E20 = cross2(e20x, e20y, xf - p2.x, yf - p2.y); + + // 半空间测试(根据朝向选择符号) + const bool inside = positive ? (E01 >= 0.0f && E12 >= 0.0f && E20 >= 0.0f) + : (E01 <= 0.0f && E12 <= 0.0f && E20 <= 0.0f); + if (!inside) continue; + + // 重心权重映射: + // b0 对应 v0,取与对边 (v1,v2) 的子面积 → E12 + // b1 对应 v1 → E20 + // b2 对应 v2 → E01 + const float b0 = E12 * inv_area2; + const float b1 = E20 * inv_area2; + const float b2 = E01 * inv_area2; + const Vector3f bary(b0, b1, b2); // 透视矫正插值 auto perspective_result = PerformPerspectiveCorrection( p0.w, p1.w, p2.w, p0.z, p1.z, p2.z, bary); - + const Vector3f& corrected_bary = perspective_result.corrected_barycentric; - float z = perspective_result.interpolated_z; + const float z = perspective_result.interpolated_z; Fragment frag; // Note: material 指针由调用方填写 frag.screen_coord = {x, y}; From 30038efb17648268eeb760e8f67baf3aa928ac84 Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Mon, 15 Sep 2025 13:38:26 +0800 Subject: [PATCH 18/24] DR: Optimize fragment collection(pre-reserve per bucket, move-insert, and per-bucket parallel merge) Signed-off-by: ZhouFANG --- src/renderers/deferred_renderer.cpp | 42 ++++++++++++++++++++++++----- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/src/renderers/deferred_renderer.cpp b/src/renderers/deferred_renderer.cpp index 6931812..c93c9ff 100644 --- a/src/renderers/deferred_renderer.cpp +++ b/src/renderers/deferred_renderer.cpp @@ -3,6 +3,8 @@ #include #include #include +#include +#include #include "config.h" #include "log_system.h" @@ -77,12 +79,40 @@ bool DeferredRenderer::Render(const Model& model, const Shader& shader_in, uint3 /* * * Fragment Collection * * */ auto collect_start = std::chrono::high_resolution_clock::now(); - std::vector> fragmentsBuffer(width_ * height_); - for (const auto &fragmentsBuffer_per_thread : fragmentsBuffer_all_thread) { - for (size_t i = 0; i < fragmentsBuffer_per_thread.size(); i++) { - fragmentsBuffer[i].insert(fragmentsBuffer[i].end(), - fragmentsBuffer_per_thread[i].begin(), - fragmentsBuffer_per_thread[i].end()); + + const size_t pixel_count = static_cast(width_) * static_cast(height_); + +#ifndef NDEBUG + for (const auto &tb : fragmentsBuffer_all_thread) { + // 断言避免越界,确保固定维度 + assert(tb.size() == pixel_count && "thread buffer size mismatch"); + } +#endif + + // Pass 1: 统计每个像素桶的总片元数 + std::vector bucket_total(pixel_count, 0); + for (const auto &tb : fragmentsBuffer_all_thread) { + for (size_t i = 0; i < pixel_count; ++i) { + bucket_total[i] += tb[i].size(); + } + } + + // Pass 2: 统一预分配 + std::vector> fragmentsBuffer(pixel_count); + for (size_t i = 0; i < pixel_count; ++i) { + if (bucket_total[i] > 0) fragmentsBuffer[i].reserve(bucket_total[i]); + } + + // Pass 3: 按桶并行合并(每个桶内部保持按线程序的插入顺序) +#pragma omp parallel for num_threads(kNProc) schedule(static) + for (long long i = 0; i < static_cast(pixel_count); ++i) { + auto &dst = fragmentsBuffer[static_cast(i)]; + for (size_t t = 0; t < fragmentsBuffer_all_thread.size(); ++t) { + auto &src = fragmentsBuffer_all_thread[t][static_cast(i)]; + dst.insert(dst.end(), + std::make_move_iterator(src.begin()), + std::make_move_iterator(src.end())); + src.clear(); } } auto collect_end = std::chrono::high_resolution_clock::now(); From 61e75a8d7486d29ca9338332453cade1ef3ac738 Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Mon, 15 Sep 2025 15:48:25 +0800 Subject: [PATCH 19/24] Refactor: Modify the triangle binning logic in TBR to use the TileGridContext structure. Replacing hard-coded values with constants. Signed-off-by: ZhouFANG --- src/include/renderers/tile_based_renderer.hpp | 42 ++++--- src/renderers/tile_based_renderer.cpp | 105 +++++++++--------- 2 files changed, 79 insertions(+), 68 deletions(-) diff --git a/src/include/renderers/tile_based_renderer.hpp b/src/include/renderers/tile_based_renderer.hpp index e3ecb89..f524fb6 100644 --- a/src/include/renderers/tile_based_renderer.hpp +++ b/src/include/renderers/tile_based_renderer.hpp @@ -14,6 +14,16 @@ struct TileTriangleRef { size_t face_index = 0; }; +/** + * @brief Tile 网格上下文(供 binning 和 raster 共享的网格/几何信息) + */ +struct TileGridContext { + const VertexSoA& soa; + size_t tiles_x; + size_t tiles_y; + size_t tile_size; +}; + /** * @brief 基于 Tile 的渲染器(Tile‑Major) * @@ -49,10 +59,9 @@ class TileBasedRenderer final : public RendererBase { * @param tiles_y 垂直 tile 数 * @param tile_size tile 像素尺寸 */ - void TriangleTileBinning(const Model &model, - const VertexSoA &soa, - std::vector> &tile_triangles, - size_t tiles_x, size_t tiles_y, size_t tile_size); + void TriangleTileBinning(const Model& model, + const TileGridContext& grid, + std::vector> &tile_triangles); /** * @brief 处理单个三角形的 tile binning 逻辑 @@ -68,8 +77,8 @@ class TileBasedRenderer final : public RendererBase { */ void ProcessTriangleForTileBinning( size_t tri_idx, bool count_only, - const Model& model, const VertexSoA& soa, - size_t tiles_x, size_t tiles_y, size_t tile_size, + const Model& model, + const TileGridContext& grid, std::vector& tile_counts, std::vector>& tile_triangles); @@ -90,17 +99,20 @@ class TileBasedRenderer final : public RendererBase { * @param scratch_fragments 可复用片段临时容器 */ void RasterizeTile(size_t tile_id, - const std::vector &triangles, - size_t tiles_x, size_t tiles_y, size_t tile_size, - float* tile_depth_buffer, uint32_t* tile_color_buffer, - std::unique_ptr &global_depth_buffer, - std::unique_ptr &global_color_buffer, - const VertexSoA &soa, - const Shader& shader, - bool use_early_z, - std::vector* scratch_fragments); + const std::vector &triangles, + const TileGridContext& grid, + float* tile_depth_buffer, uint32_t* tile_color_buffer, + std::unique_ptr &global_depth_buffer, + std::unique_ptr &global_color_buffer, + const Shader& shader, + bool use_early_z, + std::vector* scratch_fragments); private: + // 深度和颜色的默认值,同时用于tile级和全局级buffers的初始化 + static constexpr float kDepthClear = 1.0f; // 默认为最远值,用于Early-Z + static constexpr uint32_t kColorClear = 0u; // 默认为黑色 + const bool early_z_; const size_t tile_size_; }; diff --git a/src/renderers/tile_based_renderer.cpp b/src/renderers/tile_based_renderer.cpp index 1ad3db7..c88f114 100644 --- a/src/renderers/tile_based_renderer.cpp +++ b/src/renderers/tile_based_renderer.cpp @@ -59,7 +59,8 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in, // 2. Binning auto binning_start = std::chrono::high_resolution_clock::now(); - TriangleTileBinning(model, soa, tile_triangles, tiles_x, tiles_y, TILE_SIZE); + TileGridContext grid_ctx{soa, tiles_x, tiles_y, TILE_SIZE}; + TriangleTileBinning(model, grid_ctx, tile_triangles); auto binning_end = std::chrono::high_resolution_clock::now(); auto binning_ms = std::chrono::duration_cast( binning_end - binning_start) @@ -74,11 +75,10 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in, std::make_unique(width_ * height_); std::unique_ptr colorBuffer = std::make_unique(width_ * height_); - // 深度初始化为最远值,颜色清零 - std::fill_n(depthBuffer.get(), width_ * height_, - std::numeric_limits::infinity()); - std::fill_n(colorBuffer.get(), width_ * height_, 0); + // 深度初始化为最远值,颜色清零 + std::fill_n(depthBuffer.get(), width_ * height_, kDepthClear); + std::fill_n(colorBuffer.get(), width_ * height_, kColorClear); auto buffer_alloc_end = std::chrono::high_resolution_clock::now(); auto buffer_alloc_ms = std::chrono::duration_cast( buffer_alloc_end - buffer_alloc_start) @@ -88,26 +88,26 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in, // 4. 并行光栅化每个 tile(SoA + early-z) auto raster_start = std::chrono::high_resolution_clock::now(); #pragma omp parallel num_threads(kNProc) default(none) \ - shared(tile_triangles, rasterizer_, shader, width_, height_, depthBuffer, \ - colorBuffer, tiles_x, tiles_y, total_tiles, soa, TILE_SIZE) + shared(tile_triangles, shader, depthBuffer, colorBuffer, total_tiles, \ + grid_ctx, early_z_) { // 为每个 tile 分配局部深度和颜色缓冲 std::unique_ptr tile_depth_buffer = - std::make_unique(TILE_SIZE * TILE_SIZE); + std::make_unique(grid_ctx.tile_size * grid_ctx.tile_size); std::unique_ptr tile_color_buffer = - std::make_unique(TILE_SIZE * TILE_SIZE); + std::make_unique(grid_ctx.tile_size * grid_ctx.tile_size); // 为每个 tile 分配可复用片段临时容器,容量按单 tile 上限预估 std::vector scratch_fragments; - scratch_fragments.reserve(TILE_SIZE * TILE_SIZE); + scratch_fragments.reserve(grid_ctx.tile_size * grid_ctx.tile_size); #pragma omp for schedule(static) for (size_t tile_id = 0; tile_id < total_tiles; ++tile_id) { // 按照 tile 进行光栅化(SoA) // 直接写入单份全局 framebuffer;不同 tile 不重叠,无需加锁 - RasterizeTile(tile_id, tile_triangles[tile_id], tiles_x, tiles_y, - TILE_SIZE, tile_depth_buffer.get(), tile_color_buffer.get(), - depthBuffer, colorBuffer, soa, *shader, early_z_, + RasterizeTile(tile_id, tile_triangles[tile_id], grid_ctx, + tile_depth_buffer.get(), tile_color_buffer.get(), + depthBuffer, colorBuffer, *shader, early_z_, &scratch_fragments); } } @@ -150,21 +150,22 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in, } void TileBasedRenderer::TriangleTileBinning( - const Model &model, const VertexSoA &soa, - std::vector> &tile_triangles, size_t tiles_x, - size_t tiles_y, size_t tile_size) { + const Model& model, + const TileGridContext& grid, + std::vector> &tile_triangles) { const size_t total_triangles = model.GetFaces().size(); SPDLOG_INFO("Starting triangle-tile binning (SoA) for {} triangles", total_triangles); SPDLOG_INFO("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", width_, - height_, tile_size, tiles_x, tiles_y); + height_, grid.tile_size, grid.tiles_x, grid.tiles_y); - std::vector tile_counts(tiles_x * tiles_y, 0); + std::vector tile_counts(grid.tiles_x * grid.tiles_y, 0); // 第一遍(count only):计算每个tile需要容纳多少三角形 for (size_t tri_idx = 0; tri_idx < total_triangles; ++tri_idx) { - ProcessTriangleForTileBinning(tri_idx, true, model, soa, tiles_x, tiles_y, tile_size, tile_counts, tile_triangles); + ProcessTriangleForTileBinning(tri_idx, true, model, grid, + tile_counts, tile_triangles); } // 预分配,避免动态扩容 @@ -175,7 +176,8 @@ void TileBasedRenderer::TriangleTileBinning( // 第二遍(fill):按范围填充TriangleRef for (size_t tri_idx = 0; tri_idx < total_triangles; ++tri_idx) { - ProcessTriangleForTileBinning(tri_idx, false, model, soa, tiles_x, tiles_y, tile_size, tile_counts, tile_triangles); + ProcessTriangleForTileBinning(tri_idx, false, model, grid, + tile_counts, tile_triangles); } size_t total_triangle_refs = 0; @@ -194,25 +196,24 @@ void TileBasedRenderer::TriangleTileBinning( void TileBasedRenderer::RasterizeTile( size_t tile_id, const std::vector &triangles, - size_t tiles_x, size_t tiles_y, size_t tile_size, float *tile_depth_buffer, + const TileGridContext& grid, float *tile_depth_buffer, uint32_t *tile_color_buffer, std::unique_ptr &global_depth_buffer, - std::unique_ptr &global_color_buffer, const VertexSoA &soa, + std::unique_ptr &global_color_buffer, const Shader &shader, bool use_early_z, std::vector *scratch_fragments) { - (void)tiles_y; // 计算 tile 屏幕范围 - size_t tile_x = tile_id % tiles_x; - size_t tile_y = tile_id / tiles_x; - size_t screen_x_start = tile_x * tile_size; - size_t screen_y_start = tile_y * tile_size; - size_t screen_x_end = std::min(screen_x_start + tile_size, width_); - size_t screen_y_end = std::min(screen_y_start + tile_size, height_); + size_t tile_x = tile_id % grid.tiles_x; + size_t tile_y = tile_id / grid.tiles_x; + size_t screen_x_start = tile_x * grid.tile_size; + size_t screen_y_start = tile_y * grid.tile_size; + size_t screen_x_end = std::min(screen_x_start + grid.tile_size, width_); + size_t screen_y_end = std::min(screen_y_start + grid.tile_size, height_); // 初始化 tile 局部缓冲 size_t tile_width = screen_x_end - screen_x_start; size_t tile_height = screen_y_end - screen_y_start; - std::fill_n(tile_depth_buffer, tile_width * tile_height, 1.0f); - std::fill_n(tile_color_buffer, tile_width * tile_height, 0); + std::fill_n(tile_depth_buffer, tile_width * tile_height, kDepthClear); + std::fill_n(tile_color_buffer, tile_width * tile_height, kColorClear); // 只有当调用方没有提供 scratch 时,才启用本地容器并且只构造一次 const bool use_internal_scratch = (scratch_fragments == nullptr); @@ -228,7 +229,7 @@ void TileBasedRenderer::RasterizeTile( out.reserve(tile_width * tile_height); rasterizer_->RasterizeTo( - soa, tri.i0, tri.i1, tri.i2, static_cast(screen_x_start), + grid.soa, tri.i0, tri.i1, tri.i2, static_cast(screen_x_start), static_cast(screen_y_start), static_cast(screen_x_end), static_cast(screen_y_end), out); @@ -278,11 +279,9 @@ void TileBasedRenderer::RasterizeTile( } void TileBasedRenderer::ProcessTriangleForTileBinning( - size_t tri_idx, bool count_only, - const Model& model, const VertexSoA& soa, - size_t tiles_x, size_t tiles_y, size_t tile_size, - std::vector& tile_counts, - std::vector>& tile_triangles) { + size_t tri_idx, bool count_only, const Model &model, + const TileGridContext &grid, std::vector &tile_counts, + std::vector> &tile_triangles) { const auto &f = model.GetFaces()[tri_idx]; size_t i0 = f.GetIndex(0); size_t i1 = f.GetIndex(1); @@ -290,9 +289,9 @@ void TileBasedRenderer::ProcessTriangleForTileBinning( // 视锥体裁剪 (裁剪空间) // 保守视锥体裁剪:只有当整个三角形都在视锥体外同一侧时才裁剪 - const Vector4f &c0 = soa.pos_clip[i0]; - const Vector4f &c1 = soa.pos_clip[i1]; - const Vector4f &c2 = soa.pos_clip[i2]; + const Vector4f &c0 = grid.soa.pos_clip[i0]; + const Vector4f &c1 = grid.soa.pos_clip[i1]; + const Vector4f &c2 = grid.soa.pos_clip[i2]; bool frustum_cull = (c0.x > c0.w && c1.x > c1.w && c2.x > c2.w) || // 右平面外 (c0.x < -c0.w && c1.x < -c0.w && c2.x < -c0.w) || // 左平面外 @@ -304,9 +303,9 @@ void TileBasedRenderer::ProcessTriangleForTileBinning( return; } - const Vector4f &pos0 = soa.pos_screen[i0]; - const Vector4f &pos1 = soa.pos_screen[i1]; - const Vector4f &pos2 = soa.pos_screen[i2]; + const Vector4f &pos0 = grid.soa.pos_screen[i0]; + const Vector4f &pos1 = grid.soa.pos_screen[i1]; + const Vector4f &pos2 = grid.soa.pos_screen[i2]; // 背面剔除(屏幕空间) // NDC空间中叉积为负表示顺时针,即背面。 @@ -332,23 +331,23 @@ void TileBasedRenderer::ProcessTriangleForTileBinning( float min_y = std::min({screen_y0, screen_y1, screen_y2}); float max_y = std::max({screen_y0, screen_y1, screen_y2}); - int start_tile_x = - std::max(0, static_cast(min_x) / static_cast(tile_size)); + int start_tile_x = std::max(0, static_cast(min_x) / + static_cast(grid.tile_size)); int end_tile_x = - std::min(static_cast(tiles_x - 1), - static_cast(max_x) / static_cast(tile_size)); - int start_tile_y = - std::max(0, static_cast(min_y) / static_cast(tile_size)); + std::min(static_cast(grid.tiles_x - 1), + static_cast(max_x) / static_cast(grid.tile_size)); + int start_tile_y = std::max(0, static_cast(min_y) / + static_cast(grid.tile_size)); int end_tile_y = - std::min(static_cast(tiles_y - 1), - static_cast(max_y) / static_cast(tile_size)); + std::min(static_cast(grid.tiles_y - 1), + static_cast(max_y) / static_cast(grid.tile_size)); if (start_tile_x > end_tile_x || start_tile_y > end_tile_y) return; // 如果bbox不在任何tile内,直接返回 if (count_only) { // 第一遍计数,只统计tile内三角形数量 for (int ty = start_tile_y; ty <= end_tile_y; ++ty) { for (int tx = start_tile_x; tx <= end_tile_x; ++tx) { - size_t tile_id = ty * tiles_x + tx; + size_t tile_id = ty * grid.tiles_x + tx; tile_counts[tile_id]++; } } @@ -356,7 +355,7 @@ void TileBasedRenderer::ProcessTriangleForTileBinning( TileTriangleRef tri_ref{i0, i1, i2, &f.GetMaterial(), tri_idx}; for (int ty = start_tile_y; ty <= end_tile_y; ++ty) { for (int tx = start_tile_x; tx <= end_tile_x; ++tx) { - size_t tile_id = ty * tiles_x + tx; + size_t tile_id = ty * grid.tiles_x + tx; tile_triangles[tile_id].push_back(tri_ref); } } From 86d06adfda6567c68c5fdb86c7cbb46416d60b11 Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Mon, 15 Sep 2025 16:14:23 +0800 Subject: [PATCH 20/24] Change timing-related debug messages from SPDLOG_INFO to SPDLOG_DEBUG, set the default log level to INFO Signed-off-by: ZhouFANG --- src/include/log_system.h | 3 +++ src/light.cpp | 2 +- src/rasterizer.cpp | 2 +- src/renderer.cpp | 2 +- src/renderers/deferred_renderer.cpp | 18 +++++++-------- src/renderers/per_triangle_renderer.cpp | 14 ++++++------ src/renderers/tile_based_renderer.cpp | 30 ++++++++++++------------- 7 files changed, 37 insertions(+), 34 deletions(-) diff --git a/src/include/log_system.h b/src/include/log_system.h index a1f2903..2f8d9c4 100755 --- a/src/include/log_system.h +++ b/src/include/log_system.h @@ -17,6 +17,9 @@ #ifndef SIMPLERENDER_SRC_INCLUDE_LOG_SYSTEM_H_ #define SIMPLERENDER_SRC_INCLUDE_LOG_SYSTEM_H_ +#ifndef SPDLOG_ACTIVE_LEVEL +#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO +#endif #include namespace simple_renderer { diff --git a/src/light.cpp b/src/light.cpp index f25fb4c..ae3a51d 100644 --- a/src/light.cpp +++ b/src/light.cpp @@ -27,7 +27,7 @@ const Vector3f Light::kDefaultDir = Vector3f(0, 0, -1); const Color Light::kDefaultColor = Color::kWhite; Light::Light(const std::string &name) : name_(name) { - SPDLOG_INFO("Light: {}", name_); + SPDLOG_DEBUG("Light: {}", name_); } } // namespace simple_renderer diff --git a/src/rasterizer.cpp b/src/rasterizer.cpp index 9b8558a..04aa6b1 100644 --- a/src/rasterizer.cpp +++ b/src/rasterizer.cpp @@ -8,7 +8,7 @@ namespace simple_renderer { Rasterizer::Rasterizer(size_t width, size_t height) : width_(width), height_(height) { - SPDLOG_INFO("Rasterizer init with {}, {}", width, height); + SPDLOG_DEBUG("Rasterizer init with {}, {}", width, height); } std::vector Rasterizer::Rasterize(const Vertex& v0, const Vertex& v1, diff --git a/src/renderer.cpp b/src/renderer.cpp index 4319066..0939cf5 100644 --- a/src/renderer.cpp +++ b/src/renderer.cpp @@ -30,7 +30,7 @@ SimpleRenderer::SimpleRenderer(size_t width, size_t height) bool SimpleRenderer::DrawModel(const Model &model, const Shader &shader, uint32_t *buffer) { EnsureRenderer(); // 确保渲染器实例存在 - SPDLOG_INFO("draw model: {}", model.GetModelPath()); + SPDLOG_DEBUG("draw model: {}", model.GetModelPath()); return renderer_->Render(model, shader, buffer); } diff --git a/src/renderers/deferred_renderer.cpp b/src/renderers/deferred_renderer.cpp index c93c9ff..a86d41f 100644 --- a/src/renderers/deferred_renderer.cpp +++ b/src/renderers/deferred_renderer.cpp @@ -159,16 +159,16 @@ bool DeferredRenderer::Render(const Model& model, const Shader& shader_in, uint3 auto total_end_time = std::chrono::high_resolution_clock::now(); double total_ms = std::chrono::duration_cast(total_end_time - total_start_time).count() / 1000.0; - SPDLOG_INFO("=== DEFERRED RENDERING PERFORMANCE ==="); + SPDLOG_DEBUG("=== DEFERRED RENDERING PERFORMANCE ==="); double sum_ms = vertex_ms + (total_ms - vertex_ms); - SPDLOG_INFO("Vertex Shader: {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/sum_ms*100); - SPDLOG_INFO("Buffer Alloc: {:8.3f} ms", buffer_alloc_ms); - SPDLOG_INFO("Rasterization: {:8.3f} ms", raster_ms); - SPDLOG_INFO("Fragment Collection: {:8.3f} ms", collect_ms); - SPDLOG_INFO("Fragment Merge: {:8.3f} ms", merge_ms); - SPDLOG_INFO("Deferred Shading: {:8.3f} ms", shade_ms); - SPDLOG_INFO("Total: {:8.3f} ms", vertex_ms + (buffer_alloc_ms + raster_ms + collect_ms + merge_ms + shade_ms)); - SPDLOG_INFO("========================================="); + SPDLOG_DEBUG("Vertex Shader: {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/sum_ms*100); + SPDLOG_DEBUG("Buffer Alloc: {:8.3f} ms", buffer_alloc_ms); + SPDLOG_DEBUG("Rasterization: {:8.3f} ms", raster_ms); + SPDLOG_DEBUG("Fragment Collection: {:8.3f} ms", collect_ms); + SPDLOG_DEBUG("Fragment Merge: {:8.3f} ms", merge_ms); + SPDLOG_DEBUG("Deferred Shading: {:8.3f} ms", shade_ms); + SPDLOG_DEBUG("Total: {:8.3f} ms", vertex_ms + (buffer_alloc_ms + raster_ms + collect_ms + merge_ms + shade_ms)); + SPDLOG_DEBUG("========================================="); return true; } diff --git a/src/renderers/per_triangle_renderer.cpp b/src/renderers/per_triangle_renderer.cpp index 9e3167c..369f282 100644 --- a/src/renderers/per_triangle_renderer.cpp +++ b/src/renderers/per_triangle_renderer.cpp @@ -155,16 +155,16 @@ bool PerTriangleRenderer::Render(const Model &model, const Shader &shader_in, .count() / 1000.0; - SPDLOG_INFO("=== PER-TRIANGLE RENDERING PERFORMANCE ==="); + SPDLOG_DEBUG("=== PER-TRIANGLE RENDERING PERFORMANCE ==="); double sum_ms = vertex_ms + (total_ms - vertex_ms); - SPDLOG_INFO("Vertex Shader: {:8.3f} ms ({:5.1f}%)", vertex_ms, + SPDLOG_DEBUG("Vertex Shader: {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms / sum_ms * 100); - SPDLOG_INFO("Buffer Alloc: {:8.3f} ms", buffer_alloc_ms); - SPDLOG_INFO("Rasterization: {:8.3f} ms", raster_ms); - SPDLOG_INFO("Merge: {:8.3f} ms", merge_ms); - SPDLOG_INFO("Total: {:8.3f} ms", + SPDLOG_DEBUG("Buffer Alloc: {:8.3f} ms", buffer_alloc_ms); + SPDLOG_DEBUG("Rasterization: {:8.3f} ms", raster_ms); + SPDLOG_DEBUG("Merge: {:8.3f} ms", merge_ms); + SPDLOG_DEBUG("Total: {:8.3f} ms", vertex_ms + (buffer_alloc_ms + raster_ms + merge_ms)); - SPDLOG_INFO("=========================================="); + SPDLOG_DEBUG("=========================================="); return true; } diff --git a/src/renderers/tile_based_renderer.cpp b/src/renderers/tile_based_renderer.cpp index c88f114..e0df64d 100644 --- a/src/renderers/tile_based_renderer.cpp +++ b/src/renderers/tile_based_renderer.cpp @@ -132,19 +132,19 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in, .count() / 1000.0; - SPDLOG_INFO("=== TILE-BASED RENDERING PERFORMANCE ==="); + SPDLOG_DEBUG("=== TILE-BASED RENDERING PERFORMANCE ==="); double sum_ms = vertex_ms + (total_ms - vertex_ms); - SPDLOG_INFO("Vertex Shader: {:8.3f} ms ({:5.1f}%)", vertex_ms, + SPDLOG_DEBUG("Vertex Shader: {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms / sum_ms * 100); - SPDLOG_INFO("Setup: {:8.3f} ms", setup_ms); - SPDLOG_INFO("Binning: {:8.3f} ms", binning_ms); - SPDLOG_INFO("Buffer Alloc: {:8.3f} ms", buffer_alloc_ms); - SPDLOG_INFO("Rasterization: {:8.3f} ms", raster_ms); - SPDLOG_INFO("Copy: {:8.3f} ms", present_ms); - SPDLOG_INFO("Total: {:8.3f} ms", + SPDLOG_DEBUG("Setup: {:8.3f} ms", setup_ms); + SPDLOG_DEBUG("Binning: {:8.3f} ms", binning_ms); + SPDLOG_DEBUG("Buffer Alloc: {:8.3f} ms", buffer_alloc_ms); + SPDLOG_DEBUG("Rasterization: {:8.3f} ms", raster_ms); + SPDLOG_DEBUG("Copy: {:8.3f} ms", present_ms); + SPDLOG_DEBUG("Total: {:8.3f} ms", vertex_ms + (setup_ms + binning_ms + buffer_alloc_ms + raster_ms + - present_ms)); - SPDLOG_INFO("=========================================="); + present_ms)); + SPDLOG_DEBUG("=========================================="); return true; } @@ -155,9 +155,9 @@ void TileBasedRenderer::TriangleTileBinning( std::vector> &tile_triangles) { const size_t total_triangles = model.GetFaces().size(); - SPDLOG_INFO("Starting triangle-tile binning (SoA) for {} triangles", + SPDLOG_DEBUG("Starting triangle-tile binning (SoA) for {} triangles", total_triangles); - SPDLOG_INFO("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", width_, + SPDLOG_DEBUG("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", width_, height_, grid.tile_size, grid.tiles_x, grid.tiles_y); std::vector tile_counts(grid.tiles_x * grid.tiles_y, 0); @@ -186,9 +186,9 @@ void TileBasedRenderer::TriangleTileBinning( total_triangle_refs += tile.size(); if (!tile.empty()) non_empty_tiles++; } - SPDLOG_INFO(" (SoA) Total triangle references: {}", total_triangle_refs); - SPDLOG_INFO(" (SoA) Non-empty tiles: {}", non_empty_tiles); - SPDLOG_INFO(" (SoA) Average triangles per tile: {:.2f}", + SPDLOG_DEBUG(" (SoA) Total triangle references: {}", total_triangle_refs); + SPDLOG_DEBUG(" (SoA) Non-empty tiles: {}", non_empty_tiles); + SPDLOG_DEBUG(" (SoA) Average triangles per tile: {:.2f}", total_triangle_refs > 0 ? float(total_triangle_refs) / tile_triangles.size() : 0.0f); From 0ea7f223235fba7dbc7854b1ee4e6fb7cac5b43d Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Tue, 16 Sep 2025 13:41:08 +0800 Subject: [PATCH 21/24] TBR: Perform mask-based computation for TBR rasterization to achieve SIMD-friendly rasterization, and add corresponding mask statistics output. Signed-off-by: ZhouFANG --- src/include/renderers/tile_based_renderer.hpp | 10 +- src/renderers/tile_based_renderer.cpp | 250 +++++++++++++++--- 2 files changed, 219 insertions(+), 41 deletions(-) diff --git a/src/include/renderers/tile_based_renderer.hpp b/src/include/renderers/tile_based_renderer.hpp index f524fb6..da7970c 100644 --- a/src/include/renderers/tile_based_renderer.hpp +++ b/src/include/renderers/tile_based_renderer.hpp @@ -14,6 +14,13 @@ struct TileTriangleRef { size_t face_index = 0; }; +struct TileMaskStats { + uint64_t tested = 0; // 遍历检测像素总数 + uint64_t covered = 0; // 三角形内覆盖测试通过像素数(通过边函数做内点测试成功) + uint64_t zpass = 0; // 通过early-z测试像素数(深度值小于tile局部深度缓冲) + uint64_t shaded = 0; // 实际着色并写回像素数(同时通过early-z或late-z测试) +}; + /** * @brief Tile 网格上下文(供 binning 和 raster 共享的网格/几何信息) */ @@ -106,7 +113,8 @@ class TileBasedRenderer final : public RendererBase { std::unique_ptr &global_color_buffer, const Shader& shader, bool use_early_z, - std::vector* scratch_fragments); + std::vector* scratch_fragments, + TileMaskStats* out_stats); private: // 深度和颜色的默认值,同时用于tile级和全局级buffers的初始化 diff --git a/src/renderers/tile_based_renderer.cpp b/src/renderers/tile_based_renderer.cpp index e0df64d..39ad7fa 100644 --- a/src/renderers/tile_based_renderer.cpp +++ b/src/renderers/tile_based_renderer.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "config.h" #include "log_system.h" @@ -16,6 +17,7 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in, uint32_t *buffer) { auto total_start_time = std::chrono::high_resolution_clock::now(); auto shader = std::make_shared(shader_in); + shader->PrepareVertexUniforms(); // 顶点变换(SoA) auto vertex_start = std::chrono::high_resolution_clock::now(); @@ -87,9 +89,10 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in, // 4. 并行光栅化每个 tile(SoA + early-z) auto raster_start = std::chrono::high_resolution_clock::now(); + std::vector tile_stats(total_tiles); #pragma omp parallel num_threads(kNProc) default(none) \ shared(tile_triangles, shader, depthBuffer, colorBuffer, total_tiles, \ - grid_ctx, early_z_) + grid_ctx, early_z_, tile_stats) { // 为每个 tile 分配局部深度和颜色缓冲 std::unique_ptr tile_depth_buffer = @@ -108,7 +111,7 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in, RasterizeTile(tile_id, tile_triangles[tile_id], grid_ctx, tile_depth_buffer.get(), tile_color_buffer.get(), depthBuffer, colorBuffer, *shader, early_z_, - &scratch_fragments); + &scratch_fragments, &tile_stats[tile_id]); } } auto raster_end = std::chrono::high_resolution_clock::now(); @@ -117,6 +120,23 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in, .count() / 1000.0; + // 汇总并打印掩码收益统计 + uint64_t sum_tested = 0, sum_covered = 0, sum_zpass = 0, sum_shaded = 0; + for (const auto& s : tile_stats) { + sum_tested += s.tested; + sum_covered += s.covered; + sum_zpass += s.zpass; + sum_shaded += s.shaded; + } + auto rate = [](uint64_t num, uint64_t den) -> double { + if (den == 0) return 0.0; return double(num) / double(den) * 100.0; + }; + SPDLOG_DEBUG( + "TBR Mask Stats: tested={}, covered={} ({:.1f}%), zpass={} ({:.1f}%), shaded={} ({:.1f}%)", + sum_tested, sum_covered, rate(sum_covered, sum_tested), + sum_zpass, rate(sum_zpass, sum_covered), + sum_shaded, rate(sum_shaded, sum_covered)); + // 5. 直接将单份全局 colorBuffer 拷贝到输出 auto present_start = std::chrono::high_resolution_clock::now(); std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t)); @@ -200,7 +220,8 @@ void TileBasedRenderer::RasterizeTile( uint32_t *tile_color_buffer, std::unique_ptr &global_depth_buffer, std::unique_ptr &global_color_buffer, const Shader &shader, bool use_early_z, - std::vector *scratch_fragments) { + std::vector *scratch_fragments, + TileMaskStats* out_stats) { // 计算 tile 屏幕范围 size_t tile_x = tile_id % grid.tiles_x; size_t tile_y = tile_id / grid.tiles_x; @@ -215,50 +236,199 @@ void TileBasedRenderer::RasterizeTile( std::fill_n(tile_depth_buffer, tile_width * tile_height, kDepthClear); std::fill_n(tile_color_buffer, tile_width * tile_height, kColorClear); - // 只有当调用方没有提供 scratch 时,才启用本地容器并且只构造一次 - const bool use_internal_scratch = (scratch_fragments == nullptr); - std::vector internal_out; - if (use_internal_scratch) internal_out.reserve(tile_width * tile_height); - - for (const auto &tri : triangles) { // 用来应对scratch传入nullptr的情况 - // 始终走 SoA + 限制矩形的光栅化路径;如未提供 scratch,则使用函数内局部容器 - std::vector &out = - use_internal_scratch ? internal_out : *scratch_fragments; - out.clear(); - if (out.capacity() < tile_width * tile_height) - out.reserve(tile_width * tile_height); - - rasterizer_->RasterizeTo( - grid.soa, tri.i0, tri.i1, tri.i2, static_cast(screen_x_start), - static_cast(screen_y_start), static_cast(screen_x_end), - static_cast(screen_y_end), out); - - for (auto &fragment : out) { - fragment.material = tri.material; - size_t sx = fragment.screen_coord[0]; - size_t sy = fragment.screen_coord[1]; - if (sx >= screen_x_start && sx < screen_x_end && sy >= screen_y_start && - sy < screen_y_end) { - size_t local_x = sx - screen_x_start; - size_t local_y = sy - screen_y_start; - size_t idx = local_x + local_y * tile_width; - if (use_early_z) { - if (fragment.depth < tile_depth_buffer[idx]) { - auto color = shader.FragmentShader(fragment); - tile_depth_buffer[idx] = fragment.depth; - tile_color_buffer[idx] = uint32_t(color); + // 掩码化扫描:按三角形直接写入 tile 局部缓冲,避免中间片段向量 + constexpr int kLane = 8; // 横向处理的像素个数(便于编译器自动向量化) + + // 轻量统计:用于评估掩码收益(仅对少量tile打印DEBUG) + uint64_t tested_pixels = 0; + uint64_t covered_pixels = 0; + uint64_t zpass_pixels = 0; + uint64_t shaded_pixels = 0; + + auto cross2 = [](float ax, float ay, float bx, float by) { + return ax * by - ay * bx; + }; + + for (const auto &tri : triangles) { + const auto i0 = tri.i0, i1 = tri.i1, i2 = tri.i2; + + // 顶点屏幕坐标 + const Vector4f &p0 = grid.soa.pos_screen[i0]; + const Vector4f &p1 = grid.soa.pos_screen[i1]; + const Vector4f &p2 = grid.soa.pos_screen[i2]; + + // 三角形屏幕空间 AABB,与 tile 矩形求交 + const float tri_minx = std::min({p0.x, p1.x, p2.x}); + const float tri_miny = std::min({p0.y, p1.y, p2.y}); + const float tri_maxx = std::max({p0.x, p1.x, p2.x}); + const float tri_maxy = std::max({p0.y, p1.y, p2.y}); + + int sx = std::max(static_cast(screen_x_start), + static_cast(std::floor(std::max(0.0f, tri_minx)))); + int sy = std::max(static_cast(screen_y_start), + static_cast(std::floor(std::max(0.0f, tri_miny)))); + int ex = std::min(static_cast(screen_x_end - 1), + static_cast(std::floor(std::min(width_ - 1, tri_maxx)))); + int ey = std::min(static_cast(screen_y_end - 1), + static_cast(std::floor(std::min(height_ - 1, tri_maxy)))); + if (sx > ex || sy > ey) continue; + + // 边向量与有向面积 + const float e01x = p1.x - p0.x, e01y = p1.y - p0.y; + const float e12x = p2.x - p1.x, e12y = p2.y - p1.y; + const float e20x = p0.x - p2.x, e20y = p0.y - p2.y; + const float area2 = cross2(e01x, e01y, p2.x - p0.x, p2.y - p0.y); + if (std::abs(area2) < 1e-6f) continue; // 退化三角形 + const bool positive = (area2 > 0.0f); + + // z 与 1/w 的平面插值准备 + const float z0 = p0.z, z1 = p1.z, z2 = p2.z; + const float w0_inv = 1.0f / p0.w, w1_inv = 1.0f / p1.w, w2_inv = 1.0f / p2.w; + + // 行扫描 + for (int y = sy; y <= ey; ++y) { // 行优先遍历:有利于 cache 与向量化 + const float yf = static_cast(y); + for (int xb = sx; xb <= ex; xb += kLane) { // 每次处理kLane个像素 + const int lane = std::min(kLane, ex - xb + 1); // 当前需要处理的像素个数 + const float x0f = static_cast(xb); // 本块起点的x坐标 + + // 计算本块起点的三个边函数值与横向步长(dE/dx) + float E01_base = cross2(e01x, e01y, x0f - p0.x, yf - p0.y); + float E12_base = cross2(e12x, e12y, x0f - p1.x, yf - p1.y); + float E20_base = cross2(e20x, e20y, x0f - p2.x, yf - p2.y); + const float dE01dx = -e01y; + const float dE12dx = -e12y; + const float dE20dx = -e20y; + + // ============== 构造覆盖掩码 cover mask ============== + unsigned mask_cover = 0u; + int cover_count = 0; + float E01[kLane], E12[kLane], E20[kLane]; + #pragma omp simd + for (int j = 0; j < lane; ++j) { + E01[j] = E01_base + dE01dx * static_cast(j); + E12[j] = E12_base + dE12dx * static_cast(j); + E20[j] = E20_base + dE20dx * static_cast(j); + } + for (int j = 0; j < lane; ++j) { // 内点测试,如果三角形在像素内,则将该像素加入覆盖掩码 + bool inside = positive ? (E01[j] >= 0.0f && E12[j] >= 0.0f && E20[j] >= 0.0f) + : (E01[j] <= 0.0f && E12[j] <= 0.0f && E20[j] <= 0.0f); + if (inside) { + mask_cover |= (1u << j); + cover_count++; + } + } + tested_pixels += static_cast(lane); + covered_pixels += static_cast(cover_count); + if (mask_cover == 0u) continue; + + // ============== 计算 z,进行early-z掩码 ============== + unsigned mask_zpass = 0u; + float zvals[kLane]; + // 缓存校正后的重心坐标,避免着色阶段重复计算 + float b0c_arr[kLane]; + float b1c_arr[kLane]; + float b2c_arr[kLane]; + int zpass_count = 0; + for (int j = 0; j < lane; ++j) { + if (((mask_cover >> j) & 1u) == 0u) { continue; } // 如果该像素不在覆盖掩码内,则跳过 + const float b0 = E12[j] / area2; + const float b1 = E20[j] / area2; + const float b2 = E01[j] / area2; + const float w_inv = b0 * w0_inv + b1 * w1_inv + b2 * w2_inv; // 透视矫正 + const float b0c = (b0 * w0_inv) / w_inv; + const float b1c = (b1 * w1_inv) / w_inv; + const float b2c = (b2 * w2_inv) / w_inv; + b0c_arr[j] = b0c; b1c_arr[j] = b1c; b2c_arr[j] = b2c; + const float z = z0 * b0c + z1 * b1c + z2 * b2c; + zvals[j] = z; + + const int sx_pix = xb + j; + const int local_x = sx_pix - static_cast(screen_x_start); + const int local_y = y - static_cast(screen_y_start); + const size_t idx = static_cast(local_x + local_y * static_cast(tile_width)); + if (z < tile_depth_buffer[idx]) { + mask_zpass |= (1u << j); + zpass_count++; } - } else { - auto color = shader.FragmentShader(fragment); - if (fragment.depth < tile_depth_buffer[idx]) { - tile_depth_buffer[idx] = fragment.depth; - tile_color_buffer[idx] = uint32_t(color); + } + zpass_pixels += static_cast(zpass_count); + + // ============== 构造最终掩码 ============== + unsigned mask_final = use_early_z ? (mask_cover & mask_zpass) : mask_cover; + if (mask_final == 0u && use_early_z) continue; + + // 对掩码内像素着色并写回(非 early-z 时,先着色,再按 z 测试写入) + for (int j = 0; j < lane; ++j) { + if (((mask_final >> j) & 1u) == 0u && use_early_z) continue; + const int sx_pix = xb + j; + const int local_x = sx_pix - static_cast(screen_x_start); + const int local_y = y - static_cast(screen_y_start); + const size_t idx = static_cast(local_x + local_y * static_cast(tile_width)); + + // 计算插值属性 + const float b0c = b0c_arr[j]; + const float b1c = b1c_arr[j]; + const float b2c = b2c_arr[j]; + + Fragment frag; + frag.screen_coord = {sx_pix, y}; + frag.depth = zvals[j]; + frag.material = tri.material; + + // 法向量插值 + const Vector3f &n0 = grid.soa.normal[i0]; + const Vector3f &n1 = grid.soa.normal[i1]; + const Vector3f &n2 = grid.soa.normal[i2]; + frag.normal = n0 * b0c + n1 * b1c + n2 * b2c; + + // 纹理坐标插值 + const Vector2f &uv0 = grid.soa.uv[i0]; + const Vector2f &uv1 = grid.soa.uv[i1]; + const Vector2f &uv2 = grid.soa.uv[i2]; + frag.uv = uv0 * b0c + uv1 * b1c + uv2 * b2c; + + // 颜色插值 + const Color &c0 = grid.soa.color[i0]; + const Color &c1 = grid.soa.color[i1]; + const Color &c2 = grid.soa.color[i2]; + auto color_r = FloatToUint8_t(static_cast(c0[Color::kColorIndexRed]) * b0c + + static_cast(c1[Color::kColorIndexRed]) * b1c + + static_cast(c2[Color::kColorIndexRed]) * b2c); + auto color_g = FloatToUint8_t(static_cast(c0[Color::kColorIndexGreen]) * b0c + + static_cast(c1[Color::kColorIndexGreen]) * b1c + + static_cast(c2[Color::kColorIndexGreen]) * b2c); + auto color_b = FloatToUint8_t(static_cast(c0[Color::kColorIndexBlue]) * b0c + + static_cast(c1[Color::kColorIndexBlue]) * b1c + + static_cast(c2[Color::kColorIndexBlue]) * b2c); + frag.color = Color(color_r, color_g, color_b); + + if (use_early_z) { // 开启时,仅对mask中通过early-z的像素进行着色和写回 + auto out_color = shader.FragmentShader(frag); + tile_depth_buffer[idx] = frag.depth; + tile_color_buffer[idx] = uint32_t(out_color); + shaded_pixels++; + } else { + // 关闭时,先着色,再按z测试写入 + auto out_color = shader.FragmentShader(frag); + if (frag.depth < tile_depth_buffer[idx]) { // late-z + tile_depth_buffer[idx] = frag.depth; + tile_color_buffer[idx] = uint32_t(out_color); + shaded_pixels++; + } } } } } } + if (out_stats) { + out_stats->tested = tested_pixels; + out_stats->covered = covered_pixels; + out_stats->zpass = zpass_pixels; + out_stats->shaded = shaded_pixels; + } + // 写回全局缓冲 // TBR 下不同 tile 覆盖的屏幕区域互不重叠,且在 tile 内部已通过 Early‑Z // 得出每个像素的最终值。因此可以直接将 tile 行数据拷贝到全局缓冲 From b659f57137978941da8e83b907ff5f9f253236ff Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Tue, 16 Sep 2025 14:16:51 +0800 Subject: [PATCH 22/24] VS: Optimize the vertex matrix caching in shaders by adding cache preparation and update functionality to reduce redundant computations. Signed-off-by: ZhouFANG --- src/include/shader.hpp | 24 +++++- src/renderers/deferred_renderer.cpp | 1 + src/renderers/per_triangle_renderer.cpp | 1 + src/shader.cpp | 108 +++++++++++++++++++++--- 4 files changed, 119 insertions(+), 15 deletions(-) diff --git a/src/include/shader.hpp b/src/include/shader.hpp index ed08998..5c02a7a 100644 --- a/src/include/shader.hpp +++ b/src/include/shader.hpp @@ -63,6 +63,19 @@ struct SharedDataInShader { Vector3f fragPos_varying = Vector3f(0.0f); }; +struct VertexUniformCache { + Matrix4f model = Matrix4f(1.0f); + Matrix4f view = Matrix4f(1.0f); + Matrix4f projection = Matrix4f(1.0f); + Matrix4f model_view = Matrix4f(1.0f); + Matrix4f mvp = Matrix4f(1.0f); + Matrix3f normal = Matrix3f(1.0f); + bool has_model = false; + bool has_view = false; + bool has_projection = false; + bool derived_valid = false; +}; + /** * @brief Shader Class 着色器类 * @@ -85,8 +98,13 @@ class Shader { template void SetUniform(const std::string &name, const T &value) { uniformbuffer_.SetUniform(name, value); + if constexpr (std::is_same_v) { + UpdateMatrixCache(name, value); + } } + void PrepareVertexUniforms(); + private: // UniformBuffer UniformBuffer uniformbuffer_; @@ -94,6 +112,10 @@ class Shader { // Shared Variables // 共享变量 SharedDataInShader sharedDataInShader_; + VertexUniformCache vertex_uniform_cache_; + + void UpdateMatrixCache(const std::string &name, const Matrix4f &value); + void RecalculateDerivedMatrices(); Color SampleTexture(const Texture &texture, const Vector2f &uv) const; Color ClampColor(const Color color) const; @@ -103,4 +125,4 @@ uint8_t FloatToUint8_t(float val); } // namespace simple_renderer -#endif /* SIMPLERENDER_SRC_INCLUDE_SHADER_H_ */ \ No newline at end of file +#endif /* SIMPLERENDER_SRC_INCLUDE_SHADER_H_ */ diff --git a/src/renderers/deferred_renderer.cpp b/src/renderers/deferred_renderer.cpp index a86d41f..9d8f7c2 100644 --- a/src/renderers/deferred_renderer.cpp +++ b/src/renderers/deferred_renderer.cpp @@ -14,6 +14,7 @@ namespace simple_renderer { bool DeferredRenderer::Render(const Model& model, const Shader& shader_in, uint32_t* buffer) { auto total_start_time = std::chrono::high_resolution_clock::now(); auto shader = std::make_shared(shader_in); + shader->PrepareVertexUniforms(); // 顶点变换(AoS) auto vertex_start = std::chrono::high_resolution_clock::now(); diff --git a/src/renderers/per_triangle_renderer.cpp b/src/renderers/per_triangle_renderer.cpp index 369f282..8a3b4cb 100644 --- a/src/renderers/per_triangle_renderer.cpp +++ b/src/renderers/per_triangle_renderer.cpp @@ -19,6 +19,7 @@ bool PerTriangleRenderer::Render(const Model &model, const Shader &shader_in, // 复制 shader 以便在多线程中共享 auto shader = std::make_shared(shader_in); + shader->PrepareVertexUniforms(); // 顶点变换(AoS) auto vertex_start = std::chrono::high_resolution_clock::now(); diff --git a/src/shader.cpp b/src/shader.cpp index 7b8eeae..4441eed 100644 --- a/src/shader.cpp +++ b/src/shader.cpp @@ -3,29 +3,109 @@ namespace simple_renderer { Vertex Shader::VertexShader(const Vertex& vertex) { - Matrix4f model_matrix = uniformbuffer_.GetUniform("modelMatrix"); - Matrix4f view_matrix = uniformbuffer_.GetUniform("viewMatrix"); - Matrix4f projection_matrix = - uniformbuffer_.GetUniform("projectionMatrix"); - - Matrix4f mvp_matrix = projection_matrix * view_matrix * model_matrix; - - Matrix3f normal_matrix = glm::transpose(glm::inverse(Matrix3f(model_matrix))); + const bool cache_ready = vertex_uniform_cache_.derived_valid; + + const Matrix4f* model_ptr = nullptr; + const Matrix4f* mvp_ptr = nullptr; + const Matrix3f* normal_ptr = nullptr; + + Matrix4f fallback_model; + Matrix4f fallback_mvp; + Matrix3f fallback_normal; + + if (cache_ready) { // 如果所有派生矩阵已预计算并可直接复用 + // 直接复用缓存矩阵,避免逐顶点哈希查询 + model_ptr = &vertex_uniform_cache_.model; + mvp_ptr = &vertex_uniform_cache_.mvp; + normal_ptr = &vertex_uniform_cache_.normal; + } else { // 如果缓存尚未建立 + fallback_model = uniformbuffer_.GetUniform("modelMatrix"); + Matrix4f view_matrix = uniformbuffer_.GetUniform("viewMatrix"); + Matrix4f projection_matrix = + uniformbuffer_.GetUniform("projectionMatrix"); + fallback_mvp = projection_matrix * view_matrix * fallback_model; + fallback_normal = + glm::transpose(glm::inverse(Matrix3f(fallback_model))); + model_ptr = &fallback_model; + mvp_ptr = &fallback_mvp; + normal_ptr = &fallback_normal; + } + + const Matrix4f& model_matrix = *model_ptr; + const Matrix4f& mvp_matrix = *mvp_ptr; + const Matrix3f& normal_matrix = *normal_ptr; + + const Vector4f position = vertex.GetPosition(); + Vector4f world_position = model_matrix * position; Vector3f transformed_normal = normal_matrix * vertex.GetNormal(); - sharedDataInShader_.fragPos_varying = Vector3f(model_matrix * vertex.GetPosition()); + // 将世界空间位置写入共享数据供片元阶段使用 + sharedDataInShader_.fragPos_varying = Vector3f(world_position); // 计算裁剪空间坐标 - Vector4f clip_position = mvp_matrix * vertex.GetPosition(); - + Vector4f clip_position = mvp_matrix * position; + // 返回变换后的顶点(包含变换后的法向量和裁剪坐标) - return Vertex(clip_position, - transformed_normal, - vertex.GetTexCoords(), + return Vertex(clip_position, transformed_normal, vertex.GetTexCoords(), vertex.GetColor(), clip_position); // 同时保存裁剪空间坐标用于后续裁剪 } +void Shader::PrepareVertexUniforms() { + if (vertex_uniform_cache_.derived_valid) { + return; + } + // 在进入顶点阶段前一次性取出常用矩阵并填充缓存 + if (uniformbuffer_.HasUniform("modelMatrix") && + uniformbuffer_.HasUniform("viewMatrix") && + uniformbuffer_.HasUniform("projectionMatrix")) { + vertex_uniform_cache_.model = + uniformbuffer_.GetUniform("modelMatrix"); + vertex_uniform_cache_.view = + uniformbuffer_.GetUniform("viewMatrix"); + vertex_uniform_cache_.projection = + uniformbuffer_.GetUniform("projectionMatrix"); + vertex_uniform_cache_.has_model = true; + vertex_uniform_cache_.has_view = true; + vertex_uniform_cache_.has_projection = true; + RecalculateDerivedMatrices(); + } +} + +void Shader::UpdateMatrixCache(const std::string& name, + const Matrix4f& value) { + if (name == "modelMatrix") { + vertex_uniform_cache_.model = value; + vertex_uniform_cache_.has_model = true; + } else if (name == "viewMatrix") { + vertex_uniform_cache_.view = value; + vertex_uniform_cache_.has_view = true; + } else if (name == "projectionMatrix") { + vertex_uniform_cache_.projection = value; + vertex_uniform_cache_.has_projection = true; + } else { + return; + } + + // 任一基础矩阵更新后,标记派生矩阵失效等待重算 + vertex_uniform_cache_.derived_valid = false; + if (vertex_uniform_cache_.has_model && vertex_uniform_cache_.has_view && + vertex_uniform_cache_.has_projection) { + RecalculateDerivedMatrices(); + } +} + +void Shader::RecalculateDerivedMatrices() { + // 预计算 Model-View、MVP 以及法线矩阵,供顶点着色器复用 + vertex_uniform_cache_.model_view = + vertex_uniform_cache_.view * vertex_uniform_cache_.model; + vertex_uniform_cache_.mvp = vertex_uniform_cache_.projection * + vertex_uniform_cache_.model_view; + vertex_uniform_cache_.normal = glm::transpose(glm::inverse( + Matrix3f(vertex_uniform_cache_.model))); + vertex_uniform_cache_.derived_valid = true; +} + Color Shader::FragmentShader(const Fragment& fragment) const { // interpolate Normal, Color and UV Color interpolateColor = fragment.color; From e81bcffd32be4dcf69b49c8b507aa646ddc8cf5f Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Tue, 16 Sep 2025 15:14:39 +0800 Subject: [PATCH 23/24] FS: Cache vectors and matrices to avoid redundant computations. Signed-off-by: ZhouFANG --- src/include/shader.hpp | 21 ++++- src/renderers/deferred_renderer.cpp | 2 +- src/renderers/per_triangle_renderer.cpp | 2 +- src/renderers/tile_based_renderer.cpp | 2 +- src/shader.cpp | 114 ++++++++++++++++++------ 5 files changed, 111 insertions(+), 30 deletions(-) diff --git a/src/include/shader.hpp b/src/include/shader.hpp index 5c02a7a..097214c 100644 --- a/src/include/shader.hpp +++ b/src/include/shader.hpp @@ -76,6 +76,15 @@ struct VertexUniformCache { bool derived_valid = false; }; +struct FragmentUniformCache { + Light light{}; + Vector3f camera_pos = Vector3f(0.0f); + Vector3f light_dir_normalized = Vector3f(0.0f); + bool has_light = false; + bool has_camera = false; + bool derived_valid = false; +}; + /** * @brief Shader Class 着色器类 * @@ -100,10 +109,14 @@ class Shader { uniformbuffer_.SetUniform(name, value); if constexpr (std::is_same_v) { UpdateMatrixCache(name, value); + } else if constexpr (std::is_same_v) { + UpdateFragmentCache(name, value); + } else if constexpr (std::is_same_v) { + UpdateFragmentCache(name, value); } } - void PrepareVertexUniforms(); + void PrepareUniformCaches(); private: // UniformBuffer @@ -113,9 +126,15 @@ class Shader { // 共享变量 SharedDataInShader sharedDataInShader_; VertexUniformCache vertex_uniform_cache_; + FragmentUniformCache fragment_uniform_cache_; void UpdateMatrixCache(const std::string &name, const Matrix4f &value); + void UpdateFragmentCache(const std::string &name, const Light &value); + void UpdateFragmentCache(const std::string &name, const Vector3f &value); void RecalculateDerivedMatrices(); + void RecalculateFragmentDerived(); + void PrepareVertexUniformCache(); + void PrepareFragmentUniformCache(); Color SampleTexture(const Texture &texture, const Vector2f &uv) const; Color ClampColor(const Color color) const; diff --git a/src/renderers/deferred_renderer.cpp b/src/renderers/deferred_renderer.cpp index 9d8f7c2..523fe20 100644 --- a/src/renderers/deferred_renderer.cpp +++ b/src/renderers/deferred_renderer.cpp @@ -14,7 +14,7 @@ namespace simple_renderer { bool DeferredRenderer::Render(const Model& model, const Shader& shader_in, uint32_t* buffer) { auto total_start_time = std::chrono::high_resolution_clock::now(); auto shader = std::make_shared(shader_in); - shader->PrepareVertexUniforms(); + shader->PrepareUniformCaches(); // 顶点变换(AoS) auto vertex_start = std::chrono::high_resolution_clock::now(); diff --git a/src/renderers/per_triangle_renderer.cpp b/src/renderers/per_triangle_renderer.cpp index 8a3b4cb..9348594 100644 --- a/src/renderers/per_triangle_renderer.cpp +++ b/src/renderers/per_triangle_renderer.cpp @@ -19,7 +19,7 @@ bool PerTriangleRenderer::Render(const Model &model, const Shader &shader_in, // 复制 shader 以便在多线程中共享 auto shader = std::make_shared(shader_in); - shader->PrepareVertexUniforms(); + shader->PrepareUniformCaches(); // 顶点变换(AoS) auto vertex_start = std::chrono::high_resolution_clock::now(); diff --git a/src/renderers/tile_based_renderer.cpp b/src/renderers/tile_based_renderer.cpp index 39ad7fa..e39526e 100644 --- a/src/renderers/tile_based_renderer.cpp +++ b/src/renderers/tile_based_renderer.cpp @@ -17,7 +17,7 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in, uint32_t *buffer) { auto total_start_time = std::chrono::high_resolution_clock::now(); auto shader = std::make_shared(shader_in); - shader->PrepareVertexUniforms(); + shader->PrepareUniformCaches(); // 顶点变换(SoA) auto vertex_start = std::chrono::high_resolution_clock::now(); diff --git a/src/shader.cpp b/src/shader.cpp index 4441eed..e01e9b1 100644 --- a/src/shader.cpp +++ b/src/shader.cpp @@ -51,27 +51,6 @@ Vertex Shader::VertexShader(const Vertex& vertex) { clip_position); // 同时保存裁剪空间坐标用于后续裁剪 } -void Shader::PrepareVertexUniforms() { - if (vertex_uniform_cache_.derived_valid) { - return; - } - // 在进入顶点阶段前一次性取出常用矩阵并填充缓存 - if (uniformbuffer_.HasUniform("modelMatrix") && - uniformbuffer_.HasUniform("viewMatrix") && - uniformbuffer_.HasUniform("projectionMatrix")) { - vertex_uniform_cache_.model = - uniformbuffer_.GetUniform("modelMatrix"); - vertex_uniform_cache_.view = - uniformbuffer_.GetUniform("viewMatrix"); - vertex_uniform_cache_.projection = - uniformbuffer_.GetUniform("projectionMatrix"); - vertex_uniform_cache_.has_model = true; - vertex_uniform_cache_.has_view = true; - vertex_uniform_cache_.has_projection = true; - RecalculateDerivedMatrices(); - } -} - void Shader::UpdateMatrixCache(const std::string& name, const Matrix4f& value) { if (name == "modelMatrix") { @@ -106,6 +85,80 @@ void Shader::RecalculateDerivedMatrices() { vertex_uniform_cache_.derived_valid = true; } +void Shader::UpdateFragmentCache(const std::string& name, + const Light& value) { + if (name != "light") { + return; + } + fragment_uniform_cache_.light = value; + fragment_uniform_cache_.has_light = true; + fragment_uniform_cache_.derived_valid = false; + if (fragment_uniform_cache_.has_light && fragment_uniform_cache_.has_camera) { + RecalculateFragmentDerived(); + } +} + +void Shader::UpdateFragmentCache(const std::string& name, + const Vector3f& value) { + if (name != "cameraPos") { + return; + } + fragment_uniform_cache_.camera_pos = value; + fragment_uniform_cache_.has_camera = true; + fragment_uniform_cache_.derived_valid = false; + if (fragment_uniform_cache_.has_light && fragment_uniform_cache_.has_camera) { + RecalculateFragmentDerived(); + } +} + +void Shader::RecalculateFragmentDerived() { + fragment_uniform_cache_.light_dir_normalized = + glm::normalize(fragment_uniform_cache_.light.dir); + fragment_uniform_cache_.derived_valid = true; +} + +void Shader::PrepareUniformCaches() { + PrepareVertexUniformCache(); + PrepareFragmentUniformCache(); +} + +void Shader::PrepareVertexUniformCache() { + if (vertex_uniform_cache_.derived_valid) { + return; + } + // 在进入渲染阶段前一次性取出常用矩阵并填充缓存 + if (uniformbuffer_.HasUniform("modelMatrix") && + uniformbuffer_.HasUniform("viewMatrix") && + uniformbuffer_.HasUniform("projectionMatrix")) { + vertex_uniform_cache_.model = + uniformbuffer_.GetUniform("modelMatrix"); + vertex_uniform_cache_.view = + uniformbuffer_.GetUniform("viewMatrix"); + vertex_uniform_cache_.projection = + uniformbuffer_.GetUniform("projectionMatrix"); + vertex_uniform_cache_.has_model = true; + vertex_uniform_cache_.has_view = true; + vertex_uniform_cache_.has_projection = true; + RecalculateDerivedMatrices(); + } +} + +void Shader::PrepareFragmentUniformCache() { + if (fragment_uniform_cache_.derived_valid) { + return; + } + if (uniformbuffer_.HasUniform("light") && + uniformbuffer_.HasUniform("cameraPos")) { + fragment_uniform_cache_.light = + uniformbuffer_.GetUniform("light"); + fragment_uniform_cache_.camera_pos = + uniformbuffer_.GetUniform("cameraPos"); + fragment_uniform_cache_.has_light = true; + fragment_uniform_cache_.has_camera = true; + RecalculateFragmentDerived(); + } +} + Color Shader::FragmentShader(const Fragment& fragment) const { // interpolate Normal, Color and UV Color interpolateColor = fragment.color; @@ -113,14 +166,23 @@ Color Shader::FragmentShader(const Fragment& fragment) const { Vector2f uv = fragment.uv; // uniform - Light light = uniformbuffer_.GetUniform("light"); + Light light; + Vector3f light_dir; + Vector3f camera_pos; + if (fragment_uniform_cache_.derived_valid) { + light = fragment_uniform_cache_.light; + light_dir = fragment_uniform_cache_.light_dir_normalized; + camera_pos = fragment_uniform_cache_.camera_pos; + } else { + light = uniformbuffer_.GetUniform("light"); + camera_pos = uniformbuffer_.GetUniform("cameraPos"); + light_dir = glm::normalize(light.dir); + } Material material = *fragment.material; // view direction Vector3f view_dir = - glm::normalize(sharedDataInShader_.fragPos_varying - - uniformbuffer_.GetUniform("cameraPos")); - Vector3f light_dir = glm::normalize(light.dir); + glm::normalize(sharedDataInShader_.fragPos_varying - camera_pos); auto intensity = std::max(glm::dot(normal, light_dir), 0.0f); // texture color @@ -197,4 +259,4 @@ Color Shader::ClampColor(const Color color) const { return Color(red, green, blue, alpha); } -} // namespace simple_renderer \ No newline at end of file +} // namespace simple_renderer From b84cfd2afe4ad3f6c73bbc85ca0c914a4c6c8454 Mon Sep 17 00:00:00 2001 From: ZhouFANG Date: Tue, 16 Sep 2025 15:48:47 +0800 Subject: [PATCH 24/24] Enhanced shader class with LUT caching for specular reflection to optimize computation and eliminate redundancy. Added copy/move constructors for thread safety Signed-off-by: ZhouFANG --- src/include/shader.hpp | 25 ++++++++-- src/shader.cpp | 103 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 122 insertions(+), 6 deletions(-) diff --git a/src/include/shader.hpp b/src/include/shader.hpp index 097214c..8314f55 100644 --- a/src/include/shader.hpp +++ b/src/include/shader.hpp @@ -1,6 +1,10 @@ #ifndef SIMPLERENDER_SRC_INCLUDE_SHADER_HPP_ #define SIMPLERENDER_SRC_INCLUDE_SHADER_HPP_ +#include +#include +#include +#include #include #include "light.h" @@ -12,6 +16,8 @@ namespace simple_renderer { using UniformValue = std::variant; +inline constexpr size_t kSpecularLutResolution = 256; + class UniformBuffer { public: template @@ -85,6 +91,10 @@ struct FragmentUniformCache { bool derived_valid = false; }; +struct SpecularLUT { + std::array values{}; +}; + /** * @brief Shader Class 着色器类 * @@ -92,10 +102,10 @@ struct FragmentUniformCache { class Shader { public: Shader() = default; - Shader(const Shader &shader) = default; - Shader(Shader &&shader) = default; - auto operator=(const Shader &shader) -> Shader & = default; - auto operator=(Shader &&shader) -> Shader & = default; + Shader(const Shader &shader); + Shader(Shader &&shader) noexcept; + auto operator=(const Shader &shader) -> Shader &; + auto operator=(Shader &&shader) noexcept -> Shader &; virtual ~Shader() = default; // Input Data -> Vertex Shader -> Screen Space Coordiante @@ -127,6 +137,8 @@ class Shader { SharedDataInShader sharedDataInShader_; VertexUniformCache vertex_uniform_cache_; FragmentUniformCache fragment_uniform_cache_; + mutable std::unordered_map specular_lut_cache_; + mutable std::shared_mutex specular_cache_mutex_; void UpdateMatrixCache(const std::string &name, const Matrix4f &value); void UpdateFragmentCache(const std::string &name, const Light &value); @@ -136,6 +148,11 @@ class Shader { void PrepareVertexUniformCache(); void PrepareFragmentUniformCache(); + // LUT相关 + [[nodiscard]] auto BuildSpecularLUT(float shininess) const -> SpecularLUT; + [[nodiscard]] auto GetSpecularLUT(float shininess) const -> const SpecularLUT &; + [[nodiscard]] auto EvaluateSpecular(float cos_theta, float shininess) const -> float; + Color SampleTexture(const Texture &texture, const Vector2f &uv) const; Color ClampColor(const Color color) const; }; diff --git a/src/shader.cpp b/src/shader.cpp index e01e9b1..06ab241 100644 --- a/src/shader.cpp +++ b/src/shader.cpp @@ -1,7 +1,56 @@ #include "shader.hpp" +#include +#include +#include +#include + namespace simple_renderer { +Shader::Shader(const Shader& shader) { + std::shared_lock lock(shader.specular_cache_mutex_); + uniformbuffer_ = shader.uniformbuffer_; + sharedDataInShader_ = shader.sharedDataInShader_; + vertex_uniform_cache_ = shader.vertex_uniform_cache_; + fragment_uniform_cache_ = shader.fragment_uniform_cache_; + specular_lut_cache_ = shader.specular_lut_cache_; +} + +Shader::Shader(Shader&& shader) noexcept { + std::unique_lock lock(shader.specular_cache_mutex_); + uniformbuffer_ = std::move(shader.uniformbuffer_); + sharedDataInShader_ = shader.sharedDataInShader_; + vertex_uniform_cache_ = shader.vertex_uniform_cache_; + fragment_uniform_cache_ = shader.fragment_uniform_cache_; + specular_lut_cache_ = std::move(shader.specular_lut_cache_); +} + +auto Shader::operator=(const Shader& shader) -> Shader& { + if (this == &shader) { + return *this; + } + std::shared_lock lock(shader.specular_cache_mutex_); + uniformbuffer_ = shader.uniformbuffer_; + sharedDataInShader_ = shader.sharedDataInShader_; + vertex_uniform_cache_ = shader.vertex_uniform_cache_; + fragment_uniform_cache_ = shader.fragment_uniform_cache_; + specular_lut_cache_ = shader.specular_lut_cache_; + return *this; +} + +auto Shader::operator=(Shader&& shader) noexcept -> Shader& { + if (this == &shader) { + return *this; + } + std::unique_lock lock(shader.specular_cache_mutex_); + uniformbuffer_ = std::move(shader.uniformbuffer_); + sharedDataInShader_ = shader.sharedDataInShader_; + vertex_uniform_cache_ = shader.vertex_uniform_cache_; + fragment_uniform_cache_ = shader.fragment_uniform_cache_; + specular_lut_cache_ = std::move(shader.specular_lut_cache_); + return *this; +} + Vertex Shader::VertexShader(const Vertex& vertex) { const bool cache_ready = vertex_uniform_cache_.derived_valid; @@ -159,6 +208,56 @@ void Shader::PrepareFragmentUniformCache() { } } +auto Shader::BuildSpecularLUT(float shininess) const -> SpecularLUT { + SpecularLUT lut; + if (shininess <= 0.0f) { + lut.values.fill(1.0f); + return lut; + } + + for (size_t i = 0; i < kSpecularLutResolution; ++i) { + float cos_theta = static_cast(i) / + static_cast(kSpecularLutResolution - 1); + lut.values[i] = cos_theta <= 0.0f ? 0.0f : std::pow(cos_theta, shininess); + } + return lut; +} + +auto Shader::GetSpecularLUT(float shininess) const -> const SpecularLUT& { + uint32_t key = std::bit_cast(shininess); + { + std::shared_lock lock(specular_cache_mutex_); + auto it = specular_lut_cache_.find(key); + if (it != specular_lut_cache_.end()) { + return it->second; + } + } + + SpecularLUT lut = BuildSpecularLUT(shininess); + std::unique_lock lock(specular_cache_mutex_); + auto [it, inserted] = specular_lut_cache_.emplace(key, std::move(lut)); + return it->second; +} + +auto Shader::EvaluateSpecular(float cos_theta, float shininess) const -> float { + cos_theta = std::clamp(cos_theta, 0.0f, 1.0f); + if (shininess <= 0.0f) { + return 1.0f; + } + if (cos_theta <= 0.0f) { + return 0.0f; + } + + const auto& lut = GetSpecularLUT(shininess); + float scaled = cos_theta * static_cast(kSpecularLutResolution - 1); + size_t index = static_cast(scaled); + float frac = scaled - static_cast(index); + + const float v0 = lut.values[index]; + const float v1 = lut.values[std::min(index + 1, kSpecularLutResolution - 1)]; + return v0 + (v1 - v0) * frac; +} + Color Shader::FragmentShader(const Fragment& fragment) const { // interpolate Normal, Color and UV Color interpolateColor = fragment.color; @@ -202,8 +301,8 @@ Color Shader::FragmentShader(const Fragment& fragment) const { } Vector3f halfVector = glm::normalize(light_dir + view_dir); - float spec = std::pow(std::max(glm::dot(normal, halfVector), 0.0f), - material.shininess); + float cos_theta = std::max(glm::dot(normal, halfVector), 0.0f); + float spec = EvaluateSpecular(cos_theta, material.shininess); if (material.has_specular_texture) { Color texture_color = SampleTexture(material.specular_texture, uv); specular_color = texture_color * spec;