From 979a251cff70f0db6a4c8be4168eee624438090c Mon Sep 17 00:00:00 2001
From: indevn <indevn@outlook.com>
Date: Tue, 27 May 2025 18:06:12 +0800
Subject: [PATCH 01/24] fix typo

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 README-cn.md | 2 +-
 README.md    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/README-cn.md b/README-cn.md
index 5404fda..01ae8c6 100644
--- a/README-cn.md
+++ b/README-cn.md
@@ -85,7 +85,7 @@ cmake --build build-macos --target all
 #### 3. 运行示例应用程序
 
 ```bash
-./build/bin/system_test ../obj
+./build/bin/system_test ./obj
 ```
 
 ---
diff --git a/README.md b/README.md
index 95981fd..fab00dc 100755
--- a/README.md
+++ b/README.md
@@ -86,7 +86,7 @@ cmake --build build-macos --target all
 #### 3. Run the Example Application
 
 ```bash
-./build/bin/system_test ../obj
+./build/bin/system_test ./obj
 ```
 
 ---

From b9f2ae84177945d728c2a8a696f4c0537fbb29ea Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Mon, 4 Aug 2025 13:05:34 +0800
Subject: [PATCH 02/24] Implement perspective division and viewport
 transformation with perspective-correct interpolation

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 src/include/renderer.h    | 15 ++++++++
 src/rasterizer.cpp        | 30 ++++++++++++----
 src/renderer.cpp          | 75 ++++++++++++++++++++++++++++++++++++---
 src/shader.cpp            | 13 ++++---
 test/system_test/main.cpp |  9 +++--
 5 files changed, 123 insertions(+), 19 deletions(-)

diff --git a/src/include/renderer.h b/src/include/renderer.h
index bcc136f..456010b 100755
--- a/src/include/renderer.h
+++ b/src/include/renderer.h
@@ -69,6 +69,21 @@ class SimpleRenderer {
    */
   void DrawModel(const Model &model, uint32_t *buffer);
   void DrawModelSlower(const Model &model, uint32_t *buffer);
+
+  
+  /**
+   * 透视除法 - 将裁剪空间坐标转换为归一化设备坐标(NDC)
+   * @param vertex 裁剪空间坐标的顶点
+   * @return 转换后的顶点(NDC坐标)
+   */
+  Vertex PerspectiveDivision(const Vertex &vertex);
+
+  /**
+   * 视口变换 - 将NDC坐标转换为屏幕坐标
+   * @param vertex NDC坐标的顶点
+   * @return 转换后的顶点(屏幕坐标)
+   */
+  Vertex ViewportTransformation(const Vertex &vertex);
 };
 }  // namespace simple_renderer
 
diff --git a/src/rasterizer.cpp b/src/rasterizer.cpp
index 8bf2d34..0712f4a 100644
--- a/src/rasterizer.cpp
+++ b/src/rasterizer.cpp
@@ -46,18 +46,36 @@ std::vector<Fragment> Rasterizer::Rasterize(const Vertex& v0, const Vertex& v1,
         if (!is_inside) {
           continue;
         }
-        // 计算该点的深度，通过重心坐标插值计算
+
+        // 透视矫正插值
+        // 1. 获取三个顶点的1/w值
+        float w0_inv = v0.GetPosition().w;
+        float w1_inv = v1.GetPosition().w;  
+        float w2_inv = v2.GetPosition().w;
+        
+        // 2. 插值1/w
+        float w_inv_interpolated = Interpolate(w0_inv, w1_inv, w2_inv, barycentric_coord);
+        
+        // 3. 计算透视矫正的重心坐标
+        Vector3f corrected_bary(
+          barycentric_coord.x * w0_inv / w_inv_interpolated,
+          barycentric_coord.y * w1_inv / w_inv_interpolated,
+          barycentric_coord.z * w2_inv / w_inv_interpolated
+        );
+        
+        // 4. 使用矫正的重心坐标进行插值
         auto z = Interpolate(v0.GetPosition().z, v1.GetPosition().z,
-                             v2.GetPosition().z, barycentric_coord);
+                             v2.GetPosition().z, corrected_bary);
+
 
         Fragment fragment;
         fragment.screen_coord = {x, y};
-        fragment.normal = CalculateNormal(v0.GetPosition(), v1.GetPosition(),
-                                          v2.GetPosition());
+        fragment.normal = Interpolate(v0.GetNormal(), v1.GetNormal(),
+                                      v2.GetNormal(), corrected_bary);
         fragment.uv = Interpolate(v0.GetTexCoords(), v1.GetTexCoords(),
-                                  v2.GetTexCoords(), barycentric_coord);
+                                  v2.GetTexCoords(), corrected_bary);
         fragment.color = InterpolateColor(v0.GetColor(), v1.GetColor(),
-                                          v2.GetColor(), barycentric_coord);
+                                          v2.GetColor(), corrected_bary);
         fragment.depth = z;
 
         local_fragments.push_back(fragment);
diff --git a/src/renderer.cpp b/src/renderer.cpp
index c7a5769..d433ff1 100755
--- a/src/renderer.cpp
+++ b/src/renderer.cpp
@@ -69,8 +69,16 @@ void SimpleRenderer::DrawModel(const Model &model, uint32_t *buffer) {
 
 #pragma omp for
     for (const auto &v : model.GetVertices()) {
-      auto vertex = shader_->VertexShader(v);
-      processedVertices_per_thread.push_back(vertex);
+      // 顶点着色器：世界坐标 -> 裁剪坐标
+      auto clipSpaceVertex = shader_->VertexShader(v);
+      
+      // 透视除法：裁剪坐标 -> NDC坐标
+      auto ndcVertex = PerspectiveDivision(clipSpaceVertex);
+      
+      // 视口变换：NDC坐标 -> 屏幕坐标
+      auto screenSpaceVertex = ViewportTransformation(ndcVertex);
+      
+      processedVertices_per_thread.push_back(screenSpaceVertex);
     }
   }
 
@@ -192,8 +200,16 @@ void SimpleRenderer::DrawModelSlower(const Model &model, uint32_t *buffer) {
 #pragma omp for
     for (const auto &v : model.GetVertices()) {
       /* * * Vertex Shader * *  */
-      auto vertex = shader_->VertexShader(v);
-      local_vertices.push_back(vertex);
+      // 顶点着色器：世界坐标 -> 裁剪坐标
+      auto clipSpaceVertex = shader_->VertexShader(v);
+      
+      // 透视除法：裁剪坐标 -> NDC坐标
+      auto ndcVertex = PerspectiveDivision(clipSpaceVertex);
+      
+      // 视口变换：NDC坐标 -> 屏幕坐标
+      auto screenSpaceVertex = ViewportTransformation(ndcVertex);
+      
+      local_vertices.push_back(screenSpaceVertex);
     }
   }
 
@@ -274,4 +290,55 @@ void SimpleRenderer::DrawModelSlower(const Model &model, uint32_t *buffer) {
   /*  *  *  *  *  *  *  */
 }
 
+Vertex SimpleRenderer::PerspectiveDivision(const Vertex &vertex) {
+  Vector4f position = vertex.GetPosition();
+  
+  // 检查w分量，避免除零和负数问题
+  if (position.w <= 1e-6f) {
+    SPDLOG_DEBUG("PerspectiveDivision: w <= 1e-6f");
+    Vector4f farPosition(0.0f, 0.0f, 1.0f, 1.0f);
+    return Vertex(farPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor());
+  }
+  
+  // 保存原始w分量用于透视矫正插值
+  float original_w = position.w;
+  
+  // 执行透视除法：(x, y, z, w) -> (x/w, y/w, z/w, 1/w)
+  Vector4f ndcPosition(
+    position.x / position.w,  // x_ndc = x_clip / w_clip
+    position.y / position.w,  // y_ndc = y_clip / w_clip  
+    position.z / position.w,  // z_ndc = z_clip / w_clip
+    1.0f / original_w         // 保存1/w用于透视矫正插值
+  );
+  
+  // 严格限制NDC坐标在标准范围内
+  ndcPosition.x = std::clamp(ndcPosition.x, -1.0f, 1.0f);
+  ndcPosition.y = std::clamp(ndcPosition.y, -1.0f, 1.0f);
+  ndcPosition.z = std::clamp(ndcPosition.z, -1.0f, 1.0f);
+  
+  // 创建新的顶点，保持其他属性不变
+  return Vertex(ndcPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor());
+}
+
+Vertex SimpleRenderer::ViewportTransformation(const Vertex &vertex) {
+  Vector4f ndcPosition = vertex.GetPosition();
+  
+  // 视口变换：将NDC坐标[-1,1]转换为屏幕坐标[0,width]x[0,height]
+  float screen_x = (ndcPosition.x + 1.0f) * width_ / 2.0f;
+  float screen_y = (1.0f - ndcPosition.y) * height_ / 2.0f;
+  
+  // 额外的屏幕坐标边界保护
+  screen_x = std::clamp(screen_x, 0.0f, static_cast<float>(width_ - 1));
+  screen_y = std::clamp(screen_y, 0.0f, static_cast<float>(height_ - 1));
+  
+  Vector4f screenPosition(
+    screen_x,                    // x: 屏幕坐标
+    screen_y,                    // y: 屏幕坐标
+    ndcPosition.z,               // z: NDC坐标用于深度测试
+    ndcPosition.w                // w: 保持1/w用于透视矫正插值
+  );
+  
+  return Vertex(screenPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor());
+}
+
 }  // namespace simple_renderer
diff --git a/src/shader.cpp b/src/shader.cpp
index 3438627..087cca5 100644
--- a/src/shader.cpp
+++ b/src/shader.cpp
@@ -9,12 +9,17 @@ Vertex Shader::VertexShader(const Vertex& vertex) {
       uniformbuffer_.GetUniform<Matrix4f>("projectionMatrix");
 
   Matrix4f mvp_matrix = projection_matrix * view_matrix * model_matrix;
-  // auto normal_matrix = model_matrix.inverse().transpose();
+  
+  Matrix3f normal_matrix = glm::transpose(glm::inverse(Matrix3f(model_matrix)));
+  Vector3f transformed_normal = normal_matrix * vertex.GetNormal();
 
-  sharedDataInShader_.fragPos_varying =
-      Vector3f(model_matrix * vertex.GetPosition());
+  sharedDataInShader_.fragPos_varying = Vector3f(model_matrix * vertex.GetPosition());
 
-  return mvp_matrix * vertex;
+  // 返回变换后的顶点（包含变换后的法向量）
+  return Vertex(mvp_matrix * vertex.GetPosition(), 
+                transformed_normal, 
+                vertex.GetTexCoords(), 
+                vertex.GetColor());
 }
 
 Color Shader::FragmentShader(const Fragment& fragment) const {
diff --git a/test/system_test/main.cpp b/test/system_test/main.cpp
index f75b29c..a844aa7 100755
--- a/test/system_test/main.cpp
+++ b/test/system_test/main.cpp
@@ -58,15 +58,14 @@ int main(int argc, char **argv) {
   auto modelMatrix = simple_renderer::Matrix4f(1.0f);
   simple_renderer::Matrix4f scale_matrix =
       glm::scale(simple_renderer::Matrix4f(1.0f),
-                 simple_renderer::Vector3f(7.0f, 7.0f, 7.0f));
+                 simple_renderer::Vector3f(.02f, .02f, .02f));
 
-  // Translation matrix
   simple_renderer::Matrix4f translation_matrix =
       glm::translate(simple_renderer::Matrix4f(1.0f),
-                     simple_renderer::Vector3f(30.0f, 30.0f, 0.0f));
+                     simple_renderer::Vector3f(0.0f, -5.0f, 0.0f));
 
   simple_renderer::Matrix4f rotation_matrix =
-      glm::rotate(simple_renderer::Matrix4f(1.0f), 90.0f,
+      glm::rotate(simple_renderer::Matrix4f(1.0f), glm::radians(-105.0f),
                   simple_renderer::Vector3f(1.0f, 0.0f, 0.0f));
 
   // Combined transformation matrix
@@ -90,7 +89,7 @@ int main(int argc, char **argv) {
     shader.SetUniform("cameraPos", camera.GetPosition());
     shader.SetUniform("viewMatrix", camera.GetViewMatrix());
     shader.SetUniform("projectionMatrix",
-                      camera.GetProjectionMatrix(60.0f, 1.0f, 0.1f, 100.0f));
+                      camera.GetProjectionMatrix(60.0f, float(kWidth)/float(kHeight), 0.1f, 100.0f));
 
     buffer.ClearDrawBuffer(simple_renderer::Color::kBlack);
     for (auto &model : models) {

From 7093d822d7c908d9989be838ac2d03fc15f8cd8d Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Thu, 17 Jul 2025 18:28:58 +0800
Subject: [PATCH 03/24] implement tile-based rasterizer and refractor the
 pipeline to support multi-rendering-mode

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 src/include/renderer.h    | 148 ++++++-
 src/rasterizer.cpp        |   2 +-
 src/renderer.cpp          | 818 +++++++++++++++++++++++++++++++-------
 test/system_test/main.cpp |  30 +-
 4 files changed, 851 insertions(+), 147 deletions(-)

diff --git a/src/include/renderer.h b/src/include/renderer.h
index 456010b..2464c19 100755
--- a/src/include/renderer.h
+++ b/src/include/renderer.h
@@ -31,6 +31,27 @@
 
 namespace simple_renderer {
 
+// 渲染模式枚举
+enum class RenderingMode {
+  TRADITIONAL,  // 传统光栅化模式 - 立即深度测试
+  TILE_BASED,   // Tile-based光栅化模式 - 移动GPU架构
+  DEFERRED      // 延迟渲染模式 - 经典GPU管线教学模拟
+};
+
+// Face 只包含顶点索引，不包含实际的顶点数据;
+// Vertex 包含3D坐标，但没有屏幕坐标
+// Fragment 包含屏幕坐标，但它是光栅化的结果，不是输入
+struct TriangleInfo {
+  Vertex v0, v1, v2;
+  const Material *material;
+  size_t face_index;
+  TriangleInfo(const Vertex& vertex0, const Vertex& vertex1, const Vertex& vertex2,
+             const Material* mat, size_t face_idx = 0)
+    : v0(vertex0), v1(vertex1), v2(vertex2), material(mat), face_index(face_idx) {}
+    
+  TriangleInfo() = default;
+};
+
 class SimpleRenderer {
  public:
   /**
@@ -53,22 +74,122 @@ class SimpleRenderer {
   virtual ~SimpleRenderer() = default;
   /// @}
 
-  bool Render(const Model &model, const Shader &shader, uint32_t *buffer);
+  /**
+   * 绘制单个模型
+   * @param model 要绘制的模型
+   * @param shader 用于渲染的着色器
+   * @param buffer 输出缓冲区
+   * @return 绘制是否成功
+   */
+  bool DrawModel(const Model &model, const Shader &shader, uint32_t *buffer);
+
+  /**
+   * 设置渲染模式
+   * @param mode 渲染模式（传统或基于Tile）
+   */
+  void SetRenderingMode(RenderingMode mode);
+
+  /**
+   * 获取当前渲染模式
+   * @return 当前渲染模式
+   */
+  RenderingMode GetRenderingMode() const;
 
  private:
   const size_t height_;
   const size_t width_;
   LogSystem log_system_;
+  RenderingMode current_mode_;  // 当前渲染模式
 
   std::shared_ptr<Shader> shader_;
   std::shared_ptr<Rasterizer> rasterizer_;
 
   /**
-   * 绘制模型
+   * 执行绘制管线
    * @param model 模型
+   * @param buffer 输出缓冲区
    */
-  void DrawModel(const Model &model, uint32_t *buffer);
-  void DrawModelSlower(const Model &model, uint32_t *buffer);
+  void ExecuteDrawPipeline(const Model &model, uint32_t *buffer);
+  
+
+  /**
+   * 传统光栅化渲染
+   * @param model 模型
+   * @param processedVertices 已处理的顶点
+   * @param buffer 输出缓冲区
+   * @return 渲染统计信息
+   */
+  struct RenderStats {
+    double buffer_alloc_ms;
+    double rasterization_ms;
+    double merge_ms;
+    double total_ms;
+  };
+  
+  RenderStats ExecuteTraditionalPipeline(const Model &model, 
+                                        const std::vector<Vertex> &processedVertices,
+                                        uint32_t *buffer);
+
+  /**
+   * Tile-based光栅化渲染
+   * @param model 模型
+   * @param processedVertices 已处理的顶点
+   * @param buffer 输出缓冲区
+   * @return 渲染统计信息
+   */
+  struct TileRenderStats {
+    double setup_ms;
+    double binning_ms;
+    double buffer_alloc_ms;
+    double rasterization_ms;
+    double merge_ms;
+    double visualization_ms;
+    double total_ms;
+  };
+  
+  /**
+   * 延迟渲染统计信息
+   */
+  struct DeferredRenderStats {
+    double buffer_alloc_ms;
+    double rasterization_ms;
+    double fragment_collection_ms;
+    double fragment_merge_ms;
+    double deferred_shading_ms;
+    double total_ms;
+  };
+  
+  TileRenderStats ExecuteTileBasedPipeline(const Model &model,
+                                          const std::vector<Vertex> &processedVertices,
+                                          uint32_t *buffer);
+
+  /**
+   * 延迟渲染管线
+   * @param model 模型
+   * @param processedVertices 已处理的顶点
+   * @param buffer 输出缓冲区
+   * @return 渲染统计信息
+   */
+  DeferredRenderStats ExecuteDeferredPipeline(const Model &model,
+                                             const std::vector<Vertex> &processedVertices,
+                                             uint32_t *buffer);
+
+  
+private:
+
+  void TriangleTileBinning(
+    const Model &model, 
+    const std::vector<Vertex> &screenVertices,
+    std::vector<std::vector<TriangleInfo>> &tile_triangles,
+    size_t tiles_x, size_t tiles_y, size_t tile_size);
+
+  void RasterizeTile(
+    size_t tile_id,
+    const std::vector<TriangleInfo> &triangles,
+    size_t tiles_x, size_t tiles_y, size_t tile_size,
+    float* tile_depth_buffer, uint32_t* tile_color_buffer,
+    std::unique_ptr<float[]> &global_depth_buffer,
+    std::unique_ptr<uint32_t[]> &global_color_buffer);
 
   
   /**
@@ -84,6 +205,25 @@ class SimpleRenderer {
    * @return 转换后的顶点(屏幕坐标)
    */
   Vertex ViewportTransformation(const Vertex &vertex);
+  /**
+   * Tile可视化调试函数 - 在渲染结果上绘制tile网格和状态
+   * @param buffer 渲染结果缓冲区
+   * @param tile_triangles 每个tile包含的三角形列表
+   * @param tiles_x X方向tile数量
+   * @param tiles_y Y方向tile数量 
+   * @param tile_size 单个tile的像素大小
+   */
+  void DrawTileVisualization(uint32_t* buffer, 
+      const std::vector<std::vector<TriangleInfo>>& tile_triangles, 
+      size_t tiles_x, size_t tiles_y, size_t tile_size);
+
+  /**
+   * 颜色混合函数 - 用于半透明效果
+   * @param base 基础颜色
+   * @param overlay 叠加颜色(包含alpha通道)
+   * @return 混合后的颜色
+   */
+  uint32_t BlendColors(uint32_t base, uint32_t overlay);
 };
 }  // namespace simple_renderer
 
diff --git a/src/rasterizer.cpp b/src/rasterizer.cpp
index 0712f4a..7a8c602 100644
--- a/src/rasterizer.cpp
+++ b/src/rasterizer.cpp
@@ -116,7 +116,7 @@ std::pair<bool, Vector3f> Rasterizer::GetBarycentricCoord(const Vector3f& p0,
 
   return std::pair<bool, const Vector3f>{true, Vector3f(x, y, z)};
 }
-
+ 
 template <typename T>
 T Rasterizer::Interpolate(const T& v0, const T& v1, const T& v2,
                           const Vector3f& barycentric_coord) {
diff --git a/src/renderer.cpp b/src/renderer.cpp
index d433ff1..34866c2 100755
--- a/src/renderer.cpp
+++ b/src/renderer.cpp
@@ -19,6 +19,7 @@
 #include <omp.h>
 
 #include <array>
+#include <chrono>
 #include <cstdint>
 #include <limits>
 #include <span>
@@ -35,18 +36,40 @@ namespace simple_renderer {
 SimpleRenderer::SimpleRenderer(size_t width, size_t height)
     : height_(height),
       width_(width),
-      log_system_(LogSystem(kLogFilePath, kLogFileMaxSize, kLogFileMaxCount)) {
+      log_system_(LogSystem(kLogFilePath, kLogFileMaxSize, kLogFileMaxCount)),
+      current_mode_(RenderingMode::TRADITIONAL) {  // 默认使用传统渲染模式
   rasterizer_ = std::make_shared<Rasterizer>(width, height);
 }
 
-bool SimpleRenderer::Render(const Model &model, const Shader &shader,
-                            uint32_t *buffer) {
-  SPDLOG_INFO("render model: {}", model.GetModelPath());
+bool SimpleRenderer::DrawModel(const Model &model, const Shader &shader,
+                               uint32_t *buffer) {
+  SPDLOG_INFO("draw model: {}", model.GetModelPath());
   shader_ = std::make_shared<Shader>(shader);
-  DrawModel(model, buffer);
+  ExecuteDrawPipeline(model, buffer);
   return true;
 }
 
+void SimpleRenderer::SetRenderingMode(RenderingMode mode) {
+  current_mode_ = mode;
+  std::string mode_name;
+  switch(mode) {
+    case RenderingMode::TRADITIONAL:
+      mode_name = "TRADITIONAL";
+      break;
+    case RenderingMode::TILE_BASED:
+      mode_name = "TILE_BASED";
+      break;
+    case RenderingMode::DEFERRED:
+      mode_name = "DEFERRED";
+      break;
+  }
+  SPDLOG_INFO("rendering mode set to: {}", mode_name);
+}
+
+RenderingMode SimpleRenderer::GetRenderingMode() const {
+  return current_mode_;
+}
+
 /*
 Optimizes performance by performing depth testing during rasterization, keeping
 only the closest fragment per pixel, and avoiding storing all
@@ -54,10 +77,31 @@ fragments—resulting in faster rendering.
 
 通过在光栅化过程中执行深度测试，仅保留每个像素的深度值最近的片段，避免存储所有片段，从而优化性能，实现更快的渲染。
 */
-void SimpleRenderer::DrawModel(const Model &model, uint32_t *buffer) {
-  SPDLOG_INFO("draw {}", model.GetModelPath());
+void SimpleRenderer::ExecuteDrawPipeline(const Model &model, uint32_t *buffer) {
+  std::string mode_name;
+  switch(current_mode_) {
+    case RenderingMode::TRADITIONAL:
+      mode_name = "TRADITIONAL";
+      break;
+    case RenderingMode::TILE_BASED:
+      mode_name = "TILE_BASED";
+      break;
+    case RenderingMode::DEFERRED:
+      mode_name = "DEFERRED";
+      break;
+  }
+  SPDLOG_INFO("execute draw pipeline for {} using {} mode", model.GetModelPath(), mode_name);
+  
+  if (!shader_) {
+    SPDLOG_ERROR("No shader set for DrawModel, cannot render");
+    return;
+  }
+  
+  // === PERFORMANCE TIMING ===
+  auto total_start_time = std::chrono::high_resolution_clock::now();
 
   /* * * Vertex Shader * * */
+  auto vertex_shader_start_time = std::chrono::high_resolution_clock::now();
   std::vector<Vertex> processedVertices;
   std::vector<std::vector<Vertex>> processed_vertices_all_thread(kNProc);
 #pragma omp parallel num_threads(kNProc) default(none) \
@@ -88,93 +132,64 @@ void SimpleRenderer::DrawModel(const Model &model, uint32_t *buffer) {
                              processedVertices_per_thread.begin(),
                              processedVertices_per_thread.end());
   }
-  /*  *  *  *  *  *  *  */
-
-  /* * * Rasterization * * */
-  std::vector<std::unique_ptr<float[]>> depthBuffer_all_thread(kNProc);
-  std::vector<std::unique_ptr<uint32_t[]>> colorBuffer_all_thread(kNProc);
-
-  for (size_t thread_id = 0; thread_id < kNProc; thread_id++) {
-    depthBuffer_all_thread[thread_id] =
-        std::make_unique<float[]>(width_ * height_);
-    colorBuffer_all_thread[thread_id] =
-        std::make_unique<uint32_t[]>(width_ * height_);
-
-    std::fill_n(depthBuffer_all_thread[thread_id].get(), width_ * height_,
-                std::numeric_limits<float>::infinity());
-    std::fill_n(colorBuffer_all_thread[thread_id].get(), width_ * height_, 0);
-  }
-
-#pragma omp parallel num_threads(kNProc) default(none) \ 
-  shared(processedVertices, rasterizer_, shader_, width_, height_, \
-             depthBuffer_all_thread, colorBuffer_all_thread)       \
-    firstprivate(model)
-  {
-    int thread_id = omp_get_thread_num();
-    auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id];
-    auto &colorBuffer_per_thread = colorBuffer_all_thread[thread_id];
-#pragma omp for
-    for (const auto &f : model.GetFaces()) {
-      auto v0 = processedVertices[f.GetIndex(0)];
-      auto v1 = processedVertices[f.GetIndex(1)];
-      auto v2 = processedVertices[f.GetIndex(2)];
-
-      const Material *material = &f.GetMaterial();
-
-      auto fragments = rasterizer_->Rasterize(v0, v1, v2);
-
-      for (auto &fragment : fragments) {
-        fragment.material = material;
-
-        size_t x = fragment.screen_coord[0];
-        size_t y = fragment.screen_coord[1];
-
-        if (x >= width_ || y >= height_) {
-          continue;
-        }
-
-        size_t index = x + y * width_;
-
-        if (fragment.depth < depthBuffer_per_thread[index]) {
-          depthBuffer_per_thread[index] = fragment.depth;
+  auto vertex_shader_end_time = std::chrono::high_resolution_clock::now();
+  auto vertex_shader_duration = std::chrono::duration_cast<std::chrono::microseconds>(
+      vertex_shader_end_time - vertex_shader_start_time);
 
-          /* * * Fragment Shader * * */
-          auto color = shader_->FragmentShader(fragment);
-          colorBuffer_per_thread[index] = uint32_t(color);
-        }
-      }
+  // 根据当前设置的模式选择不同的渲染管线
+  double vertex_ms = vertex_shader_duration.count() / 1000.0;
+  
+  switch (current_mode_) {
+    case RenderingMode::TRADITIONAL: {
+      auto stats = ExecuteTraditionalPipeline(model, processedVertices, buffer);
+      double total_ms = vertex_ms + stats.total_ms;
+      
+      SPDLOG_INFO("=== TRADITIONAL RENDERING PERFORMANCE ===");
+      SPDLOG_INFO("Vertex Shader:    {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100);
+      SPDLOG_INFO("Buffer Alloc:     {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100);
+      SPDLOG_INFO("Rasterization:    {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100);
+      SPDLOG_INFO("Merge:            {:8.3f} ms ({:5.1f}%)", stats.merge_ms, stats.merge_ms/total_ms*100);
+      SPDLOG_INFO("Total:            {:8.3f} ms", total_ms);
+      SPDLOG_INFO("==========================================");
+      break;
     }
-  }
-
-  // Merge
-  std::unique_ptr<float[]> depthBuffer =
-      std::make_unique<float[]>(width_ * height_);
-  std::unique_ptr<uint32_t[]> colorBuffer =
-      std::make_unique<uint32_t[]>(width_ * height_);
-
-  std::fill_n(depthBuffer.get(), width_ * height_,
-              std::numeric_limits<float>::infinity());
-  std::fill_n(colorBuffer.get(), width_ * height_, 0);
-
-#pragma omp parallel for
-  for (size_t i = 0; i < width_ * height_; i++) {
-    float min_depth = std::numeric_limits<float>::infinity();
-    uint32_t color = 0;
-
-    for (size_t thread_id = 0; thread_id < kNProc; thread_id++) {
-      float depth = depthBuffer_all_thread[thread_id][i];
-      if (depth < min_depth) {
-        min_depth = depth;
-        color = colorBuffer_all_thread[thread_id][i];
-      }
+    
+    case RenderingMode::TILE_BASED: {
+      auto stats = ExecuteTileBasedPipeline(model, processedVertices, buffer);
+      double total_ms = vertex_ms + stats.total_ms;
+      
+      SPDLOG_INFO("=== TILE-BASED RENDERING PERFORMANCE ===");
+      SPDLOG_INFO("Vertex Shader:    {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100);
+      SPDLOG_INFO("Setup:            {:8.3f} ms ({:5.1f}%)", stats.setup_ms, stats.setup_ms/total_ms*100);
+      SPDLOG_INFO("Binning:          {:8.3f} ms ({:5.1f}%)", stats.binning_ms, stats.binning_ms/total_ms*100);
+      SPDLOG_INFO("Buffer Alloc:     {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100);
+      SPDLOG_INFO("Rasterization:    {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100);
+      SPDLOG_INFO("Merge:            {:8.3f} ms ({:5.1f}%)", stats.merge_ms, stats.merge_ms/total_ms*100);
+      SPDLOG_INFO("Visualization:    {:8.3f} ms ({:5.1f}%)", stats.visualization_ms, stats.visualization_ms/total_ms*100);
+      SPDLOG_INFO("Total:            {:8.3f} ms", total_ms);
+      SPDLOG_INFO("==========================================");
+      break;
+    }
+    
+    case RenderingMode::DEFERRED: {
+      auto stats = ExecuteDeferredPipeline(model, processedVertices, buffer);
+      double total_ms = vertex_ms + stats.total_ms;
+      
+      SPDLOG_INFO("=== DEFERRED RENDERING PERFORMANCE ===");
+      SPDLOG_INFO("Vertex Shader:        {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100);
+      SPDLOG_INFO("Buffer Alloc:         {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100);
+      SPDLOG_INFO("Rasterization:        {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100);
+      SPDLOG_INFO("Fragment Collection:  {:8.3f} ms ({:5.1f}%)", stats.fragment_collection_ms, stats.fragment_collection_ms/total_ms*100);
+      SPDLOG_INFO("Fragment Merge:       {:8.3f} ms ({:5.1f}%)", stats.fragment_merge_ms, stats.fragment_merge_ms/total_ms*100);
+      SPDLOG_INFO("Deferred Shading:     {:8.3f} ms ({:5.1f}%)", stats.deferred_shading_ms, stats.deferred_shading_ms/total_ms*100);
+      SPDLOG_INFO("Total:                {:8.3f} ms", total_ms);
+      SPDLOG_INFO("=========================================");
+      break;
     }
-    depthBuffer[i] = min_depth;
-    colorBuffer[i] = color;
   }
-
-  std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t));
 }
 
+
 /*
 Organizes processing to simulate how OpenGL works with GPUs by collecting all
 fragments per pixel before processing, closely mimicking the GPU pipeline but
@@ -182,61 +197,46 @@ leading to increased memory usage and slower performance.
 
 组织处理方式模拟 OpenGL 在 GPU
 上的工作原理，先收集每个像素的所有片段再并行处理屏幕上的每个像素，模仿 GPU
-管线，但导致内存使用增加和渲染速度变慢
-*/
-void SimpleRenderer::DrawModelSlower(const Model &model, uint32_t *buffer) {
-  SPDLOG_INFO("draw {}", model.GetModelPath());
-
-  /* * * Vertex Shader * * */
-  std::vector<Vertex> processedVertex;
-  std::vector<std::vector<Vertex>> processed_vertices_per_thread(kNProc);
-#pragma omp parallel num_threads(kNProc) default(none) \
-    shared(shader_, processed_vertices_per_thread) firstprivate(model)
-  {
-    int thread_id = omp_get_thread_num();
-    std::vector<Vertex> &local_vertices =
-        processed_vertices_per_thread[thread_id];
-
-#pragma omp for
-    for (const auto &v : model.GetVertices()) {
-      /* * * Vertex Shader * *  */
-      // 顶点着色器：世界坐标 -> 裁剪坐标
-      auto clipSpaceVertex = shader_->VertexShader(v);
-      
-      // 透视除法：裁剪坐标 -> NDC坐标
-      auto ndcVertex = PerspectiveDivision(clipSpaceVertex);
-      
-      // 视口变换：NDC坐标 -> 屏幕坐标
-      auto screenSpaceVertex = ViewportTransformation(ndcVertex);
-      
-      local_vertices.push_back(screenSpaceVertex);
-    }
-  }
+管线，但导致内存使用增加和渲染速度变慢。
 
-  for (const auto &local_vertices : processed_vertices_per_thread) {
-    processedVertex.insert(processedVertex.end(), local_vertices.begin(),
-                           local_vertices.end());
-  }
+现在作为延迟渲染管线的一部分，用于教学演示经典GPU管线概念。
+*/
+SimpleRenderer::DeferredRenderStats SimpleRenderer::ExecuteDeferredPipeline(
+    const Model &model,
+    const std::vector<Vertex> &processedVertices,
+    uint32_t *buffer) {
+    
+  DeferredRenderStats stats;
+  SPDLOG_INFO("execute deferred pipeline for {}", model.GetModelPath());
   /*  *  *  *  *  *  *  */
 
   /* * * Rasterization * * */
   std::vector<std::vector<std::vector<Fragment>>> fragmentsBuffer_all_thread(
       kNProc, std::vector<std::vector<Fragment>>(width_ * height_));
 
+  // 预先缓存所有Material数据，避免指针悬垂问题
+  std::vector<Material> material_cache;
+  material_cache.reserve(model.GetFaces().size());
+  for (const auto &f : model.GetFaces()) {
+    material_cache.push_back(f.GetMaterial()); // 值拷贝
+  }
+  SPDLOG_INFO("cached {} materials for deferred rendering", material_cache.size());
+
 #pragma omp parallel num_threads(kNProc) default(none)                       \
-    shared(processedVertex, fragmentsBuffer_all_thread, rasterizer_, width_, \
-               height_) firstprivate(model)
+    shared(processedVertices, fragmentsBuffer_all_thread, rasterizer_, width_, \
+               height_, material_cache) firstprivate(model)
   {
     int thread_id = omp_get_thread_num();
     auto &fragmentsBuffer_per_thread = fragmentsBuffer_all_thread[thread_id];
 
 #pragma omp for
-    for (const auto &f : model.GetFaces()) {
-      auto v0 = processedVertex[f.GetIndex(0)];
-      auto v1 = processedVertex[f.GetIndex(1)];
-      auto v2 = processedVertex[f.GetIndex(2)];
+    for (size_t face_idx = 0; face_idx < model.GetFaces().size(); ++face_idx) {
+      const auto &f = model.GetFaces()[face_idx];
+      auto v0 = processedVertices[f.GetIndex(0)];
+      auto v1 = processedVertices[f.GetIndex(1)];
+      auto v2 = processedVertices[f.GetIndex(2)];
 
-      const Material *material = &f.GetMaterial();
+      const Material *material = &material_cache[face_idx]; // 使用缓存的Material
 
       auto fragments = rasterizer_->Rasterize(v0, v1, v2);
 
@@ -268,7 +268,7 @@ void SimpleRenderer::DrawModelSlower(const Model &model, uint32_t *buffer) {
 /*  *  *  *  *  *  *  */
 
 /* * * Fragment Shader * * */
-#pragma omp parallel for
+// #pragma omp parallel for
   for (size_t i = 0; i < fragmentsBuffer.size(); i++) {
     const auto &fragments = fragmentsBuffer[i];
     if (fragments.empty()) {
@@ -283,11 +283,26 @@ void SimpleRenderer::DrawModelSlower(const Model &model, uint32_t *buffer) {
     }
 
     if (renderFragment) {
+      // 添加Material指针有效性检查
+      if (renderFragment->material == nullptr) {
+        SPDLOG_ERROR("Fragment material is nullptr at pixel {}", i);
+        continue;
+      }
       auto color = shader_->FragmentShader(*renderFragment);
       buffer[i] = uint32_t(color);
     }
   }
   /*  *  *  *  *  *  *  */
+  
+  // 填充基本统计信息（延迟渲染模式主要用于教学演示）
+  stats.buffer_alloc_ms = 0.0;
+  stats.rasterization_ms = 0.0;
+  stats.fragment_collection_ms = 0.0;
+  stats.fragment_merge_ms = 0.0;
+  stats.deferred_shading_ms = 0.0;
+  stats.total_ms = 0.0;
+  
+  return stats;
 }
 
 Vertex SimpleRenderer::PerspectiveDivision(const Vertex &vertex) {
@@ -295,7 +310,6 @@ Vertex SimpleRenderer::PerspectiveDivision(const Vertex &vertex) {
   
   // 检查w分量，避免除零和负数问题
   if (position.w <= 1e-6f) {
-    SPDLOG_DEBUG("PerspectiveDivision: w <= 1e-6f");
     Vector4f farPosition(0.0f, 0.0f, 1.0f, 1.0f);
     return Vertex(farPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor());
   }
@@ -311,9 +325,8 @@ Vertex SimpleRenderer::PerspectiveDivision(const Vertex &vertex) {
     1.0f / original_w         // 保存1/w用于透视矫正插值
   );
   
-  // 严格限制NDC坐标在标准范围内
-  ndcPosition.x = std::clamp(ndcPosition.x, -1.0f, 1.0f);
-  ndcPosition.y = std::clamp(ndcPosition.y, -1.0f, 1.0f);
+  // 只对Z坐标进行深度范围限制，X和Y允许超出以支持屏幕外三角形
+  // 这些坐标在后续的视口变换和裁剪阶段会被正确处理
   ndcPosition.z = std::clamp(ndcPosition.z, -1.0f, 1.0f);
   
   // 创建新的顶点，保持其他属性不变
@@ -327,10 +340,6 @@ Vertex SimpleRenderer::ViewportTransformation(const Vertex &vertex) {
   float screen_x = (ndcPosition.x + 1.0f) * width_ / 2.0f;
   float screen_y = (1.0f - ndcPosition.y) * height_ / 2.0f;
   
-  // 额外的屏幕坐标边界保护
-  screen_x = std::clamp(screen_x, 0.0f, static_cast<float>(width_ - 1));
-  screen_y = std::clamp(screen_y, 0.0f, static_cast<float>(height_ - 1));
-  
   Vector4f screenPosition(
     screen_x,                    // x: 屏幕坐标
     screen_y,                    // y: 屏幕坐标
@@ -341,4 +350,537 @@ Vertex SimpleRenderer::ViewportTransformation(const Vertex &vertex) {
   return Vertex(screenPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor());
 }
 
+
+
+
+// Triangle-Tile binning函数 - 修正版本
+void SimpleRenderer::TriangleTileBinning(
+    const Model &model, 
+    const std::vector<Vertex> &screenVertices,
+    std::vector<std::vector<TriangleInfo>> &tile_triangles,
+    size_t tiles_x, size_t tiles_y, size_t tile_size) {
+    
+    size_t total_triangles = model.GetFaces().size();
+    size_t processed_triangles = 0;
+    size_t clipped_triangles = 0;
+    size_t triangles_with_clipped_vertices = 0;
+    
+    SPDLOG_INFO("Starting triangle-tile binning for {} triangles", total_triangles);
+    SPDLOG_INFO("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", 
+                width_, height_, tile_size, tiles_x, tiles_y);
+    
+    for (size_t tri_idx = 0; tri_idx < model.GetFaces().size(); tri_idx++) {
+        const auto &f = model.GetFaces()[tri_idx];
+        auto v0 = screenVertices[f.GetIndex(0)];
+        auto v1 = screenVertices[f.GetIndex(1)];
+        auto v2 = screenVertices[f.GetIndex(2)];
+        
+        // 获取屏幕空间坐标（现在已经是屏幕坐标了）
+        Vector4f pos0 = v0.GetPosition();
+        Vector4f pos1 = v1.GetPosition();
+        Vector4f pos2 = v2.GetPosition();
+        
+        // 检查三角形是否有被裁剪的顶点（坐标为-1000的表示被裁剪）
+        bool has_clipped_vertex = (pos0.x == -1000.0f || pos1.x == -1000.0f || pos2.x == -1000.0f);
+        
+        if (has_clipped_vertex) {
+            triangles_with_clipped_vertices++;
+            if (triangles_with_clipped_vertices <= 3) {
+                SPDLOG_INFO("Triangle {} has clipped vertices:", tri_idx);
+                SPDLOG_INFO("  V0: ({:.1f},{:.1f}) V1: ({:.1f},{:.1f}) V2: ({:.1f},{:.1f})", 
+                           pos0.x, pos0.y, pos1.x, pos1.y, pos2.x, pos2.y);
+            }
+            continue;
+        }
+        
+        // 直接使用屏幕空间坐标
+        float screen_x0 = pos0.x;
+        float screen_y0 = pos0.y;
+        float screen_x1 = pos1.x;
+        float screen_y1 = pos1.y;
+        float screen_x2 = pos2.x;
+        float screen_y2 = pos2.y;
+        
+        // 计算bounding box
+        float min_x = std::min({screen_x0, screen_x1, screen_x2});
+        float max_x = std::max({screen_x0, screen_x1, screen_x2});
+        float min_y = std::min({screen_y0, screen_y1, screen_y2});
+        float max_y = std::max({screen_y0, screen_y1, screen_y2});
+        
+        // 调试前几个有效三角形的坐标范围
+        if (processed_triangles < 3) {
+            SPDLOG_INFO("Triangle {} coordinates:", tri_idx);
+            SPDLOG_INFO("  Screen coords: ({:.1f},{:.1f}) ({:.1f},{:.1f}) ({:.1f},{:.1f})", 
+                       screen_x0, screen_y0, screen_x1, screen_y1, screen_x2, screen_y2);
+            SPDLOG_INFO("  BBox: min({:.1f},{:.1f}) max({:.1f},{:.1f})", 
+                       min_x, min_y, max_x, max_y);
+        }
+        
+        // 临时：大幅放宽屏幕边界检查，让超出屏幕的三角形也能处理
+        if (max_x < -5000.0f || min_x >= width_ + 5000.0f || 
+            max_y < -5000.0f || min_y >= height_ + 5000.0f) {
+            clipped_triangles++;
+            if (processed_triangles < 3) {
+                SPDLOG_INFO("  -> CLIPPED by screen bounds");
+            }
+            continue;
+        }
+        
+        // 计算影响的tile范围
+        int start_tile_x = std::max(0, static_cast<int>(min_x) / static_cast<int>(tile_size));
+        int end_tile_x = std::min(static_cast<int>(tiles_x - 1), 
+                                 static_cast<int>(max_x) / static_cast<int>(tile_size));
+        int start_tile_y = std::max(0, static_cast<int>(min_y) / static_cast<int>(tile_size));
+        int end_tile_y = std::min(static_cast<int>(tiles_y - 1), 
+                                 static_cast<int>(max_y) / static_cast<int>(tile_size));
+        
+        // 添加三角形到相关tiles（多个三角形可能会映射到同一个tile当中，所以谨慎并行化）
+        if (start_tile_x <= end_tile_x && start_tile_y <= end_tile_y) {
+            TriangleInfo triangle_info = {v0, v1, v2, &f.GetMaterial(), processed_triangles};
+            
+            for (int ty = start_tile_y; ty <= end_tile_y; ty++) {
+                for (int tx = start_tile_x; tx <= end_tile_x; tx++) {
+                    size_t tile_id = ty * tiles_x + tx;
+                    tile_triangles[tile_id].push_back(triangle_info); // 可能多个线程同时pushback的话有风险
+                }
+            }
+            processed_triangles++;
+            
+            // 输出前几个成功添加的三角形信息
+            if (processed_triangles <= 3) {
+                SPDLOG_INFO("  -> SUCCESSFULLY ADDED to tiles x[{}..{}] y[{}..{}]", 
+                           start_tile_x, end_tile_x, start_tile_y, end_tile_y);
+            }
+        } else {
+            if (processed_triangles < 3) {
+                SPDLOG_INFO("  -> FAILED tile calculation: x[{}..{}] y[{}..{}]", 
+                           start_tile_x, end_tile_x, start_tile_y, end_tile_y);
+            }
+        }
+    }
+    
+    // 输出统计信息
+    SPDLOG_INFO("Triangle-Tile binning completed:");
+    SPDLOG_INFO("  Total triangles: {}", total_triangles);
+    SPDLOG_INFO("  Triangles with clipped vertices: {}", triangles_with_clipped_vertices);
+    SPDLOG_INFO("  Processed triangles: {}", processed_triangles);
+    SPDLOG_INFO("  Clipped by screen bounds: {}", clipped_triangles);
+    
+    size_t total_triangle_refs = 0;
+    size_t non_empty_tiles = 0;
+    for (const auto& tile : tile_triangles) {
+        total_triangle_refs += tile.size();
+        if (!tile.empty()) non_empty_tiles++;
+    }
+    
+    SPDLOG_INFO("  Total triangle references: {}", total_triangle_refs);
+    SPDLOG_INFO("  Non-empty tiles: {}", non_empty_tiles);
+    SPDLOG_INFO("  Average triangles per tile: {:.2f}", 
+                total_triangle_refs > 0 ? float(total_triangle_refs) / tile_triangles.size() : 0.0f);
+}
+
+// 单个tile光栅化函数
+void SimpleRenderer::RasterizeTile(
+    size_t tile_id,
+    const std::vector<TriangleInfo> &triangles,
+    size_t tiles_x, size_t tiles_y, size_t tile_size,
+    float* tile_depth_buffer, uint32_t* tile_color_buffer,
+    std::unique_ptr<float[]> &global_depth_buffer,
+    std::unique_ptr<uint32_t[]> &global_color_buffer) {
+  // 计算tile在屏幕空间的范围
+  size_t tile_x = tile_id % tiles_x;
+  size_t tile_y = tile_id / tiles_x;
+  size_t screen_x_start = tile_x * tile_size;
+  size_t screen_y_start = tile_y * tile_size;
+  size_t screen_x_end = std::min(screen_x_start + tile_size, width_);
+  size_t screen_y_end = std::min(screen_y_start + tile_size, height_);
+    
+  // 初始化tile缓冲区
+  size_t tile_width = screen_x_end - screen_x_start;
+  size_t tile_height = screen_y_end - screen_y_start;
+  std::fill_n(tile_depth_buffer, tile_width * tile_height,
+              std::numeric_limits<float>::infinity());
+  std::fill_n(tile_color_buffer, tile_width * tile_height, 0);
+    
+  // 在tile内光栅化所有三角形
+  for (const auto &triangle : triangles) {
+    auto fragments = rasterizer_->Rasterize(triangle.v0, triangle.v1, triangle.v2);
+        
+    for (auto &fragment : fragments) {
+      fragment.material = triangle.material;
+            
+      size_t screen_x = fragment.screen_coord[0];
+      size_t screen_y = fragment.screen_coord[1];
+            
+      // 检查fragment是否在当前tile内
+      if (screen_x >= screen_x_start && screen_x < screen_x_end &&
+          screen_y >= screen_y_start && screen_y < screen_y_end) {
+                
+        size_t tile_local_x = screen_x - screen_x_start;
+        size_t tile_local_y = screen_y - screen_y_start;
+        size_t tile_index = tile_local_x + tile_local_y * tile_width;
+                
+        // tile内深度测试
+        if (fragment.depth < tile_depth_buffer[tile_index]) {
+          tile_depth_buffer[tile_index] = fragment.depth;
+                    
+          auto color = shader_->FragmentShader(fragment);
+          tile_color_buffer[tile_index] = uint32_t(color);
+        }
+          }
+    }
+  }
+    
+  // 将tile结果写入全局缓冲区
+  for (size_t y = 0; y < tile_height; y++) {
+    for (size_t x = 0; x < tile_width; x++) {
+      size_t tile_index = x + y * tile_width;
+      size_t global_index = (screen_x_start + x) + (screen_y_start + y) * width_;
+            
+      if (tile_depth_buffer[tile_index] < global_depth_buffer[global_index]) {
+        global_depth_buffer[global_index] = tile_depth_buffer[tile_index];
+        global_color_buffer[global_index] = tile_color_buffer[tile_index];
+      }
+    }
+  }
+}
+
+// Tile可视化调试函数，这里用于固定大小的tiles
+void SimpleRenderer::DrawTileVisualization(uint32_t* buffer, 
+    const std::vector<std::vector<TriangleInfo>>& tile_triangles, 
+    size_t tiles_x, size_t tiles_y, size_t tile_size) {
+    
+    SPDLOG_INFO("=== TILE VISUALIZATION DEBUG ===");
+    SPDLOG_INFO("Drawing tile grid overlay for debugging");
+    
+    // 颜色定义 (ABGR格式)
+    const uint32_t GRID_COLOR = 0xFF00FF00;      // 绿色网格线
+    const uint32_t NONEMPTY_COLOR = 0x4000FFFF;  // 半透明黄色背景 (非空tile)
+    const uint32_t EMPTY_COLOR = 0x20FF0000;     // 半透明蓝色背景 (空tile)
+    
+    // 1. 为非空tiles添加背景色
+    for (size_t tile_y = 0; tile_y < tiles_y; tile_y++) {
+        for (size_t tile_x = 0; tile_x < tiles_x; tile_x++) {
+            size_t tile_id = tile_y * tiles_x + tile_x;
+            bool is_empty = tile_triangles[tile_id].empty();
+            
+            // 计算tile在屏幕上的像素范围
+            size_t pixel_start_x = tile_x * tile_size;
+            size_t pixel_end_x = std::min(pixel_start_x + tile_size, static_cast<size_t>(width_));
+            size_t pixel_start_y = tile_y * tile_size;
+            size_t pixel_end_y = std::min(pixel_start_y + tile_size, static_cast<size_t>(height_));
+            
+            uint32_t bg_color = is_empty ? EMPTY_COLOR : NONEMPTY_COLOR;
+            
+            // 给tile添加半透明背景
+            for (size_t y = pixel_start_y; y < pixel_end_y; y++) {
+                for (size_t x = pixel_start_x; x < pixel_end_x; x++) {
+                    size_t pixel_idx = y * static_cast<size_t>(width_) + x;
+                    // 简单的alpha混合：将背景色与原色混合
+                    uint32_t original = buffer[pixel_idx];
+                    buffer[pixel_idx] = BlendColors(original, bg_color);
+                }
+            }
+            
+            // 记录非空tile的信息
+            if (!is_empty) {
+                SPDLOG_INFO("Non-empty Tile[{},{}] (ID:{}): {} triangles", 
+                           tile_x, tile_y, tile_id, tile_triangles[tile_id].size());
+            }
+        }
+    }
+    
+    // 2. 绘制网格线
+    // 垂直线
+    for (size_t tile_x = 0; tile_x <= tiles_x; tile_x++) {
+        size_t pixel_x = tile_x * tile_size;
+        if (pixel_x < static_cast<size_t>(width_)) {
+            for (size_t y = 0; y < static_cast<size_t>(height_); y++) {
+                buffer[y * static_cast<size_t>(width_) + pixel_x] = GRID_COLOR;
+            }
+        }
+    }
+    
+    // 水平线
+    for (size_t tile_y = 0; tile_y <= tiles_y; tile_y++) {
+        size_t pixel_y = tile_y * tile_size;
+        if (pixel_y < static_cast<size_t>(height_)) {
+            for (size_t x = 0; x < static_cast<size_t>(width_); x++) {
+                buffer[pixel_y * static_cast<size_t>(width_) + x] = GRID_COLOR;
+            }
+        }
+    }
+    
+    SPDLOG_INFO("Tile visualization completed - Green:Grid, Yellow:NonEmpty, Blue:Empty");
+    SPDLOG_INFO("=====================================");
+}
+
+// 简单的颜色混合函数 (alpha blending)
+uint32_t SimpleRenderer::BlendColors(uint32_t base, uint32_t overlay) {
+    // 提取RGBA通道 (假设是ABGR格式)
+    uint8_t base_r = (base >> 16) & 0xFF;
+    uint8_t base_g = (base >> 8) & 0xFF;
+    uint8_t base_b = base & 0xFF;
+    
+    uint8_t overlay_r = (overlay >> 16) & 0xFF;
+    uint8_t overlay_g = (overlay >> 8) & 0xFF;
+    uint8_t overlay_b = overlay & 0xFF;
+    uint8_t overlay_a = (overlay >> 24) & 0xFF;
+    
+    // 简单的alpha混合
+    float alpha = overlay_a / 255.0f;
+    uint8_t result_r = (uint8_t)(base_r * (1.0f - alpha) + overlay_r * alpha);
+    uint8_t result_g = (uint8_t)(base_g * (1.0f - alpha) + overlay_g * alpha);
+    uint8_t result_b = (uint8_t)(base_b * (1.0f - alpha) + overlay_b * alpha);
+    
+    return 0xFF000000 | (result_r << 16) | (result_g << 8) | result_b;
+}
+
+// 传统光栅化管线实现
+SimpleRenderer::RenderStats SimpleRenderer::ExecuteTraditionalPipeline(
+    const Model &model, 
+    const std::vector<Vertex> &processedVertices,
+    uint32_t *buffer) {
+    
+    RenderStats stats;
+    auto total_start_time = std::chrono::high_resolution_clock::now();
+    
+    // 1. 为每个线程创建framebuffer
+    auto buffer_alloc_start_time = std::chrono::high_resolution_clock::now();
+    std::vector<std::unique_ptr<float[]>> depthBuffer_all_thread(kNProc);
+    std::vector<std::unique_ptr<uint32_t[]>> colorBuffer_all_thread(kNProc);
+    
+    for (size_t thread_id = 0; thread_id < kNProc; thread_id++) {
+        depthBuffer_all_thread[thread_id] = 
+            std::make_unique<float[]>(width_ * height_);
+        colorBuffer_all_thread[thread_id] = 
+            std::make_unique<uint32_t[]>(width_ * height_);
+        
+        std::fill_n(depthBuffer_all_thread[thread_id].get(), width_ * height_,
+                    std::numeric_limits<float>::infinity());
+        std::fill_n(colorBuffer_all_thread[thread_id].get(), width_ * height_, 0);
+    }
+    auto buffer_alloc_end_time = std::chrono::high_resolution_clock::now();
+    auto buffer_alloc_duration = std::chrono::duration_cast<std::chrono::microseconds>(
+        buffer_alloc_end_time - buffer_alloc_start_time);
+    
+    // 2. 并行光栅化
+    auto raster_start_time = std::chrono::high_resolution_clock::now();
+#pragma omp parallel num_threads(kNProc) default(none) \
+    shared(processedVertices, rasterizer_, shader_, width_, height_, \
+           depthBuffer_all_thread, colorBuffer_all_thread) \
+    firstprivate(model)
+    {
+        int thread_id = omp_get_thread_num();
+        auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id];
+        auto &colorBuffer_per_thread = colorBuffer_all_thread[thread_id];
+        
+#pragma omp for
+        for (const auto &f : model.GetFaces()) {
+            auto v0 = processedVertices[f.GetIndex(0)];
+            auto v1 = processedVertices[f.GetIndex(1)];
+            auto v2 = processedVertices[f.GetIndex(2)];
+
+            const Material *material = &f.GetMaterial();
+            auto fragments = rasterizer_->Rasterize(v0, v1, v2);
+
+            for (auto &fragment : fragments) {
+                fragment.material = material;
+                size_t x = fragment.screen_coord[0];
+                size_t y = fragment.screen_coord[1];
+
+                if (x >= width_ || y >= height_) {
+                    continue;
+                }
+
+                size_t index = x + y * width_;
+                if (fragment.depth < depthBuffer_per_thread[index]) {
+                    depthBuffer_per_thread[index] = fragment.depth;
+                    auto color = shader_->FragmentShader(fragment);
+                    colorBuffer_per_thread[index] = uint32_t(color);
+                }
+            }
+        }
+    }
+    auto raster_end_time = std::chrono::high_resolution_clock::now();
+    auto raster_duration = std::chrono::duration_cast<std::chrono::microseconds>(
+        raster_end_time - raster_start_time);
+    
+    // 3. 合并结果
+    auto merge_start_time = std::chrono::high_resolution_clock::now();
+    std::unique_ptr<float[]> depthBuffer = 
+        std::make_unique<float[]>(width_ * height_);
+    std::unique_ptr<uint32_t[]> colorBuffer = 
+        std::make_unique<uint32_t[]>(width_ * height_);
+
+    std::fill_n(depthBuffer.get(), width_ * height_,
+                std::numeric_limits<float>::infinity());
+    std::fill_n(colorBuffer.get(), width_ * height_, 0);
+
+#pragma omp parallel for
+    for (size_t i = 0; i < width_ * height_; i++) {
+        float min_depth = std::numeric_limits<float>::infinity();
+        uint32_t color = 0;
+
+        for (size_t thread_id = 0; thread_id < kNProc; thread_id++) {
+            float depth = depthBuffer_all_thread[thread_id][i];
+            if (depth < min_depth) {
+                min_depth = depth;
+                color = colorBuffer_all_thread[thread_id][i];
+            }
+        }
+        depthBuffer[i] = min_depth;
+        colorBuffer[i] = color;
+    }
+
+    std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t));
+    auto merge_end_time = std::chrono::high_resolution_clock::now();
+    auto merge_duration = std::chrono::duration_cast<std::chrono::microseconds>(
+        merge_end_time - merge_start_time);
+    
+    auto total_end_time = std::chrono::high_resolution_clock::now();
+    auto total_duration = std::chrono::duration_cast<std::chrono::microseconds>(
+        total_end_time - total_start_time);
+    
+    // 填充统计信息
+    stats.buffer_alloc_ms = buffer_alloc_duration.count() / 1000.0;
+    stats.rasterization_ms = raster_duration.count() / 1000.0;
+    stats.merge_ms = merge_duration.count() / 1000.0;
+    stats.total_ms = total_duration.count() / 1000.0;
+    
+    return stats;
+}
+
+// Tile-based光栅化管线实现
+SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
+    const Model &model,
+    const std::vector<Vertex> &processedVertices,
+    uint32_t *buffer) {
+    
+    TileRenderStats stats;
+    auto total_start_time = std::chrono::high_resolution_clock::now();
+    
+    // 1. Setup阶段
+    auto setup_start_time = std::chrono::high_resolution_clock::now();
+    const size_t TILE_SIZE = 64; // 64x64 pixels per tile
+    const size_t tiles_x = (width_ + TILE_SIZE - 1) / TILE_SIZE;
+    const size_t tiles_y = (height_ + TILE_SIZE - 1) / TILE_SIZE;
+    const size_t total_tiles = tiles_x * tiles_y;
+    
+    // 为每个tile创建三角形列表
+    std::vector<std::vector<TriangleInfo>> tile_triangles(total_tiles);
+    auto setup_end_time = std::chrono::high_resolution_clock::now();
+    auto setup_duration = std::chrono::duration_cast<std::chrono::microseconds>(
+        setup_end_time - setup_start_time);
+    
+    // 2. Triangle-Tile binning阶段
+    auto binning_start_time = std::chrono::high_resolution_clock::now();
+    TriangleTileBinning(model, processedVertices, tile_triangles, tiles_x, tiles_y, TILE_SIZE);
+    auto binning_end_time = std::chrono::high_resolution_clock::now();
+    auto binning_duration = std::chrono::duration_cast<std::chrono::microseconds>(
+        binning_end_time - binning_start_time);
+    
+    // 3. 为每个线程创建framebuffer
+    auto buffer_alloc_start_time = std::chrono::high_resolution_clock::now();
+    std::vector<std::unique_ptr<float[]>> depthBuffer_all_thread(kNProc);
+    std::vector<std::unique_ptr<uint32_t[]>> colorBuffer_all_thread(kNProc);
+    
+    for (size_t thread_id = 0; thread_id < kNProc; thread_id++) {
+        depthBuffer_all_thread[thread_id] = 
+            std::make_unique<float[]>(width_ * height_);
+        colorBuffer_all_thread[thread_id] = 
+            std::make_unique<uint32_t[]>(width_ * height_);
+            
+        std::fill_n(depthBuffer_all_thread[thread_id].get(), width_ * height_,
+                    std::numeric_limits<float>::infinity());
+        std::fill_n(colorBuffer_all_thread[thread_id].get(), width_ * height_, 0);
+    }
+    auto buffer_alloc_end_time = std::chrono::high_resolution_clock::now();
+    auto buffer_alloc_duration = std::chrono::duration_cast<std::chrono::microseconds>(
+        buffer_alloc_end_time - buffer_alloc_start_time);
+    
+    // 4. 并行处理每个tile
+    auto rasterization_start_time = std::chrono::high_resolution_clock::now();
+#pragma omp parallel num_threads(kNProc) default(none) \
+    shared(tile_triangles, rasterizer_, shader_, width_, height_, \
+           depthBuffer_all_thread, colorBuffer_all_thread, tiles_x, tiles_y, total_tiles)
+    {
+        int thread_id = omp_get_thread_num();
+        auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id];
+        auto &colorBuffer_per_thread = colorBuffer_all_thread[thread_id];
+
+        // 为当前线程创建tile局部缓冲区
+        std::unique_ptr<float[]> tile_depth_buffer = 
+            std::make_unique<float[]>(TILE_SIZE * TILE_SIZE);
+        std::unique_ptr<uint32_t[]> tile_color_buffer = 
+            std::make_unique<uint32_t[]>(TILE_SIZE * TILE_SIZE);
+
+#pragma omp for
+        for (size_t tile_id = 0; tile_id < total_tiles; tile_id++) {
+            // 按照tile进行光栅化
+            RasterizeTile(tile_id, tile_triangles[tile_id], 
+                         tiles_x, tiles_y, TILE_SIZE,
+                         tile_depth_buffer.get(), tile_color_buffer.get(),
+                         depthBuffer_per_thread, colorBuffer_per_thread);
+        }
+    }
+    auto rasterization_end_time = std::chrono::high_resolution_clock::now();
+    auto rasterization_duration = std::chrono::duration_cast<std::chrono::microseconds>(
+        rasterization_end_time - rasterization_start_time);
+    
+    // 5. 合并所有线程结果
+    auto merge_start_time = std::chrono::high_resolution_clock::now();
+    std::unique_ptr<float[]> depthBuffer = 
+        std::make_unique<float[]>(width_ * height_);
+    std::unique_ptr<uint32_t[]> colorBuffer = 
+        std::make_unique<uint32_t[]>(width_ * height_);
+
+    std::fill_n(depthBuffer.get(), width_ * height_,
+                std::numeric_limits<float>::infinity());
+    std::fill_n(colorBuffer.get(), width_ * height_, 0);
+
+#pragma omp parallel for
+    for (size_t i = 0; i < width_ * height_; i++) {
+        float min_depth = std::numeric_limits<float>::infinity();
+        uint32_t color = 0;
+
+        for (size_t thread_id = 0; thread_id < kNProc; thread_id++) {
+            float depth = depthBuffer_all_thread[thread_id][i];
+            if (depth < min_depth) {
+                min_depth = depth;
+                color = colorBuffer_all_thread[thread_id][i];
+            }
+        }
+        depthBuffer[i] = min_depth;
+        colorBuffer[i] = color;
+    }
+
+    std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t));
+    auto merge_end_time = std::chrono::high_resolution_clock::now();
+    auto merge_duration = std::chrono::duration_cast<std::chrono::microseconds>(
+        merge_end_time - merge_start_time);
+    
+    // 6. Tile可视化调试
+    auto visualization_start_time = std::chrono::high_resolution_clock::now();
+    DrawTileVisualization(buffer, tile_triangles, tiles_x, tiles_y, TILE_SIZE);
+    auto visualization_end_time = std::chrono::high_resolution_clock::now();
+    auto visualization_duration = std::chrono::duration_cast<std::chrono::microseconds>(
+        visualization_end_time - visualization_start_time);
+    
+    auto total_end_time = std::chrono::high_resolution_clock::now();
+    auto total_duration = std::chrono::duration_cast<std::chrono::microseconds>(
+        total_end_time - total_start_time);
+    
+    // 填充统计信息
+    stats.setup_ms = setup_duration.count() / 1000.0;
+    stats.binning_ms = binning_duration.count() / 1000.0;
+    stats.buffer_alloc_ms = buffer_alloc_duration.count() / 1000.0;
+    stats.rasterization_ms = rasterization_duration.count() / 1000.0;
+    stats.merge_ms = merge_duration.count() / 1000.0;
+    stats.visualization_ms = visualization_duration.count() / 1000.0;
+    stats.total_ms = total_duration.count() / 1000.0;
+    
+    return stats;
+}
+
 }  // namespace simple_renderer
diff --git a/test/system_test/main.cpp b/test/system_test/main.cpp
index a844aa7..58c70ad 100755
--- a/test/system_test/main.cpp
+++ b/test/system_test/main.cpp
@@ -56,6 +56,7 @@ int main(int argc, char **argv) {
   }
 
   auto modelMatrix = simple_renderer::Matrix4f(1.0f);
+  
   simple_renderer::Matrix4f scale_matrix =
       glm::scale(simple_renderer::Matrix4f(1.0f),
                  simple_renderer::Vector3f(.02f, .02f, .02f));
@@ -68,8 +69,7 @@ int main(int argc, char **argv) {
       glm::rotate(simple_renderer::Matrix4f(1.0f), glm::radians(-105.0f),
                   simple_renderer::Vector3f(1.0f, 0.0f, 0.0f));
 
-  // Combined transformation matrix
-  modelMatrix = scale_matrix * translation_matrix * rotation_matrix;
+  modelMatrix = scale_matrix* translation_matrix * rotation_matrix ;
 
   simple_renderer::Shader shader;
   shader.SetUniform("modelMatrix", modelMatrix);
@@ -80,6 +80,28 @@ int main(int argc, char **argv) {
 
   simple_renderer::Camera camera(simple_renderer::Vector3f(0.0f, 0.0f, 1.0f));
 
+  // 设置渲染模式（可选：TRADITIONAL、TILE_BASED 或 DEFERRED）
+  simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::DEFERRED);
+  
+  // 输出当前渲染模式
+  std::string current_mode_name;
+  switch(simple_renderer.GetRenderingMode()) {
+    case simple_renderer::RenderingMode::TRADITIONAL:
+      current_mode_name = "TRADITIONAL (传统光栅化)";
+      break;
+    case simple_renderer::RenderingMode::TILE_BASED:
+      current_mode_name = "TILE_BASED (基于Tile光栅化)";
+      break;
+    case simple_renderer::RenderingMode::DEFERRED:
+      current_mode_name = "DEFERRED (模仿GPU的延迟渲染)";
+      break;
+  }
+  SPDLOG_INFO("当前渲染模式: {}", current_mode_name);
+  
+  // 可以在这里添加模式切换的示例：
+  // simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::TILE_BASED);
+  // simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::DEFERRED);
+
   auto display = Display(kWidth, kHeight);
   display.loopBegin();
 
@@ -89,11 +111,11 @@ int main(int argc, char **argv) {
     shader.SetUniform("cameraPos", camera.GetPosition());
     shader.SetUniform("viewMatrix", camera.GetViewMatrix());
     shader.SetUniform("projectionMatrix",
-                      camera.GetProjectionMatrix(60.0f, float(kWidth)/float(kHeight), 0.1f, 100.0f));
+                      camera.GetProjectionMatrix(60.0f, static_cast<float>(kWidth) / static_cast<float>(kHeight), 0.1f, 100.0f));
 
     buffer.ClearDrawBuffer(simple_renderer::Color::kBlack);
     for (auto &model : models) {
-      simple_renderer.Render(model, shader, buffer.GetDrawBuffer());
+      simple_renderer.DrawModel(model, shader, buffer.GetDrawBuffer());
     }
 
     buffer.SwapBuffer();

From 8d58a84c4d5222091d276322ee1343019dbb860e Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Fri, 29 Aug 2025 15:53:55 +0800
Subject: [PATCH 04/24] Add Performance Profiling for Deffered Pipeline. Remove
 detailed debug code for TBR.

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 src/include/vertex.hpp    |  18 +++++-
 src/renderer.cpp          | 116 ++++++++++++++++++++++++++++++++------
 src/shader.cpp            |  10 +++-
 test/system_test/main.cpp |   6 +-
 4 files changed, 125 insertions(+), 25 deletions(-)

diff --git a/src/include/vertex.hpp b/src/include/vertex.hpp
index 975abd0..bff0680 100644
--- a/src/include/vertex.hpp
+++ b/src/include/vertex.hpp
@@ -34,7 +34,15 @@ class Vertex {
   // Constructor with parameters 带参数的构造函数
   explicit Vertex(const Vector4f& pos, const Vector3f& norm,
                   const Vector2f& tex, const Color& color_)
-      : position_(pos), normal_(norm), texCoords_(tex), color_(color_) {}
+      : position_(pos), normal_(norm), texCoords_(tex), color_(color_),
+        clip_position_(pos), has_clip_position_(false) {}
+        
+  // 扩展构造函数：包含裁剪空间坐标
+  explicit Vertex(const Vector4f& pos, const Vector3f& norm,
+                  const Vector2f& tex, const Color& color_,
+                  const Vector4f& clip_pos)
+      : position_(pos), normal_(norm), texCoords_(tex), color_(color_),
+        clip_position_(clip_pos), has_clip_position_(true) {}
 
   // Transform the vertex with a matrix     使用矩阵变换顶点
   void transform(const Matrix4f& matrix) { position_ = matrix * position_; }
@@ -45,12 +53,20 @@ class Vertex {
   [[nodiscard]] inline Vector3f GetNormal() const { return normal_; }
   [[nodiscard]] inline Vector2f GetTexCoords() const { return texCoords_; }
   [[nodiscard]] inline Color GetColor() const { return color_; }
+  
+  // 扩展坐标访问
+  [[nodiscard]] inline Vector4f GetClipPosition() const { return clip_position_; }
+  [[nodiscard]] inline bool HasClipPosition() const { return has_clip_position_; }
 
  private:
   Vector4f position_;   // 3D position, 3D顶点坐标
   Vector3f normal_;     // Normal vector, 顶点法向量
   Vector2f texCoords_;  // Texture coordinates, 顶点纹理坐标
   Color color_;
+  
+  // 扩展坐标用于裁剪优化
+  Vector4f clip_position_; // 裁剪空间坐标 (用于视锥体裁剪)
+  bool has_clip_position_; // 是否包含裁剪坐标
 };
 
 inline Vertex operator*(const Matrix4f& matrix, const Vertex& vertex) {
diff --git a/src/renderer.cpp b/src/renderer.cpp
index 34866c2..a7bc226 100755
--- a/src/renderer.cpp
+++ b/src/renderer.cpp
@@ -37,7 +37,7 @@ SimpleRenderer::SimpleRenderer(size_t width, size_t height)
     : height_(height),
       width_(width),
       log_system_(LogSystem(kLogFilePath, kLogFileMaxSize, kLogFileMaxCount)),
-      current_mode_(RenderingMode::TRADITIONAL) {  // 默认使用传统渲染模式
+      current_mode_(RenderingMode::TILE_BASED) {
   rasterizer_ = std::make_shared<Rasterizer>(width, height);
 }
 
@@ -207,10 +207,12 @@ SimpleRenderer::DeferredRenderStats SimpleRenderer::ExecuteDeferredPipeline(
     uint32_t *buffer) {
     
   DeferredRenderStats stats;
+  auto total_start_time = std::chrono::high_resolution_clock::now();
   SPDLOG_INFO("execute deferred pipeline for {}", model.GetModelPath());
   /*  *  *  *  *  *  *  */
 
-  /* * * Rasterization * * */
+  /* * * Buffer Allocation * * */
+  auto buffer_alloc_start_time = std::chrono::high_resolution_clock::now();
   std::vector<std::vector<std::vector<Fragment>>> fragmentsBuffer_all_thread(
       kNProc, std::vector<std::vector<Fragment>>(width_ * height_));
 
@@ -220,8 +222,13 @@ SimpleRenderer::DeferredRenderStats SimpleRenderer::ExecuteDeferredPipeline(
   for (const auto &f : model.GetFaces()) {
     material_cache.push_back(f.GetMaterial()); // 值拷贝
   }
+  auto buffer_alloc_end_time = std::chrono::high_resolution_clock::now();
+  auto buffer_alloc_duration = std::chrono::duration_cast<std::chrono::microseconds>(
+      buffer_alloc_end_time - buffer_alloc_start_time);
   SPDLOG_INFO("cached {} materials for deferred rendering", material_cache.size());
 
+  /* * * Rasterization * * */
+  auto rasterization_start_time = std::chrono::high_resolution_clock::now();
 #pragma omp parallel num_threads(kNProc) default(none)                       \
     shared(processedVertices, fragmentsBuffer_all_thread, rasterizer_, width_, \
                height_, material_cache) firstprivate(model)
@@ -255,8 +262,13 @@ SimpleRenderer::DeferredRenderStats SimpleRenderer::ExecuteDeferredPipeline(
       }
     }
   }
+  auto rasterization_end_time = std::chrono::high_resolution_clock::now();
+  auto rasterization_duration = std::chrono::duration_cast<std::chrono::microseconds>(
+      rasterization_end_time - rasterization_start_time);
+  /*  *  *  *  *  *  *  */
 
-  // Merge fragments
+  /* * * Fragment Collection * * */
+  auto fragment_collection_start_time = std::chrono::high_resolution_clock::now();
   std::vector<std::vector<Fragment>> fragmentsBuffer(width_ * height_);
   for (const auto &fragmentsBuffer_per_thread : fragmentsBuffer_all_thread) {
     for (size_t i = 0; i < fragmentsBuffer_per_thread.size(); i++) {
@@ -265,10 +277,17 @@ SimpleRenderer::DeferredRenderStats SimpleRenderer::ExecuteDeferredPipeline(
                                 fragmentsBuffer_per_thread[i].end());
     }
   }
-/*  *  *  *  *  *  *  */
+  auto fragment_collection_end_time = std::chrono::high_resolution_clock::now();
+  auto fragment_collection_duration = std::chrono::duration_cast<std::chrono::microseconds>(
+      fragment_collection_end_time - fragment_collection_start_time);
+  /*  *  *  *  *  *  *  */
 
-/* * * Fragment Shader * * */
-// #pragma omp parallel for
+  /* * * Fragment Merge & Deferred Shading * * */
+  auto fragment_merge_start_time = std::chrono::high_resolution_clock::now();
+  
+  // Fragment Merge阶段：深度测试选择最近片段
+  std::vector<const Fragment*> selected_fragments(width_ * height_, nullptr);
+  #pragma omp parallel for
   for (size_t i = 0; i < fragmentsBuffer.size(); i++) {
     const auto &fragments = fragmentsBuffer[i];
     if (fragments.empty()) {
@@ -281,7 +300,17 @@ SimpleRenderer::DeferredRenderStats SimpleRenderer::ExecuteDeferredPipeline(
         renderFragment = &fragment;
       }
     }
-
+    selected_fragments[i] = renderFragment;
+  }
+  auto fragment_merge_end_time = std::chrono::high_resolution_clock::now();
+  auto fragment_merge_duration = std::chrono::duration_cast<std::chrono::microseconds>(
+      fragment_merge_end_time - fragment_merge_start_time);
+  
+  // Deferred Shading阶段：执行片段着色器
+  auto deferred_shading_start_time = std::chrono::high_resolution_clock::now();
+#pragma omp parallel for
+  for (size_t i = 0; i < selected_fragments.size(); i++) {
+    const Fragment *renderFragment = selected_fragments[i];
     if (renderFragment) {
       // 添加Material指针有效性检查
       if (renderFragment->material == nullptr) {
@@ -292,15 +321,22 @@ SimpleRenderer::DeferredRenderStats SimpleRenderer::ExecuteDeferredPipeline(
       buffer[i] = uint32_t(color);
     }
   }
+  auto deferred_shading_end_time = std::chrono::high_resolution_clock::now();
+  auto deferred_shading_duration = std::chrono::duration_cast<std::chrono::microseconds>(
+      deferred_shading_end_time - deferred_shading_start_time);
   /*  *  *  *  *  *  *  */
   
-  // 填充基本统计信息（延迟渲染模式主要用于教学演示）
-  stats.buffer_alloc_ms = 0.0;
-  stats.rasterization_ms = 0.0;
-  stats.fragment_collection_ms = 0.0;
-  stats.fragment_merge_ms = 0.0;
-  stats.deferred_shading_ms = 0.0;
-  stats.total_ms = 0.0;
+  auto total_end_time = std::chrono::high_resolution_clock::now();
+  auto total_duration = std::chrono::duration_cast<std::chrono::microseconds>(
+      total_end_time - total_start_time);
+  
+  // 填充统计信息
+  stats.buffer_alloc_ms = buffer_alloc_duration.count() / 1000.0;
+  stats.rasterization_ms = rasterization_duration.count() / 1000.0;
+  stats.fragment_collection_ms = fragment_collection_duration.count() / 1000.0;
+  stats.fragment_merge_ms = fragment_merge_duration.count() / 1000.0;
+  stats.deferred_shading_ms = deferred_shading_duration.count() / 1000.0;
+  stats.total_ms = total_duration.count() / 1000.0;
   
   return stats;
 }
@@ -329,8 +365,12 @@ Vertex SimpleRenderer::PerspectiveDivision(const Vertex &vertex) {
   // 这些坐标在后续的视口变换和裁剪阶段会被正确处理
   ndcPosition.z = std::clamp(ndcPosition.z, -1.0f, 1.0f);
   
-  // 创建新的顶点，保持其他属性不变
-  return Vertex(ndcPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor());
+  // 创建新的顶点，保持其他属性和裁剪空间坐标不变
+  if (vertex.HasClipPosition()) {
+    return Vertex(ndcPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor(), vertex.GetClipPosition());
+  } else {
+    return Vertex(ndcPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor());
+  }
 }
 
 Vertex SimpleRenderer::ViewportTransformation(const Vertex &vertex) {
@@ -365,6 +405,10 @@ void SimpleRenderer::TriangleTileBinning(
     size_t clipped_triangles = 0;
     size_t triangles_with_clipped_vertices = 0;
     
+    // 裁剪统计
+    size_t frustum_culled = 0;
+    size_t backface_culled = 0;
+    
     SPDLOG_INFO("Starting triangle-tile binning for {} triangles", total_triangles);
     SPDLOG_INFO("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", 
                 width_, height_, tile_size, tiles_x, tiles_y);
@@ -375,11 +419,49 @@ void SimpleRenderer::TriangleTileBinning(
         auto v1 = screenVertices[f.GetIndex(1)];
         auto v2 = screenVertices[f.GetIndex(2)];
         
+        // 视锥体裁剪 (裁剪空间)
+        if (v0.HasClipPosition()) {
+            Vector4f c0 = v0.GetClipPosition();
+            Vector4f c1 = v1.GetClipPosition(); 
+            Vector4f c2 = v2.GetClipPosition();
+            
+            // 保守视锥体裁剪：只有当整个三角形都在视锥体外同一侧时才裁剪
+            bool frustum_cull = 
+                (c0.x > c0.w && c1.x > c1.w && c2.x > c2.w) ||  // 右平面外
+                (c0.x < -c0.w && c1.x < -c1.w && c2.x < -c2.w) || // 左平面外  
+                (c0.y > c0.w && c1.y > c1.w && c2.y > c2.w) ||  // 上平面外
+                (c0.y < -c0.w && c1.y < -c1.w && c2.y < -c2.w) || // 下平面外
+                (c0.z > c0.w && c1.z > c1.w && c2.z > c2.w) ||  // 远平面外
+                (c0.z < -c0.w && c1.z < -c1.w && c2.z < -c2.w);  // 近平面外
+                
+            // if (frustum_cull) {
+            //     frustum_culled++;
+            //     continue;
+            // }
+        }
+        
         // 获取屏幕空间坐标（现在已经是屏幕坐标了）
         Vector4f pos0 = v0.GetPosition();
         Vector4f pos1 = v1.GetPosition();
         Vector4f pos2 = v2.GetPosition();
         
+        // 背面剔除 (屏幕空间)
+        Vector2f screen0(pos0.x, pos0.y);
+        Vector2f screen1(pos1.x, pos1.y);  
+        Vector2f screen2(pos2.x, pos2.y);
+        
+        // 计算屏幕空间叉积判断朝向
+        Vector2f edge1 = screen1 - screen0;
+        Vector2f edge2 = screen2 - screen0;
+        float cross_product = edge1.x * edge2.y - edge1.y * edge2.x;
+        
+        // 背面剔除：NDC空间中叉积为负表示顺时针，即背面。
+        // 从NDC到屏幕空间中，会发生Y轴翻转，对应叉积应为正。
+        if (cross_product > 0.0f) {
+            backface_culled++;
+            continue;
+        }
+        
         // 检查三角形是否有被裁剪的顶点（坐标为-1000的表示被裁剪）
         bool has_clipped_vertex = (pos0.x == -1000.0f || pos1.x == -1000.0f || pos2.x == -1000.0f);
         
@@ -465,6 +547,8 @@ void SimpleRenderer::TriangleTileBinning(
     SPDLOG_INFO("  Triangles with clipped vertices: {}", triangles_with_clipped_vertices);
     SPDLOG_INFO("  Processed triangles: {}", processed_triangles);
     SPDLOG_INFO("  Clipped by screen bounds: {}", clipped_triangles);
+    SPDLOG_INFO("  Frustum culled: {}", frustum_culled);
+    SPDLOG_INFO("  Backface culled: {}", backface_culled);
     
     size_t total_triangle_refs = 0;
     size_t non_empty_tiles = 0;
diff --git a/src/shader.cpp b/src/shader.cpp
index 087cca5..7b8eeae 100644
--- a/src/shader.cpp
+++ b/src/shader.cpp
@@ -15,11 +15,15 @@ Vertex Shader::VertexShader(const Vertex& vertex) {
 
   sharedDataInShader_.fragPos_varying = Vector3f(model_matrix * vertex.GetPosition());
 
-  // 返回变换后的顶点（包含变换后的法向量）
-  return Vertex(mvp_matrix * vertex.GetPosition(), 
+  // 计算裁剪空间坐标
+  Vector4f clip_position = mvp_matrix * vertex.GetPosition();
+  
+  // 返回变换后的顶点（包含变换后的法向量和裁剪坐标）
+  return Vertex(clip_position, 
                 transformed_normal, 
                 vertex.GetTexCoords(), 
-                vertex.GetColor());
+                vertex.GetColor(),
+                clip_position);  // 同时保存裁剪空间坐标用于后续裁剪
 }
 
 Color Shader::FragmentShader(const Fragment& fragment) const {
diff --git a/test/system_test/main.cpp b/test/system_test/main.cpp
index 58c70ad..0f222b5 100755
--- a/test/system_test/main.cpp
+++ b/test/system_test/main.cpp
@@ -81,7 +81,7 @@ int main(int argc, char **argv) {
   simple_renderer::Camera camera(simple_renderer::Vector3f(0.0f, 0.0f, 1.0f));
 
   // 设置渲染模式（可选：TRADITIONAL、TILE_BASED 或 DEFERRED）
-  simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::DEFERRED);
+  simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::TILE_BASED);
   
   // 输出当前渲染模式
   std::string current_mode_name;
@@ -97,10 +97,6 @@ int main(int argc, char **argv) {
       break;
   }
   SPDLOG_INFO("当前渲染模式: {}", current_mode_name);
-  
-  // 可以在这里添加模式切换的示例：
-  // simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::TILE_BASED);
-  // simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::DEFERRED);
 
   auto display = Display(kWidth, kHeight);
   display.loopBegin();

From d0ddf62ae6d02126d43438b06f1a52525710eebd Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Fri, 29 Aug 2025 20:14:21 +0800
Subject: [PATCH 05/24] Expand the Vertex data structure, implement frustum
 culling and backface culling for TBR.

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 src/renderer.cpp | 133 +++--------------------------------------------
 1 file changed, 6 insertions(+), 127 deletions(-)

diff --git a/src/renderer.cpp b/src/renderer.cpp
index a7bc226..ae53457 100755
--- a/src/renderer.cpp
+++ b/src/renderer.cpp
@@ -97,10 +97,7 @@ void SimpleRenderer::ExecuteDrawPipeline(const Model &model, uint32_t *buffer) {
     return;
   }
   
-  // === PERFORMANCE TIMING ===
-  auto total_start_time = std::chrono::high_resolution_clock::now();
-
-  /* * * Vertex Shader * * */
+  /* * * Vertex Transformation * * */
   auto vertex_shader_start_time = std::chrono::high_resolution_clock::now();
   std::vector<Vertex> processedVertices;
   std::vector<std::vector<Vertex>> processed_vertices_all_thread(kNProc);
@@ -402,7 +399,6 @@ void SimpleRenderer::TriangleTileBinning(
     
     size_t total_triangles = model.GetFaces().size();
     size_t processed_triangles = 0;
-    size_t clipped_triangles = 0;
     size_t triangles_with_clipped_vertices = 0;
     
     // 裁剪统计
@@ -434,10 +430,10 @@ void SimpleRenderer::TriangleTileBinning(
                 (c0.z > c0.w && c1.z > c1.w && c2.z > c2.w) ||  // 远平面外
                 (c0.z < -c0.w && c1.z < -c1.w && c2.z < -c2.w);  // 近平面外
                 
-            // if (frustum_cull) {
-            //     frustum_culled++;
-            //     continue;
-            // }
+            if (frustum_cull) {
+                frustum_culled++;
+                continue;
+            }
         }
         
         // 获取屏幕空间坐标（现在已经是屏幕坐标了）
@@ -445,12 +441,10 @@ void SimpleRenderer::TriangleTileBinning(
         Vector4f pos1 = v1.GetPosition();
         Vector4f pos2 = v2.GetPosition();
         
-        // 背面剔除 (屏幕空间)
+        // 计算屏幕空间叉积判断朝向
         Vector2f screen0(pos0.x, pos0.y);
         Vector2f screen1(pos1.x, pos1.y);  
         Vector2f screen2(pos2.x, pos2.y);
-        
-        // 计算屏幕空间叉积判断朝向
         Vector2f edge1 = screen1 - screen0;
         Vector2f edge2 = screen2 - screen0;
         float cross_product = edge1.x * edge2.y - edge1.y * edge2.x;
@@ -489,24 +483,6 @@ void SimpleRenderer::TriangleTileBinning(
         float min_y = std::min({screen_y0, screen_y1, screen_y2});
         float max_y = std::max({screen_y0, screen_y1, screen_y2});
         
-        // 调试前几个有效三角形的坐标范围
-        if (processed_triangles < 3) {
-            SPDLOG_INFO("Triangle {} coordinates:", tri_idx);
-            SPDLOG_INFO("  Screen coords: ({:.1f},{:.1f}) ({:.1f},{:.1f}) ({:.1f},{:.1f})", 
-                       screen_x0, screen_y0, screen_x1, screen_y1, screen_x2, screen_y2);
-            SPDLOG_INFO("  BBox: min({:.1f},{:.1f}) max({:.1f},{:.1f})", 
-                       min_x, min_y, max_x, max_y);
-        }
-        
-        // 临时：大幅放宽屏幕边界检查，让超出屏幕的三角形也能处理
-        if (max_x < -5000.0f || min_x >= width_ + 5000.0f || 
-            max_y < -5000.0f || min_y >= height_ + 5000.0f) {
-            clipped_triangles++;
-            if (processed_triangles < 3) {
-                SPDLOG_INFO("  -> CLIPPED by screen bounds");
-            }
-            continue;
-        }
         
         // 计算影响的tile范围
         int start_tile_x = std::max(0, static_cast<int>(min_x) / static_cast<int>(tile_size));
@@ -528,28 +504,9 @@ void SimpleRenderer::TriangleTileBinning(
             }
             processed_triangles++;
             
-            // 输出前几个成功添加的三角形信息
-            if (processed_triangles <= 3) {
-                SPDLOG_INFO("  -> SUCCESSFULLY ADDED to tiles x[{}..{}] y[{}..{}]", 
-                           start_tile_x, end_tile_x, start_tile_y, end_tile_y);
-            }
-        } else {
-            if (processed_triangles < 3) {
-                SPDLOG_INFO("  -> FAILED tile calculation: x[{}..{}] y[{}..{}]", 
-                           start_tile_x, end_tile_x, start_tile_y, end_tile_y);
-            }
         }
     }
     
-    // 输出统计信息
-    SPDLOG_INFO("Triangle-Tile binning completed:");
-    SPDLOG_INFO("  Total triangles: {}", total_triangles);
-    SPDLOG_INFO("  Triangles with clipped vertices: {}", triangles_with_clipped_vertices);
-    SPDLOG_INFO("  Processed triangles: {}", processed_triangles);
-    SPDLOG_INFO("  Clipped by screen bounds: {}", clipped_triangles);
-    SPDLOG_INFO("  Frustum culled: {}", frustum_culled);
-    SPDLOG_INFO("  Backface culled: {}", backface_culled);
-    
     size_t total_triangle_refs = 0;
     size_t non_empty_tiles = 0;
     for (const auto& tile : tile_triangles) {
@@ -629,76 +586,6 @@ void SimpleRenderer::RasterizeTile(
   }
 }
 
-// Tile可视化调试函数，这里用于固定大小的tiles
-void SimpleRenderer::DrawTileVisualization(uint32_t* buffer, 
-    const std::vector<std::vector<TriangleInfo>>& tile_triangles, 
-    size_t tiles_x, size_t tiles_y, size_t tile_size) {
-    
-    SPDLOG_INFO("=== TILE VISUALIZATION DEBUG ===");
-    SPDLOG_INFO("Drawing tile grid overlay for debugging");
-    
-    // 颜色定义 (ABGR格式)
-    const uint32_t GRID_COLOR = 0xFF00FF00;      // 绿色网格线
-    const uint32_t NONEMPTY_COLOR = 0x4000FFFF;  // 半透明黄色背景 (非空tile)
-    const uint32_t EMPTY_COLOR = 0x20FF0000;     // 半透明蓝色背景 (空tile)
-    
-    // 1. 为非空tiles添加背景色
-    for (size_t tile_y = 0; tile_y < tiles_y; tile_y++) {
-        for (size_t tile_x = 0; tile_x < tiles_x; tile_x++) {
-            size_t tile_id = tile_y * tiles_x + tile_x;
-            bool is_empty = tile_triangles[tile_id].empty();
-            
-            // 计算tile在屏幕上的像素范围
-            size_t pixel_start_x = tile_x * tile_size;
-            size_t pixel_end_x = std::min(pixel_start_x + tile_size, static_cast<size_t>(width_));
-            size_t pixel_start_y = tile_y * tile_size;
-            size_t pixel_end_y = std::min(pixel_start_y + tile_size, static_cast<size_t>(height_));
-            
-            uint32_t bg_color = is_empty ? EMPTY_COLOR : NONEMPTY_COLOR;
-            
-            // 给tile添加半透明背景
-            for (size_t y = pixel_start_y; y < pixel_end_y; y++) {
-                for (size_t x = pixel_start_x; x < pixel_end_x; x++) {
-                    size_t pixel_idx = y * static_cast<size_t>(width_) + x;
-                    // 简单的alpha混合：将背景色与原色混合
-                    uint32_t original = buffer[pixel_idx];
-                    buffer[pixel_idx] = BlendColors(original, bg_color);
-                }
-            }
-            
-            // 记录非空tile的信息
-            if (!is_empty) {
-                SPDLOG_INFO("Non-empty Tile[{},{}] (ID:{}): {} triangles", 
-                           tile_x, tile_y, tile_id, tile_triangles[tile_id].size());
-            }
-        }
-    }
-    
-    // 2. 绘制网格线
-    // 垂直线
-    for (size_t tile_x = 0; tile_x <= tiles_x; tile_x++) {
-        size_t pixel_x = tile_x * tile_size;
-        if (pixel_x < static_cast<size_t>(width_)) {
-            for (size_t y = 0; y < static_cast<size_t>(height_); y++) {
-                buffer[y * static_cast<size_t>(width_) + pixel_x] = GRID_COLOR;
-            }
-        }
-    }
-    
-    // 水平线
-    for (size_t tile_y = 0; tile_y <= tiles_y; tile_y++) {
-        size_t pixel_y = tile_y * tile_size;
-        if (pixel_y < static_cast<size_t>(height_)) {
-            for (size_t x = 0; x < static_cast<size_t>(width_); x++) {
-                buffer[pixel_y * static_cast<size_t>(width_) + x] = GRID_COLOR;
-            }
-        }
-    }
-    
-    SPDLOG_INFO("Tile visualization completed - Green:Grid, Yellow:NonEmpty, Blue:Empty");
-    SPDLOG_INFO("=====================================");
-}
-
 // 简单的颜色混合函数 (alpha blending)
 uint32_t SimpleRenderer::BlendColors(uint32_t base, uint32_t overlay) {
     // 提取RGBA通道 (假设是ABGR格式)
@@ -944,13 +831,6 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
     auto merge_duration = std::chrono::duration_cast<std::chrono::microseconds>(
         merge_end_time - merge_start_time);
     
-    // 6. Tile可视化调试
-    auto visualization_start_time = std::chrono::high_resolution_clock::now();
-    DrawTileVisualization(buffer, tile_triangles, tiles_x, tiles_y, TILE_SIZE);
-    auto visualization_end_time = std::chrono::high_resolution_clock::now();
-    auto visualization_duration = std::chrono::duration_cast<std::chrono::microseconds>(
-        visualization_end_time - visualization_start_time);
-    
     auto total_end_time = std::chrono::high_resolution_clock::now();
     auto total_duration = std::chrono::duration_cast<std::chrono::microseconds>(
         total_end_time - total_start_time);
@@ -961,7 +841,6 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
     stats.buffer_alloc_ms = buffer_alloc_duration.count() / 1000.0;
     stats.rasterization_ms = rasterization_duration.count() / 1000.0;
     stats.merge_ms = merge_duration.count() / 1000.0;
-    stats.visualization_ms = visualization_duration.count() / 1000.0;
     stats.total_ms = total_duration.count() / 1000.0;
     
     return stats;

From a4021cdf8a3bfceb08140f896946563110856399 Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Sat, 30 Aug 2025 17:27:31 +0800
Subject: [PATCH 06/24] Fix rendering consistency between TRADITIONAL and
 TILE_BASED modes 1. Add backface culling to TRADITIONAL pipeline to match
 TILE_BASED behavior 2. Fix depth buffer initialization from infinity to 1.0f
 for standard range

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 src/renderer.cpp | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/renderer.cpp b/src/renderer.cpp
index ae53457..e4eb01d 100755
--- a/src/renderer.cpp
+++ b/src/renderer.cpp
@@ -401,10 +401,6 @@ void SimpleRenderer::TriangleTileBinning(
     size_t processed_triangles = 0;
     size_t triangles_with_clipped_vertices = 0;
     
-    // 裁剪统计
-    size_t frustum_culled = 0;
-    size_t backface_culled = 0;
-    
     SPDLOG_INFO("Starting triangle-tile binning for {} triangles", total_triangles);
     SPDLOG_INFO("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", 
                 width_, height_, tile_size, tiles_x, tiles_y);
@@ -431,7 +427,6 @@ void SimpleRenderer::TriangleTileBinning(
                 (c0.z < -c0.w && c1.z < -c1.w && c2.z < -c2.w);  // 近平面外
                 
             if (frustum_cull) {
-                frustum_culled++;
                 continue;
             }
         }
@@ -452,7 +447,6 @@ void SimpleRenderer::TriangleTileBinning(
         // 背面剔除：NDC空间中叉积为负表示顺时针，即背面。
         // 从NDC到屏幕空间中，会发生Y轴翻转，对应叉积应为正。
         if (cross_product > 0.0f) {
-            backface_culled++;
             continue;
         }
         
@@ -540,7 +534,7 @@ void SimpleRenderer::RasterizeTile(
   size_t tile_width = screen_x_end - screen_x_start;
   size_t tile_height = screen_y_end - screen_y_start;
   std::fill_n(tile_depth_buffer, tile_width * tile_height,
-              std::numeric_limits<float>::infinity());
+              1.0f);  // 初始化为最远深度（标准深度缓冲范围[0,1]）
   std::fill_n(tile_color_buffer, tile_width * tile_height, 0);
     
   // 在tile内光栅化所有三角形
@@ -652,6 +646,22 @@ SimpleRenderer::RenderStats SimpleRenderer::ExecuteTraditionalPipeline(
             auto v1 = processedVertices[f.GetIndex(1)];
             auto v2 = processedVertices[f.GetIndex(2)];
 
+            // 获取屏幕空间坐标
+            Vector2f screen0(v0.GetPosition().x, v0.GetPosition().y);
+            Vector2f screen1(v1.GetPosition().x, v1.GetPosition().y);  
+            Vector2f screen2(v2.GetPosition().x, v2.GetPosition().y);
+            
+            // 计算屏幕空间叉积判断朝向
+            Vector2f edge1 = screen1 - screen0;
+            Vector2f edge2 = screen2 - screen0;
+            float cross_product = edge1.x * edge2.y - edge1.y * edge2.x;
+            
+            // 背面剔除：NDC空间中叉积为负表示顺时针，即背面。
+            // 从NDC到屏幕空间中，会发生Y轴翻转，对应叉积应为正。
+            if (cross_product > 0.0f) {
+                continue;
+            }
+
             const Material *material = &f.GetMaterial();
             auto fragments = rasterizer_->Rasterize(v0, v1, v2);
 

From 70e1581f9daa326c6cb0fd3819718b19797af090 Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Sat, 30 Aug 2025 17:27:53 +0800
Subject: [PATCH 07/24] add debug mode

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 test/system_test/main.cpp | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/test/system_test/main.cpp b/test/system_test/main.cpp
index 0f222b5..383ff83 100755
--- a/test/system_test/main.cpp
+++ b/test/system_test/main.cpp
@@ -16,10 +16,12 @@
 
 #include <renderer.h>
 
+#include <chrono>
 #include <cstdint>
 #include <iostream>
 #include <span>
 #include <string>
+#include <thread>
 #include <vector>
 
 #include "buffer.hpp"
@@ -81,7 +83,7 @@ int main(int argc, char **argv) {
   simple_renderer::Camera camera(simple_renderer::Vector3f(0.0f, 0.0f, 1.0f));
 
   // 设置渲染模式（可选：TRADITIONAL、TILE_BASED 或 DEFERRED）
-  simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::TILE_BASED);
+  simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::TRADITIONAL);
   
   // 输出当前渲染模式
   std::string current_mode_name;
@@ -101,9 +103,11 @@ int main(int argc, char **argv) {
   auto display = Display(kWidth, kHeight);
   display.loopBegin();
 
-  while (!display.loopShouldClose()) {
-    display.handleEvents(camera);
-
+  // 调试模式：固定相机状态，只渲染一帧
+  bool debug_mode = true;
+  
+  if (debug_mode) {
+    // 固定相机参数进行调试
     shader.SetUniform("cameraPos", camera.GetPosition());
     shader.SetUniform("viewMatrix", camera.GetViewMatrix());
     shader.SetUniform("projectionMatrix",
@@ -115,8 +119,29 @@ int main(int argc, char **argv) {
     }
 
     buffer.SwapBuffer();
-
     display.fill(buffer.GetDisplayBuffer());
+    
+    // 调试模式下等待几秒让我们看到结果
+    std::this_thread::sleep_for(std::chrono::seconds(3));
+  } else {
+    // 正常渲染循环
+    while (!display.loopShouldClose()) {
+      display.handleEvents(camera);
+
+      shader.SetUniform("cameraPos", camera.GetPosition());
+      shader.SetUniform("viewMatrix", camera.GetViewMatrix());
+      shader.SetUniform("projectionMatrix",
+                        camera.GetProjectionMatrix(60.0f, static_cast<float>(kWidth) / static_cast<float>(kHeight), 0.1f, 100.0f));
+
+      buffer.ClearDrawBuffer(simple_renderer::Color::kBlack);
+      for (auto &model : models) {
+        simple_renderer.DrawModel(model, shader, buffer.GetDrawBuffer());
+      }
+
+      buffer.SwapBuffer();
+
+      display.fill(buffer.GetDisplayBuffer());
+    }
   }
 
   return 0;

From 45386674a7e34cc5b090751a652abe8d99387d2c Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Tue, 2 Sep 2025 23:44:02 +0800
Subject: [PATCH 08/24] Revert "add debug mode", which is not necessary for
 rendering tests.

This reverts commit 70e1581f9daa326c6cb0fd3819718b19797af090.

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 test/system_test/main.cpp | 35 +++++------------------------------
 1 file changed, 5 insertions(+), 30 deletions(-)

diff --git a/test/system_test/main.cpp b/test/system_test/main.cpp
index 383ff83..0f222b5 100755
--- a/test/system_test/main.cpp
+++ b/test/system_test/main.cpp
@@ -16,12 +16,10 @@
 
 #include <renderer.h>
 
-#include <chrono>
 #include <cstdint>
 #include <iostream>
 #include <span>
 #include <string>
-#include <thread>
 #include <vector>
 
 #include "buffer.hpp"
@@ -83,7 +81,7 @@ int main(int argc, char **argv) {
   simple_renderer::Camera camera(simple_renderer::Vector3f(0.0f, 0.0f, 1.0f));
 
   // 设置渲染模式（可选：TRADITIONAL、TILE_BASED 或 DEFERRED）
-  simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::TRADITIONAL);
+  simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::TILE_BASED);
   
   // 输出当前渲染模式
   std::string current_mode_name;
@@ -103,11 +101,9 @@ int main(int argc, char **argv) {
   auto display = Display(kWidth, kHeight);
   display.loopBegin();
 
-  // 调试模式：固定相机状态，只渲染一帧
-  bool debug_mode = true;
-  
-  if (debug_mode) {
-    // 固定相机参数进行调试
+  while (!display.loopShouldClose()) {
+    display.handleEvents(camera);
+
     shader.SetUniform("cameraPos", camera.GetPosition());
     shader.SetUniform("viewMatrix", camera.GetViewMatrix());
     shader.SetUniform("projectionMatrix",
@@ -119,29 +115,8 @@ int main(int argc, char **argv) {
     }
 
     buffer.SwapBuffer();
+
     display.fill(buffer.GetDisplayBuffer());
-    
-    // 调试模式下等待几秒让我们看到结果
-    std::this_thread::sleep_for(std::chrono::seconds(3));
-  } else {
-    // 正常渲染循环
-    while (!display.loopShouldClose()) {
-      display.handleEvents(camera);
-
-      shader.SetUniform("cameraPos", camera.GetPosition());
-      shader.SetUniform("viewMatrix", camera.GetViewMatrix());
-      shader.SetUniform("projectionMatrix",
-                        camera.GetProjectionMatrix(60.0f, static_cast<float>(kWidth) / static_cast<float>(kHeight), 0.1f, 100.0f));
-
-      buffer.ClearDrawBuffer(simple_renderer::Color::kBlack);
-      for (auto &model : models) {
-        simple_renderer.DrawModel(model, shader, buffer.GetDrawBuffer());
-      }
-
-      buffer.SwapBuffer();
-
-      display.fill(buffer.GetDisplayBuffer());
-    }
   }
 
   return 0;

From b57d9077900eddfdfe9844df1c08b2875d3f5b45 Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Thu, 4 Sep 2025 22:16:12 +0800
Subject: [PATCH 09/24] Add Early-Z to TBR. Remove obsolete functions
 previously used for TBR debugging.

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 src/include/renderer.h | 25 ++++------------------
 src/renderer.cpp       | 48 +++++++++++++++++-------------------------
 2 files changed, 23 insertions(+), 50 deletions(-)

diff --git a/src/include/renderer.h b/src/include/renderer.h
index 2464c19..38f9dd0 100755
--- a/src/include/renderer.h
+++ b/src/include/renderer.h
@@ -100,6 +100,7 @@ class SimpleRenderer {
   const size_t width_;
   LogSystem log_system_;
   RenderingMode current_mode_;  // 当前渲染模式
+  bool early_z_enabled_;        // Early-Z优化开关
 
   std::shared_ptr<Shader> shader_;
   std::shared_ptr<Rasterizer> rasterizer_;
@@ -143,7 +144,6 @@ class SimpleRenderer {
     double buffer_alloc_ms;
     double rasterization_ms;
     double merge_ms;
-    double visualization_ms;
     double total_ms;
   };
   
@@ -189,7 +189,8 @@ class SimpleRenderer {
     size_t tiles_x, size_t tiles_y, size_t tile_size,
     float* tile_depth_buffer, uint32_t* tile_color_buffer,
     std::unique_ptr<float[]> &global_depth_buffer,
-    std::unique_ptr<uint32_t[]> &global_color_buffer);
+    std::unique_ptr<uint32_t[]> &global_color_buffer,
+    bool use_early_z = false);
 
   
   /**
@@ -205,25 +206,7 @@ class SimpleRenderer {
    * @return 转换后的顶点(屏幕坐标)
    */
   Vertex ViewportTransformation(const Vertex &vertex);
-  /**
-   * Tile可视化调试函数 - 在渲染结果上绘制tile网格和状态
-   * @param buffer 渲染结果缓冲区
-   * @param tile_triangles 每个tile包含的三角形列表
-   * @param tiles_x X方向tile数量
-   * @param tiles_y Y方向tile数量 
-   * @param tile_size 单个tile的像素大小
-   */
-  void DrawTileVisualization(uint32_t* buffer, 
-      const std::vector<std::vector<TriangleInfo>>& tile_triangles, 
-      size_t tiles_x, size_t tiles_y, size_t tile_size);
-
-  /**
-   * 颜色混合函数 - 用于半透明效果
-   * @param base 基础颜色
-   * @param overlay 叠加颜色(包含alpha通道)
-   * @return 混合后的颜色
-   */
-  uint32_t BlendColors(uint32_t base, uint32_t overlay);
+  
 };
 }  // namespace simple_renderer
 
diff --git a/src/renderer.cpp b/src/renderer.cpp
index e4eb01d..faabc58 100755
--- a/src/renderer.cpp
+++ b/src/renderer.cpp
@@ -37,7 +37,8 @@ SimpleRenderer::SimpleRenderer(size_t width, size_t height)
     : height_(height),
       width_(width),
       log_system_(LogSystem(kLogFilePath, kLogFileMaxSize, kLogFileMaxCount)),
-      current_mode_(RenderingMode::TILE_BASED) {
+      current_mode_(RenderingMode::TILE_BASED),
+      early_z_enabled_(true) {
   rasterizer_ = std::make_shared<Rasterizer>(width, height);
 }
 
@@ -162,7 +163,6 @@ void SimpleRenderer::ExecuteDrawPipeline(const Model &model, uint32_t *buffer) {
       SPDLOG_INFO("Buffer Alloc:     {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100);
       SPDLOG_INFO("Rasterization:    {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100);
       SPDLOG_INFO("Merge:            {:8.3f} ms ({:5.1f}%)", stats.merge_ms, stats.merge_ms/total_ms*100);
-      SPDLOG_INFO("Visualization:    {:8.3f} ms ({:5.1f}%)", stats.visualization_ms, stats.visualization_ms/total_ms*100);
       SPDLOG_INFO("Total:            {:8.3f} ms", total_ms);
       SPDLOG_INFO("==========================================");
       break;
@@ -521,7 +521,8 @@ void SimpleRenderer::RasterizeTile(
     size_t tiles_x, size_t tiles_y, size_t tile_size,
     float* tile_depth_buffer, uint32_t* tile_color_buffer,
     std::unique_ptr<float[]> &global_depth_buffer,
-    std::unique_ptr<uint32_t[]> &global_color_buffer) {
+    std::unique_ptr<uint32_t[]> &global_color_buffer,
+    bool use_early_z) {
   // 计算tile在屏幕空间的范围
   size_t tile_x = tile_id % tiles_x;
   size_t tile_y = tile_id / tiles_x;
@@ -556,11 +557,18 @@ void SimpleRenderer::RasterizeTile(
         size_t tile_index = tile_local_x + tile_local_y * tile_width;
                 
         // tile内深度测试
-        if (fragment.depth < tile_depth_buffer[tile_index]) {
-          tile_depth_buffer[tile_index] = fragment.depth;
-                    
+        if (use_early_z) { // Early-Z模式：深度测试在Fragment Shader之前
+          if (fragment.depth < tile_depth_buffer[tile_index]) {
+            auto color = shader_->FragmentShader(fragment);
+            tile_depth_buffer[tile_index] = fragment.depth;
+            tile_color_buffer[tile_index] = uint32_t(color);
+          }
+        } else { // Late-Z模式：Fragment Shader在深度测试之前
           auto color = shader_->FragmentShader(fragment);
-          tile_color_buffer[tile_index] = uint32_t(color);
+          if (fragment.depth < tile_depth_buffer[tile_index]) {
+            tile_depth_buffer[tile_index] = fragment.depth;
+            tile_color_buffer[tile_index] = uint32_t(color);
+          }
         }
           }
     }
@@ -580,26 +588,6 @@ void SimpleRenderer::RasterizeTile(
   }
 }
 
-// 简单的颜色混合函数 (alpha blending)
-uint32_t SimpleRenderer::BlendColors(uint32_t base, uint32_t overlay) {
-    // 提取RGBA通道 (假设是ABGR格式)
-    uint8_t base_r = (base >> 16) & 0xFF;
-    uint8_t base_g = (base >> 8) & 0xFF;
-    uint8_t base_b = base & 0xFF;
-    
-    uint8_t overlay_r = (overlay >> 16) & 0xFF;
-    uint8_t overlay_g = (overlay >> 8) & 0xFF;
-    uint8_t overlay_b = overlay & 0xFF;
-    uint8_t overlay_a = (overlay >> 24) & 0xFF;
-    
-    // 简单的alpha混合
-    float alpha = overlay_a / 255.0f;
-    uint8_t result_r = (uint8_t)(base_r * (1.0f - alpha) + overlay_r * alpha);
-    uint8_t result_g = (uint8_t)(base_g * (1.0f - alpha) + overlay_g * alpha);
-    uint8_t result_b = (uint8_t)(base_b * (1.0f - alpha) + overlay_b * alpha);
-    
-    return 0xFF000000 | (result_r << 16) | (result_g << 8) | result_b;
-}
 
 // 传统光栅化管线实现
 SimpleRenderer::RenderStats SimpleRenderer::ExecuteTraditionalPipeline(
@@ -784,7 +772,8 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
     auto rasterization_start_time = std::chrono::high_resolution_clock::now();
 #pragma omp parallel num_threads(kNProc) default(none) \
     shared(tile_triangles, rasterizer_, shader_, width_, height_, \
-           depthBuffer_all_thread, colorBuffer_all_thread, tiles_x, tiles_y, total_tiles)
+           depthBuffer_all_thread, colorBuffer_all_thread, tiles_x, tiles_y, total_tiles, \
+           early_z_enabled_)
     {
         int thread_id = omp_get_thread_num();
         auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id];
@@ -802,7 +791,8 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
             RasterizeTile(tile_id, tile_triangles[tile_id], 
                          tiles_x, tiles_y, TILE_SIZE,
                          tile_depth_buffer.get(), tile_color_buffer.get(),
-                         depthBuffer_per_thread, colorBuffer_per_thread);
+                         depthBuffer_per_thread, colorBuffer_per_thread,
+                         early_z_enabled_);
         }
     }
     auto rasterization_end_time = std::chrono::high_resolution_clock::now();

From 1d2d9a9b596559f292d642068c44ed73cd158fcc Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Fri, 5 Sep 2025 23:05:05 +0800
Subject: [PATCH 10/24] TBR: Pre-allocate and reuse fragment caches; add
 RasterizeTo; two-pass counting in Binning to eliminate frequent dynamic
 memory reallocations.

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 src/include/rasterizer.hpp |   7 ++
 src/include/renderer.h     |   3 +-
 src/rasterizer.cpp         |  70 ++++++++++++++-
 src/renderer.cpp           | 176 ++++++++++++++++++++++++++++++-------
 4 files changed, 224 insertions(+), 32 deletions(-)

diff --git a/src/include/rasterizer.hpp b/src/include/rasterizer.hpp
index 749aa28..80b5f84 100644
--- a/src/include/rasterizer.hpp
+++ b/src/include/rasterizer.hpp
@@ -20,6 +20,13 @@ class Rasterizer {
   std::vector<Fragment> Rasterize(const Vertex& v0, const Vertex& v1,
                                   const Vertex& v2);
 
+  // 非分配版本：将片段直接写入调用方提供的容器
+  // 可选的裁剪区域为半开区间 [x0, x1) × [y0, y1)
+  // 用于 TBR：将光栅化限制在 tile 边界内，便于复用外部 scratch 容器
+  void RasterizeTo(const Vertex& v0, const Vertex& v1, const Vertex& v2,
+                   int x0, int y0, int x1, int y1,
+                   std::vector<Fragment>& out);
+
  private:
   size_t width_, height_;
 
diff --git a/src/include/renderer.h b/src/include/renderer.h
index 38f9dd0..97c9952 100755
--- a/src/include/renderer.h
+++ b/src/include/renderer.h
@@ -190,7 +190,8 @@ class SimpleRenderer {
     float* tile_depth_buffer, uint32_t* tile_color_buffer,
     std::unique_ptr<float[]> &global_depth_buffer,
     std::unique_ptr<uint32_t[]> &global_color_buffer,
-    bool use_early_z = false);
+    bool use_early_z = false,
+    std::vector<Fragment>* scratch_fragments = nullptr);
 
   
   /**
diff --git a/src/rasterizer.cpp b/src/rasterizer.cpp
index 7a8c602..2bbe161 100644
--- a/src/rasterizer.cpp
+++ b/src/rasterizer.cpp
@@ -1,6 +1,8 @@
 #include "rasterizer.hpp"
 
 #include <omp.h>
+#include <algorithm>
+#include <cmath>
 
 namespace simple_renderer {
 
@@ -90,6 +92,72 @@ std::vector<Fragment> Rasterizer::Rasterize(const Vertex& v0, const Vertex& v1,
   return fragments;
 }
 
+void Rasterizer::RasterizeTo(const Vertex& v0, const Vertex& v1, const Vertex& v2,
+                             int x0, int y0, int x1, int y1,
+                             std::vector<Fragment>& out) {
+  // 获取三角形的最小 box（屏幕空间）
+  Vector2f a = Vector2f(v0.GetPosition().x, v0.GetPosition().y);
+  Vector2f b = Vector2f(v1.GetPosition().x, v1.GetPosition().y);
+  Vector2f c = Vector2f(v2.GetPosition().x, v2.GetPosition().y);
+
+  Vector2f bboxMin =
+      Vector2f{std::min({a.x, b.x, c.x}), std::min({a.y, b.y, c.y})};
+  Vector2f bboxMax =
+      Vector2f{std::max({a.x, b.x, c.x}), std::max({a.y, b.y, c.y})};
+
+  // Clamp 到屏幕尺寸
+  float minx = std::max(0.0f, bboxMin.x);
+  float miny = std::max(0.0f, bboxMin.y);
+  float maxx = std::min(float(width_ - 1), bboxMax.x);
+  float maxy = std::min(float(height_ - 1), bboxMax.y);
+
+  // 与外部提供的裁剪区域（半开区间）相交，转成闭区间扫描
+  int sx = std::max(x0, int(std::floor(minx)));
+  int sy = std::max(y0, int(std::floor(miny)));
+  int ex = std::min(x1 - 1, int(std::floor(maxx)));
+  int ey = std::min(y1 - 1, int(std::floor(maxy)));
+
+  if (sx > ex || sy > ey) {
+    return;  // 与裁剪区域无交
+  }
+
+  // 透视矫正插值使用与 Rasterize 相同逻辑，但单线程写入 out
+  float w0_inv = v0.GetPosition().w;
+  float w1_inv = v1.GetPosition().w;
+  float w2_inv = v2.GetPosition().w;
+
+  for (int x = sx; x <= ex; ++x) {
+    for (int y = sy; y <= ey; ++y) {
+      auto [is_inside, barycentric_coord] = GetBarycentricCoord(
+          v0.GetPosition(), v1.GetPosition(), v2.GetPosition(),
+          Vector3f(static_cast<float>(x), static_cast<float>(y), 0));
+      if (!is_inside) continue;
+
+      // 插值 1/w 并进行透视矫正
+      float w_inv_interpolated = Interpolate(w0_inv, w1_inv, w2_inv, barycentric_coord);
+      Vector3f corrected_bary(
+          barycentric_coord.x * w0_inv / w_inv_interpolated,
+          barycentric_coord.y * w1_inv / w_inv_interpolated,
+          barycentric_coord.z * w2_inv / w_inv_interpolated);
+
+      auto z = Interpolate(v0.GetPosition().z, v1.GetPosition().z,
+                           v2.GetPosition().z, corrected_bary);
+
+      Fragment fragment;
+      fragment.screen_coord = {x, y};
+      fragment.normal = Interpolate(v0.GetNormal(), v1.GetNormal(),
+                                    v2.GetNormal(), corrected_bary);
+      fragment.uv = Interpolate(v0.GetTexCoords(), v1.GetTexCoords(),
+                                v2.GetTexCoords(), corrected_bary);
+      fragment.color = InterpolateColor(v0.GetColor(), v1.GetColor(),
+                                        v2.GetColor(), corrected_bary);
+      fragment.depth = z;
+
+      out.push_back(fragment);
+    }
+  }
+}
+
 std::pair<bool, Vector3f> Rasterizer::GetBarycentricCoord(const Vector3f& p0,
                                                           const Vector3f& p1,
                                                           const Vector3f& p2,
@@ -157,4 +225,4 @@ Vector3f Rasterizer::CalculateNormal(const Vector3f& v0, const Vector3f& v1,
       glm::cross(edge1, edge2));
 }
 
-}  // namespace simple_renderer
\ No newline at end of file
+}  // namespace simple_renderer
diff --git a/src/renderer.cpp b/src/renderer.cpp
index faabc58..4d8306f 100755
--- a/src/renderer.cpp
+++ b/src/renderer.cpp
@@ -405,6 +405,86 @@ void SimpleRenderer::TriangleTileBinning(
     SPDLOG_INFO("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", 
                 width_, height_, tile_size, tiles_x, tiles_y);
     
+    // 第一遍：仅统计每个 tile 的三角形数量以便预分配，避免 push_back 扩容
+    std::vector<size_t> tile_counts(tiles_x * tiles_y, 0);
+    for (size_t tri_idx = 0; tri_idx < model.GetFaces().size(); tri_idx++) {
+        const auto &f = model.GetFaces()[tri_idx];
+        auto v0 = screenVertices[f.GetIndex(0)];
+        auto v1 = screenVertices[f.GetIndex(1)];
+        auto v2 = screenVertices[f.GetIndex(2)];
+
+        if (v0.HasClipPosition()) {
+            Vector4f c0 = v0.GetClipPosition();
+            Vector4f c1 = v1.GetClipPosition(); 
+            Vector4f c2 = v2.GetClipPosition();
+            bool frustum_cull = 
+                (c0.x > c0.w && c1.x > c1.w && c2.x > c2.w) ||
+                (c0.x < -c0.w && c1.x < -c0.w && c2.x < -c0.w) ||
+                (c0.y > c0.w && c1.y > c1.w && c2.y > c2.w) ||
+                (c0.y < -c0.w && c1.y < -c0.w && c2.y < -c0.w) ||
+                (c0.z > c0.w && c1.z > c1.w && c2.z > c2.w) ||
+                (c0.z < -c0.w && c1.z < -c0.w && c2.z < -c0.w);
+            if (frustum_cull) {
+                continue;
+            }
+        }
+
+        Vector4f pos0 = v0.GetPosition();
+        Vector4f pos1 = v1.GetPosition();
+        Vector4f pos2 = v2.GetPosition();
+
+        Vector2f screen0(pos0.x, pos0.y);
+        Vector2f screen1(pos1.x, pos1.y);  
+        Vector2f screen2(pos2.x, pos2.y);
+        Vector2f edge1 = screen1 - screen0;
+        Vector2f edge2 = screen2 - screen0;
+        float cross_product = edge1.x * edge2.y - edge1.y * edge2.x;
+        if (cross_product > 0.0f) {
+            continue;
+        }
+
+        bool has_clipped_vertex = (pos0.x == -1000.0f || pos1.x == -1000.0f || pos2.x == -1000.0f);
+        if (has_clipped_vertex) {
+            continue;
+        }
+
+        float screen_x0 = pos0.x;
+        float screen_y0 = pos0.y;
+        float screen_x1 = pos1.x;
+        float screen_y1 = pos1.y;
+        float screen_x2 = pos2.x;
+        float screen_y2 = pos2.y;
+
+        float min_x = std::min({screen_x0, screen_x1, screen_x2});
+        float max_x = std::max({screen_x0, screen_x1, screen_x2});
+        float min_y = std::min({screen_y0, screen_y1, screen_y2});
+        float max_y = std::max({screen_y0, screen_y1, screen_y2});
+
+        int start_tile_x = std::max(0, static_cast<int>(min_x) / static_cast<int>(tile_size));
+        int end_tile_x = std::min(static_cast<int>(tiles_x - 1), 
+                                 static_cast<int>(max_x) / static_cast<int>(tile_size));
+        int start_tile_y = std::max(0, static_cast<int>(min_y) / static_cast<int>(tile_size));
+        int end_tile_y = std::min(static_cast<int>(tiles_y - 1), 
+                                 static_cast<int>(max_y) / static_cast<int>(tile_size));
+
+        if (start_tile_x > end_tile_x || start_tile_y > end_tile_y) {
+            continue;
+        }
+
+        for (int ty = start_tile_y; ty <= end_tile_y; ++ty) {
+            for (int tx = start_tile_x; tx <= end_tile_x; ++tx) {
+                size_t tile_id = ty * tiles_x + tx;
+                tile_counts[tile_id]++;
+            }
+        }
+    }
+
+    // 依据统计结果进行容量预留
+    for (size_t tile_id = 0; tile_id < tile_triangles.size(); ++tile_id) {
+        if (tile_counts[tile_id] > 0) {
+            tile_triangles[tile_id].reserve(tile_counts[tile_id]);
+        }
+    }
     for (size_t tri_idx = 0; tri_idx < model.GetFaces().size(); tri_idx++) {
         const auto &f = model.GetFaces()[tri_idx];
         auto v0 = screenVertices[f.GetIndex(0)];
@@ -522,7 +602,8 @@ void SimpleRenderer::RasterizeTile(
     float* tile_depth_buffer, uint32_t* tile_color_buffer,
     std::unique_ptr<float[]> &global_depth_buffer,
     std::unique_ptr<uint32_t[]> &global_color_buffer,
-    bool use_early_z) {
+    bool use_early_z,
+    std::vector<Fragment>* scratch_fragments) {
   // 计算tile在屏幕空间的范围
   size_t tile_x = tile_id % tiles_x;
   size_t tile_y = tile_id / tiles_x;
@@ -539,38 +620,69 @@ void SimpleRenderer::RasterizeTile(
   std::fill_n(tile_color_buffer, tile_width * tile_height, 0);
     
   // 在tile内光栅化所有三角形
+  (void)tiles_y; // 避免未使用参数告警
   for (const auto &triangle : triangles) {
-    auto fragments = rasterizer_->Rasterize(triangle.v0, triangle.v1, triangle.v2);
-        
-    for (auto &fragment : fragments) {
-      fragment.material = triangle.material;
-            
-      size_t screen_x = fragment.screen_coord[0];
-      size_t screen_y = fragment.screen_coord[1];
-            
-      // 检查fragment是否在当前tile内
-      if (screen_x >= screen_x_start && screen_x < screen_x_end &&
-          screen_y >= screen_y_start && screen_y < screen_y_end) {
-                
-        size_t tile_local_x = screen_x - screen_x_start;
-        size_t tile_local_y = screen_y - screen_y_start;
-        size_t tile_index = tile_local_x + tile_local_y * tile_width;
-                
-        // tile内深度测试
-        if (use_early_z) { // Early-Z模式：深度测试在Fragment Shader之前
-          if (fragment.depth < tile_depth_buffer[tile_index]) {
+    // 复用线程本地 scratch 容器，限制在 tile 边界内栅格化
+    if (scratch_fragments) { // 提供scratch容器
+      scratch_fragments->clear();
+      if (scratch_fragments->capacity() < tile_width * tile_height) { // 二次确认，为日后可能的可变tile进行设计
+        scratch_fragments->reserve(tile_width * tile_height);
+      }
+      rasterizer_->RasterizeTo(triangle.v0, triangle.v1, triangle.v2,
+                               static_cast<int>(screen_x_start), static_cast<int>(screen_y_start),
+                               static_cast<int>(screen_x_end),   static_cast<int>(screen_y_end),
+                               *scratch_fragments);
+
+      for (auto &fragment : *scratch_fragments) {
+        fragment.material = triangle.material;
+        size_t screen_x = fragment.screen_coord[0];
+        size_t screen_y = fragment.screen_coord[1];
+        if (screen_x >= screen_x_start && screen_x < screen_x_end &&
+            screen_y >= screen_y_start && screen_y < screen_y_end) {
+          size_t tile_local_x = screen_x - screen_x_start;
+          size_t tile_local_y = screen_y - screen_y_start;
+          size_t tile_index = tile_local_x + tile_local_y * tile_width;
+          if (use_early_z) {
+            if (fragment.depth < tile_depth_buffer[tile_index]) {
+              auto color = shader_->FragmentShader(fragment);
+              tile_depth_buffer[tile_index] = fragment.depth;
+              tile_color_buffer[tile_index] = uint32_t(color);
+            }
+          } else {
             auto color = shader_->FragmentShader(fragment);
-            tile_depth_buffer[tile_index] = fragment.depth;
-            tile_color_buffer[tile_index] = uint32_t(color);
-          }
-        } else { // Late-Z模式：Fragment Shader在深度测试之前
-          auto color = shader_->FragmentShader(fragment);
-          if (fragment.depth < tile_depth_buffer[tile_index]) {
-            tile_depth_buffer[tile_index] = fragment.depth;
-            tile_color_buffer[tile_index] = uint32_t(color);
+            if (fragment.depth < tile_depth_buffer[tile_index]) {
+              tile_depth_buffer[tile_index] = fragment.depth;
+              tile_color_buffer[tile_index] = uint32_t(color);
+            }
           }
         }
+      }
+    } else { // 不提供scratch容器的版本
+      auto fragments = rasterizer_->Rasterize(triangle.v0, triangle.v1, triangle.v2);
+      for (auto &fragment : fragments) {
+        fragment.material = triangle.material;
+        size_t screen_x = fragment.screen_coord[0];
+        size_t screen_y = fragment.screen_coord[1];
+        if (screen_x >= screen_x_start && screen_x < screen_x_end &&
+            screen_y >= screen_y_start && screen_y < screen_y_end) {
+          size_t tile_local_x = screen_x - screen_x_start;
+          size_t tile_local_y = screen_y - screen_y_start;
+          size_t tile_index = tile_local_x + tile_local_y * tile_width;
+          if (use_early_z) {
+            if (fragment.depth < tile_depth_buffer[tile_index]) {
+              auto color = shader_->FragmentShader(fragment);
+              tile_depth_buffer[tile_index] = fragment.depth;
+              tile_color_buffer[tile_index] = uint32_t(color);
+            }
+          } else {
+            auto color = shader_->FragmentShader(fragment);
+            if (fragment.depth < tile_depth_buffer[tile_index]) {
+              tile_depth_buffer[tile_index] = fragment.depth;
+              tile_color_buffer[tile_index] = uint32_t(color);
+            }
           }
+        }
+      }
     }
   }
     
@@ -785,14 +897,18 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
         std::unique_ptr<uint32_t[]> tile_color_buffer = 
             std::make_unique<uint32_t[]>(TILE_SIZE * TILE_SIZE);
 
+        // 线程本地片段 scratch 容器（复用），容量按单 tile 上限预估
+        std::vector<Fragment> scratch_fragments;
+        scratch_fragments.reserve(TILE_SIZE * TILE_SIZE);
+
 #pragma omp for
         for (size_t tile_id = 0; tile_id < total_tiles; tile_id++) {
-            // 按照tile进行光栅化
+            // 按照tile进行光栅化，每个Tile进行区域限制+scratch复用，区域限制避免了可能的数据竞争
             RasterizeTile(tile_id, tile_triangles[tile_id], 
                          tiles_x, tiles_y, TILE_SIZE,
                          tile_depth_buffer.get(), tile_color_buffer.get(),
                          depthBuffer_per_thread, colorBuffer_per_thread,
-                         early_z_enabled_);
+                         early_z_enabled_, &scratch_fragments);
         }
     }
     auto rasterization_end_time = std::chrono::high_resolution_clock::now();

From 8a743793ca6a867389b8b7956751114fb76668e1 Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Sat, 6 Sep 2025 18:21:13 +0800
Subject: [PATCH 11/24] vertex optimization: avoid data movement and
 multi-stage memory reallocation

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 src/renderer.cpp | 42 ++++++++++++++++--------------------------
 1 file changed, 16 insertions(+), 26 deletions(-)

diff --git a/src/renderer.cpp b/src/renderer.cpp
index 4d8306f..b08c7f2 100755
--- a/src/renderer.cpp
+++ b/src/renderer.cpp
@@ -101,34 +101,24 @@ void SimpleRenderer::ExecuteDrawPipeline(const Model &model, uint32_t *buffer) {
   /* * * Vertex Transformation * * */
   auto vertex_shader_start_time = std::chrono::high_resolution_clock::now();
   std::vector<Vertex> processedVertices;
-  std::vector<std::vector<Vertex>> processed_vertices_all_thread(kNProc);
-#pragma omp parallel num_threads(kNProc) default(none) \
-    shared(shader_, processed_vertices_all_thread) firstprivate(model)
-  {
-    int thread_id = omp_get_thread_num();
-    std::vector<Vertex> &processedVertices_per_thread =
-        processed_vertices_all_thread[thread_id];
+  const auto &input_vertices = model.GetVertices();
+  processedVertices.resize(input_vertices.size()); // 根据顶点总数量进行预分配
 
-#pragma omp for
-    for (const auto &v : model.GetVertices()) {
-      // 顶点着色器：世界坐标 -> 裁剪坐标
-      auto clipSpaceVertex = shader_->VertexShader(v);
-      
-      // 透视除法：裁剪坐标 -> NDC坐标
-      auto ndcVertex = PerspectiveDivision(clipSpaceVertex);
-      
-      // 视口变换：NDC坐标 -> 屏幕坐标
-      auto screenSpaceVertex = ViewportTransformation(ndcVertex);
-      
-      processedVertices_per_thread.push_back(screenSpaceVertex);
-    }
-  }
+// 并行过程保持连续分块，避免false sharing
+#pragma omp parallel for num_threads(kNProc) schedule(static) \ 
+    shared(shader_, processedVertices, input_vertices)
+  for (size_t i = 0; i < input_vertices.size(); ++i) { // 按索引并行处理
+    const auto &v = input_vertices[i];
+    // 顶点着色器：世界坐标 -> 裁剪坐标
+    auto clipSpaceVertex = shader_->VertexShader(v);
+
+    // 透视除法：裁剪坐标 -> NDC坐标
+    auto ndcVertex = PerspectiveDivision(clipSpaceVertex);
+
+    // 视口变换：NDC坐标 -> 屏幕坐标
+    auto screenSpaceVertex = ViewportTransformation(ndcVertex);
 
-  for (const auto &processedVertices_per_thread :
-       processed_vertices_all_thread) {
-    processedVertices.insert(processedVertices.end(),
-                             processedVertices_per_thread.begin(),
-                             processedVertices_per_thread.end());
+    processedVertices[i] = screenSpaceVertex;
   }
   auto vertex_shader_end_time = std::chrono::high_resolution_clock::now();
   auto vertex_shader_duration = std::chrono::duration_cast<std::chrono::microseconds>(

From bb5acc1aca13f018a4132da22f81887e8328f6da Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Sat, 6 Sep 2025 23:03:26 +0800
Subject: [PATCH 12/24] TBR: Use SoA vertex layout to improve cache locality

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 src/include/rasterizer.hpp |   6 +
 src/include/renderer.h     |  36 ++-
 src/include/vertex_soa.hpp |  33 +++
 src/rasterizer.cpp         |  61 +++++
 src/renderer.cpp           | 522 +++++++++++++++----------------------
 5 files changed, 328 insertions(+), 330 deletions(-)
 create mode 100644 src/include/vertex_soa.hpp

diff --git a/src/include/rasterizer.hpp b/src/include/rasterizer.hpp
index 80b5f84..24e4a20 100644
--- a/src/include/rasterizer.hpp
+++ b/src/include/rasterizer.hpp
@@ -3,6 +3,7 @@
 
 #include "config.h"
 #include "shader.hpp"
+#include "vertex_soa.hpp"
 
 namespace simple_renderer {
 
@@ -27,6 +28,11 @@ class Rasterizer {
                    int x0, int y0, int x1, int y1,
                    std::vector<Fragment>& out);
 
+  // SoA 版本：按顶点索引从 SoA 读取三角形三顶点
+  void RasterizeTo(const VertexSoA& soa, size_t i0, size_t i1, size_t i2,
+                   int x0, int y0, int x1, int y1,
+                   std::vector<Fragment>& out);
+
  private:
   size_t width_, height_;
 
diff --git a/src/include/renderer.h b/src/include/renderer.h
index 97c9952..56c84c8 100755
--- a/src/include/renderer.h
+++ b/src/include/renderer.h
@@ -38,18 +38,12 @@ enum class RenderingMode {
   DEFERRED      // 延迟渲染模式 - 经典GPU管线教学模拟
 };
 
-// Face 只包含顶点索引，不包含实际的顶点数据;
-// Vertex 包含3D坐标，但没有屏幕坐标
-// Fragment 包含屏幕坐标，但它是光栅化的结果，不是输入
-struct TriangleInfo {
-  Vertex v0, v1, v2;
-  const Material *material;
-  size_t face_index;
-  TriangleInfo(const Vertex& vertex0, const Vertex& vertex1, const Vertex& vertex2,
-             const Material* mat, size_t face_idx = 0)
-    : v0(vertex0), v1(vertex1), v2(vertex2), material(mat), face_index(face_idx) {}
-    
-  TriangleInfo() = default;
+
+// SoA 版 tile 列表中的三角形引用（仅存索引与材质指针）
+struct TriangleRef {
+  size_t i0, i1, i2;
+  const Material* material = nullptr;
+  size_t face_index = 0;
 };
 
 class SimpleRenderer {
@@ -158,10 +152,9 @@ class SimpleRenderer {
     double deferred_shading_ms;
     double total_ms;
   };
-  
   TileRenderStats ExecuteTileBasedPipeline(const Model &model,
-                                          const std::vector<Vertex> &processedVertices,
-                                          uint32_t *buffer);
+                                              const VertexSoA &soa,
+                                              uint32_t *buffer);
 
   /**
    * 延迟渲染管线
@@ -177,19 +170,24 @@ class SimpleRenderer {
   
 private:
 
+
+  // SoA 版本的 Triangle-Tile binning（两遍计数 + reserve）
   void TriangleTileBinning(
-    const Model &model, 
-    const std::vector<Vertex> &screenVertices,
-    std::vector<std::vector<TriangleInfo>> &tile_triangles,
+    const Model &model,
+    const VertexSoA &soa,
+    std::vector<std::vector<TriangleRef>> &tile_triangles,
     size_t tiles_x, size_t tiles_y, size_t tile_size);
 
+
+  // SoA 版本的 tile 光栅化
   void RasterizeTile(
     size_t tile_id,
-    const std::vector<TriangleInfo> &triangles,
+    const std::vector<TriangleRef> &triangles,
     size_t tiles_x, size_t tiles_y, size_t tile_size,
     float* tile_depth_buffer, uint32_t* tile_color_buffer,
     std::unique_ptr<float[]> &global_depth_buffer,
     std::unique_ptr<uint32_t[]> &global_color_buffer,
+    const VertexSoA &soa,
     bool use_early_z = false,
     std::vector<Fragment>* scratch_fragments = nullptr);
 
diff --git a/src/include/vertex_soa.hpp b/src/include/vertex_soa.hpp
new file mode 100644
index 0000000..4c5806a
--- /dev/null
+++ b/src/include/vertex_soa.hpp
@@ -0,0 +1,33 @@
+// Minimal SoA layout for TBR pipeline (Phase 1)
+#ifndef SIMPLERENDER_SRC_INCLUDE_VERTEX_SOA_HPP_
+#define SIMPLERENDER_SRC_INCLUDE_VERTEX_SOA_HPP_
+
+#include <vector>
+
+#include "math.hpp"
+#include "color.h"
+
+namespace simple_renderer {
+
+struct VertexSoA {
+  // 屏幕空间坐标（视口变换后）
+  std::vector<Vector4f> pos_screen;  // screen space position (x,y,z,w)
+  // 裁剪空间坐标（用于视锥体剔除）：clip = MVP * pos
+  std::vector<Vector4f> pos_clip;
+  std::vector<Vector3f> normal;
+  std::vector<Vector2f> uv;
+  std::vector<Color>    color;
+
+  inline size_t size() const { return pos_screen.size(); }
+  inline void resize(size_t n) {
+    pos_screen.resize(n);
+    pos_clip.resize(n);
+    normal.resize(n);
+    uv.resize(n);
+    color.resize(n);
+  }
+};
+
+}  // namespace simple_renderer
+
+#endif  // SIMPLERENDER_SRC_INCLUDE_VERTEX_SOA_HPP_
diff --git a/src/rasterizer.cpp b/src/rasterizer.cpp
index 2bbe161..1ee2fff 100644
--- a/src/rasterizer.cpp
+++ b/src/rasterizer.cpp
@@ -158,6 +158,67 @@ void Rasterizer::RasterizeTo(const Vertex& v0, const Vertex& v1, const Vertex& v
   }
 }
 
+void Rasterizer::RasterizeTo(const VertexSoA& soa, size_t i0, size_t i1, size_t i2,
+                             int x0, int y0, int x1, int y1,
+                             std::vector<Fragment>& out) {
+  // 读取三顶点的屏幕空间位置
+  const Vector4f& p0 = soa.pos_screen[i0];
+  const Vector4f& p1 = soa.pos_screen[i1];
+  const Vector4f& p2 = soa.pos_screen[i2];
+
+  Vector2f a = Vector2f(p0.x, p0.y);
+  Vector2f b = Vector2f(p1.x, p1.y);
+  Vector2f c = Vector2f(p2.x, p2.y);
+
+  Vector2f bboxMin = Vector2f{std::min({a.x, b.x, c.x}), std::min({a.y, b.y, c.y})};
+  Vector2f bboxMax = Vector2f{std::max({a.x, b.x, c.x}), std::max({a.y, b.y, c.y})};
+
+  // Clamp 到屏幕尺寸
+  float minx = std::max(0.0f, bboxMin.x);
+  float miny = std::max(0.0f, bboxMin.y);
+  float maxx = std::min(float(width_ - 1), bboxMax.x);
+  float maxy = std::min(float(height_ - 1), bboxMax.y);
+
+  // 与外部提供的裁剪区域相交（半开区间） -> 闭区间扫描
+  int sx = std::max(x0, int(std::floor(minx)));
+  int sy = std::max(y0, int(std::floor(miny)));
+  int ex = std::min(x1 - 1, int(std::floor(maxx)));
+  int ey = std::min(y1 - 1, int(std::floor(maxy)));
+  if (sx > ex || sy > ey) return;
+
+  // 透视矫正插值依赖 w
+  float w0_inv = p0.w;
+  float w1_inv = p1.w;
+  float w2_inv = p2.w;
+
+  for (int x = sx; x <= ex; ++x) {
+    for (int y = sy; y <= ey; ++y) {
+      auto [is_inside, bary] = GetBarycentricCoord(
+          Vector3f(p0.x, p0.y, p0.z), Vector3f(p1.x, p1.y, p1.z), Vector3f(p2.x, p2.y, p2.z),
+          Vector3f(static_cast<float>(x), static_cast<float>(y), 0));
+      if (!is_inside) continue;
+
+      float w_inv_interp = Interpolate(w0_inv, w1_inv, w2_inv, bary);
+      Vector3f cb(
+          bary.x * w0_inv / w_inv_interp,
+          bary.y * w1_inv / w_inv_interp,
+          bary.z * w2_inv / w_inv_interp);
+
+      float z = Interpolate(p0.z, p1.z, p2.z, cb);
+
+      Fragment frag;
+      frag.screen_coord = {x, y};
+      frag.normal = Interpolate(soa.normal[i0], soa.normal[i1], soa.normal[i2], cb);
+      frag.uv     = Interpolate(soa.uv[i0],     soa.uv[i1],     soa.uv[i2],     cb);
+      frag.color  = InterpolateColor(soa.color[i0], soa.color[i1], soa.color[i2], cb);
+      frag.depth  = z;
+      // material 指针由调用方填写
+
+      out.push_back(frag);
+    }
+  }
+}
+
 std::pair<bool, Vector3f> Rasterizer::GetBarycentricCoord(const Vector3f& p0,
                                                           const Vector3f& p1,
                                                           const Vector3f& p2,
diff --git a/src/renderer.cpp b/src/renderer.cpp
index b08c7f2..3167cf5 100755
--- a/src/renderer.cpp
+++ b/src/renderer.cpp
@@ -100,25 +100,42 @@ void SimpleRenderer::ExecuteDrawPipeline(const Model &model, uint32_t *buffer) {
   
   /* * * Vertex Transformation * * */
   auto vertex_shader_start_time = std::chrono::high_resolution_clock::now();
-  std::vector<Vertex> processedVertices;
   const auto &input_vertices = model.GetVertices();
-  processedVertices.resize(input_vertices.size()); // 根据顶点总数量进行预分配
-
-// 并行过程保持连续分块，避免false sharing
-#pragma omp parallel for num_threads(kNProc) schedule(static) \ 
+  std::vector<Vertex> processedVertices;  // 非 TBR
+  VertexSoA processedSoA;                 // TBR 专用
+
+  if (current_mode_ == RenderingMode::TILE_BASED) {
+    processedSoA.resize(input_vertices.size());
+    // schedule(static)使并行过程保持连续分块，避免 false sharing
+#pragma omp parallel for num_threads(kNProc) schedule(static) \
+    shared(shader_, processedSoA, input_vertices)
+    for (size_t i = 0; i < input_vertices.size(); ++i) { // 按索引并行处理
+      const auto &v = input_vertices[i];
+      // 顶点着色器：世界坐标 -> 裁剪坐标
+      auto clipSpaceVertex = shader_->VertexShader(v);
+      // 保存裁剪空间坐标用于后续视锥体裁剪
+      processedSoA.pos_clip[i] = clipSpaceVertex.GetPosition();
+      auto ndcVertex = PerspectiveDivision(clipSpaceVertex);
+      auto screenSpaceVertex = ViewportTransformation(ndcVertex);
+
+      // 填充为SoA数据结构，用于优化缓存局部性
+      processedSoA.pos_screen[i] = screenSpaceVertex.GetPosition();
+      processedSoA.normal[i]     = screenSpaceVertex.GetNormal();
+      processedSoA.uv[i]         = screenSpaceVertex.GetTexCoords();
+      processedSoA.color[i]      = screenSpaceVertex.GetColor();
+    }
+  } else { // Tradition或Deffer管线
+    processedVertices.resize(input_vertices.size()); // 根据顶点总数量进行预分配
+    // 并行过程保持连续分块，避免false sharing
+#pragma omp parallel for num_threads(kNProc) schedule(static) \
     shared(shader_, processedVertices, input_vertices)
-  for (size_t i = 0; i < input_vertices.size(); ++i) { // 按索引并行处理
-    const auto &v = input_vertices[i];
-    // 顶点着色器：世界坐标 -> 裁剪坐标
-    auto clipSpaceVertex = shader_->VertexShader(v);
-
-    // 透视除法：裁剪坐标 -> NDC坐标
-    auto ndcVertex = PerspectiveDivision(clipSpaceVertex);
-
-    // 视口变换：NDC坐标 -> 屏幕坐标
-    auto screenSpaceVertex = ViewportTransformation(ndcVertex);
-
-    processedVertices[i] = screenSpaceVertex;
+    for (size_t i = 0; i < input_vertices.size(); ++i) { // 按索引并行处理
+      const auto &v = input_vertices[i];
+      auto clipSpaceVertex = shader_->VertexShader(v);
+      auto ndcVertex = PerspectiveDivision(clipSpaceVertex);
+      auto screenSpaceVertex = ViewportTransformation(ndcVertex);
+      processedVertices[i] = screenSpaceVertex;
+    }
   }
   auto vertex_shader_end_time = std::chrono::high_resolution_clock::now();
   auto vertex_shader_duration = std::chrono::duration_cast<std::chrono::microseconds>(
@@ -143,7 +160,7 @@ void SimpleRenderer::ExecuteDrawPipeline(const Model &model, uint32_t *buffer) {
     }
     
     case RenderingMode::TILE_BASED: {
-      auto stats = ExecuteTileBasedPipeline(model, processedVertices, buffer);
+      auto stats = ExecuteTileBasedPipeline(model, processedSoA, buffer);
       double total_ms = vertex_ms + stats.total_ms;
       
       SPDLOG_INFO("=== TILE-BASED RENDERING PERFORMANCE ===");
@@ -379,309 +396,193 @@ Vertex SimpleRenderer::ViewportTransformation(const Vertex &vertex) {
 
 
 
-
-// Triangle-Tile binning函数 - 修正版本
+// SoA优化的Binning：两遍计数 + 预留 + 填充 TriangleRef
 void SimpleRenderer::TriangleTileBinning(
-    const Model &model, 
-    const std::vector<Vertex> &screenVertices,
-    std::vector<std::vector<TriangleInfo>> &tile_triangles,
+    const Model &model,
+    const VertexSoA &soa,
+    std::vector<std::vector<TriangleRef>> &tile_triangles,
     size_t tiles_x, size_t tiles_y, size_t tile_size) {
-    
-    size_t total_triangles = model.GetFaces().size();
-    size_t processed_triangles = 0;
-    size_t triangles_with_clipped_vertices = 0;
-    
-    SPDLOG_INFO("Starting triangle-tile binning for {} triangles", total_triangles);
-    SPDLOG_INFO("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", 
-                width_, height_, tile_size, tiles_x, tiles_y);
-    
-    // 第一遍：仅统计每个 tile 的三角形数量以便预分配，避免 push_back 扩容
-    std::vector<size_t> tile_counts(tiles_x * tiles_y, 0);
-    for (size_t tri_idx = 0; tri_idx < model.GetFaces().size(); tri_idx++) {
-        const auto &f = model.GetFaces()[tri_idx];
-        auto v0 = screenVertices[f.GetIndex(0)];
-        auto v1 = screenVertices[f.GetIndex(1)];
-        auto v2 = screenVertices[f.GetIndex(2)];
-
-        if (v0.HasClipPosition()) {
-            Vector4f c0 = v0.GetClipPosition();
-            Vector4f c1 = v1.GetClipPosition(); 
-            Vector4f c2 = v2.GetClipPosition();
-            bool frustum_cull = 
-                (c0.x > c0.w && c1.x > c1.w && c2.x > c2.w) ||
-                (c0.x < -c0.w && c1.x < -c0.w && c2.x < -c0.w) ||
-                (c0.y > c0.w && c1.y > c1.w && c2.y > c2.w) ||
-                (c0.y < -c0.w && c1.y < -c0.w && c2.y < -c0.w) ||
-                (c0.z > c0.w && c1.z > c1.w && c2.z > c2.w) ||
-                (c0.z < -c0.w && c1.z < -c0.w && c2.z < -c0.w);
-            if (frustum_cull) {
-                continue;
-            }
-        }
+  const size_t total_triangles = model.GetFaces().size();
+
+  SPDLOG_INFO("Starting triangle-tile binning (SoA) for {} triangles", total_triangles);
+  SPDLOG_INFO("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}",
+              width_, height_, tile_size, tiles_x, tiles_y);
+
+  std::vector<size_t> tile_counts(tiles_x * tiles_y, 0);
+
+  auto process_triangle = [&](size_t tri_idx, bool count_only) {
+    const auto &f = model.GetFaces()[tri_idx];
+    size_t i0 = f.GetIndex(0);
+    size_t i1 = f.GetIndex(1);
+    size_t i2 = f.GetIndex(2);
+
+    // 视锥体裁剪 (裁剪空间)
+    // 保守视锥体裁剪：只有当整个三角形都在视锥体外同一侧时才裁剪
+    const Vector4f &c0 = soa.pos_clip[i0];
+    const Vector4f &c1 = soa.pos_clip[i1];
+    const Vector4f &c2 = soa.pos_clip[i2];
+    bool frustum_cull =
+        (c0.x > c0.w && c1.x > c1.w && c2.x > c2.w) ||  // 右平面外
+        (c0.x < -c0.w && c1.x < -c0.w && c2.x < -c0.w) || // 左平面外
+        (c0.y > c0.w && c1.y > c1.w && c2.y > c2.w) ||  // 上平面外
+        (c0.y < -c0.w && c1.y < -c0.w && c2.y < -c0.w) || // 下平面外
+        (c0.z > c0.w && c1.z > c1.w && c2.z > c2.w) ||  // 远平面外
+        (c0.z < -c0.w && c1.z < -c0.w && c2.z < -c0.w);  // 近平面外
+    if (frustum_cull) {
+      return;
+    }
 
-        Vector4f pos0 = v0.GetPosition();
-        Vector4f pos1 = v1.GetPosition();
-        Vector4f pos2 = v2.GetPosition();
-
-        Vector2f screen0(pos0.x, pos0.y);
-        Vector2f screen1(pos1.x, pos1.y);  
-        Vector2f screen2(pos2.x, pos2.y);
-        Vector2f edge1 = screen1 - screen0;
-        Vector2f edge2 = screen2 - screen0;
-        float cross_product = edge1.x * edge2.y - edge1.y * edge2.x;
-        if (cross_product > 0.0f) {
-            continue;
+    const Vector4f &pos0 = soa.pos_screen[i0];
+    const Vector4f &pos1 = soa.pos_screen[i1];
+    const Vector4f &pos2 = soa.pos_screen[i2];
+
+    // 背面剔除（屏幕空间）
+    // NDC空间中叉积为负表示顺时针，即背面。
+    // 从NDC到屏幕空间中，会发生Y轴翻转，对应叉积应为正。
+
+    Vector2f screen0(pos0.x, pos0.y);
+    Vector2f screen1(pos1.x, pos1.y);
+    Vector2f screen2(pos2.x, pos2.y);
+    Vector2f edge1 = screen1 - screen0;
+    Vector2f edge2 = screen2 - screen0;
+    float cross_product = edge1.x * edge2.y - edge1.y * edge2.x;
+    if (cross_product > 0.0f) return;
+
+    float screen_x0 = pos0.x;
+    float screen_y0 = pos0.y;
+    float screen_x1 = pos1.x;
+    float screen_y1 = pos1.y;
+    float screen_x2 = pos2.x;
+    float screen_y2 = pos2.y;
+
+    // 计算屏幕bbox，用于后续tile划分
+    float min_x = std::min({screen_x0, screen_x1, screen_x2});
+    float max_x = std::max({screen_x0, screen_x1, screen_x2});
+    float min_y = std::min({screen_y0, screen_y1, screen_y2});
+    float max_y = std::max({screen_y0, screen_y1, screen_y2});
+
+    int start_tile_x = std::max(0, static_cast<int>(min_x) / static_cast<int>(tile_size));
+    int end_tile_x   = std::min(static_cast<int>(tiles_x - 1), static_cast<int>(max_x) / static_cast<int>(tile_size));
+    int start_tile_y = std::max(0, static_cast<int>(min_y) / static_cast<int>(tile_size));
+    int end_tile_y   = std::min(static_cast<int>(tiles_y - 1), static_cast<int>(max_y) / static_cast<int>(tile_size));
+    if (start_tile_x > end_tile_x || start_tile_y > end_tile_y) return; // 如果bbox不在任何tile内，直接返回
+
+    if (count_only) { // 第一遍计数，只统计tile内三角形数量
+      for (int ty = start_tile_y; ty <= end_tile_y; ++ty) {
+        for (int tx = start_tile_x; tx <= end_tile_x; ++tx) {
+          size_t tile_id = ty * tiles_x + tx;
+          tile_counts[tile_id]++;
         }
-
-        bool has_clipped_vertex = (pos0.x == -1000.0f || pos1.x == -1000.0f || pos2.x == -1000.0f);
-        if (has_clipped_vertex) {
-            continue;
+      }
+    } else { // 第二遍填充，填充TriangleRef
+      TriangleRef tri_ref{ i0, i1, i2, &f.GetMaterial(), tri_idx };
+      for (int ty = start_tile_y; ty <= end_tile_y; ++ty) {
+        for (int tx = start_tile_x; tx <= end_tile_x; ++tx) {
+          size_t tile_id = ty * tiles_x + tx;
+          tile_triangles[tile_id].push_back(tri_ref);
         }
+      }
+    }
+  };
 
-        float screen_x0 = pos0.x;
-        float screen_y0 = pos0.y;
-        float screen_x1 = pos1.x;
-        float screen_y1 = pos1.y;
-        float screen_x2 = pos2.x;
-        float screen_y2 = pos2.y;
-
-        float min_x = std::min({screen_x0, screen_x1, screen_x2});
-        float max_x = std::max({screen_x0, screen_x1, screen_x2});
-        float min_y = std::min({screen_y0, screen_y1, screen_y2});
-        float max_y = std::max({screen_y0, screen_y1, screen_y2});
-
-        int start_tile_x = std::max(0, static_cast<int>(min_x) / static_cast<int>(tile_size));
-        int end_tile_x = std::min(static_cast<int>(tiles_x - 1), 
-                                 static_cast<int>(max_x) / static_cast<int>(tile_size));
-        int start_tile_y = std::max(0, static_cast<int>(min_y) / static_cast<int>(tile_size));
-        int end_tile_y = std::min(static_cast<int>(tiles_y - 1), 
-                                 static_cast<int>(max_y) / static_cast<int>(tile_size));
-
-        if (start_tile_x > end_tile_x || start_tile_y > end_tile_y) {
-            continue;
-        }
+  // 第一遍（count only）：计算每个tile需要容纳多少三角形
+  for (size_t tri_idx = 0; tri_idx < total_triangles; ++tri_idx) {
+    process_triangle(tri_idx, true);
+  }
 
-        for (int ty = start_tile_y; ty <= end_tile_y; ++ty) {
-            for (int tx = start_tile_x; tx <= end_tile_x; ++tx) {
-                size_t tile_id = ty * tiles_x + tx;
-                tile_counts[tile_id]++;
-            }
-        }
-    }
+  // 预分配，避免动态扩容
+  for (size_t tile_id = 0; tile_id < tile_triangles.size(); ++tile_id) {
+    if (tile_counts[tile_id] > 0) tile_triangles[tile_id].reserve(tile_counts[tile_id]);
+  }
 
-    // 依据统计结果进行容量预留
-    for (size_t tile_id = 0; tile_id < tile_triangles.size(); ++tile_id) {
-        if (tile_counts[tile_id] > 0) {
-            tile_triangles[tile_id].reserve(tile_counts[tile_id]);
-        }
-    }
-    for (size_t tri_idx = 0; tri_idx < model.GetFaces().size(); tri_idx++) {
-        const auto &f = model.GetFaces()[tri_idx];
-        auto v0 = screenVertices[f.GetIndex(0)];
-        auto v1 = screenVertices[f.GetIndex(1)];
-        auto v2 = screenVertices[f.GetIndex(2)];
-        
-        // 视锥体裁剪 (裁剪空间)
-        if (v0.HasClipPosition()) {
-            Vector4f c0 = v0.GetClipPosition();
-            Vector4f c1 = v1.GetClipPosition(); 
-            Vector4f c2 = v2.GetClipPosition();
-            
-            // 保守视锥体裁剪：只有当整个三角形都在视锥体外同一侧时才裁剪
-            bool frustum_cull = 
-                (c0.x > c0.w && c1.x > c1.w && c2.x > c2.w) ||  // 右平面外
-                (c0.x < -c0.w && c1.x < -c1.w && c2.x < -c2.w) || // 左平面外  
-                (c0.y > c0.w && c1.y > c1.w && c2.y > c2.w) ||  // 上平面外
-                (c0.y < -c0.w && c1.y < -c1.w && c2.y < -c2.w) || // 下平面外
-                (c0.z > c0.w && c1.z > c1.w && c2.z > c2.w) ||  // 远平面外
-                (c0.z < -c0.w && c1.z < -c1.w && c2.z < -c2.w);  // 近平面外
-                
-            if (frustum_cull) {
-                continue;
-            }
-        }
-        
-        // 获取屏幕空间坐标（现在已经是屏幕坐标了）
-        Vector4f pos0 = v0.GetPosition();
-        Vector4f pos1 = v1.GetPosition();
-        Vector4f pos2 = v2.GetPosition();
-        
-        // 计算屏幕空间叉积判断朝向
-        Vector2f screen0(pos0.x, pos0.y);
-        Vector2f screen1(pos1.x, pos1.y);  
-        Vector2f screen2(pos2.x, pos2.y);
-        Vector2f edge1 = screen1 - screen0;
-        Vector2f edge2 = screen2 - screen0;
-        float cross_product = edge1.x * edge2.y - edge1.y * edge2.x;
-        
-        // 背面剔除：NDC空间中叉积为负表示顺时针，即背面。
-        // 从NDC到屏幕空间中，会发生Y轴翻转，对应叉积应为正。
-        if (cross_product > 0.0f) {
-            continue;
-        }
-        
-        // 检查三角形是否有被裁剪的顶点（坐标为-1000的表示被裁剪）
-        bool has_clipped_vertex = (pos0.x == -1000.0f || pos1.x == -1000.0f || pos2.x == -1000.0f);
-        
-        if (has_clipped_vertex) {
-            triangles_with_clipped_vertices++;
-            if (triangles_with_clipped_vertices <= 3) {
-                SPDLOG_INFO("Triangle {} has clipped vertices:", tri_idx);
-                SPDLOG_INFO("  V0: ({:.1f},{:.1f}) V1: ({:.1f},{:.1f}) V2: ({:.1f},{:.1f})", 
-                           pos0.x, pos0.y, pos1.x, pos1.y, pos2.x, pos2.y);
-            }
-            continue;
-        }
-        
-        // 直接使用屏幕空间坐标
-        float screen_x0 = pos0.x;
-        float screen_y0 = pos0.y;
-        float screen_x1 = pos1.x;
-        float screen_y1 = pos1.y;
-        float screen_x2 = pos2.x;
-        float screen_y2 = pos2.y;
-        
-        // 计算bounding box
-        float min_x = std::min({screen_x0, screen_x1, screen_x2});
-        float max_x = std::max({screen_x0, screen_x1, screen_x2});
-        float min_y = std::min({screen_y0, screen_y1, screen_y2});
-        float max_y = std::max({screen_y0, screen_y1, screen_y2});
-        
-        
-        // 计算影响的tile范围
-        int start_tile_x = std::max(0, static_cast<int>(min_x) / static_cast<int>(tile_size));
-        int end_tile_x = std::min(static_cast<int>(tiles_x - 1), 
-                                 static_cast<int>(max_x) / static_cast<int>(tile_size));
-        int start_tile_y = std::max(0, static_cast<int>(min_y) / static_cast<int>(tile_size));
-        int end_tile_y = std::min(static_cast<int>(tiles_y - 1), 
-                                 static_cast<int>(max_y) / static_cast<int>(tile_size));
-        
-        // 添加三角形到相关tiles（多个三角形可能会映射到同一个tile当中，所以谨慎并行化）
-        if (start_tile_x <= end_tile_x && start_tile_y <= end_tile_y) {
-            TriangleInfo triangle_info = {v0, v1, v2, &f.GetMaterial(), processed_triangles};
-            
-            for (int ty = start_tile_y; ty <= end_tile_y; ty++) {
-                for (int tx = start_tile_x; tx <= end_tile_x; tx++) {
-                    size_t tile_id = ty * tiles_x + tx;
-                    tile_triangles[tile_id].push_back(triangle_info); // 可能多个线程同时pushback的话有风险
-                }
-            }
-            processed_triangles++;
-            
-        }
-    }
-    
-    size_t total_triangle_refs = 0;
-    size_t non_empty_tiles = 0;
-    for (const auto& tile : tile_triangles) {
-        total_triangle_refs += tile.size();
-        if (!tile.empty()) non_empty_tiles++;
-    }
-    
-    SPDLOG_INFO("  Total triangle references: {}", total_triangle_refs);
-    SPDLOG_INFO("  Non-empty tiles: {}", non_empty_tiles);
-    SPDLOG_INFO("  Average triangles per tile: {:.2f}", 
-                total_triangle_refs > 0 ? float(total_triangle_refs) / tile_triangles.size() : 0.0f);
+  // 第二遍（fill）：按范围填充TriangleRef
+  for (size_t tri_idx = 0; tri_idx < total_triangles; ++tri_idx) {
+    process_triangle(tri_idx, false);
+  }
+
+  size_t total_triangle_refs = 0;
+  size_t non_empty_tiles = 0;
+  for (const auto& tile : tile_triangles) {
+    total_triangle_refs += tile.size();
+    if (!tile.empty()) non_empty_tiles++;
+  }
+  SPDLOG_INFO("  (SoA) Total triangle references: {}", total_triangle_refs);
+  SPDLOG_INFO("  (SoA) Non-empty tiles: {}", non_empty_tiles);
+  SPDLOG_INFO("  (SoA) Average triangles per tile: {:.2f}",
+              total_triangle_refs > 0 ? float(total_triangle_refs) / tile_triangles.size() : 0.0f);
 }
 
-// 单个tile光栅化函数
+// SoA 版：单个 tile 光栅化
 void SimpleRenderer::RasterizeTile(
     size_t tile_id,
-    const std::vector<TriangleInfo> &triangles,
+    const std::vector<TriangleRef> &triangles,
     size_t tiles_x, size_t tiles_y, size_t tile_size,
     float* tile_depth_buffer, uint32_t* tile_color_buffer,
     std::unique_ptr<float[]> &global_depth_buffer,
     std::unique_ptr<uint32_t[]> &global_color_buffer,
+    const VertexSoA &soa,
     bool use_early_z,
     std::vector<Fragment>* scratch_fragments) {
-  // 计算tile在屏幕空间的范围
+  (void)tiles_y;
+  // 计算 tile 屏幕范围
   size_t tile_x = tile_id % tiles_x;
   size_t tile_y = tile_id / tiles_x;
   size_t screen_x_start = tile_x * tile_size;
   size_t screen_y_start = tile_y * tile_size;
   size_t screen_x_end = std::min(screen_x_start + tile_size, width_);
   size_t screen_y_end = std::min(screen_y_start + tile_size, height_);
-    
-  // 初始化tile缓冲区
+
+  // 初始化 tile 局部缓冲
   size_t tile_width = screen_x_end - screen_x_start;
   size_t tile_height = screen_y_end - screen_y_start;
-  std::fill_n(tile_depth_buffer, tile_width * tile_height,
-              1.0f);  // 初始化为最远深度（标准深度缓冲范围[0,1]）
+  std::fill_n(tile_depth_buffer, tile_width * tile_height, 1.0f);
   std::fill_n(tile_color_buffer, tile_width * tile_height, 0);
-    
-  // 在tile内光栅化所有三角形
-  (void)tiles_y; // 避免未使用参数告警
-  for (const auto &triangle : triangles) {
-    // 复用线程本地 scratch 容器，限制在 tile 边界内栅格化
-    if (scratch_fragments) { // 提供scratch容器
-      scratch_fragments->clear();
-      if (scratch_fragments->capacity() < tile_width * tile_height) { // 二次确认，为日后可能的可变tile进行设计
-        scratch_fragments->reserve(tile_width * tile_height);
-      }
-      rasterizer_->RasterizeTo(triangle.v0, triangle.v1, triangle.v2,
-                               static_cast<int>(screen_x_start), static_cast<int>(screen_y_start),
-                               static_cast<int>(screen_x_end),   static_cast<int>(screen_y_end),
-                               *scratch_fragments);
-
-      for (auto &fragment : *scratch_fragments) {
-        fragment.material = triangle.material;
-        size_t screen_x = fragment.screen_coord[0];
-        size_t screen_y = fragment.screen_coord[1];
-        if (screen_x >= screen_x_start && screen_x < screen_x_end &&
-            screen_y >= screen_y_start && screen_y < screen_y_end) {
-          size_t tile_local_x = screen_x - screen_x_start;
-          size_t tile_local_y = screen_y - screen_y_start;
-          size_t tile_index = tile_local_x + tile_local_y * tile_width;
-          if (use_early_z) {
-            if (fragment.depth < tile_depth_buffer[tile_index]) {
-              auto color = shader_->FragmentShader(fragment);
-              tile_depth_buffer[tile_index] = fragment.depth;
-              tile_color_buffer[tile_index] = uint32_t(color);
-            }
-          } else {
+
+  for (const auto &tri : triangles) { // 用来应对scratch传入nullptr的情况
+    // 始终走 SoA + 限制矩形的光栅化路径；如未提供 scratch，则使用函数内局部容器
+    std::vector<Fragment> local_out;
+    std::vector<Fragment> &out = scratch_fragments ? *scratch_fragments : local_out;
+
+    out.clear();
+    if (out.capacity() < tile_width * tile_height) {
+      out.reserve(tile_width * tile_height);
+    }
+
+    rasterizer_->RasterizeTo(soa, tri.i0, tri.i1, tri.i2,
+                             static_cast<int>(screen_x_start), static_cast<int>(screen_y_start),
+                             static_cast<int>(screen_x_end),   static_cast<int>(screen_y_end),
+                             out);
+
+    for (auto &fragment : out) {
+      fragment.material = tri.material;
+      size_t sx = fragment.screen_coord[0];
+      size_t sy = fragment.screen_coord[1];
+      if (sx >= screen_x_start && sx < screen_x_end && sy >= screen_y_start && sy < screen_y_end) {
+        size_t local_x = sx - screen_x_start;
+        size_t local_y = sy - screen_y_start;
+        size_t idx = local_x + local_y * tile_width;
+        if (use_early_z) {
+          if (fragment.depth < tile_depth_buffer[idx]) {
             auto color = shader_->FragmentShader(fragment);
-            if (fragment.depth < tile_depth_buffer[tile_index]) {
-              tile_depth_buffer[tile_index] = fragment.depth;
-              tile_color_buffer[tile_index] = uint32_t(color);
-            }
+            tile_depth_buffer[idx] = fragment.depth;
+            tile_color_buffer[idx] = uint32_t(color);
           }
-        }
-      }
-    } else { // 不提供scratch容器的版本
-      auto fragments = rasterizer_->Rasterize(triangle.v0, triangle.v1, triangle.v2);
-      for (auto &fragment : fragments) {
-        fragment.material = triangle.material;
-        size_t screen_x = fragment.screen_coord[0];
-        size_t screen_y = fragment.screen_coord[1];
-        if (screen_x >= screen_x_start && screen_x < screen_x_end &&
-            screen_y >= screen_y_start && screen_y < screen_y_end) {
-          size_t tile_local_x = screen_x - screen_x_start;
-          size_t tile_local_y = screen_y - screen_y_start;
-          size_t tile_index = tile_local_x + tile_local_y * tile_width;
-          if (use_early_z) {
-            if (fragment.depth < tile_depth_buffer[tile_index]) {
-              auto color = shader_->FragmentShader(fragment);
-              tile_depth_buffer[tile_index] = fragment.depth;
-              tile_color_buffer[tile_index] = uint32_t(color);
-            }
-          } else {
-            auto color = shader_->FragmentShader(fragment);
-            if (fragment.depth < tile_depth_buffer[tile_index]) {
-              tile_depth_buffer[tile_index] = fragment.depth;
-              tile_color_buffer[tile_index] = uint32_t(color);
-            }
+        } else {
+          auto color = shader_->FragmentShader(fragment);
+          if (fragment.depth < tile_depth_buffer[idx]) {
+            tile_depth_buffer[idx] = fragment.depth;
+            tile_color_buffer[idx] = uint32_t(color);
           }
         }
       }
     }
   }
-    
-  // 将tile结果写入全局缓冲区
+
+  // 写回全局缓冲
   for (size_t y = 0; y < tile_height; y++) {
     for (size_t x = 0; x < tile_width; x++) {
       size_t tile_index = x + y * tile_width;
       size_t global_index = (screen_x_start + x) + (screen_y_start + y) * width_;
-            
       if (tile_depth_buffer[tile_index] < global_depth_buffer[global_index]) {
         global_depth_buffer[global_index] = tile_depth_buffer[tile_index];
         global_color_buffer[global_index] = tile_color_buffer[tile_index];
@@ -690,8 +591,7 @@ void SimpleRenderer::RasterizeTile(
   }
 }
 
-
-// 传统光栅化管线实现
+// 基础光栅化管线实现
 SimpleRenderer::RenderStats SimpleRenderer::ExecuteTraditionalPipeline(
     const Model &model, 
     const std::vector<Vertex> &processedVertices,
@@ -822,46 +722,46 @@ SimpleRenderer::RenderStats SimpleRenderer::ExecuteTraditionalPipeline(
     return stats;
 }
 
-// Tile-based光栅化管线实现
+
+// Tile-based光栅化管线实现（SoA 直连版本，避免 AoS->SoA 拷贝）
 SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
     const Model &model,
-    const std::vector<Vertex> &processedVertices,
+    const VertexSoA &soa,
     uint32_t *buffer) {
-    
     TileRenderStats stats;
     auto total_start_time = std::chrono::high_resolution_clock::now();
-    
+
     // 1. Setup阶段
     auto setup_start_time = std::chrono::high_resolution_clock::now();
     const size_t TILE_SIZE = 64; // 64x64 pixels per tile
     const size_t tiles_x = (width_ + TILE_SIZE - 1) / TILE_SIZE;
     const size_t tiles_y = (height_ + TILE_SIZE - 1) / TILE_SIZE;
     const size_t total_tiles = tiles_x * tiles_y;
-    
-    // 为每个tile创建三角形列表
-    std::vector<std::vector<TriangleInfo>> tile_triangles(total_tiles);
+
+    // 为每个tile创建三角形列表（SoA 引用）
+    std::vector<std::vector<TriangleRef>> tile_triangles(total_tiles);
     auto setup_end_time = std::chrono::high_resolution_clock::now();
     auto setup_duration = std::chrono::duration_cast<std::chrono::microseconds>(
         setup_end_time - setup_start_time);
-    
-    // 2. Triangle-Tile binning阶段
+
+    // 2. Triangle-Tile binning阶段（SoA）
     auto binning_start_time = std::chrono::high_resolution_clock::now();
-    TriangleTileBinning(model, processedVertices, tile_triangles, tiles_x, tiles_y, TILE_SIZE);
+    TriangleTileBinning(model, soa, tile_triangles, tiles_x, tiles_y, TILE_SIZE);
     auto binning_end_time = std::chrono::high_resolution_clock::now();
     auto binning_duration = std::chrono::duration_cast<std::chrono::microseconds>(
         binning_end_time - binning_start_time);
-    
+
     // 3. 为每个线程创建framebuffer
     auto buffer_alloc_start_time = std::chrono::high_resolution_clock::now();
     std::vector<std::unique_ptr<float[]>> depthBuffer_all_thread(kNProc);
     std::vector<std::unique_ptr<uint32_t[]>> colorBuffer_all_thread(kNProc);
-    
+
     for (size_t thread_id = 0; thread_id < kNProc; thread_id++) {
         depthBuffer_all_thread[thread_id] = 
             std::make_unique<float[]>(width_ * height_);
         colorBuffer_all_thread[thread_id] = 
             std::make_unique<uint32_t[]>(width_ * height_);
-            
+
         std::fill_n(depthBuffer_all_thread[thread_id].get(), width_ * height_,
                     std::numeric_limits<float>::infinity());
         std::fill_n(colorBuffer_all_thread[thread_id].get(), width_ * height_, 0);
@@ -869,13 +769,13 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
     auto buffer_alloc_end_time = std::chrono::high_resolution_clock::now();
     auto buffer_alloc_duration = std::chrono::duration_cast<std::chrono::microseconds>(
         buffer_alloc_end_time - buffer_alloc_start_time);
-    
-    // 4. 并行处理每个tile
+
+    // 4. 并行处理每个tile（SoA）
     auto rasterization_start_time = std::chrono::high_resolution_clock::now();
 #pragma omp parallel num_threads(kNProc) default(none) \
     shared(tile_triangles, rasterizer_, shader_, width_, height_, \
            depthBuffer_all_thread, colorBuffer_all_thread, tiles_x, tiles_y, total_tiles, \
-           early_z_enabled_)
+           early_z_enabled_, soa)
     {
         int thread_id = omp_get_thread_num();
         auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id];
@@ -893,18 +793,18 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
 
 #pragma omp for
         for (size_t tile_id = 0; tile_id < total_tiles; tile_id++) {
-            // 按照tile进行光栅化，每个Tile进行区域限制+scratch复用，区域限制避免了可能的数据竞争
-            RasterizeTile(tile_id, tile_triangles[tile_id], 
-                         tiles_x, tiles_y, TILE_SIZE,
-                         tile_depth_buffer.get(), tile_color_buffer.get(),
-                         depthBuffer_per_thread, colorBuffer_per_thread,
-                         early_z_enabled_, &scratch_fragments);
+            // 按照 tile 进行光栅化（SoA）
+            RasterizeTile(tile_id, tile_triangles[tile_id],
+                             tiles_x, tiles_y, TILE_SIZE,
+                             tile_depth_buffer.get(), tile_color_buffer.get(),
+                             depthBuffer_per_thread, colorBuffer_per_thread,
+                             soa, early_z_enabled_, &scratch_fragments);
         }
     }
     auto rasterization_end_time = std::chrono::high_resolution_clock::now();
     auto rasterization_duration = std::chrono::duration_cast<std::chrono::microseconds>(
         rasterization_end_time - rasterization_start_time);
-    
+
     // 5. 合并所有线程结果
     auto merge_start_time = std::chrono::high_resolution_clock::now();
     std::unique_ptr<float[]> depthBuffer = 
@@ -936,11 +836,11 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
     auto merge_end_time = std::chrono::high_resolution_clock::now();
     auto merge_duration = std::chrono::duration_cast<std::chrono::microseconds>(
         merge_end_time - merge_start_time);
-    
+
     auto total_end_time = std::chrono::high_resolution_clock::now();
     auto total_duration = std::chrono::duration_cast<std::chrono::microseconds>(
         total_end_time - total_start_time);
-    
+
     // 填充统计信息
     stats.setup_ms = setup_duration.count() / 1000.0;
     stats.binning_ms = binning_duration.count() / 1000.0;
@@ -948,7 +848,7 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
     stats.rasterization_ms = rasterization_duration.count() / 1000.0;
     stats.merge_ms = merge_duration.count() / 1000.0;
     stats.total_ms = total_duration.count() / 1000.0;
-    
+
     return stats;
 }
 

From 05492110e6214826c41bfb837e91aaf29b938abc Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Sun, 7 Sep 2025 20:13:37 +0800
Subject: [PATCH 13/24] TBR: Use global framebuffer to avoid merge overhead

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 src/rasterizer.cpp | 66 -----------------------------------------
 src/renderer.cpp   | 74 +++++++++++++---------------------------------
 2 files changed, 21 insertions(+), 119 deletions(-)

diff --git a/src/rasterizer.cpp b/src/rasterizer.cpp
index 1ee2fff..f4e251c 100644
--- a/src/rasterizer.cpp
+++ b/src/rasterizer.cpp
@@ -92,72 +92,6 @@ std::vector<Fragment> Rasterizer::Rasterize(const Vertex& v0, const Vertex& v1,
   return fragments;
 }
 
-void Rasterizer::RasterizeTo(const Vertex& v0, const Vertex& v1, const Vertex& v2,
-                             int x0, int y0, int x1, int y1,
-                             std::vector<Fragment>& out) {
-  // 获取三角形的最小 box（屏幕空间）
-  Vector2f a = Vector2f(v0.GetPosition().x, v0.GetPosition().y);
-  Vector2f b = Vector2f(v1.GetPosition().x, v1.GetPosition().y);
-  Vector2f c = Vector2f(v2.GetPosition().x, v2.GetPosition().y);
-
-  Vector2f bboxMin =
-      Vector2f{std::min({a.x, b.x, c.x}), std::min({a.y, b.y, c.y})};
-  Vector2f bboxMax =
-      Vector2f{std::max({a.x, b.x, c.x}), std::max({a.y, b.y, c.y})};
-
-  // Clamp 到屏幕尺寸
-  float minx = std::max(0.0f, bboxMin.x);
-  float miny = std::max(0.0f, bboxMin.y);
-  float maxx = std::min(float(width_ - 1), bboxMax.x);
-  float maxy = std::min(float(height_ - 1), bboxMax.y);
-
-  // 与外部提供的裁剪区域（半开区间）相交，转成闭区间扫描
-  int sx = std::max(x0, int(std::floor(minx)));
-  int sy = std::max(y0, int(std::floor(miny)));
-  int ex = std::min(x1 - 1, int(std::floor(maxx)));
-  int ey = std::min(y1 - 1, int(std::floor(maxy)));
-
-  if (sx > ex || sy > ey) {
-    return;  // 与裁剪区域无交
-  }
-
-  // 透视矫正插值使用与 Rasterize 相同逻辑，但单线程写入 out
-  float w0_inv = v0.GetPosition().w;
-  float w1_inv = v1.GetPosition().w;
-  float w2_inv = v2.GetPosition().w;
-
-  for (int x = sx; x <= ex; ++x) {
-    for (int y = sy; y <= ey; ++y) {
-      auto [is_inside, barycentric_coord] = GetBarycentricCoord(
-          v0.GetPosition(), v1.GetPosition(), v2.GetPosition(),
-          Vector3f(static_cast<float>(x), static_cast<float>(y), 0));
-      if (!is_inside) continue;
-
-      // 插值 1/w 并进行透视矫正
-      float w_inv_interpolated = Interpolate(w0_inv, w1_inv, w2_inv, barycentric_coord);
-      Vector3f corrected_bary(
-          barycentric_coord.x * w0_inv / w_inv_interpolated,
-          barycentric_coord.y * w1_inv / w_inv_interpolated,
-          barycentric_coord.z * w2_inv / w_inv_interpolated);
-
-      auto z = Interpolate(v0.GetPosition().z, v1.GetPosition().z,
-                           v2.GetPosition().z, corrected_bary);
-
-      Fragment fragment;
-      fragment.screen_coord = {x, y};
-      fragment.normal = Interpolate(v0.GetNormal(), v1.GetNormal(),
-                                    v2.GetNormal(), corrected_bary);
-      fragment.uv = Interpolate(v0.GetTexCoords(), v1.GetTexCoords(),
-                                v2.GetTexCoords(), corrected_bary);
-      fragment.color = InterpolateColor(v0.GetColor(), v1.GetColor(),
-                                        v2.GetColor(), corrected_bary);
-      fragment.depth = z;
-
-      out.push_back(fragment);
-    }
-  }
-}
-
 void Rasterizer::RasterizeTo(const VertexSoA& soa, size_t i0, size_t i1, size_t i2,
                              int x0, int y0, int x1, int y1,
                              std::vector<Fragment>& out) {
diff --git a/src/renderer.cpp b/src/renderer.cpp
index 3167cf5..fb57a60 100755
--- a/src/renderer.cpp
+++ b/src/renderer.cpp
@@ -751,21 +751,14 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
     auto binning_duration = std::chrono::duration_cast<std::chrono::microseconds>(
         binning_end_time - binning_start_time);
 
-    // 3. 为每个线程创建framebuffer
+    // 3. 全局 framebuffer（单份）
+    // 直接让每个 tile 写入这份全局缓冲区，避免末端 O(W*H*kNProc) 合并开销
     auto buffer_alloc_start_time = std::chrono::high_resolution_clock::now();
-    std::vector<std::unique_ptr<float[]>> depthBuffer_all_thread(kNProc);
-    std::vector<std::unique_ptr<uint32_t[]>> colorBuffer_all_thread(kNProc);
-
-    for (size_t thread_id = 0; thread_id < kNProc; thread_id++) {
-        depthBuffer_all_thread[thread_id] = 
-            std::make_unique<float[]>(width_ * height_);
-        colorBuffer_all_thread[thread_id] = 
-            std::make_unique<uint32_t[]>(width_ * height_);
-
-        std::fill_n(depthBuffer_all_thread[thread_id].get(), width_ * height_,
-                    std::numeric_limits<float>::infinity());
-        std::fill_n(colorBuffer_all_thread[thread_id].get(), width_ * height_, 0);
-    }
+    std::unique_ptr<float[]> depthBuffer = std::make_unique<float[]>(width_ * height_);
+    std::unique_ptr<uint32_t[]> colorBuffer = std::make_unique<uint32_t[]>(width_ * height_);
+    // 深度初始化为最远值，颜色清零
+    std::fill_n(depthBuffer.get(), width_ * height_, std::numeric_limits<float>::infinity());
+    std::fill_n(colorBuffer.get(), width_ * height_, 0);
     auto buffer_alloc_end_time = std::chrono::high_resolution_clock::now();
     auto buffer_alloc_duration = std::chrono::duration_cast<std::chrono::microseconds>(
         buffer_alloc_end_time - buffer_alloc_start_time);
@@ -774,14 +767,12 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
     auto rasterization_start_time = std::chrono::high_resolution_clock::now();
 #pragma omp parallel num_threads(kNProc) default(none) \
     shared(tile_triangles, rasterizer_, shader_, width_, height_, \
-           depthBuffer_all_thread, colorBuffer_all_thread, tiles_x, tiles_y, total_tiles, \
+           depthBuffer, colorBuffer, tiles_x, tiles_y, total_tiles, \
            early_z_enabled_, soa)
     {
         int thread_id = omp_get_thread_num();
-        auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id];
-        auto &colorBuffer_per_thread = colorBuffer_all_thread[thread_id];
 
-        // 为当前线程创建tile局部缓冲区
+        // 为当前线程创建 tile 局部缓冲区（避免在全局缓冲上直接逐像素竞争）
         std::unique_ptr<float[]> tile_depth_buffer = 
             std::make_unique<float[]>(TILE_SIZE * TILE_SIZE);
         std::unique_ptr<uint32_t[]> tile_color_buffer = 
@@ -794,48 +785,24 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
 #pragma omp for
         for (size_t tile_id = 0; tile_id < total_tiles; tile_id++) {
             // 按照 tile 进行光栅化（SoA）
+            // 直接写入单份全局 framebuffer；不同 tile 不重叠，无需加锁
             RasterizeTile(tile_id, tile_triangles[tile_id],
-                             tiles_x, tiles_y, TILE_SIZE,
-                             tile_depth_buffer.get(), tile_color_buffer.get(),
-                             depthBuffer_per_thread, colorBuffer_per_thread,
-                             soa, early_z_enabled_, &scratch_fragments);
+                          tiles_x, tiles_y, TILE_SIZE,
+                          tile_depth_buffer.get(), tile_color_buffer.get(),
+                          depthBuffer, colorBuffer,
+                          soa, early_z_enabled_, &scratch_fragments);
         }
     }
     auto rasterization_end_time = std::chrono::high_resolution_clock::now();
     auto rasterization_duration = std::chrono::duration_cast<std::chrono::microseconds>(
         rasterization_end_time - rasterization_start_time);
 
-    // 5. 合并所有线程结果
-    auto merge_start_time = std::chrono::high_resolution_clock::now();
-    std::unique_ptr<float[]> depthBuffer = 
-        std::make_unique<float[]>(width_ * height_);
-    std::unique_ptr<uint32_t[]> colorBuffer = 
-        std::make_unique<uint32_t[]>(width_ * height_);
-
-    std::fill_n(depthBuffer.get(), width_ * height_,
-                std::numeric_limits<float>::infinity());
-    std::fill_n(colorBuffer.get(), width_ * height_, 0);
-
-#pragma omp parallel for
-    for (size_t i = 0; i < width_ * height_; i++) {
-        float min_depth = std::numeric_limits<float>::infinity();
-        uint32_t color = 0;
-
-        for (size_t thread_id = 0; thread_id < kNProc; thread_id++) {
-            float depth = depthBuffer_all_thread[thread_id][i];
-            if (depth < min_depth) {
-                min_depth = depth;
-                color = colorBuffer_all_thread[thread_id][i];
-            }
-        }
-        depthBuffer[i] = min_depth;
-        colorBuffer[i] = color;
-    }
-
+    // 5. 直接将单份全局 colorBuffer 拷贝到输出
+    auto present_start_time = std::chrono::high_resolution_clock::now();
     std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t));
-    auto merge_end_time = std::chrono::high_resolution_clock::now();
-    auto merge_duration = std::chrono::duration_cast<std::chrono::microseconds>(
-        merge_end_time - merge_start_time);
+    auto present_end_time = std::chrono::high_resolution_clock::now();
+    auto present_duration = std::chrono::duration_cast<std::chrono::microseconds>(
+        present_end_time - present_start_time);
 
     auto total_end_time = std::chrono::high_resolution_clock::now();
     auto total_duration = std::chrono::duration_cast<std::chrono::microseconds>(
@@ -846,7 +813,8 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
     stats.binning_ms = binning_duration.count() / 1000.0;
     stats.buffer_alloc_ms = buffer_alloc_duration.count() / 1000.0;
     stats.rasterization_ms = rasterization_duration.count() / 1000.0;
-    stats.merge_ms = merge_duration.count() / 1000.0;
+    // 合并阶段已被消除，仅为拷贝开销
+    stats.merge_ms = present_duration.count() / 1000.0;
     stats.total_ms = total_duration.count() / 1000.0;
 
     return stats;

From 258607acca4879916af23e4431baa8003a340cf9 Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Thu, 11 Sep 2025 14:26:35 +0800
Subject: [PATCH 14/24] TBR: Optimize global buffer write-back logic

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 src/renderer.cpp | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/renderer.cpp b/src/renderer.cpp
index fb57a60..b449b93 100755
--- a/src/renderer.cpp
+++ b/src/renderer.cpp
@@ -536,7 +536,7 @@ void SimpleRenderer::RasterizeTile(
   size_t tile_width = screen_x_end - screen_x_start;
   size_t tile_height = screen_y_end - screen_y_start;
   std::fill_n(tile_depth_buffer, tile_width * tile_height, 1.0f);
-  std::fill_n(tile_color_buffer, tile_width * tile_height, 0);
+  std::fill_n(tile_color_buffer, tile_width * tile_height, 0); // 默认背景色为0/黑色
 
   for (const auto &tri : triangles) { // 用来应对scratch传入nullptr的情况
     // 始终走 SoA + 限制矩形的光栅化路径；如未提供 scratch，则使用函数内局部容器
@@ -579,15 +579,21 @@ void SimpleRenderer::RasterizeTile(
   }
 
   // 写回全局缓冲
+  // TBR 下不同 tile 覆盖的屏幕区域互不重叠，且在 tile 内部已通过 Early‑Z
+  // 得出每个像素的最终值。因此可以直接将 tile 行数据拷贝到全局缓冲
   for (size_t y = 0; y < tile_height; y++) {
-    for (size_t x = 0; x < tile_width; x++) {
-      size_t tile_index = x + y * tile_width;
-      size_t global_index = (screen_x_start + x) + (screen_y_start + y) * width_;
-      if (tile_depth_buffer[tile_index] < global_depth_buffer[global_index]) {
-        global_depth_buffer[global_index] = tile_depth_buffer[tile_index];
-        global_color_buffer[global_index] = tile_color_buffer[tile_index];
-      }
-    }
+    const size_t tile_row_off   = y * tile_width;
+    const size_t global_row_off = (screen_y_start + y) * width_ + screen_x_start;
+
+    // 拷贝本行 color 到全局 color
+    std::memcpy(global_color_buffer.get() + global_row_off,
+                tile_color_buffer + tile_row_off,
+                tile_width * sizeof(uint32_t));
+
+    // 拷贝本行 depth 到全局 depth
+    std::memcpy(global_depth_buffer.get() + global_row_off,
+                tile_depth_buffer + tile_row_off,
+                tile_width * sizeof(float));
   }
 }
 

From ffe0d756aee01a2356adf4a5ea904b68a0be3445 Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Fri, 12 Sep 2025 20:00:42 +0800
Subject: [PATCH 15/24] Optimize perspective correction, add helper func,
 simplify code

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 src/include/rasterizer.hpp |  18 ++++-
 src/include/renderer.h     |  36 +++++++---
 src/include/vertex.hpp     |  43 ++++++++----
 src/include/vertex_soa.hpp |  33 ---------
 src/rasterizer.cpp         |  89 +++++++++++++-----------
 src/renderer.cpp           | 139 ++++++++++++++++++-------------------
 test/system_test/main.cpp  |  14 +---
 7 files changed, 188 insertions(+), 184 deletions(-)
 delete mode 100644 src/include/vertex_soa.hpp

diff --git a/src/include/rasterizer.hpp b/src/include/rasterizer.hpp
index 24e4a20..c389f49 100644
--- a/src/include/rasterizer.hpp
+++ b/src/include/rasterizer.hpp
@@ -3,7 +3,7 @@
 
 #include "config.h"
 #include "shader.hpp"
-#include "vertex_soa.hpp"
+#include "vertex.hpp"
 
 namespace simple_renderer {
 
@@ -36,13 +36,25 @@ class Rasterizer {
  private:
   size_t width_, height_;
 
+  // 透视矫正结果
+  struct PerspectiveCorrectionResult {
+    Vector3f corrected_barycentric;
+    float interpolated_z;
+  };
+
+  // 透视矫正helper函数
+  PerspectiveCorrectionResult PerformPerspectiveCorrection(
+      float w0, float w1, float w2,
+      float z0, float z1, float z2,
+      const Vector3f& original_barycentric) const;
+
   template <typename T>
   T Interpolate(const T& v0, const T& v1, const T& v2,
-                const Vector3f& barycentric_coord);
+                const Vector3f& barycentric_coord) const;
 
   Color InterpolateColor(const Color& color0, const Color& color1,
                          const Color& color2,
-                         const Vector3f& barycentric_coord);
+                         const Vector3f& barycentric_coord) const;
 
   std::pair<bool, Vector3f> GetBarycentricCoord(const Vector3f& p0,
                                                 const Vector3f& p1,
diff --git a/src/include/renderer.h b/src/include/renderer.h
index 56c84c8..f239910 100755
--- a/src/include/renderer.h
+++ b/src/include/renderer.h
@@ -20,6 +20,7 @@
 #include <cstdint>
 #include <functional>
 #include <span>
+#include <string>
 
 #include "buffer.hpp"
 #include "light.h"
@@ -38,6 +39,10 @@ enum class RenderingMode {
   DEFERRED      // 延迟渲染模式 - 经典GPU管线教学模拟
 };
 
+// RenderingMode辅助函数声明
+std::string RenderingModeToString(RenderingMode mode);
+std::string RenderingModeToDetailedString(RenderingMode mode);
+
 
 // SoA 版 tile 列表中的三角形引用（仅存索引与材质指针）
 struct TriangleRef {
@@ -94,11 +99,15 @@ class SimpleRenderer {
   const size_t width_;
   LogSystem log_system_;
   RenderingMode current_mode_;  // 当前渲染模式
-  bool early_z_enabled_;        // Early-Z优化开关
+  bool is_early_z_enabled_;        // Early-Z优化开关
 
   std::shared_ptr<Shader> shader_;
   std::shared_ptr<Rasterizer> rasterizer_;
 
+  // Rendering constants
+  static constexpr float kMinWValue = 1e-6f;      // W分量检查阈值（避免除零）
+  static constexpr size_t kDefaultTileSize = 64;  // 默认Tile大小（64x64像素）
+
   /**
    * 执行绘制管线
    * @param model 模型
@@ -125,13 +134,6 @@ class SimpleRenderer {
                                         const std::vector<Vertex> &processedVertices,
                                         uint32_t *buffer);
 
-  /**
-   * Tile-based光栅化渲染
-   * @param model 模型
-   * @param processedVertices 已处理的顶点
-   * @param buffer 输出缓冲区
-   * @return 渲染统计信息
-   */
   struct TileRenderStats {
     double setup_ms;
     double binning_ms;
@@ -167,10 +169,8 @@ class SimpleRenderer {
                                              const std::vector<Vertex> &processedVertices,
                                              uint32_t *buffer);
 
-  
 private:
 
-
   // SoA 版本的 Triangle-Tile binning（两遍计数 + reserve）
   void TriangleTileBinning(
     const Model &model,
@@ -191,7 +191,6 @@ class SimpleRenderer {
     bool use_early_z = false,
     std::vector<Fragment>* scratch_fragments = nullptr);
 
-  
   /**
    * 透视除法 - 将裁剪空间坐标转换为归一化设备坐标(NDC)
    * @param vertex 裁剪空间坐标的顶点
@@ -206,6 +205,21 @@ class SimpleRenderer {
    */
   Vertex ViewportTransformation(const Vertex &vertex);
   
+  /**
+   * 打印传统渲染性能统计信息
+   */
+  void PrintTraditionalStats(double vertex_ms, const RenderStats& stats) const;
+  
+  /**
+   * 打印基于Tile渲染性能统计信息
+   */
+  void PrintTileBasedStats(double vertex_ms, const TileRenderStats& stats) const;
+  
+  /**
+   * 打印延迟渲染性能统计信息
+   */
+  void PrintDeferredStats(double vertex_ms, const DeferredRenderStats& stats) const;
+  
 };
 }  // namespace simple_renderer
 
diff --git a/src/include/vertex.hpp b/src/include/vertex.hpp
index bff0680..b00f648 100644
--- a/src/include/vertex.hpp
+++ b/src/include/vertex.hpp
@@ -1,6 +1,9 @@
 #ifndef SIMPLERENDER_SRC_INCLUDE_VERTEX_HPP_
 #define SIMPLERENDER_SRC_INCLUDE_VERTEX_HPP_
 
+#include <vector>
+#include <optional>
+
 #include <math.hpp>
 
 #include "color.h"
@@ -31,18 +34,13 @@ class Vertex {
   // 析构函数
   ~Vertex() = default;
 
-  // Constructor with parameters 带参数的构造函数
-  explicit Vertex(const Vector4f& pos, const Vector3f& norm,
-                  const Vector2f& tex, const Color& color_)
-      : position_(pos), normal_(norm), texCoords_(tex), color_(color_),
-        clip_position_(pos), has_clip_position_(false) {}
-        
-  // 扩展构造函数：包含裁剪空间坐标
+  // Constructor with parameters: optional clip space coordinate
+  // 带参数的构造函数：可选的裁剪空间坐标
   explicit Vertex(const Vector4f& pos, const Vector3f& norm,
                   const Vector2f& tex, const Color& color_,
-                  const Vector4f& clip_pos)
+                  std::optional<Vector4f> clip_pos = std::nullopt)
       : position_(pos), normal_(norm), texCoords_(tex), color_(color_),
-        clip_position_(clip_pos), has_clip_position_(true) {}
+        clip_position_(clip_pos) {}
 
   // Transform the vertex with a matrix     使用矩阵变换顶点
   void transform(const Matrix4f& matrix) { position_ = matrix * position_; }
@@ -55,8 +53,8 @@ class Vertex {
   [[nodiscard]] inline Color GetColor() const { return color_; }
   
   // 扩展坐标访问
-  [[nodiscard]] inline Vector4f GetClipPosition() const { return clip_position_; }
-  [[nodiscard]] inline bool HasClipPosition() const { return has_clip_position_; }
+  [[nodiscard]] inline std::optional<Vector4f> GetClipPosition() const { return clip_position_; }
+  [[nodiscard]] inline bool HasClipPosition() const { return clip_position_.has_value(); }
 
  private:
   Vector4f position_;   // 3D position, 3D顶点坐标
@@ -65,8 +63,7 @@ class Vertex {
   Color color_;
   
   // 扩展坐标用于裁剪优化
-  Vector4f clip_position_; // 裁剪空间坐标 (用于视锥体裁剪)
-  bool has_clip_position_; // 是否包含裁剪坐标
+  std::optional<Vector4f> clip_position_; // 裁剪空间坐标 (用于视锥体裁剪)
 };
 
 inline Vertex operator*(const Matrix4f& matrix, const Vertex& vertex) {
@@ -75,6 +72,26 @@ inline Vertex operator*(const Matrix4f& matrix, const Vertex& vertex) {
                 vertex.GetColor());
 }
 
+// Minimal SoA layout for TBR pipeline
+struct VertexSoA {
+  // 屏幕空间坐标（视口变换后）
+  std::vector<Vector4f> pos_screen;  // screen space position (x,y,z,w)
+  // 裁剪空间坐标（用于视锥体剔除）：clip = MVP * pos
+  std::vector<Vector4f> pos_clip;
+  std::vector<Vector3f> normal;
+  std::vector<Vector2f> uv;
+  std::vector<Color>    color;
+
+  inline size_t size() const { return pos_screen.size(); }
+  inline void resize(size_t n) {
+    pos_screen.resize(n);
+    pos_clip.resize(n);
+    normal.resize(n);
+    uv.resize(n);
+    color.resize(n);
+  }
+};
+
 }  // namespace simple_renderer
 
 #endif
\ No newline at end of file
diff --git a/src/include/vertex_soa.hpp b/src/include/vertex_soa.hpp
deleted file mode 100644
index 4c5806a..0000000
--- a/src/include/vertex_soa.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// Minimal SoA layout for TBR pipeline (Phase 1)
-#ifndef SIMPLERENDER_SRC_INCLUDE_VERTEX_SOA_HPP_
-#define SIMPLERENDER_SRC_INCLUDE_VERTEX_SOA_HPP_
-
-#include <vector>
-
-#include "math.hpp"
-#include "color.h"
-
-namespace simple_renderer {
-
-struct VertexSoA {
-  // 屏幕空间坐标（视口变换后）
-  std::vector<Vector4f> pos_screen;  // screen space position (x,y,z,w)
-  // 裁剪空间坐标（用于视锥体剔除）：clip = MVP * pos
-  std::vector<Vector4f> pos_clip;
-  std::vector<Vector3f> normal;
-  std::vector<Vector2f> uv;
-  std::vector<Color>    color;
-
-  inline size_t size() const { return pos_screen.size(); }
-  inline void resize(size_t n) {
-    pos_screen.resize(n);
-    pos_clip.resize(n);
-    normal.resize(n);
-    uv.resize(n);
-    color.resize(n);
-  }
-};
-
-}  // namespace simple_renderer
-
-#endif  // SIMPLERENDER_SRC_INCLUDE_VERTEX_SOA_HPP_
diff --git a/src/rasterizer.cpp b/src/rasterizer.cpp
index f4e251c..a30101b 100644
--- a/src/rasterizer.cpp
+++ b/src/rasterizer.cpp
@@ -50,24 +50,13 @@ std::vector<Fragment> Rasterizer::Rasterize(const Vertex& v0, const Vertex& v1,
         }
 
         // 透视矫正插值
-        // 1. 获取三个顶点的1/w值
-        float w0_inv = v0.GetPosition().w;
-        float w1_inv = v1.GetPosition().w;  
-        float w2_inv = v2.GetPosition().w;
+        auto perspective_result = PerformPerspectiveCorrection(
+            v0.GetPosition().w, v1.GetPosition().w, v2.GetPosition().w,
+            v0.GetPosition().z, v1.GetPosition().z, v2.GetPosition().z,
+            barycentric_coord);
         
-        // 2. 插值1/w
-        float w_inv_interpolated = Interpolate(w0_inv, w1_inv, w2_inv, barycentric_coord);
-        
-        // 3. 计算透视矫正的重心坐标
-        Vector3f corrected_bary(
-          barycentric_coord.x * w0_inv / w_inv_interpolated,
-          barycentric_coord.y * w1_inv / w_inv_interpolated,
-          barycentric_coord.z * w2_inv / w_inv_interpolated
-        );
-        
-        // 4. 使用矫正的重心坐标进行插值
-        auto z = Interpolate(v0.GetPosition().z, v1.GetPosition().z,
-                             v2.GetPosition().z, corrected_bary);
+        const Vector3f& corrected_bary = perspective_result.corrected_barycentric;
+        float z = perspective_result.interpolated_z;
 
 
         Fragment fragment;
@@ -114,17 +103,12 @@ void Rasterizer::RasterizeTo(const VertexSoA& soa, size_t i0, size_t i1, size_t
   float maxy = std::min(float(height_ - 1), bboxMax.y);
 
   // 与外部提供的裁剪区域相交（半开区间） -> 闭区间扫描
-  int sx = std::max(x0, int(std::floor(minx)));
-  int sy = std::max(y0, int(std::floor(miny)));
-  int ex = std::min(x1 - 1, int(std::floor(maxx)));
-  int ey = std::min(y1 - 1, int(std::floor(maxy)));
+  int sx = std::max(x0, static_cast<int>(std::floor(minx)));
+  int sy = std::max(y0, static_cast<int>(std::floor(miny)));
+  int ex = std::min(x1 - 1, static_cast<int>(std::floor(maxx)));
+  int ey = std::min(y1 - 1, static_cast<int>(std::floor(maxy)));
   if (sx > ex || sy > ey) return;
 
-  // 透视矫正插值依赖 w
-  float w0_inv = p0.w;
-  float w1_inv = p1.w;
-  float w2_inv = p2.w;
-
   for (int x = sx; x <= ex; ++x) {
     for (int y = sy; y <= ey; ++y) {
       auto [is_inside, bary] = GetBarycentricCoord(
@@ -132,21 +116,21 @@ void Rasterizer::RasterizeTo(const VertexSoA& soa, size_t i0, size_t i1, size_t
           Vector3f(static_cast<float>(x), static_cast<float>(y), 0));
       if (!is_inside) continue;
 
-      float w_inv_interp = Interpolate(w0_inv, w1_inv, w2_inv, bary);
-      Vector3f cb(
-          bary.x * w0_inv / w_inv_interp,
-          bary.y * w1_inv / w_inv_interp,
-          bary.z * w2_inv / w_inv_interp);
+      // 透视矫正插值
+      auto perspective_result = PerformPerspectiveCorrection(
+          p0.w, p1.w, p2.w,
+          p0.z, p1.z, p2.z,
+          bary);
+      
+      const Vector3f& corrected_bary = perspective_result.corrected_barycentric;
+      float z = perspective_result.interpolated_z;
 
-      float z = Interpolate(p0.z, p1.z, p2.z, cb);
-
-      Fragment frag;
+      Fragment frag; // Note: material 指针由调用方填写
       frag.screen_coord = {x, y};
-      frag.normal = Interpolate(soa.normal[i0], soa.normal[i1], soa.normal[i2], cb);
-      frag.uv     = Interpolate(soa.uv[i0],     soa.uv[i1],     soa.uv[i2],     cb);
-      frag.color  = InterpolateColor(soa.color[i0], soa.color[i1], soa.color[i2], cb);
+      frag.normal = Interpolate(soa.normal[i0], soa.normal[i1], soa.normal[i2], corrected_bary);
+      frag.uv     = Interpolate(soa.uv[i0],     soa.uv[i1],     soa.uv[i2],     corrected_bary);
+      frag.color  = InterpolateColor(soa.color[i0], soa.color[i1], soa.color[i2], corrected_bary);
       frag.depth  = z;
-      // material 指针由调用方填写
 
       out.push_back(frag);
     }
@@ -182,14 +166,14 @@ std::pair<bool, Vector3f> Rasterizer::GetBarycentricCoord(const Vector3f& p0,
  
 template <typename T>
 T Rasterizer::Interpolate(const T& v0, const T& v1, const T& v2,
-                          const Vector3f& barycentric_coord) {
+                          const Vector3f& barycentric_coord) const {
   return v0 * barycentric_coord.x + v1 * barycentric_coord.y +
          v2 * barycentric_coord.z;
 }
 
 Color Rasterizer::InterpolateColor(const Color& color0, const Color& color1,
                                    const Color& color2,
-                                   const Vector3f& barycentric_coord) {
+                                   const Vector3f& barycentric_coord) const {
   auto color_r = FloatToUint8_t(
       static_cast<float>(color0[Color::kColorIndexRed]) * barycentric_coord.x +
       static_cast<float>(color1[Color::kColorIndexRed]) * barycentric_coord.y +
@@ -208,6 +192,31 @@ Color Rasterizer::InterpolateColor(const Color& color0, const Color& color1,
   return Color(color_r, color_g, color_b);
 }
 
+// 透视矫正helper函数：在透视投影下，1/w 在屏幕空间中是线性的// 因此需要先对 1/w 进行插值，再用结果矫正其他属性
+Rasterizer::PerspectiveCorrectionResult Rasterizer::PerformPerspectiveCorrection(
+    float w0, float w1, float w2,
+    float z0, float z1, float z2,
+    const Vector3f& original_barycentric) const {
+    
+  // 1. 插值 1/w （注意：这里传入的w0,w1,w2是原始的w值，需要先求倒数）
+  float w0_inv = 1.0f / w0;
+  float w1_inv = 1.0f / w1;
+  float w2_inv = 1.0f / w2;
+  float w_inv_interpolated = Interpolate(w0_inv, w1_inv, w2_inv, original_barycentric);
+  
+  // 2. 计算透视矫正的重心坐标
+  Vector3f corrected_barycentric(
+      original_barycentric.x * w0_inv / w_inv_interpolated,
+      original_barycentric.y * w1_inv / w_inv_interpolated,
+      original_barycentric.z * w2_inv / w_inv_interpolated
+  );
+  
+  // 3. 使用矫正的重心坐标插值深度值
+  float interpolated_z = Interpolate(z0, z1, z2, corrected_barycentric);
+  
+  return {corrected_barycentric, interpolated_z};
+}
+
 // Calculate the normal vector based on the vertices
 // 根据顶点计算法向量
 Vector3f Rasterizer::CalculateNormal(const Vector3f& v0, const Vector3f& v1,
diff --git a/src/renderer.cpp b/src/renderer.cpp
index b449b93..f6e5984 100755
--- a/src/renderer.cpp
+++ b/src/renderer.cpp
@@ -33,12 +33,23 @@
 
 namespace simple_renderer {
 
+// RenderingMode到字符串转换函数
+std::string RenderingModeToString(RenderingMode mode) {
+  switch(mode) {
+    case RenderingMode::TRADITIONAL:
+      return "TRADITIONAL";
+    case RenderingMode::TILE_BASED:
+      return "TILE_BASED";
+    case RenderingMode::DEFERRED:
+      return "DEFERRED";
+  }
+}
 SimpleRenderer::SimpleRenderer(size_t width, size_t height)
     : height_(height),
       width_(width),
       log_system_(LogSystem(kLogFilePath, kLogFileMaxSize, kLogFileMaxCount)),
       current_mode_(RenderingMode::TILE_BASED),
-      early_z_enabled_(true) {
+      is_early_z_enabled_(true) {
   rasterizer_ = std::make_shared<Rasterizer>(width, height);
 }
 
@@ -52,19 +63,7 @@ bool SimpleRenderer::DrawModel(const Model &model, const Shader &shader,
 
 void SimpleRenderer::SetRenderingMode(RenderingMode mode) {
   current_mode_ = mode;
-  std::string mode_name;
-  switch(mode) {
-    case RenderingMode::TRADITIONAL:
-      mode_name = "TRADITIONAL";
-      break;
-    case RenderingMode::TILE_BASED:
-      mode_name = "TILE_BASED";
-      break;
-    case RenderingMode::DEFERRED:
-      mode_name = "DEFERRED";
-      break;
-  }
-  SPDLOG_INFO("rendering mode set to: {}", mode_name);
+  SPDLOG_INFO("rendering mode set to: {}", RenderingModeToString(mode));
 }
 
 RenderingMode SimpleRenderer::GetRenderingMode() const {
@@ -79,19 +78,8 @@ fragments—resulting in faster rendering.
 通过在光栅化过程中执行深度测试，仅保留每个像素的深度值最近的片段，避免存储所有片段，从而优化性能，实现更快的渲染。
 */
 void SimpleRenderer::ExecuteDrawPipeline(const Model &model, uint32_t *buffer) {
-  std::string mode_name;
-  switch(current_mode_) {
-    case RenderingMode::TRADITIONAL:
-      mode_name = "TRADITIONAL";
-      break;
-    case RenderingMode::TILE_BASED:
-      mode_name = "TILE_BASED";
-      break;
-    case RenderingMode::DEFERRED:
-      mode_name = "DEFERRED";
-      break;
-  }
-  SPDLOG_INFO("execute draw pipeline for {} using {} mode", model.GetModelPath(), mode_name);
+  SPDLOG_INFO("execute draw pipeline for {} using {} mode", 
+              model.GetModelPath(), RenderingModeToString(current_mode_));
   
   if (!shader_) {
     SPDLOG_ERROR("No shader set for DrawModel, cannot render");
@@ -147,47 +135,19 @@ void SimpleRenderer::ExecuteDrawPipeline(const Model &model, uint32_t *buffer) {
   switch (current_mode_) {
     case RenderingMode::TRADITIONAL: {
       auto stats = ExecuteTraditionalPipeline(model, processedVertices, buffer);
-      double total_ms = vertex_ms + stats.total_ms;
-      
-      SPDLOG_INFO("=== TRADITIONAL RENDERING PERFORMANCE ===");
-      SPDLOG_INFO("Vertex Shader:    {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100);
-      SPDLOG_INFO("Buffer Alloc:     {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100);
-      SPDLOG_INFO("Rasterization:    {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100);
-      SPDLOG_INFO("Merge:            {:8.3f} ms ({:5.1f}%)", stats.merge_ms, stats.merge_ms/total_ms*100);
-      SPDLOG_INFO("Total:            {:8.3f} ms", total_ms);
-      SPDLOG_INFO("==========================================");
+      PrintTraditionalStats(vertex_ms, stats);
       break;
     }
     
     case RenderingMode::TILE_BASED: {
       auto stats = ExecuteTileBasedPipeline(model, processedSoA, buffer);
-      double total_ms = vertex_ms + stats.total_ms;
-      
-      SPDLOG_INFO("=== TILE-BASED RENDERING PERFORMANCE ===");
-      SPDLOG_INFO("Vertex Shader:    {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100);
-      SPDLOG_INFO("Setup:            {:8.3f} ms ({:5.1f}%)", stats.setup_ms, stats.setup_ms/total_ms*100);
-      SPDLOG_INFO("Binning:          {:8.3f} ms ({:5.1f}%)", stats.binning_ms, stats.binning_ms/total_ms*100);
-      SPDLOG_INFO("Buffer Alloc:     {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100);
-      SPDLOG_INFO("Rasterization:    {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100);
-      SPDLOG_INFO("Merge:            {:8.3f} ms ({:5.1f}%)", stats.merge_ms, stats.merge_ms/total_ms*100);
-      SPDLOG_INFO("Total:            {:8.3f} ms", total_ms);
-      SPDLOG_INFO("==========================================");
+      PrintTileBasedStats(vertex_ms, stats);
       break;
     }
     
     case RenderingMode::DEFERRED: {
       auto stats = ExecuteDeferredPipeline(model, processedVertices, buffer);
-      double total_ms = vertex_ms + stats.total_ms;
-      
-      SPDLOG_INFO("=== DEFERRED RENDERING PERFORMANCE ===");
-      SPDLOG_INFO("Vertex Shader:        {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100);
-      SPDLOG_INFO("Buffer Alloc:         {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100);
-      SPDLOG_INFO("Rasterization:        {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100);
-      SPDLOG_INFO("Fragment Collection:  {:8.3f} ms ({:5.1f}%)", stats.fragment_collection_ms, stats.fragment_collection_ms/total_ms*100);
-      SPDLOG_INFO("Fragment Merge:       {:8.3f} ms ({:5.1f}%)", stats.fragment_merge_ms, stats.fragment_merge_ms/total_ms*100);
-      SPDLOG_INFO("Deferred Shading:     {:8.3f} ms ({:5.1f}%)", stats.deferred_shading_ms, stats.deferred_shading_ms/total_ms*100);
-      SPDLOG_INFO("Total:                {:8.3f} ms", total_ms);
-      SPDLOG_INFO("=========================================");
+      PrintDeferredStats(vertex_ms, stats);
       break;
     }
   }
@@ -224,7 +184,7 @@ SimpleRenderer::DeferredRenderStats SimpleRenderer::ExecuteDeferredPipeline(
   std::vector<Material> material_cache;
   material_cache.reserve(model.GetFaces().size());
   for (const auto &f : model.GetFaces()) {
-    material_cache.push_back(f.GetMaterial()); // 值拷贝
+    material_cache.emplace_back(f.GetMaterial()); // 值拷贝
   }
   auto buffer_alloc_end_time = std::chrono::high_resolution_clock::now();
   auto buffer_alloc_duration = std::chrono::duration_cast<std::chrono::microseconds>(
@@ -234,8 +194,8 @@ SimpleRenderer::DeferredRenderStats SimpleRenderer::ExecuteDeferredPipeline(
   /* * * Rasterization * * */
   auto rasterization_start_time = std::chrono::high_resolution_clock::now();
 #pragma omp parallel num_threads(kNProc) default(none)                       \
-    shared(processedVertices, fragmentsBuffer_all_thread, rasterizer_, width_, \
-               height_, material_cache) firstprivate(model)
+  shared(processedVertices, fragmentsBuffer_all_thread, rasterizer_, width_, \
+               height_, material_cache, model)
   {
     int thread_id = omp_get_thread_num();
     auto &fragmentsBuffer_per_thread = fragmentsBuffer_all_thread[thread_id];
@@ -349,7 +309,7 @@ Vertex SimpleRenderer::PerspectiveDivision(const Vertex &vertex) {
   Vector4f position = vertex.GetPosition();
   
   // 检查w分量，避免除零和负数问题
-  if (position.w <= 1e-6f) {
+  if (position.w <= kMinWValue) {
     Vector4f farPosition(0.0f, 0.0f, 1.0f, 1.0f);
     return Vertex(farPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor());
   }
@@ -370,11 +330,7 @@ Vertex SimpleRenderer::PerspectiveDivision(const Vertex &vertex) {
   ndcPosition.z = std::clamp(ndcPosition.z, -1.0f, 1.0f);
   
   // 创建新的顶点，保持其他属性和裁剪空间坐标不变
-  if (vertex.HasClipPosition()) {
-    return Vertex(ndcPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor(), vertex.GetClipPosition());
-  } else {
-    return Vertex(ndcPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor());
-  }
+  return Vertex(ndcPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor(), vertex.GetClipPosition());
 }
 
 Vertex SimpleRenderer::ViewportTransformation(const Vertex &vertex) {
@@ -629,8 +585,7 @@ SimpleRenderer::RenderStats SimpleRenderer::ExecuteTraditionalPipeline(
     auto raster_start_time = std::chrono::high_resolution_clock::now();
 #pragma omp parallel num_threads(kNProc) default(none) \
     shared(processedVertices, rasterizer_, shader_, width_, height_, \
-           depthBuffer_all_thread, colorBuffer_all_thread) \
-    firstprivate(model)
+           depthBuffer_all_thread, colorBuffer_all_thread, model)
     {
         int thread_id = omp_get_thread_num();
         auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id];
@@ -739,7 +694,7 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
 
     // 1. Setup阶段
     auto setup_start_time = std::chrono::high_resolution_clock::now();
-    const size_t TILE_SIZE = 64; // 64x64 pixels per tile
+    const size_t TILE_SIZE = kDefaultTileSize; // Default tile size per tile
     const size_t tiles_x = (width_ + TILE_SIZE - 1) / TILE_SIZE;
     const size_t tiles_y = (height_ + TILE_SIZE - 1) / TILE_SIZE;
     const size_t total_tiles = tiles_x * tiles_y;
@@ -774,7 +729,7 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
 #pragma omp parallel num_threads(kNProc) default(none) \
     shared(tile_triangles, rasterizer_, shader_, width_, height_, \
            depthBuffer, colorBuffer, tiles_x, tiles_y, total_tiles, \
-           early_z_enabled_, soa)
+           is_early_z_enabled_, soa)
     {
         int thread_id = omp_get_thread_num();
 
@@ -796,7 +751,7 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
                           tiles_x, tiles_y, TILE_SIZE,
                           tile_depth_buffer.get(), tile_color_buffer.get(),
                           depthBuffer, colorBuffer,
-                          soa, early_z_enabled_, &scratch_fragments);
+                          soa, is_early_z_enabled_, &scratch_fragments);
         }
     }
     auto rasterization_end_time = std::chrono::high_resolution_clock::now();
@@ -826,4 +781,44 @@ SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
     return stats;
 }
 
+void SimpleRenderer::PrintTraditionalStats(double vertex_ms, const RenderStats& stats) const {
+  double total_ms = vertex_ms + stats.total_ms;
+  
+  SPDLOG_INFO("=== TRADITIONAL RENDERING PERFORMANCE ===");
+  SPDLOG_INFO("Vertex Shader:    {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100);
+  SPDLOG_INFO("Buffer Alloc:     {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100);
+  SPDLOG_INFO("Rasterization:    {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100);
+  SPDLOG_INFO("Merge:            {:8.3f} ms ({:5.1f}%)", stats.merge_ms, stats.merge_ms/total_ms*100);
+  SPDLOG_INFO("Total:            {:8.3f} ms", total_ms);
+  SPDLOG_INFO("==========================================");
+}
+
+void SimpleRenderer::PrintTileBasedStats(double vertex_ms, const TileRenderStats& stats) const {
+  double total_ms = vertex_ms + stats.total_ms;
+  
+  SPDLOG_INFO("=== TILE-BASED RENDERING PERFORMANCE ===");
+  SPDLOG_INFO("Vertex Shader:    {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100);
+  SPDLOG_INFO("Setup:            {:8.3f} ms ({:5.1f}%)", stats.setup_ms, stats.setup_ms/total_ms*100);
+  SPDLOG_INFO("Binning:          {:8.3f} ms ({:5.1f}%)", stats.binning_ms, stats.binning_ms/total_ms*100);
+  SPDLOG_INFO("Buffer Alloc:     {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100);
+  SPDLOG_INFO("Rasterization:    {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100);
+  SPDLOG_INFO("Merge:            {:8.3f} ms ({:5.1f}%)", stats.merge_ms, stats.merge_ms/total_ms*100);
+  SPDLOG_INFO("Total:            {:8.3f} ms", total_ms);
+  SPDLOG_INFO("==========================================");
+}
+
+void SimpleRenderer::PrintDeferredStats(double vertex_ms, const DeferredRenderStats& stats) const {
+  double total_ms = vertex_ms + stats.total_ms;
+  
+  SPDLOG_INFO("=== DEFERRED RENDERING PERFORMANCE ===");
+  SPDLOG_INFO("Vertex Shader:        {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100);
+  SPDLOG_INFO("Buffer Alloc:         {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100);
+  SPDLOG_INFO("Rasterization:        {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100);
+  SPDLOG_INFO("Fragment Collection:  {:8.3f} ms ({:5.1f}%)", stats.fragment_collection_ms, stats.fragment_collection_ms/total_ms*100);
+  SPDLOG_INFO("Fragment Merge:       {:8.3f} ms ({:5.1f}%)", stats.fragment_merge_ms, stats.fragment_merge_ms/total_ms*100);
+  SPDLOG_INFO("Deferred Shading:     {:8.3f} ms ({:5.1f}%)", stats.deferred_shading_ms, stats.deferred_shading_ms/total_ms*100);
+  SPDLOG_INFO("Total:                {:8.3f} ms", total_ms);
+  SPDLOG_INFO("=========================================");
+}
+
 }  // namespace simple_renderer
diff --git a/test/system_test/main.cpp b/test/system_test/main.cpp
index 0f222b5..9725181 100755
--- a/test/system_test/main.cpp
+++ b/test/system_test/main.cpp
@@ -84,18 +84,8 @@ int main(int argc, char **argv) {
   simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::TILE_BASED);
   
   // 输出当前渲染模式
-  std::string current_mode_name;
-  switch(simple_renderer.GetRenderingMode()) {
-    case simple_renderer::RenderingMode::TRADITIONAL:
-      current_mode_name = "TRADITIONAL (传统光栅化)";
-      break;
-    case simple_renderer::RenderingMode::TILE_BASED:
-      current_mode_name = "TILE_BASED (基于Tile光栅化)";
-      break;
-    case simple_renderer::RenderingMode::DEFERRED:
-      current_mode_name = "DEFERRED (模仿GPU的延迟渲染)";
-      break;
-  }
+  std::string current_mode_name = simple_renderer::RenderingModeToString(
+      simple_renderer.GetRenderingMode());
   SPDLOG_INFO("当前渲染模式: {}", current_mode_name);
 
   auto display = Display(kWidth, kHeight);

From 957c9b0e10cc93f83f931ec29f08fdd0d014489d Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Fri, 12 Sep 2025 23:24:44 +0800
Subject: [PATCH 16/24] Refactor: Extract pipeline into standalone class;
 rename TraditionalPipeline to PerTriangle for consistency with TileBased.
 Switch core function comments to Doxygen style.

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 src/include/face.hpp                          |   2 +-
 src/include/rasterizer.hpp                    |  43 +-
 src/include/renderer.h                        | 215 ++---
 src/include/renderers/deferred_renderer.hpp   |  31 +
 .../renderers/per_triangle_renderer.hpp       |  28 +
 src/include/renderers/renderer_base.hpp       |  66 ++
 src/include/renderers/tile_based_renderer.hpp | 110 +++
 src/rasterizer.cpp                            |  56 ++
 src/renderer.cpp                              | 821 +-----------------
 src/renderers/deferred_renderer.cpp           | 146 ++++
 src/renderers/per_triangle_renderer.cpp       | 172 ++++
 src/renderers/renderer_base.cpp               |  44 +
 src/renderers/tile_based_renderer.cpp         | 366 ++++++++
 test/system_test/main.cpp                     |   2 +-
 14 files changed, 1151 insertions(+), 951 deletions(-)
 create mode 100644 src/include/renderers/deferred_renderer.hpp
 create mode 100644 src/include/renderers/per_triangle_renderer.hpp
 create mode 100644 src/include/renderers/renderer_base.hpp
 create mode 100644 src/include/renderers/tile_based_renderer.hpp
 mode change 100755 => 100644 src/renderer.cpp
 create mode 100644 src/renderers/deferred_renderer.cpp
 create mode 100644 src/renderers/per_triangle_renderer.cpp
 create mode 100644 src/renderers/renderer_base.cpp
 create mode 100644 src/renderers/tile_based_renderer.cpp

diff --git a/src/include/face.hpp b/src/include/face.hpp
index 28a5b30..49f0754 100644
--- a/src/include/face.hpp
+++ b/src/include/face.hpp
@@ -40,7 +40,7 @@ class Face {
   // Get functions
   // 获取函数
   inline const std::array<size_t, 3>& GetIndices() const { return indices_; }
-  inline const size_t GetIndex(size_t index) const { return indices_[index]; }
+  inline size_t GetIndex(size_t index) const { return indices_[index]; }
   inline const Material& GetMaterial() const { return material_; }
 
  private:
diff --git a/src/include/rasterizer.hpp b/src/include/rasterizer.hpp
index c389f49..cd0b349 100644
--- a/src/include/rasterizer.hpp
+++ b/src/include/rasterizer.hpp
@@ -16,19 +16,54 @@ class Rasterizer {
   auto operator=(Rasterizer&& rasterizer) -> Rasterizer& = default;
   ~Rasterizer() = default;
 
+  /**
+   * @brief 构造具有指定尺寸的光栅化器
+   * @param width 光栅化器宽度
+   * @param height 光栅化器高度
+   */
   Rasterizer(size_t width, size_t height);
 
+  /**
+   * @brief 光栅化三角形，生成片段列表
+   * @param v0 三角形第一个顶点
+   * @param v1 三角形第二个顶点
+   * @param v2 三角形第三个顶点
+   * @return 生成的片段向量
+   */
   std::vector<Fragment> Rasterize(const Vertex& v0, const Vertex& v1,
                                   const Vertex& v2);
 
-  // 非分配版本：将片段直接写入调用方提供的容器
-  // 可选的裁剪区域为半开区间 [x0, x1) × [y0, y1)
-  // 用于 TBR：将光栅化限制在 tile 边界内，便于复用外部 scratch 容器
+  /**
+   * @brief 非分配版本：将片段直接写入调用方提供的容器
+   * 
+   * 可选的裁剪区域为半开区间 [x0, x1) × [y0, y1)
+   * 用于 TBR：将光栅化限制在 tile 边界内，便于复用外部 scratch 容器
+   * 
+   * @param v0 三角形第一个顶点
+   * @param v1 三角形第二个顶点
+   * @param v2 三角形第三个顶点
+   * @param x0 裁剪区域左边界（包含）
+   * @param y0 裁剪区域上边界（包含）
+   * @param x1 裁剪区域右边界（不包含）
+   * @param y1 裁剪区域下边界（不包含）
+   * @param out 输出片段容器
+   */
   void RasterizeTo(const Vertex& v0, const Vertex& v1, const Vertex& v2,
                    int x0, int y0, int x1, int y1,
                    std::vector<Fragment>& out);
 
-  // SoA 版本：按顶点索引从 SoA 读取三角形三顶点
+  /**
+   * @brief SoA 版本：按顶点索引从 SoA 读取三角形三顶点
+   * @param soa 结构体数组格式的顶点数据
+   * @param i0 三角形第一个顶点索引
+   * @param i1 三角形第二个顶点索引
+   * @param i2 三角形第三个顶点索引
+   * @param x0 裁剪区域左边界（包含）
+   * @param y0 裁剪区域上边界（包含）
+   * @param x1 裁剪区域右边界（不包含）
+   * @param y1 裁剪区域下边界（不包含）
+   * @param out 输出片段容器
+   */
   void RasterizeTo(const VertexSoA& soa, size_t i0, size_t i1, size_t i2,
                    int x0, int y0, int x1, int y1,
                    std::vector<Fragment>& out);
diff --git a/src/include/renderer.h b/src/include/renderer.h
index f239910..e11c93f 100755
--- a/src/include/renderer.h
+++ b/src/include/renderer.h
@@ -18,208 +18,95 @@
 #define SIMPLERENDER_SRC_INCLUDE_RENDERER_H_
 
 #include <cstdint>
-#include <functional>
-#include <span>
+#include <memory>
 #include <string>
 
-#include "buffer.hpp"
-#include "light.h"
 #include "log_system.h"
-#include "math.hpp"
 #include "model.hpp"
-#include "rasterizer.hpp"
 #include "shader.hpp"
+#include "renderers/renderer_base.hpp"
 
 namespace simple_renderer {
 
 // 渲染模式枚举
+/**
+ * @brief 渲染模式
+ * - PER_TRIANGLE: 逐三角形（triangle-major）前向渲染
+ * - TILE_BASED: 基于 tile（tile-major）前向渲染
+ * - DEFERRED: 延迟渲染（片段收集后再着色）
+ */
 enum class RenderingMode {
-  TRADITIONAL,  // 传统光栅化模式 - 立即深度测试
-  TILE_BASED,   // Tile-based光栅化模式 - 移动GPU架构
-  DEFERRED      // 延迟渲染模式 - 经典GPU管线教学模拟
+  PER_TRIANGLE,  //!< 逐三角形（triangle-major）
+  TILE_BASED,    //!< 基于 tile（tile-major）
+  DEFERRED       //!< 延迟渲染
 };
 
-// RenderingMode辅助函数声明
+/**
+ * @brief 将渲染模式枚举转为可读字符串
+ * @param mode 渲染模式
+ * @return 可读字符串（PER_TRIANGLE/TILE_BASED/DEFERRED）
+ */
 std::string RenderingModeToString(RenderingMode mode);
-std::string RenderingModeToDetailedString(RenderingMode mode);
-
-
-// SoA 版 tile 列表中的三角形引用（仅存索引与材质指针）
-struct TriangleRef {
-  size_t i0, i1, i2;
-  const Material* material = nullptr;
-  size_t face_index = 0;
-};
 
+/**
+ * @brief 渲染门面（Facade）
+ *
+ * 职责：
+ * - 仅作为模式选择与调用入口；
+ * - 根据 `RenderingMode` 构造并持有具体渲染器；
+ * - 对外暴露统一的 `DrawModel` 接口。
+ */
 class SimpleRenderer {
  public:
   /**
-   * 构造函数
-   * @param width
-   * @param height
-   * @param buffer 要进行绘制的内存区域，大小为 width*height*sizeof(uint32_t)
-   * @param
+   * @brief 构造渲染器门面
+   * @param width 画布宽度（像素）
+   * @param height 画布高度（像素）
    */
   SimpleRenderer(size_t width, size_t height);
-
-  /// @name 默认构造/析构函数
-  /// @{
-  SimpleRenderer(const SimpleRenderer &_simplerenderer) = default;
-  SimpleRenderer(SimpleRenderer &&_simplerenderer) = default;
-  auto operator=(const SimpleRenderer &_simplerenderer) -> SimpleRenderer & =
-                                                               default;
-  auto operator=(SimpleRenderer &&_simplerenderer) -> SimpleRenderer & =
-                                                          default;
-  virtual ~SimpleRenderer() = default;
-  /// @}
+  ~SimpleRenderer() = default;
 
   /**
-   * 绘制单个模型
-   * @param model 要绘制的模型
-   * @param shader 用于渲染的着色器
-   * @param buffer 输出缓冲区
-   * @return 绘制是否成功
+   * @brief 绘制单个模型
+   * @param model 模型
+   * @param shader 着色器（含 uniform）
+   * @param buffer 输出颜色缓冲（width*height）
+   * @return 是否成功
    */
   bool DrawModel(const Model &model, const Shader &shader, uint32_t *buffer);
 
   /**
-   * 设置渲染模式
-   * @param mode 渲染模式（传统或基于Tile）
+   * @brief 设置渲染模式
    */
   void SetRenderingMode(RenderingMode mode);
-
   /**
-   * 获取当前渲染模式
-   * @return 当前渲染模式
+   * @brief 获取当前渲染模式
    */
   RenderingMode GetRenderingMode() const;
 
- private:
-  const size_t height_;
-  const size_t width_;
-  LogSystem log_system_;
-  RenderingMode current_mode_;  // 当前渲染模式
-  bool is_early_z_enabled_;        // Early-Z优化开关
-
-  std::shared_ptr<Shader> shader_;
-  std::shared_ptr<Rasterizer> rasterizer_;
-
-  // Rendering constants
-  static constexpr float kMinWValue = 1e-6f;      // W分量检查阈值（避免除零）
-  static constexpr size_t kDefaultTileSize = 64;  // 默认Tile大小（64x64像素）
-
+  // 可选：配置参数（仅对 TileBasedRenderer 生效；运行中修改将重建 TBR 实例）
   /**
-   * 执行绘制管线
-   * @param model 模型
-   * @param buffer 输出缓冲区
+   * @brief 启用或禁用 Early‑Z（仅 TBR 有效）
    */
-  void ExecuteDrawPipeline(const Model &model, uint32_t *buffer);
-  
-
+  void SetEarlyZEnabled(bool enabled);
   /**
-   * 传统光栅化渲染
-   * @param model 模型
-   * @param processedVertices 已处理的顶点
-   * @param buffer 输出缓冲区
-   * @return 渲染统计信息
+   * @brief 设置 Tile 大小（仅 TBR 有效）
    */
-  struct RenderStats {
-    double buffer_alloc_ms;
-    double rasterization_ms;
-    double merge_ms;
-    double total_ms;
-  };
-  
-  RenderStats ExecuteTraditionalPipeline(const Model &model, 
-                                        const std::vector<Vertex> &processedVertices,
-                                        uint32_t *buffer);
+  void SetTileSize(size_t tile_size);
 
-  struct TileRenderStats {
-    double setup_ms;
-    double binning_ms;
-    double buffer_alloc_ms;
-    double rasterization_ms;
-    double merge_ms;
-    double total_ms;
-  };
-  
-  /**
-   * 延迟渲染统计信息
-   */
-  struct DeferredRenderStats {
-    double buffer_alloc_ms;
-    double rasterization_ms;
-    double fragment_collection_ms;
-    double fragment_merge_ms;
-    double deferred_shading_ms;
-    double total_ms;
-  };
-  TileRenderStats ExecuteTileBasedPipeline(const Model &model,
-                                              const VertexSoA &soa,
-                                              uint32_t *buffer);
-
-  /**
-   * 延迟渲染管线
-   * @param model 模型
-   * @param processedVertices 已处理的顶点
-   * @param buffer 输出缓冲区
-   * @return 渲染统计信息
-   */
-  DeferredRenderStats ExecuteDeferredPipeline(const Model &model,
-                                             const std::vector<Vertex> &processedVertices,
-                                             uint32_t *buffer);
-
-private:
-
-  // SoA 版本的 Triangle-Tile binning（两遍计数 + reserve）
-  void TriangleTileBinning(
-    const Model &model,
-    const VertexSoA &soa,
-    std::vector<std::vector<TriangleRef>> &tile_triangles,
-    size_t tiles_x, size_t tiles_y, size_t tile_size);
-
-
-  // SoA 版本的 tile 光栅化
-  void RasterizeTile(
-    size_t tile_id,
-    const std::vector<TriangleRef> &triangles,
-    size_t tiles_x, size_t tiles_y, size_t tile_size,
-    float* tile_depth_buffer, uint32_t* tile_color_buffer,
-    std::unique_ptr<float[]> &global_depth_buffer,
-    std::unique_ptr<uint32_t[]> &global_color_buffer,
-    const VertexSoA &soa,
-    bool use_early_z = false,
-    std::vector<Fragment>* scratch_fragments = nullptr);
+ private:
+  void EnsureRenderer();
 
-  /**
-   * 透视除法 - 将裁剪空间坐标转换为归一化设备坐标(NDC)
-   * @param vertex 裁剪空间坐标的顶点
-   * @return 转换后的顶点(NDC坐标)
-   */
-  Vertex PerspectiveDivision(const Vertex &vertex);
+ private:
+  const size_t height_;
+  const size_t width_;
+  LogSystem log_system_;
+  RenderingMode current_mode_;
+  std::unique_ptr<RendererBase> renderer_;
 
-  /**
-   * 视口变换 - 将NDC坐标转换为屏幕坐标
-   * @param vertex NDC坐标的顶点
-   * @return 转换后的顶点(屏幕坐标)
-   */
-  Vertex ViewportTransformation(const Vertex &vertex);
-  
-  /**
-   * 打印传统渲染性能统计信息
-   */
-  void PrintTraditionalStats(double vertex_ms, const RenderStats& stats) const;
-  
-  /**
-   * 打印基于Tile渲染性能统计信息
-   */
-  void PrintTileBasedStats(double vertex_ms, const TileRenderStats& stats) const;
-  
-  /**
-   * 打印延迟渲染性能统计信息
-   */
-  void PrintDeferredStats(double vertex_ms, const DeferredRenderStats& stats) const;
-  
+  // TBR 配置缓存：在创建 TileBasedRenderer 时下发
+  bool tbr_early_z_ = true;
+  size_t tbr_tile_size_ = 64;
 };
 }  // namespace simple_renderer
 
diff --git a/src/include/renderers/deferred_renderer.hpp b/src/include/renderers/deferred_renderer.hpp
new file mode 100644
index 0000000..245f5f8
--- /dev/null
+++ b/src/include/renderers/deferred_renderer.hpp
@@ -0,0 +1,31 @@
+#ifndef SIMPLERENDER_SRC_INCLUDE_RENDERERS_DEFERRED_RENDERER_HPP_
+#define SIMPLERENDER_SRC_INCLUDE_RENDERERS_DEFERRED_RENDERER_HPP_
+
+#include "renderers/renderer_base.hpp"
+
+namespace simple_renderer {
+
+/**
+ * @brief 延迟渲染器（Deferred）
+ * 
+ * 组织处理方式模拟 OpenGL 在 GPU上的工作原理，模仿 GPU管线。
+ * 但相比于另外两个前向渲染实现，导致内存使用增加和渲染速度变慢。
+ * 
+ * 特点：
+ * - AoS 顶点路径；
+ * - 首先按像素收集所有片段并选择最近深度；
+ * - 再对选择的片段执行片段着色（模拟经典 GPU 管线的一种教学实现）。
+ * - 
+ */
+class DeferredRenderer final : public RendererBase {
+ public:
+  using RendererBase::RendererBase;
+  /**
+   * @copydoc RendererBase::Render
+   */
+  bool Render(const Model& model, const Shader& shader, uint32_t* out_color) override;
+};
+
+}  // namespace simple_renderer
+
+#endif  // SIMPLERENDER_SRC_INCLUDE_RENDERERS_DEFERRED_RENDERER_HPP_
diff --git a/src/include/renderers/per_triangle_renderer.hpp b/src/include/renderers/per_triangle_renderer.hpp
new file mode 100644
index 0000000..e2cee62
--- /dev/null
+++ b/src/include/renderers/per_triangle_renderer.hpp
@@ -0,0 +1,28 @@
+#ifndef SIMPLERENDER_SRC_INCLUDE_RENDERERS_PER_TRIANGLE_RENDERER_HPP_
+#define SIMPLERENDER_SRC_INCLUDE_RENDERERS_PER_TRIANGLE_RENDERER_HPP_
+
+#include "renderers/renderer_base.hpp"
+
+namespace simple_renderer {
+
+/**
+ * @brief 逐三角形渲染器（Triangle‑Major）
+ *
+ * 特点：
+ * - AoS 顶点路径；
+ * - 每线程本地 framebuffer（depth/color）合并；
+ * - 背面剔除在屏幕空间完成；
+ * - 接近“传统”栈式前向渲染教学实现。
+ */
+class PerTriangleRenderer final : public RendererBase {
+ public:
+  using RendererBase::RendererBase;
+  /**
+   * @copydoc RendererBase::Render
+   */
+  bool Render(const Model& model, const Shader& shader, uint32_t* out_color) override;
+};
+
+}  // namespace simple_renderer
+
+#endif  // SIMPLERENDER_SRC_INCLUDE_RENDERERS_PER_TRIANGLE_RENDERER_HPP_
diff --git a/src/include/renderers/renderer_base.hpp b/src/include/renderers/renderer_base.hpp
new file mode 100644
index 0000000..ad09ac7
--- /dev/null
+++ b/src/include/renderers/renderer_base.hpp
@@ -0,0 +1,66 @@
+// Renderer base and options
+#ifndef SIMPLERENDER_SRC_INCLUDE_RENDERERS_RENDERER_BASE_HPP_
+#define SIMPLERENDER_SRC_INCLUDE_RENDERERS_RENDERER_BASE_HPP_
+
+#include <cstdint>
+#include <memory>
+
+#include "rasterizer.hpp"
+#include "vertex.hpp"
+#include "model.hpp"
+#include "shader.hpp"
+
+namespace simple_renderer {
+
+
+/**
+ * @brief 渲染器抽象基类
+ *
+ * 约定：
+ * - Render 负责完成完整的渲染过程（顶点变换 + 光栅化 + 着色 + 写入输出缓冲）。
+ * - 子类选择不同的“组织单元”：（按照并行组织单元）逐三角形、按 tile、或延迟管线。
+ * - 公共的透视除法与视口变换在此提供，子类按需复用。
+ */
+class RendererBase {
+ public:
+  RendererBase(size_t width, size_t height)
+      : width_(width), height_(height), rasterizer_(std::make_shared<Rasterizer>(width, height)) {}
+  virtual ~RendererBase() = default;
+
+  RendererBase(const RendererBase&) = delete;
+  RendererBase& operator=(const RendererBase&) = delete;
+
+  /**
+   * @brief 执行一次渲染
+   * @param model 模型数据
+   * @param shader 着色器（包含材质/光照/矩阵等 uniform）
+   * @param out_color 输出颜色缓冲（大小为 width*height）
+   * @return 是否渲染成功
+   */
+  virtual bool Render(const Model& model, const Shader& shader, uint32_t* out_color) = 0;
+
+ protected:
+  /**
+   * @brief 透视除法：裁剪空间 -> NDC
+   * @param vertex 裁剪空间顶点
+   * @return NDC 顶点（保留 1/w 以供透视校正）
+   */
+  Vertex PerspectiveDivision(const Vertex& vertex);
+  /**
+   * @brief 视口变换：NDC -> 屏幕坐标
+   * @param vertex NDC 顶点
+   * @return 屏幕空间顶点
+   */
+  Vertex ViewportTransformation(const Vertex& vertex);
+
+ protected:
+  size_t width_;
+  size_t height_;
+  std::shared_ptr<Rasterizer> rasterizer_;
+
+  static constexpr float kMinWValue = 1e-6f;
+};
+
+}  // namespace simple_renderer
+
+#endif  // SIMPLERENDER_SRC_INCLUDE_RENDERERS_RENDERER_BASE_HPP_
diff --git a/src/include/renderers/tile_based_renderer.hpp b/src/include/renderers/tile_based_renderer.hpp
new file mode 100644
index 0000000..e3ecb89
--- /dev/null
+++ b/src/include/renderers/tile_based_renderer.hpp
@@ -0,0 +1,110 @@
+#ifndef SIMPLERENDER_SRC_INCLUDE_RENDERERS_TILE_BASED_RENDERER_HPP_
+#define SIMPLERENDER_SRC_INCLUDE_RENDERERS_TILE_BASED_RENDERER_HPP_
+
+#include "renderers/renderer_base.hpp"
+
+namespace simple_renderer {
+
+/**
+ * @brief Tile 中的三角形轻量引用（SoA 索引 + 材质指针）
+ */
+struct TileTriangleRef {
+  size_t i0, i1, i2;
+  const Material* material = nullptr;
+  size_t face_index = 0;
+};
+
+/**
+ * @brief 基于 Tile 的渲染器（Tile‑Major）
+ *
+ * 特点：
+ * - SoA 顶点布局；
+ * - 三角形按 tile 分箱（binning），每 tile 内局部 Early‑Z；
+ * - 单份全局 framebuffer，按 tile 覆盖范围直接拷贝回写；
+ * - 通过构造参数 early_z 与 tile_size 控制行为。
+ */
+class TileBasedRenderer final : public RendererBase {
+ public:
+  /**
+   * @brief 构造函数
+   * @param width 画布宽度
+   * @param height 画布高度
+   * @param early_z 是否启用 Early‑Z（默认启用）
+   * @param tile_size Tile 像素尺寸（默认 64）
+   */
+  TileBasedRenderer(size_t width, size_t height, bool early_z = true, size_t tile_size = 64)
+      : RendererBase(width, height), early_z_(early_z), tile_size_(tile_size) {}
+  /**
+   * @copydoc RendererBase::Render
+   */
+  bool Render(const Model& model, const Shader& shader, uint32_t* out_color) override;
+
+ private:
+  /**
+   * @brief 将三角形按屏幕空间包围盒映射到 tile 网格
+   * @param model 模型（提供面/材质）
+   * @param soa 经过变换后的 SoA 顶点数据
+   * @param tile_triangles 输出：每个 tile 的三角形引用列表
+   * @param tiles_x 水平 tile 数
+   * @param tiles_y 垂直 tile 数
+   * @param tile_size tile 像素尺寸
+   */
+  void TriangleTileBinning(const Model &model,
+                           const VertexSoA &soa,
+                           std::vector<std::vector<TileTriangleRef>> &tile_triangles,
+                           size_t tiles_x, size_t tiles_y, size_t tile_size);
+
+  /**
+   * @brief 处理单个三角形的 tile binning 逻辑
+   * @param tri_idx 三角形索引
+   * @param count_only 是否仅进行计数（true=计数模式，false=填充模式）
+   * @param model 模型数据
+   * @param soa 经过变换后的 SoA 顶点数据
+   * @param tiles_x 水平 tile 数
+   * @param tiles_y 垂直 tile 数
+   * @param tile_size tile 像素尺寸
+   * @param tile_counts tile 计数数组的引用（计数模式时使用）
+   * @param tile_triangles tile 三角形引用列表（填充模式时使用）
+   */
+  void ProcessTriangleForTileBinning(
+      size_t tri_idx, bool count_only,
+      const Model& model, const VertexSoA& soa,
+      size_t tiles_x, size_t tiles_y, size_t tile_size,
+      std::vector<size_t>& tile_counts,
+      std::vector<std::vector<TileTriangleRef>>& tile_triangles);
+
+  /**
+   * @brief 光栅化单个 tile，并将结果写回全局 framebuffer
+   * @param tile_id tile 序号
+   * @param triangles 该 tile 覆盖的三角形引用
+   * @param tiles_x 水平 tile 数
+   * @param tiles_y 垂直 tile 数
+   * @param tile_size tile 像素尺寸
+   * @param tile_depth_buffer tile 局部深度缓冲（由调用方提供/复用）
+   * @param tile_color_buffer tile 局部颜色缓冲（由调用方提供/复用）
+   * @param global_depth_buffer 全局深度缓冲（单份）
+   * @param global_color_buffer 全局颜色缓冲（单份）
+   * @param soa 经过变换后的 SoA 顶点数据
+   * @param shader 着色器
+   * @param use_early_z 是否启用 Early‑Z
+   * @param scratch_fragments 可复用片段临时容器
+   */
+  void RasterizeTile(size_t tile_id,
+                      const std::vector<TileTriangleRef> &triangles,
+                      size_t tiles_x, size_t tiles_y, size_t tile_size,
+                      float* tile_depth_buffer, uint32_t* tile_color_buffer,
+                      std::unique_ptr<float[]> &global_depth_buffer,
+                      std::unique_ptr<uint32_t[]> &global_color_buffer,
+                      const VertexSoA &soa,
+                      const Shader& shader,
+                      bool use_early_z,
+                      std::vector<Fragment>* scratch_fragments);
+
+ private:
+  const bool early_z_;
+  const size_t tile_size_;
+};
+
+}  // namespace simple_renderer
+
+#endif  // SIMPLERENDER_SRC_INCLUDE_RENDERERS_TILE_BASED_RENDERER_HPP_
diff --git a/src/rasterizer.cpp b/src/rasterizer.cpp
index a30101b..84cbc83 100644
--- a/src/rasterizer.cpp
+++ b/src/rasterizer.cpp
@@ -81,6 +81,62 @@ std::vector<Fragment> Rasterizer::Rasterize(const Vertex& v0, const Vertex& v1,
   return fragments;
 }
 
+void Rasterizer::RasterizeTo(const Vertex& v0, const Vertex& v1, const Vertex& v2,
+                             int x0, int y0, int x1, int y1,
+                             std::vector<Fragment>& out) {
+  // 获取三角形的最小 box（屏幕空间）
+  const Vector4f p0 = v0.GetPosition();
+  const Vector4f p1 = v1.GetPosition();
+  const Vector4f p2 = v2.GetPosition();
+
+  Vector2f a(p0.x, p0.y);
+  Vector2f b(p1.x, p1.y);
+  Vector2f c(p2.x, p2.y);
+
+  Vector2f bboxMin = Vector2f{std::min({a.x, b.x, c.x}), std::min({a.y, b.y, c.y})};
+  Vector2f bboxMax = Vector2f{std::max({a.x, b.x, c.x}), std::max({a.y, b.y, c.y})};
+
+  // Clamp 到屏幕尺寸
+  float minx = std::max(0.0f, bboxMin.x);
+  float miny = std::max(0.0f, bboxMin.y);
+  float maxx = std::min(float(width_ - 1), bboxMax.x);
+  float maxy = std::min(float(height_ - 1), bboxMax.y);
+
+  // 与外部提供的裁剪区域相交（半开区间） -> 闭区间扫描
+  int sx = std::max(x0, static_cast<int>(std::floor(minx)));
+  int sy = std::max(y0, static_cast<int>(std::floor(miny)));
+  int ex = std::min(x1 - 1, static_cast<int>(std::floor(maxx)));
+  int ey = std::min(y1 - 1, static_cast<int>(std::floor(maxy)));
+  if (sx > ex || sy > ey) return;
+
+  for (int x = sx; x <= ex; ++x) {
+    for (int y = sy; y <= ey; ++y) {
+      auto [is_inside, bary] = GetBarycentricCoord(
+          Vector3f(p0.x, p0.y, p0.z), Vector3f(p1.x, p1.y, p1.z), Vector3f(p2.x, p2.y, p2.z),
+          Vector3f(static_cast<float>(x), static_cast<float>(y), 0));
+      if (!is_inside) continue;
+
+      // 透视矫正插值
+      auto perspective_result = PerformPerspectiveCorrection(
+          p0.w, p1.w, p2.w,
+          p0.z, p1.z, p2.z,
+          bary);
+
+      const Vector3f& corrected_bary = perspective_result.corrected_barycentric;
+      float z = perspective_result.interpolated_z;
+
+      Fragment frag; // material 指针由调用方填写
+      frag.screen_coord = {x, y};
+      frag.normal = Interpolate(v0.GetNormal(), v1.GetNormal(), v2.GetNormal(), corrected_bary);
+      frag.uv     = Interpolate(v0.GetTexCoords(), v1.GetTexCoords(), v2.GetTexCoords(), corrected_bary);
+      frag.color  = InterpolateColor(v0.GetColor(), v1.GetColor(), v2.GetColor(), corrected_bary);
+      frag.depth  = z;
+
+      out.push_back(frag);
+    }
+  }
+}
+
 void Rasterizer::RasterizeTo(const VertexSoA& soa, size_t i0, size_t i1, size_t i2,
                              int x0, int y0, int x1, int y1,
                              std::vector<Fragment>& out) {
diff --git a/src/renderer.cpp b/src/renderer.cpp
old mode 100755
new mode 100644
index f6e5984..4319066
--- a/src/renderer.cpp
+++ b/src/renderer.cpp
@@ -1,824 +1,83 @@
-
-/**
- * @file simple_renderer.cpp
- * @brief SimpleRenderer 实现
- * @author Zone.N (Zone.Niuzh@hotmail.com)
- * @version 1.0
- * @date 2023-10-23
- * @copyright MIT LICENSE
- * https://github.com/Simple-XX/SimpleRenderer
- * @par change log:
- * <table>
- * <tr><th>Date<th>Author<th>Description
- * <tr><td>2023-10-23<td>Zone.N<td>创建文件
- * </table>
- */
-
 #include "renderer.h"
 
-#include <omp.h>
-
-#include <array>
-#include <chrono>
-#include <cstdint>
-#include <limits>
-#include <span>
-#include <string_view>
-#include <vector>
+#include <string>
 
 #include "config.h"
-#include "light.h"
-#include "log_system.h"
-#include "model.hpp"
+#include "renderers/per_triangle_renderer.hpp"
+#include "renderers/tile_based_renderer.hpp"
+#include "renderers/deferred_renderer.hpp"
 
 namespace simple_renderer {
 
-// RenderingMode到字符串转换函数
 std::string RenderingModeToString(RenderingMode mode) {
   switch(mode) {
-    case RenderingMode::TRADITIONAL:
-      return "TRADITIONAL";
-    case RenderingMode::TILE_BASED:
-      return "TILE_BASED";
-    case RenderingMode::DEFERRED:
-      return "DEFERRED";
+    case RenderingMode::PER_TRIANGLE: return "PER_TRIANGLE";
+    case RenderingMode::TILE_BASED:  return "TILE_BASED";
+    case RenderingMode::DEFERRED:    return "DEFERRED";
   }
+  return "PER_TRIANGLE";
 }
+
 SimpleRenderer::SimpleRenderer(size_t width, size_t height)
     : height_(height),
       width_(width),
       log_system_(LogSystem(kLogFilePath, kLogFileMaxSize, kLogFileMaxCount)),
-      current_mode_(RenderingMode::TILE_BASED),
-      is_early_z_enabled_(true) {
-  rasterizer_ = std::make_shared<Rasterizer>(width, height);
+      current_mode_(RenderingMode::TILE_BASED) {
+  tbr_early_z_ = true;
+  tbr_tile_size_ = 64;
+  EnsureRenderer();
 }
 
-bool SimpleRenderer::DrawModel(const Model &model, const Shader &shader,
-                               uint32_t *buffer) {
+bool SimpleRenderer::DrawModel(const Model &model, const Shader &shader, uint32_t *buffer) {
+  EnsureRenderer(); // 确保渲染器实例存在
   SPDLOG_INFO("draw model: {}", model.GetModelPath());
-  shader_ = std::make_shared<Shader>(shader);
-  ExecuteDrawPipeline(model, buffer);
-  return true;
+  return renderer_->Render(model, shader, buffer);
 }
 
 void SimpleRenderer::SetRenderingMode(RenderingMode mode) {
   current_mode_ = mode;
   SPDLOG_INFO("rendering mode set to: {}", RenderingModeToString(mode));
+  renderer_.reset();
+  EnsureRenderer();
 }
 
-RenderingMode SimpleRenderer::GetRenderingMode() const {
-  return current_mode_;
-}
-
-/*
-Optimizes performance by performing depth testing during rasterization, keeping
-only the closest fragment per pixel, and avoiding storing all
-fragments—resulting in faster rendering.
+RenderingMode SimpleRenderer::GetRenderingMode() const { return current_mode_; }
 
-通过在光栅化过程中执行深度测试，仅保留每个像素的深度值最近的片段，避免存储所有片段，从而优化性能，实现更快的渲染。
-*/
-void SimpleRenderer::ExecuteDrawPipeline(const Model &model, uint32_t *buffer) {
-  SPDLOG_INFO("execute draw pipeline for {} using {} mode", 
-              model.GetModelPath(), RenderingModeToString(current_mode_));
-  
-  if (!shader_) {
-    SPDLOG_ERROR("No shader set for DrawModel, cannot render");
-    return;
+void SimpleRenderer::SetEarlyZEnabled(bool enabled) {
+  tbr_early_z_ = enabled;
+  if (current_mode_ == RenderingMode::TILE_BASED) {
+    renderer_.reset();
+    EnsureRenderer();
   }
-  
-  /* * * Vertex Transformation * * */
-  auto vertex_shader_start_time = std::chrono::high_resolution_clock::now();
-  const auto &input_vertices = model.GetVertices();
-  std::vector<Vertex> processedVertices;  // 非 TBR
-  VertexSoA processedSoA;                 // TBR 专用
+}
 
+void SimpleRenderer::SetTileSize(size_t tile_size) {
+  tbr_tile_size_ = tile_size;
   if (current_mode_ == RenderingMode::TILE_BASED) {
-    processedSoA.resize(input_vertices.size());
-    // schedule(static)使并行过程保持连续分块，避免 false sharing
-#pragma omp parallel for num_threads(kNProc) schedule(static) \
-    shared(shader_, processedSoA, input_vertices)
-    for (size_t i = 0; i < input_vertices.size(); ++i) { // 按索引并行处理
-      const auto &v = input_vertices[i];
-      // 顶点着色器：世界坐标 -> 裁剪坐标
-      auto clipSpaceVertex = shader_->VertexShader(v);
-      // 保存裁剪空间坐标用于后续视锥体裁剪
-      processedSoA.pos_clip[i] = clipSpaceVertex.GetPosition();
-      auto ndcVertex = PerspectiveDivision(clipSpaceVertex);
-      auto screenSpaceVertex = ViewportTransformation(ndcVertex);
-
-      // 填充为SoA数据结构，用于优化缓存局部性
-      processedSoA.pos_screen[i] = screenSpaceVertex.GetPosition();
-      processedSoA.normal[i]     = screenSpaceVertex.GetNormal();
-      processedSoA.uv[i]         = screenSpaceVertex.GetTexCoords();
-      processedSoA.color[i]      = screenSpaceVertex.GetColor();
-    }
-  } else { // Tradition或Deffer管线
-    processedVertices.resize(input_vertices.size()); // 根据顶点总数量进行预分配
-    // 并行过程保持连续分块，避免false sharing
-#pragma omp parallel for num_threads(kNProc) schedule(static) \
-    shared(shader_, processedVertices, input_vertices)
-    for (size_t i = 0; i < input_vertices.size(); ++i) { // 按索引并行处理
-      const auto &v = input_vertices[i];
-      auto clipSpaceVertex = shader_->VertexShader(v);
-      auto ndcVertex = PerspectiveDivision(clipSpaceVertex);
-      auto screenSpaceVertex = ViewportTransformation(ndcVertex);
-      processedVertices[i] = screenSpaceVertex;
-    }
+    renderer_.reset();
+    EnsureRenderer();
   }
-  auto vertex_shader_end_time = std::chrono::high_resolution_clock::now();
-  auto vertex_shader_duration = std::chrono::duration_cast<std::chrono::microseconds>(
-      vertex_shader_end_time - vertex_shader_start_time);
+}
 
-  // 根据当前设置的模式选择不同的渲染管线
-  double vertex_ms = vertex_shader_duration.count() / 1000.0;
-  
-  switch (current_mode_) {
-    case RenderingMode::TRADITIONAL: {
-      auto stats = ExecuteTraditionalPipeline(model, processedVertices, buffer);
-      PrintTraditionalStats(vertex_ms, stats);
+void SimpleRenderer::EnsureRenderer() {
+  if (renderer_) return;
+  switch (current_mode_) { // 延迟初始化，根据模式创建相应实例
+    case RenderingMode::PER_TRIANGLE: {
+      auto r = std::make_unique<PerTriangleRenderer>(width_, height_);
+      renderer_ = std::move(r);
       break;
     }
-    
     case RenderingMode::TILE_BASED: {
-      auto stats = ExecuteTileBasedPipeline(model, processedSoA, buffer);
-      PrintTileBasedStats(vertex_ms, stats);
+      auto r = std::make_unique<TileBasedRenderer>(width_, height_, tbr_early_z_, tbr_tile_size_);
+      renderer_ = std::move(r);
       break;
     }
-    
     case RenderingMode::DEFERRED: {
-      auto stats = ExecuteDeferredPipeline(model, processedVertices, buffer);
-      PrintDeferredStats(vertex_ms, stats);
+      auto r = std::make_unique<DeferredRenderer>(width_, height_);
+      renderer_ = std::move(r);
       break;
     }
   }
 }
 
-
-/*
-Organizes processing to simulate how OpenGL works with GPUs by collecting all
-fragments per pixel before processing, closely mimicking the GPU pipeline but
-leading to increased memory usage and slower performance.
-
-组织处理方式模拟 OpenGL 在 GPU
-上的工作原理，先收集每个像素的所有片段再并行处理屏幕上的每个像素，模仿 GPU
-管线，但导致内存使用增加和渲染速度变慢。
-
-现在作为延迟渲染管线的一部分，用于教学演示经典GPU管线概念。
-*/
-SimpleRenderer::DeferredRenderStats SimpleRenderer::ExecuteDeferredPipeline(
-    const Model &model,
-    const std::vector<Vertex> &processedVertices,
-    uint32_t *buffer) {
-    
-  DeferredRenderStats stats;
-  auto total_start_time = std::chrono::high_resolution_clock::now();
-  SPDLOG_INFO("execute deferred pipeline for {}", model.GetModelPath());
-  /*  *  *  *  *  *  *  */
-
-  /* * * Buffer Allocation * * */
-  auto buffer_alloc_start_time = std::chrono::high_resolution_clock::now();
-  std::vector<std::vector<std::vector<Fragment>>> fragmentsBuffer_all_thread(
-      kNProc, std::vector<std::vector<Fragment>>(width_ * height_));
-
-  // 预先缓存所有Material数据，避免指针悬垂问题
-  std::vector<Material> material_cache;
-  material_cache.reserve(model.GetFaces().size());
-  for (const auto &f : model.GetFaces()) {
-    material_cache.emplace_back(f.GetMaterial()); // 值拷贝
-  }
-  auto buffer_alloc_end_time = std::chrono::high_resolution_clock::now();
-  auto buffer_alloc_duration = std::chrono::duration_cast<std::chrono::microseconds>(
-      buffer_alloc_end_time - buffer_alloc_start_time);
-  SPDLOG_INFO("cached {} materials for deferred rendering", material_cache.size());
-
-  /* * * Rasterization * * */
-  auto rasterization_start_time = std::chrono::high_resolution_clock::now();
-#pragma omp parallel num_threads(kNProc) default(none)                       \
-  shared(processedVertices, fragmentsBuffer_all_thread, rasterizer_, width_, \
-               height_, material_cache, model)
-  {
-    int thread_id = omp_get_thread_num();
-    auto &fragmentsBuffer_per_thread = fragmentsBuffer_all_thread[thread_id];
-
-#pragma omp for
-    for (size_t face_idx = 0; face_idx < model.GetFaces().size(); ++face_idx) {
-      const auto &f = model.GetFaces()[face_idx];
-      auto v0 = processedVertices[f.GetIndex(0)];
-      auto v1 = processedVertices[f.GetIndex(1)];
-      auto v2 = processedVertices[f.GetIndex(2)];
-
-      const Material *material = &material_cache[face_idx]; // 使用缓存的Material
-
-      auto fragments = rasterizer_->Rasterize(v0, v1, v2);
-
-      for (auto &fragment : fragments) {
-        fragment.material = material;
-
-        size_t x = fragment.screen_coord[0];
-        size_t y = fragment.screen_coord[1];
-
-        if (x >= width_ || y >= height_) {
-          continue;
-        }
-
-        size_t index = x + y * width_;
-        fragmentsBuffer_per_thread[index].push_back(fragment);
-      }
-    }
-  }
-  auto rasterization_end_time = std::chrono::high_resolution_clock::now();
-  auto rasterization_duration = std::chrono::duration_cast<std::chrono::microseconds>(
-      rasterization_end_time - rasterization_start_time);
-  /*  *  *  *  *  *  *  */
-
-  /* * * Fragment Collection * * */
-  auto fragment_collection_start_time = std::chrono::high_resolution_clock::now();
-  std::vector<std::vector<Fragment>> fragmentsBuffer(width_ * height_);
-  for (const auto &fragmentsBuffer_per_thread : fragmentsBuffer_all_thread) {
-    for (size_t i = 0; i < fragmentsBuffer_per_thread.size(); i++) {
-      fragmentsBuffer[i].insert(fragmentsBuffer[i].end(),
-                                fragmentsBuffer_per_thread[i].begin(),
-                                fragmentsBuffer_per_thread[i].end());
-    }
-  }
-  auto fragment_collection_end_time = std::chrono::high_resolution_clock::now();
-  auto fragment_collection_duration = std::chrono::duration_cast<std::chrono::microseconds>(
-      fragment_collection_end_time - fragment_collection_start_time);
-  /*  *  *  *  *  *  *  */
-
-  /* * * Fragment Merge & Deferred Shading * * */
-  auto fragment_merge_start_time = std::chrono::high_resolution_clock::now();
-  
-  // Fragment Merge阶段：深度测试选择最近片段
-  std::vector<const Fragment*> selected_fragments(width_ * height_, nullptr);
-  #pragma omp parallel for
-  for (size_t i = 0; i < fragmentsBuffer.size(); i++) {
-    const auto &fragments = fragmentsBuffer[i];
-    if (fragments.empty()) {
-      continue;
-    }
-
-    const Fragment *renderFragment = nullptr;
-    for (const auto &fragment : fragments) {
-      if (!renderFragment || fragment.depth < renderFragment->depth) {
-        renderFragment = &fragment;
-      }
-    }
-    selected_fragments[i] = renderFragment;
-  }
-  auto fragment_merge_end_time = std::chrono::high_resolution_clock::now();
-  auto fragment_merge_duration = std::chrono::duration_cast<std::chrono::microseconds>(
-      fragment_merge_end_time - fragment_merge_start_time);
-  
-  // Deferred Shading阶段：执行片段着色器
-  auto deferred_shading_start_time = std::chrono::high_resolution_clock::now();
-#pragma omp parallel for
-  for (size_t i = 0; i < selected_fragments.size(); i++) {
-    const Fragment *renderFragment = selected_fragments[i];
-    if (renderFragment) {
-      // 添加Material指针有效性检查
-      if (renderFragment->material == nullptr) {
-        SPDLOG_ERROR("Fragment material is nullptr at pixel {}", i);
-        continue;
-      }
-      auto color = shader_->FragmentShader(*renderFragment);
-      buffer[i] = uint32_t(color);
-    }
-  }
-  auto deferred_shading_end_time = std::chrono::high_resolution_clock::now();
-  auto deferred_shading_duration = std::chrono::duration_cast<std::chrono::microseconds>(
-      deferred_shading_end_time - deferred_shading_start_time);
-  /*  *  *  *  *  *  *  */
-  
-  auto total_end_time = std::chrono::high_resolution_clock::now();
-  auto total_duration = std::chrono::duration_cast<std::chrono::microseconds>(
-      total_end_time - total_start_time);
-  
-  // 填充统计信息
-  stats.buffer_alloc_ms = buffer_alloc_duration.count() / 1000.0;
-  stats.rasterization_ms = rasterization_duration.count() / 1000.0;
-  stats.fragment_collection_ms = fragment_collection_duration.count() / 1000.0;
-  stats.fragment_merge_ms = fragment_merge_duration.count() / 1000.0;
-  stats.deferred_shading_ms = deferred_shading_duration.count() / 1000.0;
-  stats.total_ms = total_duration.count() / 1000.0;
-  
-  return stats;
-}
-
-Vertex SimpleRenderer::PerspectiveDivision(const Vertex &vertex) {
-  Vector4f position = vertex.GetPosition();
-  
-  // 检查w分量，避免除零和负数问题
-  if (position.w <= kMinWValue) {
-    Vector4f farPosition(0.0f, 0.0f, 1.0f, 1.0f);
-    return Vertex(farPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor());
-  }
-  
-  // 保存原始w分量用于透视矫正插值
-  float original_w = position.w;
-  
-  // 执行透视除法：(x, y, z, w) -> (x/w, y/w, z/w, 1/w)
-  Vector4f ndcPosition(
-    position.x / position.w,  // x_ndc = x_clip / w_clip
-    position.y / position.w,  // y_ndc = y_clip / w_clip  
-    position.z / position.w,  // z_ndc = z_clip / w_clip
-    1.0f / original_w         // 保存1/w用于透视矫正插值
-  );
-  
-  // 只对Z坐标进行深度范围限制，X和Y允许超出以支持屏幕外三角形
-  // 这些坐标在后续的视口变换和裁剪阶段会被正确处理
-  ndcPosition.z = std::clamp(ndcPosition.z, -1.0f, 1.0f);
-  
-  // 创建新的顶点，保持其他属性和裁剪空间坐标不变
-  return Vertex(ndcPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor(), vertex.GetClipPosition());
-}
-
-Vertex SimpleRenderer::ViewportTransformation(const Vertex &vertex) {
-  Vector4f ndcPosition = vertex.GetPosition();
-  
-  // 视口变换：将NDC坐标[-1,1]转换为屏幕坐标[0,width]x[0,height]
-  float screen_x = (ndcPosition.x + 1.0f) * width_ / 2.0f;
-  float screen_y = (1.0f - ndcPosition.y) * height_ / 2.0f;
-  
-  Vector4f screenPosition(
-    screen_x,                    // x: 屏幕坐标
-    screen_y,                    // y: 屏幕坐标
-    ndcPosition.z,               // z: NDC坐标用于深度测试
-    ndcPosition.w                // w: 保持1/w用于透视矫正插值
-  );
-  
-  return Vertex(screenPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor());
-}
-
-
-
-// SoA优化的Binning：两遍计数 + 预留 + 填充 TriangleRef
-void SimpleRenderer::TriangleTileBinning(
-    const Model &model,
-    const VertexSoA &soa,
-    std::vector<std::vector<TriangleRef>> &tile_triangles,
-    size_t tiles_x, size_t tiles_y, size_t tile_size) {
-  const size_t total_triangles = model.GetFaces().size();
-
-  SPDLOG_INFO("Starting triangle-tile binning (SoA) for {} triangles", total_triangles);
-  SPDLOG_INFO("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}",
-              width_, height_, tile_size, tiles_x, tiles_y);
-
-  std::vector<size_t> tile_counts(tiles_x * tiles_y, 0);
-
-  auto process_triangle = [&](size_t tri_idx, bool count_only) {
-    const auto &f = model.GetFaces()[tri_idx];
-    size_t i0 = f.GetIndex(0);
-    size_t i1 = f.GetIndex(1);
-    size_t i2 = f.GetIndex(2);
-
-    // 视锥体裁剪 (裁剪空间)
-    // 保守视锥体裁剪：只有当整个三角形都在视锥体外同一侧时才裁剪
-    const Vector4f &c0 = soa.pos_clip[i0];
-    const Vector4f &c1 = soa.pos_clip[i1];
-    const Vector4f &c2 = soa.pos_clip[i2];
-    bool frustum_cull =
-        (c0.x > c0.w && c1.x > c1.w && c2.x > c2.w) ||  // 右平面外
-        (c0.x < -c0.w && c1.x < -c0.w && c2.x < -c0.w) || // 左平面外
-        (c0.y > c0.w && c1.y > c1.w && c2.y > c2.w) ||  // 上平面外
-        (c0.y < -c0.w && c1.y < -c0.w && c2.y < -c0.w) || // 下平面外
-        (c0.z > c0.w && c1.z > c1.w && c2.z > c2.w) ||  // 远平面外
-        (c0.z < -c0.w && c1.z < -c0.w && c2.z < -c0.w);  // 近平面外
-    if (frustum_cull) {
-      return;
-    }
-
-    const Vector4f &pos0 = soa.pos_screen[i0];
-    const Vector4f &pos1 = soa.pos_screen[i1];
-    const Vector4f &pos2 = soa.pos_screen[i2];
-
-    // 背面剔除（屏幕空间）
-    // NDC空间中叉积为负表示顺时针，即背面。
-    // 从NDC到屏幕空间中，会发生Y轴翻转，对应叉积应为正。
-
-    Vector2f screen0(pos0.x, pos0.y);
-    Vector2f screen1(pos1.x, pos1.y);
-    Vector2f screen2(pos2.x, pos2.y);
-    Vector2f edge1 = screen1 - screen0;
-    Vector2f edge2 = screen2 - screen0;
-    float cross_product = edge1.x * edge2.y - edge1.y * edge2.x;
-    if (cross_product > 0.0f) return;
-
-    float screen_x0 = pos0.x;
-    float screen_y0 = pos0.y;
-    float screen_x1 = pos1.x;
-    float screen_y1 = pos1.y;
-    float screen_x2 = pos2.x;
-    float screen_y2 = pos2.y;
-
-    // 计算屏幕bbox，用于后续tile划分
-    float min_x = std::min({screen_x0, screen_x1, screen_x2});
-    float max_x = std::max({screen_x0, screen_x1, screen_x2});
-    float min_y = std::min({screen_y0, screen_y1, screen_y2});
-    float max_y = std::max({screen_y0, screen_y1, screen_y2});
-
-    int start_tile_x = std::max(0, static_cast<int>(min_x) / static_cast<int>(tile_size));
-    int end_tile_x   = std::min(static_cast<int>(tiles_x - 1), static_cast<int>(max_x) / static_cast<int>(tile_size));
-    int start_tile_y = std::max(0, static_cast<int>(min_y) / static_cast<int>(tile_size));
-    int end_tile_y   = std::min(static_cast<int>(tiles_y - 1), static_cast<int>(max_y) / static_cast<int>(tile_size));
-    if (start_tile_x > end_tile_x || start_tile_y > end_tile_y) return; // 如果bbox不在任何tile内，直接返回
-
-    if (count_only) { // 第一遍计数，只统计tile内三角形数量
-      for (int ty = start_tile_y; ty <= end_tile_y; ++ty) {
-        for (int tx = start_tile_x; tx <= end_tile_x; ++tx) {
-          size_t tile_id = ty * tiles_x + tx;
-          tile_counts[tile_id]++;
-        }
-      }
-    } else { // 第二遍填充，填充TriangleRef
-      TriangleRef tri_ref{ i0, i1, i2, &f.GetMaterial(), tri_idx };
-      for (int ty = start_tile_y; ty <= end_tile_y; ++ty) {
-        for (int tx = start_tile_x; tx <= end_tile_x; ++tx) {
-          size_t tile_id = ty * tiles_x + tx;
-          tile_triangles[tile_id].push_back(tri_ref);
-        }
-      }
-    }
-  };
-
-  // 第一遍（count only）：计算每个tile需要容纳多少三角形
-  for (size_t tri_idx = 0; tri_idx < total_triangles; ++tri_idx) {
-    process_triangle(tri_idx, true);
-  }
-
-  // 预分配，避免动态扩容
-  for (size_t tile_id = 0; tile_id < tile_triangles.size(); ++tile_id) {
-    if (tile_counts[tile_id] > 0) tile_triangles[tile_id].reserve(tile_counts[tile_id]);
-  }
-
-  // 第二遍（fill）：按范围填充TriangleRef
-  for (size_t tri_idx = 0; tri_idx < total_triangles; ++tri_idx) {
-    process_triangle(tri_idx, false);
-  }
-
-  size_t total_triangle_refs = 0;
-  size_t non_empty_tiles = 0;
-  for (const auto& tile : tile_triangles) {
-    total_triangle_refs += tile.size();
-    if (!tile.empty()) non_empty_tiles++;
-  }
-  SPDLOG_INFO("  (SoA) Total triangle references: {}", total_triangle_refs);
-  SPDLOG_INFO("  (SoA) Non-empty tiles: {}", non_empty_tiles);
-  SPDLOG_INFO("  (SoA) Average triangles per tile: {:.2f}",
-              total_triangle_refs > 0 ? float(total_triangle_refs) / tile_triangles.size() : 0.0f);
-}
-
-// SoA 版：单个 tile 光栅化
-void SimpleRenderer::RasterizeTile(
-    size_t tile_id,
-    const std::vector<TriangleRef> &triangles,
-    size_t tiles_x, size_t tiles_y, size_t tile_size,
-    float* tile_depth_buffer, uint32_t* tile_color_buffer,
-    std::unique_ptr<float[]> &global_depth_buffer,
-    std::unique_ptr<uint32_t[]> &global_color_buffer,
-    const VertexSoA &soa,
-    bool use_early_z,
-    std::vector<Fragment>* scratch_fragments) {
-  (void)tiles_y;
-  // 计算 tile 屏幕范围
-  size_t tile_x = tile_id % tiles_x;
-  size_t tile_y = tile_id / tiles_x;
-  size_t screen_x_start = tile_x * tile_size;
-  size_t screen_y_start = tile_y * tile_size;
-  size_t screen_x_end = std::min(screen_x_start + tile_size, width_);
-  size_t screen_y_end = std::min(screen_y_start + tile_size, height_);
-
-  // 初始化 tile 局部缓冲
-  size_t tile_width = screen_x_end - screen_x_start;
-  size_t tile_height = screen_y_end - screen_y_start;
-  std::fill_n(tile_depth_buffer, tile_width * tile_height, 1.0f);
-  std::fill_n(tile_color_buffer, tile_width * tile_height, 0); // 默认背景色为0/黑色
-
-  for (const auto &tri : triangles) { // 用来应对scratch传入nullptr的情况
-    // 始终走 SoA + 限制矩形的光栅化路径；如未提供 scratch，则使用函数内局部容器
-    std::vector<Fragment> local_out;
-    std::vector<Fragment> &out = scratch_fragments ? *scratch_fragments : local_out;
-
-    out.clear();
-    if (out.capacity() < tile_width * tile_height) {
-      out.reserve(tile_width * tile_height);
-    }
-
-    rasterizer_->RasterizeTo(soa, tri.i0, tri.i1, tri.i2,
-                             static_cast<int>(screen_x_start), static_cast<int>(screen_y_start),
-                             static_cast<int>(screen_x_end),   static_cast<int>(screen_y_end),
-                             out);
-
-    for (auto &fragment : out) {
-      fragment.material = tri.material;
-      size_t sx = fragment.screen_coord[0];
-      size_t sy = fragment.screen_coord[1];
-      if (sx >= screen_x_start && sx < screen_x_end && sy >= screen_y_start && sy < screen_y_end) {
-        size_t local_x = sx - screen_x_start;
-        size_t local_y = sy - screen_y_start;
-        size_t idx = local_x + local_y * tile_width;
-        if (use_early_z) {
-          if (fragment.depth < tile_depth_buffer[idx]) {
-            auto color = shader_->FragmentShader(fragment);
-            tile_depth_buffer[idx] = fragment.depth;
-            tile_color_buffer[idx] = uint32_t(color);
-          }
-        } else {
-          auto color = shader_->FragmentShader(fragment);
-          if (fragment.depth < tile_depth_buffer[idx]) {
-            tile_depth_buffer[idx] = fragment.depth;
-            tile_color_buffer[idx] = uint32_t(color);
-          }
-        }
-      }
-    }
-  }
-
-  // 写回全局缓冲
-  // TBR 下不同 tile 覆盖的屏幕区域互不重叠，且在 tile 内部已通过 Early‑Z
-  // 得出每个像素的最终值。因此可以直接将 tile 行数据拷贝到全局缓冲
-  for (size_t y = 0; y < tile_height; y++) {
-    const size_t tile_row_off   = y * tile_width;
-    const size_t global_row_off = (screen_y_start + y) * width_ + screen_x_start;
-
-    // 拷贝本行 color 到全局 color
-    std::memcpy(global_color_buffer.get() + global_row_off,
-                tile_color_buffer + tile_row_off,
-                tile_width * sizeof(uint32_t));
-
-    // 拷贝本行 depth 到全局 depth
-    std::memcpy(global_depth_buffer.get() + global_row_off,
-                tile_depth_buffer + tile_row_off,
-                tile_width * sizeof(float));
-  }
-}
-
-// 基础光栅化管线实现
-SimpleRenderer::RenderStats SimpleRenderer::ExecuteTraditionalPipeline(
-    const Model &model, 
-    const std::vector<Vertex> &processedVertices,
-    uint32_t *buffer) {
-    
-    RenderStats stats;
-    auto total_start_time = std::chrono::high_resolution_clock::now();
-    
-    // 1. 为每个线程创建framebuffer
-    auto buffer_alloc_start_time = std::chrono::high_resolution_clock::now();
-    std::vector<std::unique_ptr<float[]>> depthBuffer_all_thread(kNProc);
-    std::vector<std::unique_ptr<uint32_t[]>> colorBuffer_all_thread(kNProc);
-    
-    for (size_t thread_id = 0; thread_id < kNProc; thread_id++) {
-        depthBuffer_all_thread[thread_id] = 
-            std::make_unique<float[]>(width_ * height_);
-        colorBuffer_all_thread[thread_id] = 
-            std::make_unique<uint32_t[]>(width_ * height_);
-        
-        std::fill_n(depthBuffer_all_thread[thread_id].get(), width_ * height_,
-                    std::numeric_limits<float>::infinity());
-        std::fill_n(colorBuffer_all_thread[thread_id].get(), width_ * height_, 0);
-    }
-    auto buffer_alloc_end_time = std::chrono::high_resolution_clock::now();
-    auto buffer_alloc_duration = std::chrono::duration_cast<std::chrono::microseconds>(
-        buffer_alloc_end_time - buffer_alloc_start_time);
-    
-    // 2. 并行光栅化
-    auto raster_start_time = std::chrono::high_resolution_clock::now();
-#pragma omp parallel num_threads(kNProc) default(none) \
-    shared(processedVertices, rasterizer_, shader_, width_, height_, \
-           depthBuffer_all_thread, colorBuffer_all_thread, model)
-    {
-        int thread_id = omp_get_thread_num();
-        auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id];
-        auto &colorBuffer_per_thread = colorBuffer_all_thread[thread_id];
-        
-#pragma omp for
-        for (const auto &f : model.GetFaces()) {
-            auto v0 = processedVertices[f.GetIndex(0)];
-            auto v1 = processedVertices[f.GetIndex(1)];
-            auto v2 = processedVertices[f.GetIndex(2)];
-
-            // 获取屏幕空间坐标
-            Vector2f screen0(v0.GetPosition().x, v0.GetPosition().y);
-            Vector2f screen1(v1.GetPosition().x, v1.GetPosition().y);  
-            Vector2f screen2(v2.GetPosition().x, v2.GetPosition().y);
-            
-            // 计算屏幕空间叉积判断朝向
-            Vector2f edge1 = screen1 - screen0;
-            Vector2f edge2 = screen2 - screen0;
-            float cross_product = edge1.x * edge2.y - edge1.y * edge2.x;
-            
-            // 背面剔除：NDC空间中叉积为负表示顺时针，即背面。
-            // 从NDC到屏幕空间中，会发生Y轴翻转，对应叉积应为正。
-            if (cross_product > 0.0f) {
-                continue;
-            }
-
-            const Material *material = &f.GetMaterial();
-            auto fragments = rasterizer_->Rasterize(v0, v1, v2);
-
-            for (auto &fragment : fragments) {
-                fragment.material = material;
-                size_t x = fragment.screen_coord[0];
-                size_t y = fragment.screen_coord[1];
-
-                if (x >= width_ || y >= height_) {
-                    continue;
-                }
-
-                size_t index = x + y * width_;
-                if (fragment.depth < depthBuffer_per_thread[index]) {
-                    depthBuffer_per_thread[index] = fragment.depth;
-                    auto color = shader_->FragmentShader(fragment);
-                    colorBuffer_per_thread[index] = uint32_t(color);
-                }
-            }
-        }
-    }
-    auto raster_end_time = std::chrono::high_resolution_clock::now();
-    auto raster_duration = std::chrono::duration_cast<std::chrono::microseconds>(
-        raster_end_time - raster_start_time);
-    
-    // 3. 合并结果
-    auto merge_start_time = std::chrono::high_resolution_clock::now();
-    std::unique_ptr<float[]> depthBuffer = 
-        std::make_unique<float[]>(width_ * height_);
-    std::unique_ptr<uint32_t[]> colorBuffer = 
-        std::make_unique<uint32_t[]>(width_ * height_);
-
-    std::fill_n(depthBuffer.get(), width_ * height_,
-                std::numeric_limits<float>::infinity());
-    std::fill_n(colorBuffer.get(), width_ * height_, 0);
-
-#pragma omp parallel for
-    for (size_t i = 0; i < width_ * height_; i++) {
-        float min_depth = std::numeric_limits<float>::infinity();
-        uint32_t color = 0;
-
-        for (size_t thread_id = 0; thread_id < kNProc; thread_id++) {
-            float depth = depthBuffer_all_thread[thread_id][i];
-            if (depth < min_depth) {
-                min_depth = depth;
-                color = colorBuffer_all_thread[thread_id][i];
-            }
-        }
-        depthBuffer[i] = min_depth;
-        colorBuffer[i] = color;
-    }
-
-    std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t));
-    auto merge_end_time = std::chrono::high_resolution_clock::now();
-    auto merge_duration = std::chrono::duration_cast<std::chrono::microseconds>(
-        merge_end_time - merge_start_time);
-    
-    auto total_end_time = std::chrono::high_resolution_clock::now();
-    auto total_duration = std::chrono::duration_cast<std::chrono::microseconds>(
-        total_end_time - total_start_time);
-    
-    // 填充统计信息
-    stats.buffer_alloc_ms = buffer_alloc_duration.count() / 1000.0;
-    stats.rasterization_ms = raster_duration.count() / 1000.0;
-    stats.merge_ms = merge_duration.count() / 1000.0;
-    stats.total_ms = total_duration.count() / 1000.0;
-    
-    return stats;
-}
-
-
-// Tile-based光栅化管线实现（SoA 直连版本，避免 AoS->SoA 拷贝）
-SimpleRenderer::TileRenderStats SimpleRenderer::ExecuteTileBasedPipeline(
-    const Model &model,
-    const VertexSoA &soa,
-    uint32_t *buffer) {
-    TileRenderStats stats;
-    auto total_start_time = std::chrono::high_resolution_clock::now();
-
-    // 1. Setup阶段
-    auto setup_start_time = std::chrono::high_resolution_clock::now();
-    const size_t TILE_SIZE = kDefaultTileSize; // Default tile size per tile
-    const size_t tiles_x = (width_ + TILE_SIZE - 1) / TILE_SIZE;
-    const size_t tiles_y = (height_ + TILE_SIZE - 1) / TILE_SIZE;
-    const size_t total_tiles = tiles_x * tiles_y;
-
-    // 为每个tile创建三角形列表（SoA 引用）
-    std::vector<std::vector<TriangleRef>> tile_triangles(total_tiles);
-    auto setup_end_time = std::chrono::high_resolution_clock::now();
-    auto setup_duration = std::chrono::duration_cast<std::chrono::microseconds>(
-        setup_end_time - setup_start_time);
-
-    // 2. Triangle-Tile binning阶段（SoA）
-    auto binning_start_time = std::chrono::high_resolution_clock::now();
-    TriangleTileBinning(model, soa, tile_triangles, tiles_x, tiles_y, TILE_SIZE);
-    auto binning_end_time = std::chrono::high_resolution_clock::now();
-    auto binning_duration = std::chrono::duration_cast<std::chrono::microseconds>(
-        binning_end_time - binning_start_time);
-
-    // 3. 全局 framebuffer（单份）
-    // 直接让每个 tile 写入这份全局缓冲区，避免末端 O(W*H*kNProc) 合并开销
-    auto buffer_alloc_start_time = std::chrono::high_resolution_clock::now();
-    std::unique_ptr<float[]> depthBuffer = std::make_unique<float[]>(width_ * height_);
-    std::unique_ptr<uint32_t[]> colorBuffer = std::make_unique<uint32_t[]>(width_ * height_);
-    // 深度初始化为最远值，颜色清零
-    std::fill_n(depthBuffer.get(), width_ * height_, std::numeric_limits<float>::infinity());
-    std::fill_n(colorBuffer.get(), width_ * height_, 0);
-    auto buffer_alloc_end_time = std::chrono::high_resolution_clock::now();
-    auto buffer_alloc_duration = std::chrono::duration_cast<std::chrono::microseconds>(
-        buffer_alloc_end_time - buffer_alloc_start_time);
-
-    // 4. 并行处理每个tile（SoA）
-    auto rasterization_start_time = std::chrono::high_resolution_clock::now();
-#pragma omp parallel num_threads(kNProc) default(none) \
-    shared(tile_triangles, rasterizer_, shader_, width_, height_, \
-           depthBuffer, colorBuffer, tiles_x, tiles_y, total_tiles, \
-           is_early_z_enabled_, soa)
-    {
-        int thread_id = omp_get_thread_num();
-
-        // 为当前线程创建 tile 局部缓冲区（避免在全局缓冲上直接逐像素竞争）
-        std::unique_ptr<float[]> tile_depth_buffer = 
-            std::make_unique<float[]>(TILE_SIZE * TILE_SIZE);
-        std::unique_ptr<uint32_t[]> tile_color_buffer = 
-            std::make_unique<uint32_t[]>(TILE_SIZE * TILE_SIZE);
-
-        // 线程本地片段 scratch 容器（复用），容量按单 tile 上限预估
-        std::vector<Fragment> scratch_fragments;
-        scratch_fragments.reserve(TILE_SIZE * TILE_SIZE);
-
-#pragma omp for
-        for (size_t tile_id = 0; tile_id < total_tiles; tile_id++) {
-            // 按照 tile 进行光栅化（SoA）
-            // 直接写入单份全局 framebuffer；不同 tile 不重叠，无需加锁
-            RasterizeTile(tile_id, tile_triangles[tile_id],
-                          tiles_x, tiles_y, TILE_SIZE,
-                          tile_depth_buffer.get(), tile_color_buffer.get(),
-                          depthBuffer, colorBuffer,
-                          soa, is_early_z_enabled_, &scratch_fragments);
-        }
-    }
-    auto rasterization_end_time = std::chrono::high_resolution_clock::now();
-    auto rasterization_duration = std::chrono::duration_cast<std::chrono::microseconds>(
-        rasterization_end_time - rasterization_start_time);
-
-    // 5. 直接将单份全局 colorBuffer 拷贝到输出
-    auto present_start_time = std::chrono::high_resolution_clock::now();
-    std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t));
-    auto present_end_time = std::chrono::high_resolution_clock::now();
-    auto present_duration = std::chrono::duration_cast<std::chrono::microseconds>(
-        present_end_time - present_start_time);
-
-    auto total_end_time = std::chrono::high_resolution_clock::now();
-    auto total_duration = std::chrono::duration_cast<std::chrono::microseconds>(
-        total_end_time - total_start_time);
-
-    // 填充统计信息
-    stats.setup_ms = setup_duration.count() / 1000.0;
-    stats.binning_ms = binning_duration.count() / 1000.0;
-    stats.buffer_alloc_ms = buffer_alloc_duration.count() / 1000.0;
-    stats.rasterization_ms = rasterization_duration.count() / 1000.0;
-    // 合并阶段已被消除，仅为拷贝开销
-    stats.merge_ms = present_duration.count() / 1000.0;
-    stats.total_ms = total_duration.count() / 1000.0;
-
-    return stats;
-}
-
-void SimpleRenderer::PrintTraditionalStats(double vertex_ms, const RenderStats& stats) const {
-  double total_ms = vertex_ms + stats.total_ms;
-  
-  SPDLOG_INFO("=== TRADITIONAL RENDERING PERFORMANCE ===");
-  SPDLOG_INFO("Vertex Shader:    {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100);
-  SPDLOG_INFO("Buffer Alloc:     {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100);
-  SPDLOG_INFO("Rasterization:    {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100);
-  SPDLOG_INFO("Merge:            {:8.3f} ms ({:5.1f}%)", stats.merge_ms, stats.merge_ms/total_ms*100);
-  SPDLOG_INFO("Total:            {:8.3f} ms", total_ms);
-  SPDLOG_INFO("==========================================");
-}
-
-void SimpleRenderer::PrintTileBasedStats(double vertex_ms, const TileRenderStats& stats) const {
-  double total_ms = vertex_ms + stats.total_ms;
-  
-  SPDLOG_INFO("=== TILE-BASED RENDERING PERFORMANCE ===");
-  SPDLOG_INFO("Vertex Shader:    {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100);
-  SPDLOG_INFO("Setup:            {:8.3f} ms ({:5.1f}%)", stats.setup_ms, stats.setup_ms/total_ms*100);
-  SPDLOG_INFO("Binning:          {:8.3f} ms ({:5.1f}%)", stats.binning_ms, stats.binning_ms/total_ms*100);
-  SPDLOG_INFO("Buffer Alloc:     {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100);
-  SPDLOG_INFO("Rasterization:    {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100);
-  SPDLOG_INFO("Merge:            {:8.3f} ms ({:5.1f}%)", stats.merge_ms, stats.merge_ms/total_ms*100);
-  SPDLOG_INFO("Total:            {:8.3f} ms", total_ms);
-  SPDLOG_INFO("==========================================");
-}
-
-void SimpleRenderer::PrintDeferredStats(double vertex_ms, const DeferredRenderStats& stats) const {
-  double total_ms = vertex_ms + stats.total_ms;
-  
-  SPDLOG_INFO("=== DEFERRED RENDERING PERFORMANCE ===");
-  SPDLOG_INFO("Vertex Shader:        {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/total_ms*100);
-  SPDLOG_INFO("Buffer Alloc:         {:8.3f} ms ({:5.1f}%)", stats.buffer_alloc_ms, stats.buffer_alloc_ms/total_ms*100);
-  SPDLOG_INFO("Rasterization:        {:8.3f} ms ({:5.1f}%)", stats.rasterization_ms, stats.rasterization_ms/total_ms*100);
-  SPDLOG_INFO("Fragment Collection:  {:8.3f} ms ({:5.1f}%)", stats.fragment_collection_ms, stats.fragment_collection_ms/total_ms*100);
-  SPDLOG_INFO("Fragment Merge:       {:8.3f} ms ({:5.1f}%)", stats.fragment_merge_ms, stats.fragment_merge_ms/total_ms*100);
-  SPDLOG_INFO("Deferred Shading:     {:8.3f} ms ({:5.1f}%)", stats.deferred_shading_ms, stats.deferred_shading_ms/total_ms*100);
-  SPDLOG_INFO("Total:                {:8.3f} ms", total_ms);
-  SPDLOG_INFO("=========================================");
-}
-
 }  // namespace simple_renderer
diff --git a/src/renderers/deferred_renderer.cpp b/src/renderers/deferred_renderer.cpp
new file mode 100644
index 0000000..6931812
--- /dev/null
+++ b/src/renderers/deferred_renderer.cpp
@@ -0,0 +1,146 @@
+#include "renderers/deferred_renderer.hpp"
+
+#include <omp.h>
+#include <algorithm>
+#include <chrono>
+
+#include "config.h"
+#include "log_system.h"
+
+namespace simple_renderer {
+
+bool DeferredRenderer::Render(const Model& model, const Shader& shader_in, uint32_t* buffer) {
+  auto total_start_time = std::chrono::high_resolution_clock::now();
+  auto shader = std::make_shared<Shader>(shader_in);
+
+  // 顶点变换（AoS）
+  auto vertex_start = std::chrono::high_resolution_clock::now();
+  const auto &input_vertices = model.GetVertices();
+  std::vector<Vertex> processedVertices(input_vertices.size());
+#pragma omp parallel for num_threads(kNProc) schedule(static) \
+    shared(shader, processedVertices, input_vertices)
+  for (size_t i = 0; i < input_vertices.size(); ++i) {
+    const auto &v = input_vertices[i];
+    auto clipSpaceVertex = shader->VertexShader(v);
+    auto ndcVertex = PerspectiveDivision(clipSpaceVertex);
+    auto screenSpaceVertex = ViewportTransformation(ndcVertex);
+    processedVertices[i] = screenSpaceVertex;
+  }
+  auto vertex_end = std::chrono::high_resolution_clock::now();
+  auto vertex_ms = std::chrono::duration_cast<std::chrono::microseconds>(vertex_end - vertex_start).count() / 1000.0;
+
+  // Buffer allocation
+  auto buffer_alloc_start = std::chrono::high_resolution_clock::now();
+  std::vector<std::vector<std::vector<Fragment>>> fragmentsBuffer_all_thread(
+      kNProc, std::vector<std::vector<Fragment>>(width_ * height_));
+
+  std::vector<Material> material_cache;
+  material_cache.reserve(model.GetFaces().size());
+  for (const auto &f : model.GetFaces()) {
+    material_cache.emplace_back(f.GetMaterial());
+  }
+  auto buffer_alloc_end = std::chrono::high_resolution_clock::now();
+  auto buffer_alloc_ms = std::chrono::duration_cast<std::chrono::microseconds>(buffer_alloc_end - buffer_alloc_start).count() / 1000.0;
+
+  // Rasterization: collect fragments per pixel per thread
+  auto raster_start = std::chrono::high_resolution_clock::now();
+#pragma omp parallel num_threads(kNProc) default(none)                       \
+  shared(processedVertices, fragmentsBuffer_all_thread, rasterizer_, width_, \
+               height_, material_cache, model)
+  {
+    int thread_id = omp_get_thread_num();
+    auto &fragmentsBuffer_per_thread = fragmentsBuffer_all_thread[thread_id];
+
+#pragma omp for
+    for (size_t face_idx = 0; face_idx < model.GetFaces().size(); ++face_idx) {
+      const auto &f = model.GetFaces()[face_idx];
+      auto v0 = processedVertices[f.GetIndex(0)];
+      auto v1 = processedVertices[f.GetIndex(1)];
+      auto v2 = processedVertices[f.GetIndex(2)];
+
+      const Material *material = &material_cache[face_idx]; // 使用缓存的Material
+      auto fragments = rasterizer_->Rasterize(v0, v1, v2);
+
+      for (auto &fragment : fragments) {
+        fragment.material = material;
+        size_t x = fragment.screen_coord[0];
+        size_t y = fragment.screen_coord[1];
+
+        if (x >= width_ || y >= height_) continue;
+        size_t index = x + y * width_;
+        fragmentsBuffer_per_thread[index].push_back(fragment);
+      }
+    }
+  }
+  auto raster_end = std::chrono::high_resolution_clock::now();
+  auto raster_ms = std::chrono::duration_cast<std::chrono::microseconds>(raster_end - raster_start).count() / 1000.0;
+
+  /* * * Fragment Collection * * */
+  auto collect_start = std::chrono::high_resolution_clock::now();
+  std::vector<std::vector<Fragment>> fragmentsBuffer(width_ * height_);
+  for (const auto &fragmentsBuffer_per_thread : fragmentsBuffer_all_thread) {
+    for (size_t i = 0; i < fragmentsBuffer_per_thread.size(); i++) {
+      fragmentsBuffer[i].insert(fragmentsBuffer[i].end(),
+                                fragmentsBuffer_per_thread[i].begin(),
+                                fragmentsBuffer_per_thread[i].end());
+    }
+  }
+  auto collect_end = std::chrono::high_resolution_clock::now();
+  auto collect_ms = std::chrono::duration_cast<std::chrono::microseconds>(collect_end - collect_start).count() / 1000.0;
+
+  /* * * Fragment Merge & Deferred Shading * * */
+  auto merge_start = std::chrono::high_resolution_clock::now();
+
+  // Fragment Merge阶段：深度测试选择最近片段
+  std::vector<const Fragment*> selected_fragments(width_ * height_, nullptr);
+#pragma omp parallel for
+  for (size_t i = 0; i < fragmentsBuffer.size(); i++) {
+    const auto &fragments = fragmentsBuffer[i];
+    if (fragments.empty()) continue;
+    const Fragment *renderFragment = nullptr;
+    for (const auto &fragment : fragments) {
+      if (!renderFragment || fragment.depth < renderFragment->depth) {
+        renderFragment = &fragment;
+      }
+    }
+    selected_fragments[i] = renderFragment;
+  }
+  auto merge_end = std::chrono::high_resolution_clock::now();
+  auto merge_ms = std::chrono::duration_cast<std::chrono::microseconds>(merge_end - merge_start).count() / 1000.0;
+
+  // Deferred Shading阶段：对选择的片段执行片段着色
+  auto shade_start = std::chrono::high_resolution_clock::now();
+#pragma omp parallel for
+  for (size_t i = 0; i < selected_fragments.size(); i++) {
+    const Fragment *renderFragment = selected_fragments[i];
+    if (renderFragment) {
+      // 添加Material指针有效性检查
+      if (renderFragment->material == nullptr) {
+        SPDLOG_ERROR("Fragment material is nullptr at pixel {}", i);
+        continue;
+      }
+      auto color = shader->FragmentShader(*renderFragment);
+      buffer[i] = uint32_t(color);
+    }
+  }
+  auto shade_end = std::chrono::high_resolution_clock::now();
+  auto shade_ms = std::chrono::duration_cast<std::chrono::microseconds>(shade_end - shade_start).count() / 1000.0;
+
+  auto total_end_time = std::chrono::high_resolution_clock::now();
+  double total_ms = std::chrono::duration_cast<std::chrono::microseconds>(total_end_time - total_start_time).count() / 1000.0;
+
+  SPDLOG_INFO("=== DEFERRED RENDERING PERFORMANCE ===");
+  double sum_ms = vertex_ms + (total_ms - vertex_ms);
+  SPDLOG_INFO("Vertex Shader:        {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/sum_ms*100);
+  SPDLOG_INFO("Buffer Alloc:         {:8.3f} ms", buffer_alloc_ms);
+  SPDLOG_INFO("Rasterization:        {:8.3f} ms", raster_ms);
+  SPDLOG_INFO("Fragment Collection:  {:8.3f} ms", collect_ms);
+  SPDLOG_INFO("Fragment Merge:       {:8.3f} ms", merge_ms);
+  SPDLOG_INFO("Deferred Shading:     {:8.3f} ms", shade_ms);
+  SPDLOG_INFO("Total:                {:8.3f} ms", vertex_ms + (buffer_alloc_ms + raster_ms + collect_ms + merge_ms + shade_ms));
+  SPDLOG_INFO("=========================================");
+
+  return true;
+}
+
+}  // namespace simple_renderer
diff --git a/src/renderers/per_triangle_renderer.cpp b/src/renderers/per_triangle_renderer.cpp
new file mode 100644
index 0000000..9e3167c
--- /dev/null
+++ b/src/renderers/per_triangle_renderer.cpp
@@ -0,0 +1,172 @@
+#include "renderers/per_triangle_renderer.hpp"
+
+#include <omp.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cstring>
+#include <limits>
+#include <memory>
+
+#include "config.h"
+#include "log_system.h"
+
+namespace simple_renderer {
+
+bool PerTriangleRenderer::Render(const Model &model, const Shader &shader_in,
+                                 uint32_t *buffer) {
+  auto total_start_time = std::chrono::high_resolution_clock::now();
+
+  // 复制 shader 以便在多线程中共享
+  auto shader = std::make_shared<Shader>(shader_in);
+
+  // 顶点变换（AoS）
+  auto vertex_start = std::chrono::high_resolution_clock::now();
+  const auto &input_vertices = model.GetVertices();
+  std::vector<Vertex> processedVertices(input_vertices.size());
+
+#pragma omp parallel for num_threads(kNProc) schedule(static) \
+    shared(shader, processedVertices, input_vertices)
+  for (size_t i = 0; i < input_vertices.size(); ++i) {
+    const auto &v = input_vertices[i];
+    auto clipSpaceVertex = shader->VertexShader(v);
+    auto ndcVertex = PerspectiveDivision(clipSpaceVertex);
+    auto screenSpaceVertex = ViewportTransformation(ndcVertex);
+    processedVertices[i] = screenSpaceVertex;
+  }
+  auto vertex_end = std::chrono::high_resolution_clock::now();
+  auto vertex_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                       vertex_end - vertex_start)
+                       .count() /
+                   1000.0;
+
+  // 1. 为每个线程创建framebuffer
+  auto buffer_alloc_start = std::chrono::high_resolution_clock::now();
+  std::vector<std::unique_ptr<float[]>> depthBuffer_all_thread(kNProc);
+  std::vector<std::unique_ptr<uint32_t[]>> colorBuffer_all_thread(kNProc);
+
+  for (size_t thread_id = 0; thread_id < kNProc; thread_id++) {
+    depthBuffer_all_thread[thread_id] =
+        std::make_unique<float[]>(width_ * height_);
+    colorBuffer_all_thread[thread_id] =
+        std::make_unique<uint32_t[]>(width_ * height_);
+    std::fill_n(depthBuffer_all_thread[thread_id].get(), width_ * height_,
+                std::numeric_limits<float>::infinity());
+    std::fill_n(colorBuffer_all_thread[thread_id].get(), width_ * height_, 0);
+  }
+  auto buffer_alloc_end = std::chrono::high_resolution_clock::now();
+  auto buffer_alloc_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                             buffer_alloc_end - buffer_alloc_start)
+                             .count() /
+                         1000.0;
+
+  // 2. 并行光栅化
+  auto raster_start = std::chrono::high_resolution_clock::now();
+#pragma omp parallel num_threads(kNProc) default(none)              \
+    shared(processedVertices, shader, rasterizer_, width_, height_, \
+               depthBuffer_all_thread, colorBuffer_all_thread, model)
+  {
+    int thread_id = omp_get_thread_num();
+    auto &depthBuffer_per_thread = depthBuffer_all_thread[thread_id];
+    auto &colorBuffer_per_thread = colorBuffer_all_thread[thread_id];
+
+#pragma omp for
+    for (const auto &f : model.GetFaces()) {
+      auto v0 = processedVertices[f.GetIndex(0)];
+      auto v1 = processedVertices[f.GetIndex(1)];
+      auto v2 = processedVertices[f.GetIndex(2)];
+
+      // 背面剔除（屏幕空间叉积）
+      Vector2f screen0(v0.GetPosition().x, v0.GetPosition().y);
+      Vector2f screen1(v1.GetPosition().x, v1.GetPosition().y);
+      Vector2f screen2(v2.GetPosition().x, v2.GetPosition().y);
+
+      // 计算屏幕空间叉积判断朝向
+      Vector2f edge1 = screen1 - screen0;
+      Vector2f edge2 = screen2 - screen0;
+
+      // 背面剔除：NDC空间中叉积为负表示顺时针，即背面。
+      // 从NDC到屏幕空间中，会发生Y轴翻转，对应叉积应为正。
+      float cross_product = edge1.x * edge2.y - edge1.y * edge2.x;
+      if (cross_product > 0.0f) {
+        continue;  // 背面
+      }
+
+      const Material *material = &f.GetMaterial();
+      auto fragments = rasterizer_->Rasterize(v0, v1, v2);
+
+      for (auto &fragment : fragments) {
+        fragment.material = material;
+        size_t x = fragment.screen_coord[0];
+        size_t y = fragment.screen_coord[1];
+        if (x >= width_ || y >= height_) {
+          continue;
+        }
+        size_t index = x + y * width_;
+        if (fragment.depth < depthBuffer_per_thread[index]) {
+          depthBuffer_per_thread[index] = fragment.depth;
+          auto color = shader->FragmentShader(fragment);
+          colorBuffer_per_thread[index] = uint32_t(color);
+        }
+      }
+    }
+  }
+  auto raster_end = std::chrono::high_resolution_clock::now();
+  auto raster_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                       raster_end - raster_start)
+                       .count() /
+                   1000.0;
+
+  // 3. 合并结果
+  auto merge_start = std::chrono::high_resolution_clock::now();
+  std::unique_ptr<float[]> depthBuffer =
+      std::make_unique<float[]>(width_ * height_);
+  std::unique_ptr<uint32_t[]> colorBuffer =
+      std::make_unique<uint32_t[]>(width_ * height_);
+  std::fill_n(depthBuffer.get(), width_ * height_,
+              std::numeric_limits<float>::infinity());
+  std::fill_n(colorBuffer.get(), width_ * height_, 0);
+
+#pragma omp parallel for
+  for (size_t i = 0; i < width_ * height_; i++) {
+    float min_depth = std::numeric_limits<float>::infinity();
+    uint32_t color = 0;
+    for (size_t thread_id = 0; thread_id < kNProc; thread_id++) {
+      float depth = depthBuffer_all_thread[thread_id][i];
+      if (depth < min_depth) {
+        min_depth = depth;
+        color = colorBuffer_all_thread[thread_id][i];
+      }
+    }
+    depthBuffer[i] = min_depth;
+    colorBuffer[i] = color;
+  }
+
+  std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t));
+  auto merge_end = std::chrono::high_resolution_clock::now();
+  auto merge_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                      merge_end - merge_start)
+                      .count() /
+                  1000.0;
+
+  auto total_end_time = std::chrono::high_resolution_clock::now();
+  auto total_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                      total_end_time - total_start_time)
+                      .count() /
+                  1000.0;
+
+  SPDLOG_INFO("=== PER-TRIANGLE RENDERING PERFORMANCE ===");
+  double sum_ms = vertex_ms + (total_ms - vertex_ms);
+  SPDLOG_INFO("Vertex Shader:    {:8.3f} ms ({:5.1f}%)", vertex_ms,
+              vertex_ms / sum_ms * 100);
+  SPDLOG_INFO("Buffer Alloc:     {:8.3f} ms", buffer_alloc_ms);
+  SPDLOG_INFO("Rasterization:    {:8.3f} ms", raster_ms);
+  SPDLOG_INFO("Merge:            {:8.3f} ms", merge_ms);
+  SPDLOG_INFO("Total:            {:8.3f} ms",
+              vertex_ms + (buffer_alloc_ms + raster_ms + merge_ms));
+  SPDLOG_INFO("==========================================");
+
+  return true;
+}
+
+}  // namespace simple_renderer
diff --git a/src/renderers/renderer_base.cpp b/src/renderers/renderer_base.cpp
new file mode 100644
index 0000000..5a82e5a
--- /dev/null
+++ b/src/renderers/renderer_base.cpp
@@ -0,0 +1,44 @@
+#include "renderers/renderer_base.hpp"
+
+#include <algorithm>
+
+namespace simple_renderer {
+
+Vertex RendererBase::PerspectiveDivision(const Vertex &vertex) {
+  Vector4f position = vertex.GetPosition();
+
+  if (position.w <= kMinWValue) {
+    Vector4f farPosition(0.0f, 0.0f, 1.0f, 1.0f);
+    return Vertex(farPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor());
+  }
+
+  float original_w = position.w;
+  Vector4f ndcPosition(
+      position.x / position.w,  // x_ndc = x_clip / w_clip
+      position.y / position.w,  // y_ndc = y_clip / w_clip
+      position.z / position.w,  // z_ndc = z_clip / w_clip
+      1.0f / original_w         // 保存1/w用于透视矫正插值
+  );
+
+  ndcPosition.z = std::clamp(ndcPosition.z, -1.0f, 1.0f);
+  return Vertex(ndcPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor(), vertex.GetClipPosition());
+}
+
+Vertex RendererBase::ViewportTransformation(const Vertex &vertex) {
+  Vector4f ndcPosition = vertex.GetPosition();
+
+  // 视口变换：将NDC坐标[-1,1]转换为屏幕坐标[0,width]x[0,height]
+  float screen_x = (ndcPosition.x + 1.0f) * width_ / 2.0f;
+  float screen_y = (1.0f - ndcPosition.y) * height_ / 2.0f;
+
+  Vector4f screenPosition(
+      screen_x,
+      screen_y,
+      ndcPosition.z,
+      ndcPosition.w);
+
+  return Vertex(screenPosition, vertex.GetNormal(), vertex.GetTexCoords(), vertex.GetColor());
+}
+
+}  // namespace simple_renderer
+
diff --git a/src/renderers/tile_based_renderer.cpp b/src/renderers/tile_based_renderer.cpp
new file mode 100644
index 0000000..1ad3db7
--- /dev/null
+++ b/src/renderers/tile_based_renderer.cpp
@@ -0,0 +1,366 @@
+#include "renderers/tile_based_renderer.hpp"
+
+#include <omp.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cstring>
+#include <limits>
+
+#include "config.h"
+#include "log_system.h"
+
+namespace simple_renderer {
+
+bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in,
+                               uint32_t *buffer) {
+  auto total_start_time = std::chrono::high_resolution_clock::now();
+  auto shader = std::make_shared<Shader>(shader_in);
+
+  // 顶点变换（SoA）
+  auto vertex_start = std::chrono::high_resolution_clock::now();
+  const auto &input_vertices = model.GetVertices();
+  VertexSoA soa;
+  soa.resize(input_vertices.size());
+
+#pragma omp parallel for num_threads(kNProc) schedule(static) \
+    shared(shader, soa, input_vertices)
+  for (size_t i = 0; i < input_vertices.size(); ++i) {
+    const auto &v = input_vertices[i];
+    auto clipSpaceVertex = shader->VertexShader(v);
+    soa.pos_clip[i] = clipSpaceVertex.GetPosition();
+    auto ndcVertex = PerspectiveDivision(clipSpaceVertex);
+    auto screenSpaceVertex = ViewportTransformation(ndcVertex);
+    soa.pos_screen[i] = screenSpaceVertex.GetPosition();
+    soa.normal[i] = screenSpaceVertex.GetNormal();
+    soa.uv[i] = screenSpaceVertex.GetTexCoords();
+    soa.color[i] = screenSpaceVertex.GetColor();
+  }
+  auto vertex_end = std::chrono::high_resolution_clock::now();
+  auto vertex_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                       vertex_end - vertex_start)
+                       .count() /
+                   1000.0;
+
+  // 1. Setup
+  auto setup_start = std::chrono::high_resolution_clock::now();
+  const size_t TILE_SIZE = tile_size_ > 0 ? tile_size_ : 64;
+  const size_t tiles_x = (width_ + TILE_SIZE - 1) / TILE_SIZE;
+  const size_t tiles_y = (height_ + TILE_SIZE - 1) / TILE_SIZE;
+  const size_t total_tiles = tiles_x * tiles_y;
+
+  // 为每个tile创建三角形列表（SoA 引用）
+  std::vector<std::vector<TileTriangleRef>> tile_triangles(total_tiles);
+  auto setup_end = std::chrono::high_resolution_clock::now();
+  auto setup_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                      setup_end - setup_start)
+                      .count() /
+                  1000.0;
+
+  // 2. Binning
+  auto binning_start = std::chrono::high_resolution_clock::now();
+  TriangleTileBinning(model, soa, tile_triangles, tiles_x, tiles_y, TILE_SIZE);
+  auto binning_end = std::chrono::high_resolution_clock::now();
+  auto binning_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                        binning_end - binning_start)
+                        .count() /
+                    1000.0;
+
+  // 3. 单份全局 framebuffer
+  // 直接让每个 tile 写入这份全局缓冲区，避免末端 O(W*H*kNProc) 合并开销
+
+  auto buffer_alloc_start = std::chrono::high_resolution_clock::now();
+  std::unique_ptr<float[]> depthBuffer =
+      std::make_unique<float[]>(width_ * height_);
+  std::unique_ptr<uint32_t[]> colorBuffer =
+      std::make_unique<uint32_t[]>(width_ * height_);
+  // 深度初始化为最远值，颜色清零
+
+  std::fill_n(depthBuffer.get(), width_ * height_,
+              std::numeric_limits<float>::infinity());
+  std::fill_n(colorBuffer.get(), width_ * height_, 0);
+  auto buffer_alloc_end = std::chrono::high_resolution_clock::now();
+  auto buffer_alloc_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                             buffer_alloc_end - buffer_alloc_start)
+                             .count() /
+                         1000.0;
+
+  // 4. 并行光栅化每个 tile（SoA + early-z）
+  auto raster_start = std::chrono::high_resolution_clock::now();
+#pragma omp parallel num_threads(kNProc) default(none)                        \
+    shared(tile_triangles, rasterizer_, shader, width_, height_, depthBuffer, \
+               colorBuffer, tiles_x, tiles_y, total_tiles, soa, TILE_SIZE)
+  {
+    // 为每个 tile 分配局部深度和颜色缓冲
+    std::unique_ptr<float[]> tile_depth_buffer =
+        std::make_unique<float[]>(TILE_SIZE * TILE_SIZE);
+    std::unique_ptr<uint32_t[]> tile_color_buffer =
+        std::make_unique<uint32_t[]>(TILE_SIZE * TILE_SIZE);
+
+    // 为每个 tile 分配可复用片段临时容器，容量按单 tile 上限预估
+    std::vector<Fragment> scratch_fragments;
+    scratch_fragments.reserve(TILE_SIZE * TILE_SIZE);
+
+#pragma omp for schedule(static)
+    for (size_t tile_id = 0; tile_id < total_tiles; ++tile_id) {
+      // 按照 tile 进行光栅化（SoA）
+      // 直接写入单份全局 framebuffer；不同 tile 不重叠，无需加锁
+      RasterizeTile(tile_id, tile_triangles[tile_id], tiles_x, tiles_y,
+                    TILE_SIZE, tile_depth_buffer.get(), tile_color_buffer.get(),
+                    depthBuffer, colorBuffer, soa, *shader, early_z_,
+                    &scratch_fragments);
+    }
+  }
+  auto raster_end = std::chrono::high_resolution_clock::now();
+  auto raster_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                       raster_end - raster_start)
+                       .count() /
+                   1000.0;
+
+  // 5. 直接将单份全局 colorBuffer 拷贝到输出
+  auto present_start = std::chrono::high_resolution_clock::now();
+  std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t));
+  auto present_end = std::chrono::high_resolution_clock::now();
+  auto present_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                        present_end - present_start)
+                        .count() /
+                    1000.0;
+
+  auto total_end_time = std::chrono::high_resolution_clock::now();
+  double total_ms = std::chrono::duration_cast<std::chrono::microseconds>(
+                        total_end_time - total_start_time)
+                        .count() /
+                    1000.0;
+
+  SPDLOG_INFO("=== TILE-BASED RENDERING PERFORMANCE ===");
+  double sum_ms = vertex_ms + (total_ms - vertex_ms);
+  SPDLOG_INFO("Vertex Shader:    {:8.3f} ms ({:5.1f}%)", vertex_ms,
+              vertex_ms / sum_ms * 100);
+  SPDLOG_INFO("Setup:            {:8.3f} ms", setup_ms);
+  SPDLOG_INFO("Binning:          {:8.3f} ms", binning_ms);
+  SPDLOG_INFO("Buffer Alloc:     {:8.3f} ms", buffer_alloc_ms);
+  SPDLOG_INFO("Rasterization:    {:8.3f} ms", raster_ms);
+  SPDLOG_INFO("Copy:             {:8.3f} ms", present_ms);
+  SPDLOG_INFO("Total:            {:8.3f} ms",
+              vertex_ms + (setup_ms + binning_ms + buffer_alloc_ms + raster_ms +
+                           present_ms));
+  SPDLOG_INFO("==========================================");
+
+  return true;
+}
+
+void TileBasedRenderer::TriangleTileBinning(
+    const Model &model, const VertexSoA &soa,
+    std::vector<std::vector<TileTriangleRef>> &tile_triangles, size_t tiles_x,
+    size_t tiles_y, size_t tile_size) {
+  const size_t total_triangles = model.GetFaces().size();
+
+  SPDLOG_INFO("Starting triangle-tile binning (SoA) for {} triangles",
+              total_triangles);
+  SPDLOG_INFO("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", width_,
+              height_, tile_size, tiles_x, tiles_y);
+
+  std::vector<size_t> tile_counts(tiles_x * tiles_y, 0);
+
+  // 第一遍（count only）：计算每个tile需要容纳多少三角形
+  for (size_t tri_idx = 0; tri_idx < total_triangles; ++tri_idx) {
+    ProcessTriangleForTileBinning(tri_idx, true, model, soa, tiles_x, tiles_y, tile_size, tile_counts, tile_triangles);
+  }
+
+  // 预分配，避免动态扩容
+  for (size_t tile_id = 0; tile_id < tile_triangles.size(); ++tile_id) {
+    if (tile_counts[tile_id] > 0)
+      tile_triangles[tile_id].reserve(tile_counts[tile_id]);
+  }
+
+  // 第二遍（fill）：按范围填充TriangleRef
+  for (size_t tri_idx = 0; tri_idx < total_triangles; ++tri_idx) {
+    ProcessTriangleForTileBinning(tri_idx, false, model, soa, tiles_x, tiles_y, tile_size, tile_counts, tile_triangles);
+  }
+
+  size_t total_triangle_refs = 0;
+  size_t non_empty_tiles = 0;
+  for (const auto &tile : tile_triangles) {
+    total_triangle_refs += tile.size();
+    if (!tile.empty()) non_empty_tiles++;
+  }
+  SPDLOG_INFO("  (SoA) Total triangle references: {}", total_triangle_refs);
+  SPDLOG_INFO("  (SoA) Non-empty tiles: {}", non_empty_tiles);
+  SPDLOG_INFO("  (SoA) Average triangles per tile: {:.2f}",
+              total_triangle_refs > 0
+                  ? float(total_triangle_refs) / tile_triangles.size()
+                  : 0.0f);
+}
+
+void TileBasedRenderer::RasterizeTile(
+    size_t tile_id, const std::vector<TileTriangleRef> &triangles,
+    size_t tiles_x, size_t tiles_y, size_t tile_size, float *tile_depth_buffer,
+    uint32_t *tile_color_buffer, std::unique_ptr<float[]> &global_depth_buffer,
+    std::unique_ptr<uint32_t[]> &global_color_buffer, const VertexSoA &soa,
+    const Shader &shader, bool use_early_z,
+    std::vector<Fragment> *scratch_fragments) {
+  (void)tiles_y;
+  // 计算 tile 屏幕范围
+  size_t tile_x = tile_id % tiles_x;
+  size_t tile_y = tile_id / tiles_x;
+  size_t screen_x_start = tile_x * tile_size;
+  size_t screen_y_start = tile_y * tile_size;
+  size_t screen_x_end = std::min(screen_x_start + tile_size, width_);
+  size_t screen_y_end = std::min(screen_y_start + tile_size, height_);
+
+  // 初始化 tile 局部缓冲
+  size_t tile_width = screen_x_end - screen_x_start;
+  size_t tile_height = screen_y_end - screen_y_start;
+  std::fill_n(tile_depth_buffer, tile_width * tile_height, 1.0f);
+  std::fill_n(tile_color_buffer, tile_width * tile_height, 0);
+
+  // 只有当调用方没有提供 scratch 时，才启用本地容器并且只构造一次
+  const bool use_internal_scratch = (scratch_fragments == nullptr);
+  std::vector<Fragment> internal_out;
+  if (use_internal_scratch) internal_out.reserve(tile_width * tile_height);
+
+  for (const auto &tri : triangles) {  // 用来应对scratch传入nullptr的情况
+    // 始终走 SoA + 限制矩形的光栅化路径；如未提供 scratch，则使用函数内局部容器
+    std::vector<Fragment> &out =
+        use_internal_scratch ? internal_out : *scratch_fragments;
+    out.clear();
+    if (out.capacity() < tile_width * tile_height)
+      out.reserve(tile_width * tile_height);
+
+    rasterizer_->RasterizeTo(
+        soa, tri.i0, tri.i1, tri.i2, static_cast<int>(screen_x_start),
+        static_cast<int>(screen_y_start), static_cast<int>(screen_x_end),
+        static_cast<int>(screen_y_end), out);
+
+    for (auto &fragment : out) {
+      fragment.material = tri.material;
+      size_t sx = fragment.screen_coord[0];
+      size_t sy = fragment.screen_coord[1];
+      if (sx >= screen_x_start && sx < screen_x_end && sy >= screen_y_start &&
+          sy < screen_y_end) {
+        size_t local_x = sx - screen_x_start;
+        size_t local_y = sy - screen_y_start;
+        size_t idx = local_x + local_y * tile_width;
+        if (use_early_z) {
+          if (fragment.depth < tile_depth_buffer[idx]) {
+            auto color = shader.FragmentShader(fragment);
+            tile_depth_buffer[idx] = fragment.depth;
+            tile_color_buffer[idx] = uint32_t(color);
+          }
+        } else {
+          auto color = shader.FragmentShader(fragment);
+          if (fragment.depth < tile_depth_buffer[idx]) {
+            tile_depth_buffer[idx] = fragment.depth;
+            tile_color_buffer[idx] = uint32_t(color);
+          }
+        }
+      }
+    }
+  }
+
+  // 写回全局缓冲
+  // TBR 下不同 tile 覆盖的屏幕区域互不重叠，且在 tile 内部已通过 Early‑Z
+  // 得出每个像素的最终值。因此可以直接将 tile 行数据拷贝到全局缓冲
+  for (size_t y = 0; y < tile_height; y++) {
+    const size_t tile_row_off = y * tile_width;
+    const size_t global_row_off =
+        (screen_y_start + y) * width_ + screen_x_start;
+
+    // 拷贝本行 color 到全局 color
+    std::memcpy(global_color_buffer.get() + global_row_off,
+                tile_color_buffer + tile_row_off,
+                tile_width * sizeof(uint32_t));
+
+    // 拷贝本行 depth 到全局 depth
+    std::memcpy(global_depth_buffer.get() + global_row_off,
+                tile_depth_buffer + tile_row_off, tile_width * sizeof(float));
+  }
+}
+
+void TileBasedRenderer::ProcessTriangleForTileBinning(
+    size_t tri_idx, bool count_only,
+    const Model& model, const VertexSoA& soa,
+    size_t tiles_x, size_t tiles_y, size_t tile_size,
+    std::vector<size_t>& tile_counts,
+    std::vector<std::vector<TileTriangleRef>>& tile_triangles) {
+  const auto &f = model.GetFaces()[tri_idx];
+  size_t i0 = f.GetIndex(0);
+  size_t i1 = f.GetIndex(1);
+  size_t i2 = f.GetIndex(2);
+
+  // 视锥体裁剪 (裁剪空间)
+  // 保守视锥体裁剪：只有当整个三角形都在视锥体外同一侧时才裁剪
+  const Vector4f &c0 = soa.pos_clip[i0];
+  const Vector4f &c1 = soa.pos_clip[i1];
+  const Vector4f &c2 = soa.pos_clip[i2];
+  bool frustum_cull =
+      (c0.x > c0.w && c1.x > c1.w && c2.x > c2.w) ||     // 右平面外
+      (c0.x < -c0.w && c1.x < -c0.w && c2.x < -c0.w) ||  // 左平面外
+      (c0.y > c0.w && c1.y > c1.w && c2.y > c2.w) ||     // 上平面外
+      (c0.y < -c0.w && c1.y < -c0.w && c2.y < -c0.w) ||  // 下平面外
+      (c0.z > c0.w && c1.z > c1.w && c2.z > c2.w) ||     // 远平面外
+      (c0.z < -c0.w && c1.z < -c0.w && c2.z < -c0.w);    // 近平面外
+  if (frustum_cull) {
+    return;
+  }
+
+  const Vector4f &pos0 = soa.pos_screen[i0];
+  const Vector4f &pos1 = soa.pos_screen[i1];
+  const Vector4f &pos2 = soa.pos_screen[i2];
+
+  // 背面剔除（屏幕空间）
+  // NDC空间中叉积为负表示顺时针，即背面。
+  // 从NDC到屏幕空间中，会发生Y轴翻转，对应叉积应为正。
+  Vector2f screen0(pos0.x, pos0.y);
+  Vector2f screen1(pos1.x, pos1.y);
+  Vector2f screen2(pos2.x, pos2.y);
+  Vector2f edge1 = screen1 - screen0;
+  Vector2f edge2 = screen2 - screen0;
+  float cross_product = edge1.x * edge2.y - edge1.y * edge2.x;
+  if (cross_product > 0.0f) return;
+
+  float screen_x0 = pos0.x;
+  float screen_y0 = pos0.y;
+  float screen_x1 = pos1.x;
+  float screen_y1 = pos1.y;
+  float screen_x2 = pos2.x;
+  float screen_y2 = pos2.y;
+
+  // 计算屏幕bbox，用于后续tile划分
+  float min_x = std::min({screen_x0, screen_x1, screen_x2});
+  float max_x = std::max({screen_x0, screen_x1, screen_x2});
+  float min_y = std::min({screen_y0, screen_y1, screen_y2});
+  float max_y = std::max({screen_y0, screen_y1, screen_y2});
+
+  int start_tile_x =
+      std::max(0, static_cast<int>(min_x) / static_cast<int>(tile_size));
+  int end_tile_x =
+      std::min(static_cast<int>(tiles_x - 1),
+               static_cast<int>(max_x) / static_cast<int>(tile_size));
+  int start_tile_y =
+      std::max(0, static_cast<int>(min_y) / static_cast<int>(tile_size));
+  int end_tile_y =
+      std::min(static_cast<int>(tiles_y - 1),
+               static_cast<int>(max_y) / static_cast<int>(tile_size));
+  if (start_tile_x > end_tile_x || start_tile_y > end_tile_y)
+    return;  // 如果bbox不在任何tile内，直接返回
+
+  if (count_only) {  // 第一遍计数，只统计tile内三角形数量
+    for (int ty = start_tile_y; ty <= end_tile_y; ++ty) {
+      for (int tx = start_tile_x; tx <= end_tile_x; ++tx) {
+        size_t tile_id = ty * tiles_x + tx;
+        tile_counts[tile_id]++;
+      }
+    }
+  } else {  // 第二遍填充，填充TriangleRef
+    TileTriangleRef tri_ref{i0, i1, i2, &f.GetMaterial(), tri_idx};
+    for (int ty = start_tile_y; ty <= end_tile_y; ++ty) {
+      for (int tx = start_tile_x; tx <= end_tile_x; ++tx) {
+        size_t tile_id = ty * tiles_x + tx;
+        tile_triangles[tile_id].push_back(tri_ref);
+      }
+    }
+  }
+}
+
+}  // namespace simple_renderer
diff --git a/test/system_test/main.cpp b/test/system_test/main.cpp
index 9725181..d6491d9 100755
--- a/test/system_test/main.cpp
+++ b/test/system_test/main.cpp
@@ -80,7 +80,7 @@ int main(int argc, char **argv) {
 
   simple_renderer::Camera camera(simple_renderer::Vector3f(0.0f, 0.0f, 1.0f));
 
-  // 设置渲染模式（可选：TRADITIONAL、TILE_BASED 或 DEFERRED）
+  // 设置渲染模式（可选：PER_TRIANGLE、TILE_BASED 或 DEFERRED）
   simple_renderer.SetRenderingMode(simple_renderer::RenderingMode::TILE_BASED);
   
   // 输出当前渲染模式

From d6e3b4002ac1a1f9ef45c7f7d02f426cb86953bd Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Sun, 14 Sep 2025 23:34:26 +0800
Subject: [PATCH 17/24] TBR: Replace barycentric coordinate computation with
 half-space testing to enable SIMD-friendly rasterization; use
 relative-coordinate cross products to ensure numerical stability.

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 src/rasterizer.cpp | 87 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 64 insertions(+), 23 deletions(-)

diff --git a/src/rasterizer.cpp b/src/rasterizer.cpp
index 84cbc83..9b8558a 100644
--- a/src/rasterizer.cpp
+++ b/src/rasterizer.cpp
@@ -145,41 +145,82 @@ void Rasterizer::RasterizeTo(const VertexSoA& soa, size_t i0, size_t i1, size_t
   const Vector4f& p1 = soa.pos_screen[i1];
   const Vector4f& p2 = soa.pos_screen[i2];
 
-  Vector2f a = Vector2f(p0.x, p0.y);
-  Vector2f b = Vector2f(p1.x, p1.y);
-  Vector2f c = Vector2f(p2.x, p2.y);
+  // 为BarycentricCoord预构造Vec3f，避免循环内重复构造
+  const Vector3f sp0(p0.x, p0.y, p0.z);
+  const Vector3f sp1(p1.x, p1.y, p1.z);
+  const Vector3f sp2(p2.x, p2.y, p2.z);
 
-  Vector2f bboxMin = Vector2f{std::min({a.x, b.x, c.x}), std::min({a.y, b.y, c.y})};
-  Vector2f bboxMax = Vector2f{std::max({a.x, b.x, c.x}), std::max({a.y, b.y, c.y})};
-
-  // Clamp 到屏幕尺寸
-  float minx = std::max(0.0f, bboxMin.x);
-  float miny = std::max(0.0f, bboxMin.y);
-  float maxx = std::min(float(width_ - 1), bboxMax.x);
-  float maxy = std::min(float(height_ - 1), bboxMax.y);
+  // 计算屏幕空间AABB包围盒
+  const float minx_f = std::max(0.0f, std::min({p0.x, p1.x, p2.x}));
+  const float miny_f = std::max(0.0f, std::min({p0.y, p1.y, p2.y}));
+  const float maxx_f = std::min(float(width_  - 1), std::max({p0.x, p1.x, p2.x}));
+  const float maxy_f = std::min(float(height_ - 1), std::max({p0.y, p1.y, p2.y}));
 
   // 与外部提供的裁剪区域相交（半开区间） -> 闭区间扫描
-  int sx = std::max(x0, static_cast<int>(std::floor(minx)));
-  int sy = std::max(y0, static_cast<int>(std::floor(miny)));
-  int ex = std::min(x1 - 1, static_cast<int>(std::floor(maxx)));
-  int ey = std::min(y1 - 1, static_cast<int>(std::floor(maxy)));
+  int sx = std::max(x0, static_cast<int>(std::floor(minx_f)));
+  int sy = std::max(y0, static_cast<int>(std::floor(miny_f)));
+  int ex = std::min(x1 - 1, static_cast<int>(std::floor(maxx_f)));
+  int ey = std::min(y1 - 1, static_cast<int>(std::floor(maxy_f)));
   if (sx > ex || sy > ey) return;
 
-  for (int x = sx; x <= ex; ++x) {
-    for (int y = sy; y <= ey; ++y) {
-      auto [is_inside, bary] = GetBarycentricCoord(
-          Vector3f(p0.x, p0.y, p0.z), Vector3f(p1.x, p1.y, p1.z), Vector3f(p2.x, p2.y, p2.z),
-          Vector3f(static_cast<float>(x), static_cast<float>(y), 0));
-      if (!is_inside) continue;
+  // 预计算边函数系数：E(x,y) = A*x + B*y + C
+  // 使用相对坐标的边函数定义，避免大常数项导致的数值不稳定
+  // 如使用绝对形式Ax+By+C会由于常数C的量级过大，造成浮点抵消，有效位丢失不稳定
+  auto cross2 = [](float ax, float ay, float bx, float by) {
+    return ax * by - ay * bx;
+  };
+  // 边向量
+  const float e01x = p1.x - p0.x, e01y = p1.y - p0.y; // (p0->p1)
+  const float e12x = p2.x - p1.x, e12y = p2.y - p1.y; // (p1->p2)
+  const float e20x = p0.x - p2.x, e20y = p0.y - p2.y; // (p2->p0)
+
+  // 有向面积（两倍），用相对面积定义：area2 = cross(p1 - p0, p2 - p0)
+  float area2 = cross2(e01x, e01y, p2.x - p0.x, p2.y - p0.y);
+  if (std::abs(area2) < 1e-6f) return; // 退化三角形
+  const float inv_area2 = 1.0f / area2;
+  const bool positive = (area2 > 0.0f);
+
+  // 行优先遍历：有利于 cache 与向量化
+  #pragma omp simd
+  for (int y = sy; y <= ey; ++y) {
+    const float yf = static_cast<float>(y);
+
+    // 注意：此处存在对 out.push_back 的写入，属于有副作用操作，不适合使用
+    // omp simd 进行强制向量化，否则可能导致不符合预期的行为（如周期性伪影）。
+    // 先保持标量内层，后续如切换为“直写像素回调”再考虑安全的 SIMD 化。
+    for (int x = sx; x <= ex; ++x) {
+      const float xf = static_cast<float>(x);
+
+      // 相对坐标边函数：
+      // E01(p) = cross(p1 - p0, p - p0)
+      // E12(p) = cross(p2 - p1, p - p1)
+      // E20(p) = cross(p0 - p2, p - p2)
+      const float E01 = cross2(e01x, e01y, xf - p0.x, yf - p0.y);
+      const float E12 = cross2(e12x, e12y, xf - p1.x, yf - p1.y);
+      const float E20 = cross2(e20x, e20y, xf - p2.x, yf - p2.y);
+
+      // 半空间测试（根据朝向选择符号）
+      const bool inside = positive ? (E01 >= 0.0f && E12 >= 0.0f && E20 >= 0.0f)
+                                   : (E01 <= 0.0f && E12 <= 0.0f && E20 <= 0.0f);
+      if (!inside) continue;
+
+      // 重心权重映射：
+      // b0 对应 v0，取与对边 (v1,v2) 的子面积 → E12
+      // b1 对应 v1 → E20
+      // b2 对应 v2 → E01
+      const float b0 = E12 * inv_area2;
+      const float b1 = E20 * inv_area2;
+      const float b2 = E01 * inv_area2;
+      const Vector3f bary(b0, b1, b2);
 
       // 透视矫正插值
       auto perspective_result = PerformPerspectiveCorrection(
           p0.w, p1.w, p2.w,
           p0.z, p1.z, p2.z,
           bary);
-      
+
       const Vector3f& corrected_bary = perspective_result.corrected_barycentric;
-      float z = perspective_result.interpolated_z;
+      const float z = perspective_result.interpolated_z;
 
       Fragment frag; // Note: material 指针由调用方填写
       frag.screen_coord = {x, y};

From 30038efb17648268eeb760e8f67baf3aa928ac84 Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Mon, 15 Sep 2025 13:38:26 +0800
Subject: [PATCH 18/24] DR: Optimize fragment collection(pre-reserve per
 bucket, move-insert, and per-bucket parallel merge)

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 src/renderers/deferred_renderer.cpp | 42 ++++++++++++++++++++++++-----
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/src/renderers/deferred_renderer.cpp b/src/renderers/deferred_renderer.cpp
index 6931812..c93c9ff 100644
--- a/src/renderers/deferred_renderer.cpp
+++ b/src/renderers/deferred_renderer.cpp
@@ -3,6 +3,8 @@
 #include <omp.h>
 #include <algorithm>
 #include <chrono>
+#include <cassert>
+#include <iterator>
 
 #include "config.h"
 #include "log_system.h"
@@ -77,12 +79,40 @@ bool DeferredRenderer::Render(const Model& model, const Shader& shader_in, uint3
 
   /* * * Fragment Collection * * */
   auto collect_start = std::chrono::high_resolution_clock::now();
-  std::vector<std::vector<Fragment>> fragmentsBuffer(width_ * height_);
-  for (const auto &fragmentsBuffer_per_thread : fragmentsBuffer_all_thread) {
-    for (size_t i = 0; i < fragmentsBuffer_per_thread.size(); i++) {
-      fragmentsBuffer[i].insert(fragmentsBuffer[i].end(),
-                                fragmentsBuffer_per_thread[i].begin(),
-                                fragmentsBuffer_per_thread[i].end());
+
+  const size_t pixel_count = static_cast<size_t>(width_) * static_cast<size_t>(height_);
+
+#ifndef NDEBUG
+  for (const auto &tb : fragmentsBuffer_all_thread) {
+    // 断言避免越界，确保固定维度
+    assert(tb.size() == pixel_count && "thread buffer size mismatch");
+  }
+#endif
+
+  // Pass 1: 统计每个像素桶的总片元数
+  std::vector<size_t> bucket_total(pixel_count, 0);
+  for (const auto &tb : fragmentsBuffer_all_thread) {
+    for (size_t i = 0; i < pixel_count; ++i) {
+      bucket_total[i] += tb[i].size();
+    }
+  }
+
+  // Pass 2: 统一预分配
+  std::vector<std::vector<Fragment>> fragmentsBuffer(pixel_count);
+  for (size_t i = 0; i < pixel_count; ++i) {
+    if (bucket_total[i] > 0) fragmentsBuffer[i].reserve(bucket_total[i]);
+  }
+
+  // Pass 3: 按桶并行合并（每个桶内部保持按线程序的插入顺序）
+#pragma omp parallel for num_threads(kNProc) schedule(static)
+  for (long long i = 0; i < static_cast<long long>(pixel_count); ++i) {
+    auto &dst = fragmentsBuffer[static_cast<size_t>(i)];
+    for (size_t t = 0; t < fragmentsBuffer_all_thread.size(); ++t) {
+      auto &src = fragmentsBuffer_all_thread[t][static_cast<size_t>(i)];
+      dst.insert(dst.end(),
+                 std::make_move_iterator(src.begin()),
+                 std::make_move_iterator(src.end()));
+      src.clear();
     }
   }
   auto collect_end = std::chrono::high_resolution_clock::now();

From 61e75a8d7486d29ca9338332453cade1ef3ac738 Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Mon, 15 Sep 2025 15:48:25 +0800
Subject: [PATCH 19/24] Refactor: Modify the triangle binning logic in TBR to
 use the TileGridContext structure. Replacing hard-coded values with
 constants.

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 src/include/renderers/tile_based_renderer.hpp |  42 ++++---
 src/renderers/tile_based_renderer.cpp         | 105 +++++++++---------
 2 files changed, 79 insertions(+), 68 deletions(-)

diff --git a/src/include/renderers/tile_based_renderer.hpp b/src/include/renderers/tile_based_renderer.hpp
index e3ecb89..f524fb6 100644
--- a/src/include/renderers/tile_based_renderer.hpp
+++ b/src/include/renderers/tile_based_renderer.hpp
@@ -14,6 +14,16 @@ struct TileTriangleRef {
   size_t face_index = 0;
 };
 
+/**
+ * @brief Tile 网格上下文（供 binning 和 raster 共享的网格/几何信息）
+ */
+struct TileGridContext {
+  const VertexSoA& soa;
+  size_t tiles_x;
+  size_t tiles_y;
+  size_t tile_size;
+};
+
 /**
  * @brief 基于 Tile 的渲染器（Tile‑Major）
  *
@@ -49,10 +59,9 @@ class TileBasedRenderer final : public RendererBase {
    * @param tiles_y 垂直 tile 数
    * @param tile_size tile 像素尺寸
    */
-  void TriangleTileBinning(const Model &model,
-                           const VertexSoA &soa,
-                           std::vector<std::vector<TileTriangleRef>> &tile_triangles,
-                           size_t tiles_x, size_t tiles_y, size_t tile_size);
+  void TriangleTileBinning(const Model& model,
+                           const TileGridContext& grid,
+                           std::vector<std::vector<TileTriangleRef>> &tile_triangles);
 
   /**
    * @brief 处理单个三角形的 tile binning 逻辑
@@ -68,8 +77,8 @@ class TileBasedRenderer final : public RendererBase {
    */
   void ProcessTriangleForTileBinning(
       size_t tri_idx, bool count_only,
-      const Model& model, const VertexSoA& soa,
-      size_t tiles_x, size_t tiles_y, size_t tile_size,
+      const Model& model,
+      const TileGridContext& grid,
       std::vector<size_t>& tile_counts,
       std::vector<std::vector<TileTriangleRef>>& tile_triangles);
 
@@ -90,17 +99,20 @@ class TileBasedRenderer final : public RendererBase {
    * @param scratch_fragments 可复用片段临时容器
    */
   void RasterizeTile(size_t tile_id,
-                      const std::vector<TileTriangleRef> &triangles,
-                      size_t tiles_x, size_t tiles_y, size_t tile_size,
-                      float* tile_depth_buffer, uint32_t* tile_color_buffer,
-                      std::unique_ptr<float[]> &global_depth_buffer,
-                      std::unique_ptr<uint32_t[]> &global_color_buffer,
-                      const VertexSoA &soa,
-                      const Shader& shader,
-                      bool use_early_z,
-                      std::vector<Fragment>* scratch_fragments);
+                     const std::vector<TileTriangleRef> &triangles,
+                     const TileGridContext& grid,
+                     float* tile_depth_buffer, uint32_t* tile_color_buffer,
+                     std::unique_ptr<float[]> &global_depth_buffer,
+                     std::unique_ptr<uint32_t[]> &global_color_buffer,
+                     const Shader& shader,
+                     bool use_early_z,
+                     std::vector<Fragment>* scratch_fragments);
 
  private:
+  // 深度和颜色的默认值，同时用于tile级和全局级buffers的初始化
+  static constexpr float kDepthClear = 1.0f; // 默认为最远值，用于Early-Z
+  static constexpr uint32_t kColorClear = 0u; // 默认为黑色
+
   const bool early_z_;
   const size_t tile_size_;
 };
diff --git a/src/renderers/tile_based_renderer.cpp b/src/renderers/tile_based_renderer.cpp
index 1ad3db7..c88f114 100644
--- a/src/renderers/tile_based_renderer.cpp
+++ b/src/renderers/tile_based_renderer.cpp
@@ -59,7 +59,8 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in,
 
   // 2. Binning
   auto binning_start = std::chrono::high_resolution_clock::now();
-  TriangleTileBinning(model, soa, tile_triangles, tiles_x, tiles_y, TILE_SIZE);
+  TileGridContext grid_ctx{soa, tiles_x, tiles_y, TILE_SIZE};
+  TriangleTileBinning(model, grid_ctx, tile_triangles);
   auto binning_end = std::chrono::high_resolution_clock::now();
   auto binning_ms = std::chrono::duration_cast<std::chrono::microseconds>(
                         binning_end - binning_start)
@@ -74,11 +75,10 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in,
       std::make_unique<float[]>(width_ * height_);
   std::unique_ptr<uint32_t[]> colorBuffer =
       std::make_unique<uint32_t[]>(width_ * height_);
-  // 深度初始化为最远值，颜色清零
 
-  std::fill_n(depthBuffer.get(), width_ * height_,
-              std::numeric_limits<float>::infinity());
-  std::fill_n(colorBuffer.get(), width_ * height_, 0);
+  // 深度初始化为最远值，颜色清零
+  std::fill_n(depthBuffer.get(), width_ * height_, kDepthClear);
+  std::fill_n(colorBuffer.get(), width_ * height_, kColorClear);
   auto buffer_alloc_end = std::chrono::high_resolution_clock::now();
   auto buffer_alloc_ms = std::chrono::duration_cast<std::chrono::microseconds>(
                              buffer_alloc_end - buffer_alloc_start)
@@ -88,26 +88,26 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in,
   // 4. 并行光栅化每个 tile（SoA + early-z）
   auto raster_start = std::chrono::high_resolution_clock::now();
 #pragma omp parallel num_threads(kNProc) default(none)                        \
-    shared(tile_triangles, rasterizer_, shader, width_, height_, depthBuffer, \
-               colorBuffer, tiles_x, tiles_y, total_tiles, soa, TILE_SIZE)
+    shared(tile_triangles, shader, depthBuffer, colorBuffer, total_tiles,     \
+               grid_ctx, early_z_)
   {
     // 为每个 tile 分配局部深度和颜色缓冲
     std::unique_ptr<float[]> tile_depth_buffer =
-        std::make_unique<float[]>(TILE_SIZE * TILE_SIZE);
+        std::make_unique<float[]>(grid_ctx.tile_size * grid_ctx.tile_size);
     std::unique_ptr<uint32_t[]> tile_color_buffer =
-        std::make_unique<uint32_t[]>(TILE_SIZE * TILE_SIZE);
+        std::make_unique<uint32_t[]>(grid_ctx.tile_size * grid_ctx.tile_size);
 
     // 为每个 tile 分配可复用片段临时容器，容量按单 tile 上限预估
     std::vector<Fragment> scratch_fragments;
-    scratch_fragments.reserve(TILE_SIZE * TILE_SIZE);
+    scratch_fragments.reserve(grid_ctx.tile_size * grid_ctx.tile_size);
 
 #pragma omp for schedule(static)
     for (size_t tile_id = 0; tile_id < total_tiles; ++tile_id) {
       // 按照 tile 进行光栅化（SoA）
       // 直接写入单份全局 framebuffer；不同 tile 不重叠，无需加锁
-      RasterizeTile(tile_id, tile_triangles[tile_id], tiles_x, tiles_y,
-                    TILE_SIZE, tile_depth_buffer.get(), tile_color_buffer.get(),
-                    depthBuffer, colorBuffer, soa, *shader, early_z_,
+      RasterizeTile(tile_id, tile_triangles[tile_id], grid_ctx,
+                    tile_depth_buffer.get(), tile_color_buffer.get(),
+                    depthBuffer, colorBuffer, *shader, early_z_,
                     &scratch_fragments);
     }
   }
@@ -150,21 +150,22 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in,
 }
 
 void TileBasedRenderer::TriangleTileBinning(
-    const Model &model, const VertexSoA &soa,
-    std::vector<std::vector<TileTriangleRef>> &tile_triangles, size_t tiles_x,
-    size_t tiles_y, size_t tile_size) {
+    const Model& model,
+    const TileGridContext& grid,
+    std::vector<std::vector<TileTriangleRef>> &tile_triangles) {
   const size_t total_triangles = model.GetFaces().size();
 
   SPDLOG_INFO("Starting triangle-tile binning (SoA) for {} triangles",
               total_triangles);
   SPDLOG_INFO("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", width_,
-              height_, tile_size, tiles_x, tiles_y);
+              height_, grid.tile_size, grid.tiles_x, grid.tiles_y);
 
-  std::vector<size_t> tile_counts(tiles_x * tiles_y, 0);
+  std::vector<size_t> tile_counts(grid.tiles_x * grid.tiles_y, 0);
 
   // 第一遍（count only）：计算每个tile需要容纳多少三角形
   for (size_t tri_idx = 0; tri_idx < total_triangles; ++tri_idx) {
-    ProcessTriangleForTileBinning(tri_idx, true, model, soa, tiles_x, tiles_y, tile_size, tile_counts, tile_triangles);
+    ProcessTriangleForTileBinning(tri_idx, true, model, grid,
+                                  tile_counts, tile_triangles);
   }
 
   // 预分配，避免动态扩容
@@ -175,7 +176,8 @@ void TileBasedRenderer::TriangleTileBinning(
 
   // 第二遍（fill）：按范围填充TriangleRef
   for (size_t tri_idx = 0; tri_idx < total_triangles; ++tri_idx) {
-    ProcessTriangleForTileBinning(tri_idx, false, model, soa, tiles_x, tiles_y, tile_size, tile_counts, tile_triangles);
+    ProcessTriangleForTileBinning(tri_idx, false, model, grid,
+                                  tile_counts, tile_triangles);
   }
 
   size_t total_triangle_refs = 0;
@@ -194,25 +196,24 @@ void TileBasedRenderer::TriangleTileBinning(
 
 void TileBasedRenderer::RasterizeTile(
     size_t tile_id, const std::vector<TileTriangleRef> &triangles,
-    size_t tiles_x, size_t tiles_y, size_t tile_size, float *tile_depth_buffer,
+    const TileGridContext& grid, float *tile_depth_buffer,
     uint32_t *tile_color_buffer, std::unique_ptr<float[]> &global_depth_buffer,
-    std::unique_ptr<uint32_t[]> &global_color_buffer, const VertexSoA &soa,
+    std::unique_ptr<uint32_t[]> &global_color_buffer,
     const Shader &shader, bool use_early_z,
     std::vector<Fragment> *scratch_fragments) {
-  (void)tiles_y;
   // 计算 tile 屏幕范围
-  size_t tile_x = tile_id % tiles_x;
-  size_t tile_y = tile_id / tiles_x;
-  size_t screen_x_start = tile_x * tile_size;
-  size_t screen_y_start = tile_y * tile_size;
-  size_t screen_x_end = std::min(screen_x_start + tile_size, width_);
-  size_t screen_y_end = std::min(screen_y_start + tile_size, height_);
+  size_t tile_x = tile_id % grid.tiles_x;
+  size_t tile_y = tile_id / grid.tiles_x;
+  size_t screen_x_start = tile_x * grid.tile_size;
+  size_t screen_y_start = tile_y * grid.tile_size;
+  size_t screen_x_end = std::min(screen_x_start + grid.tile_size, width_);
+  size_t screen_y_end = std::min(screen_y_start + grid.tile_size, height_);
 
   // 初始化 tile 局部缓冲
   size_t tile_width = screen_x_end - screen_x_start;
   size_t tile_height = screen_y_end - screen_y_start;
-  std::fill_n(tile_depth_buffer, tile_width * tile_height, 1.0f);
-  std::fill_n(tile_color_buffer, tile_width * tile_height, 0);
+  std::fill_n(tile_depth_buffer, tile_width * tile_height, kDepthClear);
+  std::fill_n(tile_color_buffer, tile_width * tile_height, kColorClear);
 
   // 只有当调用方没有提供 scratch 时，才启用本地容器并且只构造一次
   const bool use_internal_scratch = (scratch_fragments == nullptr);
@@ -228,7 +229,7 @@ void TileBasedRenderer::RasterizeTile(
       out.reserve(tile_width * tile_height);
 
     rasterizer_->RasterizeTo(
-        soa, tri.i0, tri.i1, tri.i2, static_cast<int>(screen_x_start),
+        grid.soa, tri.i0, tri.i1, tri.i2, static_cast<int>(screen_x_start),
         static_cast<int>(screen_y_start), static_cast<int>(screen_x_end),
         static_cast<int>(screen_y_end), out);
 
@@ -278,11 +279,9 @@ void TileBasedRenderer::RasterizeTile(
 }
 
 void TileBasedRenderer::ProcessTriangleForTileBinning(
-    size_t tri_idx, bool count_only,
-    const Model& model, const VertexSoA& soa,
-    size_t tiles_x, size_t tiles_y, size_t tile_size,
-    std::vector<size_t>& tile_counts,
-    std::vector<std::vector<TileTriangleRef>>& tile_triangles) {
+    size_t tri_idx, bool count_only, const Model &model,
+    const TileGridContext &grid, std::vector<size_t> &tile_counts,
+    std::vector<std::vector<TileTriangleRef>> &tile_triangles) {
   const auto &f = model.GetFaces()[tri_idx];
   size_t i0 = f.GetIndex(0);
   size_t i1 = f.GetIndex(1);
@@ -290,9 +289,9 @@ void TileBasedRenderer::ProcessTriangleForTileBinning(
 
   // 视锥体裁剪 (裁剪空间)
   // 保守视锥体裁剪：只有当整个三角形都在视锥体外同一侧时才裁剪
-  const Vector4f &c0 = soa.pos_clip[i0];
-  const Vector4f &c1 = soa.pos_clip[i1];
-  const Vector4f &c2 = soa.pos_clip[i2];
+  const Vector4f &c0 = grid.soa.pos_clip[i0];
+  const Vector4f &c1 = grid.soa.pos_clip[i1];
+  const Vector4f &c2 = grid.soa.pos_clip[i2];
   bool frustum_cull =
       (c0.x > c0.w && c1.x > c1.w && c2.x > c2.w) ||     // 右平面外
       (c0.x < -c0.w && c1.x < -c0.w && c2.x < -c0.w) ||  // 左平面外
@@ -304,9 +303,9 @@ void TileBasedRenderer::ProcessTriangleForTileBinning(
     return;
   }
 
-  const Vector4f &pos0 = soa.pos_screen[i0];
-  const Vector4f &pos1 = soa.pos_screen[i1];
-  const Vector4f &pos2 = soa.pos_screen[i2];
+  const Vector4f &pos0 = grid.soa.pos_screen[i0];
+  const Vector4f &pos1 = grid.soa.pos_screen[i1];
+  const Vector4f &pos2 = grid.soa.pos_screen[i2];
 
   // 背面剔除（屏幕空间）
   // NDC空间中叉积为负表示顺时针，即背面。
@@ -332,23 +331,23 @@ void TileBasedRenderer::ProcessTriangleForTileBinning(
   float min_y = std::min({screen_y0, screen_y1, screen_y2});
   float max_y = std::max({screen_y0, screen_y1, screen_y2});
 
-  int start_tile_x =
-      std::max(0, static_cast<int>(min_x) / static_cast<int>(tile_size));
+  int start_tile_x = std::max(0, static_cast<int>(min_x) /
+                                     static_cast<int>(grid.tile_size));
   int end_tile_x =
-      std::min(static_cast<int>(tiles_x - 1),
-               static_cast<int>(max_x) / static_cast<int>(tile_size));
-  int start_tile_y =
-      std::max(0, static_cast<int>(min_y) / static_cast<int>(tile_size));
+      std::min(static_cast<int>(grid.tiles_x - 1),
+               static_cast<int>(max_x) / static_cast<int>(grid.tile_size));
+  int start_tile_y = std::max(0, static_cast<int>(min_y) /
+                                     static_cast<int>(grid.tile_size));
   int end_tile_y =
-      std::min(static_cast<int>(tiles_y - 1),
-               static_cast<int>(max_y) / static_cast<int>(tile_size));
+      std::min(static_cast<int>(grid.tiles_y - 1),
+               static_cast<int>(max_y) / static_cast<int>(grid.tile_size));
   if (start_tile_x > end_tile_x || start_tile_y > end_tile_y)
     return;  // 如果bbox不在任何tile内，直接返回
 
   if (count_only) {  // 第一遍计数，只统计tile内三角形数量
     for (int ty = start_tile_y; ty <= end_tile_y; ++ty) {
       for (int tx = start_tile_x; tx <= end_tile_x; ++tx) {
-        size_t tile_id = ty * tiles_x + tx;
+        size_t tile_id = ty * grid.tiles_x + tx;
         tile_counts[tile_id]++;
       }
     }
@@ -356,7 +355,7 @@ void TileBasedRenderer::ProcessTriangleForTileBinning(
     TileTriangleRef tri_ref{i0, i1, i2, &f.GetMaterial(), tri_idx};
     for (int ty = start_tile_y; ty <= end_tile_y; ++ty) {
       for (int tx = start_tile_x; tx <= end_tile_x; ++tx) {
-        size_t tile_id = ty * tiles_x + tx;
+        size_t tile_id = ty * grid.tiles_x + tx;
         tile_triangles[tile_id].push_back(tri_ref);
       }
     }

From 86d06adfda6567c68c5fdb86c7cbb46416d60b11 Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Mon, 15 Sep 2025 16:14:23 +0800
Subject: [PATCH 20/24] Change timing-related debug messages from SPDLOG_INFO
 to SPDLOG_DEBUG, set the default log level to INFO

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 src/include/log_system.h                |  3 +++
 src/light.cpp                           |  2 +-
 src/rasterizer.cpp                      |  2 +-
 src/renderer.cpp                        |  2 +-
 src/renderers/deferred_renderer.cpp     | 18 +++++++--------
 src/renderers/per_triangle_renderer.cpp | 14 ++++++------
 src/renderers/tile_based_renderer.cpp   | 30 ++++++++++++-------------
 7 files changed, 37 insertions(+), 34 deletions(-)

diff --git a/src/include/log_system.h b/src/include/log_system.h
index a1f2903..2f8d9c4 100755
--- a/src/include/log_system.h
+++ b/src/include/log_system.h
@@ -17,6 +17,9 @@
 #ifndef SIMPLERENDER_SRC_INCLUDE_LOG_SYSTEM_H_
 #define SIMPLERENDER_SRC_INCLUDE_LOG_SYSTEM_H_
 
+#ifndef SPDLOG_ACTIVE_LEVEL
+#define SPDLOG_ACTIVE_LEVEL SPDLOG_LEVEL_INFO
+#endif
 #include <spdlog/spdlog.h>
 
 namespace simple_renderer {
diff --git a/src/light.cpp b/src/light.cpp
index f25fb4c..ae3a51d 100644
--- a/src/light.cpp
+++ b/src/light.cpp
@@ -27,7 +27,7 @@ const Vector3f Light::kDefaultDir = Vector3f(0, 0, -1);
 const Color Light::kDefaultColor = Color::kWhite;
 
 Light::Light(const std::string &name) : name_(name) {
-  SPDLOG_INFO("Light: {}", name_);
+  SPDLOG_DEBUG("Light: {}", name_);
 }
 
 }  // namespace simple_renderer
diff --git a/src/rasterizer.cpp b/src/rasterizer.cpp
index 9b8558a..04aa6b1 100644
--- a/src/rasterizer.cpp
+++ b/src/rasterizer.cpp
@@ -8,7 +8,7 @@ namespace simple_renderer {
 
 Rasterizer::Rasterizer(size_t width, size_t height)
     : width_(width), height_(height) {
-  SPDLOG_INFO("Rasterizer init with {}, {}", width, height);
+  SPDLOG_DEBUG("Rasterizer init with {}, {}", width, height);
 }
 
 std::vector<Fragment> Rasterizer::Rasterize(const Vertex& v0, const Vertex& v1,
diff --git a/src/renderer.cpp b/src/renderer.cpp
index 4319066..0939cf5 100644
--- a/src/renderer.cpp
+++ b/src/renderer.cpp
@@ -30,7 +30,7 @@ SimpleRenderer::SimpleRenderer(size_t width, size_t height)
 
 bool SimpleRenderer::DrawModel(const Model &model, const Shader &shader, uint32_t *buffer) {
   EnsureRenderer(); // 确保渲染器实例存在
-  SPDLOG_INFO("draw model: {}", model.GetModelPath());
+  SPDLOG_DEBUG("draw model: {}", model.GetModelPath());
   return renderer_->Render(model, shader, buffer);
 }
 
diff --git a/src/renderers/deferred_renderer.cpp b/src/renderers/deferred_renderer.cpp
index c93c9ff..a86d41f 100644
--- a/src/renderers/deferred_renderer.cpp
+++ b/src/renderers/deferred_renderer.cpp
@@ -159,16 +159,16 @@ bool DeferredRenderer::Render(const Model& model, const Shader& shader_in, uint3
   auto total_end_time = std::chrono::high_resolution_clock::now();
   double total_ms = std::chrono::duration_cast<std::chrono::microseconds>(total_end_time - total_start_time).count() / 1000.0;
 
-  SPDLOG_INFO("=== DEFERRED RENDERING PERFORMANCE ===");
+  SPDLOG_DEBUG("=== DEFERRED RENDERING PERFORMANCE ===");
   double sum_ms = vertex_ms + (total_ms - vertex_ms);
-  SPDLOG_INFO("Vertex Shader:        {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/sum_ms*100);
-  SPDLOG_INFO("Buffer Alloc:         {:8.3f} ms", buffer_alloc_ms);
-  SPDLOG_INFO("Rasterization:        {:8.3f} ms", raster_ms);
-  SPDLOG_INFO("Fragment Collection:  {:8.3f} ms", collect_ms);
-  SPDLOG_INFO("Fragment Merge:       {:8.3f} ms", merge_ms);
-  SPDLOG_INFO("Deferred Shading:     {:8.3f} ms", shade_ms);
-  SPDLOG_INFO("Total:                {:8.3f} ms", vertex_ms + (buffer_alloc_ms + raster_ms + collect_ms + merge_ms + shade_ms));
-  SPDLOG_INFO("=========================================");
+  SPDLOG_DEBUG("Vertex Shader:        {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms/sum_ms*100);
+  SPDLOG_DEBUG("Buffer Alloc:         {:8.3f} ms", buffer_alloc_ms);
+  SPDLOG_DEBUG("Rasterization:        {:8.3f} ms", raster_ms);
+  SPDLOG_DEBUG("Fragment Collection:  {:8.3f} ms", collect_ms);
+  SPDLOG_DEBUG("Fragment Merge:       {:8.3f} ms", merge_ms);
+  SPDLOG_DEBUG("Deferred Shading:     {:8.3f} ms", shade_ms);
+  SPDLOG_DEBUG("Total:                {:8.3f} ms", vertex_ms + (buffer_alloc_ms + raster_ms + collect_ms + merge_ms + shade_ms));
+  SPDLOG_DEBUG("=========================================");
 
   return true;
 }
diff --git a/src/renderers/per_triangle_renderer.cpp b/src/renderers/per_triangle_renderer.cpp
index 9e3167c..369f282 100644
--- a/src/renderers/per_triangle_renderer.cpp
+++ b/src/renderers/per_triangle_renderer.cpp
@@ -155,16 +155,16 @@ bool PerTriangleRenderer::Render(const Model &model, const Shader &shader_in,
                       .count() /
                   1000.0;
 
-  SPDLOG_INFO("=== PER-TRIANGLE RENDERING PERFORMANCE ===");
+  SPDLOG_DEBUG("=== PER-TRIANGLE RENDERING PERFORMANCE ===");
   double sum_ms = vertex_ms + (total_ms - vertex_ms);
-  SPDLOG_INFO("Vertex Shader:    {:8.3f} ms ({:5.1f}%)", vertex_ms,
+  SPDLOG_DEBUG("Vertex Shader:    {:8.3f} ms ({:5.1f}%)", vertex_ms,
               vertex_ms / sum_ms * 100);
-  SPDLOG_INFO("Buffer Alloc:     {:8.3f} ms", buffer_alloc_ms);
-  SPDLOG_INFO("Rasterization:    {:8.3f} ms", raster_ms);
-  SPDLOG_INFO("Merge:            {:8.3f} ms", merge_ms);
-  SPDLOG_INFO("Total:            {:8.3f} ms",
+  SPDLOG_DEBUG("Buffer Alloc:     {:8.3f} ms", buffer_alloc_ms);
+  SPDLOG_DEBUG("Rasterization:    {:8.3f} ms", raster_ms);
+  SPDLOG_DEBUG("Merge:            {:8.3f} ms", merge_ms);
+  SPDLOG_DEBUG("Total:            {:8.3f} ms",
               vertex_ms + (buffer_alloc_ms + raster_ms + merge_ms));
-  SPDLOG_INFO("==========================================");
+  SPDLOG_DEBUG("==========================================");
 
   return true;
 }
diff --git a/src/renderers/tile_based_renderer.cpp b/src/renderers/tile_based_renderer.cpp
index c88f114..e0df64d 100644
--- a/src/renderers/tile_based_renderer.cpp
+++ b/src/renderers/tile_based_renderer.cpp
@@ -132,19 +132,19 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in,
                         .count() /
                     1000.0;
 
-  SPDLOG_INFO("=== TILE-BASED RENDERING PERFORMANCE ===");
+  SPDLOG_DEBUG("=== TILE-BASED RENDERING PERFORMANCE ===");
   double sum_ms = vertex_ms + (total_ms - vertex_ms);
-  SPDLOG_INFO("Vertex Shader:    {:8.3f} ms ({:5.1f}%)", vertex_ms,
+  SPDLOG_DEBUG("Vertex Shader:    {:8.3f} ms ({:5.1f}%)", vertex_ms,
               vertex_ms / sum_ms * 100);
-  SPDLOG_INFO("Setup:            {:8.3f} ms", setup_ms);
-  SPDLOG_INFO("Binning:          {:8.3f} ms", binning_ms);
-  SPDLOG_INFO("Buffer Alloc:     {:8.3f} ms", buffer_alloc_ms);
-  SPDLOG_INFO("Rasterization:    {:8.3f} ms", raster_ms);
-  SPDLOG_INFO("Copy:             {:8.3f} ms", present_ms);
-  SPDLOG_INFO("Total:            {:8.3f} ms",
+  SPDLOG_DEBUG("Setup:            {:8.3f} ms", setup_ms);
+  SPDLOG_DEBUG("Binning:          {:8.3f} ms", binning_ms);
+  SPDLOG_DEBUG("Buffer Alloc:     {:8.3f} ms", buffer_alloc_ms);
+  SPDLOG_DEBUG("Rasterization:    {:8.3f} ms", raster_ms);
+  SPDLOG_DEBUG("Copy:             {:8.3f} ms", present_ms);
+  SPDLOG_DEBUG("Total:            {:8.3f} ms",
               vertex_ms + (setup_ms + binning_ms + buffer_alloc_ms + raster_ms +
-                           present_ms));
-  SPDLOG_INFO("==========================================");
+                          present_ms));
+  SPDLOG_DEBUG("==========================================");
 
   return true;
 }
@@ -155,9 +155,9 @@ void TileBasedRenderer::TriangleTileBinning(
     std::vector<std::vector<TileTriangleRef>> &tile_triangles) {
   const size_t total_triangles = model.GetFaces().size();
 
-  SPDLOG_INFO("Starting triangle-tile binning (SoA) for {} triangles",
+  SPDLOG_DEBUG("Starting triangle-tile binning (SoA) for {} triangles",
               total_triangles);
-  SPDLOG_INFO("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", width_,
+  SPDLOG_DEBUG("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", width_,
               height_, grid.tile_size, grid.tiles_x, grid.tiles_y);
 
   std::vector<size_t> tile_counts(grid.tiles_x * grid.tiles_y, 0);
@@ -186,9 +186,9 @@ void TileBasedRenderer::TriangleTileBinning(
     total_triangle_refs += tile.size();
     if (!tile.empty()) non_empty_tiles++;
   }
-  SPDLOG_INFO("  (SoA) Total triangle references: {}", total_triangle_refs);
-  SPDLOG_INFO("  (SoA) Non-empty tiles: {}", non_empty_tiles);
-  SPDLOG_INFO("  (SoA) Average triangles per tile: {:.2f}",
+  SPDLOG_DEBUG("  (SoA) Total triangle references: {}", total_triangle_refs);
+  SPDLOG_DEBUG("  (SoA) Non-empty tiles: {}", non_empty_tiles);
+  SPDLOG_DEBUG("  (SoA) Average triangles per tile: {:.2f}",
               total_triangle_refs > 0
                   ? float(total_triangle_refs) / tile_triangles.size()
                   : 0.0f);

From 0ea7f223235fba7dbc7854b1ee4e6fb7cac5b43d Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Tue, 16 Sep 2025 13:41:08 +0800
Subject: [PATCH 21/24] TBR: Perform mask-based computation for TBR
 rasterization to achieve SIMD-friendly rasterization, and add corresponding
 mask statistics output.

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 src/include/renderers/tile_based_renderer.hpp |  10 +-
 src/renderers/tile_based_renderer.cpp         | 250 +++++++++++++++---
 2 files changed, 219 insertions(+), 41 deletions(-)

diff --git a/src/include/renderers/tile_based_renderer.hpp b/src/include/renderers/tile_based_renderer.hpp
index f524fb6..da7970c 100644
--- a/src/include/renderers/tile_based_renderer.hpp
+++ b/src/include/renderers/tile_based_renderer.hpp
@@ -14,6 +14,13 @@ struct TileTriangleRef {
   size_t face_index = 0;
 };
 
+struct TileMaskStats {
+  uint64_t tested = 0; // 遍历检测像素总数
+  uint64_t covered = 0; // 三角形内覆盖测试通过像素数（通过边函数做内点测试成功）
+  uint64_t zpass = 0; // 通过early-z测试像素数（深度值小于tile局部深度缓冲）
+  uint64_t shaded = 0; // 实际着色并写回像素数（同时通过early-z或late-z测试）
+};
+
 /**
  * @brief Tile 网格上下文（供 binning 和 raster 共享的网格/几何信息）
  */
@@ -106,7 +113,8 @@ class TileBasedRenderer final : public RendererBase {
                      std::unique_ptr<uint32_t[]> &global_color_buffer,
                      const Shader& shader,
                      bool use_early_z,
-                     std::vector<Fragment>* scratch_fragments);
+                     std::vector<Fragment>* scratch_fragments,
+                     TileMaskStats* out_stats);
 
  private:
   // 深度和颜色的默认值，同时用于tile级和全局级buffers的初始化
diff --git a/src/renderers/tile_based_renderer.cpp b/src/renderers/tile_based_renderer.cpp
index e0df64d..39ad7fa 100644
--- a/src/renderers/tile_based_renderer.cpp
+++ b/src/renderers/tile_based_renderer.cpp
@@ -6,6 +6,7 @@
 #include <chrono>
 #include <cstring>
 #include <limits>
+#include <cmath>
 
 #include "config.h"
 #include "log_system.h"
@@ -16,6 +17,7 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in,
                                uint32_t *buffer) {
   auto total_start_time = std::chrono::high_resolution_clock::now();
   auto shader = std::make_shared<Shader>(shader_in);
+  shader->PrepareVertexUniforms();
 
   // 顶点变换（SoA）
   auto vertex_start = std::chrono::high_resolution_clock::now();
@@ -87,9 +89,10 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in,
 
   // 4. 并行光栅化每个 tile（SoA + early-z）
   auto raster_start = std::chrono::high_resolution_clock::now();
+  std::vector<TileMaskStats> tile_stats(total_tiles);
 #pragma omp parallel num_threads(kNProc) default(none)                        \
     shared(tile_triangles, shader, depthBuffer, colorBuffer, total_tiles,     \
-               grid_ctx, early_z_)
+               grid_ctx, early_z_, tile_stats)
   {
     // 为每个 tile 分配局部深度和颜色缓冲
     std::unique_ptr<float[]> tile_depth_buffer =
@@ -108,7 +111,7 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in,
       RasterizeTile(tile_id, tile_triangles[tile_id], grid_ctx,
                     tile_depth_buffer.get(), tile_color_buffer.get(),
                     depthBuffer, colorBuffer, *shader, early_z_,
-                    &scratch_fragments);
+                    &scratch_fragments, &tile_stats[tile_id]);
     }
   }
   auto raster_end = std::chrono::high_resolution_clock::now();
@@ -117,6 +120,23 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in,
                        .count() /
                    1000.0;
 
+  // 汇总并打印掩码收益统计
+  uint64_t sum_tested = 0, sum_covered = 0, sum_zpass = 0, sum_shaded = 0;
+  for (const auto& s : tile_stats) {
+    sum_tested += s.tested;
+    sum_covered += s.covered;
+    sum_zpass   += s.zpass;
+    sum_shaded  += s.shaded;
+  }
+  auto rate = [](uint64_t num, uint64_t den) -> double {
+    if (den == 0) return 0.0; return double(num) / double(den) * 100.0;
+  };
+  SPDLOG_DEBUG(
+      "TBR Mask Stats: tested={}, covered={} ({:.1f}%), zpass={} ({:.1f}%), shaded={} ({:.1f}%)",
+      sum_tested, sum_covered, rate(sum_covered, sum_tested),
+      sum_zpass, rate(sum_zpass, sum_covered),
+      sum_shaded, rate(sum_shaded, sum_covered));
+
   // 5. 直接将单份全局 colorBuffer 拷贝到输出
   auto present_start = std::chrono::high_resolution_clock::now();
   std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t));
@@ -200,7 +220,8 @@ void TileBasedRenderer::RasterizeTile(
     uint32_t *tile_color_buffer, std::unique_ptr<float[]> &global_depth_buffer,
     std::unique_ptr<uint32_t[]> &global_color_buffer,
     const Shader &shader, bool use_early_z,
-    std::vector<Fragment> *scratch_fragments) {
+    std::vector<Fragment> *scratch_fragments,
+    TileMaskStats* out_stats) {
   // 计算 tile 屏幕范围
   size_t tile_x = tile_id % grid.tiles_x;
   size_t tile_y = tile_id / grid.tiles_x;
@@ -215,50 +236,199 @@ void TileBasedRenderer::RasterizeTile(
   std::fill_n(tile_depth_buffer, tile_width * tile_height, kDepthClear);
   std::fill_n(tile_color_buffer, tile_width * tile_height, kColorClear);
 
-  // 只有当调用方没有提供 scratch 时，才启用本地容器并且只构造一次
-  const bool use_internal_scratch = (scratch_fragments == nullptr);
-  std::vector<Fragment> internal_out;
-  if (use_internal_scratch) internal_out.reserve(tile_width * tile_height);
-
-  for (const auto &tri : triangles) {  // 用来应对scratch传入nullptr的情况
-    // 始终走 SoA + 限制矩形的光栅化路径；如未提供 scratch，则使用函数内局部容器
-    std::vector<Fragment> &out =
-        use_internal_scratch ? internal_out : *scratch_fragments;
-    out.clear();
-    if (out.capacity() < tile_width * tile_height)
-      out.reserve(tile_width * tile_height);
-
-    rasterizer_->RasterizeTo(
-        grid.soa, tri.i0, tri.i1, tri.i2, static_cast<int>(screen_x_start),
-        static_cast<int>(screen_y_start), static_cast<int>(screen_x_end),
-        static_cast<int>(screen_y_end), out);
-
-    for (auto &fragment : out) {
-      fragment.material = tri.material;
-      size_t sx = fragment.screen_coord[0];
-      size_t sy = fragment.screen_coord[1];
-      if (sx >= screen_x_start && sx < screen_x_end && sy >= screen_y_start &&
-          sy < screen_y_end) {
-        size_t local_x = sx - screen_x_start;
-        size_t local_y = sy - screen_y_start;
-        size_t idx = local_x + local_y * tile_width;
-        if (use_early_z) {
-          if (fragment.depth < tile_depth_buffer[idx]) {
-            auto color = shader.FragmentShader(fragment);
-            tile_depth_buffer[idx] = fragment.depth;
-            tile_color_buffer[idx] = uint32_t(color);
+  // 掩码化扫描：按三角形直接写入 tile 局部缓冲，避免中间片段向量
+  constexpr int kLane = 8;  // 横向处理的像素个数（便于编译器自动向量化）
+
+  // 轻量统计：用于评估掩码收益（仅对少量tile打印DEBUG）
+  uint64_t tested_pixels = 0;
+  uint64_t covered_pixels = 0;
+  uint64_t zpass_pixels = 0;
+  uint64_t shaded_pixels = 0;
+
+  auto cross2 = [](float ax, float ay, float bx, float by) {
+    return ax * by - ay * bx;
+  };
+
+  for (const auto &tri : triangles) {
+    const auto i0 = tri.i0, i1 = tri.i1, i2 = tri.i2;
+
+    // 顶点屏幕坐标
+    const Vector4f &p0 = grid.soa.pos_screen[i0];
+    const Vector4f &p1 = grid.soa.pos_screen[i1];
+    const Vector4f &p2 = grid.soa.pos_screen[i2];
+
+    // 三角形屏幕空间 AABB，与 tile 矩形求交
+    const float tri_minx = std::min({p0.x, p1.x, p2.x});
+    const float tri_miny = std::min({p0.y, p1.y, p2.y});
+    const float tri_maxx = std::max({p0.x, p1.x, p2.x});
+    const float tri_maxy = std::max({p0.y, p1.y, p2.y});
+
+    int sx = std::max<int>(static_cast<int>(screen_x_start),
+                           static_cast<int>(std::floor(std::max(0.0f, tri_minx))));
+    int sy = std::max<int>(static_cast<int>(screen_y_start),
+                           static_cast<int>(std::floor(std::max(0.0f, tri_miny))));
+    int ex = std::min<int>(static_cast<int>(screen_x_end - 1),
+                           static_cast<int>(std::floor(std::min<float>(width_ - 1, tri_maxx))));
+    int ey = std::min<int>(static_cast<int>(screen_y_end - 1),
+                           static_cast<int>(std::floor(std::min<float>(height_ - 1, tri_maxy))));
+    if (sx > ex || sy > ey) continue;
+
+    // 边向量与有向面积
+    const float e01x = p1.x - p0.x, e01y = p1.y - p0.y;
+    const float e12x = p2.x - p1.x, e12y = p2.y - p1.y;
+    const float e20x = p0.x - p2.x, e20y = p0.y - p2.y;
+    const float area2 = cross2(e01x, e01y, p2.x - p0.x, p2.y - p0.y);
+    if (std::abs(area2) < 1e-6f) continue;  // 退化三角形
+    const bool positive = (area2 > 0.0f);
+
+    // z 与 1/w 的平面插值准备
+    const float z0 = p0.z, z1 = p1.z, z2 = p2.z;
+    const float w0_inv = 1.0f / p0.w, w1_inv = 1.0f / p1.w, w2_inv = 1.0f / p2.w;
+
+    // 行扫描
+    for (int y = sy; y <= ey; ++y) { // 行优先遍历：有利于 cache 与向量化
+      const float yf = static_cast<float>(y);
+      for (int xb = sx; xb <= ex; xb += kLane) { // 每次处理kLane个像素
+        const int lane = std::min(kLane, ex - xb + 1); // 当前需要处理的像素个数
+        const float x0f = static_cast<float>(xb); // 本块起点的x坐标
+
+        // 计算本块起点的三个边函数值与横向步长（dE/dx）
+        float E01_base = cross2(e01x, e01y, x0f - p0.x, yf - p0.y);
+        float E12_base = cross2(e12x, e12y, x0f - p1.x, yf - p1.y);
+        float E20_base = cross2(e20x, e20y, x0f - p2.x, yf - p2.y);
+        const float dE01dx = -e01y;
+        const float dE12dx = -e12y;
+        const float dE20dx = -e20y;
+
+        // ============== 构造覆盖掩码 cover mask ==============
+        unsigned mask_cover = 0u;
+        int cover_count = 0;
+        float E01[kLane], E12[kLane], E20[kLane];
+        #pragma omp simd
+        for (int j = 0; j < lane; ++j) {
+          E01[j] = E01_base + dE01dx * static_cast<float>(j);
+          E12[j] = E12_base + dE12dx * static_cast<float>(j);
+          E20[j] = E20_base + dE20dx * static_cast<float>(j);
+        }
+        for (int j = 0; j < lane; ++j) { // 内点测试，如果三角形在像素内，则将该像素加入覆盖掩码
+          bool inside = positive ? (E01[j] >= 0.0f && E12[j] >= 0.0f && E20[j] >= 0.0f)
+                                 : (E01[j] <= 0.0f && E12[j] <= 0.0f && E20[j] <= 0.0f);
+          if (inside) {
+            mask_cover |= (1u << j);
+            cover_count++;
+          }
+        }
+        tested_pixels += static_cast<uint64_t>(lane);
+        covered_pixels += static_cast<uint64_t>(cover_count);
+        if (mask_cover == 0u) continue;
+
+        // ============== 计算 z，进行early-z掩码 ==============
+        unsigned mask_zpass = 0u;
+        float zvals[kLane];
+        // 缓存校正后的重心坐标，避免着色阶段重复计算
+        float b0c_arr[kLane];
+        float b1c_arr[kLane];
+        float b2c_arr[kLane];
+        int zpass_count = 0;
+        for (int j = 0; j < lane; ++j) {
+          if (((mask_cover >> j) & 1u) == 0u) { continue; } // 如果该像素不在覆盖掩码内，则跳过
+          const float b0 = E12[j] / area2;
+          const float b1 = E20[j] / area2;
+          const float b2 = E01[j] / area2;
+          const float w_inv = b0 * w0_inv + b1 * w1_inv + b2 * w2_inv; // 透视矫正
+          const float b0c = (b0 * w0_inv) / w_inv;
+          const float b1c = (b1 * w1_inv) / w_inv;
+          const float b2c = (b2 * w2_inv) / w_inv;
+          b0c_arr[j] = b0c; b1c_arr[j] = b1c; b2c_arr[j] = b2c;
+          const float z = z0 * b0c + z1 * b1c + z2 * b2c;
+          zvals[j] = z;
+
+          const int sx_pix = xb + j;
+          const int local_x = sx_pix - static_cast<int>(screen_x_start);
+          const int local_y = y - static_cast<int>(screen_y_start);
+          const size_t idx = static_cast<size_t>(local_x + local_y * static_cast<int>(tile_width));
+          if (z < tile_depth_buffer[idx]) {
+            mask_zpass |= (1u << j);
+            zpass_count++;
           }
-        } else {
-          auto color = shader.FragmentShader(fragment);
-          if (fragment.depth < tile_depth_buffer[idx]) {
-            tile_depth_buffer[idx] = fragment.depth;
-            tile_color_buffer[idx] = uint32_t(color);
+        }
+        zpass_pixels += static_cast<uint64_t>(zpass_count);
+
+        // ============== 构造最终掩码 ==============
+        unsigned mask_final = use_early_z ? (mask_cover & mask_zpass) : mask_cover;
+        if (mask_final == 0u && use_early_z) continue;
+
+        // 对掩码内像素着色并写回（非 early-z 时，先着色，再按 z 测试写入）
+        for (int j = 0; j < lane; ++j) {
+          if (((mask_final >> j) & 1u) == 0u && use_early_z) continue;
+          const int sx_pix = xb + j;
+          const int local_x = sx_pix - static_cast<int>(screen_x_start);
+          const int local_y = y - static_cast<int>(screen_y_start);
+          const size_t idx = static_cast<size_t>(local_x + local_y * static_cast<int>(tile_width));
+
+          // 计算插值属性
+          const float b0c = b0c_arr[j];
+          const float b1c = b1c_arr[j];
+          const float b2c = b2c_arr[j];
+
+          Fragment frag;
+          frag.screen_coord = {sx_pix, y};
+          frag.depth = zvals[j];
+          frag.material = tri.material;
+
+          // 法向量插值
+          const Vector3f &n0 = grid.soa.normal[i0];
+          const Vector3f &n1 = grid.soa.normal[i1];
+          const Vector3f &n2 = grid.soa.normal[i2];
+          frag.normal = n0 * b0c + n1 * b1c + n2 * b2c;
+
+          // 纹理坐标插值
+          const Vector2f &uv0 = grid.soa.uv[i0];
+          const Vector2f &uv1 = grid.soa.uv[i1];
+          const Vector2f &uv2 = grid.soa.uv[i2];
+          frag.uv = uv0 * b0c + uv1 * b1c + uv2 * b2c;
+
+          // 颜色插值
+          const Color &c0 = grid.soa.color[i0];
+          const Color &c1 = grid.soa.color[i1];
+          const Color &c2 = grid.soa.color[i2];
+          auto color_r = FloatToUint8_t(static_cast<float>(c0[Color::kColorIndexRed]) * b0c +
+                                        static_cast<float>(c1[Color::kColorIndexRed]) * b1c +
+                                        static_cast<float>(c2[Color::kColorIndexRed]) * b2c);
+          auto color_g = FloatToUint8_t(static_cast<float>(c0[Color::kColorIndexGreen]) * b0c +
+                                        static_cast<float>(c1[Color::kColorIndexGreen]) * b1c +
+                                        static_cast<float>(c2[Color::kColorIndexGreen]) * b2c);
+          auto color_b = FloatToUint8_t(static_cast<float>(c0[Color::kColorIndexBlue]) * b0c +
+                                        static_cast<float>(c1[Color::kColorIndexBlue]) * b1c +
+                                        static_cast<float>(c2[Color::kColorIndexBlue]) * b2c);
+          frag.color = Color(color_r, color_g, color_b);
+
+          if (use_early_z) { // 开启时，仅对mask中通过early-z的像素进行着色和写回
+            auto out_color = shader.FragmentShader(frag);
+            tile_depth_buffer[idx] = frag.depth;
+            tile_color_buffer[idx] = uint32_t(out_color);
+            shaded_pixels++;
+          } else {
+            // 关闭时，先着色，再按z测试写入
+            auto out_color = shader.FragmentShader(frag);
+            if (frag.depth < tile_depth_buffer[idx]) { // late-z
+              tile_depth_buffer[idx] = frag.depth;
+              tile_color_buffer[idx] = uint32_t(out_color);
+              shaded_pixels++;
+            }
           }
         }
       }
     }
   }
 
+  if (out_stats) {
+    out_stats->tested = tested_pixels;
+    out_stats->covered = covered_pixels;
+    out_stats->zpass = zpass_pixels;
+    out_stats->shaded = shaded_pixels;
+  }
+
   // 写回全局缓冲
   // TBR 下不同 tile 覆盖的屏幕区域互不重叠，且在 tile 内部已通过 Early‑Z
   // 得出每个像素的最终值。因此可以直接将 tile 行数据拷贝到全局缓冲

From b659f57137978941da8e83b907ff5f9f253236ff Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Tue, 16 Sep 2025 14:16:51 +0800
Subject: [PATCH 22/24] VS: Optimize the vertex matrix caching in shaders by
 adding cache preparation and update functionality to reduce redundant
 computations.

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 src/include/shader.hpp                  |  24 +++++-
 src/renderers/deferred_renderer.cpp     |   1 +
 src/renderers/per_triangle_renderer.cpp |   1 +
 src/shader.cpp                          | 108 +++++++++++++++++++++---
 4 files changed, 119 insertions(+), 15 deletions(-)

diff --git a/src/include/shader.hpp b/src/include/shader.hpp
index ed08998..5c02a7a 100644
--- a/src/include/shader.hpp
+++ b/src/include/shader.hpp
@@ -63,6 +63,19 @@ struct SharedDataInShader {
   Vector3f fragPos_varying = Vector3f(0.0f);
 };
 
+struct VertexUniformCache {
+  Matrix4f model = Matrix4f(1.0f);
+  Matrix4f view = Matrix4f(1.0f);
+  Matrix4f projection = Matrix4f(1.0f);
+  Matrix4f model_view = Matrix4f(1.0f);
+  Matrix4f mvp = Matrix4f(1.0f);
+  Matrix3f normal = Matrix3f(1.0f);
+  bool has_model = false;
+  bool has_view = false;
+  bool has_projection = false;
+  bool derived_valid = false;
+};
+
 /**
  * @brief Shader Class 着色器类
  *
@@ -85,8 +98,13 @@ class Shader {
   template <typename T>
   void SetUniform(const std::string &name, const T &value) {
     uniformbuffer_.SetUniform(name, value);
+    if constexpr (std::is_same_v<T, Matrix4f>) {
+      UpdateMatrixCache(name, value);
+    }
   }
 
+  void PrepareVertexUniforms();
+
  private:
   // UniformBuffer
   UniformBuffer uniformbuffer_;
@@ -94,6 +112,10 @@ class Shader {
   // Shared Variables
   // 共享变量
   SharedDataInShader sharedDataInShader_;
+  VertexUniformCache vertex_uniform_cache_;
+
+  void UpdateMatrixCache(const std::string &name, const Matrix4f &value);
+  void RecalculateDerivedMatrices();
 
   Color SampleTexture(const Texture &texture, const Vector2f &uv) const;
   Color ClampColor(const Color color) const;
@@ -103,4 +125,4 @@ uint8_t FloatToUint8_t(float val);
 
 }  // namespace simple_renderer
 
-#endif /* SIMPLERENDER_SRC_INCLUDE_SHADER_H_ */
\ No newline at end of file
+#endif /* SIMPLERENDER_SRC_INCLUDE_SHADER_H_ */
diff --git a/src/renderers/deferred_renderer.cpp b/src/renderers/deferred_renderer.cpp
index a86d41f..9d8f7c2 100644
--- a/src/renderers/deferred_renderer.cpp
+++ b/src/renderers/deferred_renderer.cpp
@@ -14,6 +14,7 @@ namespace simple_renderer {
 bool DeferredRenderer::Render(const Model& model, const Shader& shader_in, uint32_t* buffer) {
   auto total_start_time = std::chrono::high_resolution_clock::now();
   auto shader = std::make_shared<Shader>(shader_in);
+  shader->PrepareVertexUniforms();
 
   // 顶点变换（AoS）
   auto vertex_start = std::chrono::high_resolution_clock::now();
diff --git a/src/renderers/per_triangle_renderer.cpp b/src/renderers/per_triangle_renderer.cpp
index 369f282..8a3b4cb 100644
--- a/src/renderers/per_triangle_renderer.cpp
+++ b/src/renderers/per_triangle_renderer.cpp
@@ -19,6 +19,7 @@ bool PerTriangleRenderer::Render(const Model &model, const Shader &shader_in,
 
   // 复制 shader 以便在多线程中共享
   auto shader = std::make_shared<Shader>(shader_in);
+  shader->PrepareVertexUniforms();
 
   // 顶点变换（AoS）
   auto vertex_start = std::chrono::high_resolution_clock::now();
diff --git a/src/shader.cpp b/src/shader.cpp
index 7b8eeae..4441eed 100644
--- a/src/shader.cpp
+++ b/src/shader.cpp
@@ -3,29 +3,109 @@
 namespace simple_renderer {
 
 Vertex Shader::VertexShader(const Vertex& vertex) {
-  Matrix4f model_matrix = uniformbuffer_.GetUniform<Matrix4f>("modelMatrix");
-  Matrix4f view_matrix = uniformbuffer_.GetUniform<Matrix4f>("viewMatrix");
-  Matrix4f projection_matrix =
-      uniformbuffer_.GetUniform<Matrix4f>("projectionMatrix");
-
-  Matrix4f mvp_matrix = projection_matrix * view_matrix * model_matrix;
-  
-  Matrix3f normal_matrix = glm::transpose(glm::inverse(Matrix3f(model_matrix)));
+  const bool cache_ready = vertex_uniform_cache_.derived_valid;
+
+  const Matrix4f* model_ptr = nullptr;
+  const Matrix4f* mvp_ptr = nullptr;
+  const Matrix3f* normal_ptr = nullptr;
+
+  Matrix4f fallback_model;
+  Matrix4f fallback_mvp;
+  Matrix3f fallback_normal;
+
+  if (cache_ready) { // 如果所有派生矩阵已预计算并可直接复用
+    // 直接复用缓存矩阵，避免逐顶点哈希查询
+    model_ptr = &vertex_uniform_cache_.model;
+    mvp_ptr = &vertex_uniform_cache_.mvp;
+    normal_ptr = &vertex_uniform_cache_.normal;
+  } else { // 如果缓存尚未建立
+    fallback_model = uniformbuffer_.GetUniform<Matrix4f>("modelMatrix");
+    Matrix4f view_matrix = uniformbuffer_.GetUniform<Matrix4f>("viewMatrix");
+    Matrix4f projection_matrix =
+        uniformbuffer_.GetUniform<Matrix4f>("projectionMatrix");
+    fallback_mvp = projection_matrix * view_matrix * fallback_model;
+    fallback_normal =
+        glm::transpose(glm::inverse(Matrix3f(fallback_model)));
+    model_ptr = &fallback_model;
+    mvp_ptr = &fallback_mvp;
+    normal_ptr = &fallback_normal;
+  }
+
+  const Matrix4f& model_matrix = *model_ptr;
+  const Matrix4f& mvp_matrix = *mvp_ptr;
+  const Matrix3f& normal_matrix = *normal_ptr;
+
+  const Vector4f position = vertex.GetPosition();
+  Vector4f world_position = model_matrix * position;
   Vector3f transformed_normal = normal_matrix * vertex.GetNormal();
 
-  sharedDataInShader_.fragPos_varying = Vector3f(model_matrix * vertex.GetPosition());
+  // 将世界空间位置写入共享数据供片元阶段使用
+  sharedDataInShader_.fragPos_varying = Vector3f(world_position);
 
   // 计算裁剪空间坐标
-  Vector4f clip_position = mvp_matrix * vertex.GetPosition();
-  
+  Vector4f clip_position = mvp_matrix * position;
+
   // 返回变换后的顶点（包含变换后的法向量和裁剪坐标）
-  return Vertex(clip_position, 
-                transformed_normal, 
-                vertex.GetTexCoords(), 
+  return Vertex(clip_position, transformed_normal, vertex.GetTexCoords(),
                 vertex.GetColor(),
                 clip_position);  // 同时保存裁剪空间坐标用于后续裁剪
 }
 
+void Shader::PrepareVertexUniforms() {
+  if (vertex_uniform_cache_.derived_valid) {
+    return;
+  }
+  // 在进入顶点阶段前一次性取出常用矩阵并填充缓存
+  if (uniformbuffer_.HasUniform<Matrix4f>("modelMatrix") &&
+      uniformbuffer_.HasUniform<Matrix4f>("viewMatrix") &&
+      uniformbuffer_.HasUniform<Matrix4f>("projectionMatrix")) {
+    vertex_uniform_cache_.model =
+        uniformbuffer_.GetUniform<Matrix4f>("modelMatrix");
+    vertex_uniform_cache_.view =
+        uniformbuffer_.GetUniform<Matrix4f>("viewMatrix");
+    vertex_uniform_cache_.projection =
+        uniformbuffer_.GetUniform<Matrix4f>("projectionMatrix");
+    vertex_uniform_cache_.has_model = true;
+    vertex_uniform_cache_.has_view = true;
+    vertex_uniform_cache_.has_projection = true;
+    RecalculateDerivedMatrices();
+  }
+}
+
+void Shader::UpdateMatrixCache(const std::string& name,
+                               const Matrix4f& value) {
+  if (name == "modelMatrix") {
+    vertex_uniform_cache_.model = value;
+    vertex_uniform_cache_.has_model = true;
+  } else if (name == "viewMatrix") {
+    vertex_uniform_cache_.view = value;
+    vertex_uniform_cache_.has_view = true;
+  } else if (name == "projectionMatrix") {
+    vertex_uniform_cache_.projection = value;
+    vertex_uniform_cache_.has_projection = true;
+  } else {
+    return;
+  }
+
+  // 任一基础矩阵更新后，标记派生矩阵失效等待重算
+  vertex_uniform_cache_.derived_valid = false;
+  if (vertex_uniform_cache_.has_model && vertex_uniform_cache_.has_view &&
+      vertex_uniform_cache_.has_projection) {
+    RecalculateDerivedMatrices();
+  }
+}
+
+void Shader::RecalculateDerivedMatrices() {
+  // 预计算 Model-View、MVP 以及法线矩阵，供顶点着色器复用
+  vertex_uniform_cache_.model_view =
+      vertex_uniform_cache_.view * vertex_uniform_cache_.model;
+  vertex_uniform_cache_.mvp = vertex_uniform_cache_.projection *
+                              vertex_uniform_cache_.model_view;
+  vertex_uniform_cache_.normal = glm::transpose(glm::inverse(
+      Matrix3f(vertex_uniform_cache_.model)));
+  vertex_uniform_cache_.derived_valid = true;
+}
+
 Color Shader::FragmentShader(const Fragment& fragment) const {
   // interpolate Normal, Color and UV
   Color interpolateColor = fragment.color;

From e81bcffd32be4dcf69b49c8b507aa646ddc8cf5f Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Tue, 16 Sep 2025 15:14:39 +0800
Subject: [PATCH 23/24] FS: Cache vectors and matrices to avoid redundant
 computations.

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 src/include/shader.hpp                  |  21 ++++-
 src/renderers/deferred_renderer.cpp     |   2 +-
 src/renderers/per_triangle_renderer.cpp |   2 +-
 src/renderers/tile_based_renderer.cpp   |   2 +-
 src/shader.cpp                          | 114 ++++++++++++++++++------
 5 files changed, 111 insertions(+), 30 deletions(-)

diff --git a/src/include/shader.hpp b/src/include/shader.hpp
index 5c02a7a..097214c 100644
--- a/src/include/shader.hpp
+++ b/src/include/shader.hpp
@@ -76,6 +76,15 @@ struct VertexUniformCache {
   bool derived_valid = false;
 };
 
+struct FragmentUniformCache {
+  Light light{};
+  Vector3f camera_pos = Vector3f(0.0f);
+  Vector3f light_dir_normalized = Vector3f(0.0f);
+  bool has_light = false;
+  bool has_camera = false;
+  bool derived_valid = false;
+};
+
 /**
  * @brief Shader Class 着色器类
  *
@@ -100,10 +109,14 @@ class Shader {
     uniformbuffer_.SetUniform(name, value);
     if constexpr (std::is_same_v<T, Matrix4f>) {
       UpdateMatrixCache(name, value);
+    } else if constexpr (std::is_same_v<T, Light>) {
+      UpdateFragmentCache(name, value);
+    } else if constexpr (std::is_same_v<T, Vector3f>) {
+      UpdateFragmentCache(name, value);
     }
   }
 
-  void PrepareVertexUniforms();
+  void PrepareUniformCaches();
 
  private:
   // UniformBuffer
@@ -113,9 +126,15 @@ class Shader {
   // 共享变量
   SharedDataInShader sharedDataInShader_;
   VertexUniformCache vertex_uniform_cache_;
+  FragmentUniformCache fragment_uniform_cache_;
 
   void UpdateMatrixCache(const std::string &name, const Matrix4f &value);
+  void UpdateFragmentCache(const std::string &name, const Light &value);
+  void UpdateFragmentCache(const std::string &name, const Vector3f &value);
   void RecalculateDerivedMatrices();
+  void RecalculateFragmentDerived();
+  void PrepareVertexUniformCache();
+  void PrepareFragmentUniformCache();
 
   Color SampleTexture(const Texture &texture, const Vector2f &uv) const;
   Color ClampColor(const Color color) const;
diff --git a/src/renderers/deferred_renderer.cpp b/src/renderers/deferred_renderer.cpp
index 9d8f7c2..523fe20 100644
--- a/src/renderers/deferred_renderer.cpp
+++ b/src/renderers/deferred_renderer.cpp
@@ -14,7 +14,7 @@ namespace simple_renderer {
 bool DeferredRenderer::Render(const Model& model, const Shader& shader_in, uint32_t* buffer) {
   auto total_start_time = std::chrono::high_resolution_clock::now();
   auto shader = std::make_shared<Shader>(shader_in);
-  shader->PrepareVertexUniforms();
+  shader->PrepareUniformCaches();
 
   // 顶点变换（AoS）
   auto vertex_start = std::chrono::high_resolution_clock::now();
diff --git a/src/renderers/per_triangle_renderer.cpp b/src/renderers/per_triangle_renderer.cpp
index 8a3b4cb..9348594 100644
--- a/src/renderers/per_triangle_renderer.cpp
+++ b/src/renderers/per_triangle_renderer.cpp
@@ -19,7 +19,7 @@ bool PerTriangleRenderer::Render(const Model &model, const Shader &shader_in,
 
   // 复制 shader 以便在多线程中共享
   auto shader = std::make_shared<Shader>(shader_in);
-  shader->PrepareVertexUniforms();
+  shader->PrepareUniformCaches();
 
   // 顶点变换（AoS）
   auto vertex_start = std::chrono::high_resolution_clock::now();
diff --git a/src/renderers/tile_based_renderer.cpp b/src/renderers/tile_based_renderer.cpp
index 39ad7fa..e39526e 100644
--- a/src/renderers/tile_based_renderer.cpp
+++ b/src/renderers/tile_based_renderer.cpp
@@ -17,7 +17,7 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in,
                                uint32_t *buffer) {
   auto total_start_time = std::chrono::high_resolution_clock::now();
   auto shader = std::make_shared<Shader>(shader_in);
-  shader->PrepareVertexUniforms();
+  shader->PrepareUniformCaches();
 
   // 顶点变换（SoA）
   auto vertex_start = std::chrono::high_resolution_clock::now();
diff --git a/src/shader.cpp b/src/shader.cpp
index 4441eed..e01e9b1 100644
--- a/src/shader.cpp
+++ b/src/shader.cpp
@@ -51,27 +51,6 @@ Vertex Shader::VertexShader(const Vertex& vertex) {
                 clip_position);  // 同时保存裁剪空间坐标用于后续裁剪
 }
 
-void Shader::PrepareVertexUniforms() {
-  if (vertex_uniform_cache_.derived_valid) {
-    return;
-  }
-  // 在进入顶点阶段前一次性取出常用矩阵并填充缓存
-  if (uniformbuffer_.HasUniform<Matrix4f>("modelMatrix") &&
-      uniformbuffer_.HasUniform<Matrix4f>("viewMatrix") &&
-      uniformbuffer_.HasUniform<Matrix4f>("projectionMatrix")) {
-    vertex_uniform_cache_.model =
-        uniformbuffer_.GetUniform<Matrix4f>("modelMatrix");
-    vertex_uniform_cache_.view =
-        uniformbuffer_.GetUniform<Matrix4f>("viewMatrix");
-    vertex_uniform_cache_.projection =
-        uniformbuffer_.GetUniform<Matrix4f>("projectionMatrix");
-    vertex_uniform_cache_.has_model = true;
-    vertex_uniform_cache_.has_view = true;
-    vertex_uniform_cache_.has_projection = true;
-    RecalculateDerivedMatrices();
-  }
-}
-
 void Shader::UpdateMatrixCache(const std::string& name,
                                const Matrix4f& value) {
   if (name == "modelMatrix") {
@@ -106,6 +85,80 @@ void Shader::RecalculateDerivedMatrices() {
   vertex_uniform_cache_.derived_valid = true;
 }
 
+void Shader::UpdateFragmentCache(const std::string& name,
+                                 const Light& value) {
+  if (name != "light") {
+    return;
+  }
+  fragment_uniform_cache_.light = value;
+  fragment_uniform_cache_.has_light = true;
+  fragment_uniform_cache_.derived_valid = false;
+  if (fragment_uniform_cache_.has_light && fragment_uniform_cache_.has_camera) {
+    RecalculateFragmentDerived();
+  }
+}
+
+void Shader::UpdateFragmentCache(const std::string& name,
+                                 const Vector3f& value) {
+  if (name != "cameraPos") {
+    return;
+  }
+  fragment_uniform_cache_.camera_pos = value;
+  fragment_uniform_cache_.has_camera = true;
+  fragment_uniform_cache_.derived_valid = false;
+  if (fragment_uniform_cache_.has_light && fragment_uniform_cache_.has_camera) {
+    RecalculateFragmentDerived();
+  }
+}
+
+void Shader::RecalculateFragmentDerived() {
+  fragment_uniform_cache_.light_dir_normalized =
+      glm::normalize(fragment_uniform_cache_.light.dir);
+  fragment_uniform_cache_.derived_valid = true;
+}
+
+void Shader::PrepareUniformCaches() {
+  PrepareVertexUniformCache();
+  PrepareFragmentUniformCache();
+}
+
+void Shader::PrepareVertexUniformCache() {
+  if (vertex_uniform_cache_.derived_valid) {
+    return;
+  }
+  // 在进入渲染阶段前一次性取出常用矩阵并填充缓存
+  if (uniformbuffer_.HasUniform<Matrix4f>("modelMatrix") &&
+      uniformbuffer_.HasUniform<Matrix4f>("viewMatrix") &&
+      uniformbuffer_.HasUniform<Matrix4f>("projectionMatrix")) {
+    vertex_uniform_cache_.model =
+        uniformbuffer_.GetUniform<Matrix4f>("modelMatrix");
+    vertex_uniform_cache_.view =
+        uniformbuffer_.GetUniform<Matrix4f>("viewMatrix");
+    vertex_uniform_cache_.projection =
+        uniformbuffer_.GetUniform<Matrix4f>("projectionMatrix");
+    vertex_uniform_cache_.has_model = true;
+    vertex_uniform_cache_.has_view = true;
+    vertex_uniform_cache_.has_projection = true;
+    RecalculateDerivedMatrices();
+  }
+}
+
+void Shader::PrepareFragmentUniformCache() {
+  if (fragment_uniform_cache_.derived_valid) {
+    return;
+  }
+  if (uniformbuffer_.HasUniform<Light>("light") &&
+      uniformbuffer_.HasUniform<Vector3f>("cameraPos")) {
+    fragment_uniform_cache_.light =
+        uniformbuffer_.GetUniform<Light>("light");
+    fragment_uniform_cache_.camera_pos =
+        uniformbuffer_.GetUniform<Vector3f>("cameraPos");
+    fragment_uniform_cache_.has_light = true;
+    fragment_uniform_cache_.has_camera = true;
+    RecalculateFragmentDerived();
+  }
+}
+
 Color Shader::FragmentShader(const Fragment& fragment) const {
   // interpolate Normal, Color and UV
   Color interpolateColor = fragment.color;
@@ -113,14 +166,23 @@ Color Shader::FragmentShader(const Fragment& fragment) const {
   Vector2f uv = fragment.uv;
 
   // uniform
-  Light light = uniformbuffer_.GetUniform<Light>("light");
+  Light light;
+  Vector3f light_dir;
+  Vector3f camera_pos;
+  if (fragment_uniform_cache_.derived_valid) {
+    light = fragment_uniform_cache_.light;
+    light_dir = fragment_uniform_cache_.light_dir_normalized;
+    camera_pos = fragment_uniform_cache_.camera_pos;
+  } else {
+    light = uniformbuffer_.GetUniform<Light>("light");
+    camera_pos = uniformbuffer_.GetUniform<Vector3f>("cameraPos");
+    light_dir = glm::normalize(light.dir);
+  }
   Material material = *fragment.material;
 
   // view direction
   Vector3f view_dir =
-      glm::normalize(sharedDataInShader_.fragPos_varying -
-                     uniformbuffer_.GetUniform<Vector3f>("cameraPos"));
-  Vector3f light_dir = glm::normalize(light.dir);
+      glm::normalize(sharedDataInShader_.fragPos_varying - camera_pos);
 
   auto intensity = std::max(glm::dot(normal, light_dir), 0.0f);
   // texture color
@@ -197,4 +259,4 @@ Color Shader::ClampColor(const Color color) const {
   return Color(red, green, blue, alpha);
 }
 
-}  // namespace simple_renderer
\ No newline at end of file
+}  // namespace simple_renderer

From b84cfd2afe4ad3f6c73bbc85ca0c914a4c6c8454 Mon Sep 17 00:00:00 2001
From: ZhouFANG <indevn@outlook.com>
Date: Tue, 16 Sep 2025 15:48:47 +0800
Subject: [PATCH 24/24] Enhanced shader class with LUT caching for specular
 reflection to optimize computation and eliminate redundancy. Added copy/move
 constructors for thread safety

Signed-off-by: ZhouFANG <indevn@outlook.com>
---
 src/include/shader.hpp |  25 ++++++++--
 src/shader.cpp         | 103 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 122 insertions(+), 6 deletions(-)

diff --git a/src/include/shader.hpp b/src/include/shader.hpp
index 097214c..8314f55 100644
--- a/src/include/shader.hpp
+++ b/src/include/shader.hpp
@@ -1,6 +1,10 @@
 #ifndef SIMPLERENDER_SRC_INCLUDE_SHADER_HPP_
 #define SIMPLERENDER_SRC_INCLUDE_SHADER_HPP_
 
+#include <array>
+#include <bit>
+#include <shared_mutex>
+#include <unordered_map>
 #include <variant>
 
 #include "light.h"
@@ -12,6 +16,8 @@ namespace simple_renderer {
 using UniformValue = std::variant<int, float, Vector2f, Vector3f, Vector4f,
                                   Matrix3f, Matrix4f, Material, Light>;
 
+inline constexpr size_t kSpecularLutResolution = 256;
+
 class UniformBuffer {
  public:
   template <typename T>
@@ -85,6 +91,10 @@ struct FragmentUniformCache {
   bool derived_valid = false;
 };
 
+struct SpecularLUT {
+  std::array<float, kSpecularLutResolution> values{};
+};
+
 /**
  * @brief Shader Class 着色器类
  *
@@ -92,10 +102,10 @@ struct FragmentUniformCache {
 class Shader {
  public:
   Shader() = default;
-  Shader(const Shader &shader) = default;
-  Shader(Shader &&shader) = default;
-  auto operator=(const Shader &shader) -> Shader & = default;
-  auto operator=(Shader &&shader) -> Shader & = default;
+  Shader(const Shader &shader);
+  Shader(Shader &&shader) noexcept;
+  auto operator=(const Shader &shader) -> Shader &;
+  auto operator=(Shader &&shader) noexcept -> Shader &;
   virtual ~Shader() = default;
 
   // Input Data -> Vertex Shader -> Screen Space Coordiante
@@ -127,6 +137,8 @@ class Shader {
   SharedDataInShader sharedDataInShader_;
   VertexUniformCache vertex_uniform_cache_;
   FragmentUniformCache fragment_uniform_cache_;
+  mutable std::unordered_map<uint32_t, SpecularLUT> specular_lut_cache_;
+  mutable std::shared_mutex specular_cache_mutex_;
 
   void UpdateMatrixCache(const std::string &name, const Matrix4f &value);
   void UpdateFragmentCache(const std::string &name, const Light &value);
@@ -136,6 +148,11 @@ class Shader {
   void PrepareVertexUniformCache();
   void PrepareFragmentUniformCache();
 
+  // LUT相关
+  [[nodiscard]] auto BuildSpecularLUT(float shininess) const -> SpecularLUT;
+  [[nodiscard]] auto GetSpecularLUT(float shininess) const -> const SpecularLUT &;
+  [[nodiscard]] auto EvaluateSpecular(float cos_theta, float shininess) const -> float;
+
   Color SampleTexture(const Texture &texture, const Vector2f &uv) const;
   Color ClampColor(const Color color) const;
 };
diff --git a/src/shader.cpp b/src/shader.cpp
index e01e9b1..06ab241 100644
--- a/src/shader.cpp
+++ b/src/shader.cpp
@@ -1,7 +1,56 @@
 #include "shader.hpp"
 
+#include <algorithm>
+#include <cmath>
+#include <mutex>
+#include <shared_mutex>
+
 namespace simple_renderer {
 
+Shader::Shader(const Shader& shader) {
+  std::shared_lock lock(shader.specular_cache_mutex_);
+  uniformbuffer_ = shader.uniformbuffer_;
+  sharedDataInShader_ = shader.sharedDataInShader_;
+  vertex_uniform_cache_ = shader.vertex_uniform_cache_;
+  fragment_uniform_cache_ = shader.fragment_uniform_cache_;
+  specular_lut_cache_ = shader.specular_lut_cache_;
+}
+
+Shader::Shader(Shader&& shader) noexcept {
+  std::unique_lock lock(shader.specular_cache_mutex_);
+  uniformbuffer_ = std::move(shader.uniformbuffer_);
+  sharedDataInShader_ = shader.sharedDataInShader_;
+  vertex_uniform_cache_ = shader.vertex_uniform_cache_;
+  fragment_uniform_cache_ = shader.fragment_uniform_cache_;
+  specular_lut_cache_ = std::move(shader.specular_lut_cache_);
+}
+
+auto Shader::operator=(const Shader& shader) -> Shader& {
+  if (this == &shader) {
+    return *this;
+  }
+  std::shared_lock lock(shader.specular_cache_mutex_);
+  uniformbuffer_ = shader.uniformbuffer_;
+  sharedDataInShader_ = shader.sharedDataInShader_;
+  vertex_uniform_cache_ = shader.vertex_uniform_cache_;
+  fragment_uniform_cache_ = shader.fragment_uniform_cache_;
+  specular_lut_cache_ = shader.specular_lut_cache_;
+  return *this;
+}
+
+auto Shader::operator=(Shader&& shader) noexcept -> Shader& {
+  if (this == &shader) {
+    return *this;
+  }
+  std::unique_lock lock(shader.specular_cache_mutex_);
+  uniformbuffer_ = std::move(shader.uniformbuffer_);
+  sharedDataInShader_ = shader.sharedDataInShader_;
+  vertex_uniform_cache_ = shader.vertex_uniform_cache_;
+  fragment_uniform_cache_ = shader.fragment_uniform_cache_;
+  specular_lut_cache_ = std::move(shader.specular_lut_cache_);
+  return *this;
+}
+
 Vertex Shader::VertexShader(const Vertex& vertex) {
   const bool cache_ready = vertex_uniform_cache_.derived_valid;
 
@@ -159,6 +208,56 @@ void Shader::PrepareFragmentUniformCache() {
   }
 }
 
+auto Shader::BuildSpecularLUT(float shininess) const -> SpecularLUT {
+  SpecularLUT lut;
+  if (shininess <= 0.0f) {
+    lut.values.fill(1.0f);
+    return lut;
+  }
+
+  for (size_t i = 0; i < kSpecularLutResolution; ++i) {
+    float cos_theta = static_cast<float>(i) /
+                      static_cast<float>(kSpecularLutResolution - 1);
+    lut.values[i] = cos_theta <= 0.0f ? 0.0f : std::pow(cos_theta, shininess);
+  }
+  return lut;
+}
+
+auto Shader::GetSpecularLUT(float shininess) const -> const SpecularLUT& {
+  uint32_t key = std::bit_cast<uint32_t>(shininess);
+  {
+    std::shared_lock lock(specular_cache_mutex_);
+    auto it = specular_lut_cache_.find(key);
+    if (it != specular_lut_cache_.end()) {
+      return it->second;
+    }
+  }
+
+  SpecularLUT lut = BuildSpecularLUT(shininess);
+  std::unique_lock lock(specular_cache_mutex_);
+  auto [it, inserted] = specular_lut_cache_.emplace(key, std::move(lut));
+  return it->second;
+}
+
+auto Shader::EvaluateSpecular(float cos_theta, float shininess) const -> float {
+  cos_theta = std::clamp(cos_theta, 0.0f, 1.0f);
+  if (shininess <= 0.0f) {
+    return 1.0f;
+  }
+  if (cos_theta <= 0.0f) {
+    return 0.0f;
+  }
+
+  const auto& lut = GetSpecularLUT(shininess);
+  float scaled = cos_theta * static_cast<float>(kSpecularLutResolution - 1);
+  size_t index = static_cast<size_t>(scaled);
+  float frac = scaled - static_cast<float>(index);
+
+  const float v0 = lut.values[index];
+  const float v1 = lut.values[std::min(index + 1, kSpecularLutResolution - 1)];
+  return v0 + (v1 - v0) * frac;
+}
+
 Color Shader::FragmentShader(const Fragment& fragment) const {
   // interpolate Normal, Color and UV
   Color interpolateColor = fragment.color;
@@ -202,8 +301,8 @@ Color Shader::FragmentShader(const Fragment& fragment) const {
   }
 
   Vector3f halfVector = glm::normalize(light_dir + view_dir);
-  float spec = std::pow(std::max(glm::dot(normal, halfVector), 0.0f),
-                        material.shininess);
+  float cos_theta = std::max(glm::dot(normal, halfVector), 0.0f);
+  float spec = EvaluateSpecular(cos_theta, material.shininess);
   if (material.has_specular_texture) {
     Color texture_color = SampleTexture(material.specular_texture, uv);
     specular_color = texture_color * spec;