diff --git a/src/include/rasterizer.hpp b/src/include/rasterizer.hpp
index cd0b349..8fa7cf0 100644
--- a/src/include/rasterizer.hpp
+++ b/src/include/rasterizer.hpp
@@ -33,41 +33,6 @@ class Rasterizer {
   std::vector<Fragment> Rasterize(const Vertex& v0, const Vertex& v1,
                                   const Vertex& v2);
 
-  /**
-   * @brief 非分配版本：将片段直接写入调用方提供的容器
-   * 
-   * 可选的裁剪区域为半开区间 [x0, x1) × [y0, y1)
-   * 用于 TBR：将光栅化限制在 tile 边界内，便于复用外部 scratch 容器
-   * 
-   * @param v0 三角形第一个顶点
-   * @param v1 三角形第二个顶点
-   * @param v2 三角形第三个顶点
-   * @param x0 裁剪区域左边界（包含）
-   * @param y0 裁剪区域上边界（包含）
-   * @param x1 裁剪区域右边界（不包含）
-   * @param y1 裁剪区域下边界（不包含）
-   * @param out 输出片段容器
-   */
-  void RasterizeTo(const Vertex& v0, const Vertex& v1, const Vertex& v2,
-                   int x0, int y0, int x1, int y1,
-                   std::vector<Fragment>& out);
-
-  /**
-   * @brief SoA 版本：按顶点索引从 SoA 读取三角形三顶点
-   * @param soa 结构体数组格式的顶点数据
-   * @param i0 三角形第一个顶点索引
-   * @param i1 三角形第二个顶点索引
-   * @param i2 三角形第三个顶点索引
-   * @param x0 裁剪区域左边界（包含）
-   * @param y0 裁剪区域上边界（包含）
-   * @param x1 裁剪区域右边界（不包含）
-   * @param y1 裁剪区域下边界（不包含）
-   * @param out 输出片段容器
-   */
-  void RasterizeTo(const VertexSoA& soa, size_t i0, size_t i1, size_t i2,
-                   int x0, int y0, int x1, int y1,
-                   std::vector<Fragment>& out);
-
  private:
   size_t width_, height_;
 
diff --git a/src/include/renderer.h b/src/include/renderer.h
index e11c93f..2acb7a9 100755
--- a/src/include/renderer.h
+++ b/src/include/renderer.h
@@ -36,9 +36,10 @@ namespace simple_renderer {
  * - DEFERRED: 延迟渲染（片段收集后再着色）
  */
 enum class RenderingMode {
-  PER_TRIANGLE,  //!< 逐三角形（triangle-major）
-  TILE_BASED,    //!< 基于 tile（tile-major）
-  DEFERRED       //!< 延迟渲染
+  PER_TRIANGLE,        //!< 逐三角形（triangle-major）
+  TILE_BASED,          //!< 基于 tile（tile-major）
+  DEFERRED,            //!< 延迟渲染
+  TILE_BASED_DEFERRED  //!< 基于 tile 的延迟着色（TBDR）
 };
 
 /**
diff --git a/src/include/renderers/tile_based_deferred_renderer.hpp b/src/include/renderers/tile_based_deferred_renderer.hpp
new file mode 100644
index 0000000..3db269a
--- /dev/null
+++ b/src/include/renderers/tile_based_deferred_renderer.hpp
@@ -0,0 +1,59 @@
+#ifndef SIMPLERENDER_SRC_INCLUDE_RENDERERS_TILE_BASED_DEFERRED_RENDERER_HPP_
+#define SIMPLERENDER_SRC_INCLUDE_RENDERERS_TILE_BASED_DEFERRED_RENDERER_HPP_
+
+#include "renderers/renderer_base.hpp"
+#include "renderers/tile_based_renderer.hpp"  // 复用 TileTriangleRef / TileGridContext 定义
+
+namespace simple_renderer {
+
+/**
+ * @brief 基于 Tile 的延迟渲染器（Tile‑Based Deferred Renderer, TBDR）
+ *
+ * 设计要点：
+ * - SoA 顶点布局 + 三角形分箱（binning）→ 与 TBR 一致；
+ * - 以 Tile 为并行单元，避免跨 Tile 写冲突；
+ * - 2‑Pass（同现代 TBDR 思路）：
+ *   1) Z 预通过（深度决胜）：仅更新每像素最小深度与胜出三角形索引，并缓存透视矫正重心；
+ *   2) 延迟着色：仅对胜者像素执行一次片元着色，写入 tile 局部缓冲，最后整 Tile 拷贝到全局。
+ *
+ * 优势：显著减少overdraw场景中的无效着色（FragmentShader 调用次数近似等于胜出像素数）。
+ */
+class TileBasedDeferredRenderer final : public RendererBase {
+ public:
+  TileBasedDeferredRenderer(size_t width, size_t height, size_t tile_size = 64)
+      : RendererBase(width, height), tile_size_(tile_size) {}
+
+  bool Render(const Model& model, const Shader& shader, uint32_t* out_color) override;
+
+ private:
+  void TriangleTileBinning(const Model& model,
+                           const TileGridContext& grid,
+                           std::vector<std::vector<TileTriangleRef>>& tile_triangles);
+
+  void ProcessTriangleForTileBinning(size_t tri_idx, bool count_only,
+                                     const Model& model,
+                                     const TileGridContext& grid,
+                                     std::vector<size_t>& tile_counts,
+                                     std::vector<std::vector<TileTriangleRef>>& tile_triangles);
+
+  void RasterizeTileDeferred(size_t tile_id,
+                             const std::vector<TileTriangleRef>& triangles,
+                             const TileGridContext& grid,
+                             float* tile_depth_buffer, uint32_t* tile_color_buffer,
+                             std::unique_ptr<float[]>& global_depth_buffer,
+                             std::unique_ptr<uint32_t[]>& global_color_buffer,
+                             const Shader& shader,
+                             uint64_t* out_tested, uint64_t* out_covered,
+                             uint64_t* out_winners, uint64_t* out_shaded);
+
+ private:
+  // 深度与颜色清除默认值（与 TBR 保持一致）
+  static constexpr float kDepthClear = 1.0f;
+  static constexpr uint32_t kColorClear = 0u;
+
+  const size_t tile_size_;
+};
+
+}  // namespace simple_renderer
+
+#endif  // SIMPLERENDER_SRC_INCLUDE_RENDERERS_TILE_BASED_DEFERRED_RENDERER_HPP_
diff --git a/src/include/renderers/tile_based_renderer.hpp b/src/include/renderers/tile_based_renderer.hpp
index da7970c..7baf2a8 100644
--- a/src/include/renderers/tile_based_renderer.hpp
+++ b/src/include/renderers/tile_based_renderer.hpp
@@ -103,7 +103,6 @@ class TileBasedRenderer final : public RendererBase {
    * @param soa 经过变换后的 SoA 顶点数据
    * @param shader 着色器
    * @param use_early_z 是否启用 Early‑Z
-   * @param scratch_fragments 可复用片段临时容器
    */
   void RasterizeTile(size_t tile_id,
                      const std::vector<TileTriangleRef> &triangles,
@@ -113,7 +112,6 @@ class TileBasedRenderer final : public RendererBase {
                      std::unique_ptr<uint32_t[]> &global_color_buffer,
                      const Shader& shader,
                      bool use_early_z,
-                     std::vector<Fragment>* scratch_fragments,
                      TileMaskStats* out_stats);
 
  private:
diff --git a/src/include/shader.hpp b/src/include/shader.hpp
index 8314f55..8bfe041 100644
--- a/src/include/shader.hpp
+++ b/src/include/shader.hpp
@@ -6,6 +6,7 @@
 #include <shared_mutex>
 #include <unordered_map>
 #include <variant>
+#include <vector>
 
 #include "light.h"
 #include "material.hpp"
@@ -14,7 +15,8 @@
 namespace simple_renderer {
 
 using UniformValue = std::variant<int, float, Vector2f, Vector3f, Vector4f,
-                                  Matrix3f, Matrix4f, Material, Light>;
+                                  Matrix3f, Matrix4f, Material, Light,
+                                  std::vector<Light>>;
 
 inline constexpr size_t kSpecularLutResolution = 256;
 
@@ -27,7 +29,7 @@ class UniformBuffer {
             std::is_same_v<T, Vector2f> || std::is_same_v<T, Vector3f> ||
             std::is_same_v<T, Vector4f> || std::is_same_v<T, Matrix3f> ||
             std::is_same_v<T, Matrix4f> || std::is_same_v<T, Material> ||
-            std::is_same_v<T, Light>,
+            std::is_same_v<T, Light> || std::is_same_v<T, std::vector<Light>>,
         "Type not supported by UniformValue");
     uniforms_[name] = value;
   }
@@ -83,10 +85,10 @@ struct VertexUniformCache {
 };
 
 struct FragmentUniformCache {
-  Light light{};
+  std::vector<Light> lights{}; // 支持多光源
   Vector3f camera_pos = Vector3f(0.0f);
-  Vector3f light_dir_normalized = Vector3f(0.0f);
-  bool has_light = false;
+  std::vector<Vector3f> light_dirs_normalized{};
+  bool has_lights = false;
   bool has_camera = false;
   bool derived_valid = false;
 };
@@ -142,6 +144,7 @@ class Shader {
 
   void UpdateMatrixCache(const std::string &name, const Matrix4f &value);
   void UpdateFragmentCache(const std::string &name, const Light &value);
+  void UpdateFragmentCache(const std::string &name, const std::vector<Light> &value);
   void UpdateFragmentCache(const std::string &name, const Vector3f &value);
   void RecalculateDerivedMatrices();
   void RecalculateFragmentDerived();
@@ -155,6 +158,10 @@ class Shader {
 
   Color SampleTexture(const Texture &texture, const Vector2f &uv) const;
   Color ClampColor(const Color color) const;
+
+ public:
+  // 便捷接口：设置多光源
+  void SetLights(const std::vector<Light>& lights) { SetUniform("lights", lights); }
 };
 
 uint8_t FloatToUint8_t(float val);
diff --git a/src/rasterizer.cpp b/src/rasterizer.cpp
index 04aa6b1..e241444 100644
--- a/src/rasterizer.cpp
+++ b/src/rasterizer.cpp
@@ -81,159 +81,6 @@ std::vector<Fragment> Rasterizer::Rasterize(const Vertex& v0, const Vertex& v1,
   return fragments;
 }
 
-void Rasterizer::RasterizeTo(const Vertex& v0, const Vertex& v1, const Vertex& v2,
-                             int x0, int y0, int x1, int y1,
-                             std::vector<Fragment>& out) {
-  // 获取三角形的最小 box（屏幕空间）
-  const Vector4f p0 = v0.GetPosition();
-  const Vector4f p1 = v1.GetPosition();
-  const Vector4f p2 = v2.GetPosition();
-
-  Vector2f a(p0.x, p0.y);
-  Vector2f b(p1.x, p1.y);
-  Vector2f c(p2.x, p2.y);
-
-  Vector2f bboxMin = Vector2f{std::min({a.x, b.x, c.x}), std::min({a.y, b.y, c.y})};
-  Vector2f bboxMax = Vector2f{std::max({a.x, b.x, c.x}), std::max({a.y, b.y, c.y})};
-
-  // Clamp 到屏幕尺寸
-  float minx = std::max(0.0f, bboxMin.x);
-  float miny = std::max(0.0f, bboxMin.y);
-  float maxx = std::min(float(width_ - 1), bboxMax.x);
-  float maxy = std::min(float(height_ - 1), bboxMax.y);
-
-  // 与外部提供的裁剪区域相交（半开区间） -> 闭区间扫描
-  int sx = std::max(x0, static_cast<int>(std::floor(minx)));
-  int sy = std::max(y0, static_cast<int>(std::floor(miny)));
-  int ex = std::min(x1 - 1, static_cast<int>(std::floor(maxx)));
-  int ey = std::min(y1 - 1, static_cast<int>(std::floor(maxy)));
-  if (sx > ex || sy > ey) return;
-
-  for (int x = sx; x <= ex; ++x) {
-    for (int y = sy; y <= ey; ++y) {
-      auto [is_inside, bary] = GetBarycentricCoord(
-          Vector3f(p0.x, p0.y, p0.z), Vector3f(p1.x, p1.y, p1.z), Vector3f(p2.x, p2.y, p2.z),
-          Vector3f(static_cast<float>(x), static_cast<float>(y), 0));
-      if (!is_inside) continue;
-
-      // 透视矫正插值
-      auto perspective_result = PerformPerspectiveCorrection(
-          p0.w, p1.w, p2.w,
-          p0.z, p1.z, p2.z,
-          bary);
-
-      const Vector3f& corrected_bary = perspective_result.corrected_barycentric;
-      float z = perspective_result.interpolated_z;
-
-      Fragment frag; // material 指针由调用方填写
-      frag.screen_coord = {x, y};
-      frag.normal = Interpolate(v0.GetNormal(), v1.GetNormal(), v2.GetNormal(), corrected_bary);
-      frag.uv     = Interpolate(v0.GetTexCoords(), v1.GetTexCoords(), v2.GetTexCoords(), corrected_bary);
-      frag.color  = InterpolateColor(v0.GetColor(), v1.GetColor(), v2.GetColor(), corrected_bary);
-      frag.depth  = z;
-
-      out.push_back(frag);
-    }
-  }
-}
-
-void Rasterizer::RasterizeTo(const VertexSoA& soa, size_t i0, size_t i1, size_t i2,
-                             int x0, int y0, int x1, int y1,
-                             std::vector<Fragment>& out) {
-  // 读取三顶点的屏幕空间位置
-  const Vector4f& p0 = soa.pos_screen[i0];
-  const Vector4f& p1 = soa.pos_screen[i1];
-  const Vector4f& p2 = soa.pos_screen[i2];
-
-  // 为BarycentricCoord预构造Vec3f，避免循环内重复构造
-  const Vector3f sp0(p0.x, p0.y, p0.z);
-  const Vector3f sp1(p1.x, p1.y, p1.z);
-  const Vector3f sp2(p2.x, p2.y, p2.z);
-
-  // 计算屏幕空间AABB包围盒
-  const float minx_f = std::max(0.0f, std::min({p0.x, p1.x, p2.x}));
-  const float miny_f = std::max(0.0f, std::min({p0.y, p1.y, p2.y}));
-  const float maxx_f = std::min(float(width_  - 1), std::max({p0.x, p1.x, p2.x}));
-  const float maxy_f = std::min(float(height_ - 1), std::max({p0.y, p1.y, p2.y}));
-
-  // 与外部提供的裁剪区域相交（半开区间） -> 闭区间扫描
-  int sx = std::max(x0, static_cast<int>(std::floor(minx_f)));
-  int sy = std::max(y0, static_cast<int>(std::floor(miny_f)));
-  int ex = std::min(x1 - 1, static_cast<int>(std::floor(maxx_f)));
-  int ey = std::min(y1 - 1, static_cast<int>(std::floor(maxy_f)));
-  if (sx > ex || sy > ey) return;
-
-  // 预计算边函数系数：E(x,y) = A*x + B*y + C
-  // 使用相对坐标的边函数定义，避免大常数项导致的数值不稳定
-  // 如使用绝对形式Ax+By+C会由于常数C的量级过大，造成浮点抵消，有效位丢失不稳定
-  auto cross2 = [](float ax, float ay, float bx, float by) {
-    return ax * by - ay * bx;
-  };
-  // 边向量
-  const float e01x = p1.x - p0.x, e01y = p1.y - p0.y; // (p0->p1)
-  const float e12x = p2.x - p1.x, e12y = p2.y - p1.y; // (p1->p2)
-  const float e20x = p0.x - p2.x, e20y = p0.y - p2.y; // (p2->p0)
-
-  // 有向面积（两倍），用相对面积定义：area2 = cross(p1 - p0, p2 - p0)
-  float area2 = cross2(e01x, e01y, p2.x - p0.x, p2.y - p0.y);
-  if (std::abs(area2) < 1e-6f) return; // 退化三角形
-  const float inv_area2 = 1.0f / area2;
-  const bool positive = (area2 > 0.0f);
-
-  // 行优先遍历：有利于 cache 与向量化
-  #pragma omp simd
-  for (int y = sy; y <= ey; ++y) {
-    const float yf = static_cast<float>(y);
-
-    // 注意：此处存在对 out.push_back 的写入，属于有副作用操作，不适合使用
-    // omp simd 进行强制向量化，否则可能导致不符合预期的行为（如周期性伪影）。
-    // 先保持标量内层，后续如切换为“直写像素回调”再考虑安全的 SIMD 化。
-    for (int x = sx; x <= ex; ++x) {
-      const float xf = static_cast<float>(x);
-
-      // 相对坐标边函数：
-      // E01(p) = cross(p1 - p0, p - p0)
-      // E12(p) = cross(p2 - p1, p - p1)
-      // E20(p) = cross(p0 - p2, p - p2)
-      const float E01 = cross2(e01x, e01y, xf - p0.x, yf - p0.y);
-      const float E12 = cross2(e12x, e12y, xf - p1.x, yf - p1.y);
-      const float E20 = cross2(e20x, e20y, xf - p2.x, yf - p2.y);
-
-      // 半空间测试（根据朝向选择符号）
-      const bool inside = positive ? (E01 >= 0.0f && E12 >= 0.0f && E20 >= 0.0f)
-                                   : (E01 <= 0.0f && E12 <= 0.0f && E20 <= 0.0f);
-      if (!inside) continue;
-
-      // 重心权重映射：
-      // b0 对应 v0，取与对边 (v1,v2) 的子面积 → E12
-      // b1 对应 v1 → E20
-      // b2 对应 v2 → E01
-      const float b0 = E12 * inv_area2;
-      const float b1 = E20 * inv_area2;
-      const float b2 = E01 * inv_area2;
-      const Vector3f bary(b0, b1, b2);
-
-      // 透视矫正插值
-      auto perspective_result = PerformPerspectiveCorrection(
-          p0.w, p1.w, p2.w,
-          p0.z, p1.z, p2.z,
-          bary);
-
-      const Vector3f& corrected_bary = perspective_result.corrected_barycentric;
-      const float z = perspective_result.interpolated_z;
-
-      Fragment frag; // Note: material 指针由调用方填写
-      frag.screen_coord = {x, y};
-      frag.normal = Interpolate(soa.normal[i0], soa.normal[i1], soa.normal[i2], corrected_bary);
-      frag.uv     = Interpolate(soa.uv[i0],     soa.uv[i1],     soa.uv[i2],     corrected_bary);
-      frag.color  = InterpolateColor(soa.color[i0], soa.color[i1], soa.color[i2], corrected_bary);
-      frag.depth  = z;
-
-      out.push_back(frag);
-    }
-  }
-}
-
 std::pair<bool, Vector3f> Rasterizer::GetBarycentricCoord(const Vector3f& p0,
                                                           const Vector3f& p1,
                                                           const Vector3f& p2,
diff --git a/src/renderer.cpp b/src/renderer.cpp
index 0939cf5..d647e93 100644
--- a/src/renderer.cpp
+++ b/src/renderer.cpp
@@ -6,6 +6,7 @@
 #include "renderers/per_triangle_renderer.hpp"
 #include "renderers/tile_based_renderer.hpp"
 #include "renderers/deferred_renderer.hpp"
+#include "renderers/tile_based_deferred_renderer.hpp"
 
 namespace simple_renderer {
 
@@ -14,6 +15,7 @@ std::string RenderingModeToString(RenderingMode mode) {
     case RenderingMode::PER_TRIANGLE: return "PER_TRIANGLE";
     case RenderingMode::TILE_BASED:  return "TILE_BASED";
     case RenderingMode::DEFERRED:    return "DEFERRED";
+    case RenderingMode::TILE_BASED_DEFERRED: return "TILE_BASED_DEFERRED";
   }
   return "PER_TRIANGLE";
 }
@@ -77,6 +79,11 @@ void SimpleRenderer::EnsureRenderer() {
       renderer_ = std::move(r);
       break;
     }
+    case RenderingMode::TILE_BASED_DEFERRED: {
+      auto r = std::make_unique<TileBasedDeferredRenderer>(width_, height_, tbr_tile_size_);
+      renderer_ = std::move(r);
+      break;
+    }
   }
 }
 
diff --git a/src/renderers/tile_based_deferred_renderer.cpp b/src/renderers/tile_based_deferred_renderer.cpp
new file mode 100644
index 0000000..93abf9b
--- /dev/null
+++ b/src/renderers/tile_based_deferred_renderer.cpp
@@ -0,0 +1,435 @@
+//
+// Tile-Based Deferred Renderer (TBDR)
+// -----------------------------------
+// 本文件实现 CPU 侧的基于 Tile 的延迟着色(TBDR)。整体思路与现代 TBDR 硬件类似：
+// 1) 先将三角形按屏幕划分到 Tile（binning）；
+// 2) 对每个 Tile，进行“两阶段”光栅化：
+//    - 阶段A：仅进行深度决胜（Z 预通过）——找出每个像素的“胜出三角形”和其最小深度，并缓存透视矫正后的重心权重；
+//    - 阶段B：仅对胜出的像素执行一次片元着色（FragmentShader），写回 Tile 局部缓冲，然后整 Tile 拷贝到全局。
+//
+// 与现有 TBR（Tile-Based 前向渲染）相比，TBDR 避免了“对被随后覆盖的像素进行无用的着色”，在overdraw较多时显著减少
+// FragmentShader 调用次数；同时保持 Tile‑major 的访问局部性与单份全局 frame buffer 的并发安全写回。
+
+#include "renderers/tile_based_deferred_renderer.hpp"
+
+#include <omp.h>
+
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <cstring>
+#include <limits>
+
+#include "config.h"
+#include "log_system.h"
+
+namespace simple_renderer {
+
+bool TileBasedDeferredRenderer::Render(const Model& model, const Shader& shader_in,
+                                       uint32_t* buffer) {
+  auto total_start_time = std::chrono::high_resolution_clock::now();
+  auto shader = std::make_shared<Shader>(shader_in);
+  shader->PrepareUniformCaches();
+
+  // 顶点阶段（SoA）
+  // - 统一进行裁剪空间->NDC->屏幕空间的变换，并将结果写入 SoA。
+  // - 此阶段与 TBR 完全一致。
+  auto vertex_start = std::chrono::high_resolution_clock::now();
+  const auto& input_vertices = model.GetVertices();
+  VertexSoA soa; soa.resize(input_vertices.size());
+
+#pragma omp parallel for num_threads(kNProc) schedule(static) shared(shader, soa, input_vertices)
+  for (size_t i = 0; i < input_vertices.size(); ++i) {
+    const auto& v = input_vertices[i];
+    auto clip = shader->VertexShader(v);
+    soa.pos_clip[i] = clip.GetPosition();
+    auto ndc = PerspectiveDivision(clip);
+    auto screen = ViewportTransformation(ndc);
+    soa.pos_screen[i] = screen.GetPosition();
+    soa.normal[i] = screen.GetNormal();
+    soa.uv[i] = screen.GetTexCoords();
+    soa.color[i] = screen.GetColor();
+  }
+  auto vertex_end = std::chrono::high_resolution_clock::now();
+  double vertex_ms = std::chrono::duration_cast<std::chrono::microseconds>(vertex_end - vertex_start).count() / 1000.0;
+
+  // Tile Binning
+  // - 将三角形按屏幕空间包围盒映射到 Tile 网格；
+  // - 使后续处理以 Tile 为并行单元，避免跨 Tile 写冲突；
+  // - 仍复用现有 TBR 的数据结构与Binning逻辑。
+  auto setup_start = std::chrono::high_resolution_clock::now();
+  const size_t TILE_SIZE = tile_size_ > 0 ? tile_size_ : 64;
+  const size_t tiles_x = (width_ + TILE_SIZE - 1) / TILE_SIZE;
+  const size_t tiles_y = (height_ + TILE_SIZE - 1) / TILE_SIZE;
+  const size_t total_tiles = tiles_x * tiles_y;
+  std::vector<std::vector<TileTriangleRef>> tile_triangles(total_tiles);
+  auto setup_end = std::chrono::high_resolution_clock::now();
+  double setup_ms = std::chrono::duration_cast<std::chrono::microseconds>(setup_end - setup_start).count() / 1000.0;
+
+  auto bin_start = std::chrono::high_resolution_clock::now();
+  TileGridContext grid_ctx{soa, tiles_x, tiles_y, TILE_SIZE};
+  TriangleTileBinning(model, grid_ctx, tile_triangles);
+  auto bin_end = std::chrono::high_resolution_clock::now();
+  double bin_ms = std::chrono::duration_cast<std::chrono::microseconds>(bin_end - bin_start).count() / 1000.0;
+
+  // 全局 framebuffer（单份）
+  // - 每个 Tile 完成后，整行拷贝到这份全局缓冲；
+  // - 不同 Tile 不重叠，省去同步/锁开销。
+  auto buf_alloc_start = std::chrono::high_resolution_clock::now();
+  std::unique_ptr<float[]> depthBuffer = std::make_unique<float[]>(width_ * height_);
+  std::unique_ptr<uint32_t[]> colorBuffer = std::make_unique<uint32_t[]>(width_ * height_);
+  std::fill_n(depthBuffer.get(), width_ * height_, kDepthClear);
+  std::fill_n(colorBuffer.get(), width_ * height_, kColorClear);
+  auto buf_alloc_end = std::chrono::high_resolution_clock::now();
+  double buf_alloc_ms = std::chrono::duration_cast<std::chrono::microseconds>(buf_alloc_end - buf_alloc_start).count() / 1000.0;
+
+  // 并行按 tile 渲染：两阶段（Z 决胜 -> 着色）
+  // - OpenMP 以 Tile 为单位并行；
+  // - Tile 内先进行“Z 预通过”（不着色），再统一“按像素胜者着色”。
+  auto raster_start = std::chrono::high_resolution_clock::now();
+  std::vector<TileMaskStats> tile_stats(total_tiles);
+
+#pragma omp parallel num_threads(kNProc) default(none) \
+    shared(tile_triangles, grid_ctx, depthBuffer, colorBuffer, shader, total_tiles, tile_stats)
+  {
+    std::unique_ptr<float[]> tile_depth_buffer = std::make_unique<float[]>(grid_ctx.tile_size * grid_ctx.tile_size);
+    std::unique_ptr<uint32_t[]> tile_color_buffer = std::make_unique<uint32_t[]>(grid_ctx.tile_size * grid_ctx.tile_size);
+
+#pragma omp for schedule(static)
+    for (size_t tile_id = 0; tile_id < total_tiles; ++tile_id) {
+      uint64_t tested = 0, covered = 0, winners = 0, shaded = 0;
+      // 2-pass 的核心逻辑在 RasterizeTileDeferred 内：
+      //   A) 仅计算覆盖与深度，确定每像素胜者（三角形索引）并缓存透视矫正重心；
+      //   B) 对胜者像素一次性着色写回，最后整 Tile 拷贝到全局。
+      RasterizeTileDeferred(tile_id, tile_triangles[tile_id], grid_ctx,
+                            tile_depth_buffer.get(), tile_color_buffer.get(),
+                            depthBuffer, colorBuffer, *shader,
+                            &tested, &covered, &winners, &shaded);
+      tile_stats[tile_id].tested = tested;
+      tile_stats[tile_id].covered = covered;
+      tile_stats[tile_id].zpass = winners; // 在 TBDR 中 zpass≈winner 数
+      tile_stats[tile_id].shaded = shaded;
+    }
+  }
+  auto raster_end = std::chrono::high_resolution_clock::now();
+  double raster_ms = std::chrono::duration_cast<std::chrono::microseconds>(raster_end - raster_start).count() / 1000.0;
+
+  // 汇总统计
+  uint64_t sum_tested = 0, sum_covered = 0, sum_winners = 0, sum_shaded = 0;
+  for (const auto& s : tile_stats) {
+    sum_tested += s.tested;
+    sum_covered += s.covered;
+    sum_winners += s.zpass;
+    sum_shaded += s.shaded;
+  }
+  auto rate = [](uint64_t num, uint64_t den) -> double { return (den == 0) ? 0.0 : double(num) / double(den) * 100.0; };
+  SPDLOG_DEBUG("TBDR Stats: tested={}, covered={} ({:.1f}%), winners={} ({:.1f}%), shaded={} ({:.1f}%)",
+               sum_tested, sum_covered, rate(sum_covered, sum_tested),
+               sum_winners, rate(sum_winners, sum_covered),
+               sum_shaded, rate(sum_shaded, sum_covered));
+
+  // 拷贝到输出
+  auto present_start = std::chrono::high_resolution_clock::now();
+  std::memcpy(buffer, colorBuffer.get(), width_ * height_ * sizeof(uint32_t));
+  auto present_end = std::chrono::high_resolution_clock::now();
+  double present_ms = std::chrono::duration_cast<std::chrono::microseconds>(present_end - present_start).count() / 1000.0;
+
+  auto total_end_time = std::chrono::high_resolution_clock::now();
+  double total_ms = std::chrono::duration_cast<std::chrono::microseconds>(total_end_time - total_start_time).count() / 1000.0;
+
+  SPDLOG_DEBUG("=== TILE-BASED DEFERRED RENDERING PERFORMANCE ===");
+  double sum_ms = vertex_ms + (total_ms - vertex_ms);
+  SPDLOG_DEBUG("Vertex Shader:    {:8.3f} ms ({:5.1f}%)", vertex_ms, vertex_ms / sum_ms * 100);
+  SPDLOG_DEBUG("Setup:            {:8.3f} ms", setup_ms);
+  SPDLOG_DEBUG("Binning:          {:8.3f} ms", bin_ms);
+  SPDLOG_DEBUG("Buffer Alloc:     {:8.3f} ms", buf_alloc_ms);
+  SPDLOG_DEBUG("Tile Raster:      {:8.3f} ms", raster_ms);
+  SPDLOG_DEBUG("Copy:             {:8.3f} ms", present_ms);
+  SPDLOG_DEBUG("Total:            {:8.3f} ms", vertex_ms + (setup_ms + bin_ms + buf_alloc_ms + raster_ms + present_ms));
+  SPDLOG_DEBUG("===============================================");
+
+  return true;
+}
+
+void TileBasedDeferredRenderer::TriangleTileBinning(
+    const Model& model, const TileGridContext& grid,
+    std::vector<std::vector<TileTriangleRef>>& tile_triangles) {
+  const size_t total_triangles = model.GetFaces().size();
+  SPDLOG_DEBUG("Starting triangle-tile binning (SoA) for {} triangles", total_triangles);
+  SPDLOG_DEBUG("Screen dimensions: {}x{}, Tile size: {}, Tiles: {}x{}", width_, height_, grid.tile_size, grid.tiles_x, grid.tiles_y);
+
+  std::vector<size_t> tile_counts(grid.tiles_x * grid.tiles_y, 0);
+  for (size_t tri_idx = 0; tri_idx < total_triangles; ++tri_idx) {
+    ProcessTriangleForTileBinning(tri_idx, true, model, grid, tile_counts, tile_triangles);
+  }
+  for (size_t tile_id = 0; tile_id < tile_triangles.size(); ++tile_id) {
+    if (tile_counts[tile_id] > 0) tile_triangles[tile_id].reserve(tile_counts[tile_id]);
+  }
+  for (size_t tri_idx = 0; tri_idx < total_triangles; ++tri_idx) {
+    ProcessTriangleForTileBinning(tri_idx, false, model, grid, tile_counts, tile_triangles);
+  }
+
+  size_t total_refs = 0, non_empty = 0;
+  for (const auto& v : tile_triangles) { total_refs += v.size(); if (!v.empty()) non_empty++; }
+  SPDLOG_DEBUG("  (SoA) Total triangle references: {}", total_refs);
+  SPDLOG_DEBUG("  (SoA) Non-empty tiles: {}", non_empty);
+  SPDLOG_DEBUG("  (SoA) Average triangles per tile: {:.2f}", total_refs > 0 ? float(total_refs) / tile_triangles.size() : 0.0f);
+}
+
+void TileBasedDeferredRenderer::ProcessTriangleForTileBinning(
+    size_t tri_idx, bool count_only, const Model& model, const TileGridContext& grid,
+    std::vector<size_t>& tile_counts,
+    std::vector<std::vector<TileTriangleRef>>& tile_triangles) {
+  const auto& f = model.GetFaces()[tri_idx];
+  size_t i0 = f.GetIndex(0), i1 = f.GetIndex(1), i2 = f.GetIndex(2);
+
+  // 视锥体裁剪（裁剪空间保守裁剪）
+  const Vector4f &c0 = grid.soa.pos_clip[i0];
+  const Vector4f &c1 = grid.soa.pos_clip[i1];
+  const Vector4f &c2 = grid.soa.pos_clip[i2];
+  bool frustum_cull =
+      (c0.x > c0.w && c1.x > c1.w && c2.x > c2.w) ||
+      (c0.x < -c0.w && c1.x < -c0.w && c2.x < -c0.w) ||
+      (c0.y > c0.w && c1.y > c1.w && c2.y > c2.w) ||
+      (c0.y < -c0.w && c1.y < -c0.w && c2.y < -c0.w) ||
+      (c0.z > c0.w && c1.z > c1.w && c2.z > c2.w) ||
+      (c0.z < -c0.w && c1.z < -c0.w && c2.z < -c0.w);
+  if (frustum_cull) return;
+
+  const Vector4f &pos0 = grid.soa.pos_screen[i0];
+  const Vector4f &pos1 = grid.soa.pos_screen[i1];
+  const Vector4f &pos2 = grid.soa.pos_screen[i2];
+
+  // 背面剔除（屏幕空间叉积）
+  Vector2f screen0(pos0.x, pos0.y), screen1(pos1.x, pos1.y), screen2(pos2.x, pos2.y);
+  Vector2f edge1 = screen1 - screen0, edge2 = screen2 - screen0;
+  float cross_product = edge1.x * edge2.y - edge1.y * edge2.x;
+  if (cross_product > 0.0f) return;
+
+  // tile 覆盖范围
+  float min_x = std::min({pos0.x, pos1.x, pos2.x});
+  float max_x = std::max({pos0.x, pos1.x, pos2.x});
+  float min_y = std::min({pos0.y, pos1.y, pos2.y});
+  float max_y = std::max({pos0.y, pos1.y, pos2.y});
+
+  int start_tile_x = std::max(0, static_cast<int>(min_x) / static_cast<int>(grid.tile_size));
+  int end_tile_x   = std::min(static_cast<int>(grid.tiles_x - 1), static_cast<int>(max_x) / static_cast<int>(grid.tile_size));
+  int start_tile_y = std::max(0, static_cast<int>(min_y) / static_cast<int>(grid.tile_size));
+  int end_tile_y   = std::min(static_cast<int>(grid.tiles_y - 1), static_cast<int>(max_y) / static_cast<int>(grid.tile_size));
+  if (start_tile_x > end_tile_x || start_tile_y > end_tile_y) return;
+
+  if (count_only) {
+    for (int ty = start_tile_y; ty <= end_tile_y; ++ty)
+      for (int tx = start_tile_x; tx <= end_tile_x; ++tx)
+        tile_counts[ty * grid.tiles_x + tx]++;
+  } else {
+    TileTriangleRef tri_ref{i0, i1, i2, &f.GetMaterial(), tri_idx};
+    for (int ty = start_tile_y; ty <= end_tile_y; ++ty)
+      for (int tx = start_tile_x; tx <= end_tile_x; ++tx)
+        tile_triangles[ty * grid.tiles_x + tx].push_back(tri_ref);
+  }
+}
+
+void TileBasedDeferredRenderer::RasterizeTileDeferred(
+    size_t tile_id, const std::vector<TileTriangleRef>& triangles,
+    const TileGridContext& grid, float* tile_depth_buffer, uint32_t* tile_color_buffer,
+    std::unique_ptr<float[]>& global_depth_buffer, std::unique_ptr<uint32_t[]>& global_color_buffer,
+    const Shader& shader, uint64_t* out_tested, uint64_t* out_covered, uint64_t* out_winners, uint64_t* out_shaded) {
+  // 计算本 Tile 覆盖的屏幕区域（半开区间对齐到闭区间扫描）
+  size_t tile_x = tile_id % grid.tiles_x;
+  size_t tile_y = tile_id / grid.tiles_x;
+  size_t screen_x_start = tile_x * grid.tile_size;
+  size_t screen_y_start = tile_y * grid.tile_size;
+  size_t screen_x_end = std::min(screen_x_start + grid.tile_size, width_);
+  size_t screen_y_end = std::min(screen_y_start + grid.tile_size, height_);
+
+  size_t tile_width = screen_x_end - screen_x_start;
+  size_t tile_height = screen_y_end - screen_y_start;
+
+  // 阶段缓冲：Z 最小、胜者三角形索引、重心缓存（b0c/b1c）
+  // - zmin：本 Tile 每像素的当前最小深度；
+  // - winner：本 Tile 每像素的“胜出三角形”的局部索引（-1 表示尚未命中任何三角形）；
+  // - b0c/b1c：缓存透视矫正后的重心权重（b2c = 1 - b0c - b1c），用于阶段B避免重复计算。
+  std::vector<float> zmin(tile_width * tile_height, kDepthClear);
+  std::vector<int32_t> winner(tile_width * tile_height, -1);
+  std::vector<float> b0c(tile_width * tile_height, 0.0f);
+  std::vector<float> b1c(tile_width * tile_height, 0.0f);
+
+  // 初始化 tile 局部 color/depth 缓冲
+  std::fill_n(tile_depth_buffer, tile_width * tile_height, kDepthClear);
+  std::fill_n(tile_color_buffer, tile_width * tile_height, kColorClear);
+
+  constexpr int kLane = 8;
+  uint64_t tested_pixels = 0, covered_pixels = 0, winner_pixels = 0, shaded_pixels = 0;
+
+  auto cross2 = [](float ax, float ay, float bx, float by) { return ax * by - ay * bx; };
+
+  // 阶段 A：Z 决胜（仅更新 zmin / winner / b0c/b1c）
+  // - 使用边函数进行半空间内点测试，行优先 + kLane 批处理，利于 cache 与自动向量化；
+  // - 对覆盖像素进行透视矫正重心计算（先插 1/w，再还原权重），并据此插值 z；
+  // - 若 z 更小，则更新该像素的胜者信息与缓存的重心；此阶段不执行着色。
+  for (const auto& tri : triangles) {
+    const size_t i0 = tri.i0, i1 = tri.i1, i2 = tri.i2;
+    const Vector4f &p0 = grid.soa.pos_screen[i0];
+    const Vector4f &p1 = grid.soa.pos_screen[i1];
+    const Vector4f &p2 = grid.soa.pos_screen[i2];
+
+    // 屏幕空间 AABB 与 tile 相交
+    const float tri_minx = std::min({p0.x, p1.x, p2.x});
+    const float tri_miny = std::min({p0.y, p1.y, p2.y});
+    const float tri_maxx = std::max({p0.x, p1.x, p2.x});
+    const float tri_maxy = std::max({p0.y, p1.y, p2.y});
+
+    int sx = std::max<int>(static_cast<int>(screen_x_start), static_cast<int>(std::floor(std::max(0.0f, tri_minx))));
+    int sy = std::max<int>(static_cast<int>(screen_y_start), static_cast<int>(std::floor(std::max(0.0f, tri_miny))));
+    int ex = std::min<int>(static_cast<int>(screen_x_end - 1), static_cast<int>(std::floor(std::min<float>(width_ - 1, tri_maxx))));
+    int ey = std::min<int>(static_cast<int>(screen_y_end - 1), static_cast<int>(std::floor(std::min<float>(height_ - 1, tri_maxy))));
+    if (sx > ex || sy > ey) continue;
+
+    // 边向量、面积及朝向
+    const float e01x = p1.x - p0.x, e01y = p1.y - p0.y;
+    const float e12x = p2.x - p1.x, e12y = p2.y - p1.y;
+    const float e20x = p0.x - p2.x, e20y = p0.y - p2.y;
+    // 面积 area2 = cross(p1 - p0, p2 - p0)；用于重心计算与正负朝向判别。
+    const float area2 = cross2(e01x, e01y, p2.x - p0.x, p2.y - p0.y);
+    if (std::abs(area2) < 1e-6f) continue;
+    const bool positive = (area2 > 0.0f);
+
+    // 深度与 1/w 插值准备
+    // 透视校正思路：在屏幕空间中 1/w 线性，先插值 1/w，再将各顶点属性乘以 1/w 并归一。
+    const float z0 = p0.z, z1 = p1.z, z2 = p2.z;
+    const float w0_inv = 1.0f / p0.w, w1_inv = 1.0f / p1.w, w2_inv = 1.0f / p2.w;
+
+    for (int y = sy; y <= ey; ++y) {
+      const float yf = static_cast<float>(y);
+      float E01_base = cross2(e01x, e01y, static_cast<float>(sx) - p0.x, yf - p0.y);
+      float E12_base = cross2(e12x, e12y, static_cast<float>(sx) - p1.x, yf - p1.y);
+      float E20_base = cross2(e20x, e20y, static_cast<float>(sx) - p2.x, yf - p2.y);
+      const float dE01dx = -e01y;
+      const float dE12dx = -e12y;
+      const float dE20dx = -e20y;
+
+      // 行扫描 + kLane 批处理：利于 cache 与自动向量化
+      for (int xb = sx; xb <= ex; xb += kLane) {
+        const int lane = std::min(kLane, ex - xb + 1);
+        float E01[kLane], E12[kLane], E20[kLane];
+#pragma omp simd
+        for (int j = 0; j < lane; ++j) {
+          E01[j] = E01_base + dE01dx * static_cast<float>(xb - sx + j);
+          E12[j] = E12_base + dE12dx * static_cast<float>(xb - sx + j);
+          E20[j] = E20_base + dE20dx * static_cast<float>(xb - sx + j);
+        }
+
+        // 内点测试，如果三角形在像素内，则将该像素加入覆盖掩码
+        unsigned mask_cover = 0u; int cover_count = 0;
+        for (int j = 0; j < lane; ++j) {
+          bool inside = positive ? (E01[j] >= 0.0f && E12[j] >= 0.0f && E20[j] >= 0.0f)
+                                 : (E01[j] <= 0.0f && E12[j] <= 0.0f && E20[j] <= 0.0f);
+          if (inside) { mask_cover |= (1u << j); ++cover_count; }
+        }
+        tested_pixels += static_cast<uint64_t>(lane);
+        covered_pixels += static_cast<uint64_t>(cover_count);
+        if (mask_cover == 0u) continue;
+
+        for (int j = 0; j < lane; ++j) {
+          if (((mask_cover >> j) & 1u) == 0u) continue;
+          const float b0 = E12[j] / area2;
+          const float b1 = E20[j] / area2;
+          const float b2 = E01[j] / area2;
+          const float w_inv = b0 * w0_inv + b1 * w1_inv + b2 * w2_inv; // 透视校正
+          const float b0c_ = (b0 * w0_inv) / w_inv;
+          const float b1c_ = (b1 * w1_inv) / w_inv;
+          const float b2c_ = (b2 * w2_inv) / w_inv;
+          const float z = z0 * b0c_ + z1 * b1c_ + z2 * b2c_;
+
+          const int sx_pix = xb + j;
+          const int local_x = sx_pix - static_cast<int>(screen_x_start);
+          const int local_y = y - static_cast<int>(screen_y_start);
+          const size_t idx = static_cast<size_t>(local_x + local_y * static_cast<int>(tile_width));
+          // 用极小 epsilon 防止抖动
+          if (z < zmin[idx] - 1e-8f) {
+            if (winner[idx] < 0) winner_pixels++;
+            zmin[idx] = z;
+            // 记录本 Tile 内的“局部三角形索引”，便于阶段B无需再次查找
+            winner[idx] = static_cast<int32_t>(&tri - &triangles[0]);
+            b0c[idx] = b0c_;
+            b1c[idx] = b1c_;
+          }
+        }
+      }
+    }
+  }
+
+  // 阶段 B：仅对胜者像素着色并写入 tile 局部缓冲
+  // - 对于 winner[idx] >= 0 的像素，从 SoA 插值 normal/uv/color，构造 Fragment；
+  // - 每像素仅进行一次 FragmentShader 调用，随后写回 tile 局部 color/depth。
+  for (size_t y = 0; y < tile_height; ++y) {
+    for (size_t x = 0; x < tile_width; ++x) {
+      const size_t idx = x + y * tile_width;
+      int32_t win = winner[idx];
+      if (win < 0) continue;
+
+      const auto& tri = triangles[static_cast<size_t>(win)];
+      const size_t i0 = tri.i0, i1 = tri.i1, i2 = tri.i2;
+      const float b0c_ = b0c[idx];
+      const float b1c_ = b1c[idx];
+      const float b2c_ = 1.0f - b0c_ - b1c_;
+
+      Fragment frag;
+      frag.screen_coord = {static_cast<int32_t>(screen_x_start + x), static_cast<int32_t>(screen_y_start + y)};
+      frag.depth = zmin[idx];
+      frag.material = tri.material;
+
+      // 插值属性
+      const Vector3f &n0 = grid.soa.normal[i0];
+      const Vector3f &n1 = grid.soa.normal[i1];
+      const Vector3f &n2 = grid.soa.normal[i2];
+      frag.normal = n0 * b0c_ + n1 * b1c_ + n2 * b2c_;
+
+      const Vector2f &uv0 = grid.soa.uv[i0];
+      const Vector2f &uv1 = grid.soa.uv[i1];
+      const Vector2f &uv2 = grid.soa.uv[i2];
+      frag.uv = uv0 * b0c_ + uv1 * b1c_ + uv2 * b2c_;
+
+      const Color &c0 = grid.soa.color[i0];
+      const Color &c1 = grid.soa.color[i1];
+      const Color &c2 = grid.soa.color[i2];
+      auto color_r = FloatToUint8_t(static_cast<float>(c0[Color::kColorIndexRed]) * b0c_ +
+                                    static_cast<float>(c1[Color::kColorIndexRed]) * b1c_ +
+                                    static_cast<float>(c2[Color::kColorIndexRed]) * b2c_);
+      auto color_g = FloatToUint8_t(static_cast<float>(c0[Color::kColorIndexGreen]) * b0c_ +
+                                    static_cast<float>(c1[Color::kColorIndexGreen]) * b1c_ +
+                                    static_cast<float>(c2[Color::kColorIndexGreen]) * b2c_);
+      auto color_b = FloatToUint8_t(static_cast<float>(c0[Color::kColorIndexBlue]) * b0c_ +
+                                    static_cast<float>(c1[Color::kColorIndexBlue]) * b1c_ +
+                                    static_cast<float>(c2[Color::kColorIndexBlue]) * b2c_);
+      frag.color = Color(color_r, color_g, color_b);
+
+      auto out_color = shader.FragmentShader(frag);
+      tile_depth_buffer[idx] = frag.depth;
+      tile_color_buffer[idx] = uint32_t(out_color);
+      shaded_pixels++;
+    }
+  }
+
+  // 写回全局缓冲（tile 行拷贝）
+  // 不同 Tile 区域不重叠，行拷贝无需锁
+  for (size_t y = 0; y < tile_height; ++y) {
+    const size_t tile_row_off = y * tile_width;
+    const size_t global_row_off = (screen_y_start + y) * width_ + screen_x_start;
+    // 将局部 tile 的 color/depth 复制到全局 framebuffer 中对应位置。
+    std::memcpy(global_color_buffer.get() + global_row_off,
+                tile_color_buffer + tile_row_off,
+                tile_width * sizeof(uint32_t));
+    std::memcpy(global_depth_buffer.get() + global_row_off,
+                tile_depth_buffer + tile_row_off,
+                tile_width * sizeof(float));
+  }
+
+  if (out_tested) *out_tested = tested_pixels;
+  if (out_covered) *out_covered = covered_pixels;
+  if (out_winners) *out_winners = winner_pixels;
+  if (out_shaded) *out_shaded = shaded_pixels;
+}
+
+}  // namespace simple_renderer
diff --git a/src/renderers/tile_based_renderer.cpp b/src/renderers/tile_based_renderer.cpp
index e39526e..e1c63e7 100644
--- a/src/renderers/tile_based_renderer.cpp
+++ b/src/renderers/tile_based_renderer.cpp
@@ -100,10 +100,6 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in,
     std::unique_ptr<uint32_t[]> tile_color_buffer =
         std::make_unique<uint32_t[]>(grid_ctx.tile_size * grid_ctx.tile_size);
 
-    // 为每个 tile 分配可复用片段临时容器，容量按单 tile 上限预估
-    std::vector<Fragment> scratch_fragments;
-    scratch_fragments.reserve(grid_ctx.tile_size * grid_ctx.tile_size);
-
 #pragma omp for schedule(static)
     for (size_t tile_id = 0; tile_id < total_tiles; ++tile_id) {
       // 按照 tile 进行光栅化（SoA）
@@ -111,7 +107,7 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in,
       RasterizeTile(tile_id, tile_triangles[tile_id], grid_ctx,
                     tile_depth_buffer.get(), tile_color_buffer.get(),
                     depthBuffer, colorBuffer, *shader, early_z_,
-                    &scratch_fragments, &tile_stats[tile_id]);
+                    &tile_stats[tile_id]);
     }
   }
   auto raster_end = std::chrono::high_resolution_clock::now();
@@ -129,7 +125,7 @@ bool TileBasedRenderer::Render(const Model &model, const Shader &shader_in,
     sum_shaded  += s.shaded;
   }
   auto rate = [](uint64_t num, uint64_t den) -> double {
-    if (den == 0) return 0.0; return double(num) / double(den) * 100.0;
+    return (den == 0)?0.0:double(num) / double(den) * 100.0;
   };
   SPDLOG_DEBUG(
       "TBR Mask Stats: tested={}, covered={} ({:.1f}%), zpass={} ({:.1f}%), shaded={} ({:.1f}%)",
@@ -220,7 +216,6 @@ void TileBasedRenderer::RasterizeTile(
     uint32_t *tile_color_buffer, std::unique_ptr<float[]> &global_depth_buffer,
     std::unique_ptr<uint32_t[]> &global_color_buffer,
     const Shader &shader, bool use_early_z,
-    std::vector<Fragment> *scratch_fragments,
     TileMaskStats* out_stats) {
   // 计算 tile 屏幕范围
   size_t tile_x = tile_id % grid.tiles_x;
diff --git a/src/shader.cpp b/src/shader.cpp
index 06ab241..708b419 100644
--- a/src/shader.cpp
+++ b/src/shader.cpp
@@ -136,13 +136,23 @@ void Shader::RecalculateDerivedMatrices() {
 
 void Shader::UpdateFragmentCache(const std::string& name,
                                  const Light& value) {
-  if (name != "light") {
-    return;
+  if (name != "light") { return; }
+  fragment_uniform_cache_.lights.clear();
+  fragment_uniform_cache_.lights.push_back(value);
+  fragment_uniform_cache_.has_lights = true;
+  fragment_uniform_cache_.derived_valid = false;
+  if (fragment_uniform_cache_.has_lights && fragment_uniform_cache_.has_camera) {
+    RecalculateFragmentDerived();
   }
-  fragment_uniform_cache_.light = value;
-  fragment_uniform_cache_.has_light = true;
+}
+
+void Shader::UpdateFragmentCache(const std::string& name,
+                                 const std::vector<Light>& value) {
+  if (name != "lights") { return; }
+  fragment_uniform_cache_.lights = value;
+  fragment_uniform_cache_.has_lights = true;
   fragment_uniform_cache_.derived_valid = false;
-  if (fragment_uniform_cache_.has_light && fragment_uniform_cache_.has_camera) {
+  if (fragment_uniform_cache_.has_lights && fragment_uniform_cache_.has_camera) {
     RecalculateFragmentDerived();
   }
 }
@@ -155,14 +165,18 @@ void Shader::UpdateFragmentCache(const std::string& name,
   fragment_uniform_cache_.camera_pos = value;
   fragment_uniform_cache_.has_camera = true;
   fragment_uniform_cache_.derived_valid = false;
-  if (fragment_uniform_cache_.has_light && fragment_uniform_cache_.has_camera) {
+  if (fragment_uniform_cache_.has_lights && fragment_uniform_cache_.has_camera) {
     RecalculateFragmentDerived();
   }
 }
 
 void Shader::RecalculateFragmentDerived() {
-  fragment_uniform_cache_.light_dir_normalized =
-      glm::normalize(fragment_uniform_cache_.light.dir);
+  fragment_uniform_cache_.light_dirs_normalized.clear();
+  fragment_uniform_cache_.light_dirs_normalized.reserve(
+      fragment_uniform_cache_.lights.size());
+  for (const auto& l : fragment_uniform_cache_.lights) {
+    fragment_uniform_cache_.light_dirs_normalized.push_back(glm::normalize(l.dir));
+  }
   fragment_uniform_cache_.derived_valid = true;
 }
 
@@ -196,13 +210,27 @@ void Shader::PrepareFragmentUniformCache() {
   if (fragment_uniform_cache_.derived_valid) {
     return;
   }
+  // 优先多光源
+  if (uniformbuffer_.HasUniform<std::vector<Light>>("lights") &&
+      uniformbuffer_.HasUniform<Vector3f>("cameraPos")) {
+    fragment_uniform_cache_.lights =
+        uniformbuffer_.GetUniform<std::vector<Light>>("lights");
+    fragment_uniform_cache_.has_lights = true;
+    fragment_uniform_cache_.camera_pos =
+        uniformbuffer_.GetUniform<Vector3f>("cameraPos");
+    fragment_uniform_cache_.has_camera = true;
+    RecalculateFragmentDerived();
+    return;
+  }
+  // 兼容单光源
   if (uniformbuffer_.HasUniform<Light>("light") &&
       uniformbuffer_.HasUniform<Vector3f>("cameraPos")) {
-    fragment_uniform_cache_.light =
-        uniformbuffer_.GetUniform<Light>("light");
+    fragment_uniform_cache_.lights.clear();
+    fragment_uniform_cache_.lights.push_back(
+        uniformbuffer_.GetUniform<Light>("light"));
     fragment_uniform_cache_.camera_pos =
         uniformbuffer_.GetUniform<Vector3f>("cameraPos");
-    fragment_uniform_cache_.has_light = true;
+    fragment_uniform_cache_.has_lights = true;
     fragment_uniform_cache_.has_camera = true;
     RecalculateFragmentDerived();
   }
@@ -259,59 +287,81 @@ auto Shader::EvaluateSpecular(float cos_theta, float shininess) const -> float {
 }
 
 Color Shader::FragmentShader(const Fragment& fragment) const {
-  // interpolate Normal, Color and UV
-  Color interpolateColor = fragment.color;
+  // Helper: 将 Color 转为 [0,1] 归一化向量
+  auto color_to_vec = [](const Color& c) -> Vector3f {
+    constexpr float inv255 = 1.0f / 255.0f;
+    return Vector3f(static_cast<float>(c[Color::kColorIndexRed]) * inv255,
+                    static_cast<float>(c[Color::kColorIndexGreen]) * inv255,
+                    static_cast<float>(c[Color::kColorIndexBlue]) * inv255);
+  };
+
+  // 输入插值属性
+  Vector3f base_color = color_to_vec(fragment.color);
   Vector3f normal = glm::normalize(fragment.normal);
   Vector2f uv = fragment.uv;
 
-  // uniform
-  Light light;
-  Vector3f light_dir;
+  // uniform（优先缓存）
+  std::vector<Light> lights;
+  std::vector<Vector3f> light_dirs;
   Vector3f camera_pos;
   if (fragment_uniform_cache_.derived_valid) {
-    light = fragment_uniform_cache_.light;
-    light_dir = fragment_uniform_cache_.light_dir_normalized;
+    lights = fragment_uniform_cache_.lights;
+    light_dirs = fragment_uniform_cache_.light_dirs_normalized;
     camera_pos = fragment_uniform_cache_.camera_pos;
   } else {
-    light = uniformbuffer_.GetUniform<Light>("light");
+    if (uniformbuffer_.HasUniform<std::vector<Light>>("lights")) {
+      lights = uniformbuffer_.GetUniform<std::vector<Light>>("lights");
+      light_dirs.reserve(lights.size());
+      for (const auto& l : lights) light_dirs.push_back(glm::normalize(l.dir));
+    } else if (uniformbuffer_.HasUniform<Light>("light")) {
+      lights = {uniformbuffer_.GetUniform<Light>("light")};
+      light_dirs = {glm::normalize(lights[0].dir)};
+    }
     camera_pos = uniformbuffer_.GetUniform<Vector3f>("cameraPos");
-    light_dir = glm::normalize(light.dir);
   }
+
   Material material = *fragment.material;
 
-  // view direction
-  Vector3f view_dir =
-      glm::normalize(sharedDataInShader_.fragPos_varying - camera_pos);
+  // 视线方向
+  Vector3f view_dir = glm::normalize(sharedDataInShader_.fragPos_varying - camera_pos);
 
-  auto intensity = std::max(glm::dot(normal, light_dir), 0.0f);
-  // texture color
-  Color ambient_color, diffuse_color, specular_color;
+  // ambient（只计算一次，使用纹理或顶点颜色）
+  Vector3f ambient_rgb;
   if (material.has_ambient_texture) {
-    Color texture_color = SampleTexture(material.ambient_texture, uv);
-    ambient_color = texture_color;
+    ambient_rgb = color_to_vec(SampleTexture(material.ambient_texture, uv));
   } else {
-    ambient_color = interpolateColor;
+    ambient_rgb = base_color;
   }
 
-  if (material.has_diffuse_texture) {
-    Color texture_color = SampleTexture(material.diffuse_texture, uv);
-    diffuse_color = texture_color * intensity;
-  } else {
-    diffuse_color = interpolateColor * intensity;
-  }
-
-  Vector3f halfVector = glm::normalize(light_dir + view_dir);
-  float cos_theta = std::max(glm::dot(normal, halfVector), 0.0f);
-  float spec = EvaluateSpecular(cos_theta, material.shininess);
-  if (material.has_specular_texture) {
-    Color texture_color = SampleTexture(material.specular_texture, uv);
-    specular_color = texture_color * spec;
-  } else {
-    specular_color = Color(1.0f, 1.0f, 1.0f) * spec;
+  // diffuse/specular 累加（float 归一化空间，避免 8bit 溢出与截断）
+  Vector3f diffuse_accum(0.0f);
+  Vector3f specular_accum(0.0f);
+  for (size_t i = 0; i < light_dirs.size(); ++i) {
+    const Vector3f& ldir = light_dirs[i];
+    float intensity = std::max(glm::dot(normal, ldir), 0.0f);
+
+    // diffuse
+    Vector3f kd = material.has_diffuse_texture
+                      ? color_to_vec(SampleTexture(material.diffuse_texture, uv))
+                      : base_color;
+    diffuse_accum += kd * intensity;
+
+    // specular
+    Vector3f halfVector = glm::normalize(ldir + view_dir);
+    float cos_theta = std::max(glm::dot(normal, halfVector), 0.0f);
+    float spec = EvaluateSpecular(cos_theta, material.shininess);
+    Vector3f ks = material.has_specular_texture
+                      ? color_to_vec(SampleTexture(material.specular_texture, uv))
+                      : Vector3f(1.0f);
+    specular_accum += ks * spec;
   }
 
-  return ClampColor(ambient_color * 0.1f + diffuse_color +
-                    specular_color * 0.2f);
+  Vector3f out_rgb = ambient_rgb * 0.1f + diffuse_accum + specular_accum * 0.2f;
+  // clamp 到 [0,1]
+  out_rgb.x = std::clamp(out_rgb.x, 0.0f, 1.0f);
+  out_rgb.y = std::clamp(out_rgb.y, 0.0f, 1.0f);
+  out_rgb.z = std::clamp(out_rgb.z, 0.0f, 1.0f);
+  return Color(out_rgb.x, out_rgb.y, out_rgb.z, 1.0f);
 }
 
 // 将浮点数转换为 uint8_t
diff --git a/test/system_test/main.cpp b/test/system_test/main.cpp
index d6491d9..6ec3502 100755
--- a/test/system_test/main.cpp
+++ b/test/system_test/main.cpp
@@ -74,9 +74,14 @@ int main(int argc, char **argv) {
   simple_renderer::Shader shader;
   shader.SetUniform("modelMatrix", modelMatrix);
 
-  simple_renderer::Light light;
-  light.dir = simple_renderer::Vector3f(1.0f, 5.0f, 1.0f);
-  shader.SetUniform("light", light);
+  // 多光源
+  std::vector<simple_renderer::Light> lights;
+  {
+    simple_renderer::Light l0; l0.dir = simple_renderer::Vector3f( 1.0f,  5.0f,  1.0f); lights.push_back(l0);
+    simple_renderer::Light l1; l1.dir = simple_renderer::Vector3f(-3.0f, -2.0f,  2.0f); lights.push_back(l1);
+    simple_renderer::Light l2; l2.dir = simple_renderer::Vector3f( 2.0f,  1.0f, -1.0f); lights.push_back(l2);
+  }
+  shader.SetLights(lights);
 
   simple_renderer::Camera camera(simple_renderer::Vector3f(0.0f, 0.0f, 1.0f));