benchmark_crew/make_react_gif.py at main · Dynamite2003/benchmark_crew · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
"""
生成 ReAct 过程动画 GIF
每个 Agent 的 Thought → Action → Observation → Answer 循环以卡片形式展示
"""
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.animation import FuncAnimation, PillowWriter
from matplotlib import rcParams

rcParams["font.family"]        = "PingFang HK"
rcParams["axes.unicode_minus"] = False

OUT_GIF = "outputs/react_process.gif"

# ── 真实 ReAct 数据（从日志提取）──────────────────────────────────────────────
AGENTS_DATA = [
    {
        "name":  "Benchmark Runner",
        "color": "#80CBC4",
        "icon":  "▶",
        "goal":  "确保推理结果文件就绪",
        "steps": [
            ("THOUGHT", "需要检查 data/results.jsonl 是否存在"),
            ("ACTION",  "benchmark_runner_tool(dummy='')"),
            ("OBSERVE", "results.jsonl 已存在，20 条记录，准确率 50.0%，无需重新推理"),
            ("ANSWER",  "推理文件已就绪，可进行后续分析"),
        ],
    },
    {
        "name":  "Benchmark Data Parser",
        "color": "#4FC3F7",
        "icon":  "📋",
        "goal":  "解析结果文件结构与数据分布",
        "steps": [
            ("THOUGHT", "需要读取 JSONL 文件，提取字段和统计信息"),
            ("ACTION",  "json_parse_tool(file_path='data/results.jsonl')"),
            ("OBSERVE", "总记录数: 20 | 字段: 10项 | score均值: 0.5000 | 平均延迟: 1.8s"),
            ("ANSWER",  "数据结构清晰，20条样本，单一category，score分布0/1二值"),
        ],
    },
    {
        "name":  "Statistical Analyst",
        "color": "#81C784",
        "icon":  "📊",
        "goal":  "计算 accuracy / pass@1 / 错误分布",
        "steps": [
            ("THOUGHT", "需要按 category 分组计算统计指标"),
            ("ACTION",  "stats_tool(file_path='data/results.jsonl')"),
            ("OBSERVE", "[causal_judgment] n=20 | mean=0.5000 | std=0.5000 | pass@1=0.5000 | 错误: 10/20"),
            ("ANSWER",  "准确率 50%，与随机基线持平，pass@1=50%，答错10题"),
        ],
    },
    {
        "name":  "QA Detail Reviewer",
        "color": "#FFD54F",
        "icon":  "🔍",
        "goal":  "逐题分析模型错误模式",
        "steps": [
            ("THOUGHT", "需要加载原始 BBH 数据集，与推理结果逐条对照"),
            ("ACTION",  "qa_detail_tool(results_file='data/results.jsonl')"),
            ("OBSERVE", "写入 qa_detail.md | 错误10/20 | Yes误答4题 | No误答6题 | No偏向"),
            ("ANSWER",  "模型存在 No 偏向；对反事实因果和侧面效应场景理解薄弱"),
        ],
    },
    {
        "name":  "Comparative Research Analyst",
        "color": "#FFB74D",
        "icon":  "🔬",
        "goal":  "与公开 baseline 横向对比",
        "steps": [
            ("THOUGHT", "需要查找 BIG-Bench causal_judgement 任务的公开模型分数"),
            ("ACTION",  "big_bench_baseline_tool(task_name='causal_judgement')"),
            ("OBSERVE", "GPT-4: 67% | GPT-3.5: 56% | PaLM2: 59% | Human: 67% | Random: 50%"),
            ("ANSWER",  "llama3.1:8b (50%) 与随机基线持平，显著低于 GPT-4 (67%)"),
        ],
    },
    {
        "name":  "Report Publisher",
        "color": "#CE93D8",
        "icon":  "✍",
        "goal":  "生成报告 Markdown + Pipeline GIF",
        "steps": [
            ("THOUGHT", "需要整合所有分析结论，先生成 GIF，再写入 Markdown 报告"),
            ("ACTION",  "visualize_tool(log_file='outputs/pipeline_log.jsonl')"),
            ("OBSERVE", "GIF 已生成: pipeline_flow.gif (644 KB)"),
            ("ACTION",  "file_writer_tool → outputs/llama3.1_8b_causal_judgment_report.md"),
            ("ANSWER",  "报告与可视化已写入 outputs/，pipeline 全流程完成"),
        ],
    },
]

# ── 颜色 & 样式 ───────────────────────────────────────────────────────────────
BG       = "#0F1117"
CARD_BG  = "#1E2130"
BORDER   = "#2E3250"
FG       = "#E2E8F0"
GRAY     = "#64748B"
STEP_COLORS = {
    "THOUGHT": ("#3B4F8C", "#93C5FD"),   # bg, text
    "ACTION":  ("#1E4D3B", "#6EE7B7"),
    "OBSERVE": ("#4A3820", "#FCD34D"),
    "ANSWER":  ("#3B1F4E", "#C084FC"),
}

FIG_W, FIG_H = 18, 10
DPI  = 150
FPS  = 5

# ── 帧设计：每个 agent 展示 N 帧（每步 2 帧 + 停顿 3 帧）─────────────────────
def build_frames():
    frames = []
    for agent_idx, ag in enumerate(AGENTS_DATA):
        steps = ag["steps"]
        # 逐步累积显示
        for visible_steps in range(1, len(steps) + 1):
            repeat = 3 if visible_steps == len(steps) else 2
            for _ in range(repeat):
                frames.append((agent_idx, visible_steps))
        # 停顿帧
        for _ in range(4):
            frames.append((agent_idx, len(steps)))
    return frames

ALL_FRAMES = build_frames()

fig, ax = plt.subplots(figsize=(FIG_W, FIG_H), facecolor=BG)
ax.set_facecolor(BG)
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.axis("off")

# ── 左侧 agent 列表（固定） ───────────────────────────────────────────────────
LIST_X = 0.01
LIST_W = 0.18
for i, ag in enumerate(AGENTS_DATA):
    y = 0.88 - i * 0.135
    ax.add_patch(mpatches.FancyBboxPatch(
        (LIST_X, y - 0.03), LIST_W, 0.055,
        boxstyle="round,pad=0.008",
        facecolor=CARD_BG, edgecolor=BORDER, linewidth=0.8,
        transform=ax.transAxes, zorder=2,
    ))
    ax.text(LIST_X + 0.01, y + 0.005, ag["name"],
            transform=ax.transAxes,
            color=GRAY, fontsize=7.5, va="center", fontweight="bold")

# 动态元素（每帧重绘）
active_patch  = None
card_elements = []

CARD_X = 0.21
CARD_Y = 0.05
CARD_W = 0.77
CARD_H = 0.90

def clear_card():
    global card_elements
    for el in card_elements:
        el.remove()
    card_elements = []

def draw_card(agent_idx, visible_steps):
    global active_patch, card_elements
    clear_card()

    ag    = AGENTS_DATA[agent_idx]
    steps = ag["steps"][:visible_steps]
    col   = ag["color"]

    # 高亮左侧当前 agent
    if active_patch:
        active_patch.remove()
    y_active = 0.88 - agent_idx * 0.135
    active_patch = mpatches.FancyBboxPatch(
        (LIST_X, y_active - 0.03), LIST_W, 0.055,
        boxstyle="round,pad=0.008",
        facecolor=col + "33", edgecolor=col, linewidth=1.5,
        transform=ax.transAxes, zorder=3,
    )
    ax.add_patch(active_patch)
    # 更新左侧文字颜色
    for i, a in enumerate(AGENTS_DATA):
        color = col if i == agent_idx else GRAY
        # （静态文字不好改颜色，用独立 text 覆盖）

    # 主卡片背景
    card_bg = mpatches.FancyBboxPatch(
        (CARD_X, CARD_Y), CARD_W, CARD_H,
        boxstyle="round,pad=0.012",
        facecolor=CARD_BG, edgecolor=col, linewidth=2.0,
        transform=ax.transAxes, zorder=2,
    )
    ax.add_patch(card_bg)
    card_elements.append(card_bg)

    # Agent 名 + goal
    t1 = ax.text(CARD_X + 0.02, CARD_Y + CARD_H - 0.04,
                 f"{ag['name']}",
                 transform=ax.transAxes,
                 color=col, fontsize=14, fontweight="bold", va="top", zorder=5)
    t2 = ax.text(CARD_X + 0.02, CARD_Y + CARD_H - 0.085,
                 f"Goal: {ag['goal']}",
                 transform=ax.transAxes,
                 color=GRAY, fontsize=9, va="top", zorder=5)
    card_elements += [t1, t2]

    # 分隔线
    sep = ax.plot([CARD_X + 0.01, CARD_X + CARD_W - 0.01],
                  [CARD_Y + CARD_H - 0.105, CARD_Y + CARD_H - 0.105],
                  color=BORDER, linewidth=1.0, transform=ax.transAxes, zorder=4)
    card_elements += sep

    # ReAct 步骤
    step_y = CARD_Y + CARD_H - 0.14
    STEP_H = 0.115
    for label, content in steps:
        bg_col, txt_col = STEP_COLORS.get(label, ("#2D3748", "#E2E8F0"))

        # 步骤背景
        s_bg = mpatches.FancyBboxPatch(
            (CARD_X + 0.015, step_y - STEP_H + 0.01), CARD_W - 0.03, STEP_H - 0.015,
            boxstyle="round,pad=0.006",
            facecolor=bg_col, edgecolor="none",
            transform=ax.transAxes, zorder=4,
        )
        ax.add_patch(s_bg)
        card_elements.append(s_bg)

        # 标签 badge
        badge = mpatches.FancyBboxPatch(
            (CARD_X + 0.025, step_y - STEP_H + 0.038), 0.075, 0.038,
            boxstyle="round,pad=0.004",
            facecolor=txt_col + "33", edgecolor=txt_col, linewidth=0.8,
            transform=ax.transAxes, zorder=5,
        )
        ax.add_patch(badge)
        card_elements.append(badge)

        tl = ax.text(CARD_X + 0.063, step_y - STEP_H + 0.057, label,
                     transform=ax.transAxes,
                     color=txt_col, fontsize=8, fontweight="bold",
                     va="center", ha="center", zorder=6)
        # 内容
        tc = ax.text(CARD_X + 0.11, step_y - STEP_H + 0.057, content,
                     transform=ax.transAxes,
                     color=FG, fontsize=9, va="center", zorder=6)
        card_elements += [tl, tc]

        step_y -= STEP_H

    # 进度指示（底部）
    prog_total = len(AGENTS_DATA)
    prog_done  = agent_idx + (1 if visible_steps == len(ag["steps"]) else 0)
    prog_txt = ax.text(CARD_X + CARD_W - 0.02, CARD_Y + 0.025,
                       f"Agent {agent_idx + 1}/{prog_total}",
                       transform=ax.transAxes,
                       color=GRAY, fontsize=8, ha="right", va="bottom", zorder=5)
    card_elements.append(prog_txt)

    # 进度条
    bar_w = (CARD_W - 0.03) * (agent_idx + visible_steps / len(ag["steps"])) / prog_total
    prog_bg = mpatches.FancyBboxPatch(
        (CARD_X + 0.015, CARD_Y + 0.01), CARD_W - 0.03, 0.012,
        boxstyle="square,pad=0",
        facecolor=BORDER, edgecolor="none",
        transform=ax.transAxes, zorder=4,
    )
    prog_bar = mpatches.FancyBboxPatch(
        (CARD_X + 0.015, CARD_Y + 0.01), bar_w, 0.012,
        boxstyle="square,pad=0",
        facecolor=col, edgecolor="none", alpha=0.85,
        transform=ax.transAxes, zorder=5,
    )
    ax.add_patch(prog_bg); ax.add_patch(prog_bar)
    card_elements += [prog_bg, prog_bar]


# 标题
ax.text(0.5, 0.985, "ReAct Process  —  CrewAI Pipeline  ×  llama3.1:8b",
        transform=ax.transAxes,
        color=FG, fontsize=13, fontweight="bold",
        ha="center", va="top")

# 左侧标题
ax.text(LIST_X + LIST_W / 2, 0.965, "Agents",
        transform=ax.transAxes,
        color=GRAY, fontsize=9, ha="center", va="top")

def update(frame_idx):
    agent_idx, visible_steps = ALL_FRAMES[frame_idx]
    draw_card(agent_idx, visible_steps)
    return card_elements + ([active_patch] if active_patch else [])

anim = FuncAnimation(fig, update, frames=len(ALL_FRAMES),
                     interval=1000 // FPS, blit=False)
anim.save(OUT_GIF, writer=PillowWriter(fps=FPS), dpi=DPI)
plt.close(fig)
print(f"ReAct GIF saved: {OUT_GIF}")