EvoGraph/run_all_evals.py at main · AnnaSuSu/EvoGraph · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python3
"""
批量评估 - 仅运行 evaluate，不重新导入
适用于已经通过 batch_import.py 导入数据的情况
"""

import subprocess
import sys
from pathlib import Path

SPLITS = [
    # "conv-26", "conv-30", "conv-41", "conv-42", "conv-43",  # Batch 1 - 已完成
    "conv-44", "conv-47", "conv-48", "conv-49", "conv-50"  # Batch 2
]


def run_command(cmd: list) -> bool:
    """运行命令，返回是否成功"""
    try:
        result = subprocess.run(
            cmd,
            text=True,
            capture_output=False,  # 实时显示输出
            check=False
        )
        return result.returncode == 0
    except Exception as e:
        print(f"[ERROR] Command failed: {e}")
        return False


def main():
    print("=" * 60)
    print("Running evaluations (without import)")
    print("=" * 60)
    print()

    success_count = 0
    failed_splits = []

    for i, conv_id in enumerate(SPLITS, 1):
        print()
        print("=" * 60)
        print(f"[{i}/{len(SPLITS)}] Evaluating {conv_id}")
        print("=" * 60)
        print()

        eval_success = run_command(
            [sys.executable, "evaluate.py", "gen", "-c", conv_id]
        )

        if not eval_success:
            print(f"[ERROR] Evaluation failed for {conv_id}")
            failed_splits.append(conv_id)
        else:
            success_count += 1
            print()
            print(f"[Done] {conv_id} completed ✓")

        print()

    # Summary
    print()
    print("=" * 60)
    print("Summary")
    print("=" * 60)
    print(f"Successful: {success_count}/{len(SPLITS)}")

    if failed_splits:
        print(f"\nFailed splits:")
        for split in failed_splits:
            print(f"  - {split}")
    else:
        print("\nAll evaluations completed successfully! ✓")


if __name__ == "__main__":
    main()