EvoGraph/evaluate.py at main · AnnaSuSu/EvoGraph · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
"""
V4 评估脚本 - 简化版 (w/o LineNode)
评估检索和问答性能
消融实验：移除 LineNode 演化链
"""

import json
import time
import re
import math
import string
from pathlib import Path
from datetime import datetime
from typing import List, Dict
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
from evograph_nolinenode import EvoGraphNoLineNode as EvoGraphV4
from config import DATA_PATHS, STM_CONFIG, LLM_CONFIG, LLM_PROVIDERS, get_neo4j_config


_ARTICLES_RE = re.compile(r"\b(a|an|the)\b", re.IGNORECASE)
_CATEGORY_TO_PAPER_TYPE = {
    1: "single_hop",   # dataset: Single-fact
    2: "temporal",
    3: "open_domain",  # dataset: Reasoning (paper groups differ; treat as Open Domain)
    4: "multi_hop",
    # category 5 (adversarial) excluded per Mem0 paper methodology
}
_PAPER_TYPE_ORDER = ["single_hop", "multi_hop", "temporal", "open_domain"]
_PAPER_TYPE_LABEL = {
    "single_hop": "Single Hop",
    "multi_hop": "Multi Hop",
    "temporal": "Temporal",
    "open_domain": "Open Domain",
    "unknown": "Unknown",
}


def _paper_type(category: int) -> str:
    try:
        return _CATEGORY_TO_PAPER_TYPE.get(int(category), "unknown")
    except (TypeError, ValueError):
        return "unknown"


def _normalize_answer(text: str) -> str:
    """
    SQuAD-style normalization:
    - lowercase
    - remove punctuation
    - remove articles (a/an/the)
    - collapse whitespace
    """
    text = (text or "").lower()
    text = _ARTICLES_RE.sub(" ", text)
    text = "".join(ch for ch in text if ch not in string.punctuation)
    return " ".join(text.split())


def _tokenize(text: str) -> List[str]:
    return _normalize_answer(text).split()


def _f1_score(pred: str, gold: str) -> float:
    pred_tokens = _tokenize(pred)
    gold_tokens = _tokenize(gold)
    if not pred_tokens or not gold_tokens:
        return 0.0

    pred_counts = Counter(pred_tokens)
    gold_counts = Counter(gold_tokens)
    overlap = sum((pred_counts & gold_counts).values())

    precision = overlap / len(pred_tokens)
    recall = overlap / len(gold_tokens)
    if precision + recall == 0:
        return 0.0
    return 2 * precision * recall / (precision + recall)


def _bleu1_from_counts(overlap: int, cand_len: int, ref_len: int) -> float:
    if cand_len <= 0 or ref_len <= 0:
        return 0.0
    precision = overlap / cand_len
    bp = 1.0 if cand_len > ref_len else math.exp(1.0 - (ref_len / cand_len))
    return bp * precision


def compute_metrics(results: List[dict], evaluate_answers: bool = False) -> dict:
    """计算评估指标"""
    metrics = {
        "retrieval": {
            "overall": {"hit": 0, "recall": 0, "precision": 0, "mrr": 0, "count": 0},
            "by_type": {}
        },
    }

    if evaluate_answers:
        metrics["answer"] = {
            "overall": {"f1": 0, "bleu1": 0, "count": 0},
            "by_type": {}
        }
        bleu1_overall = {"overlap": 0, "cand_len": 0, "ref_len": 0}
        bleu1_by_type: Dict[str, dict] = {}

    for r in results:
        gold = set(r.get("gold_notes", []))
        retrieved = r.get("retrieved_indices", [])
        category = r.get("category", 0)

        # Skip adversarial (category 5) per Mem0 paper methodology
        if category == 5:
            continue

        cat_key = _paper_type(category)

        if not gold:
            continue

        # 计算指标
        retrieved_set = set(retrieved)
        hits = gold & retrieved_set
        hit = 1 if hits else 0
        recall = len(hits) / len(gold) if gold else 0
        precision = len(hits) / len(retrieved) if retrieved else 0

        # MRR
        mrr = 0
        for i, rid in enumerate(retrieved):
            if rid in gold:
                mrr = 1 / (i + 1)
                break

        # 更新总体指标
        metrics["retrieval"]["overall"]["hit"] += hit
        metrics["retrieval"]["overall"]["recall"] += recall
        metrics["retrieval"]["overall"]["precision"] += precision
        metrics["retrieval"]["overall"]["mrr"] += mrr
        metrics["retrieval"]["overall"]["count"] += 1

        # 更新分类指标
        if cat_key not in metrics["retrieval"]["by_type"]:
            metrics["retrieval"]["by_type"][cat_key] = {"hit": 0, "recall": 0, "count": 0}
        metrics["retrieval"]["by_type"][cat_key]["hit"] += hit
        metrics["retrieval"]["by_type"][cat_key]["recall"] += recall
        metrics["retrieval"]["by_type"][cat_key]["count"] += 1

        if evaluate_answers:
            pred_answer = r.get("answer", "")
            gold_answer = r.get("gold_answer", "")
            if gold_answer:
                f1 = _f1_score(pred_answer, gold_answer)
                pred_tokens = _tokenize(pred_answer)
                gold_tokens = _tokenize(gold_answer)
                overlap = sum((Counter(pred_tokens) & Counter(gold_tokens)).values())

                metrics["answer"]["overall"]["f1"] += f1
                metrics["answer"]["overall"]["count"] += 1
                bleu1_overall["overlap"] += overlap
                bleu1_overall["cand_len"] += len(pred_tokens)
                bleu1_overall["ref_len"] += len(gold_tokens)

                if cat_key not in metrics["answer"]["by_type"]:
                    metrics["answer"]["by_type"][cat_key] = {"f1": 0, "bleu1": 0, "count": 0}
                    bleu1_by_type[cat_key] = {"overlap": 0, "cand_len": 0, "ref_len": 0}
                metrics["answer"]["by_type"][cat_key]["f1"] += f1
                metrics["answer"]["by_type"][cat_key]["count"] += 1
                bleu1_by_type[cat_key]["overlap"] += overlap
                bleu1_by_type[cat_key]["cand_len"] += len(pred_tokens)
                bleu1_by_type[cat_key]["ref_len"] += len(gold_tokens)

    # 计算平均值
    count = metrics["retrieval"]["overall"]["count"]
    if count > 0:
        for key in ["hit", "recall", "precision", "mrr"]:
            metrics["retrieval"]["overall"][key] /= count

    for cat_key, cat_metrics in metrics["retrieval"]["by_type"].items():
        cat_count = cat_metrics["count"]
        if cat_count > 0:
            cat_metrics["hit"] /= cat_count
            cat_metrics["recall"] /= cat_count

    if evaluate_answers:
        a_count = metrics["answer"]["overall"]["count"]
        if a_count > 0:
            metrics["answer"]["overall"]["f1"] /= a_count
        metrics["answer"]["overall"]["bleu1"] = _bleu1_from_counts(
            bleu1_overall["overlap"],
            bleu1_overall["cand_len"],
            bleu1_overall["ref_len"],
        )
        for cat_key, cat_metrics in metrics["answer"]["by_type"].items():
            cat_count = cat_metrics["count"]
            if cat_count > 0:
                cat_metrics["f1"] /= cat_count
            bleu_counts = bleu1_by_type.get(cat_key, {"overlap": 0, "cand_len": 0, "ref_len": 0})
            cat_metrics["bleu1"] = _bleu1_from_counts(
                bleu_counts["overlap"],
                bleu_counts["cand_len"],
                bleu_counts["ref_len"],
            )

    return metrics


def evaluate(num_queries: int = None, generate_answer: bool = False, conversation_id: str = "conv-26") -> dict:
    """
    运行评估

    Args:
        num_queries: 评估的查询数量（None 表示全部）
        generate_answer: 是否生成答案
        conversation_id: 对话ID（例如 conv-26, conv-30）
    """
    print("=" * 60)
    print(f"EvoGraph V2 Evaluation - {conversation_id}")
    print("=" * 60)

    # 计算文件名（和 eval_results 一致）
    script_dir = Path(__file__).parent
    provider = LLM_CONFIG["active_provider"]
    model_name = LLM_PROVIDERS[provider]["model"].split("/")[-1]  # Qwen/Qwen3-8B -> Qwen3-8B
    temp_str = str(LLM_CONFIG["temperature"]).replace(".", "_")
    date_str = datetime.now().strftime("%m%d")  # 月日，不含年份

    # 找同日期同模型同温度最大 id（和 eval_results 同步）
    import glob
    output_dir = script_dir / "eval_results"
    output_dir.mkdir(exist_ok=True)
    pattern = str(output_dir / f"{date_str}_{model_name}_{temp_str}_*.json")
    existing = glob.glob(pattern)
    max_id = 0
    for f in existing:
        try:
            file_id = int(Path(f).stem.split("_")[-1])
            max_id = max(max_id, file_id)
        except ValueError:
            pass
    next_id = max_id + 1

    # 创建对应的 debug 日志文件（放在 eval_results 目录）
    debug_log = output_dir / f"debug_log_{date_str}_{model_name}_{temp_str}_{next_id:03d}.txt"
    debug_log.write_text("", encoding="utf-8")
    print(f"[Eval] Debug log: {debug_log}")

    # 加载 construction token usage（如果存在）
    construction_usage_file = script_dir / f"construction_usage_{conversation_id}.json"
    construction_usage = None
    construction_cost = None
    if construction_usage_file.exists():
        with open(construction_usage_file, "r", encoding="utf-8") as f:
            construction_data = json.load(f)
            construction_usage = construction_data.get("token_usage", {})
            construction_cost = construction_data.get("cost", {})
        print(f"[Eval] Loaded construction usage from {construction_usage_file}")
    else:
        print(f"[Eval] No construction usage file found (expected: {construction_usage_file})")

    # 加载 benchmark
    benchmark_file = script_dir / f"../../datasets/locomo10_split/locomo_{conversation_id}_benchmark.json"

    if not benchmark_file.exists():
        print(f"[ERROR] Benchmark file not found: {benchmark_file}")
        print(f"[TIP] Available conversations: conv-26, conv-30, conv-41, conv-42, conv-43, conv-44, conv-47, conv-48, conv-49, conv-50")
        return {}

    with open(benchmark_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    queries = data.get("queries", [])

    # Filter out category 5 (adversarial) before evaluation to save costs
    original_count = len(queries)
    queries = [q for q in queries if q.get("category") != 5]
    filtered_count = original_count - len(queries)
    if filtered_count > 0:
        print(f"[Eval] Filtered out {filtered_count} category 5 (adversarial) queries")

    if num_queries:
        queries = queries[:num_queries]

    print(f"\n[Eval] Evaluating {len(queries)} queries...")
    if not queries:
        print("[Eval][ERROR] No queries found in benchmark file (expected key: 'queries').")
        return {}

    # 获取对应的数据库配置
    db_config = get_neo4j_config(conversation_id)
    print(f"[Eval] Using database: {db_config['uri']}")

    # 获取图谱统计信息（用一个临时实例，不记录debug）
    temp_evo = EvoGraphV4(debug_log_path=None, neo4j_config=db_config)
    stats = temp_evo.get_stats()
    print(f"[Eval] Graph stats: {stats}")
    temp_evo.close()

    # 设置并发数（默认6个线程）
    max_workers = 6
    print(f"[Eval] Using {max_workers} parallel workers")

    # 在主线程创建共享的 EmbeddingManager（避免多线程同时加载模型冲突）
    from embedding_manager import EmbeddingManager
    from config import EMBEDDING_CONFIG

    shared_embedding_manager = None
    if EMBEDDING_CONFIG.get("use_embedding", True):
        print("[Eval] Loading shared embedding model...")
        shared_embedding_manager = EmbeddingManager(
            model_path=EMBEDDING_CONFIG["model_path"],
            similarity_threshold=EMBEDDING_CONFIG["similarity_threshold"]
        )
        print("[Eval] Shared embedding model loaded ✓")

    # 并发处理函数
    def process_query(i: int, q: dict):
        """处理单个查询（每个线程独立创建 EvoGraph 实例）"""
        query_id = q.get("id", f"Q{i+1}")
        query_text = q.get("query", "")
        gold_notes = q.get("gold_notes", [])
        gold_answer = q.get("gold_answer", "")
        category = q.get("category", 0)

        # 每个线程独立创建 EvoGraph 实例和debug文件（避免并发冲突）
        # 但共享 embedding_manager（避免重复加载模型）
        thread_log = output_dir / f"debug_{next_id:03d}_thread{i:03d}.txt"
        evo = EvoGraphV4(
            debug_log_path=thread_log,
            embedding_manager=shared_embedding_manager,
            neo4j_config=db_config
        )

        start = time.time()
        result = evo.answer(query_text, generate=generate_answer)
        elapsed = time.time() - start

        evo.close()

        return {
            "index": i,
            "id": query_id,
            "query": query_text,
            "category": category,
            "gold_notes": gold_notes,
            "gold_answer": gold_answer,
            "retrieved_indices": result["retrieved_ids"],
            "answer": result.get("answer", ""),
            "elapsed": elapsed
        }

    # 并发执行
    results = []
    total_time = 0
    completed_count = 0

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # 提交所有任务
        future_to_query = {executor.submit(process_query, i, q): i for i, q in enumerate(queries)}

        # 收集结果
        for future in as_completed(future_to_query):
            result = future.result()
            results.append(result)
            total_time += result["elapsed"]
            completed_count += 1

            if completed_count % 10 == 0 or completed_count == len(queries):
                print(f"[Eval] Completed {completed_count}/{len(queries)} queries...")

    # 按原始顺序排序结果
    results.sort(key=lambda x: x["index"])

    # 移除 index 字段（仅用于排序）
    for r in results:
        del r["index"]

    # 合并所有线程的debug日志
    print(f"\n[Eval] Merging debug logs...")
    try:
        with open(debug_log, "w", encoding="utf-8") as outfile:
            for i in range(len(queries)):
                thread_log = output_dir / f"debug_{next_id:03d}_thread{i:03d}.txt"
                if thread_log.exists():
                    with open(thread_log, "r", encoding="utf-8") as infile:
                        outfile.write(infile.read())
                    thread_log.unlink()  # 删除临时文件
        print(f"[Eval] Debug logs merged to {debug_log}")
    except Exception as e:
        print(f"[Eval] Warning: Failed to merge debug logs: {e}")

    # 获取 token 使用统计（从 LLM 单例获取）
    from llm_client import get_llm_client
    llm = get_llm_client()
    answer_usage = llm.get_answer_usage()
    total_usage = llm.get_total_usage()

    # 计算指标
    metrics = compute_metrics(results, evaluate_answers=generate_answer)

    # 输出结果
    print("\n" + "=" * 60)
    print("Results")
    print("=" * 60)
    print(f"\nOverall Retrieval Metrics:")
    print(f"  Hit@{STM_CONFIG['top_k']}: {metrics['retrieval']['overall']['hit']:.4f}")
    print(f"  Recall@{STM_CONFIG['top_k']}: {metrics['retrieval']['overall']['recall']:.4f}")
    print(f"  Precision@{STM_CONFIG['top_k']}: {metrics['retrieval']['overall']['precision']:.4f}")
    print(f"  MRR: {metrics['retrieval']['overall']['mrr']:.4f}")

    if generate_answer and "answer" in metrics:
        print("\nOverall Answer Metrics:")
        print(f"  F1: {metrics['answer']['overall']['f1']:.4f}")
        print(f"  BLEU-1: {metrics['answer']['overall']['bleu1']:.4f}")

    print("\nBy Type (paper grouping):")
    for cat in _PAPER_TYPE_ORDER + ["unknown"]:
        if cat not in metrics["retrieval"]["by_type"]:
            continue
        cat_metrics = metrics["retrieval"]["by_type"][cat]
        label = _PAPER_TYPE_LABEL.get(cat, cat)
        line = f"  {label}: Hit={cat_metrics['hit']:.4f}, Recall={cat_metrics['recall']:.4f}, Count={cat_metrics['count']}"
        if generate_answer and "answer" in metrics and cat in metrics["answer"]["by_type"]:
            a = metrics["answer"]["by_type"][cat]
            line += f", F1={a['f1']:.4f}, BLEU-1={a['bleu1']:.4f}"
        print(line)

    print(f"\nTotal time: {total_time:.2f}s")
    print(f"Avg time per query: {total_time/len(queries):.2f}s")

    # 成本计算（基于所有 LLM 调用）
    query_cost_data = None
    total_cost_data = None

    if total_usage["call_count"] > 0:
        from llm_client import TOKEN_PRICES

        # Query Cost (仅评估阶段，不含构建)
        query_input_cost = total_usage['prompt_tokens'] / 1_000_000 * TOKEN_PRICES["input"]
        query_output_cost = total_usage['completion_tokens'] / 1_000_000 * TOKEN_PRICES["output"]
        query_total_cost = query_input_cost + query_output_cost

        query_cost_data = {
            "input": query_input_cost,
            "output": query_output_cost,
            "total": query_total_cost,
            "per_1k_queries": query_total_cost / len(queries) * 1000
        }

        print(f"\n" + "=" * 60)
        print("Cost Analysis (gpt-4o-mini)")
        print("=" * 60)
        print(f"\n1. Query Cost (Rerank + Answer Generation)")
        print(f"   Input:  ${query_input_cost:.4f}")
        print(f"   Output: ${query_output_cost:.4f}")
        print(f"   Total:  ${query_total_cost:.4f}")
        print(f"   Per 1k queries: ${query_cost_data['per_1k_queries']:.4f}")
        print(f"   [Compare with SYNAPSE Table 4: $0.24/1k]")

        # Total Cost (构建 + 评估，按本次 benchmark queries 摊销)
        if construction_cost:
            total_input_cost = construction_cost["input"] + query_input_cost
            total_output_cost = construction_cost["output"] + query_output_cost
            total_combined_cost = total_input_cost + total_output_cost

            total_cost_data = {
                "construction": construction_cost["total"],
                "query": query_total_cost,
                "total": total_combined_cost,
                "per_1k_queries": total_combined_cost / len(queries) * 1000
            }

            print(f"\n2. Total Cost (Construction + Query, amortized over {len(queries)} queries)")
            print(f"   Construction: ${construction_cost['total']:.4f}")
            print(f"   Query:        ${query_total_cost:.4f}")
            print(f"   Total:        ${total_combined_cost:.4f}")
            print(f"   Per 1k queries: ${total_cost_data['per_1k_queries']:.4f}")
            print(f"   [Note: Construction cost amortized over benchmark queries only]")
            print(f"   [In production, construction cost would be negligible per-query]")
        else:
            print(f"\n2. Total Cost: Not available (no construction usage file)")

        print("=" * 60)

    # 保存结果（使用前面计算的 next_id）
    output = {
        "config": {
            "num_queries": len(queries),
            "generate_answer": generate_answer,
            "top_k": STM_CONFIG["top_k"],
            "graph_stats": stats,
            "llm_provider": provider,
            "model": LLM_PROVIDERS[provider]["model"],
            "temperature": LLM_CONFIG["temperature"],
            "benchmark_file": str(benchmark_file.name),
            "conversation_id": conversation_id,
        },
        "metrics": metrics,
        "total_time": total_time,
        "token_usage": {
            "total": total_usage,
            "answer_only": answer_usage,
            "construction": construction_usage,
        },
        "cost_analysis": {
            "query_cost": query_cost_data,
            "total_cost": total_cost_data,
        },
        "results": results
    }

    output_file = output_dir / f"{date_str}_{model_name}_{temp_str}_{next_id:03d}.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(output, f, indent=2, ensure_ascii=False)
    print(f"\n[Eval] Results saved to {output_file}")

    return output


def analyze_errors(results_file: str):
    """分析错误案例"""
    with open(results_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    print("=" * 60)
    print("Error Analysis")
    print("=" * 60)

    results = data.get("results", [])
    errors = []

    for r in results:
        gold = set(r.get("gold_notes", []))
        retrieved = set(r.get("retrieved_indices", []))
        if not gold & retrieved:
            errors.append(r)

    print(f"\nTotal errors (no hit): {len(errors)}/{len(results)}")

    for e in errors[:10]:
        print(f"\n[{e['id']}] Category {e['category']}")
        print(f"  Query: {e['query']}")
        print(f"  Gold: {e['gold_notes']}")
        print(f"  Retrieved: {e['retrieved_indices']}")


if __name__ == "__main__":
    import sys
    import argparse

    if len(sys.argv) > 1 and sys.argv[1] == "analyze":
        if len(sys.argv) > 2:
            analyze_errors(sys.argv[2])
        else:
            print("Usage: python evaluate.py analyze <results_file.json>")
    else:
        parser = argparse.ArgumentParser(description="Evaluate EvoGraphV4 on LoCoMo benchmark")
        parser.add_argument("action", nargs="?", default="eval",
                            help="Action: 'gen' to generate answers, or omit for retrieval only")
        parser.add_argument("--conversation", "-c", type=str, default="conv-26",
                            help="Conversation ID (e.g., conv-26, conv-30)")
        parser.add_argument("--num-queries", "-n", type=int, default=None,
                            help="Number of queries to evaluate (default: all)")

        args = parser.parse_args()

        generate_answer = args.action in {"gen", "generate"}
        evaluate(num_queries=args.num_queries, generate_answer=generate_answer,
                conversation_id=args.conversation)