-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathevals.py
More file actions
86 lines (67 loc) · 2.7 KB
/
evals.py
File metadata and controls
86 lines (67 loc) · 2.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""Evaluation suite for parallel research agents.
Three evaluators:
- coverage: Did the synthesizer mention required terms?
- source_diversity: Is the fan-out actually working?
- quality: LLM-as-judge for completeness, accuracy, and tone.
"""
from langsmith import Client, evaluate
from openevals.llm import create_llm_as_judge
from parallel_agents import graph
ls_client = Client()
# --- Dataset ---
DATASET_NAME = "research-agent-evals"
if not ls_client.has_dataset(dataset_name=DATASET_NAME):
dataset = ls_client.create_dataset(
dataset_name=DATASET_NAME,
description="Parallel research agent evaluation dataset",
)
ls_client.create_examples(
dataset_id=dataset.id,
inputs=[
{"question": "What is our refund policy for enterprise clients?"},
{"question": "How does GDPR affect our data pipeline architecture?"},
{"question": "What competitors launched AI features last quarter?"},
],
outputs=[
{"must_mention": ["refund", "enterprise", "policy"]},
{"must_mention": ["GDPR", "data", "compliance"]},
{"must_mention": ["competitor", "AI", "feature"]},
],
)
# --- Evaluators ---
QUALITY_PROMPT = """\
Customer query: {inputs}
AI response: {outputs}
Rate 0.0-1.0 on completeness, accuracy, and tone.
Return ONLY: {{"score": <float>, "reasoning": "<explanation>"}}"""
quality_judge = create_llm_as_judge(
prompt=QUALITY_PROMPT,
model="anthropic:claude-sonnet-4-5-20250929",
feedback_key="quality",
continuous=True,
)
def coverage(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
"""Did the synthesizer actually address the question?"""
text = outputs.get("final_response", "").lower()
must_mention = reference_outputs.get("must_mention", [])
hits = sum(1 for t in must_mention if t.lower() in text)
return {
"key": "coverage",
"score": hits / len(must_mention) if must_mention else 1.0,
}
def source_diversity(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
"""Is the fan-out actually working, or did it silently degrade?"""
results = outputs.get("research_results", [])
sources = {r["source"] for r in results if isinstance(r, dict)}
return {"key": "source_diversity", "score": min(len(sources) / 2.0, 1.0)}
def target(inputs: dict) -> dict:
return graph.invoke({"question": inputs["question"]})
if __name__ == "__main__":
results = evaluate(
target,
data=DATASET_NAME,
evaluators=[quality_judge, coverage, source_diversity],
experiment_prefix="parallel-research-v1",
max_concurrency=4,
)
print("\nEvaluation complete. Check LangSmith for results.")