03-streaming-agents/evals.py at main · focused-dot-io/03-streaming-agents · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""Evals for the streaming research agent."""

from langsmith import Client, evaluate
from openevals.llm import create_llm_as_judge

from streaming_agent import graph

ls_client = Client()

DATASET_NAME = "streaming-agent-evals"

if not ls_client.has_dataset(dataset_name=DATASET_NAME):
    dataset = ls_client.create_dataset(
        dataset_name=DATASET_NAME,
        description="Streaming research agent evaluation dataset",
    )
    ls_client.create_examples(
        dataset_id=dataset.id,
        inputs=[
            {"question": "What are the tradeoffs between REST and GraphQL?"},
            {"question": "How do vector databases enable semantic search?"},
            {"question": "What is retrieval-augmented generation?"},
        ],
        outputs=[
            {"must_mention": ["REST", "GraphQL", "tradeoff"]},
            {"must_mention": ["vector", "embedding", "similarity"]},
            {"must_mention": ["retrieval", "generation", "context"]},
        ],
    )

QUALITY_PROMPT = """\
User question: {inputs}
Agent response: {outputs}

Rate 0.0-1.0 on completeness, accuracy, and clarity.
Return ONLY: {{"score": <float>, "reasoning": "<explanation>"}}"""

quality_judge = create_llm_as_judge(
    prompt=QUALITY_PROMPT,
    model="anthropic:claude-sonnet-4-5-20250929",
    feedback_key="quality",
    continuous=True,
)


def coverage(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
    text = outputs.get("answer", "").lower()
    must_mention = reference_outputs.get("must_mention", [])
    hits = sum(1 for t in must_mention if t.lower() in text)
    return {"key": "coverage", "score": hits / len(must_mention) if must_mention else 1.0}


def stream_completeness(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
    streamed = outputs.get("answer", "")
    invoked_result = graph.invoke({"question": inputs["question"]})
    invoked = invoked_result.get("answer", "")
    streamed_words = set(streamed.lower().split())
    invoked_words = set(invoked.lower().split())
    if not invoked_words:
        return {"key": "stream_completeness", "score": 1.0}
    overlap = len(streamed_words & invoked_words) / len(invoked_words)
    return {"key": "stream_completeness", "score": min(overlap, 1.0)}


def custom_events_emitted(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
    events = outputs.get("custom_events", [])
    expected_steps = {"research", "synthesize"}
    seen_steps = {e.get("step") for e in events if isinstance(e, dict)}
    coverage_score = len(seen_steps & expected_steps) / len(expected_steps)
    return {"key": "custom_events", "score": coverage_score}


def target(inputs: dict) -> dict:
    custom_events = []
    answer_chunks = []
    for mode, chunk in graph.stream(
        {"question": inputs["question"]},
        stream_mode=["updates", "custom", "messages"],
    ):
        if mode == "custom":
            custom_events.append(chunk)
        elif mode == "messages":
            message_chunk, metadata = chunk
            if hasattr(message_chunk, "content") and message_chunk.content:
                answer_chunks.append(message_chunk.content)

    return {
        "answer": "".join(answer_chunks) if answer_chunks else "",
        "custom_events": custom_events,
    }


if __name__ == "__main__":
    results = evaluate(
        target,
        data=DATASET_NAME,
        evaluators=[quality_judge, coverage, stream_completeness, custom_events_emitted],
        experiment_prefix="streaming-agent-v1",
        max_concurrency=4,
    )
    print("\nEvaluation complete. Check LangSmith for results.")