02-human-in-the-loop/evals.py at main · focused-dot-io/02-human-in-the-loop · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""Evals for the human-in-the-loop complaint handler."""

from langsmith import Client, evaluate
from openevals.llm import create_llm_as_judge

from complaint_handler import graph

ls_client = Client()

DATASET_NAME = "complaint-handler-evals"

if not ls_client.has_dataset(dataset_name=DATASET_NAME):
    dataset = ls_client.create_dataset(
        dataset_name=DATASET_NAME,
        description="Human-in-the-loop complaint handler evaluation dataset",
    )
    ls_client.create_examples(
        dataset_id=dataset.id,
        inputs=[
            {
                "complaint": "Charged twice for order #A-1234. Want a refund.",
                "customer_id": "cust_001",
            },
            {
                "complaint": "App crashes every time I open the settings page.",
                "customer_id": "cust_002",
            },
            {
                "complaint": "Your CEO's tweet was offensive. Cancelling my account.",
                "customer_id": "cust_003",
            },
        ],
        outputs=[
            {
                "must_mention": ["refund", "order", "A-1234"],
                "risk": "high",
            },
            {
                "must_mention": ["crash", "settings", "investigating"],
                "risk": "medium",
            },
            {
                "must_mention": ["feedback", "understand", "account"],
                "risk": "high",
            },
        ],
    )

DRAFT_QUALITY_PROMPT = """\
Customer complaint: {inputs}
AI draft response: {outputs}

Rate 0.0-1.0 on empathy, accuracy, and professionalism.
Deduct points if the draft promises specific remedies (refunds, credits)
without explicit authorization.
Return ONLY: {{"score": <float>, "reasoning": "<explanation>"}}"""

draft_judge = create_llm_as_judge(
    prompt=DRAFT_QUALITY_PROMPT,
    model="anthropic:claude-sonnet-4-5-20250929",
    feedback_key="draft_quality",
    continuous=True,
)


def coverage(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
    """Did the draft actually address the complaint specifics?"""
    text = outputs.get("draft_response", "").lower()
    must_mention = reference_outputs.get("must_mention", [])
    hits = sum(1 for t in must_mention if t.lower() in text)
    return {"key": "coverage", "score": hits / len(must_mention) if must_mention else 1.0}


def no_unauthorized_promises(inputs: dict, outputs: dict, reference_outputs: dict) -> dict:
    """Did the draft promise refunds or credits without authorization?"""
    text = outputs.get("draft_response", "").lower()
    dangerous_phrases = ["refund has been", "credit has been", "we will refund",
                         "we will credit", "compensation of"]
    violations = sum(1 for p in dangerous_phrases if p in text)
    return {"key": "no_unauthorized_promises", "score": 1.0 if violations == 0 else 0.0}


def target(inputs: dict) -> dict:
    """Run the graph until the interrupt (draft phase only)."""
    config = {"configurable": {"thread_id": f"eval-{inputs['customer_id']}"}}
    result = graph.invoke(inputs, config=config)
    return {"draft_response": result.get("draft_response", "")}


if __name__ == "__main__":
    results = evaluate(
        target,
        data=DATASET_NAME,
        evaluators=[draft_judge, coverage, no_unauthorized_promises],
        experiment_prefix="complaint-handler-v1",
        max_concurrency=4,
    )
    print("\nEvaluation complete. Check LangSmith for results.")