langchain-ce-app/eval.py at main · midaz/langchain-ce-app · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
from langsmith import Client
from chains import triage_and_answer
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser
import os
import sys
from langsmith.evaluation import EvaluationResult, RunEvaluator, EvaluationResults

# Read local .env file
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

# ===== Evaluation Schemas =====
class SimpleJudgeEvaluation(BaseModel):
    score: float = Field(description="Overall score from 0-1")
    explanation: str = Field(description="Step-by-step explanation of the score")

class RetrievalQualityEvaluation(BaseModel):
    relevance_score: float = Field(description="Score from 0-1 for relevance of retrieved documents")
    coverage_score: float = Field(description="Score from 0-1 for coverage of required information")
    explanation: str = Field(description="Explanation of the scores")

class IssueTypeJudgeEvaluation(BaseModel):
    score: float = Field(description="Score from 0-1 for issue type classification accuracy")
    explanation: str = Field(description="Explanation of the score")

class SeverityJudgeEvaluation(BaseModel):
    score: float = Field(description="Score from 0-1 for severity accuracy")
    explanation: str = Field(description="Explanation of the score")

class ResponseActionJudgeEvaluation(BaseModel):
    score: float = Field(description="Score from 0-1 for response action accuracy")
    explanation: str = Field(description="Explanation of the score")

# ===== Evaluation Prompts =====
ISSUE_TYPE_JUDGE_PROMPT = """
You are evaluating the accuracy of an issue type classification for a customer support issue.

Expected issue type: {expected}
Predicted issue type: {predicted}

Score from 0-1 where:
1.0 = Perfect match
0.0 = Completely wrong

Explain your reasoning step by step.
"""

SEVERITY_JUDGE_PROMPT = """
You are evaluating the accuracy of a severity classification for a customer support issue.

Expected severity: {expected}
Predicted severity: {predicted}

Score from 0-1 where:
1.0 = Perfect match
0.0 = Completely wrong

Explain your reasoning step by step.
"""

RESPONSE_ACTION_JUDGE_PROMPT = """
You are evaluating whether the response action correctly addresses the issue.

Expected issue type: {expected_issue_type}
Predicted issue type: {predicted_issue_type}
Expected severity: {expected_severity}
Predicted severity: {predicted_severity}
Response: {answer}

Score from 0-1 where:
1.0 = Response action is fully appropriate
0.0 = Response action is completely inappropriate

Explain your reasoning step by step.
"""

TONE_JUDGE_PROMPT = """
You are evaluating the tone of a customer support response.

Consider professionalism, empathy, clarity, and positivity.
Score from 0-1 where:
1.0 = Perfect professional tone
0.0 = Unprofessional or inappropriate tone

Explain your reasoning step by step.
"""

COMPLETENESS_JUDGE_PROMPT = """
You are evaluating the completeness of a customer support response.

Consider technical details, explanation quality, next steps, and references.
Score from 0-1 where:
1.0 = Complete response with all necessary information
0.0 = Incomplete or missing critical information

Special instruction: If 'Relevant docs retrieved' is NO, automatically score 0.
Explain your reasoning step by step.
"""

TECHNICAL_ACCURACY_JUDGE_PROMPT = """
You are evaluating the technical accuracy of a customer support response.

Consider code references, documentation, and technical terminology.
Score from 0-1 where:
1.0 = Technically accurate in all aspects
0.0 = Contains technical inaccuracies

Special instruction: If 'Relevant docs retrieved' is NO, automatically score 0.

Explain your reasoning step by step.
"""

RETRIEVAL_QUALITY_JUDGE_PROMPT = """
You are evaluating the quality of document retrieval for a customer support issue.

Issue: {issue_text}
Retrieved Documents: {retrieved_docs}

Score from 0-1 for:
1. Relevance: How relevant are the retrieved documents to the issue? Consider:
   - Are the sources directly addressing the issue?
   - Do the sources contain specific, actionable information?
   - Are the sources up-to-date and appropriate for the issue type?

2. Coverage: Do the retrieved documents cover all necessary information? Consider:
   - Are all aspects of the issue addressed?
   - Are there any critical gaps in the information?
   - Are there multiple sources providing complementary information?

Note: If the answer indicates no documents were retrieved ("I don't have enough information" or "No relevant documentation found"),
score both relevance and coverage as 0.

Explain your reasoning step by step.
"""

# ===== Classification Evaluators =====
# These evaluators assess how well the system categorizes the issue
class IssueTypeEvaluator(RunEvaluator):
    def __init__(self):
        self.llm = ChatOpenAI(model="gpt-4", temperature=0)
        self.parser = PydanticOutputParser(pydantic_object=IssueTypeJudgeEvaluation)
    def evaluate_run(self, run, example, **kwargs):
        predicted = run.outputs.get("issue_type", "")
        expected = example.outputs.get("issue_type", "")
        prompt = ISSUE_TYPE_JUDGE_PROMPT.format(expected=expected, predicted=predicted) + "\n" + self.parser.get_format_instructions()
        result = self.llm.invoke(prompt)
        evaluation = self.parser.parse(result.content)
        return EvaluationResult(
            key="issue_type_accuracy",
            score=evaluation.score,
            comment=evaluation.explanation,
            evaluation_type="llm_judge"
        )

class SeverityEvaluator(RunEvaluator):
    def __init__(self):
        self.llm = ChatOpenAI(model="gpt-4", temperature=0)
        self.parser = PydanticOutputParser(pydantic_object=SeverityJudgeEvaluation)
    def evaluate_run(self, run, example, **kwargs):
        predicted = run.outputs.get("severity", "")
        expected = example.outputs.get("severity", "")
        prompt = SEVERITY_JUDGE_PROMPT.format(expected=expected, predicted=predicted) + "\n" + self.parser.get_format_instructions()
        result = self.llm.invoke(prompt)
        evaluation = self.parser.parse(result.content)
        return EvaluationResult(
            key="severity_accuracy",
            score=evaluation.score,
            comment=evaluation.explanation,
            evaluation_type="llm_judge"
        )

# ===== Response Quality Evaluators =====
# These evaluators assess the quality and appropriateness of the system's response
class ResponseActionEvaluator(RunEvaluator):
    def __init__(self):
        self.llm = ChatOpenAI(model="gpt-4", temperature=0)
        self.parser = PydanticOutputParser(pydantic_object=ResponseActionJudgeEvaluation)
    def evaluate_run(self, run, example, **kwargs):
        answer = run.outputs.get("answer", "")
        predicted_issue_type = run.outputs.get("issue_type", "")
        predicted_severity = run.outputs.get("severity", "")
        expected_issue_type = example.outputs.get("issue_type", "")
        expected_severity = example.outputs.get("severity", "")
        prompt = RESPONSE_ACTION_JUDGE_PROMPT.format(
            expected_issue_type=expected_issue_type,
            predicted_issue_type=predicted_issue_type,
            expected_severity=expected_severity,
            predicted_severity=predicted_severity,
            answer=answer
        ) + "\n" + self.parser.get_format_instructions()
        result = self.llm.invoke(prompt)
        evaluation = self.parser.parse(result.content)
        return EvaluationResult(
            key="response_action_accuracy",
            score=evaluation.score,
            comment=evaluation.explanation,
            evaluation_type="llm_judge"
        )

class ToneAppropriatenessEvaluator(RunEvaluator):
    def __init__(self):
        self.llm = ChatOpenAI(model="gpt-4", temperature=0)
        self.parser = PydanticOutputParser(pydantic_object=SimpleJudgeEvaluation)
    def evaluate_run(self, run, example, **kwargs):
        answer = run.outputs.get("answer", "")
        prompt = f"""Response to evaluate: {answer}\n\n{TONE_JUDGE_PROMPT}\n{self.parser.get_format_instructions()}"""
        result = self.llm.invoke(prompt)
        evaluation = self.parser.parse(result.content)
        return EvaluationResult(
            key="tone_appropriateness",
            score=evaluation.score,
            comment=evaluation.explanation,
            evaluation_type="llm_judge"
        )

class ResponseCompletenessEvaluator(RunEvaluator):
    def __init__(self):
        self.llm = ChatOpenAI(model="gpt-4", temperature=0)
        self.parser = PydanticOutputParser(pydantic_object=SimpleJudgeEvaluation)
    def evaluate_run(self, run, example, **kwargs):
        retrieved_docs = run.outputs.get("retrieved_docs", [])
        retrieval_status = "YES" if retrieved_docs else "NO"
        answer = run.outputs.get("answer", "")
        prompt = (
            f"Relevant docs retrieved: {retrieval_status}\n"
            f"Response to evaluate: {answer}\n\n"
            f"{COMPLETENESS_JUDGE_PROMPT}\n"
            f"{self.parser.get_format_instructions()}"
        )
        result = self.llm.invoke(prompt)
        evaluation = self.parser.parse(result.content)
        return EvaluationResult(
            key="response_completeness",
            score=evaluation.score,
            comment=evaluation.explanation,
            evaluation_type="llm_judge"
        )

class TechnicalAccuracyEvaluator(RunEvaluator):
    def __init__(self):
        self.llm = ChatOpenAI(model="gpt-4", temperature=0)
        self.parser = PydanticOutputParser(pydantic_object=SimpleJudgeEvaluation)
    def evaluate_run(self, run, example, **kwargs):
        retrieved_docs = run.outputs.get("retrieved_docs", [])
        retrieval_status = "YES" if retrieved_docs else "NO"
        answer = run.outputs.get("answer", "")
        prompt = (
            f"Relevant docs retrieved: {retrieval_status}\n"
            f"Response to evaluate: {answer}\n\n"
            f"{TECHNICAL_ACCURACY_JUDGE_PROMPT}\n"
            f"{self.parser.get_format_instructions()}"
        )
        result = self.llm.invoke(prompt)
        evaluation = self.parser.parse(result.content)
        return EvaluationResult(
            key="technical_accuracy",
            score=evaluation.score,
            comment=evaluation.explanation,
            evaluation_type="llm_judge"
        )

class RetrievalQualityEvaluator(RunEvaluator):
    def __init__(self):
        self.llm = ChatOpenAI(model="gpt-4", temperature=0)
        self.parser = PydanticOutputParser(pydantic_object=RetrievalQualityEvaluation)
    def evaluate_run(self, run, example, **kwargs):
        issue_text = example.inputs.get("issue_text", "")
        answer = run.outputs.get("answer", "")

        # Check if the answer indicates no docs were retrieved
        if "I don't have enough information" in answer or "No relevant documentation found" in answer:
            docs_text = "No documents were retrieved."
        else:
            # Extract source URLs from the answer to indicate retrieval
            docs_text = "Documents were retrieved and used in the answer. Sources referenced: " + answer

        prompt = RETRIEVAL_QUALITY_JUDGE_PROMPT.format(
            issue_text=issue_text,
            retrieved_docs=docs_text
        ) + "\n" + self.parser.get_format_instructions()
        result = self.llm.invoke(prompt)
        evaluation = self.parser.parse(result.content)
        return EvaluationResults(
            results=[
                EvaluationResult(
                    key="retrieval_relevance",
                    score=evaluation.relevance_score,
                    comment=evaluation.explanation,
                    evaluation_type="llm_judge"
                ),
                EvaluationResult(
                    key="retrieval_coverage",
                    score=evaluation.coverage_score,
                    comment=evaluation.explanation,
                    evaluation_type="llm_judge"
                )
            ]
        )

# ===== Main Execution =====
if __name__ == "__main__":
    # Check for required environment variables
    required_vars = ["LANGSMITH_API_KEY", "LANGSMITH_PROJECT"]
    missing_vars = [var for var in required_vars if not os.getenv(var)]

    if missing_vars:
        print("Error: Missing required environment variables:")
        for var in missing_vars:
            print(f"  - {var}")
        print("\nPlease set these variables before running the script.")
        sys.exit(1)

    # Print environment variables for debugging
    print(f"LANGSMITH_API_KEY exists: {bool(os.getenv('LANGSMITH_API_KEY'))}")
    print(f"LANGSMITH_PROJECT exists: {bool(os.getenv('LANGSMITH_PROJECT'))}")

    client = Client()

    def target(inputs: dict) -> dict:
        return triage_and_answer(
            issue_text=inputs["issue_text"],
            issue_url=inputs["issue_url"]
        )

    # Define all evaluators
    evaluators = [
        # Retrieval evaluator
        RetrievalQualityEvaluator(),
        # Classification evaluators
        IssueTypeEvaluator(),
        SeverityEvaluator(),
        # Response quality evaluators
        ResponseActionEvaluator(),
        ToneAppropriatenessEvaluator(),
        ResponseCompletenessEvaluator(),
        TechnicalAccuracyEvaluator()
    ]

    # Run the evaluation
    experiment_results = client.evaluate(
        target,
        data="CE Triage App: E2E",
        evaluators=evaluators,
        experiment_prefix="sdk:",
        max_concurrency=2,
    )

    print("Experiment results:", experiment_results)
    print("Check the LangSmith UI for the new experiment link!")

    # Run experiments by changing parameters
    # run_evaluation(
    #     experiment_name="improved-prompt",
    #     model_version="gpt-4"
    # )