-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy patheval.py
More file actions
91 lines (66 loc) · 3.32 KB
/
eval.py
File metadata and controls
91 lines (66 loc) · 3.32 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import asyncio
import time
from typing import Dict
from groq import Groq
from Chat.chat import ChatSession, Message
from Configs.prompts import ONLINE_PROMPT
from Services.LLMService import LLMService
client = Groq()
EVALUATION_INPUT = 'Schedule me a appointment on July 10th, 2025'
EXPECTED_OUTPUT = "This is the expected output: I’d be happy to help set up that appointment for July 10, 2025. Could you please provide your name so I can schedule this for you?"
llm_service = LLMService('openai/gpt-oss-20b')
EVAL_PROMPT = "You are an expert judge tasked with comparing the quality of an AI Agent’s output to a user-provided expected output. You must assume the expected_output is correct - even if you personally disagree."
session_id = 'evaluation'
# This async function performs an evaluation of an agent-generated response.
# It first gets a response from an LLM service (based on the evaluation input),
# then sends that response along with expected input and prompt to a second model for evaluation.
# Finally, it prints both the AI-generated output and the evaluation result from the second model.
async def evaluation():
chat_sessions: Dict[str, ChatSession] = {}
session_id = 'evaluation'
chat_sessions[session_id] = ChatSession(id=session_id)
chat_sessions[session_id].messages.append(Message(role='system', content=ONLINE_PROMPT))
chat_sessions[session_id].messages.append(Message(role='user', content=EVALUATION_INPUT))
response = await llm_service.generate_response(chat_sessions, session_id)
completion = client.chat.completions.create(
model="openai/gpt-oss-20b",
messages=[
{
"role": "system",
"content": EVAL_PROMPT
},
{
"role": "system",
"content": EVALUATION_INPUT
},
{
"role": "user",
"content": EXPECTED_OUTPUT
},
{
"role": "user",
"content": f"This is the AI agent's output: {response.choices[0].message.content}"
}
]
)
print(f"{EXPECTED_OUTPUT}\n\nThis is the AI agent's output: {response.choices[0].message.content}\n\n")
print(completion.choices[0].message.content)
# This async function measures the average response time of an LLM service.
# It simulates multiple chat sessions in a loop, sends predefined prompts,
# and times how long it takes to receive responses.
# At the end, it calculates and prints the average response time per request.
async def average_response_time(loops):
start = time.time()
for _ in range(loops):
chat_sessions: Dict[str, ChatSession] = {}
chat_sessions[session_id] = ChatSession(id=session_id)
chat_sessions[session_id].messages.append(Message(role='system', content=ONLINE_PROMPT))
chat_sessions[session_id].messages.append(Message(role='user', content=EVALUATION_INPUT))
await llm_service.generate_response(chat_sessions, session_id)
end = time.time()
print(f"Average time per response: {(end - start) / loops}")
# if __name__ == "__main__":
# asyncio.run(evaluation())
# Uncomment this to check average response time (can be limited by API rate limit)
if __name__ == "__main__":
asyncio.run(average_response_time(10))