forked from openai/openai-agents-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathllm_as_a_judge.py
More file actions
89 lines (67 loc) · 2.8 KB
/
llm_as_a_judge.py
File metadata and controls
89 lines (67 loc) · 2.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from __future__ import annotations
import asyncio
from dataclasses import dataclass
from typing import Literal
from agents import Agent, ItemHelpers, Runner, TResponseInputItem, trace
from examples.auto_mode import input_with_fallback, is_auto_mode
"""
This example shows the LLM as a judge pattern. The first agent generates an outline for a story.
The second agent judges the outline and provides feedback. We loop until the judge is satisfied
with the outline.
"""
story_outline_generator = Agent(
name="story_outline_generator",
instructions=(
"You generate a very short story outline based on the user's input. "
"If there is any feedback provided, use it to improve the outline."
),
)
@dataclass
class EvaluationFeedback:
feedback: str
score: Literal["pass", "needs_improvement", "fail"]
evaluator = Agent[None](
name="evaluator",
instructions=(
"You evaluate a story outline and decide if it's good enough. "
"If it's not good enough, you provide feedback on what needs to be improved. "
"Never give it a pass on the first try. After 5 attempts, you can give it a pass if the story outline is good enough - do not go for perfection"
),
output_type=EvaluationFeedback,
)
async def main() -> None:
msg = input_with_fallback(
"What kind of story would you like to hear? ",
"A detective story in space.",
)
input_items: list[TResponseInputItem] = [{"content": msg, "role": "user"}]
latest_outline: str | None = None
auto_mode = is_auto_mode()
max_rounds = 3 if auto_mode else None
rounds = 0
# We'll run the entire workflow in a single trace
with trace("LLM as a judge"):
while True:
story_outline_result = await Runner.run(
story_outline_generator,
input_items,
)
input_items = story_outline_result.to_input_list()
latest_outline = ItemHelpers.text_message_outputs(story_outline_result.new_items)
print("Story outline generated")
evaluator_result = await Runner.run(evaluator, input_items)
result: EvaluationFeedback = evaluator_result.final_output
print(f"Evaluator score: {result.score}")
if result.score == "pass":
print("Story outline is good enough, exiting.")
break
if auto_mode:
rounds += 1
if max_rounds is not None and rounds >= max_rounds:
print("Auto mode: stopping after limited rounds.")
break
print("Re-running with feedback")
input_items.append({"content": f"Feedback: {result.feedback}", "role": "user"})
print(f"Final story outline: {latest_outline}")
if __name__ == "__main__":
asyncio.run(main())