diff --git a/.gitignore b/.gitignore index a9729050..a867929d 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,7 @@ __pycache__/ .DS_Store .venv/ -venv/* +venv/ *venv/* *.venv/* diff --git a/README.md b/README.md index 27b82ca8..2ce854cc 100644 --- a/README.md +++ b/README.md @@ -284,6 +284,55 @@ run = run.score([check_keywords], expected_keywords="expected_keywords") --- +## Saving Completion Feedback + +Track user feedback on prompt completions to improve your prompts with DSPy optimization: + +```python +import zeroeval as ze + +# Initialize client +ze.init() + +# Send positive feedback +feedback = ze.send_feedback( + prompt_slug="customer-support", + completion_id="completion-uuid-123", + thumbs_up=True, + reason="Excellent response, very helpful" +) + +# Send negative feedback with expected output +feedback = ze.send_feedback( + prompt_slug="customer-support", + completion_id="completion-uuid-456", + thumbs_up=False, + reason="Response was too formal", + expected_output="Should be more casual and friendly", + metadata={"user_id": "user-789", "source": "production"} +) +``` + +### Parameters + +- **prompt_slug** _(str, required)_ – The slug of the prompt +- **completion_id** _(str, required)_ – UUID of the completion to provide feedback on +- **thumbs_up** _(bool, required)_ – True for positive feedback, False for negative +- **reason** _(str, optional)_ – Explanation of the feedback +- **expected_output** _(str, optional)_ – Description of what the expected output should be. This field is automatically used by ZeroEval for **tuning datasets and DSPy prompt optimization** to create stronger training examples. +- **metadata** _(dict, optional)_ – Additional metadata to attach to the feedback + +### Integration with Prompt Tuning + +Feedback submitted via `send_feedback` is automatically linked to the prompt version used for the completion. When you provide both `reason` and `expected_output`, ZeroEval creates stronger training examples for DSPy optimization: + +- **`reason`** helps the optimizer understand what makes a response good or bad +- **`expected_output`** provides a concrete example of the ideal response, which DSPy uses to generate improved prompts + +If the completion was traced with a `span_id`, the feedback is mirrored to your tuning datasets automatically, making it available for prompt optimization runs in the ZeroEval platform. + +--- + ## Streaming & tracing • **Streaming responses** – streaming guide: https://docs.zeroeval.com/streaming (coming soon) diff --git a/examples_v2/README.md b/examples_v2/README.md index 4d0b745a..c9dc71bc 100644 --- a/examples_v2/README.md +++ b/examples_v2/README.md @@ -14,6 +14,10 @@ This directory contains organized, focused examples for ZeroEval SDK features. - Weighted variant selection - Automatic choice tracking +- **`tuning/`** - Examples for Prompt Tuning and Optimization + - Customer support agent with feedback loop + - Prompt versioning with ze.prompt() + ## Getting Started 1. **Install dependencies**: diff --git a/examples_v2/tuning/README.md b/examples_v2/tuning/README.md new file mode 100644 index 00000000..cf7427a0 --- /dev/null +++ b/examples_v2/tuning/README.md @@ -0,0 +1,60 @@ +# Prompt Tuning Examples + +This directory contains examples demonstrating ZeroEval's prompt tuning and optimization features. + +## Core Concepts + +Prompt tuning in ZeroEval works through a feedback loop: + +1. **Define Prompt**: Use `ze.prompt()` to register a prompt and bind variables. +2. **Trace Execution**: Run your agent; the SDK automatically traces the inputs and outputs. +3. **Send Feedback**: Use `ze.send_feedback()` (or the direct API) to signal what was good or bad about the completion. +4. **Optimize**: ZeroEval (and integrated optimizers like DSPy) uses this feedback to generate better prompt versions. + +## Examples + +### 1. Customer Support Agent (`customer_support_agent.py`) + +A simple example of a support agent that uses `ze.prompt()` for versioned, managed prompts. This demonstrates the basic setup without the automated feedback loop. + +### 2. Customer Support Agent with SDK Feedback (`bookstore_agent_with_feedback.py`) + +An advanced example that implements a complete automated feedback loop using the ZeroEval SDK. + +**Key Features:** + +- **Automated Evaluator**: Uses a powerful model (GPT-4o) to grade the agent's responses. +- **Feedback Submission**: Uses `ze.send_feedback()` to programmatically submit the evaluator's scores (thumbs up/down) and reasoning. +- **Metadata Tracking**: Attaches metadata (like scores and evaluator model) to the feedback. + +**Run it:** + +```bash +python tuning/bookstore_agent_with_feedback.py +``` + +### 3. Customer Support Agent with API Feedback (`bookstore_agent_with_api_feedback.py`) + +Demonstrates how to submit feedback using direct HTTP calls to the ZeroEval API, bypassing the SDK's `ze.send_feedback` helper. This is useful for frontend applications or systems where the SDK cannot be installed. + +**Key Features:** + +- **Direct API Integration**: Uses `requests` to hit the `/v1/prompts/{slug}/completions/{id}/feedback` endpoint. +- **Payload Structure**: Shows exactly what JSON payload the backend expects. +- **Flexible Integration**: Ideal for custom pipelines or non-Python environments. + +**Run it:** + +```bash +python tuning/bookstore_agent_with_api_feedback.py +``` + +## Setup + +Ensure you have your `.env` file set up in the parent directory with: + +- `ZEROEVAL_API_KEY`: Your ZeroEval API key (required, starts with `sk_ze_...`) +- `OPENAI_API_KEY`: Your OpenAI API key (required) +- `ZEROEVAL_API_URL`: (Optional) URL of your ZeroEval instance (default: `http://localhost:8000`) + +**Important**: All examples now pull credentials from environment variables. Never commit hardcoded API keys to version control. diff --git a/examples_v2/tuning/bookstore_agent.py b/examples_v2/tuning/bookstore_agent.py new file mode 100644 index 00000000..127b3af4 --- /dev/null +++ b/examples_v2/tuning/bookstore_agent.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 +""" +Customer Support Agent with Tuning +================================= + +This example demonstrates how to build a customer support agent using ZeroEval's +tuning features. It uses `ze.prompt()` to manage the prompt and `ze.send_feedback()` +to provide signals for optimization. + +Key concepts: +1. `ze.prompt()`: Defines the prompt and binds variables for interpolation +2. Automatic Tracing: The SDK automatically traces OpenAI calls +3. Interactive Mode: You can chat with the agent and see how it responds +""" + +import os +import uuid +from pathlib import Path + +from dotenv import load_dotenv + +# Load environment variables BEFORE importing zeroeval +env_path = Path(__file__).parent.parent / ".env" +load_dotenv(env_path) + +import openai +import zeroeval as ze + +# 1. Initialize ZeroEval +# Ensure you have ZEROEVAL_API_KEY and ZEROEVAL_API_URL set in your environment +ze.init( + api_key=os.getenv("ZEROEVAL_API_KEY"), + api_url=os.getenv("ZEROEVAL_API_URL", "http://localhost:8000"), +) + +def customer_support_agent(user_query: str, user_context: dict = None, conversation_history: list = None): + """ + A simple customer support agent that uses a managed prompt and maintains conversation history. + """ + if user_context is None: + user_context = {} + if conversation_history is None: + conversation_history = [] + + # 2. Define the prompt using ze.prompt() + # This registers the prompt with ZeroEval (if not exists) and allows for versioning. + # The 'content' is your base prompt. You can use {{variable}} syntax. + # 'variables' are passed for interpolation and tracking. + + prompt_name = "bookstore-support-agent" + + system_instruction = ze.prompt( + name=prompt_name, + content="""You are Elena, a passionate book enthusiast and customer support specialist at Bibliophile Books. You've worked in the bookstore for 5 years and genuinely love helping people discover their next great read. + +Your personality: +- Warm and personable, like chatting with a knowledgeable friend at a bookshop +- Enthusiastic about books and reading +- Patient and empathetic when customers have issues +- Professional but not overly formal +- You use the customer's name naturally in conversation + +Customer Information: +- Name: {{user_name}} +- Membership Level: {{membership}} + +Guidelines: +1. Address {{user_name}} directly and warmly (but don't say "Hi {{user_name}}" in every message if you're in an ongoing conversation) +2. For Gold members: Remember they have free shipping, priority support, and 15% off all purchases +3. For Standard members: Offer helpful service while mentioning Gold membership benefits when relevant +4. Keep responses concise but friendly (2-4 sentences for simple queries) +5. If you don't know something or can't help, offer to connect them with a specialist +6. Never use placeholder text like "[Your Name]" - you are Elena +7. End naturally without formal sign-offs unless it's clearly the end of the conversation +8. IMPORTANT: Remember information from the conversation history and don't ask for things the customer already told you + +Respond directly to their query in a helpful, personable way.""", + variables={ + "user_name": user_context.get("name", "there"), + "membership": user_context.get("membership", "Standard") + } + ) + + # Initialize OpenAI client (ZeroEval automatically instruments this) + client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + + print(f"\n--- Sending Request to AI ({prompt_name}) ---") + + # Build messages with conversation history + messages = [{"role": "system", "content": system_instruction}] + messages.extend(conversation_history) + messages.append({"role": "user", "content": user_query}) + + # 3. Call the Model + # The SDK intercepts this call: + # - Detects the metadata from ze.prompt() + # - Interpolates variables into the content + # - Traces the execution + response = client.chat.completions.create( + model="gpt-4o-mini", # Use a cost-effective model + messages=messages, + temperature=0.7 + ) + + completion_text = response.choices[0].message.content + completion_id = response.id + + return completion_text, completion_id, prompt_name + +def main(): + # Example interaction + print("\n=== Bookstore Support Agent (Type 'exit' to quit) ===") + + # We'll assume a fixed user context for this session + user_context = { + "name": "Alice", + "membership": "Gold" # VIP customer + } + print(f"Context: User={user_context['name']}, Membership={user_context['membership']}\n") + + # Initialize conversation history + conversation_history = [] + + # Agent introduces itself + intro_query = "Hello! Please introduce yourself and ask how you can help me today." + response_text, _, _ = customer_support_agent(intro_query, user_context, conversation_history) + print(f"Elena: {response_text}\n") + + # Add intro to history + conversation_history.append({"role": "user", "content": intro_query}) + conversation_history.append({"role": "assistant", "content": response_text}) + + while True: + try: + user_query = input("\nEnter your query: ").strip() + if not user_query: + continue + + if user_query.lower() in ('exit', 'quit'): + print("Goodbye!") + break + + response_text, completion_id, prompt_slug = customer_support_agent(user_query, user_context, conversation_history) + + print(f"\nElena: {response_text}") + + # Add to conversation history + conversation_history.append({"role": "user", "content": user_query}) + conversation_history.append({"role": "assistant", "content": response_text}) + + except KeyboardInterrupt: + print("\nGoodbye!") + break + except Exception as e: + print(f"\nError: {e}") + print("Check your ZEROEVAL_API_KEY and OPENAI_API_KEY.") + break + +if __name__ == "__main__": + main() diff --git a/examples_v2/tuning/bookstore_agent_with_api_feedback.py b/examples_v2/tuning/bookstore_agent_with_api_feedback.py new file mode 100644 index 00000000..c73d3878 --- /dev/null +++ b/examples_v2/tuning/bookstore_agent_with_api_feedback.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python3 +""" +Customer Support Agent with API Feedback Loop +=================================================== + +This example demonstrates how to submit feedback using the ZeroEval API directly, +bypassing the SDK's `ze.send_feedback` helper. This is useful for: +1. Frontend applications calling the backend directly +2. Systems where the SDK is not installed +3. Custom integrations + +Key concepts: +- `POST /v1/prompts/{slug}/completions/{id}/feedback`: The feedback endpoint +- Direct API interaction +""" + +import os +import json +import requests +from pathlib import Path +from dotenv import load_dotenv + +# Load environment variables BEFORE importing zeroeval +env_path = Path(__file__).parent.parent / ".env" +load_dotenv(env_path) + +import openai +import zeroeval as ze + +# Configuration +API_URL = os.getenv("ZEROEVAL_API_URL", "http://localhost:8000") +API_KEY = os.getenv("ZEROEVAL_API_KEY") # Use your ZeroEval API Key + +# 1. Initialize ZeroEval +ze.init( + api_key=API_KEY, + api_url=API_URL, +) + +# Initialize OpenAI client +client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + +def customer_support_agent(user_query: str, user_context: dict = None, conversation_history: list = None): + """ + A simple customer support agent that uses a managed prompt and maintains conversation history. + """ + if user_context is None: + user_context = {} + if conversation_history is None: + conversation_history = [] + + # 2. Define the prompt using ze.prompt() + prompt_name = "bookstore-support-agent-with-api-feedback" + + system_instruction = ze.prompt( + name=prompt_name, + content="""You are Elena, a passionate book enthusiast and customer support specialist at Bibliophile Books. You've worked in the bookstore for 5 years and genuinely love helping people discover their next great read. + +Your personality: +- Warm and personable, like chatting with a knowledgeable friend at a bookshop +- Enthusiastic about books and reading +- Patient and empathetic when customers have issues +- Professional but not overly formal +- You use the customer's name naturally in conversation + +Customer Information: +- Name: {{user_name}} +- Membership Level: {{membership}} + +Guidelines: +- Address {{user_name}} directly and warmly +- For Gold members: Remember they have free shipping, priority support, and 15% off all purchases +- For Standard members: Offer helpful service while mentioning Gold membership benefits when relevant +- Keep responses concise but friendly +- If you don't know something or can't help, offer to connect them with a specialist +- Never use placeholder text like "[Your Name]" - you are Elena + +Respond directly to their query in a helpful, personable way.""", + variables={ + "user_name": user_context.get("name", "there"), + "membership": user_context.get("membership", "Standard") + } + ) + + print(f"\n--- Sending Request to AI ({prompt_name}) ---") + + # Build messages with conversation history + messages = [{"role": "system", "content": system_instruction}] + messages.extend(conversation_history) + messages.append({"role": "user", "content": user_query}) + + # 3. Call the Model + # The SDK intercepts this call and tracks the completion_id + response = client.chat.completions.create( + model="gpt-4o-mini", # Use a cost-effective model for the agent + messages=messages, + temperature=0.7 + ) + + completion_text = response.choices[0].message.content + completion_id = response.id + + return completion_text, completion_id, prompt_name + +def evaluate_response(user_query: str, agent_response: str): + """ + Uses a powerful model (Evaluator) to grade the agent's response. + Returns (is_good: bool, reason: str) + """ + print("\n--- Running Evaluator (GPT-4o) ---") + + eval_prompt = f"""You are an expert customer support quality assurance specialist. + Your job is to evaluate a customer support response. + + User Query: "{user_query}" + Agent Response: "{agent_response}" + + Criteria: + 1. Is the tone warm and professional? + 2. Is the information accurate and helpful? + 3. Does it address the user's specific query? + + Output strictly in JSON format with these fields: + - "score": 1 to 5 (5 being perfect) + - "reason": A brief explanation of the score + - "thumbs_up": true if score >= 4, else false + """ + + response = client.chat.completions.create( + model="gpt-4o", # Use a powerful model for evaluation + messages=[{"role": "user", "content": eval_prompt}], + temperature=0, + response_format={"type": "json_object"} + ) + + try: + result = json.loads(response.choices[0].message.content) + return result + except Exception as e: + print(f"Error parsing evaluation: {e}") + return {"thumbs_up": True, "reason": "Failed to parse evaluation", "score": 5} + +def send_feedback_via_api(prompt_slug, completion_id, thumbs_up, reason=None, expected_output=None, metadata=None): + """ + Sends feedback directly using requests.post to the ZeroEval API. + """ + url = f"{API_URL}/v1/prompts/{prompt_slug}/completions/{completion_id}/feedback" + + payload = { + "thumbs_up": thumbs_up, + "reason": reason, + "expected_output": expected_output, + "metadata": metadata or {} + } + + headers = { + "Authorization": f"Bearer {API_KEY}", + "Content-Type": "application/json" + } + + try: + print(f"\n[API] POST {url}") + resp = requests.post(url, json=payload, headers=headers) + resp.raise_for_status() + print("✓ API Feedback submitted successfully") + return resp.json() + except requests.exceptions.HTTPError as e: + print(f"❌ API Request failed: {e}") + print(f"Response: {e.response.text}") + return None + except Exception as e: + print(f"❌ Error sending feedback: {e}") + return None + +def main(): + # Example interaction + print("\n=== Bookstore Support Agent with API Feedback (Type 'exit' to quit) ===") + + user_context = { + "name": "Alice", + "membership": "Gold" # VIP customer + } + print(f"Context: User={user_context['name']}, Membership={user_context['membership']}\n") + + conversation_history = [] + + while True: + try: + user_query = input("\nEnter your query: ").strip() + if not user_query: + continue + + if user_query.lower() in ('exit', 'quit'): + print("Goodbye!") + break + + # 1. Get response from the agent + response_text, completion_id, prompt_slug = customer_support_agent( + user_query, + user_context, + conversation_history + ) + + print(f"\nElena: {response_text}") + print(f"\n[DEBUG] OpenAI completion_id: {completion_id}") + print(f"[DEBUG] Prompt slug: {prompt_slug}") + + # 2. Generate feedback using a powerful model + # In a real system, this might happen asynchronously or be sampled + eval_result = evaluate_response(user_query, response_text) + + print(f"\n[Evaluator] Score: {eval_result.get('score')}/5") + print(f"[Evaluator] Reason: {eval_result.get('reason')}") + print(f"[Evaluator] Verdict: {'👍 Thumbs Up' if eval_result.get('thumbs_up') else '👎 Thumbs Down'}") + + # 3. Submit feedback via API directly + send_feedback_via_api( + prompt_slug=prompt_slug, + completion_id=completion_id, + thumbs_up=eval_result.get("thumbs_up", True), + reason=eval_result.get("reason"), + metadata={ + "score": eval_result.get("score"), + "evaluator_model": "gpt-4o", + "source": "direct_api" + } + ) + + # Add to conversation history + conversation_history.append({"role": "user", "content": user_query}) + conversation_history.append({"role": "assistant", "content": response_text}) + + except KeyboardInterrupt: + print("\nGoodbye!") + break + except Exception as e: + print(f"\nError: {e}") + import traceback + traceback.print_exc() + break + +if __name__ == "__main__": + main() + diff --git a/examples_v2/tuning/bookstore_agent_with_feedback.py b/examples_v2/tuning/bookstore_agent_with_feedback.py new file mode 100644 index 00000000..8452650b --- /dev/null +++ b/examples_v2/tuning/bookstore_agent_with_feedback.py @@ -0,0 +1,212 @@ +#!/usr/bin/env python3 +""" +Customer Support Agent with Tuning and Feedback Loop +=================================================== + +This example is an enhanced version of `customer_support_agent.py` that adds an +automated feedback loop. It demonstrates how to: + +1. Use `ze.prompt()` to manage prompts +2. Automatically trace OpenAI calls +3. **New**: Use a powerful model (evaluator) to critique the agent's responses +4. **New**: Submit this feedback using `ze.send_feedback()` to improve the prompt over time + +Key concepts: +- `ze.send_feedback()`: Submits programmatic feedback (thumbs up/down, reason) associated with a completion +- Automated Evaluation: Using a stronger model to grade a faster/cheaper model +""" + +import os +from pathlib import Path +import json + +from dotenv import load_dotenv + +# Load environment variables BEFORE importing zeroeval +env_path = Path(__file__).parent.parent / ".env" +load_dotenv(env_path) + +import openai +import zeroeval as ze + +# 1. Initialize ZeroEval +# Ensure you have ZEROEVAL_API_KEY and ZEROEVAL_API_URL set in your environment +ze.init( + api_key=os.getenv("ZEROEVAL_API_KEY"), + api_url=os.getenv("ZEROEVAL_API_URL", "http://localhost:8000"), +) + +# Initialize OpenAI client +client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + +def customer_support_agent(user_query: str, user_context: dict = None, conversation_history: list = None): + """ + A simple customer support agent that uses a managed prompt and maintains conversation history. + """ + if user_context is None: + user_context = {} + if conversation_history is None: + conversation_history = [] + + # 2. Define the prompt using ze.prompt() + prompt_name = "bookstore-support-agent-with-sdk-feedback" + + system_instruction = ze.prompt( + name=prompt_name, + content="""You are Elena, a passionate book enthusiast and customer support specialist at Bibliophile Books. You've worked in the bookstore for 5 years and genuinely love helping people discover their next great read. + +Your personality: +- Warm and personable, like chatting with a knowledgeable friend at a bookshop +- Enthusiastic about books and reading +- Patient and empathetic when customers have issues +- Professional but not overly formal +- You use the customer's name naturally in conversation + +Customer Information: +- Name: {{user_name}} +- Membership Level: {{membership}} + +Guidelines: +- Address {{user_name}} directly and warmly +- For Gold members: Remember they have free shipping, priority support, and 15% off all purchases +- For Standard members: Offer helpful service while mentioning Gold membership benefits when relevant +- Keep responses concise but friendly +- If you don't know something or can't help, offer to connect them with a specialist +- Never use placeholder text like "[Your Name]" - you are Elena + +Respond directly to their query in a helpful, personable way.""", + variables={ + "user_name": user_context.get("name", "there"), + "membership": user_context.get("membership", "Standard") + } + ) + + print(f"\n--- Sending Request to AI ({prompt_name}) ---") + + # Build messages with conversation history + messages = [{"role": "system", "content": system_instruction}] + messages.extend(conversation_history) + messages.append({"role": "user", "content": user_query}) + + # 3. Call the Model + # The SDK intercepts this call and tracks the completion_id + response = client.chat.completions.create( + model="gpt-4o-mini", # Use a cost-effective model for the agent + messages=messages, + temperature=0.7 + ) + + completion_text = response.choices[0].message.content + completion_id = response.id + + return completion_text, completion_id, prompt_name + +def evaluate_response(user_query: str, agent_response: str): + """ + Uses a powerful model (Evaluator) to grade the agent's response. + Returns (is_good: bool, reason: str) + """ + print("\n--- Running Evaluator (GPT-4o) ---") + + eval_prompt = f"""You are an expert customer support quality assurance specialist. + Your job is to evaluate a customer support response. + + User Query: "{user_query}" + Agent Response: "{agent_response}" + + Criteria: + 1. Is the tone warm and professional? + 2. Is the information accurate and helpful? + 3. Does it address the user's specific query? + + Output strictly in JSON format with these fields: + - "score": 1 to 5 (5 being perfect) + - "reason": A brief explanation of the score + - "thumbs_up": true if score >= 4, else false + """ + + response = client.chat.completions.create( + model="gpt-4o", # Use a powerful model for evaluation + messages=[{"role": "user", "content": eval_prompt}], + temperature=0, + response_format={"type": "json_object"} + ) + + try: + result = json.loads(response.choices[0].message.content) + return result + except Exception as e: + print(f"Error parsing evaluation: {e}") + return {"thumbs_up": True, "reason": "Failed to parse evaluation", "score": 5} + +def main(): + # Example interaction + print("\n=== Bookstore Support Agent with Feedback Loop (Type 'exit' to quit) ===") + + user_context = { + "name": "Alice", + "membership": "Gold" # VIP customer + } + print(f"Context: User={user_context['name']}, Membership={user_context['membership']}\n") + + conversation_history = [] + + while True: + try: + user_query = input("\nEnter your query: ").strip() + if not user_query: + continue + + if user_query.lower() in ('exit', 'quit'): + print("Goodbye!") + break + + # 1. Get response from the agent + response_text, completion_id, prompt_slug = customer_support_agent( + user_query, + user_context, + conversation_history + ) + + print(f"\nElena: {response_text}") + print(f"\n[DEBUG] OpenAI completion_id: {completion_id}") + print(f"[DEBUG] Prompt slug: {prompt_slug}") + + # 2. Generate feedback using a powerful model + # In a real system, this might happen asynchronously or be sampled + eval_result = evaluate_response(user_query, response_text) + + print(f"\n[Evaluator] Score: {eval_result.get('score')}/5") + print(f"[Evaluator] Reason: {eval_result.get('reason')}") + print(f"[Evaluator] Verdict: {'👍 Thumbs Up' if eval_result.get('thumbs_up') else '👎 Thumbs Down'}") + + # 3. Submit feedback to ZeroEval + # This signals to the optimizer which responses were good/bad + ze.send_feedback( + prompt_slug=prompt_slug, + completion_id=completion_id, + thumbs_up=eval_result.get("thumbs_up", True), + reason=eval_result.get("reason"), + metadata={ + "score": eval_result.get("score"), + "evaluator_model": "gpt-4o" + } + ) + print("✓ Feedback submitted to ZeroEval") + + # Add to conversation history + conversation_history.append({"role": "user", "content": user_query}) + conversation_history.append({"role": "assistant", "content": response_text}) + + except KeyboardInterrupt: + print("\nGoodbye!") + break + except Exception as e: + print(f"\nError: {e}") + import traceback + traceback.print_exc() + break + +if __name__ == "__main__": + main() + diff --git a/pyproject.toml b/pyproject.toml index 6961d648..224e382d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "zeroeval" -version = "0.6.121" +version = "0.6.122" description = "ZeroEval SDK" readme = "README.md" authors = [ diff --git a/src/zeroeval/__init__.py b/src/zeroeval/__init__.py index 3204c464..8a56f08d 100644 --- a/src/zeroeval/__init__.py +++ b/src/zeroeval/__init__.py @@ -180,6 +180,75 @@ def get(self, slug: str, **kwargs): prompts = _PromptsNamespace() + +def log_completion( + *, + prompt_slug: str, + prompt_id: str, + prompt_version_id: str, + messages: list, + input_text: Optional[str] = None, + output_text: Optional[str] = None, + model_id: Optional[str] = None, + metadata: Optional[dict] = None, + duration_ms: Optional[float] = None, + prompt_tokens: Optional[int] = None, + completion_tokens: Optional[int] = None, + total_tokens: Optional[int] = None, + cost: Optional[float] = None, + has_error: bool = False, + error_message: Optional[str] = None, + span_id: Optional[str] = None, +): + """ + Log a completion for a specific prompt. + + This automatically tracks prompt usage without requiring manual wrapping. + """ + client = _ensure_prompt_client() + return client.log_completion( + prompt_slug=prompt_slug, + prompt_id=prompt_id, + prompt_version_id=prompt_version_id, + messages=messages, + input_text=input_text, + output_text=output_text, + model_id=model_id, + metadata=metadata, + duration_ms=duration_ms, + prompt_tokens=prompt_tokens, + completion_tokens=completion_tokens, + total_tokens=total_tokens, + cost=cost, + has_error=has_error, + error_message=error_message, + span_id=span_id, + ) + + +def send_feedback( + *, + prompt_slug: str, + completion_id: str, + thumbs_up: bool, + reason: Optional[str] = None, + expected_output: Optional[str] = None, + metadata: Optional[dict] = None, +): + """ + Send feedback for a specific completion. + """ + client = _ensure_prompt_client() + return client.send_feedback( + prompt_slug=prompt_slug, + completion_id=completion_id, + thumbs_up=thumbs_up, + reason=reason, + expected_output=expected_output, + metadata=metadata, + ) + + # Define what's exported __all__ = [ # Core functionality @@ -192,7 +261,6 @@ def get(self, slug: str, **kwargs): # Providers "ZeroEvalOTLPProvider", "SingleProcessorProvider", - # Observability "tracer", "span", @@ -211,6 +279,9 @@ def get(self, slug: str, **kwargs): "PromptClient", "get_prompt", "prompts", + # Completion logging and feedback + "log_completion", + "send_feedback", ] # Version info diff --git a/src/zeroeval/client.py b/src/zeroeval/client.py index 5b8c6600..72d2a88b 100644 --- a/src/zeroeval/client.py +++ b/src/zeroeval/client.py @@ -1,5 +1,6 @@ from __future__ import annotations +import logging import os import re from typing import Any, Dict, Optional, Tuple @@ -17,6 +18,8 @@ _SLUG_RE = re.compile(r"^[a-z0-9-]+$") _TAG_RE = re.compile(r"^[a-z0-9-]+$") +logger = logging.getLogger(__name__) + class ZeroEval: def __init__( @@ -322,4 +325,146 @@ def _post_process( return decorated return prompt + # ---- New Prompt Completion and Feedback API ---- + + def log_completion( + self, + *, + prompt_slug: str, + prompt_id: str, + prompt_version_id: str, + messages: list[dict[str, Any]], + input_text: Optional[str] = None, + output_text: Optional[str] = None, + model_id: Optional[str] = None, + metadata: Optional[dict[str, Any]] = None, + duration_ms: Optional[float] = None, + prompt_tokens: Optional[int] = None, + completion_tokens: Optional[int] = None, + total_tokens: Optional[int] = None, + cost: Optional[float] = None, + has_error: bool = False, + error_message: Optional[str] = None, + span_id: Optional[str] = None, + ) -> dict[str, Any]: + """ + Log a completion for a specific prompt and version. + This is used to track prompt usage automatically. + + Args: + prompt_slug: The slug of the prompt + prompt_id: UUID of the prompt + prompt_version_id: UUID of the prompt version + messages: Array of message objects in OpenAI format + input_text: Optional text representation of input + output_text: Optional text representation of output + model_id: Optional model identifier used + metadata: Optional additional metadata + duration_ms: Optional execution duration in milliseconds + prompt_tokens: Optional number of prompt tokens + completion_tokens: Optional number of completion tokens + total_tokens: Optional total token count + cost: Optional cost in USD + has_error: Whether the completion had an error + error_message: Optional error message + span_id: Optional span ID for trace linking + + Returns: + The created completion record + """ + # Extract project_id from API key context (handled by backend) + url = f"{self._base_url}/projects/{{project_id}}/prompts/{prompt_slug}/completions" + + payload = { + "prompt_id": prompt_id, + "prompt_version_id": prompt_version_id, + "model_id": model_id, + "messages": messages, + "input_text": input_text, + "output_text": output_text, + "metadata": metadata or {}, + "duration_ms": duration_ms, + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + "cost": cost, + "has_error": has_error, + "error_message": error_message, + "span_id": span_id, + } + + # Remove None values + payload = {k: v for k, v in payload.items() if v is not None} + + resp = requests.post(url, headers=self._headers(), json=payload, timeout=self._timeout) + if resp.status_code >= 400: + raise PromptRequestError( + f"log_completion failed: {resp.text}", status=resp.status_code + ) + return resp.json() + + def send_feedback( + self, + *, + prompt_slug: str, + completion_id: str, + thumbs_up: bool, + reason: Optional[str] = None, + expected_output: Optional[str] = None, + metadata: Optional[dict[str, Any]] = None, + ) -> dict[str, Any]: + """ + Send feedback for a specific completion. + + Args: + prompt_slug: The slug of the prompt + completion_id: UUID of the completion to provide feedback on + thumbs_up: True for positive feedback, False for negative + reason: Optional explanation of the feedback + expected_output: Optional description of what the expected output should be + metadata: Optional additional metadata + + Returns: + The created feedback record + """ + url = f"{self._base_url}/v1/prompts/{prompt_slug}/completions/{completion_id}/feedback" + + logger.debug( + f"[SDK] Sending feedback for completion_id={completion_id}, prompt_slug={prompt_slug}", + extra={ + "completion_id": completion_id, + "prompt_slug": prompt_slug, + "thumbs_up": thumbs_up, + "url": url + } + ) + + payload = { + "thumbs_up": thumbs_up, + } + + # Add optional fields only if provided + if reason is not None: + payload["reason"] = reason + if expected_output is not None: + payload["expected_output"] = expected_output + if metadata is not None: + payload["metadata"] = metadata + + resp = requests.post(url, headers=self._headers(), json=payload, timeout=self._timeout) + + logger.debug( + f"[SDK] Feedback response status={resp.status_code}", + extra={ + "status_code": resp.status_code, + "response_text": resp.text[:500] if resp.text else None + } + ) + + if resp.status_code >= 400: + raise PromptRequestError( + f"send_feedback failed: {resp.text}", status=resp.status_code + ) + return resp.json() + diff --git a/src/zeroeval/observability/integrations/openai/integration.py b/src/zeroeval/observability/integrations/openai/integration.py index ba886a6f..dbead92d 100644 --- a/src/zeroeval/observability/integrations/openai/integration.py +++ b/src/zeroeval/observability/integrations/openai/integration.py @@ -67,9 +67,10 @@ def zeroeval_prompt( """ Helper function to create a prompt with zeroeval metadata for tracing and observability. - IMPORTANT: This function does NOT create or update tasks in ZeroEval. It only adds - metadata to OpenAI API calls for tracing purposes. Tasks must be created separately - using Dataset.run() or Experiment.run(). + When this prompt is used in an OpenAI API call, ZeroEval will automatically: + 1. Extract the task metadata from the prompt + 2. Link the span to the specified task + 3. Create the task automatically if it doesn't exist yet Args: name: Required task identifier for this prompt @@ -80,7 +81,6 @@ def zeroeval_prompt( A string with the format: {JSON}content Example: - >>> # This adds metadata but does NOT create a task >>> zeroeval_prompt( ... name="custom-bot-5", ... content="You are an assistant that helps users with {{task}}. Be {{tone}} in your responses.", @@ -92,9 +92,8 @@ def zeroeval_prompt( '{"task": "custom-bot-5", "variables": {"task": "coding questions", "tone": "helpful and concise"}}You are an assistant that helps users with {{task}}. Be {{tone}} in your responses.' Note: - - The 'name' parameter is for linking OpenAI calls to existing tasks - - Tasks are created through Dataset.run() or Experiment.run() - Variables will be interpolated in the prompt when the OpenAI API is called + - The task will be automatically created in ZeroEval if it doesn't exist """ metadata = {"task": name} @@ -348,8 +347,8 @@ def _log_task_metadata(self, task_id: Optional[str], zeroeval_metadata: dict[str if task_id: logger.info( f"{context}: Task ID '{task_id}' added to span attributes. " - f"This enables tracing but does NOT create/update tasks. " - f"Ensure the task exists or will be created through Dataset/Experiment.run()." + f"The task will be automatically created if it doesn't exist yet, " + f"and this span will be linked to it for tracing and tuning." ) logger.debug(f"{context}: Full zeroeval metadata added to span: {zeroeval_metadata}") @@ -391,13 +390,12 @@ def _process_messages_with_zeroeval(self, messages: Optional[list[dict[str, Any] task_id = metadata.get('task') logger.info(f"_process_messages_with_zeroeval: Successfully extracted metadata - task: '{task_id}', variables: {list(variables.keys()) if variables else 'none'}") - # Important warning for users + # Log task linkage info if task_id: - logger.warning( + logger.info( f"_process_messages_with_zeroeval: Task ID '{task_id}' found in zeroeval_prompt. " - f"Note: zeroeval_prompt does NOT automatically create or update tasks. " - f"Tasks must be created separately using Dataset.run() or Experiment.run(). " - f"This metadata is only used for tracing and observability." + f"This span will be automatically linked to the task and the task will be " + f"created if it doesn't exist yet." ) else: logger.debug("_process_messages_with_zeroeval: No zeroeval metadata found in system message") diff --git a/src/zeroeval/pyproject.toml b/src/zeroeval/pyproject.toml index ae45880e..a913a9c3 100644 --- a/src/zeroeval/pyproject.toml +++ b/src/zeroeval/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "zeroeval" -version = "0.6.121" +version = "0.6.122" description = "ZeroEval SDK" [project.scripts] diff --git a/test_sampling_correct.py b/test_sampling_correct.py deleted file mode 100644 index 1318dd81..00000000 --- a/test_sampling_correct.py +++ /dev/null @@ -1,225 +0,0 @@ -#!/usr/bin/env python3 -"""Test script to verify sampling functionality works correctly.""" - -import os -import sys -import random -from pathlib import Path - -# Add the src directory to the path -sys.path.insert(0, str(Path(__file__).parent / 'src')) - -# Set seed for reproducible tests -random.seed(42) - -def test_sampling_rate(rate, num_traces=1000): - """Test sampling at a specific rate.""" - print(f"\n=== Testing Sampling Rate: {rate*100}% ({num_traces} traces) ===") - - # Set environment variable before importing - os.environ["ZEROEVAL_SAMPLING_RATE"] = str(rate) - - # Force reimport to get fresh tracer instance - import importlib - import zeroeval - from zeroeval.observability.tracer import Tracer - - # Reset singleton - Tracer._instance = None - - # Reimport and initialize - importlib.reload(zeroeval) - zeroeval.init(api_key="test_key", debug=False) - - # Track sampled traces - sampled_count = 0 - total_count = num_traces - - for i in range(total_count): - # Create a new trace each time - span = zeroeval.tracer.start_span(f"test_trace_{i}", is_new_trace=True) - trace_id = span.trace_id - - # Check if this trace is sampled BEFORE ending the span - # (since the trace gets cleaned up after the last span ends) - if trace_id in zeroeval.tracer._traces: - if zeroeval.tracer._traces[trace_id].is_sampled: - sampled_count += 1 - - # End the span properly to clean up - zeroeval.tracer.end_span(span) - - # After all traces, check that spans were buffered for sampled traces - # The actual number of spans in buffer depends on flushing behavior - spans_buffered = len(zeroeval.tracer._spans) - - # Calculate actual sampling rate - actual_rate = sampled_count / total_count - expected_rate = rate - - # Allow for some statistical variance (±5% absolute difference for small samples) - # For larger samples, use tighter bounds - if num_traces >= 500: - tolerance = 0.05 - else: - tolerance = 0.1 # More tolerance for smaller samples - - is_within_tolerance = abs(actual_rate - expected_rate) <= tolerance - - print(f" Expected rate: {expected_rate*100:.1f}%") - print(f" Actual rate: {actual_rate*100:.1f}% ({sampled_count}/{total_count} traces)") - print(f" Within tolerance (±{tolerance*100}%): {'✅ YES' if is_within_tolerance else '❌ NO'}") - - # Check for memory leaks - all traces should be cleaned up - remaining_traces = len(zeroeval.tracer._traces) - print(f" Memory check - remaining traces: {remaining_traces} {'✅' if remaining_traces == 0 else '❌ MEMORY LEAK!'}") - - # For sampled traces, we should have spans buffered (unless auto-flushed) - if expected_rate > 0: - print(f" Spans buffered: {spans_buffered} (may vary due to auto-flush)") - - # Force flush to clear buffer for next test - zeroeval.tracer.flush() - - return is_within_tolerance, actual_rate, remaining_traces == 0 - - -def test_trace_completeness(): - """Test that all spans in a trace follow the same sampling decision.""" - print("\n=== Testing Trace Completeness ===") - - os.environ["ZEROEVAL_SAMPLING_RATE"] = "0.5" # 50% sampling - - import importlib - import zeroeval - from zeroeval.observability.tracer import Tracer - - # Reset singleton - Tracer._instance = None - importlib.reload(zeroeval) - zeroeval.init(api_key="test_key", debug=False) - - # Test 10 traces with multiple spans each - traces_sampled = [] - - for i in range(10): - # Start root span (new trace) - root = zeroeval.tracer.start_span(f"root_{i}", is_new_trace=True) - trace_id = root.trace_id - - # Check sampling decision while trace is active - is_sampled = zeroeval.tracer._traces[trace_id].is_sampled - traces_sampled.append(is_sampled) - - # Create child spans in same trace - child1 = zeroeval.tracer.start_span(f"child1_{i}") - child2 = zeroeval.tracer.start_span(f"child2_{i}") - - # All spans in trace should have same sampling decision - assert child1.trace_id == trace_id, "Child should have same trace ID" - assert child2.trace_id == trace_id, "Child should have same trace ID" - assert zeroeval.tracer._traces[trace_id].is_sampled == is_sampled, "Sampling decision changed!" - assert zeroeval.tracer._traces[trace_id].ref_count == 3, f"Expected ref_count=3, got {zeroeval.tracer._traces[trace_id].ref_count}" - - # End spans in reverse order (LIFO) - zeroeval.tracer.end_span(child2) - assert zeroeval.tracer._traces[trace_id].ref_count == 2, "ref_count should be 2 after ending one child" - - zeroeval.tracer.end_span(child1) - assert zeroeval.tracer._traces[trace_id].ref_count == 1, "ref_count should be 1 after ending both children" - - zeroeval.tracer.end_span(root) - - # After ending all spans, trace should be cleaned up - assert trace_id not in zeroeval.tracer._traces, f"Trace {trace_id} not cleaned up!" - - sampled = sum(traces_sampled) - print(f" Traces sampled: {sampled}/10") - print(f" All spans in each trace had consistent sampling: ✅") - print(f" Reference counting worked correctly: ✅") - print(f" All traces cleaned up after completion: ✅") - - # Force flush - zeroeval.tracer.flush() - - -def test_nested_spans_cleanup(): - """Test that nested spans are properly cleaned up even when unsampled.""" - print("\n=== Testing Nested Spans Cleanup ===") - - os.environ["ZEROEVAL_SAMPLING_RATE"] = "0" # Sample nothing - - import importlib - import zeroeval - from zeroeval.observability.tracer import Tracer - - # Reset singleton - Tracer._instance = None - importlib.reload(zeroeval) - zeroeval.init(api_key="test_key", debug=False) - - # Create deeply nested spans - spans = [] - for i in range(5): - span = zeroeval.tracer.start_span(f"level_{i}", is_new_trace=(i == 0)) - spans.append(span) - - trace_id = spans[0].trace_id - - # Verify trace is not sampled - assert not zeroeval.tracer._traces[trace_id].is_sampled, "Trace should not be sampled with rate=0" - assert zeroeval.tracer._traces[trace_id].ref_count == 5, "Should have ref_count=5 for 5 active spans" - - # Check active spans stack has all 5 spans - stack = zeroeval.tracer._active_spans_ctx.get() - assert len(stack) == 5, f"Expected 5 spans in stack, got {len(stack)}" - - # End all spans in reverse order - for span in reversed(spans): - zeroeval.tracer.end_span(span) - - # Verify stack is empty - stack = zeroeval.tracer._active_spans_ctx.get() - assert len(stack) == 0, f"Stack should be empty, but has {len(stack)} spans" - - # Verify trace is cleaned up - assert trace_id not in zeroeval.tracer._traces, "Unsampled trace not cleaned up" - assert len(zeroeval.tracer._traces) == 0, f"Memory leak: {len(zeroeval.tracer._traces)} traces remain" - assert len(zeroeval.tracer._spans) == 0, f"No spans should be buffered for unsampled traces, but found {len(zeroeval.tracer._spans)}" - - print(f" Unsampled nested spans: Created 5 levels") - print(f" Stack properly cleaned: ✅") - print(f" Reference counting correct: ✅") - print(f" Trace properly cleaned: ✅") - print(f" No spans buffered: ✅") - - -def main(): - print("Testing ZeroEval Sampling Functionality") - print("="*50) - - all_passed = True - - # Test different sampling rates - rates_to_test = [0.0, 0.1, 0.25, 0.5, 0.75, 1.0] - for rate in rates_to_test: - passed, actual, no_leak = test_sampling_rate(rate, num_traces=500) - all_passed = all_passed and passed and no_leak - - # Test trace completeness - test_trace_completeness() - - # Test cleanup of unsampled spans - test_nested_spans_cleanup() - - print("\n" + "="*50) - if all_passed: - print("✅ All sampling tests PASSED!") - else: - print("❌ Some sampling tests FAILED - check the output above") - - return 0 if all_passed else 1 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/test_sampling_debug.py b/test_sampling_debug.py deleted file mode 100644 index 064eae7e..00000000 --- a/test_sampling_debug.py +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env python3 -"""Debug script to understand sampling issue.""" - -import os -import sys -from pathlib import Path - -# Add the src directory to the path -sys.path.insert(0, str(Path(__file__).parent / 'src')) - -def test_single_trace(): - """Test a single trace to debug the issue.""" - - # Set environment variable before importing - os.environ["ZEROEVAL_SAMPLING_RATE"] = "1.0" - - # Import fresh - import zeroeval - from zeroeval.observability.tracer import Tracer - - # Reset singleton - Tracer._instance = None - - # Initialize - zeroeval.init(api_key="test_key", debug=True) - - print(f"Tracer sampling rate: {zeroeval.tracer._sampling_rate}") - print(f"Tracer instance: {zeroeval.tracer}") - - # Create a single span/trace - span = zeroeval.tracer.start_span("test_trace", is_new_trace=True) - trace_id = span.trace_id - - print(f"Trace ID: {trace_id}") - print(f"Traces registry: {zeroeval.tracer._traces}") - - if trace_id in zeroeval.tracer._traces: - trace_info = zeroeval.tracer._traces[trace_id] - print(f"Trace is_sampled: {trace_info.is_sampled}") - print(f"Trace ref_count: {trace_info.ref_count}") - else: - print("ERROR: Trace not in registry!") - - # End the span - zeroeval.tracer.end_span(span) - - print(f"After ending span, traces registry: {zeroeval.tracer._traces}") - print(f"Buffered spans: {len(zeroeval.tracer._spans)}") - - -if __name__ == "__main__": - test_single_trace() diff --git a/test_sampling_fix.py b/test_sampling_fix.py deleted file mode 100644 index a9fa7e2f..00000000 --- a/test_sampling_fix.py +++ /dev/null @@ -1,195 +0,0 @@ -#!/usr/bin/env python3 -"""Test script to verify sampling functionality works correctly.""" - -import os -import sys -import random -from pathlib import Path - -# Add the src directory to the path -sys.path.insert(0, str(Path(__file__).parent / 'src')) - -# Set seed for reproducible tests -random.seed(42) - -def test_sampling_rate(rate, num_traces=1000): - """Test sampling at a specific rate.""" - print(f"\n=== Testing Sampling Rate: {rate*100}% ({num_traces} traces) ===") - - # Set environment variable before importing - os.environ["ZEROEVAL_SAMPLING_RATE"] = str(rate) - - # Force reimport to get fresh tracer instance - import importlib - import zeroeval - from zeroeval.observability.tracer import Tracer - - # Reset singleton - Tracer._instance = None - - # Reimport and initialize - importlib.reload(zeroeval) - zeroeval.init(api_key="test_key", debug=False) - - # Track sampled traces - sampled_count = 0 - total_count = num_traces - - for i in range(total_count): - # Create a new trace each time - span = zeroeval.tracer.start_span(f"test_trace_{i}", is_new_trace=True) - trace_id = span.trace_id - - # Check if this trace is sampled - if trace_id in zeroeval.tracer._traces: - if zeroeval.tracer._traces[trace_id].is_sampled: - sampled_count += 1 - - # End the span properly to clean up - zeroeval.tracer.end_span(span) - - # Calculate actual sampling rate - actual_rate = sampled_count / total_count - expected_rate = rate - - # Allow for some statistical variance (±5% absolute difference) - tolerance = 0.05 - is_within_tolerance = abs(actual_rate - expected_rate) <= tolerance - - print(f" Expected rate: {expected_rate*100:.1f}%") - print(f" Actual rate: {actual_rate*100:.1f}% ({sampled_count}/{total_count} traces)") - print(f" Within tolerance (±{tolerance*100}%): {'✅ YES' if is_within_tolerance else '❌ NO'}") - - # Check for memory leaks - all traces should be cleaned up - remaining_traces = len(zeroeval.tracer._traces) - print(f" Memory check - remaining traces: {remaining_traces} {'✅' if remaining_traces == 0 else '❌ MEMORY LEAK!'}") - - return is_within_tolerance, actual_rate, remaining_traces == 0 - - -def test_trace_completeness(): - """Test that all spans in a trace follow the same sampling decision.""" - print("\n=== Testing Trace Completeness ===") - - os.environ["ZEROEVAL_SAMPLING_RATE"] = "0.5" # 50% sampling - - import importlib - import zeroeval - from zeroeval.observability.tracer import Tracer - - # Reset singleton - Tracer._instance = None - importlib.reload(zeroeval) - zeroeval.init(api_key="test_key", debug=False) - - # Test 10 traces with multiple spans each - traces_sampled = [] - - for i in range(10): - # Start root span (new trace) - root = zeroeval.tracer.start_span(f"root_{i}", is_new_trace=True) - trace_id = root.trace_id - is_sampled = zeroeval.tracer._traces[trace_id].is_sampled - traces_sampled.append(is_sampled) - - # Create child spans in same trace - child1 = zeroeval.tracer.start_span(f"child1_{i}") - child2 = zeroeval.tracer.start_span(f"child2_{i}") - - # All spans in trace should have same sampling decision - assert child1.trace_id == trace_id, "Child should have same trace ID" - assert child2.trace_id == trace_id, "Child should have same trace ID" - assert zeroeval.tracer._traces[trace_id].is_sampled == is_sampled, "Sampling decision changed!" - - # End spans in reverse order (LIFO) - zeroeval.tracer.end_span(child2) - zeroeval.tracer.end_span(child1) - zeroeval.tracer.end_span(root) - - # After ending all spans, trace should be cleaned up - assert trace_id not in zeroeval.tracer._traces, f"Trace {trace_id} not cleaned up!" - - sampled = sum(traces_sampled) - print(f" Traces sampled: {sampled}/10") - print(f" All spans in each trace had consistent sampling: ✅") - print(f" All traces cleaned up after completion: ✅") - - -def test_nested_spans_cleanup(): - """Test that nested spans are properly cleaned up even when unsampled.""" - print("\n=== Testing Nested Spans Cleanup ===") - - os.environ["ZEROEVAL_SAMPLING_RATE"] = "0" # Sample nothing - - import importlib - import zeroeval - from zeroeval.observability.tracer import Tracer - - # Reset singleton - Tracer._instance = None - importlib.reload(zeroeval) - zeroeval.init(api_key="test_key", debug=False) - - # Create deeply nested spans - spans = [] - for i in range(5): - span = zeroeval.tracer.start_span(f"level_{i}", is_new_trace=(i == 0)) - spans.append(span) - - trace_id = spans[0].trace_id - - # Verify trace is not sampled - assert not zeroeval.tracer._traces[trace_id].is_sampled, "Trace should not be sampled with rate=0" - - # Check active spans stack has all 5 spans - stack = zeroeval.tracer._active_spans_ctx.get() - assert len(stack) == 5, f"Expected 5 spans in stack, got {len(stack)}" - - # End all spans in reverse order - for span in reversed(spans): - zeroeval.tracer.end_span(span) - - # Verify stack is empty - stack = zeroeval.tracer._active_spans_ctx.get() - assert len(stack) == 0, f"Stack should be empty, but has {len(stack)} spans" - - # Verify trace is cleaned up - assert trace_id not in zeroeval.tracer._traces, "Unsampled trace not cleaned up" - assert len(zeroeval.tracer._traces) == 0, f"Memory leak: {len(zeroeval.tracer._traces)} traces remain" - assert len(zeroeval.tracer._spans) == 0, f"No spans should be buffered for unsampled traces, but found {len(zeroeval.tracer._spans)}" - - print(f" Unsampled nested spans: Created 5 levels") - print(f" Stack properly cleaned: ✅") - print(f" Trace properly cleaned: ✅") - print(f" No spans buffered: ✅") - - -def main(): - print("Testing ZeroEval Sampling Functionality") - print("="*50) - - all_passed = True - - # Test different sampling rates - rates_to_test = [0.0, 0.1, 0.25, 0.5, 0.75, 1.0] - for rate in rates_to_test: - passed, actual, no_leak = test_sampling_rate(rate, num_traces=500) - all_passed = all_passed and passed and no_leak - - # Test trace completeness - test_trace_completeness() - - # Test cleanup of unsampled spans - test_nested_spans_cleanup() - - print("\n" + "="*50) - if all_passed: - print("✅ All sampling tests PASSED!") - else: - print("❌ Some sampling tests FAILED - check the output above") - - return 0 if all_passed else 1 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/tests/test_client_feedback.py b/tests/test_client_feedback.py new file mode 100644 index 00000000..ff5eb6fe --- /dev/null +++ b/tests/test_client_feedback.py @@ -0,0 +1,203 @@ +"""Tests for ZeroEval client feedback functionality.""" + +import json +from unittest.mock import Mock, patch + +import pytest + +from zeroeval.client import ZeroEval +from zeroeval.errors import PromptRequestError + + +@pytest.fixture +def client(): + """Create a ZeroEval client for testing.""" + return ZeroEval(api_key="test-api-key", base_url="https://api.test.com") + + +@patch("zeroeval.client.requests.post") +def test_send_feedback_success(mock_post, client): + """Test successful feedback submission.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "id": "feedback-123", + "completion_id": "completion-456", + "prompt_id": "prompt-789", + "prompt_version_id": "version-abc", + "project_id": "project-def", + "thumbs_up": True, + "reason": "Great response", + "expected_output": None, + "metadata": {}, + "created_by": "user-123", + "created_at": "2025-01-01T00:00:00Z", + "updated_at": "2025-01-01T00:00:00Z", + } + mock_post.return_value = mock_response + + result = client.send_feedback( + prompt_slug="test-prompt", + completion_id="completion-456", + thumbs_up=True, + reason="Great response", + ) + + # Verify the request was made correctly + mock_post.assert_called_once() + call_args = mock_post.call_args + + # Check URL + assert call_args[0][0] == "https://api.test.com/v1/prompts/test-prompt/completions/completion-456/feedback" + + # Check headers + headers = call_args[1]["headers"] + assert headers["Authorization"] == "Bearer test-api-key" + assert headers["Content-Type"] == "application/json" + + # Check payload + payload = call_args[1]["json"] + assert payload["thumbs_up"] is True + assert payload["reason"] == "Great response" + assert "expected_output" not in payload # Not included when None + assert "metadata" not in payload # Not included when None + + # Check response + assert result["id"] == "feedback-123" + assert result["thumbs_up"] is True + + +@patch("zeroeval.client.requests.post") +def test_send_feedback_negative_with_expected_output(mock_post, client): + """Test negative feedback with expected output.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "id": "feedback-456", + "completion_id": "completion-789", + "thumbs_up": False, + "reason": "Incorrect format", + "expected_output": "Should be JSON", + } + mock_post.return_value = mock_response + + result = client.send_feedback( + prompt_slug="test-prompt", + completion_id="completion-789", + thumbs_up=False, + reason="Incorrect format", + expected_output="Should be JSON", + ) + + # Check payload includes all fields + payload = mock_post.call_args[1]["json"] + assert payload["thumbs_up"] is False + assert payload["reason"] == "Incorrect format" + assert payload["expected_output"] == "Should be JSON" + + assert result["id"] == "feedback-456" + + +@patch("zeroeval.client.requests.post") +def test_send_feedback_with_metadata(mock_post, client): + """Test feedback submission with custom metadata.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "id": "feedback-789", + "thumbs_up": True, + "metadata": {"source": "automated", "version": "1.0"}, + } + mock_post.return_value = mock_response + + result = client.send_feedback( + prompt_slug="test-prompt", + completion_id="completion-abc", + thumbs_up=True, + metadata={"source": "automated", "version": "1.0"}, + ) + + # Check metadata is included + payload = mock_post.call_args[1]["json"] + assert payload["metadata"] == {"source": "automated", "version": "1.0"} + + assert result["metadata"]["source"] == "automated" + + +@patch("zeroeval.client.requests.post") +def test_send_feedback_minimal(mock_post, client): + """Test feedback with only required fields.""" + mock_response = Mock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "id": "feedback-minimal", + "thumbs_up": True, + } + mock_post.return_value = mock_response + + result = client.send_feedback( + prompt_slug="test-prompt", + completion_id="completion-xyz", + thumbs_up=True, + ) + + # Check only thumbs_up is in payload + payload = mock_post.call_args[1]["json"] + assert payload == {"thumbs_up": True} + + assert result["id"] == "feedback-minimal" + + +@patch("zeroeval.client.requests.post") +def test_send_feedback_404_error(mock_post, client): + """Test feedback submission when completion not found.""" + mock_response = Mock() + mock_response.status_code = 404 + mock_response.text = "Completion not found" + mock_post.return_value = mock_response + + with pytest.raises(PromptRequestError) as exc_info: + client.send_feedback( + prompt_slug="test-prompt", + completion_id="nonexistent", + thumbs_up=True, + ) + + assert "send_feedback failed" in str(exc_info.value) + assert "404" in str(exc_info.value.status) + + +@patch("zeroeval.client.requests.post") +def test_send_feedback_500_error(mock_post, client): + """Test feedback submission with server error.""" + mock_response = Mock() + mock_response.status_code = 500 + mock_response.text = "Internal server error" + mock_post.return_value = mock_response + + with pytest.raises(PromptRequestError) as exc_info: + client.send_feedback( + prompt_slug="test-prompt", + completion_id="completion-123", + thumbs_up=False, + reason="Test", + ) + + assert "send_feedback failed" in str(exc_info.value) + assert "500" in str(exc_info.value.status) + + +@patch("zeroeval.client.requests.post") +def test_send_feedback_timeout(mock_post, client): + """Test feedback submission handles timeout correctly.""" + mock_post.side_effect = Exception("Connection timeout") + + with pytest.raises(Exception) as exc_info: + client.send_feedback( + prompt_slug="test-prompt", + completion_id="completion-123", + thumbs_up=True, + ) + + assert "timeout" in str(exc_info.value).lower() +