diff --git a/.gitignore b/.gitignore index efb5264..cf5ea7e 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,6 @@ get_helm.sh .env .ruff_cache data/minio -eval.txt \ No newline at end of file +eval.txt +services/AgentService/prompts/tests +.DS_Store \ No newline at end of file diff --git a/services/AgentService/eval-harness/Makefile b/services/AgentService/eval-harness/Makefile deleted file mode 100644 index 7ba5229..0000000 --- a/services/AgentService/eval-harness/Makefile +++ /dev/null @@ -1,15 +0,0 @@ -promptfoo: - @echo "$(GREEN)Checking npx installation...$(NC)" - @if ! command -v npx >/dev/null 2>&1; then \ - echo "$(RED)Error: npx is not installed. Please install Node.js and npm first.$(NC)"; \ - exit 1; \ - fi - @echo "$(GREEN)Checking promptfoo installation...$(NC)" - @if ! npx promptfoo@latest --version >/dev/null 2>&1; then \ - echo "$(GREEN)Installing promptfoo...$(NC)"; \ - npx promptfoo@latest --version >/dev/null 2>&1 || true; \ - fi - @echo "$(GREEN)Running promptfoo in AgentService prompts directory...$(NC)" - npx promptfoo@latest eval - -.PHONY: promptfoo \ No newline at end of file diff --git a/services/AgentService/eval-harness/__init__.py b/services/AgentService/eval-harness/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/services/AgentService/eval-harness/promptfooconfig.yaml b/services/AgentService/eval-harness/promptfooconfig.yaml deleted file mode 100644 index 41c47f1..0000000 --- a/services/AgentService/eval-harness/promptfooconfig.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# yaml-language-server: $schema=https://promptfoo.dev/config-schema.json - -# Learn more about building a configuration: https://promptfoo.dev/docs/configuration/guide - -description: "Podcast Prompts" - -prompts: - - file://./prompts_iter.py:PodcastPrompts.raw_outline_prompt - -providers: - - id: "https://integrate.api.nvidia.com/v1/chat/completions" - config: - method: "POST" - headers: - "Content-Type": "application/json" - "Authorization": "Bearer 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" - body: - model: "meta/llama-3.1-405b-instruct" - messages: - - role: "user" - content: "{{prompt}}" - responseParser: "json.choices[0].message.content" - -tests: - - vars: - duration: 10 - text: file://./eval.txt diff --git a/services/AgentService/eval-harness/tests/schemas/conversation.json b/services/AgentService/eval-harness/tests/schemas/conversation.json deleted file mode 100644 index 24c0484..0000000 --- a/services/AgentService/eval-harness/tests/schemas/conversation.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "$defs": { - "DialogueEntry": { - "properties": { - "text": { - "title": "Text", - "type": "string" - }, - "speaker": { - "enum": [ - "speaker-1", - "speaker-2" - ], - "title": "Speaker", - "type": "string" - } - }, - "required": [ - "text", - "speaker" - ], - "title": "DialogueEntry", - "type": "object" - } - }, - "properties": { - "scratchpad": { - "title": "Scratchpad", - "type": "string" - }, - "dialogue": { - "items": { - "$ref": "#/$defs/DialogueEntry" - }, - "title": "Dialogue", - "type": "array" - } - }, - "required": [ - "scratchpad", - "dialogue" - ], - "title": "Conversation", - "type": "object" -} \ No newline at end of file diff --git a/services/AgentService/eval-harness/tests/schemas/podcast_outline.json b/services/AgentService/eval-harness/tests/schemas/podcast_outline.json deleted file mode 100644 index 7a07b89..0000000 --- a/services/AgentService/eval-harness/tests/schemas/podcast_outline.json +++ /dev/null @@ -1,49 +0,0 @@ -{ - "$defs": { - "PodcastSegment": { - "properties": { - "section": { - "title": "Section", - "type": "string" - }, - "descriptions": { - "items": { - "type": "string" - }, - "title": "Descriptions", - "type": "array" - }, - "duration": { - "title": "Duration", - "type": "integer" - } - }, - "required": [ - "section", - "descriptions", - "duration" - ], - "title": "PodcastSegment", - "type": "object" - } - }, - "properties": { - "title": { - "title": "Title", - "type": "string" - }, - "segments": { - "items": { - "$ref": "#/$defs/PodcastSegment" - }, - "title": "Segments", - "type": "array" - } - }, - "required": [ - "title", - "segments" - ], - "title": "PodcastOutline", - "type": "object" -} \ No newline at end of file diff --git a/services/AgentService/main.py b/services/AgentService/main.py index cd3540a..289c33a 100644 --- a/services/AgentService/main.py +++ b/services/AgentService/main.py @@ -1,5 +1,5 @@ from fastapi import FastAPI, BackgroundTasks, HTTPException -from shared.shared_types import ServiceType, JobStatus, Conversation +from shared.shared_types import ServiceType, JobStatus, Conversation, PodcastOutline from shared.storage import StorageManager from shared.job import JobStatusManager from shared.otel import OpenTelemetryInstrumentation, OpenTelemetryConfig @@ -33,17 +33,6 @@ # Data Models -class PodcastSegment(BaseModel): - section: str - descriptions: List[str] - duration: int - - -class PodcastOutline(BaseModel): - title: str - segments: List[PodcastSegment] - - class TranscriptionRequest(BaseModel): markdown: str duration: int = 20 diff --git a/services/AgentService/prompts/Makefile b/services/AgentService/prompts/Makefile new file mode 100644 index 0000000..6327b6d --- /dev/null +++ b/services/AgentService/prompts/Makefile @@ -0,0 +1,56 @@ +# Colors for better visibility +GREEN=\033[0;32m +RED=\033[0;31m +NC=\033[0m + +# Directory structure +CONFIG_DIR=configs +TEST_DIR=tests +OUTPUTS_DIR=$(TEST_DIR)/outputs +SCHEMAS_DIR=$(TEST_DIR)/schemas +INPUTS_DIR=$(TEST_DIR)/inputs + +# Ensure promptfoo is installed +ensure-promptfoo: + @echo "$(GREEN)Checking promptfoo installation...$(NC)" + @if ! command -v promptfoo >/dev/null 2>&1; then \ + echo "$(RED)Error: promptfoo is not installed. Installing via brew...$(NC)"; \ + brew install promptfoo || { echo "$(RED)Failed to install promptfoo$(NC)"; exit 1; }; \ + fi + +# Generate JSON schemas +generate-schemas: + @echo "$(GREEN)Generating JSON schemas from Pydantic models...$(NC)" + @python scripts/generate_schemas.py $(SCHEMAS_DIR) + +# Setup test environment +setup-test: ensure-promptfoo + @echo "$(GREEN)Setting up test environment...$(NC)" + @mkdir -p $(INPUTS_DIR) $(OUTPUTS_DIR) $(SCHEMAS_DIR) + @make generate-schemas + +# Run all prompt tests +test-prompts: setup-test + @echo "$(GREEN)Running all prompt tests...$(NC)" + @cd $(CONFIG_DIR) && python ../scripts/run_tests.py + +# Run up to a specific stage +test-upto: setup-test + @if [ -z "$(stage)" ]; then \ + echo "$(RED)Error: Please specify a stage number with 'stage=N'$(NC)"; \ + exit 1; \ + fi + @echo "$(GREEN)Running prompt tests up to stage $(stage)...$(NC)" + @cd $(CONFIG_DIR) && python ../scripts/run_tests.py --up-to $(stage) + +# List all available test stages +test-list: + @echo "$(GREEN)Listing available test stages...$(NC)" + @python scripts/run_tests.py --list + +# Clean test outputs +clean: + @echo "$(GREEN)Cleaning test outputs...$(NC)" + rm -rf $(TEST_DIR) + +.PHONY: ensure-promptfoo generate-schemas setup-test test-prompts test-upto clean test-list \ No newline at end of file diff --git a/services/AgentService/eval-harness/README.md b/services/AgentService/prompts/README.md similarity index 100% rename from services/AgentService/eval-harness/README.md rename to services/AgentService/prompts/README.md diff --git a/services/AgentService/prompts/__init__.py b/services/AgentService/prompts/__init__.py new file mode 100644 index 0000000..3865200 --- /dev/null +++ b/services/AgentService/prompts/__init__.py @@ -0,0 +1,21 @@ +from .prompts import ( + RAW_OUTLINE_PROMPT, + OUTLINE_PROMPT, + SEGMENT_TRANSCRIPT_PROMPT, + DEEP_DIVE_PROMPT, + RAW_PODCAST_DIALOGUE_PROMPT_v2, + FUSE_OUTLINE_PROMPT, + REVISE_PROMPT, + PODCAST_DIALOGUE_PROMPT, +) + +__all__ = [ + 'RAW_OUTLINE_PROMPT', + 'OUTLINE_PROMPT', + 'SEGMENT_TRANSCRIPT_PROMPT', + 'DEEP_DIVE_PROMPT', + 'RAW_PODCAST_DIALOGUE_PROMPT_v2', + 'FUSE_OUTLINE_PROMPT', + 'REVISE_PROMPT', + 'PODCAST_DIALOGUE_PROMPT', +] \ No newline at end of file diff --git a/services/AgentService/prompts/configs/01_raw_outline.yaml b/services/AgentService/prompts/configs/01_raw_outline.yaml new file mode 100644 index 0000000..3d0a7cd --- /dev/null +++ b/services/AgentService/prompts/configs/01_raw_outline.yaml @@ -0,0 +1,28 @@ +description: "01 Raw Outline Generation" +evaluateOptions: + maxConcurrency: 1 + showProgressBar: true + +prompts: + - "file://../prompts.py:PodcastPrompts.raw_outline_prompt" + +providers: + - id: "file://../providers/nim-405b.py" + label: "405b" + +tests: + - description: "Raw Outline Generation" + vars: + text: file://../data/eval.txt + duration: 15 + storeOutputAs: raw_outline_output + assert: + - type: llm-rubric + value: | + Evaluate if the outline: + 1. Has clear sections for background, innovation, impact, and future work + 2. Makes innovation the focus + 3. Allocates time appropriately for {{duration}} minutes + 4. Maintains technical accuracy while being accessible + Pass if all scores >= 0.95 + provider: "file://../scripts/nim-provider.py" diff --git a/services/AgentService/prompts/configs/02_outline_json.yaml b/services/AgentService/prompts/configs/02_outline_json.yaml new file mode 100644 index 0000000..b6a6a4d --- /dev/null +++ b/services/AgentService/prompts/configs/02_outline_json.yaml @@ -0,0 +1,25 @@ +description: "02 Outline JSON Generation" +evaluateOptions: + maxConcurrency: 1 + showProgressBar: true + +prompts: + - "file://../prompts.py:PodcastPrompts.outline_prompt" + +providers: + - id: "file://../providers/nim-8b.py" + label: "8b" + config: + json_schema: file://../tests/schemas/podcast_outline.json + +tests: + - description: "Outline JSON Generation" + vars: + text: file://../tests/outputs/01_raw_outline_results.json + schema: file://../tests/schemas/podcast_outline.json + options: + transformVars: file://../scripts/get-output.py:get_transform + storeOutputAs: outline_json_output + assert: + - type: is-json + value: file://../tests/schemas/podcast_outline.json diff --git a/services/AgentService/prompts.py b/services/AgentService/prompts/prompts.py similarity index 100% rename from services/AgentService/prompts.py rename to services/AgentService/prompts/prompts.py diff --git a/services/AgentService/prompts/providers/nim-405b.py b/services/AgentService/prompts/providers/nim-405b.py new file mode 100644 index 0000000..c34b4b9 --- /dev/null +++ b/services/AgentService/prompts/providers/nim-405b.py @@ -0,0 +1,79 @@ +# chat_provider.py +import requests +import json +from typing import Dict, Any + + +def call_api( + prompt: str, options: Dict[str, Any], context: Dict[str, Any] +) -> Dict[str, Any]: + """ + Custom provider for chat completions using your existing infrastructure. + + Args: + prompt: The prompt text or JSON string of messages + options: Configuration options from the YAML file + context: Test context including variables used + + Returns: + Dict containing output or error + """ + try: + # Get configuration from options + config = options.get("config", {}) + api_base = config.get("api_base", "https://youngthug.demoz.io") + api_key = config.get("api_key") + model = config.get("model", "meta/llama-3.1-405b-instruct") + temperature = config.get("temperature", 0.7) + max_tokens = config.get("max_tokens", 1000) + + # Setup headers + headers = {"Content-Type": "application/json"} + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + + # Handle different prompt formats + try: + # Check if prompt is a JSON string containing messages + messages = json.loads(prompt) + if isinstance(messages, list): + chat_messages = messages + else: + chat_messages = [{"role": "user", "content": prompt}] + except json.JSONDecodeError: + # If not JSON, treat as regular prompt + chat_messages = [{"role": "user", "content": prompt}] + + # Prepare payload + payload = { + "model": model, + "messages": chat_messages, + "temperature": temperature, + "max_tokens": max_tokens, + "stream": False, + } + + # Make request + response = requests.post( + f"{api_base}/v1/chat/completions", headers=headers, json=payload + ) + response.raise_for_status() + result = response.json() + + if "choices" in result and len(result["choices"]) > 0: + output = result["choices"][0]["message"]["content"] + return { + "output": output, + "tokenUsage": result.get( + "usage", {"total": 0, "prompt": 0, "completion": 0} + ), + } + else: + return {"error": "No choices in response"} + + except requests.exceptions.RequestException as e: + return { + "error": f"Request error: {str(e)}\nResponse: {response.text if 'response' in locals() else 'No response'}" + } + except Exception as e: + return {"error": f"Unexpected error: {str(e)}"} diff --git a/services/AgentService/prompts/providers/nim-70b.py b/services/AgentService/prompts/providers/nim-70b.py new file mode 100644 index 0000000..b0b70bd --- /dev/null +++ b/services/AgentService/prompts/providers/nim-70b.py @@ -0,0 +1,79 @@ +# chat_provider.py +import requests +import json +from typing import Dict, Any + + +def call_api( + prompt: str, options: Dict[str, Any], context: Dict[str, Any] +) -> Dict[str, Any]: + """ + Custom provider for chat completions using your existing infrastructure. + + Args: + prompt: The prompt text or JSON string of messages + options: Configuration options from the YAML file + context: Test context including variables used + + Returns: + Dict containing output or error + """ + try: + # Get configuration from options + config = options.get("config", {}) + api_base = config.get("api_base", "https://small-nim-pc8kmx5ae.brevlab.com/") + api_key = config.get("api_key") + model = config.get("model", "meta/llama-3.1-70b-instruct") + temperature = config.get("temperature", 0.7) + max_tokens = config.get("max_tokens", 1000) + + # Setup headers + headers = {"Content-Type": "application/json"} + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + + # Handle different prompt formats + try: + # Check if prompt is a JSON string containing messages + messages = json.loads(prompt) + if isinstance(messages, list): + chat_messages = messages + else: + chat_messages = [{"role": "user", "content": prompt}] + except json.JSONDecodeError: + # If not JSON, treat as regular prompt + chat_messages = [{"role": "user", "content": prompt}] + + # Prepare payload + payload = { + "model": model, + "messages": chat_messages, + "temperature": temperature, + "max_tokens": max_tokens, + "stream": False, + } + + # Make request + response = requests.post( + f"{api_base}/v1/chat/completions", headers=headers, json=payload + ) + response.raise_for_status() + result = response.json() + + if "choices" in result and len(result["choices"]) > 0: + output = result["choices"][0]["message"]["content"] + return { + "output": output, + "tokenUsage": result.get( + "usage", {"total": 0, "prompt": 0, "completion": 0} + ), + } + else: + return {"error": "No choices in response"} + + except requests.exceptions.RequestException as e: + return { + "error": f"Request error: {str(e)}\nResponse: {response.text if 'response' in locals() else 'No response'}" + } + except Exception as e: + return {"error": f"Unexpected error: {str(e)}"} diff --git a/services/AgentService/prompts/providers/nim-8b.py b/services/AgentService/prompts/providers/nim-8b.py new file mode 100644 index 0000000..97538c4 --- /dev/null +++ b/services/AgentService/prompts/providers/nim-8b.py @@ -0,0 +1,89 @@ +# Note we use 8b in json mode + +import requests +import json +from typing import Dict, Any + + +def call_api( + prompt: str, options: Dict[str, Any], context: Dict[str, Any] +) -> Dict[str, Any]: + """ + Custom provider for chat completions using your existing infrastructure. + + Args: + prompt: The prompt text or JSON string of messages + options: Configuration options from the YAML file + context: Test context including variables used + + Returns: + Dict containing output or error + """ + try: + # Get configuration from options + config = options.get("config", {}) + api_base = config.get("api_base", "https://nim-pc8kmx5ae.brevlab.com") + api_key = config.get("api_key") + model = config.get("model", "meta/llama-3.1-8b-instruct") + temperature = config.get("temperature", 0.7) + max_tokens = config.get("max_tokens", 1000) + json_schema = config.get("json_schema", "") + + # Setup headers + headers = {"Content-Type": "application/json"} + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + + # Handle different prompt formats + try: + # Check if prompt is a JSON string containing messages + messages = json.loads(prompt) + if isinstance(messages, list): + chat_messages = messages + else: + chat_messages = [{"role": "user", "content": prompt}] + except json.JSONDecodeError: + # If not JSON, treat as regular prompt + chat_messages = [{"role": "user", "content": prompt}] + + # Prepare payload + payload = { + "model": model, + "messages": chat_messages, + "temperature": temperature, + "max_tokens": max_tokens, + "stream": False, + } + + if json_schema != "": + # Remove 'file://' prefix if present + print(json_schema) + json_schema_path = "/Users/idhanani/Desktop/notebooklm/backend/services/AgentService/prompts/tests/schemas/podcast_outline.json" + with open(json_schema_path, "r") as file: + json_schema_content = file.read() + payload["nvext"] = {"guided_json": json_schema_content} + + # Make request + response = requests.post( + f"{api_base}/v1/chat/completions", headers=headers, json=payload + ) + response.raise_for_status() + result = response.json() + + if "choices" in result and len(result["choices"]) > 0: + output = result["choices"][0]["message"]["content"] + return { + "output": output, + "tokenUsage": result.get( + "usage", {"total": 0, "prompt": 0, "completion": 0} + ), + } + else: + return {"error": "No choices in response"} + + except requests.exceptions.RequestException as e: + return { + "error": f"Request error: {str(e)}\nResponse: {response.text if 'response' in locals() else 'No response'}" + } + except Exception as e: + return {"error": f"Unexpected error: {str(e)}"} diff --git a/services/AgentService/prompts/scripts/generate_schemas.py b/services/AgentService/prompts/scripts/generate_schemas.py new file mode 100644 index 0000000..ab6784e --- /dev/null +++ b/services/AgentService/prompts/scripts/generate_schemas.py @@ -0,0 +1,36 @@ +# tests/generate_schemas.py +import json +import sys +from pathlib import Path +from shared.shared.shared_types import Conversation, PodcastOutline + +# Get the absolute path to the root directory +root_dir = Path(__file__).resolve().parents[4] +sys.path.append(str(root_dir)) + + +def generate_schemas(output_dir: Path): + """Generate JSON schemas from Pydantic models.""" + # Ensure output directory exists + output_dir.mkdir(parents=True, exist_ok=True) + + # Generate and save PodcastOutline schema + podcast_schema = PodcastOutline.model_json_schema() + with open(output_dir / "podcast_outline.json", "w") as f: + json.dump(podcast_schema, f, indent=2) + print("Generated podcast_outline.json") + + # Generate and save Conversation schema + conversation_schema = Conversation.model_json_schema() + with open(output_dir / "conversation.json", "w") as f: + json.dump(conversation_schema, f, indent=2) + print("Generated conversation.json") + + +if __name__ == "__main__": + if len(sys.argv) > 1: + output_dir = Path(sys.argv[1]) + else: + output_dir = Path(__file__).parent / "schemas" + + generate_schemas(output_dir) diff --git a/services/AgentService/prompts/scripts/get-output.py b/services/AgentService/prompts/scripts/get-output.py new file mode 100644 index 0000000..a9bc483 --- /dev/null +++ b/services/AgentService/prompts/scripts/get-output.py @@ -0,0 +1,29 @@ +import json + + +def get_transform(vars, context): + """ + Transform function for promptfoo that extracts the output field from evaluation results JSON + while preserving other variables. + + Args: + vars (dict): Variables passed from promptfoo config + context (dict): Additional context from promptfoo + + Returns: + dict: Transformed variables including the extracted output + """ + try: + # Remove 'file://' prefix if present and get absolute path + file_path = vars["text"].replace("file://", "") + + # Read and parse the JSON file directly without joining paths + with open(file_path, "r") as f: + data = json.load(f) + + # Extract the output and return all vars with transformed text + return {**vars, "text": data["results"]["results"][0]["response"]["output"]} + + except Exception as e: + print(f"Error transforming variables: {e}") + return {**vars, "error": f"Failed to transform variables: {str(e)}"} diff --git a/services/AgentService/prompts/scripts/nim-provider.py b/services/AgentService/prompts/scripts/nim-provider.py new file mode 100644 index 0000000..c34b4b9 --- /dev/null +++ b/services/AgentService/prompts/scripts/nim-provider.py @@ -0,0 +1,79 @@ +# chat_provider.py +import requests +import json +from typing import Dict, Any + + +def call_api( + prompt: str, options: Dict[str, Any], context: Dict[str, Any] +) -> Dict[str, Any]: + """ + Custom provider for chat completions using your existing infrastructure. + + Args: + prompt: The prompt text or JSON string of messages + options: Configuration options from the YAML file + context: Test context including variables used + + Returns: + Dict containing output or error + """ + try: + # Get configuration from options + config = options.get("config", {}) + api_base = config.get("api_base", "https://youngthug.demoz.io") + api_key = config.get("api_key") + model = config.get("model", "meta/llama-3.1-405b-instruct") + temperature = config.get("temperature", 0.7) + max_tokens = config.get("max_tokens", 1000) + + # Setup headers + headers = {"Content-Type": "application/json"} + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + + # Handle different prompt formats + try: + # Check if prompt is a JSON string containing messages + messages = json.loads(prompt) + if isinstance(messages, list): + chat_messages = messages + else: + chat_messages = [{"role": "user", "content": prompt}] + except json.JSONDecodeError: + # If not JSON, treat as regular prompt + chat_messages = [{"role": "user", "content": prompt}] + + # Prepare payload + payload = { + "model": model, + "messages": chat_messages, + "temperature": temperature, + "max_tokens": max_tokens, + "stream": False, + } + + # Make request + response = requests.post( + f"{api_base}/v1/chat/completions", headers=headers, json=payload + ) + response.raise_for_status() + result = response.json() + + if "choices" in result and len(result["choices"]) > 0: + output = result["choices"][0]["message"]["content"] + return { + "output": output, + "tokenUsage": result.get( + "usage", {"total": 0, "prompt": 0, "completion": 0} + ), + } + else: + return {"error": "No choices in response"} + + except requests.exceptions.RequestException as e: + return { + "error": f"Request error: {str(e)}\nResponse: {response.text if 'response' in locals() else 'No response'}" + } + except Exception as e: + return {"error": f"Unexpected error: {str(e)}"} diff --git a/services/AgentService/prompts/scripts/run_tests.py b/services/AgentService/prompts/scripts/run_tests.py new file mode 100644 index 0000000..c728349 --- /dev/null +++ b/services/AgentService/prompts/scripts/run_tests.py @@ -0,0 +1,78 @@ +# prompts/scripts/run_tests.py +import argparse +import subprocess +from pathlib import Path +from typing import List + + +class PromptTestRunner: + def __init__(self, config_dir: str = "configs"): + self.base_dir = Path(__file__).parent.parent # prompts directory + self.config_dir = self.base_dir / config_dir + self.outputs_dir = self.base_dir / "tests/outputs" + + def get_stage_configs(self) -> List[Path]: + """Get all numbered configuration files in order.""" + return sorted(self.config_dir.glob("[0-9][0-9]_*.yaml")) + + def run_stage(self, config_path: Path) -> bool: + """Run a single test stage using promptfoo.""" + print(f"\n=== Running stage: {config_path.stem} ===") + + # Create output path for this stage + output_path = self.outputs_dir / f"{config_path.stem}_results.json" + + result = subprocess.run( + ["promptfoo", "eval", "-c", str(config_path), "--output", str(output_path)], + capture_output=True, + text=True, + ) + + # Print the output regardless of success/failure + if result.stdout: + print(result.stdout) + if result.returncode != 0: + print(f"Error: {result.stderr}") + + return result.returncode == 0 + + def run_all_stages(self) -> None: + """Run all test stages in order.""" + for config in self.get_stage_configs(): + if not self.run_stage(config): + print(f"\nStage {config.stem} failed. Stopping pipeline.") + break + + def run_up_to_stage(self, target_stage: int) -> None: + """Run all stages up to and including the target stage number.""" + for config in self.get_stage_configs(): + stage_num = int(config.stem.split("_")[0]) + if stage_num > target_stage: + break + if not self.run_stage(config): + print(f"\nStage {config.stem} failed. Stopping pipeline.") + break + + +def main(): + parser = argparse.ArgumentParser(description="Run prompt tests in stages") + parser.add_argument("--up-to", type=int, help="Run all stages up to this number") + parser.add_argument( + "--list", action="store_true", help="List all available test stages" + ) + + args = parser.parse_args() + runner = PromptTestRunner() + + if args.list: + print("Available test stages:") + for config in runner.get_stage_configs(): + print(f" - {config.stem}") + elif args.up_to is not None: + runner.run_up_to_stage(args.up_to) + else: + runner.run_all_stages() + + +if __name__ == "__main__": + main() diff --git a/shared/shared/shared_types.py b/shared/shared/shared_types.py index 352b452..2f3bce7 100644 --- a/shared/shared/shared_types.py +++ b/shared/shared/shared_types.py @@ -82,3 +82,14 @@ class ProcessingStep(BaseModel): class PromptTracker(BaseModel): steps: List[ProcessingStep] + + +class PodcastSegment(BaseModel): + section: str + descriptions: List[str] + duration: int + + +class PodcastOutline(BaseModel): + title: str + segments: List[PodcastSegment]