brevdev · ishandhanani · Nov 9, 2024 · Nov 9, 2024 · Nov 9, 2024 · Nov 9, 2024
diff --git a/.gitignore b/.gitignore
@@ -10,4 +10,6 @@ get_helm.sh
 .env
 .ruff_cache
 data/minio
-eval.txt
+eval.txt
+services/AgentService/prompts/tests
+.DS_Store
diff --git a/services/AgentService/eval-harness/Makefile b/services/AgentService/eval-harness/Makefile
diff --git a/services/AgentService/eval-harness/__init__.py b/services/AgentService/eval-harness/__init__.py
diff --git a/services/AgentService/eval-harness/promptfooconfig.yaml b/services/AgentService/eval-harness/promptfooconfig.yaml
diff --git a/services/AgentService/eval-harness/tests/schemas/conversation.json b/services/AgentService/eval-harness/tests/schemas/conversation.json
diff --git a/services/AgentService/eval-harness/tests/schemas/podcast_outline.json b/services/AgentService/eval-harness/tests/schemas/podcast_outline.json
diff --git a/services/AgentService/main.py b/services/AgentService/main.py
@@ -1,5 +1,5 @@
 from fastapi import FastAPI, BackgroundTasks, HTTPException
-from shared.shared_types import ServiceType, JobStatus, Conversation
+from shared.shared_types import ServiceType, JobStatus, Conversation, PodcastOutline
 from shared.storage import StorageManager
 from shared.job import JobStatusManager
 from shared.otel import OpenTelemetryInstrumentation, OpenTelemetryConfig
@@ -33,17 +33,6 @@
 # Data Models
 
 
-class PodcastSegment(BaseModel):
-    section: str
-    descriptions: List[str]
-    duration: int
-
-
-class PodcastOutline(BaseModel):
-    title: str
-    segments: List[PodcastSegment]
-
-
 class TranscriptionRequest(BaseModel):
     markdown: str
     duration: int = 20

diff --git a/services/AgentService/prompts/Makefile b/services/AgentService/prompts/Makefile
@@ -0,0 +1,56 @@
+# Colors for better visibility
+GREEN=\033[0;32m
+RED=\033[0;31m
+NC=\033[0m
+
+# Directory structure
+CONFIG_DIR=configs
+TEST_DIR=tests
+OUTPUTS_DIR=$(TEST_DIR)/outputs
+SCHEMAS_DIR=$(TEST_DIR)/schemas
+INPUTS_DIR=$(TEST_DIR)/inputs
+
+# Ensure promptfoo is installed
+ensure-promptfoo:
+	@echo "$(GREEN)Checking promptfoo installation...$(NC)"
+	@if ! command -v promptfoo >/dev/null 2>&1; then \
+		echo "$(RED)Error: promptfoo is not installed. Installing via brew...$(NC)"; \
+		brew install promptfoo || { echo "$(RED)Failed to install promptfoo$(NC)"; exit 1; }; \
+	fi
+
+# Generate JSON schemas
+generate-schemas:
+	@echo "$(GREEN)Generating JSON schemas from Pydantic models...$(NC)"
+	@python scripts/generate_schemas.py $(SCHEMAS_DIR)
+
+# Setup test environment
+setup-test: ensure-promptfoo
+	@echo "$(GREEN)Setting up test environment...$(NC)"
+	@mkdir -p $(INPUTS_DIR) $(OUTPUTS_DIR) $(SCHEMAS_DIR)
+	@make generate-schemas
+
+# Run all prompt tests
+test-prompts: setup-test
+	@echo "$(GREEN)Running all prompt tests...$(NC)"
+	@cd $(CONFIG_DIR) && python ../scripts/run_tests.py
+
+# Run up to a specific stage
+test-upto: setup-test
+	@if [ -z "$(stage)" ]; then \
+		echo "$(RED)Error: Please specify a stage number with 'stage=N'$(NC)"; \
+		exit 1; \
+	fi
+	@echo "$(GREEN)Running prompt tests up to stage $(stage)...$(NC)"
+	@cd $(CONFIG_DIR) && python ../scripts/run_tests.py --up-to $(stage)
+
+# List all available test stages
+test-list:
+	@echo "$(GREEN)Listing available test stages...$(NC)"
+	@python scripts/run_tests.py --list
+
+# Clean test outputs
+clean:
+	@echo "$(GREEN)Cleaning test outputs...$(NC)"
+	rm -rf $(TEST_DIR)
+
+.PHONY: ensure-promptfoo generate-schemas setup-test test-prompts test-upto clean test-list
diff --git a/services/AgentService/eval-harness/README.md → services/AgentService/prompts/README.md b/services/AgentService/eval-harness/README.md → services/AgentService/prompts/README.md
diff --git a/services/AgentService/prompts/__init__.py b/services/AgentService/prompts/__init__.py
@@ -0,0 +1,21 @@
+from .prompts import (
+    RAW_OUTLINE_PROMPT,
+    OUTLINE_PROMPT,
+    SEGMENT_TRANSCRIPT_PROMPT,
+    DEEP_DIVE_PROMPT,
+    RAW_PODCAST_DIALOGUE_PROMPT_v2,
+    FUSE_OUTLINE_PROMPT,
+    REVISE_PROMPT,
+    PODCAST_DIALOGUE_PROMPT,
+)
+
+__all__ = [
+    'RAW_OUTLINE_PROMPT',
+    'OUTLINE_PROMPT',
+    'SEGMENT_TRANSCRIPT_PROMPT',
+    'DEEP_DIVE_PROMPT',
+    'RAW_PODCAST_DIALOGUE_PROMPT_v2',
+    'FUSE_OUTLINE_PROMPT',
+    'REVISE_PROMPT',
+    'PODCAST_DIALOGUE_PROMPT',
+]
diff --git a/services/AgentService/prompts/configs/01_raw_outline.yaml b/services/AgentService/prompts/configs/01_raw_outline.yaml
@@ -0,0 +1,28 @@
+description: "01 Raw Outline Generation"
+evaluateOptions:
+  maxConcurrency: 1
+  showProgressBar: true
+
+prompts:
+  - "file://../prompts.py:PodcastPrompts.raw_outline_prompt"
+
+providers:
+  - id: "file://../providers/nim-405b.py"
+    label: "405b"
+
+tests:
+  - description: "Raw Outline Generation"
+    vars:
+      text: file://../data/eval.txt
+      duration: 15
+    storeOutputAs: raw_outline_output
+    assert:
+      - type: llm-rubric
+        value: |
+          Evaluate if the outline:
+          1. Has clear sections for background, innovation, impact, and future work
+          2. Makes innovation the focus
+          3. Allocates time appropriately for {{duration}} minutes
+          4. Maintains technical accuracy while being accessible
+          Pass if all scores >= 0.95
+        provider: "file://../scripts/nim-provider.py"
diff --git a/services/AgentService/prompts/configs/02_outline_json.yaml b/services/AgentService/prompts/configs/02_outline_json.yaml
@@ -0,0 +1,25 @@
+description: "02 Outline JSON Generation"
+evaluateOptions:
+  maxConcurrency: 1
+  showProgressBar: true
+
+prompts:
+  - "file://../prompts.py:PodcastPrompts.outline_prompt"
+
+providers:
+  - id: "file://../providers/nim-8b.py"
+    label: "8b"
+    config:
+      json_schema: file://../tests/schemas/podcast_outline.json
+
+tests:
+  - description: "Outline JSON Generation"
+    vars:
+      text: file://../tests/outputs/01_raw_outline_results.json
+      schema: file://../tests/schemas/podcast_outline.json
+    options:
+      transformVars: file://../scripts/get-output.py:get_transform
+    storeOutputAs: outline_json_output
+    assert:
+      - type: is-json
+        value: file://../tests/schemas/podcast_outline.json
diff --git a/services/AgentService/prompts.py → services/AgentService/prompts/prompts.py b/services/AgentService/prompts.py → services/AgentService/prompts/prompts.py
diff --git a/services/AgentService/prompts/providers/nim-405b.py b/services/AgentService/prompts/providers/nim-405b.py
@@ -0,0 +1,79 @@
+# chat_provider.py
+import requests
+import json
+from typing import Dict, Any
+
+
+def call_api(
+    prompt: str, options: Dict[str, Any], context: Dict[str, Any]
+) -> Dict[str, Any]:
+    """
+    Custom provider for chat completions using your existing infrastructure.
+
+    Args:
+        prompt: The prompt text or JSON string of messages
+        options: Configuration options from the YAML file
+        context: Test context including variables used
+
+    Returns:
+        Dict containing output or error
+    """
+    try:
+        # Get configuration from options
+        config = options.get("config", {})
+        api_base = config.get("api_base", "https://youngthug.demoz.io")
+        api_key = config.get("api_key")
+        model = config.get("model", "meta/llama-3.1-405b-instruct")
+        temperature = config.get("temperature", 0.7)
+        max_tokens = config.get("max_tokens", 1000)
+
+        # Setup headers
+        headers = {"Content-Type": "application/json"}
+        if api_key:
+            headers["Authorization"] = f"Bearer {api_key}"
+
+        # Handle different prompt formats
+        try:
+            # Check if prompt is a JSON string containing messages
+            messages = json.loads(prompt)
+            if isinstance(messages, list):
+                chat_messages = messages
+            else:
+                chat_messages = [{"role": "user", "content": prompt}]
+        except json.JSONDecodeError:
+            # If not JSON, treat as regular prompt
+            chat_messages = [{"role": "user", "content": prompt}]
+
+        # Prepare payload
+        payload = {
+            "model": model,
+            "messages": chat_messages,
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+            "stream": False,
+        }
+
+        # Make request
+        response = requests.post(
+            f"{api_base}/v1/chat/completions", headers=headers, json=payload
+        )
+        response.raise_for_status()
+        result = response.json()
+
+        if "choices" in result and len(result["choices"]) > 0:
+            output = result["choices"][0]["message"]["content"]
+            return {
+                "output": output,
+                "tokenUsage": result.get(
+                    "usage", {"total": 0, "prompt": 0, "completion": 0}
+                ),
+            }
+        else:
+            return {"error": "No choices in response"}
+
+    except requests.exceptions.RequestException as e:
+        return {
+            "error": f"Request error: {str(e)}\nResponse: {response.text if 'response' in locals() else 'No response'}"
+        }
+    except Exception as e:
+        return {"error": f"Unexpected error: {str(e)}"}