diff --git a/data/persona_prompt_template.txt b/data/persona_prompt_template.txt
index 2851d30b..ba07b84e 100644
--- a/data/persona_prompt_template.txt
+++ b/data/persona_prompt_template.txt
@@ -25,7 +25,10 @@ Below is the context: """
 - If the chatbot is repetitive point that out to the chatbot and/or show frustration
 
 # Other instructions
-- You start the conversation 
+- You start the conversation
+- When asked to provide multiple responses, generate 5 diverse possible responses with probability scores (0.0-1.0) representing how likely each response is based on the persona
+- Sample responses from different parts of the probability distribution, including less likely but contextually appropriate options
+- Each response should vary in tone, directness, or emotional expression while staying in character
 - Don't repeat the same sentences, make the same point multiple times or rephrase the same question 
 - Don't repeat or rephrase your sample prompt over and over
 - Don't ask for scripts including but not limited to email templates
diff --git a/generate.py b/generate.py
index 94aa94df..61a97145 100644
--- a/generate.py
+++ b/generate.py
@@ -23,6 +23,7 @@ async def main(
     run_id: Optional[str] = None,
     max_concurrent: Optional[int] = None,
     max_total_words: Optional[int] = None,
+    multiple_responses: bool = False,
 ) -> List[Dict[str, Any]]:
     """
     Generate conversations and return results.
@@ -37,9 +38,13 @@ async def main(
         runs_per_prompt: Number of runs per prompt
         persona_names: List of persona names to use. If None, uses all personas.
         verbose: Whether to print status messages
-        folder_name: Custom folder name for saving conversations. If None, uses default format.
+        folder_name: Custom folder name for saving conversations.
+            If None, uses default format.
         max_total_words: Optional maximum total words across all responses
-        max_concurrent: Maximum number of concurrent conversations. If None, runs all conversations concurrently.
+        max_concurrent: Maximum number of concurrent conversations.
+            If None, runs all conversations concurrently.
+        multiple_responses: If True, generate multiple responses with scores
+            and select highest-scored one
 
     Returns:
         List of conversation results
@@ -67,6 +72,7 @@ async def main(
         print(f"  - Run ID: {run_id}")
         print(f"  - Max concurrent: {max_concurrent}")
         print(f"  - Max total words: {max_total_words}")
+        print(f"  - Multiple responses: {multiple_responses}")
 
     # Generate default folder name if not provided
     if folder_name is None:
@@ -97,6 +103,7 @@ async def main(
         run_id=run_id,
         max_concurrent=max_concurrent,
         max_total_words=max_total_words,
+        multiple_responses=multiple_responses,
     )
 
     # Run conversations
@@ -187,6 +194,17 @@ async def main(
         type=int,
     )
 
+    parser.add_argument(
+        "--multiple-responses",
+        "-m",
+        help=(
+            "Enable multiple response generation with scoring. "
+            "Generates 5 diverse responses and selects the highest-scored one."
+        ),
+        action="store_true",
+        default=False,
+    )
+
     args = parser.parse_args()
 
     persona_model_config = {
@@ -222,5 +240,6 @@ async def main(
             folder_name=args.folder_name,
             max_concurrent=args.max_concurrent,
             max_total_words=args.max_total_words,
+            multiple_responses=args.multiple_responses,
         )
     )
diff --git a/generate_conversations/conversation_simulator.py b/generate_conversations/conversation_simulator.py
index 1113d435..3a70edb8 100644
--- a/generate_conversations/conversation_simulator.py
+++ b/generate_conversations/conversation_simulator.py
@@ -1,9 +1,29 @@
-from typing import Any, Dict, List, Optional, Set
+from typing import Any, Dict, List, Optional, Set, Tuple
+
+from pydantic import BaseModel
 
 from llm_clients import LLMInterface
 from utils.conversation_utils import save_conversation_to_file
 
 
+class ScoredResponse(BaseModel):
+    """A single response with its probability score."""
+
+    text: str
+    probability: float
+
+
+class ResponseWithScores(BaseModel):
+    """Model for multiple responses with confidence scores.
+
+    Note: Uses nested Pydantic model instead of List[Tuple[str, float]]
+    because OpenAI's structured output API doesn't support tuple types in
+    JSON schema. Tuples must be converted to objects with named fields.
+    """
+
+    responses: List[ScoredResponse]
+
+
 class ConversationSimulator:
     """Simulates a conversation between two LLM instances."""
 
@@ -15,16 +35,6 @@ def __init__(self, persona: LLMInterface, agent: LLMInterface):
         # Define termination signals that indicate persona wants to end the conversation
         self.termination_signals: Set[str] = set()
 
-        # "goodbye", "bye", "farewell", "talk to you later",
-        # "ttyl",
-        # "end conversation", "conversation over", "that's all",
-        # "nothing more to discuss",
-        # "i'm done", "let's end here",
-        # "conversation complete", "wrapping up", "final thoughts",
-        # "concluding", "to conclude",
-        # "in conclusion"
-        # }
-
     def _should_terminate_conversation(
         self, response: str, speaker: LLMInterface
     ) -> bool:
@@ -44,13 +54,7 @@ def _should_terminate_conversation(
                 return True
 
         # Check for common ending patterns
-        ending_patterns = [
-            # "it was nice",
-            # "pleasure talking",
-            # "great conversation",
-            # "good chat",
-            # "until next time"
-        ]
+        ending_patterns = []
 
         for pattern in ending_patterns:
             if pattern in response_lower:
@@ -63,6 +67,7 @@ async def start_conversation(
         max_turns: int,
         initial_message: Optional[str] = None,
         max_total_words: Optional[int] = None,
+        multiple_responses: bool = False,
     ) -> List[Dict[str, Any]]:
         """
         Start a conversation between the two LLMs with early stopping support.
@@ -72,7 +77,8 @@ async def start_conversation(
             initial_message: Optional initial message (for the first speaker)
                 to start the conversation. By default, first speaker is persona.
             max_total_words: Optional maximum total words across all responses
-
+            multiple_responses: If True, generate multiple responses with scores
+                and select the highest-scored one. Requires JudgeLLM support.
 
         Returns:
             List of conversation turns with speaker and message
@@ -90,20 +96,63 @@ async def start_conversation(
             # Record start time for this turn
 
             # Generate response
-            response = await current_speaker.generate_response(current_message)
+            response: str
+            score: Optional[float]
+            all_responses: Optional[List[Tuple[str, float]]]
 
+            if multiple_responses and hasattr(
+                current_speaker, "generate_structured_response"
+            ):
+                # Generate multiple responses with scores
+                # Add instruction to generate multiple responses
+                multi_response_message = (
+                    f"{current_message}\n\n"
+                    "Please provide 5 diverse possible responses as a persona would, "
+                    "each with a probability score (0.0-1.0) indicating how likely "
+                    "that response is based on the persona's characteristics."
+                )
+                structured_response = (
+                    await current_speaker.generate_structured_response(
+                        multi_response_message, ResponseWithScores
+                    )
+                )
+
+                # Select the response with the highest score
+                best_response = max(
+                    structured_response.responses, key=lambda x: x.probability
+                )
+                response = best_response.text
+                score = best_response.probability
+                # Store all responses in metadata for transparency
+                all_responses = [
+                    (r.text, r.probability) for r in structured_response.responses
+                ]
+            else:
+                # Generate single response (default behavior)
+                # Note: Despite interface definition, implementations return str
+                response = await current_speaker.generate_response(current_message)  # type: ignore[assignment]
+                score = None
+                all_responses = None
+
+            # response is mostly a text string
             total_words += len(response.split())
+
             # Record this turn
-            self.conversation_history.append(
-                {
-                    "turn": turn + 1,
-                    "speaker": current_speaker.get_name(),
-                    "input": current_message or "",
-                    "response": response,
-                    "early_termination": False,
-                    "logging": current_speaker.get_last_response_metadata(),
-                }
-            )
+            turn_data = {
+                "turn": turn + 1,
+                "speaker": current_speaker.get_name(),
+                "input": current_message or "",
+                "response": response,
+                "early_termination": False,
+                "logging": current_speaker.get_last_response_metadata(),
+            }
+
+            # Add multiple response metadata if available
+            if multiple_responses and all_responses is not None:
+                turn_data["selected_score"] = score
+                turn_data["all_responses"] = all_responses
+
+            self.conversation_history.append(turn_data)
 
             # Check if persona wants to end the conversation
             if self._should_terminate_conversation(response, current_speaker):
diff --git a/generate_conversations/runner.py b/generate_conversations/runner.py
index 6d993229..bb99aa71 100644
--- a/generate_conversations/runner.py
+++ b/generate_conversations/runner.py
@@ -33,6 +33,7 @@ def __init__(
         folder_name: str = "conversations",
         max_concurrent: Optional[int] = None,
         max_total_words: Optional[int] = None,
+        multiple_responses: bool = False,
     ):
         self.persona_model_config = persona_model_config
         self.agent_model_config = agent_model_config
@@ -45,6 +46,7 @@ def __init__(
         # Default: None - run all conversations concurrently
         self.max_concurrent = max_concurrent
         self.max_total_words = max_total_words
+        self.multiple_responses = multiple_responses
 
         self.AGENT_SYSTEM_PROMPT = self.agent_model_config.get(
             "system_prompt", "You are a helpful AI assistant."
@@ -66,8 +68,6 @@ async def run_single_conversation(
 
         # Generate filename base using persona name, model, and run number
         tag = uuid.uuid4().hex[:6]
-        # TODO: consider removing timestamp
-        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3]
         # TODO: should this be inside the LLM class?
         model_short = (
             model_name.replace("claude-3-", "c3-")
@@ -111,6 +111,7 @@ async def run_single_conversation(
             initial_message=None,
             max_turns=max_turns,
             max_total_words=self.max_total_words,
+            multiple_responses=self.multiple_responses,
         )
 
         # Log each conversation turn
@@ -164,7 +165,9 @@ async def run_conversations(
     ) -> List[Dict[str, Any]]:
         """Run multiple conversations concurrently."""
         # Load prompts from CSV based on persona names
-        personas = load_prompts_from_csv(persona_names)
+        personas = load_prompts_from_csv(
+            persona_names, multiple_responses=self.multiple_responses
+        )
 
         # Load agent configuration (fixed, shared across all conversations)
         agent = LLMFactory.create_llm(
@@ -209,7 +212,8 @@ async def run_with_limit(task):
                     return await task
 
             print(
-                f"Running {len(tasks)} conversations with max concurrency: {self.max_concurrent}"
+                f"Running {len(tasks)} conversations with max concurrency: "
+                f"{self.max_concurrent}"
             )
             results = await asyncio.gather(*[run_with_limit(task) for task in tasks])
         else:
diff --git a/generate_conversations/utils.py b/generate_conversations/utils.py
index 186d9666..018df5b0 100644
--- a/generate_conversations/utils.py
+++ b/generate_conversations/utils.py
@@ -11,6 +11,7 @@ def load_prompts_from_csv(
     name_list: Optional[List[str]] = None,
     prompt_path="data/personas.tsv",
     prompt_template_path="data/persona_prompt_template.txt",
+    multiple_responses: bool = False,
 ) -> List[dict[str, str]]:
     """Load prompts from personas.csv file and return them as a list.
 
@@ -18,6 +19,8 @@ def load_prompts_from_csv(
         name_list: Optional list of names to filter by. If None, returns all prompts.
         prompt_path: Path to the CSV file containing persona data
         prompt_template_path: Path to the template file for formatting prompts
+        multiple_responses: If True, include instructions for generating
+            multiple responses
     """
 
     csv_path = Path(prompt_path)
@@ -33,6 +36,22 @@ def load_prompts_from_csv(
     with open(template_path, "r", encoding="utf-8") as template_file:
         template = template_file.read()
 
+    # Remove multiple response instructions if not needed
+    if not multiple_responses:
+        lines = template.split("\n")
+        filtered_lines = []
+        skip_next = False
+        for line in lines:
+            # Skip the three lines about multiple responses
+            if "When asked to provide multiple responses" in line:
+                skip_next = 2  # Skip this line and the next 2
+                continue
+            if skip_next > 0:
+                skip_next -= 1
+                continue
+            filtered_lines.append(line)
+        template = "\n".join(filtered_lines)
+
     data = []
     with open(csv_path, "r", encoding="utf-8") as f:
         reader = csv.DictReader(f, delimiter="\t")
diff --git a/llm_clients/claude_llm.py b/llm_clients/claude_llm.py
index 8f1efac6..82370229 100644
--- a/llm_clients/claude_llm.py
+++ b/llm_clients/claude_llm.py
@@ -115,7 +115,7 @@ async def generate_response(self, message: Optional[str] = None) -> str:
                 # Store raw metadata
                 self.last_response_metadata["raw_metadata"] = dict(metadata)
 
-            return response.content
+            return response.content  # type: ignore[return-value]
         except Exception as e:
             # Store error metadata
             self.last_response_metadata = {
diff --git a/llm_clients/config.py b/llm_clients/config.py
index a66b1687..5d5196f4 100644
--- a/llm_clients/config.py
+++ b/llm_clients/config.py
@@ -49,6 +49,16 @@ class Config:
             "max_tokens": 1000,
         },
         "gemini-pro": {"provider": "google", "temperature": 0.7, "max_tokens": 1000},
+        "gemini-3-pro-preview": {
+            "provider": "google",
+            "temperature": 0.7,
+            "max_tokens": 1000,
+        },
+        "gemini-2.5-flash": {
+            "provider": "google",
+            "temperature": 0.7,
+            "max_tokens": 1000,
+        },
         "llama2:7b": {
             "provider": "ollama",
             "temperature": 0.7,
diff --git a/llm_clients/gemini_llm.py b/llm_clients/gemini_llm.py
index ac113cad..982a1f1a 100644
--- a/llm_clients/gemini_llm.py
+++ b/llm_clients/gemini_llm.py
@@ -124,7 +124,7 @@ async def generate_response(self, message: Optional[str] = None) -> str:
                 # Store raw metadata
                 self.last_response_metadata["raw_metadata"] = dict(metadata)
 
-            return response.content
+            return response.content  # type: ignore[return-value]
         except Exception as e:
             # Store error metadata
             self.last_response_metadata = {
@@ -141,10 +141,10 @@ def get_last_response_metadata(self) -> Dict[str, Any]:
         """Get metadata from the last response."""
         return self.last_response_metadata.copy()
 
-    async def generate_structured_response(
+    async def _generate_structured_via_json_parsing(
         self, message: Optional[str], response_model: Type[T]
     ) -> T:
-        """Generate a structured response using Pydantic model.
+        """Fallback method for Gemini 3 that parses JSON from text response.
 
         Args:
             message: The prompt message
@@ -153,50 +153,184 @@ async def generate_structured_response(
         Returns:
             Instance of the response_model with structured data
         """
-        messages = []
+        import json
+        import re
 
-        if self.system_prompt:
-            messages.append(SystemMessage(content=self.system_prompt))
+        # Add JSON formatting instruction to the message
+        json_message = (
+            f"{message}\n\n"
+            f"IMPORTANT: Respond with ONLY valid JSON matching this schema:\n"
+            f"{response_model.model_json_schema()}\n\n"
+            f"Do not include any text before or after the JSON."
+        )
 
-        messages.append(HumanMessage(content=message))
+        # Use normal text generation
+        text_response = await self.generate_response(json_message)
 
+        # Try to extract JSON from the response
         try:
-            # Create a structured LLM using with_structured_output
-            structured_llm = self.llm.with_structured_output(response_model)
+            # First, try to parse the whole response as JSON
+            parsed_data = json.loads(text_response)
+        except json.JSONDecodeError:
+            # If that fails, try to find JSON in code blocks
+            json_match = re.search(
+                r"```(?:json)?\s*(\{.*?\})\s*```", text_response, re.DOTALL
+            )
+            if json_match:
+                parsed_data = json.loads(json_match.group(1))
+            else:
+                # Try to find any JSON object in the text
+                json_match = re.search(r"\{.*\}", text_response, re.DOTALL)
+                if json_match:
+                    parsed_data = json.loads(json_match.group(0))
+                else:
+                    raise ValueError(
+                        f"Could not extract valid JSON from Gemini response. "
+                        f"Response: {text_response[:500]}"
+                    )
+
+        # Convert to Pydantic model
+        return response_model(**parsed_data)
 
-            start_time = time.time()
-            response = await structured_llm.ainvoke(messages)
-            end_time = time.time()
+    async def generate_structured_response(
+        self, message: Optional[str], response_model: Type[T], max_retries: int = 3
+    ) -> T:
+        """Generate a structured response using Pydantic model.
 
-            # Store basic metadata for structured responses
-            self.last_response_metadata = {
-                "response_id": None,
-                "model": self.model_name,
-                "provider": "gemini",
-                "timestamp": datetime.now().isoformat(),
-                "response_time_seconds": round(end_time - start_time, 3),
-                "usage": {},
-                "structured_output": True,
-            }
+        Args:
+            message: The prompt message
+            response_model: Pydantic model class to structure the response
+            max_retries: Maximum number of retries for None responses (default: 3)
 
-            # Ensure response is the correct type
-            if not isinstance(response, response_model):
-                raise ValueError(
-                    f"Response is not an instance of {response_model.__name__}"
-                )
+        Returns:
+            Instance of the response_model with structured data
 
-            return response  # type: ignore[return-value]
-        except Exception as e:
-            # Store error metadata
-            self.last_response_metadata = {
-                "response_id": None,
-                "model": self.model_name,
-                "provider": "gemini",
-                "timestamp": datetime.now().isoformat(),
-                "error": str(e),
-                "usage": {},
-            }
-            raise RuntimeError(f"Error generating structured response: {str(e)}") from e
+        Note:
+            Gemini 2.x models work reliably with structured output.
+            Gemini 3.x models have issues with LangChain's structured output
+            and will fall back to JSON text parsing.
+            See: https://github.com/langchain-ai/langchain-google/issues/1207
+        """
+        # Check if this is a Gemini 3.x model
+        is_gemini_3 = "gemini-3" in self.model_name.lower()
+
+        if is_gemini_3:
+            # Gemini 3 has issues with structured output, use JSON parsing fallback
+            return await self._generate_structured_via_json_parsing(
+                message, response_model
+            )
+
+        # Gemini 2.x and earlier use normal structured output path
+        messages = []
+
+        if self.system_prompt:
+            messages.append(SystemMessage(content=self.system_prompt))
+
+        messages.append(HumanMessage(content=message))
+
+        import asyncio
+
+        last_error = None
+        for attempt in range(max_retries):
+            try:
+                # Create a structured LLM using with_structured_output
+                # Note: Keeping function_calling as default since json_schema
+                # may not be available in all langchain-google-genai versions
+                structured_llm = self.llm.with_structured_output(response_model)
+
+                start_time = time.time()
+                response = await structured_llm.ainvoke(messages)
+                end_time = time.time()
+
+                # Store basic metadata for structured responses
+                self.last_response_metadata = {
+                    "response_id": None,
+                    "model": self.model_name,
+                    "provider": "gemini",
+                    "timestamp": datetime.now().isoformat(),
+                    "response_time_seconds": round(end_time - start_time, 3),
+                    "usage": {},
+                    "structured_output": True,
+                    "retry_attempt": attempt + 1,
+                }
+
+                # Handle None response (MALFORMED_FUNCTION_CALL from Gemini)
+                if response is None:
+                    error_msg = (
+                        f"Gemini returned None (attempt {attempt + 1}/{max_retries}). "
+                        f"This is a known issue with Gemini's function calling. "
+                    )
+                    if attempt < max_retries - 1:
+                        # Wait before retrying (exponential backoff)
+                        wait_time = 2**attempt
+                        print(
+                            f"WARNING: {error_msg}Waiting {wait_time}s before retry..."
+                        )
+                        await asyncio.sleep(wait_time)
+                        continue
+                    else:
+                        raise ValueError(
+                            f"{error_msg}Max retries exceeded. "
+                            f"Message: {message[:200] if message else 'None'}..."
+                        )
+
+                # Ensure response is the correct type
+                # LangChain's Gemini integration may return dict instead of Pydantic
+                if isinstance(response, dict):
+                    try:
+                        response = response_model(**response)
+                    except Exception as conv_error:
+                        model_name = response_model.__name__
+                        raise ValueError(
+                            f"Failed to convert dict to {model_name}: "
+                            f"{conv_error}. Response: {response}"
+                        ) from conv_error
+                elif not isinstance(response, response_model):
+                    model_name = response_model.__name__
+                    response_type = type(response)
+                    raise ValueError(
+                        f"Response is not an instance of {model_name}, "
+                        f"got {response_type}"
+                    )
+
+                return response  # type: ignore[return-value]
+
+            except ValueError as e:
+                # If it's a None response error and we have retries left, continue
+                if "returned None" in str(e) and attempt < max_retries - 1:
+                    last_error = e
+                    continue
+                # Otherwise, re-raise
+                raise
+            except Exception as e:
+                # For other exceptions, store error and re-raise
+                self.last_response_metadata = {
+                    "response_id": None,
+                    "model": self.model_name,
+                    "provider": "gemini",
+                    "timestamp": datetime.now().isoformat(),
+                    "error": str(e),
+                    "usage": {},
+                    "retry_attempt": attempt + 1,
+                }
+                raise RuntimeError(
+                    f"Error generating structured response: {str(e)}"
+                ) from e
+
+        # If we exhausted all retries
+        self.last_response_metadata = {
+            "response_id": None,
+            "model": self.model_name,
+            "provider": "gemini",
+            "timestamp": datetime.now().isoformat(),
+            "error": str(last_error),
+            "usage": {},
+            "retry_attempts": max_retries,
+        }
+        raise RuntimeError(
+            f"Error generating structured response after {max_retries} retries: "
+            f"{str(last_error)}"
+        ) from last_error
 
     def set_system_prompt(self, system_prompt: str) -> None:
         """Set or update the system prompt."""
diff --git a/llm_clients/llama_llm.py b/llm_clients/llama_llm.py
index 2dd4596c..7510ba3f 100644
--- a/llm_clients/llama_llm.py
+++ b/llm_clients/llama_llm.py
@@ -1,4 +1,4 @@
-from typing import Optional
+from typing import Any, Dict, Optional
 
 from langchain_community.llms import Ollama
 
@@ -40,6 +40,9 @@ def __init__(
         llm_params.update(kwargs)
         self.llm = Ollama(**llm_params)
 
+        # Store metadata from last response
+        self.last_response_metadata: Dict[str, Any] = {}
+
     async def generate_response(self, message: Optional[str] = None) -> str:
         """Generate a response to the given message asynchronously."""
         try:
@@ -53,10 +56,29 @@ async def generate_response(self, message: Optional[str] = None) -> str:
             # Ollama doesn't have native async support in langchain-community
             # So we'll use the synchronous version
             response = self.llm.invoke(full_message)
+
+            # Store basic metadata
+            self.last_response_metadata = {
+                "model": self.model_name,
+                "provider": "llama",
+                "usage": {},
+            }
+
             return response
         except Exception as e:
+            # Store error metadata
+            self.last_response_metadata = {
+                "model": self.model_name,
+                "provider": "llama",
+                "error": str(e),
+                "usage": {},
+            }
             return f"Error generating response: {str(e)}"
 
+    def get_last_response_metadata(self) -> Dict[str, Any]:
+        """Get metadata from the last response."""
+        return self.last_response_metadata.copy()
+
     def set_system_prompt(self, system_prompt: str) -> None:
         """Set or update the system prompt."""
         self.system_prompt = system_prompt
diff --git a/llm_clients/llm_interface.py b/llm_clients/llm_interface.py
index 6b360291..89b6f08e 100644
--- a/llm_clients/llm_interface.py
+++ b/llm_clients/llm_interface.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, Dict, Optional, Tuple, Type, TypeVar
+from typing import Any, Dict, Optional, Type, TypeVar
 
 from pydantic import BaseModel
 
@@ -18,13 +18,22 @@ def __init__(self, name: str, system_prompt: Optional[str] = None):
         self.system_prompt = system_prompt or ""
 
     @abstractmethod
-    async def generate_response(
-        self, message: Optional[str] = None
-    ) -> Tuple[str, Dict[str, Any]]:
+    async def generate_response(self, message: Optional[str] = None) -> str:
         """Generate a response to the given message asynchronously.
 
         Returns:
-            Tuple of (response_text, metadata_dict)
+            Response text as a string. Use get_last_response_metadata() to access
+            metadata from the last response.
+        """
+        pass
+
+    @abstractmethod
+    def get_last_response_metadata(self) -> Dict[str, Any]:
+        """Get metadata from the last response.
+
+        Returns:
+            Dictionary containing metadata such as model name, provider,
+            timestamp, token usage, and other provider-specific information.
         """
         pass
 
diff --git a/llm_clients/openai_llm.py b/llm_clients/openai_llm.py
index 0151f183..cfc88fcc 100644
--- a/llm_clients/openai_llm.py
+++ b/llm_clients/openai_llm.py
@@ -136,7 +136,7 @@ async def generate_response(self, message: Optional[str] = None) -> str:
                 # Store raw usage_metadata
                 self.last_response_metadata["raw_usage_metadata"] = dict(usage_meta)
 
-            return response.content
+            return response.content  # type: ignore[return-value]
         except Exception as e:
             # Store error metadata
             self.last_response_metadata = {
diff --git a/scripts/run_combinations.sh b/scripts/run_combinations.sh
index 17404a1e..2ef99eed 100755
--- a/scripts/run_combinations.sh
+++ b/scripts/run_combinations.sh
@@ -5,28 +5,34 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$SCRIPT_DIR/.."
 
 # Parameters
-RUNS=1
+RUNS=5
 TURNS=20
 MAX_WORDS=4000
 
 users=(
-    "claude-opus-4-1-20250805"
-    "gpt-5 -uep max_completion_tokens=5000"
-    "gemini-3-pro-preview"
+    # "claude-opus-4-5-20251101"
+    "claude-sonnet-4-5-20250929"
+    # "gpt-4o"
+    # "gpt-5.2-2025-12-11 -uep max_completion_tokens=5000"
+    # "gemini-3-pro-preview"
+    # "gemini-2.5-flash"
 )
 
 providers=(
+    # "claude-opus-4-5-20251101"
+    # "claude-sonnet-4-5-20250929"
     "gpt-4o"
-    "gpt-5 -pep max_completion_tokens=5000"
-    "claude-opus-4-1-20250805"
-    "gemini-3-pro-preview"
+    # "gpt-5.2-2025-12-11 -pep max_completion_tokens=5000"
+    # "claude-opus-4-5-20251101"
+    # "gemini-3-pro-preview"
+    # "gemini-2.5-flash"
 )
 
 for user in "${users[@]}"; do
     for provider in "${providers[@]}"; do
         echo "Running with user: $user, provider: $provider"
-        echo "python3 generate.py -u $user -p $provider -r $RUNS -t $TURNS -w $MAX_WORDS"
-        python3 generate.py -u $user -p $provider -r $RUNS -t $TURNS -w $MAX_WORDS 
+        echo "python3 generate.py -u $user -p $provider -r $RUNS -t $TURNS -m"
+        python3 generate.py -u $user -p $provider -r $RUNS -t $TURNS
     done
 done
 
diff --git a/tests/mocks/mock_llm.py b/tests/mocks/mock_llm.py
index 068a8491..d3031db0 100644
--- a/tests/mocks/mock_llm.py
+++ b/tests/mocks/mock_llm.py
@@ -1,4 +1,4 @@
-from typing import Any, Dict, List, Optional, Type, TypeVar, Union
+from typing import Any, Dict, List, Optional, Type, TypeVar
 
 from llm_clients.llm_interface import JudgeLLM
 
diff --git a/tests/test_question_navigator.py b/tests/test_question_navigator.py
index becd1e96..84e41c31 100644
--- a/tests/test_question_navigator.py
+++ b/tests/test_question_navigator.py
@@ -65,7 +65,8 @@ class TestNotRelevantGoto:
 
     def test_not_relevant_parsing(self, navigator):
         """Test that NOT_RELEVANT>>ID is parsed correctly"""
-        # Question 13 should have "Denies suicidal thoughts" answer with NOT_RELEVANT>>25
+        # Question 13 should have "Denies suicidal thoughts" answer
+        # with NOT_RELEVANT>>25
         q13_data = navigator.get_question_data("13")
         assert q13_data is not None