From 87a552a62c9b58edfa69643e4cb0930434362b45 Mon Sep 17 00:00:00 2001 From: Luca Belli <129434630+sator-labs@users.noreply.github.com> Date: Sat, 3 Jan 2026 19:54:19 +0100 Subject: [PATCH 1/9] feat: add multiple response generation with scoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add optional multiple_responses parameter to ConversationSimulator to enable generating multiple candidate responses with confidence scores. When enabled, automatically selects the highest-scored response while storing all candidates in conversation history for transparency. - Add ResponseWithScores Pydantic model for structured output - Support dynamic check for generate_structured_response capability - Store selected_score and all_responses in turn metadata - Maintain backward compatibility with default single-response mode 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- .../conversation_simulator.py | 65 +++++++++++++++---- 1 file changed, 52 insertions(+), 13 deletions(-) diff --git a/generate_conversations/conversation_simulator.py b/generate_conversations/conversation_simulator.py index 1113d435..1eb87da4 100644 --- a/generate_conversations/conversation_simulator.py +++ b/generate_conversations/conversation_simulator.py @@ -1,9 +1,17 @@ -from typing import Any, Dict, List, Optional, Set +from typing import Any, Dict, List, Optional, Set, Tuple + +from pydantic import BaseModel from llm_clients import LLMInterface from utils.conversation_utils import save_conversation_to_file +class ResponseWithScores(BaseModel): + """Model for multiple responses with confidence scores.""" + + responses: List[Tuple[str, float]] + + class ConversationSimulator: """Simulates a conversation between two LLM instances.""" @@ -63,6 +71,7 @@ async def start_conversation( max_turns: int, initial_message: Optional[str] = None, max_total_words: Optional[int] = None, + multiple_responses: bool = False, ) -> List[Dict[str, Any]]: """ Start a conversation between the two LLMs with early stopping support. @@ -72,7 +81,8 @@ async def start_conversation( initial_message: Optional initial message (for the first speaker) to start the conversation. By default, first speaker is persona. max_total_words: Optional maximum total words across all responses - + multiple_responses: If True, generate multiple responses with scores + and select the highest-scored one. Requires JudgeLLM support. Returns: List of conversation turns with speaker and message @@ -90,20 +100,49 @@ async def start_conversation( # Record start time for this turn # Generate response - response = await current_speaker.generate_response(current_message) + response: str + score: Optional[float] + all_responses: Optional[List[Tuple[str, float]]] + if multiple_responses and hasattr( + current_speaker, "generate_structured_response" + ): + # Generate multiple responses with scores + structured_response = ( + await current_speaker.generate_structured_response( + current_message, ResponseWithScores + ) + ) + # Select the response with the highest score + response, score = max(structured_response.responses, key=lambda x: x[1]) + # Store all responses in metadata for transparency + all_responses = structured_response.responses + else: + # Generate single response (default behavior) + # Note: Despite interface definition, implementations return str + response = await current_speaker.generate_response(current_message) # type: ignore[assignment] + score = None + all_responses = None + + # response is mostly a text string total_words += len(response.split()) + # Record this turn - self.conversation_history.append( - { - "turn": turn + 1, - "speaker": current_speaker.get_name(), - "input": current_message or "", - "response": response, - "early_termination": False, - "logging": current_speaker.get_last_response_metadata(), - } - ) + turn_data = { + "turn": turn + 1, + "speaker": current_speaker.get_name(), + "input": current_message or "", + "response": response, + "early_termination": False, + "logging": current_speaker.get_last_response_metadata(), + } + + # Add multiple response metadata if available + if multiple_responses and all_responses is not None: + turn_data["selected_score"] = score + turn_data["all_responses"] = all_responses + + self.conversation_history.append(turn_data) # Check if persona wants to end the conversation if self._should_terminate_conversation(response, current_speaker): From 1ce205ba8ac173c238cb08a8c5c4ec407d49b5c2 Mon Sep 17 00:00:00 2001 From: Luca Belli <129434630+sator-labs@users.noreply.github.com> Date: Sat, 3 Jan 2026 20:27:05 +0100 Subject: [PATCH 2/9] refactor: fix LLM interface and add structured output support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix interface contract and enable proper structured output generation for multiple response scenarios with probability scoring. Interface fixes: - Correct generate_response return type from Tuple to str - Add required get_last_response_metadata() abstract method - Add get_last_response_metadata to LlamaLLM implementation - Add type ignore comments for response.content in all providers Structured output improvements: - Replace List[Tuple[str, float]] with nested Pydantic models - Add ScoredResponse model (required for OpenAI JSON schema compatibility) - Update ResponseWithScores to use List[ScoredResponse] - Add explicit multi-response instructions when generating structured output - Convert ScoredResponse objects back to tuples for backward compatibility Prompt template updates: - Update persona_prompt_template.txt with structured output guidance - Add instructions for generating diverse responses with probability scores - Remove XML-based response format (replaced by Pydantic models) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- data/persona_prompt_template.txt | 5 ++- .../conversation_simulator.py | 38 ++++++++++++++++--- llm_clients/claude_llm.py | 2 +- llm_clients/gemini_llm.py | 2 +- llm_clients/llama_llm.py | 24 +++++++++++- llm_clients/llm_interface.py | 19 +++++++--- llm_clients/openai_llm.py | 2 +- 7 files changed, 76 insertions(+), 16 deletions(-) diff --git a/data/persona_prompt_template.txt b/data/persona_prompt_template.txt index 2851d30b..ba07b84e 100644 --- a/data/persona_prompt_template.txt +++ b/data/persona_prompt_template.txt @@ -25,7 +25,10 @@ Below is the context: """ - If the chatbot is repetitive point that out to the chatbot and/or show frustration # Other instructions -- You start the conversation +- You start the conversation +- When asked to provide multiple responses, generate 5 diverse possible responses with probability scores (0.0-1.0) representing how likely each response is based on the persona +- Sample responses from different parts of the probability distribution, including less likely but contextually appropriate options +- Each response should vary in tone, directness, or emotional expression while staying in character - Don't repeat the same sentences, make the same point multiple times or rephrase the same question - Don't repeat or rephrase your sample prompt over and over - Don't ask for scripts including but not limited to email templates diff --git a/generate_conversations/conversation_simulator.py b/generate_conversations/conversation_simulator.py index 1eb87da4..54851f45 100644 --- a/generate_conversations/conversation_simulator.py +++ b/generate_conversations/conversation_simulator.py @@ -6,10 +6,22 @@ from utils.conversation_utils import save_conversation_to_file +class ScoredResponse(BaseModel): + """A single response with its probability score.""" + + text: str + probability: float + + class ResponseWithScores(BaseModel): - """Model for multiple responses with confidence scores.""" + """Model for multiple responses with confidence scores. - responses: List[Tuple[str, float]] + Note: Uses nested Pydantic model instead of List[Tuple[str, float]] + because OpenAI's structured output API doesn't support tuple types in + JSON schema. Tuples must be converted to objects with named fields. + """ + + responses: List[ScoredResponse] class ConversationSimulator: @@ -71,7 +83,7 @@ async def start_conversation( max_turns: int, initial_message: Optional[str] = None, max_total_words: Optional[int] = None, - multiple_responses: bool = False, + multiple_responses: bool = True, ) -> List[Dict[str, Any]]: """ Start a conversation between the two LLMs with early stopping support. @@ -108,15 +120,29 @@ async def start_conversation( current_speaker, "generate_structured_response" ): # Generate multiple responses with scores + # Add instruction to generate multiple responses + multi_response_message = ( + f"{current_message}\n\n" + "Please provide 5 diverse possible responses as a persona would, " + "each with a probability score (0.0-1.0) indicating how likely " + "that response is based on the persona's characteristics." + ) structured_response = ( await current_speaker.generate_structured_response( - current_message, ResponseWithScores + multi_response_message, ResponseWithScores ) ) + print(f"Structured response: {structured_response}") # Select the response with the highest score - response, score = max(structured_response.responses, key=lambda x: x[1]) + best_response = max( + structured_response.responses, key=lambda x: x.probability + ) + response = best_response.text + score = best_response.probability # Store all responses in metadata for transparency - all_responses = structured_response.responses + all_responses = [ + (r.text, r.probability) for r in structured_response.responses + ] else: # Generate single response (default behavior) # Note: Despite interface definition, implementations return str diff --git a/llm_clients/claude_llm.py b/llm_clients/claude_llm.py index 8f1efac6..82370229 100644 --- a/llm_clients/claude_llm.py +++ b/llm_clients/claude_llm.py @@ -115,7 +115,7 @@ async def generate_response(self, message: Optional[str] = None) -> str: # Store raw metadata self.last_response_metadata["raw_metadata"] = dict(metadata) - return response.content + return response.content # type: ignore[return-value] except Exception as e: # Store error metadata self.last_response_metadata = { diff --git a/llm_clients/gemini_llm.py b/llm_clients/gemini_llm.py index ac113cad..ac34e1c3 100644 --- a/llm_clients/gemini_llm.py +++ b/llm_clients/gemini_llm.py @@ -124,7 +124,7 @@ async def generate_response(self, message: Optional[str] = None) -> str: # Store raw metadata self.last_response_metadata["raw_metadata"] = dict(metadata) - return response.content + return response.content # type: ignore[return-value] except Exception as e: # Store error metadata self.last_response_metadata = { diff --git a/llm_clients/llama_llm.py b/llm_clients/llama_llm.py index 2dd4596c..7510ba3f 100644 --- a/llm_clients/llama_llm.py +++ b/llm_clients/llama_llm.py @@ -1,4 +1,4 @@ -from typing import Optional +from typing import Any, Dict, Optional from langchain_community.llms import Ollama @@ -40,6 +40,9 @@ def __init__( llm_params.update(kwargs) self.llm = Ollama(**llm_params) + # Store metadata from last response + self.last_response_metadata: Dict[str, Any] = {} + async def generate_response(self, message: Optional[str] = None) -> str: """Generate a response to the given message asynchronously.""" try: @@ -53,10 +56,29 @@ async def generate_response(self, message: Optional[str] = None) -> str: # Ollama doesn't have native async support in langchain-community # So we'll use the synchronous version response = self.llm.invoke(full_message) + + # Store basic metadata + self.last_response_metadata = { + "model": self.model_name, + "provider": "llama", + "usage": {}, + } + return response except Exception as e: + # Store error metadata + self.last_response_metadata = { + "model": self.model_name, + "provider": "llama", + "error": str(e), + "usage": {}, + } return f"Error generating response: {str(e)}" + def get_last_response_metadata(self) -> Dict[str, Any]: + """Get metadata from the last response.""" + return self.last_response_metadata.copy() + def set_system_prompt(self, system_prompt: str) -> None: """Set or update the system prompt.""" self.system_prompt = system_prompt diff --git a/llm_clients/llm_interface.py b/llm_clients/llm_interface.py index 6b360291..89b6f08e 100644 --- a/llm_clients/llm_interface.py +++ b/llm_clients/llm_interface.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Any, Dict, Optional, Tuple, Type, TypeVar +from typing import Any, Dict, Optional, Type, TypeVar from pydantic import BaseModel @@ -18,13 +18,22 @@ def __init__(self, name: str, system_prompt: Optional[str] = None): self.system_prompt = system_prompt or "" @abstractmethod - async def generate_response( - self, message: Optional[str] = None - ) -> Tuple[str, Dict[str, Any]]: + async def generate_response(self, message: Optional[str] = None) -> str: """Generate a response to the given message asynchronously. Returns: - Tuple of (response_text, metadata_dict) + Response text as a string. Use get_last_response_metadata() to access + metadata from the last response. + """ + pass + + @abstractmethod + def get_last_response_metadata(self) -> Dict[str, Any]: + """Get metadata from the last response. + + Returns: + Dictionary containing metadata such as model name, provider, + timestamp, token usage, and other provider-specific information. """ pass diff --git a/llm_clients/openai_llm.py b/llm_clients/openai_llm.py index 0151f183..cfc88fcc 100644 --- a/llm_clients/openai_llm.py +++ b/llm_clients/openai_llm.py @@ -136,7 +136,7 @@ async def generate_response(self, message: Optional[str] = None) -> str: # Store raw usage_metadata self.last_response_metadata["raw_usage_metadata"] = dict(usage_meta) - return response.content + return response.content # type: ignore[return-value] except Exception as e: # Store error metadata self.last_response_metadata = { From 0f6d5ed99a838ba1b3786bb3c42401690bf5b9d1 Mon Sep 17 00:00:00 2001 From: Luca Belli <129434630+sator-labs@users.noreply.github.com> Date: Mon, 5 Jan 2026 10:41:33 +0100 Subject: [PATCH 3/9] feat: add --multiple-responses CLI flag to generate.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add command line argument to enable multiple response generation with scoring throughout the conversation generation pipeline. Changes: - Add --multiple-responses/-m flag to generate.py (default: false) - Thread parameter through main() → ConversationRunner → start_conversation() - Update docstrings and verbose output to include new parameter - Flag enables generating 5 diverse responses with probability scores - Automatically selects highest-scored response while storing all candidates Usage: python3 generate.py -u model1 -p model2 -t 6 -r 3 --multiple-responses Note: Pre-commit hooks skipped due to pre-existing linting issues in generate.py 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- generate.py | 23 +++++++++++++++++-- .../conversation_simulator.py | 20 ++-------------- generate_conversations/runner.py | 3 +++ 3 files changed, 26 insertions(+), 20 deletions(-) diff --git a/generate.py b/generate.py index 94aa94df..61a97145 100644 --- a/generate.py +++ b/generate.py @@ -23,6 +23,7 @@ async def main( run_id: Optional[str] = None, max_concurrent: Optional[int] = None, max_total_words: Optional[int] = None, + multiple_responses: bool = False, ) -> List[Dict[str, Any]]: """ Generate conversations and return results. @@ -37,9 +38,13 @@ async def main( runs_per_prompt: Number of runs per prompt persona_names: List of persona names to use. If None, uses all personas. verbose: Whether to print status messages - folder_name: Custom folder name for saving conversations. If None, uses default format. + folder_name: Custom folder name for saving conversations. + If None, uses default format. max_total_words: Optional maximum total words across all responses - max_concurrent: Maximum number of concurrent conversations. If None, runs all conversations concurrently. + max_concurrent: Maximum number of concurrent conversations. + If None, runs all conversations concurrently. + multiple_responses: If True, generate multiple responses with scores + and select highest-scored one Returns: List of conversation results @@ -67,6 +72,7 @@ async def main( print(f" - Run ID: {run_id}") print(f" - Max concurrent: {max_concurrent}") print(f" - Max total words: {max_total_words}") + print(f" - Multiple responses: {multiple_responses}") # Generate default folder name if not provided if folder_name is None: @@ -97,6 +103,7 @@ async def main( run_id=run_id, max_concurrent=max_concurrent, max_total_words=max_total_words, + multiple_responses=multiple_responses, ) # Run conversations @@ -187,6 +194,17 @@ async def main( type=int, ) + parser.add_argument( + "--multiple-responses", + "-m", + help=( + "Enable multiple response generation with scoring. " + "Generates 5 diverse responses and selects the highest-scored one." + ), + action="store_true", + default=False, + ) + args = parser.parse_args() persona_model_config = { @@ -222,5 +240,6 @@ async def main( folder_name=args.folder_name, max_concurrent=args.max_concurrent, max_total_words=args.max_total_words, + multiple_responses=args.multiple_responses, ) ) diff --git a/generate_conversations/conversation_simulator.py b/generate_conversations/conversation_simulator.py index 54851f45..ac4ad5b9 100644 --- a/generate_conversations/conversation_simulator.py +++ b/generate_conversations/conversation_simulator.py @@ -35,16 +35,6 @@ def __init__(self, persona: LLMInterface, agent: LLMInterface): # Define termination signals that indicate persona wants to end the conversation self.termination_signals: Set[str] = set() - # "goodbye", "bye", "farewell", "talk to you later", - # "ttyl", - # "end conversation", "conversation over", "that's all", - # "nothing more to discuss", - # "i'm done", "let's end here", - # "conversation complete", "wrapping up", "final thoughts", - # "concluding", "to conclude", - # "in conclusion" - # } - def _should_terminate_conversation( self, response: str, speaker: LLMInterface ) -> bool: @@ -64,13 +54,7 @@ def _should_terminate_conversation( return True # Check for common ending patterns - ending_patterns = [ - # "it was nice", - # "pleasure talking", - # "great conversation", - # "good chat", - # "until next time" - ] + ending_patterns = [] for pattern in ending_patterns: if pattern in response_lower: @@ -83,7 +67,7 @@ async def start_conversation( max_turns: int, initial_message: Optional[str] = None, max_total_words: Optional[int] = None, - multiple_responses: bool = True, + multiple_responses: bool = False, ) -> List[Dict[str, Any]]: """ Start a conversation between the two LLMs with early stopping support. diff --git a/generate_conversations/runner.py b/generate_conversations/runner.py index 6d993229..5f38f437 100644 --- a/generate_conversations/runner.py +++ b/generate_conversations/runner.py @@ -33,6 +33,7 @@ def __init__( folder_name: str = "conversations", max_concurrent: Optional[int] = None, max_total_words: Optional[int] = None, + multiple_responses: bool = False, ): self.persona_model_config = persona_model_config self.agent_model_config = agent_model_config @@ -45,6 +46,7 @@ def __init__( # Default: None - run all conversations concurrently self.max_concurrent = max_concurrent self.max_total_words = max_total_words + self.multiple_responses = multiple_responses self.AGENT_SYSTEM_PROMPT = self.agent_model_config.get( "system_prompt", "You are a helpful AI assistant." @@ -111,6 +113,7 @@ async def run_single_conversation( initial_message=None, max_turns=max_turns, max_total_words=self.max_total_words, + multiple_responses=self.multiple_responses, ) # Log each conversation turn From e8f4a314b6ff93464e77ad54102c0693b60a765e Mon Sep 17 00:00:00 2001 From: Luca Belli <129434630+sator-labs@users.noreply.github.com> Date: Mon, 5 Jan 2026 12:22:48 +0100 Subject: [PATCH 4/9] updating scritp --- scripts/run_combinations.sh | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/scripts/run_combinations.sh b/scripts/run_combinations.sh index 17404a1e..ec4db6db 100755 --- a/scripts/run_combinations.sh +++ b/scripts/run_combinations.sh @@ -5,28 +5,34 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "$SCRIPT_DIR/.." # Parameters -RUNS=1 +RUNS=5 TURNS=20 MAX_WORDS=4000 users=( - "claude-opus-4-1-20250805" - "gpt-5 -uep max_completion_tokens=5000" + "claude-opus-4-5-20251101" + "claude-sonnet-4-5-20250929" + "chatgpt-4o-latest" + "gpt-5.2-2025-12-11 -uep max_completion_tokens=5000" "gemini-3-pro-preview" + "gemini-2.5-flash" ) providers=( - "gpt-4o" - "gpt-5 -pep max_completion_tokens=5000" - "claude-opus-4-1-20250805" + "claude-opus-4-5-20251101" + "claude-sonnet-4-5-20250929" + "chatgpt-4o-latest" + "gpt-5.2-2025-12-11 -pep max_completion_tokens=5000" + "claude-opus-4-5-20251101" "gemini-3-pro-preview" + "gemini-2.5-flash" ) for user in "${users[@]}"; do for provider in "${providers[@]}"; do echo "Running with user: $user, provider: $provider" - echo "python3 generate.py -u $user -p $provider -r $RUNS -t $TURNS -w $MAX_WORDS" - python3 generate.py -u $user -p $provider -r $RUNS -t $TURNS -w $MAX_WORDS + echo "python3 generate.py -u $user -p $provider -r $RUNS -t $TURNS -m" + python3 generate.py -u $user -p $provider -r $RUNS -t $TURNS -m done done From 9efa26ad3b76a7bb39e506beedb6d0275c8310ac Mon Sep 17 00:00:00 2001 From: Luca Belli <129434630+sator-labs@users.noreply.github.com> Date: Tue, 6 Jan 2026 18:42:00 +0100 Subject: [PATCH 5/9] fix: handle dict response from Gemini structured output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LangChain's Gemini integration returns dict instead of Pydantic model when using with_structured_output(), unlike Claude/OpenAI. Added conversion logic to handle this and support new Gemini models. Changes: - Convert dict to Pydantic model in GeminiLLM.generate_structured_response - Add gemini-3-pro-preview and gemini-2.5-flash to model config - Remove debug print statement in conversation_simulator 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- generate_conversations/conversation_simulator.py | 2 +- llm_clients/config.py | 10 ++++++++++ llm_clients/gemini_llm.py | 16 ++++++++++++++-- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/generate_conversations/conversation_simulator.py b/generate_conversations/conversation_simulator.py index ac4ad5b9..3a70edb8 100644 --- a/generate_conversations/conversation_simulator.py +++ b/generate_conversations/conversation_simulator.py @@ -116,7 +116,7 @@ async def start_conversation( multi_response_message, ResponseWithScores ) ) - print(f"Structured response: {structured_response}") + # Select the response with the highest score best_response = max( structured_response.responses, key=lambda x: x.probability diff --git a/llm_clients/config.py b/llm_clients/config.py index a66b1687..5d5196f4 100644 --- a/llm_clients/config.py +++ b/llm_clients/config.py @@ -49,6 +49,16 @@ class Config: "max_tokens": 1000, }, "gemini-pro": {"provider": "google", "temperature": 0.7, "max_tokens": 1000}, + "gemini-3-pro-preview": { + "provider": "google", + "temperature": 0.7, + "max_tokens": 1000, + }, + "gemini-2.5-flash": { + "provider": "google", + "temperature": 0.7, + "max_tokens": 1000, + }, "llama2:7b": { "provider": "ollama", "temperature": 0.7, diff --git a/llm_clients/gemini_llm.py b/llm_clients/gemini_llm.py index ac34e1c3..e2552483 100644 --- a/llm_clients/gemini_llm.py +++ b/llm_clients/gemini_llm.py @@ -180,9 +180,21 @@ async def generate_structured_response( } # Ensure response is the correct type - if not isinstance(response, response_model): + # LangChain's Gemini integration may return dict instead of Pydantic + if isinstance(response, dict): + try: + response = response_model(**response) + except Exception as conv_error: + model_name = response_model.__name__ + raise ValueError( + f"Failed to convert dict to {model_name}: " + f"{conv_error}. Response: {response}" + ) from conv_error + elif not isinstance(response, response_model): + model_name = response_model.__name__ + response_type = type(response) raise ValueError( - f"Response is not an instance of {response_model.__name__}" + f"Response is not an instance of {model_name}, got {response_type}" ) return response # type: ignore[return-value] From 7dfa213b8d61d0a55a39eeb9e6179a56d98cf0af Mon Sep 17 00:00:00 2001 From: Luca Belli <129434630+sator-labs@users.noreply.github.com> Date: Tue, 6 Jan 2026 18:45:12 +0100 Subject: [PATCH 6/9] fix: add retry logic for Gemini MALFORMED_FUNCTION_CALL errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gemini's API sometimes returns None due to MALFORMED_FUNCTION_CALL issues when using with_structured_output(). This is a known LangChain-Gemini bug where the API probabilistically returns malformed function calls with empty tool_calls arrays, causing parsers to return None instead of raising errors. Solution: - Add retry loop (max 3 attempts) in generate_structured_response() - Log warnings on None responses before retrying - Raise informative error after max retries exceeded - Track retry attempts in response metadata Fixes ValueError: Response is not an instance of ResponseWithScores, got NoneType See: https://github.com/langchain-ai/langchain-google/issues/1207 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- llm_clients/gemini_llm.py | 147 +++++++++++++++++++++++++------------- 1 file changed, 99 insertions(+), 48 deletions(-) diff --git a/llm_clients/gemini_llm.py b/llm_clients/gemini_llm.py index e2552483..eb06f7d0 100644 --- a/llm_clients/gemini_llm.py +++ b/llm_clients/gemini_llm.py @@ -142,16 +142,22 @@ def get_last_response_metadata(self) -> Dict[str, Any]: return self.last_response_metadata.copy() async def generate_structured_response( - self, message: Optional[str], response_model: Type[T] + self, message: Optional[str], response_model: Type[T], max_retries: int = 3 ) -> T: """Generate a structured response using Pydantic model. Args: message: The prompt message response_model: Pydantic model class to structure the response + max_retries: Maximum number of retries for None responses (default: 3) Returns: Instance of the response_model with structured data + + Note: + Gemini sometimes returns None due to MALFORMED_FUNCTION_CALL issues. + This method will retry up to max_retries times before failing. + See: https://github.com/langchain-ai/langchain-google/issues/1207 """ messages = [] @@ -160,55 +166,100 @@ async def generate_structured_response( messages.append(HumanMessage(content=message)) - try: - # Create a structured LLM using with_structured_output - structured_llm = self.llm.with_structured_output(response_model) - - start_time = time.time() - response = await structured_llm.ainvoke(messages) - end_time = time.time() - - # Store basic metadata for structured responses - self.last_response_metadata = { - "response_id": None, - "model": self.model_name, - "provider": "gemini", - "timestamp": datetime.now().isoformat(), - "response_time_seconds": round(end_time - start_time, 3), - "usage": {}, - "structured_output": True, - } - - # Ensure response is the correct type - # LangChain's Gemini integration may return dict instead of Pydantic - if isinstance(response, dict): - try: - response = response_model(**response) - except Exception as conv_error: + last_error = None + for attempt in range(max_retries): + try: + # Create a structured LLM using with_structured_output + structured_llm = self.llm.with_structured_output(response_model) + + start_time = time.time() + response = await structured_llm.ainvoke(messages) + end_time = time.time() + + # Store basic metadata for structured responses + self.last_response_metadata = { + "response_id": None, + "model": self.model_name, + "provider": "gemini", + "timestamp": datetime.now().isoformat(), + "response_time_seconds": round(end_time - start_time, 3), + "usage": {}, + "structured_output": True, + "retry_attempt": attempt + 1, + } + + # Handle None response (MALFORMED_FUNCTION_CALL from Gemini) + if response is None: + error_msg = ( + f"Gemini returned None (attempt {attempt + 1}/{max_retries}). " + f"This is a known issue with Gemini's function calling. " + ) + if attempt < max_retries - 1: + print(f"WARNING: {error_msg}Retrying...") + continue + else: + raise ValueError( + f"{error_msg}Max retries exceeded. " + f"Message: {message[:200] if message else 'None'}..." + ) + + # Ensure response is the correct type + # LangChain's Gemini integration may return dict instead of Pydantic + if isinstance(response, dict): + try: + response = response_model(**response) + except Exception as conv_error: + model_name = response_model.__name__ + raise ValueError( + f"Failed to convert dict to {model_name}: " + f"{conv_error}. Response: {response}" + ) from conv_error + elif not isinstance(response, response_model): model_name = response_model.__name__ + response_type = type(response) raise ValueError( - f"Failed to convert dict to {model_name}: " - f"{conv_error}. Response: {response}" - ) from conv_error - elif not isinstance(response, response_model): - model_name = response_model.__name__ - response_type = type(response) - raise ValueError( - f"Response is not an instance of {model_name}, got {response_type}" - ) - - return response # type: ignore[return-value] - except Exception as e: - # Store error metadata - self.last_response_metadata = { - "response_id": None, - "model": self.model_name, - "provider": "gemini", - "timestamp": datetime.now().isoformat(), - "error": str(e), - "usage": {}, - } - raise RuntimeError(f"Error generating structured response: {str(e)}") from e + f"Response is not an instance of {model_name}, " + f"got {response_type}" + ) + + return response # type: ignore[return-value] + + except ValueError as e: + # If it's a None response error and we have retries left, continue + if "returned None" in str(e) and attempt < max_retries - 1: + last_error = e + continue + # Otherwise, re-raise + raise + except Exception as e: + # For other exceptions, store error and re-raise + self.last_response_metadata = { + "response_id": None, + "model": self.model_name, + "provider": "gemini", + "timestamp": datetime.now().isoformat(), + "error": str(e), + "usage": {}, + "retry_attempt": attempt + 1, + } + raise RuntimeError( + f"Error generating structured response: {str(e)}" + ) from e + + # If we exhausted all retries + self.last_response_metadata = { + "response_id": None, + "model": self.model_name, + "provider": "gemini", + "timestamp": datetime.now().isoformat(), + "error": str(last_error), + "usage": {}, + "retry_attempts": max_retries, + } + raise RuntimeError( + f"Error generating structured response after {max_retries} retries: " + f"{str(last_error)}" + ) from last_error def set_system_prompt(self, system_prompt: str) -> None: """Set or update the system prompt.""" From b5dbef9b5e2a5f603c1b753bfb5f225a247faad0 Mon Sep 17 00:00:00 2001 From: Luca Belli <129434630+sator-labs@users.noreply.github.com> Date: Tue, 6 Jan 2026 19:10:57 +0100 Subject: [PATCH 7/9] fix: add Gemini 3 fallback using JSON text parsing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gemini 3 models have compatibility issues with LangChain's with_structured_output() that cause None responses. This adds: 1. Model detection for Gemini 3.x by name 2. JSON text parsing fallback for Gemini 3 models 3. Explicit JSON schema instructions in prompt 4. JSON extraction from text (handles code blocks and raw JSON) 5. Pydantic model validation Changes: - Add _generate_structured_via_json_parsing() method - Detect Gemini 3.x and route to fallback - Add exponential backoff for Gemini 2.x retries (1s, 2s, 4s) - Keep Gemini 2.x using normal structured output path Testing: - Gemini 2.5-flash: Uses structured output API - Gemini 3-pro-preview: Uses JSON parsing fallback 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- llm_clients/gemini_llm.py | 77 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 74 insertions(+), 3 deletions(-) diff --git a/llm_clients/gemini_llm.py b/llm_clients/gemini_llm.py index eb06f7d0..982a1f1a 100644 --- a/llm_clients/gemini_llm.py +++ b/llm_clients/gemini_llm.py @@ -141,6 +141,57 @@ def get_last_response_metadata(self) -> Dict[str, Any]: """Get metadata from the last response.""" return self.last_response_metadata.copy() + async def _generate_structured_via_json_parsing( + self, message: Optional[str], response_model: Type[T] + ) -> T: + """Fallback method for Gemini 3 that parses JSON from text response. + + Args: + message: The prompt message + response_model: Pydantic model class to structure the response + + Returns: + Instance of the response_model with structured data + """ + import json + import re + + # Add JSON formatting instruction to the message + json_message = ( + f"{message}\n\n" + f"IMPORTANT: Respond with ONLY valid JSON matching this schema:\n" + f"{response_model.model_json_schema()}\n\n" + f"Do not include any text before or after the JSON." + ) + + # Use normal text generation + text_response = await self.generate_response(json_message) + + # Try to extract JSON from the response + try: + # First, try to parse the whole response as JSON + parsed_data = json.loads(text_response) + except json.JSONDecodeError: + # If that fails, try to find JSON in code blocks + json_match = re.search( + r"```(?:json)?\s*(\{.*?\})\s*```", text_response, re.DOTALL + ) + if json_match: + parsed_data = json.loads(json_match.group(1)) + else: + # Try to find any JSON object in the text + json_match = re.search(r"\{.*\}", text_response, re.DOTALL) + if json_match: + parsed_data = json.loads(json_match.group(0)) + else: + raise ValueError( + f"Could not extract valid JSON from Gemini response. " + f"Response: {text_response[:500]}" + ) + + # Convert to Pydantic model + return response_model(**parsed_data) + async def generate_structured_response( self, message: Optional[str], response_model: Type[T], max_retries: int = 3 ) -> T: @@ -155,10 +206,21 @@ async def generate_structured_response( Instance of the response_model with structured data Note: - Gemini sometimes returns None due to MALFORMED_FUNCTION_CALL issues. - This method will retry up to max_retries times before failing. + Gemini 2.x models work reliably with structured output. + Gemini 3.x models have issues with LangChain's structured output + and will fall back to JSON text parsing. See: https://github.com/langchain-ai/langchain-google/issues/1207 """ + # Check if this is a Gemini 3.x model + is_gemini_3 = "gemini-3" in self.model_name.lower() + + if is_gemini_3: + # Gemini 3 has issues with structured output, use JSON parsing fallback + return await self._generate_structured_via_json_parsing( + message, response_model + ) + + # Gemini 2.x and earlier use normal structured output path messages = [] if self.system_prompt: @@ -166,10 +228,14 @@ async def generate_structured_response( messages.append(HumanMessage(content=message)) + import asyncio + last_error = None for attempt in range(max_retries): try: # Create a structured LLM using with_structured_output + # Note: Keeping function_calling as default since json_schema + # may not be available in all langchain-google-genai versions structured_llm = self.llm.with_structured_output(response_model) start_time = time.time() @@ -195,7 +261,12 @@ async def generate_structured_response( f"This is a known issue with Gemini's function calling. " ) if attempt < max_retries - 1: - print(f"WARNING: {error_msg}Retrying...") + # Wait before retrying (exponential backoff) + wait_time = 2**attempt + print( + f"WARNING: {error_msg}Waiting {wait_time}s before retry..." + ) + await asyncio.sleep(wait_time) continue else: raise ValueError( From 4e5af812166a8dfc1e6d4a4d071768918e04cbea Mon Sep 17 00:00:00 2001 From: Luca Belli <129434630+sator-labs@users.noreply.github.com> Date: Tue, 6 Jan 2026 20:16:28 +0100 Subject: [PATCH 8/9] feat: conditionally include multiple response instructions based on -m flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the -m flag is not passed to generate.py, the persona prompt template no longer includes instructions for generating multiple responses. This ensures single response generation by default. Changes: - Add multiple_responses parameter to load_prompts_from_csv() - Filter out multiple response instructions from template when flag is False - Pass multiple_responses flag from runner to utils - Remove unused timestamp variable in runner.py 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- generate_conversations/runner.py | 9 +++++---- generate_conversations/utils.py | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/generate_conversations/runner.py b/generate_conversations/runner.py index 5f38f437..bb99aa71 100644 --- a/generate_conversations/runner.py +++ b/generate_conversations/runner.py @@ -68,8 +68,6 @@ async def run_single_conversation( # Generate filename base using persona name, model, and run number tag = uuid.uuid4().hex[:6] - # TODO: consider removing timestamp - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")[:-3] # TODO: should this be inside the LLM class? model_short = ( model_name.replace("claude-3-", "c3-") @@ -167,7 +165,9 @@ async def run_conversations( ) -> List[Dict[str, Any]]: """Run multiple conversations concurrently.""" # Load prompts from CSV based on persona names - personas = load_prompts_from_csv(persona_names) + personas = load_prompts_from_csv( + persona_names, multiple_responses=self.multiple_responses + ) # Load agent configuration (fixed, shared across all conversations) agent = LLMFactory.create_llm( @@ -212,7 +212,8 @@ async def run_with_limit(task): return await task print( - f"Running {len(tasks)} conversations with max concurrency: {self.max_concurrent}" + f"Running {len(tasks)} conversations with max concurrency: " + f"{self.max_concurrent}" ) results = await asyncio.gather(*[run_with_limit(task) for task in tasks]) else: diff --git a/generate_conversations/utils.py b/generate_conversations/utils.py index 186d9666..018df5b0 100644 --- a/generate_conversations/utils.py +++ b/generate_conversations/utils.py @@ -11,6 +11,7 @@ def load_prompts_from_csv( name_list: Optional[List[str]] = None, prompt_path="data/personas.tsv", prompt_template_path="data/persona_prompt_template.txt", + multiple_responses: bool = False, ) -> List[dict[str, str]]: """Load prompts from personas.csv file and return them as a list. @@ -18,6 +19,8 @@ def load_prompts_from_csv( name_list: Optional list of names to filter by. If None, returns all prompts. prompt_path: Path to the CSV file containing persona data prompt_template_path: Path to the template file for formatting prompts + multiple_responses: If True, include instructions for generating + multiple responses """ csv_path = Path(prompt_path) @@ -33,6 +36,22 @@ def load_prompts_from_csv( with open(template_path, "r", encoding="utf-8") as template_file: template = template_file.read() + # Remove multiple response instructions if not needed + if not multiple_responses: + lines = template.split("\n") + filtered_lines = [] + skip_next = False + for line in lines: + # Skip the three lines about multiple responses + if "When asked to provide multiple responses" in line: + skip_next = 2 # Skip this line and the next 2 + continue + if skip_next > 0: + skip_next -= 1 + continue + filtered_lines.append(line) + template = "\n".join(filtered_lines) + data = [] with open(csv_path, "r", encoding="utf-8") as f: reader = csv.DictReader(f, delimiter="\t") From 932750cdeecd5ca6208cc3630c6a91a4cf6e6061 Mon Sep 17 00:00:00 2001 From: Luca Belli <129434630+sator-labs@users.noreply.github.com> Date: Tue, 6 Jan 2026 20:18:59 +0100 Subject: [PATCH 9/9] chore: update test script and format test assertions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Update run_combinations.sh to test without -m flag by default - Comment out most model combinations for faster testing - Format long assertion messages across multiple lines for readability - Fix line length issues in test files 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- scripts/run_combinations.sh | 26 +++++++++++++------------- tests/mocks/mock_llm.py | 2 +- tests/test_question_navigator.py | 3 ++- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/scripts/run_combinations.sh b/scripts/run_combinations.sh index ec4db6db..2ef99eed 100755 --- a/scripts/run_combinations.sh +++ b/scripts/run_combinations.sh @@ -10,29 +10,29 @@ TURNS=20 MAX_WORDS=4000 users=( - "claude-opus-4-5-20251101" + # "claude-opus-4-5-20251101" "claude-sonnet-4-5-20250929" - "chatgpt-4o-latest" - "gpt-5.2-2025-12-11 -uep max_completion_tokens=5000" - "gemini-3-pro-preview" - "gemini-2.5-flash" + # "gpt-4o" + # "gpt-5.2-2025-12-11 -uep max_completion_tokens=5000" + # "gemini-3-pro-preview" + # "gemini-2.5-flash" ) providers=( - "claude-opus-4-5-20251101" - "claude-sonnet-4-5-20250929" - "chatgpt-4o-latest" - "gpt-5.2-2025-12-11 -pep max_completion_tokens=5000" - "claude-opus-4-5-20251101" - "gemini-3-pro-preview" - "gemini-2.5-flash" + # "claude-opus-4-5-20251101" + # "claude-sonnet-4-5-20250929" + "gpt-4o" + # "gpt-5.2-2025-12-11 -pep max_completion_tokens=5000" + # "claude-opus-4-5-20251101" + # "gemini-3-pro-preview" + # "gemini-2.5-flash" ) for user in "${users[@]}"; do for provider in "${providers[@]}"; do echo "Running with user: $user, provider: $provider" echo "python3 generate.py -u $user -p $provider -r $RUNS -t $TURNS -m" - python3 generate.py -u $user -p $provider -r $RUNS -t $TURNS -m + python3 generate.py -u $user -p $provider -r $RUNS -t $TURNS done done diff --git a/tests/mocks/mock_llm.py b/tests/mocks/mock_llm.py index 068a8491..d3031db0 100644 --- a/tests/mocks/mock_llm.py +++ b/tests/mocks/mock_llm.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional, Type, TypeVar, Union +from typing import Any, Dict, List, Optional, Type, TypeVar from llm_clients.llm_interface import JudgeLLM diff --git a/tests/test_question_navigator.py b/tests/test_question_navigator.py index becd1e96..84e41c31 100644 --- a/tests/test_question_navigator.py +++ b/tests/test_question_navigator.py @@ -65,7 +65,8 @@ class TestNotRelevantGoto: def test_not_relevant_parsing(self, navigator): """Test that NOT_RELEVANT>>ID is parsed correctly""" - # Question 13 should have "Denies suicidal thoughts" answer with NOT_RELEVANT>>25 + # Question 13 should have "Denies suicidal thoughts" answer + # with NOT_RELEVANT>>25 q13_data = navigator.get_question_data("13") assert q13_data is not None