Handle planner follow-ups for empty tool results

jck411 · jck411 · commit 8f0486b69d13 · 2025-10-29T04:35:08.000-04:00
diff --git a/data/model_settings.json b/data/model_settings.json
@@ -1,5 +1,5 @@
 {
   "model": "openai/gpt-4o-mini-2024-07-18",
-  "system_prompt": "You are a helpful but funny assistant. You have access to tools. User is Jack and live in Olrlando, Fl",
-  "updated_at": "2025-10-28T21:00:33.372068+00:00"
+  "system_prompt": "You are a focused assistant that collaborates with planner-provided MCP tools.\n\nDecision rules:\n1. Confirm the current date and timezone by calling the `calendar_current_context` tool before referencing \"today\" or upcoming events.\n2. Select tools that match the planner's context tags and ask the user before touching tools outside those tags.\n3. When a tool returns no data, a guardrail message, or an error, confirm with the user whether to widen the time span or include the additional planner contexts before retrying.\n4. Never guess or fabricate tool output—collect whatever details you still need from the user.\n5. Ask for consent before launching broad or cross-context searches.",
+  "updated_at": "2025-01-01T00:00:00+00:00"
 }
diff --git a/src/backend/chat/streaming.py b/src/backend/chat/streaming.py
@@ -805,11 +805,15 @@ async def stream_conversation(
 
                 arguments_raw = function.get("arguments")
                 status = "finished"
+                result_text = ""
+                result_obj: Any | None = None
+                missing_arguments = False
                 if not arguments_raw or arguments_raw.strip() == "":
                     result_text = (
                         f"Tool {tool_name} requires arguments but none were provided."
                     )
                     status = "error"
+                    missing_arguments = True
                     logger.warning("Missing tool arguments for %s", tool_name)
                 else:
                     try:
@@ -839,13 +843,17 @@ async def stream_conversation(
                             if session_id and _tool_requires_session_id(tool_name):
                                 working_arguments.setdefault("session_id", session_id)
                             try:
-                                result = await self._tool_client.call_tool(
+                                result_obj = await self._tool_client.call_tool(
                                     tool_name, working_arguments
                                 )
                                 result_text = self._tool_client.format_tool_result(
-                                    result
+                                    result_obj
+                                )
+                                status = (
+                                    "error"
+                                    if getattr(result_obj, "isError", False)
+                                    else "finished"
                                 )
-                                status = "error" if result.isError else "finished"
                             except Exception as exc:  # pragma: no cover - MCP errors
                                 logger.exception(
                                     "Tool '%s' raised an exception", tool_name
@@ -892,8 +900,41 @@ async def stream_conversation(
                     ),
                 }
 
-                if status != "error" and _looks_like_no_result(result_text):
-                    expand_contexts = True
+                notice_reason = _classify_tool_followup(
+                    status,
+                    result_text,
+                    tool_error_flag=bool(
+                        getattr(result_obj, "isError", False)
+                    ),
+                    missing_arguments=missing_arguments,
+                )
+                if notice_reason is not None:
+                    next_contexts: list[str] = []
+                    will_use_all_tools = False
+                    if tool_context_plan is not None:
+                        if notice_reason in {"no_results", "empty_result", "tool_error"}:
+                            next_contexts = (
+                                tool_context_plan.additional_contexts_for_attempt(hop_count)
+                            )
+                            will_use_all_tools = (
+                                tool_context_plan.use_all_tools_for_attempt(hop_count + 1)
+                            )
+                    if notice_reason in {"no_results", "empty_result", "tool_error"}:
+                        expand_contexts = True
+                    notice_payload = {
+                        "type": "tool_followup_required",
+                        "tool": tool_name or "unknown",
+                        "reason": notice_reason,
+                        "message": result_text,
+                        "attempt": hop_count,
+                        "next_contexts": next_contexts,
+                        "will_use_all_tools": will_use_all_tools,
+                        "confirmation_required": True,
+                    }
+                    yield {
+                        "event": "notice",
+                        "data": json.dumps(notice_payload),
+                    }
 
             hop_count += 1
 
@@ -942,10 +983,44 @@ def _looks_like_no_result(result_text: str) -> bool:
         "wasn't found",
         "nothing found",
         "no matching",
+        "no events found",
     )
     return any(phrase in lowered for phrase in phrases)
 
 
+def _classify_tool_followup(
+    status: str,
+    result_text: str | None,
+    *,
+    tool_error_flag: bool,
+    missing_arguments: bool,
+) -> str | None:
+    """Classify tool results that require follow-up guidance for the assistant."""
+
+    text = result_text if isinstance(result_text, str) else ""
+    normalized = text.strip().lower()
+
+    if missing_arguments:
+        return "missing_arguments"
+
+    if status == "error":
+        if _looks_like_no_result(text):
+            return "no_results"
+        if tool_error_flag or not normalized:
+            return "tool_error"
+        if "invalid" in normalized and "argument" in normalized:
+            return "tool_error"
+        return "tool_error"
+
+    if not normalized:
+        return "empty_result"
+
+    if _looks_like_no_result(text):
+        return "no_results"
+
+    return None
+
+
 def _is_tool_support_error(error: OpenRouterError) -> bool:
     detail = error.detail
     message = ""
diff --git a/src/backend/chat/tool_context_planner.py b/src/backend/chat/tool_context_planner.py
@@ -33,6 +33,33 @@ def contexts_for_attempt(self, attempt: int) -> list[str]:
                 merged.append(normalized)
         return merged
 
+    def additional_contexts_for_attempt(self, attempt: int) -> list[str]:
+        """Return new contexts that would be introduced on the next attempt."""
+
+        if not self.stages:
+            return []
+
+        if attempt < 0:
+            attempt = 0
+
+        if attempt + 1 >= len(self.stages):
+            return []
+
+        seen: set[str] = set()
+        for stage in self.stages[: attempt + 1]:
+            for context in stage:
+                normalized = context.strip().lower()
+                if normalized:
+                    seen.add(normalized)
+
+        additions: list[str] = []
+        for context in self.stages[attempt + 1]:
+            normalized = context.strip().lower()
+            if not normalized or normalized in seen or normalized in additions:
+                continue
+            additions.append(normalized)
+        return additions
+
     def use_all_tools_for_attempt(self, attempt: int) -> bool:
         """Return True when the attempt should fall back to every tool."""
 
diff --git a/tests/test_streaming.py b/tests/test_streaming.py
@@ -311,6 +311,61 @@ async def stream_chat_raw(self, payload: dict[str, Any]):
         yield {"data": "[DONE]"}
 
 
+class MultiToolOpenRouterClient(DummyOpenRouterClient):
+    def __init__(self, tool_calls: list[dict[str, Any]], final_message: str) -> None:
+        super().__init__()
+        self.tool_calls = tool_calls
+        self.final_message = final_message
+        self.call_index = 0
+
+    async def stream_chat_raw(self, payload: dict[str, Any]):
+        self.payloads.append(payload)
+        if self.call_index < len(self.tool_calls):
+            call = self.tool_calls[self.call_index]
+            self.call_index += 1
+            arguments = call.get("arguments", {})
+            if isinstance(arguments, dict):
+                arguments_payload = json.dumps(arguments)
+            else:
+                arguments_payload = str(arguments)
+            chunk = {
+                "id": f"gen-tool-{self.call_index}",
+                "choices": [
+                    {
+                        "delta": {
+                            "tool_calls": [
+                                {
+                                    "id": f"call_{self.call_index}",
+                                    "type": "function",
+                                    "function": {
+                                        "name": call.get("name", "calendar_lookup"),
+                                        "arguments": arguments_payload,
+                                    },
+                                }
+                            ]
+                        },
+                        "finish_reason": "tool_calls",
+                    }
+                ],
+            }
+            yield {"data": json.dumps(chunk)}
+            yield {"data": "[DONE]"}
+            return
+
+        self.call_index += 1
+        final_chunk = {
+            "id": f"gen-final-{self.call_index}",
+            "choices": [
+                {
+                    "delta": {"content": self.final_message},
+                    "finish_reason": "stop",
+                }
+            ],
+        }
+        yield {"data": json.dumps(final_chunk)}
+        yield {"data": "[DONE]"}
+
+
 class ExpandingToolClient:
     def __init__(self) -> None:
         self.context_history: list[list[str]] = []
@@ -585,6 +640,18 @@ async def test_streaming_expands_contexts_after_no_result() -> None:
     assert len(client.payloads) == 2
     assert events[-1]["data"] == "[DONE]"
 
+    notice_events = [
+        json.loads(event["data"])
+        for event in events
+        if event.get("event") == "notice"
+    ]
+    assert notice_events, "Expected notice event after empty tool result"
+    notice = notice_events[0]
+    assert notice["reason"] == "no_results"
+    assert notice["tool"] == "calendar_lookup"
+    assert notice["next_contexts"] == ["tasks"]
+    assert notice["confirmation_required"] is True
+
 
 @pytest.mark.anyio("asyncio")
 async def test_structured_tool_choice_does_not_retry_without_tools() -> None:
@@ -625,3 +692,118 @@ async def test_structured_tool_choice_does_not_retry_without_tools() -> None:
             pass
 
     assert client.calls == 1
+
+
+@pytest.mark.anyio("asyncio")
+async def test_streaming_emits_notice_for_missing_arguments() -> None:
+    client = MultiToolOpenRouterClient(
+        [{"name": "calendar_lookup", "arguments": ""}],
+        final_message="Please share more details.",
+    )
+    tool_client = ExpandingToolClient()
+    handler = StreamingHandler(
+        client,  # type: ignore[arg-type]
+        DummyRepository(),  # type: ignore[arg-type]
+        tool_client,  # type: ignore[arg-type]
+        default_model="openrouter/auto",
+    )
+
+    request = ChatCompletionRequest(
+        messages=[ChatMessage(role="user", content="Check my calendar today")],
+    )
+    conversation = [{"role": "user", "content": "Check my calendar today"}]
+    plan = ToolContextPlan(stages=[["calendar"], ["tasks"]], broad_search=True)
+    initial_tools = tool_client.get_openai_tools_for_contexts(
+        plan.contexts_for_attempt(0)
+    )
+
+    events: list[dict[str, Any]] = []
+    async for event in handler.stream_conversation(
+        "session-missing-args",
+        request,
+        conversation,
+        initial_tools,
+        None,
+        plan,
+    ):
+        events.append(event)
+
+    notice_events = [
+        json.loads(event["data"])
+        for event in events
+        if event.get("event") == "notice"
+    ]
+
+    assert notice_events, "Expected notice event when tool arguments are missing"
+    notice = notice_events[0]
+    assert notice["reason"] == "missing_arguments"
+    assert notice["tool"] == "calendar_lookup"
+    assert notice["next_contexts"] == []
+    assert notice["confirmation_required"] is True
+    assert tool_client.calls == 0
+    assert len(client.payloads) == 2
+    assert events[-1]["data"] == "[DONE]"
+
+
+@pytest.mark.anyio("asyncio")
+async def test_streaming_handles_multi_stage_notices() -> None:
+    client = MultiToolOpenRouterClient(
+        [
+            {"name": "calendar_lookup", "arguments": {"query": "habit review"}},
+            {"name": "tasks_lookup", "arguments": {"query": "habit review"}},
+        ],
+        final_message="Let's confirm the plan.",
+    )
+    tool_client = ExpandingToolClient()
+    tool_client.results = [
+        "No events found in that window.",
+        "No matching tasks were located.",
+    ]
+    handler = StreamingHandler(
+        client,  # type: ignore[arg-type]
+        DummyRepository(),  # type: ignore[arg-type]
+        tool_client,  # type: ignore[arg-type]
+        default_model="openrouter/auto",
+    )
+
+    request = ChatCompletionRequest(
+        messages=[ChatMessage(role="user", content="Help me build better habits")],
+    )
+    conversation = [{"role": "user", "content": "Help me build better habits"}]
+    plan = ToolContextPlan(
+        stages=[["calendar"], ["tasks"], ["notes"]],
+        broad_search=True,
+    )
+    initial_tools = tool_client.get_openai_tools_for_contexts(
+        plan.contexts_for_attempt(0)
+    )
+
+    events: list[dict[str, Any]] = []
+    async for event in handler.stream_conversation(
+        "session-multi-stage",
+        request,
+        conversation,
+        initial_tools,
+        None,
+        plan,
+    ):
+        events.append(event)
+
+    notice_events = [
+        json.loads(event["data"])
+        for event in events
+        if event.get("event") == "notice"
+    ]
+
+    assert len(notice_events) == 2
+    first_notice, second_notice = notice_events
+    assert first_notice["reason"] == "no_results"
+    assert first_notice["next_contexts"] == ["tasks"]
+    assert second_notice["reason"] == "no_results"
+    assert second_notice["next_contexts"] == ["notes"]
+    assert tool_client.context_history[0] == ["calendar"]
+    assert ["calendar", "tasks"] in tool_client.context_history
+    assert ["calendar", "tasks", "notes"] in tool_client.context_history
+    assert tool_client.calls == 2
+    assert len(client.payloads) == 3
+    assert events[-1]["data"] == "[DONE]"
diff --git a/tests/test_tool_context_planner.py b/tests/test_tool_context_planner.py
@@ -29,6 +29,17 @@ def test_planner_expands_habit_contexts() -> None:
     assert plan.contexts_for_attempt(2) == ["calendar", "tasks", "notes"]
 
 
+def test_plan_reports_additional_contexts() -> None:
+    planner = ToolContextPlanner()
+    request = _make_request("Help me build better habits this month")
+
+    plan = planner.plan(request, [])
+
+    assert plan.additional_contexts_for_attempt(0) == ["tasks"]
+    assert plan.additional_contexts_for_attempt(1) == ["notes"]
+    assert plan.additional_contexts_for_attempt(2) == []
+
+
 def test_planner_routes_documents_to_gdrive() -> None:
     planner = ToolContextPlanner()
     request = _make_request("search docs for last year's budget")

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`{`
`2`	`2`	`"model": "openai/gpt-4o-mini-2024-07-18",`
`3`		`- "system_prompt": "You are a helpful but funny assistant. You have access to tools. User is Jack and live in Olrlando, Fl",`
`4`		`- "updated_at": "2025-10-28T21:00:33.372068+00:00"`
	`3`	+ "system_prompt": "You are a focused assistant that collaborates with planner-provided MCP tools.\n\nDecision rules:\n1. Confirm the current date and timezone by calling the `calendar_current_context` tool before referencing \"today\" or upcoming events.\n2. Select tools that match the planner's context tags and ask the user before touching tools outside those tags.\n3. When a tool returns no data, a guardrail message, or an error, confirm with the user whether to widen the time span or include the additional planner contexts before retrying.\n4. Never guess or fabricate tool output—collect whatever details you still need from the user.\n5. Ask for consent before launching broad or cross-context searches.",
	`4`	`+ "updated_at": "2025-01-01T00:00:00+00:00"`
`5`	`5`	`}`