jbrinkw
diff --git a/‎core/agent/project.py‎
Lines changed: 14 additions & 2 deletions b/‎core/agent/project.py‎
Lines changed: 14 additions & 2 deletions
diff --git a/‎core/agent/simple.py‎
Lines changed: 110 additions & 4 deletions b/‎core/agent/simple.py‎
Lines changed: 110 additions & 4 deletions
diff --git a/‎core/agent/simple_passthough.py‎
Lines changed: 117 additions & 0 deletions b/‎core/agent/simple_passthough.py‎
Lines changed: 117 additions & 0 deletions
diff --git a/‎core/helpers/llm_selector.py‎
Lines changed: 12 additions & 10 deletions b/‎core/helpers/llm_selector.py‎
Lines changed: 12 additions & 10 deletions
@@ -648,8 +648,20 @@ async def run_agent_stream(user_prompt: str, chat_history: Optional[str] = None,
                 if ev == "on_chat_model_stream":
                     data = event.get("data") if isinstance(event, dict) else getattr(event, "data", {})
                     chunk = (data or {}).get("chunk") if isinstance(data, dict) else getattr(data, "chunk", None)
-                    text = getattr(chunk, "content", None)
-                    if isinstance(text, str) and text:
+                    content = getattr(chunk, "content", None)
+                    
+                    # Handle both string content (OpenAI) and list content (Anthropic)
+                    text = None
+                    if isinstance(content, str):
+                        text = content
+                    elif isinstance(content, list) and content:
+                        # Anthropic format: [{'text': '...', 'type': 'text', 'index': 0}]
+                        for item in content:
+                            if isinstance(item, dict) and item.get("type") == "text":
+                                text = item.get("text", "")
+                                break
+                    
+                    if text:
                         yielded_any = True
                         yield text
             except Exception:
 
@@ -50,6 +50,7 @@ class AgentResult(BaseModel):
 # ---- Runtime cache ----
 PRELOADED_TOOLS: List[Any] = []
 RUN_TRACES: List[ToolTrace] = []
+DOMAIN_PROMPTS_TEXT: str = ""
 
 
 def _get_env(key: str, default: Optional[str] = None) -> Optional[str]:
@@ -136,19 +137,32 @@ def _runner(**kwargs):
 
 
 def initialize_runtime(tool_root: Optional[str] = None) -> None:
-    global PRELOADED_TOOLS
+    global PRELOADED_TOOLS, DOMAIN_PROMPTS_TEXT
     try:
         exts = discover_extensions(tool_root)
     except Exception:
         exts = []
     tools: List[Any] = []
+    domain_prompts: List[str] = []
     for ext in exts:
         for fn in (ext.get("tools") or []):
             try:
                 tools.append(_wrap_callable_as_tool(fn, ext.get("name", "unknown")))
             except Exception:
                 continue
+        # Collect full domain system prompts
+        try:
+            name = ext.get("name", "")
+            sp = ext.get("system_prompt", "")
+            if isinstance(name, str) and isinstance(sp, str) and sp.strip():
+                domain_prompts.append(f"[Domain: {name}]\n{sp.strip()}")
+        except Exception:
+            pass
     PRELOADED_TOOLS = tools
+    try:
+        DOMAIN_PROMPTS_TEXT = "\n\n".join([p for p in domain_prompts if p])
+    except Exception:
+        DOMAIN_PROMPTS_TEXT = ""
 
 
 def _active_models() -> Dict[str, str]:
@@ -192,7 +206,12 @@ async def run_agent(user_prompt: str, chat_history: Optional[str] = None, memory
             if isinstance(llm, str) and llm.strip() in {"low", "med", "high"}
             else get_chat_model(role="domain", model=_get_env("REACT_MODEL", "gpt-4.1"), callbacks=[LLMRunTracer("react")], temperature=0.0)
         )
-        agent = create_react_agent(model, tools=tools)
+        # Include combined domain system prompts as the agent's system prompt when available
+        if isinstance(DOMAIN_PROMPTS_TEXT, str) and DOMAIN_PROMPTS_TEXT.strip():
+            final_prompt = "Domain system prompts:\n" + DOMAIN_PROMPTS_TEXT.strip()
+            agent = create_react_agent(model, tools=tools, prompt=final_prompt)
+        else:
+            agent = create_react_agent(model, tools=tools)
     except Exception as e:
         msg = f"Error building ReAct agent: {str(e)}"
         return AgentResult(final=msg, results=[], timings=[], content=msg, response_time_secs=0.0, traces=[])
@@ -215,11 +234,11 @@ async def run_agent(user_prompt: str, chat_history: Optional[str] = None, memory
     import asyncio
     t0 = time.perf_counter()
     try:
-        result = await agent.ainvoke({"messages": messages}, config={"recursion_limit": 8, "callbacks": [LLMRunTracer("react")]})
+        result = await agent.ainvoke({"messages": messages}, config={"recursion_limit": 16, "callbacks": [LLMRunTracer("react")]})
     except RuntimeError:
         # fallback loop handling if needed
         loop = asyncio.get_event_loop()
-        result = await agent.ainvoke({"messages": messages}, config={"recursion_limit": 8, "callbacks": [LLMRunTracer("react")]})
+        result = await agent.ainvoke({"messages": messages}, config={"recursion_limit": 16, "callbacks": [LLMRunTracer("react")]})
     elapsed = time.perf_counter() - t0
 
     # Extract final content
@@ -249,6 +268,93 @@ async def run_agent(user_prompt: str, chat_history: Optional[str] = None, memory
     )
 
 
+async def run_agent_stream(user_prompt: str, chat_history: Optional[str] = None, memory: Optional[str] = None, tool_root: Optional[str] = None, llm: Optional[str] = None):
+    """Yield incremental text chunks while the agent generates a response.
+
+    Fallback: if streaming is unavailable, yields the final response once.
+    """
+    # Discover/warm tools if not already done or if tool_root differs
+    if not PRELOADED_TOOLS or isinstance(tool_root, str):
+        initialize_runtime(tool_root=tool_root)
+    tools = PRELOADED_TOOLS or []
+    if not tools:
+        yield "No tools discovered. Ensure files matching *_tool.py exist under extensions/."
+        return
+
+    # Build a simple ReAct agent with all tools
+    try:
+        from langgraph.prebuilt import create_react_agent
+        # Use tier if provided; else fall back to env model
+        model = (
+            get_chat_model(role="domain", tier=llm, callbacks=[LLMRunTracer("react")], temperature=0.0)
+            if isinstance(llm, str) and llm.strip() in {"low", "med", "high"}
+            else get_chat_model(role="domain", model=_get_env("REACT_MODEL", "gpt-4.1"), callbacks=[LLMRunTracer("react")], temperature=0.0)
+        )
+        # Include combined domain system prompts as the agent's system prompt when available
+        if isinstance(DOMAIN_PROMPTS_TEXT, str) and DOMAIN_PROMPTS_TEXT.strip():
+            final_prompt = "Domain system prompts:\n" + DOMAIN_PROMPTS_TEXT.strip()
+            agent = create_react_agent(model, tools=tools, prompt=final_prompt)
+        else:
+            agent = create_react_agent(model, tools=tools)
+    except Exception:
+        # If building agent fails, just yield non-streaming result from run_agent
+        res = await run_agent(user_prompt, chat_history=chat_history, memory=memory, tool_root=tool_root, llm=llm)
+        yield res.final
+        return
+
+    # Prepare messages
+    from langchain_core.messages import SystemMessage, HumanMessage
+    messages: List[Any] = []
+    if chat_history or memory:
+        messages.append(SystemMessage(content=(
+            "Conversation context to consider when responding.\n"
+            f"Chat history:\n{chat_history or ''}\n\n"
+            f"Memory:\n{memory or ''}"
+        )))
+    messages.append(HumanMessage(content=user_prompt))
+
+    # Clear traces for this run
+    del RUN_TRACES[:]
+
+    yielded_any = False
+    try:
+        # Prefer event-streaming for token deltas
+        async for event in agent.astream_events({"messages": messages}, config={"recursion_limit": 16, "callbacks": [LLMRunTracer("react")]}, version="v1"):
+            try:
+                ev = event.get("event") if isinstance(event, dict) else getattr(event, "event", None)
+                if ev == "on_chat_model_stream":
+                    data = event.get("data") if isinstance(event, dict) else getattr(event, "data", {})
+                    chunk = (data or {}).get("chunk") if isinstance(data, dict) else getattr(data, "chunk", None)
+                    content = getattr(chunk, "content", None)
+                    
+                    # Handle both string content (OpenAI) and list content (Anthropic)
+                    text = None
+                    if isinstance(content, str):
+                        text = content
+                    elif isinstance(content, list) and content:
+                        # Anthropic format: [{'text': '...', 'type': 'text', 'index': 0}]
+                        for item in content:
+                            if isinstance(item, dict) and item.get("type") == "text":
+                                text = item.get("text", "")
+                                break
+                    
+                    if text:
+                        yielded_any = True
+                        yield text
+            except Exception:
+                # Ignore malformed events; continue streaming
+                continue
+    except Exception:
+        # If streaming path fails, fall back to single-shot
+        pass
+
+    if not yielded_any:
+        # Fallback to non-streaming execution
+        res = await run_agent(user_prompt, chat_history=chat_history, memory=memory, tool_root=tool_root, llm=llm)
+        yield res.final
+        return
+
+
 def main(argv: Optional[List[str]] = None) -> int:
     import argparse
     parser = argparse.ArgumentParser(description="Simple ReAct agent over all tools")
 
@@ -458,6 +458,123 @@ async def run_agent(user_prompt: str, chat_history: Optional[str] = None, memory
     )
 
 
+async def run_agent_stream(user_prompt: str, chat_history: Optional[str] = None, memory: Optional[str] = None, tool_root: Optional[str] = None, llm: Optional[str] = None):
+    """Yield incremental text chunks while the agent generates a response.
+
+    For this planner-executor architecture, we stream tool results as they complete.
+    Fallback: if streaming is unavailable, yields the final response once.
+    """
+    # Prepare tools once
+    if not TOOL_RUNNERS or isinstance(tool_root, str):
+        initialize_runtime(tool_root=tool_root)
+    if not TOOL_RUNNERS:
+        yield "No tools discovered. Ensure files matching *_tool.py exist under extensions/."
+        return
+
+    # Planner model: use tier if provided; otherwise env-based model
+    tracer = LLMRunTracer("planner")
+    if isinstance(llm, str) and llm.strip() in {"low", "med", "high"}:
+        model = get_chat_model(role="domain", tier=llm.strip(), callbacks=[tracer], temperature=0.0)
+    else:
+        planner_model_name = _get_env("MONO_PT_PLANNER_MODEL", _get_env("REACT_MODEL", "gpt-4.1")) or "gpt-4.1"
+        model = get_chat_model(role="domain", model=planner_model_name, callbacks=[tracer], temperature=0.0)
+
+    # Clear traces per run
+    del RUN_TRACES[:]
+
+    # Iterative plan-execute-review loop with streaming
+    accumulated_segments: List[str] = []
+    recursion_limit = int(_get_env("MONO_PT_RECURSION_LIMIT", "8") or 8)
+    followup_items: List[ToolResult] = []
+    step = 0
+    yielded_any = False
+
+    while step < recursion_limit:
+        step += 1
+        # Build planning messages (initial or follow-up)
+        messages = _build_planner_messages(
+            user_prompt=user_prompt,
+            chat_history=chat_history,
+            memory=memory,
+            light_schema=LIGHT_SCHEMA,
+            review_items=(followup_items or None),
+        )
+
+        # Invoke planner
+        _dbg_print(f"[simple-pt-stream] step {step}: planning...")
+        plan_resp = await model.ainvoke(messages)
+        
+        # Parse plan JSON
+        raw_text = (plan_resp.content or "") if hasattr(plan_resp, "content") else str(plan_resp)
+        _dbg_print(f"[simple-pt-stream] step {step}: planner raw -> {_truncate(raw_text, int(_get_env('MONO_PT_LOG_MAXLEN', '600') or '600'))}")
+        parsed = _extract_json_object(raw_text)
+        planner_step = PlannerStep()
+        if isinstance(parsed, dict):
+            try:
+                planner_step = PlannerStep.model_validate(parsed)
+            except Exception:
+                # Try to coerce structure
+                calls_raw = parsed.get("calls") if isinstance(parsed.get("calls"), list) else []
+                calls: List[PlannedToolCall] = []
+                for c in calls_raw:
+                    try:
+                        calls.append(PlannedToolCall.model_validate(c))
+                    except Exception:
+                        continue
+                planner_step = PlannerStep(calls=calls, final_text=parsed.get("final_text"))
+
+        # If planner provided final text only and no calls, finish
+        if not planner_step.calls:
+            if isinstance(planner_step.final_text, str) and planner_step.final_text.strip():
+                final_chunk = planner_step.final_text.strip()
+                accumulated_segments.append(final_chunk)
+                yield final_chunk
+                yielded_any = True
+                _dbg_print(f"[simple-pt-stream] step {step}: final_text provided; finishing.")
+            break
+
+        # Execute calls concurrently
+        _dbg_print(f"[simple-pt-stream] step {step}: executing {len(planner_step.calls)} call(s) concurrently...")
+        for idx, pc in enumerate(planner_step.calls, start=1):
+            try:
+                args_str = json.dumps(pc.args or {}, ensure_ascii=False)
+            except Exception:
+                args_str = str(pc.args or {})
+            _dbg_print(f"[simple-pt-stream] step {step} CALL {idx}/{len(planner_step.calls)}: tool={pc.tool} passthrough={(pc.options.passthrough if pc.options else True)} args={_truncate(args_str, int(_get_env('MONO_PT_LOG_MAXLEN', '600') or '600'))}")
+
+        _, paired = await _execute_planned_calls(planner_step.calls)
+
+        # Route outputs and stream passthrough results
+        followup_items = []
+        for idx, (pc, res) in enumerate(paired, start=1):
+            passthrough = True if pc.options is None else bool(getattr(pc.options, "passthrough", True))
+            if passthrough and res.success:
+                # Stream directly by yielding
+                if isinstance(res.public_text, str) and res.public_text.strip():
+                    chunk_text = res.public_text.strip()
+                    accumulated_segments.append(chunk_text)
+                    yield chunk_text
+                    yielded_any = True
+                _dbg_print(f"[simple-pt-stream] step {step} RESULT {idx}/{len(paired)}: tool={res.tool} success={res.success} passthrough={passthrough} ROUTE=STREAMED text={_truncate(res.public_text, int(_get_env('MONO_PT_LOG_MAXLEN', '600') or '600'))}")
+            else:
+                followup_items.append(res)
+                _dbg_print(f"[simple-pt-stream] step {step} RESULT {idx}/{len(paired)}: tool={res.tool} success={res.success} passthrough={passthrough} ROUTE=REVIEW text={_truncate(res.public_text, int(_get_env('MONO_PT_LOG_MAXLEN', '600') or '600'))}")
+
+        # If nothing needs review, continue next plan step
+        if not followup_items:
+            _dbg_print(f"[simple-pt-stream] step {step}: no follow-up needed; finishing.")
+            break
+
+    if not yielded_any:
+        # Fallback to non-streaming execution
+        _dbg_print("[simple-pt-stream] no content streamed; falling back to non-streaming.")
+        res = await run_agent(user_prompt, chat_history=chat_history, memory=memory, tool_root=tool_root, llm=llm)
+        yield res.final
+        return
+
+    _dbg_print(f"[simple-pt-stream] done. steps={step} segments={len(accumulated_segments)}")
+
+
 def main(argv: Optional[List[str]] = None) -> int:
     import argparse
     parser = argparse.ArgumentParser(description="Simple passthough agent over all tools")
 
@@ -6,25 +6,27 @@
     "gpt-4.1-nano",
     "gpt-4.1-mini",
     "gpt-4.1",
+    "gpt-5"
 }
 
 SUPPORTED_GEMINI = {
     "gemini-2.5-flash",
     "gemini-2.5-flash-lite",
-    "gemini-2.5-pro-thinking",
+    "gemini-2.5-pro",
 }
 
-SUPPORTED_ANTHROPIC = {
-    "claude-3.5-sonnet",
-    "claude-4.0-sonnet",
+SUPPORTED_ANTHROPIC = {           # Alias for claude-sonnet-4-20250514
+    "claude-sonnet-4-20250514",           # Alias for claude-sonnet-4-5-20250929
+    "claude-sonnet-4-5-20250929",       # Claude Sonnet 4.5 full ID
+
 }
 
 
 # Simple fixed tier mapping (no overrides for now)
 TIER_TO_MODEL = {
     "low": "gpt-4.1",
-    "med": "claude-4.0-sonnet",
-    "high": "gemini-2.5-pro-thinking",
+    "med": "claude-sonnet-4-0",         # Claude Sonnet 4.0
+    "high": "claude-sonnet-4-5",        # Claude Sonnet 4.5
 }
 
 
@@ -62,13 +64,13 @@ def get_chat_model(*, role: str, model: Optional[str] = None, tier: Optional[str
     # Heuristic: choose provider based on model identifier prefix/allowlist
     if selected in SUPPORTED_OPENAI or selected.lower().startswith("gpt"):
         from langchain_openai import ChatOpenAI
-        return ChatOpenAI(model=selected, temperature=temperature, callbacks=cbs)
+        return ChatOpenAI(model=selected, temperature=temperature, callbacks=cbs, streaming=True)
 
     if selected in SUPPORTED_ANTHROPIC or selected.lower().startswith("claude"):
         from langchain_anthropic import ChatAnthropic
 
         api_key = _env("ANTHROPIC_API_KEY")
-        kwargs = {"model": selected, "temperature": temperature, "callbacks": cbs}
+        kwargs = {"model": selected, "temperature": temperature, "callbacks": cbs, "streaming": True}
         if api_key:
             kwargs["anthropic_api_key"] = api_key
         return ChatAnthropic(**kwargs)
@@ -85,7 +87,7 @@ def get_chat_model(*, role: str, model: Optional[str] = None, tier: Optional[str
     # Allow explicit API key passthrough via env, but ChatGoogleGenerativeAI also reads GOOGLE_API_KEY
     api_key = _env("GOOGLE_API_KEY") or _env("GEMINI_API_KEY")
     if api_key:
-        return ChatGoogleGenerativeAI(model=selected, temperature=temperature, callbacks=cbs, api_key=api_key)
-    return ChatGoogleGenerativeAI(model=selected, temperature=temperature, callbacks=cbs)
+        return ChatGoogleGenerativeAI(model=selected, temperature=temperature, callbacks=cbs, api_key=api_key, streaming=True)
+    return ChatGoogleGenerativeAI(model=selected, temperature=temperature, callbacks=cbs, streaming=True)