diff --git a/README.md b/README.md index af841e7..bf39713 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,28 @@ Real-world agents you can clone and run. Most recent first. ### 2026 -#### [Deep Research Agent — LangGraph + BMasterAI Telemetry](examples/deep-research-agent/) `NEW` +#### [Claude Web + Computer Agent — Native Tool-Use Loop](examples/claude-web-computer-agent/) `NEW` +*March 2026* + +A bare-metal Claude tool-use agent combining **web search** (Tavily) and **computer use** (screenshot/click/type/key/scroll) — no LangGraph, no framework, just the Anthropic SDK — fully instrumented with BMasterAI logging and telemetry. The foundational pattern that every Claude agent is built on. + +**Stack:** Claude (Anthropic), Tavily, xdotool + scrot, BMasterAI + +**What it demonstrates:** +- The raw Anthropic `tool_use` / `tool_result` message cycle — the core loop behind every Claude agent +- Multimodal tool results: screenshots sent back to Claude as image blocks so it can see the screen +- BMasterAI telemetry on every LLM call, tool dispatch, decision point, and error path +- Structured JSONL telemetry at `logs/agent.jsonl` — pipe to any analytics tool + +```bash +pip install -r requirements.txt +cp .env.example .env # add ANTHROPIC_API_KEY + TAVILY_API_KEY +python main.py "Search for today's top AI news, open a browser to the first result, take a screenshot, and summarize what you see." +``` + +--- + +#### [Deep Research Agent — LangGraph + BMasterAI Telemetry](examples/deep-research-agent/) *March 2026* A multi-step web research agent built with **LangGraph** and fully instrumented with **BMasterAI** logging and telemetry. Inspired by [langchain-ai/deepagents](https://github.com/langchain-ai/deepagents). Give it any research question and it plans, searches, analyzes, reflects on quality, and synthesizes a structured report — automatically looping back for more research if gaps are found. diff --git a/examples/claude-web-computer-agent/.env.example b/examples/claude-web-computer-agent/.env.example new file mode 100644 index 0000000..fa24212 --- /dev/null +++ b/examples/claude-web-computer-agent/.env.example @@ -0,0 +1,5 @@ +# Required +ANTHROPIC_API_KEY=your_anthropic_api_key_here + +# Optional — enables web_search tool (generous free tier at tavily.com) +TAVILY_API_KEY=your_tavily_api_key_here diff --git a/examples/claude-web-computer-agent/.gitignore b/examples/claude-web-computer-agent/.gitignore new file mode 100644 index 0000000..70fd5be --- /dev/null +++ b/examples/claude-web-computer-agent/.gitignore @@ -0,0 +1,6 @@ +.env +logs/ +__pycache__/ +*.pyc +*.pyo +.DS_Store diff --git a/examples/claude-web-computer-agent/README.md b/examples/claude-web-computer-agent/README.md new file mode 100644 index 0000000..ad22d34 --- /dev/null +++ b/examples/claude-web-computer-agent/README.md @@ -0,0 +1,217 @@ +# claude-web-computer-agent + +A bare-metal Claude tool-use agent combining **web search** and **computer use** — no LangGraph, no framework, just the Anthropic SDK — fully instrumented with **BMasterAI** logging and telemetry. + +This is the foundational pattern that every Claude agent is built on. Study this before moving to the LangGraph examples. + +--- + +## What It Demonstrates + +- The raw Anthropic `tool_use` / `tool_result` message cycle (the core loop behind every Claude agent) +- How to register two complementary tool types — network I/O (web search) and system I/O (computer use) +- How to send screenshot images back to Claude as multimodal `tool_result` content +- BMasterAI telemetry on every step of a bare SDK agent — not just framework agents + +--- + +## Architecture + +``` +user prompt + ↓ +claude_call (tools: web_search, computer_use, calculator) + ↓ +stop_reason == "tool_use"? + ├── yes → dispatch tool(s) → append tool_result(s) → loop back + └── no → final text response → END +``` + +**Tools available:** + +| Tool | Description | Implementation | +|---|---|---| +| `web_search` | Tavily search — current information from the web | `tavily-python` | +| `computer_use` | Screenshot, click, type, key, scroll | `xdotool` + `scrot` | + +--- + +## BMasterAI Instrumentation + +| Event | BMasterAI call | +|---|---| +| Agent starts | `monitor.track_agent_start(AGENT_ID)` + `log_event(AGENT_START)` | +| Each Claude API call | `monitor.track_llm_call(...)` + `log_event(LLM_CALL)` ×2 (before + after) | +| Tool dispatched | `log_event(TOOL_USE)` | +| Tool result returned | `log_event(TASK_COMPLETE)` or `log_event(TASK_ERROR)` | +| Loop decision | `log_event(DECISION_POINT, "continue" or "end_turn")` | +| Any exception | `monitor.track_error(...)` + `log_event(TASK_ERROR)` | +| Agent finishes | `monitor.track_agent_stop(AGENT_ID)` + `log_event(AGENT_STOP)` | +| Task timings | `monitor.track_task_duration(...)` per LLM call and per tool call | + +Telemetry output: + +``` +logs/agent.log — human-readable event log +logs/agent.jsonl — structured JSON (pipe to any analytics tool) +logs/reasoning/agent_reasoning.jsonl — decision points and reasoning chains +``` + +--- + +## Setup + +```bash +pip install -r requirements.txt + +# Linux only: install computer use dependencies +sudo apt-get install scrot xdotool +``` + +Copy `.env.example` to `.env` and fill in your keys: + +```bash +cp .env.example .env +``` + +Required: +- `ANTHROPIC_API_KEY` — [console.anthropic.com](https://console.anthropic.com) + +Optional (enables `web_search`): +- `TAVILY_API_KEY` — [tavily.com](https://tavily.com) (generous free tier) + +--- + +## Usage + +```bash +# Pass query as argument +python main.py "Search for the latest Anthropic model pricing and calculate + the cost of 1 million tokens at Sonnet rates." + +# Or run interactively +python main.py +``` + +### Example queries + +```bash +# Web search +python main.py "What are the key differences between Claude Opus 4.6 and Sonnet 4.6?" + +# Computer use — screenshot + describe +python main.py "Take a screenshot of my current screen and describe what applications are open." + +# Combined workflow — the core use case for this example +python main.py "Search for today's top AI news, open a browser to the first result, + take a screenshot, and summarize what you see." +``` + +--- + +## Example Output + +``` +════════════════════════════════════════════════════════════ +🤖 claude-web-computer-agent +──────────────────────────────────────────────────────────── +📝 Query: Search for Claude Opus 4.6 pricing and calculate cost for 1M tokens +════════════════════════════════════════════════════════════ + +🔄 Turn 1/20 + 🧠 claude-opus-4-6 | 892+87 tokens | 1243ms | stop=tool_use + 🔧 Tool: web_search({"query": "Claude Opus 4.6 pricing per token 2026"}) + ✅ web_search → {"query": "...", "results": [...], "result_count": 5} (412ms) + +🔄 Turn 2/20 + 🧠 claude-opus-4-6 | 2341+63 tokens | 987ms | stop=tool_use + 🔧 Tool: computer_use({"action": "screenshot"}) + ✅ computer_use → {"action": "screenshot", "success": true} (312ms) + +🔄 Turn 3/20 + 🧠 claude-opus-4-6 | 2589+312 tokens | 1821ms | stop=end_turn + +✅ Done in 3 turn(s) + +════════════════════════════════════════════════════════════ +📊 BMASTERAI TELEMETRY +──────────────────────────────────────────────────────────── + Agent status : STOPPED + Total errors : 0 + + Task timings: + llm_call_turn_1 avg=1243ms calls=1 + llm_call_turn_2 avg=987ms calls=1 + llm_call_turn_3 avg=1821ms calls=1 + tool_web_search avg=412ms calls=1 + tool_computer_use avg=312ms calls=1 + + Telemetry logs: + logs/agent.log — human-readable + logs/agent.jsonl — structured JSON + logs/reasoning/ — decision points & reasoning +════════════════════════════════════════════════════════════ + +════════════════════════════════════════════════════════════ +🗒️ FINAL RESPONSE +──────────────────────────────────────────────────────────── +Based on current pricing, 1 million input tokens with Claude Opus 4.6 would cost $15.00... +════════════════════════════════════════════════════════════ +``` + +--- + +## Files + +| File | Purpose | +|---|---| +| `tools.py` | Tool JSON schemas + dispatch functions for all three tools | +| `agent.py` | `WebComputerAgent` class — the tool-use loop with full BMasterAI instrumentation | +| `main.py` | CLI entry point with env checks and interactive fallback | +| `requirements.txt` | Python dependencies | +| `.env.example` | Environment variable template | + +--- + +## Analyse the Telemetry + +```bash +# Show all LLM calls with token counts +cat logs/agent.jsonl | python3 -c " +import sys, json +for line in sys.stdin: + e = json.loads(line) + if e.get('event_type') == 'llm_call': + meta = e.get('metadata', {}) + if 'input_tokens' in meta: + print(f\"{e['timestamp'][:19]} tokens={meta.get('input_tokens',0)}+{meta.get('output_tokens',0)} latency={meta.get('latency_ms',0):.0f}ms\") +" + +# Show all tool calls +cat logs/agent.jsonl | python3 -c " +import sys, json +for line in sys.stdin: + e = json.loads(line) + if e.get('event_type') == 'tool_use': + meta = e.get('metadata', {}) + print(f\"{e['timestamp'][:19]} tool={meta.get('tool_name')} input={str(meta.get('input',''))[:80]}\") +" + +# Show errors only +cat logs/agent.jsonl | python3 -c " +import sys, json +for line in sys.stdin: + e = json.loads(line) + if e.get('event_type') == 'task_error': + print(json.dumps(e, indent=2)) +" +``` + +--- + +## Stack + +- [Anthropic Python SDK](https://github.com/anthropic-ai/anthropic-sdk-python) +- [BMasterAI](https://github.com/travis-burmaster/bmasterai) +- [Tavily Python](https://github.com/tavily-ai/tavily-python) +- [xdotool](https://github.com/jordansissel/xdotool) + [scrot](https://github.com/dreamer/scrot) (Linux computer use) diff --git a/examples/claude-web-computer-agent/agent.py b/examples/claude-web-computer-agent/agent.py new file mode 100644 index 0000000..fecac6e --- /dev/null +++ b/examples/claude-web-computer-agent/agent.py @@ -0,0 +1,381 @@ +""" +agent.py — Core tool-use agent loop + +Runs a Claude tool-use loop with two tools: + - web_search (Tavily) + - computer_use (screenshot / click / type / key / scroll) + +Every step is instrumented with BMasterAI logging and monitoring. + +Architecture: + user prompt + ↓ + claude_call (tools: web_search, computer_use) + ↓ + stop_reason == "tool_use"? + ├── yes → dispatch tool(s) → append tool_result(s) → loop back + └── no → return final text response +""" + +import os +import time +import json +from typing import Optional + +import anthropic + +from bmasterai.logging import configure_logging, get_logger, LogLevel, EventType +from bmasterai.monitoring import get_monitor + +from tools import ALL_TOOL_SCHEMAS, dispatch_tool + +# ───────────────────────────────────────────────────────────────────────────── +# Constants +# ───────────────────────────────────────────────────────────────────────────── + +AGENT_ID = "claude-web-computer-agent" +DEFAULT_MODEL = "claude-opus-4-6" +MAX_TURNS = 20 # hard cap on tool-use iterations +MAX_TOKENS = 4096 + + +# ───────────────────────────────────────────────────────────────────────────── +# Setup BMasterAI +# ───────────────────────────────────────────────────────────────────────────── + +def setup_logging(): + configure_logging( + log_file="agent.log", + json_log_file="agent.jsonl", + reasoning_log_file="agent_reasoning.jsonl", + log_level=LogLevel.INFO, + enable_console=True, + enable_file=True, + enable_json=True, + enable_reasoning_logs=True, + ) + return get_logger(), get_monitor() + + +# ───────────────────────────────────────────────────────────────────────────── +# Agent +# ───────────────────────────────────────────────────────────────────────────── + +class WebComputerAgent: + """ + A Claude tool-use agent combining web search, computer use, + and math — fully instrumented with BMasterAI telemetry. + """ + + def __init__(self, model: str = DEFAULT_MODEL, verbose: bool = True): + self.model = model + self.verbose = verbose + self.client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"]) + self.bm, self.monitor = setup_logging() + self.monitor.start_monitoring() + + # ── Public entry point ──────────────────────────────────────────────────── + + def run(self, user_message: str) -> str: + """ + Run the agent on a user message. + Returns the final text response from Claude. + """ + self.monitor.track_agent_start(AGENT_ID) + self.bm.log_event( + agent_id=AGENT_ID, + event_type=EventType.AGENT_START, + message=f"Agent started", + metadata={"model": self.model, "query": user_message[:200]}, + ) + + if self.verbose: + print(f"\n{'═'*60}") + print(f"🤖 {AGENT_ID}") + print(f"{'─'*60}") + print(f"📝 Query: {user_message}") + print(f"{'═'*60}\n") + + messages = [{"role": "user", "content": user_message}] + turn = 0 + final_response = "" + + try: + while turn < MAX_TURNS: + turn += 1 + if self.verbose: + print(f"🔄 Turn {turn}/{MAX_TURNS}") + + # ── Claude API call ─────────────────────────────────────────── + response, latency_ms, input_tokens, output_tokens = self._call_claude(messages, turn) + + messages.append({"role": "assistant", "content": response.content}) + + # ── End turn: no more tools ─────────────────────────────────── + if response.stop_reason == "end_turn": + self.bm.log_event( + agent_id=AGENT_ID, + event_type=EventType.DECISION_POINT, + message="end_turn — agent finished", + metadata={"turn": turn, "total_input_tokens": input_tokens, + "total_output_tokens": output_tokens}, + ) + final_response = self._extract_text(response) + if self.verbose: + print(f"\n✅ Done in {turn} turn(s)\n") + break + + # ── Tool use ────────────────────────────────────────────────── + if response.stop_reason == "tool_use": + tool_results = self._dispatch_tools(response, turn) + messages.append({"role": "user", "content": tool_results}) + + self.bm.log_event( + agent_id=AGENT_ID, + event_type=EventType.DECISION_POINT, + message=f"continue — {len(tool_results)} tool result(s) appended", + metadata={"turn": turn, "tool_count": len(tool_results)}, + ) + continue + + # ── Unexpected stop reason ──────────────────────────────────── + self.bm.log_event( + agent_id=AGENT_ID, + event_type=EventType.TASK_ERROR, + message=f"Unexpected stop_reason: {response.stop_reason}", + level=LogLevel.WARNING, + metadata={"stop_reason": response.stop_reason, "turn": turn}, + ) + final_response = self._extract_text(response) + break + + else: + # Hit MAX_TURNS + self.bm.log_event( + agent_id=AGENT_ID, + event_type=EventType.TASK_ERROR, + message=f"Reached MAX_TURNS ({MAX_TURNS}) without end_turn", + level=LogLevel.WARNING, + metadata={"max_turns": MAX_TURNS}, + ) + final_response = self._extract_text(response) + + except Exception as e: + import traceback + self.bm.log_event( + agent_id=AGENT_ID, + event_type=EventType.TASK_ERROR, + message=f"Agent error: {type(e).__name__}: {e}", + level=LogLevel.ERROR, + metadata={ + "error_type": type(e).__name__, + "error": str(e), + "turn": turn, + "message_count": len(messages), + "traceback": traceback.format_exc(limit=5), + }, + ) + self.monitor.track_error(AGENT_ID, type(e).__name__) + raise + + finally: + self.monitor.track_agent_stop(AGENT_ID) + self.bm.log_event( + agent_id=AGENT_ID, + event_type=EventType.AGENT_STOP, + message="Agent stopped", + metadata={"turns_used": turn}, + ) + if self.verbose: + self._print_dashboard() + + return final_response + + # ── Internal: Claude API call ───────────────────────────────────────────── + + def _call_claude(self, messages: list, turn: int): + """Call Claude and record telemetry. Returns (response, latency_ms, in_tokens, out_tokens).""" + self.bm.log_event( + agent_id=AGENT_ID, + event_type=EventType.LLM_CALL, + message=f"Calling {self.model} (turn {turn})", + metadata={"model": self.model, "turn": turn, "message_count": len(messages)}, + ) + + t0 = time.time() + response = self.client.messages.create( + model=self.model, + max_tokens=MAX_TOKENS, + tools=ALL_TOOL_SCHEMAS, + messages=messages, + ) + latency_ms = (time.time() - t0) * 1000 + + input_tokens = response.usage.input_tokens + output_tokens = response.usage.output_tokens + total_tokens = input_tokens + output_tokens + + self.monitor.track_llm_call( + agent_id=AGENT_ID, + model=self.model, + tokens_used=total_tokens, + duration_ms=latency_ms, + ) + self.monitor.track_task_duration(AGENT_ID, f"llm_call_turn_{turn}", latency_ms) + + self.bm.log_event( + agent_id=AGENT_ID, + event_type=EventType.LLM_CALL, + message=f"Claude responded — stop_reason={response.stop_reason}", + metadata={ + "turn": turn, + "stop_reason": response.stop_reason, + "input_tokens": input_tokens, + "output_tokens": output_tokens, + "latency_ms": round(latency_ms, 1), + }, + ) + + if self.verbose: + print(f" 🧠 {self.model} | {input_tokens}+{output_tokens} tokens | " + f"{latency_ms:.0f}ms | stop={response.stop_reason}") + + return response, latency_ms, input_tokens, output_tokens + + # ── Internal: Tool dispatch ─────────────────────────────────────────────── + + def _dispatch_tools(self, response, turn: int) -> list: + """Dispatch all tool_use blocks, log each one, return tool_result list.""" + tool_results = [] + + for block in response.content: + if block.type != "tool_use": + continue + + tool_name = block.name + tool_input = block.input + tool_use_id = block.id + + self.bm.log_event( + agent_id=AGENT_ID, + event_type=EventType.TOOL_USE, + message=f"Dispatching tool: {tool_name}", + metadata={ + "tool_name": tool_name, + "tool_use_id": tool_use_id, + "input": {k: str(v)[:200] for k, v in tool_input.items()}, + "turn": turn, + }, + ) + + if self.verbose: + print(f" 🔧 Tool: {tool_name}({json.dumps(tool_input)[:120]})") + + # Execute + t0 = time.time() + result = dispatch_tool(tool_name, tool_input) + duration_ms = (time.time() - t0) * 1000 + + self.monitor.track_task_duration(AGENT_ID, f"tool_{tool_name}", duration_ms) + + # Determine success + is_error = "error" in result + event_type = EventType.TASK_ERROR if is_error else EventType.TASK_COMPLETE + log_level = LogLevel.WARNING if is_error else LogLevel.INFO + + self.bm.log_event( + agent_id=AGENT_ID, + event_type=event_type, + message=f"Tool {tool_name} {'failed' if is_error else 'succeeded'}", + level=log_level, + metadata={ + "tool_name": tool_name, + "tool_use_id": tool_use_id, + "duration_ms": round(duration_ms, 1), + "result_preview": str(result)[:300], + "success": not is_error, + }, + ) + + if self.verbose: + status = "❌" if is_error else "✅" + result_preview = result.get("error", str(result))[:100] + print(f" {status} {tool_name} → {result_preview} ({duration_ms:.0f}ms)") + + # Format result for Claude + # Screenshots get special treatment — strip base64 from log preview + result_content = self._format_tool_result(tool_name, result) + + tool_results.append({ + "type": "tool_result", + "tool_use_id": tool_use_id, + "content": result_content, + "is_error": is_error, + }) + + return tool_results + + # ── Internal: helpers ───────────────────────────────────────────────────── + + def _format_tool_result(self, tool_name: str, result: dict) -> str: + """ + Format a tool result for inclusion in the messages array. + Screenshots return a multimodal content block; others return JSON text. + """ + if tool_name == "computer_use" and result.get("action") == "screenshot" and result.get("success"): + # Return image block so Claude can actually see the screen + return [ + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": result["image_base64"], + }, + } + ] + # Strip large base64 blobs from non-screenshot results before serialising + clean = {k: v for k, v in result.items() if k != "image_base64"} + return json.dumps(clean, indent=2) + + def _extract_text(self, response) -> str: + """Extract the final text from a Claude response.""" + parts = [] + for block in response.content: + if hasattr(block, "text"): + parts.append(block.text) + return "\n".join(parts).strip() + + def _print_dashboard(self): + """Print a telemetry summary.""" + dash = self.monitor.get_agent_dashboard(AGENT_ID) + health = self.monitor.get_system_health() + + print(f"\n{'═'*60}") + print("📊 BMASTERAI TELEMETRY") + print(f"{'─'*60}") + print(f" Agent status : {dash.get('status', 'unknown').upper()}") + print(f" Total errors : {dash['metrics'].get('total_errors', 0)}") + + perf = dash.get("performance", {}) + if perf: + print(f"\n Task timings:") + for task, stats in sorted(perf.items()): + print(f" {task:<35} avg={stats['avg_duration_ms']:.0f}ms " + f"calls={stats['total_calls']}") + + sys_m = dash.get("system", {}) + cpu = sys_m.get("cpu_usage", {}) + mem = sys_m.get("memory_usage", {}) + if cpu or mem: + print(f"\n System:") + if cpu: + print(f" CPU : {cpu.get('latest', 0):.1f}%") + if mem: + print(f" Memory : {mem.get('latest', 0):.1f}%") + + print(f"\n Telemetry logs:") + print(f" logs/agent.log — human-readable") + print(f" logs/agent.jsonl — structured JSON") + print(f" logs/reasoning/ — decision points & reasoning") + print(f"{'═'*60}\n") diff --git a/examples/claude-web-computer-agent/main.py b/examples/claude-web-computer-agent/main.py new file mode 100644 index 0000000..edd89c3 --- /dev/null +++ b/examples/claude-web-computer-agent/main.py @@ -0,0 +1,81 @@ +""" +main.py — CLI entry point for claude-web-computer-agent + +Usage: + python main.py "Search for the latest Claude API pricing and calculate the cost + of 1000 calls at 10k tokens each." + + python main.py # interactive prompt +""" + +import sys +import os + + +def check_env(): + """Warn about missing environment variables before starting.""" + missing = [] + if not os.getenv("ANTHROPIC_API_KEY"): + missing.append("ANTHROPIC_API_KEY") + if not os.getenv("TAVILY_API_KEY"): + missing.append("TAVILY_API_KEY (optional — web_search tool will be disabled)") + if missing: + for m in missing: + print(f" ⚠️ {m} not set") + if "ANTHROPIC_API_KEY" in str(missing): + print("\nSet ANTHROPIC_API_KEY to continue. Exiting.") + sys.exit(1) + print() + + +def main(): + print("\n" + "═" * 60) + print(" claude-web-computer-agent") + print(" web search + computer use + calculator + BMasterAI") + print("═" * 60) + + # Load .env if present + try: + from dotenv import load_dotenv + load_dotenv() + except ImportError: + pass + + check_env() + + # Lazy import after env check + from agent import WebComputerAgent + + # Get query from CLI args or interactive prompt + if len(sys.argv) > 1: + query = " ".join(sys.argv[1:]) + else: + print("\nEnter your query (or Ctrl+C to quit):") + print("Examples:") + print(' "Search for the latest LangChain release notes"') + print(' "Take a screenshot and describe what you see"') + print(' "What is 2^32 and how many GB is that?"') + print() + try: + query = input("Query: ").strip() + except (KeyboardInterrupt, EOFError): + print("\nBye!") + sys.exit(0) + + if not query: + print("No query provided. Exiting.") + sys.exit(1) + + # Run + agent = WebComputerAgent(model="claude-opus-4-6", verbose=True) + response = agent.run(query) + + print("\n" + "═" * 60) + print("🗒️ FINAL RESPONSE") + print("─" * 60) + print(response) + print("═" * 60 + "\n") + + +if __name__ == "__main__": + main() diff --git a/examples/claude-web-computer-agent/requirements.txt b/examples/claude-web-computer-agent/requirements.txt new file mode 100644 index 0000000..42f4ff4 --- /dev/null +++ b/examples/claude-web-computer-agent/requirements.txt @@ -0,0 +1,5 @@ +anthropic>=0.40.0 +bmasterai>=0.2.3 +tavily-python>=0.3.0 +python-dotenv>=1.0.0 +psutil>=5.9.0 diff --git a/examples/claude-web-computer-agent/tools.py b/examples/claude-web-computer-agent/tools.py new file mode 100644 index 0000000..833f6df --- /dev/null +++ b/examples/claude-web-computer-agent/tools.py @@ -0,0 +1,242 @@ +""" +tools.py — Tool schemas and dispatch for claude-web-computer-agent + +Defines two tool types: + 1. web_search — Tavily search (client-side wrapper) + 2. computer_use — screenshot, click, type, key, scroll via xdotool + scrot + +All tools log TOOL_USE events via BMasterAI before and after execution. +""" + +import os +import base64 +import subprocess +from typing import Any, Dict, List, Optional + +# ── Optional imports (graceful degradation) ────────────────────────────────── +try: + from tavily import TavilyClient + _tavily_available = True +except ImportError: + _tavily_available = False + + +# ───────────────────────────────────────────────────────────────────────────── +# Tool JSON schemas (passed to Claude in the `tools` array) +# ───────────────────────────────────────────────────────────────────────────── + +WEB_SEARCH_SCHEMA = { + "name": "web_search", + "description": ( + "Search the web for current information using Tavily. " + "Use this when you need up-to-date facts, news, documentation, or any " + "information that may not be in your training data." + ), + "input_schema": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The search query to look up.", + }, + "max_results": { + "type": "integer", + "description": "Maximum number of results to return (1–10). Defaults to 5.", + "default": 5, + }, + }, + "required": ["query"], + }, +} + +COMPUTER_USE_SCHEMA = { + "name": "computer_use", + "description": ( + "Take a screenshot of the current screen, click on UI elements, " + "type text, or press keys. Use this to interact with desktop " + "applications, browsers, or observe the current screen state." + ), + "input_schema": { + "type": "object", + "properties": { + "action": { + "type": "string", + "enum": ["screenshot", "click", "type", "key", "scroll"], + "description": ( + "Action to perform:\n" + " screenshot — capture current screen\n" + " click — click at (x, y)\n" + " type — type a string of text\n" + " key — press a keyboard shortcut (e.g. 'ctrl+c')\n" + " scroll — scroll at (x, y) by delta" + ), + }, + "x": { + "type": "integer", + "description": "X coordinate (required for click/scroll).", + }, + "y": { + "type": "integer", + "description": "Y coordinate (required for click/scroll).", + }, + "text": { + "type": "string", + "description": "Text to type (required for type action).", + }, + "key": { + "type": "string", + "description": "Key or key combo to press (required for key action). E.g. 'Return', 'ctrl+c'.", + }, + "delta_y": { + "type": "integer", + "description": "Scroll delta in pixels, positive = down (required for scroll).", + "default": 300, + }, + }, + "required": ["action"], + }, +} + +# All schemas to pass to Claude +ALL_TOOL_SCHEMAS: List[Dict] = [ + WEB_SEARCH_SCHEMA, + COMPUTER_USE_SCHEMA, +] + + +# ───────────────────────────────────────────────────────────────────────────── +# Tool implementations +# ───────────────────────────────────────────────────────────────────────────── + +def _run_web_search(query: str, max_results: int = 5) -> Dict[str, Any]: + """Execute a Tavily web search.""" + if not _tavily_available: + return {"error": "tavily-python not installed. Run: pip install tavily-python"} + + api_key = os.getenv("TAVILY_API_KEY") + if not api_key: + return {"error": "TAVILY_API_KEY not set in environment."} + + client = TavilyClient(api_key=api_key) + response = client.search(query=query, max_results=max_results) + + results = [] + for r in response.get("results", []): + results.append({ + "title": r.get("title", ""), + "url": r.get("url", ""), + "snippet": r.get("content", "")[:500], # truncate for token efficiency + "score": r.get("score", 0), + }) + + return { + "query": query, + "results": results, + "result_count": len(results), + } + + +_SUBPROCESS_TIMEOUT = 10 # seconds — prevents hanging processes + +def _run_computer_use(action: str, x: Optional[int] = None, y: Optional[int] = None, + text: Optional[str] = None, key: Optional[str] = None, + delta_y: int = 300) -> Dict[str, Any]: + """ + Execute a computer use action using xdotool + scrot (Linux). + Returns result dict; screenshots are base64-encoded PNG data. + """ + try: + if action == "screenshot": + result = subprocess.run( + ["scrot", "-", "--quality", "80"], + capture_output=True, check=True, + timeout=_SUBPROCESS_TIMEOUT, # prevent hang if display is unresponsive + ) + img_b64 = base64.b64encode(result.stdout).decode("utf-8") + return { + "action": "screenshot", + "success": True, + "image_base64": img_b64, + "format": "png", + "note": "Screenshot captured. Describe what you see.", + } + + elif action == "click": + if x is None or y is None: + return {"error": "click requires x and y coordinates."} + subprocess.run( + ["xdotool", "mousemove", str(x), str(y), "click", "1"], + check=True, timeout=_SUBPROCESS_TIMEOUT, + ) + return {"action": "click", "success": True, "x": x, "y": y} + + elif action == "type": + if not text: + return {"error": "type requires text parameter."} + # "--" signals end of options, preventing text from being parsed as flags + subprocess.run( + ["xdotool", "type", "--clearmodifiers", "--", text], + check=True, timeout=_SUBPROCESS_TIMEOUT, + ) + return {"action": "type", "success": True, "typed": text} + + elif action == "key": + if not key: + return {"error": "key requires key parameter."} + # "--" prevents key names from being treated as options + subprocess.run( + ["xdotool", "key", "--", key], + check=True, timeout=_SUBPROCESS_TIMEOUT, + ) + return {"action": "key", "success": True, "key": key} + + elif action == "scroll": + if x is None or y is None: + return {"error": "scroll requires x and y coordinates."} + direction = "5" if delta_y > 0 else "4" # button 5 = scroll down, 4 = up + steps = max(1, abs(delta_y) // 100) + for _ in range(steps): + subprocess.run( + ["xdotool", "mousemove", str(x), str(y), "click", direction], + check=True, timeout=_SUBPROCESS_TIMEOUT, + ) + return {"action": "scroll", "success": True, "x": x, "y": y, "delta_y": delta_y} + + else: + return {"error": f"Unknown action: {action}"} + + except subprocess.TimeoutExpired: + return {"error": f"Timed out after {_SUBPROCESS_TIMEOUT}s", "action": action, "success": False} + except FileNotFoundError as e: + tool = "scrot" if "scrot" in str(e) else "xdotool" + return { + "error": f"{tool} not found. Install with: sudo apt-get install {tool}", + "action": action, + "success": False, + } + except subprocess.CalledProcessError as e: + return {"error": str(e), "action": action, "success": False} + + +# ───────────────────────────────────────────────────────────────────────────── +# Dispatch router +# ───────────────────────────────────────────────────────────────────────────── + +def dispatch_tool(tool_name: str, tool_input: Dict[str, Any]) -> Dict[str, Any]: + """Route a tool_use block to the correct implementation.""" + if tool_name == "web_search": + return _run_web_search( + query=tool_input["query"], + max_results=tool_input.get("max_results", 5), + ) + elif tool_name == "computer_use": + return _run_computer_use( + action=tool_input["action"], + x=tool_input.get("x"), + y=tool_input.get("y"), + text=tool_input.get("text"), + key=tool_input.get("key"), + delta_y=tool_input.get("delta_y", 300), + ) + else: + return {"error": f"Unknown tool: {tool_name}"}