diff --git a/README.md b/README.md index bf39713..3b87585 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,28 @@ Real-world agents you can clone and run. Most recent first. ### 2026 -#### [Claude Web + Computer Agent — Native Tool-Use Loop](examples/claude-web-computer-agent/) `NEW` +#### [Gemini Web + Computer Agent — Native Function-Calling Loop](examples/gemini-web-computer-agent/) `NEW` +*March 2026* + +A bare-metal Gemini function-calling agent combining **web search** (Tavily) and **computer use** (screenshot/click/type/key/scroll) — no LangGraph, no framework, just the Google GenAI SDK — fully instrumented with BMasterAI logging and telemetry. Cross-platform: works on Linux (xdotool + scrot) and macOS (cliclick + screencapture). + +**Stack:** Gemini (Google GenAI SDK), Tavily, xdotool/cliclick, BMasterAI + +**What it demonstrates:** +- The raw Gemini `function_call` / `function_response` message cycle — the core loop behind every Gemini agent +- Multimodal tool results: screenshots sent back to Gemini as image parts so it can see the screen +- BMasterAI telemetry on every LLM call, tool dispatch, decision point, and error path +- Structured JSONL telemetry at `logs/agent.jsonl` — pipe to any analytics tool + +```bash +pip install -r requirements.txt +cp .env.example .env # add GEMINI_API_KEY + TAVILY_API_KEY +python main.py "Search for today's top AI news, open a browser to the first result, take a screenshot, and summarize what you see." +``` + +--- + +#### [Claude Web + Computer Agent — Native Tool-Use Loop](examples/claude-web-computer-agent/) *March 2026* A bare-metal Claude tool-use agent combining **web search** (Tavily) and **computer use** (screenshot/click/type/key/scroll) — no LangGraph, no framework, just the Anthropic SDK — fully instrumented with BMasterAI logging and telemetry. The foundational pattern that every Claude agent is built on. diff --git a/examples/gemini-web-computer-agent/.env.example b/examples/gemini-web-computer-agent/.env.example new file mode 100644 index 0000000..bfaa4d6 --- /dev/null +++ b/examples/gemini-web-computer-agent/.env.example @@ -0,0 +1,5 @@ +# Required +GEMINI_API_KEY=your_gemini_api_key_here + +# Optional — enables web_search tool (generous free tier at tavily.com) +TAVILY_API_KEY=your_tavily_api_key_here diff --git a/examples/gemini-web-computer-agent/.gitignore b/examples/gemini-web-computer-agent/.gitignore new file mode 100644 index 0000000..48d08ae --- /dev/null +++ b/examples/gemini-web-computer-agent/.gitignore @@ -0,0 +1,32 @@ +# Environment variables +.env + +# Logs and telemetry +logs/ +*.log +*.jsonl + +# Python cache and compiled files +__pycache__/ +*.py[cod] +*$py.class + +# Virtual environments +.venv/ +venv/ +env/ +ENV/ + +# Testing +.pytest_cache/ +.coverage +htmlcov/ + +# macOS +.DS_Store + +# IDEs and Editors +.vscode/ +.idea/ +*.swp +*.swo diff --git a/examples/gemini-web-computer-agent/README.md b/examples/gemini-web-computer-agent/README.md new file mode 100644 index 0000000..3b0b5c2 --- /dev/null +++ b/examples/gemini-web-computer-agent/README.md @@ -0,0 +1,218 @@ +# gemini-web-computer-agent + +A bare-metal Gemini tool-use agent combining **web search** and **computer use** — no LangGraph, no framework, just the Google GenAI SDK — fully instrumented with **BMasterAI** logging and telemetry. + +This is the foundational pattern that every Gemini agent is built on. Study this before moving to the LangGraph examples. + +--- + +## What It Demonstrates + +- The raw Gemini `function_call` / tool response message cycle (the core loop behind every Gemini agent) +- How to register two complementary tool types — network I/O (web search) and system I/O (computer use) +- How to send screenshot images back to Gemini as multimodal function response content +- BMasterAI telemetry on every step of a bare SDK agent — not just framework agents + +--- + +## Architecture + +``` +user prompt + ↓ +gemini_call (tools: web_search, computer_use) + ↓ +stop_reason == "tool_use"? + ├── yes → dispatch tool(s) → append tool_result(s) → loop back + └── no → final text response → END +``` + +**Tools available:** + +| Tool | Description | Implementation | +|---|---|---| +| `web_search` | Tavily search — current information from the web | `tavily-python` | +| `computer_use` | Screenshot, click, type, key, scroll | `xdotool` + `scrot` | + +--- + +## BMasterAI Instrumentation + +| Event | BMasterAI call | +|---|---| +| Agent starts | `monitor.track_agent_start(AGENT_ID)` + `log_event(AGENT_START)` | +| Each Gemini API call | `monitor.track_llm_call(...)` + `log_event(LLM_CALL)` ×2 (before + after) | +| Tool dispatched | `log_event(TOOL_USE)` | +| Tool result returned | `log_event(TASK_COMPLETE)` or `log_event(TASK_ERROR)` | +| Loop decision | `log_event(DECISION_POINT, "continue" or "end_turn")` | +| Any exception | `monitor.track_error(...)` + `log_event(TASK_ERROR)` | +| Agent finishes | `monitor.track_agent_stop(AGENT_ID)` + `log_event(AGENT_STOP)` | +| Task timings | `monitor.track_task_duration(...)` per LLM call and per tool call | + +Telemetry output: + +``` +logs/agent.log — human-readable event log +logs/agent.jsonl — structured JSON (pipe to any analytics tool) +logs/reasoning/agent_reasoning.jsonl — decision points and reasoning chains +``` + +--- + +## Setup + +```bash +source .venv/bin/activate +pip install -r requirements.txt + +# Linux only: install computer use dependencies +sudo apt-get install scrot xdotool +``` + +Copy `.env.example` to `.env` and fill in your keys: + +```bash +cp .env.example .env +``` + +Required: +- `GEMINI_API_KEY` — [aistudio.google.com](https://aistudio.google.com) + +Optional (enables `web_search`): +- `TAVILY_API_KEY` — [tavily.com](https://tavily.com) (generous free tier) + +--- + +## Usage + +```bash +# Pass query as argument +python main.py "Search for the latest Google Gemini model pricing and calculate + the cost of 1 million tokens at Flash rates." + +# Or run interactively +python main.py +``` + +### Example queries + +```bash +# Web search +python main.py "What are the key differences between Gemini 1.5 Pro and Gemini 1.5 Flash?" + +# Computer use — screenshot + describe +python main.py "Take a screenshot of my current screen and describe what applications are open." + +# Combined workflow — the core use case for this example +python main.py "Search for today's top AI news, open a browser to the first result, + take a screenshot, and summarize what you see." +``` + +--- + +## Example Output + +``` +════════════════════════════════════════════════════════════ +🤖 gemini-web-computer-agent +──────────────────────────────────────────────────────────── +📝 Query: Search for Gemini 1.5 Pro pricing and calculate cost for 1M tokens +════════════════════════════════════════════════════════════ + +🔄 Turn 1/20 + 🧠 gemini-3-flash-preview | 892+87 tokens | 1243ms | stop=tool_use + 🔧 Tool: web_search({"query": "Gemini 1.5 Pro pricing per token 2026"}) + ✅ web_search → {"query": "...", "results": [...], "result_count": 5} (412ms) + +🔄 Turn 2/20 + 🧠 gemini-3-flash-preview | 2341+63 tokens | 987ms | stop=tool_use + 🔧 Tool: computer_use({"action": "screenshot"}) + ✅ computer_use → {"action": "screenshot", "success": true} (312ms) + +🔄 Turn 3/20 + 🧠 gemini-3-flash-preview | 2589+312 tokens | 1821ms | stop=end_turn + +✅ Done in 3 turn(s) + +════════════════════════════════════════════════════════════ +📊 BMASTERAI TELEMETRY +──────────────────────────────────────────────────────────── + Agent status : STOPPED + Total errors : 0 + + Task timings: + llm_call_turn_1 avg=1243ms calls=1 + llm_call_turn_2 avg=987ms calls=1 + llm_call_turn_3 avg=1821ms calls=1 + tool_web_search avg=412ms calls=1 + tool_computer_use avg=312ms calls=1 + + Telemetry logs: + logs/agent.log — human-readable + logs/agent.jsonl — structured JSON + logs/reasoning/ — decision points & reasoning +════════════════════════════════════════════════════════════ + +════════════════════════════════════════════════════════════ +🗒️ FINAL RESPONSE +──────────────────────────────────────────────────────────── +Based on current pricing, 1 million input tokens with Gemini 1.5 Pro would cost $1.25... +════════════════════════════════════════════════════════════ +``` + +--- + +## Files + +| File | Purpose | +|---|---| +| `tools.py` | Tool JSON schemas + dispatch functions for both tools | +| `agent.py` | `WebComputerAgent` class — the tool-use loop with full BMasterAI instrumentation | +| `main.py` | CLI entry point with env checks and interactive fallback | +| `requirements.txt` | Python dependencies | +| `.env.example` | Environment variable template | + +--- + +## Analyse the Telemetry + +```bash +# Show all LLM calls with token counts +cat logs/agent.jsonl | python3 -c " +import sys, json +for line in sys.stdin: + e = json.loads(line) + if e.get('event_type') == 'llm_call': + meta = e.get('metadata', {}) + if 'input_tokens' in meta: + print(f\"{e['timestamp'][:19]} tokens={meta.get('input_tokens',0)}+{meta.get('output_tokens',0)} latency={meta.get('latency_ms',0):.0f}ms\") +" + +# Show all tool calls +cat logs/agent.jsonl | python3 -c " +import sys, json +for line in sys.stdin: + e = json.loads(line) + if e.get('event_type') == 'tool_use': + meta = e.get('metadata', {}) + print(f\"{e['timestamp'][:19]} tool={meta.get('tool_name')} input={str(meta.get('input',''))[:80]}\") +" + +# Show errors only +cat logs/agent.jsonl | python3 -c " +import sys, json +for line in sys.stdin: + e = json.loads(line) + if e.get('event_type') == 'task_error': + print(json.dumps(e, indent=2)) +" +``` + +--- + +## Stack + +- [Google GenAI Python SDK](https://github.com/google/genai-python) +- [BMasterAI](https://github.com/travis-burmaster/bmasterai) +- [Tavily Python](https://github.com/tavily-ai/tavily-python) +- [xdotool](https://github.com/jordansissel/xdotool) + [scrot](https://github.com/dreamer/scrot) (Linux computer use) diff --git a/examples/gemini-web-computer-agent/agent.py b/examples/gemini-web-computer-agent/agent.py new file mode 100644 index 0000000..ec4d6c5 --- /dev/null +++ b/examples/gemini-web-computer-agent/agent.py @@ -0,0 +1,391 @@ +""" +agent.py — Core tool-use agent loop + +Runs a Gemini function-calling loop with two tools: + - web_search (Tavily) + - computer_use (screenshot / click / type / key / scroll) + +Every step is instrumented with BMasterAI logging and monitoring. + +Architecture: + user prompt + ↓ + gemini_call (tools: web_search, computer_use) + ↓ + function_calls present? + ├── yes → dispatch tool(s) → append function_response(s) → loop back + └── no → return final text response +""" + +import os +import time +import json +import base64 +from typing import Optional + +from google import genai +from google.genai import types + +from bmasterai.logging import configure_logging, get_logger, LogLevel, EventType +from bmasterai.monitoring import get_monitor + +from tools import ALL_TOOL_SCHEMAS, dispatch_tool + +# ───────────────────────────────────────────────────────────────────────────── +# Constants +# ───────────────────────────────────────────────────────────────────────────── + +AGENT_ID = "gemini-web-computer-agent" +DEFAULT_MODEL = "gemini-3-flash-preview" +MAX_TURNS = 20 # hard cap on tool-use iterations +MAX_TOKENS = 4096 + + +# ───────────────────────────────────────────────────────────────────────────── +# Setup BMasterAI +# ───────────────────────────────────────────────────────────────────────────── + +def setup_logging(): + configure_logging( + log_file="agent.log", + json_log_file="agent.jsonl", + reasoning_log_file="agent_reasoning.jsonl", + log_level=LogLevel.INFO, + enable_console=True, + enable_file=True, + enable_json=True, + enable_reasoning_logs=True, + ) + return get_logger(), get_monitor() + + +# ───────────────────────────────────────────────────────────────────────────── +# Agent +# ───────────────────────────────────────────────────────────────────────────── + +class WebComputerAgent: + """ + A Gemini function-calling agent combining web search and computer use, + fully instrumented with BMasterAI telemetry. + """ + + def __init__(self, model: str = DEFAULT_MODEL, verbose: bool = True): + self.model = model + self.verbose = verbose + api_key = os.environ.get("GEMINI_API_KEY") + self.client = genai.Client(api_key=api_key) + self.bm, self.monitor = setup_logging() + self.monitor.start_monitoring() + + # ── Public entry point ──────────────────────────────────────────────────── + + def run(self, user_message: str) -> str: + """ + Run the agent on a user message. + Returns the final text response from Gemini. + """ + self.monitor.track_agent_start(AGENT_ID) + self.bm.log_event( + agent_id=AGENT_ID, + event_type=EventType.AGENT_START, + message=f"Agent started", + metadata={"model": self.model, "query": user_message[:200]}, + ) + + if self.verbose: + print(f"\n{'═'*60}") + print(f"🤖 {AGENT_ID}") + print(f"{'─'*60}") + print(f"📝 Query: {user_message}") + print(f"{'═'*60}\n") + + messages = [ + types.Content(role="user", parts=[ + types.Part.from_text(text=user_message) + ]) + ] + turn = 0 + final_response = "" + + try: + while turn < MAX_TURNS: + turn += 1 + if self.verbose: + print(f"🔄 Turn {turn}/{MAX_TURNS}") + + # ── Gemini API call ─────────────────────────────────────────── + response, latency_ms, input_tokens, output_tokens = self._call_gemini(messages, turn) + + if response.candidates and response.candidates[0].content: + messages.append(response.candidates[0].content) + + function_calls = response.function_calls + + # ── End turn: no more tools ─────────────────────────────────── + if not function_calls: + self.bm.log_event( + agent_id=AGENT_ID, + event_type=EventType.DECISION_POINT, + message="end_turn — agent finished", + metadata={"turn": turn, "total_input_tokens": input_tokens, + "total_output_tokens": output_tokens}, + ) + final_response = self._extract_text(response) + if self.verbose: + print(f"\n✅ Done in {turn} turn(s)\n") + break + + # ── Tool use ────────────────────────────────────────────────── + tool_results = self._dispatch_tools(function_calls, turn) + messages.append(types.Content(role="user", parts=tool_results)) + + self.bm.log_event( + agent_id=AGENT_ID, + event_type=EventType.DECISION_POINT, + message=f"continue — {len(tool_results)} tool result(s) appended", + metadata={"turn": turn, "tool_count": len(tool_results)}, + ) + continue + + else: + # Hit MAX_TURNS + self.bm.log_event( + agent_id=AGENT_ID, + event_type=EventType.TASK_ERROR, + message=f"Reached MAX_TURNS ({MAX_TURNS}) without end_turn", + level=LogLevel.WARNING, + metadata={"max_turns": MAX_TURNS}, + ) + final_response = self._extract_text(response) + + except Exception as e: + import traceback + self.bm.log_event( + agent_id=AGENT_ID, + event_type=EventType.TASK_ERROR, + message=f"Agent error: {type(e).__name__}: {e}", + level=LogLevel.ERROR, + metadata={ + "error_type": type(e).__name__, + "error": str(e), + "turn": turn, + "message_count": len(messages), + "traceback": traceback.format_exc(limit=5), + }, + ) + self.monitor.track_error(AGENT_ID, type(e).__name__) + raise + + finally: + self.monitor.track_agent_stop(AGENT_ID) + self.bm.log_event( + agent_id=AGENT_ID, + event_type=EventType.AGENT_STOP, + message="Agent stopped", + metadata={"turns_used": turn}, + ) + if self.verbose: + self._print_dashboard() + + return final_response + + # ── Internal: Gemini API call ───────────────────────────────────────────── + + def _call_gemini(self, messages: list, turn: int): + """Call Gemini and record telemetry. Returns (response, latency_ms, in_tokens, out_tokens).""" + self.bm.log_event( + agent_id=AGENT_ID, + event_type=EventType.LLM_CALL, + message=f"Calling {self.model} (turn {turn})", + metadata={"model": self.model, "turn": turn, "message_count": len(messages)}, + ) + + t0 = time.time() + + declarations = [] + for schema in ALL_TOOL_SCHEMAS: + declarations.append(types.FunctionDeclaration( + name=schema["name"], + description=schema["description"], + parameters=schema["input_schema"] + )) + gemini_tools = [types.Tool(function_declarations=declarations)] + + response = self.client.models.generate_content( + model=self.model, + contents=messages, + config=types.GenerateContentConfig( + tools=gemini_tools, + temperature=0.0, + ) + ) + latency_ms = (time.time() - t0) * 1000 + + input_tokens = response.usage_metadata.prompt_token_count if response.usage_metadata else 0 + output_tokens = response.usage_metadata.candidates_token_count if response.usage_metadata else 0 + total_tokens = response.usage_metadata.total_token_count if response.usage_metadata else 0 + + self.monitor.track_llm_call( + agent_id=AGENT_ID, + model=self.model, + tokens_used=total_tokens, + duration_ms=latency_ms, + ) + self.monitor.track_task_duration(AGENT_ID, f"llm_call_turn_{turn}", latency_ms) + + function_calls = response.function_calls + stop_reason = "tool_use" if function_calls else "end_turn" + + self.bm.log_event( + agent_id=AGENT_ID, + event_type=EventType.LLM_CALL, + message=f"Gemini responded — stop_reason={stop_reason}", + metadata={ + "turn": turn, + "stop_reason": stop_reason, + "input_tokens": input_tokens, + "output_tokens": output_tokens, + "latency_ms": round(latency_ms, 1), + }, + ) + + if self.verbose: + print(f" 🧠 {self.model} | {input_tokens}+{output_tokens} tokens | " + f"{latency_ms:.0f}ms | stop={stop_reason}") + + return response, latency_ms, input_tokens, output_tokens + + # ── Internal: Tool dispatch ─────────────────────────────────────────────── + + def _dispatch_tools(self, function_calls: list, turn: int) -> list: + """Dispatch all function calls, log each one, return list of Parts.""" + tool_results = [] + + for call in function_calls: + tool_name = call.name + tool_input = call.args + if not isinstance(tool_input, dict): + try: + tool_input = dict(tool_input) + except (ValueError, TypeError): + tool_input = {} + + # Gemini function calls may or may not have an id field in the SDK depending on the model. + tool_use_id = getattr(call, "id", "") + + self.bm.log_event( + agent_id=AGENT_ID, + event_type=EventType.TOOL_USE, + message=f"Dispatching tool: {tool_name}", + metadata={ + "tool_name": tool_name, + "tool_use_id": tool_use_id, + "input": {k: str(v)[:200] for k, v in tool_input.items()}, + "turn": turn, + }, + ) + + if self.verbose: + print(f" 🔧 Tool: {tool_name}({json.dumps(tool_input)[:120]})") + + # Execute + t0 = time.time() + result = dispatch_tool(tool_name, tool_input) + duration_ms = (time.time() - t0) * 1000 + + self.monitor.track_task_duration(AGENT_ID, f"tool_{tool_name}", duration_ms) + + # Determine success + is_error = "error" in result + event_type = EventType.TASK_ERROR if is_error else EventType.TASK_COMPLETE + log_level = LogLevel.WARNING if is_error else LogLevel.INFO + + self.bm.log_event( + agent_id=AGENT_ID, + event_type=event_type, + message=f"Tool {tool_name} {'failed' if is_error else 'succeeded'}", + level=log_level, + metadata={ + "tool_name": tool_name, + "tool_use_id": tool_use_id, + "duration_ms": round(duration_ms, 1), + "result_preview": str(result)[:300], + "success": not is_error, + }, + ) + + if self.verbose: + status = "❌" if is_error else "✅" + result_preview = result.get("error", str(result))[:100] + print(f" {status} {tool_name} → {result_preview} ({duration_ms:.0f}ms)") + + # Format result for Gemini + if tool_name == "computer_use" and result.get("action") == "screenshot" and result.get("success"): + try: + image_bytes = base64.b64decode(result["image_base64"]) + # Provide the function response part + tool_results.append(types.Part.from_function_response( + name=tool_name, + response={"success": True, "note": "Screenshot captured"} + )) + # And append the image part separately + tool_results.append(types.Part.from_bytes( + data=image_bytes, + mime_type="image/png" + )) + except Exception as e: + tool_results.append(types.Part.from_function_response( + name=tool_name, + response={"error": f"Failed to decode base64 screenshot: {str(e)}"} + )) + else: + clean = {k: v for k, v in result.items() if k != "image_base64"} + tool_results.append(types.Part.from_function_response( + name=tool_name, + response=clean + )) + + return tool_results + + # ── Internal: helpers ───────────────────────────────────────────────────── + + def _extract_text(self, response) -> str: + """Extract the final text from a Gemini response.""" + try: + return response.text if response.text else "" + except (ValueError, AttributeError): + return "" + + def _print_dashboard(self): + """Print a telemetry summary.""" + dash = self.monitor.get_agent_dashboard(AGENT_ID) + health = self.monitor.get_system_health() + + print(f"\n{'═'*60}") + print("📊 BMASTERAI TELEMETRY") + print(f"{'─'*60}") + print(f" Agent status : {dash.get('status', 'unknown').upper()}") + print(f" Total errors : {dash['metrics'].get('total_errors', 0)}") + + perf = dash.get("performance", {}) + if perf: + print(f"\n Task timings:") + for task, stats in sorted(perf.items()): + print(f" {task:<35} avg={stats['avg_duration_ms']:.0f}ms " + f"calls={stats['total_calls']}") + + sys_m = dash.get("system", {}) + cpu = sys_m.get("cpu_usage", {}) + mem = sys_m.get("memory_usage", {}) + if cpu or mem: + print(f"\n System:") + if cpu: + print(f" CPU : {cpu.get('latest', 0):.1f}%") + if mem: + print(f" Memory : {mem.get('latest', 0):.1f}%") + + print(f"\n Telemetry logs:") + print(f" logs/agent.log — human-readable") + print(f" logs/agent.jsonl — structured JSON") + print(f" logs/reasoning/ — decision points & reasoning") + print(f"{'═'*60}\n") diff --git a/examples/gemini-web-computer-agent/main.py b/examples/gemini-web-computer-agent/main.py new file mode 100644 index 0000000..e982708 --- /dev/null +++ b/examples/gemini-web-computer-agent/main.py @@ -0,0 +1,79 @@ +""" +main.py — CLI entry point for gemini-web-computer-agent + +Usage: + python main.py "Search for the latest Gemini API pricing" + python main.py # interactive prompt +""" + +import sys +import os + + +def check_env(): + """Warn about missing environment variables before starting.""" + missing = [] + if not os.getenv("GEMINI_API_KEY"): + missing.append("GEMINI_API_KEY") + if not os.getenv("TAVILY_API_KEY"): + missing.append("TAVILY_API_KEY (optional — web_search tool will be disabled)") + if missing: + for m in missing: + print(f" ⚠️ {m} not set") + if "GEMINI_API_KEY" in str(missing): + print("\nSet GEMINI_API_KEY to continue. Exiting.") + sys.exit(1) + print() + + +def main(): + print("\n" + "═" * 60) + print(" gemini-web-computer-agent") + print(" web search + computer use + BMasterAI") + print("═" * 60) + + # Load .env if present + try: + from dotenv import load_dotenv + load_dotenv() + except ImportError: + pass + + check_env() + + # Lazy import after env check + from agent import WebComputerAgent + + # Get query from CLI args or interactive prompt + if len(sys.argv) > 1: + query = " ".join(sys.argv[1:]) + else: + print("\nEnter your query (or Ctrl+C to quit):") + print("Examples:") + print(' "Search for the latest Gemini model release notes"') + print(' "Take a screenshot and describe what you see"') + print(' "Search for top AI news, open the first result, and summarize it"') + print() + try: + query = input("Query: ").strip() + except (KeyboardInterrupt, EOFError): + print("\nBye!") + sys.exit(0) + + if not query: + print("No query provided. Exiting.") + sys.exit(1) + + # Run + agent = WebComputerAgent(verbose=True) + response = agent.run(query) + + print("\n" + "═" * 60) + print("🗒️ FINAL RESPONSE") + print("─" * 60) + print(response) + print("═" * 60 + "\n") + + +if __name__ == "__main__": + main() diff --git a/examples/gemini-web-computer-agent/requirements.txt b/examples/gemini-web-computer-agent/requirements.txt new file mode 100644 index 0000000..b58226f --- /dev/null +++ b/examples/gemini-web-computer-agent/requirements.txt @@ -0,0 +1,5 @@ +google-genai>=0.5.0 +bmasterai>=0.2.3 +tavily-python>=0.3.0 +python-dotenv>=1.0.0 +psutil>=5.9.0 diff --git a/examples/gemini-web-computer-agent/tools.py b/examples/gemini-web-computer-agent/tools.py new file mode 100644 index 0000000..1a1d914 --- /dev/null +++ b/examples/gemini-web-computer-agent/tools.py @@ -0,0 +1,278 @@ +""" +tools.py — Tool schemas and dispatch for gemini-web-computer-agent + +Defines two tool types: + 1. web_search — Tavily search (client-side wrapper) + 2. computer_use — screenshot, click, type, key, scroll via xdotool + scrot + +All tools log TOOL_USE events via BMasterAI before and after execution. +""" + +import os +import base64 +import subprocess +from typing import Any, Dict, List, Optional + +# ── Optional imports (graceful degradation) ────────────────────────────────── +try: + from tavily import TavilyClient + _tavily_available = True +except ImportError: + _tavily_available = False + + +# ───────────────────────────────────────────────────────────────────────────── +# Tool JSON schemas (passed to Gemini in the `tools` array) +# ───────────────────────────────────────────────────────────────────────────── + +WEB_SEARCH_SCHEMA = { + "name": "web_search", + "description": ( + "Search the web for current information using Tavily. " + "Use this when you need up-to-date facts, news, documentation, or any " + "information that may not be in your training data." + ), + "input_schema": { + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "The search query to look up.", + }, + "max_results": { + "type": "integer", + "description": "Maximum number of results to return (1–10). Defaults to 5.", + "default": 5, + }, + }, + "required": ["query"], + }, +} + +COMPUTER_USE_SCHEMA = { + "name": "computer_use", + "description": ( + "Take a screenshot of the current screen, click on UI elements, " + "type text, or press keys. Use this to interact with desktop " + "applications, browsers, or observe the current screen state." + ), + "input_schema": { + "type": "object", + "properties": { + "action": { + "type": "string", + "enum": ["screenshot", "click", "type", "key", "scroll"], + "description": ( + "Action to perform:\n" + " screenshot — capture current screen\n" + " click — click at (x, y)\n" + " type — type a string of text\n" + " key — press a keyboard shortcut (e.g. 'ctrl+c')\n" + " scroll — scroll at (x, y) by delta" + ), + }, + "x": { + "type": "integer", + "description": "X coordinate (required for click/scroll).", + }, + "y": { + "type": "integer", + "description": "Y coordinate (required for click/scroll).", + }, + "text": { + "type": "string", + "description": "Text to type (required for type action).", + }, + "key": { + "type": "string", + "description": "Key or key combo to press (required for key action). E.g. 'Return', 'ctrl+c'.", + }, + "delta_y": { + "type": "integer", + "description": "Scroll delta in pixels, positive = down (required for scroll).", + "default": 300, + }, + }, + "required": ["action"], + }, +} + +# All schemas to pass to Gemini +ALL_TOOL_SCHEMAS: List[Dict] = [ + WEB_SEARCH_SCHEMA, + COMPUTER_USE_SCHEMA, +] + + +# ───────────────────────────────────────────────────────────────────────────── +# Tool implementations +# ───────────────────────────────────────────────────────────────────────────── + +def _run_web_search(query: str, max_results: int = 5) -> Dict[str, Any]: + """Execute a Tavily web search.""" + if not _tavily_available: + return {"error": "tavily-python not installed. Run: pip install tavily-python"} + + api_key = os.getenv("TAVILY_API_KEY") + if not api_key: + return {"error": "TAVILY_API_KEY not set in environment."} + + client = TavilyClient(api_key=api_key) + response = client.search(query=query, max_results=max_results) + + results = [] + for r in response.get("results", []): + results.append({ + "title": r.get("title", ""), + "url": r.get("url", ""), + "snippet": r.get("content", "")[:500], # truncate for token efficiency + "score": r.get("score", 0), + }) + + return { + "query": query, + "results": results, + "result_count": len(results), + } + + +_SUBPROCESS_TIMEOUT = 10 # seconds — prevents hanging processes + +def _run_computer_use(action: str, x: Optional[int] = None, y: Optional[int] = None, + text: Optional[str] = None, key: Optional[str] = None, + delta_y: int = 300) -> Dict[str, Any]: + """ + Execute a computer use action. Uses xdotool + scrot on Linux, + and screencapture + cliclick on macOS. + Returns result dict; screenshots are base64-encoded PNG data. + """ + import sys + try: + if action == "screenshot": + if sys.platform == "darwin": + import tempfile + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tf: + tmp_path = tf.name + subprocess.run( + ["screencapture", "-x", tmp_path], + check=True, timeout=_SUBPROCESS_TIMEOUT, + capture_output=True, text=True + ) + with open(tmp_path, "rb") as f: + img_data = f.read() + os.remove(tmp_path) + else: + result = subprocess.run( + ["scrot", "-", "--quality", "80"], + capture_output=True, check=True, + timeout=_SUBPROCESS_TIMEOUT, + ) + img_data = result.stdout + + img_b64 = base64.b64encode(img_data).decode("utf-8") + return { + "action": "screenshot", + "success": True, + "image_base64": img_b64, + "format": "png", + "note": "Screenshot captured. Describe what you see.", + } + + elif action == "click": + if x is None or y is None: + return {"error": "click requires x and y coordinates."} + if sys.platform == "darwin": + subprocess.run(["cliclick", f"c:{x},{y}"], check=True, timeout=_SUBPROCESS_TIMEOUT, capture_output=True, text=True) + else: + subprocess.run( + ["xdotool", "mousemove", str(x), str(y), "click", "1"], + check=True, timeout=_SUBPROCESS_TIMEOUT, capture_output=True, text=True + ) + return {"action": "click", "success": True, "x": x, "y": y} + + elif action == "type": + if not text: + return {"error": "type requires text parameter."} + if sys.platform == "darwin": + subprocess.run(["cliclick", "-r", f"t:{text}"], check=True, timeout=_SUBPROCESS_TIMEOUT, capture_output=True, text=True) + else: + subprocess.run( + ["xdotool", "type", "--clearmodifiers", "--", text], + check=True, timeout=_SUBPROCESS_TIMEOUT, capture_output=True, text=True + ) + return {"action": "type", "success": True, "typed": text} + + elif action == "key": + if not key: + return {"error": "key requires key parameter."} + if sys.platform == "darwin": + subprocess.run(["cliclick", f"kp:{key}"], check=True, timeout=_SUBPROCESS_TIMEOUT, capture_output=True, text=True) + else: + subprocess.run( + ["xdotool", "key", "--", key], + check=True, timeout=_SUBPROCESS_TIMEOUT, capture_output=True, text=True + ) + return {"action": "key", "success": True, "key": key} + + elif action == "scroll": + if x is None or y is None: + return {"error": "scroll requires x and y coordinates."} + if sys.platform == "darwin": + return {"error": "scroll is not fully supported on macOS via cliclick without custom scripts"} + else: + direction = "5" if delta_y > 0 else "4" # button 5 = scroll down, 4 = up + steps = max(1, abs(delta_y) // 100) + for _ in range(steps): + subprocess.run( + ["xdotool", "mousemove", str(x), str(y), "click", direction], + check=True, timeout=_SUBPROCESS_TIMEOUT, capture_output=True, text=True + ) + return {"action": "scroll", "success": True, "x": x, "y": y, "delta_y": delta_y} + + else: + return {"error": f"Unknown action: {action}"} + + except subprocess.TimeoutExpired: + return {"error": f"Timed out after {_SUBPROCESS_TIMEOUT}s", "action": action, "success": False} + except FileNotFoundError as e: + if sys.platform == "darwin": + tool = "cliclick" if "cliclick" in str(e) else "screencapture" + install_cmd = f"brew install {tool}" + else: + tool = "scrot" if "scrot" in str(e) else "xdotool" + install_cmd = f"sudo apt-get install {tool}" + + return { + "error": f"{tool} not found. Install with: {install_cmd}", + "action": action, + "success": False, + } + except subprocess.CalledProcessError as e: + err_msg = e.stderr.strip() if e.stderr else str(e) + if sys.platform == "darwin" and action == "screenshot" and "could not create image from display" in err_msg: + err_msg += " (Error implies headless environment or missing Screen Recording permissions in macOS System Settings > Privacy & Security.)" + return {"error": err_msg, "action": action, "success": False} + + +# ───────────────────────────────────────────────────────────────────────────── +# Dispatch router +# ───────────────────────────────────────────────────────────────────────────── + +def dispatch_tool(tool_name: str, tool_input: Dict[str, Any]) -> Dict[str, Any]: + """Route a tool_use block to the correct implementation.""" + if tool_name == "web_search": + return _run_web_search( + query=tool_input["query"], + max_results=tool_input.get("max_results", 5), + ) + elif tool_name == "computer_use": + return _run_computer_use( + action=tool_input["action"], + x=tool_input.get("x"), + y=tool_input.get("y"), + text=tool_input.get("text"), + key=tool_input.get("key"), + delta_y=tool_input.get("delta_y", 300), + ) + else: + return {"error": f"Unknown tool: {tool_name}"}