Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
# OpenAI API key (required)
# LLM provider: openai, anthropic, or ollama (default: openai)
DROIDPILOT_PROVIDER=openai

# Model to use (optional — each provider has a sensible default)
# DROIDPILOT_MODEL=gpt-4o

# --- Provider API keys (set the one matching your provider) ---

# OpenAI
OPENAI_API_KEY=sk-...

# Model to use (optional, default: gpt-4o)
DROIDPILOT_MODEL=gpt-4o
# Anthropic
# ANTHROPIC_API_KEY=sk-ant-...

# Ollama (no API key needed, just the base URL if non-default)
# OLLAMA_BASE_URL=http://localhost:11434/v1
36 changes: 36 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: CI

on:
push:
branches: [main]
pull_request:
branches: [main]

jobs:
lint-and-test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.12"]

steps:
- uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
pip install -e ".[all]"
pip install black mypy pytest

- name: Format check (black)
run: black --check .

- name: Type check (mypy)
run: mypy droidpilot/ tests/ --ignore-missing-imports

- name: Tests
run: pytest tests/ -v
30 changes: 23 additions & 7 deletions droidpilot/adb.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,18 @@ def tap(x: int, y: int) -> None:


def swipe(x1: int, y1: int, x2: int, y2: int, duration_ms: int = 300) -> None:
_run(["shell", "input", "swipe", str(x1), str(y1), str(x2), str(y2), str(duration_ms)])
_run(
[
"shell",
"input",
"swipe",
str(x1),
str(y1),
str(x2),
str(y2),
str(duration_ms),
]
)


def input_text(text: str) -> None:
Expand All @@ -95,12 +106,17 @@ def press_enter() -> None:


def open_app(package_name: str) -> None:
_run([
"shell", "monkey",
"-p", package_name,
"-c", "android.intent.category.LAUNCHER",
"1",
])
_run(
[
"shell",
"monkey",
"-p",
package_name,
"-c",
"android.intent.category.LAUNCHER",
"1",
]
)


def list_packages(name: str = "") -> list[str]:
Expand Down
58 changes: 21 additions & 37 deletions droidpilot/agent.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
import json
import time

from openai import OpenAI

from . import adb
from .ui_tree import parse
from .actions import TOOLS
from .prompts import SYSTEM_PROMPT
from .providers import create_provider

SWIPE_OFFSETS = {
"up": (0, 1, 0, -1),
Expand Down Expand Up @@ -108,19 +107,21 @@ def _execute_action(
return f"Unknown action: {name}"


def run(prompt: str, model: str = "gpt-4o", max_steps: int = 30) -> str:
client = OpenAI()
def run(
prompt: str,
provider: str = "openai",
model: str | None = None,
max_steps: int = 30,
) -> str:
llm = create_provider(provider, model, SYSTEM_PROMPT, TOOLS)

serial = adb.check_device()
screen_size = adb.get_screen_size()
print(f"Connected to device: {serial}")
print(f"Screen size: {screen_size[0]}x{screen_size[1]}")
print(f"Task: {prompt}\n")

messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": f"Task: {prompt}"},
]
llm.add_user_message(f"Task: {prompt}")

for step in range(1, max_steps + 1):
try:
Expand All @@ -130,44 +131,27 @@ def run(prompt: str, model: str = "gpt-4o", max_steps: int = 30) -> str:
time.sleep(2)
continue

messages.append({
"role": "user",
"content": f"Current screen UI tree:\n```\n{tree_text}\n```",
})

response = client.chat.completions.create(
model=model,
messages=messages,
tools=TOOLS,
tool_choice="required",
)
llm.add_user_message(f"Current screen UI tree:\n```\n{tree_text}\n```")

message = response.choices[0].message
messages.append(message.model_dump(exclude_none=True))
tool_call = llm.get_tool_call()

if not message.tool_calls:
print(f" [!] No action returned, retrying...")
if not tool_call:
print(" [!] No action returned, retrying...")
continue

tool_call = message.tool_calls[0]
action_name = tool_call.function.name
action_args = json.loads(tool_call.function.arguments)
print(f" Step {step}: {tool_call.name}({json.dumps(tool_call.arguments)})")

print(f" Step {step}: {action_name}({json.dumps(action_args)})")

if action_name == "done":
summary = action_args.get("summary", "Task completed.")
if tool_call.name == "done":
summary = tool_call.arguments.get("summary", "Task completed.")
print(f"\nDone: {summary}")
return summary

result = _execute_action(action_name, action_args, ref_map, screen_size)
print(f" → {result}")
result = _execute_action(
tool_call.name, tool_call.arguments, ref_map, screen_size
)
print(f" -> {result}")

messages.append({
"role": "tool",
"tool_call_id": tool_call.id,
"content": result,
})
llm.add_tool_result(result or "")

time.sleep(UI_SETTLE_DELAY)

Expand Down
25 changes: 22 additions & 3 deletions droidpilot/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,33 @@

from . import adb
from .agent import run
from .providers import DEFAULT_MODELS, PROVIDERS


def main():
def main() -> None:
load_dotenv()

provider_names = sorted(PROVIDERS.keys())
default_provider = os.getenv("DROIDPILOT_PROVIDER", "openai")
default_model = os.getenv("DROIDPILOT_MODEL")

parser = argparse.ArgumentParser(
prog="droidpilot",
description="AI agent that operates your Android phone via ADB",
)
parser.add_argument("prompt")
parser.add_argument(
"--provider",
choices=provider_names,
default=default_provider,
help=f"LLM provider (default: {default_provider})",
)
parser.add_argument(
"--model",
default=os.getenv("DROIDPILOT_MODEL", "gpt-4o"),
default=default_model,
help="Model name (defaults per provider: "
+ ", ".join(f"{k}={v}" for k, v in sorted(DEFAULT_MODELS.items()))
+ ")",
)
parser.add_argument("--max-steps", type=int, default=30)

Expand All @@ -31,7 +45,12 @@ def main():
sys.exit(1)

try:
run(args.prompt, model=args.model, max_steps=args.max_steps)
run(
args.prompt,
provider=args.provider,
model=args.model,
max_steps=args.max_steps,
)
except RuntimeError as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
Expand Down
56 changes: 56 additions & 0 deletions droidpilot/providers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""LLM provider registry."""

from .base import LLMProvider, ToolCall
from .openai_provider import OpenAIProvider
from .anthropic_provider import AnthropicProvider
from .ollama_provider import OllamaProvider

PROVIDERS: dict[str, type[LLMProvider]] = {
"openai": OpenAIProvider,
"anthropic": AnthropicProvider,
"ollama": OllamaProvider,
}

# Default model per provider so users don't have to specify both
DEFAULT_MODELS: dict[str, str] = {
"openai": "gpt-4o",
"anthropic": "claude-sonnet-4-20250514",
"ollama": "llama3",
}


def create_provider(
provider_name: str, model: str | None, system_prompt: str, tools: list[dict]
) -> LLMProvider:
"""Create an LLM provider instance.

Args:
provider_name: One of "openai", "anthropic", "ollama".
model: Model identifier, or None to use the provider's default.
system_prompt: System prompt for the agent.
tools: Tool definitions in OpenAI function-calling format.

Returns:
An initialized LLMProvider.

Raises:
ValueError: If the provider name is not recognized.
"""
if provider_name not in PROVIDERS:
supported = ", ".join(sorted(PROVIDERS))
raise ValueError(f"Unknown provider '{provider_name}'. Supported: {supported}")

resolved_model = model or DEFAULT_MODELS[provider_name]
return PROVIDERS[provider_name](resolved_model, system_prompt, tools)


__all__ = [
"LLMProvider",
"ToolCall",
"OpenAIProvider",
"AnthropicProvider",
"OllamaProvider",
"PROVIDERS",
"DEFAULT_MODELS",
"create_provider",
]
84 changes: 84 additions & 0 deletions droidpilot/providers/anthropic_provider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
"""Anthropic LLM provider."""

import anthropic

from .base import LLMProvider, ToolCall


def _convert_tools(openai_tools: list[dict]) -> list[dict]:
"""Convert OpenAI function-calling tool format to Anthropic tool format."""
converted = []
for tool in openai_tools:
func = tool["function"]
converted.append(
{
"name": func["name"],
"description": func.get("description", ""),
"input_schema": func.get(
"parameters", {"type": "object", "properties": {}}
),
}
)
return converted


class AnthropicProvider(LLMProvider):
"""Provider for Anthropic models (Claude)."""

def __init__(self, model: str, system_prompt: str, tools: list[dict]) -> None:
self._model = model
self._system_prompt = system_prompt
self._tools = _convert_tools(tools)
self._client = anthropic.Anthropic()
self._messages: list[dict] = []
self._last_tool_use_id: str | None = None

def add_user_message(self, content: str) -> None:
self._messages.append({"role": "user", "content": content})

def get_tool_call(self) -> ToolCall | None:
response = self._client.messages.create( # type: ignore[call-overload]
model=self._model,
max_tokens=1024,
system=self._system_prompt,
messages=self._messages,
tools=self._tools,
tool_choice={"type": "any"},
)

# Build assistant content blocks for history
assistant_content = []
tool_call: ToolCall | None = None

for block in response.content:
if block.type == "text":
assistant_content.append({"type": "text", "text": block.text})
elif block.type == "tool_use":
assistant_content.append(
{
"type": "tool_use",
"id": block.id,
"name": block.name,
"input": block.input,
}
)
if tool_call is None:
self._last_tool_use_id = block.id
tool_call = ToolCall(name=block.name, arguments=block.input)

self._messages.append({"role": "assistant", "content": assistant_content})
return tool_call

def add_tool_result(self, result: str) -> None:
self._messages.append(
{
"role": "user",
"content": [
{
"type": "tool_result",
"tool_use_id": self._last_tool_use_id,
"content": result,
}
],
}
)
Loading
Loading