Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 21 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from phone_agent.config.apps_ios import list_supported_apps as list_ios_apps
from phone_agent.device_factory import DeviceType, get_device_factory, set_device_type
from phone_agent.model import ModelConfig
from phone_agent.xctest import XCTestConnection
from phone_agent.xctest import XCTestConnection, set_scale_factor
from phone_agent.xctest import list_devices as list_ios_devices


Expand Down Expand Up @@ -753,6 +753,26 @@ def main():
)

if device_type == DeviceType.IOS:
# Auto-detect iOS WDA scale factor (pixels -> points) instead of hard-coding 3.
# You can override it with env PHONE_AGENT_IOS_SCALE (float).
detected_scale: float | None = None
try:
env_scale = os.getenv("PHONE_AGENT_IOS_SCALE")
if env_scale:
detected_scale = float(env_scale)
else:
detected_scale = XCTestConnection(wda_url=args.wda_url).detect_screen_scale(
session_id=None,
device_id=args.device_id,
default=3.0,
)

if detected_scale and detected_scale > 0:
set_scale_factor(detected_scale)
except Exception:
# Fall back to default scale in xctest.device
pass

# Create iOS agent
agent_config = IOSAgentConfig(
max_steps=args.max_steps,
Expand Down
8 changes: 8 additions & 0 deletions phone_agent/xctest/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,12 @@
from phone_agent.xctest.device import (
back,
double_tap,
get_scale_factor,
get_current_app,
home,
launch_app,
long_press,
set_scale_factor,
swipe,
tap,
)
Expand All @@ -38,10 +40,16 @@
"double_tap",
"long_press",
"launch_app",
"set_scale_factor",
"get_scale_factor",
# Connection management
"XCTestConnection",
"DeviceInfo",
"ConnectionType",
"quick_connect",
"list_devices",
]

# Re-export convenience methods (available on XCTestConnection).
# Kept for discoverability in higher-level modules.

72 changes: 72 additions & 0 deletions phone_agent/xctest/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,78 @@ def start_wda_session(self) -> tuple[bool, str]:
except Exception as e:
return False, f"Error starting WDA session: {e}"

def get_wda_screen(self) -> dict | None:
"""Get WDA screen information (/wda/screen).

Returns:
The JSON-decoded response dict on success, otherwise None.
"""
try:
import requests

response = requests.get(f"{self.wda_url}/wda/screen", timeout=5, verify=False)
if response.status_code == 200:
return response.json()
return None
except Exception:
return None

def detect_screen_scale(
self,
session_id: str | None = None,
device_id: str | None = None,
default: float = 3.0,
) -> float:
"""Detect iOS screen scale factor used by WDA coordinate system.

Priority:
1) Use `/wda/screen` -> value.scale if available.
2) Fallback: infer scale by comparing screenshot pixel size with screenSize points.

This keeps the detection details inside xctest, so CLI/agent code can stay clean.
"""
screen = self.get_wda_screen()
try:
if screen and isinstance(screen, dict):
value = screen.get("value", {}) or {}
scale = value.get("scale")
if isinstance(scale, (int, float)) and scale > 0:
return float(scale)

screen_size = value.get("screenSize", {}) or {}
width_pt = screen_size.get("width")
height_pt = screen_size.get("height")

if isinstance(width_pt, (int, float)) and isinstance(height_pt, (int, float)):
# Import locally to avoid circular import at module import time.
from phone_agent.xctest.screenshot import get_screenshot

shot = get_screenshot(
wda_url=self.wda_url,
session_id=session_id,
device_id=device_id,
)

# Best-effort inference: choose the more stable ratio between width/height.
ratio_w = shot.width / float(width_pt) if width_pt else 0
ratio_h = shot.height / float(height_pt) if height_pt else 0
ratio = ratio_w if ratio_w > 0 else ratio_h
if ratio_h > 0 and ratio_w > 0:
# If both available, use the rounded average to reduce rotation noise.
ratio = (ratio_w + ratio_h) / 2

# WDA scale is typically 1/2/3. Round to nearest int if close.
rounded = round(ratio) if ratio > 0 else 0
if rounded in (1, 2, 3) and abs(ratio - rounded) < 0.25:
return float(rounded)
if ratio > 0:
return float(ratio)
except Exception:
# Never block agent startup due to scale detection.
pass

return float(default)

def get_wda_status(self) -> dict | None:
"""
Get WebDriverAgent status information.
Expand Down
53 changes: 41 additions & 12 deletions phone_agent/xctest/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,27 @@

from phone_agent.config.apps_ios import APP_PACKAGES_IOS as APP_PACKAGES

SCALE_FACTOR = 3 # 3 for most modern iPhone
# WDA expects coordinates in "points" while our higher-level code
# mostly works in screenshot pixel coordinates.
# This factor converts pixels -> points.
_SCALE_FACTOR: float = 3.0 # default for many modern iPhones


def set_scale_factor(scale: float) -> None:
"""Set the global pixel->point scale factor used for coordinate conversion."""
global _SCALE_FACTOR
try:
scale_f = float(scale)
if scale_f > 0:
_SCALE_FACTOR = scale_f
except Exception:
# Keep previous value on invalid input
return


def get_scale_factor() -> float:
"""Get the global pixel->point scale factor used for coordinate conversion."""
return float(_SCALE_FACTOR)

def _get_wda_session_url(wda_url: str, session_id: str | None, endpoint: str) -> str:
"""
Expand Down Expand Up @@ -95,14 +115,15 @@ def tap(
url = _get_wda_session_url(wda_url, session_id, "actions")

# W3C WebDriver Actions API for tap/click
scale = get_scale_factor()
actions = {
"actions": [
{
"type": "pointer",
"id": "finger1",
"parameters": {"pointerType": "touch"},
"actions": [
{"type": "pointerMove", "duration": 0, "x": x / SCALE_FACTOR, "y": y / SCALE_FACTOR},
{"type": "pointerMove", "duration": 0, "x": x / scale, "y": y / scale},
{"type": "pointerDown", "button": 0},
{"type": "pause", "duration": 0.1},
{"type": "pointerUp", "button": 0},
Expand Down Expand Up @@ -143,6 +164,8 @@ def double_tap(

url = _get_wda_session_url(wda_url, session_id, "actions")

scale = get_scale_factor()

# W3C WebDriver Actions API for double tap
actions = {
"actions": [
Expand All @@ -151,11 +174,12 @@ def double_tap(
"id": "finger1",
"parameters": {"pointerType": "touch"},
"actions": [
{"type": "pointerMove", "duration": 0, "x": x / SCALE_FACTOR, "y": y / SCALE_FACTOR},
{"type": "pointerMove", "duration": 0, "x": x / scale, "y": y / scale},
{"type": "pointerDown", "button": 0},
{"type": "pause", "duration": 100},
{"type": "pointerUp", "button": 0},
{"type": "pause", "duration": 100},
{"type": "pointerMove", "duration": 0, "x": x / scale, "y": y / scale},
{"type": "pointerDown", "button": 0},
{"type": "pause", "duration": 100},
{"type": "pointerUp", "button": 0},
Expand Down Expand Up @@ -202,14 +226,15 @@ def long_press(
# Convert duration to milliseconds
duration_ms = int(duration * 1000)

scale = get_scale_factor()
actions = {
"actions": [
{
"type": "pointer",
"id": "finger1",
"parameters": {"pointerType": "touch"},
"actions": [
{"type": "pointerMove", "duration": 0, "x": x / SCALE_FACTOR, "y": y / SCALE_FACTOR},
{"type": "pointerMove", "duration": 0, "x": x / scale, "y": y / scale},
{"type": "pointerDown", "button": 0},
{"type": "pause", "duration": duration_ms},
{"type": "pointerUp", "button": 0},
Expand Down Expand Up @@ -262,12 +287,14 @@ def swipe(

url = _get_wda_session_url(wda_url, session_id, "wda/dragfromtoforduration")

scale = get_scale_factor()

# WDA dragfromtoforduration API payload
payload = {
"fromX": start_x / SCALE_FACTOR,
"fromY": start_y / SCALE_FACTOR,
"toX": end_x / SCALE_FACTOR,
"toY": end_y / SCALE_FACTOR,
"fromX": start_x / scale,
"fromY": start_y / scale,
"toX": end_x / scale,
"toY": end_y / scale,
"duration": duration,
}

Expand Down Expand Up @@ -303,12 +330,14 @@ def back(

url = _get_wda_session_url(wda_url, session_id, "wda/dragfromtoforduration")

scale = get_scale_factor()

# Swipe from left edge to simulate back gesture
payload = {
"fromX": 0,
"fromY": 640,
"toX": 400,
"toY": 640,
"fromX": 0 / scale,
"fromY": 640 / scale,
"toX": 400 / scale,
"toY": 640 / scale,
"duration": 0.3,
}

Expand Down