zai-org · zgfh · Dec 23, 2025
diff --git a/main.py b/main.py
@@ -30,7 +30,7 @@
 from phone_agent.config.apps_ios import list_supported_apps as list_ios_apps
 from phone_agent.device_factory import DeviceType, get_device_factory, set_device_type
 from phone_agent.model import ModelConfig
-from phone_agent.xctest import XCTestConnection
+from phone_agent.xctest import XCTestConnection, set_scale_factor
 from phone_agent.xctest import list_devices as list_ios_devices
 
 
@@ -753,6 +753,26 @@ def main():
     )
 
     if device_type == DeviceType.IOS:
+        # Auto-detect iOS WDA scale factor (pixels -> points) instead of hard-coding 3.
+        # You can override it with env PHONE_AGENT_IOS_SCALE (float).
+        detected_scale: float | None = None
+        try:
+            env_scale = os.getenv("PHONE_AGENT_IOS_SCALE")
+            if env_scale:
+                detected_scale = float(env_scale)
+            else:
+                detected_scale = XCTestConnection(wda_url=args.wda_url).detect_screen_scale(
+                    session_id=None,
+                    device_id=args.device_id,
+                    default=3.0,
+                )
+
+            if detected_scale and detected_scale > 0:
+                set_scale_factor(detected_scale)
+        except Exception:
+            # Fall back to default scale in xctest.device
+            pass
+
         # Create iOS agent
         agent_config = IOSAgentConfig(
             max_steps=args.max_steps,

diff --git a/phone_agent/xctest/__init__.py b/phone_agent/xctest/__init__.py
@@ -10,10 +10,12 @@
 from phone_agent.xctest.device import (
     back,
     double_tap,
+    get_scale_factor,
     get_current_app,
     home,
     launch_app,
     long_press,
+    set_scale_factor,
     swipe,
     tap,
 )
@@ -38,10 +40,16 @@
     "double_tap",
     "long_press",
     "launch_app",
+    "set_scale_factor",
+    "get_scale_factor",
     # Connection management
     "XCTestConnection",
     "DeviceInfo",
     "ConnectionType",
     "quick_connect",
     "list_devices",
 ]
+
+# Re-export convenience methods (available on XCTestConnection).
+# Kept for discoverability in higher-level modules.
+
diff --git a/phone_agent/xctest/connection.py b/phone_agent/xctest/connection.py
@@ -252,6 +252,78 @@ def start_wda_session(self) -> tuple[bool, str]:
         except Exception as e:
             return False, f"Error starting WDA session: {e}"
 
+    def get_wda_screen(self) -> dict | None:
+        """Get WDA screen information (/wda/screen).
+
+        Returns:
+            The JSON-decoded response dict on success, otherwise None.
+        """
+        try:
+            import requests
+
+            response = requests.get(f"{self.wda_url}/wda/screen", timeout=5, verify=False)
+            if response.status_code == 200:
+                return response.json()
+            return None
+        except Exception:
+            return None
+
+    def detect_screen_scale(
+        self,
+        session_id: str | None = None,
+        device_id: str | None = None,
+        default: float = 3.0,
+    ) -> float:
+        """Detect iOS screen scale factor used by WDA coordinate system.
+
+        Priority:
+        1) Use `/wda/screen` -> value.scale if available.
+        2) Fallback: infer scale by comparing screenshot pixel size with screenSize points.
+
+        This keeps the detection details inside xctest, so CLI/agent code can stay clean.
+        """
+        screen = self.get_wda_screen()
+        try:
+            if screen and isinstance(screen, dict):
+                value = screen.get("value", {}) or {}
+                scale = value.get("scale")
+                if isinstance(scale, (int, float)) and scale > 0:
+                    return float(scale)
+
+                screen_size = value.get("screenSize", {}) or {}
+                width_pt = screen_size.get("width")
+                height_pt = screen_size.get("height")
+
+                if isinstance(width_pt, (int, float)) and isinstance(height_pt, (int, float)):
+                    # Import locally to avoid circular import at module import time.
+                    from phone_agent.xctest.screenshot import get_screenshot
+
+                    shot = get_screenshot(
+                        wda_url=self.wda_url,
+                        session_id=session_id,
+                        device_id=device_id,
+                    )
+
+                    # Best-effort inference: choose the more stable ratio between width/height.
+                    ratio_w = shot.width / float(width_pt) if width_pt else 0
+                    ratio_h = shot.height / float(height_pt) if height_pt else 0
+                    ratio = ratio_w if ratio_w > 0 else ratio_h
+                    if ratio_h > 0 and ratio_w > 0:
+                        # If both available, use the rounded average to reduce rotation noise.
+                        ratio = (ratio_w + ratio_h) / 2
+
+                    # WDA scale is typically 1/2/3. Round to nearest int if close.
+                    rounded = round(ratio) if ratio > 0 else 0
+                    if rounded in (1, 2, 3) and abs(ratio - rounded) < 0.25:
+                        return float(rounded)
+                    if ratio > 0:
+                        return float(ratio)
+        except Exception:
+            # Never block agent startup due to scale detection.
+            pass
+
+        return float(default)
+
     def get_wda_status(self) -> dict | None:
         """
         Get WebDriverAgent status information.

diff --git a/phone_agent/xctest/device.py b/phone_agent/xctest/device.py
@@ -6,7 +6,27 @@
 
 from phone_agent.config.apps_ios import APP_PACKAGES_IOS as APP_PACKAGES
 
-SCALE_FACTOR = 3 # 3 for most modern iPhone 
+# WDA expects coordinates in "points" while our higher-level code
+# mostly works in screenshot pixel coordinates.
+# This factor converts pixels -> points.
+_SCALE_FACTOR: float = 3.0  # default for many modern iPhones
+
+
+def set_scale_factor(scale: float) -> None:
+    """Set the global pixel->point scale factor used for coordinate conversion."""
+    global _SCALE_FACTOR
+    try:
+        scale_f = float(scale)
+        if scale_f > 0:
+            _SCALE_FACTOR = scale_f
+    except Exception:
+        # Keep previous value on invalid input
+        return
+
+
+def get_scale_factor() -> float:
+    """Get the global pixel->point scale factor used for coordinate conversion."""
+    return float(_SCALE_FACTOR)
 
 def _get_wda_session_url(wda_url: str, session_id: str | None, endpoint: str) -> str:
     """
@@ -95,14 +115,15 @@ def tap(
         url = _get_wda_session_url(wda_url, session_id, "actions")
 
         # W3C WebDriver Actions API for tap/click
+        scale = get_scale_factor()
         actions = {
             "actions": [
                 {
                     "type": "pointer",
                     "id": "finger1",
                     "parameters": {"pointerType": "touch"},
                     "actions": [
-                        {"type": "pointerMove", "duration": 0, "x": x / SCALE_FACTOR, "y": y / SCALE_FACTOR},
+                        {"type": "pointerMove", "duration": 0, "x": x / scale, "y": y / scale},
                         {"type": "pointerDown", "button": 0},
                         {"type": "pause", "duration": 0.1},
                         {"type": "pointerUp", "button": 0},
@@ -143,6 +164,8 @@ def double_tap(
 
         url = _get_wda_session_url(wda_url, session_id, "actions")
 
+        scale = get_scale_factor()
+
         # W3C WebDriver Actions API for double tap
         actions = {
             "actions": [
@@ -151,11 +174,12 @@ def double_tap(
                     "id": "finger1",
                     "parameters": {"pointerType": "touch"},
                     "actions": [
-                        {"type": "pointerMove", "duration": 0, "x": x / SCALE_FACTOR, "y": y / SCALE_FACTOR},
+                        {"type": "pointerMove", "duration": 0, "x": x / scale, "y": y / scale},
                         {"type": "pointerDown", "button": 0},
                         {"type": "pause", "duration": 100},
                         {"type": "pointerUp", "button": 0},
                         {"type": "pause", "duration": 100},
+                        {"type": "pointerMove", "duration": 0, "x": x / scale, "y": y / scale},
                         {"type": "pointerDown", "button": 0},
                         {"type": "pause", "duration": 100},
                         {"type": "pointerUp", "button": 0},
@@ -202,14 +226,15 @@ def long_press(
         # Convert duration to milliseconds
         duration_ms = int(duration * 1000)
 
+        scale = get_scale_factor()
         actions = {
             "actions": [
                 {
                     "type": "pointer",
                     "id": "finger1",
                     "parameters": {"pointerType": "touch"},
                     "actions": [
-                        {"type": "pointerMove", "duration": 0, "x": x / SCALE_FACTOR, "y": y / SCALE_FACTOR},
+                        {"type": "pointerMove", "duration": 0, "x": x / scale, "y": y / scale},
                         {"type": "pointerDown", "button": 0},
                         {"type": "pause", "duration": duration_ms},
                         {"type": "pointerUp", "button": 0},
@@ -262,12 +287,14 @@ def swipe(
 
         url = _get_wda_session_url(wda_url, session_id, "wda/dragfromtoforduration")
 
+        scale = get_scale_factor()
+
         # WDA dragfromtoforduration API payload
         payload = {
-            "fromX": start_x / SCALE_FACTOR,
-            "fromY": start_y / SCALE_FACTOR,
-            "toX": end_x / SCALE_FACTOR,
-            "toY": end_y / SCALE_FACTOR,
+            "fromX": start_x / scale,
+            "fromY": start_y / scale,
+            "toX": end_x / scale,
+            "toY": end_y / scale,
             "duration": duration,
         }
 
@@ -303,12 +330,14 @@ def back(
 
         url = _get_wda_session_url(wda_url, session_id, "wda/dragfromtoforduration")
 
+        scale = get_scale_factor()
+
         # Swipe from left edge to simulate back gesture
         payload = {
-            "fromX": 0,
-            "fromY": 640,
-            "toX": 400,
-            "toY": 640,
+            "fromX": 0 / scale,
+            "fromY": 640 / scale,
+            "toX": 400 / scale,
+            "toY": 640 / scale,
             "duration": 0.3,
         }