From 46e9ce191cb224da9d648a1cd183b7ced3e2e6f2 Mon Sep 17 00:00:00 2001
From: Mingjie Liu <jayliu@umich.edu>
Date: Tue, 18 Nov 2025 16:58:08 -0600
Subject: [PATCH 01/14] Update first version.

---
 examples/SETUP_CHANGES.md                     |  233 ++++
 examples/setup.py                             | 1118 +++++++++++++++++
 examples/test.py                              |  156 +++
 examples/test_setup.py                        |  180 +++
 examples/test_setup_para.py                   |  259 ++++
 examples/test_setup_single.py                 |  211 ++++
 .../osworld_singularity_runtime.py            |    8 +-
 7 files changed, 2162 insertions(+), 3 deletions(-)
 create mode 100644 examples/SETUP_CHANGES.md
 create mode 100644 examples/setup.py
 create mode 100755 examples/test.py
 create mode 100755 examples/test_setup.py
 create mode 100755 examples/test_setup_para.py
 create mode 100755 examples/test_setup_single.py

diff --git a/examples/SETUP_CHANGES.md b/examples/SETUP_CHANGES.md
new file mode 100644
index 000000000..68e5085a4
--- /dev/null
+++ b/examples/SETUP_CHANGES.md
@@ -0,0 +1,233 @@
+# Setup.py Modification Summary
+
+## Changes Made
+
+### 1. Removed Proxy Functionality
+- **Removed imports:**
+  - `from desktop_env.providers.aws.proxy_pool import get_global_proxy_pool, init_proxy_pool, ProxyInfo`
+  
+- **Removed code:**
+  - `PROXY_CONFIG_FILE` constant
+  - `init_proxy_pool(PROXY_CONFIG_FILE)` initialization
+  - `_proxy_setup()` method (entire method removed)
+  - `self.use_proxy` attribute from `__init__`
+  - `use_proxy` parameter from `setup()` method
+  - Proxy server logic in `_launch_setup()` method
+
+### 2. Replaced PythonController with Runtime and OSWorldInteractiveAction
+
+- **Modified `__init__` method:**
+  - Added `runtime=None` parameter
+  - Added `self.runtime = runtime` to store runtime object for environment interaction
+  - Removed `self.use_proxy` attribute
+
+- **Modified `setup()` method:**
+  - Replaced HTTP call to `/terminal` endpoint with `OSWorldInteractiveAction`:
+    ```python
+    action = OSWorldInteractiveAction(
+        method='get_terminal_output',
+        params={}
+    )
+    observation = self.runtime.run_action(action)
+    ```
+  - **Returns False** if runtime is not available (runtime is now **required**)
+
+- **Modified `_execute_setup()` method:**
+  - Replaced HTTP call to `/setup/execute` endpoint with `OSWorldInteractiveAction`:
+    ```python
+    action = OSWorldInteractiveAction(
+        method='run_bash_script',
+        params={'script': script, 'timeout': 120}
+    )
+    observation = self.runtime.run_action(action)
+    ```
+  - **Raises Exception** if runtime is not available (runtime is now **required**)
+
+- **Modified `_update_browse_history_setup()` method:**
+  - Replaced `controller = PythonController(self.vm_ip, self.server_port)` with `OSWorldInteractiveAction` and runtime
+  - Changed `controller.get_vm_platform()` to:
+    ```python
+    action = OSWorldInteractiveAction(
+        method='run_python_script',
+        params={'script': 'import platform; print(platform.system())'}
+    )
+    observation = self.runtime.run_action(action)
+    ```
+  - Changed all `controller.execute_python_command()` calls to use `OSWorldInteractiveAction` with `run_python_script` method
+  - Added proper error handling checking `observation.exit_code` and `observation.content`
+  - Added runtime validation check at the beginning of the method
+
+- **Added new import:**
+  - `from openhands.events.action.os import OSWorldInteractiveAction`
+
+- **HTTP endpoints kept as-is (not standard OSWorld methods):**
+  - `/setup/upload` - File upload endpoint
+  - `/setup/change_wallpaper` - Change desktop wallpaper
+  - `/setup/open_file` - Open file with default application
+  - `/setup/launch` - Launch applications
+  - `/setup/execute_with_verification` - Execute command with verification
+  - `/setup/activate_window` - Activate window by name
+  - `/setup/close_window` - Close window by name
+  
+  These endpoints are setup-specific and have no corresponding methods in OSWorldInteractiveAction, so they remain as HTTP requests. However, all methods now check for runtime availability at the beginning and raise an Exception if runtime is None.
+
+- **All setup methods now validate runtime:**
+  - `_download_setup()` - Checks runtime at start
+  - `_upload_file_setup()` - Checks runtime at start
+  - `_change_wallpaper_setup()` - Checks runtime at start
+  - `_open_setup()` - Checks runtime at start
+  - `_launch_setup()` - Checks runtime at start
+  - `_execute_setup()` - Checks runtime at start (already had this)
+  - `_execute_with_verification_setup()` - Checks runtime at start
+  - `_activate_window_setup()` - Checks runtime at start
+  - `_close_window_setup()` - Checks runtime at start
+  - `_chrome_open_tabs_setup()` - Checks runtime at start
+  - `_chrome_close_tabs_setup()` - Checks runtime at start
+  - `_googledrive_setup()` - Checks runtime at start
+  - `_login_setup()` - Checks runtime at start
+  - `_update_browse_history_setup()` - Checks runtime at start (already had this)
+
+### 3. Copied Necessary Functions from desktop_env
+
+- **Added `compare_urls()` function:**
+  - Copied from `desktop_env.evaluators.metrics.utils`
+  - Includes helper functions: `parse_with_default_scheme()` and `normalize_url()`
+  - Added necessary imports: `re`, `urllib.parse` components, and `tldextract`
+  - Properly documented as copied from desktop_env
+
+- **Removed imports:**
+  - `from desktop_env.controllers.python import PythonController`
+  - `from desktop_env.evaluators.metrics.utils import compare_urls` (function now defined locally)
+
+## New Dependencies
+- Added `re` import for URL parsing
+- Added `from urllib.parse import urlparse, urlunparse, ParseResult` for URL manipulation
+- Added `tldextract` import for domain extraction
+- Added `from openhands.events.action.os import OSWorldInteractiveAction` for runtime actions
+
+## Usage Changes
+
+### Before:
+```python
+# Initialize without runtime
+setup_controller = SetupController(
+    vm_ip="192.168.1.1",
+    server_port=5000,
+    cache_dir="cache"
+)
+
+# Setup with proxy support
+setup_controller.setup(config, use_proxy=True)
+```
+
+### After:
+```python
+# Initialize with runtime
+setup_controller = SetupController(
+    vm_ip="192.168.1.1",
+    server_port=5000,
+    cache_dir="cache",
+    runtime=runtime_object  # Pass your runtime object here
+)
+
+# Setup without proxy parameter
+setup_controller.setup(config)
+```
+
+## Backward Compatibility Notes
+
+1. **Breaking changes:**
+   - `setup()` method no longer accepts `use_proxy` parameter
+   - `SetupController.__init__()` now **REQUIRES** `runtime` parameter - will fail without it
+   - `_proxy_setup` setup type is no longer supported in config
+   - **ALL setup methods now require runtime:** Every `_*_setup()` method validates that runtime is not None at the beginning and raises an Exception if it is
+   - **NO HTTP FALLBACK:** All methods that previously had HTTP fallbacks now require runtime
+   - **Setup methods that still use HTTP endpoints** (like `/setup/upload`, `/setup/launch`, etc.) now also require runtime to be set, even though they don't directly use runtime.run_action()
+
+2. **Runtime requirement:**
+   - The runtime object must implement `run_action(action: OSWorldInteractiveAction)` method
+   - The method should return an observation object with:
+     - `content`: The output/result as a string
+     - `exit_code`: The exit code (0 for success, non-zero for errors)
+   - Example usage:
+     ```python
+     from openhands.events.action.os import OSWorldInteractiveAction
+     
+     action = OSWorldInteractiveAction(
+         method='run_python_script',
+         params={'script': 'import platform; print(platform.system())'}
+     )
+     observation = runtime.run_action(action)
+     if observation.exit_code == 0:
+         print(f"Result: {observation.content}")
+     else:
+         print(f"Error: {observation.content}")
+     ```
+
+## Complete Example
+
+Here's a complete example showing how to use the updated SetupController:
+
+```python
+from openhands.events.action.os import OSWorldInteractiveAction
+from setup import SetupController
+
+# Assume you have a runtime object that implements run_action()
+# For example: runtime = OSWorldSingularityRuntime(...)
+
+# Initialize SetupController with runtime
+setup_controller = SetupController(
+    vm_ip="192.168.1.1",
+    server_port=5000,
+    cache_dir="cache",
+    runtime=runtime  # Pass your runtime object
+)
+
+# Setup configuration (no proxy support)
+config = [
+    {
+        "type": "download",
+        "parameters": {
+            "files": [
+                {
+                    "url": "https://example.com/file.zip",
+                    "path": "/home/user/Downloads/file.zip"
+                }
+            ]
+        }
+    },
+    {
+        "type": "launch",
+        "parameters": {
+            "command": ["google-chrome"]
+        }
+    }
+]
+
+# Run setup (no use_proxy parameter)
+success = setup_controller.setup(config)
+
+if success:
+    print("Setup completed successfully!")
+else:
+    print("Setup failed!")
+```
+
+## Testing Recommendations
+
+1. Test all setup types without proxy functionality
+2. Verify `_update_browse_history_setup` works with runtime object
+3. Ensure `compare_urls` function works correctly for Chrome setup operations
+4. Test runtime integration with various environment types
+5. Verify that `OSWorldInteractiveAction` is correctly created and handled by runtime
+6. **Test that SetupController fails gracefully when runtime is None**
+7. **Verify error messages are clear when runtime is missing**
+
+## Files Modified
+
+- `/home/jayliu/ProRL-Agent-Server/examples/setup.py`
+
+## Files Not Modified
+
+- Original file at `/home/jayliu/OSWorld/desktop_env/controllers/setup.py` remains unchanged
+
diff --git a/examples/setup.py b/examples/setup.py
new file mode 100644
index 000000000..cf71ffcb7
--- /dev/null
+++ b/examples/setup.py
@@ -0,0 +1,1118 @@
+import asyncio
+import inspect
+import json
+import logging
+import os
+import os.path
+import platform
+import re
+import shutil
+import sqlite3
+import tempfile
+import time
+import traceback
+import uuid
+from datetime import datetime, timedelta
+from typing import Any, Union, Optional
+from typing import Dict, List
+from urllib.parse import urlparse, urlunparse, ParseResult
+
+import requests
+from playwright.async_api import async_playwright, TimeoutError
+from pydrive.auth import GoogleAuth
+from pydrive.drive import GoogleDrive, GoogleDriveFile, GoogleDriveFileList
+from requests_toolbelt.multipart.encoder import MultipartEncoder
+import tldextract
+
+from openhands.events.action.os import OSWorldInteractiveAction
+from openhands.core.logger import openhands_logger as logger
+from openhands.runtime.impl.singularity.osworld_singularity_runtime import OSWorldSingularityRuntime, OSWORLD_CHROMIUM_PORT_RANGE
+from openhands.runtime.utils import find_available_tcp_port
+
+import dotenv
+# Load environment variables from .env file
+dotenv.load_dotenv()
+
+FILE_PATH = os.path.dirname(os.path.abspath(__file__))
+
+MAX_RETRIES = 20
+
+
+# Utility function copied from desktop_env.evaluators.metrics.utils
+def compare_urls(url1, url2, full=True):
+    """
+    Compare two URLs for equality, handling various normalization cases.
+    Copied from desktop_env.evaluators.metrics.utils
+    """
+    if url1 is None or url2 is None:
+        return url1 == url2
+    
+    logger.info(f"compare_urls. url1: {url1}; url2: {url2}")
+
+    def parse_with_default_scheme(url):
+        """
+        Ensure the URL has a scheme. If not, prepend 'http://'
+        so it parses as host + path instead of just a path.
+        """
+        # Regex to check if URL has scheme like 'http://', 'https://', etc.
+        if not re.match(r'^[a-zA-Z][a-zA-Z0-9+\-.]*://', url):
+            url = f"http://{url}"
+        return urlparse(url)
+
+    def normalize_url(url):
+        # Parse the URL; if no scheme is present, assume 'http'
+        parsed_url = parse_with_default_scheme(url)
+        scheme = parsed_url.scheme.lower()
+
+        # Extract the domain parts using tldextract
+        extracted = tldextract.extract(parsed_url.netloc.lower())
+        
+        # Drop 'www' if it's the only subdomain
+        subdomain = extracted.subdomain
+        if subdomain == 'www':
+            subdomain = ''
+
+        # Normalize the domain
+        if subdomain:
+            normalized_netloc = f"{subdomain}.{extracted.domain}"
+        else:
+            normalized_netloc = extracted.domain
+
+        # Handle trailing slash in the path
+        normalized_path = parsed_url.path if parsed_url.path != '/' else ''
+
+        # Reassemble the URL with the normalized components
+        normalized_parsed_url = ParseResult(
+            scheme=scheme.lower(),
+            netloc=normalized_netloc,
+            path=normalized_path,
+            params=parsed_url.params if full else '',
+            query=parsed_url.query if full else '',
+            fragment=parsed_url.fragment if full else '',
+        )
+        return urlunparse(normalized_parsed_url)
+
+    logger.info(f"After normalization. url1: {normalize_url(url1)}; url2: {normalize_url(url2)}")
+    # Normalize both URLs
+    norm_url1 = normalize_url(url1)
+    norm_url2 = normalize_url(url2)
+
+    # Compare the normalized URLs
+    return norm_url1 == norm_url2
+
+class SetupController:
+    def __init__(self, vm_ip: str, server_port: int = 5000, chromium_port: int = 9222, vlc_port: int = 8080, cache_dir: str = "cache", client_password: str = "", screen_width: int = 1920, screen_height: int = 1080, runtime=None):
+        self.vm_ip: str = vm_ip
+        self.server_port: int = server_port
+        self.chromium_port: int = chromium_port
+        self.vlc_port: int = vlc_port
+        self.http_server: str = f"http://{vm_ip}:{server_port}"
+        self.http_server_setup_root: str = f"http://{vm_ip}:{server_port}/setup"
+        self.cache_dir: str = cache_dir
+        self.client_password: str = client_password
+        self.screen_width: int = screen_width
+        self.screen_height: int = screen_height
+        self.runtime = runtime  # Runtime object for interacting with the environment
+
+    def reset_cache_dir(self, cache_dir: str):
+        self.cache_dir = cache_dir
+    
+    async def setup(self, config: List[Dict[str, Any]])-> bool:
+        """
+        Args:
+            config (List[Dict[str, Any]]): list of dict like {str: Any}. each
+              config dict has the structure like
+                {
+                    "type": str, corresponding to the `_{:}_setup` methods of
+                      this class
+                    "parameters": dict like {str, Any} providing the keyword
+                      parameters
+                }
+        """  
+        # Runtime is required for setup
+        if not self.runtime:
+            logger.error("Runtime is required for SetupController. Please provide a runtime object.")
+            return False
+        
+        # Make sure connection can be established
+        logger.info(f"Checking runtime connection...")
+        retry = 0
+        while retry < MAX_RETRIES:
+            try:
+                action = OSWorldInteractiveAction(
+                    method='get_terminal_output',
+                    params={}
+                )
+                observation = self.runtime.run_action(action)
+                if observation.exit_code == 0:
+                    logger.info("Runtime connection established successfully")
+                    break
+            except:
+                pass
+            time.sleep(5)
+            retry += 1
+            logger.info(f"retry: {retry}/{MAX_RETRIES}")
+            
+            if retry == MAX_RETRIES:
+                logger.error("Failed to establish runtime connection after maximum retries")
+                return False
+                
+
+        for i, cfg in enumerate(config):
+            config_type: str = cfg["type"]
+            parameters: Dict[str, Any] = cfg["parameters"]
+
+            # Assumes all the setup the functions should follow this name
+            # protocol
+            setup_function: str = "_{:}_setup".format(config_type)
+            assert hasattr(self, setup_function), f'Setup controller cannot find init function {setup_function}'
+            
+            try:
+                logger.info(f"Executing setup step {i+1}/{len(config)}: {setup_function}")
+                logger.debug(f"Setup parameters: {parameters}")
+                
+                # Get the setup function
+                func = getattr(self, setup_function)
+                
+                # Check if it's a coroutine function and await if necessary
+                if inspect.iscoroutinefunction(func):
+                    await func(**parameters)
+                else:
+                    func(**parameters)
+                    
+                logger.info(f"SETUP COMPLETED: {setup_function}({str(parameters)})")
+            except Exception as e:
+                logger.error(f"SETUP FAILED at step {i+1}/{len(config)}: {setup_function}({str(parameters)})")
+                logger.error(f"Error details: {e}")
+                logger.error(f"Traceback: {traceback.format_exc()}")
+                raise Exception(f"Setup step {i+1} failed: {setup_function} - {e}") from e
+        
+        return True
+
+    def _download_setup(self, files: List[Dict[str, str]]):
+        """
+        Args:
+            files (List[Dict[str, str]]): files to download. lisf of dict like
+              {
+                "url": str, the url to download
+                "path": str, the path on the VM to store the downloaded file
+              }
+        """
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        for f in files:
+            url: str = f["url"]
+            path: str = f["path"]
+            cache_path: str = os.path.join(self.cache_dir, "{:}_{:}".format(
+                uuid.uuid5(uuid.NAMESPACE_URL, url),
+                os.path.basename(path)))
+            if not url or not path:
+                raise Exception(f"Setup Download - Invalid URL ({url}) or path ({path}).")
+
+            if not os.path.exists(cache_path):
+                logger.info(f"Cache file not found, downloading from {url} to {cache_path}")
+                max_retries = 3
+                downloaded = False
+                e = None
+                for i in range(max_retries):
+                    try:
+                        logger.info(f"Download attempt {i+1}/{max_retries} for {url}")
+                        response = requests.get(url, stream=True, timeout=300)  # Add 5 minute timeout
+                        response.raise_for_status()
+                        
+                        # Get file size if available
+                        total_size = int(response.headers.get('content-length', 0))
+                        if total_size > 0:
+                            logger.info(f"File size: {total_size / (1024*1024):.2f} MB")
+
+                        downloaded_size = 0
+                        with open(cache_path, 'wb') as f:
+                            for chunk in response.iter_content(chunk_size=8192):
+                                if chunk:
+                                    f.write(chunk)
+                                    downloaded_size += len(chunk)
+                                    if total_size > 0 and downloaded_size % (1024*1024) == 0:  # Log every MB
+                                        progress = (downloaded_size / total_size) * 100
+                                        logger.info(f"Download progress: {progress:.1f}%")
+                        
+                        logger.info(f"File downloaded successfully to {cache_path} ({downloaded_size / (1024*1024):.2f} MB)")
+                        downloaded = True
+                        break
+
+                    except requests.RequestException as e:
+                        logger.error(
+                            f"Failed to download {url} caused by {e}. Retrying... ({max_retries - i - 1} attempts left)")
+                        # Clean up partial download
+                        if os.path.exists(cache_path):
+                            os.remove(cache_path)
+                if not downloaded:
+                    raise requests.RequestException(f"Failed to download {url}. No retries left.")
+
+            
+            self._upload_file_setup([{
+                "local_path": cache_path,
+                "path": path
+            }])
+
+    def _upload_file_setup(self, files: List[Dict[str, str]]):
+        """
+        Args:
+            files (List[Dict[str, str]]): files to download. lisf of dict like
+              {
+                "local_path": str, the local path to the file to upload
+                "path": str, the path on the VM to store the downloaded file
+              }
+        """
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        for f in files:
+            local_path: str = f["local_path"]
+            path: str = f["path"]
+
+            if not os.path.exists(local_path):
+                raise Exception(f"Setup Upload - Invalid local path ({local_path}).")
+
+            file_size = None
+            try:
+                file_size = os.path.getsize(local_path)
+            except Exception:
+                pass
+
+            max_retries = 3
+            last_error: Optional[Exception] = None
+
+            for attempt in range(max_retries):
+                try:
+                    logger.info(
+                        f"Uploading {os.path.basename(local_path)}{f' ({file_size} bytes)' if file_size is not None else ''} "
+                        f"to VM at {path} (attempt {attempt + 1}/{max_retries})"
+                    )
+                    logger.debug("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/upload")
+
+                    # Open the file inside each attempt to ensure fresh stream position
+                    with open(local_path, "rb") as fp:
+                        form = MultipartEncoder({
+                            "file_path": path,
+                            "file_data": (os.path.basename(path), fp)
+                        })
+                        headers = {"Content-Type": form.content_type}
+                        logger.debug(form.content_type)
+
+                        # Explicit connect/read timeout to avoid hanging forever
+                        response = requests.post(
+                            self.http_server + "/setup" + "/upload",
+                            headers=headers,
+                            data=form,
+                            timeout=(10, 600)
+                        )
+
+                        if response.status_code == 200:
+                            logger.info(f"File uploaded successfully: {path}")
+                            logger.debug("Upload response: %s", response.text)
+                            last_error = None
+                            break
+                        else:
+                            msg = f"Failed to upload file {path}. Status code: {response.status_code}, Response: {response.text}"
+                            logger.error(msg)
+                            last_error = requests.RequestException(msg)
+
+                except requests.exceptions.RequestException as e:
+                    last_error = e
+                    logger.error(f"Upload attempt {attempt + 1} failed for {path}: {e}")
+
+                # Exponential backoff between retries
+                if attempt < max_retries - 1:
+                    time.sleep(2 ** attempt)
+
+            if last_error is not None:
+                raise last_error
+
+    def _change_wallpaper_setup(self, path: str):
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        if not path:
+            raise Exception(f"Setup Wallpaper - Invalid path ({path}).")
+
+        payload = json.dumps({"path": path})
+        headers = {
+            'Content-Type': 'application/json'
+        }
+
+        # send request to server to change wallpaper
+        # Note: This uses a custom /setup endpoint, not a standard OSWorld method
+        try:
+            response = requests.post(self.http_server + "/setup" + "/change_wallpaper", headers=headers, data=payload)
+            if response.status_code == 200:
+                logger.info("Command executed successfully: %s", response.text)
+            else:
+                logger.error("Failed to change wallpaper. Status code: %s", response.text)
+        except requests.exceptions.RequestException as e:
+            logger.error("An error occurred while trying to send the request: %s", e)
+
+    def _tidy_desktop_setup(self, **config):
+        raise NotImplementedError()
+
+    def _open_setup(self, path: str):
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        if not path:
+            raise Exception(f"Setup Open - Invalid path ({path}).")
+
+        payload = json.dumps({"path": path})
+        headers = {
+            'Content-Type': 'application/json'
+        }
+
+        # send request to server to open file
+        # Note: This uses a custom /setup endpoint, not a standard OSWorld method
+        try:
+            # The server-side call is now blocking and can take time.
+            # We set a timeout that is slightly longer than the server's timeout (1800s).
+            response = requests.post(self.http_server + "/setup" + "/open_file", headers=headers, data=payload, timeout=1810)
+            response.raise_for_status()  # This will raise an exception for 4xx and 5xx status codes
+            logger.info("Command executed successfully: %s", response.text)
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}")
+            raise Exception(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}") from e
+
+    def _ensure_launch_command_finish(self, command):
+        additional_wait_time = 3
+        if isinstance(command, list):
+            if len(command) < 1:
+                return True
+            original_command = command.copy()
+            command = ' '.join(command)
+        else:
+            original_command = [command]
+
+        def get_last_observation(observation_content):
+            lines = observation_content.split('\n')[::-1]
+            for line in lines:
+                if len(line) > 0:
+                    return line
+            return ''
+
+        if 'gimp' in command:
+            action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
+            observation = self.runtime.run_action(action)
+            if '– GIMP' in get_last_observation(observation.content):
+                time.sleep(additional_wait_time)
+                return True
+            elif 'Convert to RGB Working Space?' in get_last_observation(observation.content):
+                # Weird error message, so we assume the command is successful
+                # TODO: FIX THIS IN QEMU image.
+                time.sleep(additional_wait_time)
+                return True
+            elif len(original_command) < 2 and 'GNU Image Manipulation Program' in get_last_observation(observation.content):
+                # Here no file name is provided, so we simply expect window to launch
+                return True
+            elif 'yicun.raw' in command and 'GIMP Message' in get_last_observation(observation.content):
+                # special case for dbbf4b99-2253-4b10-9274-45f246af2466.json
+                time.sleep(additional_wait_time)
+                return True
+            return False
+
+        elif 'google-chrome' in command:
+            action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
+            observation = self.runtime.run_action(action)
+            if '- Google Chrome' in get_last_observation(observation.content):
+                return True
+            return False
+
+        elif 'libreoffice --writer' in command:
+            action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
+            observation = self.runtime.run_action(action)
+            if '- LibreOffice Writer' in get_last_observation(observation.content):
+                files = original_command[-1].split('/')
+                if len(files) > 0 and files[-1].endswith('.docx'):
+                    if files[-1] in get_last_observation(observation.content):
+                        return True
+                    else:
+                        return False
+                else:
+                    # No file name is provided, so we assume the command is successful
+                    return True
+            return False
+
+        elif 'libreoffice --calc' in command:
+            action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
+            observation = self.runtime.run_action(action)
+            if '- LibreOffice Calc' in get_last_observation(observation.content):
+                files = original_command[-1].split('/')
+                if len(files) > 0 and files[-1].endswith('.xlsx'):
+                    if files[-1] in get_last_observation(observation.content):
+                        return True
+                    else:
+                        return False
+                else:
+                    # No file name is provided, so we assume the command is successful
+                    return True
+            return False
+
+        elif 'libreoffice --impress' in command:
+            action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
+            observation = self.runtime.run_action(action)
+            if '- LibreOffice Impress' in get_last_observation(observation.content):
+                files = original_command[-1].split('/')
+                if len(files) > 0 and files[-1].endswith('.pptx'):
+                    if files[-1] in get_last_observation(observation.content):
+                        return True
+                    else:
+                        return False
+                else:
+                    # No file name is provided, so we assume the command is successful
+                    return True
+            return False
+
+        elif 'nautilus' in command:
+            action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
+            observation = self.runtime.run_action(action)
+            dir_path = [p for p in original_command[-1].split('/') if p][-1]
+            if dir_path in get_last_observation(observation.content):
+                return True
+            return False
+
+        elif 'vlc' in command:
+            action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
+            observation = self.runtime.run_action(action)
+            if '- VLC media player' in get_last_observation(observation.content):
+                return True
+            return False
+
+        elif 'thunderbird' in command:
+            action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
+            observation = self.runtime.run_action(action)
+            if 'Thunderbird' in get_last_observation(observation.content):
+                return True
+            return False
+
+        elif 'code' in command:
+            action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
+            observation = self.runtime.run_action(action)
+            if 'Visual Studio Code' in get_last_observation(observation.content):
+                return True
+            return False
+
+        return True
+    
+    def _launch_setup(self, command: Union[str, List[str]], shell: bool = False):
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        if not command:
+            raise Exception("Empty command to launch.")
+
+        if not shell and isinstance(command, str) and len(command.split()) > 1:
+            logger.warning("Command should be a list of strings. Now it is a string. Will split it by space.")
+            command = command.split()
+
+        payload = json.dumps({"command": command, "shell": shell})
+        headers = {"Content-Type": "application/json"}
+
+        # Note: This uses a custom /setup endpoint, not a standard OSWorld method
+        try:
+            logger.info("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/launch")
+            response = requests.post(self.http_server + "/setup" + "/launch", headers=headers, data=payload)
+            if response.status_code == 200:
+                logger.info("Command executed successfully: %s", response.text)
+            else:
+                logger.error("Failed to launch application. Status code: %s", response.text)
+        except requests.exceptions.RequestException as e:
+            logger.error("An error occurred while trying to send the request: %s", e)
+
+        ensure_launch_retries = 0
+        while not self._ensure_launch_command_finish(command):
+            logger.info("Waiting for command to finish...")
+            time.sleep(5)
+            ensure_launch_retries += 1
+            if ensure_launch_retries >= 10:
+                raise Exception("Failed to ensure launch command finish after multiple retries")
+
+    def _execute_setup(
+            self,
+            command: List[str],
+            stdout: str = "",
+            stderr: str = "",
+            shell: bool = False,
+            until: Optional[Dict[str, Any]] = None
+    ):
+        if not command:
+            raise Exception("Empty command to launch.")
+
+        until: Dict[str, Any] = until or {}
+        terminates: bool = False
+        nb_failings = 0
+
+        def replace_screen_env_in_command(command):
+            password = self.client_password
+            width = self.screen_width
+            height = self.screen_height
+            width_half = str(width // 2)
+            height_half = str(height // 2)
+            new_command_list = []
+            new_command = ""
+            if isinstance(command, str):
+                new_command = command.replace("{CLIENT_PASSWORD}", password)
+                new_command = new_command.replace("{SCREEN_WIDTH_HALF}", width_half)
+                new_command = new_command.replace("{SCREEN_HEIGHT_HALF}", height_half)
+                new_command = new_command.replace("{SCREEN_WIDTH}", str(width))
+                new_command = new_command.replace("{SCREEN_HEIGHT}", str(height))
+                return new_command
+            else:
+                for item in command:
+                    item = item.replace("{CLIENT_PASSWORD}", password)
+                    item = item.replace("{SCREEN_WIDTH_HALF}", width_half)
+                    item = item.replace("{SCREEN_HEIGHT_HALF}", height_half)
+                    item = item.replace("{SCREEN_WIDTH}", str(width))
+                    item = item.replace("{SCREEN_HEIGHT}", str(height))
+                    new_command_list.append(item)
+                return new_command_list
+        command = replace_screen_env_in_command(command)
+        payload = json.dumps({"command": command, "shell": shell})
+        headers = {"Content-Type": "application/json"}
+        # Runtime is required
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        # Execute using runtime
+        while not terminates:
+            try:
+                response = requests.post(self.http_server + "/setup" + "/execute", headers=headers, data=payload)
+                if response.status_code == 200:
+                    results: Dict[str, str] = response.json()
+                    if stdout:
+                        with open(os.path.join(self.cache_dir, stdout), "w") as f:
+                            f.write(results["output"])
+                    if stderr:
+                        with open(os.path.join(self.cache_dir, stderr), "w") as f:
+                            f.write(results["error"])
+                    logger.info("Command executed successfully: %s -> %s"
+                                , " ".join(command) if isinstance(command, list) else command
+                                , response.text
+                                )
+                else:
+                    logger.error("Failed to launch application. Status code: %s", response.text)
+                    results = None
+                    nb_failings += 1
+            except requests.exceptions.RequestException as e:
+                logger.error("An error occurred while trying to send the request: %s", e)
+                traceback.print_exc()
+
+                results = None
+                nb_failings += 1
+
+            if len(until) == 0:
+                terminates = True
+            elif results is not None:
+                terminates = "returncode" in until and results["returncode"] == until["returncode"] \
+                             or "stdout" in until and until["stdout"] in results["output"] \
+                             or "stderr" in until and until["stderr"] in results["error"]
+            terminates = terminates or nb_failings >= 5
+            if not terminates:
+                time.sleep(0.3)
+
+    def _execute_with_verification_setup(
+            self,
+            command: List[str],
+            verification: Dict[str, Any] = None,
+            max_wait_time: int = 10,
+            check_interval: float = 1.0,
+            shell: bool = False
+    ):
+        """Execute command with verification of results
+        
+        Args:
+            command: Command to execute
+            verification: Dict with verification criteria:
+                - window_exists: Check if window with this name exists
+                - command_success: Execute this command and check if it succeeds
+            max_wait_time: Maximum time to wait for verification
+            check_interval: Time between verification checks
+            shell: Whether to use shell
+        """
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        if not command:
+            raise Exception("Empty command to launch.")
+
+        verification = verification or {}
+        
+        payload = json.dumps({
+            "command": command, 
+            "shell": shell,
+            "verification": verification,
+            "max_wait_time": max_wait_time,
+            "check_interval": check_interval
+        })
+        headers = {"Content-Type": "application/json"}
+
+        # Note: This uses a custom /setup endpoint, not a standard OSWorld method
+        try:
+            response = requests.post(self.http_server + "/setup" + "/execute_with_verification", 
+                                   headers=headers, data=payload, timeout=max_wait_time + 10)
+            if response.status_code == 200:
+                result = response.json()
+                logger.info("Command executed and verified successfully: %s -> %s"
+                            , " ".join(command) if isinstance(command, list) else command
+                            , response.text
+                            )
+                return result
+            else:
+                logger.error("Failed to execute with verification. Status code: %s", response.text)
+                raise Exception(f"Command verification failed: {response.text}")
+        except requests.exceptions.RequestException as e:
+            logger.error("An error occurred while trying to send the request: %s", e)
+            traceback.print_exc()
+            raise Exception(f"Request failed: {e}")
+
+    def _command_setup(self, command: List[str], **kwargs):
+        self._execute_setup(command, **kwargs)
+
+    def _sleep_setup(self, seconds: float):
+        time.sleep(seconds)
+
+    def _act_setup(self, action_seq: List[Union[Dict[str, Any], str]]):
+        # TODO
+        raise NotImplementedError()
+
+    def _replay_setup(self, trajectory: str):
+        """
+        Args:
+            trajectory (str): path to the replay trajectory file
+        """
+
+        # TODO
+        raise NotImplementedError()
+
+    def _activate_window_setup(self, window_name: str, strict: bool = False, by_class: bool = False):
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        if not window_name:
+            raise Exception(f"Setup Open - Invalid path ({window_name}).")
+
+        payload = json.dumps({"window_name": window_name, "strict": strict, "by_class": by_class})
+        headers = {
+            'Content-Type': 'application/json'
+        }
+
+        # send request to server to open file
+        # Note: This uses a custom /setup endpoint, not a standard OSWorld method
+        try:
+            response = requests.post(self.http_server + "/setup" + "/activate_window", headers=headers, data=payload)
+            if response.status_code == 200:
+                logger.info("Command executed successfully: %s", response.text)
+            else:
+                logger.error(f"Failed to activate window {window_name}. Status code: %s", response.text)
+        except requests.exceptions.RequestException as e:
+            logger.error("An error occurred while trying to send the request: %s", e)
+
+    def _close_window_setup(self, window_name: str, strict: bool = False, by_class: bool = False):
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        if not window_name:
+            raise Exception(f"Setup Open - Invalid path ({window_name}).")
+
+        payload = json.dumps({"window_name": window_name, "strict": strict, "by_class": by_class})
+        headers = {
+            'Content-Type': 'application/json'
+        }
+
+        # send request to server to open file
+        # Note: This uses a custom /setup endpoint, not a standard OSWorld method
+        try:
+            response = requests.post(self.http_server + "/setup" + "/close_window", headers=headers, data=payload)
+            if response.status_code == 200:
+                logger.info("Command executed successfully: %s", response.text)
+            else:
+                logger.error(f"Failed to close window {window_name}. Status code: %s", response.text)
+        except requests.exceptions.RequestException as e:
+            logger.error("An error occurred while trying to send the request: %s", e)
+
+    # Chrome setup
+    async def _chrome_open_tabs_setup(self, urls_to_open: List[str]):
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        host = self.vm_ip
+        port = self.chromium_port  # fixme: this port is hard-coded, need to be changed from config file
+
+        remote_debugging_url = f"http://{host}:{port}"
+        logger.info("Connect to Chrome @: %s", remote_debugging_url)
+        logger.debug("PLAYWRIGHT ENV: %s", repr(os.environ))
+        for attempt in range(15):
+            if attempt > 0:
+                time.sleep(5)
+
+            browser = None
+            async with async_playwright() as p:
+                try:
+                    browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+                    # break
+                except Exception as e:
+                    if attempt < 14:
+                        logger.error(f"Attempt {attempt + 1}: Failed to connect, retrying. Error: {e}")
+                        # time.sleep(10)
+                        continue
+                    else:
+                        logger.error(f"Failed to connect after multiple attempts: {e}")
+                        raise e
+
+                if not browser:
+                    return
+
+                logger.info("Opening %s...", urls_to_open)
+                for i, url in enumerate(urls_to_open):
+                    # Use the first context (which should be the only one if using default profile)
+                    if i == 0:
+                        context = browser.contexts[0]
+
+                    page = await context.new_page()  # Create a new page (tab) within the existing context
+                    try:
+                        await page.goto(url, timeout=60000)
+                    except:
+                        logger.warning("Opening %s exceeds time limit", url)  # only for human test
+                    logger.info(f"Opened tab {i + 1}: {url}")
+
+                    if i == 0:
+                        # clear the default tab
+                        default_page = context.pages[0]
+                        await default_page.close()
+
+                # Do not close the context or browser; they will remain open after script ends
+                return browser, context
+
+    async def _chrome_close_tabs_setup(self, urls_to_close: List[str]):
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        time.sleep(5)  # Wait for Chrome to finish launching
+
+        host = self.vm_ip
+        port = self.chromium_port  # fixme: this port is hard-coded, need to be changed from config file
+
+        remote_debugging_url = f"http://{host}:{port}"
+        async with async_playwright() as p:
+            browser = None
+            for attempt in range(15):
+                try:
+                    browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+                    break
+                except Exception as e:
+                    if attempt < 14:
+                        logger.error(f"Attempt {attempt + 1}: Failed to connect, retrying. Error: {e}")
+                        time.sleep(5)
+                    else:
+                        logger.error(f"Failed to connect after multiple attempts: {e}")
+                        raise e
+
+            if not browser:
+                return
+
+            for i, url in enumerate(urls_to_close):
+                # Use the first context (which should be the only one if using default profile)
+                if i == 0:
+                    context = browser.contexts[0]
+
+                for page in context.pages:
+
+                    # if two urls are the same, close the tab
+                    if compare_urls(page.url, url):
+                        context.pages.pop(context.pages.index(page))
+                        await page.close()
+                        logger.info(f"Closed tab {i + 1}: {url}")
+                        break
+
+            # Do not close the context or browser; they will remain open after script ends
+            return browser, context
+
+    # google drive setup
+    def _googledrive_setup(self, **config):
+        """ Clean google drive space (eliminate the impact of previous experiments to reset the environment)
+        @args:
+            config(Dict[str, Any]): contain keys
+                settings_file(str): path to google drive settings file, which will be loaded by pydrive.auth.GoogleAuth()
+                operation(List[str]): each operation is chosen from ['delete', 'upload']
+                args(List[Dict[str, Any]]): parameters for each operation
+            different args dict for different operations:
+                for delete:
+                    query(str): query pattern string to search files or folder in google drive to delete, please refer to
+                        https://developers.google.com/drive/api/guides/search-files?hl=en about how to write query string.
+                    trash(bool): whether to delete files permanently or move to trash. By default, trash=false, completely delete it.
+                for mkdirs:
+                    path(List[str]): the path in the google drive to create folder
+                for upload:
+                    path(str): remote url to download file
+                    dest(List[str]): the path in the google drive to store the downloaded file
+        """
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        settings_file = config.get('settings_file', 'evaluation_examples/settings/googledrive/settings.yml')
+        gauth = GoogleAuth(settings_file=settings_file)
+        drive = GoogleDrive(gauth)
+
+        def mkdir_in_googledrive(paths: List[str]):
+            paths = [paths] if type(paths) != list else paths
+            parent_id = 'root'
+            for p in paths:
+                q = f'"{parent_id}" in parents and title = "{p}" and mimeType = "application/vnd.google-apps.folder" and trashed = false'
+                folder = drive.ListFile({'q': q}).GetList()
+                if len(folder) == 0:  # not exists, create it
+                    parents = {} if parent_id == 'root' else {'parents': [{'id': parent_id}]}
+                    file = drive.CreateFile({'title': p, 'mimeType': 'application/vnd.google-apps.folder', **parents})
+                    file.Upload()
+                    parent_id = file['id']
+                else:
+                    parent_id = folder[0]['id']
+            return parent_id
+
+        for oid, operation in enumerate(config['operation']):
+            if operation == 'delete':  # delete a specific file
+                # query pattern string, by default, remove all files/folders not in the trash to the trash
+                params = config['args'][oid]
+                q = params.get('query', '')
+                trash = params.get('trash', False)
+                q_file = f"( {q} ) and mimeType != 'application/vnd.google-apps.folder'" if q.strip() else "mimeType != 'application/vnd.google-apps.folder'"
+                filelist: GoogleDriveFileList = drive.ListFile({'q': q_file}).GetList()
+                q_folder = f"( {q} ) and mimeType = 'application/vnd.google-apps.folder'" if q.strip() else "mimeType = 'application/vnd.google-apps.folder'"
+                folderlist: GoogleDriveFileList = drive.ListFile({'q': q_folder}).GetList()
+                for file in filelist:  # first delete file, then folder
+                    file: GoogleDriveFile
+                    if trash:
+                        file.Trash()
+                    else:
+                        file.Delete()
+                for folder in folderlist:
+                    folder: GoogleDriveFile
+                    # note that, if a folder is trashed/deleted, all files and folders in it will be trashed/deleted
+                    if trash:
+                        folder.Trash()
+                    else:
+                        folder.Delete()
+            elif operation == 'mkdirs':
+                params = config['args'][oid]
+                mkdir_in_googledrive(params['path'])
+            elif operation == 'upload':
+                params = config['args'][oid]
+                url = params['url']
+                with tempfile.NamedTemporaryFile(mode='wb', delete=False) as tmpf:
+                    response = requests.get(url, stream=True)
+                    response.raise_for_status()
+                    for chunk in response.iter_content(chunk_size=8192):
+                        if chunk:
+                            tmpf.write(chunk)
+                    tmpf.close()
+                    paths = [params['path']] if params['path'] != list else params['path']
+                    parent_id = mkdir_in_googledrive(paths[:-1])
+                    parents = {} if parent_id == 'root' else {'parents': [{'id': parent_id}]}
+                    file = drive.CreateFile({'title': paths[-1], **parents})
+                    file.SetContentFile(tmpf.name)
+                    file.Upload()
+                return
+            else:
+                raise ValueError('[ERROR]: not implemented clean type!')
+
+    async def _login_setup(self, **config):
+        """ Login to a website with account and password information.
+        @args:
+            config(Dict[str, Any]): contain keys
+                settings_file(str): path to the settings file
+                platform(str): platform to login, implemented platforms include:
+                    googledrive: https://drive.google.com/drive/my-drive
+
+        """
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        host = self.vm_ip
+        port = self.chromium_port
+
+        remote_debugging_url = f"http://{host}:{port}"
+        async with async_playwright() as p:
+            browser = None
+            for attempt in range(15):
+                try:
+                    browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+                    break
+                except Exception as e:
+                    if attempt < 14:
+                        logger.error(f"Attempt {attempt + 1}: Failed to connect, retrying. Error: {e}")
+                        time.sleep(5)
+                    else:
+                        logger.error(f"Failed to connect after multiple attempts: {e}")
+                        raise e
+            if not browser:
+                return
+
+            context = browser.contexts[0]
+            platform = config['platform']
+
+            if platform == 'googledrive':
+                url = 'https://drive.google.com/drive/my-drive'
+                page = await context.new_page()  # Create a new page (tab) within the existing context
+                try:
+                    await page.goto(url, timeout=60000)
+                except:
+                    logger.warning("Opening %s exceeds time limit", url)  # only for human test
+                logger.info(f"Opened new page: {url}")
+                settings = json.load(open(config['settings_file']))
+                email, password = settings['email'], settings['password']
+
+                try:
+                    await page.wait_for_selector('input[type="email"]', state="visible", timeout=3000)
+                    await page.fill('input[type="email"]', email)
+                    await page.click('#identifierNext > div > button')
+                    await page.wait_for_selector('input[type="password"]', state="visible", timeout=5000)
+                    await page.fill('input[type="password"]', password)
+                    await page.click('#passwordNext > div > button')
+                    await page.wait_for_load_state('load', timeout=5000)
+                except TimeoutError:
+                    logger.info('[ERROR]: timeout when waiting for google drive login page to load!')
+                    return
+
+            else:
+                raise NotImplementedError
+
+            return browser, context
+
+    def _execute_python_command(self, command: str):
+        """
+        Taken from OSWorld/desktop_env/controllers/python.py
+        Executes a python command on the server.
+        It can be used to execute the pyautogui commands, or... any other python command. who knows?
+        """
+        pkgs_prefix = "import pyautogui; import time; pyautogui.FAILSAFE = False; {command}"
+        command_list = ["python", "-c", pkgs_prefix.format(command=command)]
+        payload = json.dumps({"command": command_list, "shell": False})
+
+        for _ in range(3):
+            try:
+                response = requests.post(self.http_server + "/execute", headers={'Content-Type': 'application/json'},
+                                         data=payload, timeout=90)
+                if response.status_code == 200:
+                    logger.info("Command executed successfully: %s", response.text)
+                    return response.json()
+                else:
+                    logger.error("Failed to execute command. Status code: %d", response.status_code)
+                    logger.info("Retrying to execute command.")
+            except requests.exceptions.ReadTimeout:
+                break
+            except Exception as e:
+                logger.error("An error occurred while trying to execute the command: %s", e)
+                logger.info("Retrying to execute command.")
+            time.sleep(5)
+
+        logger.error("Failed to execute command.")
+        return None
+    def _update_browse_history_setup(self, **config):
+        cache_path = os.path.join(self.cache_dir, "history_new.sqlite")
+        db_url = "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938/history_empty.sqlite?download=true"
+        if not os.path.exists(cache_path):
+            max_retries = 3
+            downloaded = False
+            e = None
+            for i in range(max_retries):
+                try:
+                    response = requests.get(db_url, stream=True)
+                    response.raise_for_status()
+
+                    with open(cache_path, 'wb') as f:
+                        for chunk in response.iter_content(chunk_size=8192):
+                            if chunk:
+                                f.write(chunk)
+                    logger.info("File downloaded successfully")
+                    downloaded = True
+                    break
+
+                except requests.RequestException as e:
+                    logger.error(
+                        f"Failed to download {db_url} caused by {e}. Retrying... ({max_retries - i - 1} attempts left)")
+            if not downloaded:
+                raise requests.RequestException(f"Failed to download {db_url}. No retries left. Error: {e}")
+        else:
+            logger.info("File already exists in cache directory")
+        # copy a new history file in the tmp folder
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            db_path = os.path.join(tmp_dir, "history_empty.sqlite")
+            shutil.copy(cache_path, db_path)
+
+            history = config['history']
+
+            for history_item in history:
+                url = history_item['url']
+                title = history_item['title']
+                visit_time = datetime.now() - timedelta(seconds=history_item['visit_time_from_now_in_seconds'])
+
+                # Chrome use ms from 1601-01-01 as timestamp
+                epoch_start = datetime(1601, 1, 1)
+                chrome_timestamp = int((visit_time - epoch_start).total_seconds() * 1000000)
+
+                conn = sqlite3.connect(db_path)
+                cursor = conn.cursor()
+
+                cursor.execute('''
+                    INSERT INTO urls (url, title, visit_count, typed_count, last_visit_time, hidden)
+                    VALUES (?, ?, ?, ?, ?, ?)
+                ''', (url, title, 1, 0, chrome_timestamp, 0))
+
+                url_id = cursor.lastrowid
+
+                cursor.execute('''
+                    INSERT INTO visits (url, visit_time, from_visit, transition, segment_id, visit_duration)
+                    VALUES (?, ?, ?, ?, ?, ?)
+                ''', (url_id, chrome_timestamp, 0, 805306368, 0, 0))
+
+                conn.commit()
+                conn.close()
+
+            logger.info('Fake browsing history added successfully.')
+
+            # get the path of the history file according to the platform
+            os_type = self.runtime.os_type
+
+            if os_type == 'windows':
+                chrome_history_path = self._execute_python_command(
+                    """import os; print(os.path.join(os.getenv('USERPROFILE'), "AppData", "Local", "Google", "Chrome", "User Data", "Default", "History"))""")[
+                    'output'].strip()
+            elif os_type == 'darwin':
+                chrome_history_path = self._execute_python_command(
+                    """import os; print(os.path.join(os.getenv('HOME'), "Library", "Application Support", "Google", "Chrome", "Default", "History"))""")[
+                    'output'].strip()
+            elif os_type == 'linux':
+                if "arm" in platform.machine():
+                    chrome_history_path = self._execute_python_command(
+                        "import os; print(os.path.join(os.getenv('HOME'), 'snap', 'chromium', 'common', 'chromium', 'Default', 'History'))")[
+                        'output'].strip()
+                else:
+                    chrome_history_path = self._execute_python_command(
+                        "import os; print(os.path.join(os.getenv('HOME'), '.config', 'google-chrome', 'Default', 'History'))")[
+                        'output'].strip()
+            else:
+                raise Exception('Unsupported operating system')
+
+            form = MultipartEncoder({
+                "file_path": chrome_history_path,
+                "file_data": (os.path.basename(chrome_history_path), open(db_path, "rb"))
+            })
+            headers = {"Content-Type": form.content_type}
+            logger.debug(form.content_type)
+
+            # send request to server to upload file
+            try:
+                logger.debug("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/upload")
+                response = requests.post(self.http_server + "/setup" + "/upload", headers=headers, data=form)
+                if response.status_code == 200:
+                    logger.info("Command executed successfully: %s", response.text)
+                else:
+                    logger.error("Failed to upload file. Status code: %s", response.text)
+            except requests.exceptions.RequestException as e:
+                logger.error("An error occurred while trying to send the request: %s", e)
+
+            self._execute_setup(["sudo chown -R user:user /home/user/.config/google-chrome/Default/History"], shell=True)
diff --git a/examples/test.py b/examples/test.py
new file mode 100755
index 000000000..821e55f92
--- /dev/null
+++ b/examples/test.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+"""
+Example script demonstrating OSWorld runtime usage with OSWorldInteractiveAction.
+
+This script shows how to:
+1. Configure and create an OSWorld runtime
+2. Connect to a VM
+3. Use OSWorldInteractiveAction to interact with the VM
+4. Perform GUI actions (click, type, press keys)
+5. Take screenshots
+6. Get VM information (platform, screen size)
+7. Clean up resources
+
+Prerequisites:
+- Singularity/Apptainer installed
+- QEMU and OVMF installed
+- VM image at ./OS_images/Ubuntu.qcow2 with OSWorld server
+"""
+
+import asyncio
+import sys
+from pathlib import Path
+
+from openhands.core.config import OpenHandsConfig
+from openhands.core.logger import openhands_logger as logger
+from openhands.events import EventStream
+from openhands.events.action.os import OSWorldInteractiveAction
+from openhands.runtime.impl.singularity.osworld_singularity_runtime import (
+    OSWorldSingularityRuntime,
+)
+from openhands.storage import get_file_store
+from examples.setup import SetupController
+
+
+async def main():
+    """Main example function."""
+    
+    print("=" * 80)
+    print("OSWorld Runtime Example")
+    print("=" * 80)
+    print()
+    
+    # 1. Create configuration
+    print("1. Creating configuration...")
+    config = OpenHandsConfig()
+    config.runtime = 'osworld'
+    config.sandbox.base_container_image = 'ubuntu:24.04'
+    config.sandbox.run_as_fakeroot = True
+    
+    # Check if VM image exists
+    vm_image_path = './OS_images/Ubuntu.qcow2'
+    if not Path(vm_image_path).exists():
+        print(f"ERROR: VM image not found at {vm_image_path}")
+        print("Please place your Ubuntu VM image with OSWorld server at this location.")
+        return 1
+    
+    print(f"✓ Configuration created")
+    print(f"  Runtime: {config.runtime}")
+    print(f"  Base image: {config.sandbox.base_container_image}")
+    print(f"  VM image: {vm_image_path}")
+    print()
+    
+    # 2. Create event stream
+    print("2. Creating event stream...")
+    file_store = get_file_store('local', '/tmp/osworld_example')
+    event_stream = EventStream(sid='osworld-example', file_store=file_store)
+    print("✓ Event stream created")
+    print()
+    
+    # 3. Create OSWorld runtime
+    print("3. Creating OSWorld runtime...")
+    runtime = OSWorldSingularityRuntime(
+        config=config,
+        event_stream=event_stream,
+        sid='osworld-example',
+        os_type='linux',
+        vm_image_path=vm_image_path,
+        attach_to_existing=False,
+    )
+    print("✓ Runtime created")
+    print(f"  OS Type: {runtime.os_type}")
+    print()
+    
+    try:
+        # 4. Connect to runtime (starts VM)
+        print("4. Connecting to runtime (this may take 1-2 minutes)...")
+        print("   - Building Singularity image (if needed)")
+        print("   - Starting QEMU VM")
+        print("   - Waiting for VM to boot")
+        print("   - Waiting for OSWorld server to start")
+        await runtime.connect()
+        print("✓ Runtime connected and VM is ready!")
+        print(f"  VM Server URL: {runtime.osworld_vm_url}")
+        print(f"  VNC URL: {runtime.vnc_url}")
+        print(f"  Chromium URL: {runtime._chromium_port}")
+        print()
+        
+        # 5. Check if VM is alive
+        print("5. Checking VM health...")
+        runtime.check_if_alive()
+        print("✓ VM is alive and responding")
+        print()
+        
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        logger.exception("Failed to run example")
+        return 1
+        
+    import pdb; pdb.set_trace()
+    setup_controller = SetupController(
+        vm_ip="127.0.0.1",
+        server_port=runtime._vm_server_port,
+        chromium_port=runtime._chromium_port,
+        cache_dir="/tmp/osworld_example",
+        client_password="password",
+        runtime=runtime  # Pass your runtime object here
+    )
+
+    files = [
+          {
+            "url": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/gimp/2a729ded-3296-423d-aec4-7dd55ed5fbb3/dog_with_background.png",
+            "path": "/home/user/Desktop/dog_with_background.png"
+          },
+          {
+            "url": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/libreoffice_writer/0810415c-bde4-4443-9047-d5f70165a697/Novels_Intro_Packet.docx",
+            "path": "/home/user/Desktop/Novels_Intro_Packet.docx"
+          }
+        ]
+
+    setup_controller._download_setup(files)
+    setup_controller._change_wallpaper_setup('/home/user/Desktop/dog_with_background.png')
+    #setup_controller._launch_setup(command=['gimp', '/home/user/Desktop/dog_with_background.png'])
+    #setup_controller._open_setup('/home/user/Desktop/Novels_Intro_Packet.docx')
+    command = [
+          "python",
+          "-c",
+          "import pyautogui; import time; time.sleep(0.5); pyautogui.hotkey('ctrl', 'alt', 't'); time.sleep(0.5); pyautogui.write('stty size'); time.sleep(0.5); pyautogui.press('enter')"
+        ]
+    setup_controller._execute_setup(command=command)
+
+    setup_controller._launch_setup(command=['google-chrome', '--remote-debugging-port=1337'])
+    setup_controller._launch_setup(command=['socat', 'tcp-listen:9222,fork', 'tcp:localhost:1337'])
+    await setup_controller._chrome_open_tabs_setup(urls_to_open=["https://drugs.com"])
+
+    import pdb; pdb.set_trace()
+    print("Cleaning up...")
+    runtime.close()
+    print("✓ Runtime closed")
+    print()
+
+
+if __name__ == '__main__':
+    # Run async main
+    exit_code = asyncio.run(main())
+    sys.exit(exit_code)
+
diff --git a/examples/test_setup.py b/examples/test_setup.py
new file mode 100755
index 000000000..cb5059aab
--- /dev/null
+++ b/examples/test_setup.py
@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+"""
+Example script demonstrating OSWorld runtime usage with OSWorldInteractiveAction.
+
+This script shows how to:
+1. Configure and create an OSWorld runtime
+2. Connect to a VM
+3. Use OSWorldInteractiveAction to interact with the VM
+4. Perform GUI actions (click, type, press keys)
+5. Take screenshots
+6. Get VM information (platform, screen size)
+7. Clean up resources
+
+Prerequisites:
+- Singularity/Apptainer installed
+- QEMU and OVMF installed
+- VM image at ./OS_images/Ubuntu.qcow2 with OSWorld server
+"""
+
+import asyncio
+import json
+import sys
+from pathlib import Path
+
+from openhands.core.config import OpenHandsConfig
+from openhands.core.logger import openhands_logger as logger
+from openhands.events import EventStream
+from openhands.events.action.os import OSWorldInteractiveAction
+from openhands.runtime.impl.singularity.osworld_singularity_runtime import (
+    OSWorldSingularityRuntime,
+)
+from openhands.storage import get_file_store
+from examples.setup import SetupController
+
+google_drive_problems = [
+    '4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc.json',
+    '897e3b53-5d4d-444b-85cb-2cdc8a97d903.json',
+    '22a4636f-8179-4357-8e87-d1743ece1f81.json',
+    'b52b40a5-ad70-4c53-b5b0-5650a8387052.json',
+    'a0b9dc9c-fc07-4a88-8c5d-5e3ecad91bcb.json',
+    '46407397-a7d5-4c6b-92c6-dbe038b1457b.json',
+    '0c825995-5b70-4526-b663-113f4c999dd2.json',
+    '78aed49a-a710-4321-a793-b611a7c5b56b.json',
+    '897e3b53-5d4d-444b-85cb-2cdc8a97d903.json',
+    '46407397-a7d5-4c6b-92c6-dbe038b1457b.json'
+]
+
+async def main(config_path, results_dir):
+    """Main example function."""
+    
+    print("=" * 80)
+    print("OSWorld Runtime Example")
+    print("=" * 80)
+    print()
+    
+    # 1. Create configuration
+    print("1. Creating configuration...")
+    config = OpenHandsConfig()
+    config.runtime = 'osworld'
+    config.sandbox.base_container_image = 'ubuntu:24.04'
+    config.sandbox.run_as_fakeroot = True
+    
+    # Check if VM image exists
+    vm_image_path = './OS_images/Ubuntu.qcow2'
+    if not Path(vm_image_path).exists():
+        print(f"ERROR: VM image not found at {vm_image_path}")
+        print("Please place your Ubuntu VM image with OSWorld server at this location.")
+        return 1
+    
+    print(f"✓ Configuration created")
+    print(f"  Runtime: {config.runtime}")
+    print(f"  Base image: {config.sandbox.base_container_image}")
+    print(f"  VM image: {vm_image_path}")
+    print()
+    
+    # 2. Create event stream
+    print("2. Creating event stream...")
+    file_store = get_file_store('local', '/tmp/osworld_example')
+    event_stream = EventStream(sid='osworld-example', file_store=file_store)
+    print("✓ Event stream created")
+    print()
+    
+    # 3. Create OSWorld runtime
+    print("3. Creating OSWorld runtime...")
+    runtime = OSWorldSingularityRuntime(
+        config=config,
+        event_stream=event_stream,
+        sid='osworld-example',
+        os_type='linux',
+        vm_image_path=vm_image_path,
+        attach_to_existing=False,
+    )
+    print("✓ Runtime created")
+    print(f"  OS Type: {runtime.os_type}")
+    print()
+    
+    try:
+        # 4. Connect to runtime (starts VM)
+        print("4. Connecting to runtime (this may take 1-2 minutes)...")
+        print("   - Building Singularity image (if needed)")
+        print("   - Starting QEMU VM")
+        print("   - Waiting for VM to boot")
+        print("   - Waiting for OSWorld server to start")
+        await runtime.connect()
+        print("✓ Runtime connected and VM is ready!")
+        print(f"  VM Server URL: {runtime.osworld_vm_url}")
+        print(f"  VNC URL: {runtime.vnc_url}")
+        print(f"  Chromium URL: {runtime._chromium_port}")
+        print()
+        
+        # 5. Check if VM is alive
+        print("5. Checking VM health...")
+        runtime.check_if_alive()
+        print("✓ VM is alive and responding")
+        print()
+        
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        logger.exception("Failed to run example")
+        return 1
+        
+    setup_controller = SetupController(
+        vm_ip="127.0.0.1",
+        server_port=runtime._vm_server_port,
+        chromium_port=runtime._chromium_port,
+        cache_dir="/tmp/osworld_example",
+        client_password="password",
+        runtime=runtime  # Pass your runtime object here
+    )
+
+    with open(config_path, 'r') as f:
+        setup_config = json.load(f)
+    assert 'config' in setup_config, "Setup config not found in setup JSON"
+    await setup_controller.setup(setup_config['config'])
+
+    os.makedirs(results_dir, exist_ok=True)
+    with open(os.path.join(results_dir, 'config.json'), 'w') as f:
+        json.dump(setup_config, f)
+    
+    def get_final_state():
+        action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
+        observation = runtime.run_action(action)
+        with open(os.path.join(results_dir, 'final_state.txt'), 'w') as f:
+            f.write(observation.content)
+            
+    get_final_state()
+
+    screenshot = runtime.get_vm_screenshot()
+    with open(os.path.join(results_dir, 'screenshot.png'), 'wb') as f:
+        f.write(screenshot)
+
+    print("Cleaning up...")
+    runtime.close()
+    print("✓ Runtime closed")
+    print()
+
+if __name__ == '__main__':
+    # Run async main
+    import os
+    examples_path = '/home/jayliu/OSWorld/evaluation_examples/examples'
+    output_path = '/home/jayliu/ProRL-Agent-Server/results'
+    categories = os.listdir(examples_path)
+    categories = ['gimp']
+    for category in categories:
+        print(f"Running {category}...")
+        examples = os.listdir(os.path.join(examples_path, category))
+        for example in examples:
+            if example in google_drive_problems:
+                print(f"Skipping {example} because it is a Google Drive problem")
+                continue
+            print(f"Running {example}...")
+            example_path = os.path.join(examples_path, category, example)
+            results_dir = os.path.join(output_path, category, example)
+            os.makedirs(results_dir, exist_ok=True)
+            if os.path.exists(os.path.join(results_dir, 'screenshot.png')):
+                print(f"Skipping {example} because it already exists")
+                continue
+            exit_code = asyncio.run(main(example_path, results_dir))
+    sys.exit(0)
+
diff --git a/examples/test_setup_para.py b/examples/test_setup_para.py
new file mode 100755
index 000000000..4285123fe
--- /dev/null
+++ b/examples/test_setup_para.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+"""
+Example script demonstrating OSWorld runtime usage with OSWorldInteractiveAction.
+
+This script shows how to:
+1. Configure and create an OSWorld runtime
+2. Connect to a VM
+3. Use OSWorldInteractiveAction to interact with the VM
+4. Perform GUI actions (click, type, press keys)
+5. Take screenshots
+6. Get VM information (platform, screen size)
+7. Clean up resources
+
+Prerequisites:
+- Singularity/Apptainer installed
+- QEMU and OVMF installed
+- VM image at ./OS_images/Ubuntu.qcow2 with OSWorld server
+"""
+
+import asyncio
+import json
+import sys
+import os
+import multiprocessing
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+
+from openhands.core.config import OpenHandsConfig
+from openhands.core.logger import openhands_logger as logger
+from openhands.events import EventStream
+from openhands.events.action.os import OSWorldInteractiveAction
+from openhands.runtime.impl.singularity.osworld_singularity_runtime import (
+    OSWorldSingularityRuntime,
+)
+from openhands.storage import get_file_store
+from examples.setup import SetupController
+
+
+async def main(config_path, results_dir):
+    """Main example function."""
+    
+    print("=" * 80)
+    print("OSWorld Runtime Example")
+    print("=" * 80)
+    print()
+    
+    # 1. Create configuration
+    print("1. Creating configuration...")
+    config = OpenHandsConfig()
+    config.runtime = 'osworld'
+    config.sandbox.base_container_image = 'ubuntu:24.04'
+    config.sandbox.run_as_fakeroot = True
+    
+    # Check if VM image exists
+    vm_image_path = './OS_images/Ubuntu.qcow2'
+    if not Path(vm_image_path).exists():
+        print(f"ERROR: VM image not found at {vm_image_path}")
+        print("Please place your Ubuntu VM image with OSWorld server at this location.")
+        return 1
+    
+    print(f"✓ Configuration created")
+    print(f"  Runtime: {config.runtime}")
+    print(f"  Base image: {config.sandbox.base_container_image}")
+    print(f"  VM image: {vm_image_path}")
+    print()
+    
+    # 2. Create event stream
+    print("2. Creating event stream...")
+    file_store = get_file_store('local', '/tmp/osworld_example')
+    event_stream = EventStream(sid='osworld-example', file_store=file_store)
+    print("✓ Event stream created")
+    print()
+    
+    # 3. Create OSWorld runtime
+    print("3. Creating OSWorld runtime...")
+    runtime = OSWorldSingularityRuntime(
+        config=config,
+        event_stream=event_stream,
+        sid='osworld-example',
+        os_type='linux',
+        vm_image_path=vm_image_path,
+        attach_to_existing=False,
+    )
+    print("✓ Runtime created")
+    print(f"  OS Type: {runtime.os_type}")
+    print()
+    
+    try:
+        # 4. Connect to runtime (starts VM)
+        print("4. Connecting to runtime (this may take 1-2 minutes)...")
+        print("   - Building Singularity image (if needed)")
+        print("   - Starting QEMU VM")
+        print("   - Waiting for VM to boot")
+        print("   - Waiting for OSWorld server to start")
+        await runtime.connect()
+        print("✓ Runtime connected and VM is ready!")
+        print(f"  VM Server URL: {runtime.osworld_vm_url}")
+        print(f"  VNC URL: {runtime.vnc_url}")
+        print(f"  Chromium URL: {runtime._chromium_port}")
+        print()
+        
+        # 5. Check if VM is alive
+        print("5. Checking VM health...")
+        runtime.check_if_alive()
+        print("✓ VM is alive and responding")
+        print()
+        
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        logger.exception("Failed to run example")
+        return 1
+    
+    setup_controller = SetupController(
+        vm_ip="127.0.0.1",
+        server_port=runtime._vm_server_port,
+        chromium_port=runtime._chromium_port,
+        cache_dir="/tmp/osworld_example",
+        client_password="password",
+        runtime=runtime  # Pass your runtime object here
+    )
+
+    with open(config_path, 'r') as f:
+        setup_config = json.load(f)
+    assert 'config' in setup_config, "Setup config not found in setup JSON"
+    await setup_controller.setup(setup_config['config'])
+
+    await asyncio.sleep(10)
+
+    os.makedirs(results_dir, exist_ok=True)
+    with open(os.path.join(results_dir, 'config.json'), 'w') as f:
+        json.dump(setup_config, f)
+
+    screenshot = runtime.get_vm_screenshot()
+    with open(os.path.join(results_dir, 'screenshot.png'), 'wb') as f:
+        f.write(screenshot)
+    
+    print("Cleaning up...")
+    runtime.close()
+    print("✓ Runtime closed")
+    print()
+
+
+def run_main_wrapper(config_path, results_dir, example_name):
+    """Wrapper function to run async main in a separate process."""
+    print(f"\n[Process {os.getpid()}] Starting processing for {example_name}")
+    
+    # Check if already processed
+    if os.path.exists(os.path.join(results_dir, 'screenshot.png')):
+        print(f"[Process {os.getpid()}] Skipping {example_name} because it already exists")
+        return 0
+    
+    try:
+        exit_code = asyncio.run(main(config_path, results_dir))
+        print(f"[Process {os.getpid()}] Completed {example_name} with exit code {exit_code}")
+        return exit_code if exit_code is not None else 0
+    except Exception as e:
+        print(f"[Process {os.getpid()}] Error processing {example_name}: {e}")
+        return 1
+
+
+if __name__ == '__main__':
+    # Configuration
+    examples_path = '/home/jayliu/OSWorld/evaluation_examples/examples'
+    output_path = '/home/jayliu/ProRL-Agent-Server/two_results'
+    
+    # ============= CONCURRENCY CONTROL =============
+    # Set the maximum number of concurrent processes
+    # Change this value to control concurrency: 1, 2, 4, etc.
+    MAX_WORKERS = 4
+    # ===============================================
+    
+    # Define 2 examples to run in parallel
+    # You can modify these to test different examples
+    tasks = [
+        {
+            'category': 'chrome',
+            'example': '44ee5668-ecd5-4366-a6ce-c1c9b8d4e938.json'
+        },
+        {
+            'category': 'chrome',
+            'example': 'fc6d8143-9452-4171-9459-7f515143419a.json'  # Change this to a different example
+        },
+        {
+            'category': 'chrome',
+            'example': 'e1e75309-3ddb-4d09-92ec-de869c928143.json'  # Change this to a different example
+        },
+        {
+            'category': 'chrome',
+            'example': '47543840-672a-467d-80df-8f7c3b9788c9.json'  # Change this to a different example
+        },
+    ]
+    """
+        {
+            'category': 'chrome',
+            'example': '030eeff7-b492-4218-b312-701ec99ee0cc.json'
+        },
+        {
+            'category': 'chrome',
+            'example': '93eabf48-6a27-4cb6-b963-7d5fe1e0d3a9.json'  # Change this to a different example
+        },
+        {
+            'category': 'chrome',
+            'example': '1704f00f-79e6-43a7-961b-cedd3724d5fd.json'  # Change this to a different example
+        },
+        {
+            'category': 'chrome',
+            'example': 'a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json'  # Change this to a different example
+        },
+        
+    ]
+    """
+    # Prepare arguments for each process
+    process_args = []
+    for task in tasks:
+        category = task['category']
+        example = task['example']
+        example_path = os.path.join(examples_path, category, example)
+        results_dir = os.path.join(output_path, category, example)
+        os.makedirs(results_dir, exist_ok=True)
+        process_args.append((example_path, results_dir, example))
+    
+    # Run processes with controlled concurrency using ProcessPoolExecutor
+    print("=" * 80)
+    print(f"Starting {len(tasks)} tasks with max concurrency: {MAX_WORKERS}")
+    print("=" * 80)
+    print()
+    
+    # Use ProcessPoolExecutor for structured concurrency control
+    with ProcessPoolExecutor(max_workers=MAX_WORKERS) as executor:
+        # Submit all tasks to the executor
+        future_to_task = {}
+        for i, args in enumerate(process_args):
+            future = executor.submit(run_main_wrapper, *args)
+            future_to_task[future] = (i + 1, args[2])  # (task_number, example_name)
+            print(f"Submitted task {i+1}/{len(tasks)}: {args[2]}")
+        
+        print(f"\nRunning with {MAX_WORKERS} worker(s)...")
+        print()
+        
+        # Process completed tasks as they finish
+        completed = 0
+        for future in as_completed(future_to_task):
+            task_num, example_name = future_to_task[future]
+            completed += 1
+            
+            try:
+                exit_code = future.result()
+                status = "✓ SUCCESS" if exit_code == 0 else f"✗ FAILED (exit code: {exit_code})"
+                print(f"[{completed}/{len(tasks)}] Task {task_num} ({example_name}): {status}")
+            except Exception as e:
+                print(f"[{completed}/{len(tasks)}] Task {task_num} ({example_name}): ✗ EXCEPTION - {e}")
+    
+    print()
+    print("=" * 80)
+    print("All tasks completed!")
+    print("=" * 80)
+    
+    sys.exit(0)
+
diff --git a/examples/test_setup_single.py b/examples/test_setup_single.py
new file mode 100755
index 000000000..af5ca93cb
--- /dev/null
+++ b/examples/test_setup_single.py
@@ -0,0 +1,211 @@
+#!/usr/bin/env python3
+"""
+Example script demonstrating OSWorld runtime usage with OSWorldInteractiveAction.
+
+This script shows how to:
+1. Configure and create an OSWorld runtime
+2. Connect to a VM
+3. Use OSWorldInteractiveAction to interact with the VM
+4. Perform GUI actions (click, type, press keys)
+5. Take screenshots
+6. Get VM information (platform, screen size)
+7. Clean up resources
+
+Prerequisites:
+- Singularity/Apptainer installed
+- QEMU and OVMF installed
+- VM image at ./OS_images/Ubuntu.qcow2 with OSWorld server
+"""
+
+import asyncio
+import json
+import sys
+import os
+import multiprocessing
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from pathlib import Path
+
+from openhands.core.config import OpenHandsConfig
+from openhands.core.logger import openhands_logger as logger
+from openhands.events import EventStream
+from openhands.events.action.os import OSWorldInteractiveAction
+from openhands.runtime.impl.singularity.osworld_singularity_runtime import (
+    OSWorldSingularityRuntime,
+)
+from openhands.storage import get_file_store
+from examples.setup import SetupController
+
+
+async def main(config_path, results_dir):
+    """Main example function."""
+    
+    print("=" * 80)
+    print("OSWorld Runtime Example")
+    print("=" * 80)
+    print()
+    
+    # 1. Create configuration
+    print("1. Creating configuration...")
+    config = OpenHandsConfig()
+    config.runtime = 'osworld'
+    config.sandbox.base_container_image = 'ubuntu:24.04'
+    config.sandbox.run_as_fakeroot = True
+    
+    # Check if VM image exists
+    vm_image_path = './OS_images/Ubuntu.qcow2'
+    if not Path(vm_image_path).exists():
+        print(f"ERROR: VM image not found at {vm_image_path}")
+        print("Please place your Ubuntu VM image with OSWorld server at this location.")
+        return 1
+    
+    print(f"✓ Configuration created")
+    print(f"  Runtime: {config.runtime}")
+    print(f"  Base image: {config.sandbox.base_container_image}")
+    print(f"  VM image: {vm_image_path}")
+    print()
+    
+    # 2. Create event stream
+    print("2. Creating event stream...")
+    file_store = get_file_store('local', '/tmp/osworld_example')
+    event_stream = EventStream(sid='osworld-example', file_store=file_store)
+    print("✓ Event stream created")
+    print()
+    
+    # 3. Create OSWorld runtime
+    print("3. Creating OSWorld runtime...")
+    runtime = OSWorldSingularityRuntime(
+        config=config,
+        event_stream=event_stream,
+        sid='osworld-example',
+        os_type='linux',
+        vm_image_path=vm_image_path,
+        attach_to_existing=False,
+    )
+    print("✓ Runtime created")
+    print(f"  OS Type: {runtime.os_type}")
+    print()
+    
+    try:
+        # 4. Connect to runtime (starts VM)
+        print("4. Connecting to runtime (this may take 1-2 minutes)...")
+        print("   - Building Singularity image (if needed)")
+        print("   - Starting QEMU VM")
+        print("   - Waiting for VM to boot")
+        print("   - Waiting for OSWorld server to start")
+        await runtime.connect()
+        print("✓ Runtime connected and VM is ready!")
+        print(f"  VM Server URL: {runtime.osworld_vm_url}")
+        print(f"  VNC URL: {runtime.vnc_url}")
+        print(f"  Chromium URL: {runtime._chromium_port}")
+        print()
+        
+        # 5. Check if VM is alive
+        print("5. Checking VM health...")
+        runtime.check_if_alive()
+        print("✓ VM is alive and responding")
+        print()
+        
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        logger.exception("Failed to run example")
+        return 1
+    
+    setup_controller = SetupController(
+        vm_ip="127.0.0.1",
+        server_port=runtime._vm_server_port,
+        chromium_port=runtime._chromium_port,
+        cache_dir="/tmp/osworld_example",
+        client_password="password",
+        runtime=runtime  # Pass your runtime object here
+    )
+    import pdb; pdb.set_trace()
+    with open(config_path, 'r') as f:
+        setup_config = json.load(f)
+    assert 'config' in setup_config, "Setup config not found in setup JSON"
+    await setup_controller.setup(setup_config['config'])
+
+    #await asyncio.sleep(5)
+
+    os.makedirs(results_dir, exist_ok=True)
+    with open(os.path.join(results_dir, 'config.json'), 'w') as f:
+        json.dump(setup_config, f)
+
+    def get_final_state():
+        action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
+        observation = runtime.run_action(action)
+        with open(os.path.join(results_dir, 'final_state.txt'), 'w') as f:
+            f.write(observation.content)
+
+    get_final_state()
+
+    screenshot = runtime.get_vm_screenshot()
+    with open(os.path.join(results_dir, 'screenshot.png'), 'wb') as f:
+        f.write(screenshot)
+    
+    print("Cleaning up...")
+    runtime.close()
+    print("✓ Runtime closed")
+    print()
+
+
+def run_main_wrapper(config_path, results_dir, example_name):
+    """Wrapper function to run async main in a separate process."""
+    print(f"\n[Process {os.getpid()}] Starting processing for {example_name}")
+    
+    # Check if already processed
+    if os.path.exists(os.path.join(results_dir, 'screenshot.png')):
+        print(f"[Process {os.getpid()}] Skipping {example_name} because it already exists")
+        return 0
+    
+    try:
+        exit_code = asyncio.run(main(config_path, results_dir))
+        print(f"[Process {os.getpid()}] Completed {example_name} with exit code {exit_code}")
+        return exit_code if exit_code is not None else 0
+    except Exception as e:
+        print(f"[Process {os.getpid()}] Error processing {example_name}: {e}")
+        return 1
+
+
+if __name__ == '__main__':
+    # Configuration
+    examples_path = '/home/jayliu/OSWorld/evaluation_examples/examples'
+    output_path = '/home/jayliu/ProRL-Agent-Server/two_results'
+    
+    # ============= CONCURRENCY CONTROL =============
+    # Set the maximum number of concurrent processes
+    # Change this value to control concurrency: 1, 2, 4, etc.
+    MAX_WORKERS = 1
+    # ===============================================
+    
+    # Define 2 examples to run in parallel
+    # You can modify these to test different examples
+    tasks = [
+        {
+            'category': 'gimp',
+            'example': '045bf3ff-9077-4b86-b483-a1040a949cff.json'
+        },
+    ]
+    # Prepare arguments for each process
+    process_args = []
+    for task in tasks:
+        category = task['category']
+        example = task['example']
+        example_path = os.path.join(examples_path, category, example)
+        results_dir = os.path.join(output_path, category, example)
+        os.makedirs(results_dir, exist_ok=True)
+        process_args.append((example_path, results_dir, example))
+    
+    # Run processes with controlled concurrency using ProcessPoolExecutor
+    print("=" * 80)
+    print(f"Starting {len(tasks)} tasks with max concurrency: {MAX_WORKERS}")
+    print("=" * 80)
+    print()
+    
+    asyncio.run(main(process_args[0][0], process_args[0][1]))
+    print()
+    print("=" * 80)
+    print("All tasks completed!")
+    print("=" * 80)
+    
+    sys.exit(0)
+
diff --git a/openhands/runtime/impl/singularity/osworld_singularity_runtime.py b/openhands/runtime/impl/singularity/osworld_singularity_runtime.py
index 1e7245dc1..879e75b65 100644
--- a/openhands/runtime/impl/singularity/osworld_singularity_runtime.py
+++ b/openhands/runtime/impl/singularity/osworld_singularity_runtime.py
@@ -238,8 +238,8 @@ def _get_qemu_command(self) -> list[str]:
         )
         
         cmd.extend([
-            '-m', '2G',
-            '-smp', '2',
+            '-m', '8G',
+            '-smp', '8',
             '-drive', f'file={vm_image_container_path},if=ide',
             '-netdev', portfwd,
             '-device', 'virtio-net-pci,netdev=net0',
@@ -432,7 +432,8 @@ def _wait_for_vm_ready(self, timeout: int = 300):
             try:
                 # Try to connect to OSWorld server
                 response = httpx.get(
-                    f'http://localhost:{self._vm_server_port}/screenshot',
+                    #f'http://localhost:{self._vm_server_port}/screenshot',
+                    f'{self.osworld_vm_url}/terminal',
                     timeout=5.0
                 )
                 if response.status_code == 200:
@@ -543,6 +544,7 @@ def close(self, rm_all_containers: bool | None = None):
                     os.kill(self.qemu_pid, signal.SIGKILL)
                 except ProcessLookupError:
                     pass
+                self.qemu_pid = None
             except Exception as e:
                 self.log('warning', f'Failed to stop QEMU VM: {e}')
         

From 1a40f4eac110d382768fa571809953ca6213fe89 Mon Sep 17 00:00:00 2001
From: Mingjie Liu <jayliu@umich.edu>
Date: Wed, 19 Nov 2025 09:23:27 -0600
Subject: [PATCH 02/14] Working setup.

---
 examples/setup.py             | 136 ++++++++++++++++++++++++++++++++--
 examples/test_setup.py        |   2 +-
 examples/test_setup_single.py |  15 +++-
 3 files changed, 143 insertions(+), 10 deletions(-)

diff --git a/examples/setup.py b/examples/setup.py
index cf71ffcb7..a12703139 100644
--- a/examples/setup.py
+++ b/examples/setup.py
@@ -113,6 +113,7 @@ def __init__(self, vm_ip: str, server_port: int = 5000, chromium_port: int = 922
         self.screen_width: int = screen_width
         self.screen_height: int = screen_height
         self.runtime = runtime  # Runtime object for interacting with the environment
+        self.additional_wait_time = 3
 
     def reset_cache_dir(self, cache_dir: str):
         self.cache_dir = cache_dir
@@ -375,12 +376,12 @@ def _open_setup(self, path: str):
             response = requests.post(self.http_server + "/setup" + "/open_file", headers=headers, data=payload, timeout=1810)
             response.raise_for_status()  # This will raise an exception for 4xx and 5xx status codes
             logger.info("Command executed successfully: %s", response.text)
+            time.sleep(self.additional_wait_time)
         except requests.exceptions.RequestException as e:
             logger.error(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}")
             raise Exception(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}") from e
 
     def _ensure_launch_command_finish(self, command):
-        additional_wait_time = 3
         if isinstance(command, list):
             if len(command) < 1:
                 return True
@@ -400,19 +401,16 @@ def get_last_observation(observation_content):
             action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
             observation = self.runtime.run_action(action)
             if '– GIMP' in get_last_observation(observation.content):
-                time.sleep(additional_wait_time)
                 return True
             elif 'Convert to RGB Working Space?' in get_last_observation(observation.content):
                 # Weird error message, so we assume the command is successful
                 # TODO: FIX THIS IN QEMU image.
-                time.sleep(additional_wait_time)
                 return True
             elif len(original_command) < 2 and 'GNU Image Manipulation Program' in get_last_observation(observation.content):
                 # Here no file name is provided, so we simply expect window to launch
                 return True
             elif 'yicun.raw' in command and 'GIMP Message' in get_last_observation(observation.content):
                 # special case for dbbf4b99-2253-4b10-9274-45f246af2466.json
-                time.sleep(additional_wait_time)
                 return True
             return False
 
@@ -479,7 +477,10 @@ def get_last_observation(observation_content):
         elif 'vlc' in command:
             action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
             observation = self.runtime.run_action(action)
-            if '- VLC media player' in get_last_observation(observation.content):
+            if 'mp3' in command or 'mp4' in command:
+                if '- VLC media player' in get_last_observation(observation.content):
+                    return True
+            elif 'VLC media player' in get_last_observation(observation.content):
                 return True
             return False
 
@@ -527,10 +528,18 @@ def _launch_setup(self, command: Union[str, List[str]], shell: bool = False):
         ensure_launch_retries = 0
         while not self._ensure_launch_command_finish(command):
             logger.info("Waiting for command to finish...")
-            time.sleep(5)
+            time.sleep(self.additional_wait_time)
             ensure_launch_retries += 1
             if ensure_launch_retries >= 10:
                 raise Exception("Failed to ensure launch command finish after multiple retries")
+            
+        # Sleep additional time to ensure window is launched
+        if isinstance(command, list):
+            command = ' '.join(command)
+        for key_commands in ['google-chrome', 'code', 'libreoffice', 'nautilus', 'vlc', 'thunderbird']:
+            if key_commands in command:
+                time.sleep(self.additional_wait_time)
+                break
 
     def _execute_setup(
             self,
@@ -709,6 +718,7 @@ def _activate_window_setup(self, window_name: str, strict: bool = False, by_clas
                 logger.info("Command executed successfully: %s", response.text)
             else:
                 logger.error(f"Failed to activate window {window_name}. Status code: %s", response.text)
+            time.sleep(self.additional_wait_time)
         except requests.exceptions.RequestException as e:
             logger.error("An error occurred while trying to send the request: %s", e)
 
@@ -1116,3 +1126,117 @@ def _update_browse_history_setup(self, **config):
                 logger.error("An error occurred while trying to send the request: %s", e)
 
             self._execute_setup(["sudo chown -R user:user /home/user/.config/google-chrome/Default/History"], shell=True)
+
+
+class Evaluator:
+    def __init__(self, task_config: Dict[str, Any]):
+        """Set evaluator information from task config"""
+        # evaluator dict
+        # func -> metric function string, or list of metric function strings
+        # conj -> conjunction of multiple metrics if func is a list with length > 1, "and"/"or"
+        # result -> result getter config, or list of result getter configs
+        # expected (optional) -> expected getter config, or list of expected getter configs
+        # options (optional) -> metric options, or list of metric options
+        # if func is a str list, then result, expected (if exists), options (if exists) should also be lists of the same length
+        # even if one of the metrics does not need expected or options field, it should be included in the list with None
+        self.evaluator = task_config["evaluator"]
+        self.metric = [getattr(metrics, func) for func in self.evaluator["func"]] \
+            if isinstance(self.evaluator["func"], list) \
+            else getattr(metrics, self.evaluator["func"])
+        self.metric_conj: str = self.evaluator.get("conj", "and")  # take conjunction of multiple metrics
+        if "result" in self.evaluator and len(self.evaluator["result"]) > 0:
+            self.result_getter: Getter = [getattr(getters, "get_{:}".format(res["type"])) for res in
+                                          self.evaluator["result"]] \
+                if isinstance(self.evaluator["result"], list) \
+                else getattr(getters, "get_{:}".format(self.evaluator["result"]["type"]))
+        else:
+            self.result_getter = [None] * len(self.metric) \
+                if isinstance(self.metric, list) \
+                else None
+
+        if "expected" in self.evaluator and len(self.evaluator["expected"]) > 0:
+            self.expected_getter: Getter = [getattr(getters, "get_{:}".format(exp["type"])) if exp else None for exp in
+                                            self.evaluator["expected"]] \
+                if isinstance(self.evaluator["expected"], list) \
+                else getattr(getters, "get_{:}".format(self.evaluator["expected"]["type"]))
+        else:
+            self.expected_getter = [None] * len(self.metric) \
+                if isinstance(self.metric, list) \
+                else None
+        self.metric_options: Union[List[Dict[str, Any]], Dict[str, Any]] = [opt if opt else {} for opt in
+                                                                            self.evaluator["options"]] \
+            if isinstance(self.evaluator.get("options", {}), list) \
+            else self.evaluator["options"] \
+            if "options" in self.evaluator \
+            else [{}] * len(self.metric) \
+            if isinstance(self.metric, list) \
+            else {}
+
+        assert (not isinstance(self.evaluator["func"], list)
+                or (len(self.metric) == len(self.result_getter) == len(self.expected_getter) == len(
+                    self.metric_options)))
+
+    def evaluate(self, setup_controller, action_history = []):
+        """
+        Evaluate whether the task is successfully completed.
+        """
+
+        postconfig = self.evaluator.get("postconfig", [])
+        setup_controller.setup(postconfig)
+
+        if self.evaluator['func'] == "infeasible":
+            if len(action_history) > 0:
+                last_action = action_history[-1]
+                if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
+                    return 1
+            return 0
+        else:
+            if len(action_history) > 0:
+                last_action = action_history[-1]
+                if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
+                    return 0
+
+        if type(self.metric) == list:
+            # Multiple metrics to evaluate whether the task is successfully completed
+            results = []
+            assert len(self.metric) == len(self.result_getter), "The number of metrics and result getters must be the same"
+            if "expected" in self.evaluator:
+                assert len(self.metric) == len(self.expected_getter), "The number of metrics and expected getters must be the same"
+            for idx, metric in enumerate(self.metric):
+                try:
+                    config = self.evaluator["result"][idx]
+                    result_state = self.result_getter[idx](self, config)
+                except FileNotFoundError:
+                    logger.error("File not found!")
+                    if self.metric_conj == 'and':
+                        return 0
+
+                if "expected" in self.evaluator and self.expected_getter and self.evaluator["expected"]:
+                    expected_state = self.expected_getter[idx](self, self.evaluator["expected"][idx])
+                    metric: int = metric(result_state, expected_state, **self.metric_options[idx])
+                else:
+                    metric: int = metric(result_state, **self.metric_options[idx])
+
+                if self.metric_conj == 'and' and float(metric) == 0.0:
+                    return 0
+                elif self.metric_conj == 'or' and float(metric) == 1.0:
+                    return 1
+                else:
+                    results.append(metric)
+
+            return sum(results) / len(results) if self.metric_conj == 'and' else max(results)
+        else:
+            # Single metric to evaluate whether the task is successfully completed
+            try:
+                result_state = self.result_getter(self, self.evaluator["result"])
+            except FileNotFoundError:
+                logger.error("File not found!")
+                return 0
+
+            if "expected" in self.evaluator and self.expected_getter and self.evaluator["expected"]:
+                expected_state = self.expected_getter(self, self.evaluator["expected"])
+                metric: float = self.metric(result_state, expected_state, **self.metric_options)
+            else:
+                metric: float = self.metric(result_state, **self.metric_options)
+
+        return metric
\ No newline at end of file
diff --git a/examples/test_setup.py b/examples/test_setup.py
index cb5059aab..3166adfd6 100755
--- a/examples/test_setup.py
+++ b/examples/test_setup.py
@@ -160,7 +160,7 @@ def get_final_state():
     examples_path = '/home/jayliu/OSWorld/evaluation_examples/examples'
     output_path = '/home/jayliu/ProRL-Agent-Server/results'
     categories = os.listdir(examples_path)
-    categories = ['gimp']
+    #categories = ['os']
     for category in categories:
         print(f"Running {category}...")
         examples = os.listdir(os.path.join(examples_path, category))
diff --git a/examples/test_setup_single.py b/examples/test_setup_single.py
index af5ca93cb..c4a155487 100755
--- a/examples/test_setup_single.py
+++ b/examples/test_setup_single.py
@@ -33,7 +33,7 @@
     OSWorldSingularityRuntime,
 )
 from openhands.storage import get_file_store
-from examples.setup import SetupController
+from examples.setup import SetupController, Evaluator
 
 
 async def main(config_path, results_dir):
@@ -118,11 +118,13 @@ async def main(config_path, results_dir):
         client_password="password",
         runtime=runtime  # Pass your runtime object here
     )
+    
     import pdb; pdb.set_trace()
     with open(config_path, 'r') as f:
         setup_config = json.load(f)
     assert 'config' in setup_config, "Setup config not found in setup JSON"
     await setup_controller.setup(setup_config['config'])
+    #evaluator = Evaluator(setup_config)
 
     #await asyncio.sleep(5)
 
@@ -138,10 +140,17 @@ def get_final_state():
 
     get_final_state()
 
+    await asyncio.sleep(3)
+
     screenshot = runtime.get_vm_screenshot()
     with open(os.path.join(results_dir, 'screenshot.png'), 'wb') as f:
         f.write(screenshot)
     
+    print("Start Working. Continue to evaluate...")
+    import pdb; pdb.set_trace()
+    
+    #evaluator.evaluate(setup_controller)
+    
     print("Cleaning up...")
     runtime.close()
     print("✓ Runtime closed")
@@ -181,8 +190,8 @@ def run_main_wrapper(config_path, results_dir, example_name):
     # You can modify these to test different examples
     tasks = [
         {
-            'category': 'gimp',
-            'example': '045bf3ff-9077-4b86-b483-a1040a949cff.json'
+            'category': 'os',
+            'example': '5ced85fc-fa1a-4217-95fd-0fb530545ce2.json'
         },
     ]
     # Prepare arguments for each process

From c092dc76b57a588126d0c0910f33170a944af894 Mon Sep 17 00:00:00 2001
From: Mingjie Liu <mingjiel@nvidia.com>
Date: Thu, 20 Nov 2025 13:41:01 -0600
Subject: [PATCH 03/14] added metrics/getters to os_world dedicated package.

---
 openhands/nvidia/os_world/__init__.py         |    0
 .../nvidia/os_world/controllers/__init__.py   |    0
 .../nvidia/os_world/controllers/python.py     |  574 ++++
 .../nvidia/os_world/controllers/setup.py      | 1054 +++++++
 openhands/nvidia/os_world/evaluate.py         |  153 ++
 openhands/nvidia/os_world/getters/__init__.py |   42 +
 openhands/nvidia/os_world/getters/calc.py     |   15 +
 openhands/nvidia/os_world/getters/chrome.py   | 2429 +++++++++++++++++
 openhands/nvidia/os_world/getters/file.py     |  152 ++
 openhands/nvidia/os_world/getters/general.py  |   40 +
 openhands/nvidia/os_world/getters/gimp.py     |   37 +
 openhands/nvidia/os_world/getters/impress.py  |  126 +
 openhands/nvidia/os_world/getters/info.py     |   49 +
 openhands/nvidia/os_world/getters/misc.py     |  405 +++
 openhands/nvidia/os_world/getters/replay.py   |   20 +
 openhands/nvidia/os_world/getters/vlc.py      |   85 +
 openhands/nvidia/os_world/getters/vscode.py   |   34 +
 openhands/nvidia/os_world/metrics/__init__.py |  160 ++
 openhands/nvidia/os_world/metrics/basic_os.py |   68 +
 openhands/nvidia/os_world/metrics/chrome.py   |  502 ++++
 openhands/nvidia/os_world/metrics/docs.py     | 1192 ++++++++
 openhands/nvidia/os_world/metrics/general.py  |  663 +++++
 openhands/nvidia/os_world/metrics/gimp.py     |  816 ++++++
 .../nvidia/os_world/metrics/libreoffice.py    |   28 +
 openhands/nvidia/os_world/metrics/others.py   |  103 +
 openhands/nvidia/os_world/metrics/pdf.py      |   31 +
 openhands/nvidia/os_world/metrics/slides.py   |  929 +++++++
 openhands/nvidia/os_world/metrics/table.py    |  797 ++++++
 .../nvidia/os_world/metrics/thunderbird.py    |  175 ++
 openhands/nvidia/os_world/metrics/utils.py    |  839 ++++++
 openhands/nvidia/os_world/metrics/vlc.py      |  523 ++++
 openhands/nvidia/os_world/metrics/vscode.py   |  407 +++
 32 files changed, 12448 insertions(+)
 create mode 100644 openhands/nvidia/os_world/__init__.py
 create mode 100644 openhands/nvidia/os_world/controllers/__init__.py
 create mode 100644 openhands/nvidia/os_world/controllers/python.py
 create mode 100644 openhands/nvidia/os_world/controllers/setup.py
 create mode 100644 openhands/nvidia/os_world/evaluate.py
 create mode 100644 openhands/nvidia/os_world/getters/__init__.py
 create mode 100644 openhands/nvidia/os_world/getters/calc.py
 create mode 100644 openhands/nvidia/os_world/getters/chrome.py
 create mode 100644 openhands/nvidia/os_world/getters/file.py
 create mode 100644 openhands/nvidia/os_world/getters/general.py
 create mode 100644 openhands/nvidia/os_world/getters/gimp.py
 create mode 100644 openhands/nvidia/os_world/getters/impress.py
 create mode 100644 openhands/nvidia/os_world/getters/info.py
 create mode 100644 openhands/nvidia/os_world/getters/misc.py
 create mode 100644 openhands/nvidia/os_world/getters/replay.py
 create mode 100644 openhands/nvidia/os_world/getters/vlc.py
 create mode 100644 openhands/nvidia/os_world/getters/vscode.py
 create mode 100644 openhands/nvidia/os_world/metrics/__init__.py
 create mode 100644 openhands/nvidia/os_world/metrics/basic_os.py
 create mode 100644 openhands/nvidia/os_world/metrics/chrome.py
 create mode 100644 openhands/nvidia/os_world/metrics/docs.py
 create mode 100644 openhands/nvidia/os_world/metrics/general.py
 create mode 100644 openhands/nvidia/os_world/metrics/gimp.py
 create mode 100644 openhands/nvidia/os_world/metrics/libreoffice.py
 create mode 100644 openhands/nvidia/os_world/metrics/others.py
 create mode 100644 openhands/nvidia/os_world/metrics/pdf.py
 create mode 100644 openhands/nvidia/os_world/metrics/slides.py
 create mode 100644 openhands/nvidia/os_world/metrics/table.py
 create mode 100644 openhands/nvidia/os_world/metrics/thunderbird.py
 create mode 100644 openhands/nvidia/os_world/metrics/utils.py
 create mode 100644 openhands/nvidia/os_world/metrics/vlc.py
 create mode 100644 openhands/nvidia/os_world/metrics/vscode.py

diff --git a/openhands/nvidia/os_world/__init__.py b/openhands/nvidia/os_world/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/openhands/nvidia/os_world/controllers/__init__.py b/openhands/nvidia/os_world/controllers/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/openhands/nvidia/os_world/controllers/python.py b/openhands/nvidia/os_world/controllers/python.py
new file mode 100644
index 000000000..7844d7307
--- /dev/null
+++ b/openhands/nvidia/os_world/controllers/python.py
@@ -0,0 +1,574 @@
+import json
+import random
+from typing import Any, Dict, Optional
+import time
+import traceback
+import requests
+from openhands.core.logger import openhands_logger as logger
+
+KEYBOARD_KEYS = ['\t', '\n', '\r', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', 'accept', 'add', 'alt', 'altleft', 'altright', 'apps', 'backspace', 'browserback', 'browserfavorites', 'browserforward', 'browserhome', 'browserrefresh', 'browsersearch', 'browserstop', 'capslock', 'clear', 'convert', 'ctrl', 'ctrlleft', 'ctrlright', 'decimal', 'del', 'delete', 'divide', 'down', 'end', 'enter', 'esc', 'escape', 'execute', 'f1', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f2', 'f20', 'f21', 'f22', 'f23', 'f24', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'final', 'fn', 'hanguel', 'hangul', 'hanja', 'help', 'home', 'insert', 'junja', 'kana', 'kanji', 'launchapp1', 'launchapp2', 'launchmail', 'launchmediaselect', 'left', 'modechange', 'multiply', 'nexttrack', 'nonconvert', 'num0', 'num1', 'num2', 'num3', 'num4', 'num5', 'num6', 'num7', 'num8', 'num9', 'numlock', 'pagedown', 'pageup', 'pause', 'pgdn', 'pgup', 'playpause', 'prevtrack', 'print', 'printscreen', 'prntscrn', 'prtsc', 'prtscr', 'return', 'right', 'scrolllock', 'select', 'separator', 'shift', 'shiftleft', 'shiftright', 'sleep', 'stop', 'subtract', 'tab', 'up', 'volumedown', 'volumemute', 'volumeup', 'win', 'winleft', 'winright', 'yen', 'command', 'option', 'optionleft', 'optionright']
+
+class PythonController:
+    def __init__(self, vm_ip: str,
+                 server_port: int,
+                 pkgs_prefix: str = "import pyautogui; import time; pyautogui.FAILSAFE = False; {command}"):
+        self.vm_ip = vm_ip
+        self.http_server = f"http://{vm_ip}:{server_port}"
+        self.pkgs_prefix = pkgs_prefix  # fixme: this is a hacky way to execute python commands. fix it and combine it with installation of packages
+        self.retry_times = 3
+        self.retry_interval = 5
+
+    @staticmethod
+    def _is_valid_image_response(content_type: str, data: Optional[bytes]) -> bool:
+        """Quick validation for PNG/JPEG payload using magic bytes; Content-Type is advisory.
+        Returns True only when bytes look like a real PNG or JPEG.
+        """
+        if not isinstance(data, (bytes, bytearray)) or not data:
+            return False
+        # PNG magic
+        if len(data) >= 8 and data[:8] == b"\x89PNG\r\n\x1a\n":
+            return True
+        # JPEG magic
+        if len(data) >= 3 and data[:3] == b"\xff\xd8\xff":
+            return True
+        # If server explicitly marks as image, accept as a weak fallback (some environments strip magic)
+        if content_type and ("image/png" in content_type or "image/jpeg" in content_type or "image/jpg" in content_type):
+            return True
+        return False
+
+    def get_screenshot(self) -> Optional[bytes]:
+        """
+        Gets a screenshot from the server. With the cursor. None -> no screenshot or unexpected error.
+        """
+
+        for attempt_idx in range(self.retry_times):
+            try:
+                response = requests.get(self.http_server + "/screenshot", timeout=10)
+                if response.status_code == 200:
+                    content_type = response.headers.get("Content-Type", "")
+                    content = response.content
+                    if self._is_valid_image_response(content_type, content):
+                        logger.info("Got screenshot successfully")
+                        return content
+                    else:
+                        logger.error("Invalid screenshot payload (attempt %d/%d).", attempt_idx + 1, self.retry_times)
+                        logger.info("Retrying to get screenshot.")
+                else:
+                    logger.error("Failed to get screenshot. Status code: %d", response.status_code)
+                    logger.info("Retrying to get screenshot.")
+            except Exception as e:
+                logger.error("An error occurred while trying to get the screenshot: %s", e)
+                logger.info("Retrying to get screenshot.")
+            time.sleep(self.retry_interval)
+
+        logger.error("Failed to get screenshot.")
+        return None
+
+    def get_accessibility_tree(self) -> Optional[str]:
+        """
+        Gets the accessibility tree from the server. None -> no accessibility tree or unexpected error.
+        """
+
+        for _ in range(self.retry_times):
+            try:
+                response: requests.Response = requests.get(self.http_server + "/accessibility")
+                if response.status_code == 200:
+                    logger.info("Got accessibility tree successfully")
+                    return response.json()["AT"]
+                else:
+                    logger.error("Failed to get accessibility tree. Status code: %d", response.status_code)
+                    logger.info("Retrying to get accessibility tree.")
+            except Exception as e:
+                logger.error("An error occurred while trying to get the accessibility tree: %s", e)
+                logger.info("Retrying to get accessibility tree.")
+            time.sleep(self.retry_interval)
+
+        logger.error("Failed to get accessibility tree.")
+        return None
+
+    def get_terminal_output(self) -> Optional[str]:
+        """
+        Gets the terminal output from the server. None -> no terminal output or unexpected error.
+        """
+
+        for _ in range(self.retry_times):
+            try:
+                response = requests.get(self.http_server + "/terminal")
+                if response.status_code == 200:
+                    logger.info("Got terminal output successfully")
+                    return response.json()["output"]
+                else:
+                    logger.error("Failed to get terminal output. Status code: %d", response.status_code)
+                    logger.info("Retrying to get terminal output.")
+            except Exception as e:
+                logger.error("An error occurred while trying to get the terminal output: %s", e)
+                logger.info("Retrying to get terminal output.")
+            time.sleep(self.retry_interval)
+
+        logger.error("Failed to get terminal output.")
+        return None
+
+    def get_file(self, file_path: str) -> Optional[bytes]:
+        """
+        Gets a file from the server.
+        """
+
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(self.http_server + "/file", data={"file_path": file_path})
+                if response.status_code == 200:
+                    logger.info("File downloaded successfully")
+                    return response.content
+                else:
+                    logger.error("Failed to get file. Status code: %d", response.status_code)
+                    logger.info("Retrying to get file.")
+            except Exception as e:
+                logger.error("An error occurred while trying to get the file: %s", e)
+                logger.info("Retrying to get file.")
+            time.sleep(self.retry_interval)
+
+        logger.error("Failed to get file.")
+        return None
+
+    def execute_python_command(self, command: str) -> None:
+        """
+        Executes a python command on the server.
+        It can be used to execute the pyautogui commands, or... any other python command. who knows?
+        """
+        # command_list = ["python", "-c", self.pkgs_prefix.format(command=command)]
+        command_list = ["python", "-c", self.pkgs_prefix.format(command=command)]
+        payload = json.dumps({"command": command_list, "shell": False})
+
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(self.http_server + "/execute", headers={'Content-Type': 'application/json'},
+                                         data=payload, timeout=90)
+                if response.status_code == 200:
+                    logger.info("Command executed successfully: %s", response.text)
+                    return response.json()
+                else:
+                    logger.error("Failed to execute command. Status code: %d", response.status_code)
+                    logger.info("Retrying to execute command.")
+            except requests.exceptions.ReadTimeout:
+                break
+            except Exception as e:
+                logger.error("An error occurred while trying to execute the command: %s", e)
+                logger.info("Retrying to execute command.")
+            time.sleep(self.retry_interval)
+
+        logger.error("Failed to execute command.")
+        return None
+    
+    def run_python_script(self, script: str) -> Optional[Dict[str, Any]]:
+        """
+        Executes a python script on the server.
+        """
+        payload = json.dumps({"code": script})
+
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(self.http_server + "/run_python", headers={'Content-Type': 'application/json'},
+                                         data=payload, timeout=90)
+                if response.status_code == 200:
+                    return response.json()
+                else:
+                    return {"status": "error", "message": "Failed to execute command.", "output": None, "error": response.json()["error"]}
+            except requests.exceptions.ReadTimeout:
+                break
+            except Exception:
+                logger.error("An error occurred while trying to execute the command: %s", traceback.format_exc())
+                logger.info("Retrying to execute command.")
+            time.sleep(self.retry_interval)
+
+        logger.error("Failed to execute command.")
+        return {"status": "error", "message": "Failed to execute command.", "output": "", "error": "Retry limit reached."}
+    
+    def run_bash_script(self, script: str, timeout: int = 30, working_dir: Optional[str] = None) -> Optional[Dict[str, Any]]:
+        """
+        Executes a bash script on the server.
+        
+        :param script: The bash script content (can be multi-line)
+        :param timeout: Execution timeout in seconds (default: 30)
+        :param working_dir: Working directory for script execution (optional)
+        :return: Dictionary with status, output, error, and returncode, or None if failed
+        """
+        payload = json.dumps({
+            "script": script,
+            "timeout": timeout,
+            "working_dir": working_dir
+        })
+
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(
+                    self.http_server + "/run_bash_script", 
+                    headers={'Content-Type': 'application/json'},
+                    data=payload, 
+                    timeout=timeout + 100  # Add buffer to HTTP timeout
+                )
+                if response.status_code == 200:
+                    result = response.json()
+                    logger.info("Bash script executed successfully with return code: %d", result.get("returncode", -1))
+                    return result
+                else:
+                    logger.error("Failed to execute bash script. Status code: %d, response: %s", 
+                                response.status_code, response.text)
+                    logger.info("Retrying to execute bash script.")
+            except requests.exceptions.ReadTimeout:
+                logger.error("Bash script execution timed out")
+                return {
+                    "status": "error",
+                    "output": "",
+                    "error": f"Script execution timed out after {timeout} seconds",
+                    "returncode": -1
+                }
+            except Exception as e:
+                logger.error("An error occurred while trying to execute the bash script: %s", e)
+                logger.info("Retrying to execute bash script.")
+            time.sleep(self.retry_interval)
+
+        logger.error("Failed to execute bash script after %d retries.", self.retry_times)
+        return {
+            "status": "error",
+            "output": "",
+            "error": f"Failed to execute bash script after {self.retry_times} retries",
+            "returncode": -1
+        }
+
+    def execute_action(self, action):
+        """
+        Executes an action on the server computer.
+        """
+        # Handle string actions
+        if action in ['WAIT', 'FAIL', 'DONE']:
+            return
+        
+        # Handle dictionary actions
+        if type(action) == dict and action.get('action_type') in ['WAIT', 'FAIL', 'DONE']:
+            return
+
+        action_type = action["action_type"]
+        parameters = action["parameters"] if "parameters" in action else {param: action[param] for param in action if param != 'action_type'}
+        move_mode = random.choice(
+            ["pyautogui.easeInQuad", "pyautogui.easeOutQuad", "pyautogui.easeInOutQuad", "pyautogui.easeInBounce",
+             "pyautogui.easeInElastic"])
+        duration = random.uniform(0.5, 1)
+
+        if action_type == "MOVE_TO":
+            if parameters == {} or None:
+                self.execute_python_command("pyautogui.moveTo()")
+            elif "x" in parameters and "y" in parameters:
+                x = parameters["x"]
+                y = parameters["y"]
+                self.execute_python_command(f"pyautogui.moveTo({x}, {y}, {duration}, {move_mode})")
+            else:
+                raise Exception(f"Unknown parameters: {parameters}")
+
+        elif action_type == "CLICK":
+            if parameters == {} or None:
+                self.execute_python_command("pyautogui.click()")
+            elif "button" in parameters and "x" in parameters and "y" in parameters:
+                button = parameters["button"]
+                x = parameters["x"]
+                y = parameters["y"]
+                if "num_clicks" in parameters:
+                    num_clicks = parameters["num_clicks"]
+                    self.execute_python_command(
+                        f"pyautogui.click(button='{button}', x={x}, y={y}, clicks={num_clicks})")
+                else:
+                    self.execute_python_command(f"pyautogui.click(button='{button}', x={x}, y={y})")
+            elif "button" in parameters and "x" not in parameters and "y" not in parameters:
+                button = parameters["button"]
+                if "num_clicks" in parameters:
+                    num_clicks = parameters["num_clicks"]
+                    self.execute_python_command(f"pyautogui.click(button='{button}', clicks={num_clicks})")
+                else:
+                    self.execute_python_command(f"pyautogui.click(button='{button}')")
+            elif "button" not in parameters and "x" in parameters and "y" in parameters:
+                x = parameters["x"]
+                y = parameters["y"]
+                if "num_clicks" in parameters:
+                    num_clicks = parameters["num_clicks"]
+                    self.execute_python_command(f"pyautogui.click(x={x}, y={y}, clicks={num_clicks})")
+                else:
+                    self.execute_python_command(f"pyautogui.click(x={x}, y={y})")
+            else:
+                raise Exception(f"Unknown parameters: {parameters}")
+
+        elif action_type == "MOUSE_DOWN":
+            if parameters == {} or None:
+                self.execute_python_command("pyautogui.mouseDown()")
+            elif "button" in parameters:
+                button = parameters["button"]
+                self.execute_python_command(f"pyautogui.mouseDown(button='{button}')")
+            else:
+                raise Exception(f"Unknown parameters: {parameters}")
+
+        elif action_type == "MOUSE_UP":
+            if parameters == {} or None:
+                self.execute_python_command("pyautogui.mouseUp()")
+            elif "button" in parameters:
+                button = parameters["button"]
+                self.execute_python_command(f"pyautogui.mouseUp(button='{button}')")
+            else:
+                raise Exception(f"Unknown parameters: {parameters}")
+
+        elif action_type == "RIGHT_CLICK":
+            if parameters == {} or None:
+                self.execute_python_command("pyautogui.rightClick()")
+            elif "x" in parameters and "y" in parameters:
+                x = parameters["x"]
+                y = parameters["y"]
+                self.execute_python_command(f"pyautogui.rightClick(x={x}, y={y})")
+            else:
+                raise Exception(f"Unknown parameters: {parameters}")
+
+        elif action_type == "DOUBLE_CLICK":
+            if parameters == {} or None:
+                self.execute_python_command("pyautogui.doubleClick()")
+            elif "x" in parameters and "y" in parameters:
+                x = parameters["x"]
+                y = parameters["y"]
+                self.execute_python_command(f"pyautogui.doubleClick(x={x}, y={y})")
+            else:
+                raise Exception(f"Unknown parameters: {parameters}")
+
+        elif action_type == "DRAG_TO":
+            if "x" in parameters and "y" in parameters:
+                x = parameters["x"]
+                y = parameters["y"]
+                self.execute_python_command(
+                    f"pyautogui.dragTo({x}, {y}, duration=1.0, button='left', mouseDownUp=True)")
+
+        elif action_type == "SCROLL":
+            # todo: check if it is related to the operating system, as https://github.com/TheDuckAI/DuckTrack/blob/main/ducktrack/playback.py pointed out
+            if "dx" in parameters and "dy" in parameters:
+                dx = parameters["dx"]
+                dy = parameters["dy"]
+                self.execute_python_command(f"pyautogui.hscroll({dx})")
+                self.execute_python_command(f"pyautogui.vscroll({dy})")
+            elif "dx" in parameters and "dy" not in parameters:
+                dx = parameters["dx"]
+                self.execute_python_command(f"pyautogui.hscroll({dx})")
+            elif "dx" not in parameters and "dy" in parameters:
+                dy = parameters["dy"]
+                self.execute_python_command(f"pyautogui.vscroll({dy})")
+            else:
+                raise Exception(f"Unknown parameters: {parameters}")
+
+        elif action_type == "TYPING":
+            if "text" not in parameters:
+                raise Exception(f"Unknown parameters: {parameters}")
+            # deal with special ' and \ characters
+            # text = parameters["text"].replace("\\", "\\\\").replace("'", "\\'")
+            # self.execute_python_command(f"pyautogui.typewrite('{text}')")
+            text = parameters["text"]
+            self.execute_python_command("pyautogui.typewrite({:})".format(repr(text)))
+
+        elif action_type == "PRESS":
+            if "key" not in parameters:
+                raise Exception(f"Unknown parameters: {parameters}")
+            key = parameters["key"]
+            if key.lower() not in KEYBOARD_KEYS:
+                raise Exception(f"Key must be one of {KEYBOARD_KEYS}")
+            self.execute_python_command(f"pyautogui.press('{key}')")
+
+        elif action_type == "KEY_DOWN":
+            if "key" not in parameters:
+                raise Exception(f"Unknown parameters: {parameters}")
+            key = parameters["key"]
+            if key.lower() not in KEYBOARD_KEYS:
+                raise Exception(f"Key must be one of {KEYBOARD_KEYS}")
+            self.execute_python_command(f"pyautogui.keyDown('{key}')")
+
+        elif action_type == "KEY_UP":
+            if "key" not in parameters:
+                raise Exception(f"Unknown parameters: {parameters}")
+            key = parameters["key"]
+            if key.lower() not in KEYBOARD_KEYS:
+                raise Exception(f"Key must be one of {KEYBOARD_KEYS}")
+            self.execute_python_command(f"pyautogui.keyUp('{key}')")
+
+        elif action_type == "HOTKEY":
+            if "keys" not in parameters:
+                raise Exception(f"Unknown parameters: {parameters}")
+            keys = parameters["keys"]
+            if not isinstance(keys, list):
+                raise Exception("Keys must be a list of keys")
+            for key in keys:
+                if key.lower() not in KEYBOARD_KEYS:
+                    raise Exception(f"Key must be one of {KEYBOARD_KEYS}")
+
+            keys_para_rep = "', '".join(keys)
+            self.execute_python_command(f"pyautogui.hotkey('{keys_para_rep}')")
+
+        elif action_type in ['WAIT', 'FAIL', 'DONE']:
+            pass
+
+        else:
+            raise Exception(f"Unknown action type: {action_type}")
+
+    # Record video
+    def start_recording(self):
+        """
+        Starts recording the screen.
+        """
+
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(self.http_server + "/start_recording")
+                if response.status_code == 200:
+                    logger.info("Recording started successfully")
+                    return
+                else:
+                    logger.error("Failed to start recording. Status code: %d", response.status_code)
+                    logger.info("Retrying to start recording.")
+            except Exception as e:
+                logger.error("An error occurred while trying to start recording: %s", e)
+                logger.info("Retrying to start recording.")
+            time.sleep(self.retry_interval)
+
+        logger.error("Failed to start recording.")
+
+    def end_recording(self, dest: str):
+        """
+        Ends recording the screen.
+        """
+
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(self.http_server + "/end_recording")
+                if response.status_code == 200:
+                    logger.info("Recording stopped successfully")
+                    with open(dest, 'wb') as f:
+                        for chunk in response.iter_content(chunk_size=8192):
+                            if chunk:
+                                f.write(chunk)
+                    return
+                else:
+                    logger.error("Failed to stop recording. Status code: %d", response.status_code)
+                    logger.info("Retrying to stop recording.")
+            except Exception as e:
+                logger.error("An error occurred while trying to stop recording: %s", e)
+                logger.info("Retrying to stop recording.")
+            time.sleep(self.retry_interval)
+
+        logger.error("Failed to stop recording.")
+
+    # Additional info
+    def get_vm_platform(self):
+        """
+        Gets the size of the vm screen.
+        """
+        return self.execute_python_command("import platform; print(platform.system())")['output'].strip()
+
+    def get_vm_screen_size(self):
+        """
+        Gets the size of the vm screen.
+        """
+
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(self.http_server + "/screen_size")
+                if response.status_code == 200:
+                    logger.info("Got screen size successfully")
+                    return response.json()
+                else:
+                    logger.error("Failed to get screen size. Status code: %d", response.status_code)
+                    logger.info("Retrying to get screen size.")
+            except Exception as e:
+                logger.error("An error occurred while trying to get the screen size: %s", e)
+                logger.info("Retrying to get screen size.")
+            time.sleep(self.retry_interval)
+
+        logger.error("Failed to get screen size.")
+        return None
+
+    def get_vm_window_size(self, app_class_name: str):
+        """
+        Gets the size of the vm app window.
+        """
+
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(self.http_server + "/window_size", data={"app_class_name": app_class_name})
+                if response.status_code == 200:
+                    logger.info("Got window size successfully")
+                    return response.json()
+                else:
+                    logger.error("Failed to get window size. Status code: %d", response.status_code)
+                    logger.info("Retrying to get window size.")
+            except Exception as e:
+                logger.error("An error occurred while trying to get the window size: %s", e)
+                logger.info("Retrying to get window size.")
+            time.sleep(self.retry_interval)
+
+        logger.error("Failed to get window size.")
+        return None
+
+    def get_vm_wallpaper(self):
+        """
+        Gets the wallpaper of the vm.
+        """
+
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(self.http_server + "/wallpaper")
+                if response.status_code == 200:
+                    logger.info("Got wallpaper successfully")
+                    return response.content
+                else:
+                    logger.error("Failed to get wallpaper. Status code: %d", response.status_code)
+                    logger.info("Retrying to get wallpaper.")
+            except Exception as e:
+                logger.error("An error occurred while trying to get the wallpaper: %s", e)
+                logger.info("Retrying to get wallpaper.")
+            time.sleep(self.retry_interval)
+
+        logger.error("Failed to get wallpaper.")
+        return None
+
+    def get_vm_desktop_path(self) -> Optional[str]:
+        """
+        Gets the desktop path of the vm.
+        """
+
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(self.http_server + "/desktop_path")
+                if response.status_code == 200:
+                    logger.info("Got desktop path successfully")
+                    return response.json()["desktop_path"]
+                else:
+                    logger.error("Failed to get desktop path. Status code: %d", response.status_code)
+                    logger.info("Retrying to get desktop path.")
+            except Exception as e:
+                logger.error("An error occurred while trying to get the desktop path: %s", e)
+                logger.info("Retrying to get desktop path.")
+            time.sleep(self.retry_interval)
+
+        logger.error("Failed to get desktop path.")
+        return None
+
+    def get_vm_directory_tree(self, path) -> Optional[Dict[str, Any]]:
+        """
+        Gets the directory tree of the vm.
+        """
+        payload = json.dumps({"path": path})
+
+        for _ in range(self.retry_times):
+            try:
+                response = requests.post(self.http_server + "/list_directory", headers={'Content-Type': 'application/json'}, data=payload)
+                if response.status_code == 200:
+                    logger.info("Got directory tree successfully")
+                    return response.json()["directory_tree"]
+                else:
+                    logger.error("Failed to get directory tree. Status code: %d", response.status_code)
+                    logger.info("Retrying to get directory tree.")
+            except Exception as e:
+                logger.error("An error occurred while trying to get directory tree: %s", e)
+                logger.info("Retrying to get directory tree.")
+            time.sleep(self.retry_interval)
+
+        logger.error("Failed to get directory tree.")
+        return None
diff --git a/openhands/nvidia/os_world/controllers/setup.py b/openhands/nvidia/os_world/controllers/setup.py
new file mode 100644
index 000000000..41813d7d7
--- /dev/null
+++ b/openhands/nvidia/os_world/controllers/setup.py
@@ -0,0 +1,1054 @@
+import inspect
+import json
+import os
+import os.path
+import platform
+import shutil
+import sqlite3
+import tempfile
+import time
+import traceback
+import uuid
+from datetime import datetime, timedelta
+from typing import Any, Union, Optional, Dict, List
+
+import requests
+from playwright.async_api import async_playwright, TimeoutError
+from pydrive.auth import GoogleAuth
+from pydrive.drive import GoogleDrive, GoogleDriveFile, GoogleDriveFileList
+from requests_toolbelt.multipart.encoder import MultipartEncoder
+
+from openhands.events.action.os import OSWorldInteractiveAction
+from openhands.core.logger import openhands_logger as logger
+from openhands.nvidia.os_world.metrics.utils import compare_urls
+
+FILE_PATH = os.path.dirname(os.path.abspath(__file__))
+
+MAX_RETRIES = 20
+
+class SetupController:
+    def __init__(self, vm_ip: str, server_port: int = 5000, chromium_port: int = 9222, vlc_port: int = 8080, cache_dir: str = "cache", client_password: str = "", screen_width: int = 1920, screen_height: int = 1080, runtime=None):
+        self.vm_ip: str = vm_ip
+        self.server_port: int = server_port
+        self.chromium_port: int = chromium_port
+        self.vlc_port: int = vlc_port
+        self.http_server: str = f"http://{vm_ip}:{server_port}"
+        self.http_server_setup_root: str = f"http://{vm_ip}:{server_port}/setup"
+        self.cache_dir: str = cache_dir
+        self.client_password: str = client_password
+        self.screen_width: int = screen_width
+        self.screen_height: int = screen_height
+        self.runtime = runtime  # Runtime object for interacting with the environment
+        self.additional_wait_time = 3
+
+    def reset_cache_dir(self, cache_dir: str):
+        self.cache_dir = cache_dir
+    
+    async def setup(self, config: List[Dict[str, Any]])-> bool:
+        """
+        Args:
+            config (List[Dict[str, Any]]): list of dict like {str: Any}. each
+              config dict has the structure like
+                {
+                    "type": str, corresponding to the `_{:}_setup` methods of
+                      this class
+                    "parameters": dict like {str, Any} providing the keyword
+                      parameters
+                }
+        """  
+        # Runtime is required for setup
+        if not self.runtime:
+            logger.error("Runtime is required for SetupController. Please provide a runtime object.")
+            return False
+        
+        # Make sure connection can be established
+        logger.info(f"Checking runtime connection...")
+        retry = 0
+        while retry < MAX_RETRIES:
+            try:
+                action = OSWorldInteractiveAction(
+                    method='get_terminal_output',
+                    params={}
+                )
+                observation = self.runtime.run_action(action)
+                if observation.exit_code == 0:
+                    logger.info("Runtime connection established successfully")
+                    break
+            except:
+                pass
+            time.sleep(5)
+            retry += 1
+            logger.info(f"retry: {retry}/{MAX_RETRIES}")
+            
+            if retry == MAX_RETRIES:
+                logger.error("Failed to establish runtime connection after maximum retries")
+                return False
+                
+
+        for i, cfg in enumerate(config):
+            config_type: str = cfg["type"]
+            parameters: Dict[str, Any] = cfg["parameters"]
+
+            # Assumes all the setup the functions should follow this name
+            # protocol
+            setup_function: str = "_{:}_setup".format(config_type)
+            assert hasattr(self, setup_function), f'Setup controller cannot find init function {setup_function}'
+            
+            try:
+                logger.info(f"Executing setup step {i+1}/{len(config)}: {setup_function}")
+                logger.debug(f"Setup parameters: {parameters}")
+                
+                # Get the setup function
+                func = getattr(self, setup_function)
+                
+                # Check if it's a coroutine function and await if necessary
+                if inspect.iscoroutinefunction(func):
+                    await func(**parameters)
+                else:
+                    func(**parameters)
+                    
+                logger.info(f"SETUP COMPLETED: {setup_function}({str(parameters)})")
+            except Exception as e:
+                logger.error(f"SETUP FAILED at step {i+1}/{len(config)}: {setup_function}({str(parameters)})")
+                logger.error(f"Error details: {e}")
+                logger.error(f"Traceback: {traceback.format_exc()}")
+                raise Exception(f"Setup step {i+1} failed: {setup_function} - {e}") from e
+        
+        return True
+
+    def _download_setup(self, files: List[Dict[str, str]]):
+        """
+        Args:
+            files (List[Dict[str, str]]): files to download. lisf of dict like
+              {
+                "url": str, the url to download
+                "path": str, the path on the VM to store the downloaded file
+              }
+        """
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        for f in files:
+            url: str = f["url"]
+            path: str = f["path"]
+            cache_path: str = os.path.join(self.cache_dir, "{:}_{:}".format(
+                uuid.uuid5(uuid.NAMESPACE_URL, url),
+                os.path.basename(path)))
+            if not url or not path:
+                raise Exception(f"Setup Download - Invalid URL ({url}) or path ({path}).")
+
+            if not os.path.exists(cache_path):
+                logger.info(f"Cache file not found, downloading from {url} to {cache_path}")
+                max_retries = 3
+                downloaded = False
+                e = None
+                for i in range(max_retries):
+                    try:
+                        logger.info(f"Download attempt {i+1}/{max_retries} for {url}")
+                        response = requests.get(url, stream=True, timeout=300)  # Add 5 minute timeout
+                        response.raise_for_status()
+                        
+                        # Get file size if available
+                        total_size = int(response.headers.get('content-length', 0))
+                        if total_size > 0:
+                            logger.info(f"File size: {total_size / (1024*1024):.2f} MB")
+
+                        downloaded_size = 0
+                        with open(cache_path, 'wb') as f:
+                            for chunk in response.iter_content(chunk_size=8192):
+                                if chunk:
+                                    f.write(chunk)
+                                    downloaded_size += len(chunk)
+                                    if total_size > 0 and downloaded_size % (1024*1024) == 0:  # Log every MB
+                                        progress = (downloaded_size / total_size) * 100
+                                        logger.info(f"Download progress: {progress:.1f}%")
+                        
+                        logger.info(f"File downloaded successfully to {cache_path} ({downloaded_size / (1024*1024):.2f} MB)")
+                        downloaded = True
+                        break
+
+                    except requests.RequestException as e:
+                        logger.error(
+                            f"Failed to download {url} caused by {e}. Retrying... ({max_retries - i - 1} attempts left)")
+                        # Clean up partial download
+                        if os.path.exists(cache_path):
+                            os.remove(cache_path)
+                if not downloaded:
+                    raise requests.RequestException(f"Failed to download {url}. No retries left.")
+
+            
+            self._upload_file_setup([{
+                "local_path": cache_path,
+                "path": path
+            }])
+
+    def _upload_file_setup(self, files: List[Dict[str, str]]):
+        """
+        Args:
+            files (List[Dict[str, str]]): files to download. lisf of dict like
+              {
+                "local_path": str, the local path to the file to upload
+                "path": str, the path on the VM to store the downloaded file
+              }
+        """
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        for f in files:
+            local_path: str = f["local_path"]
+            path: str = f["path"]
+
+            if not os.path.exists(local_path):
+                raise Exception(f"Setup Upload - Invalid local path ({local_path}).")
+
+            file_size = None
+            try:
+                file_size = os.path.getsize(local_path)
+            except Exception:
+                pass
+
+            max_retries = 3
+            last_error: Optional[Exception] = None
+
+            for attempt in range(max_retries):
+                try:
+                    logger.info(
+                        f"Uploading {os.path.basename(local_path)}{f' ({file_size} bytes)' if file_size is not None else ''} "
+                        f"to VM at {path} (attempt {attempt + 1}/{max_retries})"
+                    )
+                    logger.debug("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/upload")
+
+                    # Open the file inside each attempt to ensure fresh stream position
+                    with open(local_path, "rb") as fp:
+                        form = MultipartEncoder({
+                            "file_path": path,
+                            "file_data": (os.path.basename(path), fp)
+                        })
+                        headers = {"Content-Type": form.content_type}
+                        logger.debug(form.content_type)
+
+                        # Explicit connect/read timeout to avoid hanging forever
+                        response = requests.post(
+                            self.http_server + "/setup" + "/upload",
+                            headers=headers,
+                            data=form,
+                            timeout=(10, 600)
+                        )
+
+                        if response.status_code == 200:
+                            logger.info(f"File uploaded successfully: {path}")
+                            logger.debug("Upload response: %s", response.text)
+                            last_error = None
+                            break
+                        else:
+                            msg = f"Failed to upload file {path}. Status code: {response.status_code}, Response: {response.text}"
+                            logger.error(msg)
+                            last_error = requests.RequestException(msg)
+
+                except requests.exceptions.RequestException as e:
+                    last_error = e
+                    logger.error(f"Upload attempt {attempt + 1} failed for {path}: {e}")
+
+                # Exponential backoff between retries
+                if attempt < max_retries - 1:
+                    time.sleep(2 ** attempt)
+
+            if last_error is not None:
+                raise last_error
+
+    def _change_wallpaper_setup(self, path: str):
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        if not path:
+            raise Exception(f"Setup Wallpaper - Invalid path ({path}).")
+
+        payload = json.dumps({"path": path})
+        headers = {
+            'Content-Type': 'application/json'
+        }
+
+        # send request to server to change wallpaper
+        # Note: This uses a custom /setup endpoint, not a standard OSWorld method
+        try:
+            response = requests.post(self.http_server + "/setup" + "/change_wallpaper", headers=headers, data=payload)
+            if response.status_code == 200:
+                logger.info("Command executed successfully: %s", response.text)
+            else:
+                logger.error("Failed to change wallpaper. Status code: %s", response.text)
+        except requests.exceptions.RequestException as e:
+            logger.error("An error occurred while trying to send the request: %s", e)
+
+    def _tidy_desktop_setup(self, **config):
+        raise NotImplementedError()
+
+    def _open_setup(self, path: str):
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        if not path:
+            raise Exception(f"Setup Open - Invalid path ({path}).")
+
+        payload = json.dumps({"path": path})
+        headers = {
+            'Content-Type': 'application/json'
+        }
+
+        # send request to server to open file
+        # Note: This uses a custom /setup endpoint, not a standard OSWorld method
+        try:
+            # The server-side call is now blocking and can take time.
+            # We set a timeout that is slightly longer than the server's timeout (1800s).
+            response = requests.post(self.http_server + "/setup" + "/open_file", headers=headers, data=payload, timeout=1810)
+            response.raise_for_status()  # This will raise an exception for 4xx and 5xx status codes
+            logger.info("Command executed successfully: %s", response.text)
+            time.sleep(self.additional_wait_time)
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}")
+            raise Exception(f"Failed to open file '{path}'. An error occurred while trying to send the request or the server responded with an error: {e}") from e
+
+    def _ensure_launch_command_finish(self, command):
+        if isinstance(command, list):
+            if len(command) < 1:
+                return True
+            original_command = command.copy()
+            command = ' '.join(command)
+        else:
+            original_command = [command]
+
+        def get_last_observation(observation_content):
+            lines = observation_content.split('\n')[::-1]
+            for line in lines:
+                if len(line) > 0:
+                    return line
+            return ''
+
+        if 'gimp' in command:
+            action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
+            observation = self.runtime.run_action(action)
+            if '– GIMP' in get_last_observation(observation.content):
+                return True
+            elif 'Convert to RGB Working Space?' in get_last_observation(observation.content):
+                # Weird error message, so we assume the command is successful
+                # TODO: FIX THIS IN QEMU image.
+                return True
+            elif len(original_command) < 2 and 'GNU Image Manipulation Program' in get_last_observation(observation.content):
+                # Here no file name is provided, so we simply expect window to launch
+                return True
+            elif 'yicun.raw' in command and 'GIMP Message' in get_last_observation(observation.content):
+                # special case for dbbf4b99-2253-4b10-9274-45f246af2466.json
+                return True
+            return False
+
+        elif 'google-chrome' in command:
+            action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
+            observation = self.runtime.run_action(action)
+            if '- Google Chrome' in get_last_observation(observation.content):
+                return True
+            return False
+
+        elif 'libreoffice --writer' in command:
+            action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
+            observation = self.runtime.run_action(action)
+            if '- LibreOffice Writer' in get_last_observation(observation.content):
+                files = original_command[-1].split('/')
+                if len(files) > 0 and files[-1].endswith('.docx'):
+                    if files[-1] in get_last_observation(observation.content):
+                        return True
+                    else:
+                        return False
+                else:
+                    # No file name is provided, so we assume the command is successful
+                    return True
+            return False
+
+        elif 'libreoffice --calc' in command:
+            action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
+            observation = self.runtime.run_action(action)
+            if '- LibreOffice Calc' in get_last_observation(observation.content):
+                files = original_command[-1].split('/')
+                if len(files) > 0 and files[-1].endswith('.xlsx'):
+                    if files[-1] in get_last_observation(observation.content):
+                        return True
+                    else:
+                        return False
+                else:
+                    # No file name is provided, so we assume the command is successful
+                    return True
+            return False
+
+        elif 'libreoffice --impress' in command:
+            action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
+            observation = self.runtime.run_action(action)
+            if '- LibreOffice Impress' in get_last_observation(observation.content):
+                files = original_command[-1].split('/')
+                if len(files) > 0 and files[-1].endswith('.pptx'):
+                    if files[-1] in get_last_observation(observation.content):
+                        return True
+                    else:
+                        return False
+                else:
+                    # No file name is provided, so we assume the command is successful
+                    return True
+            return False
+
+        elif 'nautilus' in command:
+            action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
+            observation = self.runtime.run_action(action)
+            dir_path = [p for p in original_command[-1].split('/') if p][-1]
+            if dir_path in get_last_observation(observation.content):
+                return True
+            return False
+
+        elif 'vlc' in command:
+            action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
+            observation = self.runtime.run_action(action)
+            if 'mp3' in command or 'mp4' in command:
+                if '- VLC media player' in get_last_observation(observation.content):
+                    return True
+            elif 'VLC media player' in get_last_observation(observation.content):
+                return True
+            return False
+
+        elif 'thunderbird' in command:
+            action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
+            observation = self.runtime.run_action(action)
+            if 'Thunderbird' in get_last_observation(observation.content):
+                return True
+            return False
+
+        elif 'code' in command:
+            action = OSWorldInteractiveAction(method='run_bash_script', params={'script': 'wmctrl -l'})
+            observation = self.runtime.run_action(action)
+            if 'Visual Studio Code' in get_last_observation(observation.content):
+                return True
+            return False
+
+        return True
+    
+    def _launch_setup(self, command: Union[str, List[str]], shell: bool = False):
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        if not command:
+            raise Exception("Empty command to launch.")
+
+        if not shell and isinstance(command, str) and len(command.split()) > 1:
+            logger.warning("Command should be a list of strings. Now it is a string. Will split it by space.")
+            command = command.split()
+
+        payload = json.dumps({"command": command, "shell": shell})
+        headers = {"Content-Type": "application/json"}
+
+        # Note: This uses a custom /setup endpoint, not a standard OSWorld method
+        try:
+            logger.info("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/launch")
+            response = requests.post(self.http_server + "/setup" + "/launch", headers=headers, data=payload)
+            if response.status_code == 200:
+                logger.info("Command executed successfully: %s", response.text)
+            else:
+                logger.error("Failed to launch application. Status code: %s", response.text)
+        except requests.exceptions.RequestException as e:
+            logger.error("An error occurred while trying to send the request: %s", e)
+
+        ensure_launch_retries = 0
+        while not self._ensure_launch_command_finish(command):
+            logger.info("Waiting for command to finish...")
+            time.sleep(self.additional_wait_time)
+            ensure_launch_retries += 1
+            if ensure_launch_retries >= 10:
+                raise Exception("Failed to ensure launch command finish after multiple retries")
+            
+        # Sleep additional time to ensure window is launched
+        if isinstance(command, list):
+            command = ' '.join(command)
+        for key_commands in ['google-chrome', 'code', 'libreoffice', 'nautilus', 'vlc', 'thunderbird']:
+            if key_commands in command:
+                time.sleep(self.additional_wait_time)
+                break
+
+    def _execute_setup(
+            self,
+            command: List[str],
+            stdout: str = "",
+            stderr: str = "",
+            shell: bool = False,
+            until: Optional[Dict[str, Any]] = None
+    ):
+        if not command:
+            raise Exception("Empty command to launch.")
+
+        until: Dict[str, Any] = until or {}
+        terminates: bool = False
+        nb_failings = 0
+
+        def replace_screen_env_in_command(command):
+            password = self.client_password
+            width = self.screen_width
+            height = self.screen_height
+            width_half = str(width // 2)
+            height_half = str(height // 2)
+            new_command_list = []
+            new_command = ""
+            if isinstance(command, str):
+                new_command = command.replace("{CLIENT_PASSWORD}", password)
+                new_command = new_command.replace("{SCREEN_WIDTH_HALF}", width_half)
+                new_command = new_command.replace("{SCREEN_HEIGHT_HALF}", height_half)
+                new_command = new_command.replace("{SCREEN_WIDTH}", str(width))
+                new_command = new_command.replace("{SCREEN_HEIGHT}", str(height))
+                return new_command
+            else:
+                for item in command:
+                    item = item.replace("{CLIENT_PASSWORD}", password)
+                    item = item.replace("{SCREEN_WIDTH_HALF}", width_half)
+                    item = item.replace("{SCREEN_HEIGHT_HALF}", height_half)
+                    item = item.replace("{SCREEN_WIDTH}", str(width))
+                    item = item.replace("{SCREEN_HEIGHT}", str(height))
+                    new_command_list.append(item)
+                return new_command_list
+        command = replace_screen_env_in_command(command)
+        payload = json.dumps({"command": command, "shell": shell})
+        headers = {"Content-Type": "application/json"}
+        # Runtime is required
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        # Execute using runtime
+        while not terminates:
+            try:
+                response = requests.post(self.http_server + "/setup" + "/execute", headers=headers, data=payload)
+                if response.status_code == 200:
+                    results: Dict[str, str] = response.json()
+                    if stdout:
+                        with open(os.path.join(self.cache_dir, stdout), "w") as f:
+                            f.write(results["output"])
+                    if stderr:
+                        with open(os.path.join(self.cache_dir, stderr), "w") as f:
+                            f.write(results["error"])
+                    logger.info("Command executed successfully: %s -> %s"
+                                , " ".join(command) if isinstance(command, list) else command
+                                , response.text
+                                )
+                else:
+                    logger.error("Failed to launch application. Status code: %s", response.text)
+                    results = None
+                    nb_failings += 1
+            except requests.exceptions.RequestException as e:
+                logger.error("An error occurred while trying to send the request: %s", e)
+                traceback.print_exc()
+
+                results = None
+                nb_failings += 1
+
+            if len(until) == 0:
+                terminates = True
+            elif results is not None:
+                terminates = "returncode" in until and results["returncode"] == until["returncode"] \
+                             or "stdout" in until and until["stdout"] in results["output"] \
+                             or "stderr" in until and until["stderr"] in results["error"]
+            terminates = terminates or nb_failings >= 5
+            if not terminates:
+                time.sleep(0.3)
+
+    def _execute_with_verification_setup(
+            self,
+            command: List[str],
+            verification: Dict[str, Any] = None,
+            max_wait_time: int = 10,
+            check_interval: float = 1.0,
+            shell: bool = False
+    ):
+        """Execute command with verification of results
+        
+        Args:
+            command: Command to execute
+            verification: Dict with verification criteria:
+                - window_exists: Check if window with this name exists
+                - command_success: Execute this command and check if it succeeds
+            max_wait_time: Maximum time to wait for verification
+            check_interval: Time between verification checks
+            shell: Whether to use shell
+        """
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        if not command:
+            raise Exception("Empty command to launch.")
+
+        verification = verification or {}
+        
+        payload = json.dumps({
+            "command": command, 
+            "shell": shell,
+            "verification": verification,
+            "max_wait_time": max_wait_time,
+            "check_interval": check_interval
+        })
+        headers = {"Content-Type": "application/json"}
+
+        # Note: This uses a custom /setup endpoint, not a standard OSWorld method
+        try:
+            response = requests.post(self.http_server + "/setup" + "/execute_with_verification", 
+                                   headers=headers, data=payload, timeout=max_wait_time + 10)
+            if response.status_code == 200:
+                result = response.json()
+                logger.info("Command executed and verified successfully: %s -> %s"
+                            , " ".join(command) if isinstance(command, list) else command
+                            , response.text
+                            )
+                return result
+            else:
+                logger.error("Failed to execute with verification. Status code: %s", response.text)
+                raise Exception(f"Command verification failed: {response.text}")
+        except requests.exceptions.RequestException as e:
+            logger.error("An error occurred while trying to send the request: %s", e)
+            traceback.print_exc()
+            raise Exception(f"Request failed: {e}")
+
+    def _command_setup(self, command: List[str], **kwargs):
+        self._execute_setup(command, **kwargs)
+
+    def _sleep_setup(self, seconds: float):
+        time.sleep(seconds)
+
+    def _act_setup(self, action_seq: List[Union[Dict[str, Any], str]]):
+        # TODO
+        raise NotImplementedError()
+
+    def _replay_setup(self, trajectory: str):
+        """
+        Args:
+            trajectory (str): path to the replay trajectory file
+        """
+
+        # TODO
+        raise NotImplementedError()
+
+    def _activate_window_setup(self, window_name: str, strict: bool = False, by_class: bool = False):
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        if not window_name:
+            raise Exception(f"Setup Open - Invalid path ({window_name}).")
+
+        payload = json.dumps({"window_name": window_name, "strict": strict, "by_class": by_class})
+        headers = {
+            'Content-Type': 'application/json'
+        }
+
+        # send request to server to open file
+        # Note: This uses a custom /setup endpoint, not a standard OSWorld method
+        try:
+            response = requests.post(self.http_server + "/setup" + "/activate_window", headers=headers, data=payload)
+            if response.status_code == 200:
+                logger.info("Command executed successfully: %s", response.text)
+            else:
+                logger.error(f"Failed to activate window {window_name}. Status code: %s", response.text)
+            time.sleep(self.additional_wait_time)
+        except requests.exceptions.RequestException as e:
+            logger.error("An error occurred while trying to send the request: %s", e)
+
+    def _close_window_setup(self, window_name: str, strict: bool = False, by_class: bool = False):
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        if not window_name:
+            raise Exception(f"Setup Open - Invalid path ({window_name}).")
+
+        payload = json.dumps({"window_name": window_name, "strict": strict, "by_class": by_class})
+        headers = {
+            'Content-Type': 'application/json'
+        }
+
+        # send request to server to open file
+        # Note: This uses a custom /setup endpoint, not a standard OSWorld method
+        try:
+            response = requests.post(self.http_server + "/setup" + "/close_window", headers=headers, data=payload)
+            if response.status_code == 200:
+                logger.info("Command executed successfully: %s", response.text)
+            else:
+                logger.error(f"Failed to close window {window_name}. Status code: %s", response.text)
+        except requests.exceptions.RequestException as e:
+            logger.error("An error occurred while trying to send the request: %s", e)
+
+    # Chrome setup
+    async def _chrome_open_tabs_setup(self, urls_to_open: List[str]):
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        host = self.vm_ip
+        port = self.chromium_port  # fixme: this port is hard-coded, need to be changed from config file
+
+        remote_debugging_url = f"http://{host}:{port}"
+        logger.info("Connect to Chrome @: %s", remote_debugging_url)
+        logger.debug("PLAYWRIGHT ENV: %s", repr(os.environ))
+        for attempt in range(15):
+            if attempt > 0:
+                time.sleep(5)
+
+            browser = None
+            async with async_playwright() as p:
+                try:
+                    browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+                    # break
+                except Exception as e:
+                    if attempt < 14:
+                        logger.error(f"Attempt {attempt + 1}: Failed to connect, retrying. Error: {e}")
+                        # time.sleep(10)
+                        continue
+                    else:
+                        logger.error(f"Failed to connect after multiple attempts: {e}")
+                        raise e
+
+                if not browser:
+                    return
+
+                logger.info("Opening %s...", urls_to_open)
+                for i, url in enumerate(urls_to_open):
+                    # Use the first context (which should be the only one if using default profile)
+                    if i == 0:
+                        context = browser.contexts[0]
+
+                    page = await context.new_page()  # Create a new page (tab) within the existing context
+                    try:
+                        await page.goto(url, timeout=60000)
+                    except:
+                        logger.warning("Opening %s exceeds time limit", url)  # only for human test
+                    logger.info(f"Opened tab {i + 1}: {url}")
+
+                    if i == 0:
+                        # clear the default tab
+                        default_page = context.pages[0]
+                        await default_page.close()
+
+                # Do not close the context or browser; they will remain open after script ends
+                return browser, context
+
+    async def _chrome_close_tabs_setup(self, urls_to_close: List[str]):
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        time.sleep(5)  # Wait for Chrome to finish launching
+
+        host = self.vm_ip
+        port = self.chromium_port  # fixme: this port is hard-coded, need to be changed from config file
+
+        remote_debugging_url = f"http://{host}:{port}"
+        async with async_playwright() as p:
+            browser = None
+            for attempt in range(15):
+                try:
+                    browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+                    break
+                except Exception as e:
+                    if attempt < 14:
+                        logger.error(f"Attempt {attempt + 1}: Failed to connect, retrying. Error: {e}")
+                        time.sleep(5)
+                    else:
+                        logger.error(f"Failed to connect after multiple attempts: {e}")
+                        raise e
+
+            if not browser:
+                return
+
+            for i, url in enumerate(urls_to_close):
+                # Use the first context (which should be the only one if using default profile)
+                if i == 0:
+                    context = browser.contexts[0]
+
+                for page in context.pages:
+
+                    # if two urls are the same, close the tab
+                    if compare_urls(page.url, url):
+                        context.pages.pop(context.pages.index(page))
+                        await page.close()
+                        logger.info(f"Closed tab {i + 1}: {url}")
+                        break
+
+            # Do not close the context or browser; they will remain open after script ends
+            return browser, context
+
+    # google drive setup
+    def _googledrive_setup(self, **config):
+        """ Clean google drive space (eliminate the impact of previous experiments to reset the environment)
+        @args:
+            config(Dict[str, Any]): contain keys
+                settings_file(str): path to google drive settings file, which will be loaded by pydrive.auth.GoogleAuth()
+                operation(List[str]): each operation is chosen from ['delete', 'upload']
+                args(List[Dict[str, Any]]): parameters for each operation
+            different args dict for different operations:
+                for delete:
+                    query(str): query pattern string to search files or folder in google drive to delete, please refer to
+                        https://developers.google.com/drive/api/guides/search-files?hl=en about how to write query string.
+                    trash(bool): whether to delete files permanently or move to trash. By default, trash=false, completely delete it.
+                for mkdirs:
+                    path(List[str]): the path in the google drive to create folder
+                for upload:
+                    path(str): remote url to download file
+                    dest(List[str]): the path in the google drive to store the downloaded file
+        """
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        settings_file = config.get('settings_file', 'evaluation_examples/settings/googledrive/settings.yml')
+        gauth = GoogleAuth(settings_file=settings_file)
+        drive = GoogleDrive(gauth)
+
+        def mkdir_in_googledrive(paths: List[str]):
+            paths = [paths] if type(paths) != list else paths
+            parent_id = 'root'
+            for p in paths:
+                q = f'"{parent_id}" in parents and title = "{p}" and mimeType = "application/vnd.google-apps.folder" and trashed = false'
+                folder = drive.ListFile({'q': q}).GetList()
+                if len(folder) == 0:  # not exists, create it
+                    parents = {} if parent_id == 'root' else {'parents': [{'id': parent_id}]}
+                    file = drive.CreateFile({'title': p, 'mimeType': 'application/vnd.google-apps.folder', **parents})
+                    file.Upload()
+                    parent_id = file['id']
+                else:
+                    parent_id = folder[0]['id']
+            return parent_id
+
+        for oid, operation in enumerate(config['operation']):
+            if operation == 'delete':  # delete a specific file
+                # query pattern string, by default, remove all files/folders not in the trash to the trash
+                params = config['args'][oid]
+                q = params.get('query', '')
+                trash = params.get('trash', False)
+                q_file = f"( {q} ) and mimeType != 'application/vnd.google-apps.folder'" if q.strip() else "mimeType != 'application/vnd.google-apps.folder'"
+                filelist: GoogleDriveFileList = drive.ListFile({'q': q_file}).GetList()
+                q_folder = f"( {q} ) and mimeType = 'application/vnd.google-apps.folder'" if q.strip() else "mimeType = 'application/vnd.google-apps.folder'"
+                folderlist: GoogleDriveFileList = drive.ListFile({'q': q_folder}).GetList()
+                for file in filelist:  # first delete file, then folder
+                    file: GoogleDriveFile
+                    if trash:
+                        file.Trash()
+                    else:
+                        file.Delete()
+                for folder in folderlist:
+                    folder: GoogleDriveFile
+                    # note that, if a folder is trashed/deleted, all files and folders in it will be trashed/deleted
+                    if trash:
+                        folder.Trash()
+                    else:
+                        folder.Delete()
+            elif operation == 'mkdirs':
+                params = config['args'][oid]
+                mkdir_in_googledrive(params['path'])
+            elif operation == 'upload':
+                params = config['args'][oid]
+                url = params['url']
+                with tempfile.NamedTemporaryFile(mode='wb', delete=False) as tmpf:
+                    response = requests.get(url, stream=True)
+                    response.raise_for_status()
+                    for chunk in response.iter_content(chunk_size=8192):
+                        if chunk:
+                            tmpf.write(chunk)
+                    tmpf.close()
+                    paths = [params['path']] if params['path'] != list else params['path']
+                    parent_id = mkdir_in_googledrive(paths[:-1])
+                    parents = {} if parent_id == 'root' else {'parents': [{'id': parent_id}]}
+                    file = drive.CreateFile({'title': paths[-1], **parents})
+                    file.SetContentFile(tmpf.name)
+                    file.Upload()
+                return
+            else:
+                raise ValueError('[ERROR]: not implemented clean type!')
+
+    async def _login_setup(self, **config):
+        """ Login to a website with account and password information.
+        @args:
+            config(Dict[str, Any]): contain keys
+                settings_file(str): path to the settings file
+                platform(str): platform to login, implemented platforms include:
+                    googledrive: https://drive.google.com/drive/my-drive
+
+        """
+        if not self.runtime:
+            raise Exception("Runtime is required for SetupController. Please provide a runtime object.")
+        
+        host = self.vm_ip
+        port = self.chromium_port
+
+        remote_debugging_url = f"http://{host}:{port}"
+        async with async_playwright() as p:
+            browser = None
+            for attempt in range(15):
+                try:
+                    browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+                    break
+                except Exception as e:
+                    if attempt < 14:
+                        logger.error(f"Attempt {attempt + 1}: Failed to connect, retrying. Error: {e}")
+                        time.sleep(5)
+                    else:
+                        logger.error(f"Failed to connect after multiple attempts: {e}")
+                        raise e
+            if not browser:
+                return
+
+            context = browser.contexts[0]
+            platform = config['platform']
+
+            if platform == 'googledrive':
+                url = 'https://drive.google.com/drive/my-drive'
+                page = await context.new_page()  # Create a new page (tab) within the existing context
+                try:
+                    await page.goto(url, timeout=60000)
+                except:
+                    logger.warning("Opening %s exceeds time limit", url)  # only for human test
+                logger.info(f"Opened new page: {url}")
+                settings = json.load(open(config['settings_file']))
+                email, password = settings['email'], settings['password']
+
+                try:
+                    await page.wait_for_selector('input[type="email"]', state="visible", timeout=3000)
+                    await page.fill('input[type="email"]', email)
+                    await page.click('#identifierNext > div > button')
+                    await page.wait_for_selector('input[type="password"]', state="visible", timeout=5000)
+                    await page.fill('input[type="password"]', password)
+                    await page.click('#passwordNext > div > button')
+                    await page.wait_for_load_state('load', timeout=5000)
+                except TimeoutError:
+                    logger.info('[ERROR]: timeout when waiting for google drive login page to load!')
+                    return
+
+            else:
+                raise NotImplementedError
+
+            return browser, context
+
+    def execute_python_command(self, command: str):
+        """
+        Taken from OSWorld/desktop_env/controllers/python.py
+        Executes a python command on the server.
+        It can be used to execute the pyautogui commands, or... any other python command. who knows?
+        """
+        pkgs_prefix = "import pyautogui; import time; pyautogui.FAILSAFE = False; {command}"
+        command_list = ["python", "-c", pkgs_prefix.format(command=command)]
+        payload = json.dumps({"command": command_list, "shell": False})
+
+        for _ in range(3):
+            try:
+                response = requests.post(self.http_server + "/execute", headers={'Content-Type': 'application/json'},
+                                         data=payload, timeout=90)
+                if response.status_code == 200:
+                    logger.info("Command executed successfully: %s", response.text)
+                    return response.json()
+                else:
+                    logger.error("Failed to execute command. Status code: %d", response.status_code)
+                    logger.info("Retrying to execute command.")
+            except requests.exceptions.ReadTimeout:
+                break
+            except Exception as e:
+                logger.error("An error occurred while trying to execute the command: %s", e)
+                logger.info("Retrying to execute command.")
+            time.sleep(5)
+
+        logger.error("Failed to execute command.")
+        return None
+    def _update_browse_history_setup(self, **config):
+        cache_path = os.path.join(self.cache_dir, "history_new.sqlite")
+        db_url = "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938/history_empty.sqlite?download=true"
+        if not os.path.exists(cache_path):
+            max_retries = 3
+            downloaded = False
+            e = None
+            for i in range(max_retries):
+                try:
+                    response = requests.get(db_url, stream=True)
+                    response.raise_for_status()
+
+                    with open(cache_path, 'wb') as f:
+                        for chunk in response.iter_content(chunk_size=8192):
+                            if chunk:
+                                f.write(chunk)
+                    logger.info("File downloaded successfully")
+                    downloaded = True
+                    break
+
+                except requests.RequestException as e:
+                    logger.error(
+                        f"Failed to download {db_url} caused by {e}. Retrying... ({max_retries - i - 1} attempts left)")
+            if not downloaded:
+                raise requests.RequestException(f"Failed to download {db_url}. No retries left. Error: {e}")
+        else:
+            logger.info("File already exists in cache directory")
+        # copy a new history file in the tmp folder
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            db_path = os.path.join(tmp_dir, "history_empty.sqlite")
+            shutil.copy(cache_path, db_path)
+
+            history = config['history']
+
+            for history_item in history:
+                url = history_item['url']
+                title = history_item['title']
+                visit_time = datetime.now() - timedelta(seconds=history_item['visit_time_from_now_in_seconds'])
+
+                # Chrome use ms from 1601-01-01 as timestamp
+                epoch_start = datetime(1601, 1, 1)
+                chrome_timestamp = int((visit_time - epoch_start).total_seconds() * 1000000)
+
+                conn = sqlite3.connect(db_path)
+                cursor = conn.cursor()
+
+                cursor.execute('''
+                    INSERT INTO urls (url, title, visit_count, typed_count, last_visit_time, hidden)
+                    VALUES (?, ?, ?, ?, ?, ?)
+                ''', (url, title, 1, 0, chrome_timestamp, 0))
+
+                url_id = cursor.lastrowid
+
+                cursor.execute('''
+                    INSERT INTO visits (url, visit_time, from_visit, transition, segment_id, visit_duration)
+                    VALUES (?, ?, ?, ?, ?, ?)
+                ''', (url_id, chrome_timestamp, 0, 805306368, 0, 0))
+
+                conn.commit()
+                conn.close()
+
+            logger.info('Fake browsing history added successfully.')
+
+            # get the path of the history file according to the platform
+            os_type = self.runtime.os_type
+
+            if os_type == 'windows':
+                chrome_history_path = self.execute_python_command(
+                    """import os; print(os.path.join(os.getenv('USERPROFILE'), "AppData", "Local", "Google", "Chrome", "User Data", "Default", "History"))""")[
+                    'output'].strip()
+            elif os_type == 'darwin':
+                chrome_history_path = self.execute_python_command(
+                    """import os; print(os.path.join(os.getenv('HOME'), "Library", "Application Support", "Google", "Chrome", "Default", "History"))""")[
+                    'output'].strip()
+            elif os_type == 'linux':
+                if "arm" in platform.machine():
+                    chrome_history_path = self.execute_python_command(
+                        "import os; print(os.path.join(os.getenv('HOME'), 'snap', 'chromium', 'common', 'chromium', 'Default', 'History'))")[
+                        'output'].strip()
+                else:
+                    chrome_history_path = self.execute_python_command(
+                        "import os; print(os.path.join(os.getenv('HOME'), '.config', 'google-chrome', 'Default', 'History'))")[
+                        'output'].strip()
+            else:
+                raise Exception('Unsupported operating system')
+
+            form = MultipartEncoder({
+                "file_path": chrome_history_path,
+                "file_data": (os.path.basename(chrome_history_path), open(db_path, "rb"))
+            })
+            headers = {"Content-Type": form.content_type}
+            logger.debug(form.content_type)
+
+            # send request to server to upload file
+            try:
+                logger.debug("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/upload")
+                response = requests.post(self.http_server + "/setup" + "/upload", headers=headers, data=form)
+                if response.status_code == 200:
+                    logger.info("Command executed successfully: %s", response.text)
+                else:
+                    logger.error("Failed to upload file. Status code: %s", response.text)
+            except requests.exceptions.RequestException as e:
+                logger.error("An error occurred while trying to send the request: %s", e)
+
+            self._execute_setup(["sudo chown -R user:user /home/user/.config/google-chrome/Default/History"], shell=True)
diff --git a/openhands/nvidia/os_world/evaluate.py b/openhands/nvidia/os_world/evaluate.py
new file mode 100644
index 000000000..85ae63df4
--- /dev/null
+++ b/openhands/nvidia/os_world/evaluate.py
@@ -0,0 +1,153 @@
+from typing import Any, Union, Dict, List
+import inspect
+
+from openhands.nvidia.os_world.controllers.python import PythonController
+from openhands.nvidia.os_world import metrics, getters
+from openhands.core.logger import openhands_logger as logger
+
+class Evaluator:
+    def __init__(self, task_config: Dict[str, Any], controller):
+        self.setup_controller = controller
+        self.vm_ip = controller.vm_ip
+        self.server_port = controller.server_port
+        self.chromium_port = controller.chromium_port
+        self.vlc_port = controller.vlc_port
+        self.http_server = controller.http_server
+        self.http_server_setup_root = controller.http_server_setup_root
+        self.cache_dir = controller.cache_dir
+        self.client_password = controller.client_password
+        self.screen_width = controller.screen_width
+        self.screen_height = controller.screen_height
+
+        # Assume Linux platform for OS World VMs
+        # TODO: get from runtime/controller, mismatch initial letter is lowercase 
+        self.vm_platform = 'Linux'
+
+        self.controller = PythonController(self.vm_ip, self.server_port)
+        self._set_evaluator_info(task_config)
+ 
+    def _set_evaluator_info(self, task_config: Dict[str, Any]):
+        """Set evaluator information from task config"""
+        # evaluator dict
+        # func -> metric function string, or list of metric function strings
+        # conj -> conjunction of multiple metrics if func is a list with length > 1, "and"/"or"
+        # result -> result getter config, or list of result getter configs
+        # expected (optional) -> expected getter config, or list of expected getter configs
+        # options (optional) -> metric options, or list of metric options
+        # if func is a str list, then result, expected (if exists), options (if exists) should also be lists of the same length
+        # even if one of the metrics does not need expected or options field, it should be included in the list with None
+        self.evaluator = task_config["evaluator"]
+        self.metric = [getattr(metrics, func) for func in self.evaluator["func"]] \
+            if isinstance(self.evaluator["func"], list) \
+            else getattr(metrics, self.evaluator["func"])
+        self.metric_conj: str = self.evaluator.get("conj", "and")  # take conjunction of multiple metrics
+        if "result" in self.evaluator and len(self.evaluator["result"]) > 0:
+            self.result_getter = [getattr(getters, "get_{:}".format(res["type"])) for res in
+                                          self.evaluator["result"]] \
+                if isinstance(self.evaluator["result"], list) \
+                else getattr(getters, "get_{:}".format(self.evaluator["result"]["type"]))
+        else:
+            self.result_getter = [None] * len(self.metric) \
+                if isinstance(self.metric, list) \
+                else None
+
+        if "expected" in self.evaluator and len(self.evaluator["expected"]) > 0:
+            self.expected_getter = [getattr(getters, "get_{:}".format(exp["type"])) if exp else None for exp in
+                                            self.evaluator["expected"]] \
+                if isinstance(self.evaluator["expected"], list) \
+                else getattr(getters, "get_{:}".format(self.evaluator["expected"]["type"]))
+        else:
+            self.expected_getter = [None] * len(self.metric) \
+                if isinstance(self.metric, list) \
+                else None
+        self.metric_options: Union[List[Dict[str, Any]], Dict[str, Any]] = [opt if opt else {} for opt in
+                                                                            self.evaluator["options"]] \
+            if isinstance(self.evaluator.get("options", {}), list) \
+            else self.evaluator["options"] \
+            if "options" in self.evaluator \
+            else [{}] * len(self.metric) \
+            if isinstance(self.metric, list) \
+            else {}
+
+        assert (not isinstance(self.evaluator["func"], list)
+                or (len(self.metric) == len(self.result_getter) == len(self.expected_getter) == len(
+                    self.metric_options)))
+
+    async def evaluate(self, action_history = []):
+        """
+        Evaluate whether the task is successfully completed.
+        """
+
+        postconfig = self.evaluator.get("postconfig", [])
+        await self.setup_controller.setup(postconfig)
+
+        # TODO: special handling for infeasible tasks, might be different to OpenHands
+        if self.evaluator['func'] == "infeasible":
+            if len(action_history) > 0:
+                last_action = action_history[-1]
+                if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
+                    return 1
+            return 0
+        else:
+            if len(action_history) > 0:
+                last_action = action_history[-1]
+                if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
+                    return 0
+
+        if type(self.metric) == list:
+            # Multiple metrics to evaluate whether the task is successfully completed
+            results = []
+            assert len(self.metric) == len(self.result_getter), "The number of metrics and result getters must be the same"
+            if "expected" in self.evaluator:
+                assert len(self.metric) == len(self.expected_getter), "The number of metrics and expected getters must be the same"
+            for idx, metric in enumerate(self.metric):
+                try:
+                    config = self.evaluator["result"][idx]
+
+                    # Check if it's a coroutine function and await if necessary
+                    if inspect.iscoroutinefunction(self.result_getter[idx]):
+                        result_state = await self.result_getter[idx](self, config)
+                    else:
+                        result_state = self.result_getter[idx](self, config)
+
+                    
+                except FileNotFoundError:
+                    logger.error("File not found!")
+                    if self.metric_conj == 'and':
+                        return 0
+
+                if "expected" in self.evaluator and self.expected_getter and self.evaluator["expected"]:
+                    expected_state = self.expected_getter[idx](self, self.evaluator["expected"][idx])
+                    metric: int = metric(result_state, expected_state, **self.metric_options[idx])
+                else:
+                    metric: int = metric(result_state, **self.metric_options[idx])
+
+                if self.metric_conj == 'and' and float(metric) == 0.0:
+                    return 0
+                elif self.metric_conj == 'or' and float(metric) == 1.0:
+                    return 1
+                else:
+                    results.append(metric)
+
+            return sum(results) / len(results) if self.metric_conj == 'and' else max(results)
+        else:
+            # Single metric to evaluate whether the task is successfully completed
+            try:
+
+                # Check if it's a coroutine function and await if necessary
+                if inspect.iscoroutinefunction(self.result_getter):
+                    result_state = await self.result_getter(self, self.evaluator["result"])
+                else:
+                    result_state = self.result_getter(self, self.evaluator["result"])
+
+            except FileNotFoundError:
+                logger.error("File not found!")
+                return 0
+
+            if "expected" in self.evaluator and self.expected_getter and self.evaluator["expected"]:
+                expected_state = self.expected_getter(self, self.evaluator["expected"])
+                metric: float = self.metric(result_state, expected_state, **self.metric_options)
+            else:
+                metric: float = self.metric(result_state, **self.metric_options)
+
+        return metric
\ No newline at end of file
diff --git a/openhands/nvidia/os_world/getters/__init__.py b/openhands/nvidia/os_world/getters/__init__.py
new file mode 100644
index 000000000..e966d9f0c
--- /dev/null
+++ b/openhands/nvidia/os_world/getters/__init__.py
@@ -0,0 +1,42 @@
+from .chrome import (
+    get_default_search_engine,
+    get_cookie_data,
+    get_bookmarks,
+    get_open_tabs_info,
+    get_pdf_from_url,
+    get_shortcuts_on_desktop,
+    get_history,
+    get_page_info,
+    get_enabled_experiments,
+    get_chrome_language,
+    get_chrome_font_size,
+    get_profile_name,
+    get_number_of_search_results,
+    get_googledrive_file,
+    get_active_tab_info,
+    get_enable_do_not_track,
+    get_enable_enhanced_safety_browsing,
+    get_enable_safe_browsing,
+    get_new_startup_page,
+    get_find_unpacked_extension_path,
+    get_data_delete_automacally,
+    get_active_tab_html_parse,
+    get_active_tab_url_parse,
+    get_gotoRecreationPage_and_get_html_content,
+    get_url_dashPart,
+    get_active_url_from_accessTree,
+    get_find_installed_extension_name,
+    get_info_from_website,
+    get_macys_product_url_parse,
+    get_url_path_parse  # Alias for backward compatibility
+)
+from .file import get_cloud_file, get_vm_file, get_cache_file, get_content_from_vm_file
+from .general import get_vm_command_line, get_vm_terminal_output, get_vm_command_error
+from .gimp import get_gimp_config_file
+from .impress import get_audio_in_slide, get_background_image_in_slide
+from .info import get_vm_screen_size, get_vm_window_size, get_vm_wallpaper, get_list_directory
+from .misc import get_rule, get_accessibility_tree, get_rule_relativeTime, get_time_diff_range
+from .replay import get_replay
+from .vlc import get_vlc_playing_info, get_vlc_config, get_default_video_player
+from .vscode import get_vscode_config
+from .calc import get_conference_city_in_order
diff --git a/openhands/nvidia/os_world/getters/calc.py b/openhands/nvidia/os_world/getters/calc.py
new file mode 100644
index 000000000..81e11752a
--- /dev/null
+++ b/openhands/nvidia/os_world/getters/calc.py
@@ -0,0 +1,15 @@
+import csv
+
+
+# I want to write a function, reads a csv file, and get all the contents in the third column in the order of rows
+def get_conference_city_in_order(env, config):
+    # read the csv file
+    csv_path = config['csv_path']
+    print(f"Reading csv file from {csv_path}")
+    with open(csv_path, 'r') as f:
+        reader = csv.reader(f)
+    # skip the header row
+    next(reader)
+    # get the third column in the order of rows
+    conference_city_list = [row[2] for row in reader]
+    return conference_city_list
diff --git a/openhands/nvidia/os_world/getters/chrome.py b/openhands/nvidia/os_world/getters/chrome.py
new file mode 100644
index 000000000..cd2acd906
--- /dev/null
+++ b/openhands/nvidia/os_world/getters/chrome.py
@@ -0,0 +1,2429 @@
+import json
+import os
+import platform
+import sqlite3
+import time
+import asyncio
+from urllib.parse import unquote
+from typing import Dict, Any, List
+from urllib.parse import urlparse, parse_qs
+
+import lxml.etree
+import requests
+from lxml.cssselect import CSSSelector
+from lxml.etree import _Element
+from playwright.async_api import async_playwright
+from pydrive.auth import GoogleAuth
+from pydrive.drive import GoogleDrive, GoogleDriveFileList, GoogleDriveFile
+
+from openhands.core.logger import openhands_logger as logger
+
+_accessibility_ns_map = {
+    "st": "uri:deskat:state.at-spi.gnome.org",
+    "attr": "uri:deskat:attributes.at-spi.gnome.org",
+    "cp": "uri:deskat:component.at-spi.gnome.org",
+    "doc": "uri:deskat:document.at-spi.gnome.org",
+    "docattr": "uri:deskat:attributes.document.at-spi.gnome.org",
+    "txt": "uri:deskat:text.at-spi.gnome.org",
+    "val": "uri:deskat:value.at-spi.gnome.org",
+    "act": "uri:deskat:action.at-spi.gnome.org"
+}
+
+"""
+WARNING: 
+1. Functions from this script assume that no account is registered on Chrome, otherwise the default file path needs to be changed.
+2. The functions are not tested on Windows and Mac, but they should work.
+"""
+
+
+async def get_info_from_website(env, config: Dict[Any, Any]) -> Any:
+    """ Get information from a website. Especially useful when the information may be updated through time.
+    Args:
+        env (Any): The environment object.
+        config (Dict[Any, Any]): The configuration dictionary.
+            - url (str): The URL of the website to visit
+            - infos (List[Dict[str, str]]): The list of information to be extracted from the website. Each dictionary contains:
+                - action (str): chosen from 'inner_text', 'attribute', 'click_and_inner_text', 'click_and_attribute', etc., concretely,
+                    - inner_text: extract the inner text of the element specified by the selector
+                    - attribute: extract the attribute of the element specified by the selector
+                    - click_and_inner_text: click elements following the selector and then extract the inner text of the last element
+                    - click_and_attribute: click elements following the selector and then extract the attribute of the last element
+                - selector (Union[str, List[str]]): The CSS selector(s) of the element(s) to be extracted.
+                - attribute (str): optional for 'attribute' and 'click_and_attribute', the attribute to be extracted.
+            - backups (Any): The backup information to be returned if the extraction fails.
+    """
+    # 添加函数开始日志
+    logger.info(f"[INFO_FROM_WEBSITE] Starting to get information from website: {config.get('url', 'N/A')}")
+    logger.info(f"[INFO_FROM_WEBSITE] Total info operations to perform: {len(config.get('infos', []))}")
+    logger.debug(f"[INFO_FROM_WEBSITE] Full config: {config}")
+    
+    try:
+        host = env.vm_ip
+        port = env.chromium_port  # fixme: this port is hard-coded, need to be changed from config file
+        server_port = env.server_port
+        remote_debugging_url = f"http://{host}:{port}"
+        backend_url = f"http://{host}:{server_port}"
+        use_proxy = env.current_use_proxy
+        
+        logger.info(f"[INFO_FROM_WEBSITE] Connecting to Chrome at {remote_debugging_url}")
+        
+        async with async_playwright() as p:
+            # connect to remote Chrome instance
+            try:
+                browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+                logger.info(f"[INFO_FROM_WEBSITE] Successfully connected to existing Chrome instance")
+            except Exception as e:
+                logger.warning(f"[INFO_FROM_WEBSITE] Failed to connect to existing Chrome instance: {e}")
+                logger.info(f"[INFO_FROM_WEBSITE] Starting new Chrome instance...")
+                
+                # If the connection fails (e.g., the agent close the browser instance), start a new browser instance
+                app = 'chromium' if 'arm' in platform.machine() else 'google-chrome'
+                command = [
+                    app,
+                    "--remote-debugging-port=1337"
+                ]
+                if use_proxy:
+                    command.append(f"--proxy-server=127.0.0.1:18888")
+                    logger.info(f"[INFO_FROM_WEBSITE] Using proxy server: 127.0.0.1:18888")
+                
+                logger.info(f"[INFO_FROM_WEBSITE] Starting browser with command: {' '.join(command)}")
+                payload = json.dumps({"command": command, "shell": False})
+                headers = {"Content-Type": "application/json"}
+                #requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
+                requests.post(backend_url + "/setup" + "/launch", headers=headers, data=payload)
+                await asyncio.sleep(5)
+                browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+                logger.info(f"[INFO_FROM_WEBSITE] Successfully connected to new Chrome instance")
+
+            page = await browser.new_page()
+            logger.info(f"[INFO_FROM_WEBSITE] Created new page, navigating to: {config['url']}")
+            
+            await page.goto(config["url"])
+            await page.wait_for_load_state('load')
+            
+            # 记录页面加载完成后的信息
+            logger.info(f"[INFO_FROM_WEBSITE] Page loaded successfully")
+            logger.info(f"[INFO_FROM_WEBSITE] Page title: '{await page.title()}'")
+            logger.info(f"[INFO_FROM_WEBSITE] Current URL: '{page.url}'")
+            
+            infos = []
+            for idx, info_dict in enumerate(config.get('infos', [])):
+                logger.info(f"[INFO_FROM_WEBSITE] Processing info operation {idx + 1}/{len(config.get('infos', []))}")
+                logger.debug(f"[INFO_FROM_WEBSITE] Info config: {info_dict}")
+                
+                if page.url != config["url"]:
+                    logger.info(f"[INFO_FROM_WEBSITE] Page URL changed, navigating back to: {config['url']}")
+                    await page.goto(config["url"])
+                    await page.wait_for_load_state('load')
+                    logger.info(f"[INFO_FROM_WEBSITE] Back to original page")
+                
+                action = info_dict.get('action', 'inner_text')
+                selector = info_dict.get('selector')
+                logger.info(f"[INFO_FROM_WEBSITE] Action: {action}, Selector: {selector}")
+                
+                if action == "inner_text":
+                    logger.debug(f"[INFO_FROM_WEBSITE] Waiting for element with selector: {selector}")
+                    ele = await page.wait_for_selector(info_dict['selector'], state='attached', timeout=10000)
+                    extracted_text = await ele.inner_text()
+                    logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted inner_text: '{extracted_text}'")
+                    infos.append(extracted_text)
+                    
+                elif action == "attribute":
+                    attribute = info_dict.get('attribute')
+                    logger.debug(f"[INFO_FROM_WEBSITE] Waiting for element with selector: {selector}")
+                    logger.debug(f"[INFO_FROM_WEBSITE] Extracting attribute: {attribute}")
+                    ele = await page.wait_for_selector(info_dict['selector'], state='attached', timeout=10000)
+                    extracted_attr = await ele.get_attribute(info_dict['attribute'])
+                    logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted attribute '{attribute}': '{extracted_attr}'")
+                    infos.append(extracted_attr)
+                    
+                elif action == 'click_and_inner_text':
+                    logger.debug(f"[INFO_FROM_WEBSITE] Performing click_and_inner_text with {len(info_dict['selector'])} selectors")
+                    for idx, sel in enumerate(info_dict['selector']):
+                        logger.debug(f"[INFO_FROM_WEBSITE] Processing selector {idx + 1}/{len(info_dict['selector'])}: {sel}")
+                        if idx != len(info_dict['selector']) - 1:
+                            logger.debug(f"[INFO_FROM_WEBSITE] Clicking element with selector: {sel}")
+                            link = await page.wait_for_selector(sel, state='attached', timeout=10000)
+                            await link.click()
+                            await page.wait_for_load_state('load')
+                            logger.info(f"[INFO_FROM_WEBSITE] Successfully clicked element, page loaded")
+                            logger.debug(f"[INFO_FROM_WEBSITE] New page URL: {page.url}")
+                        else:
+                            logger.debug(f"[INFO_FROM_WEBSITE] Extracting inner_text from final element: {sel}")
+                            ele = await page.wait_for_selector(sel, state='attached', timeout=10000)
+                            extracted_text = await ele.inner_text()
+                            logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted inner_text after clicks: '{extracted_text}'")
+                            infos.append(extracted_text)
+                            
+                elif action == 'click_and_attribute':
+                    attribute = info_dict.get('attribute')
+                    logger.debug(f"[INFO_FROM_WEBSITE] Performing click_and_attribute with {len(info_dict['selector'])} selectors")
+                    logger.debug(f"[INFO_FROM_WEBSITE] Target attribute: {attribute}")
+                    for idx, sel in enumerate(info_dict['selector']):
+                        logger.debug(f"[INFO_FROM_WEBSITE] Processing selector {idx + 1}/{len(info_dict['selector'])}: {sel}")
+                        if idx != len(info_dict['selector']) - 1:
+                            logger.debug(f"[INFO_FROM_WEBSITE] Clicking element with selector: {sel}")
+                            link = await page.wait_for_selector(sel, state='attached', timeout=10000)
+                            await link.click()
+                            await page.wait_for_load_state('load')
+                            logger.info(f"[INFO_FROM_WEBSITE] Successfully clicked element, page loaded")
+                            logger.debug(f"[INFO_FROM_WEBSITE] New page URL: {page.url}")
+                        else:
+                            logger.debug(f"[INFO_FROM_WEBSITE] Extracting attribute from final element: {sel}")
+                            ele = await page.wait_for_selector(sel, state='attached')
+                            extracted_attr = await ele.get_attribute(info_dict['attribute'])
+                            logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted attribute '{attribute}' after clicks: '{extracted_attr}'")
+                            infos.append(extracted_attr)
+                else:
+                    logger.error(f"[INFO_FROM_WEBSITE] Unsupported action: {action}")
+                    raise NotImplementedError(f'The action {action} is not supported yet.')
+                
+                logger.info(f"[INFO_FROM_WEBSITE] Completed info operation {idx + 1}")
+            
+            # 记录最终提取的所有信息
+            logger.info(f"[INFO_FROM_WEBSITE] All operations completed successfully")
+            logger.info(f"[INFO_FROM_WEBSITE] Total extracted information count: {len(infos)}")
+            logger.info(f"[INFO_FROM_WEBSITE] Final extracted information: {infos}")
+            
+        return infos
+    except Exception as e:
+        logger.error(f'[INFO_FROM_WEBSITE] ERROR: Failed to obtain information from website: {config.get("url", "N/A")}')
+        logger.error(f'[INFO_FROM_WEBSITE] Exception details: {str(e)}')
+        logger.error(f'[INFO_FROM_WEBSITE] Exception type: {type(e).__name__}')
+        logger.info(f'[INFO_FROM_WEBSITE] Using backup results instead')
+        backup_data = config.get('backups', None)
+        logger.info(f'[INFO_FROM_WEBSITE] Backup data: {backup_data}')
+        return backup_data
+
+
+# The following ones just need to load info from the files of software, no need to connect to the software
+def get_default_search_engine(env, config: Dict[str, str]):
+    os_type = env.vm_platform
+    if os_type == 'Windows':
+        preference_file_path = env.controller.execute_python_command("""import os; print(os.path.join(os.getenv('LOCALAPPDATA'),
+                                            'Google\\Chrome\\User Data\\Default\\Preferences'))""")['output'].strip()
+    elif os_type == 'Darwin':
+        preference_file_path = env.controller.execute_python_command(
+            "import os; print(os.path.join(os.getenv('HOME'), 'Library/Application Support/Google/Chrome/Default/Preferences'))")[
+            'output'].strip()
+    elif os_type == 'Linux':
+        if "arm" in platform.machine():
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), 'snap/chromium/common/chromium/Default/Preferences'))")[
+                'output'].strip()
+        else:
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), '.config/google-chrome/Default/Preferences'))")[
+                'output'].strip()
+    else:
+        raise Exception('Unsupported operating system')
+
+    try:
+        content = env.controller.get_file(preference_file_path)
+        data = json.loads(content)
+
+        # The path within the JSON data to the default search engine might vary
+        search_engine = data.get('default_search_provider_data', {}).get('template_url_data', {}).get('short_name',
+                                                                                                      'Google')
+        return search_engine
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return "Google"
+
+
+def get_cookie_data(env, config: Dict[str, str]):
+    """
+    Get the cookies from the Chrome browser.
+    Assume the cookies are stored in the default location, not encrypted and not large in size.
+    """
+    os_type = env.vm_platform
+    if os_type == 'Windows':
+        chrome_cookie_file_path = env.controller.execute_python_command("""import os; print(os.path.join(os.getenv('LOCALAPPDATA'),
+                                            'Google\\Chrome\\User Data\\Default\\Cookies'))""")['output'].strip()
+    elif os_type == 'Darwin':
+        chrome_cookie_file_path = env.controller.execute_python_command(
+            "import os; print(os.path.join(os.getenv('HOME'), 'Library/Application Support/Google/Chrome/Default/Cookies'))")[
+            'output'].strip()
+    elif os_type == 'Linux':
+        if "arm" in platform.machine():
+            chrome_cookie_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), 'snap/chromium/common/chromium/Default/Cookies'))")[
+                'output'].strip()
+        else:
+            chrome_cookie_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), '.config/google-chrome/Default/Cookies'))")[
+                'output'].strip()
+    else:
+        raise Exception('Unsupported operating system')
+
+    try:
+        content = env.controller.get_file(chrome_cookie_file_path)
+        _path = os.path.join(env.cache_dir, config["dest"])
+
+        with open(_path, "wb") as f:
+            f.write(content)
+
+        conn = sqlite3.connect(_path)
+        cursor = conn.cursor()
+
+        # Query to check for OpenAI cookies
+        cursor.execute("SELECT * FROM cookies")
+        cookies = cursor.fetchall()
+        return cookies
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return None
+
+
+def get_history(env, config: Dict[str, str]):
+    os_type = env.vm_platform
+    if os_type == 'Windows':
+        chrome_history_path = env.controller.execute_python_command(
+            """import os; print(os.path.join(os.getenv('USERPROFILE'), "AppData", "Local", "Google", "Chrome", "User Data", "Default", "History"))""")[
+            'output'].strip()
+    elif os_type == 'Darwin':
+        chrome_history_path = env.controller.execute_python_command(
+            """import os; print(os.path.join(os.getenv('HOME'), "Library", "Application Support", "Google", "Chrome", "Default", "History"))""")[
+            'output'].strip()
+    elif os_type == 'Linux':
+        if "arm" in platform.machine():
+            chrome_history_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), 'snap/chromium/common/chromium/Default/History'))")[
+                'output'].strip()
+        else:
+            chrome_history_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), '.config', 'google-chrome', 'Default', 'History'))")[
+                'output'].strip()
+    else:
+        raise Exception('Unsupported operating system')
+
+    try:
+        content = env.controller.get_file(chrome_history_path)
+        _path = os.path.join(env.cache_dir, config["dest"])
+
+        with open(_path, "wb") as f:
+            f.write(content)
+
+        conn = sqlite3.connect(_path)
+        cursor = conn.cursor()
+
+        # Query to check for OpenAI cookies
+        cursor.execute("SELECT url, title, last_visit_time FROM urls")
+        history_items = cursor.fetchall()
+        return history_items
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return None
+
+
+def get_enabled_experiments(env, config: Dict[str, str]):
+    os_type = env.vm_platform
+    if os_type == 'Windows':
+        preference_file_path = env.controller.execute_python_command("""import os; print(os.path.join(os.getenv('LOCALAPPDATA'),
+                                                'Google\\Chrome\\User Data\\Local State'))""")[
+            'output'].strip()
+    elif os_type == 'Darwin':
+        preference_file_path = env.controller.execute_python_command(
+            "import os; print(os.path.join(os.getenv('HOME'), 'Library/Application Support/Google/Chrome/Local State'))")[
+            'output'].strip()
+    elif os_type == 'Linux':
+        if "arm" in platform.machine():
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), 'snap/chromium/common/chromium/Local State'))")[
+                'output'].strip()
+        else:
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), '.config/google-chrome/Local State'))")[
+                'output'].strip()
+    else:
+        raise Exception('Unsupported operating system')
+
+    try:
+        content = env.controller.get_file(preference_file_path)
+        data = json.loads(content)
+
+        # The path within the JSON data to the default search engine might vary
+        enabled_labs_experiments = data.get('browser', {}).get('enabled_labs_experiments', [])
+        return enabled_labs_experiments
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return []
+
+
+def get_profile_name(env, config: Dict[str, str]):
+    """
+    Get the username from the Chrome browser.
+    Assume the cookies are stored in the default location, not encrypted and not large in size.
+    """
+    os_type = env.vm_platform
+    if os_type == 'Windows':
+        preference_file_path = env.controller.execute_python_command("""import os; print(os.path.join(os.getenv('LOCALAPPDATA'),
+                                            'Google\\Chrome\\User Data\\Default\\Preferences'))""")['output'].strip()
+    elif os_type == 'Darwin':
+        preference_file_path = env.controller.execute_python_command(
+            "import os; print(os.path.join(os.getenv('HOME'), 'Library/Application Support/Google/Chrome/Default/Preferences'))")[
+            'output'].strip()
+    elif os_type == 'Linux':
+        if "arm" in platform.machine():
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), 'snap/chromium/common/chromium/Default/Preferences'))")[
+                'output'].strip()
+        else:
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), '.config/google-chrome/Default/Preferences'))")[
+                'output'].strip()
+    else:
+        raise Exception('Unsupported operating system')
+
+    try:
+        content = env.controller.get_file(preference_file_path)
+        data = json.loads(content)
+
+        # The path within the JSON data to the default search engine might vary
+        profile_name = data.get('profile', {}).get('name', None)
+        return profile_name
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return None
+
+
+def get_chrome_language(env, config: Dict[str, str]):
+    os_type = env.vm_platform
+    if os_type == 'Windows':
+        preference_file_path = env.controller.execute_python_command("""import os; print(os.path.join(os.getenv('LOCALAPPDATA'),
+                                                    'Google\\Chrome\\User Data\\Local State'))""")[
+            'output'].strip()
+    elif os_type == 'Darwin':
+        preference_file_path = env.controller.execute_python_command(
+            "import os; print(os.path.join(os.getenv('HOME'), 'Library/Application Support/Google/Chrome/Local State'))")[
+            'output'].strip()
+    elif os_type == 'Linux':
+        if "arm" in platform.machine():
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), 'snap/chromium/common/chromium/Local State'))")[
+                'output'].strip()
+        else:
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), '.config/google-chrome/Local State'))")[
+                'output'].strip()
+    else:
+        raise Exception('Unsupported operating system')
+
+    try:
+        content = env.controller.get_file(preference_file_path)
+        data = json.loads(content)
+
+        # The path within the JSON data to the default search engine might vary
+        enabled_labs_experiments = data.get('intl', {}).get('app_locale', "en-US")
+        return enabled_labs_experiments
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return "en-US"
+
+
+def get_chrome_font_size(env, config: Dict[str, str]):
+    os_type = env.vm_platform
+    if os_type == 'Windows':
+        preference_file_path = env.controller.execute_python_command("""import os; print(os.path.join(os.getenv('LOCALAPPDATA'),
+                                                'Google\\Chrome\\User Data\\Default\\Preferences'))""")[
+            'output'].strip()
+    elif os_type == 'Darwin':
+        preference_file_path = env.controller.execute_python_command(
+            "import os; print(os.path.join(os.getenv('HOME'), 'Library/Application Support/Google/Chrome/Default/Preferences'))")[
+            'output'].strip()
+    elif os_type == 'Linux':
+        if "arm" in platform.machine():
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), 'snap/chromium/common/chromium/Default/Preferences'))")[
+                'output'].strip()
+        else:
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), '.config/google-chrome/Default/Preferences'))")[
+                'output'].strip()
+    else:
+        raise Exception('Unsupported operating system')
+
+    try:
+        content = env.controller.get_file(preference_file_path)
+        data = json.loads(content)
+
+        # The path within the JSON data to the default search engine might vary
+        search_engine = data.get('webkit', {}).get('webprefs', {
+            "default_fixed_font_size": 13,
+            "default_font_size": 16,
+            "minimum_font_size": 13
+        })
+        return search_engine
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return {
+            "default_fixed_font_size": 13,
+            "default_font_size": 16
+        }
+
+
+def get_bookmarks(env, config: Dict[str, str]):
+    os_type = env.vm_platform
+    if os_type == 'Windows':
+        preference_file_path = env.controller.execute_python_command("""import os; print(os.path.join(os.getenv('LOCALAPPDATA'),
+                                            'Google\\Chrome\\User Data\\Default\\Bookmarks'))""")['output'].strip()
+    elif os_type == 'Darwin':
+        preference_file_path = env.controller.execute_python_command(
+            "import os; print(os.path.join(os.getenv('HOME'), 'Library/Application Support/Google/Chrome/Default/Bookmarks'))")[
+            'output'].strip()
+    elif os_type == 'Linux':
+        if "arm" in platform.machine():
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), 'snap/chromium/common/chromium/Default/Bookmarks'))")[
+                'output'].strip()
+        else:
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), '.config/google-chrome/Default/Bookmarks'))")[
+                'output'].strip()
+    else:
+        raise Exception('Unsupported operating system')
+
+    content = env.controller.get_file(preference_file_path)
+    if not content:
+        return []
+    data = json.loads(content)
+    bookmarks = data.get('roots', {})
+    return bookmarks
+
+
+# todo: move this to the main.py
+def get_extensions_installed_from_shop(env, config: Dict[str, str]):
+    """Find the Chrome extensions directory based on the operating system."""
+    os_type = env.vm_platform
+    if os_type == 'Windows':
+        chrome_extension_dir = env.controller.execute_python_command(
+            """os.path.expanduser('~') + '\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\Extensions\\'""")[
+            'output'].strip()
+    elif os_type == 'Darwin':  # macOS
+        chrome_extension_dir = env.controller.execute_python_command(
+            """os.path.expanduser('~') + '/Library/Application Support/Google/Chrome/Default/Extensions/'""")[
+            'output'].strip()
+    elif os_type == 'Linux':
+        if "arm" in platform.machine():
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), 'snap/chromium/common/chromium/Default/Extensions/'))")[
+                'output'].strip()
+        else:
+            chrome_extension_dir = env.controller.execute_python_command(
+                """os.path.expanduser('~') + '/.config/google-chrome/Default/Extensions/'""")['output'].strip()
+    else:
+        raise Exception('Unsupported operating system')
+
+    manifests = []
+    for extension_id in os.listdir(chrome_extension_dir):
+        extension_path = os.path.join(chrome_extension_dir, extension_id)
+        if os.path.isdir(extension_path):
+            # Iterate through version-named subdirectories
+            for version_dir in os.listdir(extension_path):
+                version_path = os.path.join(extension_path, version_dir)
+                manifest_path = os.path.join(version_path, 'manifest.json')
+                if os.path.isfile(manifest_path):
+                    with open(manifest_path, 'r') as file:
+                        try:
+                            manifest = json.load(file)
+                            manifests.append(manifest)
+                        except json.JSONDecodeError:
+                            logger.error(f"Error reading {manifest_path}")
+    return manifests
+
+
+# The following ones require Playwright to be installed on the target machine, and the chrome needs to be pre-config on
+# port info to allow remote debugging, see README.md for details
+
+async def get_page_info(env, config: Dict[str, str]):
+    host = env.vm_ip
+    port = env.chromium_port  # fixme: this port is hard-coded, need to be changed from config file
+    server_port = env.server_port
+    url = config["url"]
+
+    remote_debugging_url = f"http://{host}:{port}"
+    
+    # Configuration for retry and timeout
+    max_retries = 2
+    timeout_ms = 60000  # Increase timeout to 60 seconds
+    
+    for attempt in range(max_retries):
+        try:
+            logger.info(f"[PAGE_INFO] Attempt {attempt + 1}/{max_retries} for URL: {url}")
+            
+            async with async_playwright() as p:
+                # connect to remote Chrome instance
+                try:
+                    browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+                    logger.info(f"[PAGE_INFO] Successfully connected to existing Chrome instance")
+                except Exception as e:
+                    logger.warning(f"[PAGE_INFO] Failed to connect to existing Chrome instance: {e}")
+                    # If the connection fails, start a new browser instance
+                    platform.machine()
+                    if "arm" in platform.machine():
+                        # start a new browser instance if the connection fails
+                        payload = json.dumps({"command": [
+                            "chromium",
+                            "--remote-debugging-port=1337"
+                        ], "shell": False})
+                    else:
+                        payload = json.dumps({"command": [
+                            "google-chrome",
+                            "--remote-debugging-port=1337"
+                        ], "shell": False})
+
+                    headers = {"Content-Type": "application/json"}
+                    requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
+                    await asyncio.sleep(5)
+                    browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+                    logger.info(f"[PAGE_INFO] Successfully connected to new Chrome instance")
+
+                page = await browser.new_page()
+                
+                # Set longer timeout for navigation
+                page.set_default_timeout(timeout_ms)
+                
+                logger.info(f"[PAGE_INFO] Navigating to URL: {url}")
+                await page.goto(url, wait_until='networkidle', timeout=timeout_ms)
+
+                try:
+                    # Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
+                    await page.wait_for_load_state('networkidle', timeout=timeout_ms)  # Wait for the 'load' event to complete
+                    title = await page.title()
+                    url = page.url
+                    page_info = {'title': title, 'url': url, 'content': await page.content()}
+                    logger.info(f"[PAGE_INFO] Successfully loaded page. Title: '{title}'")
+                except TimeoutError:
+                    # If page loading times out, catch the exception and store the current information in the list
+                    logger.warning(f"[PAGE_INFO] Page load timeout for URL: {url}")
+                    page_info = {'title': 'Load timeout', 'url': page.url, 'content': await page.content()}
+                except Exception as e:
+                    # Catch other potential exceptions that might occur while reading the page title
+                    logger.error(f'[PAGE_INFO] Error: {e}')
+                    page_info = {'title': 'Error encountered', 'url': page.url, 'content': await page.content()}
+
+                await browser.close()
+                return page_info
+                
+        except Exception as e:
+            logger.error(f"[PAGE_INFO] Attempt {attempt + 1} failed: {str(e)}")
+            logger.error(f"[PAGE_INFO] Exception type: {type(e).__name__}")
+            
+            if attempt < max_retries - 1:
+                logger.info(f"[PAGE_INFO] Retrying in 3 seconds...")
+                await asyncio.sleep(3)
+            else:
+                logger.error(f"[PAGE_INFO] All {max_retries} attempts failed. Returning error info.")
+                return {'title': 'Connection failed', 'url': url, 'content': ''}
+
+    # This should never be reached, but just in case
+    return {'title': 'Unknown error', 'url': url, 'content': ''}
+
+
+async def get_open_tabs_info(env, config: Dict[str, str]):
+    host = env.vm_ip
+    port = env.chromium_port  # fixme: this port is hard-coded, need to be changed from config file
+    server_port = env.server_port
+
+    remote_debugging_url = f"http://{host}:{port}"
+    
+    # Configuration for retry and timeout
+    max_retries = 2
+    timeout_ms = 30000  # 30 seconds for tab info
+    
+    for attempt in range(max_retries):
+        try:
+            logger.info(f"[OPEN_TABS_INFO] Attempt {attempt + 1}/{max_retries}")
+            
+            async with async_playwright() as p:
+                # connect to remote Chrome instance
+                try:
+                    browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+                    logger.info(f"[OPEN_TABS_INFO] Successfully connected to existing Chrome instance")
+                except Exception as e:
+                    logger.warning(f"[OPEN_TABS_INFO] Failed to connect to existing Chrome instance: {e}")
+                    # If the connection fails, start a new browser instance
+                    platform.machine()
+                    if "arm" in platform.machine():
+                        # start a new browser instance if the connection fails
+                        payload = json.dumps({"command": [
+                            "chromium",
+                            "--remote-debugging-port=1337"
+                        ], "shell": False})
+                    else:
+                        payload = json.dumps({"command": [
+                            "google-chrome",
+                            "--remote-debugging-port=1337"
+                        ], "shell": False})
+
+                    headers = {"Content-Type": "application/json"}
+                    requests.post(f"http://{host}:{server_port}/setup/launch", headers=headers, data=payload)
+                    await asyncio.sleep(5)
+                    try:
+                        browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+                        logger.info(f"[OPEN_TABS_INFO] Successfully connected to new Chrome instance")
+                    except Exception as e:
+                        logger.error(f"[OPEN_TABS_INFO] Failed to connect to new Chrome instance: {e}")
+                        return []
+
+                tabs_info = []
+                for context in browser.contexts:
+                    for page in context.pages:
+                        try:
+                            # Set timeout for each page
+                            page.set_default_timeout(timeout_ms)
+                            
+                            # Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
+                            await page.wait_for_load_state('networkidle', timeout=timeout_ms)  # Wait for the 'load' event to complete
+                            title = await page.title()
+                            url = page.url
+                            tabs_info.append({'title': title, 'url': url})
+                            logger.info(f"[OPEN_TABS_INFO] Tab info: '{title}' -> {url}")
+                        except TimeoutError:
+                            # If page loading times out, catch the exception and store the current information in the list
+                            logger.warning(f"[OPEN_TABS_INFO] Tab load timeout for URL: {page.url}")
+                            tabs_info.append({'title': 'Load timeout', 'url': page.url})
+                        except Exception as e:
+                            # Catch other potential exceptions that might occur while reading the page title
+                            logger.error(f'[OPEN_TABS_INFO] Error reading tab info: {e}')
+                            tabs_info.append({'title': 'Error encountered', 'url': page.url})
+
+                await browser.close()
+                logger.info(f"[OPEN_TABS_INFO] Successfully retrieved info for {len(tabs_info)} tabs")
+                return tabs_info
+                
+        except Exception as e:
+            logger.error(f"[OPEN_TABS_INFO] Attempt {attempt + 1} failed: {str(e)}")
+            logger.error(f"[OPEN_TABS_INFO] Exception type: {type(e).__name__}")
+            
+            if attempt < max_retries - 1:
+                logger.info(f"[OPEN_TABS_INFO] Retrying in 3 seconds...")
+                await asyncio.sleep(3)
+            else:
+                logger.error(f"[OPEN_TABS_INFO] All {max_retries} attempts failed. Returning empty list.")
+                return []
+
+    # This should never be reached, but just in case
+    return []
+
+
+def get_active_url_from_accessTree(env, config):
+    """
+        Playwright cannot get the url of active tab directly, 
+        so we need to use accessibility tree to get the active tab info.
+        This function is used to get the active tab url from the accessibility tree.
+        config: 
+            Dict[str, str]{
+                # we no longer need to specify the xpath or selectors, since we will use defalut value
+                # 'xpath': 
+                #     the same as in metrics.general.accessibility_tree.
+                # 'selectors': 
+                #     the same as in metrics.general.accessibility_tree.
+                'goto_prefix':
+                    the prefix you want to add to the beginning of the url to be opened, default is "https://",
+                    (the url we get from accTree does not have prefix)
+                ...(other keys, not used in this function)
+        }
+        Return
+            url: str
+    """
+    # Ensure the controller and its method are accessible and return a valid result
+    if hasattr(env, 'controller') and callable(getattr(env.controller, 'get_accessibility_tree', None)):
+        accessibility_tree = env.controller.get_accessibility_tree()
+        if accessibility_tree is None:
+            print("Failed to get the accessibility tree.")
+            return None
+    else:
+        print("Controller or method 'get_accessibility_tree' not found.")
+        return None
+
+    logger.debug("AT@eval: %s", accessibility_tree)
+
+    at = None
+    try:
+        at = lxml.etree.fromstring(accessibility_tree)
+    except ValueError as e:
+        logger.error(f"Error parsing accessibility tree: {e}")
+        return None
+
+    # Determine the correct selector based on system architecture
+    selector = None
+    arch = platform.machine()
+    print(f"Your architecture is: {arch}")
+
+    if "arm" in arch:
+        selector_string = "application[name=Chromium] entry[name=Address\\ and\\ search\\ bar]"
+    else:
+        selector_string = "application[name=Google\\ Chrome] entry[name=Address\\ and\\ search\\ bar]"
+
+    try:
+        selector = CSSSelector(selector_string, namespaces=_accessibility_ns_map)
+    except Exception as e:
+        logger.error(f"Failed to parse the selector for active tab URL: {e}")
+        return None
+
+    elements = selector(at) if selector else []
+    if not elements:
+        print("No elements found.")
+        return None
+    elif not elements[-1].text:
+        print("No text found in the latest element.")
+        return None
+
+    # Use a default prefix if 'goto_prefix' is not specified in the config
+    goto_prefix = config.get("goto_prefix", "https://")
+
+    active_tab_url = f"{goto_prefix}{elements[0].text}"
+    print(f"Active tab url now: {active_tab_url}")
+    return active_tab_url
+
+
+async def get_active_tab_info(env, config: Dict[str, str]):
+    """
+    This function is used to get all info about active tab.
+    Warning! This function will reload the target-url page
+    If the tartget url has cache or cookie, this function may reload to another page.
+    If you have tested the url will not pop up to another page (check in incongnito mode yourself first),
+    you can use this function.
+    config: Dict[str, str]{
+        # Keys used in get_active_url_from_accessTree: "xpath", "selectors"
+    }
+    """
+    active_tab_url = get_active_url_from_accessTree(env, config)
+    if active_tab_url is None:
+        logger.error("Failed to get the url of active tab")
+        return None
+        
+    logger.info(f"[ACTIVE_TAB_INFO] Active tab URL: {active_tab_url}")
+    
+    host = env.vm_ip
+    port = env.chromium_port  # fixme: this port is hard-coded, need to be changed from config file
+
+    remote_debugging_url = f"http://{host}:{port}"
+    
+    # Configuration for retry and timeout
+    max_retries = 2
+    timeout_ms = 60000  # 60 seconds for active tab
+    
+    for attempt in range(max_retries):
+        try:
+            logger.info(f"[ACTIVE_TAB_INFO] Attempt {attempt + 1}/{max_retries}")
+            
+            async with async_playwright() as p:
+                # connect to remote Chrome instance, since it is supposed to be the active one, we won't start a new one if failed
+                try:
+                    browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+                    logger.info(f"[ACTIVE_TAB_INFO] Successfully connected to Chrome instance")
+                except Exception as e:
+                    logger.error(f"[ACTIVE_TAB_INFO] Failed to connect to Chrome instance: {e}")
+                    return None
+
+                active_tab_info = {}
+                # go to the target URL page
+                page = await browser.new_page()
+                
+                # Set longer timeout for navigation
+                page.set_default_timeout(timeout_ms)
+                
+                try:
+                    logger.info(f"[ACTIVE_TAB_INFO] Navigating to URL: {active_tab_url}")
+                    await page.goto(active_tab_url, wait_until='load', timeout=timeout_ms)
+                    await page.wait_for_load_state('load', timeout=timeout_ms)  # Wait for the 'load' event to complete
+                    
+                    active_tab_info = {
+                        'title': await page.title(),
+                        'url': page.url,
+                        'content': await page.content()  # get the HTML content of the page
+                    }
+                    
+                    logger.info(f"[ACTIVE_TAB_INFO] Successfully loaded page. Title: '{active_tab_info['title']}'")
+                    logger.info(f"[ACTIVE_TAB_INFO] Current URL: '{active_tab_info['url']}'")
+                    
+                except TimeoutError:
+                    logger.warning(f"[ACTIVE_TAB_INFO] Page load timeout for URL: {active_tab_url}")
+                    active_tab_info = {
+                        'title': 'Load timeout',
+                        'url': page.url,
+                        'content': await page.content()
+                    }
+                except Exception as e:
+                    logger.error(f"[ACTIVE_TAB_INFO] Failed to go to the target URL page: {e}")
+                    return None
+
+                await browser.close()
+                return active_tab_info
+                
+        except Exception as e:
+            logger.error(f"[ACTIVE_TAB_INFO] Attempt {attempt + 1} failed: {str(e)}")
+            logger.error(f"[ACTIVE_TAB_INFO] Exception type: {type(e).__name__}")
+            
+            if attempt < max_retries - 1:
+                logger.info(f"[ACTIVE_TAB_INFO] Retrying in 3 seconds...")
+                await asyncio.sleep(3)
+            else:
+                logger.error(f"[ACTIVE_TAB_INFO] All {max_retries} attempts failed.")
+                return None
+
+    # This should never be reached, but just in case
+    return None
+
+
+async def get_pdf_from_url(env, config: Dict[str, str]) -> str:
+    """
+    Download a PDF from a URL.
+    """
+    _url = config["path"]
+    _path = os.path.join(env.cache_dir, config["dest"])
+    
+    # Add logging for debugging
+    logger.info(f"[PDF_FROM_URL] Starting PDF download from URL: {_url}")
+    logger.info(f"[PDF_FROM_URL] Target path: {_path}")
+
+    host = env.vm_ip
+    port = env.chromium_port  # fixme: this port is hard-coded, need to be changed from config file
+    server_port = env.server_port
+
+    remote_debugging_url = f"http://{host}:{port}"
+    
+    # Configuration for retry and timeout
+    max_retries = 3
+    timeout_ms = 60000  # Increase timeout to 60 seconds
+    
+    for attempt in range(max_retries):
+        try:
+            logger.info(f"[PDF_FROM_URL] Attempt {attempt + 1}/{max_retries}")
+            
+            async with async_playwright() as p:
+                try:
+                    browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+                    logger.info(f"[PDF_FROM_URL] Successfully connected to existing Chrome instance")
+                except Exception as e:
+                    logger.warning(f"[PDF_FROM_URL] Failed to connect to existing Chrome instance: {e}")
+                    logger.info(f"[PDF_FROM_URL] Starting new Chrome instance...")
+                    
+                    # If the connection fails, start a new browser instance
+                    platform.machine()
+                    if "arm" in platform.machine():
+                        # start a new browser instance if the connection fails
+                        payload = json.dumps({"command": [
+                            "chromium",
+                            "--remote-debugging-port=1337"
+                        ], "shell": False})
+                    else:
+                        payload = json.dumps({"command": [
+                            "google-chrome",
+                            "--remote-debugging-port=1337"
+                        ], "shell": False})
+
+                    headers = {"Content-Type": "application/json"}
+                    requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
+                    await asyncio.sleep(5)
+                    browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+                    logger.info(f"[PDF_FROM_URL] Successfully connected to new Chrome instance")
+
+                page = await browser.new_page()
+                
+                # Set longer timeout for navigation
+                page.set_default_timeout(timeout_ms)
+                
+                logger.info(f"[PDF_FROM_URL] Navigating to URL: {_url}")
+                await page.goto(_url, wait_until='networkidle', timeout=timeout_ms)
+                
+                # Wait for page to be fully loaded
+                logger.info(f"[PDF_FROM_URL] Waiting for page to be fully loaded...")
+                await page.wait_for_load_state('networkidle', timeout=timeout_ms)
+                
+                # Additional wait to ensure all content is rendered
+                await asyncio.sleep(3)
+                
+                logger.info(f"[PDF_FROM_URL] Page loaded successfully. Title: '{await page.title()}'")
+                logger.info(f"[PDF_FROM_URL] Current URL: '{page.url}'")
+                
+                # Generate PDF
+                logger.info(f"[PDF_FROM_URL] Generating PDF...")
+                await page.pdf(path=_path)
+                
+                logger.info(f"[PDF_FROM_URL] PDF generated successfully at: {_path}")
+                await browser.close()
+                
+                # Verify PDF file was created
+                if os.path.exists(_path):
+                    file_size = os.path.getsize(_path)
+                    logger.info(f"[PDF_FROM_URL] PDF file created successfully. Size: {file_size} bytes")
+                    return _path
+                else:
+                    logger.error(f"[PDF_FROM_URL] PDF file was not created at expected path: {_path}")
+                    raise FileNotFoundError(f"PDF file was not created at {_path}")
+                    
+        except Exception as e:
+            logger.error(f"[PDF_FROM_URL] Attempt {attempt + 1} failed: {str(e)}")
+            logger.error(f"[PDF_FROM_URL] Exception type: {type(e).__name__}")
+            
+            if attempt < max_retries - 1:
+                logger.info(f"[PDF_FROM_URL] Retrying in 5 seconds...")
+                await asyncio.sleep(5)
+            else:
+                logger.error(f"[PDF_FROM_URL] All {max_retries} attempts failed. Giving up.")
+                
+                # Create a placeholder file or return a default path
+                try:
+                    # Create an empty PDF file as fallback
+                    with open(_path, 'w') as f:
+                        f.write("%PDF-1.4\n%EOF\n")
+                    logger.warning(f"[PDF_FROM_URL] Created empty PDF file as fallback: {_path}")
+                    return _path
+                except Exception as fallback_error:
+                    logger.error(f"[PDF_FROM_URL] Failed to create fallback file: {fallback_error}")
+                    # Return the path anyway, even if file creation failed
+                    return _path
+
+    # This should never be reached, but just in case
+    return _path
+
+
+# fixme: needs to be changed (maybe through post-processing) since it's not working
+async def get_chrome_saved_address(env, config: Dict[str, str]):
+    host = env.vm_ip
+    port = env.chromium_port  # fixme: this port is hard-coded, need to be changed from config file
+    server_port = env.server_port
+
+    remote_debugging_url = f"http://{host}:{port}"
+    
+    # Configuration for retry and timeout
+    max_retries = 2
+    timeout_ms = 30000  # 30 seconds for settings page
+    
+    for attempt in range(max_retries):
+        try:
+            logger.info(f"[CHROME_SAVED_ADDRESS] Attempt {attempt + 1}/{max_retries}")
+            
+            async with async_playwright() as p:
+                # connect to remote Chrome instance
+                try:
+                    browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+                    logger.info(f"[CHROME_SAVED_ADDRESS] Successfully connected to existing Chrome instance")
+                except Exception as e:
+                    logger.warning(f"[CHROME_SAVED_ADDRESS] Failed to connect to existing Chrome instance: {e}")
+                    # If the connection fails, start a new browser instance
+                    platform.machine()
+                    if "arm" in platform.machine():
+                        # start a new browser instance if the connection fails
+                        payload = json.dumps({"command": [
+                            "chromium",
+                            "--remote-debugging-port=1337"
+                        ], "shell": False})
+                    else:
+                        payload = json.dumps({"command": [
+                            "google-chrome",
+                            "--remote-debugging-port=1337"
+                        ], "shell": False})
+
+                    headers = {"Content-Type": "application/json"}
+                    requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
+                    await asyncio.sleep(5)
+                    browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+                    logger.info(f"[CHROME_SAVED_ADDRESS] Successfully connected to new Chrome instance")
+
+                page = await browser.new_page()
+                
+                # Set longer timeout for navigation
+                page.set_default_timeout(timeout_ms)
+
+                # Navigate to Chrome's settings page for autofill
+                logger.info(f"[CHROME_SAVED_ADDRESS] Navigating to Chrome settings page")
+                await page.goto("chrome://settings/addresses", wait_until='networkidle', timeout=timeout_ms)
+                
+                # Wait for page to be fully loaded
+                await page.wait_for_load_state('networkidle', timeout=timeout_ms)
+
+                # Get the HTML content of the page
+                content = await page.content()
+                
+                logger.info(f"[CHROME_SAVED_ADDRESS] Successfully retrieved settings page content")
+                await browser.close()
+                
+                return content
+                
+        except Exception as e:
+            logger.error(f"[CHROME_SAVED_ADDRESS] Attempt {attempt + 1} failed: {str(e)}")
+            logger.error(f"[CHROME_SAVED_ADDRESS] Exception type: {type(e).__name__}")
+            
+            if attempt < max_retries - 1:
+                logger.info(f"[CHROME_SAVED_ADDRESS] Retrying in 3 seconds...")
+                await asyncio.sleep(3)
+            else:
+                logger.error(f"[CHROME_SAVED_ADDRESS] All {max_retries} attempts failed. Returning empty content.")
+                return ""
+
+    # This should never be reached, but just in case
+    return ""
+
+
+def get_shortcuts_on_desktop(env, config: Dict[str, str]):
+    # Find out the operating system
+    os_name = env.vm_platform
+
+    # Depending on the OS, define the shortcut file extension
+    if os_name == 'Windows':
+        # Windows shortcuts are typically .url or .lnk files
+        shortcut_extension = '.lnk'
+    elif os_name == 'Darwin':
+        # macOS's shortcuts are .webloc files
+        shortcut_extension = '.webloc'
+    elif os_name == 'Linux':
+        # Linux (Ubuntu, etc.) shortcuts are typically .desktop files
+        shortcut_extension = '.desktop'
+    else:
+        logger.error(f"Unsupported operating system: {os_name}")
+        return []
+
+    # Get the path to the desktop folder
+    desktop_path = env.controller.get_vm_desktop_path()
+    desktop_directory_tree = env.controller.get_vm_directory_tree(desktop_path)
+
+    shortcuts_paths = [file['name'] for file in desktop_directory_tree['children'] if
+                       file['name'].endswith(shortcut_extension)]
+
+    short_cuts = {}
+
+    for shortcut_path in shortcuts_paths:
+        short_cuts[shortcut_path] = env.controller.get_file(env.controller.execute_python_command(
+            f"import os; print(os.path.join(os.path.expanduser('~'), 'Desktop', '{shortcut_path}'))")[
+                                                                'output'].strip()).decode('utf-8')
+
+    return short_cuts
+
+
+async def get_number_of_search_results(env, config: Dict[str, str]):
+    # todo: move into the config file
+    url, result_selector = "https://google.com/search?q=query", '.search-result'
+    host = env.vm_ip
+    port = env.chromium_port  # fixme: this port is hard-coded, need to be changed from config file
+    server_port = env.server_port
+
+    remote_debugging_url = f"http://{host}:{port}"
+    
+    # Configuration for retry and timeout
+    max_retries = 2
+    timeout_ms = 45000  # 45 seconds for search results
+    
+    for attempt in range(max_retries):
+        try:
+            logger.info(f"[SEARCH_RESULTS] Attempt {attempt + 1}/{max_retries} for URL: {url}")
+            
+            async with async_playwright() as p:
+                try:
+                    browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+                    logger.info(f"[SEARCH_RESULTS] Successfully connected to existing Chrome instance")
+                except Exception as e:
+                    logger.warning(f"[SEARCH_RESULTS] Failed to connect to existing Chrome instance: {e}")
+                    # If the connection fails, start a new browser instance
+                    platform.machine()
+                    if "arm" in platform.machine():
+                        # start a new browser instance if the connection fails
+                        payload = json.dumps({"command": [
+                            "chromium",
+                            "--remote-debugging-port=1337"
+                        ], "shell": False})
+                    else:
+                        payload = json.dumps({"command": [
+                            "google-chrome",
+                            "--remote-debugging-port=1337"
+                        ], "shell": False})
+
+                    headers = {"Content-Type": "application/json"}
+                    requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload)
+                    await asyncio.sleep(5)
+                    browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+                    logger.info(f"[SEARCH_RESULTS] Successfully connected to new Chrome instance")
+                    
+                page = await browser.new_page()
+                
+                # Set longer timeout for navigation
+                page.set_default_timeout(timeout_ms)
+                
+                logger.info(f"[SEARCH_RESULTS] Navigating to URL: {url}")
+                await page.goto(url, wait_until='networkidle', timeout=timeout_ms)
+                
+                # Wait for page to be fully loaded
+                await page.wait_for_load_state('networkidle', timeout=timeout_ms)
+                
+                search_results = await page.query_selector_all(result_selector)
+                actual_count = len(search_results)
+                
+                logger.info(f"[SEARCH_RESULTS] Found {actual_count} search results")
+                await browser.close()
+                
+                return actual_count
+                
+        except Exception as e:
+            logger.error(f"[SEARCH_RESULTS] Attempt {attempt + 1} failed: {str(e)}")
+            logger.error(f"[SEARCH_RESULTS] Exception type: {type(e).__name__}")
+            
+            if attempt < max_retries - 1:
+                logger.info(f"[SEARCH_RESULTS] Retrying in 3 seconds...")
+                await asyncio.sleep(3)
+            else:
+                logger.error(f"[SEARCH_RESULTS] All {max_retries} attempts failed. Returning 0.")
+                return 0
+
+    # This should never be reached, but just in case
+    return 0
+
+
+def get_googledrive_file(env, config: Dict[str, Any]) -> Any:
+    """ Get the desired file from Google Drive based on config, return the downloaded local filepath.
+    @args: keys in config dict
+        settings_file(str): target filepath to the settings file for Google Drive authentication, default is 'evaluation_examples/settings/googledrive/settings.yml'
+        query/path[_list](Union[str, List[str]]): the query or path [list] to the file(s) on Google Drive. To retrieve the file, we provide multiple key options to specify the filepath on drive in config dict:
+            1) query: a list of queries to search the file, each query is a string that follows the format of Google Drive search query. The documentation is available here: (support more complex search but too complicated to use)
+                https://developers.google.com/drive/api/guides/search-files?hl=en
+            2) path: a str list poingting to file path on googledrive, e.g., 'folder/subfolder/filename.txt' -> 
+                config contain one key-value pair "path": ['folder', 'subfolder', 'filename.txt']
+            3) query_list: query extends to list to download multiple files
+            4) path_list: path extends to list to download multiple files, e.g.,
+                "path_list": [['folder', 'subfolder', 'filename1.txt'], ['folder', 'subfolder', 'filename2.txt']]
+    @return:
+        dest(Union[List[str], str]): target file name or list. If *_list is used in input config, dest should also be a list of the same length. Return the downloaded local filepath.
+    """
+    settings_file = config.get('settings_file', 'evaluation_examples/settings/googledrive/settings.yml')
+    auth = GoogleAuth(settings_file=settings_file)
+    drive = GoogleDrive(auth)
+
+    def get_single_file(_query, _path):
+        parent_id = 'root'
+        try:
+            for q in _query:
+                search = f'( {q} ) and "{parent_id}" in parents'
+                filelist: GoogleDriveFileList = drive.ListFile({'q': search}).GetList()
+                if len(filelist) == 0:  # target file not found
+                    return None
+                file: GoogleDriveFile = filelist[0]  # HACK: if multiple candidates, just use the first one
+                parent_id = file['id']
+
+            file.GetContentFile(_path, mimetype=file['mimeType'])
+        except Exception as e:
+            logger.info('[ERROR]: Failed to download the file from Google Drive', e)
+            return None
+        return _path
+
+    if 'query' in config:
+        result = get_single_file(config['query'], os.path.join(env.cache_dir, config['dest']))
+        return result
+    elif 'path' in config:
+        query = [f"title = '{fp}' and mimeType = 'application/vnd.google-apps.folder' and trashed = false" if idx < len(
+            config['path']) - 1
+                 else f"title = '{fp}' and trashed = false" for idx, fp in enumerate(config['path'])]
+        result = get_single_file(query, os.path.join(env.cache_dir, config['dest']))
+        return result
+    elif 'query_list' in config:
+        _path_list = []
+        assert len(config['query_list']) == len(config['dest'])
+        for idx, query in enumerate(config['query_list']):
+            dest = config['dest'][idx]
+            result = get_single_file(query, os.path.join(env.cache_dir, dest))
+            _path_list.append(result)
+        return _path_list
+    else:  # path_list in config
+        _path_list = []
+        assert len(config['path_list']) == len(config['dest'])
+        for idx, path in enumerate(config['path_list']):
+            query = [
+                f"title = '{fp}' and mimeType = 'application/vnd.google-apps.folder' and trashed = false" if jdx < len(
+                    path) - 1
+                else f"title = '{fp}' and trashed = false" for jdx, fp in enumerate(path)]
+            dest = config['dest'][idx]
+            result = get_single_file(query, os.path.join(env.cache_dir, dest))
+            _path_list.append(result)
+        return _path_list
+
+
+def get_enable_do_not_track(env, config: Dict[str, str]):
+    os_type = env.vm_platform
+    if os_type == 'Windows':
+        preference_file_path = env.controller.execute_python_command("""import os; print(os.path.join(os.getenv('LOCALAPPDATA'),
+                                            'Google\\Chrome\\User Data\\Default\\Preferences'))""")['output'].strip()
+    elif os_type == 'Darwin':
+        preference_file_path = env.controller.execute_python_command(
+            "import os; print(os.path.join(os.getenv('HOME'), 'Library/Application Support/Google/Chrome/Default/Preferences'))")[
+            'output'].strip()
+    elif os_type == 'Linux':
+        if "arm" in platform.machine():
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), 'snap/chromium/common/chromium/Default/Preferences'))")[
+                'output'].strip()
+        else:
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), '.config/google-chrome/Default/Preferences'))")[
+                'output'].strip()
+
+    else:
+        raise Exception('Unsupported operating system')
+
+    try:
+        content = env.controller.get_file(preference_file_path)
+        data = json.loads(content)
+
+        if_enable_do_not_track = data.get('enable_do_not_track', {})  # bool
+        return "true" if if_enable_do_not_track else "false"
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return "false"
+
+
+def get_enable_enhanced_safety_browsing(env, config: Dict[str, str]):
+    os_type = env.vm_platform
+    if os_type == 'Windows':
+        preference_file_path = env.controller.execute_python_command("""import os; print(os.path.join(os.getenv('LOCALAPPDATA'),
+                                            'Google\\Chrome\\User Data\\Default\\Preferences'))""")['output'].strip()
+    elif os_type == 'Darwin':
+        preference_file_path = env.controller.execute_python_command(
+            "import os; print(os.path.join(os.getenv('HOME'), 'Library/Application Support/Google/Chrome/Default/Preferences'))")[
+            'output'].strip()
+    elif os_type == 'Linux':
+        if "arm" in platform.machine():
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), 'snap/chromium/common/chromium/Default/Preferences'))")[
+                'output'].strip()
+        else:
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), '.config/google-chrome/Default/Preferences'))")[
+                'output'].strip()
+
+    else:
+        raise Exception('Unsupported operating system')
+
+    try:
+        content = env.controller.get_file(preference_file_path)
+        data = json.loads(content)
+
+        if_enable_do_not_track = data.get('safebrowsing', {}).get('enhanced', {})  # bool
+        return "true" if if_enable_do_not_track else "false"
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return "Google"
+
+
+def get_enable_safe_browsing(env, config: Dict[str, str]):
+    os_type = env.vm_platform
+    if os_type == 'Windows':
+        preference_file_path = env.controller.execute_python_command("""import os; print(os.path.join(os.getenv('LOCALAPPDATA'),
+                                            'Google\\Chrome\\User Data\\Default\\Preferences'))""")['output'].strip()
+    elif os_type == 'Darwin':
+        preference_file_path = env.controller.execute_python_command(
+            "import os; print(os.path.join(os.getenv('HOME'), 'Library/Application Support/Google/Chrome/Default/Preferences'))")[
+            'output'].strip()
+    elif os_type == 'Linux':
+        if "arm" in platform.machine():
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), 'snap/chromium/common/chromium/Default/Preferences'))")[
+                'output'].strip()
+        else:
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), '.config/google-chrome/Default/Preferences'))")[
+                'output'].strip()
+
+    else:
+        raise Exception('Unsupported operating system')
+
+    try:
+        content = env.controller.get_file(preference_file_path)
+        data = json.loads(content)
+
+        safebrowsing = data.get('safebrowsing', {})
+        is_enhanced = bool(safebrowsing.get('enhanced', False))
+        is_enabled = bool(safebrowsing.get('enabled', False))
+        return "true" if (is_enhanced or is_enabled) else "false"
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return "false"
+
+def get_new_startup_page(env, config: Dict[str, str]):
+    os_type = env.vm_platform
+    if os_type == 'Windows':
+        preference_file_path = env.controller.execute_python_command("""import os; print(os.path.join(os.getenv('LOCALAPPDATA'),
+                                            'Google\\Chrome\\User Data\\Default\\Preferences'))""")['output'].strip()
+    elif os_type == 'Darwin':
+        preference_file_path = env.controller.execute_python_command(
+            "import os; print(os.path.join(os.getenv('HOME'), 'Library/Application Support/Google/Chrome/Default/Preferences'))")[
+            'output'].strip()
+    elif os_type == 'Linux':
+        if "arm" in platform.machine():
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), 'snap/chromium/common/chromium/Default/Preferences'))")[
+                'output'].strip()
+        else:
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), '.config/google-chrome/Default/Preferences'))")[
+                'output'].strip()
+
+    else:
+        raise Exception('Unsupported operating system')
+
+    try:
+        content = env.controller.get_file(preference_file_path)
+        data = json.loads(content)
+
+        # if data has no key called 'session', it means the chrome is on a fresh-start mode, which is a true state;
+        # otherwise, try to find the code number in 'restored_on_startup' in 'session'
+        if "session" not in data.keys():
+            return "true"
+        else:
+            if_enable_do_not_track = data.get('session', {}).get('restore_on_startup', {})  # int, need to be 5
+            return "true" if if_enable_do_not_track == 5 else "false"
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return "Google"
+
+
+def get_find_unpacked_extension_path(env, config: Dict[str, str]):
+    os_type = env.vm_platform
+    if os_type == 'Windows':
+        preference_file_path = env.controller.execute_python_command("""import os; print(os.path.join(os.getenv('LOCALAPPDATA'),
+                                            'Google\\Chrome\\User Data\\Default\\Preferences'))""")['output'].strip()
+    elif os_type == 'Darwin':
+        preference_file_path = env.controller.execute_python_command(
+            "import os; print(os.path.join(os.getenv('HOME'), 'Library/Application Support/Google/Chrome/Default/Preferences'))")[
+            'output'].strip()
+    elif os_type == 'Linux':
+        if "arm" in platform.machine():
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), 'snap/chromium/common/chromium/Default/Preferences'))")[
+                'output'].strip()
+        else:
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), '.config/google-chrome/Default/Preferences'))")[
+                'output'].strip()
+
+    else:
+        raise Exception('Unsupported operating system')
+
+    try:
+        content = env.controller.get_file(preference_file_path)
+        data = json.loads(content)
+        # Preferences store all the path of installed extensions, return them all and let metrics try to find one matches the targeted extension path
+        all_extensions_path = []
+        all_extensions = data.get('extensions', {}).get('settings', {})
+        for id in all_extensions.keys():
+            path = all_extensions[id]["path"]
+            all_extensions_path.append(path)
+        return all_extensions_path
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return "Google"
+
+
+def get_find_installed_extension_name(env, config: Dict[str, str]):
+    os_type = env.vm_platform
+    if os_type == 'Windows':
+        preference_file_path = env.controller.execute_python_command("""import os; print(os.path.join(os.getenv('LOCALAPPDATA'),
+                                            'Google\\Chrome\\User Data\\Default\\Preferences'))""")['output'].strip()
+    elif os_type == 'Darwin':
+        preference_file_path = env.controller.execute_python_command(
+            "import os; print(os.path.join(os.getenv('HOME'), 'Library/Application Support/Google/Chrome/Default/Preferences'))")[
+            'output'].strip()
+    elif os_type == 'Linux':
+        if "arm" in platform.machine():
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), 'snap/chromium/common/chromium/Default/Preferences'))")[
+                'output'].strip()
+        else:
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), '.config/google-chrome/Default/Preferences'))")[
+                'output'].strip()
+
+    else:
+        raise Exception('Unsupported operating system')
+
+    try:
+        content = env.controller.get_file(preference_file_path)
+        data = json.loads(content)
+        # Preferences store all the path of installed extensions, return them all and let metrics try to find one matches the targeted extension path
+        all_extensions_name = []
+        all_extensions = data.get('extensions', {}).get('settings', {})
+        for id in all_extensions.keys():
+            name = all_extensions[id]["manifest"]["name"]
+            all_extensions_name.append(name)
+        return all_extensions_name
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return "Google"
+
+
+def get_data_delete_automacally(env, config: Dict[str, str]):
+    """
+    This function is used to open th "auto-delete" mode of chromium
+    """
+    os_type = env.vm_platform
+    if os_type == 'Windows':
+        preference_file_path = env.controller.execute_python_command("""import os; print(os.path.join(os.getenv('LOCALAPPDATA'),
+                                            'Google\\Chrome\\User Data\\Default\\Preferences'))""")['output'].strip()
+    elif os_type == 'Darwin':
+        preference_file_path = env.controller.execute_python_command(
+            "import os; print(os.path.join(os.getenv('HOME'), 'Library/Application Support/Google/Chrome/Default/Preferences'))")[
+            'output'].strip()
+    elif os_type == 'Linux':
+        if "arm" in platform.machine():
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), 'snap/chromium/common/chromium/Default/Preferences'))")[
+                'output'].strip()
+        else:
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), '.config/google-chrome/Default/Preferences'))")[
+                'output'].strip()
+    else:
+        raise Exception('Unsupported operating system')
+
+    try:
+        content = env.controller.get_file(preference_file_path)
+        data = json.loads(content)
+        data_delete_state = data["profile"].get("default_content_setting_values", None)
+        return "true" if data_delete_state is not None else "false"
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return "Google"
+
+
+async def get_active_tab_html_parse(env, config: Dict[str, Any]):
+    """
+    This function is used to get the specific element's text content from the active tab's html.
+    config:
+        Dict[str, str]{
+            # Keys used in get_active_url_from_accessTree: "xpath", "selectors"
+            'category':
+                choose from ["class", "label", "xpath", "input"], used to indicate how to find the element
+            'labelObject':
+                only exists when category is "label",
+                a dict like { "labelSelector": "the key you want to store the text content of this label's ee=lement"}
+            'class_singleObject':
+                only exists when category is "class", a dict with keys as the class name,
+                like { "class name" : "the key you want to store the text content of this element" }
+            'class_multiObject':
+                only exists when category is "class", used for elements with same class name.
+                Two layer of dict, like
+                    ( {
+                        "class name": {
+                            "rank in this class" : "the key you want to store the text content of this element"
+                            ...
+                            }
+                        } )
+            'xpathObject':
+                only exists when category is "xpath", a dict with keys as the xpath,
+                like { "full xpath" : "the key you want to store the text content of this element" }
+            'inputObject':
+                only exists when category is "input",
+                a dict with keys as the input element's xpath, like { "full xpath" : "the key you want to store the text content of this element" }
+    }
+    """
+    active_tab_url = get_active_url_from_accessTree(env, config)
+    logger.info(f"[DEBUG] get_active_url_from_accessTree returned: {active_tab_url} (type: {type(active_tab_url)})")
+    if not isinstance(active_tab_url, str):
+        logger.error(f"[DEBUG] active_tab_url is not a string, got {type(active_tab_url)}: {active_tab_url}")
+        return None
+    host = env.vm_ip
+    port = env.chromium_port  # fixme: this port is hard-coded, need to be changed from config file
+    server_port = env.server_port
+
+    remote_debugging_url = f"http://{host}:{port}"
+    
+    # DEBUG: Add logging for configuration
+    logger.info(f"[DEBUG] get_active_tab_html_parse called with config: {config}")
+    
+    async with async_playwright() as p:
+        # connect to remote Chrome instance
+        try:
+            browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+        except Exception as e:
+            # If the connection fails, start a new browser instance
+            platform.machine()
+            if "arm" in platform.machine():
+                # start a new browser instance if the connection fails
+                payload = json.dumps({"command": [
+                    "chromium",
+                    "--remote-debugging-port=1337"
+                ], "shell": False})
+            else:
+                payload = json.dumps({"command": [
+                    "google-chrome",
+                    "--remote-debugging-port=1337"
+                ], "shell": False})
+
+            headers = {"Content-Type": "application/json"}
+            requests.post("http://" + host + ":" + str(server_port) + "/setup" + "/launch", headers=headers, data=payload)
+            await asyncio.sleep(5)
+            browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+        target_page = None
+        for context in browser.contexts:
+            for page in context.pages:
+                try:
+                    # Wait for page to be stable before checking URL
+                    await page.wait_for_load_state("networkidle", timeout=60000)
+                    
+                    # Check if page is still valid before accessing properties
+                    if page.is_closed():
+                        logger.debug(f"[DEBUG] Page is closed, skipping")
+                        continue
+                    
+                    # the accTree and playwright can get encoding(percent-encoding) characters, we need to convert them to normal characters
+                    # Normalize URLs by removing trailing slashes and decoding percent-encoding
+                    def normalize_url(url):
+                        return unquote(url).rstrip('/')
+                    
+                    # Safely get page URL with error handling
+                    try:
+                        page_url = page.url
+                    except Exception as url_error:
+                        logger.debug(f"[DEBUG] Failed to get page URL: {url_error}")
+                        continue
+                    
+                    if normalize_url(page_url) == normalize_url(active_tab_url):
+                        target_page = page
+                        print("\33[32mtartget page url: ", target_page.url, "\33[0m")
+                        
+                        # Safely get page title with error handling
+                        try:
+                            page_title = await target_page.title()
+                            print("\33[32mtartget page title: ", page_title, "\33[0m")
+                        except Exception as title_error:
+                            logger.debug(f"[DEBUG] Failed to get page title: {title_error}")
+                            print("\33[32mtartget page title: [Unable to get title - page may be navigating]", "\33[0m")
+                        break
+                        
+                except Exception as page_error:
+                    logger.debug(f"[DEBUG] Error processing page: {page_error}")
+                    continue
+        if target_page is None:
+            logger.error("[DEBUG] Could not find target tab matching URL. Available tabs:")
+            for context in browser.contexts:
+                for page in context.pages:
+                    try:
+                        if not page.is_closed():
+                            page_url = page.url
+                            logger.error(f"[DEBUG] - Tab URL: {page_url}")
+                    except Exception as e:
+                        logger.error(f"[DEBUG] - Tab URL: [Unable to get URL: {e}]")
+            logger.error(f"[DEBUG] Expected URL: {active_tab_url}")
+            return {}
+
+        return_json = {}
+
+        async def safely_get_text_content(selector):
+            try:
+                if target_page.is_closed():
+                    logger.warning(f"[DEBUG] Target page is closed, cannot get text content for selector: {selector}")
+                    return []
+                elements = await target_page.query_selector_all(selector)
+                results = []
+                for element in elements:
+                    try:
+                        text = await element.text_content()
+                        if text is not None:
+                            results.append(text.strip())
+                    except Exception:
+                        continue
+                return results
+            except Exception as e:
+                logger.warning(f"[DEBUG] Error getting text content for selector '{selector}': {e}")
+                return []
+
+        async def safely_get_direct_text_nodes_playwright(selector):
+            """
+            Extract all direct text node contents under the specified selector element (excluding text inside child div, span, etc.).
+            Returns a list of lists, each sublist contains the direct text nodes of one element.
+            Suitable for structures like: <div>SEA<div class="aura-separator"></div>NYC</div>
+            """
+            try:
+                if target_page.is_closed():
+                    logger.warning(f"[DEBUG] Target page is closed, cannot get direct text nodes for selector: {selector}")
+                    return []
+                elements = await target_page.query_selector_all(selector)
+                results = []
+                for element in elements:
+                    texts = await element.evaluate('''
+                        (node) => Array.from(node.childNodes)
+                            .filter(n => n.nodeType === Node.TEXT_NODE)
+                            .map(n => n.textContent.trim())
+                            .filter(Boolean)
+                    ''')
+                    results.append(texts)
+                # Safety check: return empty list if no elements found
+                return results[0] if results else []
+            except Exception as e:
+                logger.warning(f"[DEBUG] Error getting direct text nodes for selector '{selector}': {e}")
+                return []
+        
+        async def safely_get_direct_li_playwright(selector):
+            try:
+                if target_page.is_closed():
+                    logger.warning(f"[DEBUG] Target page is closed, cannot get li elements for selector: {selector}")
+                    return []
+                elements = await target_page.query_selector_all(selector + " li.catAllProducts")
+                results = []
+                for element in elements:
+                    try:
+                        span = await element.query_selector('span')
+                        if span:
+                            text = await span.inner_text()
+                            if text is not None:
+                                results.append(text.strip())
+                    except Exception:
+                        continue
+                return results
+            except Exception as e:
+                logger.warning(f"[DEBUG] Error getting li elements for selector '{selector}': {e}")
+                return []
+
+        async def safely_get_only_child_text_content(selector):
+            try:
+                if target_page.is_closed():
+                    logger.warning(f"[DEBUG] Target page is closed, cannot get child text content for selector: {selector}")
+                    return []
+                elements = await target_page.query_selector_all(selector)
+                results = []
+                for element in elements:
+                    try:
+                        h3 = await element.query_selector('h3')
+                        if h3:
+                            text = await h3.text_content()
+                            if text is not None:
+                                results.append(text.strip())
+                    except Exception:
+                        continue
+                return results
+            except Exception as e:
+                logger.warning(f"[DEBUG] Error getting child text content for selector '{selector}': {e}")
+                return []
+
+        if config["category"] == "class":
+            class_multiObject = config.get("class_multiObject", {})
+            for class_name, object_dict in class_multiObject.items():
+                elements_texts = await safely_get_text_content("." + class_name)
+                for order_key, key in object_dict.items():
+                    index = int(order_key)
+                    if len(elements_texts) > index:
+                        return_json[key] = elements_texts[index]
+                    else:
+                        logger.warning(f"[DEBUG] Element at index {index} not found for class '{class_name}'. Found {len(elements_texts)} elements.")
+                        return_json[key] = ""  # Return empty string instead of None
+                        
+            class_multiObject_child = config.get("class_multiObject_child", {})
+            for class_name, object_dict in class_multiObject_child.items():
+                elements_texts = await safely_get_direct_text_nodes_playwright("." + class_name)
+                for order_key, key in object_dict.items():
+                    index = int(order_key)
+                    if len(elements_texts) > index:
+                        return_json[key] = elements_texts[index]
+                    else:
+                        logger.warning(f"[DEBUG] Child element at index {index} not found for class '{class_name}'. Found {len(elements_texts)} elements.")
+                        return_json[key] = ""  # Return empty string instead of None
+            
+            class_multiObject_only_child = config.get("class_multiObject_only_child", {})
+            for class_name, object_dict in class_multiObject_only_child.items():
+                elements_texts = await safely_get_only_child_text_content("." + class_name)
+                for order_key, key in object_dict.items():
+                    index = int(order_key)
+                    if len(elements_texts) > index:
+                        return_json[key] = elements_texts[index]
+                    else:
+                        logger.warning(f"[DEBUG] Only child element at index {index} not found for class '{class_name}'. Found {len(elements_texts)} elements.")
+                        return_json[key] = ""  # Return empty string instead of None
+
+            class_multiObject_search_exist = config.get("class_multiObject_search_exist", {})
+            for class_name, object_list in class_multiObject_search_exist.items():
+                elements_texts = await safely_get_text_content("." + class_name)
+                logger.info(f"[DEBUG] Found elements with class '{class_name}': {elements_texts}")
+                logger.info(f"[DEBUG] Expected elements: {[obj for obj in object_list if obj != 'is_other_exist']}")
+                
+                for each_object in object_list:
+                    if each_object == "is_other_exist":
+                        continue
+                    if each_object in elements_texts:
+                        return_json[each_object] = True
+                    else:
+                        return_json[each_object] = False
+                if "is_other_exist" in object_list:
+                    extra_elements = []
+                    for each_element in elements_texts:
+                        if each_element not in object_list:
+                            extra_elements.append(each_element)
+                            return_json["is_other_exist"] = True
+                    if extra_elements:
+                        logger.warning(f"[DEBUG] Found unexpected elements not in expected list: {extra_elements}")
+                    else:
+                        logger.info(f"[DEBUG] No unexpected elements found")
+                    if "is_other_exist" not in return_json.keys():
+                        return_json["is_other_exist"] = False
+                
+
+            class_singleObject = config.get("class_singleObject", {})
+            for class_name, key in class_singleObject.items():
+                element_text = await safely_get_text_content("." + class_name)
+                logger.info(f"[DEBUG] Class '{class_name}' found {len(element_text)} elements")
+                if element_text:
+                    return_json[key] = element_text[0]
+                    logger.info(f"[DEBUG] Class extraction for key '{key}': '{element_text[0]}'")
+                else:
+                    logger.warning(f"[DEBUG] No elements found for class: {class_name}")
+                    return_json[key] = ""  # Return empty string instead of None
+
+        elif config['category'] == "label":
+            # Assuming get_by_label is a custom function or part of the framework being used
+            labelObject = config.get("labelObject", {})
+            for labelSelector, key in labelObject.items():
+                try:
+                    if target_page.is_closed():
+                        logger.warning(f"[DEBUG] Target page is closed, cannot process label for key '{key}'")
+                        return_json[key] = ""
+                        continue
+                        
+                    text = await target_page.locator(f"text={labelSelector}").first.text_content()
+                    if text:
+                        text = text.strip()
+                        return_json[key] = text
+                    else:
+                        return_json[key] = ""
+                except Exception as e:
+                    logger.error(f"[DEBUG] Error processing label for key '{key}': {e}")
+                    return_json[key] = ""
+
+        elif config["category"] == "xpath":
+            xpathObject = config.get("xpathObject", {})
+            logger.info(f"[DEBUG] Processing xpath category with xpathObject: {xpathObject}")
+            
+            for xpath, key in xpathObject.items():
+                logger.info(f"[DEBUG] Processing xpath: {xpath} -> key: {key}")
+                
+                # Check if page is still valid
+                try:
+                    if target_page.is_closed():
+                        logger.warning(f"[DEBUG] Target page is closed, cannot process xpath for key '{key}'")
+                        return_json[key] = ""
+                        continue
+                        
+                    elements = target_page.locator(f"xpath={xpath}")
+                    element_count = await elements.count()
+                    logger.info(f"[DEBUG] Found {element_count} elements for xpath: {xpath}")
+                    
+                    if element_count > 0:
+                        try:
+                            text_content = await elements.first.text_content()
+                            if text_content is not None:
+                                text_content = text_content.strip()
+                            logger.info(f"[DEBUG] Raw text content for key '{key}': '{text_content}' (type: {type(text_content)})")
+                            
+                            # 处理空文本内容的情况
+                            if text_content is None or text_content == "":
+                                logger.warning(f"[DEBUG] Element found but text content is empty for key '{key}' xpath: {xpath}")
+                                # 尝试获取更多信息
+                                try:
+                                    element_html = await elements.first.inner_html()
+                                    element_text = await elements.first.inner_text()
+                                    logger.info(f"[DEBUG] Element innerHTML: '{element_html[:100]}...' innerText: '{element_text}'")
+                                except Exception as inner_e:
+                                    logger.debug(f"[DEBUG] Failed to get inner content: {inner_e}")
+                                
+                            return_json[key] = text_content if text_content else ""
+                            logger.info(f"[DEBUG] Final value for key '{key}': '{return_json[key]}'")
+                        except Exception as e:
+                            logger.error(f"[DEBUG] Error extracting text from element for key '{key}': {e}")
+                            return_json[key] = ""
+                    else:
+                        logger.warning(f"[DEBUG] No elements found for xpath: {xpath}")
+                        # 尝试一些备用的xpath查找方法
+                        try:
+                            # 尝试不使用xpath前缀
+                            fallback_elements = target_page.locator(xpath)
+                            fallback_count = await fallback_elements.count()
+                            logger.info(f"[DEBUG] Fallback search (without xpath prefix) found {fallback_count} elements")
+                            if fallback_count > 0:
+                                text_content = await fallback_elements.first.text_content()
+                                if text_content:
+                                    text_content = text_content.strip()
+                                return_json[key] = text_content if text_content else ""
+                                logger.info(f"[DEBUG] Fallback extraction successful for key '{key}': '{return_json[key]}'")
+                            else:
+                                return_json[key] = ""
+                        except Exception as e:
+                            logger.info(f"[DEBUG] Fallback xpath search also failed: {e}")
+                            return_json[key] = ""
+                            
+                except Exception as e:
+                    logger.error(f"[DEBUG] Error processing xpath for key '{key}': {e}")
+                    return_json[key] = ""
+
+        elif config["category"] == "input":
+            inputObjects = config.get("inputObject", {})
+            logger.info(f"[DEBUG] Processing input category with inputObjects: {inputObjects}")
+            for xpath, key in inputObjects.items():
+                logger.info(f"[DEBUG] Processing input xpath: {xpath} -> key: {key}")
+                try:
+                    if target_page.is_closed():
+                        logger.warning(f"[DEBUG] Target page is closed, cannot process input for key '{key}'")
+                        return_json[key] = ""
+                        continue
+                        
+                    inputs = target_page.locator(f"xpath={xpath}")
+                    input_count = await inputs.count()
+                    logger.info(f"[DEBUG] Found {input_count} input elements for xpath: {xpath}")
+                    if input_count > 0:
+                        try:
+                            input_value = await inputs.first.input_value()
+                            if input_value:
+                                input_value = input_value.strip()
+                            return_json[key] = input_value if input_value else ""
+                            logger.info(f"[DEBUG] Input value for key '{key}': '{return_json[key]}'")
+                        except Exception as e:
+                            logger.error(f"[DEBUG] Error getting input value for key '{key}': {e}")
+                            return_json[key] = ""
+                    else:
+                        logger.warning(f"[DEBUG] No input elements found for xpath: {xpath}")
+                        return_json[key] = ""
+                except Exception as e:
+                    logger.error(f"[DEBUG] Error processing input for key '{key}': {e}")
+                    return_json[key] = ""
+        
+        elif config["category"] == "class&url":
+            class_multiObject = config.get("class_multiObject", {})
+            for class_name, object_list in class_multiObject.items():
+                elements_texts = await safely_get_text_content("." + class_name)
+                for each_key in object_list:
+                    if any(each_key.lower() == text.lower() for text in elements_texts):
+                        return_json[each_key.lower()] = True
+
+                for each_key in elements_texts:
+                    # each_key.lower() not in object_list.lower():
+                    if all(each_key.lower() not in item.lower() for item in object_list):
+                        return_json["is_other_exist"] = True
+                        break
+                if "is_other_exist" not in return_json.keys():
+                    return_json["is_other_exist"] = False
+            
+            class_multiObject_li = config.get("class_multiObject_li", {})
+            for class_name, object_list in class_multiObject_li.items():
+                elements_texts = await safely_get_direct_li_playwright("." + class_name)
+                for each_key in object_list:
+                    if any(each_key.lower() == text.lower() for text in elements_texts):
+                        return_json[each_key.lower()] = True
+                
+                for each_key in elements_texts:
+                    # each_key.lower() not in object_list.lower():
+                    if all(each_key.lower() not in item.lower() for item in object_list):
+                        return_json["is_other_exist"] = True
+                        break
+                if "is_other_exist" not in return_json.keys():
+                    return_json["is_other_exist"] = False
+
+            url_include_expected = config.get("url_include_expected", [])
+            for key in url_include_expected:
+                try:
+                    if target_page.is_closed():
+                        logger.warning(f"[DEBUG] Target page is closed, cannot check URL for key '{key}'")
+                        if key.lower() not in return_json.keys():
+                            return_json[key.lower()] = False
+                        continue
+                        
+                    page_url = target_page.url.lower()
+                    if key.lower() in page_url:
+                        if key.lower() not in return_json.keys():
+                            return_json[key.lower()] = True
+                    else:
+                        if key.lower() not in return_json.keys():
+                            return_json[key.lower()] = False
+                except Exception as e:
+                    logger.error(f"[DEBUG] Error checking URL for key '{key}': {e}")
+                    if key.lower() not in return_json.keys():
+                        return_json[key.lower()] = False
+            
+            url_include_expected_multichoice = config.get("url_include_expected_multichoice", {})
+            for key, value in url_include_expected_multichoice.items():
+                try:
+                    if target_page.is_closed():
+                        logger.warning(f"[DEBUG] Target page is closed, cannot check URL for multichoice key '{key}'")
+                        if value.lower() not in return_json.keys():
+                            return_json[value.lower()] = False
+                        continue
+                        
+                    page_url = target_page.url.lower()
+                    if key.lower() in page_url:
+                        if value.lower() not in return_json.keys():
+                            return_json[value.lower()] = True
+                    else:
+                        if value.lower() not in return_json.keys():
+                            return_json[value.lower()] = False
+                except Exception as e:
+                    logger.error(f"[DEBUG] Error checking URL for multichoice key '{key}': {e}")
+                    if value.lower() not in return_json.keys():
+                        return_json[value.lower()] = False
+
+        await browser.close()
+        
+        # DEBUG: Add logging for final result and check for None values
+        logger.info(f"[DEBUG] get_active_tab_html_parse final result: {return_json}")
+        
+        # 检查是否有None值
+        none_keys = [key for key, value in return_json.items() if value is None]
+        if none_keys:
+            logger.warning(f"[DEBUG] Found None values for keys: {none_keys}")
+            
+        # 检查是否期望的键都存在
+        if config["category"] == "xpath":
+            expected_keys = set(config.get("xpathObject", {}).values())
+            actual_keys = set(return_json.keys())
+            missing_keys = expected_keys - actual_keys
+            if missing_keys:
+                logger.warning(f"[DEBUG] Missing expected keys: {missing_keys}")
+        
+    return return_json
+
+
+async def get_gotoRecreationPage_and_get_html_content(env, config: Dict[str, Any]):
+    """
+    especially used for www.recreation.gov examples
+    """
+    # Add logging for debugging
+    logger.info(f"[RECREATION_PAGE] Starting recreation.gov page processing")
+    logger.debug(f"[RECREATION_PAGE] Config: {config}")
+    
+    host = env.vm_ip
+    port = env.chromium_port  # fixme: this port is hard-coded, need to be changed from config file
+    server_port = env.server_port
+    use_proxy = env.current_use_proxy
+
+    remote_debugging_url = f"http://{host}:{port}"
+    backend_url = f"http://{host}:{server_port}"
+    
+    # Configuration for retry and timeout
+    max_retries = 3
+    timeout_ms = 60000  # Increase timeout to 60 seconds
+    
+    # Test basic connectivity first
+    logger.info(f"[RECREATION_PAGE] Testing basic network connectivity...")
+    try:
+        import urllib.request
+        test_response = urllib.request.urlopen('http://www.google.com', timeout=10)
+        logger.info(f"[RECREATION_PAGE] Basic connectivity test passed (Google accessible)")
+    except Exception as e:
+        logger.warning(f"[RECREATION_PAGE] Basic connectivity test failed: {e}")
+        logger.info(f"[RECREATION_PAGE] Proceeding anyway...")
+    
+    for attempt in range(max_retries):
+        try:
+            logger.info(f"[RECREATION_PAGE] Attempt {attempt + 1}/{max_retries}")
+            
+            async with async_playwright() as p:
+                # Connect to remote Chrome instance
+                try:
+                    browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+                    logger.info(f"[RECREATION_PAGE] Successfully connected to existing Chrome instance")
+                except Exception as e:
+                    logger.warning(f"[RECREATION_PAGE] Failed to connect to existing Chrome instance: {e}")
+                    logger.info(f"[RECREATION_PAGE] Starting new Chrome instance with enhanced options...")
+                    
+                    # If the connection fails, start a new browser instance with better options
+                    app = 'chromium' if 'arm' in platform.machine() else 'google-chrome'
+                    command = [
+                        app,
+                        "--remote-debugging-port=1337",
+                        "--no-sandbox",
+                        "--disable-web-security",
+                        "--disable-features=VizDisplayCompositor",
+                        "--disable-dev-shm-usage",
+                        "--user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+                    ]
+                    
+                    if use_proxy:
+                        command.append(f"--proxy-server=127.0.0.1:18888")
+                        logger.info(f"[RECREATION_PAGE] Using proxy server: 127.0.0.1:18888")
+                    
+                    logger.info(f"[RECREATION_PAGE] Starting browser with command: {' '.join(command)}")
+                    payload = json.dumps({"command": command, "shell": False})
+                    headers = {"Content-Type": "application/json"}
+                    requests.post(backend_url + "/setup/launch", headers=headers, data=payload)
+                    await asyncio.sleep(8)  # Give more time for browser to start
+                    browser = await p.chromium.connect_over_cdp(remote_debugging_url)
+                    logger.info(f"[RECREATION_PAGE] Successfully connected to new Chrome instance")
+
+                page = await browser.new_page()
+                
+                # Set longer timeout for all operations
+                page.set_default_timeout(timeout_ms)
+                
+                # Set additional headers to appear more like a real browser
+                await page.set_extra_http_headers({
+                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
+                    'Accept-Language': 'en-US,en;q=0.9',
+                    'Accept-Encoding': 'gzip, deflate, br',
+                    'DNT': '1',
+                    'Connection': 'keep-alive',
+                    'Upgrade-Insecure-Requests': '1',
+                })
+                
+                # Try multiple URLs to test connectivity
+                test_urls = [
+                    "https://www.recreation.gov/",
+                    "http://www.recreation.gov/",
+                    "https://recreation.gov/",
+                ]
+                
+                successful_url = None
+                for test_url in test_urls:
+                    try:
+                        # Step 1: Navigate to recreation.gov with better error handling
+                        logger.info(f"[RECREATION_PAGE] Trying to navigate to: {test_url}")
+                        
+                        # Try different wait strategies
+                        wait_strategies = ['domcontentloaded', 'load', 'networkidle']
+                        
+                        for wait_until in wait_strategies:
+                            try:
+                                logger.debug(f"[RECREATION_PAGE] Trying wait strategy: {wait_until}")
+                                await page.goto(test_url, wait_until=wait_until, timeout=timeout_ms)
+                                logger.info(f"[RECREATION_PAGE] Successfully loaded {test_url} with strategy {wait_until}")
+                                logger.info(f"[RECREATION_PAGE] Page title: '{await page.title()}'")
+                                logger.info(f"[RECREATION_PAGE] Current URL: '{page.url}'")
+                                successful_url = test_url
+                                break
+                            except Exception as strategy_error:
+                                logger.debug(f"[RECREATION_PAGE] Wait strategy {wait_until} failed: {strategy_error}")
+                                continue
+                        
+                        if successful_url:
+                            break
+                            
+                    except Exception as url_error:
+                        logger.warning(f"[RECREATION_PAGE] Failed to navigate to {test_url}: {url_error}")
+                        continue
+                
+                if not successful_url:
+                    raise Exception("Failed to navigate to any recreation.gov URL variant")
+                    
+                # Additional wait to ensure page is fully loaded
+                try:
+                    await page.wait_for_load_state('networkidle', timeout=30000)
+                    logger.info(f"[RECREATION_PAGE] Page fully loaded and idle")
+                except Exception as e:
+                    logger.warning(f"[RECREATION_PAGE] NetworkIdle wait failed, continuing: {e}")
+
+                try:
+                    # Step 2: Fill search input
+                    logger.info(f"[RECREATION_PAGE] Filling search input with 'Diamond'...")
+                    await page.wait_for_selector("input#hero-search-input", state='visible', timeout=timeout_ms)
+                    await page.fill("input#hero-search-input", "Diamond")
+                    logger.info(f"[RECREATION_PAGE] Successfully filled search input")
+                    
+                except Exception as e:
+                    logger.error(f"[RECREATION_PAGE] Failed to fill search input: {e}")
+                    raise e
+
+                try:
+                    # Step 3: Click search button
+                    logger.info(f"[RECREATION_PAGE] Clicking search button...")
+                    await page.wait_for_selector("button.nav-search-button", state='visible', timeout=timeout_ms)
+                    await page.click("button.nav-search-button")
+                    logger.info(f"[RECREATION_PAGE] Successfully clicked search button")
+                    print("after first click")
+                    
+                    # Wait for search results to load
+                    await asyncio.sleep(10)
+                    
+                except Exception as e:
+                    logger.error(f"[RECREATION_PAGE] Failed to click search button: {e}")
+                    raise e
+
+                try:
+                    # Step 4: Click on search result
+                    logger.info(f"[RECREATION_PAGE] Waiting for and clicking search result...")
+                    await page.wait_for_selector(".search-result-highlight--success", state='visible', timeout=timeout_ms)
+                    
+                    async with page.expect_popup() as popup_info:
+                        await page.click(".search-result-highlight--success")
+                    
+                    await asyncio.sleep(30)  # Wait for popup to fully load
+                    print("after second click")
+                    logger.info(f"[RECREATION_PAGE] Successfully clicked search result")
+                    
+                except Exception as e:
+                    logger.error(f"[RECREATION_PAGE] Failed to click search result: {e}")
+                    raise e
+
+                try:
+                    # Step 5: Handle new page
+                    newpage = await popup_info.value
+                    newpage.set_default_timeout(timeout_ms)
+                    await newpage.wait_for_load_state('networkidle', timeout=timeout_ms)
+                    
+                    page_title = await newpage.title()
+                    logger.info(f"[RECREATION_PAGE] New page loaded successfully")
+                    logger.info(f"[RECREATION_PAGE] New page title: '{page_title}'")
+                    logger.info(f"[RECREATION_PAGE] New page URL: '{newpage.url}'")
+                    print(f"go to newpage: {page_title}")
+                    await asyncio.sleep(2)
+                    
+                except Exception as e:
+                    logger.error(f"[RECREATION_PAGE] Failed to handle new page: {e}")
+                    raise e
+
+                try:
+                    # Step 6: Click next-available button with better error handling
+                    logger.info(f"[RECREATION_PAGE] Looking for next-available button...")
+                    
+                    # Try multiple selectors for the button
+                    selectors_to_try = [
+                        "button.next-available",
+                        "button[class*='next-available']",
+                        "button[class*='next']",
+                        ".next-available",
+                        "[data-testid*='next']"
+                    ]
+                    
+                    button_clicked = False
+                    for selector in selectors_to_try:
+                        try:
+                            logger.debug(f"[RECREATION_PAGE] Trying selector: {selector}")
+                            await newpage.wait_for_selector(selector, state='visible', timeout=30000)
+                            await newpage.click(selector, timeout=30000)
+                            logger.info(f"[RECREATION_PAGE] Successfully clicked next-available button with selector: {selector}")
+                            print("after third click")
+                            button_clicked = True
+                            break
+                        except Exception as selector_error:
+                            logger.debug(f"[RECREATION_PAGE] Selector '{selector}' failed: {selector_error}")
+                            continue
+                    
+                    if not button_clicked:
+                        logger.warning(f"[RECREATION_PAGE] Could not find or click next-available button with any selector")
+                        logger.info(f"[RECREATION_PAGE] Continuing without clicking next-available button")
+                        print("Continuing without clicking next-available button")
+                    
+                except Exception as e:
+                    logger.warning(f"[RECREATION_PAGE] Error with next-available button: {e}")
+                    logger.info(f"[RECREATION_PAGE] Continuing execution despite button click failure")
+                    print("Continuing without clicking next-available button")
+
+                # Step 7: Extract content based on config
+                return_json = {}
+                return_json["expected"] = {}
+                
+                try:
+                    logger.info(f"[RECREATION_PAGE] Extracting content based on config...")
+                    
+                    if config["selector"] == "class":
+                        className = config["class"]
+                        logger.debug(f"[RECREATION_PAGE] Looking for elements with class: {className}")
+                        
+                        if "order" in config.keys():
+                            try:
+                                elements = await newpage.query_selector_all("." + className)
+                                order_index = int(config["order"])
+                                logger.info(f"[RECREATION_PAGE] Found {len(elements)} elements with class '{className}', looking for index {order_index}")
+                                
+                                if len(elements) > order_index:
+                                    text_content = await elements[order_index].text_content()
+                                    if text_content:
+                                        text_content = text_content.strip()
+                                    return_json["expected"][className] = text_content
+                                    logger.info(f"[RECREATION_PAGE] Successfully extracted text from element at index {order_index}: '{text_content}'")
+                                else:
+                                    logger.warning(f"[RECREATION_PAGE] Element with class '{className}' at index {order_index} not found. Found {len(elements)} elements.")
+                                    return_json["expected"][className] = "__EVALUATION_FAILED__"
+                            except Exception as e:
+                                logger.error(f"[RECREATION_PAGE] Error accessing element with class '{className}' at specific order: {e}")
+                                return_json["expected"][className] = "__EVALUATION_FAILED__"
+                        else:
+                            try:
+                                element = await newpage.query_selector("." + className)
+                                if element:
+                                    text_content = await element.text_content()
+                                    if text_content:
+                                        text_content = text_content.strip()
+                                    return_json["expected"][className] = text_content
+                                    logger.info(f"[RECREATION_PAGE] Successfully extracted text from first element: '{text_content}'")
+                                else:
+                                    logger.warning(f"[RECREATION_PAGE] Element with class '{className}' not found.")
+                                    return_json["expected"][className] = "__EVALUATION_FAILED__"
+                            except Exception as e:
+                                logger.error(f"[RECREATION_PAGE] Error accessing element with class '{className}': {e}")
+                                return_json["expected"][className] = "__EVALUATION_FAILED__"
+                    
+                    logger.info(f"[RECREATION_PAGE] Content extraction completed successfully")
+                    logger.info(f"[RECREATION_PAGE] Final result: {return_json}")
+                    
+                except Exception as e:
+                    logger.error(f"[RECREATION_PAGE] Error during content extraction: {e}")
+                    # Return a structure indicating extraction failed
+                    return_json["expected"] = {"extraction_error": "__EVALUATION_FAILED__"}
+
+                await browser.close()
+                return return_json
+                
+        except Exception as e:
+            logger.error(f"[RECREATION_PAGE] Attempt {attempt + 1} failed: {str(e)}")
+            logger.error(f"[RECREATION_PAGE] Exception type: {type(e).__name__}")
+            
+            if attempt < max_retries - 1:
+                logger.info(f"[RECREATION_PAGE] Retrying in 5 seconds...")
+                await asyncio.sleep(5)
+            else:
+                logger.error(f"[RECREATION_PAGE] All {max_retries} attempts failed. Returning error result.")
+                # Return a structure that indicates complete failure
+                return {
+                    "expected": {
+                        "error": "__EVALUATION_FAILED__",
+                        "error_message": f"Failed after {max_retries} attempts: {str(e)}"
+                    }
+                }
+
+    # This should never be reached, but just in case
+    logger.error(f"[RECREATION_PAGE] Unexpected code path reached")
+    return {
+        "expected": {
+            "error": "__EVALUATION_FAILED__",
+            "error_message": "Unexpected error in function execution"
+        }
+    }
+
+
+def get_active_tab_url_parse(env, config: Dict[str, Any]):
+    """
+    This function is used to parse the url according to config["parse_keys"].
+    config: 
+        'parse_keys': must exist,
+            a list of keys to extract from the query parameters of the url.
+        'replace': optional, 
+            a dict, used to replace the original key with the new key.
+            ( { "original key": "new key" } )
+    """
+    active_tab_url = get_active_url_from_accessTree(env, config)
+    if active_tab_url is None:
+        return None
+
+    # connect to remote Chrome instance
+    # parse in a hard-coded way to find the specific info about task
+    parsed_url = urlparse(active_tab_url)
+    # Extract the query parameters
+    query_params = parse_qs(parsed_url.query)
+    # Define the keys of interest
+    keys_of_interest = [key for key in config["parse_keys"]]
+    # Extract the parameters of interest
+    extracted_params = {key: query_params.get(key, [''])[0] for key in keys_of_interest}
+    if "replace" in config:
+        for key in config["replace"].keys():
+            # change original key to new key, keep value unchange
+            value = extracted_params.pop(key)
+            extracted_params[config["replace"][key]] = value
+    if config.get("split_list", False):
+        extracted_params = {key: extracted_params[key].split(',') for key in extracted_params.keys()}
+    return extracted_params
+
+
+def get_url_dashPart(env, config: Dict[str, str]):
+    """
+    This function is used to extract one of the dash-separated part of the URL.
+    config
+        'partIndex': must exist,
+            the index of the dash-separated part to extract, starting from 0.
+        'needDeleteId': optional,
+            a boolean, used to indicate whether to delete the "id" part ( an example: "/part-you-want?id=xxx" )
+        'returnType': must exist,
+            a string, used to indicate the return type, "string" or "json".
+    """
+    active_tab_url = get_active_url_from_accessTree(env, config)
+    if active_tab_url is None:
+        return None
+
+    # extract the last dash-separated part of the URL, and delete all the characters after "id"
+    # Ensure partIndex is an integer
+    try:
+        part_index = int(config["partIndex"])
+    except (ValueError, TypeError):
+        logger.error(f"[URL_DASH_PART] Invalid partIndex: {config.get('partIndex', 'None')}. Must be an integer.")
+        return None
+    
+    url_parts = active_tab_url.split("/")
+    if part_index >= len(url_parts):
+        logger.error(f"[URL_DASH_PART] partIndex {part_index} is out of range for URL with {len(url_parts)} parts")
+        return None
+        
+    dash_part = url_parts[part_index]
+    if config.get("needDeleteId", False):
+        dash_part = dash_part.split("?")[0]
+    
+    logger.info(f"[URL_DASH_PART] Extracted dash part: '{dash_part}' from URL: {active_tab_url}")
+    
+    if config["returnType"] == "string":
+        return dash_part
+    elif config["returnType"] == "json":
+        return {config["key"]: dash_part}
+    else:
+        logger.error(f"[URL_DASH_PART] Invalid returnType: {config.get('returnType', 'None')}. Must be 'string' or 'json'.")
+        return None
+
+
+def get_macys_product_url_parse(env, config: Dict[str, str]):
+    """
+    Parse Macy's product url path, extract:
+    - mens_clothing: true if 'mens-clothing' in path, else None
+    - shirts: true if any key 'Top_style' or 'Product_department' value is 'shirts', else None
+    - Men_regular_size_t, Price_discount_range (as list), Sleeve_length: as before, None if not found
+    All fields are None if not found for robustness.
+    """
+    from urllib.parse import urlparse, unquote
+    result = {}
+    # 1. Parse URL
+    active_tab_url = get_active_url_from_accessTree(env, config)
+    if active_tab_url is None:
+        return None
+    parsed = urlparse(active_tab_url)
+    path = unquote(parsed.path)
+    result = {}
+    # mens_clothing
+    result['mens_clothing'] = True if 'mens-clothing' in path else None
+    # key-value
+    path_parts = path.strip('/').split('/')
+    key_value_json = {}
+    shirts_flag = False
+    short_sleeve_flag = False  # Initialize short_sleeve_flag to avoid UnboundLocalError
+    if "shirts" in path:
+        shirts_flag = True
+    if "short-sleeve" in path:
+        short_sleeve_flag = True
+    for i in range(len(path_parts)-1):
+        if ',' in path_parts[i] and ',' in path_parts[i+1]:
+            keys = [k.strip() for k in path_parts[i].split(',')]
+            values = [v.strip() for v in path_parts[i+1].split(',')]
+            for k, v in zip(keys, values):
+                if k == "Price_discount_range":
+                    key_value_json[k] = [item.strip() for item in v.split('|')] if v else None
+                else:
+                    key_value_json[k] = v if v else None
+                if k == 'Product_department' and (v == 'shirts' or v == 'Shirts' or v == 'Shirt'):
+                    shirts_flag = True
+                if k == 'Sleeve_length' and (v == 'short-sleeve' or v == 'Short Sleeve'):
+                    short_sleeve_flag = True
+            break
+    for field in ['Men_regular_size_t', 'Price_discount_range']:
+        if field not in key_value_json:
+            key_value_json[field] = None
+    result['shirts'] = shirts_flag if shirts_flag else None
+    result['short_sleeve'] = short_sleeve_flag if short_sleeve_flag else None
+    # parse_keys
+    for key in config["parse_keys"]:
+        if key in key_value_json:
+            if key == "Price_discount_range":
+                # Check if key_value_json[key] is not None before using 'in' operator
+                if key_value_json[key] is not None and '50_PERCENT_ off & more' in key_value_json[key] and not '30_PERCENT_ off & more' in key_value_json[key] and not '20_PERCENT_ off & more' in key_value_json[key]:
+                    result[key] = '50_PERCENT_ off & more'
+                else:
+                    result[key] = 'not_50_PERCENT_ off & more'
+            else:
+                result[key] = key_value_json[key]
+    return result
+
+
+# Alias for backward compatibility - the old function name was too generic
+def get_url_path_parse(env, config: Dict[str, str]):
+    """
+    Alias for get_macys_product_url_parse to maintain backward compatibility.
+    This function name is kept for existing configurations that still use "url_path_parse" type.
+    """
+    return get_macys_product_url_parse(env, config)
diff --git a/openhands/nvidia/os_world/getters/file.py b/openhands/nvidia/os_world/getters/file.py
new file mode 100644
index 000000000..dfa1ee775
--- /dev/null
+++ b/openhands/nvidia/os_world/getters/file.py
@@ -0,0 +1,152 @@
+import os
+from typing import Dict, List, Set
+from typing import Optional, Any, Union
+from datetime import datetime
+import requests
+import pandas as pd
+
+from openhands.core.logger import openhands_logger as logger
+
+def get_content_from_vm_file(env, config: Dict[str, Any]) -> Any:
+    """
+    Config:
+        path (str): absolute path on the VM to fetch
+    """
+
+    path = config["path"]
+    file_path = get_vm_file(env, {"path": path, "dest": os.path.basename(path)})
+    file_type, file_content = config['file_type'], config['file_content']
+    if file_type == 'xlsx':
+        if file_content == 'last_row':
+            df = pd.read_excel(file_path)
+            last_row = df.iloc[-1]
+            last_row_as_list = last_row.astype(str).tolist()
+            return last_row_as_list
+    else:
+        raise NotImplementedError(f"File type {file_type} not supported")
+
+
+def get_cloud_file(env, config: Dict[str, Any]) -> Union[str, List[str]]:
+    """
+    Config:
+        path (str|List[str]): the url to download from
+        dest (str|List[str])): file name of the downloaded file
+        multi (bool) : optional. if path and dest are lists providing
+          information of multiple files. defaults to False
+        gives (List[int]): optional. defaults to [0]. which files are directly
+          returned to the metric. if len==1, str is returned; else, list is
+          returned.
+    """
+
+    if not config.get("multi", False):
+        paths: List[str] = [config["path"]]
+        dests: List[str] = [config["dest"]]
+    else:
+        paths: List[str] = config["path"]
+        dests: List[str] = config["dest"]
+    cache_paths: List[str] = []
+
+    gives: Set[int] = set(config.get("gives", [0]))
+
+    for i, (p, d) in enumerate(zip(paths, dests)):
+        _path = os.path.join(env.cache_dir, d)
+        if i in gives:
+            cache_paths.append(_path)
+
+        if os.path.exists(_path):
+            #return _path
+            continue
+
+        url = p
+        response = requests.get(url, stream=True)
+        response.raise_for_status()
+
+        with open(_path, 'wb') as f:
+            for chunk in response.iter_content(chunk_size=8192):
+                if chunk:
+                    f.write(chunk)
+
+    return cache_paths[0] if len(cache_paths)==1 else cache_paths
+
+
+def get_vm_file(env, config: Dict[str, Any]) -> Union[Optional[str], List[Optional[str]]]:
+    """
+    Config:
+        path (str): absolute path on the VM to fetch
+        dest (str): file name of the downloaded file
+        multi (bool) : optional. if path and dest are lists providing
+          information of multiple files. defaults to False
+        gives (List[int]): optional. defaults to [0]. which files are directly
+          returned to the metric. if len==1, str is returned; else, list is
+          returned.
+        only support for single file now:
+        time_suffix(bool): optional. defaults to False. if True, append the current time in required format.
+        time_format(str): optional. defaults to "%Y%m%d_%H%M%S". format of the time suffix.
+    """
+    time_format = "%Y%m%d_%H%M%S"
+    if not config.get("multi", False):
+        paths: List[str] = [config["path"]]
+        dests: List[str] = [config["dest"]]
+        if config.get("time_suffix", False):
+            time_format = config.get("time_format", time_format)
+            # Insert time before file extension.
+            dests = [f"{os.path.splitext(d)[0]}_{datetime.now().strftime(time_format)}{os.path.splitext(d)[1]}" for d in dests]
+    else:
+        paths: List[str] = config["path"]
+        dests: List[str] = config["dest"]
+
+
+    cache_paths: List[str] = []
+
+    gives: Set[int] = set(config.get("gives", [0]))
+
+    for i, (p, d) in enumerate(zip(paths, dests)):
+        _path = os.path.join(env.cache_dir, d)
+        
+        try:
+            # Try to get file from VM
+            file = env.controller.get_file(p)
+            if file is None:
+                logger.warning(f"Failed to get file from VM: {p}")
+                if i in gives:
+                    cache_paths.append(None)
+                continue
+
+            if i in gives:
+                cache_paths.append(_path)
+                
+            # Write file with robust error handling
+            try:
+                # Ensure cache directory exists
+                os.makedirs(env.cache_dir, exist_ok=True)
+                
+                with open(_path, "wb") as f:
+                    f.write(file)
+                logger.info(f"Successfully saved file: {_path} ({len(file)} bytes)")
+                
+            except IOError as e:
+                logger.error(f"IO error writing file {_path}: {e}")
+                if i in gives:
+                    cache_paths[-1] = None  # Replace the path we just added with None
+            except Exception as e:
+                logger.error(f"Unexpected error writing file {_path}: {e}")
+                if i in gives:
+                    cache_paths[-1] = None
+                    
+        except Exception as e:
+            logger.error(f"Error processing file {p}: {e}")
+            if i in gives:
+                cache_paths.append(None)
+                
+    return cache_paths[0] if len(cache_paths)==1 else cache_paths
+
+
+def get_cache_file(env, config: Dict[str, str]) -> str:
+    """
+    Config:
+        path (str): relative path in cache dir
+    """
+
+    _path = os.path.join(env.cache_dir, config["path"])
+    assert os.path.exists(_path)
+    return _path
diff --git a/openhands/nvidia/os_world/getters/general.py b/openhands/nvidia/os_world/getters/general.py
new file mode 100644
index 000000000..30dc239b9
--- /dev/null
+++ b/openhands/nvidia/os_world/getters/general.py
@@ -0,0 +1,40 @@
+from typing import Dict
+import requests
+
+from openhands.core.logger import openhands_logger as logger
+
+def get_vm_command_line(env, config: Dict[str, str]):
+    vm_ip = env.vm_ip
+    port = env.server_port
+    command = config["command"]
+    shell = config.get("shell", False)
+
+    response = requests.post(f"http://{vm_ip}:{port}/execute", json={"command": command, "shell": shell})
+
+    print(response.json())
+
+    if response.status_code == 200:
+        return response.json()["output"]
+    else:
+        logger.error("Failed to get vm command line. Status code: %d", response.status_code)
+        return None
+
+def get_vm_command_error(env, config: Dict[str, str]):
+    vm_ip = env.vm_ip
+    port = env.server_port
+    command = config["command"]
+    shell = config.get("shell", False)
+
+    response = requests.post(f"http://{vm_ip}:{port}/execute", json={"command": command, "shell": shell})
+
+    print(response.json())
+
+    if response.status_code == 200:
+        return response.json()["error"]
+    else:
+        logger.error("Failed to get vm command line error. Status code: %d", response.status_code)
+        return None
+
+
+def get_vm_terminal_output(env, config: Dict[str, str]):
+    return env.controller.get_terminal_output()
diff --git a/openhands/nvidia/os_world/getters/gimp.py b/openhands/nvidia/os_world/getters/gimp.py
new file mode 100644
index 000000000..5def73dd3
--- /dev/null
+++ b/openhands/nvidia/os_world/getters/gimp.py
@@ -0,0 +1,37 @@
+import os
+from typing import Dict
+
+from openhands.core.logger import openhands_logger as logger
+
+
+def get_gimp_config_file(env, config: Dict[str, str]):
+    """
+    Gets the config setting of GIMP.
+    """
+
+    os_type = env.vm_platform
+    print(os_type)
+
+    if os_type == "Linux":
+        config_path = \
+            env.controller.execute_python_command(f"import os; print("
+                                                  f"os"
+                                                  f".path.expanduser("
+                                                  f"'~/.config/GIMP/2.10/"
+                                                  f"{config['file_name']}'))")[
+                'output'].strip()
+    # TODO: Add support for macOS and Windows
+    else:
+        raise Exception("Unsupported operating system", os_type)
+
+    _path = os.path.join(env.cache_dir, config["dest"])
+    content = env.controller.get_file(config_path)
+
+    if not content:
+        logger.error("Failed to get GIMP config file.")
+        return None
+
+    with open(_path, "wb") as f:
+        f.write(content)
+
+    return _path
diff --git a/openhands/nvidia/os_world/getters/impress.py b/openhands/nvidia/os_world/getters/impress.py
new file mode 100644
index 000000000..32ede2ab1
--- /dev/null
+++ b/openhands/nvidia/os_world/getters/impress.py
@@ -0,0 +1,126 @@
+import os
+import tempfile
+import xml.etree.ElementTree as ET
+import zipfile
+from typing import Dict
+
+from openhands.nvidia.os_world.getters.file import get_vm_file
+
+
+def get_background_image_in_slide(env, config: Dict[str, str]):
+    ppt_file_path, slide_index, dest = config["ppt_file_path"], int(config["slide_index"]), config["dest"]
+    image_id, image_file_path = None, None
+
+    ppt_file_localhost_path = get_vm_file(env, {"path": ppt_file_path, "dest": os.path.split(ppt_file_path)[-1]})
+
+    with zipfile.ZipFile(ppt_file_localhost_path, 'r') as myzip:
+        slide1_xml_file = 'ppt/slides/slide{}.xml'.format(slide_index + 1)
+        # firstly, check whether the background image is used in the slide
+        if slide1_xml_file not in myzip.namelist(): return None
+        with myzip.open(slide1_xml_file) as f:
+            # Parse the XML tree from the relationships file
+            tree = ET.parse(f)
+            root = tree.getroot()
+            bg_tag = "{http://schemas.openxmlformats.org/presentationml/2006/main}bgPr"
+            image_tag = "{http://schemas.openxmlformats.org/drawingml/2006/main}blip"
+            attr_tag = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
+            for child in root.iter(bg_tag):
+                try:
+                    for element in child.iter(image_tag):
+                        image_id = element.attrib[attr_tag]
+                        break
+                except: pass
+                if image_id is not None: break
+            else: return None
+
+        # next, extract the background image from the slide
+        slide1_rels_file = 'ppt/slides/_rels/slide{}.xml.rels'.format(slide_index + 1)
+        if slide1_rels_file in myzip.namelist():
+            with myzip.open(slide1_rels_file) as f:
+                # Parse the XML tree from the relationships file
+                tree = ET.parse(f)
+                root = tree.getroot()
+                # Define the namespace used in the relationships file
+                namespaces = {'r': 'http://schemas.openxmlformats.org/package/2006/relationships'}
+                # Look for all relationship elements that have a type attribute for image
+                for rel in root.findall('r:Relationship', namespaces):
+                    # Check if the relationship is for an image file
+                    if 'image' in rel.attrib['Type'] and rel.attrib['Id'] == image_id:
+                        target = rel.attrib['Target']
+                        if target.startswith('..'):
+                            # Resolve the relative path to get the correct path within the zip file
+                            image_file_path = os.path.normpath(os.path.join('ppt/slides', target))
+                            # Replace backslashes with forward slashes for ZIP compatibility
+                            image_file_path = image_file_path.replace('\\', '/')
+                            tmpdirname = os.path.dirname(ppt_file_localhost_path)
+                            myzip.extract(image_file_path, tmpdirname)
+                            image_file_path = os.path.join(tmpdirname, image_file_path)
+                            return image_file_path
+                        else: # absolute path
+                            assert target.startswith("file://"), target
+                            image_file_path = target[7:]
+                        break
+    if image_file_path is None:
+        return None
+
+    else:
+        # Get the audio file from vm and return the file path in the host
+        return get_vm_file(env, {"path": image_file_path, "dest": dest})
+
+
+def get_audio_in_slide(env, config: Dict[str, str]):
+    ppt_file_path, slide_index, dest = config["ppt_file_path"], int(config["slide_index"]), config["dest"]
+
+    # Open the .pptx file as a zip file, fixme: now we assume there is only one audio file in the slides
+    audio_file_path = None
+
+    ppt_file_localhost_path = get_vm_file(env, {"path": ppt_file_path, "dest": os.path.split(ppt_file_path)[-1]})
+
+    with zipfile.ZipFile(ppt_file_localhost_path, 'r') as myzip:
+        # Find the relationships XML file for the first slide
+        slide1_rels_file = 'ppt/slides/_rels/slide{}.xml.rels'.format(slide_index + 1)
+        if slide1_rels_file in myzip.namelist():
+            with myzip.open(slide1_rels_file) as f:
+                # Parse the XML tree from the relationships file
+                tree = ET.parse(f)
+                root = tree.getroot()
+                # Define the namespace used in the relationships file
+                namespaces = {'r': 'http://schemas.openxmlformats.org/package/2006/relationships'}
+                # Look for all relationship elements that have a type attribute for audio
+                for rel in root.findall('r:Relationship', namespaces):
+                    # Check if the relationship is for an audio file
+                    if 'audio' in rel.attrib['Type']:
+                        # The audio can be embedded inside the file or linked to an external file
+                        # Get the target attribute which contains the audio file path
+                        target = rel.attrib['Target']
+
+                        if target.startswith('..'):
+                            # Resolve the relative path to get the correct path within the zip file
+                            audio_file_path = os.path.normpath(os.path.join('ppt/slides', target))
+                            # Replace backslashes with forward slashes for ZIP compatibility
+                            audio_file_path = audio_file_path.replace('\\', '/')
+
+                            # Create a temporary directory to extract the audio file
+                            tmpdirname = os.path.dirname(ppt_file_localhost_path)
+                            myzip.extract(audio_file_path, tmpdirname)
+                            audio_file_path = os.path.join(tmpdirname, audio_file_path)
+                            return audio_file_path
+                            # with tempfile.TemporaryDirectory() as tmpdirname:
+                            #     # Extract the audio file
+                            #     myzip.extract(audio_file_path, tmpdirname)
+                            #     # Get the full path of the extracted audio file
+                            #     extracted_audio_path = os.path.join(tmpdirname, audio_file_path)
+                            #     # Return the extracted audio file path
+                            #     audio_file_path = extracted_audio_path
+                        else:
+                            # the audio file is external to the .pptx file
+                            # Return the audio file path
+                            assert target.startswith("file://"), target
+                            audio_file_path = target[7:]
+                        break
+    if audio_file_path is None:
+        return None
+
+    else:
+        # Get the audio file from vm and return the file path in the host
+        return get_vm_file(env, {"path": audio_file_path, "dest": dest})
diff --git a/openhands/nvidia/os_world/getters/info.py b/openhands/nvidia/os_world/getters/info.py
new file mode 100644
index 000000000..431401898
--- /dev/null
+++ b/openhands/nvidia/os_world/getters/info.py
@@ -0,0 +1,49 @@
+import os
+from typing import Union
+
+from openhands.core.logger import openhands_logger as logger
+
+
+def get_vm_screen_size(env, config: dict) -> dict:
+    return env.controller.get_vm_screen_size()
+
+
+def get_vm_window_size(env, config: dict) -> dict:
+    return env.controller.get_vm_window_size(app_class_name=config["app_class_name"])
+
+
+def get_vm_wallpaper(env, config: dict) -> Union[str, bytes]:
+    _path = os.path.join(env.cache_dir, config["dest"])
+
+    content = env.controller.get_vm_wallpaper()
+    
+    # Check if content is None or empty
+    if content is None:
+        logger.error("Failed to get VM wallpaper: controller returned None")
+        # Create an empty file to prevent downstream errors
+        with open(_path, "wb") as f:
+            f.write(b"")
+        return _path
+    
+    if not isinstance(content, bytes):
+        logger.error(f"Invalid wallpaper content type: {type(content)}, expected bytes")
+        # Create an empty file to prevent downstream errors
+        with open(_path, "wb") as f:
+            f.write(b"")
+        return _path
+    
+    if len(content) == 0:
+        logger.warning("VM wallpaper content is empty")
+        # Create an empty file to prevent downstream errors
+        with open(_path, "wb") as f:
+            f.write(b"")
+        return _path
+
+    with open(_path, "wb") as f:
+        f.write(content)
+
+    return _path
+
+
+def get_list_directory(env, config: dict) -> dict:
+    return env.controller.get_vm_directory_tree(config["path"])
diff --git a/openhands/nvidia/os_world/getters/misc.py b/openhands/nvidia/os_world/getters/misc.py
new file mode 100644
index 000000000..e48b3ca5a
--- /dev/null
+++ b/openhands/nvidia/os_world/getters/misc.py
@@ -0,0 +1,405 @@
+from typing import TypeVar, Dict
+from datetime import datetime, timedelta
+import pytz
+import requests
+
+from openhands.core.logger import openhands_logger as logger
+
+R = TypeVar("Rule")
+
+day_of_week_mapping = {
+    0: 'Mon',
+    1: 'Tue',
+    2: 'Wed',
+    3: 'Thu',
+    4: 'Fri',
+    5: 'Sat',
+    6: 'Sun'
+}
+
+month_mapping = {
+    1: 'Jan',
+    2: 'Feb',
+    3: 'Mar',
+    4: 'Apr',
+    5: 'May',
+    6: 'Jun',
+    7: 'Jul',
+    8: 'Aug',
+    9: 'Sep',
+    10: 'Oct',
+    11: 'Nov',
+    12: 'Dec'
+}
+
+Month_Mapping_Full = {
+    1: "January",
+    2: "February",
+    3: "March",
+    4: "April",
+    5: "May",
+    6: "June",
+    7: "July",
+    8: "August",
+    9: "September",
+    10: "October",
+    11: "November",
+    12: "December"
+}
+
+month_mapping_full = {
+    1: 'january',
+    2: 'february',
+    3:'march',
+    4: 'april',
+    5:'may',
+    6: 'june',
+    7: 'july',
+    8: 'august',
+    9:'september',
+    10: 'october',
+    11: 'november',
+    12: 'december'
+}
+
+relativeTime_to_IntDay = {
+    "tomorrow": 1,
+    "5th next month": "special",
+    "10th next month": "special",
+    "11th next month": "special",
+    "this month": "special",
+    "this Saturday": "special",
+    "this Sunday": "special",
+    "next Monday": "special",
+    "next Friday": "special",
+    "next Saturday": "special",
+    "next Sunday": "special",
+    "next week Friday": "special",
+    "next week Saturday": "special",
+    "next week Sunday": "special",
+    "first monday four months later": "special",
+    "first monday eight months later": "special",
+    "next Monday split": "special",
+    "next Friday split": "special"
+}
+
+def get_rule(env, config: Dict[str, R]) -> R:
+    """
+    Returns the rule as-is.
+    """
+    return config["rules"]
+
+def get_rule_relativeTime(env, config: Dict[str, R]) -> R:
+    """
+    According to the rule definded in funciton "apply_rules_to_timeFormat", convert the relative time to absolute time.
+    config:
+        'relativeTime': {
+            "from": must exist; indicates the relativeTime.
+            "to": optional; indicates the relativeTime.
+        } 
+        If relativeTime only has key "from", then the key of time in "expected" dict must be "time".
+        If relativeTime has key "to", then the key of time in "expected" dict must be "from" and "to". 
+        
+        Optional 'timezone': timezone string like 'Europe/Zurich', 'America/New_York', etc.
+        If not specified, will try to get timezone from IP geolocation.
+    """
+    logger.info(f"[DEBUG] get_rule_relativeTime called with config: {config}")
+    
+    relativeRules = config["rules"]
+    relativeTime = relativeRules["relativeTime"] # int, "+" means future, "-" means past
+    
+    logger.info(f"[DEBUG] relativeTime: {relativeTime}")
+    
+    # Get timezone configuration
+    timezone_str = get_timezone_from_config(config)
+    try:
+        timezone = pytz.timezone(timezone_str)
+        logger.info(f"Successfully loaded timezone: {timezone_str}")
+    except pytz.exceptions.UnknownTimeZoneError:
+        logger.error(f"Unknown timezone: {timezone_str}, falling back to UTC")
+        timezone = pytz.UTC
+    
+    # Get current time in the specified timezone
+    now = datetime.now(timezone)
+    logger.info(f"Current time in {timezone_str}: {now.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+    
+    # calculate the relative time
+    if "to" not in relativeTime.keys():
+        start_relative_time = relativeTime["from"] 
+        logger.info(f"Processing single time: '{start_relative_time}'")
+        
+        if relativeTime_to_IntDay[start_relative_time] != "special":
+            # relativeTime can be represented by actual int days
+            start_relative_time_IntDat = relativeTime_to_IntDay[start_relative_time]
+            timediff = timedelta(days=start_relative_time_IntDat) 
+            absoluteDay = now + timediff
+            logger.info(f"Simple calculation: {start_relative_time} = {start_relative_time_IntDat} days → {absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+        else:
+            # special case, you can add more special cases here
+            if start_relative_time == "5th next month":
+                next_year = now.year + 1 if now.month == 12 else now.year
+                next_month = now.month + 1 if now.month < 12 else 1
+                next_day = 5
+                absoluteDay = timezone.localize(datetime(next_year, next_month, next_day))
+                logger.info(f"5th next month: {absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+            elif start_relative_time == "10th next month":
+                next_year = now.year + 1 if now.month == 12 else now.year
+                next_month = now.month + 1 if now.month < 12 else 1
+                next_day = 10
+                absoluteDay = timezone.localize(datetime(next_year, next_month, next_day))
+                logger.info(f"10th next month: {absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+            elif start_relative_time == "this month":
+                absoluteDay = now
+                logger.info(f"This month: {absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+            elif start_relative_time == "next Monday":
+                days_until_monday = (6-now.weekday()) + 1
+                absoluteDay = now + timedelta(days=days_until_monday)
+                logger.info(f"Next Monday: current weekday={now.weekday()}, days to add={days_until_monday} → {absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+            elif start_relative_time == "first monday four months later":
+                next_year = now.year + 1 if now.month >=9 else now.year
+                next_month = (now.month + 4)%12
+                # get the first monday of the next_month
+                temp_date = timezone.localize(datetime(next_year, next_month, 1))
+                days_to_monday = ((6-temp_date.weekday())+1)%7
+                absoluteDay = temp_date + timedelta(days=days_to_monday)
+                logger.info(f"First Monday 4 months later: {next_year}-{next_month:02d} → {absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+            elif start_relative_time == "first monday eight months later":
+                next_year = now.year + 1 if now.month >= 5 else now.year
+                next_month = (now.month + 8)%12
+                # get the first monday of the next_month
+                temp_date = timezone.localize(datetime(next_year, next_month, 1))
+                days_to_monday = ((6-temp_date.weekday())+1)%7
+                absoluteDay = temp_date + timedelta(days=days_to_monday)
+                logger.info(f"First Monday 8 months later: {next_year}-{next_month:02d} → {absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+        regular_time = apply_rules_to_timeFormat(relativeRules["expected"]["time"], absoluteDay)
+        logger.info(f"Final formatted time: {regular_time}")
+        config["rules"]["expected"]["time"] = regular_time
+
+    else:
+        from_time = relativeTime["from"]
+        to_time = relativeTime["to"]
+        logger.info(f"Processing time range: from '{from_time}' to '{to_time}'")
+        
+        # deal with from_time first
+        if relativeTime_to_IntDay[from_time] != "special":
+            from_time_IntDat = relativeTime_to_IntDay[from_time]
+            from_timediff = timedelta(days=from_time_IntDat) 
+            from_absoluteDay = now + from_timediff
+            logger.info(f"From time calculation: {from_time} = {from_time_IntDat} days → {from_absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+        else:
+            if from_time == "this Saturday":
+                days_until_saturday = (5-now.weekday())
+                from_absoluteDay = now + timedelta(days=days_until_saturday)
+                logger.info(f"This Saturday: current weekday={now.weekday()}, days to add={days_until_saturday} → {from_absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+            elif from_time == "10th next month":
+                next_year = now.year + 1 if now.month == 12 else now.year
+                next_month = now.month + 1 if now.month < 12 else 1
+                next_day = 10
+                from_absoluteDay = timezone.localize(datetime(next_year, next_month, next_day))
+                logger.info(f"10th next month (from): {from_absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+            elif from_time == "next Monday" or from_time == "next Monday split":
+               days_until_monday = (6-now.weekday()) + 1
+               from_absoluteDay = now + timedelta(days=days_until_monday) 
+               logger.info(f"Next Monday (from): current weekday={now.weekday()}, days to add={days_until_monday} → {from_absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+            elif from_time == "next Friday":
+                # Next weekend Friday calculation
+                if now.weekday() < 4:  # Monday to Thursday - use this weekend
+                    days_until_friday = 4 - now.weekday()
+                elif now.weekday() == 4:  # Today is Friday - use next weekend
+                    days_until_friday = 7
+                else:  # Saturday to Sunday - use next weekend
+                    days_until_friday = (7 - now.weekday()) + 4  # Days to next Monday + 4 to get to Friday
+                from_absoluteDay = now + timedelta(days=days_until_friday)
+                logger.info(f"Next Friday (from): current weekday={now.weekday()}, days to add={days_until_friday} → {from_absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+            elif from_time == "next Saturday":
+                # Next weekend Saturday calculation
+                if now.weekday() < 5:  # Monday to Friday - use this weekend
+                    days_until_saturday = 5 - now.weekday()
+                elif now.weekday() == 5:  # Today is Saturday - use next weekend
+                    days_until_saturday = 7
+                else:  # Sunday - use next weekend
+                    days_until_saturday = 6  # 6 days to next Saturday
+                from_absoluteDay = now + timedelta(days=days_until_saturday)
+                logger.info(f"Next Saturday (from): current weekday={now.weekday()}, days to add={days_until_saturday} → {from_absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+            elif from_time == "next week Friday":
+                # Next week Friday - simple: go to next Monday, then +4 days
+                days_to_next_monday = 7 - now.weekday()
+                days_until_friday = days_to_next_monday + 4  # Monday + 4 = Friday
+                from_absoluteDay = now + timedelta(days=days_until_friday)
+                logger.info(f"Next week Friday (from): current weekday={now.weekday()}, days to add={days_until_friday} → {from_absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+            elif from_time == "next week Saturday":
+                # Next week Saturday - simple: go to next Monday, then +5 days
+                days_to_next_monday = 7 - now.weekday()
+                days_until_saturday = days_to_next_monday + 5  # Monday + 5 = Saturday
+                from_absoluteDay = now + timedelta(days=days_until_saturday)
+                logger.info(f"Next week Saturday (from): current weekday={now.weekday()}, days to add={days_until_saturday} → {from_absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+            elif from_time == "next week Sunday":
+                # Next week Sunday - simple: go to next Monday, then +6 days
+                days_to_next_monday = 7 - now.weekday()
+                days_until_sunday = days_to_next_monday + 6  # Monday + 6 = Sunday
+                from_absoluteDay = now + timedelta(days=days_until_sunday)
+                logger.info(f"Next week Sunday (from): current weekday={now.weekday()}, days to add={days_until_sunday} → {from_absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+            else:
+                pass # more rules here
+        if from_time == "next Monday split":
+            puday = apply_rules_to_timeFormat(relativeRules["expected"]["puDay"], from_absoluteDay)
+            config["rules"]["expected"]["puDay"] = puday
+            pumonth = apply_rules_to_timeFormat(relativeRules["expected"]["puMonth"], from_absoluteDay)
+            config["rules"]["expected"]["puMonth"] = pumonth
+            puyear = apply_rules_to_timeFormat(relativeRules["expected"]["puYear"], from_absoluteDay)
+            config["rules"]["expected"]["puYear"] = puyear
+            logger.info(f"Monday split formatting: puDay={puday}, puMonth={pumonth}, puYear={puyear}")
+        else:
+            regular_from_time = apply_rules_to_timeFormat(relativeRules["expected"]["from"], from_absoluteDay)
+            config["rules"]["expected"]["from"] = regular_from_time
+            logger.info(f"From time formatted: {regular_from_time}")
+
+        # deal with to_time
+        if relativeTime_to_IntDay[to_time] != "special":
+            to_time_IntDat = relativeTime_to_IntDay[to_time]
+            to_timediff = timedelta(days=to_time_IntDat) 
+            to_absoluteDay = now + to_timediff
+            logger.info(f"To time calculation: {to_time} = {to_time_IntDat} days → {to_absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+        else:
+            if to_time == "this Sunday":
+                days_until_sunday = (6-now.weekday())
+                to_absoluteDay = now + timedelta(days=days_until_sunday)
+                logger.info(f"This Sunday: current weekday={now.weekday()}, days to add={days_until_sunday} → {to_absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+            elif to_time == "11th next month":
+                next_year = now.year + 1 if now.month == 12 else now.year
+                next_month = now.month + 1 if now.month < 12 else 1
+                next_day = 11
+                to_absoluteDay = timezone.localize(datetime(next_year, next_month, next_day))
+                logger.info(f"11th next month (to): {to_absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+            elif to_time == "next Friday" or to_time == "next Friday split":
+                # Check if from_time is any variant of "next Monday"
+                if from_time in ["next Monday", "next Monday split"]:
+                    # Calculate Friday of the same week as the Monday
+                    # from_absoluteDay is already calculated as next Monday
+                    # Friday is 4 days after Monday (Monday=0, Friday=4)
+                    to_absoluteDay = from_absoluteDay + timedelta(days=4)
+                    logger.info(f"Next Friday (same week as Monday): from Monday {from_absoluteDay.strftime('%Y-%m-%d')} + 4 days → {to_absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+                else:
+                    # Standalone "next Friday" calculation
+                    if now.weekday() < 4:  # Monday to Thursday
+                        days_to_friday = 4 - now.weekday()
+                    else:  # Friday to Sunday
+                        days_to_friday = (6 - now.weekday()) + 5
+                    to_absoluteDay = now + timedelta(days=days_to_friday)
+                    logger.info(f"Next Friday (standalone): current weekday={now.weekday()}, days to add={days_to_friday} → {to_absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+            elif to_time == "next Sunday":
+                # Next weekend Sunday calculation - should be the same weekend as the from_time
+                if from_time in ["next Friday", "next Saturday"]:
+                    # Calculate Sunday of the same weekend as from_time
+                    # from_absoluteDay is already calculated, get the Sunday of that week
+                    days_to_add_for_sunday = 6 - from_absoluteDay.weekday()  # Days from Friday/Saturday to Sunday
+                    to_absoluteDay = from_absoluteDay + timedelta(days=days_to_add_for_sunday)
+                    logger.info(f"Next Sunday (to, same weekend as {from_time}): from {from_absoluteDay.strftime('%Y-%m-%d %A')} + {days_to_add_for_sunday} days → {to_absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+                else:
+                    # Standalone next Sunday calculation
+                    if now.weekday() < 6:  # Monday to Saturday - use this weekend
+                        days_until_sunday = 6 - now.weekday()
+                    else:  # Sunday - use next weekend
+                        days_until_sunday = 7
+                    to_absoluteDay = now + timedelta(days=days_until_sunday)
+                    logger.info(f"Next Sunday (to, standalone): current weekday={now.weekday()}, days to add={days_until_sunday} → {to_absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+            elif to_time == "next week Sunday":
+                # Next week Sunday calculation - should be the same week as from_time if from_time is also "next week"
+                if from_time in ["next week Friday", "next week Saturday"]:
+                    # Calculate Sunday of the same week as from_time
+                    # from_absoluteDay is already calculated, get the Sunday of that week
+                    days_to_add_for_sunday = 6 - from_absoluteDay.weekday()  # Days from Friday/Saturday to Sunday
+                    to_absoluteDay = from_absoluteDay + timedelta(days=days_to_add_for_sunday)
+                    logger.info(f"Next week Sunday (to, same week as {from_time}): from {from_absoluteDay.strftime('%Y-%m-%d %A')} + {days_to_add_for_sunday} days → {to_absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+                else:
+                    # Standalone next week Sunday calculation - simple: go to next Monday, then +6 days
+                    days_to_next_monday = 7 - now.weekday()
+                    days_until_sunday = days_to_next_monday + 6  # Monday + 6 = Sunday
+                    to_absoluteDay = now + timedelta(days=days_until_sunday)
+                    logger.info(f"Next week Sunday (to, standalone): current weekday={now.weekday()}, days to add={days_until_sunday} → {to_absoluteDay.strftime('%Y-%m-%d %H:%M:%S %Z')}")
+            else:
+                pass # more rules here
+        if to_time == "next Friday split":
+            to_day = apply_rules_to_timeFormat(relativeRules["expected"]["doDay"], to_absoluteDay)
+            config["rules"]["expected"]["doDay"] = to_day
+            to_month = apply_rules_to_timeFormat(relativeRules["expected"]["doMonth"], to_absoluteDay)
+            config["rules"]["expected"]["doMonth"] = to_month
+            to_year = apply_rules_to_timeFormat(relativeRules["expected"]["doYear"], to_absoluteDay)
+            config["rules"]["expected"]["doYear"] = to_year
+            logger.info(f"Friday split formatting: doDay={to_day}, doMonth={to_month}, doYear={to_year}")
+        else:
+            regular_to_time = apply_rules_to_timeFormat(relativeRules["expected"]["to"], to_absoluteDay)
+            config["rules"]["expected"]["to"] = regular_to_time
+            logger.info(f"To time formatted: {regular_to_time}")
+    
+    logger.info(f"[DEBUG] Final config rules: {config['rules']}")
+    print(config["rules"])
+    return config["rules"]
+
+
+def apply_rules_to_timeFormat(timeFormat: str, absoluteDay: datetime):
+    timeFormat = timeFormat.replace("{DoW}", day_of_week_mapping[absoluteDay.weekday()], 1)
+    timeFormat = timeFormat.replace("{Month}", month_mapping[absoluteDay.month], 1)
+    timeFormat = timeFormat.replace("{DayD}", str(absoluteDay.day), 1)
+    timeFormat = timeFormat.replace("{Year}", str(absoluteDay.year), 1)
+    timeFormat = timeFormat.replace("{Month0D}", "0"+str(absoluteDay.month) if absoluteDay.month < 10 else str(absoluteDay.month), 1)
+    timeFormat = timeFormat.replace("{month}", month_mapping_full[absoluteDay.month], 1)
+    timeFormat = timeFormat.replace("{MonthFull}", Month_Mapping_Full[absoluteDay.month], 1)
+    timeFormat = timeFormat.replace("{Day0D}", "0"+str(absoluteDay.day) if absoluteDay.day < 10 else str(absoluteDay.day), 1)
+    timeFormat = timeFormat.replace("{MonthD}", str(absoluteDay.month), 1)
+    # you can add other replace rules here
+
+    return timeFormat
+
+
+def get_accessibility_tree(env, *args) -> str:
+    accessibility_tree: str = env.controller.get_accessibility_tree()
+    logger.debug("AT@eval: %s", accessibility_tree)
+    return accessibility_tree
+
+def get_time_diff_range(env, config) -> str:
+    try:
+        return config["diff_range_in_minutes"]
+    except:
+        logger.error("diff_range_in_minutes not found in config.")
+        return None
+
+def get_timezone_from_ip() -> str:
+    """
+    Get timezone from IP address using IP geolocation API
+    Returns timezone string like 'Europe/Zurich' or 'UTC' as fallback
+    """
+    try:
+        # Try ipapi.co first
+        response = requests.get('https://ipapi.co/json/', timeout=5)
+        if response.status_code == 200:
+            data = response.json()
+            timezone = data.get('timezone')
+            if timezone:
+                logger.info(f"Timezone from IP: {timezone}")
+                return timezone
+    except Exception as e:
+        logger.warning(f"Failed to get timezone from IP: {e}")
+    
+    # Fallback to UTC
+    logger.info("Using UTC as fallback timezone")
+    return 'UTC'
+
+def get_timezone_from_config(config: Dict, default_timezone: str = None) -> str:
+    """
+    Get timezone from config, with fallback options
+    Priority: config timezone > default_timezone > IP-based timezone > UTC
+    """
+    # Check if timezone is specified in config
+    if "timezone" in config.get("rules", {}):
+        timezone = config["rules"]["timezone"]
+        logger.info(f"Using timezone from config: {timezone}")
+        return timezone
+    
+    # Use provided default
+    if default_timezone:
+        logger.info(f"Using provided default timezone: {default_timezone}")
+        return default_timezone
+    
+    # Get from IP
+    return get_timezone_from_ip()
\ No newline at end of file
diff --git a/openhands/nvidia/os_world/getters/replay.py b/openhands/nvidia/os_world/getters/replay.py
new file mode 100644
index 000000000..c85098630
--- /dev/null
+++ b/openhands/nvidia/os_world/getters/replay.py
@@ -0,0 +1,20 @@
+from typing import List, Dict, Any
+
+
+def get_replay(env, trajectory: List[Dict[str, Any]]) -> None:
+    # fixme: need to be combined with the accessibility tree to activate the selection of the target window
+    def parse(action):
+        if action["type"] == "hotkey":
+            keys = "', '".join(action["param"])
+            return f"pyautogui.hotkey('{keys}')"
+
+        if action["type"] == "typewrite":
+            text = action["param"]
+            return f"pyautogui.typewrite('{text}')"
+
+        if action["type"] == "press":
+            key = action["param"]
+            return f"pyautogui.press('{key}')"
+
+    for action in trajectory:
+        env.controller.execute_python_command(parse(action))
diff --git a/openhands/nvidia/os_world/getters/vlc.py b/openhands/nvidia/os_world/getters/vlc.py
new file mode 100644
index 000000000..c84816acd
--- /dev/null
+++ b/openhands/nvidia/os_world/getters/vlc.py
@@ -0,0 +1,85 @@
+import os
+from typing import Dict
+from collections import Counter
+from .general import get_vm_command_line
+import requests
+
+from openhands.core.logger import openhands_logger as logger
+
+
+def get_vlc_playing_info(env, config: Dict[str, str]):
+    """
+    Gets the current playing information from VLC's HTTP interface.
+    """
+
+    host = env.vm_ip
+    port = env.vlc_port
+    password = 'password'
+
+    _path = os.path.join(env.cache_dir, config["dest"])
+    url = f'http://{host}:{port}/requests/status.xml'
+    response = requests.get(url, auth=('', password))
+    if response.status_code == 200:
+        content = response.content
+    else:
+        logger.error("Failed to get vlc status. Status code: %d", response.status_code)
+        return None
+
+    with open(_path, "wb") as f:
+        f.write(content)
+
+    return _path
+
+
+def get_vlc_config(env, config: Dict[str, str]):
+    """
+    Reads the VLC configuration file to check setting.
+    """
+
+    os_type = env.vm_platform
+
+    # fixme: depends on how we config and install the vlc in virtual machine, need to be aligned and double-checked
+    if os_type == "Linux":
+        config_path = \
+            env.controller.execute_python_command("import os; print(os.path.expanduser('~/.config/vlc/vlcrc'))")[
+                'output'].strip()
+    elif os_type == "Darwin":
+        config_path = env.controller.execute_python_command(
+            "import os; print(os.path.expanduser('~/Library/Preferences/org.videolan.vlc/vlcrc'))")['output'].strip()
+    elif os_type == "Windows":
+        config_path = env.controller.execute_python_command(
+            "import os; print(os.path.expanduser('~\\AppData\\Roaming\\vlc\\vlcrc'))")['output'].strip()
+    else:
+        raise Exception("Unsupported operating system", os_type)
+
+    _path = os.path.join(env.cache_dir, config["dest"])
+    content = env.controller.get_file(config_path)
+    with open(_path, "wb") as f:
+        f.write(content)
+
+    return _path
+
+
+def get_default_video_player(env, config: dict):
+    """ Gets the default application for a category or file extension.
+    """
+
+    os_type = env.vm_platform
+
+    if os_type == "Linux":
+        extensions = ['3gp', '3gp', '3gpp', '3gpp', '3gpp2', '3gpp2', 'avi', 'avi', 'divx', 'divx', 'dv', 'dv', 'fli', 'fli', 'flv', 'flv', 'mp2t', 'mp2t', 'mp4', 'mp4', 'mp4v-es', 'mp4v-es', 'mpeg', 'mpeg', 'mpeg-system', 'mpeg-system', 'msvideo', 'msvideo', 'ogg', 'ogg', 'quicktime', 'quicktime', 'vnd.divx', 'vnd.divx', 'vnd.mpegurl', 'vnd.mpegurl', 'vnd.rn-realvideo', 'vnd.rn-realvideo', 'webm', 'webm', 'x-anim', 'x-anim', 'x-avi', 'x-avi', 'x-flc', 'x-flc', 'x-fli', 'x-fli', 'x-flv', 'x-flv', 'x-m4v', 'x-m4v', 'x-matroska', 'x-matroska', 'x-mpeg', 'x-mpeg', 'x-mpeg-system', 'x-mpeg-system', 'x-mpeg2', 'x-mpeg2', 'x-ms-asf', 'x-ms-asf', 'x-ms-asf-plugin', 'x-ms-asf-plugin', 'x-ms-asx', 'x-ms-asx', 'x-ms-wm', 'x-ms-wm', 'x-ms-wmv', 'x-ms-wmv', 'x-ms-wmx', 'x-ms-wmx', 'x-ms-wvx', 'x-ms-wvx', 'x-msvideo', 'x-msvideo', 'x-nsv', 'x-nsv', 'x-ogm', 'x-ogm', 'x-ogm+ogg', 'x-theora', 'x-theora', 'x-theora+ogg', 'x-theora+ogg']
+        apps = []
+        for ext in extensions:
+            app = get_vm_command_line(env, {"command": ["xdg-mime", "query", "default", f"video/{ext}"]})
+            if app:
+                apps.append(app)
+        if len(apps) == 0:
+            return 'unknown'
+        else:
+            return Counter(apps).most_common(1)[0][0]
+    elif os_type == "Darwin":
+        raise Exception("Unsupported operating system", os_type)
+    elif os_type == "Windows":
+        raise Exception("Unsupported operating system", os_type)
+    else:
+        raise Exception("Unsupported operating system", os_type)
\ No newline at end of file
diff --git a/openhands/nvidia/os_world/getters/vscode.py b/openhands/nvidia/os_world/getters/vscode.py
new file mode 100644
index 000000000..1ca9fddab
--- /dev/null
+++ b/openhands/nvidia/os_world/getters/vscode.py
@@ -0,0 +1,34 @@
+from typing import Any, Dict
+import time
+from .file import get_vm_file
+from .replay import get_replay
+
+from openhands.core.logger import openhands_logger as logger
+
+
+def get_vscode_config(env, config: Dict[str, Any]) -> str:
+    os_type = env.vm_platform
+    vscode_extension_command = config["vscode_extension_command"]
+
+    # fixme: depends on how we config and install the vscode in virtual machine, need to be aligned and double-checked
+
+    if os_type == "MacOS":
+        trajectory = [
+            {"type": "hotkey", "param": ["command", "shift", "p"]},
+            {"type": "typewrite", "param": vscode_extension_command},
+            {"type": "press", "param": "enter"}
+        ]
+    else:
+        trajectory = [
+            {"type": "hotkey", "param": ["ctrl", "shift", "p"]},
+            {"type": "typewrite", "param": vscode_extension_command},
+            {"type": "press", "param": "enter"}
+        ]
+
+    get_replay(env, trajectory)
+    time.sleep(1.0)
+
+    return get_vm_file(env, {
+        "path": config["path"],
+        "dest": config["dest"]
+    })
diff --git a/openhands/nvidia/os_world/metrics/__init__.py b/openhands/nvidia/os_world/metrics/__init__.py
new file mode 100644
index 000000000..40b860e8c
--- /dev/null
+++ b/openhands/nvidia/os_world/metrics/__init__.py
@@ -0,0 +1,160 @@
+from .basic_os import (
+    check_gnome_favorite_apps,
+    is_utc_0,
+    check_text_enlarged,
+    check_moved_jpgs,
+    is_in_vm_clickboard
+)
+from .chrome import (
+    is_expected_tabs,
+    is_expected_bookmarks,
+    compare_pdfs,
+    compare_htmls,
+    compare_archive,
+    is_cookie_deleted,
+    is_shortcut_on_desktop,
+    check_font_size,
+    check_enabled_experiments,
+    check_history_deleted,
+    is_expected_search_query,
+    is_expected_active_tab,
+    is_expected_url_pattern_match,
+    is_added_to_steam_cart,
+    is_expected_installed_extensions,
+    compare_pdf_images,
+    is_expected_active_tab_approximate
+)
+from .docs import (
+    compare_font_names,
+    compare_subscript_contains,
+    has_page_numbers_in_footers,
+    compare_docx_lines,
+    evaluate_colored_words_in_tables,
+    check_highlighted_words,
+    evaluate_strike_through_last_paragraph,
+    evaluate_conversion,
+    evaluate_spacing,
+    check_italic_font_size_14,
+    evaluate_alignment,
+    get_unique_train_ids,
+    check_no_duplicates,
+    compare_init_lines,
+    find_default_font,
+    contains_page_break,
+    compare_docx_files,
+    compare_docx_tables,
+    compare_line_spacing,
+    compare_insert_equation,
+    compare_highlighted_text,
+    is_first_line_centered,
+    check_file_exists,
+    check_tabstops,
+    compare_contains_image,
+    compare_docx_files_and_ignore_new_lines,
+    compare_docx_images,
+    compare_image_text,
+    compare_references,
+    compare_unique_train_records
+)
+from .general import (
+    check_csv,
+    check_accessibility_tree,
+    run_sqlite3,
+    check_json,
+    check_list,
+    exact_match,
+    match_in_list,
+    is_in_list,
+    fuzzy_match,
+    check_include_exclude,
+    check_direct_json_object,
+    compare_time_in_speedtest_results,
+    is_included_all_json_objects,
+    is_gold_text_included_in_pdf,
+    check_line_number,
+    file_contains,
+    compare_terminal_and_txt,
+    fuzzy_place_math,
+    compare_python_pure_text,
+    diff_text_file,
+    literal_match
+)
+from .gimp import (
+    check_structure_sim_resized,
+    check_brightness_decrease_and_structure_sim,
+    check_contrast_increase_and_structure_sim,
+    check_saturation_increase_and_structure_sim,
+    check_image_size,
+    check_image_mirror,
+    check_palette_and_structure_sim,
+    check_textbox_on_leftside,
+    check_green_background,
+    check_file_exists_and_structure_sim,
+    check_triangle_position,
+    check_structure_sim,
+    check_config_status,
+    compare_image_list,
+    increase_saturation,
+    decrease_brightness,
+    check_file_exists,
+    compare_triangle_positions,
+    check_sharper,
+    check_image_file_size
+)
+from .libreoffice import check_libre_locale
+from .others import compare_epub, check_mp3_meta
+from .pdf import check_pdf_pages
+from .slides import (
+    check_presenter_console_disable,
+    check_image_stretch_and_center,
+    check_slide_numbers_color,
+    compare_pptx_files,
+    check_strikethrough,
+    check_slide_orientation_Portrait,
+    evaluate_presentation_fill_to_rgb_distance,
+    check_left_panel,
+    check_transition,
+    check_page_number_colors,
+    check_auto_saving_time
+)
+from .table import (
+    compare_table,
+    compare_csv,
+    compare_conference_city_in_order
+)
+from .thunderbird import (
+    check_thunderbird_prefs,
+    check_thunderbird_filter,
+    check_thunderbird_folder
+)
+from .vlc import (
+    is_vlc_playing,
+    is_vlc_recordings_folder,
+    is_vlc_fullscreen,
+    compare_images,
+    compare_audios,
+    compare_videos,
+    check_qt_bgcone,
+    check_one_instance_when_started_from_file,
+    check_qt_minimal_view,
+    check_qt_max_volume,
+    check_qt_slider_colours,
+    check_global_key_play_pause
+)
+from .vscode import (
+    compare_text_file,
+    compare_config,
+    compare_answer,
+    compare_result_files,
+    is_extension_installed,
+    check_json_settings,
+    check_json_keybindings,
+    check_python_file_by_test_suite,
+    check_python_file_by_gold_file,
+    check_html_background_image,
+    compare_zip_files
+)
+
+
+def infeasible():
+    pass
diff --git a/openhands/nvidia/os_world/metrics/basic_os.py b/openhands/nvidia/os_world/metrics/basic_os.py
new file mode 100644
index 000000000..05e51ff64
--- /dev/null
+++ b/openhands/nvidia/os_world/metrics/basic_os.py
@@ -0,0 +1,68 @@
+def check_gnome_favorite_apps(apps_str: str, rule):
+    # parse the string like "['thunderbird.desktop', 'vim.desktop', 'google-chrome.desktop']"
+    # to a list of strings
+    apps = eval(apps_str)
+
+    expected_apps = rule["expected"]
+
+    if len(apps) != len(expected_apps):
+        return 0
+
+    if set(apps) == set(expected_apps):
+        return 1
+    else:
+        return 0
+
+
+def is_utc_0(timedatectl_output):
+    """
+    Format as:
+    Local time: Thu 2024-01-25 12:56:06 WET
+           Universal time: Thu 2024-01-25 12:56:06 UTC
+                 RTC time: Thu 2024-01-25 12:56:05
+                Time zone: Atlantic/Faroe (WET, +0000)
+System clock synchronized: yes
+              NTP service: inactive
+          RTC in local TZ: no
+    """
+
+    utc_line = timedatectl_output.split("\n")[3]
+
+    if utc_line.endswith("+0000)"):
+        return 1
+    else:
+        return 0
+
+
+def check_text_enlarged(scaling_factor_str):
+    scaling_factor = float(scaling_factor_str)
+    if scaling_factor > 1.0:
+        return 1
+    else:
+        return 0
+
+
+def check_moved_jpgs(directory_list, rule):
+    expected_jpgs = rule["expected"]
+    moved_jpgs = [node['name'] for node in directory_list['children']]
+
+    if len(moved_jpgs) != len(expected_jpgs):
+        return 0
+
+    if set(moved_jpgs) == set(expected_jpgs):
+        return 1
+    else:
+        return 0
+
+
+def is_in_vm_clickboard(config, terminal_output):
+    print("terminal_output: ")
+    print(terminal_output)
+    print("config: ")
+    print(config)
+    expected_results = config["expected"]
+    # check if terminal_output has expected results
+    if not isinstance(expected_results, list):
+        return 1 if expected_results in terminal_output else 0
+    else:
+        return 1 if all(result in terminal_output for result in expected_results) else 0
diff --git a/openhands/nvidia/os_world/metrics/chrome.py b/openhands/nvidia/os_world/metrics/chrome.py
new file mode 100644
index 000000000..a8995d5ac
--- /dev/null
+++ b/openhands/nvidia/os_world/metrics/chrome.py
@@ -0,0 +1,502 @@
+import logging
+import os
+import re
+import shutil
+import io
+import time
+from itertools import product
+from typing import Any, Dict, List, Union
+
+import rapidfuzz.fuzz as fuzz
+from bs4 import BeautifulSoup, Tag
+
+from openhands.nvidia.os_world.metrics.utils import are_lists_equal, compare_urls
+from openhands.core.logger import openhands_logger as logger
+
+
+def is_expected_active_tab(active_tab_info: Dict[str, str], rule: Dict[str, Any]) -> float:
+    """
+    Checks if the expected active tab is open in Chrome.
+    """
+    if not active_tab_info:
+        return 0.
+
+    match_type = rule['type']
+
+    if match_type == "url":
+        expected_url = rule['url']
+        if isinstance(active_tab_info, Dict):
+            actual_url = active_tab_info.get('url', None)
+        else:
+            actual_url = active_tab_info
+        logger.info("expected_url: {}".format(expected_url))
+        logger.info("actual_url: {}".format(actual_url))
+        return 1 if compare_urls(expected_url, actual_url) else 0
+    else:
+        logger.error(f"Unknown type: {match_type}")
+        return 0
+
+
+def is_expected_active_tab_approximate(active_tab_info: Dict[str, str], rule: Dict[str, Any]) -> float:
+    """
+    Checks if the expected active tab is open in Chrome, ignoring query parameters in the URL.
+    """
+    if not active_tab_info:
+        return 0.
+
+    match_type = rule['type']
+
+    if match_type == "url":
+        expected_url = rule['url']
+        if isinstance(active_tab_info, Dict):
+            actual_url = active_tab_info.get('url', None)
+        else:
+            actual_url = active_tab_info
+        from urllib.parse import urlparse, urlunparse
+        def strip_query(url):
+            parsed = urlparse(url)
+            return urlunparse(parsed._replace(query=""))
+        if strip_query(expected_url) == strip_query(actual_url):
+            return 1
+        else:
+            return 0
+    else:
+        logger.error(f"Unknown type: {match_type}")
+        return 0
+
+
+# rules[expected] is a string-formatted regex
+def is_expected_url_pattern_match(result, rules) -> float:
+    """
+    This function is used to search the expected pattern in the url using regex.
+    result is the return value of function "activte_tab_info" or return value of function "get_active_url_from_accessTree"   
+    """
+    if not result:
+        return 0.
+
+    # Extract URL from result parameter - result can be either a string URL or a dict with 'url' field
+    if isinstance(result, str):
+        result_url = result
+        logger.info("result url: {}".format(result_url))
+    elif isinstance(result, dict) and 'url' in result:
+        result_url = result['url']
+        logger.info("result url: {}".format(result_url))
+    else:
+        logger.error(f"Invalid result format: {type(result)}, expected string URL or dict with 'url' field")
+        return 0.
+
+    logger.info(f"Result URL to match: {result_url}")
+    
+    # expect_regex = re.compile(rules["expected"])
+    patterns = rules["expected"]
+    logger.info("expected_regex: {}".format(patterns))
+    for pattern in patterns:
+        match = re.search(pattern, result_url)
+        logger.info("match: {}".format(match))
+        if not match:
+            return 0.
+    return 1.
+
+
+def is_expected_installed_extensions(installed_extensions, expected) -> float:
+    if not installed_extensions:
+        return 0.
+
+    logger.info("installed_extensions: ")
+    logger.info(installed_extensions)
+    expected_extensions = expected["expected"]
+
+    # whether the expected extensions are installed
+    set_expected_extensions = set(expected_extensions)
+    set_installed_extensions = set(installed_extensions)
+
+    if set_expected_extensions.issubset(set_installed_extensions):
+        return 1.
+    else:
+        return 0.
+
+
+def is_expected_tabs(open_tabs: List[Dict[str, str]], rule: Dict[str, Any]) -> float:
+    """
+    Checks if the expected tabs are open in Chrome.
+    """
+    if not open_tabs:
+        return 0.
+
+    match_type = rule['type']
+
+    if match_type == "url":
+        expected_urls = rule['urls']
+        actual_urls = [tab['url'] for tab in open_tabs]
+        if not are_lists_equal(expected_urls, actual_urls, compare_urls):
+            logger.error("list not match") 
+            logger.error(expected_urls)
+            logger.error(actual_urls)
+            return 0
+        return 1 if are_lists_equal(expected_urls, actual_urls, compare_urls) else 0
+    else:
+        logger.error(f"Unknown type: {match_type}")
+        return 0
+
+
+def is_expected_bookmarks(bookmarks: List[str], rule: Dict[str, Any]) -> float:
+    """
+    Checks if the expected bookmarks are in Chrome.
+    """
+    if not bookmarks:
+        return 0.
+    elif rule['type'] == "bookmark_bar_folders_names":
+        bookmark_bar_folders_names = [bookmark['name'] for bookmark in bookmarks['bookmark_bar']['children'] if
+                                      bookmark['type'] == 'folder']
+        return 1. if set(bookmark_bar_folders_names) == set(rule['names']) else 0.
+    elif rule['type'] == "bookmark_bar_websites_urls":
+        bookmark_bar_websites_urls = [bookmark['url'] for bookmark in bookmarks['bookmark_bar']['children'] if
+                                      bookmark['type'] == 'url']
+        return 1. if set(bookmark_bar_websites_urls) == set(rule['urls']) else 0.
+    elif rule['type'] == "liked_authors_websites_urls":
+        # Check if "liked authors" folder exists
+        liked_authors_folder = next((bookmark for bookmark in bookmarks['bookmark_bar']['children'] if
+                                     bookmark['type'] == 'folder' and bookmark['name'] == 'Liked Authors'), None)
+        if liked_authors_folder:
+            # Check if it contains the specified URLs
+            logger.info("'Liked Authors' folder exists")
+            liked_authors_urls = [bookmark['url'] for bookmark in liked_authors_folder['children'] if
+                                  bookmark['type'] == 'url']
+            logger.info("Here is the 'Liked Authors' folder's urls: {}".format(liked_authors_urls))
+
+            urls = rule['urls']
+
+            for idx, url in enumerate(urls):
+                if isinstance(url, str):
+                    urls[idx] = [url]
+
+            combinations = product(*urls)
+
+            for combination in combinations:
+                if set(combination) == set(liked_authors_urls):
+                    return 1.
+            return 0.
+        else:
+            return 0.
+    else:
+        raise TypeError(f"{rule['type']} not support yet!")
+
+
+def is_expected_search_query(active_tab_info: Dict[str, str], rules: Dict[str, Any]) -> float:
+    if not active_tab_info:
+        return 0.
+
+    expected = rules['expect']
+    pattern = expected['pattern']
+    matched = re.search(pattern, active_tab_info['url'])
+    if matched:
+        return 1.
+    return 0.
+
+
+def compare_pdfs(pdf1_path: Union[str, List[str]], pdf2_path: Union[str, List[str]]):
+    """
+    Compare two PDF files.
+    """
+    if type(pdf2_path) != list:
+        pdf1_path, pdf2_path = [pdf1_path], [pdf2_path]
+
+    def extract_text_from_pdf(pdf_path):
+        """Extract text from each page of the PDF."""
+        text = ""
+        with fitz.open(pdf_path) as pdf:
+            for page in pdf:
+                text += page.get_text()
+        return text.strip()
+
+    score = 0.
+    for path1, path2 in zip(pdf1_path, pdf2_path):
+        try:
+            text1 = extract_text_from_pdf(path1)
+            text2 = extract_text_from_pdf(path2)
+            score += fuzz.ratio(text1, text2) / 100
+        except Exception as e:
+            logger.info(f"[ERROR]: unexpected error occurred when comparing PDF files: {e}")
+    return score / len(pdf2_path)
+
+
+import fitz
+from PIL import Image
+from borb.pdf import Document
+from borb.pdf import PDF
+import imagehash
+
+from pathlib import Path
+import typing
+import time
+
+
+def compare_pdf_images(pdf1_path: str, pdf2_path: str, **kwargs) -> float:
+    if not pdf1_path or not pdf2_path:
+        return 0.
+    if not all(map(os.path.exists, [pdf1_path, pdf2_path])):
+        logger.warning(f"PDF file does not exist: {pdf1_path} or {pdf2_path}")
+        return 0.
+
+    def extract_images_from_pdf(pdf_path):
+        pdf_document = fitz.open(pdf_path)
+        images = []
+
+        for page_number in range(pdf_document.page_count):
+            page = pdf_document[page_number]
+            for img_index, img in enumerate(page.get_images(full=True)):
+                xref = img[0]
+                base_image = pdf_document.extract_image(xref)
+                image_bytes = base_image["image"]
+                
+                # convert to PIL Image
+                try:
+                    pil_image = Image.open(io.BytesIO(image_bytes))
+                    images.append(pil_image)
+                except Exception as e:
+                    logger.error(f"Failed to process image in {pdf_path} on page {page_number}: {e}")
+
+        return images
+    
+    temp_dir = Path(pdf1_path).parent / "temp_pdf_comparison"
+    os.makedirs(temp_dir, exist_ok=True)
+    
+    temp_pdf1 = temp_dir / Path(pdf1_path).name
+    temp_pdf2 = temp_dir / Path(pdf2_path).name
+
+    shutil.copy(pdf1_path, temp_pdf1)
+    shutil.copy(pdf2_path, temp_pdf2)
+
+    try:
+        images1 = extract_images_from_pdf(str(temp_pdf1))
+        images2 = extract_images_from_pdf(str(temp_pdf2))
+    except Exception as e:
+        logger.error(f"Error extracting images from PDFs: {e}")
+        shutil.rmtree(temp_dir)
+        return 0.
+    finally:
+        shutil.rmtree(temp_dir)
+
+
+    if len(images1) != len(images2):
+        logger.info(f"Different number of images found. Gold: {len(images1)}, Pred: {len(images2)}")
+        return 0.
+
+    if not images1:
+        logger.info("No images found in either PDF. Considering it a match.")
+        return 1.0
+
+    hash_threshold = 5 
+    total_score = 0
+    for i, (img1, img2) in enumerate(zip(images1, images2)):
+        hash1 = imagehash.phash(img1)
+        hash2 = imagehash.phash(img2)
+        hash_diff = hash1 - hash2
+        
+        logger.info(f"Image {i+1}: Gold hash: {hash1}, Pred hash: {hash2}, Hash difference: {hash_diff}")
+
+        if hash_diff <= hash_threshold:
+            total_score +=1
+    
+    return total_score / len(images1)
+
+
+def compare_archive(pred_path: str, gold_path: str, **kwargs) -> float:
+    """
+    Compare two archives. Note that the files in the archives should be of the same type.
+    """
+    file_path = kwargs.pop('file_path', '')
+
+    if not pred_path:
+        return 0.
+    pred_folder = os.path.splitext(pred_path)[0] + '_pred'
+    gold_folder = os.path.splitext(gold_path)[0] + '_gold'
+
+    if os.path.exists(pred_folder):  # remove existing folder for new predictions
+        shutil.rmtree(pred_folder, ignore_errors=True)
+    os.makedirs(pred_folder)
+    shutil.unpack_archive(pred_path, pred_folder)
+
+    if not os.path.exists(gold_folder):  # use cache if exists
+        os.makedirs(gold_folder)
+        shutil.unpack_archive(gold_path, gold_folder)
+
+    pred_files = sorted(os.listdir(os.path.join(pred_folder, file_path)))
+    gold_files = sorted(os.listdir(os.path.join(gold_folder, file_path)))
+
+    if pred_files != gold_files:
+        return 0.
+
+    def get_compare_function():
+        file_type = kwargs.pop('file_type', 'text')
+        if file_type == 'text':
+            from .vscode import compare_text_file
+            return compare_text_file
+        elif file_type == 'pdf':
+            return compare_pdfs
+        elif file_type == 'docx':
+            from .docs import compare_docx_files
+            return compare_docx_files
+        elif file_type == 'ppt':
+            from .slides import compare_pptx_files
+            return compare_pptx_files
+        elif file_type == 'image':
+            from .vlc import compare_images
+            return compare_images
+        elif file_type == 'csv':
+            from .table import compare_csv
+            return compare_csv
+        elif file_type == 'table':
+            from .table import compare_table
+            return compare_table
+        elif file_type == 'audio':
+            from .vlc import compare_audios
+            return compare_audios
+        elif file_type == 'video':
+            from .vlc import compare_videos
+            return compare_videos
+        else:
+            raise ValueError('[ERROR]: not support file type: %s' % file_type)
+
+    score = 0
+    compare_function = get_compare_function()
+    for f1, f2 in zip(pred_files, gold_files):
+        fp1 = os.path.join(pred_folder, file_path, f1)
+        fp2 = os.path.join(gold_folder, file_path, f2)
+        score += compare_function(fp1, fp2, **kwargs)
+    return score / len(pred_files)
+
+
+def compare_htmls(html_path1: str, html_path2: str, **options) -> float:
+    """
+    Compare two HTML files.
+    """
+    with open(html_path1, 'r', encoding='utf-8') as inf:
+        soup1 = BeautifulSoup(inf, 'lxml')
+    with open(html_path2, 'r', encoding='utf-8') as inf:
+        soup2 = BeautifulSoup(inf, 'lxml')
+    ignore_sdnum = options.get("ignore_sdnum", None)
+
+    def compare_elements(elem1, elem2):
+        if not (isinstance(elem1, Tag) and isinstance(elem2, Tag)):
+            if elem1 != elem2:
+                logger.info("not the same")
+            return elem1 == elem2
+        if elem1.name != elem2.name:
+            logger.info("html name not match")
+            return False
+        if elem1.text.strip() != elem2.text.strip():
+            logger.info("html text not match")
+            return False
+        if elem1.attrs != elem2.attrs:
+            if ignore_sdnum:
+                attrs1 = {k: v for k, v in elem1.attrs.items() if k != 'sdnum'}
+                attrs2 = {k: v for k, v in elem2.attrs.items() if k != 'sdnum'}
+                return attrs1 == attrs2
+            logger.info("html attrs not match")
+            logger.info(f"{elem1.attrs}")
+            logger.info(f"{elem2.attrs}")
+            return False
+        return True
+
+    for elem1, elem2 in zip(soup1.recursiveChildGenerator(), soup2.recursiveChildGenerator()):
+        if not compare_elements(elem1, elem2):
+            logger.info("html not match")
+            return .0
+    return 1.
+
+
+def is_cookie_deleted(cookie_data, rule):
+    """
+    Check if the cookie is deleted.
+    """
+
+    if rule['type'] == 'domains':
+        cookies_domains = [cookie[1] for cookie in cookie_data]
+        for domain in rule['domains']:
+            for cookies_domain in cookies_domains:
+                if compare_urls(domain, cookies_domain):
+                    return 0.
+        return 1.
+    else:
+        raise TypeError(f"{rule['type']} not support yet!")
+
+
+def is_shortcut_on_desktop(shortcuts: Dict[str, str], rule):
+    """
+    Check if the shortcut is on the desktop.
+    """
+    # fixme: if the name of the website changed in the future, this will not work; can be replaced with url
+    if rule['type'] == 'name':
+        for shortcut_path, shortcut_content in shortcuts.items():
+            if "Name=" + rule['name'] + "\n" in shortcut_content:
+                return 1.
+        return 0.0
+    elif rule['type'] == 'exec':
+        for shortcut_path, shortcut_content in shortcuts.items():
+            if "Exec=" + rule['exec'] + "\n" in shortcut_content:
+                return 1.
+        return 0.0
+    elif rule['type'] == 'url':
+        raise TypeError(f"{rule['type']} not support yet!")
+    elif rule['type'] == 'id':
+        raise TypeError(f"{rule['type']} not support yet!")
+    else:
+        raise TypeError(f"{rule['type']} not support yet!")
+
+
+def check_history_deleted(history_data, rule):
+    """
+    Check if the history is deleted.
+    """
+
+    if rule['type'] == 'keywords':
+        history_domains = [history[0] for history in history_data]
+        for keyword in rule['keywords']:
+            for history_domain in history_domains:
+                if keyword in history_domain:
+                    return 0.
+        return 1.
+    else:
+        raise TypeError(f"{rule['type']} not support yet!")
+
+
+def check_enabled_experiments(enabled_experiments, rule):
+    """
+    Check if the enabled experiments are as expected.
+    """
+    enabled_experiments_names = [experiment.split("@")[0] for experiment in enabled_experiments]
+
+    if rule['type'] == 'names':
+        return 1. if enabled_experiments_names == rule['names'] else 0.
+    else:
+        raise TypeError(f"{rule['type']} not support yet!")
+
+
+def check_font_size(font_size, rule):
+    """
+    Check if the font size is as expected.
+    """
+
+    default_font_size = font_size['default_font_size']
+    if rule['type'] == 'value':
+        return 1. if default_font_size == rule['value'] else 0.
+    elif rule['type'] == 'range':
+        return 1. if rule['min'] < default_font_size < rule['max'] else 0.
+    else:
+        raise TypeError(f"{rule['type']} not support yet!")
+
+
+def is_added_to_steam_cart(active_tab_info, rule):
+    """
+    Check if the item is added to the Steam cart.
+    """
+    items = rule['items']
+
+    content = active_tab_info['content']
+
+    for item in items:
+        if item not in content:
+            return 0.
+
+    return 1.
diff --git a/openhands/nvidia/os_world/metrics/docs.py b/openhands/nvidia/os_world/metrics/docs.py
new file mode 100644
index 000000000..a74f05e2a
--- /dev/null
+++ b/openhands/nvidia/os_world/metrics/docs.py
@@ -0,0 +1,1192 @@
+import os
+import re
+import xml.etree.ElementTree as ET
+import zipfile
+import tempfile
+import subprocess
+import struct
+import numpy as np
+from io import BytesIO
+from typing import List, Dict, Any
+
+import easyocr
+from PIL import Image
+from docx import Document
+from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_TAB_ALIGNMENT
+from docx.shared import RGBColor
+from odf.opendocument import load
+from odf.text import P
+from odf.text import Span
+from rapidfuzz import fuzz
+from skimage.color import deltaE_ciede2000
+from skimage.color import rgb2lab
+
+from openhands.core.logger import openhands_logger as logger
+
+
+def read_x11_image(filepath):
+    """
+    Pure Python X11 (XWD) format reader that converts to PIL Image.
+    No external dependencies required.
+    
+    Args:
+        filepath: Path to the X11/XWD format image file
+        
+    Returns:
+        PIL.Image: Converted image in RGB format
+        
+    Raises:
+        ValueError: If the format is not supported
+        IOError: If file cannot be read
+    """
+    with open(filepath, 'rb') as f:
+        # Read X11 header
+        header_data = f.read(100)
+        
+        # Parse header (big endian format)
+        header_size = struct.unpack('>I', header_data[0:4])[0]
+        version = struct.unpack('>I', header_data[4:8])[0]
+        pixmap_format = struct.unpack('>I', header_data[8:12])[0]
+        pixmap_depth = struct.unpack('>I', header_data[12:16])[0]
+        pixmap_width = struct.unpack('>I', header_data[16:20])[0]
+        pixmap_height = struct.unpack('>I', header_data[20:24])[0]
+        
+        logger.debug(f"X11 image info: {pixmap_width}x{pixmap_height}, depth={pixmap_depth}")
+        
+        # Skip to the end of header
+        f.seek(header_size)
+        
+        # Read pixel data based on depth
+        if pixmap_depth == 32:
+            # 32-bit RGBA format
+            bytes_per_pixel = 4
+            total_pixels = pixmap_width * pixmap_height
+            pixel_data = f.read(total_pixels * bytes_per_pixel)
+            
+            # Convert to numpy array
+            pixels = np.frombuffer(pixel_data, dtype=np.uint8)
+            
+            # Reshape to image dimensions
+            pixels = pixels.reshape((pixmap_height, pixmap_width, bytes_per_pixel))
+            
+            # X11 format is typically BGRA, convert to RGB
+            # Swap B and R channels, ignore alpha
+            rgb_pixels = pixels[:, :, [2, 1, 0]]  # BGR -> RGB
+            
+            # Create PIL image
+            image = Image.fromarray(rgb_pixels, 'RGB')
+            return image
+            
+        elif pixmap_depth == 24:
+            # 24-bit RGB format
+            bytes_per_pixel = 3
+            total_pixels = pixmap_width * pixmap_height
+            pixel_data = f.read(total_pixels * bytes_per_pixel)
+            
+            # Convert to numpy array and reshape
+            pixels = np.frombuffer(pixel_data, dtype=np.uint8)
+            pixels = pixels.reshape((pixmap_height, pixmap_width, bytes_per_pixel))
+            
+            # Create PIL image (assuming RGB order)
+            image = Image.fromarray(pixels, 'RGB')
+            return image
+            
+        else:
+            raise ValueError(f'Unsupported X11 pixel depth: {pixmap_depth}. Only 24-bit and 32-bit formats are supported.')
+
+
+def find_default_font(config_file_path, rules):
+    """Find the default font in LibreOffice Writer."""
+    default_font = None
+    expected_font = rules["font_name"]
+
+    if not config_file_path:
+        return 0
+
+    try:
+        tree = ET.parse(config_file_path)
+        root = tree.getroot()
+
+        # Define the XML namespace used in the file
+        namespace = {'oor': 'http://openoffice.org/2001/registry'}
+
+        # Search for the node containing the default font setting for LibreOffice Writer
+        for elem in root.findall('.//item[@oor:path="/org.openoffice.Office.Writer/DefaultFont"]', namespace):
+            for prop in elem.findall('.//prop[@oor:name="Standard"]', namespace):
+                for value in prop.findall('value', namespace):
+                    default_font = value.text
+    except Exception as e:
+        logger.error(f"Error: {e}")
+
+    return 1 if default_font == expected_font else 0
+
+
+def contains_page_break(docx_file, rules):
+    if not docx_file:
+        return 0
+
+    try:
+        doc = Document(docx_file)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
+    try:
+        expected_page_break_count = rules["page_break_count"]
+    except Exception as e:
+        expected_page_break_count = None
+
+    namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
+
+    page_break_count = 0
+    for paragraph in doc.paragraphs:
+        for run in paragraph.runs:
+            br_elems = run.element.findall('.//w:br', namespaces)
+            for br in br_elems:
+                if br is not None and '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type' in br.attrib and \
+                        br.attrib['{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type'] == 'page':
+                    page_break_count += 1
+    
+    if expected_page_break_count is not None and page_break_count != expected_page_break_count:
+        return 0    
+
+    if page_break_count > 0:
+        return 1
+    else:
+        return 0
+
+def compare_docx_files(file1, file2, **options):
+    ignore_blanks = options.get('ignore_blanks', True)
+    ignore_case = options.get('ignore_case', False)
+    ignore_order = options.get('ignore_order', False)
+    content_only = options.get('content_only', False)
+    fuzzy_match = options.get('fuzzy_match', False)
+    delete_empty_lines = options.get('delete_empty_lines', False)
+
+    if not file1 or not file2:
+        return 0
+
+    def get_paragraph_texts_odt(document):
+        paragraphs = document.getElementsByType(P)
+        paragraph_texts = []
+        for paragraph in paragraphs:
+            text_parts = []
+            for node in paragraph.childNodes:
+                if node.nodeType == node.TEXT_NODE:
+                    text_parts.append(node.data)
+                elif node.nodeType == node.ELEMENT_NODE and node.tagName == 'text:span':
+                    # Assuming direct text content in <text:span>, for simplicity
+                    for child in node.childNodes:
+                        if child.nodeType == child.TEXT_NODE:
+                            text_parts.append(child.data)
+            paragraph_texts.append(''.join(text_parts))
+        return paragraph_texts
+
+    # Determine file types and load documents
+    if file1.endswith('.docx') and file2.endswith('.docx'):
+        try:
+            doc1 = Document(file1)
+            doc2 = Document(file2)
+        except Exception as e:
+            logger.error(f"Error: {e}")
+            return 0
+        doc1_paragraphs = [p.text for p in doc1.paragraphs]
+        doc2_paragraphs = [p.text for p in doc2.paragraphs]
+        if ignore_order:
+            doc1_paragraphs = sorted(doc1_paragraphs)
+            doc2_paragraphs = sorted(doc2_paragraphs)
+        if delete_empty_lines:
+            doc1_paragraphs = [p for p in doc1_paragraphs if p.strip()]
+            doc2_paragraphs = [p for p in doc2_paragraphs if p.strip()]
+    elif file1.endswith('.odt') and file2.endswith('.odt'):
+        try:
+            doc1 = load(file1)
+            doc2 = load(file2)
+        except Exception as e:
+            logger.error(f"Error: {e}")
+            return 0
+        doc1_paragraphs = get_paragraph_texts_odt(doc1)
+        doc2_paragraphs = get_paragraph_texts_odt(doc2)
+        if ignore_order:
+            doc1_paragraphs = sorted(doc1_paragraphs)
+            doc2_paragraphs = sorted(doc2_paragraphs)
+        if delete_empty_lines:
+            doc1_paragraphs = [p for p in doc1_paragraphs if p.strip()]
+            doc2_paragraphs = [p for p in doc2_paragraphs if p.strip()]
+    else:
+        # Unsupported file types or mismatch
+        print("Unsupported file types or mismatch between file types.")
+        return 0
+
+    if content_only:
+        # Compare the content of the documents
+        text1 = re.sub(r'\s+', ' ', '\n'.join(doc1_paragraphs)).strip()
+        text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip()
+        if ignore_case:
+            text1, text2 = text1.lower(), text2.lower()
+        similarity = fuzz.ratio(text1, text2) / 100.0
+        return similarity
+
+    # Process and compare documents
+    if ignore_blanks:
+        text1 = re.sub(r'\s+', ' ', '\n'.join(doc1_paragraphs)).strip()
+        text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip()
+        if ignore_case:
+            text1, text2 = text1.lower(), text2.lower()
+
+        if fuzzy_match:
+            similarity = fuzz.ratio(text1, text2) / 100.0
+            return similarity
+        else:
+            if text1 != text2:
+                return 0
+    else:
+        if len(doc1_paragraphs) != len(doc2_paragraphs):
+            print(doc1_paragraphs)
+            print(doc2_paragraphs)
+            print(len(doc1_paragraphs))
+            print(len(doc2_paragraphs))
+            return 0
+        
+        if fuzzy_match:
+            total_similarity = 0
+            if not doc1_paragraphs:
+                return 1.0
+            for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
+                if ignore_case:
+                    p1, p2 = p1.lower(), p2.lower()
+                total_similarity += fuzz.ratio(p1, p2) / 100.0
+            
+            if len(doc1_paragraphs) == 0:
+                return 1.0 if len(doc2_paragraphs) == 0 else 0.0
+
+            avg_similarity = total_similarity / len(doc1_paragraphs)
+            return avg_similarity
+        else:
+            # Compare each paragraph
+            for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
+                if ignore_case:
+                    p1, p2 = p1.lower(), p2.lower()
+                if p1 != p2:
+                    # show the difference
+                    print("=== First Paragraph ===")
+                    print(f"\033[92m{repr(p1)}\033[0m")  # Green color for p1, repr() shows hidden chars
+                    print("=== Second Paragraph ===")
+                    print(f"\033[91m{repr(p2)}\033[0m")  # Red color for p2, repr() shows hidden chars
+                    print("=" * 50)  # Clear boundary
+                    return 0
+
+    return 1
+
+
+def compare_init_lines(file1, file2):
+    if not file1 or not file2:
+        return 0
+
+    try:
+        doc1 = Document(file1)
+        doc2 = Document(file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
+    doc1_paragraphs = [p.text for p in doc1.paragraphs]
+    doc2_paragraphs = [p.text for p in doc2.paragraphs]
+
+    # Compare each paragraph
+    for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
+        if p1 != p2:
+            # print(p1)
+            # print(p2)
+            return 0
+
+    return 1
+
+
+def compare_docx_tables(docx_file1, docx_file2):
+    if not docx_file1 or not docx_file2:
+        return 0
+
+    try:
+        doc1 = Document(docx_file1)
+        doc2 = Document(docx_file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
+    # get list of tables in docx
+    tables1 = doc1.tables
+    tables2 = doc2.tables
+
+    if len(tables1) != len(tables2):
+        return 0
+
+    # Compare each table content
+    for table1, table2 in zip(tables1, tables2):
+
+        if len(table1.rows) != len(table2.rows) or len(table1.columns) != len(table2.columns):
+            return 0
+
+        # Compare each cell
+        for i in range(len(table1.rows)):
+            for j in range(len(table1.columns)):
+                if table1.cell(i, j).text.strip() != table2.cell(i, j).text.strip():
+                    return 0
+
+    return 1
+
+
+def compare_docx_images(docx_file1, docx_file2):
+    if not docx_file1 or not docx_file2:
+        return 0
+
+    try:
+        doc1 = Document(docx_file1)
+        doc2 = Document(docx_file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
+    def extract_images(doc):
+        images = []
+        for rel in doc.part.rels.values():
+            if "image" in rel.reltype:
+                img_data = rel.target_part.blob
+                images.append(BytesIO(img_data))
+        return images
+
+    images1 = extract_images(doc1)
+    images2 = extract_images(doc2)
+    if len(images1) != len(images2):
+        return 0
+    for img1, img2 in zip(images1, images2):
+        if Image.open(img1).tobytes() != Image.open(img2).tobytes():
+            return 0
+    return 1
+
+
+def compare_image_text(image_path, rule):
+    if not image_path:
+        return 0
+    
+    # Check if the image file exists
+    if not os.path.exists(image_path):
+        logger.error(f"Image file not found: {image_path}")
+        return 0
+    
+    # Check image format and convert if necessary
+    temp_image_path = None
+    actual_image_path = image_path
+    
+    try:
+        # First, try to identify the file format
+        result = subprocess.run(['file', image_path], capture_output=True, text=True)
+        file_info = result.stdout.lower()
+        
+        # If it's an X11 screen dump, we need to convert it
+        if 'x-window screen dump' in file_info or 'xwd' in file_info:
+            logger.info(f"Detected X11 screen dump format in {image_path}, attempting conversion...")
+            
+            # Create a temporary file for the converted image
+            temp_fd, temp_image_path = tempfile.mkstemp(suffix='.png')
+            os.close(temp_fd)
+            
+            # Try to convert using PIL with xwd support or other methods
+            try:
+                # First try with PIL directly (sometimes it can handle xwd)
+                img = Image.open(image_path)
+                img.save(temp_image_path, 'PNG')
+                actual_image_path = temp_image_path
+                logger.info(f"Successfully converted X11 image using PIL")
+            except Exception as pil_error:
+                logger.warning(f"PIL conversion failed: {pil_error}")
+                
+                # Try our custom X11 reader (pure Python solution)
+                try:
+                    logger.info("Attempting conversion using custom X11 reader...")
+                    x11_image = read_x11_image(image_path)
+                    x11_image.save(temp_image_path, 'PNG')
+                    actual_image_path = temp_image_path
+                    logger.info(f"✅ Successfully converted X11 image using custom reader")
+                except Exception as custom_error:
+                    logger.warning(f"Custom X11 conversion failed: {custom_error}")
+                    
+                    # Try with netpbm tools if available (fallback)
+                    try:
+                        result = subprocess.run(['which', 'xwdtopnm'], capture_output=True)
+                        if result.returncode == 0:
+                            # Use netpbm tools chain: xwdtopnm -> pnmtopng
+                            subprocess.run(['xwdtopnm', image_path], 
+                                         stdout=subprocess.PIPE, check=True)
+                            with open(temp_image_path, 'wb') as f:
+                                result = subprocess.run(['xwdtopnm', image_path], 
+                                                      stdout=subprocess.PIPE, check=True)
+                                result2 = subprocess.run(['pnmtopng'], 
+                                                       input=result.stdout, 
+                                                       stdout=f, check=True)
+                            actual_image_path = temp_image_path
+                            logger.info(f"Successfully converted X11 image using netpbm tools")
+                        else:
+                            raise Exception("netpbm tools not available")
+                    except Exception as netpbm_error:
+                        logger.warning(f"netpbm conversion failed: {netpbm_error}")
+                        
+                        # All conversions failed
+                        logger.error(
+                            f"❌ All X11 conversion methods failed.\n"
+                            f"Attempted: PIL → Custom Python reader → netpbm tools\n"
+                            f"💡 The image might be corrupted or in an unsupported X11 variant"
+                        )
+                        
+                        # If all conversions fail, try to use the original file anyway
+                        # Sometimes easyocr might handle it better than PIL
+                        if temp_image_path:
+                            os.unlink(temp_image_path)
+                            temp_image_path = None
+                        actual_image_path = image_path
+                        logger.info(f"Will attempt OCR on original file format (likely to fail)")
+        
+        # Now attempt OCR with error handling
+        try:
+            reader = easyocr.Reader(['en'])
+            result = reader.readtext(actual_image_path)
+            extracted_text = ' '.join([entry[1] for entry in result])
+            
+            # Log OCR results
+            logger.info(f"OCR extracted texts: {[entry[1] for entry in result]}")
+            logger.info(f"Combined extracted text: {extracted_text}")
+            
+            if rule['type'] == 'text':
+                target_text = rule['text']
+                match_found = target_text in extracted_text
+                
+                # Log matching results
+                logger.info(f"Target text: '{target_text}'")
+                logger.info(f"Match found: {match_found}")
+                if match_found:
+                    logger.info("✅ Text matching successful!")
+                else:
+                    logger.info("❌ Text matching failed!")
+                
+                return 1 if match_found else 0
+            else:
+                raise ValueError("Unsupported rule type")
+                
+        except Exception as ocr_error:
+            logger.error(f"OCR processing failed for {actual_image_path}: {ocr_error}")
+            
+            # Check if this is specifically an X11 format issue
+            if 'x-window screen dump' in file_info or 'xwd' in file_info:
+                logger.error(
+                    f"🚨 OCR failed on X11 screen dump after all conversion attempts.\n"
+                    f"This might indicate:\n"
+                    f"   1. The X11 file is corrupted or in an unsupported variant\n"
+                    f"   2. Missing dependencies (numpy, PIL)\n"
+                    f"   3. Insufficient memory for large images"
+                )
+            
+            return 0
+            
+    except Exception as e:
+        logger.error(f"Error processing image {image_path}: {e}")
+        return 0
+        
+    finally:
+        # Clean up temporary file if created
+        if temp_image_path and os.path.exists(temp_image_path):
+            try:
+                os.unlink(temp_image_path)
+            except:
+                pass
+
+
+def compare_line_spacing(docx_file1, docx_file2):
+    if not docx_file1 or not docx_file2:
+        return 0
+
+    if not compare_docx_files(docx_file1, docx_file2):
+        return 0
+
+    try:
+        doc1 = Document(docx_file1)
+        doc2 = Document(docx_file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
+    if len(doc1.paragraphs) != len(doc2.paragraphs):
+        return 0
+
+    # Compare each paragraph line spacing
+    for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
+
+        spacing1 = para1.paragraph_format.line_spacing
+        spacing2 = para2.paragraph_format.line_spacing
+
+        if spacing1 != spacing2:
+            return 0
+
+    return 1
+
+
+def compare_insert_equation(docx_file1, docx_file2):
+    if not docx_file1 or not docx_file2:
+        return 0
+
+    if not compare_docx_files(docx_file1, docx_file2):
+        return 0
+
+    try:
+        doc1 = Document(docx_file1)
+        doc2 = Document(docx_file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
+    # Compare each paragraph if it contains equation
+    for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
+        for run1, run2 in zip(para1.runs, para2.runs):
+            if run1.element.xpath('.//w:object') and run2.element.xpath('.//w:object'):
+                return 1
+    return 0
+
+
+def compare_font_names(docx_file, rules: List[Dict[str, Any]]):
+    if not docx_file:
+        return 0
+
+    try:
+        doc = Document(docx_file)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
+    expected_font = rules["font_name"]
+
+    for paragraph in doc.paragraphs:
+        for run in paragraph.runs:
+            font_name = run.font.name
+            if font_name != expected_font:
+                return 0
+    return 1
+
+
+def compare_subscript_contains(docx_file1, docx_file2):
+    if not docx_file1 or not docx_file2:
+        return 0
+
+    try:
+        doc1 = Document(docx_file1)
+        doc2 = Document(docx_file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
+    for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
+        for run1, run2 in zip(para1.runs, para2.runs):
+            # check if two paras both contain subscript
+            if run1.font.subscript and run2.font.subscript:
+                return 1
+    return 0
+
+
+def has_page_numbers_in_footers(docx_file):
+    if not docx_file:
+        return 0
+
+    try:
+        doc = Document(docx_file)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
+    for section in doc.sections:
+        footer = section.footer
+        if footer is None:
+            return 0
+        footer_text = footer.paragraphs[0].text if footer.paragraphs else ''
+        if not any(char.isdigit() for char in footer_text):
+            # if no digit in footer, then no page number
+            return 0
+    return 1
+
+
+def is_first_line_centered(docx_file):
+    if not docx_file:
+        return 0
+
+    try:
+        doc = Document(docx_file)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
+    first_paragraph = doc.paragraphs[0]
+
+    # check if the first line is center justified
+    return 1 if first_paragraph.paragraph_format.alignment == WD_PARAGRAPH_ALIGNMENT.CENTER else 0
+
+
+def check_file_exists(directory, filename):
+    if not directory or not filename:
+        return 0
+    file_path = os.path.join(directory, filename)
+    return 1 if os.path.isfile(file_path) else 0
+
+
+def check_tabstops(docx_file1, docx_file2, **kwargs) -> float:
+    if not docx_file1 or not docx_file2:
+        return .0
+
+    try:
+        doc1: Document = Document(docx_file1)
+        doc2: Document = Document(docx_file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return .0
+
+    para1 = [p for p in doc1.paragraphs if p.text.strip()]
+    para2 = [p for p in doc2.paragraphs if p.text.strip()]
+    if len(para1) != len(para2): return .0
+
+    if kwargs.get('word_number_split_by_tabstop', None) is not None:
+        number = kwargs['word_number_split_by_tabstop']
+        index = kwargs.get('index', 0)
+        for p1 in para1:
+            splits = p1.text.split('\t')
+            if len(splits) == 0: return .0
+            words = list(filter(lambda x: x.strip(), re.split(r'\s', splits[index])))
+            if len(words) != number: return .0
+
+    section = doc2.sections[0]
+    paragraph_width = section.page_width - section.left_margin - section.right_margin
+    ignore_tabs = lambda x: x.alignment == WD_TAB_ALIGNMENT.CLEAR or (
+            x.alignment == WD_TAB_ALIGNMENT.LEFT and x.position == 0)
+    minus = .0
+    for p1, p2 in zip(para1, para2):
+        # filter CLEAR tabstop and default left-0 tabstop
+        tabs1 = [tst for tst in p1.paragraph_format.tab_stops if not ignore_tabs(tst)]
+        tabs2 = [tst for tst in p2.paragraph_format.tab_stops if not ignore_tabs(tst)]
+        if len(tabs1) != len(tabs2): return .0
+        difference = .0
+        for t1, t2 in zip(tabs1, tabs2):
+            if t1.alignment != t2.alignment: return .0
+            difference += abs(t1.position - t2.position)
+        minus += difference / paragraph_width
+    score = 1 - (minus / len(para1))
+    return score
+
+
+def compare_contains_image(docx_file1, docx_file2):
+    if not docx_file1 or not docx_file2:
+        return 0
+
+    try:
+        doc1 = Document(docx_file1)
+        doc2 = Document(docx_file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
+    for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
+        for run1, run2 in zip(para1.runs, para2.runs):
+            if ('graphicData' in run1._element.xml and 'graphicData' not in run2._element.xml) or (
+                    'graphicData' not in run1._element.xml and 'graphicData' in run2._element.xml):
+                return 0
+    return 1
+
+
+def evaluate_colored_words_in_tables(file_path1, file_path2, **kwargs):
+    if not file_path1 or not file_path2:
+        return 0
+
+    if not compare_docx_files(file_path1, file_path2):
+        return 0
+
+    try:
+        document = Document(file_path1)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
+    threshold = kwargs.get('threshold', 3.5)
+
+    def _calculate_color_difference(rgb1, rgb2):
+        srgb1 = [rgb1[0] / 255.0, rgb1[1] / 255.0, rgb1[2] / 255.0]
+        srgb2 = [rgb2[0] / 255.0, rgb2[1] / 255.0, rgb2[2] / 255.0]
+        lab1, lab2 = rgb2lab(srgb1), rgb2lab(srgb2)
+        delta_e = deltaE_ciede2000(lab1, lab2)
+        return delta_e
+
+    for table in document.tables:
+        # Iterate through rows and cells in the table
+        for row in table.rows:
+            for cell in row.cells:
+                for paragraph in cell.paragraphs:
+                    for run in paragraph.runs:
+                        word = run.text
+                        if word:
+                            first_letter = word[0].lower()
+
+                            if first_letter in 'aeiou' and _calculate_color_difference(run.font.color.rgb,
+                                                                                       RGBColor(255, 0, 0)) > threshold:
+                                return 0  # Vowel-colored words should be red
+                            elif first_letter not in 'aeiou' and _calculate_color_difference(run.font.color.rgb,
+                                                                                             RGBColor(0, 0,
+                                                                                                      255)) > threshold:
+                                return 0  # Non-vowel-colored words should be blue
+
+    return 1  # All words in tables are correctly colored
+
+
+def check_highlighted_words(file_path1, file_path2):
+    if not file_path1 or not file_path2:
+        return 0
+
+    if not compare_docx_files(file_path1, file_path2):
+        return 0
+
+    doc = load(file_path1)
+    highlighted = False
+
+    for span in doc.getElementsByType(Span):
+        style_name = span.getAttribute('stylename')
+        if style_name:
+            for automatic_style in doc.automaticstyles.childNodes:
+                if automatic_style.getAttribute('name') == style_name:
+                    for property in automatic_style.childNodes:
+                        if property.getAttribute('backgroundcolor') == '#ffff00':
+                            highlighted = True
+                            break
+            if highlighted:
+                break
+
+    return 0 if highlighted else 1
+
+
+def evaluate_strike_through_last_paragraph(file_path1, file_path2):
+    if not file_path1 or not file_path2:
+        return 0
+
+    if not compare_docx_files(file_path1, file_path2):
+        return 0
+
+    try:
+        document = Document(file_path1)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
+    # Get the last paragraph
+    last_paragraph = document.paragraphs[-1]
+
+    # Check if any run in the last paragraph has strike-through formatting
+    for run in last_paragraph.runs:
+        if not run.font.strike:
+            return 0  # At least one word does not have strike-through formatting
+
+    return 1  # All words in the last paragraph have strike-through formatting
+
+
+def evaluate_conversion(file_path):
+    if not file_path:
+        return 0
+
+    try:
+        document = Document(file_path)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
+    for table in document.tables:
+        for row in table.rows:
+            for cell in row.cells:
+                for paragraph in cell.paragraphs:
+                    for run in paragraph.runs:
+                        if run.text.isupper():
+                            return 0  # Uppercase text should be converted to lowercase
+
+    for paragraph in document.paragraphs:
+        for run in paragraph.runs:
+            if run.text.isupper():
+                return 0  # Uppercase text should be converted to lowercase
+
+    return 1  # All uppercase text has been successfully converted
+
+
+def evaluate_spacing(file_path):
+    if not file_path:
+        return 0
+
+    try:
+        document = Document(file_path)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
+    # Check line spacing for introduction, body, and conclusion
+    introduction_spacing = document.paragraphs[0].paragraph_format.line_spacing
+    body_spacing = document.paragraphs[1].paragraph_format.line_spacing
+    conclusion_spacing = document.paragraphs[2].paragraph_format.line_spacing
+    if (introduction_spacing == 1.0 and body_spacing == 2.0 and conclusion_spacing == 1.5):
+        return 1
+    else:
+        return 0
+
+
+def check_italic_font_size_14(path1, path2):
+    if not path1 or not path2:
+        return 0
+
+    if not compare_docx_files(path1, path2):
+        return 0
+
+    try:
+        document = Document(path1)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
+    for paragraph in document.paragraphs:
+        for run in paragraph.runs:
+            if run.italic:
+                # Check if font size is 14
+                if run.font.size is None or run.font.size.pt != 14:
+                    return 0
+    return 1
+
+
+def evaluate_alignment(docx_path):
+    if not docx_path:
+        return 0
+
+    # Load the document
+    try:
+        doc = Document(docx_path)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
+    # Iterate through each paragraph in the document
+    for para in doc.paragraphs:
+        # Split the paragraph into individual sentences
+        sentences = para.text.split('.')
+
+        for sentence in sentences:
+            # Split the sentence into words
+            words = sentence.strip().split()
+
+            # Check if the sentence has at least three words
+            if len(words) < 3:
+                continue  # Skip sentences with less than three words
+
+            # The first three words should be separated from the rest
+            first_part = ' '.join(words[:3])
+            second_part = ' '.join(words[3:])
+
+            # Check if the sentence structure matches the pattern: first part + large space/tab + second part
+            if not (first_part in sentence and second_part in sentence and sentence.find(first_part) < sentence.find(
+                    second_part)):
+                return 0  # The sentence does not meet the alignment criteria
+
+    return 1  # All sentences meet the alignment criteria
+
+
+def get_unique_train_ids(initial_file):  # fixed standard
+    if not initial_file:
+        return set(), 0
+
+    try:
+        doc = Document(initial_file)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return set(), 0
+
+    train_ids = set()
+    processed_lines = 0
+
+    for para in doc.paragraphs:
+        line_parts = para.text.split(',')
+        if len(line_parts) == 4:
+            train_id = line_parts[1].strip()
+            if train_id not in train_ids:
+                train_ids.add(train_id)
+                processed_lines += 1
+
+    return train_ids, processed_lines
+
+
+def check_no_duplicates(initial_file, processed_file):
+    if not initial_file or not processed_file:
+        return 0
+
+    # Open the document
+    train_ids_ini, ini_lines = get_unique_train_ids(initial_file)
+
+    try:
+        doc_processed = Document(processed_file)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
+    train_ids_pro = set()
+    processed_lines = 0  # Counter for valid lines processed
+
+    # processed
+    for para in doc_processed.paragraphs:
+        # Each line has the format: time_HH:MM:SS, train_id, station_id, platform_no
+        line_parts = para.text.split(',')
+        # Ensure the line has the correct format
+        if len(line_parts) == 4:
+            train_id = line_parts[1].strip()
+            # If train_id is already in the set, it's a duplicate
+            if train_id in train_ids_pro:
+                return 0  # Duplicate found
+            train_ids_pro.add(train_id)
+            processed_lines += 1  # Increment valid lines counter
+
+    if train_ids_pro != train_ids_ini or processed_lines != ini_lines:
+        return 0
+
+    # No duplicates found and at least one valid line was processed
+    return 1
+
+
+def compare_docx_lines(file1, file2):
+    if not file1 or not file2:
+        return 0
+
+    # Read the text of the document, line by line
+    try:
+        doc1 = Document(file1)
+        doc2 = Document(file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
+    doc1_lines = [p.text.strip() for p in doc1.paragraphs if p.text.strip()]
+    doc2_lines = [p.text.strip() for p in doc2.paragraphs if p.text.strip()]
+    # print(doc1_lines)
+    # print(doc2_lines)
+
+    # Convert the list of lines to sets and compare
+    if set(doc1_lines) == set(doc2_lines):
+        return 1
+    else:
+        return 0
+
+
+def compare_docx_files_and_ignore_new_lines(file1, file2, **options):
+    ignore_blanks = options.get('ignore_blanks', True)
+
+    if not file1 or not file2:
+        return 0
+
+    # Determine file types and load documents
+    if file1.endswith('.docx') and file2.endswith('.docx'):
+        try:
+            doc1 = Document(file1)
+            doc2 = Document(file2)
+        except Exception as e:
+            logger.error(f"Error: {e}")
+            return 0
+
+        # First, delete all the blank in paragraphs
+        doc1 = [p for p in doc1.paragraphs if p.text != '']
+        doc2 = [p for p in doc2.paragraphs if p.text != '']
+        doc1_paragraphs = [p.text for p in doc1]
+        doc2_paragraphs = [p.text for p in doc2]
+    else:
+        # Unsupported file types or mismatch
+        print("Unsupported file types or mismatch between file types.")
+        return 0
+
+    # Process and compare documents
+    if ignore_blanks:
+        text1 = re.sub(r'\s+', ' ', '\n'.join(doc1_paragraphs)).strip()
+        text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip()
+        if text1 != text2:
+            return 0
+    else:
+        if len(doc1_paragraphs) != len(doc2_paragraphs):
+            return 0
+        # Compare each paragraph
+        for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
+            if p1 != p2:
+                return 0
+    return 1
+
+
+# Docx file saved in the ubuntu cannot use this function to compare highlight, don't know why, deprecated
+def compare_highlighted_text(file1, file2):
+    if not file1 or not file2:
+        return 0
+
+    def extract_highlighted_text(file_path):
+        highlighted_texts = []
+
+        # Open the .docx file as a zip file and read the document.xml
+        with zipfile.ZipFile(file_path, 'r') as docx:
+            with docx.open('word/document.xml') as document_xml:
+                tree = ET.parse(document_xml)
+                root = tree.getroot()
+
+        # Define the namespaces
+        namespaces = {
+            'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
+        }
+
+        # Find all runs with highlight property
+        for run in root.findall('.//w:r', namespaces):
+            highlight = run.find('.//w:highlight', namespaces)
+            if highlight is not None and highlight.get(
+                    '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val') != 'none':
+                text = run.find('.//w:t', namespaces)
+                if text is not None:
+                    highlighted_texts.append(text.text)
+
+        return highlighted_texts
+
+    # Read the highlighted text from both documents
+    doc1_highlighted = extract_highlighted_text(file1)
+    doc2_highlighted = extract_highlighted_text(file2)
+
+    # Compare the sets of highlighted text to check if they are the same
+    if set(doc1_highlighted) == set(doc2_highlighted):
+        return 1
+    else:
+        return 0
+
+
+def compare_references(file1, file2, **options):
+    if not file1 or not file2:
+        return 0
+
+    reference_indicator = options.get('reference_indicator', 'References')
+    reference_base_result = options.get('reference_base_result', 0.5)
+
+    # Determine file types and load documents
+    if file1.endswith('.docx') and file2.endswith('.docx'):
+        try:
+            doc1 = Document(file1)
+            doc2 = Document(file2)
+        except Exception as e:
+            logger.error(f"Error: {e}")
+            return 0
+
+        doc1_paragraphs = [p.text for p in doc1.paragraphs]
+        doc2_paragraphs = [p.text for p in doc2.paragraphs]
+    else:
+        # Unsupported file types or mismatch
+        print("Unsupported file types or mismatch between file types.")
+        return 0
+
+    # Find the references section in the paragraphs, find the idx of the last reference_indicator in the paragraph list
+    ref1_idx = doc1_paragraphs.index(reference_indicator) if reference_indicator in doc1_paragraphs else -1
+    ref2_idx = doc2_paragraphs.index(reference_indicator) if reference_indicator in doc2_paragraphs else -1
+
+    if ref1_idx == -1 and ref2_idx == -1:
+        return 1
+
+    if ref1_idx == -1 or ref2_idx == -1:
+        return 0
+
+    # split the reference section into reference items, and remove the empty string items
+    ref1 = [p for p in doc1_paragraphs[ref1_idx + 1:] if p.strip()]
+    ref2 = [p for p in doc2_paragraphs[ref2_idx + 1:] if p.strip()]
+
+    # Compare the references
+
+    if len(ref1) != len(ref2):
+        return 0
+
+    total_similarity = 0
+    for r1, r2 in zip(ref1, ref2):
+        # fuzzy match the references
+        similarity = fuzz.ratio(r1, r2) / 100.0
+        total_similarity += similarity
+
+    result = total_similarity / len(ref1)
+
+    epsilon = 0.01
+
+    if result >= reference_base_result + epsilon:
+        return (result - reference_base_result) / (1 - reference_base_result)
+    else:
+        return 0
+
+
+def compare_unique_train_records(processed_file, expected_files, **kwargs):
+    """
+    Compares the processed file with a list of expected files containing the
+    gold standard and the initial document.
+    expected_files[0] should be the gold standard file.
+    expected_files[1] should be the initial file.
+    """
+    # Debug logging to understand what we're actually receiving
+    logger.info(f"DEBUG: processed_file type: {type(processed_file)}, value: {processed_file}")
+    logger.info(f"DEBUG: expected_files type: {type(expected_files)}, value: {expected_files}")
+    logger.info(f"DEBUG: kwargs: {kwargs}")
+    
+    if not processed_file or not isinstance(expected_files, list) or len(expected_files) < 2:
+        logger.error("Invalid arguments: processed_file and a list of 2 expected_files are required.")
+        return 0
+
+    gold_file = expected_files[0]
+    initial_file = expected_files[1]
+
+    if not gold_file or not initial_file:
+        logger.error("Gold file or initial file path is missing from expected_files list.")
+        return 0
+
+    # Helper function to get lines and IDs from a file
+    def get_lines_and_ids_from_file(file_path):
+        try:
+            doc = Document(file_path)
+            lines = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
+            train_ids = [line.split(',')[1].strip() for line in lines if len(line.split(',')) == 4]
+            return lines, train_ids
+        except Exception as e:
+            logger.error(f"Error opening or parsing file {file_path}: {e}")
+            return None, None
+
+    # Get data from all three files
+    processed_lines, processed_train_ids = get_lines_and_ids_from_file(processed_file)
+    if processed_lines is None: return 0
+
+    gold_lines, gold_train_ids = get_lines_and_ids_from_file(gold_file)
+    if gold_lines is None: return 0
+
+    initial_lines, _ = get_lines_and_ids_from_file(initial_file)
+    if initial_lines is None: return 0
+    initial_lines_set = set(initial_lines)
+
+    # 1. Subset Check: Ensure every processed line was in the initial file
+    if not set(processed_lines).issubset(initial_lines_set):
+        logger.error("Processed file contains lines not present in the initial file.")
+        logger.error(f"Extra lines: {set(processed_lines) - initial_lines_set}")
+        return 0
+
+    # 2. Uniqueness Check: Check for duplicates within the processed file
+    if len(processed_train_ids) != len(set(processed_train_ids)):
+        logger.error("Duplicate train_ids found in the processed file.")
+        return 0
+
+    # 3. Correctness Check: Compare the set of train_ids
+    if set(processed_train_ids) != set(gold_train_ids):
+        logger.error("Set of train_ids does not match between processed file and gold file.")
+        return 0
+
+    # 4. Line count check
+    if len(processed_lines) != len(gold_lines):
+        logger.error("Number of lines does not match between processed file and gold file.")
+        return 0
+
+    return 1
+
+if __name__ == "__main__":
+    image_path = "/home/ubuntu/OSWorld/cache/02ce9a50-7af2-47ed-8596-af0c230501f8/ls.png"
+    print(compare_image_text(image_path, {
+        "type": "text",
+        "text": "ls"
+      }))
\ No newline at end of file
diff --git a/openhands/nvidia/os_world/metrics/general.py b/openhands/nvidia/os_world/metrics/general.py
new file mode 100644
index 000000000..3830c7989
--- /dev/null
+++ b/openhands/nvidia/os_world/metrics/general.py
@@ -0,0 +1,663 @@
+import csv
+import datetime
+import difflib
+import functools
+import json
+import operator
+import os
+import re
+import sqlite3
+from numbers import Number
+from typing import Callable, Any, Union
+from typing import Dict, List, Pattern
+
+import lxml.etree
+import pdfplumber
+import yaml
+from docx import Document
+from lxml.cssselect import CSSSelector
+from lxml.etree import _Element
+from rapidfuzz import fuzz
+
+from openhands.nvidia.os_world.metrics.utils import _match_record, _match_value_to_rule
+from openhands.core.logger import openhands_logger as logger
+
+
+def check_include_exclude(result: str, rules: Dict[str, List[str]]) -> float:
+    if result is None:
+        return 0.
+
+    print(result, rules)
+    include = rules.get("include", [])
+    exclude = rules.get("exclude", [])
+    if all(r in result for r in include) and all(r not in result for r in exclude):
+        return 1.
+    else:
+        return 0.
+
+
+def exact_match(result, rules) -> float:
+    expect = rules["expected"]
+    print(result, expect)
+
+    if result == expect:
+        return 1.
+    else:
+        return 0.
+
+def match_in_list(result, rules) -> float:
+    expect = rules["expected"]
+    print(result, expect)
+
+    if result in expect:
+        return 1.
+    else:
+        return 0.
+
+def literal_match(result: Any, expected: Any, **options) -> float:
+    literal_type = options.get('type', 'str')
+    if literal_type == 'str':
+        ignore_case = options.get('ignore_case', False)
+        score = str(result) == str(expected) if not ignore_case else str(result).lower() == str(expected).lower()
+        return float(score)
+    elif literal_type == 'list':
+        if type(result) not in [list, tuple] or type(expected) not in [list, tuple] or len(result) != len(expected):
+            return .0
+        ignore_case = options.get('ignore_case', False)
+        result = [str(s) for s in result] if not ignore_case else [str(s).lower() for s in result]
+        expected = [str(s) for s in expected] if not ignore_case else [str(s).lower() for s in expected]
+        return float(result == expected)
+    else:
+        raise NotImplementedError(f"Type {type} not supported")
+
+
+def is_in_list(result, rules) -> float:
+    expect = rules["expected"]
+    if expect in result:
+        return 1.
+    else:
+        return 0.
+
+
+def diff_text_file(result: str, expect: str) -> float:
+    if result is None:
+        return 0.
+
+    with open(result) as f:
+        result_lines: List[str] = f.read().splitlines()
+    with open(expect) as f:
+        expected_lines: List[str] = f.read().splitlines()
+    return difflib.SequenceMatcher(a=result_lines, b=expected_lines).ratio()
+
+
+def fuzzy_match(result, rules) -> float:
+    expect = rules["expected"]
+
+    return fuzz.ratio(result, expect) / 100.
+
+
+def fuzzy_place_math(result_file_path, rules) -> float:
+    if result_file_path is None:
+        return 0.
+    expect = rules["expected"]  # a list of possible answers
+    # read list.docx, and get all texts out, overlook blank lines, remove blanks before and after each line
+    doc = Document(result_file_path)
+    words_list = []
+    for para in doc.paragraphs:
+        words_list.extend(para.text.split())
+    fuzzy_score_list = []
+    for word in words_list:
+        max_score = 0
+        for ans in expect:
+            score = fuzz.ratio(word, ans) / 100
+            max_score = max(max_score, score)
+        fuzzy_score_list.append(max_score)
+    if len(fuzzy_score_list) != 3:
+        return 0.
+    return sum(fuzzy_score_list) / 3
+
+
+def check_csv(result: str, rules: Dict[str, List[Dict[str, str]]]) -> float:
+    """
+    Args:
+        result (str): path to csv file
+        rules (Dict[str, List[Dict[str, str]]]): dict like
+          {
+            "expect": [{key: value}]
+            "unexpect": [{key: value}]
+          }
+
+    Returns:
+        float
+    """
+
+    if result is None:
+        return 0.
+
+    expect_metrics = [False] * len(rules.get("expect", []))
+    unexpect_metric = True
+    with open(result) as f:
+        reader = csv.DictReader(f)
+
+        for rcd in reader:
+            for i, r in enumerate(rules.get("expect", [])):
+                expect_metrics[i] = expect_metrics[i] or _match_record(r, rcd)
+            unexpect_metric = unexpect_metric and not any(_match_record(r, rcd) for r in rules.get("unexpect", []))
+    return float(all(expect_metrics) and unexpect_metric)
+
+
+def check_list(result: str, rules: Dict[str, List[str]]) -> float:
+    """
+    Args:
+        result (str): path to list file
+        rules (Dict[str, List[str]]): dict like
+          {
+            "expect": list of str as regexes
+            "unexpect": list of str as regexes
+          }
+
+    Returns:
+        float
+    """
+
+    if result is None:
+        return 0.
+
+    expect_patterns: List[Pattern[str]] = [re.compile(ptt) for ptt in rules.get("expect", [])]
+    unexpect_patterns: List[Pattern[str]] = [re.compile(ptt) for ptt in rules.get("unexpect", [])]
+
+    expect_metrics = [False] * len(expect_patterns)
+    unexpect_metric = True
+    with open(result) as f:
+        for l in f:
+            for i, r in enumerate(expect_patterns):
+                expect_metrics[i] = expect_metrics[i] or (r.search(l) is not None)
+            unexpect_metric = unexpect_metric and all(r.search(l) is None for r in unexpect_patterns)
+    return float(all(expect_metrics) and unexpect_metric)
+
+
+_accessibility_ns_map = {
+    "ubuntu": {
+        "st": "https://accessibility.ubuntu.example.org/ns/state",
+        "attr": "https://accessibility.ubuntu.example.org/ns/attributes",
+        "cp": "https://accessibility.ubuntu.example.org/ns/component",
+        "doc": "https://accessibility.ubuntu.example.org/ns/document",
+        "docattr": "https://accessibility.ubuntu.example.org/ns/document/attributes",
+        "txt": "https://accessibility.ubuntu.example.org/ns/text",
+        "val": "https://accessibility.ubuntu.example.org/ns/value",
+        "act": "https://accessibility.ubuntu.example.org/ns/action",
+    },
+    "windows": {
+        "st": "https://accessibility.windows.example.org/ns/state",
+        "attr": "https://accessibility.windows.example.org/ns/attributes",
+        "cp": "https://accessibility.windows.example.org/ns/component",
+        "doc": "https://accessibility.windows.example.org/ns/document",
+        "docattr": "https://accessibility.windows.example.org/ns/document/attributes",
+        "txt": "https://accessibility.windows.example.org/ns/text",
+        "val": "https://accessibility.windows.example.org/ns/value",
+        "act": "https://accessibility.windows.example.org/ns/action",
+        "class": "https://accessibility.windows.example.org/ns/class"
+    },
+    "macos": {
+        "st": "https://accessibility.macos.example.org/ns/state",
+        "attr": "https://accessibility.macos.example.org/ns/attributes",
+        "cp": "https://accessibility.macos.example.org/ns/component",
+        "doc": "https://accessibility.macos.example.org/ns/document",
+        "txt": "https://accessibility.macos.example.org/ns/text",
+        "val": "https://accessibility.macos.example.org/ns/value",
+        "act": "https://accessibility.macos.example.org/ns/action",
+        "role": "https://accessibility.macos.example.org/ns/role",
+    }
+
+}
+
+def check_accessibility_tree(result: str, rules: List[Dict[str, Any]], osname: str = "ubuntu") -> float:
+    """
+    Args:
+        result (str): XML of GNOME Accessibility Tree
+        rules (List[Dict[str, Any]]): list of dict like
+          {
+            "selectors": list of str as CSS selectors, will be connected by ", "
+              to form a composite selector. Only one from `selectors` and
+              `xpath` is needed. If both are present, `xpath` takes the
+              priority.
+            "xpath": str as xpath. Only one from `selectors` and `xpath` is
+              needed. If both are present, `xpath` takes the priority.
+            "text": str as the expected text content of the selected element.
+            "exact": bool specifying whether exact match or fuzzy match should
+              be performed. defaults to True.
+          }
+        osname (str): "ubuntu" | "windows" | "macos". "ubuntu" by default.
+
+    Returns:
+        float
+    """
+
+    a11y_ns_map = _accessibility_ns_map[osname]
+
+    at: _Element = lxml.etree.fromstring(result)
+    total_match_score = 1.
+    for r in rules:
+        if "xpath" in r:
+            elements: List[_Element] = at.xpath(r["xpath"], namespaces=a11y_ns_map)
+        elif "selectors" in r:
+            selector = CSSSelector(", ".join(r["selectors"]), namespaces=a11y_ns_map)
+            elements: List[_Element] = selector(at)
+        else:
+            raise ValueError("At least one of xpath and selectors is required")
+
+        if len(elements) == 0:
+            logger.info("No elements: %s", r["xpath"] if "xpath" in r else r["selectors"])
+            return 0.
+
+        if "text" in r:
+            match_func: Callable[[str], Number] = functools.partial(operator.eq if r["exact"] \
+                                                                        else (lambda a, b: fuzz.ratio(a, b) / 100.)
+                                                                    , r["text"]
+                                                                    )
+            match_score: Number = 0
+            for elm in elements:
+                match_score = max(match_score, match_func(elm.text or None))
+        else:
+            match_score = 1.
+        total_match_score *= match_score
+
+    return float(total_match_score)
+
+
+# def check_existence(result: str, *args) -> float:
+# return 1. - (result is None)
+
+def run_sqlite3(result: str, rules: Dict[str, Any]) -> float:
+    connection: sqlite3.Connection = sqlite3.connect(result)
+    cursor: sqlite3.Cursor = connection.execute(rules["sql"])
+    return float(cursor.fetchone()[0] or 0)
+
+
+def check_json(result: str, rules: Dict[str, List[Dict[str, Union[List[str], str]]]], is_yaml: bool = False) -> float:
+    """
+    Args:
+        result (str): path to json file
+        rules (Dict[str, List[Dict[str, Union[List[str], str]]]]): dict like
+          {
+            "expect": [
+                {
+                    "key": list of str
+                    "method": str
+                    "ref": something
+                }
+            ],
+            "unexpect": <the same as `expect`
+          }
+        is_yaml (bool): yaml rather than json
+
+    Returns:
+        float
+    """
+
+    if result is None:
+        logger.warning("Result file path is None, returning 0.0")
+        return 0.
+        
+    # Check if file exists
+    if not os.path.exists(result):
+        logger.warning(f"Result file does not exist: {result}, returning 0.0")
+        return 0.
+    
+    try:
+        with open(result, 'r', encoding='utf-8') as f:
+            if is_yaml:
+                try:
+                    # Use SafeLoader instead of Loader for better security and error handling
+                    result_data: Dict[str, Any] = yaml.safe_load(f)
+                    if result_data is None:
+                        logger.warning(f"YAML file {result} is empty or contains only null values, returning 0.0")
+                        return 0.
+                except yaml.YAMLError as e:
+                    logger.error(f"YAML parsing error in file {result}: {e}")
+                    logger.error(f"File content might be corrupted or have invalid YAML syntax")
+                    return 0.
+                except Exception as e:
+                    logger.error(f"Unexpected error parsing YAML file {result}: {e}")
+                    return 0.
+            else:
+                try:
+                    result_data: Dict[str, Any] = json.load(f)
+                except json.JSONDecodeError as e:
+                    logger.error(f"JSON parsing error in file {result}: {e}")
+                    return 0.
+                except Exception as e:
+                    logger.error(f"Unexpected error parsing JSON file {result}: {e}")
+                    return 0.
+    except IOError as e:
+        logger.error(f"IO error reading file {result}: {e}")
+        return 0.
+    except Exception as e:
+        logger.error(f"Unexpected error reading file {result}: {e}")
+        return 0.
+
+    expect_rules = rules.get("expect", {})
+    unexpect_rules = rules.get("unexpect", {})
+
+    metric = True
+    for r in expect_rules:
+        value = result_data
+        try:
+            for k in r["key"]:
+                try:
+                    value = value[k]
+                except KeyError:
+                    logger.debug(f"Key '{k}' not found in result data, returning 0.0")
+                    return 0.
+                except TypeError:
+                    logger.debug(f"Cannot access key '{k}' - value is not a dictionary, returning 0.0")
+                    return 0.
+            metric = metric and _match_value_to_rule(value, r)
+        except Exception as e:
+            logger.error(f"Error processing expect rule {r}: {e}")
+            return 0.
+            
+    for r in unexpect_rules:
+        value = result_data
+        try:
+            for k in r["key"]:
+                try:
+                    value = value[k]
+                except KeyError:
+                    value = None
+                    break
+                except TypeError:
+                    value = None
+                    break
+            metric = metric and not _match_value_to_rule(value, r)
+        except Exception as e:
+            logger.error(f"Error processing unexpect rule {r}: {e}")
+            return 0.
+            
+    return float(metric)
+
+
+def check_direct_json_object(result, rules) -> float:
+    """
+    One of the most commonly used function to evalute.
+    Compare two json objects directly.
+    """
+    logger.info(f"[DEBUG] check_direct_json_object called with result: {result}")
+    logger.info(f"[DEBUG] check_direct_json_object called with rules: {rules}")
+    
+    if isinstance(result, str):
+        # remove blanks before and after result
+        result = result.strip()
+        # replace all ' with "
+        result = result.replace("'", '"')
+        # load json object
+        result = json.loads(result)
+        
+    logger.info(f"[DEBUG] Processed result: {result}")
+    
+    if result is None:
+        logger.info("[DEBUG] Result is None, returning 0.0")
+        return 0.
+    
+    # Check if expected value contains evaluation failure indicator
+    try:
+        expected_json = rules.get("expected", {})
+        if expected_json:
+            for key, value in expected_json.items():
+                if value == "__EVALUATION_FAILED__":
+                    logger.error(f"[DEBUG] Expected value for key '{key}' indicates evaluation failure, returning 0.0")
+                    return 0.
+    except Exception as e:
+        logger.error(f"[DEBUG] Error checking for evaluation failure indicator: {e}")
+        return 0.
+    try:
+        expect_in_result = rules.get("expect_in_result", False)
+        logger.info(f"[DEBUG] expect_in_result: {expect_in_result}")
+        
+        if not expect_in_result:
+            expected_json = rules["expected"]
+            logger.info(f"[DEBUG] Expected JSON: {expected_json}")
+            
+            for key in expected_json.keys():
+                expected_value = expected_json.get(key)
+                actual_value = result.get(key)
+                logger.info(f"[DEBUG] Checking key '{key}': expected='{expected_value}', actual='{actual_value}'")
+                
+                if expected_json.get("ignore_list_order", False):
+                    expected_value = sorted(expected_value)
+                    result_value = sorted(result.get(key))
+                    logger.info(f"[DEBUG] Comparing lists (sorted): expected={expected_value}, actual={result_value}")
+                    if expected_value != result_value:
+                        logger.info(f"[DEBUG] List comparison failed for key '{key}', returning 0.0")
+                        return 0.
+                else:
+                    if expected_value != actual_value:
+                        logger.info(f"[DEBUG] Value comparison failed for key '{key}': expected='{expected_value}', actual='{actual_value}', returning 0.0")
+                        return 0.
+                    else:
+                        logger.info(f"[DEBUG] Value comparison passed for key '{key}'")
+                        
+            logger.info("[DEBUG] All comparisons passed, returning 1.0")
+            return 1.0
+        else:
+            expected_json = rules["expected"]
+            logger.info(f"[DEBUG] Expected JSON (expect_in_result mode): {expected_json}")
+
+            for key in expected_json.keys():
+                if isinstance(expected_json.get(key), list):
+                    flag = 0
+                    expected_value_list = expected_json.get(key)
+                    logger.info(f"[DEBUG] Checking list key '{key}': expected_list={expected_value_list}, actual='{result.get(key)}'")
+                    for each_expected_value in expected_value_list:
+                        # Handle both list and string cases
+                        if isinstance(result.get(key), list) and each_expected_value in result.get(key):
+                            flag = 1
+                            logger.info(f"[DEBUG] Found expected value '{each_expected_value}' in result list for key '{key}'")
+                            break
+                        elif isinstance(result.get(key), str) and each_expected_value == result.get(key):
+                            flag = 1
+                            logger.info(f"[DEBUG] Found expected value '{each_expected_value}' matches result string for key '{key}'")
+                            break
+                    if flag == 0:
+                        logger.info(f"[DEBUG] No expected values found in result for key '{key}', returning 0.0")
+                        return 0.
+                elif isinstance(expected_json.get(key), str):
+                    expected_str = expected_json.get(key)
+                    actual_str = result.get(key)
+                    logger.info(f"[DEBUG] Checking string key '{key}': expected='{expected_str}', actual='{actual_str}'")
+                    if expected_str not in actual_str:
+                        logger.info(f"[DEBUG] Expected string '{expected_str}' not found in actual string '{actual_str}' for key '{key}', returning 0.0")
+                        return 0.
+                else:
+                    logger.debug("check_direct_json_object: expected value type not supported")
+                    return 0.
+            logger.info("[DEBUG] All expect_in_result comparisons passed, returning 1.0")
+            return 1.0
+    except Exception as e:
+        logger.debug(f"check_direct_json_object: result is not a valid json object, error: {e}")
+        return 0.
+
+
+def compare_time_in_speedtest_results(speedtest_result_path, time_diff):
+    if not speedtest_result_path:
+        return 0
+
+    # open the speedtest results file(csv)
+    #date_col = None
+    try:
+        with open(speedtest_result_path, 'r') as f:
+            for i, line in enumerate(f):
+                if i == 1:
+                    date = line.split(',')[1]
+                    break
+            now_date_time = datetime.datetime.now().strftime('%H:%M')
+            date_time = date[-5:]
+            # compare the date time with the current date time, if time diff less than time_diff para, then return true
+            if not abs((datetime.datetime.strptime(date_time, '%H:%M') - datetime.datetime.strptime(now_date_time,
+                                                                                                    '%H:%M')).total_seconds()) / 60 < int(
+                time_diff):
+                return 0
+        return 1
+    except:
+        logger.debug("compare_time_in_speedtest_results: file not found or not readable")
+        return 0
+
+
+def is_included_all_json_objects(gold_file_path, result_file_path):
+    if not gold_file_path or not result_file_path:
+        return 0
+
+    print("gold_file_path: ")
+    print(gold_file_path)
+    print("result_file_path: ")
+    print(result_file_path)
+    # two json file, check if all the key-value pair in gold_file_path is included in result_file_path
+    with open(gold_file_path, 'r') as f:
+        gold_json = json.load(f)
+    with open(result_file_path, 'r') as fr:
+        result_json = json.load(fr)
+    for key in gold_json.keys():
+        if key not in result_json.keys() or gold_json[key] != result_json[key]:
+            return 0
+    return 1
+
+
+def is_gold_text_included_in_pdf(pdf_file_path, gold_text_path):
+    if not gold_text_path or not pdf_file_path:
+        return 0
+
+    print("gold_text_path: ")
+    print(gold_text_path)
+    print("pdf_file_path: ")
+    print(pdf_file_path)
+    # gold file is a json file, we need to check all the value in json are included in pdf file.
+    with open(gold_text_path, 'r') as f:
+        gold_json = json.load(f)
+    with pdfplumber.open(pdf_file_path) as pdf:
+        text = ''
+        for page in pdf.pages:
+            text += page.extract_text()
+    false_list = []
+    for key in gold_json.keys():
+        if gold_json[key] not in text:
+            false_list.append(key)
+    if len(false_list) > 0:
+        print("false_list: ")
+        print(false_list)
+        return 0
+    else:
+        return 1
+
+
+def file_contains(file_path, config):
+    # file_path ends with .txt
+    if not file_path:
+        return 0.
+    try:
+        with open(file_path, 'r') as f:
+            file_text = f.read()
+        for text in config["expected"]:
+            if text not in file_text:
+                logger.debug(f"file_contains: {text} not found in {file_path}")
+                return 0.
+    except:
+        logger.debug("file_contains: file not found or not readable")
+        return 0.
+    return 1.
+
+
+def check_line_number(file_path, line_number):
+    # check if file_path exists
+    if file_path is None or not os.path.isfile(file_path):
+        return 0.
+    timeRegex = "([01]\\d|2[0-3]):[0-5]\\d:([0-5]\\d|60)"
+    # check if the string that matches the timeRegex in this txt file equals to line_number["expected"]
+    try:
+        with open(file_path, 'r') as f:
+            line_count = 0
+            for line in f:
+                if re.search(timeRegex, line):
+                    line_count += 1
+        # if line_count equals to line_number["expected"], return 1, else return 0
+        return 1 if line_count == int(line_number["expected"]) else 0
+    except:
+        logger.debug("check_line_number: file not found or not readable")
+        return 0.
+
+
+def compare_terminal_and_txt(txt_file_path, terminal_output):
+    if not txt_file_path or not terminal_output:
+        return 0
+
+    # read txt file content
+    with open(txt_file_path, 'r') as f:
+        txt_file_content = f.read()
+    # compare terminal output with txt file content
+    return 1 if terminal_output == txt_file_content else 0
+
+
+def compare_python_pure_text(py_file_path, gold_file_path):
+    if not py_file_path or not gold_file_path:
+        return 0.0
+
+    def _normalize(text):
+        """
+        Minimal normalization - only handle basic formatting:
+        - Skip obvious file metadata (encoding, shebang) at the beginning
+        - Normalize whitespace and indentation
+        - Remove empty lines
+        
+        This preserves any content that shouldn't be there (like markdown)
+        so it can be detected as an error.
+        """
+        lines = text.splitlines()
+        result_lines = []
+        i = 0
+        
+        # Only skip obvious metadata at the very beginning
+        while i < len(lines) and i < 3:  # Check only first 3 lines
+            stripped = lines[i].strip()
+            
+            if (stripped.startswith('#!') or
+                stripped.startswith('# -*- coding:') or
+                stripped.startswith('# coding:') or
+                stripped.startswith('# coding=')):
+                i += 1
+                continue
+            
+            break
+        
+        # Process all remaining lines with minimal filtering
+        while i < len(lines):
+            line = lines[i]
+            stripped = line.strip()
+            
+            if stripped:  # Keep all non-empty lines
+                normalized = line.expandtabs(4).rstrip()
+                result_lines.append(normalized)
+            
+            i += 1
+        
+        return '\n'.join(result_lines)
+
+    try:
+        with open(py_file_path, 'r', encoding='utf-8') as file1:
+            user_content = file1.read()
+        with open(gold_file_path, 'r', encoding='utf-8') as file2:
+            gold_content = file2.read()
+        
+        # Apply different normalization strategies
+        user_normalized = _normalize(user_content)
+        gold_normalized = _normalize(gold_content)
+        
+        if user_normalized == gold_normalized:
+            return 1.0
+        else:
+            return 0.0
+            
+    except (FileNotFoundError, IOError, UnicodeDecodeError) as e:
+        logger.debug(f"compare_python_pure_text: Error reading files - {e}")
+        return 0.0
+    except Exception as e:
+        logger.debug(f"compare_python_pure_text: Unexpected error - {e}")
+        return 0.0
diff --git a/openhands/nvidia/os_world/metrics/gimp.py b/openhands/nvidia/os_world/metrics/gimp.py
new file mode 100644
index 000000000..993a7c606
--- /dev/null
+++ b/openhands/nvidia/os_world/metrics/gimp.py
@@ -0,0 +1,816 @@
+import os
+from typing import List, Union
+from skimage.metrics import structural_similarity as ssim
+from PIL import Image, ImageChops, ImageStat
+
+from openhands.core.logger import openhands_logger as logger
+
+
+def compare_image_list(pred_img_path_list: Union[str, List[str]],
+                       gold_img_path_list: Union[str, List[str]]) -> float:
+    """ Compare two image lists, only if all images are the same, return 1.0, otherwise return 0.0
+    """
+    if type(pred_img_path_list) != list:
+        pred_img_path_list = [pred_img_path_list]
+        gold_img_path_list = [gold_img_path_list]
+    for pred_img_path, gold_img_path in zip(pred_img_path_list, gold_img_path_list):
+        if not pred_img_path or not gold_img_path:
+            return 0.0
+        pred_img = Image.open(pred_img_path)
+        gold_img = Image.open(gold_img_path)
+        
+        # Check if images have different sizes and resize if necessary
+        if pred_img.size != gold_img.size:
+            logger.debug(f"Images have different sizes: {pred_img.size} vs {gold_img.size}, resizing predicted image to match gold image")
+            pred_img = pred_img.resize(gold_img.size, Image.Resampling.LANCZOS)
+        
+        # Ensure both images are in the same mode for comparison
+        if pred_img.mode != gold_img.mode:
+            pred_img = pred_img.convert(gold_img.mode)
+        
+        diff = ImageChops.difference(pred_img, gold_img)
+        if diff.getbbox():
+            return 0.0
+    return 1.0
+
+
+def get_gimp_export_path():
+    # Path to GIMP's configuration file. This example assumes GIMP version 2.10.
+    # You need to adjust the path according to the GIMP version and user's file system.
+    gimp_config_file = os.path.expanduser("~/.config/GIMP/2.10/gimprc")
+
+    try:
+        # Open and read the configuration file
+        with open(gimp_config_file, 'r') as file:
+            for line in file:
+                # Search for the default export path setting
+                if "default-export-path" in line:
+                    # Extract the current path from the line (assuming it's enclosed in quotes)
+                    current_path = line.split('"')[1]
+                    # Compare the current path with the expected path
+                    return current_path
+    except FileNotFoundError:
+        # Handle the case where the configuration file is not found
+        logger.debug("GIMP configuration file not found")
+        return False
+
+
+def check_file_exists(directory, filename):
+    file_path = os.path.join(directory, filename)
+    return 1 if os.path.isfile(file_path) else 0
+
+
+def increase_saturation(image1_path: str, image2_path: str) -> float:
+    def calculate_saturation(image):
+        # convert the image to HSV mode
+        hsv_image = image.convert("HSV")
+
+        saturation_channel = hsv_image.split()[1]
+
+        # calculate the mean saturation level
+        stat = ImageStat.Stat(saturation_channel)
+        mean_saturation = stat.mean[0]
+
+        return mean_saturation
+
+    image1 = Image.open(image1_path)
+    image2 = Image.open(image2_path)
+
+    # calculate the saturation level of each image
+    saturation1 = calculate_saturation(image1)
+    saturation2 = calculate_saturation(image2)
+
+    return 1 if saturation1 < saturation2 else 0
+
+
+def decrease_brightness(image1_path: str, image2_path: str) -> float:
+    def calculate_brightness(image):
+        # Convert the image to grayscale mode
+        grayscale_image = image.convert("L")
+
+        # Get the image data
+        pixels = list(grayscale_image.getdata())
+
+        brightness = sum(pixels) / len(pixels)
+        return brightness
+
+    image1 = Image.open(image1_path)
+    image2 = Image.open(image2_path)
+
+    brightness1 = calculate_brightness(image1)
+    brightness2 = calculate_brightness(image2)
+
+    return 1 if brightness1 > brightness2 else 0
+
+
+import cv2
+import numpy as np
+
+
+def find_yellow_triangle(image):
+    # Convert the image to RGBA
+    rgba = cv2.cvtColor(image, cv2.COLOR_BGR2RGBA)
+
+    # define range of yellow color in HSV
+    lower_yellow = np.array([0, 0, 0], dtype=np.uint8)
+    upper_yellow = np.array([255, 255, 255], dtype=np.uint8)
+
+    # expand the dimensions of lower and upper yellow to match the image dimensions
+    lower_yellow = np.reshape(lower_yellow, (1, 1, 3))
+    upper_yellow = np.reshape(upper_yellow, (1, 1, 3))
+    # build a mask for the yellow color
+    mask = cv2.inRange(rgba, lower_yellow, upper_yellow)
+
+    # search for contours in the mask
+    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    # choose the largest contour
+    max_contour = max(contours, key=cv2.contourArea)
+
+    # calculate the center of the contour
+    M = cv2.moments(max_contour)
+    cx = int(M['m10'] / M['m00'])
+    cy = int(M['m01'] / M['m00'])
+
+    return cx, cy
+
+
+def compare_triangle_positions(image1, image2):
+    image1 = cv2.imread(image1, cv2.IMREAD_COLOR)
+    image2 = cv2.imread(image2, cv2.IMREAD_COLOR)
+    # find the center of the yellow triangle in each image
+    cx1, cy1 = find_yellow_triangle(image1)
+    cx2, cy2 = find_yellow_triangle(image2)
+
+    # calculate the distance between the center of the triangle and the center of the image
+    center_distance1 = np.sqrt(
+        (cx1 - image1.shape[1] // 2) ** 2 + (cy1 - image1.shape[0] // 2) ** 2)
+    center_distance2 = np.sqrt(
+        (cx2 - image2.shape[1] // 2) ** 2 + (cy2 - image2.shape[0] // 2) ** 2)
+
+    return 1 if center_distance1 > center_distance2 else 0
+
+
+# Functions for the GIMP evaluator
+def calculate_brightness(image):
+    """Calculate the average brightness of an image"""
+    grayscale = image.convert('L')
+    stat = ImageStat.Stat(grayscale)
+    return stat.mean[0]
+
+
+def normalize_brightness(image, target_brightness):
+    """Normalize the brightness of an image to a target brightness in [0, 1]"""
+    current_brightness = calculate_brightness(image)
+    factor = target_brightness / current_brightness
+
+    # Apply a point transform to each pixel
+    def point_transform(x):
+        return min(255, max(0, int(x * factor)))
+
+    return image.point(point_transform)
+
+
+def measure_saturation(hsv_image):
+    """Measure the average saturation of an image"""
+    # Split into H, S, V channels
+    _, s, _ = hsv_image.split()
+    # Convert the saturation channel to a numpy array
+    s_array = np.array(s)
+    # Calculate the average saturation
+    avg_saturation = np.mean(s_array)
+    return avg_saturation
+
+
+def calculate_contrast(image):
+    """Calculate the contrast of an image as the standard deviation of the pixel
+    values."""
+    pixels = np.asarray(image, dtype=np.float32)
+    return np.std(pixels)
+
+
+def calculate_image_sharpness(image_path):
+    # Load the image in grayscale
+    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
+    # Apply the Laplacian operator
+    laplacian = cv2.Laplacian(image, cv2.CV_64F)
+    # Calculate the variance
+    variance = np.var(laplacian)
+    return variance
+
+
+def structure_check_by_mse(img1, img2, threshold=0.03):
+    """Check if two images are approximately the same by MSE"""
+    
+    # Ensure both images are PIL Image objects
+    if not hasattr(img1, 'size') or not hasattr(img2, 'size'):
+        # Convert numpy arrays to PIL Images if needed
+        if hasattr(img1, 'shape'):
+            img1 = Image.fromarray(img1)
+        if hasattr(img2, 'shape'):
+            img2 = Image.fromarray(img2)
+    
+    # Check if images have different sizes and resize if necessary
+    if img1.size != img2.size:
+        logger.debug(f"Images have different sizes: {img1.size} vs {img2.size}, resizing first image to match second")
+        img1 = img1.resize(img2.size, Image.Resampling.LANCZOS)
+    
+    # Ensure both images are in RGB mode for consistent comparison
+    if img1.mode != 'RGB':
+        img1 = img1.convert('RGB')
+    if img2.mode != 'RGB':
+        img2 = img2.convert('RGB')
+    
+    # Now calculate MSE with properly sized images
+    mse = np.mean(
+        (np.array(img1, dtype=np.float32) / 255
+         - np.array(img2, dtype=np.float32) / 255) ** 2)
+    structure_same = True if mse < threshold else False
+    logger.debug(f"MSE: {mse}, threshold: {threshold}")
+    return structure_same
+
+
+def structure_check_by_ssim(img1, img2, threshold=0.9):
+    """Check if two images are approximately the same by SSIM"""
+    min_size = 7
+    if img1.width < min_size or img1.height < min_size or \
+       img2.width < min_size or img2.height < min_size:
+        logger.warning(f"image too small for ssim: {img1.size} vs {img2.size}")
+        return False
+    
+    if img1.mode != 'RGB':
+        img1 = img1.convert('RGB')
+    if img2.mode != 'RGB':
+        img2 = img2.convert('RGB')
+    
+    # Now both images are in RGB mode, so they should have the same number of channels (3)
+    # But we still need to check the size (though the caller should have checked)
+    if img1.size != img2.size:
+        # If the sizes are different, we cannot compare, return False
+        logger.debug(f"Images have different sizes: {img1.size} vs {img2.size}")
+        return False
+
+    array1 = np.array(img1)
+    array2 = np.array(img2)
+    # They should have the same shape now, but double check
+    if array1.shape != array2.shape:
+        logger.debug(f"Images have different shapes after conversion: {array1.shape} vs {array2.shape}")
+        return False
+
+    # Determine the window size for SSIM
+    min_dim = min(array1.shape[0], array1.shape[1])
+    if min_dim < 7:
+        # If the smallest dimension is less than 7, set win_size to the next smaller odd number
+        win_size = min_dim if min_dim % 2 == 1 else min_dim - 1
+        if win_size < 1:
+            logger.debug("Image too small for SSIM computation (min dimension < 1)")
+            return False
+    else:
+        win_size = 7  # default
+
+    try:
+        # For newer versions of skimage, we use channel_axis, for older versions, multichannel
+        # We try to use the newer way first, then fall back to the old way
+        try:
+            # Newer versions (channel_axis is available)
+            similarity = ssim(array1, array2, win_size=win_size, channel_axis=2)
+        except TypeError:
+            # Older versions use multichannel
+            similarity = ssim(array1, array2, win_size=win_size, multichannel=True)
+    except Exception as e:
+        logger.error(f"SSIM computation failed: {e}")
+        return False
+
+    logger.debug("SSIM: %s", similarity)
+    return similarity >= threshold
+
+
+def check_brightness_decrease_and_structure_sim(src_path, tgt_path, threshold=0.03):
+    """
+    Check the brightness of src is lower than tgt and the structures are similar
+    gimp:7a4deb26-d57d-4ea9-9a73-630f66a7b568
+    """
+    if src_path is None or tgt_path is None:
+        return 0.
+
+    img_src = Image.open(src_path)
+    img_tgt = Image.open(tgt_path)
+
+    # Brightness comparison
+    brightness_src = calculate_brightness(img_src)
+    brightness_tgt = calculate_brightness(img_tgt)
+    brightness_reduced = brightness_tgt > brightness_src
+    
+    # print(f"Brightness src: {brightness_src}, tgt: {brightness_tgt}, reduced: {brightness_reduced}")
+
+    # Normalize and compare images
+    target_brightness = 128
+    img_src_normalized = normalize_brightness(img_src, target_brightness)
+    img_tgt_normalized = normalize_brightness(img_tgt, target_brightness)
+
+    structure_same = structure_check_by_mse(img_src_normalized, img_tgt_normalized, threshold=threshold)
+    if brightness_reduced and structure_same:
+        return 1.
+    else:
+        return 0.
+
+
+def check_saturation_increase_and_structure_sim(src_path, tgt_path):
+    """
+    Check the saturation of src is higher than tgt and the structures are similar
+    gimp:554785e9-4523-4e7a-b8e1-8016f565f56a
+    """
+    if src_path is None or tgt_path is None:
+        return 0.
+
+    img_src = Image.open(src_path)
+    hsv_img_src = img_src.convert('HSV')
+    img_tgt = Image.open(tgt_path)
+    hsv_img_tgt = img_tgt.convert('HSV')
+
+    # Saturation comparison
+    src_saturation = measure_saturation(hsv_img_src)
+    tgt_saturation = measure_saturation(hsv_img_tgt)
+
+    saturation_increased = tgt_saturation < src_saturation
+
+    # Structure comparison
+    h1, s1, v1 = hsv_img_src.split()
+    h2, s2, v2 = hsv_img_tgt.split()
+    h_same = structure_check_by_ssim(h1, h2)
+    v_same = structure_check_by_ssim(v1, v2)
+    if h_same and v_same:
+        structure_same = True
+    else:
+        structure_same = False
+
+    if saturation_increased and structure_same:
+        return 1.
+    else:
+        return 0.
+
+
+def check_file_exists_and_structure_sim(src_path, tgt_path):
+    """
+    Check if the image has been exported to the desktop
+    gimp:77b8ab4d-994f-43ac-8930-8ca087d7c4b4
+    """
+    if src_path is None or tgt_path is None:
+        return 0.
+
+    # Check if the file exists
+    export_file_exists = os.path.isfile(src_path)
+    if not export_file_exists:
+        return 0.
+
+    # Check whether the target image is the same as the source image
+    img_src = Image.open(src_path)
+    img_tgt = Image.open(tgt_path)
+    structure_same = structure_check_by_ssim(img_src, img_tgt)
+
+    if structure_same:
+        return 1.
+    else:
+        return 0.
+
+
+def check_triangle_position(tgt_path):
+    """
+    Check if the triangle is in the middle of the image.
+    gimp:f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce
+    """
+    if tgt_path is None:
+        return 0.
+
+    # Load the image
+    img = Image.open(tgt_path)
+    img_array = np.array(img)
+
+    # We assume the triangle is a different color from the background
+    # Find the unique colors
+    unique_colors, counts = np.unique(img_array.reshape(-1, img_array.shape[2]), axis=0,
+                                      return_counts=True)
+    unique_colors_sorted = unique_colors[np.argsort(counts)]
+
+    # Assuming the background is the most common color and the triangle is a different color
+    triangle_color = unique_colors_sorted[1]
+
+    # Create a mask where the triangle pixels are True
+    triangle_mask = np.all(img_array == triangle_color, axis=2)
+
+    # Get the coordinates of the triangle pixels
+    triangle_coords = np.argwhere(triangle_mask)
+
+    # Calculate the centroid of the triangle
+    centroid = triangle_coords.mean(axis=0)
+
+    # Check if the centroid is approximately in the middle of the image
+    image_center = np.array(img_array.shape[:2]) / 2
+
+    # We will consider the triangle to be in the middle if the centroid is within 5% of the image's center
+    tolerance = 0.05 * np.array(img_array.shape[:2])
+    middle = np.all(np.abs(centroid - image_center) < tolerance)
+
+    if bool(middle):
+        return 1.
+    else:
+        return 0.
+
+
+def check_structure_sim(src_path, tgt_path):
+    """
+    Check if the structure of the two images are similar
+    gimp:2a729ded-3296-423d-aec4-7dd55ed5fbb3
+    """
+    if src_path is None or tgt_path is None:
+        return 0.
+
+    try:
+        img_src = Image.open(src_path)
+        img_tgt = Image.open(tgt_path)
+
+        if img_src.size != img_tgt.size:
+            logger.debug(f"size different: src_path: {src_path}, tgt_path: {tgt_path}")
+            return 0.0
+            
+        structure_same = structure_check_by_ssim(img_src, img_tgt)
+        return 1.0 if structure_same else 0.0
+        
+    except Exception as e:
+        logger.error(f"check_structure_sim error: {str(e)}")
+        return 0.0
+
+
+def check_structure_sim_resized(src_path, tgt_path):
+    """
+    Check if the structure of the two images are similar after resizing.
+    gimp:d16c99dc-2a1e-46f2-b350-d97c86c85c15
+    """
+    if src_path is None or tgt_path is None:
+        return 0.
+
+    img_src = Image.open(src_path)
+    img_tgt = Image.open(tgt_path)
+
+    # Check if source image has transparency and extract content area
+    if img_src.mode in ('RGBA', 'LA') or 'transparency' in img_src.info:
+        if img_src.mode != 'RGBA':
+            img_src = img_src.convert('RGBA')
+        
+        # Get alpha channel and find bounding box of non-transparent pixels
+        alpha = img_src.split()[-1]
+        bbox = alpha.getbbox()
+        
+        if bbox is None:
+            # Image is completely transparent
+            logger.debug("Source image is completely transparent")
+            return 0.
+        
+        # Crop to content area only
+        img_src_content = img_src.crop(bbox)
+        logger.debug(f"Source image cropped from {img_src.size} to {img_src_content.size}")
+        
+        # Convert to RGB for comparison
+        img_src_content = img_src_content.convert('RGB')
+        img_src_resized = img_src_content.resize(img_tgt.size)
+    else:
+        # No transparency, resize normally
+        img_src_resized = img_src.resize(img_tgt.size)
+    
+    # Ensure target image is RGB for comparison
+    if img_tgt.mode != 'RGB':
+        img_tgt = img_tgt.convert('RGB')
+
+    # Check if the structure is similar
+    structure_same = structure_check_by_ssim(img_src_resized, img_tgt)
+    if structure_same:
+        return 1.
+    else:
+        return 0.
+
+
+def check_contrast_increase_and_structure_sim(src_path, tgt_path):
+    """
+    Check if the src image has higher contrast than the tgt image and the structures are similar
+    gimp:f723c744-e62c-4ae6-98d1-750d3cd7d79d
+    """
+    if src_path is None or tgt_path is None:
+        return 0.
+
+    # Load images
+    source_image = Image.open(src_path)
+    target_image = Image.open(tgt_path)
+
+    # Calculate contrast
+    source_contrast = calculate_contrast(source_image)
+    target_contrast = calculate_contrast(target_image)
+    higher_contrast = target_contrast < source_contrast
+
+    # Check structure
+    structure_same = structure_check_by_ssim(source_image, target_image, threshold=0.65)
+
+    if higher_contrast and structure_same:
+        return 1.
+    else:
+        return 0.
+
+
+def check_config_status(actual_config_path, rule):
+    """
+    Check if the GIMP status is as expected
+    """
+    if actual_config_path is None:
+        return 0.
+
+    with open(actual_config_path, 'r') as f:
+        content = f.readlines()
+
+    for line in content:
+        if line.startswith('#') or line == '\n':
+            continue
+        items = line.strip().lstrip('(').rstrip(')\n').split()
+        if isinstance(rule["key"], str):
+            if items[0] == rule["key"] and items[-1] == rule["value"]:
+                return 1.
+        elif isinstance(rule["key"], list) and len(rule["key"]) == 2:
+            if items[0] == rule["key"][0] \
+                    and items[1] == rule["key"][1] \
+                    and items[-1] == rule["value"]:
+                return 1.
+    return 0.
+
+
+def check_image_size(src_path, rule):
+    """
+    Check if the size of the src image is correct
+    multi-apps:42f4d1c7-4521-4161-b646-0a8934e36081
+    """
+    if src_path is None:
+        return 0.
+
+    # Load the image
+    img = Image.open(src_path)
+    
+    # Check if we should ignore transparent parts
+    ignore_transparent = rule.get("ignore_transparent", False)
+    
+    if ignore_transparent and img.mode in ('RGBA', 'LA') or 'transparency' in img.info:
+        # Calculate bounding box of non-transparent pixels
+        if img.mode != 'RGBA':
+            img = img.convert('RGBA')
+        
+        # Get alpha channel
+        alpha = img.split()[-1]
+        
+        # Find bounding box of non-transparent pixels
+        bbox = alpha.getbbox()
+        
+        if bbox is None:
+            # Image is completely transparent
+            actual_width = 0
+            actual_height = 0
+        else:
+            # Calculate actual content size
+            actual_width = bbox[2] - bbox[0]
+            actual_height = bbox[3] - bbox[1]
+        
+        logger.debug(f"Original size: {img.size}, Content size: {actual_width}x{actual_height}")
+    else:
+        # Use original image size
+        actual_width = img.size[0]
+        actual_height = img.size[1]
+        logger.debug(f"Image size: {img.size}")
+
+    # Check the size
+    if rule.get("height", None) is not None:
+        height_same = actual_height == rule["height"]
+    else:
+        height_same = True
+    if rule.get("width", None) is not None:
+        width_same = actual_width == rule["width"]
+    else:
+        width_same = True
+
+    if height_same and width_same:
+        logger.debug(f"height_same: {height_same}, width_same: {width_same}")
+        return 1.
+    else:
+        logger.debug(f"height_same: {height_same}, width_same: {width_same}")
+        return 0.
+
+
+def safe_open_image_with_retry(file_path, max_retries=3, retry_delay=0.5):
+    """
+    Safely open an image file with retry mechanism for handling truncated files
+    """
+    import os
+    import time
+    import logger
+    
+    logger = logger.getLogger(__name__)
+    
+    if not file_path or not os.path.exists(file_path):
+        logger.error(f"File does not exist: {file_path}")
+        return None
+    
+    for attempt in range(max_retries):
+        try:
+            # Check file size first
+            file_size = os.path.getsize(file_path)
+            if file_size == 0:
+                logger.warning(f"File is empty: {file_path}")
+                if attempt < max_retries - 1:
+                    time.sleep(retry_delay)
+                    continue
+                return None
+            
+            logger.info(f"Opening image: {file_path} (size: {file_size} bytes, attempt: {attempt + 1})")
+            
+            # Try to open with PIL
+            image = Image.open(file_path)
+            
+            # Verify image can be loaded (trigger actual parsing)
+            image.load()
+            
+            logger.info(f"Successfully opened image: {image.format} {image.mode} {image.size}")
+            return image
+            
+        except (OSError, IOError) as e:
+            if "truncated" in str(e).lower() or "cannot identify" in str(e).lower():
+                logger.warning(f"Attempt {attempt + 1}: Image file appears truncated or corrupted: {e}")
+                if attempt < max_retries - 1:
+                    logger.info(f"Retrying in {retry_delay} seconds...")
+                    time.sleep(retry_delay)
+                    continue
+            else:
+                logger.error(f"IO error opening image: {e}")
+                break
+        except Exception as e:
+            logger.error(f"Unexpected error opening image: {e}")
+            break
+    
+    logger.error(f"Failed to open image after {max_retries} attempts: {file_path}")
+    return None
+
+def check_palette_and_structure_sim(src_path, tgt_path):
+    """
+    Check if the src image is palette-based and the structure of the two images are similar
+    Enhanced with robust error handling for file format issues and truncated files
+    gimp:06ca5602-62ca-47f6-ad4f-da151cde54cc
+    """
+    import logger
+    logger = logger.getLogger(__name__)
+    
+    logger.info(f"Evaluating palette and structure similarity: src={src_path}, tgt={tgt_path}")
+    
+    if src_path is None or tgt_path is None:
+        logger.warning("Source or target path is None")
+        return 0.
+
+    # Safely open source image with retry mechanism
+    source_image = safe_open_image_with_retry(src_path)
+    if source_image is None:
+        logger.error("Failed to open source image")
+        return 0.
+
+    try:
+        # Check if the source image is palette-based
+        palette_based = source_image.mode == 'P'
+        logger.info(f"Source image mode: {source_image.mode}, palette-based: {palette_based}")
+
+        # Safely open target image
+        target_image = safe_open_image_with_retry(tgt_path)
+        if target_image is None:
+            logger.error("Failed to open target image")
+            source_image.close()
+            return 0.
+
+        try:
+            # Convert source image to RGB for comparison
+            source_rgb = source_image.convert('RGB')
+            logger.info(f"Source converted to RGB: {source_rgb.mode} {source_rgb.size}")
+            
+            # Check structure
+            structure_same = structure_check_by_ssim(source_rgb, target_image)
+            logger.info(f"Structure similarity check: {structure_same}")
+            
+            # Evaluation logic
+            if palette_based and structure_same:
+                result = 1.0
+            else:
+                result = 0.0
+                
+            logger.info(f"Evaluation result: {result} (palette_based={palette_based}, structure_same={structure_same})")
+            return result
+            
+        finally:
+            target_image.close()
+            
+    except Exception as e:
+        logger.error(f"Error during evaluation: {e}")
+        return 0.
+    finally:
+        source_image.close()
+
+
+def check_textbox_on_leftside(src_path):
+    """
+    Check if the textbox is on the left side of the image.
+    gimp:e2dd0213-26db-4349-abe5-d5667bfd725c
+    """
+    if src_path is None:
+        return 0.
+
+    source_image = Image.open(src_path)
+    gray_image = source_image.convert("L")
+    width, height = source_image.size
+
+    # Find the bounds of the black text
+    left_most_dark_pixel = width  # Start with the farthest possible left position
+    for y in range(height):
+        for x in range(width):
+            # If the pixel is dark, consider it as part of the text
+            if gray_image.getpixel((x, y)) < 128:  # Arbitrary threshold for "dark"
+                left_most_dark_pixel = min(left_most_dark_pixel, x)
+                break  # Stop after finding the first dark pixel in this row
+
+    # Here we define "almost" on the left side as being within the left 5% of the image
+    if left_most_dark_pixel < width * 0.05:
+        return 1.
+    else:
+        return 0.
+
+
+def check_image_mirror(src_path, tgt_path):
+    """
+    Check if the image is mirrored
+    gimp:72f83cdc-bf76-4531-9a1b-eb893a13f8aa
+    """
+    if src_path is None or tgt_path is None:
+        return 0.
+
+    # Load images
+    source_image = Image.open(src_path)
+    target_image = Image.open(tgt_path)
+
+    # Check if the image is mirrored
+    transposed_image = source_image.transpose(Image.FLIP_LEFT_RIGHT)
+    # Use 0.99 because the image may not be exactly mirrored by gimp
+    mirrored = structure_check_by_ssim(transposed_image, target_image, 0.99)
+    if mirrored:
+        return 1.
+    else:
+        return 0.
+
+
+def check_green_background(src_path, tgt_path):
+    """
+    Check if the background of the source image is green.
+    gimp:734d6579-c07d-47a8-9ae2-13339795476b
+    """
+    if src_path is None or tgt_path is None:
+        return 0.
+
+    # Load images
+    source_image = Image.open(src_path)
+    target_image = Image.open(tgt_path)
+
+    source_pixels = np.array(source_image)
+    target_pixels = np.array(target_image)
+
+    for x in range(target_image.width):
+        for y in range(target_image.height):
+            # Identify background pixel in target image (not black)
+            if tuple(target_pixels[x, y][:3]) != (0, 0, 0):
+                # Check if corresponding pixel in source image is green
+                # Here, "green" means more green than red or blue
+                r, g, b = source_pixels[x, y][:3]
+                if not (g > r and g > b):
+                    return 0.
+
+    return 1.
+
+
+def check_sharper(src_path, tgt_path):
+    """
+    Check if the source image is sharper than the target image.
+    multi-app:bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108
+    """
+    sharpness_src = calculate_image_sharpness(src_path)
+    sharpness_tgt = calculate_image_sharpness(tgt_path)
+    return 1.0 if sharpness_src > sharpness_tgt else 0.0
+
+
+def check_image_file_size(src_path, rule):
+    """
+    Check if the size of the src image within 500KB
+    """
+    if src_path is None:
+        return 0.0
+
+    # Check the size
+    file_size = os.path.getsize(src_path)
+    if file_size < rule["max_size"]:
+        return 1.0
+    else:
+        return 0.0
diff --git a/openhands/nvidia/os_world/metrics/libreoffice.py b/openhands/nvidia/os_world/metrics/libreoffice.py
new file mode 100644
index 000000000..1870c345b
--- /dev/null
+++ b/openhands/nvidia/os_world/metrics/libreoffice.py
@@ -0,0 +1,28 @@
+import fnmatch
+from typing import Dict, List
+
+import lxml.cssselect
+import lxml.etree
+from lxml.etree import _Element as Element
+
+_libconf_namespaces = [("oor", "http://openoffice.org/2001/registry")]
+_libconf_ns_mapping = dict(_libconf_namespaces)
+_setup_locale_selector = lxml.cssselect.CSSSelector('item[oor|path$=L10N]>prop[oor|name=ooSetupSystemLocale]>value',
+                                                    namespaces=_libconf_ns_mapping)
+_locale_selector = lxml.cssselect.CSSSelector('item[oor|path$=L10N]>prop[oor|name=ooLocale]>value',
+                                              namespaces=_libconf_ns_mapping)
+
+
+def check_libre_locale(config_file: str, rules: Dict[str, List[str]]) -> float:
+    config: Element = lxml.etree.parse(config_file).getroot()
+    setup_locale_setting: List[Element] = _setup_locale_selector(config)
+    locale_setting: List[Element] = _locale_selector(config)
+
+    setup_locale_setting: str = setup_locale_setting[0].text \
+        if len(setup_locale_setting) > 0 \
+        else locale_setting[0].text
+
+    return float(any(fnmatch.fnmatchcase(setup_locale_setting, ptn) \
+                     for ptn in rules["locale_set"]
+                     )
+                 )
diff --git a/openhands/nvidia/os_world/metrics/others.py b/openhands/nvidia/os_world/metrics/others.py
new file mode 100644
index 000000000..ab4d1b194
--- /dev/null
+++ b/openhands/nvidia/os_world/metrics/others.py
@@ -0,0 +1,103 @@
+import os
+import os.path
+import zipfile
+from typing import List, Dict
+from typing import Union, TypeVar
+
+import lxml.html
+from lxml.html import HtmlElement
+from mutagen.easyid3 import EasyID3
+
+from .general import diff_text_file
+from .utils import _match_value_to_rule
+
+from openhands.core.logger import openhands_logger as logger
+
+
+def process_epub(filename: str) -> List[str]:
+    file_list: List[str] = []
+
+    base_dir: str = filename + ".dir"
+    os.makedirs(base_dir, exist_ok=True)
+
+    try:
+        with zipfile.ZipFile(filename, "r") as z_f:
+            # Get list of all files in the zip archive
+            zip_file_list = z_f.namelist()
+            
+            # Process toc.ncx if it exists
+            if "toc.ncx" in zip_file_list:
+                with z_f.open("toc.ncx") as in_f \
+                        , open(os.path.join(base_dir, "toc.ncx"), "w") as out_f:
+                    contents: str = in_f.read().decode()
+                    contents = contents.splitlines()
+                    for l in contents:
+                        if "navPoint" not in l:
+                            out_f.write(l + "\n")
+                file_list.append(os.path.join(base_dir, "toc.ncx"))
+            else:
+                logger.debug("toc.ncx not found in epub file: %s", filename)
+            
+            # Process content.opf if it exists
+            if "content.opf" in zip_file_list:
+                with z_f.open("content.opf") as in_f \
+                        , open(os.path.join(base_dir, "content.opf"), "w") as out_f:
+                    contents: str = in_f.read().decode()
+                    contents = contents.splitlines()
+                    for l in contents:
+                        if "dc:identifier" not in l:
+                            out_f.write(l + "\n")
+                file_list.append(os.path.join(base_dir, "content.opf"))
+            else:
+                logger.debug("content.opf not found in epub file: %s", filename)
+            for f_n in z_f.namelist():
+                if f_n.endswith(".html"):
+                    with z_f.open(f_n) as in_f \
+                            , open(os.path.join(base_dir, f_n), "w") as out_f:
+                        html: HtmlElement = lxml.html.fromstring(
+                            ''.join(filter(lambda ch: ch != "\n" and ch != "\r"
+                                           , in_f.read().decode()
+                                           )
+                                    ).encode()
+                        )
+                        out_f.write(lxml.html.tostring(html, pretty_print=True, encoding="unicode"))
+                    file_list.append(os.path.join(base_dir, f_n))
+        logger.debug("%s: %s", filename, file_list)
+        return list(sorted(file_list))
+    except zipfile.BadZipFile:
+        return []
+
+
+def compare_epub(result: str, expected: str) -> float:
+    if result is None:
+        return 0.
+    result_files: List[str] = process_epub(result)
+    expected_files: List[str] = process_epub(expected)
+
+    metric: float = 0.
+    for f1, f2 in zip(result_files, expected_files):
+        current_metric: float = diff_text_file(f1, f2)
+        logger.debug("%s vs %s: %f", f1, f2, current_metric)
+        metric += current_metric
+    if len(result_files) > 0:
+        metric /= len(result_files)
+    return metric
+
+
+V = TypeVar("Value")
+
+
+def check_mp3_meta(result: str, meta: Dict[str, Dict[str, Union[str, V]]]) -> bool:
+    # checks using _match_value_to_rule
+    if result is None:
+        return 0.
+
+    id3_dict = EasyID3(result)
+    metric: bool = True
+    for k, r in meta.items():
+        value = id3_dict.get(k, "")
+        if isinstance(value, list):
+            value: str = ",".join(value)
+        logger.debug("%s.%s: %s", result, k, value)
+        metric = metric and _match_value_to_rule(value, r)
+    return float(metric)
diff --git a/openhands/nvidia/os_world/metrics/pdf.py b/openhands/nvidia/os_world/metrics/pdf.py
new file mode 100644
index 000000000..ef5b38491
--- /dev/null
+++ b/openhands/nvidia/os_world/metrics/pdf.py
@@ -0,0 +1,31 @@
+import operator
+from typing import Any
+from typing import Dict
+
+import fitz  # PyMuPDF
+from pypdf import PdfReader
+
+
+def check_pdf_pages(pdf_file: str, rules: Dict[str, Any]) -> float:
+    if pdf_file is None:
+        return 0.0
+    reader = PdfReader(pdf_file)
+    nb_pages: int = len(reader.pages)
+    return float(getattr(operator, rules["relation"])(nb_pages, rules["ref_value"]))
+
+
+def extract_answers_from_pdf(pdf_file):
+    doc = fitz.open(pdf_file)
+    answers = []
+
+    for page in doc:
+        text = page.get_text()
+        lines = text.split('\n')
+        for line in lines:
+            if line.strip():
+                parts = line.split('=')
+                if len(parts) > 1:
+                    answer = parts[-1].strip()
+                    answers.append(answer)
+
+    return answers
diff --git a/openhands/nvidia/os_world/metrics/slides.py b/openhands/nvidia/os_world/metrics/slides.py
new file mode 100644
index 000000000..ecea2c10f
--- /dev/null
+++ b/openhands/nvidia/os_world/metrics/slides.py
@@ -0,0 +1,929 @@
+import xml.etree.ElementTree as ET
+import zipfile
+from math import sqrt
+
+from pptx import Presentation
+from pptx.util import Inches
+from pptx.enum.shapes import MSO_SHAPE_TYPE
+
+from openhands.core.logger import openhands_logger as logger
+
+def check_presenter_console_disable(config_file_path):
+    try:
+        tree = ET.parse(config_file_path)
+        root = tree.getroot()
+
+        namespaces = {
+            'oor': 'http://openoffice.org/2001/registry'
+        }
+
+        for item in root.findall(
+                ".//item[@oor:path='/org.openoffice.Office.Impress/Misc/Start']/prop[@oor:name='EnablePresenterScreen']",
+                namespaces):
+            # Check if the value of the configuration item indicates that the presenter console has been disabled
+            presenter_screen_enabled = item.find('value').text
+            if presenter_screen_enabled.lower() == 'false':
+                return 1.
+            else:
+                return 0.
+        return 0.
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0.
+
+
+def check_image_stretch_and_center(modified_ppt, original_ppt):
+    # fixme: this func is overfit to this example libreoffice_impress
+    # Load the presentations
+    original_pres = Presentation(original_ppt)
+    modified_pres = Presentation(modified_ppt)
+
+    # Get the first slide of each presentation
+    original_slide = original_pres.slides[0]
+    modified_slide = modified_pres.slides[0]
+
+    # Get the image on the first slide of each presentation
+    original_slide_images = [shape for shape in original_slide.shapes if shape.shape_type == 13]
+    modified_slide_images = [shape for shape in modified_slide.shapes if shape.shape_type == 13]
+
+    if not original_slide_images:
+        return 0.
+    
+    the_image = original_slide_images[0]
+
+    the_modified_image = None
+
+    # Get the images that modified in width and height
+    for modified_image in modified_slide_images:
+        if the_image.image.blob == modified_image.image.blob:
+            the_modified_image = modified_image
+
+    if the_modified_image is None:
+        return 0.
+
+    if (abs(the_modified_image.width - original_pres.slide_width) > Inches(0.5) or
+            abs(the_modified_image.height - original_pres.slide_height) > Inches(0.5) or
+            abs(the_modified_image.left - (original_pres.slide_width - the_modified_image.width) / 2) > Inches(0.5) or
+            abs(the_modified_image.top - (original_pres.slide_height - the_modified_image.height) / 2) > Inches(0.5)):
+        return 0.
+
+    return 1.
+
+
+def is_red_color(color):
+    # judge if the color is red
+    return color and color.rgb == (255, 0, 0)
+
+
+def get_master_placeholder_color(prs):
+    # get the color of the placeholder
+    masters = prs.slide_masters
+    for idx, master in enumerate(masters):
+        for placeholder in master.placeholders:
+            if placeholder.has_text_frame and placeholder.text == "<number>":
+                text_frame = placeholder.text_frame
+
+                if text_frame.paragraphs:
+                    first_paragraph = text_frame.paragraphs[0]
+                    return first_paragraph.font.color
+    return None
+
+
+def check_slide_numbers_color(pptx_file_path):
+    presentation = Presentation(pptx_file_path)
+
+    for i, slide in enumerate(presentation.slides):
+        for shape in slide.shapes:
+            # check if the shape is a text box
+            if hasattr(shape, "text"):
+                if shape.text.isdigit():
+                    # "SlidePlaceholder" is the name of the placeholder in the master slide
+                    page_number_text = shape.text
+                    font_color = get_master_placeholder_color(presentation)
+                    return 1 if font_color is not None and is_red_color(font_color) else 0
+
+
+# import numpy as np
+# from PIL import Image
+# from skimage.metrics import structural_similarity as ssim
+
+# def compare_images(image1_path, image2_path):
+#     # You would call this function with the paths to the two images you want to compare:
+#     # score = compare_images('path_to_image1', 'path_to_image2')
+#     # print("Similarity score:", score)
+
+#     if not image1_path or not image2_path:
+#         return 0
+
+#     # Open the images and convert to grayscale
+#     image1 = Image.open(image1_path).convert('L')
+#     image2 = Image.open(image2_path).convert('L')
+
+#     # Resize images to the smaller one's size for comparison
+#     image1_size = image1.size
+#     image2_size = image2.size
+#     new_size = min(image1_size, image2_size)
+
+#     image1 = image1.resize(new_size, Image.Resampling.LANCZOS)
+#     image2 = image2.resize(new_size, Image.Resampling.LANCZOS)
+
+#     # Convert images to numpy arrays
+#     image1_array = np.array(image1)
+#     image2_array = np.array(image2)
+
+#     # Calculate SSIM between two images
+#     similarity_index = ssim(image1_array, image2_array)
+
+#     return similarity_index
+
+def get_all_text_shapes(slide):
+    """递归获取slide中所有包含文本的shapes，包括GROUP内部的"""
+    
+    def extract_text_shapes(shape):
+        results = []
+        
+        # 检查当前shape是否有文本
+        if hasattr(shape, "text") and hasattr(shape, "text_frame"):
+            results.append(shape)
+        
+        # 如果是GROUP，递归检查内部shapes
+        if hasattr(shape, 'shapes'):
+            for sub_shape in shape.shapes:
+                results.extend(extract_text_shapes(sub_shape))
+        
+        return results
+    
+    all_text_shapes = []
+    for shape in slide.shapes:
+        all_text_shapes.extend(extract_text_shapes(shape))
+    
+    return all_text_shapes
+
+
+def compare_pptx_files(file1_path, file2_path, **options):
+    # todo: not strictly match since not all information is compared because we cannot get the info through pptx
+    prs1 = Presentation(file1_path)
+    prs2 = Presentation(file2_path)
+
+    # Enable debug logging if requested
+    enable_debug = options.get("enable_debug", True)
+    if enable_debug:
+        logger.debug(f"=== COMPARING PPTX FILES ===")
+        logger.debug(f"File 1: {file1_path}")
+        logger.debug(f"File 2: {file2_path}")
+        logger.debug(f"File 1 slides: {len(prs1.slides)}")
+        logger.debug(f"File 2 slides: {len(prs2.slides)}")
+
+    approximately_tolerance = options.get("approximately_tolerance", 0.005)
+    def is_approximately_equal(val1, val2, tolerance=approximately_tolerance):
+        """Compare two values with a tolerance of 0.1% (0.005)"""
+        if val1 == val2:
+            return True
+        if val1 == 0 and val2 == 0:
+            return True
+        if val1 == 0 or val2 == 0:
+            return False
+        return abs(val1 - val2) / max(abs(val1), abs(val2)) <= tolerance
+
+    examine_number_of_slides = options.get("examine_number_of_slides", True)
+    examine_shape = options.get("examine_shape", True)
+    examine_text = options.get("examine_text", True)
+    examine_indent = options.get("examine_indent", True)
+    examine_font_name = options.get("examine_font_name", True)
+    examine_font_size = options.get("examine_font_size", True)
+    examine_font_bold = options.get("examine_font_bold", True)
+    examine_font_italic = options.get("examine_font_italic", True)
+    examine_color_rgb = options.get("examine_color_rgb", True)
+    examine_font_underline = options.get("examine_font_underline", True)
+    examine_strike_through = options.get("examine_strike_through", True)
+    examine_alignment = options.get("examine_alignment", True)
+    examine_title_bottom_position = options.get("examine_title_bottom_position", False)
+    examine_table_bottom_position = options.get("examine_table_bottom_position", False)
+    examine_right_position = options.get("examine_right_position", False)
+    examine_top_position = options.get("examine_top_position", False)
+    examine_shape_for_shift_size = options.get("examine_shape_for_shift_size", False)
+    examine_image_size = options.get("examine_image_size", False)
+    examine_modify_height = options.get("examine_modify_height", False)
+    examine_bullets = options.get("examine_bullets", True)
+    examine_background_color = options.get("examine_background_color", True)
+    examine_note = options.get("examine_note", True)
+
+    # compare the number of slides
+    if len(prs1.slides) != len(prs2.slides) and examine_number_of_slides:
+        if enable_debug:
+            logger.debug(f"MISMATCH: Number of slides differ - File1: {len(prs1.slides)}, File2: {len(prs2.slides)}")
+        return 0
+
+    slide_idx = 0
+    # compare the content of each slide
+    for slide1, slide2 in zip(prs1.slides, prs2.slides):
+        slide_idx += 1
+        if enable_debug:
+            logger.debug(f"--- Comparing Slide {slide_idx} ---")
+            logger.debug(f"Slide {slide_idx} - Shapes count: File1={len(slide1.shapes)}, File2={len(slide2.shapes)}")
+
+        def get_slide_background_color(slide):
+            # background = slide.background
+            # if background.fill.background():
+            #     return background.fill.fore_color.rgb
+            # else:
+            #     return None
+            fill = slide.background.fill
+            if fill.type == 1:
+                return fill.fore_color.rgb
+            elif fill.type == 5:
+                master_fill = slide.slide_layout.slide_master.background.fill
+                if master_fill.type == 1:
+                    return master_fill.fore_color.rgb
+                else:
+                    return None
+            else:
+                return None
+
+        if get_slide_background_color(slide1) != get_slide_background_color(slide2) and examine_background_color:
+            return 0
+
+        def get_slide_notes(slide):
+            notes_slide = slide.notes_slide
+            if notes_slide:
+                return notes_slide.notes_text_frame.text
+            else:
+                return None
+
+        if get_slide_notes(slide1).strip() != get_slide_notes(slide2).strip() and examine_note:
+            if enable_debug:
+                logger.debug(f"    MISMATCH: Slide {slide_idx} - Notes differ:")
+                logger.debug(f"      Notes1: '{get_slide_notes(slide1).strip()}'")
+                logger.debug(f"      Notes2: '{get_slide_notes(slide2).strip()}'")
+            return 0
+
+        # Get all text shapes including those inside GROUPs
+        text_shapes1 = get_all_text_shapes(slide1)
+        text_shapes2 = get_all_text_shapes(slide2)
+        
+        if enable_debug:
+            logger.debug(f"Slide {slide_idx} - Text shapes found: File1={len(text_shapes1)}, File2={len(text_shapes2)}")
+        
+        # check if the number of slides is the same
+        if len(slide1.shapes) != len(slide2.shapes):
+            if enable_debug:
+                logger.debug(f"MISMATCH: Slide {slide_idx} - Different number of shapes: File1={len(slide1.shapes)}, File2={len(slide2.shapes)}")
+            return 0
+
+        # check if the shapes are the same
+        shape_idx = 0
+        for shape1, shape2 in zip(slide1.shapes, slide2.shapes):
+            shape_idx += 1
+            if enable_debug:
+                logger.debug(f"  Shape {shape_idx} - Type: {shape1.shape_type} vs {shape2.shape_type}")
+                if hasattr(shape1, "text") and hasattr(shape2, "text"):
+                    logger.debug(f"  Shape {shape_idx} - Text: '{shape1.text.strip()}' vs '{shape2.text.strip()}'")
+                    logger.debug(f"  Shape {shape_idx} - Position: ({shape1.left}, {shape1.top}) vs ({shape2.left}, {shape2.top})")
+                    logger.debug(f"  Shape {shape_idx} - Size: ({shape1.width}, {shape1.height}) vs ({shape2.width}, {shape2.height})")
+            if examine_title_bottom_position:
+                if hasattr(shape1, "text") and hasattr(shape2, "text") and shape1.text == shape2.text:
+                    if shape1.text == "Product Comparison" and (shape1.top <= shape2.top or shape1.top < 3600000):
+                        return 0
+                elif (not is_approximately_equal(shape1.left, shape2.left) or 
+                      not is_approximately_equal(shape1.top, shape2.top) or 
+                      not is_approximately_equal(shape1.width, shape2.width) or 
+                      not is_approximately_equal(shape1.height, shape2.height)):
+                    return 0
+
+            if examine_table_bottom_position:
+                if slide_idx == 3 and shape1.shape_type == 19 and shape2.shape_type == 19:
+                    if shape1.top <= shape2.top or shape1.top < 3600000:
+                        return 0
+                elif (not is_approximately_equal(shape1.left, shape2.left) or 
+                      not is_approximately_equal(shape1.top, shape2.top) or 
+                      not is_approximately_equal(shape1.width, shape2.width) or 
+                      not is_approximately_equal(shape1.height, shape2.height)):
+                    return 0
+
+            if examine_right_position:
+                if slide_idx == 2 and not hasattr(shape1, "text") and not hasattr(shape2, "text"):
+                    if shape1.left <= shape2.left or shape1.left < 4320000:
+                        return 0
+
+            if examine_top_position:
+                if slide_idx == 2 and shape1.shape_type == 13 and shape2.shape_type == 13:
+                    if shape1.top >= shape2.top or shape1.top > 1980000:
+                        return 0
+
+
+            if examine_shape_for_shift_size:
+                if (not is_approximately_equal(shape1.left, shape2.left) or 
+                    not is_approximately_equal(shape1.top, shape2.top) or 
+                    not is_approximately_equal(shape1.width, shape2.width) or 
+                    not is_approximately_equal(shape1.height, shape2.height)):
+                    if not (hasattr(shape1, "text") and hasattr(shape2,
+                                                                "text") and shape1.text == shape2.text and shape1.text == "Elaborate on what you want to discuss."):
+                        return 0
+
+            # CRITICAL: examine_shape check happens BEFORE examine_modify_height!
+            # If examine_shape=True (default), any shape dimension mismatch will cause immediate return 0,
+            # preventing examine_modify_height from ever being executed.
+            # For height modification tasks, you MUST set examine_shape=False to allow examine_modify_height to work.
+            if (
+                    not is_approximately_equal(shape1.left, shape2.left) or 
+                    not is_approximately_equal(shape1.top, shape2.top) or 
+                    not is_approximately_equal(shape1.width, shape2.width) or 
+                    not is_approximately_equal(shape1.height, shape2.height)) and examine_shape:
+                if enable_debug:
+                    logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} - Shape dimensions differ:")
+                    logger.debug(f"      Left: {shape1.left} vs {shape2.left} (equal: {is_approximately_equal(shape1.left, shape2.left)})")
+                    logger.debug(f"      Top: {shape1.top} vs {shape2.top} (equal: {is_approximately_equal(shape1.top, shape2.top)})")
+                    logger.debug(f"      Width: {shape1.width} vs {shape2.width} (equal: {is_approximately_equal(shape1.width, shape2.width)})")
+                    logger.debug(f"      Height: {shape1.height} vs {shape2.height} (equal: {is_approximately_equal(shape1.height, shape2.height)})")
+                    if hasattr(shape1, "text") and hasattr(shape2, "text"):
+                        logger.debug(f"      Shape text: '{shape1.text.strip()}' vs '{shape2.text.strip()}'")
+                return 0
+
+            if examine_image_size:
+                if shape1.shape_type == 13 and shape2.shape_type == 13:
+                    if not is_approximately_equal(shape1.width, shape2.width) or not is_approximately_equal(shape1.height, shape2.height):
+                        return 0
+                elif (not is_approximately_equal(shape1.left, shape2.left) or 
+                      not is_approximately_equal(shape1.top, shape2.top) or 
+                      not is_approximately_equal(shape1.width, shape2.width) or 
+                      not is_approximately_equal(shape1.height, shape2.height)):
+                    return 0
+
+            # examine_modify_height: Special logic for height modification tasks
+            # - For non-text shapes and FREEFORM shapes (type 5): Only check height differences
+            # - For other shapes: Check all dimensions (left, top, width, height)
+            # WARNING: This check only works if examine_shape=False, otherwise examine_shape will
+            # terminate the comparison before this code is reached!
+            if examine_modify_height:
+                if not hasattr(shape1, "text") and not hasattr(shape2,
+                                                               "text") or shape1.shape_type == 5 and shape2.shape_type == 5:
+                    if not is_approximately_equal(shape1.height, shape2.height):
+                        return 0
+                elif (not is_approximately_equal(shape1.left, shape2.left) or 
+                      not is_approximately_equal(shape1.top, shape2.top) or 
+                      not is_approximately_equal(shape1.width, shape2.width) or 
+                      not is_approximately_equal(shape1.height, shape2.height)):
+                    return 0
+
+            if shape1.shape_type == MSO_SHAPE_TYPE.TABLE:
+                table1 = shape1.table
+                table2 = shape2.table
+                if enable_debug:
+                    logger.debug(f"  Shape {shape_idx} - Comparing TABLE with {len(table1.rows)} rows and {len(table1.columns)} columns")
+                    logger.debug(f"  Shape {shape_idx} - Table2 has {len(table2.rows)} rows and {len(table2.columns)} columns")
+                
+                # Check if tables have the same dimensions
+                if len(table1.rows) != len(table2.rows) or len(table1.columns) != len(table2.columns):
+                    if enable_debug:
+                        logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Table dimensions differ:")
+                        logger.debug(f"      Table1: {len(table1.rows)} rows x {len(table1.columns)} columns")
+                        logger.debug(f"      Table2: {len(table2.rows)} rows x {len(table2.columns)} columns")
+                    return 0
+                
+                for row_idx in range(len(table1.rows)):
+                    for col_idx in range(len(table1.columns)):
+                        cell1 = table1.cell(row_idx, col_idx)
+                        cell2 = table2.cell(row_idx, col_idx)
+
+                        # Check if cells have the same number of paragraphs
+                        if len(cell1.text_frame.paragraphs) != len(cell2.text_frame.paragraphs):
+                            if enable_debug:
+                                logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}] - Different number of paragraphs:")
+                                logger.debug(f"      Cell1 paragraphs: {len(cell1.text_frame.paragraphs)}")
+                                logger.debug(f"      Cell2 paragraphs: {len(cell2.text_frame.paragraphs)}")
+                            return 0
+
+                        for para_idx, (para1, para2) in enumerate(zip(cell1.text_frame.paragraphs, cell2.text_frame.paragraphs)):
+                            # Check if paragraphs have the same number of runs
+                            if len(para1.runs) != len(para2.runs):
+                                if enable_debug:
+                                    logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}], Para {para_idx} - Different number of runs:")
+                                    logger.debug(f"      Para1 runs: {len(para1.runs)}")
+                                    logger.debug(f"      Para2 runs: {len(para2.runs)}")
+                                return 0
+
+                            for run_idx, (run1, run2) in enumerate(zip(para1.runs, para2.runs)):
+                                # Check font color
+                                if hasattr(run1.font.color, "rgb") and hasattr(run2.font.color, "rgb"):
+                                    if run1.font.color.rgb != run2.font.color.rgb and examine_color_rgb:
+                                        if enable_debug:
+                                            logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}], Para {para_idx}, Run {run_idx} - Font color differs:")
+                                            logger.debug(f"      Color1: {run1.font.color.rgb} vs Color2: {run2.font.color.rgb}")
+                                            logger.debug(f"      Cell text: '{cell1.text.strip()}' vs '{cell2.text.strip()}'")
+                                            logger.debug(f"      Run text: '{run1.text}' vs '{run2.text}'")
+                                        return 0
+                                
+                                # Check font bold
+                                if run1.font.bold != run2.font.bold:
+                                    if not ((run1.font.bold is None or run1.font.bold is False) and 
+                                           (run2.font.bold is None or run2.font.bold is False)):
+                                        if enable_debug:
+                                            logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}], Para {para_idx}, Run {run_idx} - Font bold differs:")
+                                            logger.debug(f"      Bold1: {run1.font.bold} vs Bold2: {run2.font.bold}")
+                                            logger.debug(f"      Run text: '{run1.text}' vs '{run2.text}'")
+                                        return 0
+                                
+                                # Check font italic
+                                if run1.font.italic != run2.font.italic:
+                                    if not ((run1.font.italic is None or run1.font.italic is False) and 
+                                           (run2.font.italic is None or run2.font.italic is False)):
+                                        if enable_debug:
+                                            logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}], Para {para_idx}, Run {run_idx} - Font italic differs:")
+                                            logger.debug(f"      Italic1: {run1.font.italic} vs Italic2: {run2.font.italic}")
+                                            logger.debug(f"      Run text: '{run1.text}' vs '{run2.text}'")
+                                        return 0
+                                
+                                # Check font underline
+                                if run1.font.underline != run2.font.underline:
+                                    if run1.font.underline is not None and run2.font.underline is not None:
+                                        if enable_debug:
+                                            logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}], Para {para_idx}, Run {run_idx} - Font underline differs:")
+                                            logger.debug(f"      Underline1: {run1.font.underline} vs Underline2: {run2.font.underline}")
+                                            logger.debug(f"      Run text: '{run1.text}' vs '{run2.text}'")
+                                        return 0
+                                    if (run1.font.underline is None and run2.font.underline is True) or (run1.font.underline is True and run2.font.underline is None):
+                                        if enable_debug:
+                                            logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} (TABLE) - Cell [{row_idx},{col_idx}], Para {para_idx}, Run {run_idx} - Font underline differs (None vs True):")
+                                            logger.debug(f"      Underline1: {run1.font.underline} vs Underline2: {run2.font.underline}")
+                                            logger.debug(f"      Run text: '{run1.text}' vs '{run2.text}'")
+                                        return 0
+
+            if hasattr(shape1, "text") and hasattr(shape2, "text"):
+                if shape1.text.strip() != shape2.text.strip() and examine_text:
+                    return 0
+
+                # check if the number of paragraphs are the same
+                if len(shape1.text_frame.paragraphs) != len(shape2.text_frame.paragraphs):
+                    if enable_debug:
+                        logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx} - Different number of paragraphs:")
+                        logger.debug(f"      Shape1 paragraphs: {len(shape1.text_frame.paragraphs)}")
+                        logger.debug(f"      Shape2 paragraphs: {len(shape2.text_frame.paragraphs)}")
+                    return 0
+
+                    # check if the paragraphs are the same
+                para_idx = 0
+                for para1, para2 in zip(shape1.text_frame.paragraphs, shape2.text_frame.paragraphs):
+                    para_idx += 1
+                    # Handle alignment comparison - treat None and LEFT (1) as equivalent
+                    if examine_alignment:
+                        from pptx.enum.text import PP_ALIGN
+                        align1 = para1.alignment
+                        align2 = para2.alignment
+                        
+                        if enable_debug:
+                            align1_name = "None" if align1 is None else getattr(align1, 'name', str(align1))
+                            align2_name = "None" if align2 is None else getattr(align2, 'name', str(align2))
+                            logger.debug(f"    Slide {slide_idx}, Shape {shape_idx}, Para {para_idx} - Alignment: '{align1_name}' vs '{align2_name}'")
+                            logger.debug(f"    Slide {slide_idx}, Shape {shape_idx}, Para {para_idx} - Text: '{para1.text}' vs '{para2.text}'")
+                        
+                        # Convert None to LEFT for comparison since None means default left alignment
+                        if align1 is None:
+                            align1 = PP_ALIGN.LEFT  # LEFT alignment
+                        if align2 is None:
+                            align2 = PP_ALIGN.LEFT  # LEFT alignment
+                            
+                        if align1 != align2:
+                            if enable_debug:
+                                align1_final = getattr(align1, 'name', str(align1))
+                                align2_final = getattr(align2, 'name', str(align2))
+                                logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx}, Para {para_idx} - Alignment differs: '{align1_final}' vs '{align2_final}'")
+                            return 0
+
+                    # check if the runs are the same
+                    if para1.text != para2.text and examine_text:
+                        return 0
+
+                    if para1.level != para2.level and examine_indent:
+                        return 0
+
+                    # check if the number of runs are the same
+                    if len(para1.runs) != len(para2.runs):
+                        if enable_debug:
+                            logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx}, Para {para_idx} - Different number of runs:")
+                            logger.debug(f"      Para1 runs: {len(para1.runs)}")
+                            logger.debug(f"      Para2 runs: {len(para2.runs)}")
+                        return 0
+
+                    for run1, run2 in zip(para1.runs, para2.runs):
+
+                        # check if the font properties are the same                        
+                        if run1.font.name != run2.font.name and examine_font_name:
+                            if enable_debug:
+                                logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx}, Para {para_idx} - Font name differs:")
+                                logger.debug(f"      Name1: '{run1.font.name}' vs Name2: '{run2.font.name}'")
+                                logger.debug(f"      Text: '{run1.text}' vs '{run2.text}'")
+                            return 0
+
+                        if run1.font.size != run2.font.size and examine_font_size:
+                            if enable_debug:
+                                logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx}, Para {para_idx} - Font size differs:")
+                                logger.debug(f"      Size1: {run1.font.size} vs Size2: {run2.font.size}")
+                                logger.debug(f"      Text: '{run1.text}' vs '{run2.text}'")
+                            return 0
+
+                        if run1.font.bold != run2.font.bold and examine_font_bold:
+                            # Special handling for None vs False - both mean "not bold"
+                            if not ((run1.font.bold is None or run1.font.bold is False) and 
+                                   (run2.font.bold is None or run2.font.bold is False)):
+                                if enable_debug:
+                                    logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx}, Para {para_idx} - Font bold differs:")
+                                    logger.debug(f"      Bold1: {run1.font.bold} vs Bold2: {run2.font.bold}")
+                                    logger.debug(f"      Text: '{run1.text}' vs '{run2.text}'")
+                                return 0
+
+                        if run1.font.italic != run2.font.italic and examine_font_italic:
+                            # Special handling for None vs False - both mean "not italic"
+                            if not ((run1.font.italic is None or run1.font.italic is False) and 
+                                   (run2.font.italic is None or run2.font.italic is False)):
+                                if enable_debug:
+                                    logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx}, Para {para_idx} - Font italic differs:")
+                                    logger.debug(f"      Italic1: {run1.font.italic} vs Italic2: {run2.font.italic}")
+                                    logger.debug(f"      Text: '{run1.text}' vs '{run2.text}'")
+                                return 0
+
+                        if hasattr(run1.font.color, "rgb") and hasattr(run2.font.color, "rgb"):
+                            if run1.font.color.rgb != run2.font.color.rgb and examine_color_rgb:
+                                if enable_debug:
+                                    logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx}, Para {para_idx} - Font color differs:")
+                                    logger.debug(f"      Color1: {run1.font.color.rgb} vs Color2: {run2.font.color.rgb}")
+                                    logger.debug(f"      Text: '{run1.text}' vs '{run2.text}'")
+                                return 0
+
+                        if run1.font.underline != run2.font.underline and examine_font_underline:
+                            if run1.font.underline is not None and run2.font.underline is not None:
+                                if enable_debug:
+                                    logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx}, Para {para_idx} - Font underline differs:")
+                                    logger.debug(f"      Underline1: {run1.font.underline} vs Underline2: {run2.font.underline}")
+                                    logger.debug(f"      Text: '{run1.text}' vs '{run2.text}'")
+                                return 0
+                            if (run1.font.underline is None and run2.font.underline is True) or (run1.font.underline is True and run2.font.underline is None):
+                                if enable_debug:
+                                    logger.debug(f"    MISMATCH: Slide {slide_idx}, Shape {shape_idx}, Para {para_idx} - Font underline differs (None vs True):")
+                                    logger.debug(f"      Underline1: {run1.font.underline} vs Underline2: {run2.font.underline}")
+                                    logger.debug(f"      Text: '{run1.text}' vs '{run2.text}'")
+                                return 0
+                                
+                        if run1.font._element.attrib.get('strike', 'noStrike') != run2.font._element.attrib.get(
+                                'strike', 'noStrike') and examine_strike_through:
+                            return 0
+
+                        def _extract_bullets(xml_data):
+                            root = ET.fromstring(xml_data)
+
+                            namespaces = {
+                                'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
+                                'p': 'http://schemas.openxmlformats.org/presentationml/2006/main',
+                            }
+
+                            bullets = []
+
+                            for paragraph in root.findall('.//a:p', namespaces):
+                                pPr = paragraph.find('a:pPr', namespaces)
+                                if pPr is not None:
+                                    lvl = pPr.get('lvl')
+                                    buChar = pPr.find('a:buChar', namespaces)
+                                    char = buChar.get('char') if buChar is not None else "No Bullet"
+                                    buClr = pPr.find('a:buClr/a:srgbClr', namespaces)
+                                    color = buClr.get('val') if buClr is not None else "No Color"
+                                else:
+                                    lvl = "No Level"
+                                    char = "No Bullet"
+                                    color = "No Color"
+
+                                text = "".join(t.text for t in paragraph.findall('.//a:t', namespaces))
+                                
+                                # Only add non-empty paragraphs to bullets list
+                                if text.strip():
+                                    bullets.append((lvl, char, text, color))
+
+                            return bullets
+
+                        def _compare_bullets_with_tolerance(bullets1, bullets2):
+                            """Compare bullets with tolerance for minor differences"""
+                            if len(bullets1) != len(bullets2):
+                                return False
+                            
+                            for (lvl1, char1, text1, color1), (lvl2, char2, text2, color2) in zip(bullets1, bullets2):
+                                # Compare text (most important)
+                                if text1 != text2:
+                                    return False
+                                
+                                # Compare bullet character
+                                if char1 != char2:
+                                    return False
+                                
+                                # Compare level with tolerance (None and '0' are equivalent)
+                                normalized_lvl1 = '0' if lvl1 is None else lvl1
+                                normalized_lvl2 = '0' if lvl2 is None else lvl2
+                                if normalized_lvl1 != normalized_lvl2:
+                                    return False
+                                
+                                # Color comparison is more lenient - we don't fail on color differences
+                                # since they might be due to theme or formatting differences
+                                # if color1 != color2:
+                                #     return False
+                            
+                            return True
+
+                        if examine_bullets:
+                            try:
+                                bullets1 = _extract_bullets(run1.part.blob.decode('utf-8'))
+                                bullets2 = _extract_bullets(run2.part.blob.decode('utf-8'))
+                                
+                                # Compare bullets with tolerance for minor differences
+                                if not _compare_bullets_with_tolerance(bullets1, bullets2):
+                                    return 0
+                            except:
+                                # If bullet extraction fails, skip bullet comparison
+                                pass
+
+                    # fixme: Actually there are more properties to be compared, we can add them later via parsing the xml data
+
+        # Additional check: compare all text shapes including those in GROUPs
+        if examine_alignment and len(text_shapes1) == len(text_shapes2):
+            for idx, (tshape1, tshape2) in enumerate(zip(text_shapes1, text_shapes2)):
+                if enable_debug:
+                    logger.debug(f"  Additional text shape check {idx+1}: '{tshape1.text.strip()[:30]}' vs '{tshape2.text.strip()[:30]}'")
+                
+                # Compare text content
+                if tshape1.text.strip() != tshape2.text.strip() and examine_text:
+                    if enable_debug:
+                        logger.debug(f"    MISMATCH: Text differs - '{tshape1.text.strip()}' vs '{tshape2.text.strip()}'")
+                    return 0
+                
+                # Check if text shapes have the same number of paragraphs
+                if len(tshape1.text_frame.paragraphs) != len(tshape2.text_frame.paragraphs):
+                    if enable_debug:
+                        logger.debug(f"    MISMATCH: Different number of paragraphs - {len(tshape1.text_frame.paragraphs)} vs {len(tshape2.text_frame.paragraphs)}")
+                    return 0
+                
+                # Compare alignment of each paragraph
+                for para_idx, (para1, para2) in enumerate(zip(tshape1.text_frame.paragraphs, tshape2.text_frame.paragraphs)):
+                    from pptx.enum.text import PP_ALIGN
+                    align1 = para1.alignment
+                    align2 = para2.alignment
+                    
+                    if enable_debug:
+                        align1_name = "None" if align1 is None else getattr(align1, 'name', str(align1))
+                        align2_name = "None" if align2 is None else getattr(align2, 'name', str(align2))
+                        logger.debug(f"    Para {para_idx+1}: Alignment '{align1_name}' vs '{align2_name}'")
+                    
+                    # Convert None to LEFT for comparison
+                    if align1 is None:
+                        align1 = PP_ALIGN.LEFT
+                    if align2 is None:
+                        align2 = PP_ALIGN.LEFT
+                        
+                    if align1 != align2:
+                        if enable_debug:
+                            align1_final = getattr(align1, 'name', str(align1))
+                            align2_final = getattr(align2, 'name', str(align2))
+                            logger.debug(f"    MISMATCH: Alignment differs - '{align1_final}' vs '{align2_final}'")
+                        return 0
+        elif len(text_shapes1) != len(text_shapes2):
+            if enable_debug:
+                logger.debug(f"MISMATCH: Different number of text shapes - {len(text_shapes1)} vs {len(text_shapes2)}")
+            return 0
+    
+    if enable_debug:
+        logger.debug(f"=== COMPARISON SUCCESSFUL - Files match ===")
+    return 1
+
+
+def check_strikethrough(pptx_path, rules):
+    # Load the presentation
+    presentation = Presentation(pptx_path)
+
+    slide_index_s = rules["slide_index_s"]
+    shape_index_s = rules["shape_index_s"]
+    paragraph_index_s = rules["paragraph_index_s"]
+
+    try:
+        for slide_index in slide_index_s:
+            # Get the slide
+            slide = presentation.slides[slide_index]
+
+            for shape_index in shape_index_s:
+                # Get the text box
+                paragraphs = slide.shapes[shape_index].text_frame.paragraphs
+
+                for paragraph_index in paragraph_index_s:
+                    paragraph = paragraphs[paragraph_index]
+                    run = paragraph.runs[0]
+                    if 'strike' not in run.font._element.attrib:
+                        return 0
+
+
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
+    return 1
+
+
+def check_slide_orientation_Portrait(pptx_path):
+    presentation = Presentation(pptx_path)
+
+    slide_height = presentation.slide_height
+    slide_width = presentation.slide_width
+
+    if slide_width < slide_height:
+        return 1
+    return 0
+
+
+def evaluate_presentation_fill_to_rgb_distance(pptx_file, rules):
+    rgb = rules["rgb"]
+
+    try:
+        original_rgb = rules["original_rgb"]
+    except:
+        original_rgb = None
+
+    def get_rgb_from_color(color):
+        try:
+            if hasattr(color, "rgb"):
+                return color.rgb
+            else:
+                return None
+        except:
+            return None
+
+    def slide_fill_distance_to_rgb(_slide, _rgb, _original_rgb):
+        fill = _slide.background.fill
+        if fill.type == 1:
+            color_rgb = get_rgb_from_color(fill.fore_color)
+            if color_rgb is None:
+                return 1
+            r1, g1, b1 = color_rgb
+            r2, g2, b2 = _rgb
+
+            if _original_rgb is not None:
+                r3, g3, b3 = _original_rgb
+                if r1 == r3 and g1 == g3 and b1 == b3:
+                    return 1
+
+            return sqrt((r1 - r2) ** 2 + (g1 - g2) ** 2 + (b1 - b2) ** 2) / sqrt(255 ** 2 + 255 ** 2 + 255 ** 2)
+        elif fill.type == 5:
+            master_fill = _slide.slide_layout.slide_master.background.fill
+            if master_fill.type == 1:
+                color_rgb = get_rgb_from_color(master_fill.fore_color)
+                if color_rgb is None:
+                    return 1
+                r1, g1, b1 = color_rgb
+            else:
+                return 1
+            r2, g2, b2 = _rgb
+
+            if _original_rgb is not None:
+                r3, g3, b3 = _original_rgb
+                if r1 == r3 and g1 == g3 and b1 == b3:
+                    return 1
+
+            return sqrt((r1 - r2) ** 2 + (g1 - g2) ** 2 + (b1 - b2) ** 2) / sqrt(255 ** 2 + 255 ** 2 + 255 ** 2)
+
+        return 1
+
+    prs = Presentation(pptx_file)
+    similarity = 1 - sum(slide_fill_distance_to_rgb(slide, rgb, original_rgb) for slide in prs.slides) / len(prs.slides)
+    return similarity
+
+
+def check_left_panel(accessibility_tree):
+    namespaces = {
+        'st': 'uri:deskat:state.at-spi.gnome.org',
+        'cp': 'uri:deskat:component.at-spi.gnome.org'
+    }
+
+    root = ET.fromstring(accessibility_tree)
+
+    # 遍历所有 document-frame 节点
+    for doc_frame in root.iter('document-frame'):
+        if doc_frame.attrib.get("name") == "Slides View":
+            # 说明 Slides View 存在，即左侧面板已打开
+            return 1.
+
+    # 没找到 Slides View，认为左侧面板未打开
+    return 0.
+
+
+def check_transition(pptx_file, rules):
+    slide_idx = rules['slide_idx']
+    transition_type = rules['transition_type']
+
+    # Use the zipfile module to open the .pptx file
+    with zipfile.ZipFile(pptx_file, 'r') as zip_ref:
+        # Get the slide XML file
+        slide_name = 'ppt/slides/slide{}.xml'.format(slide_idx + 1)
+        try:
+            zip_ref.getinfo(slide_name)
+        except KeyError:
+            # Slide does not exist
+            return 0.
+
+        with zip_ref.open(slide_name) as slide_file:
+            # 解析XML
+            tree = ET.parse(slide_file)
+            root = tree.getroot()
+
+            # XML namespace
+            namespaces = {
+                'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
+                'p': 'http://schemas.openxmlformats.org/presentationml/2006/main',
+            }
+
+            # Search for the transition element
+            transition = root.find('.//p:transition', namespaces)
+            if transition is not None:
+                # Check if the transition is an expected transition
+                dissolve = transition.find('.//p:{}'.format(transition_type), namespaces)
+                if dissolve is not None:
+                    return 1.
+                else:
+                    return 0.
+            else:
+                return 0.
+
+
+def check_page_number_colors(pptx_file, rules):
+    color = rules["color"]
+
+    def is_red(rgb_str, threshold=50):
+        r, g, b = int(rgb_str[1:3], 16), int(rgb_str[3:5], 16), int(rgb_str[5:7], 16)
+        return r > g + threshold and r > b + threshold
+
+    def is_blue(rgb_str, threshold=50):
+        r, g, b = int(rgb_str[1:3], 16), int(rgb_str[3:5], 16), int(rgb_str[5:7], 16)
+        return b > g + threshold and b > r + threshold
+
+    def is_green(rgb_str, threshold=50):
+        r, g, b = int(rgb_str[1:3], 16), int(rgb_str[3:5], 16), int(rgb_str[5:7], 16)
+        return g > r + threshold and g > b + threshold
+
+    def is_black(rgb_str, threshold=50):
+        r, g, b = int(rgb_str[1:3], 16), int(rgb_str[3:5], 16), int(rgb_str[5:7], 16)
+        return r < threshold and g < threshold and b < threshold
+
+    with zipfile.ZipFile(pptx_file, 'r') as zip_ref:
+        slide_master_name = 'ppt/slideMasters/slideMaster1.xml'
+        with zip_ref.open(slide_master_name) as slide_master_file:
+            tree = ET.parse(slide_master_file)
+            root = tree.getroot()
+
+            namespaces = {
+                'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
+                'p': 'http://schemas.openxmlformats.org/presentationml/2006/main',
+            }
+
+            color_elems = root.findall('.//a:solidFill//a:srgbClr', namespaces)
+            slides_color_val = color_elems[-2].get('val')
+
+    if slides_color_val is None:
+        return 0
+    elif color == "red" and not is_red(slides_color_val):
+        return 0
+    elif color == "blue" and not is_blue(slides_color_val):
+        return 0
+    elif color == "green" and not is_green(slides_color_val):
+        return 0
+    elif color == "black" and not is_black(slides_color_val):
+        return 0
+
+    return 1
+
+
+def check_auto_saving_time(pptx_file, rules):
+    minutes = rules["minutes"]
+
+    # open and parse xml file
+    try:
+        tree = ET.parse(pptx_file)
+        root = tree.getroot()
+
+        # Traverse the XML tree to find the autosave time setting
+        autosave_time = None
+        for item in root.findall(".//item"):
+            # Check the path attribute
+            path = item.get('{http://openoffice.org/2001/registry}path')
+            if path == "/org.openoffice.Office.Common/Save/Document":
+                # Once the correct item is found, look for the prop element with the name "AutoSaveTimeIntervall"
+                for prop in item.findall(".//prop"):
+                    name = prop.get('{http://openoffice.org/2001/registry}name')
+                    if name == "AutoSaveTimeIntervall":
+                        # Extract the value of the autosave time interval
+                        autosave_time = prop.find(".//value").text
+                        break
+
+        if autosave_time is None:
+            return 0
+        else:
+            autosave_time = int(autosave_time)
+            if autosave_time == minutes:
+                return 1
+            else:
+                return 0
+
+    except ET.ParseError as e:
+        logger.error(f"Error parsing XML: {e}")
+    except FileNotFoundError:
+        logger.error(f"File not found: {pptx_file}")
diff --git a/openhands/nvidia/os_world/metrics/table.py b/openhands/nvidia/os_world/metrics/table.py
new file mode 100644
index 000000000..36f987ccf
--- /dev/null
+++ b/openhands/nvidia/os_world/metrics/table.py
@@ -0,0 +1,797 @@
+import functools
+import itertools
+import os.path
+import re
+import unicodedata
+
+# import operator
+from numbers import Number
+from typing import Any, Union, cast, Callable, Iterable
+from typing import Dict, List, Tuple, Set
+
+import openpyxl
+import pandas as pd
+from openpyxl import Workbook
+from openpyxl.cell.cell import Cell
+from openpyxl.utils import get_column_letter
+from openpyxl.worksheet.cell_range import MultiCellRange
+from openpyxl.worksheet.datavalidation import DataValidation
+from openpyxl.worksheet.worksheet import Worksheet
+from rapidfuzz import fuzz
+
+from openhands.nvidia.os_world.metrics.utils import (
+    _match_value_to_rule,
+    _read_cell_style,
+    read_cell_value,
+)
+from openhands.nvidia.os_world.metrics.utils import (
+    load_charts,
+    load_sparklines,
+    load_rows_or_cols,
+    load_xlsx_styles,
+    load_filters,
+    load_pivot_tables,
+)
+
+from openhands.core.logger import openhands_logger as logger
+
+BOOK = Union[pd.ExcelFile, Workbook, str]
+
+
+def _parse_sheet_idx(
+    sheet_idx: Union[int, str],
+    result: BOOK,
+    expected: BOOK,
+    result_sheet_names: List[str],
+    expected_sheet_names: List[str],
+) -> Tuple[BOOK, str]:
+    #  function _parse_sheet_idx {{{ #
+    if isinstance(sheet_idx, int):
+        try:
+            if not result_sheet_names or sheet_idx >= len(result_sheet_names):
+                logger.error(
+                    f"Sheet index {sheet_idx} out of range. Available sheets: {result_sheet_names}"
+                )
+                index = ""
+            else:
+                index: str = result_sheet_names[sheet_idx]
+                logger.debug(f"Sheet index {sheet_idx} resolved to sheet: {index}")
+        except Exception as e:
+            logger.error(f"Error resolving sheet index {sheet_idx}: {e}")
+            index = ""
+        book: BOOK = result
+    elif sheet_idx.startswith("RI"):
+        try:
+            index: str = result_sheet_names[int(sheet_idx[2:])]
+        except:
+            index = ""
+        book: BOOK = result
+    elif sheet_idx.startswith("RN"):
+        index: str = sheet_idx[2:]
+        book: BOOK = result
+    elif sheet_idx.startswith("EI"):
+        try:
+            index: str = expected_sheet_names[int(sheet_idx[2:])]
+        except:
+            index = ""
+        book: BOOK = expected
+    elif sheet_idx.startswith("EN"):
+        index: str = sheet_idx[2:]
+        book: BOOK = expected
+    else:
+        logger.error("Unrecognized sheet index")
+        raise ValueError("Unrecognized sheet index")
+    return book, index
+    #  }}} function _parse_sheet_idx #
+
+
+SHEET = Union[pd.DataFrame, Worksheet, List[str]]
+
+
+def _load_sheet(book: BOOK, index: str) -> SHEET:
+    #  function _load_sheet {{{ #
+    try:
+        if isinstance(book, str):
+            book: str = cast(str, book)
+            csv_name: str = "{:}-{:}.csv".format(os.path.splitext(book)[0], index)
+
+            try:
+                all_lines: List[str] = _safe_read_file(csv_name)
+                csv_lines: List[str] = list(
+                    itertools.dropwhile(
+                        lambda l: len(l) == 0,
+                        map(lambda l: l.strip(), reversed(all_lines)),
+                    )
+                )
+                return csv_lines
+            except (FileNotFoundError, IOError) as e:
+                logger.error(f"Failed to read CSV file {csv_name}: {e}")
+                return None
+        if isinstance(book, pd.ExcelFile):
+            return pd.read_excel(book, index)
+        if isinstance(book, Workbook):
+            return book[index]
+        logger.error("Not supported workbook format")
+        raise NotImplementedError("Not supported workbook format")
+    except NotImplementedError as e:
+        raise e
+    except:
+        return None
+    #  }}} function _load_sheet #
+
+
+def _safe_read_file(file_path: str) -> List[str]:
+    """
+    Safely read a file with multiple encoding attempts.
+
+    Args:
+        file_path: Path to the file to read
+
+    Returns:
+        List of lines from the file
+
+    Raises:
+        FileNotFoundError: If file doesn't exist
+        IOError: If file cannot be read with any encoding
+    """
+    # Common encodings to try in order of preference
+    encodings = [
+        "utf-8",  # Most common modern encoding
+        "utf-8-sig",  # UTF-8 with BOM
+        "latin-1",  # ISO-8859-1, works with any byte sequence
+        "windows-1252",  # Common Windows encoding
+        "gbk",  # Chinese encoding
+        "cp1251",  # Cyrillic encoding
+        "iso-8859-1",  # Alternative latin-1
+    ]
+
+    last_error = None
+
+    for encoding in encodings:
+        try:
+            with open(file_path, "r", encoding=encoding) as f:
+                lines = f.read().splitlines()
+                logger.debug(
+                    f"Successfully read file {file_path} with encoding {encoding}"
+                )
+                return lines
+        except UnicodeDecodeError as e:
+            last_error = e
+            logger.debug(f"Failed to read {file_path} with encoding {encoding}: {e}")
+            continue
+        except (FileNotFoundError, IOError) as e:
+            # These are non-encoding related errors, re-raise immediately
+            raise e
+
+    # If all encodings fail, try with error handling as last resort
+    try:
+        with open(file_path, "r", encoding="utf-8", errors="replace") as f:
+            lines = f.read().splitlines()
+            logger.warning(f"Read file {file_path} with UTF-8 and error replacement")
+            return lines
+    except Exception as e:
+        logger.error(
+            f"Failed to read file {file_path} with any encoding. Last error: {last_error}"
+        )
+        raise IOError(
+            f"Cannot read file {file_path} with any supported encoding"
+        ) from last_error
+
+
+def compare_csv(result: str, expected: Union[str, List[str]], **options) -> float:
+    """
+    Compare CSV files. If expected is a list, returns 1.0 if result matches any of the expected files.
+
+    Args:
+        result: Path to result CSV file
+        expected: Path to expected CSV file or list of paths to expected CSV files
+        options: Additional options (strict, ignore_case)
+
+    Returns:
+        1.0 if result matches expected (or any file in expected list), 0.0 otherwise
+    """
+    if result is None:
+        return 0.0
+
+    try:
+        result_lines: List[str] = _safe_read_file(result)
+    except (FileNotFoundError, IOError) as e:
+        logger.error(f"Failed to read result file {result}: {e}")
+        return 0.0
+
+    # Convert expected to list if it's a single string (for backward compatibility)
+    if isinstance(expected, str):
+        expected_files = [expected]
+    else:
+        expected_files = expected
+
+    # Try to match against each expected file
+    for expected_file in expected_files:
+        try:
+            expected_lines: List[str] = _safe_read_file(expected_file)
+
+            # Process lines based on options
+            current_result_lines = result_lines
+            current_expected_lines = expected_lines
+
+            if not options.get("strict", True):
+                current_result_lines = map(str.strip, current_result_lines)
+                current_expected_lines = map(str.strip, current_expected_lines)
+            if options.get("ignore_case", False):
+                current_result_lines = map(str.lower, current_result_lines)
+                current_expected_lines = map(str.lower, current_expected_lines)
+
+            # Check if this expected file matches
+            if list(current_result_lines) == list(current_expected_lines):
+                return 1.0
+
+        except (FileNotFoundError, IOError):
+            # If this expected file doesn't exist, continue to next one
+            continue
+
+    # No match found
+    return 0.0
+
+
+def compare_table(result: str, expected: str = None, **options) -> float:
+    #  function compare_table {{{ #
+    """
+    Args:
+        result (str): path to result xlsx
+        expected (str): path to golden xlsx
+        rules (List[Dict[str, Any]]): list of dict like
+          {
+            "type": str,
+            <str as parameters>: anything
+          }
+          as sequential rules
+
+    Returns:
+        float: the score
+    """
+
+    if result is None:
+        logger.error("Result file path is None")
+        return 0.0
+
+    # Check if result file exists
+    if not os.path.exists(result):
+        logger.error(f"Result file not found: {result}")
+        return 0.0
+
+    try:
+        logger.info(f"Loading result file: {result}")
+        xlworkbookr: Workbook = openpyxl.load_workbook(filename=result)
+        pdworkbookr = pd.ExcelFile(result)
+        logger.info(
+            f"Successfully loaded result file with sheets: {pdworkbookr.sheet_names}"
+        )
+    except Exception as e:
+        logger.error(f"Failed to load result file {result}: {e}")
+        return 0.0
+    worksheetr_names: List[str] = pdworkbookr.sheet_names
+
+    if expected is not None:
+        xlworkbooke: Workbook = openpyxl.load_workbook(filename=expected)
+        pdworkbooke = pd.ExcelFile(expected)
+        worksheete_names: List[str] = pdworkbooke.sheet_names
+    else:
+        xlworkbooke: Workbook = None
+        pdworkbooke = None
+        worksheete_names: List[str] = None
+
+    parse_idx: Callable[[Union[str, int], BOOK, BOOK], Tuple[BOOK, str]] = (
+        functools.partial(
+            _parse_sheet_idx,
+            result_sheet_names=worksheetr_names,
+            expected_sheet_names=worksheete_names,
+        )
+    )
+
+    passes = True
+    for r in options["rules"]:
+        if r["type"] == "sheet_name":
+            #  Compare Sheet Names {{{ #
+            metric: bool = worksheetr_names == worksheete_names
+            logger.debug(
+                "Assertion: %s.sheet_names == %s.sheet_names - %s",
+                result,
+                expected,
+                metric,
+            )
+            #  }}} Compare Sheet Names #
+
+        elif r["type"] == "sheet_data":
+            #  Compare Sheet Data by Internal Value {{{ #
+            # sheet_idx0: 0 == "RI0" == "RNSheet1" | "EI0" == "ENSheet1"
+            # sheet_idx1: as sheet_idx0
+            # precision: int as number of decimal digits, default to 4
+
+            error_limit: int = r.get("precision", 4)
+            sheet1: pd.DataFrame = _load_sheet(
+                *parse_idx(r["sheet_idx0"], pdworkbookr, pdworkbooke)
+            )
+            if sheet1 is None:
+                return 0.0
+            sheet2: pd.DataFrame = _load_sheet(
+                *parse_idx(r["sheet_idx1"], pdworkbookr, pdworkbooke)
+            )
+
+            sheet1 = sheet1.round(error_limit)
+            sheet2 = sheet2.round(error_limit)
+            metric: bool = sheet1.equals(sheet2)
+            logger.debug("Sheet1: \n%s", str(sheet1))
+            logger.debug("Sheet2: \n%s", str(sheet2))
+            try:
+                logger.debug("Sheet1 =v= Sheet2: \n%s", str(sheet1 == sheet2))
+            except:
+                logger.debug("Sheet1 =/v= Sheet2")
+            logger.debug(
+                "Assertion: %s =v= %s - %s", r["sheet_idx0"], r["sheet_idx1"], metric
+            )
+            #  }}} Compare Sheet Data by Internal Value #
+
+        elif r["type"] == "sheet_print":
+            #  Compare Sheet Data by Printed Value {{{ #
+            # sheet_idx0: 0 == "RI0" == "RNSheet1" | "EI0" == "ENSheet1"
+            # sheet_idx1: as sheet_idx0
+            # ignore_case: optional, defaults to False
+
+            sheet1: List[str] = _load_sheet(
+                *parse_idx(r["sheet_idx0"], result, expected)
+            )
+            if sheet1 is None:
+                return 0.0
+            sheet2: List[str] = _load_sheet(
+                *parse_idx(r["sheet_idx1"], result, expected)
+            )
+            if r.get("ignore_case", False):
+                sheet1 = [l.lower() for l in sheet1]
+                sheet2 = [l.lower() for l in sheet2]
+            metric: bool = sheet1 == sheet2
+            logger.debug(
+                "Assertion: %s =p= %s - %s", r["sheet_idx0"], r["sheet_idx1"], metric
+            )
+            #  }}} Compare Sheet Data by Printed Value #
+
+        elif r["type"] == "sheet_fuzzy":
+            #  Fuzzy Match for Ranges {{{ #
+            # sheet_idx0: 0 == "RI0" == "RNSheet1" | "EI0" == "ENSheet1"
+            # sheet_idx1: as sheet_idx0
+            # rules: list of dict, each dict is like
+            #   { "range": ["A1:B6", "C2:E5"],
+            #     "type": "includes" | "included_by" | "fuzzy_match" | "exact_match", # 0 includes 1, 0 includes_by 1
+            #     "threshold": 85, // for fuzzy match
+            #     "ignore_case": true | false,
+            #     "ignore_chars": " ()", # filtered out
+            #     "trim_leadings": "+ ", # filtered by lstrip
+            #     "trim_trailings": "", # filtered by rstrip
+            #     "normalization": [["Rd", "Road"]], # filtered by replace
+            #   }
+
+            sheet1: Tuple[BOOK, str] = parse_idx(r["sheet_idx0"], result, expected)
+            sheet2: Tuple[BOOK, str] = parse_idx(r["sheet_idx1"], result, expected)
+            total_metric = True
+            for rl in r["rules"]:
+                for rng in MultiCellRange(rl["range"]):
+                    for cdn in rng.cells:
+                        coordinate: str = "{:}{:d}".format(
+                            get_column_letter(cdn[1]), cdn[0]
+                        )
+                        value1: str = str(read_cell_value(*sheet1, coordinate))
+                        value2: str = str(read_cell_value(*sheet2, coordinate))
+                        logger.debug("%s: %s vs %s", cdn, value1, value2)
+
+                        for rplc in rl.get("normalization", []):
+                            value1 = value1.replace(rplc[0], rplc[1])
+                            value2 = value2.replace(rplc[0], rplc[1])
+                        if "trim_leadings" in rl:
+                            value1 = value1.lstrip(rl["trim_leadings"])
+                            value2 = value2.lstrip(rl["trim_leadings"])
+                        if "trim_trailings" in rl:
+                            value1 = value1.rstrip(rl["trim_trailings"])
+                            value2 = value2.rstrip(rl["trim_trailings"])
+                        if "ignore_chars" in rl:
+                            ignore_chars: Set[str] = set(rl["ignore_chars"])
+                            value1 = "".join(
+                                filter(lambda ch: ch not in ignore_chars, value1)
+                            )
+                            value2 = "".join(
+                                filter(lambda ch: ch not in ignore_chars, value2)
+                            )
+                        if rl.get("ignore_case", False):
+                            value1 = value1.lower()
+                            value2 = value2.lower()
+
+                        if rl["type"] == "includes":
+                            metric: bool = value2 in value1
+                        elif rl["type"] == "included_by":
+                            metric: bool = value1 in value2
+                        elif rl["type"] == "fuzzy_match":
+                            metric: bool = fuzz.ratio(value1, value2) >= rl.get(
+                                "threshold", 85.0
+                            )
+                        elif rl["type"] == "exact_match":
+                            metric: bool = value1 == value2
+                        total_metric = total_metric and metric
+
+            metric: bool = total_metric
+            logger.debug(
+                "Assertion: %s =~= %s - %s", r["sheet_idx0"], r["sheet_idx1"], metric
+            )
+            #  }}} Fuzzy Match for Ranges #
+
+        elif r["type"] == "sparkline":
+            #  Compare Sparklines {{{ #
+            # sheet_idx0: 0 == "RI0" == "RNSheet1" | "EI0" == "ENSheet1"
+            # sheet_idx1: as sheet_idx0
+
+            sparkline1: Dict[str, str] = load_sparklines(
+                *parse_idx(r["sheet_idx0"], result, expected)
+            )
+            sparkline2: Dict[str, str] = load_sparklines(
+                *parse_idx(r["sheet_idx1"], result, expected)
+            )
+            metric: bool = sparkline1 == sparkline2
+            logger.debug(
+                "Assertion: %s.sp == %.sp - %s",
+                r["sheet_idx0"],
+                r["sheet_idx1"],
+                metric,
+            )
+            #  }}} Compare Sparklines #
+
+        elif r["type"] == "chart":
+            #  Compare Charts {{{ #
+            # sheet_idx0: 0 == "RI0" == "RNSheet1" | "EI0" == "ENSheet1"
+            # sheet_idx1: as sheet_idx0
+            # chart_props: list of str, see utils.load_charts
+
+            charts1: Dict[str, Any] = load_charts(
+                *parse_idx(r["sheet_idx0"], xlworkbookr, xlworkbooke), **r
+            )
+            charts2: Dict[str, Any] = load_charts(
+                *parse_idx(r["sheet_idx1"], xlworkbookr, xlworkbooke), **r
+            )
+            metric: bool = charts1 == charts2
+            logger.debug(
+                "Assertion: %s[chart] == %s[chart] - %s",
+                r["sheet_idx0"],
+                r["sheet_idx1"],
+                metric,
+            )
+            #  }}} Compare Charts #
+
+        elif r["type"] == "style":
+            #  Compare Style (Also Conditional Formatiing) {{{ #
+            # sheet_idx0: 0 == "RI0" == "RNSheet1" | "EI0" == "ENSheet1"
+            # sheet_idx1: as sheet_idx0
+            # props: list of str indicating concerned styles, see utils._read_cell_style
+
+            sheet_idx1: Tuple[BOOK, str] = parse_idx(
+                r["sheet_idx0"], xlworkbookr, xlworkbooke
+            )
+            book_name1: str = parse_idx(r["sheet_idx0"], result, expected)[0]
+            styles1: Dict[str, List[Any]] = load_xlsx_styles(
+                *sheet_idx1, book_name1, **r
+            )
+
+            sheet_idx2: Tuple[BOOK, str] = parse_idx(
+                r["sheet_idx1"], xlworkbookr, xlworkbooke
+            )
+            book_name2: str = parse_idx(r["sheet_idx1"], result, expected)[0]
+            styles2: Dict[str, List[Any]] = load_xlsx_styles(
+                *sheet_idx2, book_name2, **r
+            )
+            # number_formats1: List[str] = [c.number_format.lower() for col in sheet1.iter_cols() for c in col if c.value is not None and c.data_type=="n"]
+            # number_formats2: List[str] = [c.number_format.lower() for col in sheet2.iter_cols() for c in col if c.value is not None and c.data_type=="n"]
+            metric: bool = styles1 == styles2
+            logger.debug(
+                "Assertion: %s.style == %s.style - %s",
+                r["sheet_idx0"],
+                r["sheet_idx1"],
+                metric,
+            )
+            #  }}} Compare Style (Also Conditional Formatiing) #
+
+        elif r["type"] == "freeze":
+            #  Compare Freezing {{{ #
+            # sheet_idx0: 0 == "RI0" == "RNSheet1" | "EI0" == "ENSheet1"
+            # sheet_idx1: as sheet_idx0
+
+            sheet1: Worksheet = _load_sheet(
+                *parse_idx(r["sheet_idx0"], xlworkbookr, xlworkbooke)
+            )
+            if sheet1 is None:
+                return 0.0
+            sheet2: Worksheet = _load_sheet(
+                *parse_idx(r["sheet_idx1"], xlworkbookr, xlworkbooke)
+            )
+            metric: bool = sheet1.freeze_panes == sheet2.freeze_panes
+            logger.debug(
+                "Assertion: %s.freeze(%s) == %s.freeze(%s) - %s",
+                r["sheet_idx0"],
+                sheet1.freeze_panes,
+                r["sheet_idx1"],
+                sheet2.freeze_panes,
+                metric,
+            )
+            #  }}} Compare Freezing #
+
+        elif r["type"] == "zoom":
+            #  Check Zooming {{{ #
+            # sheet_idx: 0 == "RI0" == "RNSheet1" | "EI0" == "ENSheet1"
+            # method: str
+            # ref: value
+
+            sheet: Worksheet = _load_sheet(
+                *parse_idx(r["sheet_idx"], xlworkbookr, xlworkbooke)
+            )
+            if sheet is None:
+                return 0.0
+            zoom_scale: Number = sheet.sheet_view.zoomScale or 100.0
+            metric: bool = _match_value_to_rule(zoom_scale, r)
+            logger.debug(
+                "Assertion: %s.zoom(%.1f) %s %.1f - %s",
+                r["sheet_idx"],
+                zoom_scale,
+                r["method"],
+                r["ref"],
+                metric,
+            )
+            #  }}} Check Zooming #
+
+        elif r["type"] == "data_validation":
+            #  Check Data Validation {{{ #
+            # sheet_idx: 0 == "RI0" == "RNSheet1" | "EI0" == "ENSheet1"
+            # dv_props: list of dict like {attribute: {"method": str, "ref": anything}}
+            #   available attributes:
+            #     * ranges
+            #     * type
+            #     * formula1
+            #     * formula2
+            #     * operator
+            #     * allowBlank
+            #     * showDropDown
+            #     * showInputMessage
+            #     * showErrorMessage
+            #     * error
+            #     * errorTitle
+            #     * errorStyle
+            #     * prompt
+            #     * promptTitle
+            #     * imeMode
+
+            sheet: Worksheet = _load_sheet(
+                *parse_idx(r["sheet_idx"], xlworkbookr, xlworkbooke)
+            )
+            if sheet is None:
+                return 0.0
+            data_validators: List[DataValidation] = (
+                sheet.data_validations.dataValidation
+            )
+
+            total_metric = len(data_validators) >= len(r["dv_props"])
+            for dat_vldt in data_validators:
+                metric = False
+                for prpt in r["dv_props"]:
+                    metric = metric or all(
+                        _match_value_to_rule(getattr(dat_vldt, attrbt), mr)
+                        for attrbt, mr in prpt.items()
+                    )
+                    if metric:
+                        break
+                total_metric = total_metric and metric
+                if not total_metric:
+                    break
+
+            logger.debug(
+                "Assertion: %s.data_validation - %s", r["sheet_idx"], total_metric
+            )
+            metric: bool = total_metric
+            #  }}} Check Data Validation #
+
+        elif r["type"] == "row_props":
+            #  Check Row Properties {{{ #
+            # sheet_idx0: 0 == "RI0" == "RNSheet1" | "EI0" == "ENSheet1"
+            # sheet_idx1: as sheet_idx0
+            # props: list of str, see utils.load_rows_or_cols
+
+            rows1: Dict[str, Any] = load_rows_or_cols(
+                *parse_idx(r["sheet_idx0"], xlworkbookr, xlworkbooke), obj="row", **r
+            )
+            rows2: Dict[str, Any] = load_rows_or_cols(
+                *parse_idx(r["sheet_idx1"], xlworkbookr, xlworkbooke), obj="row", **r
+            )
+            logger.debug("Rows1: %s", repr(rows1))
+            logger.debug("Rows2: %s", repr(rows2))
+            metric: bool = rows1 == rows2
+            logger.debug(
+                "Assertion: %s[rows] == %s[rows] - %s",
+                r["sheet_idx0"],
+                r["sheet_idx1"],
+                metric,
+            )
+            #  }}} Check Row Properties #
+
+        elif r["type"] == "col_props":
+            #  Check Row Properties {{{ #
+            # sheet_idx0: 0 == "RI0" == "RNSheet1" | "EI0" == "ENSheet1"
+            # sheet_idx1: as sheet_idx0
+            # props: list of str, see utils.load_rows_or_cols
+
+            cols1: Dict[str, Any] = load_rows_or_cols(
+                *parse_idx(r["sheet_idx0"], xlworkbookr, xlworkbooke), obj="column", **r
+            )
+            cols2: Dict[str, Any] = load_rows_or_cols(
+                *parse_idx(r["sheet_idx1"], xlworkbookr, xlworkbooke), obj="column", **r
+            )
+            metric: bool = cols1 == cols2
+            logger.debug(
+                "Assertion: %s[cols] == %s[cols] - %s",
+                r["sheet_idx0"],
+                r["sheet_idx1"],
+                metric,
+            )
+            #  }}} Check Row Properties #
+
+        elif r["type"] == "filter":
+            #  Compare Filters {{{ #
+            # sheet_idx0: 0 == "RI0" == "RNSheet1" | "EI0" == "ENSheet1"
+            # sheet_idx1: as sheet_idx0
+
+            filters1: Dict[str, Any] = load_filters(
+                *parse_idx(r["sheet_idx0"], xlworkbookr, xlworkbooke), **r
+            )
+            filters2: Dict[str, Any] = load_filters(
+                *parse_idx(r["sheet_idx1"], xlworkbookr, xlworkbooke), **r
+            )
+            metric: bool = filters1 == filters2
+            logger.debug(
+                "Assertion: %s[filter] == %s[filter] - %s",
+                r["sheet_idx0"],
+                r["sheet_idx1"],
+                metric,
+            )
+            #  }}} Compare Filters #
+
+        elif r["type"] == "pivot_table":
+            #  Compare Pivot Tables {{{ #
+            # sheet_idx0: 0 == "RI0" == "RNSheet1" | "EI0" == "ENSheet1"
+            # sheet_idx1: as sheet_idx0
+            # pivot_props: list of str, see utils.load_pivot_tables
+
+            pivots1: Dict[str, Any] = load_pivot_tables(
+                *parse_idx(r["sheet_idx0"], xlworkbookr, xlworkbooke), **r
+            )
+            pivots2: Dict[str, Any] = load_pivot_tables(
+                *parse_idx(r["sheet_idx1"], xlworkbookr, xlworkbooke), **r
+            )
+            metric: bool = pivots1 == pivots2
+            logger.debug(
+                "Assertion: %s[pivot]==%s[pivot] - %s",
+                r["sheet_idx0"],
+                r["sheet_idx1"],
+                metric,
+            )
+            #  }}} Compare Pivot Tables #
+
+        elif r["type"] == "check_cell":
+            #  Check Cell Properties {{{ #
+            # sheet_idx: 0 == "RI0" == "RNSheet1" | "EI0" == "ENSheet1"
+            # coordinate: str, "E3"
+            # props: dict like {attribute: {"method": str, "ref": anything}}
+            #   supported attributes: value & those supported by utils._read_cell_style
+
+            try:
+                sheet: Worksheet = _load_sheet(
+                    *parse_idx(r["sheet_idx"], xlworkbookr, xlworkbooke)
+                )
+                if sheet is None:
+                    logger.error(
+                        f"Failed to load sheet for sheet_idx: {r['sheet_idx']}"
+                    )
+                    return 0.0
+                # data_frame: pd.DataFrame = _load_sheet(*parse_idx(r["sheet_idx"], pdworkbookr, pdworkbooke))
+                cell: Cell = sheet[r["coordinate"]]
+                metric: bool = True
+                for prpt, rule in r["props"].items():
+                    if prpt == "value":
+                        try:
+                            parsed_result = parse_idx(r["sheet_idx"], result, expected)
+                            logger.debug(f"parse_idx result: {parsed_result}")
+                            val = read_cell_value(*parsed_result, r["coordinate"])
+                            logger.debug(f"Cell {r['coordinate']} value: {val}")
+                        except Exception as e:
+                            logger.error(
+                                f"Failed to read cell value at {r['coordinate']}: {e}"
+                            )
+                            val = None
+                    else:
+                        try:
+                            val = _read_cell_style(prpt, cell)
+                        except Exception as e:
+                            logger.error(
+                                f"Failed to read cell style {prpt} at {r['coordinate']}: {e}"
+                            )
+                            val = None
+
+                    metric = metric and _match_value_to_rule(val, rule)
+            except Exception as e:
+                logger.error(f"Error in check_cell processing: {e}")
+                return 0.0
+
+            logger.debug(
+                "Assertion: %s[%s] :%s - %s",
+                r["sheet_idx"],
+                r["coordinate"],
+                repr(r["props"]),
+                metric,
+            )
+            #  }}} Check Cell Properties #
+
+        else:
+            raise NotImplementedError(
+                "Unimplemented sheet check: {:}".format(r["type"])
+            )
+
+        passes = passes and metric
+        if not passes:
+            break
+
+    return float(passes)
+    #  }}} function compare_table #
+
+
+def _normalize_city_string(value: Any) -> str:
+    """Lowercase, strip punctuation, and remove accents for tolerant matching."""
+    if value is None:
+        return ""
+    if not isinstance(value, str):
+        value = str(value)
+    normalized = unicodedata.normalize("NFKD", value)
+    normalized = "".join(ch for ch in normalized if not unicodedata.combining(ch))
+    normalized = re.sub(r"[^a-z0-9]+", " ", normalized.lower())
+    return normalized.strip()
+
+
+def compare_conference_city_in_order(actual_city_list_path, expected_city):
+    expected_city_list = expected_city["expected"]
+    wb = openpyxl.load_workbook(actual_city_list_path)
+    sheet = wb.active
+    actual_city_list = []
+    for row in sheet["C2:C22"]:
+        for cell in row:
+            actual_city_list.append(cell.value)
+
+    try:
+        for i, actual_city in enumerate(actual_city_list):
+            actual_normalized = _normalize_city_string(actual_city)
+            expected_entry = expected_city_list[i]
+
+            if isinstance(expected_entry, str):
+                expected_candidates = [expected_entry]
+            elif isinstance(expected_entry, List):
+                expected_candidates = expected_entry
+            else:
+                raise TypeError("Expected city should be a string or a list of strings")
+
+            matched = False
+            for candidate in expected_candidates:
+                normalized_candidate = _normalize_city_string(candidate)
+                if normalized_candidate and normalized_candidate in actual_normalized:
+                    matched = True
+                    break
+
+            if not matched:
+                logger.debug(
+                    f"Expected city {expected_entry}; Actual city {actual_city}"
+                )
+                print(f"Expected city {expected_entry}; Actual city {actual_city}")
+                return 0.0
+
+    except Exception as exc:
+        logger.error(f"Error comparing conference cities: {exc}")
+        return 0.0
+
+    return 1.0
diff --git a/openhands/nvidia/os_world/metrics/thunderbird.py b/openhands/nvidia/os_world/metrics/thunderbird.py
new file mode 100644
index 000000000..be7def014
--- /dev/null
+++ b/openhands/nvidia/os_world/metrics/thunderbird.py
@@ -0,0 +1,175 @@
+import json
+import re
+from typing import List, Pattern, Dict, Match
+from typing import Union, Any, TypeVar, Callable
+
+from .utils import _match_record
+from .utils import _match_value_to_rule as _match_pref
+
+from openhands.core.logger import openhands_logger as logger
+
+V = TypeVar("Value")
+
+_pref_pattern: Pattern[str] = re.compile(r'^user_pref\("(?P<key>(?:[^"]|\\")+)\", (?P<val>.+)\);$');
+
+
+def check_thunderbird_prefs(result: str, rule: Dict[str, Dict[str, Dict[str, Any]]]):
+    """
+    Args:
+        result (str): path to result file
+        rule (Dict[str, Dict[str, Dict[str, Any]]]): dict like
+          {
+            "expect": {
+                str: {
+                    "method": str
+                    "ref": something
+                }
+            }
+            "unexpect": {
+                str: {
+                    "method": str
+                    "ref": something
+                }
+            }
+          }
+
+    Returns:
+        float
+    """
+
+    if result is None:
+        return 0.
+
+    expect_rules = rule.get("expect", {})
+    unexpect_rules = rule.get("unexpect", {})
+
+    expect_metrics = {k: False for k in expect_rules}
+    unexpect_metric = True
+    with open(result) as f:
+        for l in f:
+            match_: Match[str] = _pref_pattern.match(l.strip())
+            if match_ is None:
+                continue
+
+            key: str = match_.group("key")
+            # value: str = match_.group("val")
+            # if value in {"true", "false"}:
+            # value = value.title()
+            # value: V = eval(value)
+            value = json.loads(match_.group("val"))
+            if key in expect_rules:
+                logger.debug("K: %s, V: %s", key, repr(value))
+                expect_metrics[key] = _match_pref(value, expect_rules[key])
+            elif key in unexpect_rules:
+                unexpect_metric = unexpect_metric and not _match_pref(value, unexpect_rules[key])
+
+    return float(all(expect_metrics.values()) and unexpect_metric)
+
+
+_value_processor: Callable[[str], str] = lambda val: val.replace("\\\"", "\"").replace("\\\\", "\\")
+# _condition_pattern: Pattern[str] = re.compile(r'(?P<type>AND|OR) \((?P<key>[\w ]+),(?P<rel>[\w ' + '\'' + r']+),(?:"(?P<val2>(?:[^"]|\")+)"|(?P<val1>[^)]+))\)')
+_condition_pattern: Pattern[str] = re.compile(
+    r'\b(?:AND|OR) \((?:[\w ]+),(?:[\w ' + '\'' + r']+),(?:"(?:(?:[^"]|\")+)"|(?:[^)]+))\)|\bALL\b')
+
+
+def check_thunderbird_filter(result: str, rules: Dict[str, List[Dict[str, str]]]) -> float:
+    """
+    Args:
+        result (str): path to filter def file
+        rules (Dict[str, List[Dict[str, str]]]): dict like
+          {
+            "expect": [{key: value}]
+            "unexpect": [{key: value}]
+          }
+
+    Returns:
+        float
+    """
+
+    if result is None:
+        return 0.
+
+    # read filter def file
+    # a filter:
+    # {
+    #   "name": "Name",
+    #   "enabled": "yes" | "no",
+    #   "type": "17",
+    #   "action": "Move to folder" | ...,
+    #   "actionValue": ...,
+    #   "condition": [...]
+    # }
+    filters: List[Dict[str, Union[str, List[str]]]] = []
+    with open(result) as f:
+        for l in f:
+            if l.startswith("name="):
+                filter_: Dict[str, Union[str, List[str]]] = {}
+                filter_["name"] = _value_processor(l[6:-2])
+            elif l.startswith("enabled="):
+                filter_["enabled"] = _value_processor(l[9:-2])
+            elif l.startswith("type="):
+                filter_["type"] = _value_processor(l[6:-2])
+            elif l.startswith("action="):
+                filter_["action"] = _value_processor(l[8:-2])
+            elif l.startswith("actionValue="):
+                filter_["actionValue"] = _value_processor(l[13:-2])
+            elif l.startswith("condition="):
+                condition_str: str = _value_processor(l[11:-2])
+                logger.debug("FILTER CONDITION: %s", condition_str)
+
+                conditions: List[str] = \
+                    _condition_pattern.findall(condition_str)
+                logger.debug("FILTER CONDITIONS: %s", repr(conditions))
+
+                filter_["condition"] = conditions
+                logger.debug("FILTER %s", repr(filter_))
+                filters.append(filter_)
+
+    expect_metrics = [False] * len(rules.get("expect", []))
+    unexpect_metric = True
+    for flt in filters:
+        for i, r in enumerate(rules.get("expect", [])):
+            expect_metrics[i] = expect_metrics[i] or _match_record(r, flt)
+        unexpect_metric = unexpect_metric and not any(_match_record(r, flt) for r in rules.get("unexpect", []))
+    return float(all(expect_metrics) and unexpect_metric)
+
+
+def check_thunderbird_folder(result: Union[str, List[str]], reference: Union[str, List[str]], **kwargs) -> float:
+    """
+    Check the file or file_list that each text file contains all messages in a folder in Thunderbird. Each message is started with `FROM - `.
+    **kwargs:
+        ignore_status (bool): for comparison, ignore the status (X-Mozilla-Status: 0000) of each message. default: False
+        ignore_keys (bool): for comparison, ignore the keys (X-Mozilla-Keys: label) of each message. default: False
+        remove_deleted (bool): ignore deleted messages which has status code 0008 or 0009. default: True
+        remove_duplicate (bool): remove duplicate messages. default: True
+    """
+
+    def normalize_msg(msg, options):
+        ignore_status = options.get('ignore_status', False)
+        ignore_keys = options.get('ignore_keys', False)
+        if ignore_status:
+            msg = re.sub(r'X-Mozilla-Status\d?:[\s\d]+', '', msg)
+        if ignore_keys:
+            msg = re.sub(r'(X-Mozilla-Keys:[^\n]*?)\n(MIME-Version)', r'\2', msg)
+        return msg.strip()
+
+    def read_thunderbird_folder_file(path: str) -> str:
+        with open(path, 'r') as inf:
+            data = inf.read().strip()
+            messages = []
+            for mail in data.split('FROM - '):
+                if mail.strip(): continue
+                if kwargs.get('remove_deleted', True) and re.search(r'X-Mozilla-Status: 000[89]', mail): continue
+                messages.append('FROM - ' + normalize_msg(mail, kwargs))
+            if kwargs.get('remove_duplicate', True):
+                messages = set(messages)
+        return '\n'.join(sorted(messages))
+
+    if type(reference) != list:
+        result, reference = [result], [reference]
+    for pred, gold in zip(result, reference):
+        if pred is None: return .0
+        mail1 = read_thunderbird_folder_file(pred)
+        mail2 = read_thunderbird_folder_file(gold)
+        if mail1 != mail2: return .0
+    return 1.0
diff --git a/openhands/nvidia/os_world/metrics/utils.py b/openhands/nvidia/os_world/metrics/utils.py
new file mode 100644
index 000000000..8bd0ad3e6
--- /dev/null
+++ b/openhands/nvidia/os_world/metrics/utils.py
@@ -0,0 +1,839 @@
+import builtins
+#import datetime
+import functools
+import itertools
+import operator
+import os
+import re
+import zipfile
+#import pandas as pd
+from typing import Any, TypeVar, Union, Iterable, Optional, Callable
+from typing import Dict, List, Set, Match, Tuple, Pattern
+from urllib.parse import urlparse, urlunparse, ParseResult
+
+import formulas
+import lxml.cssselect
+import lxml.etree
+import xmltodict
+from lxml.etree import _Element
+from openpyxl import Workbook
+from openpyxl.cell.cell import Cell, MergedCell
+from openpyxl.chart._chart import ChartBase
+from openpyxl.formatting.formatting import ConditionalFormattingList
+from openpyxl.pivot.cache import CacheSource as PivotCacheSource
+from openpyxl.pivot.table import TableDefinition as PivotTableDefinition
+from openpyxl.styles.differential import DifferentialStyle
+from openpyxl.utils import coordinate_to_tuple, get_column_letter
+from openpyxl.worksheet.cell_range import MultiCellRange, CellRange
+from openpyxl.worksheet.dimensions import DimensionHolder
+from openpyxl.worksheet.filters import AutoFilter, SortState
+from openpyxl.worksheet.worksheet import Worksheet
+import tldextract
+
+from openhands.core.logger import openhands_logger as logger
+
+V = TypeVar("Value")
+
+_xlsx_namespaces = [
+    ("oo", "http://schemas.openxmlformats.org/spreadsheetml/2006/main"),
+    ("x14", "http://schemas.microsoft.com/office/spreadsheetml/2009/9/main"),
+    ("xm", "http://schemas.microsoft.com/office/excel/2006/main")
+]
+_xlsx_ns_mapping = dict(_xlsx_namespaces)
+_xlsx_ns_imapping = dict(map(lambda itm: (itm[1], itm[0]), _xlsx_namespaces))
+_xlsx_ns_imapping["http://schemas.openxmlformats.org/spreadsheetml/2006/main"] = None
+_sheet_name_selector = lxml.cssselect.CSSSelector("oo|sheets>oo|sheet", namespaces=_xlsx_ns_mapping)
+_sparklines_selector = lxml.cssselect.CSSSelector("x14|sparkline", namespaces=_xlsx_ns_mapping)
+
+
+def load_sparklines(xlsx_file: str, sheet_name: str) -> Dict[str, str]:
+    #  function load_sparklines {{{ # 
+    """
+    Args:
+        xlsx_file (str): path to xlsx
+        sheet_name (str): sheet name
+
+    Returns:
+        List[Dict[str, str]]: sparkline definitions in form of
+          {
+            "F3": "Sheet1!C3:E3"
+          }
+    """
+
+    # read xlsx
+    try:
+        with zipfile.ZipFile(xlsx_file, "r") as z_f:
+            with z_f.open("xl/workbook.xml") as f:
+                workbook_database: _Element = lxml.etree.fromstring(f.read())
+                sheets: List[_Element] = _sheet_name_selector(workbook_database)
+                sheet_names: Dict[str, str] = {sh.get("name"): sh.get("sheetId") for sh in sheets}
+            with z_f.open("xl/worksheets/sheet{:}.xml".format(sheet_names[sheet_name])) as f:
+                sheet: _Element = lxml.etree.fromstring(f.read())
+                sparklines: List[_Element] = _sparklines_selector(sheet)
+    except zipfile.BadZipFile:
+        return {}
+
+    sparklines_dict: Dict[str, str] = {}
+    for sp_l in sparklines:
+        sparkline_xml: str = lxml.etree.tostring(sp_l, encoding="unicode")
+        sparkline: Dict[str, Dict[str, str]] = xmltodict.parse(sparkline_xml
+                                                               , process_namespaces=True
+                                                               , namespaces=_xlsx_ns_imapping
+                                                               )
+        sparklines_dict[sparkline["x14:sparkline"]["xm:sqref"]] = sparkline["x14:sparkline"]["xm:f"]
+    return sparklines_dict
+    #  }}} function load_sparklines # 
+
+
+# Available Chart Properties:
+# title: str
+# anchor: ["oneCell" | "twoCell" | "absolute", col0, row0, col1, row1]
+# legend: "b" | "tr" | "l" | "r" | "t"
+# width: number
+# height: number
+# type: "scatterChart" | "lineChart" | "barChart"
+# direction: "bar" (hori) | "col" (vert)
+# xtitle, ytitle, ztitle: str
+def load_charts(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[str, Any]:
+    #  function load_charts {{{ # 
+    """
+    Args:
+        xlsx_file (Workbook): concerned excel book
+        sheet_name (str): sheet name
+        options (Dict[str, List[str]]): dict like {"chart_props": list of str}
+          giving the concerned chart properties
+
+    Returns:
+        Dict[str, Any]: information of charts, dict like
+          {
+            <str representing data source>: {
+                <str as property>: anything
+            }
+          }
+    """
+
+    # workbook: Workbook = openpyxl.load_workbook(filename=xlsx_file)
+    try:
+        worksheet: Worksheet = xlsx_file[sheet_name]
+    except KeyError:
+        return {}
+    charts: List[ChartBase] = worksheet._charts
+
+    chart_set: Dict[str, Any] = {}
+    chart_props: Set[str] = set(options["chart_props"]) if "chart_props" in options else set()
+    for ch in charts:
+        series: List[str] = []
+        for ser in ch.series:
+            if hasattr(ser.val, "numRef") and hasattr(ser.val.numRef, "f"):
+                value_str: str = ser.val.numRef.f
+            elif hasattr(ser.val, "strRef") and hasattr(ser.val.strRef, "f"):
+                value_str: str = ser.val.strRef.f
+            else:
+                value_str: str = ""
+            if hasattr(ser.cat, "numRef") and hasattr(ser.cat.numRef, "f"):
+                categ_str: str = ser.cat.numRef.f
+            elif hasattr(ser.cat, "strRef") and hasattr(ser.cat.strRef, "f"):
+                categ_str: str = ser.cat.strRef.f
+            else:
+                categ_str: str = ""
+            series.append("{:},{:}".format(value_str, categ_str))
+        series: str = ";".join(series)
+
+        # TODO: maybe more aspects, like chart type
+        info: Dict[str, Any] = {}
+
+        if "title" in chart_props:
+            try:
+                info["title"] = ch.title.tx.rich.p[0].r[0].t
+            except:
+                info["title"] = None
+        if "legend" in chart_props:
+            info["legend"] = ch.legend.position if ch.legend is not None else None
+        if "anchor" in chart_props:
+            info["anchor"] = [ch.anchor.editAs
+                , ch.anchor._from.col, ch.anchor.to.row
+                , ch.anchor.to.col, ch.anchor.to.row
+                              ]
+        if "width" in chart_props:
+            info["width"] = ch.width
+        if "height" in chart_props:
+            info["height"] = ch.height
+        if "type" in chart_props:
+            info["type"] = ch.tagname
+        if "direction" in chart_props:
+            info["direction"] = ch.barDir
+
+        if "xtitle" in chart_props:
+            try:
+                info["xtitle"] = ch.x_axis.title.tx.rich.p[0].r[0].t
+            except:
+                info["xtitle"] = None
+        if "ytitle" in chart_props:
+            try:
+                info["ytitle"] = ch.y_axis.title.tx.rich.p[0].r[0].t
+            except:
+                info["ytitle"] = None
+        if "ztitle" in chart_props:
+            try:
+                info["ztitle"] = ch.z_axis.title.tx.rich.p[0].r[0].t
+            except:
+                info["ztitle"] = None
+        chart_set[series] = info
+    logger.debug(".[%s].charts: %s", sheet_name, repr(chart_set))
+    return chart_set
+    #  }}} function load_charts # 
+
+
+# Available Pivot Properties:
+# name: str
+# show_total, show_empty_row, show_empty_col, show_headers: bool
+# location: str
+# selection: if the concrete item selection should be checked, a list of set of tuple like (bool, index) will be returned; list will be returned instead of set if "ordered" is specified
+# filter: if the filter fields should be checked; fields indices will be return in `filter_fields` item
+# col_fields: indices
+# row_fields: indices
+# data_fields: list of str representations. the str representation is like "index;name;subtotal_type;show_data_as"; name is optional and is only returned when `data_fields_name` is specified in `pivot_props`
+def load_pivot_tables(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[str, Any]:
+    #  function load_pivot_tables {{{ # 
+    """
+    Args:
+        xlsx_file (Workbook): concerned excel book
+        sheet_name (str): sheet name
+        options (Dict[str, List[str]]): dict like {"pivot_props": list of str}
+          giving the concerned pivot properties
+
+    Returns:
+        Dict[str, Any]: information of pivot tables, dict like
+          {
+            <str representing data source>: {
+                <str as property>: anything
+            }
+          }
+    """
+
+    try:
+        worksheet: Worksheet = xlsx_file[sheet_name]
+    except KeyError:
+        return {}
+    pivots: List[PivotTableDefinition] = worksheet._pivots
+
+    pivot_set: Dict[str, Any] = {}
+    pivot_props: Set[str] = set(options.get("pivot_props", []))
+    for pvt in pivots:
+        raw_selection: List[List[tuple[Optional[bool], int]]] = \
+            [[(itm.h, itm.x) for itm in f.items if itm.x is not None] \
+             for f in pvt.pivotFields
+             ]
+        raw__selection: List[List[tuple[Optional[bool], int]]] = list(
+            itertools.dropwhile(lambda r: len(r) == 0, raw_selection))
+        left_bias = len(raw_selection) - len(raw__selection)
+        selection: List[List[tuple[Optional[bool], int]]] = list(
+            (itertools.dropwhile(lambda r: len(r) == 0, reversed(raw__selection))))[::-1]
+        right_bias = len(raw__selection) - len(selection)
+        cache_source: PivotCacheSource = pvt.cache.cacheSource
+        cell_range1: str
+        cell_range2: str
+        cell_range1, cell_range2 = cache_source.worksheetSource.ref.split(":")
+        cell_range1: Tuple[int, int] = coordinate_to_tuple(cell_range1)
+        cell_range1 = (cell_range1[0], cell_range1[1] + left_bias)
+        cell_range2: Tuple[int, int] = coordinate_to_tuple(cell_range2)
+        cell_range2 = (cell_range2[0], cell_range2[1] - right_bias)
+        source: str = "{:};{:}:{:};{:}".format(cache_source.type, cell_range1, cell_range2,
+                                               cache_source.worksheetSource.sheet)
+
+        info: Dict[str, Any] = {}
+        if "name" in pivot_props:
+            info["name"] = pvt.name
+
+        if "show_total" in pivot_props:
+            info["show_total"] = pvt.visualTotals
+        if "show_empty_row" in pivot_props:
+            info["show_empty_row"] = pvt.showEmptyRow
+        if "show_empty_col" in pivot_props:
+            info["show_empty_col"] = pvt.showEmptyCol
+        if "show_headers" in pivot_props:
+            info["show_headers"] = pvt.showHeaders
+
+        if "location" in pivot_props:
+            info["location"] = pvt.location
+        if "filter" in pivot_props or "selection" in pivot_props:
+            info["selection"] = selection if "ordered" in pivot_props else list(set(r) for r in selection)
+        if "filter" in pivot_props:
+            info["filter_fields"] = set(f.fld for f in pvt.pageFields)
+        if "col_fields" in pivot_props:
+            info["col_fields"] = [f.x - left_bias for f in pvt.colFields]
+        if "row_fields" in pivot_props:
+            info["row_fields"] = [f.x - left_bias for f in pvt.rowFields]
+        if "data_fields" in pivot_props:
+            info["data_fields"] = [
+                "{:d};{:};{:};{:}".format(f.fld - left_bias, f.name if "data_fields_name" in pivot_props else ""
+                                          , f.subtotal, f.showDataAs
+                                          ) \
+                for f in pvt.dataFields
+                ]
+
+        pivot_set[source] = info
+    logger.debug(".[%s].pivots: %s", sheet_name, repr(pivot_set))
+    return pivot_set
+    #  }}} function load_pivot_tables # 
+
+
+_shared_str_selector = lxml.cssselect.CSSSelector("oo|sst>oo|si", namespaces=_xlsx_ns_mapping)
+_shared_str_value_selector = lxml.cssselect.CSSSelector("oo|t", namespaces=_xlsx_ns_mapping)
+
+
+def read_cell_value(xlsx_file: str, sheet_name: str, coordinate: str) -> Any:
+    #  read_cell_value {{{ # 
+    logger.debug(f"Reading cell value from {xlsx_file}, sheet: {sheet_name}, coordinate: {coordinate}")
+    
+    # Check if file exists
+    if not os.path.exists(xlsx_file):
+        logger.error(f"Excel file not found: {xlsx_file}")
+        return None
+    
+    try:
+        with zipfile.ZipFile(xlsx_file, "r") as z_f:
+            try:
+                with z_f.open("xl/sharedStrings.xml") as f:
+                    shared_str_xml: _Element = lxml.etree.fromstring(f.read())
+                    str_elements: List[_Element] = _shared_str_selector(shared_str_xml)
+                    shared_strs: List[str] = [ "".join(t.text for t in _shared_str_value_selector(elm))\
+                                           for elm in str_elements
+                                             ]
+            except:
+                #logger.exception("Read shared strings error: %s", xlsx_file)
+                logger.debug("Read shared strings error: %s", xlsx_file)
+                shared_strs: List[str] = []
+
+            with z_f.open("xl/workbook.xml") as f:
+                workbook_database: _Element = lxml.etree.fromstring(f.read())
+                sheets: List[_Element] = _sheet_name_selector(workbook_database)
+                sheet_names: Dict[str, str] = {sh.get("name"): sh.get("sheetId") for sh in sheets}
+
+            with z_f.open("xl/worksheets/sheet{:}.xml".format(sheet_names[sheet_name])) as f:
+                sheet: _Element = lxml.etree.fromstring(f.read())
+                cells: List[_Element] = \
+                    lxml.cssselect.CSSSelector('oo|row>oo|c[r="{:}"]'.format(coordinate)
+                                               , namespaces=_xlsx_ns_mapping
+                                               )(sheet)
+                if len(cells) == 0:
+                    logger.debug(f"Cell {coordinate} not found in sheet {sheet_name}")
+                    return None
+                cell: _Element = cells[0]
+    except zipfile.BadZipFile as e:
+        logger.error(f"Bad zip file {xlsx_file}: {e}")
+        return None
+    except KeyError as e:
+        logger.error(f"Sheet {sheet_name} not found in {xlsx_file}: {e}")
+        return None
+    except Exception as e:
+        logger.error(f"Error reading {xlsx_file}: {e}")
+        return None
+
+    cell: Dict[str, str] = xmltodict.parse(lxml.etree.tostring(cell, encoding="unicode")
+                                           , process_namespaces=True
+                                           , namespaces=_xlsx_ns_imapping
+                                           )
+    logger.debug("%s.shared_strings: %s", xlsx_file, repr(shared_strs))
+    logger.debug("%s.%s[%s]: %s", xlsx_file, sheet_name, coordinate, repr(cell))
+    try:
+        if "@t" not in cell["c"] or cell["c"]["@t"] == "n":
+            return float(cell["c"]["v"])
+        if cell["c"]["@t"] == "s":
+            return shared_strs[int(cell["c"]["v"])]
+        if cell["c"]["@t"] == "str":
+            return cell["c"]["v"]
+        if cell["c"]["@t"] == "inlineStr":
+            return cell["c"]["is"]["t"]
+        if cell["c"]["@t"] == "e":
+            return cell["c"]["v"]
+    except (KeyError, ValueError):
+        return None
+    #  }}} read_cell_value # 
+
+
+# Supported Styles:
+# number_format
+# font_name - str
+# font_family - float
+# font_color - in aRGB, e.g., FF000000 is black
+# font_bold - bool
+# font_italic - bool
+# font_underline - "single" | "double" | "singleAccounting" | "doubleAccounting"
+# font_size - float
+# fill_type - "patternFill" | "gradientFill"
+# bgcolor - in aRGB, e.g., FFFF0000 is red; This property seems to be ambiguous with fgcolor in xlsx, strange
+# fgcolor - in aRGB, e.g., FF00FFFF is yellow # Deprecated
+# hyperlink - str
+# merge - bool, if the cell is in a merged range and is not the first cell in the merged range
+def _read_cell_style(style_name: str, cell: Union[Cell, MergedCell], diff_style: Optional[DifferentialStyle] = None) -> Any:
+    if style_name == "number_format":
+        return (cell.number_format if diff_style is None else diff_style.numFmt.formatCode) \
+            if cell.value is not None and cell.data_type == "n" else None
+    elif style_name == "font_name":
+        return (diff_style or cell).font.name if cell.value is not None else None
+    elif style_name == "font_family":
+        return (diff_style or cell).font.family if cell.value is not None else None
+    elif style_name == "font_color":
+        return (diff_style or cell).font.color.rgb if cell.value is not None else None
+    elif style_name == "font_bold":
+        return (diff_style or cell).font.bold if cell.value is not None else None
+    elif style_name == "font_italic":
+        return (diff_style or cell).font.italic if cell.value is not None else None
+    elif style_name == "font_underline":
+        return (diff_style or cell).font.underline if cell.value is not None else None
+    elif style_name == "font_size":
+        return (diff_style or cell).font.size if cell.value is not None else None
+    elif style_name == "fill_type":
+        try:
+            return (diff_style or cell).fill.tagname
+        except:
+            return None
+    elif style_name == "bgcolor" or style_name == "fgcolor":
+        try:
+            #return (diff_style or cell).fill.bgColor.rgb
+            if diff_style is not None:
+                return diff_style.fill.bgColor.rgb
+            else:
+                return cell.fill.fgColor.rgb
+        except:
+            return None
+    #elif style_name == "fgcolor":
+        #try:
+            #return (diff_style or cell).fill.fgColor.rgb
+        #except:
+            #return None
+    elif style_name == "hyperlink":
+        return cell.hyperlink or "" if cell.value is not None else None
+    elif style_name == "merge":
+        return isinstance(cell, MergedCell)
+    else:
+        raise NotImplementedError("Unsupported Style: {:}".format(style_name))
+
+def _process_xlsx_cf_operator(operator: str, value: Any, ref: List[Any]) -> bool:
+    #  function _process_xlsx_cf_operator {{{ # 
+    # "containsText", "lessThanOrEqual", "notBetween", "lessThan", "notContains", "beginsWith", "equal", "greaterThanOrEqual", "between", "endsWith", "notEqual", "greaterThan"
+    try:
+        if operator=="lessThanOrEqual":
+            result: bool = value<=ref[0]
+        elif operator=="lessThan":
+            result: bool = value<ref[0]
+        elif operator=="equal":
+            result: bool = value==ref[0]
+        elif operator=="greaterThanOrEqual":
+            result: bool = value>=ref[0]
+        elif operator=="notEqual":
+            result: bool = value!=ref[0]
+        elif operator=="greaterThan":
+            result: bool = value>ref[0]
+        elif operator=="between":
+            small_one: float
+            large_one: float
+            small_one, large_one = min(ref), max(ref)
+            result: bool = value>=small_one and value<=large_one
+        elif operator=="notBetween":
+            small_one: float
+            large_one: float
+            small_one, large_one = min(ref), max(ref)
+            result: bool = value<small_one or value>large_one
+        else:
+            #raise NotImplementedError("Not Implemented CondFormat Operator: {:}".format(operator))
+            logger.exception("Not Implemented CondFormat Operator: {:}".format(operator))
+        return result
+    except TypeError:
+        logger.exception("Unmatched type of %s and %s. Auto to False", repr(value), repr(ref))
+        return False
+    except IndexError:
+        logger.exception("ref array doesn't have enough elements. Auto to False: %s", repr(ref))
+        return False
+    #  }}} function _process_xlsx_cf_operator # 
+
+_absolute_range_pattern: Pattern[str] = re.compile(r"""\$(?P<col1>[A-Z]{1,3})\$(?P<row1>\d+) # coord1
+                                                        (?::
+                                                          \$(?P<col2>[A-Z]{1,3})\$(?P<row2>\d+) # coord2
+                                                        )?
+                                                     """
+                                                   , re.X
+                                                   )
+
+
+def load_xlsx_styles(xlsx_file: Workbook, sheet_name: str, book_name: str, **options) -> Dict[str, List[Any]]:
+    #  function load_xlsx_styles {{{ # 
+    """
+    Args:
+        xlsx_file (Workbook): concerned excel book
+        sheet_name (str): sheet name
+        book_name (str): book name
+        options (Dict[str, List[str]): dick like {"props": list of str} giving
+          the concerned styles
+
+    Returns:
+        Dict[str, List[Any]]: dict like
+          {
+            <str as cell coordinates>: list of anything indicating concerned
+              property values
+          }
+    """
+
+    try:
+        worksheet: Worksheet = xlsx_file[sheet_name]
+    except KeyError:
+        return {}
+
+    style_dict: Dict[str, List[Any]] = {}
+    concerned_styles: List[str] = options.get("props", [])
+
+    # Handles Cell Styles
+    for col in worksheet.iter_cols():
+        for c in col:
+            style_list: List[Any] = []
+            for st in concerned_styles:
+                style_list.append(_read_cell_style(st, c))
+            style_dict[c.coordinate] = style_list
+
+    # Handles Conditional Formatting
+    conditional_formattings: ConditionalFormattingList = worksheet.conditional_formatting
+    formula_parser = formulas.Parser()
+    for fmt in conditional_formattings:
+        for r in fmt.rules:
+            active_cells: List[Cell] = []
+
+            #  Process CF Formulae {{{ # 
+            formulae: List[Callable[[Any], Any]] = []
+            argument_lists: List[List[Any]] = []
+            has_error = False
+            for fml in r.formula:
+                try:
+                    formula_func: Callable[[Any], Any] =\
+                            formula_parser.ast("=" + fml)[1].compile()
+                    logger.debug("CondFormat rule formula: %s", fml)
+                except:
+                    logger.exception("Formula parsing error: %s. Skipping.", repr(fml))
+                    has_error = True
+                    break
+
+                arguments: List[Any] = []
+                absolute_range_match: List[Tuple[str, str, str, str]] = _absolute_range_pattern.findall(fml)
+                for m in absolute_range_match:
+                    logger.debug("Absolute ranges: %s", repr(m))
+                    if m[2] is None and m[3] is None:
+                        arguments.append(read_cell_value(book_name, sheet_name, coordinate="{:}{:}".format(m[0], m[1])))
+                    else:
+                        arguments.append([read_cell_value(book_name, sheet_name
+                                                          , coordinate="{:}{:}".format(get_column_letter(c[1])
+                                                                                       , c[0]
+                                                                                       )
+                                                          ) \
+                                          for c in CellRange("{:}{:}:{:}{:}".format(m[0], m[1], m[2], m[3])).cells \
+                                          ]
+                                         )
+                logger.debug("Absolute range arguments: %s", repr(arguments))
+
+                formulae.append(formula_func)
+                argument_lists.append(arguments)
+
+            if has_error:
+                continue
+            #  }}} Process CF Formulae # 
+
+            #  Process Condition Accroding to Type {{{ # 
+            if r.type in { "expression"
+                         , "containsText", "notContainsText"
+                         , "endsWith", "beginsWith"
+                         , "containsErrors", "notContainsErrors"
+                         }:
+                condition: Callable[[Any], bool] = formulae[0]
+                arguments: List[Any] = argument_lists[0]
+                is_active: Callable[[Any], bool] = lambda v: condition(v, *arguments)
+            elif r.type == "cellIs":
+                operator: str = r.operator
+                try:
+                    references: List[Any] = [fml() for fml in formulae]
+                except:
+                    logger.exception("Error occurs while calculating reference values for cellIs condition formatting.")
+                    continue
+                is_active: Callable[[Any], bool] =\
+                        lambda v: _process_xlsx_cf_operator(operator, v, references)
+            else:
+                #raise NotImplementedError("Not Implemented Condition Type: {:}".format(r.type))
+                # e.g., type=top10 (rank=number, percent=bool, bottom=bool)
+                # type=aboveAverage (equalAverage=bool, aboveAverage=bool)
+                # type=duplicateValues / type=uniqueValues
+                logger.exception("Not Implemented Condition Type: {:}".format(r.type))
+            #  }}} Process Condition Accroding to Type # 
+
+
+            #  Test Each Cell {{{ # 
+            nb_contiguous_nothings = 0
+            for rge in fmt.cells:
+                for c in rge.cells:
+                    cell: Cell = worksheet.cell(row=c[0], column=c[1])
+                    cell_value = read_cell_value(book_name, sheet_name
+                                                 , coordinate="{:}{:d}".format(get_column_letter(c[1])
+                                                                               , c[0]
+                                                                               )
+                                                 )
+                    if cell_value is None:
+                        nb_contiguous_nothings += 1
+                        if nb_contiguous_nothings>50:
+                            break
+                        continue
+                    else:
+                        try:
+                            satisfies_condition: bool = is_active(cell_value)
+                        except:
+                            logger.exception("Error in formula calculation with cell value %d", repr(cell_value))
+                            satisfies_condition = False
+                        if satisfies_condition:
+                            logger.debug("Active Cell %s(%s) for %s", repr(cell), repr(cell_value), r.formula[0])
+                            active_cells.append(cell)
+            #  }}} Test Each Cell # 
+
+            for c in active_cells:
+                style_dict[c.coordinate] = [_read_cell_style(st, c, r.dxf) for st in concerned_styles]
+
+    logger.debug(".[%s].styles: %s", sheet_name, repr(style_dict))
+    return style_dict
+    #  }}} function load_xlsx_styles # 
+
+
+# Available Row Properties:
+# hidden
+# collapsed
+# height
+#
+# Available Column Properties:
+# width
+# auto_size
+# hidden
+# collapsed
+# min
+# max
+def load_rows_or_cols(xlsx_file: Workbook, sheet_name: str, **options) \
+        -> Dict[Union[int, str], Dict[str, Any]]:
+    #  function load_rows_or_cols {{{ # 
+    """
+    Args:
+        xlsx_file (Workbook): concerned excel book
+        sheet_name (str): sheet name
+        options (Dict[str, List[str]]): dict like
+          {"obj": "row" | "column", "props": list of str} giving the concerned
+          row/column properties
+
+    Returns:
+        Dict[Union[int, str], Dict[str, Any]]: row/column information
+    """
+
+    try:
+        worksheet: Worksheet = xlsx_file[sheet_name]
+    except KeyError:
+        return {}
+    objs: DimensionHolder = getattr(worksheet, "{:}_dimensions".format(options["obj"]))
+
+    obj_set: Dict[int, Any] = {}
+    obj_props: Set[str] = set(options.get("props", []))
+    for obj_no, obj_dms in objs.items():
+        info_dict: Dict[str, Any] = {}
+        for prop in obj_props:
+            info_dict[prop] = getattr(obj_dms, prop)
+        obj_set[obj_no] = info_dict
+    return obj_set
+    #  }}} function load_rows_or_cols # 
+
+
+def load_filters(xlsx_file: Workbook, sheet_name: str, **options) -> Dict[str, Any]:
+    #  function load_filters {{{ # 
+    try:
+        worksheet: Worksheet = xlsx_file[sheet_name]
+    except KeyError:
+        return {}
+
+    filters: AutoFilter = worksheet.auto_filter
+    filter_dict: Dict[str, Any] = {}
+    filter_dict["ref"] = filters.ref
+
+    # filterColumn
+    filter_column_set: List[Dict[str, Any]] = []
+    for flt_clm in filters.filterColumn:
+        filter_column: Dict[str, Any] = {}
+        filter_column["col_id"] = flt_clm.colId
+        filter_column["hidden_button"] = flt_clm.hiddenButton
+        filter_column["show_button"] = flt_clm.showButton
+        if flt_clm.filters is not None:
+            filter_column["filters_blank"] = flt_clm.filters.blank
+            filter_column["filters"] = set(flt_clm.filters.filter)
+        if flt_clm.customFilters is not None:
+            filter_column["custom_filters_op"] = flt_clm.customFilters._and
+            filter_column["custom_filters"] = set((flt.operator
+                                                   , flt.val
+                                                   ) \
+                                                  for flt in flt_clm.customFilters.customFilter
+                                                  )
+        filter_column_set.append(filter_column)
+    filter_column_set = list(sorted(filter_column_set
+                                    , key=(lambda d: d["col_id"])
+                                    )
+                             )
+    filter_dict["filter_column"] = filter_column_set
+
+    # sortState
+    sort_state: Optional[SortState] = filters.sortState
+    if sort_state is not None:
+        sort_state_dict: Dict[str, Any] = {}
+        sort_state_dict["sort"] = sort_state.columnSort
+        sort_state_dict["case"] = sort_state.caseSensitive
+        sort_state_dict["method"] = sort_state.sortMethod
+        sort_state_dict["ref"] = sort_state.ref
+        sort_state_dict["condition"] = list({"descending": cdt.descending
+                                                , "key": cdt.sortBy
+                                                , "ref": cdt.ref
+                                                , "custom_list": cdt.customList
+                                                , "dxf_id": cdt.dxfId
+                                                , "icon": cdt.iconSet
+                                                , "iconid": cdt.iconId
+                                             } \
+                                            for cdt in sort_state.sortCondition
+                                            )
+        filter_dict["sort_state"] = sort_state_dict
+
+    return filter_dict
+    #  }}} function load_filters # 
+
+
+def _match_record(pattern: Dict[str, Any], item: Dict[str, Any]) -> bool:
+    return all(k in item and item[k] == val for k, val in pattern.items())
+
+
+def _multicellrange_containsby(subset_candidate: MultiCellRange, superset_candidate: MultiCellRange) -> bool:
+    return all(r in superset_candidate for r in subset_candidate)
+
+
+def _match_value_to_rule(value: V, rule: Dict[str, Union[str, V]]) -> bool:
+    """
+    Args:
+        value (V): value to match
+        rule (Dict[str, Union[str, V]]): rule dict like
+          {
+            "method": str
+            "ref": V as ref value
+          }
+
+    Returns:
+        bool
+    """
+
+    if rule["method"].startswith("re"): # re.FLAGs
+        flags: List[str] = rule["method"].split(".")[1:]
+        flags: Iterable[re.RegexFlag] = (getattr(re, fl) for fl in flags)
+        flag: re.RegexFlag = functools.reduce(operator.or_, flags, re.RegexFlag(0))
+        logger.debug("REFLAG: %s", repr(flag))
+
+        match_: Optional[Match[str]] = re.search(rule["ref"], value, flag)
+        return match_ is not None
+    if rule["method"] in {"eq", "ne"
+        , "le", "lt"
+        , "ge", "gt"
+                          }:
+        return getattr(operator, rule["method"])(value, rule["ref"])
+    if rule["method"].startswith("approx"): # approx:THRESHOLD
+        threshold: float = float(rule["method"].split(":")[1])
+        logger.debug("Approx: TH%f, REF%f, VAL%s", threshold, rule["ref"], repr(value))
+        try:
+            value = float(value)
+        except (ValueError, TypeError):
+            return False
+        else:
+            return abs(value - rule["ref"]) <= threshold
+    if rule["method"] == "spreadsheet_range":
+        subset_limit = MultiCellRange(rule["ref"][0])
+        superset_limit = MultiCellRange(rule["ref"][1])
+        return _multicellrange_containsby(subset_limit, value) \
+            and _multicellrange_containsby(value, superset_limit)
+    if rule["method"].startswith("range."):  # e.g., range.te [0, 2] -> 0 < x <= 2
+        left_et = rule["method"][6]
+        right_et = rule["method"][7]
+        return getattr(operator, "l" + left_et)(rule["ref"][0], value) \
+            and getattr(operator, "l" + right_et)(value, rule["ref"][1])
+    if rule["method"] in {"str_list_eq", "str_set_eq"}:
+        container_type_str: str = rule["method"][4:-3]
+        container_type = getattr(builtins, container_type_str)
+
+        value: container_type = container_type(value.strip("\"'").split(","))
+        ref: container_type = container_type(rule["ref"])
+        return value == ref
+    raise NotImplementedError()
+
+
+def are_lists_equal(list1, list2, comparison_func):
+    # First check if both lists have the same length
+    if len(list1) != len(list2):
+        return False
+
+    # Now make sure each element in one list has an equal element in the other list
+    for item1 in list1:
+        # Use the supplied function to test for an equal item
+        if not any(comparison_func(item1, item2) for item2 in list2):
+            return False
+
+    # If all items match, the lists are equal
+    return True
+
+
+def compare_urls(url1, url2, full=True):
+    if url1 is None or url2 is None:
+        return url1 == url2
+    
+    logger.info(f"compare_urls. url1: {url1}; url2: {url2}")
+
+    def parse_with_default_scheme(url):
+        """
+        Ensure the URL has a scheme. If not, prepend 'http://'
+        so it parses as host + path instead of just a path.
+        """
+        # Regex to check if URL has scheme like 'http://', 'https://', etc.
+        if not re.match(r'^[a-zA-Z][a-zA-Z0-9+\-.]*://', url):
+            url = f"http://{url}"
+        return urlparse(url)
+
+    def normalize_url(url):
+        # Parse the URL; if no scheme is present, assume 'http'
+        parsed_url = parse_with_default_scheme(url)
+        scheme = parsed_url.scheme.lower()
+
+        # Extract the domain parts using tldextract
+        extracted = tldextract.extract(parsed_url.netloc.lower())
+        # e.g., extracted = TLDExtractResult(subdomain='www', domain='airbnb', suffix='com.sg')
+        
+        # Drop 'www' if it's the only subdomain
+        subdomain = extracted.subdomain
+        if subdomain == 'www':
+            subdomain = ''
+
+        # Instead of using the suffix (e.g., 'com', 'com.sg'), ignore it completely
+        # so that both 'airbnb.com' and 'airbnb.com.sg' become just 'airbnb' or 'www.airbnb'
+        if subdomain:
+            normalized_netloc = f"{subdomain}.{extracted.domain}"
+        else:
+            normalized_netloc = extracted.domain
+
+        # Handle trailing slash in the path
+        normalized_path = parsed_url.path if parsed_url.path != '/' else ''
+
+        # Reassemble the URL with the normalized components
+        normalized_parsed_url = ParseResult(
+            scheme=scheme.lower(),
+            netloc=normalized_netloc,
+            path=normalized_path,
+            params=parsed_url.params if full else '',       # Keep the params
+            query=parsed_url.query if full else '',         # Keep the query string
+            fragment=parsed_url.fragment if full else '',   # Keep the fragment
+        )
+        return urlunparse(normalized_parsed_url)
+
+    logger.info(f"After normalization. url1: {normalize_url(url1)}; url2: {normalize_url(url2)}")
+    # Normalize both URLs
+    norm_url1 = normalize_url(url1)
+    norm_url2 = normalize_url(url2)
+
+    # Compare the normalized URLs
+    return norm_url1 == norm_url2
diff --git a/openhands/nvidia/os_world/metrics/vlc.py b/openhands/nvidia/os_world/metrics/vlc.py
new file mode 100644
index 000000000..c5001c40f
--- /dev/null
+++ b/openhands/nvidia/os_world/metrics/vlc.py
@@ -0,0 +1,523 @@
+import os
+import subprocess
+from typing import Dict
+from xml.etree import ElementTree
+from urllib.parse import urlparse
+
+import acoustid
+import cv2
+import imagehash
+import librosa
+import numpy as np
+from PIL import Image
+from fastdtw import fastdtw
+from scipy.spatial.distance import cosine
+from skimage.metrics import structural_similarity as ssim
+
+from openhands.core.logger import openhands_logger as logger
+
+
+def is_vlc_playing(actual_status_path: str, rule: Dict[str, str]) -> float:
+    """
+    Checks if VLC is currently playing a file.
+    """
+    with open(actual_status_path, 'rb') as file:
+        actual_status = file.read().decode('utf-8')
+
+    tree = ElementTree.fromstring(actual_status)
+    status = tree.find('state').text
+    logger.info(f"VLC Status: {status}")
+    if status == 'playing':
+        if rule['type'] == 'file_name':
+            # Try multiple possible paths for file information in VLC XML
+            file_paths = [
+                'information/category[@name="meta"]/info[@name="filename"]',
+                'information/category[@name="meta"]/info[@name="title"]',
+                'information/category[@name="meta"]/info[@name="uri"]',
+                'information/category[@name="meta"]/info[@name="location"]',
+                'information/category[@name="meta"]/info[@name="name"]'
+            ]
+            
+            file_info = None
+            for path in file_paths:
+                element = tree.find(path)
+                if element is not None and element.text:
+                    file_info = element.text
+                    break
+            
+            if file_info:
+                expected_filename = rule['file_name']
+                
+                # Method 1: Direct filename match (most precise)
+                actual_basename = os.path.basename(file_info)
+                if actual_basename == expected_filename:
+                    return 1
+                
+                # Method 2: Endswith match (for backward compatibility)
+                if file_info.endswith(expected_filename):
+                    return 1
+                
+                # Method 3: For paths, check if expected filename is in the path
+                if expected_filename in file_info:
+                    # Additional check to avoid false positives
+                    # Make sure it's actually the filename, not just part of a path
+                    if file_info.endswith('/' + expected_filename) or file_info.endswith('\\' + expected_filename):
+                        return 1
+                
+                logger.warning(f"File name mismatch - Expected: {expected_filename}, Found: {file_info}")
+                return 0
+            else:
+                logger.warning(f"Could not find file information in VLC status XML for rule: {rule}")
+                return 0
+        elif rule['type'] == 'url':
+            # Try multiple possible paths for URL information in VLC XML
+            url_paths = [
+                'information/category[@name="meta"]/info[@name="url"]',
+                'information/category[@name="meta"]/info[@name="URI"]', 
+                'information/category[@name="meta"]/info[@name="location"]',
+                'information/category[@name="meta"]/info[@name="title"]',  # Sometimes URL is in title for streams
+                'information/category[@name="meta"]/info[@name="filename"]',  # Sometimes URL is in filename for streams
+                'information/category[@name="Stream 0"]/info[@name="Codec"]',  # Try stream info
+                'information/category[@name="Stream 0"]/info[@name="Type"]',
+                'information/category[@name="Stream 0"]/info[@name="Language"]'
+            ]
+            
+            file_info = None
+            logger.debug(f"Looking for URL: {rule['url']}")
+            
+            for path in url_paths:
+                element = tree.find(path)
+                if element is not None and element.text:
+                    file_info = element.text
+                    logger.debug(f"Found URL info at '{path}': {file_info}")
+                    break
+            
+            if file_info:
+                # For URL comparison, check if the rule URL is contained in the file_info
+                # This handles cases where VLC might show a longer or modified URL
+                expected_url = rule['url']
+                
+                # Method 1: Direct URL match
+                if expected_url in file_info or file_info.endswith(expected_url):
+                    return 1
+                
+                # Method 2: For HLS streams, VLC often shows just the filename instead of full URL
+                # Check if the file_info matches the filename part of the expected URL
+                try:
+                    expected_parsed = urlparse(expected_url)
+                    expected_filename = os.path.basename(expected_parsed.path)
+                    
+                    # If VLC shows just the filename (common for HLS streams)
+                    if file_info == expected_filename:
+                        logger.info(f"URL filename match - Expected URL: {expected_url}, VLC shows filename: {file_info}")
+                        return 1
+                    
+                    # Method 3: Check if both are URLs from the same domain and similar path
+                    if '://' in file_info:  # file_info is also a URL
+                        actual_parsed = urlparse(file_info)
+                        # Same domain and similar path structure
+                        if (expected_parsed.netloc == actual_parsed.netloc and 
+                            expected_parsed.path in actual_parsed.path):
+                            return 1
+                except Exception as e:
+                    logger.debug(f"URL parsing error: {e}")
+                    pass
+                
+                logger.warning(f"URL mismatch - Expected: {expected_url}, Found: {file_info}")
+                return 0
+            else:
+                logger.warning(f"Could not find URL information in VLC status XML for rule: {rule}")
+                return 0
+        else:
+            logger.error(f"Unknown type: {rule['type']}")
+            return 0
+    else:
+        return 0
+
+
+# fixme: part of this function can be moved to getters
+def is_vlc_recordings_folder(actual_config_path: str, rule: Dict[str, str]) -> float:
+    """
+    Checks if VLC's recording folder is set to the expected value.
+    """
+    with open(actual_config_path, 'rb') as file:
+        config_file = file.read().decode('utf-8')
+
+    expected_recording_file_path = rule['recording_file_path']
+
+    try:
+        for line in config_file.split("\n"):
+            # Skip comments and empty lines
+            if line.startswith('#') or not line.strip():
+                continue
+            # Check if the line contains the recording path setting
+            if 'input-record-path' in line:
+                # Extract the value of the recording path and remove surrounding whitespace
+                current_path = line.split('=')[-1].strip()
+                # Compare with the Desktop path
+                if current_path == expected_recording_file_path:
+                    return 1
+                else:
+                    return 0
+        # The configuration key was not found in the file
+        return 0
+    except FileNotFoundError:
+        logger.error("VLC configuration file not found.")
+        return 0
+    except Exception as e:
+        logger.error(f"An error occurred: {e}")
+        return 0
+
+
+def is_vlc_fullscreen(actual_window_size, screen_size):
+    if screen_size is None or actual_window_size is None:
+        # if the screen size is not available, means that the window is not fullscreen
+        return 0
+
+    if actual_window_size['width'] == screen_size['width'] and actual_window_size['height'] == screen_size['height']:
+        return 1
+    else:
+        return 0
+
+
+def compare_images(image1_path, image2_path, **options):
+    # You would call this function with the paths to the two images you want to compare:
+    # score = compare_images('path_to_image1', 'path_to_image2')
+    # print("Similarity score:", score)
+
+    if not image1_path or not image2_path:
+        return 0
+
+    base_score = options.get("reference_base_result", None)
+
+    # Open the images and convert to grayscale
+    image1 = Image.open(image1_path).convert('L')
+    image2 = Image.open(image2_path).convert('L')
+
+    # Resize images to the smaller one's size for comparison
+    image1_size = image1.size
+    image2_size = image2.size
+    new_size = min(image1_size, image2_size)
+
+    image1 = image1.resize(new_size, Image.Resampling.LANCZOS)
+    image2 = image2.resize(new_size, Image.Resampling.LANCZOS)
+
+    # Convert images to numpy arrays
+    image1_array = np.array(image1)
+    image2_array = np.array(image2)
+
+    # Calculate SSIM between two images
+    similarity_index = ssim(image1_array, image2_array)
+
+    epsilon = 0.01
+    if base_score is not None:
+        if similarity_index >= base_score + epsilon:
+            return (similarity_index - base_score) / (1 - base_score)
+        else:
+            return 0
+    else:
+        return similarity_index
+
+def compare_audios(audio_path_1, audio_path_2):
+    """
+    Compare two audio files and return a similarity score in the range [0, 1].
+    audio_path_1, audio_path_2: paths to the audio files to compare
+    """
+    # similarity = compare_audios_simple('path_to_audio1.mp3', 'path_to_audio2.mp3')
+    # print(f'Similarity Score: {similarity}')
+
+    if not audio_path_1 or not audio_path_2:
+        return 0
+
+    y1, y2 = None, None
+    try:
+        y1, sr1 = librosa.load(audio_path_1)
+    except Exception:
+        logger.warning(f"Could not load audio from {os.path.basename(audio_path_1)}. It might be empty or corrupt.")
+
+    try:
+        y2, sr2 = librosa.load(audio_path_2)
+    except Exception:
+        logger.warning(f"Could not load audio from {os.path.basename(audio_path_2)}. It might be empty or corrupt.")
+
+    # Handle cases where one or both audio files are empty or corrupt.
+    is_y1_bad = (y1 is None) or (y1.shape[0] == 0)
+    is_y2_bad = (y2 is None) or (y2.shape[0] == 0)
+
+    if is_y1_bad and is_y2_bad:
+        logger.info("Both audio files are empty or corrupt. Considering them perfectly similar.")
+        return 1.0
+    
+    if is_y1_bad or is_y2_bad:
+        logger.warning(f"One audio file is empty/corrupt, the other is not. Similarity is 0.")
+        return 0.0
+
+    try:
+        logger.info(f"Audio 1 ({os.path.basename(audio_path_1)}): sr={sr1}, len={len(y1)}")
+        logger.info(f"Audio 2 ({os.path.basename(audio_path_2)}): sr={sr2}, len={len(y2)}")
+
+        # Extract MFCC features
+        mfcc1 = librosa.feature.mfcc(y=y1, sr=sr1)
+        mfcc2 = librosa.feature.mfcc(y=y2, sr=sr2)
+    except Exception as e:
+        logger.error(f"Error during MFCC extraction: {e}")
+        return 0.0
+
+    # Normalize the MFCC features
+    mfcc1 = librosa.util.normalize(mfcc1, axis=1)
+    mfcc2 = librosa.util.normalize(mfcc2, axis=1)
+    logger.info(f"MFCCs normalized.")
+
+    # Define a lambda function to compute cosine distance
+    dist_func = lambda x, y: cosine(x, y)
+
+    # Use the DTW algorithm to find the best alignment path
+    distance, path = fastdtw(mfcc1.T, mfcc2.T, dist=dist_func)
+    logger.info(f"DTW distance: {distance:.4f}, Path length: {len(path)}")
+
+    # Normalize the DTW distance by the length of the alignment path.
+    if len(path) == 0:
+        normalized_distance = np.inf
+    else:
+        normalized_distance = distance / len(path)
+    logger.info(f"Normalized DTW distance: {normalized_distance:.4f}")
+    
+    # Convert the normalized distance to a similarity score using an exponential decay function.
+    similarity = np.exp(-normalized_distance)
+
+    return similarity
+
+
+def compare_audios_by_dl_model(audio_path_1, audio_path_2):
+    pass
+
+
+def compare_videos(video_path1, video_path2, max_frames_to_check=100, threshold=5):
+    # Open both video files
+    cap1 = cv2.VideoCapture(video_path1)
+    cap2 = cv2.VideoCapture(video_path2)
+
+    frames_checked = 0
+    mismatch_count = 0
+
+    while frames_checked < max_frames_to_check:
+        # Read frames from both videos
+        ret1, frame1 = cap1.read()
+        ret2, frame2 = cap2.read()
+
+        # If a video ends, then check if both ended to confirm they are of the same length
+        if not ret1 or not ret2:
+            return 1. if ret1 == ret2 else 0. # return float only 
+
+        # Convert frames to PIL Images
+        frame1 = Image.fromarray(cv2.cvtColor(frame1, cv2.COLOR_BGR2RGB))
+        frame2 = Image.fromarray(cv2.cvtColor(frame2, cv2.COLOR_BGR2RGB))
+
+        # Compute the perceptual hash for each frame
+        hash1 = imagehash.phash(frame1)
+        hash2 = imagehash.phash(frame2)
+
+        # Increment the frames checked
+        frames_checked += 1
+
+        # Compute the difference in the hashes
+        if hash1 - hash2 > threshold:
+            mismatch_count += 1
+            # If there's a significant difference, the frames are not the same
+            if mismatch_count > threshold:
+                return 0.
+
+    # If we reach here, the content appears to be the same
+    return 1.
+
+
+def check_qt_bgcone(actual_config_path, rule):
+    with open(actual_config_path, 'rb') as file:
+        config_file = file.read().decode('utf-8')
+
+    expected_qt_bgcone = rule['expected_qt_bgcone']
+    if isinstance(expected_qt_bgcone, int):
+        expected_qt_bgcone = str(expected_qt_bgcone)
+
+    try:
+        # The default value of qt_bgcone is 1, which means it is enabled
+        qt_bgcone = "1"
+        for line in config_file.split("\n"):
+            # Check if the line contains the recording path setting
+            if 'qt-bgcone=' in line:
+                # Extract the value of the recording path and remove surrounding whitespace
+                qt_bgcone = line.split('=')[-1].strip()
+            # The configuration key was not found in the file
+
+        if qt_bgcone == expected_qt_bgcone:
+            return 1
+        else:
+            return 0
+    except FileNotFoundError:
+        logger.error("VLC configuration file not found.")
+        return 0
+    except Exception as e:
+        logger.error(f"An error occurred: {e}")
+        return 0
+
+
+def check_qt_max_volume(actual_config_path, rule):
+    with open(actual_config_path, 'rb') as file:
+        config_file = file.read().decode('utf-8')
+
+    expected_qt_max_volume = rule['expected_qt_max_volume']
+    if isinstance(expected_qt_max_volume, int):
+        expected_qt_max_volume = str(expected_qt_max_volume)
+
+    try:
+        qt_max_volume = "125"
+        for line in config_file.split("\n"):
+            if 'qt-max-volume=' in line:
+                qt_max_volume = line.split('=')[-1].strip()
+            # The configuration key was not found in the file
+
+        if qt_max_volume == expected_qt_max_volume:
+            return 1
+        else:
+            return 0
+    except FileNotFoundError:
+        logger.error("VLC configuration file not found.")
+        return 0
+    except Exception as e:
+        logger.error(f"An error occurred: {e}")
+        return 0
+
+
+def check_qt_minimal_view(actual_config_path, rule):
+    with open(actual_config_path, 'rb') as file:
+        config_file = file.read().decode('utf-8')
+
+    expected_qt_minimal_view = rule['expected_qt_minimal_view']
+    if isinstance(expected_qt_minimal_view, int):
+        expected_qt_minimal_view = str(expected_qt_minimal_view)
+
+    try:
+        qt_minimal_view = "0"
+        for line in config_file.split("\n"):
+            if 'qt-minimal-view=' in line:
+                qt_minimal_view = line.split('=')[-1].strip()
+
+        if qt_minimal_view == expected_qt_minimal_view:
+            return 1
+        else:
+            return 0
+    except FileNotFoundError:
+        logger.error("VLC configuration file not found.")
+        return 0
+    except Exception as e:
+        logger.error(f"An error occurred: {e}")
+        return 0
+
+
+def check_qt_slider_colours(actual_config_path, rule):
+    with open(actual_config_path, 'rb') as file:
+        config_file = file.read().decode('utf-8')
+
+    try:
+        qt_slider_colours = "153;210;153;20;210;20;255;199;15;245;39;29"
+        for line in config_file.split("\n"):
+            if 'qt-slider-colours' in line:
+                qt_slider_colours = line.split('=')[-1].strip()
+            # The configuration key was not found in the file
+
+        if rule['type'] == 'match':
+            expected_qt_slider_colours = rule['expected_qt_slider_colours']
+            if qt_slider_colours == expected_qt_slider_colours:
+                return 1
+            else:
+                return 0
+        elif rule['type'] == 'blackish':
+            def is_color_blackish(rgb_values, threshold=100):
+                # decide if the color is blackish
+                return all(value < threshold for value in rgb_values)
+
+            def parse_qt_slider_colours(colours_string):
+                # parse the string of colours into a list of RGB tuples
+                values = [int(x) for x in colours_string.split(';')]
+                colors = list(zip(values[0::3], values[1::3], values[2::3]))
+                return colors
+
+            colors = parse_qt_slider_colours(qt_slider_colours)
+
+            # check if all colors are blackish
+            for color in colors:
+                if is_color_blackish(color):
+                    pass
+                else:
+                    return 0
+            return 1
+
+    except FileNotFoundError:
+        logger.error("VLC configuration file not found.")
+        return 0
+    except Exception as e:
+        logger.error(f"An error occurred: {e}")
+        return 0
+
+
+def check_global_key_play_pause(actual_config_path, rule):
+    """
+    # Play/Pause (str)
+    #global-key-play-pause=
+
+    # Play/Pause (str)
+    #key-play-pause=Space
+    """
+    with open(actual_config_path, 'rb') as file:
+        config_file = file.read().decode('utf-8')
+
+    expected_global_key_play_pause = rule['expected_global_key_play_pause']
+
+    if isinstance(expected_global_key_play_pause, int):
+        expected_global_key_play_pause = str(expected_global_key_play_pause)
+
+    try:
+        global_key_play_pause = "0"
+        for line in config_file.split("\n"):
+            # Check if the line contains the recording path setting
+            if 'global-key-play-pause=' in line:
+                global_key_play_pause = "0" if line.split('=')[-1].strip() == "" else "1"
+
+        if global_key_play_pause == expected_global_key_play_pause:
+            return 1
+        else:
+            return 0
+    except FileNotFoundError:
+        logger.error("VLC configuration file not found.")
+        return 0
+    except Exception as e:
+        logger.error(f"An error occurred: {e}")
+        return 0
+
+
+def check_one_instance_when_started_from_file(actual_config_path, rule):
+    with open(actual_config_path, 'rb') as file:
+        config_file = file.read().decode('utf-8')
+
+    expected_one_instance_when_started_from_file = rule['expected_one_instance_when_started_from_file']
+
+    if isinstance(expected_one_instance_when_started_from_file, int):
+        expected_one_instance_when_started_from_file = str(expected_one_instance_when_started_from_file)
+
+    try:
+        one_instance_when_started_from_file = "1"
+        for line in config_file.split("\n"):
+            # Check if the line contains the recording path setting
+            if 'one-instance-when-started-from-file=' in line:
+                one_instance_when_started_from_file = line.split('=')[-1].strip()
+
+        if one_instance_when_started_from_file == expected_one_instance_when_started_from_file:
+            return 1
+        else:
+            return 0
+    except FileNotFoundError:
+        logger.error("VLC configuration file not found.")
+        return 0
+    except Exception as e:
+        logger.error(f"An error occurred: {e}")
+        return 0
diff --git a/openhands/nvidia/os_world/metrics/vscode.py b/openhands/nvidia/os_world/metrics/vscode.py
new file mode 100644
index 000000000..f6f58cb49
--- /dev/null
+++ b/openhands/nvidia/os_world/metrics/vscode.py
@@ -0,0 +1,407 @@
+import copy
+import importlib.util
+import json
+import sys
+import re
+from typing import Dict
+
+
+def check_json_keybindings(actual: str, expected: str, **options) -> float:
+    """
+    Args:
+        actual (str): path to result text file
+        expected (str): expected dict{}
+
+    Return:
+        float: the score
+    """
+
+    def direct_load_json(fp):
+        try:
+            with open(fp, 'r') as f:
+                data = json.load(f)
+            return data
+        except:
+            return None
+
+    def skip_first_line_load_json(fp):
+        try:
+            with open(fp, 'r') as f:
+                f.readline()
+                data = json.load(f)
+            return data
+        except:
+            return None
+
+    for func in [direct_load_json, skip_first_line_load_json]:
+        data = func(actual)
+        if data is not None and type(data) == list:
+            break
+    else:
+        return 0.0
+    expected = expected['expected']
+    if expected in data:
+        return 1.0
+    else:
+        return 0.0
+
+
+def check_json_settings(actual: str, expected: str, **options) -> float:
+    """
+    Args:
+        actual (str): path to result text file
+        expected (dict): expected dict{}, containing key "expect"
+
+    Return:
+        float: the score
+    """
+    if not actual:
+        return 0.
+
+    try:
+        with open(actual, 'r') as f:
+            data = json.load(f)
+    except Exception as e:
+        return 0.0
+
+    expect = expected['expected']
+    
+    # Check if all expected key-value pairs are in the actual data
+    for key, value in expect.items():
+        if key not in data or data[key] != value:
+            return 0.0
+    
+    return 1.0
+
+
+def compare_text_file(actual: str, expected: str, **options) -> float:
+    """
+    Args:
+        actual (str): path to result text file
+        expected (str): path to gold text file
+
+    Return:
+        float: the score
+    """
+    if not actual:
+        return 0.
+
+    with open(actual) as f1:
+        actual_text = f1.read()
+    with open(expected) as f2:
+        expected_text = f2.read()
+
+    ignore_blanks = options.get('ignore_blanks', False)
+    if ignore_blanks:
+        actual_text = re.sub(r'[\t\n]', ' ', actual_text).strip()
+        actual_text = re.sub(r'\s+', ' ', actual_text)
+        expected_text = re.sub(r'[\t\n]', ' ', expected_text).strip()
+        expected_text = re.sub(r'\s+', ' ', expected_text)
+
+    ignore_case = options.get('ignore_case', False)
+    if ignore_case:
+        actual_text = actual_text.lower()
+        expected_text = expected_text.lower()
+
+    if actual_text == expected_text:
+        return 1.0
+    return 0.0
+
+import zipfile
+from difflib import SequenceMatcher
+import PyPDF2
+
+def compare_pdf_content(content1, content2, text_similarity_threshold):
+    def extract_text_from_pdf(content):
+        with open("temp.pdf", "wb") as temp_pdf:
+            temp_pdf.write(content)
+        with open("temp.pdf", "rb") as temp_pdf:
+            pdf_reader = PyPDF2.PdfReader(temp_pdf)
+            text = ''
+            for page_num in range(len(pdf_reader.pages)):
+                page = pdf_reader.pages[page_num]
+                text += page.extract_text()
+        return text
+
+    text1 = extract_text_from_pdf(content1)
+    text2 = extract_text_from_pdf(content2)
+
+    similarity_ratio = SequenceMatcher(None, text1, text2).ratio()
+
+    return similarity_ratio >= text_similarity_threshold
+
+def compare_zip_files(actual: str, expected: str, **options) -> float:
+    """
+    Args:
+        actual (str): path to result zip file
+        expected (str): path to gold zip file
+
+    Return:
+        float: the score
+    """
+    if not actual:
+        return 0.
+
+    with zipfile.ZipFile(actual, 'r') as zip_file1, zipfile.ZipFile(expected, 'r') as zip_file2:
+        file_list1 = set(zip_file1.namelist())
+        file_list2 = set(zip_file2.namelist())
+
+        if file_list1 != file_list2:
+            return 0.0
+        
+        for file_name in file_list1:
+            content1 = zip_file1.read(file_name)
+            content2 = zip_file2.read(file_name)
+
+            if file_name.lower().endswith('.pdf'):
+                if compare_pdf_content(content1, content2, 0.95):
+                    continue
+                else:
+                    return 0.0
+            elif content1 != content2:
+                return 0.0
+    return 1.0
+
+
+def compare_config(actual: str, rules: Dict, **options) -> float:
+    if not actual:
+        return 0.
+
+    with open(actual) as f1:
+        actual_text = f1.read()
+
+    if actual_text == rules['expected']:
+        return 1.0
+    return 0.0
+
+
+def compare_answer(actual: str, rules: Dict, **options) -> float:
+    """
+    Args:
+        actual (str): result string
+        expected (str): gold string
+
+    Return:
+        float: the score
+    """
+    if not actual:
+        return 0.
+
+    if actual == rules['expected']:
+        return 1.0
+
+    # TODO: can use text embedding to get non-zero return
+    return 0.0
+
+
+def is_extension_installed(actual: str, rules: Dict, **options):
+    if rules['type'] == 'contain':
+        if rules['expected'] in actual:
+            return 1.0
+        return 0.0
+    elif rules['type'] == 'not_contain':
+        if rules['expected'] not in actual:
+            return 1.0
+        return 0.0
+    else:
+        raise NotImplementedError
+
+
+def check_python_file_by_test_suite(actual_files, test_file, **options) -> float:
+    """Check the python file by running the test suite in the given test file.
+    
+    This function is now more robust and handles various error conditions:
+    - File existence validation
+    - Module loading errors
+    - Function execution errors
+    - Proper resource cleanup
+    - Working directory management
+    """
+    import os
+    import uuid
+    import logging
+    from pathlib import Path
+    
+    logger = logging.getLogger(__name__)
+    test_function_name = options.get('test_function_name', 'test')
+    
+    # Validate inputs
+    if not test_file:
+        logger.error("test_file is None or empty")
+        return 0.0
+    
+    # Convert to absolute path and check existence
+    test_file_path = Path(test_file).resolve()
+    if not test_file_path.exists():
+        logger.error(f"Test file does not exist: {test_file_path}")
+        return 0.0
+    
+    if not test_file_path.is_file():
+        logger.error(f"Test file path is not a file: {test_file_path}")
+        return 0.0
+    
+    # Create unique module name to avoid conflicts
+    module_name = f'dynamic_test_module_{uuid.uuid4().hex[:8]}'
+    
+    # Store original working directory and sys.path
+    original_cwd = os.getcwd()
+    original_sys_path = sys.path.copy()
+    
+    try:
+        # Change to the directory containing the test file
+        test_dir = test_file_path.parent
+        os.chdir(test_dir)
+        logger.debug(f"Changed working directory to: {test_dir}")
+        
+        # Add test directory to Python path if not already present
+        if str(test_dir) not in sys.path:
+            sys.path.insert(0, str(test_dir))
+            logger.debug(f"Added {test_dir} to sys.path")
+        
+        # Try to load the module
+        try:
+            spec = importlib.util.spec_from_file_location(module_name, test_file_path)
+            if spec is None:
+                logger.error(f"Could not create module spec for {test_file_path}")
+                return 0.0
+            
+            if spec.loader is None:
+                logger.error(f"Module spec has no loader for {test_file_path}")
+                return 0.0
+            
+            module = importlib.util.module_from_spec(spec)
+            if module is None:
+                logger.error(f"Could not create module from spec for {test_file_path}")
+                return 0.0
+            
+            # Add to sys.modules temporarily
+            sys.modules[module_name] = module
+            
+            # Execute the module
+            spec.loader.exec_module(module)
+            logger.debug(f"Successfully loaded test module: {module_name}")
+            
+        except SyntaxError as e:
+            logger.error(f"Syntax error in test file: {e}")
+            return 0.0
+        except ImportError as e:
+            logger.error(f"Import error loading test file: {e}")
+            return 0.0
+        except Exception as e:
+            logger.error(f"Error loading test module: {e}")
+            return 0.0
+        
+        # Try to get the test function
+        try:
+            if not hasattr(module, test_function_name):
+                logger.error(f"Test function '{test_function_name}' not found in {test_file_path}")
+                return 0.0
+            
+            test_function = getattr(module, test_function_name)
+            
+            if not callable(test_function):
+                logger.error(f"'{test_function_name}' is not callable in {test_file_path}")
+                return 0.0
+            
+            logger.debug(f"Found test function: {test_function_name}")
+            
+        except Exception as e:
+            logger.error(f"Error getting test function: {e}")
+            return 0.0
+        
+        # Execute the test function
+        try:
+            result = test_function()
+            logger.debug(f"Test function returned: {result} (type: {type(result)})")
+            
+            # Handle different return types
+            if isinstance(result, bool):
+                return 1.0 if result else 0.0
+            elif isinstance(result, (int, float)):
+                # Normalize to 0.0-1.0 range
+                normalized = max(0.0, min(1.0, float(result)))
+                if normalized != result:
+                    logger.warning(f"Test result {result} normalized to {normalized}")
+                return normalized
+            else:
+                # For any other type, treat as True if truthy
+                bool_result = bool(result)
+                logger.warning(f"Test returned non-boolean/numeric value {result}, treating as {bool_result}")
+                return 1.0 if bool_result else 0.0
+                
+        except Exception as e:
+            logger.error(f"Error executing test function: {e}")
+            return 0.0
+    
+    except Exception as e:
+        logger.error(f"Unexpected error in check_python_file_by_test_suite: {e}")
+        return 0.0
+    
+    finally:
+        # Cleanup: remove the module from sys.modules
+        if module_name in sys.modules:
+            del sys.modules[module_name]
+            logger.debug(f"Cleaned up module: {module_name}")
+        
+        # Restore original working directory
+        try:
+            os.chdir(original_cwd)
+            logger.debug(f"Restored working directory to: {original_cwd}")
+        except Exception as e:
+            logger.warning(f"Could not restore working directory: {e}")
+        
+        # Restore original sys.path
+        sys.path[:] = original_sys_path
+        logger.debug("Restored sys.path")
+
+
+def check_python_file_by_gold_file(actual_files, gold_file: str, **options) -> float:
+    pass
+
+
+def check_html_background_image(src_path: str, rule: Dict = None) -> float:
+    """
+    Check if the background image is correctly set.
+    multi-app:bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108
+    """
+    if not src_path:
+        return 0.0
+
+    from bs4 import BeautifulSoup
+    with open(src_path, 'r') as f:
+        html_content = f.read()
+    soup = BeautifulSoup(html_content, 'html.parser')
+    styles = soup.find_all('style')
+    for style in styles:
+        if f'background-image: url(\'{rule["value"]}\')' in style.text:
+            return 1.0
+    return 0.0
+
+
+def compare_result_files(src_path, tgt_path):
+    """
+    Compare whether the content of two files are the same.
+    multi-app:7f35355e-02a6-45b5-b140-f0be698bcf85
+    """
+    if not src_path or not tgt_path:
+        return 0.0
+
+    with open(src_path, 'r') as f:
+        src_content = f.read().strip()
+    with open(tgt_path, 'r') as f:
+        tgt_content = f.read().strip()
+    try:
+        # Compare the content as numbers
+        tgt_content_num = float(tgt_content)
+        if tgt_content in src_content:
+            # If the content of tgt is in src, return 1.0 since output src might be
+            # a superset(language description+number) of tgt
+            return 1.0
+        src_content_num = float(src_content)
+        if abs(src_content_num - tgt_content_num) < 1e-4:
+            return 1.0
+        return 0.0
+    except:
+        if src_content == tgt_content:
+            return 1.0
+    return 0.0

From 9fe410650611ce93fe70f242e16b653d7e37a79e Mon Sep 17 00:00:00 2001
From: Mingjie Liu <mingjiel@nvidia.com>
Date: Thu, 20 Nov 2025 13:41:17 -0600
Subject: [PATCH 04/14] Updated examples.

---
 examples/setup.py             | 39 ++++++++++++++++++++++++-----------
 examples/test_setup_single.py | 17 ++++++++-------
 2 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/examples/setup.py b/examples/setup.py
index a12703139..2853c2701 100644
--- a/examples/setup.py
+++ b/examples/setup.py
@@ -26,8 +26,7 @@
 
 from openhands.events.action.os import OSWorldInteractiveAction
 from openhands.core.logger import openhands_logger as logger
-from openhands.runtime.impl.singularity.osworld_singularity_runtime import OSWorldSingularityRuntime, OSWORLD_CHROMIUM_PORT_RANGE
-from openhands.runtime.utils import find_available_tcp_port
+from openhands.nvidia.os_world.controllers.python import PythonController
 
 import dotenv
 # Load environment variables from .env file
@@ -37,6 +36,7 @@
 
 MAX_RETRIES = 20
 
+from openhands.nvidia.os_world import metrics, getters
 
 # Utility function copied from desktop_env.evaluators.metrics.utils
 def compare_urls(url1, url2, full=True):
@@ -992,7 +992,7 @@ async def _login_setup(self, **config):
 
             return browser, context
 
-    def _execute_python_command(self, command: str):
+    def execute_python_command(self, command: str):
         """
         Taken from OSWorld/desktop_env/controllers/python.py
         Executes a python command on the server.
@@ -1088,20 +1088,20 @@ def _update_browse_history_setup(self, **config):
             os_type = self.runtime.os_type
 
             if os_type == 'windows':
-                chrome_history_path = self._execute_python_command(
+                chrome_history_path = self.execute_python_command(
                     """import os; print(os.path.join(os.getenv('USERPROFILE'), "AppData", "Local", "Google", "Chrome", "User Data", "Default", "History"))""")[
                     'output'].strip()
             elif os_type == 'darwin':
-                chrome_history_path = self._execute_python_command(
+                chrome_history_path = self.execute_python_command(
                     """import os; print(os.path.join(os.getenv('HOME'), "Library", "Application Support", "Google", "Chrome", "Default", "History"))""")[
                     'output'].strip()
             elif os_type == 'linux':
                 if "arm" in platform.machine():
-                    chrome_history_path = self._execute_python_command(
+                    chrome_history_path = self.execute_python_command(
                         "import os; print(os.path.join(os.getenv('HOME'), 'snap', 'chromium', 'common', 'chromium', 'Default', 'History'))")[
                         'output'].strip()
                 else:
-                    chrome_history_path = self._execute_python_command(
+                    chrome_history_path = self.execute_python_command(
                         "import os; print(os.path.join(os.getenv('HOME'), '.config', 'google-chrome', 'Default', 'History'))")[
                         'output'].strip()
             else:
@@ -1129,7 +1129,22 @@ def _update_browse_history_setup(self, **config):
 
 
 class Evaluator:
-    def __init__(self, task_config: Dict[str, Any]):
+    def __init__(self, task_config: Dict[str, Any], controller):
+        self.setup_controller = controller
+        self.vm_ip = controller.vm_ip
+        self.server_port = controller.server_port
+        self.chromium_port = controller.chromium_port
+        self.vlc_port = controller.vlc_port
+        self.http_server = controller.http_server
+        self.http_server_setup_root = controller.http_server_setup_root
+        self.cache_dir = controller.cache_dir
+        self.client_password = controller.client_password
+        self.screen_width = controller.screen_width
+        self.screen_height = controller.screen_height
+        self.vm_platform = 'Linux'
+
+        self.controller = PythonController(self.vm_ip, self.server_port)
+
         """Set evaluator information from task config"""
         # evaluator dict
         # func -> metric function string, or list of metric function strings
@@ -1145,7 +1160,7 @@ def __init__(self, task_config: Dict[str, Any]):
             else getattr(metrics, self.evaluator["func"])
         self.metric_conj: str = self.evaluator.get("conj", "and")  # take conjunction of multiple metrics
         if "result" in self.evaluator and len(self.evaluator["result"]) > 0:
-            self.result_getter: Getter = [getattr(getters, "get_{:}".format(res["type"])) for res in
+            self.result_getter = [getattr(getters, "get_{:}".format(res["type"])) for res in
                                           self.evaluator["result"]] \
                 if isinstance(self.evaluator["result"], list) \
                 else getattr(getters, "get_{:}".format(self.evaluator["result"]["type"]))
@@ -1155,7 +1170,7 @@ def __init__(self, task_config: Dict[str, Any]):
                 else None
 
         if "expected" in self.evaluator and len(self.evaluator["expected"]) > 0:
-            self.expected_getter: Getter = [getattr(getters, "get_{:}".format(exp["type"])) if exp else None for exp in
+            self.expected_getter = [getattr(getters, "get_{:}".format(exp["type"])) if exp else None for exp in
                                             self.evaluator["expected"]] \
                 if isinstance(self.evaluator["expected"], list) \
                 else getattr(getters, "get_{:}".format(self.evaluator["expected"]["type"]))
@@ -1176,13 +1191,13 @@ def __init__(self, task_config: Dict[str, Any]):
                 or (len(self.metric) == len(self.result_getter) == len(self.expected_getter) == len(
                     self.metric_options)))
 
-    def evaluate(self, setup_controller, action_history = []):
+    async def evaluate(self, action_history = []):
         """
         Evaluate whether the task is successfully completed.
         """
 
         postconfig = self.evaluator.get("postconfig", [])
-        setup_controller.setup(postconfig)
+        await self.setup_controller.setup(postconfig)
 
         if self.evaluator['func'] == "infeasible":
             if len(action_history) > 0:
diff --git a/examples/test_setup_single.py b/examples/test_setup_single.py
index c4a155487..071c080c0 100755
--- a/examples/test_setup_single.py
+++ b/examples/test_setup_single.py
@@ -33,8 +33,9 @@
     OSWorldSingularityRuntime,
 )
 from openhands.storage import get_file_store
-from examples.setup import SetupController, Evaluator
 
+from openhands.nvidia.os_world.controllers.setup import SetupController
+from openhands.nvidia.os_world.evaluate import Evaluator
 
 async def main(config_path, results_dir):
     """Main example function."""
@@ -124,7 +125,7 @@ async def main(config_path, results_dir):
         setup_config = json.load(f)
     assert 'config' in setup_config, "Setup config not found in setup JSON"
     await setup_controller.setup(setup_config['config'])
-    #evaluator = Evaluator(setup_config)
+    evaluator = Evaluator(setup_config, setup_controller)
 
     #await asyncio.sleep(5)
 
@@ -149,8 +150,10 @@ def get_final_state():
     print("Start Working. Continue to evaluate...")
     import pdb; pdb.set_trace()
     
-    #evaluator.evaluate(setup_controller)
+    result = await evaluator.evaluate()
     
+    print(f"Evaluation result: {result}")
+    import pdb; pdb.set_trace()
     print("Cleaning up...")
     runtime.close()
     print("✓ Runtime closed")
@@ -177,8 +180,8 @@ def run_main_wrapper(config_path, results_dir, example_name):
 
 if __name__ == '__main__':
     # Configuration
-    examples_path = '/home/jayliu/OSWorld/evaluation_examples/examples'
-    output_path = '/home/jayliu/ProRL-Agent-Server/two_results'
+    examples_path = '/root/OSWorld/evaluation_examples/examples'
+    output_path = '/root/ProRL-Agent-Server/two_results'
     
     # ============= CONCURRENCY CONTROL =============
     # Set the maximum number of concurrent processes
@@ -190,8 +193,8 @@ def run_main_wrapper(config_path, results_dir, example_name):
     # You can modify these to test different examples
     tasks = [
         {
-            'category': 'os',
-            'example': '5ced85fc-fa1a-4217-95fd-0fb530545ce2.json'
+            'category': 'vs_code',
+            'example': '0512bb38-d531-4acf-9e7e-0add90816068.json'
         },
     ]
     # Prepare arguments for each process

From 0aecd9ee42cba38b080971d073f811c15da3a049 Mon Sep 17 00:00:00 2001
From: Mingjie Liu <mingjiel@nvidia.com>
Date: Thu, 20 Nov 2025 16:58:32 -0600
Subject: [PATCH 05/14] Added osworld utils.py '

---
 openhands/nvidia/os_world/osworld_utils.py | 288 +++++++++++++++++++++
 scripts/tests/test_osworld_utils.py        | 129 +++++++++
 2 files changed, 417 insertions(+)
 create mode 100644 openhands/nvidia/os_world/osworld_utils.py
 create mode 100644 scripts/tests/test_osworld_utils.py

diff --git a/openhands/nvidia/os_world/osworld_utils.py b/openhands/nvidia/os_world/osworld_utils.py
new file mode 100644
index 000000000..b8c8d6eea
--- /dev/null
+++ b/openhands/nvidia/os_world/osworld_utils.py
@@ -0,0 +1,288 @@
+# type: ignore
+import time
+import os
+import pandas as pd
+import numpy as np
+import asyncio
+from evaluation.utils.shared import (  # type: ignore
+    EvalMetadata,
+    get_default_sandbox_config_for_eval,
+    update_llm_config_for_completions_logging,
+    EvalException,
+)
+from pathlib import Path
+
+from openhands.core.config.llm_config import LLMConfig
+from openhands.runtime.base import Runtime
+from openhands.core.config.condenser_config import NoOpCondenserConfig
+from openhands.core.config import (
+    AgentConfig,
+    OpenHandsConfig,
+    SandboxConfig,
+)
+from openhands.core.setup import create_agent, create_controller, generate_sid
+from openhands.storage import get_file_store
+from openhands.events import EventStream
+from openhands.runtime.impl.singularity.osworld_singularity_runtime import (
+    OSWorldSingularityRuntime,
+)
+from openhands.controller.state.state import State
+from openhands.events.action import CmdRunAction, IPythonRunCellAction, MessageAction
+from openhands.events.observation import CmdOutputObservation
+from openhands.nvidia.logger import nvidia_logger as logger
+from openhands.core.logger import openhands_logger as openhands_logger
+from evaluation.utils.shared import codeact_user_response, is_fatal_evaluation_error
+
+from openhands.nvidia.utils import process_messages_from_agent_state, is_last_action_finish, get_messages_from_partial_result
+import json
+from openhands.nvidia.reward import Reward
+from openhands.nvidia.registry import JobDetails, _DEFAULT_AGENT_CONFIG
+from openhands.nvidia.utils import get_instance_id
+from openhands.nvidia.controller import run_controller_with_controller
+
+from openhands.nvidia.os_world.controllers.setup import SetupController
+from openhands.nvidia.os_world.evaluate import Evaluator
+
+
+def get_config(
+    instance: dict,
+    metadata: EvalMetadata,
+    agent_config: dict=_DEFAULT_AGENT_CONFIG,
+) -> OpenHandsConfig:
+    # Premade Singularity image for OSWorld
+
+    sandbox_config = SandboxConfig(
+        base_container_image='ubuntu:24.04',
+        run_as_fakeroot=True,
+    )
+
+    config = OpenHandsConfig(
+        default_agent=metadata.agent_class,
+        run_as_openhands=False,
+        max_iterations=metadata.max_iterations,
+        runtime='osworld',
+        sandbox=sandbox_config,
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+    config.set_llm_config(
+        update_llm_config_for_completions_logging(
+            metadata.llm_config, metadata.eval_output_dir, get_instance_id(instance)
+        )
+    )
+
+    # https://github.com/All-Hands-AI/OpenHands/blob/main/openhands/core/config/agent_config.py
+    # Turn everything off for OSWorld
+    agent_config = AgentConfig(
+        enable_jupyter=False,
+        enable_editor=False,
+        enable_cmd=False,
+        enable_browsing=False,
+        enable_llm_editor=False,
+        enable_mcp=False,
+        condenser=metadata.condenser_config,
+        enable_prompt_extensions=False,
+        enable_think=False, 
+        enable_history_truncation=False, # turn off history truncation
+        ensure_thinking_end_properly=agent_config['ensure_thinking_end_properly'], # set to true only if using text based server for training.
+        action_timeout=30.0, # 30 seconds per action
+        strict_loop_detector=agent_config['strict_loop_detector'], # set to true only if training
+    )
+    config.set_agent_config(agent_config)
+    return config
+
+def get_instruction(instance: pd.Series | dict, metadata: EvalMetadata) -> MessageAction:
+
+    def obtain_problem_statement(instance: dict) -> str:
+        if isinstance(instance['prompt'], list):
+            problem_statement = instance['prompt'][0]['content']
+        elif isinstance(instance['prompt'], str):
+            problem_statement = json.loads(instance['prompt'])[0]['content']
+        else:
+            raise ValueError(f'Invalid prompt type: {type(instance["prompt"])}')
+        # Remove boxed instructions from problem statement
+        if " Let's think step by step and output the final answer within \\boxed{}." in problem_statement:
+            problem_statement = problem_statement.replace(" Let's think step by step and output the final answer within \\boxed{}.", "")
+        return problem_statement
+
+    instruction = f"""
+Your task is to solve challenging math problems using the `execute_ipython_cell` tool, which gives you access to a full IPython environment. You are allowed and expected to use code to explore, solve, and verify your answers.
+
+Environment:
+- Libraries already imported: `math`, `cmath`, `numpy`, `sympy`, `scipy`
+- You can also install additional libraries using `%pip install <library>` if necessary.
+
+Instructions:
+1. Read and understand the problem statement. Fist use the `think` tool to log down your thoughts and plan for solving the problem.
+2. Plan your solution using a combination of reasoning and code. Always try to solve the problem using code. Also plan about how to verify your solution.
+3. In subsequent steps after planning, use tools to execute your plan. Use `execute_ipython_cell` to run calculations, manipulate symbols, or perform verification.
+4. Use code to verify your answer. If your answer is not correct, iterate your plan with the `think` tool and continue solving the problem until you are confident in your answer is correct.
+5. Finally, when you are confident in your answer:
+    - Only call the `finish` tool if you are confident in your answer and you have verified your answer with the `execute_ipython_cell` tool.
+    - Terminate the conversation by calling the `finish` tool.
+    - Put your final answer within \\boxed{{}} in the message with the `finish` tool.
+    
+
+Important Guidelines:
+- Always first use the `think` tool to log down your thoughts and plan for solving the problem.
+- Always try to use the `execute_ipython_cell` tool to solve the problem, especially for calculations, symbolic reasoning, or simulations.
+- Always verify your answer with code and the `execute_ipython_cell` tool.
+- Only use the `finish` tool is you are confident the answer is correct. If you think there is a mistake, iterate your plan with the `think` tool and continue solving the problem.
+- Put your final answer within \\boxed{{}} in the message with the `finish` tool.
+
+Now begin solving the following problem:
+{obtain_problem_statement(instance)}
+"""
+    return MessageAction(content=instruction)
+
+def create_runtime(config: OpenHandsConfig, sid: str | None = None) -> Runtime:
+    vm_image_path = os.getenv('OSWORLD_VM_IMAGE_PATH', './OS_images/Ubuntu.qcow2')
+    assert Path(vm_image_path).exists(), f"ERROR: VM image not found at {vm_image_path}"
+    logger.info(f"Using VM image path: {vm_image_path}")
+
+    session_id = sid or generate_sid(config)
+
+    file_store = get_file_store(config.file_store, config.file_store_path)
+    event_stream = EventStream(session_id, file_store)
+    runtime = OSWorldSingularityRuntime(
+        config=config,
+        event_stream=event_stream,
+        sid=session_id,
+        os_type='linux',
+        vm_image_path=vm_image_path,
+        attach_to_existing=False,
+    )
+
+    return runtime
+
+async def initialize_agents(
+        instance: dict,
+        llm_config: LLMConfig | None = None,
+        sid:str | None = None,
+        eval_output_dir:str = "/root",
+        git_commit:str = "9f93e8a1532d6e1da4ea702f3dbd31d0f6b2fb3a",
+        dataset:str = "deepscaler",
+        data_split:str = "train",
+        agent_config: dict = dict(_DEFAULT_AGENT_CONFIG),
+    ) -> tuple[Runtime, EvalMetadata, OpenHandsConfig]:
+
+    if llm_config is None:
+        raise ValueError('LLM config is None, cannot initialize.')
+
+    metadata = EvalMetadata(
+        agent_class="CodeActAgent",
+        llm_config=llm_config,
+        agent_config=None,
+        max_iterations=agent_config['max_iterations'],
+        eval_output_dir=eval_output_dir,
+        start_time=time.strftime('%Y-%m-%d %H:%M:%S'),
+        git_commit=git_commit,
+        dataset=dataset,
+        data_split=data_split,
+        details=None,
+        condenser_config=NoOpCondenserConfig(),
+    )
+
+
+    config = get_config(instance, metadata, agent_config)
+
+    runtime = create_runtime(config, sid=sid)
+
+    await runtime.connect()
+    logger.debug(f"Runtime connected {runtime.sid}")
+
+
+    try:
+        await initialize_runtime(runtime, instance, metadata)
+
+    except Exception as e:
+        logger.error(f"Error initializing runtime: {e}")
+        raise e
+    return runtime, metadata, config
+
+async def initialize_runtime(runtime: Runtime, instance: dict, metadata: EvalMetadata):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    openhands_logger.info(f'{"-" * 50} BEGIN Runtime Initialization Fn {"-" * 50}')
+    # create cache directory
+    cache_dir = f"/tmp/osworld_cache_{runtime.sid}"
+    os.makedirs(cache_dir, exist_ok=True)
+    logger.debug(f"Created cache directory: {cache_dir}")
+
+    runtime.setup_controller = SetupController(
+        vm_ip="127.0.0.1",
+        server_port=runtime._vm_server_port,
+        chromium_port=runtime._chromium_port,
+        cache_dir="/tmp/osworld_example",
+        client_password="password",
+        runtime=runtime  # Pass your runtime object here
+    )
+
+    await runtime.setup_controller.setup(instance['config'])
+    openhands_logger.info(f'{"-" * 50} END Runtime Initialization Fn {"-" * 50}')
+
+async def run_agent(
+        job_details: JobDetails,
+        sid: str | None = None,
+    ) -> dict[str, object]:
+    runtime = job_details.runtime
+    metadata = job_details.metadata
+    config = job_details.config
+    instance = job_details.instance
+
+    message_action = get_instruction(instance, metadata)
+    try:
+        agent = create_agent(config)
+        job_details.agent = agent
+        controller, initial_state = create_controller(
+            agent=agent,
+            runtime=runtime,
+            config=config,
+            replay_events=None,
+        )
+        job_details.controller = controller
+        state: State | None = await run_controller_with_controller(
+                config=config,
+                initial_user_action=message_action,
+                sid=sid,
+                runtime=runtime,
+                agent=agent,
+                fake_user_response_fn=codeact_user_response,
+                controller=controller,
+                initial_state=initial_state,
+            )
+        # if fatal error, throw EvalError to log.
+        if state is None:
+            raise EvalException('Final state is None')
+        if is_fatal_evaluation_error(state.last_error):
+            raise EvalException('Fatal error detected: ' + state.last_error)
+
+    except Exception as e:
+        logger.error(f"Error running agent: {e}")
+
+    # get messages from agent history
+    try:
+        run_results = process_messages_from_agent_state(agent, state, job_details) # type: ignore
+    except Exception as e:
+        logger.error(f"Error while running, failed to retrieve agent messages: {e}")
+        raise Exception(f"Failed to retrieve agent messages: {str(e)}")
+
+    return {
+        'success': not bool(state.last_error if state else True),
+        'error': state.last_error if state and state.last_error else None,
+        'finish': is_last_action_finish(state),
+        **run_results,
+    }
+
+async def evaluate_agent(run_results: dict, instance: dict, runtime: Runtime):
+    try:
+        evaluator = Evaluator(instance, runtime.setup_controller)
+        score = await evaluator.evaluate()
+        if score > 0.99:
+            return {'resolved': True, 'reward': score}
+        return {'resolved': False, 'reward': score}
+    except:
+        return {'resolved': False, 'reward': 0}
diff --git a/scripts/tests/test_osworld_utils.py b/scripts/tests/test_osworld_utils.py
new file mode 100644
index 000000000..964866a2b
--- /dev/null
+++ b/scripts/tests/test_osworld_utils.py
@@ -0,0 +1,129 @@
+import asyncio
+
+import numpy as np
+import pandas as pd
+
+from openhands.core.config.llm_config import LLMConfig
+from openhands.core.logger import openhands_logger as logger
+from openhands.nvidia.os_world.osworld_utils import (
+    initialize_agents,
+    evaluate_agent,
+)
+from openhands.nvidia.registry import JobDetails
+from openhands.nvidia.timer import PausableTimer
+
+
+async def run(instance):
+    max_iterations = 35
+    sampling_params = {
+        'model': 'hosted_vllm/Qwen/Qwen3-8B',
+        'api_key': 'mykey',
+        'modify_params': False,
+        'log_completions': True,
+        'native_tool_calling': True,
+        'temperature': 0.6,
+    }
+    llm_config = LLMConfig(base_url='http://127.0.0.1:8000/v1', **sampling_params)
+
+
+    job_details = JobDetails(
+        job_id='test_job',
+        instance=instance,
+        llm_config=llm_config,
+    )
+    job_details.agent_config['max_iterations'] = max_iterations
+    job_details.timer = PausableTimer(timeout=500)
+    job_details.timer.start()
+
+    # run agent
+    runtime, metadata, config = await initialize_agents(
+        instance, llm_config=llm_config, agent_config=job_details.agent_config
+    )
+    job_details.runtime = runtime
+    job_details.metadata = metadata
+    job_details.config = config
+
+    run_results = {} #await run_agent(job_details)
+    import pdb; pdb.set_trace()
+
+
+    eval_results = await evaluate_agent({}, instance, runtime)
+    return eval_results
+
+
+if __name__ == '__main__':
+    instance = {
+        "id": "06fe7178-4491-4589-810f-2e2bc9502122",
+        "snapshot": "chrome",
+        "instruction": "Can you make my computer bring back the last tab I shut down?",
+        "source": "https://www.wikihow.com/Switch-Tabs-in-Chrome",
+        "config": [
+        {
+            "type": "launch",
+            "parameters": {
+            "command": [
+                "google-chrome",
+                "--remote-debugging-port=1337"
+            ]
+            }
+        },
+        {
+            "type": "launch",
+            "parameters": {
+            "command": [
+                "socat",
+                "tcp-listen:9222,fork",
+                "tcp:localhost:1337"
+            ]
+            }
+        },
+        {
+            "type": "chrome_open_tabs",
+            "parameters": {
+            "urls_to_open": [
+                "https://www.lonelyplanet.com",
+                "https://www.airbnb.com",
+                "https://www.tripadvisor.com"
+            ]
+            }
+        },
+        {
+            "type": "chrome_close_tabs",
+            "parameters": {
+            "urls_to_close": [
+                "https://www.tripadvisor.com"
+            ]
+            }
+        }
+        ],
+        "trajectory": "trajectories/",
+        "related_apps": [
+        "chrome"
+        ],
+        "evaluator": {
+        "func": "is_expected_tabs",
+        "result": {
+            "type": "open_tabs_info"
+        },
+        "expected": {
+            "type": "rule",
+            "rules": {
+            "type": "url",
+            "urls": [
+                "https://www.lonelyplanet.com",
+                "https://www.airbnb.com",
+                "https://www.tripadvisor.com"
+            ]
+            }
+        }
+        },
+        "proxy": True,
+        "fixed_ip": False,
+        "possibility_of_env_change": "low"
+    }
+
+    # Initialize the agents
+    results = asyncio.run(run(instance))
+
+    logger.info(f'Run Results: {results}')
+    logger.info('Agents initialized successfully!')

From e0187486cca7ea4a2fd6b546f8639b42018dc36b Mon Sep 17 00:00:00 2001
From: Mingjie Liu <mingjiel@nvidia.com>
Date: Mon, 24 Nov 2025 16:59:44 -0600
Subject: [PATCH 06/14] Added key files.

---
 openhands/agenthub/gui_agent/__init__.py      |   2 +
 .../agenthub/gui_agent/function_calling.py    | 381 +++++++++++
 openhands/agenthub/gui_agent/osworld_agent.py | 343 ++++++++++
 .../agenthub/gui_agent/prompts/osworld.py     |  10 +
 .../prompts/system_prompt_osworld.j2          |  96 +++
 .../agenthub/gui_agent/tools/__init__.py      |  38 ++
 .../agenthub/gui_agent/tools/osworld_tools.py | 241 +++++++
 .../agenthub/gui_agent/tools/system_tools.py  |  39 ++
 openhands/events/observation/osworld.py       |  33 +
 openhands/nvidia/os_world/osworld_handler.py  |  86 +++
 .../utils/accessibility_tree_simplifier.py    | 624 ++++++++++++++++++
 openhands/utils/ast_process.py                | 448 +++++++++++++
 12 files changed, 2341 insertions(+)
 create mode 100644 openhands/agenthub/gui_agent/function_calling.py
 create mode 100644 openhands/agenthub/gui_agent/osworld_agent.py
 create mode 100644 openhands/agenthub/gui_agent/prompts/osworld.py
 create mode 100644 openhands/agenthub/gui_agent/prompts/system_prompt_osworld.j2
 create mode 100644 openhands/agenthub/gui_agent/tools/__init__.py
 create mode 100644 openhands/agenthub/gui_agent/tools/osworld_tools.py
 create mode 100644 openhands/agenthub/gui_agent/tools/system_tools.py
 create mode 100644 openhands/events/observation/osworld.py
 create mode 100644 openhands/nvidia/os_world/osworld_handler.py
 create mode 100644 openhands/utils/accessibility_tree_simplifier.py
 create mode 100644 openhands/utils/ast_process.py

diff --git a/openhands/agenthub/gui_agent/__init__.py b/openhands/agenthub/gui_agent/__init__.py
index 540e3e28a..1d9c6a2a7 100644
--- a/openhands/agenthub/gui_agent/__init__.py
+++ b/openhands/agenthub/gui_agent/__init__.py
@@ -2,5 +2,7 @@
     GuiAgent,
 )
 
+from openhands.agenthub.gui_agent.osworld_agent import OSWorldAgent
 from openhands.controller.agent import Agent
 Agent.register('GuiAgent', GuiAgent)
+Agent.register('OSWorldAgent', OSWorldAgent)
diff --git a/openhands/agenthub/gui_agent/function_calling.py b/openhands/agenthub/gui_agent/function_calling.py
new file mode 100644
index 000000000..12688e476
--- /dev/null
+++ b/openhands/agenthub/gui_agent/function_calling.py
@@ -0,0 +1,381 @@
+"""This file contains the function calling implementation for different actions.
+
+This is similar to the functionality of `CodeActResponseParser`.
+"""
+
+import json
+
+from litellm import (
+    ModelResponse,
+)
+
+from openhands.agenthub.gui_agent.tools import (
+    ClickTool,
+    RightClickTool,
+    MiddleClickTool,
+    DoubleClickTool,
+    TripleClickTool,
+    MoveToTool,
+    DragToTool,
+    ScrollTool,
+    HorizontalScrollTool,
+    WriteTool,
+    PressTool,
+    HotkeyTool,
+    FailTool,
+    FinishTool,
+    WaitTool,
+)
+from openhands.core.exceptions import (
+    FunctionCallNotExistsError,
+    FunctionCallValidationError,
+)
+from openhands.core.logger import openhands_logger as logger
+from openhands.events.action import (
+    Action,
+    AgentFinishAction,
+    MessageAction,
+)
+from openhands.events.action.os import OSWorldInteractiveAction
+from openhands.events.tool import ToolCallMetadata
+
+
+def combine_thought(action: Action, thought: str) -> Action:
+    if not hasattr(action, 'thought'):
+        return action
+    if thought and action.thought:
+        action.thought = f'{thought}\n{action.thought}'
+    elif thought:
+        action.thought = thought
+    return action
+
+
+def response_to_actions(
+    response: ModelResponse,
+    mcp_tool_names: list[str] | None = None,
+    timeout: float | None = None,
+) -> Action:
+    assert len(response.choices) == 1, 'Only one choice is supported for now'
+    choice = response.choices[0]
+    assistant_msg = choice.message
+    if hasattr(assistant_msg, 'tool_calls') and assistant_msg.tool_calls:
+        # Check if there's assistant_msg.content. If so, add it to the thought
+        thought = ''
+        if isinstance(assistant_msg.content, str):
+            thought = assistant_msg.content
+        elif isinstance(assistant_msg.content, list):
+            for msg in assistant_msg.content:
+                if msg['type'] == 'text':
+                    thought += msg['text']
+
+        # Process each tool call to OpenHands action
+        tool_call = assistant_msg.tool_calls[0]
+        action: Action
+        logger.debug(f'Tool call in function_calling.py: {tool_call}')
+        try:
+            arguments = json.loads(tool_call.function.arguments)
+        except json.decoder.JSONDecodeError as e:
+            raise FunctionCallValidationError(
+                f'Failed to parse tool call arguments: {tool_call.function.arguments}'
+            ) from e
+
+        # ================================================
+        # ClickTool
+        # ================================================
+
+        if tool_call.function.name == ClickTool['function']['name']:
+            if 'x' not in arguments:
+                raise FunctionCallValidationError(
+                    f'Missing required argument "x" in tool call {tool_call.function.name}'
+                )
+            if 'y' not in arguments:
+                raise FunctionCallValidationError(
+                    f'Missing required argument "y" in tool call {tool_call.function.name}'
+                )
+            action = OSWorldInteractiveAction(
+                method='execute_agentic_action',
+                params={
+                    'action': {
+                        'action_type': 'CLICK',
+                        'parameters': arguments,
+                    }
+                },
+            )
+        # ================================================
+        # RightClickTool
+        # ================================================
+        elif tool_call.function.name == RightClickTool['function']['name']:
+            if 'x' not in arguments:
+                raise FunctionCallValidationError(
+                    f'Missing required argument "x" in tool call {tool_call.function.name}'
+                )
+            if 'y' not in arguments:
+                raise FunctionCallValidationError(
+                    f'Missing required argument "y" in tool call {tool_call.function.name}'
+                )
+            action = OSWorldInteractiveAction(
+                method='execute_agentic_action',
+                params={
+                    'action': {
+                        'action_type': 'RIGHT_CLICK',
+                        'parameters': arguments,
+                    }
+                },
+            )
+        # ================================================
+        # MiddleClickTool
+        # ================================================
+        elif tool_call.function.name == MiddleClickTool['function']['name']:
+            if 'x' not in arguments:
+                raise FunctionCallValidationError(
+                    f'Missing required argument "x" in tool call {tool_call.function.name}'
+                )
+            if 'y' not in arguments:
+                raise FunctionCallValidationError(
+                    f'Missing required argument "y" in tool call {tool_call.function.name}'
+                )
+            action = OSWorldInteractiveAction(
+                method='execute_agentic_action',
+                params={
+                    'action': {
+                        'action_type': 'MIDDLE_CLICK',
+                        'parameters': arguments,
+                    }
+                },
+            )
+        # ================================================
+        # DoubleClickTool
+        # ================================================
+        elif tool_call.function.name == DoubleClickTool['function']['name']:
+            if 'x' not in arguments:
+                raise FunctionCallValidationError(
+                    f'Missing required argument "x" in tool call {tool_call.function.name}'
+                )
+            if 'y' not in arguments:
+                raise FunctionCallValidationError(
+                    f'Missing required argument "y" in tool call {tool_call.function.name}'
+                )
+            action = OSWorldInteractiveAction(
+                method='execute_agentic_action',
+                params={
+                    'action': {
+                        'action_type': 'DOUBLE_CLICK',
+                        'parameters': arguments,
+                    }
+                },
+            )
+        # ================================================
+        # TripleClickTool
+        # ================================================
+        elif tool_call.function.name == TripleClickTool['function']['name']:
+            if 'x' not in arguments:
+                raise FunctionCallValidationError(
+                    f'Missing required argument "x" in tool call {tool_call.function.name}'
+                )
+            if 'y' not in arguments:
+                raise FunctionCallValidationError(
+                    f'Missing required argument "y" in tool call {tool_call.function.name}'
+                )
+            action = OSWorldInteractiveAction(
+                method='execute_agentic_action',
+                params={
+                    'action': {
+                        'action_type': 'TRIPLE_CLICK',
+                        'parameters': arguments,
+                    }
+                },
+            )
+        # ================================================
+        # MoveToTool
+        # ================================================
+        elif tool_call.function.name == MoveToTool['function']['name']:
+            if 'x' not in arguments:
+                raise FunctionCallValidationError(
+                    f'Missing required argument "x" in tool call {tool_call.function.name}'
+                )
+            if 'y' not in arguments:
+                raise FunctionCallValidationError(
+                    f'Missing required argument "y" in tool call {tool_call.function.name}'
+                )
+            action = OSWorldInteractiveAction(
+                method='execute_agentic_action',
+                params={
+                    'action': {
+                        'action_type': 'MOVE_TO',
+                        'parameters': arguments,
+                    }
+                },
+            )
+        # ================================================
+        # DragToTool
+        # ================================================
+        elif tool_call.function.name == DragToTool['function']['name']:
+            if 'x' not in arguments:
+                raise FunctionCallValidationError(
+                    f'Missing required argument "x" in tool call {tool_call.function.name}'
+                )
+            if 'y' not in arguments:
+                raise FunctionCallValidationError(
+                    f'Missing required argument "y" in tool call {tool_call.function.name}'
+                )
+            action = OSWorldInteractiveAction(
+                method='execute_agentic_action',
+                params={
+                    'action': {
+                        'action_type': 'DRAG_TO',
+                        'parameters': arguments,
+                    }
+                },
+            )
+        # ================================================
+        # ScrollTool
+        # ================================================
+        elif tool_call.function.name == ScrollTool['function']['name']:
+            if 'amount' not in arguments:
+                raise FunctionCallValidationError(
+                    f'Missing required argument "amount" in tool call {tool_call.function.name}'
+                )
+            # Map vertical scroll amount to dy; dx defaults to 0
+            action = OSWorldInteractiveAction(
+                method='execute_agentic_action',
+                params={
+                    'action': {
+                        'action_type': 'SCROLL',
+                        'parameters': arguments,
+                    }
+                },
+            )
+        # ================================================
+        # HorizontalScrollTool
+        # ================================================
+        elif tool_call.function.name == HorizontalScrollTool['function']['name']:
+            if 'amount' not in arguments:
+                raise FunctionCallValidationError(
+                    f'Missing required argument "amount" in tool call {tool_call.function.name}'
+                )
+            # Map vertical scroll amount to dy; dx defaults to 0
+            action = OSWorldInteractiveAction(
+                method='execute_agentic_action',
+                params={
+                    'action': {
+                        'action_type': 'SCROLL',
+                        'parameters': arguments,
+                    }
+                },
+            )
+        # ================================================
+        # WriteTool
+        # ================================================
+        elif tool_call.function.name == WriteTool['function']['name']:
+            if 'text' not in arguments:
+                raise FunctionCallValidationError(
+                    f'Missing required argument "text" in tool call {tool_call.function.name}'
+                )
+            action = OSWorldInteractiveAction(
+                method='execute_agentic_action',
+                params={
+                    'action': {
+                        'action_type': 'TYPING',
+                        'parameters': arguments,
+                    }
+                },
+            )
+        # ================================================
+        # PressTool
+        # ================================================
+        elif tool_call.function.name == PressTool['function']['name']:
+            if 'key' not in arguments:
+                raise FunctionCallValidationError(
+                    f'Missing required argument "key" in tool call {tool_call.function.name}'
+                )
+            action = OSWorldInteractiveAction(
+                method='execute_agentic_action',
+                params={
+                    'action': {
+                        'action_type': 'PRESS',
+                        'parameters': arguments,
+                    }
+                },
+            )
+        # ================================================
+        # HotkeyTool
+        # ================================================
+        elif tool_call.function.name == HotkeyTool['function']['name']:
+            if 'keys' not in arguments:
+                raise FunctionCallValidationError(
+                    f'Missing required argument "keys" in tool call {tool_call.function.name}'
+                )
+            action = OSWorldInteractiveAction(
+                method='execute_agentic_action',
+                params={
+                    'action': {
+                        'action_type': 'HOTKEY',
+                        'parameters': arguments,
+                    }
+                },
+            )
+        # ================================================
+        # FailTool
+        # ================================================
+        elif tool_call.function.name == FailTool['function']['name']:
+            action = AgentFinishAction(
+                task_completed='false',
+            )
+        # ================================================
+        # FinishTool
+        # ================================================
+        elif tool_call.function.name == FinishTool['function']['name']:
+            action = AgentFinishAction(
+                task_completed='true',
+            )
+        # ================================================
+        # WaitTool
+        # ================================================
+        elif tool_call.function.name == WaitTool['function']['name']:
+            if 'seconds' not in arguments:
+                raise FunctionCallValidationError(
+                    f'Missing required argument "seconds" in tool call {tool_call.function.name}'
+                )
+            action = OSWorldInteractiveAction(
+                method='execute_agentic_action',
+                params={
+                    'action': {
+                        'action_type': 'WAIT',
+                        'parameters': arguments,
+                    }
+                },
+            )
+        else:
+            raise FunctionCallNotExistsError(
+                f'Tool {tool_call.function.name} is not registered. (arguments: {arguments}). Please check the tool name and retry with an existing tool.'
+            )
+
+           
+        action = combine_thought(action, thought)
+        # Add metadata for tool calling
+        action.tool_call_metadata = ToolCallMetadata(
+            tool_call_id=tool_call.id,
+            function_name=tool_call.function.name,
+            model_response=response,
+            total_calls_in_response=len(assistant_msg.tool_calls),
+        )
+    else:
+        action = MessageAction(
+            content=str(assistant_msg.content) if assistant_msg.content else '',
+            wait_for_response=True,
+        )
+        action.tool_call_metadata = ToolCallMetadata(
+                tool_call_id='',
+                function_name='',
+                model_response=response,
+                total_calls_in_response=0,
+        )
+
+    # Add response id to actions
+    # This will ensure we can match both actions without tool calls (e.g. MessageAction)
+    # and actions with tool calls (e.g. CmdRunAction, IPythonRunCellAction, etc.)
+    # with the token usage data
+    action.response_id = response.id
+    if timeout:
+        action.set_hard_timeout(timeout)
+    return action
diff --git a/openhands/agenthub/gui_agent/osworld_agent.py b/openhands/agenthub/gui_agent/osworld_agent.py
new file mode 100644
index 000000000..74cfedf2e
--- /dev/null
+++ b/openhands/agenthub/gui_agent/osworld_agent.py
@@ -0,0 +1,343 @@
+import os
+from jinja2 import Template
+
+from openhands.agenthub.gui_agent.tools import OSWORLD_TOOLS
+
+from openhands.controller.agent import Agent
+from openhands.controller.state.state import State
+from openhands.core.config import AgentConfig
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.message import ImageContent, Message, TextContent
+from openhands.events.action import (
+    Action,
+    AgentFinishAction,
+    MessageAction,
+)
+import openhands.agenthub.gui_agent.function_calling as codeact_function_calling
+from openhands.events.action.os import OSWorldInteractiveAction
+from openhands.events.event import EventSource, Event
+from openhands.events.observation import OSWorldOutputObservation
+from openhands.events.observation import ErrorObservation
+from openhands.llm.llm import LLM
+from openhands.runtime.plugins import (
+    PluginRequirement,
+)
+from openhands.core.exceptions import (
+    AgentFormatError,
+    AgentEndThinkError,
+    AgentToolCallError,
+    AgentLengthError,
+)
+
+from openhands.agenthub.gui_agent.prompts.osworld import OSWORLD_OBSERVATION_FEEDBACK_PROMPT, ERROR_OBSERVATION_FEEDBACK_PROMPT
+
+def get_instruction(action: MessageAction) -> str:
+    if 'Instruction:' in action.content:
+        return action.content.split('Instruction:')[1].strip()
+    return action.content
+
+def convert_action_to_message(action: OSWorldInteractiveAction) -> Message:
+    # These are action from the LLM
+    # Only have text format
+    tool_metadata = action.tool_call_metadata
+    llm_response = tool_metadata.model_response
+    assistant_msg = getattr(llm_response.choices[0], 'message')
+    input_ids = getattr(llm_response.choices[0], 'input_ids', None)
+    output_ids = getattr(llm_response.choices[0], 'output_ids', None)
+    logprobs = getattr(llm_response.choices[0], 'logprobs', None)
+
+    return Message(
+        role=getattr(assistant_msg, 'role', 'assistant'),
+        content=[TextContent(text=assistant_msg.content)],
+        tool_calls=assistant_msg.tool_calls,
+        input_ids=input_ids,
+        output_ids=output_ids,
+        logprobs=logprobs,
+    )
+
+def convert_message_action_to_message(
+    action: MessageAction,
+    include_a11y_tree: bool = True,
+    include_screenshot: bool = True,
+    ) -> Message:
+    text_content = action.content
+    if include_a11y_tree:
+        text_content += f"\n\nAccessibility Tree: {action.accessibility_tree}"
+    content = [TextContent(text=text_content)]
+    if include_screenshot:
+        content.append(ImageContent(image_urls=action.image_urls))
+    return Message(
+        role='user',
+        content=content,
+    )
+
+def convert_observation_to_message(
+    observation: OSWorldOutputObservation | ErrorObservation,
+    instruction: str,
+    include_a11y_tree: bool = True,
+    include_screenshot: bool = True,
+    ) -> Message:
+    if isinstance(observation, OSWorldOutputObservation):
+        prompt_text = OSWORLD_OBSERVATION_FEEDBACK_PROMPT.format(instruction=instruction)
+        if include_a11y_tree:
+            prompt_text += f"\n\nAccessibility Tree: {observation.accessibility_tree}"
+        content = [TextContent(text=prompt_text)]
+        if include_screenshot:
+            content.append(ImageContent(image_urls=observation.image_urls))
+        return Message(
+            role='tool', # or user?
+            content=content,
+            tool_call_id=observation.tool_call_id,
+            name=observation.name,
+        )
+    else:
+        prompt_text = ERROR_OBSERVATION_FEEDBACK_PROMPT.format(instruction=instruction, error_message=observation.content)
+        return Message(
+            role='tool', # or user?
+            content=[TextContent(text=observation.content)],
+            tool_call_id=observation.tool_call_id,
+            name=observation.name,
+        )
+
+def convert_message_action_to_message_full_state(
+    action: MessageAction,
+    include_a11y_tree: bool = True,
+    ) -> Message:
+    test_content = action.content
+    if include_a11y_tree:
+        test_content += f"\n\nAccessibility Tree: {action.accessibility_tree}"
+    content = [TextContent(text=action.content)]
+    content.append(ImageContent(image_urls=action.image_urls))
+    content.append(TextContent(text=action.accessibility_tree))
+    return Message(
+        role='user',
+        content=content,
+    )
+
+def convert_observation_to_message_full_state(
+    observation: OSWorldOutputObservation | ErrorObservation,
+    instruction: str,
+    include_a11y_tree: bool = True,
+    ) -> Message:
+    if isinstance(observation, OSWorldOutputObservation):
+        prompt_text = OSWORLD_OBSERVATION_FEEDBACK_PROMPT.format(instruction=instruction)
+        if include_a11y_tree:
+            prompt_text += f"\n\nAccessibility Tree: {observation.accessibility_tree}"
+        content = [TextContent(text=prompt_text)]
+        
+        # We always add screenshot and accessibility tree to the message
+        content.append(ImageContent(image_urls=observation.image_urls))
+        content.append(TextContent(text=observation.accessibility_tree))
+        return Message(
+            role='tool', # or user?
+            content=content,
+            tool_call_id=observation.tool_call_id,
+            name=observation.name,
+        )
+    else:
+        prompt_text = ERROR_OBSERVATION_FEEDBACK_PROMPT.format(instruction=instruction, error_message=observation.content)
+        return Message(
+            role='tool', # or user?
+            content=[TextContent(text=observation.content)],
+            tool_call_id=observation.tool_call_id,
+            name=observation.name,
+        )
+
+class OSWorldAgent(Agent):
+    VERSION = '1.0'
+    """
+    OSWorldAgent that operates on the OSWorld virtual machine.
+    """
+
+    sandbox_plugins: list[PluginRequirement] = []
+
+    def __init__(
+        self,
+        llm: LLM,
+        config: AgentConfig,
+    ) -> None:
+        """Initializes a new instance of the GuiAgent class.
+
+        Parameters:
+        - llm (LLM): The llm to be used by this agent
+        """
+        super().__init__(llm, config)
+
+        self.system_prompt = os.path.join(os.path.dirname(__file__), 'prompts', 'system_prompt_osworld.j2')
+        with open(self.system_prompt, 'r') as file:
+            self.system_prompt = file.read()
+        self.system_prompt = Template(self.system_prompt).render(CLIENT_PASSWORD='password')
+
+        self.tools = OSWORLD_TOOLS
+
+        # enable vision for all models
+        if isinstance(self.llm.model_info, dict):
+            self.llm.model_info['supports_vision'] = True
+        else:
+            self.llm.model_info = {'supports_vision': True}
+
+        self.reset()
+
+    def reset(self) -> None:
+        """Resets the GuiAgent."""
+        super().reset()
+        self.cost_accumulator = 0
+        self.error_accumulator = 0
+
+    def _get_initial_user_message(self, history: list[Event]) -> MessageAction:
+        """Get the initial user message from the conversation history.
+
+        Args:
+            history: List of events from the conversation
+
+        Returns:
+            MessageAction: The initial user message
+
+        Raises:
+            ValueError: If no initial user message is found
+        """
+        initial_user_message = None
+
+        for event in history:
+            if isinstance(event, MessageAction) and event.source == 'user':
+                initial_user_message = event
+                break
+
+        if initial_user_message is None:
+            # This should not happen in a valid conversation
+            logger.error(
+                f'CRITICAL: Could not find the initial user MessageAction in the full {len(history)} events history.'
+            )
+            raise ValueError(
+                'Could not find the initial user MessageAction in the conversation history.'
+            )
+        return initial_user_message
+
+    def _get_messages(
+        self, events: list[Event], initial_user_message: MessageAction,
+    ) -> list[Message]:
+        """Constructs the message history for the LLM conversation.
+
+        This method builds a structured conversation history by processing events from the state
+        and formatting them into messages that the LLM can understand, similar to how the step
+        method constructs messages but for the full conversation history.
+
+        Args:
+            events: The list of events to convert to messages
+            initial_user_message: The initial user message action
+
+        Returns:
+            list[Message]: A list of formatted messages ready for LLM consumption
+        """
+        messages: list[Message] = []
+
+        # System message
+        messages.append(Message(role='system', content=[TextContent(text=self.system_prompt)]))
+
+        # Get instruction from initial user message
+        # User message is a MessageAction with content and image_urls, will be processed in events
+        instruction = get_instruction(initial_user_message)
+        include_a11y_tree = self.config.enable_a11y_tree
+        include_screenshot = self.config.enable_vision
+
+        # Build history prompts (alternating assistant/user messages)
+        for event in events:
+            if isinstance(event, OSWorldInteractiveAction):
+                messages.append(convert_action_to_message(event))
+            elif isinstance(event, MessageAction):
+                messages.append(convert_message_action_to_message(
+                    event, include_a11y_tree=include_a11y_tree, include_screenshot=include_screenshot))
+            elif isinstance(event, OSWorldOutputObservation) or isinstance(event, ErrorObservation):
+                msg = convert_observation_to_message(
+                    event, instruction, include_a11y_tree=include_a11y_tree, include_screenshot=include_screenshot)
+                messages.append(msg)
+
+        return messages
+
+    def step(self, state: State) -> Action:
+        """Performs one step using the GuiAgent.
+
+        This includes gathering information on previous steps and prompting the model to make a browsing command to execute.
+
+        Parameters:
+        - state (State): used to get updated info
+
+        Returns:
+        - OSWorldInteractiveAction(browsergym_command) - BrowserGym commands to run
+        - MessageAction(content) - Message action to run (e.g. ask for clarification)
+        - AgentFinishAction() - end the interaction
+        """
+
+        format_error = state.get_last_agent_format_error()
+        if format_error and isinstance(format_error, str):
+            if format_error == 'length':
+                raise AgentLengthError("LLM did not format the response properly")
+            elif format_error == 'tool_call':
+                raise AgentToolCallError("LLM did not format the tool call properly")
+            elif format_error != '':
+                logger.error(f"Unknown format error: {format_error}, continue")
+
+        # check if last thought is properly ended
+        # reuse stuck in loop error to exit agent loop
+        if self.config.ensure_thinking_end_properly:
+            latest_agent_thought = state.get_last_agent_thought()
+            if latest_agent_thought and '<think>' in latest_agent_thought and '</think>' not in latest_agent_thought:
+                raise AgentEndThinkError("LLM does not end properly reasoning properly")
+
+        initial_user_message = self._get_initial_user_message(state.history)
+        messages = self._get_messages(state.history, initial_user_message)
+        params: dict = {
+            'messages': self.llm.format_messages_for_llm(messages),
+        }
+        params['tools'] = self.tools
+        params['extra_body'] = {'metadata': state.to_llm_metadata(agent_name=self.name)}
+        response = self.llm.completion(**params)
+        import pdb; pdb.set_trace()
+        logger.debug(f'Response from LLM: {response}')
+        action = codeact_function_calling.response_to_actions(response, timeout=self.config.action_timeout)
+        logger.debug(f'Actions after response_to_actions: {action}')
+        return action
+
+    def _get_messages_from_agent_state(
+        self, events: list[Event], initial_user_message: MessageAction,
+    ) -> list[dict]:
+        """This message although similar to _get_messages, is used to process the messages from the agent state.
+        Key difference is to preserve all the content items in the message, including image and accessibility tree.
+        Used in process_messages_from_agent_state.
+
+        Args:
+            events: The list of events to convert to messages
+            initial_user_message: The initial user message action
+
+        Returns:
+            list[dict]: A list of formatted messages ready for LLM consumption
+        """
+        messages: list[Message] = []
+
+        # System message
+        messages.append(Message(role='system', content=[TextContent(text=self.system_prompt)]))
+
+        # Get instruction from initial user message
+        # User message is a MessageAction with content and image_urls, will be processed in events
+        instruction = get_instruction(initial_user_message)
+        include_a11y_tree = self.config.enable_a11y_tree
+
+        # Build history prompts (alternating assistant/user messages)
+        for event in events:
+            if isinstance(event, OSWorldInteractiveAction):
+                messages.append(convert_action_to_message(event))
+            elif isinstance(event, MessageAction):
+                messages.append(convert_message_action_to_message_full_state(event, include_a11y_tree=include_a11y_tree))
+            elif isinstance(event, OSWorldOutputObservation) or isinstance(event, ErrorObservation):
+                msg = convert_observation_to_message_full_state(
+                    event, instruction, include_a11y_tree=include_a11y_tree)
+                messages.append(msg)
+
+        # set flags to know how to serialize the messages
+        for message in messages:
+            message.cache_enabled = False
+            message.vision_enabled = True
+            message.function_calling_enabled = True
+
+        # let pydantic handle the serialization
+        return [message.model_dump() for message in messages]
\ No newline at end of file
diff --git a/openhands/agenthub/gui_agent/prompts/osworld.py b/openhands/agenthub/gui_agent/prompts/osworld.py
new file mode 100644
index 000000000..16bdb9173
--- /dev/null
+++ b/openhands/agenthub/gui_agent/prompts/osworld.py
@@ -0,0 +1,10 @@
+OSWORLD_OBSERVATION_FEEDBACK_PROMPT = """Action executed. Please generate the next move according to the UI screenshot and instruction. And you can refer to the previous actions and observations for reflection.
+
+Instruction: {instruction}
+"""
+
+ERROR_OBSERVATION_FEEDBACK_PROMPT = """Action failed. Please refer to previous message for the UI screenshot. Please continue working on the task according to the instruction.
+Error message: {error_message}.
+
+Instruction: {instruction}
+"""
\ No newline at end of file
diff --git a/openhands/agenthub/gui_agent/prompts/system_prompt_osworld.j2 b/openhands/agenthub/gui_agent/prompts/system_prompt_osworld.j2
new file mode 100644
index 000000000..4a9feaa3d
--- /dev/null
+++ b/openhands/agenthub/gui_agent/prompts/system_prompt_osworld.j2
@@ -0,0 +1,96 @@
+You are an autonomous agent that follows my instructions to perform desktop computer tasks. You control the mouse and keyboard on an **Ubuntu** system with a screen resolution of 1920x1080.  
+At each step, you will receive a screenshot representing the current computer state, and you must decide the next action based on that screenshot.
+
+You have strong computer knowledge and a stable internet connection. All actions you output will be executed on the machine.
+
+---
+
+## Core Rules (VERY IMPORTANT)
+
+### 1. Action Correctness
+
+- **Do NOT repeat previous actions**, especially if they did not achieve the expected result.  
+  Instead, reassess the new screenshot and adjust your action or coordinates accordingly.
+- **Do NOT predict multiple actions at once.**  
+  Only choose actions based on what is currently visible. Do not click things that are not yet shown on the screen.
+- You cannot complete tasks by outputting text alone.  
+  You must use mouse and keyboard actions to interact with the computer.
+
+---
+
+### 2. UI Update Handling
+
+- If the UI likely **has not updated yet**, you may issue **one `wait()`** action to allow the interface to refresh.
+- **Do NOT call `wait()` consecutively.**  
+  If the UI still does not change after one wait:
+  - The previous action may have missed or been incorrect, **or**
+  - The action was correct but did not produce a visible UI change.
+
+In such cases, reassess and choose a different action.
+
+---
+
+### 3. Task Completion & Failure
+
+- If the task becomes impossible to complete, use the **fail()** action.  
+  Use it only when all reasonable methods have been exhausted.
+- When the task is successfully completed, use **finish()**.
+
+---
+
+### 4. Sudo Password
+
+My computer’s password is **`{CLIENT_PASSWORD}`**.  
+Use it whenever `sudo` rights are required.
+
+---
+
+## Response Structure Requirements
+
+Each response **must** follow this structure:
+
+---
+
+### Observation:
+Provide a detailed description of the screenshot, including:
+- Visible UI elements
+- Buttons, menus, icons, windows, dialogs
+- Pop-ups, notifications, warnings, errors
+- Loading indicators or anything affecting progress
+
+Include **all relevant details**.
+
+---
+
+### Thought:
+Provide detailed reasoning before choosing an action.
+
+#### Step-by-Step Progress Assessment
+- Summarize what has been accomplished so far.
+- Identify unexpected outcomes or errors.
+- If the previous action seems incorrect, explain how to recover.
+
+#### Next Action Analysis
+- List possible next actions based on the current screen.
+- Evaluate them considering the current state and previous actions.
+- Select the **most logical** action.
+- Anticipate the consequence of that action.
+
+---
+
+### Action:
+Output exactly **one** action using the allowed tools:
+- Mouse actions  
+- Keyboard actions  
+- `wait()`  
+- `fail()`  
+- `finish()`  
+
+Only **one** tool call per step.
+
+---
+
+## Final Notes
+
+- Your decisions must be grounded **strictly in the screenshot**.  
+- Do not assume any UI that is not visible.
\ No newline at end of file
diff --git a/openhands/agenthub/gui_agent/tools/__init__.py b/openhands/agenthub/gui_agent/tools/__init__.py
new file mode 100644
index 000000000..ef9a157ca
--- /dev/null
+++ b/openhands/agenthub/gui_agent/tools/__init__.py
@@ -0,0 +1,38 @@
+from .osworld_tools import (
+    ClickTool,
+    RightClickTool,
+    MiddleClickTool,
+    DoubleClickTool,
+    TripleClickTool,
+    MoveToTool,
+    DragToTool,
+    ScrollTool,
+    HorizontalScrollTool,
+    WriteTool,
+    PressTool,
+    HotkeyTool,
+)
+from .system_tools import (
+    FailTool,
+    FinishTool,
+    WaitTool,
+)
+
+OSWORLD_TOOLS = [
+    ClickTool,
+    RightClickTool,
+    MiddleClickTool,
+    DoubleClickTool,
+    TripleClickTool,
+    MoveToTool,
+    DragToTool,
+    ScrollTool,
+    HorizontalScrollTool,
+    WriteTool,
+    PressTool,
+    HotkeyTool,
+    WaitTool,
+    FailTool,
+    FinishTool,
+]
+
diff --git a/openhands/agenthub/gui_agent/tools/osworld_tools.py b/openhands/agenthub/gui_agent/tools/osworld_tools.py
new file mode 100644
index 000000000..6da2e9d81
--- /dev/null
+++ b/openhands/agenthub/gui_agent/tools/osworld_tools.py
@@ -0,0 +1,241 @@
+from litellm import ChatCompletionToolParam, ChatCompletionToolParamFunctionChunk
+
+# Not supported: KeyDownTool, KeyUpTool, MouseDownTool, MouseUpTool
+
+# Click at (x, y)
+ClickTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='click',
+        description='Click at the given screen coordinates (x, y).',
+        parameters={
+            'type': 'object',
+            'properties': {
+                'x': {'type': 'number', 'format': 'float', 'minimum': 0.0, 'maximum': 1.0, 'description': 'Normalized X coordinate in range [0,1]'},
+                'y': {'type': 'number', 'format': 'float', 'minimum': 0.0, 'maximum': 1.0, 'description': 'Normalized Y coordinate in range [0,1]'},
+                'clicks': {'type': 'integer', 'minimum': 1, 'default': 1, 'description': 'Number of clicks to perform (default 1)'},
+                'interval': {'type': 'number', 'default': 0.0, 'description': 'Seconds between clicks when clicks > 1 (default 0.0)'},
+                'button': {'type': 'string', 'default': 'left', 'description': "Mouse button to use: 'left' | 'middle' | 'right' (default 'left')"},
+                'duration': {'type': 'number', 'default': 0.0, 'description': 'Seconds to take moving to the target before clicking (default 0.0)'},
+            },
+            'required': ['x', 'y'],
+            'additionalProperties': False,
+        },
+    ),
+)
+
+# Middle click at (x, y)
+MiddleClickTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='middleClick',
+        description='Middle-click at the given screen coordinates (x, y).',
+        parameters={
+            'type': 'object',
+            'properties': {
+                'x': {'type': 'number', 'format': 'float', 'minimum': 0.0, 'maximum': 1.0, 'description': 'Normalized X coordinate in range [0,1]'},
+                'y': {'type': 'number', 'format': 'float', 'minimum': 0.0, 'maximum': 1.0, 'description': 'Normalized Y coordinate in range [0,1]'},
+                'interval': {'type': 'number', 'default': 0.0, 'description': 'Seconds between clicks (default 0.0)'},
+                'duration': {'type': 'number', 'default': 0.0, 'description': 'Seconds to take moving to the target before clicking (default 0.0)'},
+            },
+            'required': ['x', 'y'],
+            'additionalProperties': False,
+        },
+    ),
+)
+
+# Double click at (x, y)
+DoubleClickTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='doubleClick',
+        description='Double-click at the given screen coordinates (x, y).',
+        parameters={
+            'type': 'object',
+            'properties': {
+                'x': {'type': 'number', 'format': 'float', 'minimum': 0.0, 'maximum': 1.0, 'description': 'Normalized X coordinate in range [0,1]'},
+                'y': {'type': 'number', 'format': 'float', 'minimum': 0.0, 'maximum': 1.0, 'description': 'Normalized Y coordinate in range [0,1]'},
+                'button': {'type': 'string', 'default': 'left', 'description': "Mouse button to use: 'left' | 'middle' | 'right' (default 'left')"},
+                'interval': {'type': 'number', 'default': 0.0, 'description': 'Seconds between the two clicks (default 0.0)'},
+                'duration': {'type': 'number', 'default': 0.0, 'description': 'Seconds to take moving to the target before clicking (default 0.0)'},
+            },
+            'required': ['x', 'y'],
+            'additionalProperties': False,
+        },
+    ),
+)
+
+# Triple click at (x, y)
+TripleClickTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='tripleClick',
+        description='Triple-click at the given screen coordinates (x, y).',
+        parameters={
+            'type': 'object',
+            'properties': {
+                'x': {'type': 'number', 'format': 'float', 'minimum': 0.0, 'maximum': 1.0, 'description': 'Normalized X coordinate in range [0,1]'},
+                'y': {'type': 'number', 'format': 'float', 'minimum': 0.0, 'maximum': 1.0, 'description': 'Normalized Y coordinate in range [0,1]'},
+                'button': {'type': 'string', 'default': 'left', 'description': "Mouse button to use: 'left' | 'middle' | 'right' (default 'left')"},
+                'interval': {'type': 'number', 'default': 0.0, 'description': 'Seconds between clicks (default 0.0)'},
+                'duration': {'type': 'number', 'default': 0.0, 'description': 'Seconds to take moving to the target before clicking (default 0.0)'},
+            },
+            'required': ['x', 'y'],
+            'additionalProperties': False,
+        },
+    ),
+)
+
+# Right click at (x, y)
+RightClickTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='rightClick',
+        description='Right-click at the given screen coordinates (x, y).',
+        parameters={
+            'type': 'object',
+            'properties': {
+                'x': {'type': 'number', 'format': 'float', 'minimum': 0.0, 'maximum': 1.0, 'description': 'Normalized X coordinate in range [0,1]'},
+                'y': {'type': 'number', 'format': 'float', 'minimum': 0.0, 'maximum': 1.0, 'description': 'Normalized Y coordinate in range [0,1]'},
+                'interval': {'type': 'number', 'default': 0.0, 'description': 'Seconds between clicks (default 0.0)'},
+                'duration': {'type': 'number', 'default': 0.0, 'description': 'Seconds to take moving to the target before clicking (default 0.0)'},
+            },
+            'required': ['x', 'y'],
+            'additionalProperties': False,
+        },
+    ),
+)
+
+# Move mouse to (x, y)
+MoveToTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='moveTo',
+        description='Move the mouse pointer to the given screen coordinates (x, y).',
+        parameters={
+            'type': 'object',
+            'properties': {
+                'x': {'type': 'number', 'format': 'float', 'minimum': 0.0, 'maximum': 1.0, 'description': 'Normalized X coordinate in range [0,1]'},
+                'y': {'type': 'number', 'format': 'float', 'minimum': 0.0, 'maximum': 1.0, 'description': 'Normalized Y coordinate in range [0,1]'},
+                'duration': {'type': 'number', 'default': 0.0, 'description': 'Seconds to take moving to the target (default 0.0)'},
+            },
+            'required': ['x', 'y'],
+            'additionalProperties': False,
+        },
+    ),
+)
+
+# Drag mouse to (x, y)
+DragToTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='dragTo',
+        description='Drag the mouse pointer to the given screen coordinates (x, y).',
+        parameters={
+            'type': 'object',
+            'properties': {
+                'x': {'type': 'number', 'format': 'float', 'minimum': 0.0, 'maximum': 1.0, 'description': 'Normalized X coordinate in range [0,1]'},
+                'y': {'type': 'number', 'format': 'float', 'minimum': 0.0, 'maximum': 1.0, 'description': 'Normalized Y coordinate in range [0,1]'},
+                'duration': {'type': 'number', 'default': 0.0, 'description': 'Seconds to take dragging to the target (default 0.0)'},
+                'button': {'type': 'string', 'default': 'left', 'description': "Mouse button to hold while dragging: 'left' | 'middle' | 'right' (default 'left')"},
+                'mouseDownUp': {'type': 'boolean', 'default': True, 'description': ' When true, the mouseUp/Down actions are not performed. Which allows dragging over multiple (small) actions (default True)'},
+            },
+            'required': ['x', 'y'],
+            'additionalProperties': False,
+        },
+    ),
+)
+
+# Scroll by amount (positive = up, negative = down)
+ScrollTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='scroll',
+        description='Scroll vertically by the given amount. Positive scrolls up; negative scrolls down.',
+        parameters={
+            'type': 'object',
+            'properties': {
+                'amount': {'type': 'integer', 'default': 1, 'description': 'Scroll amount (positive up, negative down). Default 1.'},
+                'x': {'type': 'number', 'format': 'float', 'minimum': 0.0, 'maximum': 1.0, 'description': 'The x position on the screen where the click happens. None by default. If provided, it will scroll at the given x position. Optional normalized X coordinate in range [0,1]'},
+                'y': {'type': 'number', 'format': 'float', 'minimum': 0.0, 'maximum': 1.0, 'description': 'The y position on the screen where the click happens. None by default. If provided, it will scroll at the given y position. Optional normalized Y coordinate in range [0,1]'},
+            },
+            'required': ['amount'],
+            'additionalProperties': False,
+        },
+    ),
+)
+
+HorizontalScrollTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='hscroll',
+        description='Scroll horizontally by the given amount. Positive scrolls right; negative scrolls left.',
+        parameters={
+            'type': 'object',
+            'properties': {
+                'amount': {'type': 'integer', 'default': 1, 'description': 'Scroll amount (positive left, negative right). Default 1.'},
+                'x': {'type': 'number', 'format': 'float', 'minimum': 0.0, 'maximum': 1.0, 'description': 'The x position on the screen where the click happens. None by default. If provided, it will scroll at the given x position. Optional normalized X coordinate in range [0,1]'},
+                'y': {'type': 'number', 'format': 'float', 'minimum': 0.0, 'maximum': 1.0, 'description': 'The y position on the screen where the click happens. None by default. If provided, it will scroll at the given y position. Optional normalized Y coordinate in range [0,1]'},
+            },
+            'required': ['amount'],
+            'additionalProperties': False,
+        },
+    ),
+)
+
+# Type text
+WriteTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='write',
+        description='Type the given text.',
+        parameters={
+            'type': 'object',
+            'properties': {
+                'text': {'type': 'string', 'description': 'The text to type'},
+                'interval': {'type': 'number', 'default': 0.0, 'description': 'Seconds between each character (default 0.0)'},
+            },
+            'required': ['text'],
+            'additionalProperties': False,
+        },
+    ),
+)
+
+# Press a key (optionally multiple times)
+PressTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='press',
+        description='Press a single key one or more times.',
+        parameters={
+            'type': 'object',
+            'properties': {
+                'key': {'type': 'string', 'description': 'The key to press (e.g., enter, esc, tab)'},
+            },
+            'required': ['key'],
+            'additionalProperties': False,
+        },
+    ),
+)
+
+# Press a combination of keys as a hotkey
+HotkeyTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='hotkey',
+        description='Press a combination of keys together as a hotkey.',
+        parameters={
+            'type': 'object',
+            'properties': {
+                'keys': {
+                    'type': 'array',
+                    'description': 'Ordered list of keys to press together',
+                    'items': {'type': 'string'},
+                    'minItems': 2,
+                },
+            },
+            'required': ['keys'],
+            'additionalProperties': False,
+        },
+    ),
+)
diff --git a/openhands/agenthub/gui_agent/tools/system_tools.py b/openhands/agenthub/gui_agent/tools/system_tools.py
new file mode 100644
index 000000000..ecbfc067d
--- /dev/null
+++ b/openhands/agenthub/gui_agent/tools/system_tools.py
@@ -0,0 +1,39 @@
+from litellm import ChatCompletionToolParam, ChatCompletionToolParamFunctionChunk
+
+FailTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='fail',
+        description="Use this tool if you can not complete the task. Do not easily use this tool, try your best to do the task.",
+        parameters={
+            'type': 'object',
+            'properties': {},
+        },
+    ),
+)
+
+FinishTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='finish',
+        description="Use this tool when you have successfully completed the user's requested task",
+        parameters={
+            'type': 'object',
+            'properties': {},
+        },
+    ),
+)
+
+WaitTool = ChatCompletionToolParam(
+    type='function',
+    function=ChatCompletionToolParamFunctionChunk(
+        name='wait',
+        description="Use this tool when you think the screen UI is not updated. Waiting for 1 second is usually enough for the UI to update.",
+        parameters={
+            'type': 'object',
+            'properties': {
+                'seconds': {'type': 'integer', 'default': 1, 'description': 'The number of seconds to wait. Default 1.'},
+            },
+        },
+    ),
+)
\ No newline at end of file
diff --git a/openhands/events/observation/osworld.py b/openhands/events/observation/osworld.py
new file mode 100644
index 000000000..e1da07ac7
--- /dev/null
+++ b/openhands/events/observation/osworld.py
@@ -0,0 +1,33 @@
+from dataclasses import dataclass, field
+from openhands.core.schema import ObservationType
+from openhands.events.observation.observation import Observation
+
+
+@dataclass
+class OSWorldOutputObservation(Observation):
+    """This data class represents the output of a browser."""
+
+    observation: str = ObservationType.OSWORLD
+    command: str = field(default='')
+    content: str = field(default='')
+    screenshot: str = field(repr=False, default='')  # don't show in repr, in base64 format
+    accessibility_tree: str = field(repr=False, default='')
+    tool_call_id: str | None = None
+    name: str = ''
+
+    @property
+    def message(self) -> str:
+        return f'OSWorld action {self.command}'
+
+    @property
+    def image_urls(self) -> list[str]:
+        return [f'data:image/png;base64,{self.screenshot}']
+
+    def __str__(self) -> str:
+        ret = (
+            '**OSWorldOutputObservation**\n'
+            f'Action: {self.command}\n'
+            f'Observation: {self.accessibility_tree}'
+        )
+        return ret
+    
\ No newline at end of file
diff --git a/openhands/nvidia/os_world/osworld_handler.py b/openhands/nvidia/os_world/osworld_handler.py
new file mode 100644
index 000000000..5e4294c18
--- /dev/null
+++ b/openhands/nvidia/os_world/osworld_handler.py
@@ -0,0 +1,86 @@
+from typing import Any, Optional
+from evaluation.utils.shared import EvalMetadata  # type: ignore
+from openhands.core.config import OpenHandsConfig
+from openhands.core.config.llm_config import LLMConfig
+from openhands.nvidia.registry import AgentHandler, JobDetails
+from openhands.runtime.base import Runtime
+from openhands.nvidia.os_world.osworld_utils import (
+    initialize_exception,
+    run_exception,
+    eval_exception,
+)
+from openhands.nvidia.utils import final_result as utils_final_result
+
+# Import the existing functions from utils
+from openhands.nvidia.os_world.osworld_utils import (  # type: ignore
+    initialize_agents,
+    run_agent,
+    evaluate_agent,
+)
+
+from openhands.nvidia.reward import Reward
+
+class OSWorldHandler(AgentHandler):
+    """Handler for SWE Agent integration, reusing functions from utils.py."""
+
+    @property
+    def name(self) -> str:
+        """The name identifier for this agent handler."""
+        return "osworld"
+
+    async def init(
+        self,
+        job_details: JobDetails,
+        sid: str | None = None,
+    ) -> tuple[Runtime, EvalMetadata, OpenHandsConfig]:
+        """Initialize the SWE Agent with instance and config using utils functions."""
+        instance = job_details.instance
+        llm_config = job_details.llm_config
+
+        return await initialize_agents(
+            instance=instance,
+            llm_config=llm_config,
+            sid=sid,
+            agent_config=job_details.agent_config,
+        )
+
+    async def run(
+        self,
+        job_details: JobDetails,
+        sid: str | None = None,
+    ) -> dict[str, object]:
+        return await run_agent(
+            job_details=job_details,
+            sid=sid,
+        )
+
+    async def eval(
+            self, job_details: JobDetails,
+            sid: str | None = None,
+            allow_skip: bool = True,
+            reward: Optional[Reward] = None,
+        ) -> dict[str, Any]:
+
+        if reward is None:
+            raise ValueError('Reward is required for evaluation of math problems.')
+
+        return await evaluate_agent(
+            run_results=job_details.run_results,
+            instance=job_details.instance,
+        )
+
+    def init_exception(self, job_details: JobDetails, exception: Exception) -> dict[str, Any]:
+        """Handle exceptions during initialization using utils functions."""
+        return initialize_exception(job_details, exception)
+
+    def run_exception(self, job_details: JobDetails, exception: Exception) -> dict[str, Any]:
+        """Handle exceptions during run using utils functions."""
+        return run_exception(job_details, exception)
+
+    def eval_exception(self, job_details: JobDetails, exception: Exception) -> dict[str, Any]:
+        """Handle exceptions during evaluation using utils functions."""
+        return eval_exception(job_details, exception)
+
+    def final_result(self, job_details: JobDetails) -> dict[str, Any]:
+        """Process final results using utils functions."""
+        return utils_final_result(job_details)
diff --git a/openhands/utils/accessibility_tree_simplifier.py b/openhands/utils/accessibility_tree_simplifier.py
new file mode 100644
index 000000000..3c280c07a
--- /dev/null
+++ b/openhands/utils/accessibility_tree_simplifier.py
@@ -0,0 +1,624 @@
+"""
+Accessibility Tree Simplifier for AI Agents
+
+This module provides utilities to simplify complex accessibility trees (AST) 
+into AI-agent-friendly formats. It extracts actionable elements, their coordinates,
+and provides context about the desktop environment.
+
+The simplified format helps AI agents:
+1. Understand the context of desktop windows and applications
+2. Identify actionable items and their types
+3. Know how to operate on elements (coordinates, typing locations, etc.)
+"""
+
+import re
+from typing import Dict, List, Any, Optional, Tuple, Set
+from collections import defaultdict
+import lxml.etree as etree
+
+
+class AccessibilityTreeSimplifier:
+    """Simplifies accessibility trees for AI agent consumption."""
+    
+    # Define actionable roles that agents can interact with
+    ACTIONABLE_ROLES = {
+        'push-button', 'toggle-button', 'radio-button', 'check-box',
+        'button', 'menu', 'menu-item', 'text', 'entry', 'password-text',
+        'combo-box', 'list-item', 'tree-item', 'slider', 'scroll-bar',
+        'tab', 'link', 'terminal', 'document-text', 'document-web',
+        'tool-bar', 'split-pane', 'page-tab', 'check-menu-item',
+        'radio-menu-item', 'spin-button', 'tree-table', 'table-cell',
+        'icon', 'canvas', 'drawing-area', 'paragraph', 'label'
+    }
+    
+    # Clickable elements
+    CLICKABLE_ROLES = {
+        'push-button', 'toggle-button', 'radio-button', 'check-box',
+        'button', 'menu', 'menu-item', 'link', 'tab', 'combo-box',
+        'list-item', 'tree-item', 'icon', 'tool-bar', 'page-tab',
+        'check-menu-item', 'radio-menu-item', 'spin-button', 'scroll-bar',
+        'slider', 'tree-table', 'table-cell', 'canvas', 'label'
+    }
+    
+    # Typeable elements
+    TYPEABLE_ROLES = {
+        'text', 'entry', 'password-text', 'terminal', 'document-text',
+        'document-web', 'spin-button', 'combo-box', 'paragraph'
+    }
+    
+    # Namespace mappings for different platforms
+    NAMESPACES = {
+        'ubuntu': {
+            'st': 'https://accessibility.ubuntu.example.org/ns/state',
+            'attr': 'https://accessibility.ubuntu.example.org/ns/attributes',
+            'cp': 'https://accessibility.ubuntu.example.org/ns/component',
+            'act': 'https://accessibility.ubuntu.example.org/ns/action',
+            'val': 'https://accessibility.ubuntu.example.org/ns/value',
+            'txt': 'https://accessibility.ubuntu.example.org/ns/text',
+        },
+        'windows': {
+            'st': 'https://accessibility.windows.example.org/ns/state',
+            'attr': 'https://accessibility.windows.example.org/ns/attributes',
+            'cp': 'https://accessibility.windows.example.org/ns/component',
+            'class': 'https://accessibility.windows.example.org/ns/class'
+        },
+        'macos': {
+            'st': 'https://accessibility.macos.example.org/ns/state',
+            'attr': 'https://accessibility.macos.example.org/ns/attributes',
+            'cp': 'https://accessibility.macos.example.org/ns/component',
+            'role': 'https://accessibility.macos.example.org/ns/role',
+        }
+    }
+    
+    def __init__(self, screen_width: int = 1920, screen_height: int = 1080, platform: str = 'ubuntu'):
+        """
+        Initialize the simplifier.
+        
+        Args:
+            screen_width: Screen width in pixels
+            screen_height: Screen height in pixels
+            platform: Platform type ('ubuntu', 'windows', 'macos')
+        """
+        self.screen_width = screen_width
+        self.screen_height = screen_height
+        self.platform = platform
+        self.ns = self.NAMESPACES.get(platform, self.NAMESPACES['ubuntu'])
+        
+    def simplify_tree(self, xml_string: str, max_items: int = 1000) -> Dict[str, Any]:
+        """
+        Simplify an accessibility tree XML into an AI-agent-friendly format.
+        
+        Args:
+            xml_string: The XML accessibility tree as a string
+            max_items: Maximum number of actionable items to extract
+            
+        Returns:
+            Dictionary with simplified tree structure
+        """
+        try:
+            root = etree.fromstring(xml_string.encode('utf-8'))
+        except Exception as e:
+            return {
+                'error': f'Failed to parse XML: {str(e)}',
+                'actionable_items': [],
+                'screen_organization': {}
+            }
+        
+        # Extract actionable items
+        actionable_items = self._extract_actionable_items(root, max_items)
+        
+        # Extract screen organization
+        screen_org = self._extract_screen_organization(root)
+        
+        # Calculate statistics
+        stats = self._calculate_statistics(actionable_items)
+        
+        return {
+            'screen_resolution': {
+                'width': self.screen_width,
+                'height': self.screen_height
+            },
+            'screen_organization': screen_org,
+            'actionable_items': actionable_items,
+            'statistics': stats
+        }
+    
+    def _extract_actionable_items(self, root: etree.Element, max_items: int) -> List[Dict[str, Any]]:
+        """Extract actionable items from the accessibility tree."""
+        items = []
+        seen_items = set()  # Avoid duplicates based on (role, name, coords)
+        
+        def traverse(element: etree.Element, app_name: str = '', window_name: str = ''):
+            """Recursively traverse the tree to find actionable items."""
+            if len(items) >= max_items:
+                return
+            
+            tag = element.tag.split('}')[-1]  # Remove namespace
+            
+            # Update context
+            if tag == 'application':
+                app_name = element.get('name', '')
+            elif tag in ('window', 'frame', 'dialog'):
+                window_name = element.get('name', '')
+            
+            # Check if element is actionable
+            if tag in self.ACTIONABLE_ROLES:
+                item = self._extract_item_info(element, tag, app_name, window_name)
+                if item:
+                    # Smart deduplication strategy
+                    # Skip standalone labels that are children of list-items (already captured in list-item name)
+                    if item['role'] == 'label':
+                        # Check if parent is a list-item
+                        parent = element.getparent()
+                        if parent is not None:
+                            parent_tag = parent.tag.split('}')[-1]
+                            if parent_tag == 'list-item':
+                                # Skip this label, already captured in list-item
+                                for child in element:
+                                    traverse(child, app_name, window_name)
+                                return
+                    
+                    # Skip standalone icons that are within list-items/buttons
+                    if item['role'] == 'icon' and not item['name']:
+                        parent = element.getparent()
+                        if parent is not None:
+                            parent_tag = parent.tag.split('}')[-1]
+                            if parent_tag in ['list-item', 'push-button', 'toggle-button']:
+                                # Skip unnamed icons within buttons/list-items
+                                for child in element:
+                                    traverse(child, app_name, window_name)
+                                return
+                    
+                    # Create unique identifier
+                    approx_x = item['coords']['pixel_x'] // 10 * 10
+                    approx_y = item['coords']['pixel_y'] // 10 * 10
+                    item_id = (item['role'], item['name'], approx_x, approx_y)
+                    
+                    if item_id not in seen_items:
+                        items.append(item)
+                        seen_items.add(item_id)
+            
+            # Traverse children
+            for child in element:
+                traverse(child, app_name, window_name)
+        
+        traverse(root)
+        return items
+    
+    def _extract_item_info(self, element: etree.Element, role: str, 
+                           app_name: str, window_name: str) -> Optional[Dict[str, Any]]:
+        """Extract information about a single actionable item."""
+        # Get element name - try multiple sources
+        name = element.get('name', '') or element.text or ''
+        
+        # If name is empty or generic, try to find nested label text
+        if not name or name.strip() == '':
+            # Look for nested label elements
+            labels = element.findall('.//label', namespaces=element.nsmap)
+            for label in labels:
+                label_text = label.get('name', '') or label.text or ''
+                if label_text and label_text.strip():
+                    name = label_text
+                    break
+            
+            # If still no name, try nested text elements
+            if not name or name.strip() == '':
+                texts = element.findall('.//text', namespaces=element.nsmap)
+                for text_elem in texts:
+                    text_content = text_elem.text or ''
+                    if text_content and text_content.strip():
+                        name = text_content
+                        break
+        
+        name = self._clean_text(name)
+        
+        # Skip invisible or non-showing elements
+        if not self._is_visible(element):
+            return None
+        
+        # Get coordinates
+        coords = self._extract_coordinates(element)
+        if not coords:
+            return None
+        
+        # Determine action type
+        actions = []
+        if role in self.CLICKABLE_ROLES:
+            actions.append('click')
+        if role in self.TYPEABLE_ROLES:
+            actions.append('type')
+        
+        if not actions:
+            return None
+        
+        # Get additional attributes
+        value = self._get_value(element)
+        description = self._get_description(element)
+        states = self._get_states(element)
+        
+        # Calculate screen position
+        position = self._calculate_position(coords['pixel_x'], coords['pixel_y'])
+        
+        item = {
+            'name': name[:100] if name else f"{role}",  # Limit name length
+            'role': role,
+            'actions': actions,
+            'coords': coords,
+            'position': position,
+            'app': app_name,
+            'window': window_name if window_name else None,
+        }
+        
+        # Add optional fields
+        if value:
+            item['value'] = value
+        if description:
+            item['description'] = description
+        if states:
+            item['states'] = states
+        
+        return item
+    
+    def _extract_coordinates(self, element: etree.Element) -> Optional[Dict[str, float]]:
+        """Extract coordinates from an element."""
+        # Try to get screen coordinates
+        coord_str = element.get(f'{{{self.ns["cp"]}}}screencoord')
+        size_str = element.get(f'{{{self.ns["cp"]}}}size')
+        
+        if not coord_str or not size_str:
+            return None
+        
+        try:
+            # Parse coordinates like "(123, 456)"
+            coord_match = re.match(r'\((\d+),\s*(\d+)\)', coord_str)
+            size_match = re.match(r'\((\d+),\s*(\d+)\)', size_str)
+            
+            if not coord_match or not size_match:
+                return None
+            
+            x = int(coord_match.group(1))
+            y = int(coord_match.group(2))
+            width = int(size_match.group(1))
+            height = int(size_match.group(2))
+            
+            # Calculate center point
+            center_x = x + width // 2
+            center_y = y + height // 2
+            
+            # Calculate relative coordinates (0-1 range)
+            rel_x = round(center_x / self.screen_width, 4)
+            rel_y = round(center_y / self.screen_height, 4)
+            
+            return {
+                'pixel_x': center_x,
+                'pixel_y': center_y,
+                'x': rel_x,
+                'y': rel_y,
+                'width': width,
+                'height': height,
+                'bbox': [x, y, x + width, y + height]  # [left, top, right, bottom]
+            }
+        except (ValueError, AttributeError):
+            return None
+    
+    def _is_visible(self, element: etree.Element) -> bool:
+        """Check if element is visible and showing."""
+        showing = element.get(f'{{{self.ns["st"]}}}showing', 'false')
+        visible = element.get(f'{{{self.ns["st"]}}}visible', 'false')
+        
+        # For some platforms, check size
+        size_str = element.get(f'{{{self.ns["cp"]}}}size')
+        if size_str:
+            size_match = re.match(r'\((\d+),\s*(\d+)\)', size_str)
+            if size_match:
+                width = int(size_match.group(1))
+                height = int(size_match.group(2))
+                if width <= 0 or height <= 0:
+                    return False
+        
+        return showing == 'true' and visible == 'true'
+    
+    def _get_value(self, element: etree.Element) -> Optional[str]:
+        """Get the value of an element (for inputs, sliders, etc.)."""
+        value = element.get(f'{{{self.ns.get("val", "")}}}value')
+        return self._clean_text(value) if value else None
+    
+    def _get_description(self, element: etree.Element) -> Optional[str]:
+        """Get the description of an element."""
+        desc = element.get(f'{{{self.ns["attr"]}}}description')
+        return self._clean_text(desc) if desc else None
+    
+    def _get_states(self, element: etree.Element) -> List[str]:
+        """Get the states of an element (enabled, focused, checked, etc.)."""
+        states = []
+        state_attrs = ['enabled', 'focused', 'selected', 'checked', 'pressed', 
+                      'expanded', 'collapsed', 'editable', 'active']
+        
+        for state in state_attrs:
+            if element.get(f'{{{self.ns["st"]}}}{state}') == 'true':
+                states.append(state)
+        
+        return states
+    
+    def _calculate_position(self, x: int, y: int) -> str:
+        """Calculate the general screen position of an element."""
+        # Divide screen into 9 regions
+        h_zone = 'left' if x < self.screen_width / 3 else \
+                 'right' if x > 2 * self.screen_width / 3 else 'center'
+        v_zone = 'top' if y < self.screen_height / 3 else \
+                 'bottom' if y > 2 * self.screen_height / 3 else 'middle'
+        
+        if h_zone == 'center' and v_zone == 'middle':
+            return 'center'
+        elif h_zone == 'center':
+            return f'{v_zone}-center'
+        elif v_zone == 'middle':
+            return f'{v_zone}-{h_zone}'
+        else:
+            return f'{v_zone}-{h_zone}'
+    
+    def _extract_screen_organization(self, root: etree.Element) -> Dict[str, Any]:
+        """Extract screen organization information."""
+        applications = []
+        active_windows = []
+        
+        for app in root.findall('.//application'):
+            app_name = app.get('name', '')
+            if not app_name:
+                continue
+            
+            windows = []
+            for window in app.findall('.//window') + app.findall('.//frame'):
+                window_name = window.get('name', '')
+                is_active = window.get(f'{{{self.ns["st"]}}}active') == 'true'
+                
+                if window_name or is_active:
+                    window_info = {
+                        'name': window_name,
+                        'active': is_active
+                    }
+                    windows.append(window_info)
+                    
+                    if is_active:
+                        active_windows.append({
+                            'app': app_name,
+                            'window': window_name
+                        })
+            
+            applications.append({
+                'name': app_name,
+                'windows': windows
+            })
+        
+        return {
+            'applications': applications,
+            'active_windows': active_windows
+        }
+    
+    def _calculate_statistics(self, items: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Calculate statistics about actionable items."""
+        stats = {
+            'total_items': len(items),
+            'clickable': 0,
+            'typable': 0,
+            'by_role': defaultdict(int),
+            'by_app': defaultdict(int),
+            'by_position': defaultdict(int)
+        }
+        
+        for item in items:
+            if 'click' in item['actions']:
+                stats['clickable'] += 1
+            if 'type' in item['actions']:
+                stats['typable'] += 1
+            
+            stats['by_role'][item['role']] += 1
+            stats['by_app'][item['app']] += 1
+            stats['by_position'][item['position']] += 1
+        
+        # Convert defaultdicts to regular dicts
+        stats['by_role'] = dict(stats['by_role'])
+        stats['by_app'] = dict(stats['by_app'])
+        stats['by_position'] = dict(stats['by_position'])
+        
+        return stats
+    
+    def _clean_text(self, text: Optional[str]) -> str:
+        """Clean text by removing extra whitespace and control characters."""
+        if not text:
+            return ''
+        
+        # Remove control characters except newlines and tabs
+        text = re.sub(r'[\x00-\x08\x0b-\x0c\x0e-\x1f\x7f-\x9f]', '', text)
+        
+        # Normalize whitespace
+        text = ' '.join(text.split())
+        
+        return text.strip()
+    
+    def generate_text_summary(self, simplified_tree: Dict[str, Any], max_items: int = 100) -> str:
+        """
+        Generate a human-readable text summary for AI agents.
+        
+        Args:
+            simplified_tree: Output from simplify_tree()
+            max_items: Maximum items to show in summary
+            
+        Returns:
+            Formatted text summary
+        """
+        lines = []
+        lines.append("=" * 80)
+        lines.append("DESKTOP SCREEN ANALYSIS FOR AI AGENT")
+        lines.append("=" * 80)
+        lines.append("")
+        
+        # Screen info
+        res = simplified_tree['screen_resolution']
+        lines.append(f"Screen Resolution: {res['width']}x{res['height']}")
+        lines.append("")
+        
+        # Screen organization
+        lines.append("--- SCREEN ORGANIZATION ---")
+        org = simplified_tree['screen_organization']
+        lines.append(f"Active Applications: {len(org['applications'])}")
+        lines.append("")
+        
+        # Active windows
+        if org['active_windows']:
+            lines.append("Active Windows:")
+            for win in org['active_windows']:
+                lines.append(f"  • {win['app']}: {win['window']}")
+            lines.append("")
+        
+        # Applications with windows
+        for app in org['applications'][:20]:  # Show top 20 apps
+            if app['windows']:
+                lines.append(f"  [{app['name']}]")
+                for win in app['windows'][:3]:  # Show up to 3 windows per app
+                    status = " (ACTIVE)" if win['active'] else ""
+                    if win['name']:
+                        lines.append(f"    → {win['name']}{status}")
+            else:
+                lines.append(f"  [{app['name']}]")
+            lines.append("")
+        
+        # Statistics
+        stats = simplified_tree['statistics']
+        lines.append("--- ACTIONABLE ITEMS SUMMARY ---")
+        lines.append(f"Total: {stats['total_items']}")
+        lines.append(f"  • Clickable: {stats['clickable']}")
+        lines.append(f"  • Typable: {stats['typable']}")
+        lines.append("")
+        
+        # Top element types
+        if stats['by_role']:
+            lines.append("Top Element Types:")
+            sorted_roles = sorted(stats['by_role'].items(), key=lambda x: x[1], reverse=True)
+            for role, count in sorted_roles[:10]:
+                lines.append(f"  • {role}: {count}")
+            lines.append("")
+        
+        # Actionable items by app
+        lines.append(f"--- ACTIONABLE ITEMS (Top {max_items}) ---")
+        lines.append("")
+        
+        items = simplified_tree['actionable_items'][:max_items]
+        
+        # Group by app
+        items_by_app = defaultdict(list)
+        for item in items:
+            items_by_app[item['app']].append(item)
+        
+        for app_name, app_items in sorted(items_by_app.items()):
+            lines.append(f"[{app_name}] - {len(app_items)} items")
+            for item in app_items[:20]:  # Show top 20 items per app
+                icon = "🖱️" if "click" in item['actions'] else ""
+                icon = "⌨️" if "type" in item['actions'] else icon
+                icon = "🖱️⌨️" if len(item['actions']) > 1 else icon
+                
+                name = item['name'][:50] if item['name'] else f"[{item['role']}]"
+                coords = item['coords']
+                pos = item['position']
+                
+                line = f"  {icon} {name} | {pos} | ({coords['x']:.2f}, {coords['y']:.2f}) | [{coords['pixel_x']}, {coords['pixel_y']}]"
+                
+                if item.get('value'):
+                    line += f" | value: {item['value'][:30]}"
+                if item.get('states'):
+                    line += f" | {', '.join(item['states'])}"
+                
+                lines.append(line)
+            lines.append("")
+        
+        lines.append("=" * 80)
+        
+        return '\n'.join(lines)
+    
+    def generate_agent_prompt(self, simplified_tree: Dict[str, Any]) -> str:
+        """
+        Generate a concise prompt for AI agents with essential information.
+        
+        Args:
+            simplified_tree: Output from simplify_tree()
+            
+        Returns:
+            Formatted prompt text
+        """
+        lines = []
+        lines.append("# Current Desktop State")
+        lines.append("")
+        
+        # Screen info
+        res = simplified_tree['screen_resolution']
+        lines.append(f"**Screen:** {res['width']}x{res['height']}")
+        lines.append("")
+        
+        # Active context
+        org = simplified_tree['screen_organization']
+        if org['active_windows']:
+            lines.append("**Active Window:**")
+            for win in org['active_windows']:
+                lines.append(f"- {win['app']}: {win['window']}")
+            lines.append("")
+        
+        # Actionable items
+        stats = simplified_tree['statistics']
+        lines.append(f"**Available Actions:** {stats['total_items']} items ({stats['clickable']} clickable, {stats['typable']} typable)")
+        lines.append("")
+        
+        lines.append("## Actionable Elements:")
+        lines.append("")
+        
+        items = simplified_tree['actionable_items']
+        
+        for idx, item in enumerate(items[:50], 1):  # Top 50 items
+            actions_str = '/'.join(item['actions'])
+            name = item['name'][:40] if item['name'] else f"[{item['role']}]"
+            coords = item['coords']
+            
+            line = f"{idx}. **{name}** ({item['role']}) - {actions_str} @ ({coords['pixel_x']}, {coords['pixel_y']})"
+            
+            if item.get('value'):
+                line += f" [value: {item['value'][:20]}]"
+            if item.get('states') and any(s in item['states'] for s in ['focused', 'selected', 'checked']):
+                line += f" [{', '.join(item['states'])}]"
+            
+            lines.append(line)
+        
+        if len(items) > 50:
+            lines.append(f"... and {len(items) - 50} more items")
+        
+        lines.append("")
+        lines.append("**Instructions:** Use pixel coordinates (pixel_x, pixel_y) for click/type actions.")
+        
+        return '\n'.join(lines)
+
+
+def simplify_accessibility_tree(xml_string: str, 
+                                screen_width: int = 1920, 
+                                screen_height: int = 1080,
+                                platform: str = 'ubuntu',
+                                output_format: str = 'json') -> Any:
+    """
+    Convenience function to simplify an accessibility tree.
+    
+    Args:
+        xml_string: The XML accessibility tree as a string
+        screen_width: Screen width in pixels
+        screen_height: Screen height in pixels
+        platform: Platform type ('ubuntu', 'windows', 'macos')
+        output_format: Output format ('json', 'text', 'prompt')
+        
+    Returns:
+        Simplified tree in the requested format
+    """
+    simplifier = AccessibilityTreeSimplifier(screen_width, screen_height, platform)
+    simplified = simplifier.simplify_tree(xml_string)
+    
+    if output_format == 'text':
+        return simplifier.generate_text_summary(simplified)
+    elif output_format == 'prompt':
+        return simplifier.generate_agent_prompt(simplified)
+    else:
+        return simplified
\ No newline at end of file
diff --git a/openhands/utils/ast_process.py b/openhands/utils/ast_process.py
new file mode 100644
index 000000000..5848133db
--- /dev/null
+++ b/openhands/utils/ast_process.py
@@ -0,0 +1,448 @@
+import xml.etree.ElementTree as ET
+import re
+from typing import Optional, Dict, Tuple, List
+
+# Shared namespace configuration
+NAMESPACES = {
+    'st': "https://accessibility.ubuntu.example.org/ns/state",
+    'attr': "https://accessibility.ubuntu.example.org/ns/attributes",
+    'cp': "https://accessibility.ubuntu.example.org/ns/component",
+    'act': "https://accessibility.ubuntu.example.org/ns/action",
+    'val': "https://accessibility.ubuntu.example.org/ns/value"
+}
+
+def _register_namespaces():
+    """Registers namespaces to prevent parsing errors."""
+    for prefix, uri in NAMESPACES.items():
+        ET.register_namespace(prefix, uri)
+
+def _get_attr(elem, key, default=None):
+    """Helper to find attributes with or without namespaces."""
+    for ns in NAMESPACES.values():
+        val = elem.attrib.get(f"{{{ns}}}{key}")
+        if val is not None:
+            return val
+    return elem.attrib.get(key, default)
+
+def _parse_coords(coord_str):
+    """Parses '(x, y)' string to integers."""
+    if not coord_str: return 0, 0
+    try:
+        parts = re.findall(r"[\d\.]+", coord_str)
+        return int(float(parts[0])), int(float(parts[1]))
+    except:
+        return 0, 0
+
+
+def get_foreground_window(root: ET.Element) -> Optional[Dict[str, any]]:
+    """
+    Identifies the foreground window based on active/focused/modal state and visibility.
+    
+    Args:
+        root: Root element of the accessibility tree
+        
+    Returns:
+        Dictionary with foreground window info (name, role, box) or None
+    """
+    candidates = []
+    
+    # Helper to get bounding box
+    def get_box(node):
+        pos = _get_attr(node, 'screencoord', '0, 0')
+        size = _get_attr(node, 'size', '0, 0')
+        x, y = _parse_coords(pos)
+        w, h = _parse_coords(size)
+        return (x, y, w, h)
+    
+    # Traverse applications to find frames/windows/dialogs/alerts
+    for app in root.findall(".//application"):
+        app_name = app.attrib.get('name', '')
+        
+        for win in app.findall(".//*"):
+            tag = win.tag.split('}')[-1]
+            
+            # Include alert dialogs in addition to frames/windows/dialogs
+            if tag in ['frame', 'window', 'dialog', 'alert']:
+                is_active = _get_attr(win, 'active') == 'true'
+                is_focused = _get_attr(win, 'focused') == 'true'
+                is_modal = _get_attr(win, 'modal') == 'true'
+                is_showing = _get_attr(win, 'showing') == 'true'
+                is_visible = _get_attr(win, 'visible') == 'true'
+                
+                # Only consider visible and showing windows
+                if is_showing and is_visible:
+                    box = get_box(win)
+                    # Calculate area for z-order tie-breaking
+                    area = box[2] * box[3]
+                    
+                    candidates.append({
+                        'name': win.attrib.get('name', ''),
+                        'app': app_name,
+                        'role': tag,
+                        'active': is_active,
+                        'focused': is_focused,
+                        'modal': is_modal,
+                        'box': box,
+                        'area': area,
+                        'element': win
+                    })
+    
+    # Priority: Modal > Active > Focused > Largest area
+    # This reflects typical desktop behavior where:
+    # - Modal dialogs block everything
+    # - Active window is the user's current focus
+    # - Non-modal alerts/notifications don't occlude active windows
+    foreground = None
+    
+    # 1. Check for TRUE modals first (they block everything)
+    modals = [c for c in candidates if c['modal']]
+    if modals:
+        # Pick the one with most recent/topmost position (last in tree = topmost)
+        foreground = modals[-1]
+    # 2. Check for active window (user's current application)
+    elif any(c['active'] for c in candidates):
+        active = [c for c in candidates if c['active']]
+        # If multiple active, pick the largest
+        foreground = max(active, key=lambda c: c['area'])
+    # 3. Check for focused window
+    elif any(c['focused'] for c in candidates):
+        focused = [c for c in candidates if c['focused']]
+        # If multiple focused, pick largest (shouldn't happen but handle it)
+        foreground = max(focused, key=lambda c: c['area'])
+    # 4. Fallback: pick largest visible window (might be alert/notification)
+    elif candidates:
+        foreground = max(candidates, key=lambda c: c['area'])
+    
+    return foreground
+
+
+def is_element_in_foreground(element: ET.Element, foreground_window: Dict, parent_map: Dict) -> bool:
+    """
+    Check if an element is within the foreground window or is a global UI element.
+    Also allows active/focused elements to pass through even if not in foreground.
+    
+    Args:
+        element: Element to check
+        foreground_window: Foreground window info from get_foreground_window()
+        parent_map: Dictionary mapping child elements to parents
+        
+    Returns:
+        True if element should be shown, False if occluded
+    """
+    if not foreground_window:
+        return True  # No foreground window, show everything
+    
+    # Check if this element itself is active or focused (always show these)
+    is_active = _get_attr(element, 'active') == 'true'
+    is_focused = _get_attr(element, 'focused') == 'true'
+    if is_active or is_focused:
+        return True
+    
+    # Global UI elements (always visible)
+    tag = element.tag.split('}')[-1]
+    
+    # Desktop frame and gnome-shell elements are always visible
+    parent_app = None
+    current = element
+    while current is not None:
+        if current.tag.split('}')[-1] == 'application':
+            parent_app = current.attrib.get('name', '')
+            break
+        current = parent_map.get(current)
+    
+    # Global UI apps (always shown) - desktop shell elements overlay everything
+    global_apps = ['gnome-shell', 'gjs']
+    if parent_app in global_apps:
+        return True
+    
+    # Desktop frame is always visible
+    if tag == 'desktop-frame':
+        return True
+    
+    # Check if element is within foreground window
+    # Walk up the tree to find parent window/frame/alert
+    current = element
+    while current is not None:
+        current_tag = current.tag.split('}')[-1]
+        # Include 'alert' in the list of window types
+        if current_tag in ['frame', 'window', 'dialog', 'alert']:
+            # Check if this is the foreground window (compare by id)
+            if id(current) == id(foreground_window['element']):
+                return True
+            else:
+                # This element belongs to a different window - check if it's active/focused
+                if _get_attr(current, 'active') == 'true' or _get_attr(current, 'focused') == 'true':
+                    return True
+                # Otherwise it's occluded
+                return False
+        
+        current = parent_map.get(current)
+    
+    # If no parent window found, it's a global element
+    return True
+
+def simplify_accessibility_tree(xml_string, filter_occlusion: bool = True):
+    """
+    Parses a verbose accessibility XML and returns a simplified XML string
+    optimized for LLM processing with bounding boxes.
+    
+    Args:
+        xml_string: Raw accessibility tree XML
+        filter_occlusion: If True, filters out occluded elements behind other windows
+        
+    Returns:
+        Simplified XML string with only visible, actionable elements.
+        The "box" ([left, top, width, height]) and "center" ([x, y]) are
+        normalized to [0,1] using the screen size.
+    """
+    _register_namespaces()
+
+    try:
+        root = ET.fromstring(xml_string)
+    except ET.ParseError as e:
+        return f"Error parsing XML: {e}"
+
+    # Build parent map for traversal
+    parent_map = {child: parent for parent in root.iter() for child in parent}
+    
+    # Get foreground window if filtering occlusion
+    foreground_window = None
+    foreground_element_id = None
+    if filter_occlusion:
+        foreground_window = get_foreground_window(root)
+        if foreground_window:
+            foreground_element_id = id(foreground_window['element'])
+            # Debug: Print foreground window info (can be suppressed in production)
+            import os
+            if os.getenv('DEBUG_AST', '0') == '1':
+                print(f"[AST] Foreground: {foreground_window['app']} - {foreground_window['name']} "
+                      f"(role={foreground_window['role']}, active={foreground_window['active']}, "
+                      f"focused={foreground_window['focused']}, modal={foreground_window['modal']})")
+
+    # Determine screen size for normalization (prefer desktop-frame size, else derive from max bounds)
+    screen_w, screen_h = 1920, 1080
+
+    def _clamp01(v: float) -> float:
+        if v < 0.0: return 0.0
+        if v > 1.0: return 1.0
+        return v
+
+    def simplify_node(node, parent_in_foreground=True, parent_app=None):
+        # 1. VISIBILITY CHECK - Filter out invisible or occluded elements
+        visible = _get_attr(node, 'visible')
+        showing = _get_attr(node, 'showing')
+        
+        # Element must be both visible AND showing (not occluded)
+        if visible == 'false' or showing == 'false':
+            return None
+        
+        # For safety, only process elements that are explicitly showing=true
+        # Allow elements without showing attribute for compatibility
+        if showing is not None and showing != 'true':
+            return None
+        
+        # 2. OCCLUSION CHECK - Filter out elements behind other windows
+        if filter_occlusion and foreground_window:
+            # Check if this element is in the foreground
+            in_foreground = is_element_in_foreground(node, foreground_window, parent_map)
+            if not in_foreground:
+                return None
+            parent_in_foreground = in_foreground
+
+        # 2. EXTRACT CRITICAL DATA
+        tag = node.tag.split('}')[-1]
+        name = node.attrib.get('name', '')
+        text_content = node.text.strip() if node.text else ""
+        
+        # Track application name for context
+        current_app = parent_app
+        if tag == 'application':
+            current_app = node.attrib.get('name', '')
+        
+        # 3. COORDINATES (Bounding Box)
+        pos_str = _get_attr(node, 'screencoord')
+        size_str = _get_attr(node, 'size')
+        x, y = _parse_coords(pos_str)
+        w, h = _parse_coords(size_str)
+        
+        # 4. ACTION & STATE
+        actions = []
+        if _get_attr(node, 'click_desc'): actions.append("click")
+        if _get_attr(node, 'selectable') == 'true': actions.append("select")
+        if _get_attr(node, 'editable') == 'true': actions.append("edit")
+        
+        is_enabled = _get_attr(node, 'enabled') == 'true'
+        is_selected = _get_attr(node, 'selected') == 'true'
+        is_checked = _get_attr(node, 'checked') == 'true'
+        is_active = _get_attr(node, 'active') == 'true'
+        is_focused_raw = _get_attr(node, 'focused') == 'true'
+        
+        # Only mark widget-level focus, not window-level focus
+        # Windows/frames can have focus but it's less meaningful for agents
+        is_focused = is_focused_raw and tag not in ['window', 'frame', 'dialog', 'alert']
+        
+        # 5. SIMPLIFICATION LOGIC
+        tag_map = {
+            'push-button': 'btn',
+            'toggle-button': 'toggle',
+            'text': 'input',
+            'menu-item': 'menuitem',
+            'list-item': 'item',
+            'scroll-pane': 'scroll',
+            'desktop-frame': 'desktop',
+            'application': 'app'
+        }
+        simple_tag = tag_map.get(tag, tag)
+
+        # Only keep items with valid coordinates
+        has_coords = (w > 0 and h > 0)
+        
+        is_interesting = (
+            (name or text_content or actions or is_selected or is_checked or is_active or is_focused)
+            and has_coords  # Must have valid coordinates
+        )
+
+        # 6. RECURSION - Process children first
+        children = []
+        for child in node:
+            simplified_child = simplify_node(child, parent_in_foreground, current_app)
+            if simplified_child is not None:
+                children.append(simplified_child)
+        
+        # 7. CONSTRUCT NEW ELEMENT
+        if is_interesting:
+            # Create element with all attributes
+            new_elem = ET.Element(simple_tag)
+            if name: new_elem.set("name", name)
+            if text_content and text_content != name: new_elem.set("text", text_content)
+            
+            # Add application name for window-type elements
+            if tag in ['frame', 'window', 'dialog', 'alert'] and current_app:
+                new_elem.set("app", current_app)
+            
+            # Add normalized bounding box [left, top, width, height] in [0,1]
+            n_left = _clamp01(x / screen_w)
+            n_top = _clamp01(y / screen_h)
+            n_width = _clamp01(w / screen_w)
+            n_height = _clamp01(h / screen_h)
+            new_elem.set("box", f"[{n_left:.3f},{n_top:.3f},{n_width:.3f},{n_height:.3f}]")
+            # Add normalized center coordinates [x, y] in [0,1]
+            center_x = x + (w / 2.0)
+            center_y = y + (h / 2.0)
+            n_center_x = _clamp01(center_x / screen_w)
+            n_center_y = _clamp01(center_y / screen_h)
+            new_elem.set("center", f"[{n_center_x:.3f},{n_center_y:.3f}]")
+            
+            if actions: new_elem.set("act", ",".join(actions))
+            if is_selected: new_elem.set("selected", "true")
+            if is_checked: new_elem.set("checked", "true")
+            if is_active: new_elem.set("active", "true")
+            if is_focused: new_elem.set("focused", "true")
+            if not is_enabled: new_elem.set("enabled", "false")
+            
+            # Add children
+            for child in children:
+                new_elem.append(child)
+            
+            return new_elem
+        
+        # Not interesting - only return if has meaningful children
+        if not children:
+            return None
+        
+        # If only one child, return it directly (skip wrapping div)
+        if len(children) == 1:
+            return children[0]
+        
+        # Multiple children - need a container
+        # But if all children are divs, flatten them
+        all_divs = all(child.tag == 'div' for child in children)
+        if all_divs:
+            # Flatten: return first child and merge others
+            first_child = children[0]
+            for other_child in children[1:]:
+                for subchild in other_child:
+                    first_child.append(subchild)
+            return first_child
+        
+        # Create minimal container
+        new_elem = ET.Element("div")
+        for child in children:
+            new_elem.append(child)
+        
+        return new_elem
+
+    simplified_root = simplify_node(root)
+    if simplified_root is None:
+        return "<error>No visible content found</error>"
+
+    return ET.tostring(simplified_root, encoding='unicode')
+
+def get_actionable_centers(xml_string):
+    """
+    Parses the XML and returns a list of actionable items with their 
+    center coordinates (x, y) in pixel space.
+    
+    Returns:
+        List[Dict]: A list of dictionaries containing name, role, center coordinates, etc.
+    """
+    _register_namespaces()
+    try:
+        root = ET.fromstring(xml_string)
+    except ET.ParseError:
+        return []
+
+    actionable_items = []
+
+    def traverse(node):
+        # Visibility Check
+        visible = _get_attr(node, 'visible')
+        showing = _get_attr(node, 'showing')
+        if visible == 'false' or showing == 'false':
+            return
+
+        tag = node.tag.split('}')[-1]
+        name = node.attrib.get('name', '')
+        text = node.text.strip() if node.text else ""
+        
+        # Action checks
+        actions = []
+        if _get_attr(node, 'click_desc'): actions.append("click")
+        if _get_attr(node, 'selectable') == 'true': actions.append("select")
+        if _get_attr(node, 'editable') == 'true': actions.append("edit")
+
+        # Coordinates
+        pos_str = _get_attr(node, 'screencoord')
+        size_str = _get_attr(node, 'size')
+        x, y = _parse_coords(pos_str)
+        w, h = _parse_coords(size_str)
+
+        # Identify actionable items:
+        # 1. Explicit actions (click/edit/select)
+        # 2. Interactive roles (button, toggle, menu item, list item)
+        # interactive_roles = ['push-button', 'toggle-button', 'text', 'menu-item', 'list-item', 'canvas', 'icon']
+        
+        if (actions) and w > 0 and h > 0:
+            center_x = int(x + (w / 2))
+            center_y = int(y + (h / 2))
+            
+            # Label priority: Name > Text > Tag
+            display_label = name if name else (text if text else tag)
+
+            item_info = {
+                'label': display_label,
+                'role': tag,
+                'center': (center_x, center_y),
+                'actions': actions
+            }
+            actionable_items.append(item_info)
+
+        for child in node:
+            traverse(child)
+
+    traverse(root)
+    return actionable_items
+
+
+# print(get_actionable_centers(open('file.xml', 'r').read()))
+# kprint(simplify_accessibility_tree(open('file.xml', 'r').read()))
\ No newline at end of file

From a3c47e0b49de42e0c8940016cd4c8e6e5655e251 Mon Sep 17 00:00:00 2001
From: Mingjie Liu <mingjiel@nvidia.com>
Date: Mon, 24 Nov 2025 17:00:36 -0600
Subject: [PATCH 07/14] Modified key files.

---
 openhands/core/config/agent_config.py         |   4 +
 openhands/core/schema/observation.py          |   4 +
 openhands/events/action/message.py            |   3 +
 openhands/events/observation/__init__.py      |   2 +
 openhands/events/observation/error.py         |   1 +
 openhands/events/serialization/action.py      |   2 +
 openhands/events/serialization/observation.py |   2 +
 openhands/nvidia/os_world/osworld_utils.py    | 283 +++++++++++++++---
 scripts/tests/test_osworld_utils.py           |  12 +-
 9 files changed, 264 insertions(+), 49 deletions(-)

diff --git a/openhands/core/config/agent_config.py b/openhands/core/config/agent_config.py
index ac03b93b3..d12428e55 100644
--- a/openhands/core/config/agent_config.py
+++ b/openhands/core/config/agent_config.py
@@ -38,6 +38,10 @@ class AgentConfig(BaseModel):
     """Whether history should be truncated to continue the session when hitting LLM context length limit."""
     enable_som_visual_browsing: bool = Field(default=True)
     """Whether to enable SoM (Set of Marks) visual browsing."""
+    enable_vision: bool = Field(default=False)
+    """Whether to enable screenshot vision for OS World."""
+    enable_a11y_tree: bool = Field(default=False)
+    """Whether to enable accessibility tree for OS World."""
     condenser: CondenserConfig = Field(
         default_factory=lambda: NoOpCondenserConfig(type='noop')
     )
diff --git a/openhands/core/schema/observation.py b/openhands/core/schema/observation.py
index 5955c1988..265ad65c1 100644
--- a/openhands/core/schema/observation.py
+++ b/openhands/core/schema/observation.py
@@ -2,6 +2,10 @@
 
 
 class ObservationType(str, Enum):
+    OSWORLD = 'osworld'
+    """The output of a OSWorld action
+    """
+
     READ = 'read'
     """The content of a file
     """
diff --git a/openhands/events/action/message.py b/openhands/events/action/message.py
index 1e8eb7568..d622f7bd8 100644
--- a/openhands/events/action/message.py
+++ b/openhands/events/action/message.py
@@ -1,6 +1,8 @@
 from dataclasses import dataclass
 from typing import Any
 
+from openpyxl.descriptors.base import NoneSet
+
 import openhands
 from openhands.core.schema import ActionType
 from openhands.events.action.action import Action, ActionSecurityRisk
@@ -13,6 +15,7 @@ class MessageAction(Action):
     wait_for_response: bool = False
     action: str = ActionType.MESSAGE
     security_risk: ActionSecurityRisk | None = None
+    accessibility_tree: str | None = None
 
     @property
     def message(self) -> str:
diff --git a/openhands/events/observation/__init__.py b/openhands/events/observation/__init__.py
index 2334cc009..8fb5dd91d 100644
--- a/openhands/events/observation/__init__.py
+++ b/openhands/events/observation/__init__.py
@@ -25,6 +25,7 @@
 from openhands.events.observation.observation import Observation
 from openhands.events.observation.reject import UserRejectObservation
 from openhands.events.observation.success import SuccessObservation
+from openhands.events.observation.osworld import OSWorldOutputObservation
 
 __all__ = [
     'Observation',
@@ -46,4 +47,5 @@
     'RecallObservation',
     'RecallType',
     'MCPObservation',
+    'OSWorldOutputObservation',
 ]
diff --git a/openhands/events/observation/error.py b/openhands/events/observation/error.py
index 4ed05b89a..a76849ea1 100644
--- a/openhands/events/observation/error.py
+++ b/openhands/events/observation/error.py
@@ -14,6 +14,7 @@ class ErrorObservation(Observation):
 
     observation: str = ObservationType.ERROR
     error_id: str = ''
+    name: str = ''
 
     @property
     def message(self) -> str:
diff --git a/openhands/events/serialization/action.py b/openhands/events/serialization/action.py
index a07d917bf..d79ddd9f8 100644
--- a/openhands/events/serialization/action.py
+++ b/openhands/events/serialization/action.py
@@ -25,6 +25,7 @@
 )
 from openhands.events.action.mcp import MCPAction
 from openhands.events.action.message import MessageAction, SystemMessageAction
+from openhands.events.action.os import OSWorldInteractiveAction
 
 actions = (
     NullAction,
@@ -46,6 +47,7 @@
     CondensationAction,
     MCPAction,
     OSInteractiveAction,
+    OSWorldInteractiveAction,
 )
 
 ACTION_TYPE_TO_CLASS = {action_class.action: action_class for action_class in actions}  # type: ignore[attr-defined]
diff --git a/openhands/events/serialization/observation.py b/openhands/events/serialization/observation.py
index 98efad0d7..00894b506 100644
--- a/openhands/events/serialization/observation.py
+++ b/openhands/events/serialization/observation.py
@@ -29,6 +29,7 @@
 from openhands.events.observation.observation import Observation
 from openhands.events.observation.reject import UserRejectObservation
 from openhands.events.observation.success import SuccessObservation
+from openhands.events.observation.osworld import OSWorldOutputObservation
 
 observations = (
     NullObservation,
@@ -47,6 +48,7 @@
     AgentThinkObservation,
     RecallObservation,
     MCPObservation,
+    OSWorldOutputObservation,
 )
 
 OBSERVATION_TYPE_TO_CLASS = {
diff --git a/openhands/nvidia/os_world/osworld_utils.py b/openhands/nvidia/os_world/osworld_utils.py
index b8c8d6eea..908826fba 100644
--- a/openhands/nvidia/os_world/osworld_utils.py
+++ b/openhands/nvidia/os_world/osworld_utils.py
@@ -12,6 +12,7 @@
 )
 from pathlib import Path
 
+from openhands.agenthub.gui_agent.osworld_agent import OSWorldAgent
 from openhands.core.config.llm_config import LLMConfig
 from openhands.runtime.base import Runtime
 from openhands.core.config.condenser_config import NoOpCondenserConfig
@@ -33,9 +34,8 @@
 from openhands.core.logger import openhands_logger as openhands_logger
 from evaluation.utils.shared import codeact_user_response, is_fatal_evaluation_error
 
-from openhands.nvidia.utils import process_messages_from_agent_state, is_last_action_finish, get_messages_from_partial_result
+from openhands.nvidia.utils import is_last_action_finish
 import json
-from openhands.nvidia.reward import Reward
 from openhands.nvidia.registry import JobDetails, _DEFAULT_AGENT_CONFIG
 from openhands.nvidia.utils import get_instance_id
 from openhands.nvidia.controller import run_controller_with_controller
@@ -43,6 +43,8 @@
 from openhands.nvidia.os_world.controllers.setup import SetupController
 from openhands.nvidia.os_world.evaluate import Evaluator
 
+from openhands.utils.ast_process import simplify_accessibility_tree
+
 
 def get_config(
     instance: dict,
@@ -88,53 +90,36 @@ def get_config(
         ensure_thinking_end_properly=agent_config['ensure_thinking_end_properly'], # set to true only if using text based server for training.
         action_timeout=30.0, # 30 seconds per action
         strict_loop_detector=agent_config['strict_loop_detector'], # set to true only if training
+        enable_vision=False,
+        enable_a11y_tree=True,
     )
     config.set_agent_config(agent_config)
     return config
 
-def get_instruction(instance: pd.Series | dict, metadata: EvalMetadata) -> MessageAction:
-
-    def obtain_problem_statement(instance: dict) -> str:
-        if isinstance(instance['prompt'], list):
-            problem_statement = instance['prompt'][0]['content']
-        elif isinstance(instance['prompt'], str):
-            problem_statement = json.loads(instance['prompt'])[0]['content']
-        else:
-            raise ValueError(f'Invalid prompt type: {type(instance["prompt"])}')
-        # Remove boxed instructions from problem statement
-        if " Let's think step by step and output the final answer within \\boxed{}." in problem_statement:
-            problem_statement = problem_statement.replace(" Let's think step by step and output the final answer within \\boxed{}.", "")
-        return problem_statement
-
-    instruction = f"""
-Your task is to solve challenging math problems using the `execute_ipython_cell` tool, which gives you access to a full IPython environment. You are allowed and expected to use code to explore, solve, and verify your answers.
-
-Environment:
-- Libraries already imported: `math`, `cmath`, `numpy`, `sympy`, `scipy`
-- You can also install additional libraries using `%pip install <library>` if necessary.
-
-Instructions:
-1. Read and understand the problem statement. Fist use the `think` tool to log down your thoughts and plan for solving the problem.
-2. Plan your solution using a combination of reasoning and code. Always try to solve the problem using code. Also plan about how to verify your solution.
-3. In subsequent steps after planning, use tools to execute your plan. Use `execute_ipython_cell` to run calculations, manipulate symbols, or perform verification.
-4. Use code to verify your answer. If your answer is not correct, iterate your plan with the `think` tool and continue solving the problem until you are confident in your answer is correct.
-5. Finally, when you are confident in your answer:
-    - Only call the `finish` tool if you are confident in your answer and you have verified your answer with the `execute_ipython_cell` tool.
-    - Terminate the conversation by calling the `finish` tool.
-    - Put your final answer within \\boxed{{}} in the message with the `finish` tool.
-    
+def get_instruction(instance: pd.Series | dict, metadata: EvalMetadata, runtime: Runtime) -> MessageAction:
+    """
+    We keep all information here. screenshot and a11y tree will be processed in agent.
+    """
 
-Important Guidelines:
-- Always first use the `think` tool to log down your thoughts and plan for solving the problem.
-- Always try to use the `execute_ipython_cell` tool to solve the problem, especially for calculations, symbolic reasoning, or simulations.
-- Always verify your answer with code and the `execute_ipython_cell` tool.
-- Only use the `finish` tool is you are confident the answer is correct. If you think there is a mistake, iterate your plan with the `think` tool and continue solving the problem.
-- Put your final answer within \\boxed{{}} in the message with the `finish` tool.
+    include_screenshot = True #runtime.config.agents['agent'].enable_vision
+    include_a11y_tree = True #runtime.config.agents['agent'].enable_a11y_tree
+    instruction = f"""Work on the following task accourding to the UI screenshot.
 
-Now begin solving the following problem:
-{obtain_problem_statement(instance)}
+Instruction: {instance['instruction']}
 """
-    return MessageAction(content=instruction)
+    
+    if include_a11y_tree:
+        accessibility_tree = runtime.get_vm_accessibility_tree()
+        if accessibility_tree:
+            accessibility_tree = simplify_accessibility_tree(accessibility_tree)
+
+    image_url = None
+    if include_screenshot:
+        image = runtime.get_vm_screenshot()
+        if image:
+            image_url = [f'data:image/png;base64,{image}']
+
+    return MessageAction(content=instruction, image_urls=image_url, accessibility_tree=accessibility_tree)
 
 def create_runtime(config: OpenHandsConfig, sid: str | None = None) -> Runtime:
     vm_image_path = os.getenv('OSWORLD_VM_IMAGE_PATH', './OS_images/Ubuntu.qcow2')
@@ -171,7 +156,7 @@ async def initialize_agents(
         raise ValueError('LLM config is None, cannot initialize.')
 
     metadata = EvalMetadata(
-        agent_class="CodeActAgent",
+        agent_class="OSWorldAgent",
         llm_config=llm_config,
         agent_config=None,
         max_iterations=agent_config['max_iterations'],
@@ -233,7 +218,7 @@ async def run_agent(
     config = job_details.config
     instance = job_details.instance
 
-    message_action = get_instruction(instance, metadata)
+    message_action = get_instruction(instance, metadata, runtime)
     try:
         agent = create_agent(config)
         job_details.agent = agent
@@ -286,3 +271,213 @@ async def evaluate_agent(run_results: dict, instance: dict, runtime: Runtime):
         return {'resolved': False, 'reward': score}
     except:
         return {'resolved': False, 'reward': 0}
+
+def process_messages_from_agent_state(
+    agent: OSWorldAgent,
+    state: State,
+    job_details: JobDetails | None = None,
+) -> dict:
+    """
+    This has been modified for OSWorldAgent to account for vision input.
+    We removed logic related to <think> and </think> tags.
+    The content logic for assistant turns will always contain 3 items:
+    - a text item with the instruction (this might have accessibility tree already embedded)
+    - a image item with the screenshot
+    - a text item with the accessibility tree
+    
+    logic for token_level_generation has not been checked or tested. Not supported for OSWorldAgent at the moment.
+    """
+    if job_details is not None:
+        assert job_details.llm_config is not None, (
+            'llm_config is required in job_details.'
+        )
+        token_level_generation = job_details.llm_config.token_level_generation
+        assert token_level_generation is False, 'token_level_generation is not supported for OSWorldAgent at the moment.'
+    else:
+        logger.warning(
+            'No job_details provided in process_messages_from_agent_state. Assuming token_level_generation is False.'
+        )
+        token_level_generation = False
+
+    initial_user_message = agent._get_initial_user_message(state.history)
+    messages = agent._get_messages_from_agent_state(state.history, initial_user_message)
+
+    while len(messages) > 0 and messages[-1]['role'] != 'assistant':
+        messages = messages[:-1]
+
+    tools = agent.tools
+    return {
+        'messages': messages,
+        'tools': tools,
+        'end_properly': not state.get_last_agent_format_error(),
+    }
+
+###############################################################################
+# Begin of exception handling
+# Used to override default process_messages_from_agent_state for OSWorldAgent
+###############################################################################
+def get_messages_from_partial_result(job_details: JobDetails) -> dict:
+    if job_details.agent is None or job_details.controller is None:
+        return {'messages': [], 'tools': [], 'end_properly': True}
+    controller = job_details.controller
+    state = controller.get_state()
+    assert state is not None, (
+        'Error in get_messages_from_partial_result: state is None.'
+    )
+    return process_messages_from_agent_state(job_details.agent, state, job_details)
+
+def initialize_exception(job_details: JobDetails, e: Exception):
+    tb = traceback.format_exc()
+    instance_id = (
+        job_details.instance.get('instance_id', None)
+        if job_details.instance is not None
+        else None
+    )
+    trajectory_id = (
+        job_details.instance.get('trajectory_id', None)
+        if job_details.instance is not None
+        else None
+    )
+    return {
+        'instance_id': instance_id,
+        'trajectory_id': trajectory_id,
+        'git_patch': None,
+        'success': False,
+        'error': f'Error in init: {str(e)}',
+        'traceback': tb,
+        'finish': False,
+        'messages': [],
+        'tools': [],
+        'end_properly': False,
+        'resolved': False,
+        'critical_error': 'init',
+    }
+
+
+def run_exception(job_details: JobDetails, e: Exception):
+    tb = traceback.format_exc()
+    instance_id = (
+        job_details.instance.get('instance_id', None)
+        if job_details.instance is not None
+        else None
+    )
+    trajectory_id = (
+        job_details.instance.get('trajectory_id', None)
+        if job_details.instance is not None
+        else None
+    )
+    git_patch = (
+        job_details.run_results.get('git_patch', None)
+        if job_details.run_results is not None
+        else None
+    )
+    success = (
+        job_details.run_results.get('success', False)
+        if job_details.run_results is not None
+        else False
+    )
+    finish = (
+        job_details.run_results.get('finish', False)
+        if job_details.run_results is not None
+        else False
+    )
+    messages = (
+        job_details.run_results.get('messages', [])
+        if job_details.run_results is not None
+        else []
+    )
+    tools = (
+        job_details.run_results.get('tools', [])
+        if job_details.run_results is not None
+        else []
+    )
+    end_properly = (
+        job_details.run_results.get('end_properly', True)
+        if job_details.run_results is not None
+        else True
+    )
+    if len(messages) == 0:
+        partial_result = get_messages_from_partial_result(job_details)
+        messages = partial_result['messages']
+        tools = partial_result['tools']
+        end_properly = partial_result['end_properly']
+    return {
+        'instance_id': instance_id,
+        'trajectory_id': trajectory_id,
+        'git_patch': git_patch,
+        'success': success,
+        'error': f'Error in run agent: {str(e)}',
+        'traceback': tb,
+        'finish': finish,
+        'messages': messages,
+        'tools': tools,
+        'end_properly': end_properly,
+        'resolved': False,
+        'critical_error': 'run',
+    }
+
+
+def eval_exception(job_details: JobDetails, e: Exception):
+    tb = traceback.format_exc()
+    instance_id = (
+        job_details.instance.get('instance_id', None)
+        if job_details.instance is not None
+        else None
+    )
+    trajectory_id = (
+        job_details.instance.get('trajectory_id', None)
+        if job_details.instance is not None
+        else None
+    )
+    git_patch = (
+        job_details.run_results.get('git_patch', None)
+        if job_details.run_results is not None
+        else None
+    )
+    success = (
+        job_details.run_results.get('success', False)
+        if job_details.run_results is not None
+        else False
+    )
+    finish = (
+        job_details.run_results.get('finish', False)
+        if job_details.run_results is not None
+        else False
+    )
+    messages = (
+        job_details.run_results.get('messages', [])
+        if job_details.run_results is not None
+        else []
+    )
+    tools = (
+        job_details.run_results.get('tools', [])
+        if job_details.run_results is not None
+        else []
+    )
+    end_properly = (
+        job_details.run_results.get('end_properly', True)
+        if job_details.run_results is not None
+        else True
+    )
+    if len(messages) == 0:
+        partial_result = get_messages_from_partial_result(job_details)
+        messages = partial_result['messages']
+        tools = partial_result['tools']
+        end_properly = partial_result['end_properly']
+    return {
+        'instance_id': instance_id,
+        'trajectory_id': trajectory_id,
+        'git_patch': git_patch,
+        'success': success,
+        'error': f'Error in eval: {str(e)}',
+        'traceback': tb,
+        'finish': finish,
+        'messages': messages,
+        'tools': tools,
+        'end_properly': end_properly,
+        'resolved': False,
+        'critical_error': 'eval',
+    }
+###############################################################################
+# End of exception handling
+###############################################################################
\ No newline at end of file
diff --git a/scripts/tests/test_osworld_utils.py b/scripts/tests/test_osworld_utils.py
index 964866a2b..2d03c02c0 100644
--- a/scripts/tests/test_osworld_utils.py
+++ b/scripts/tests/test_osworld_utils.py
@@ -7,6 +7,7 @@
 from openhands.core.logger import openhands_logger as logger
 from openhands.nvidia.os_world.osworld_utils import (
     initialize_agents,
+    run_agent,
     evaluate_agent,
 )
 from openhands.nvidia.registry import JobDetails
@@ -16,14 +17,14 @@
 async def run(instance):
     max_iterations = 35
     sampling_params = {
-        'model': 'hosted_vllm/Qwen/Qwen3-8B',
-        'api_key': 'mykey',
+        'model': 'deepseek/deepseek-chat',
+        'api_key': 'sk-697e5dc7145849a3b2ad1595718b35f2',
         'modify_params': False,
         'log_completions': True,
         'native_tool_calling': True,
         'temperature': 0.6,
     }
-    llm_config = LLMConfig(base_url='http://127.0.0.1:8000/v1', **sampling_params)
+    llm_config = LLMConfig(base_url='https://api.deepseek.com/chat/completions', **sampling_params)
 
 
     job_details = JobDetails(
@@ -43,10 +44,10 @@ async def run(instance):
     job_details.metadata = metadata
     job_details.config = config
 
-    run_results = {} #await run_agent(job_details)
     import pdb; pdb.set_trace()
+    run_results = await run_agent(job_details)   
 
-
+    import pdb; pdb.set_trace()
     eval_results = await evaluate_agent({}, instance, runtime)
     return eval_results
 
@@ -56,6 +57,7 @@ async def run(instance):
         "id": "06fe7178-4491-4589-810f-2e2bc9502122",
         "snapshot": "chrome",
         "instruction": "Can you make my computer bring back the last tab I shut down?",
+        #"instruction": "Go on to ArXiv and search for 'ProRL' paper from Nvidia. Open the pdf.",
         "source": "https://www.wikihow.com/Switch-Tabs-in-Chrome",
         "config": [
         {

From 79a04cf43b58c33c51d9b1bd56f0b6342836f541 Mon Sep 17 00:00:00 2001
From: Mingjie Liu <mingjiel@nvidia.com>
Date: Mon, 24 Nov 2025 17:01:07 -0600
Subject: [PATCH 08/14] Modified singularity. Some setting need to be reverted.
 Mainly due to personal setup issues.

---
 .../osworld_singularity_runtime.py            | 204 ++++++++++++++----
 1 file changed, 162 insertions(+), 42 deletions(-)

diff --git a/openhands/runtime/impl/singularity/osworld_singularity_runtime.py b/openhands/runtime/impl/singularity/osworld_singularity_runtime.py
index 879e75b65..dfa7291d3 100644
--- a/openhands/runtime/impl/singularity/osworld_singularity_runtime.py
+++ b/openhands/runtime/impl/singularity/osworld_singularity_runtime.py
@@ -13,6 +13,7 @@
 import json
 import threading
 from pathlib import Path
+from turtle import window_width
 from typing import TYPE_CHECKING, Callable
 
 import httpx
@@ -35,6 +36,9 @@
 from openhands.runtime.utils import find_available_tcp_port
 from openhands.runtime.utils.command import DEFAULT_MAIN_MODULE
 
+from openhands.utils.ast_process import simplify_accessibility_tree
+from openhands.events.tool import ToolCallMetadata
+
 # Port ranges for OSWorld VM services
 OSWORLD_VM_SERVER_PORT_RANGE = (15000, 19999)  # OSWorld Flask server inside VM (port 5000)
 OSWORLD_VNC_PORT_RANGE = (18000, 22999)  # VNC server for VM display
@@ -329,7 +333,7 @@ def init_container(self):
             'singularity', 'exec',
             '--pid',
             '--writable-tmpfs',
-            '--no-mount', 'home,cwd,tmp',
+            '--no-mount', 'cwd,tmp',
             '--home', '/root',
         ]
         
@@ -351,7 +355,7 @@ def init_container(self):
         qemu_cmd = self._get_qemu_command()
         cmd.extend(qemu_cmd)
         
-        self.log('debug', f'Starting QEMU VM with command: {" ".join(cmd)}')
+        self.log('info', f'Starting QEMU VM with command: {" ".join(cmd)}')
         
         try:
             # Create log directory for QEMU output
@@ -599,6 +603,25 @@ def get_vm_screenshot(self) -> bytes | None:
         except Exception as e:
             self.log('error', f'Failed to get VM screenshot: {e}')
             return None
+
+    def get_vm_accessibility_tree(self) -> str | None:
+        """Get accessibility tree from the VM.
+        
+        Returns:
+            Accessibility tree string or None if failed
+        """
+        try:
+            response = httpx.get(
+                f'{self.osworld_vm_url}/accessibility',
+                timeout=10.0
+            )
+            if response.status_code == 200:
+                at = response.json().get('AT', '')
+                return at
+            return None
+        except Exception as e:
+            self.log('error', f'Failed to get VM screenshot: {e}')
+            return None
     
     def _execute_pyautogui_command(self, pyautogui_command: str) -> dict:
         """Execute a PyAutoGUI command string in the VM.
@@ -667,38 +690,66 @@ def _action_to_pyautogui_command(self, action_type: str, parameters: dict) -> st
             "pyautogui.easeInQuad", "pyautogui.easeOutQuad", "pyautogui.easeInOutQuad",
             "pyautogui.easeInBounce", "pyautogui.easeInElastic"
         ])
-        duration = random.uniform(0.5, 1)
         
         if action_type == "CLICK":
             x = parameters.get('x')
             y = parameters.get('y')
             button = parameters.get('button', 'left')
-            num_clicks = parameters.get('num_clicks', 1)
+            num_clicks = parameters.get('clicks', 1)
+            interval = parameters.get('interval', 0.0)
+            duration = parameters.get('duration', 0.0)
             
             if x is not None and y is not None:
-                return f"pyautogui.click(x={x}, y={y}, button='{button}', clicks={num_clicks})"
+                return f"pyautogui.click(x={x}, y={y}, button='{button}', clicks={num_clicks}, interval={interval}, duration={duration})"
             else:
                 return "pyautogui.click()"
         
         elif action_type == "DOUBLE_CLICK":
             x = parameters.get('x')
             y = parameters.get('y')
+            button = parameters.get('button', 'left')
+            interval = parameters.get('interval', 0.0)
+            duration = parameters.get('duration', 0.0)
             if x is not None and y is not None:
-                return f"pyautogui.doubleClick(x={x}, y={y})"
+                return f"pyautogui.doubleClick(x={x}, y={y}, button='{button}', interval={interval}, duration={duration})"
             else:
                 return "pyautogui.doubleClick()"
         
+        elif action_type == "TRIPLE_CLICK":
+            x = parameters.get('x')
+            y = parameters.get('y')
+            button = parameters.get('button', 'left')
+            interval = parameters.get('interval', 0.0)
+            duration = parameters.get('duration', 0.0)
+            if x is not None and y is not None:
+                return f"pyautogui.tripleClick(x={x}, y={y}, button='{button}', interval={interval}, duration={duration})"
+            else:
+                return "pyautogui.tripleClick()"
+        
         elif action_type == "RIGHT_CLICK":
             x = parameters.get('x')
             y = parameters.get('y')
+            interval = parameters.get('interval', 0.0)
+            duration = parameters.get('duration', 0.0)
             if x is not None and y is not None:
-                return f"pyautogui.rightClick(x={x}, y={y})"
+                return f"pyautogui.rightClick(x={x}, y={y}, interval={interval}, duration={duration})"
             else:
                 return "pyautogui.rightClick()"
         
+        elif action_type == "MIDDLE_CLICK":
+            x = parameters.get('x')
+            y = parameters.get('y')
+            interval = parameters.get('interval', 0.0)
+            duration = parameters.get('duration', 0.0)
+            if x is not None and y is not None:
+                return f"pyautogui.middleClick(x={x}, y={y}, interval={interval}, duration={duration})"
+            else:
+                return "pyautogui.click(button='middle')"
+        
         elif action_type == "MOVE_TO":
             x = parameters.get('x')
             y = parameters.get('y')
+            duration = parameters.get('duration', 0.0)
             if x is not None and y is not None:
                 return f"pyautogui.moveTo({x}, {y}, {duration}, {move_mode})"
             else:
@@ -707,24 +758,30 @@ def _action_to_pyautogui_command(self, action_type: str, parameters: dict) -> st
         elif action_type == "DRAG_TO":
             x = parameters.get('x')
             y = parameters.get('y')
+            duration = parameters.get('duration', 0.0)
+            button = parameters.get('button', 'left')
+            mouseDownUp = parameters.get('mouseDownUp', True)
             if x is not None and y is not None:
-                return f"pyautogui.dragTo({x}, {y}, duration=1.0, button='left', mouseDownUp=True)"
+                return f"pyautogui.dragTo({x}, {y}, button='{button}', duration={duration}, mouseDownUp={mouseDownUp})"
             return None
         
         elif action_type == "SCROLL":
-            dx = parameters.get('dx', 0)
-            dy = parameters.get('dy', 0)
-            commands = []
-            if dx != 0:
-                commands.append(f"pyautogui.hscroll({dx})")
-            if dy != 0:
-                commands.append(f"pyautogui.vscroll({dy})")
-            return "; ".join(commands) if commands else None
+            x = parameters.get('x', None)
+            y = parameters.get('y', None)
+            amount = parameters.get('amount', 1)
+            return f"pyautogui.scroll({amount}, x={x}, y={y})"
+
+        elif action_type == "HSCROLL":
+            x = parameters.get('x', None)
+            y = parameters.get('y', None)
+            amount = parameters.get('amount', 1)
+            return f"pyautogui.hscroll({amount}, x={x}, y={y})"
         
         elif action_type == "TYPING":
             text = parameters.get('text', '')
+            interval = parameters.get('interval', 0.0)
             # Use repr() to properly escape the text (same as OSWorld)
-            return f"pyautogui.typewrite({repr(text)})"
+            return f"pyautogui.typewrite({repr(text)}, interval={interval})"
         
         elif action_type == "PRESS":
             key = parameters.get('key', '')
@@ -757,6 +814,10 @@ def _action_to_pyautogui_command(self, action_type: str, parameters: dict) -> st
         elif action_type == "MOUSE_UP":
             button = parameters.get('button', 'left')
             return f"pyautogui.mouseUp(button='{button}')"
+
+        elif action_type == "WAIT":
+            seconds = parameters.get('seconds', 1)
+            return f"time.sleep({seconds})"
         
         else:
             return None
@@ -797,6 +858,8 @@ def osworld_interactive(self, action) -> 'Observation':
             # Dispatch to appropriate handler
             if method == 'execute_action':
                 return self._handle_execute_action(params)
+            elif method == 'execute_agentic_action':
+                return self._handle_execute_agentic_action(params, action.tool_call_metadata)
             elif method == 'get_screenshot':
                 return self._handle_get_screenshot()
             elif method == 'get_accessibility_tree':
@@ -856,6 +919,61 @@ def _handle_execute_action(self, params: dict) -> 'Observation':
                 command=str(action_data),
                 exit_code=1,
             )
+
+    def _handle_execute_agentic_action(self, params: dict, tool_call_metadata: ToolCallMetadata | None) -> 'Observation':
+        """Handle execute_action - PyAutoGUI actions like CLICK, TYPING, etc."""
+        from openhands.events.observation.osworld import OSWorldOutputObservation  
+        from openhands.events.observation import ErrorObservation   
+        import base64
+
+        # Always save screenshot and accessibility tree. Will leave message formatting to the agent.
+        include_screenshot = True #self.config.agents['agent'].enable_vision
+        include_a11y_tree = True #self.config.agents['agent'].enable_a11y_tree
+        
+        action_data = params.get('action', params)
+
+        # Convert normalized coordinates to pixel coordinates
+        if not hasattr(self, 'screen_size'):
+            self._handle_get_vm_screen_size()
+        width, height = self.screen_size
+        if 'parameters' in action_data:
+            parameters = action_data['parameters']
+            if 'x' in parameters:
+                parameters['x'] = int(parameters['x'] * width)
+            if 'y' in parameters:
+                parameters['y'] = int(parameters['y'] * height)
+            logger.info(f"Converted normalized coordinates to pixel coordinates: {action_data}. Screen size: {width}x{height}.")
+
+        result = self.execute_vm_action(action_data)
+        
+        if result.get('status') == 'success':
+            if include_screenshot:
+                screenshot_bytes = self.get_vm_screenshot()
+                screenshot_bytes = base64.b64encode(screenshot_bytes).decode('utf-8')
+            else:
+                screenshot_bytes = None
+
+            if include_a11y_tree:
+                accessibility_tree = self.get_vm_accessibility_tree()
+                accessibility_tree = simplify_accessibility_tree(accessibility_tree)
+            else:
+                accessibility_tree = None
+
+            return OSWorldOutputObservation(
+                command=str(action_data),
+                screenshot=screenshot_bytes,
+                accessibility_tree=accessibility_tree,
+                tool_call_id=tool_call_metadata.tool_call_id,
+                name=tool_call_metadata.function_name,
+            )
+        else:
+            error_msg = result.get('error', result.get('message', 'Unknown error'))
+            return ErrorObservation(
+                content=f"Error: {error_msg}",
+                command=str(action_data),
+                tool_call_id=tool_call_metadata.tool_call_id,
+                name=tool_call_metadata.function_name,
+            )
     
     def _handle_get_screenshot(self) -> 'Observation':
         """Handle get_screenshot - returns screenshot as base64."""
@@ -878,21 +996,14 @@ def _handle_get_accessibility_tree(self) -> 'Observation':
         """Handle get_accessibility_tree."""
         from openhands.events.observation import CmdOutputObservation, ErrorObservation
         
-        try:
-            response = httpx.get(
-                f'{self.osworld_vm_url}/accessibility',
-                timeout=10.0
+        at = self.get_vm_accessibility_tree()
+        if at:
+            return CmdOutputObservation(
+                content=at,
+                command='get_accessibility_tree',
+                exit_code=0,
             )
-            if response.status_code == 200:
-                at = response.json().get('AT', '')
-                return CmdOutputObservation(
-                    content=at,
-                    command='get_accessibility_tree',
-                    exit_code=0,
-                )
-            return ErrorObservation(f'Failed to get accessibility tree: {response.status_code}')
-        except Exception as e:
-            return ErrorObservation(f'Failed to get accessibility tree: {e}')
+        return ErrorObservation('Failed to get accessibility tree')
     
     def _handle_get_terminal_output(self) -> 'Observation':
         """Handle get_terminal_output."""
@@ -1174,18 +1285,23 @@ def _handle_get_vm_screen_size(self) -> 'Observation':
         from openhands.events.observation import CmdOutputObservation, ErrorObservation
         
         try:
-            response = httpx.post(
-                f'{self.osworld_vm_url}/screen_size',
-                timeout=10.0
-            )
-            if response.status_code == 200:
-                size = response.json()
-                content = f"Width: {size.get('width', 'unknown')}, Height: {size.get('height', 'unknown')}"
-                return CmdOutputObservation(
-                    content=content,
-                    command='get_vm_screen_size',
-                    exit_code=0,
+            if hasattr(self, 'screen_size'):
+                width, height = self.screen_size
+            else:
+                response = httpx.post(
+                    f'{self.osworld_vm_url}/screen_size',
+                    timeout=10.0
                 )
+                if response.status_code == 200:
+                    size = response.json()
+                    width, height = size.get('width', 1920), size.get('height', 1080)
+                    self.screen_size = (width, height)
+            content = f"Width: {width}, Height: {height}"
+            return CmdOutputObservation(
+                content=content,
+                command='get_vm_screen_size',
+                exit_code=0,
+            )
             return ErrorObservation(f'Failed to get screen size: {response.status_code}')
         except Exception as e:
             return ErrorObservation(f'Failed to get screen size: {e}')
@@ -1291,3 +1407,7 @@ def _handle_get_vm_directory_tree(self, params: dict) -> 'Observation':
         except Exception as e:
             return ErrorObservation(f'Failed to get directory tree: {e}')
 
+    def get_microagents_from_selected_repo(
+        self, selected_repository: str | None
+    ):
+        return []
\ No newline at end of file

From ca41d4e647fa6884e4d799071a6153a2f1776662 Mon Sep 17 00:00:00 2001
From: Mingjie Liu <mingjiel@nvidia.com>
Date: Tue, 25 Nov 2025 10:58:15 -0600
Subject: [PATCH 09/14] Added agent.

---
 openhands/nvidia/__init__.py                   |  2 ++
 openhands/nvidia/os_world/evaluate.py          | 18 +++++++++++++-----
 openhands/nvidia/os_world/osworld_handler.py   |  4 +---
 openhands/nvidia/os_world/osworld_utils.py     | 12 +++++++-----
 .../singularity/osworld_singularity_runtime.py | 10 +++++-----
 scripts/tests/test_osworld_utils.py            | 12 +++++++++++-
 6 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/openhands/nvidia/__init__.py b/openhands/nvidia/__init__.py
index be79b767b..3c7a1dba7 100644
--- a/openhands/nvidia/__init__.py
+++ b/openhands/nvidia/__init__.py
@@ -4,6 +4,7 @@
 from openhands.nvidia.registry import add_name_mapping, register_agent_handler
 from openhands.nvidia.stem_agent.stem_handler import STEMHandler
 from openhands.nvidia.swe_agent.swe_agent_handler import SweAgentHandler
+from openhands.nvidia.os_world.osworld_handler import OSWorldHandler
 
 register_agent_handler(SweAgentHandler())
 register_agent_handler(MathHandler())
@@ -27,6 +28,7 @@
     add_name_mapping(prorl_dataset, 'prorl', reasoning=True)
 register_agent_handler(STEMHandler())
 register_agent_handler(GuiAgentHandler())
+register_agent_handler(OSWorldHandler())
 
 for code_dataset in ['codecontests', 'apps', 'codeforces', 'taco']:
     add_name_mapping(code_dataset, 'deepcoder')
diff --git a/openhands/nvidia/os_world/evaluate.py b/openhands/nvidia/os_world/evaluate.py
index 85ae63df4..6a88335b8 100644
--- a/openhands/nvidia/os_world/evaluate.py
+++ b/openhands/nvidia/os_world/evaluate.py
@@ -78,22 +78,30 @@ async def evaluate(self, action_history = []):
         Evaluate whether the task is successfully completed.
         """
 
-        postconfig = self.evaluator.get("postconfig", [])
-        await self.setup_controller.setup(postconfig)
+        def last_action_is_fail(last_action):
+            try:
+                function_type = last_action['tool_calls'][0]['function']['name']
+                return function_type == 'fail'
+            except:
+                return False
 
-        # TODO: special handling for infeasible tasks, might be different to OpenHands
+        # Special handling for infeasible tasks
+        # TODO: Currently working on litellm json dumped format. Might need to be modified for other formats.
         if self.evaluator['func'] == "infeasible":
             if len(action_history) > 0:
                 last_action = action_history[-1]
-                if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
+                if last_action_is_fail(last_action):
                     return 1
             return 0
         else:
             if len(action_history) > 0:
                 last_action = action_history[-1]
-                if last_action == "FAIL" or (type(last_action) == dict and last_action.get('action_type') == 'FAIL'):
+                if last_action_is_fail(last_action):
                     return 0
 
+        postconfig = self.evaluator.get("postconfig", [])
+        await self.setup_controller.setup(postconfig)
+
         if type(self.metric) == list:
             # Multiple metrics to evaluate whether the task is successfully completed
             results = []
diff --git a/openhands/nvidia/os_world/osworld_handler.py b/openhands/nvidia/os_world/osworld_handler.py
index 5e4294c18..91b47c839 100644
--- a/openhands/nvidia/os_world/osworld_handler.py
+++ b/openhands/nvidia/os_world/osworld_handler.py
@@ -61,12 +61,10 @@ async def eval(
             reward: Optional[Reward] = None,
         ) -> dict[str, Any]:
 
-        if reward is None:
-            raise ValueError('Reward is required for evaluation of math problems.')
-
         return await evaluate_agent(
             run_results=job_details.run_results,
             instance=job_details.instance,
+            runtime=job_details.runtime,
         )
 
     def init_exception(self, job_details: JobDetails, exception: Exception) -> dict[str, Any]:
diff --git a/openhands/nvidia/os_world/osworld_utils.py b/openhands/nvidia/os_world/osworld_utils.py
index 908826fba..0a73bc95a 100644
--- a/openhands/nvidia/os_world/osworld_utils.py
+++ b/openhands/nvidia/os_world/osworld_utils.py
@@ -43,7 +43,6 @@
 from openhands.nvidia.os_world.controllers.setup import SetupController
 from openhands.nvidia.os_world.evaluate import Evaluator
 
-from openhands.utils.ast_process import simplify_accessibility_tree
 
 
 def get_config(
@@ -110,8 +109,6 @@ def get_instruction(instance: pd.Series | dict, metadata: EvalMetadata, runtime:
     
     if include_a11y_tree:
         accessibility_tree = runtime.get_vm_accessibility_tree()
-        if accessibility_tree:
-            accessibility_tree = simplify_accessibility_tree(accessibility_tree)
 
     image_url = None
     if include_screenshot:
@@ -123,7 +120,7 @@ def get_instruction(instance: pd.Series | dict, metadata: EvalMetadata, runtime:
 
 def create_runtime(config: OpenHandsConfig, sid: str | None = None) -> Runtime:
     vm_image_path = os.getenv('OSWORLD_VM_IMAGE_PATH', './OS_images/Ubuntu.qcow2')
-    assert Path(vm_image_path).exists(), f"ERROR: VM image not found at {vm_image_path}"
+    assert Path(vm_image_path).exists(), f"ERROR: VM image not found at {vm_image_path}. Export OSWORLD_VM_IMAGE_PATH."
     logger.info(f"Using VM image path: {vm_image_path}")
 
     session_id = sid or generate_sid(config)
@@ -246,12 +243,16 @@ async def run_agent(
             raise EvalException('Fatal error detected: ' + state.last_error)
 
     except Exception as e:
+        import traceback
+        print(traceback.format_exc())
         logger.error(f"Error running agent: {e}")
 
     # get messages from agent history
     try:
         run_results = process_messages_from_agent_state(agent, state, job_details) # type: ignore
     except Exception as e:
+        import traceback
+        print(traceback.format_exc())
         logger.error(f"Error while running, failed to retrieve agent messages: {e}")
         raise Exception(f"Failed to retrieve agent messages: {str(e)}")
 
@@ -265,7 +266,7 @@ async def run_agent(
 async def evaluate_agent(run_results: dict, instance: dict, runtime: Runtime):
     try:
         evaluator = Evaluator(instance, runtime.setup_controller)
-        score = await evaluator.evaluate()
+        score = await evaluator.evaluate(run_results['messages'])
         if score > 0.99:
             return {'resolved': True, 'reward': score}
         return {'resolved': False, 'reward': score}
@@ -307,6 +308,7 @@ def process_messages_from_agent_state(
 
     tools = agent.tools
     return {
+        'problem_id': job_details.instance.get('id', None),
         'messages': messages,
         'tools': tools,
         'end_properly': not state.get_last_agent_format_error(),
diff --git a/openhands/runtime/impl/singularity/osworld_singularity_runtime.py b/openhands/runtime/impl/singularity/osworld_singularity_runtime.py
index dfa7291d3..c4f31447b 100644
--- a/openhands/runtime/impl/singularity/osworld_singularity_runtime.py
+++ b/openhands/runtime/impl/singularity/osworld_singularity_runtime.py
@@ -36,7 +36,6 @@
 from openhands.runtime.utils import find_available_tcp_port
 from openhands.runtime.utils.command import DEFAULT_MAIN_MODULE
 
-from openhands.utils.ast_process import simplify_accessibility_tree
 from openhands.events.tool import ToolCallMetadata
 
 # Port ranges for OSWorld VM services
@@ -595,7 +594,7 @@ def get_vm_screenshot(self) -> bytes | None:
         try:
             response = httpx.get(
                 f'{self.osworld_vm_url}/screenshot',
-                timeout=10.0
+                timeout=30.0
             )
             if response.status_code == 200:
                 return response.content
@@ -613,7 +612,7 @@ def get_vm_accessibility_tree(self) -> str | None:
         try:
             response = httpx.get(
                 f'{self.osworld_vm_url}/accessibility',
-                timeout=10.0
+                timeout=30.0
             )
             if response.status_code == 200:
                 at = response.json().get('AT', '')
@@ -955,7 +954,7 @@ def _handle_execute_agentic_action(self, params: dict, tool_call_metadata: ToolC
 
             if include_a11y_tree:
                 accessibility_tree = self.get_vm_accessibility_tree()
-                accessibility_tree = simplify_accessibility_tree(accessibility_tree)
+                #accessibility_tree = linearize_accessibility_tree(accessibility_tree)
             else:
                 accessibility_tree = None
 
@@ -968,10 +967,11 @@ def _handle_execute_agentic_action(self, params: dict, tool_call_metadata: ToolC
             )
         else:
             error_msg = result.get('error', result.get('message', 'Unknown error'))
+            logger.error(f"Error in agentic action: action_data={action_data}, error={error_msg}")
             return ErrorObservation(
                 content=f"Error: {error_msg}",
                 command=str(action_data),
-                tool_call_id=tool_call_metadata.tool_call_id,
+                error_id=tool_call_metadata.tool_call_id,
                 name=tool_call_metadata.function_name,
             )
     
diff --git a/scripts/tests/test_osworld_utils.py b/scripts/tests/test_osworld_utils.py
index 2d03c02c0..44a68c6a3 100644
--- a/scripts/tests/test_osworld_utils.py
+++ b/scripts/tests/test_osworld_utils.py
@@ -48,7 +48,7 @@ async def run(instance):
     run_results = await run_agent(job_details)   
 
     import pdb; pdb.set_trace()
-    eval_results = await evaluate_agent({}, instance, runtime)
+    eval_results = await evaluate_agent(run_results, instance, runtime)
     return eval_results
 
 
@@ -124,6 +124,16 @@ async def run(instance):
         "possibility_of_env_change": "low"
     }
 
+    import json
+    data_file = '/root/OSWorld/osworld_test_nogdrive.json'
+    all_data = []
+    with open(data_file, 'r') as f:
+        for line in f:
+            data = json.loads(line)
+            all_data.append(data)
+    instance = all_data[3] # 3 is a good example of pruning too much?
+    print(instance)
+
     # Initialize the agents
     results = asyncio.run(run(instance))
 

From 5b2f8f6cceca9a224343be1b3dfb6e01916a11f2 Mon Sep 17 00:00:00 2001
From: Mingjie Liu <mingjiel@nvidia.com>
Date: Tue, 25 Nov 2025 10:58:36 -0600
Subject: [PATCH 10/14] Added osworld ast tree.

---
 openhands/agenthub/gui_agent/osworld_agent.py |  20 +-
 .../accessibility_tree_wrap/__init__.py       |   0
 .../heuristic_retrieve.py                     | 310 ++++++++++++++++++
 3 files changed, 323 insertions(+), 7 deletions(-)
 create mode 100644 openhands/nvidia/os_world/accessibility_tree_wrap/__init__.py
 create mode 100644 openhands/nvidia/os_world/accessibility_tree_wrap/heuristic_retrieve.py

diff --git a/openhands/agenthub/gui_agent/osworld_agent.py b/openhands/agenthub/gui_agent/osworld_agent.py
index 74cfedf2e..cb580a7e0 100644
--- a/openhands/agenthub/gui_agent/osworld_agent.py
+++ b/openhands/agenthub/gui_agent/osworld_agent.py
@@ -28,6 +28,7 @@
     AgentToolCallError,
     AgentLengthError,
 )
+from openhands.nvidia.os_world.accessibility_tree_wrap.heuristic_retrieve import linearize_accessibility_tree
 
 from openhands.agenthub.gui_agent.prompts.osworld import OSWORLD_OBSERVATION_FEEDBACK_PROMPT, ERROR_OBSERVATION_FEEDBACK_PROMPT
 
@@ -62,7 +63,8 @@ def convert_message_action_to_message(
     ) -> Message:
     text_content = action.content
     if include_a11y_tree:
-        text_content += f"\n\nAccessibility Tree: {action.accessibility_tree}"
+        accessibility_tree = linearize_accessibility_tree(action.accessibility_tree)
+        text_content += f"\n\nAccessibility Tree:\n{accessibility_tree}"
     content = [TextContent(text=text_content)]
     if include_screenshot:
         content.append(ImageContent(image_urls=action.image_urls))
@@ -80,7 +82,8 @@ def convert_observation_to_message(
     if isinstance(observation, OSWorldOutputObservation):
         prompt_text = OSWORLD_OBSERVATION_FEEDBACK_PROMPT.format(instruction=instruction)
         if include_a11y_tree:
-            prompt_text += f"\n\nAccessibility Tree: {observation.accessibility_tree}"
+            accessibility_tree = linearize_accessibility_tree(observation.accessibility_tree)
+            prompt_text += f"\n\nAccessibility Tree:\n{accessibility_tree}"
         content = [TextContent(text=prompt_text)]
         if include_screenshot:
             content.append(ImageContent(image_urls=observation.image_urls))
@@ -95,7 +98,7 @@ def convert_observation_to_message(
         return Message(
             role='tool', # or user?
             content=[TextContent(text=observation.content)],
-            tool_call_id=observation.tool_call_id,
+            tool_call_id=observation.error_id,
             name=observation.name,
         )
 
@@ -105,7 +108,8 @@ def convert_message_action_to_message_full_state(
     ) -> Message:
     test_content = action.content
     if include_a11y_tree:
-        test_content += f"\n\nAccessibility Tree: {action.accessibility_tree}"
+        accessibility_tree = linearize_accessibility_tree(action.accessibility_tree)
+        test_content += f"\n\nAccessibility Tree:\n{accessibility_tree}"
     content = [TextContent(text=action.content)]
     content.append(ImageContent(image_urls=action.image_urls))
     content.append(TextContent(text=action.accessibility_tree))
@@ -122,7 +126,8 @@ def convert_observation_to_message_full_state(
     if isinstance(observation, OSWorldOutputObservation):
         prompt_text = OSWORLD_OBSERVATION_FEEDBACK_PROMPT.format(instruction=instruction)
         if include_a11y_tree:
-            prompt_text += f"\n\nAccessibility Tree: {observation.accessibility_tree}"
+            accessibility_tree = linearize_accessibility_tree(observation.accessibility_tree)
+            prompt_text += f"\n\nAccessibility Tree:\n{accessibility_tree}"
         content = [TextContent(text=prompt_text)]
         
         # We always add screenshot and accessibility tree to the message
@@ -139,7 +144,7 @@ def convert_observation_to_message_full_state(
         return Message(
             role='tool', # or user?
             content=[TextContent(text=observation.content)],
-            tool_call_id=observation.tool_call_id,
+            tool_call_id=observation.error_id,
             name=observation.name,
         )
 
@@ -303,6 +308,7 @@ def _get_messages_from_agent_state(
     ) -> list[dict]:
         """This message although similar to _get_messages, is used to process the messages from the agent state.
         Key difference is to preserve all the content items in the message, including image and accessibility tree.
+        Also will add AgentFinishAction to the messages.
         Used in process_messages_from_agent_state.
 
         Args:
@@ -324,7 +330,7 @@ def _get_messages_from_agent_state(
 
         # Build history prompts (alternating assistant/user messages)
         for event in events:
-            if isinstance(event, OSWorldInteractiveAction):
+            if isinstance(event, OSWorldInteractiveAction) or isinstance(event, AgentFinishAction):
                 messages.append(convert_action_to_message(event))
             elif isinstance(event, MessageAction):
                 messages.append(convert_message_action_to_message_full_state(event, include_a11y_tree=include_a11y_tree))
diff --git a/openhands/nvidia/os_world/accessibility_tree_wrap/__init__.py b/openhands/nvidia/os_world/accessibility_tree_wrap/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/openhands/nvidia/os_world/accessibility_tree_wrap/heuristic_retrieve.py b/openhands/nvidia/os_world/accessibility_tree_wrap/heuristic_retrieve.py
new file mode 100644
index 000000000..fa7986e0d
--- /dev/null
+++ b/openhands/nvidia/os_world/accessibility_tree_wrap/heuristic_retrieve.py
@@ -0,0 +1,310 @@
+import io
+import xml.etree.ElementTree as ET
+from typing import Tuple, List
+
+from PIL import Image, ImageDraw, ImageFont
+
+
+NORMALIZE_WIDTH = 1920
+NORMALIZE_HEIGHT = 1080
+
+
+def _clamp01(value: float) -> float:
+    if value < 0.0:
+        return 0.0
+    if value > 1.0:
+        return 1.0
+    return value
+
+
+def _normalize_coord_size(x: int, y: int, w: int, h: int) -> Tuple[float, float, float, float]:
+    nx = round(_clamp01(x / float(NORMALIZE_WIDTH)), 3)
+    ny = round(_clamp01(y / float(NORMALIZE_HEIGHT)), 3)
+    nw = round(_clamp01(w / float(NORMALIZE_WIDTH)), 3)
+    nh = round(_clamp01(h / float(NORMALIZE_HEIGHT)), 3)
+    return nx, ny, nw, nh
+
+
+def find_leaf_nodes(xlm_file_str):
+    if not xlm_file_str:
+        return []
+
+    root = ET.fromstring(xlm_file_str)
+
+    # Recursive function to traverse the XML tree and collect leaf nodes
+    def collect_leaf_nodes(node, leaf_nodes):
+        # If the node has no children, it is a leaf node, add it to the list
+        if not list(node):
+            leaf_nodes.append(node)
+        # If the node has children, recurse on each child
+        for child in node:
+            collect_leaf_nodes(child, leaf_nodes)
+
+    # List to hold all leaf nodes
+    leaf_nodes = []
+    collect_leaf_nodes(root, leaf_nodes)
+    return leaf_nodes
+
+attributes_ns_ubuntu = "https://accessibility.windows.example.org/ns/attributes"
+attributes_ns_windows = "https://accessibility.windows.example.org/ns/attributes"
+state_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/state"
+state_ns_windows = "https://accessibility.windows.example.org/ns/state"
+component_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/component"
+component_ns_windows = "https://accessibility.windows.example.org/ns/component"
+value_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/value"
+value_ns_windows = "https://accessibility.windows.example.org/ns/value"
+class_ns_windows = "https://accessibility.windows.example.org/ns/class"
+
+
+def judge_node(node: ET, platform="ubuntu", check_image=False) -> bool:
+    if platform == "ubuntu":
+        _state_ns = state_ns_ubuntu
+        _component_ns = component_ns_ubuntu
+    elif platform == "windows":
+        _state_ns = state_ns_windows
+        _component_ns = component_ns_windows
+    else:
+        raise ValueError("Invalid platform, must be 'ubuntu' or 'windows'")
+
+    keeps: bool = node.tag.startswith("document") \
+                  or node.tag.endswith("item") \
+                  or node.tag.endswith("button") \
+                  or node.tag.endswith("heading") \
+                  or node.tag.endswith("label") \
+                  or node.tag.endswith("scrollbar") \
+                  or node.tag.endswith("searchbox") \
+                  or node.tag.endswith("textbox") \
+                  or node.tag.endswith("link") \
+                  or node.tag.endswith("tabelement") \
+                  or node.tag.endswith("textfield") \
+                  or node.tag.endswith("textarea") \
+                  or node.tag.endswith("menu") \
+                  or node.tag in {"alert", "canvas", "check-box"
+                      , "combo-box", "entry", "icon"
+                      , "image", "paragraph", "scroll-bar"
+                      , "section", "slider", "static"
+                      , "table-cell", "terminal", "text"
+                      , "netuiribbontab", "start", "trayclockwclass"
+                      , "traydummysearchcontrol", "uiimage", "uiproperty"
+                      , "uiribboncommandbar"
+                                  }
+    keeps = keeps and (
+            platform == "ubuntu"
+            and node.get("{{{:}}}showing".format(_state_ns), "false") == "true"
+            and node.get("{{{:}}}visible".format(_state_ns), "false") == "true"
+            or platform == "windows"
+            and node.get("{{{:}}}visible".format(_state_ns), "false") == "true"
+    ) \
+            and (
+                    node.get("{{{:}}}enabled".format(_state_ns), "false") == "true"
+                    or node.get("{{{:}}}editable".format(_state_ns), "false") == "true"
+                    or node.get("{{{:}}}expandable".format(_state_ns), "false") == "true"
+                    or node.get("{{{:}}}checkable".format(_state_ns), "false") == "true"
+            ) \
+            and (
+                    node.get("name", "") != "" or node.text is not None and len(node.text) > 0 \
+                    or check_image and node.get("image", "false") == "true"
+            )
+
+    coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(_component_ns), "(-1, -1)"))
+    sizes: Tuple[int, int] = eval(node.get("{{{:}}}size".format(_component_ns), "(-1, -1)"))
+    keeps = keeps and coordinates[0] >= 0 and coordinates[1] >= 0 and sizes[0] > 0 and sizes[1] > 0
+    return keeps
+
+
+def filter_nodes(root: ET, platform="ubuntu", check_image=False):
+    filtered_nodes = []
+
+    for node in root.iter():
+        if judge_node(node, platform, check_image):
+            filtered_nodes.append(node)
+            # print(ET.tostring(node, encoding="unicode"))
+
+    return filtered_nodes
+
+
+def draw_bounding_boxes(nodes, image_file_content, down_sampling_ratio=1.0, platform="ubuntu"):
+
+    if platform == "ubuntu":
+        _state_ns = state_ns_ubuntu
+        _component_ns = component_ns_ubuntu
+        _value_ns = value_ns_ubuntu
+    elif platform == "windows":
+        _state_ns = state_ns_windows
+        _component_ns = component_ns_windows
+        _value_ns = value_ns_windows
+    else:
+        raise ValueError("Invalid platform, must be 'ubuntu' or 'windows'")
+
+    # Load the screenshot image
+    image_stream = io.BytesIO(image_file_content)
+    image = Image.open(image_stream)
+    if float(down_sampling_ratio) != 1.0:
+        image = image.resize((int(image.size[0] * down_sampling_ratio), int(image.size[1] * down_sampling_ratio)))
+    draw = ImageDraw.Draw(image)
+    marks = []
+    drew_nodes = []
+    text_informations: List[str] = ["index\ttag\tname\ttext"]
+
+    try:
+        # Adjust the path to the font file you have or use a default one
+        font = ImageFont.truetype("arial.ttf", 15)
+    except IOError:
+        # Fallback to a basic font if the specified font can't be loaded
+        font = ImageFont.load_default()
+
+    index = 1
+
+    # Loop over all the visible nodes and draw their bounding boxes
+    for _node in nodes:
+        coords_str = _node.attrib.get('{{{:}}}screencoord'.format(_component_ns))
+        size_str = _node.attrib.get('{{{:}}}size'.format(_component_ns))
+
+        if coords_str and size_str:
+            try:
+                # Parse the coordinates and size from the strings
+                coords = tuple(map(int, coords_str.strip('()').split(', ')))
+                size = tuple(map(int, size_str.strip('()').split(', ')))
+
+                import copy
+                original_coords = copy.deepcopy(coords)
+                original_size = copy.deepcopy(size)
+
+                if float(down_sampling_ratio) != 1.0:
+                    # Downsample the coordinates and size
+                    coords = tuple(int(coord * down_sampling_ratio) for coord in coords)
+                    size = tuple(int(s * down_sampling_ratio) for s in size)
+
+                # Check for negative sizes
+                if size[0] <= 0 or size[1] <= 0:
+                    raise ValueError(f"Size must be positive, got: {size}")
+
+                # Calculate the bottom-right corner of the bounding box
+                bottom_right = (coords[0] + size[0], coords[1] + size[1])
+
+                # Check that bottom_right > coords (x1 >= x0, y1 >= y0)
+                if bottom_right[0] < coords[0] or bottom_right[1] < coords[1]:
+                    raise ValueError(f"Invalid coordinates or size, coords: {coords}, size: {size}")
+
+                # Check if the area only contains one color
+                cropped_image = image.crop((*coords, *bottom_right))
+                if len(set(list(cropped_image.getdata()))) == 1:
+                    continue
+
+                # Draw rectangle on image
+                draw.rectangle([coords, bottom_right], outline="red", width=1)
+
+                # Draw index number at the bottom left of the bounding box with black background
+                text_position = (coords[0], bottom_right[1])  # Adjust Y to be above the bottom right
+                text_bbox: Tuple[int, int, int, int] = draw.textbbox(text_position, str(index), font=font, anchor="lb")
+                # offset: int = bottom_right[1]-text_bbox[3]
+                # text_bbox = (text_bbox[0], text_bbox[1]+offset, text_bbox[2], text_bbox[3]+offset)
+
+                # draw.rectangle([text_position, (text_position[0] + 25, text_position[1] + 18)], fill='black')
+                draw.rectangle(text_bbox, fill='black')
+                draw.text(text_position, str(index), font=font, anchor="lb", fill="white")
+
+                # each mark is an x, y, w, h tuple
+                nx, ny, nw, nh = _normalize_coord_size(
+                    original_coords[0], original_coords[1], original_size[0], original_size[1]
+                )
+                marks.append([nx, ny, nw, nh])
+                drew_nodes.append(_node)
+
+                if _node.text:
+                    node_text = (_node.text if '"' not in _node.text \
+                                     else '"{:}"'.format(_node.text.replace('"', '""'))
+                                 )
+                elif _node.get("{{{:}}}class".format(class_ns_windows), "").endswith("EditWrapper") \
+                        and _node.get("{{{:}}}value".format(_value_ns)):
+                    node_text = _node.get("{{{:}}}value".format(_value_ns), "")
+                    node_text = (node_text if '"' not in node_text \
+                                     else '"{:}"'.format(node_text.replace('"', '""'))
+                                 )
+                else:
+                    node_text = '""'
+                text_information: str = "{:d}\t{:}\t{:}\t{:}".format(index, _node.tag, _node.get("name", ""), node_text)
+                text_informations.append(text_information)
+
+                index += 1
+
+            except ValueError:
+                pass
+
+    output_image_stream = io.BytesIO()
+    image.save(output_image_stream, format='PNG')
+    image_content = output_image_stream.getvalue()
+
+    return marks, drew_nodes, "\n".join(text_informations), image_content
+
+
+def print_nodes_with_indent(nodes, indent=0):
+    for node in nodes:
+        print(' ' * indent, node.tag, node.attrib)
+        print_nodes_with_indent(node, indent + 2)
+
+def linearize_accessibility_tree(accessibility_tree, platform="ubuntu"):
+
+    if platform == "ubuntu":
+        _attributes_ns = attributes_ns_ubuntu
+        _state_ns = state_ns_ubuntu
+        _component_ns = component_ns_ubuntu
+        _value_ns = value_ns_ubuntu
+    elif platform == "windows":
+        _attributes_ns = attributes_ns_windows
+        _state_ns = state_ns_windows
+        _component_ns = component_ns_windows
+        _value_ns = value_ns_windows
+    else:
+        raise ValueError("Invalid platform, must be 'ubuntu' or 'windows'")
+
+    filtered_nodes = filter_nodes(ET.fromstring(accessibility_tree), platform)
+    linearized_accessibility_tree = ["tag\tname\ttext\tclass\tdescription\tposition (center x&y)\tsize (w&h)"]
+
+    # Linearize the accessibility tree nodes into a table format
+    for node in filtered_nodes:
+        if node.text:
+            text = (
+                node.text if '"' not in node.text \
+                    else '"{:}"'.format(node.text.replace('"', '""'))
+            )
+
+        elif node.get("{{{:}}}class".format(class_ns_windows), "").endswith("EditWrapper") \
+                and node.get("{{{:}}}value".format(_value_ns)):
+            node_text = node.get("{{{:}}}value".format(_value_ns), "")
+            text = (node_text if '"' not in node_text \
+                        else '"{:}"'.format(node_text.replace('"', '""'))
+                    )
+        else:
+            text = '""'
+
+        # Normalize position (center) and size to [0, 1] with 3 decimal precision using 1920x1080
+        pos_value = node.get('{{{:}}}screencoord'.format(_component_ns), "")
+        size_value = node.get('{{{:}}}size'.format(_component_ns), "")
+        normalized_pos_str = pos_value
+        normalized_size_str = size_value
+        try:
+            coords = eval(pos_value) if pos_value else None
+            size = eval(size_value) if size_value else None
+            if isinstance(coords, tuple) and isinstance(size, tuple) and len(coords) == 2 and len(size) == 2:
+                nx, ny, nw, nh = _normalize_coord_size(int(coords[0]), int(coords[1]), int(size[0]), int(size[1]))
+                # compute center in normalized coordinates
+                cx = round(_clamp01(nx + (nw / 2.0)), 3)
+                cy = round(_clamp01(ny + (nh / 2.0)), 3)
+                normalized_pos_str = "({:.3f}, {:.3f})".format(cx, cy)
+                normalized_size_str = "({:.3f}, {:.3f})".format(nw, nh)
+        except Exception:
+            pass
+
+        linearized_accessibility_tree.append(
+            "{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}".format(
+                node.tag, node.get("name", ""),
+                text,
+                node.get("{{{:}}}class".format(_attributes_ns), "") if platform == "ubuntu" else node.get("{{{:}}}class".format(class_ns_windows), ""),
+                node.get("{{{:}}}description".format(_attributes_ns), ""),
+                normalized_pos_str,
+                normalized_size_str
+            )
+        )
+
+    return "\n".join(linearized_accessibility_tree)
\ No newline at end of file

From e2cd2a60996e6d736a97fe3e208a534acc1ca276 Mon Sep 17 00:00:00 2001
From: Mingjie Liu <mingjiel@nvidia.com>
Date: Tue, 25 Nov 2025 11:47:59 -0600
Subject: [PATCH 11/14] Rotated secret.

---
 scripts/tests/test_async_server_osworld.py | 84 ++++++++++++++++++++++
 scripts/tests/test_osworld_utils.py        |  3 +-
 2 files changed, 86 insertions(+), 1 deletion(-)
 create mode 100644 scripts/tests/test_async_server_osworld.py

diff --git a/scripts/tests/test_async_server_osworld.py b/scripts/tests/test_async_server_osworld.py
new file mode 100644
index 000000000..cab3062f0
--- /dev/null
+++ b/scripts/tests/test_async_server_osworld.py
@@ -0,0 +1,84 @@
+"""
+Migrated test script originally from openhands.nvidia.async_server
+"""
+
+import time
+import copy
+import os
+
+from openhands.nvidia.async_server import OpenHandsServer
+
+
+def test_server(
+    total_jobs: int = 4,
+    max_parallel_jobs: int = 2,
+    allow_skip_eval: bool = False,
+    timeout: int = 50,
+):
+    from concurrent.futures import ThreadPoolExecutor
+
+    import json
+    data_file = '/root/OSWorld/osworld_test_nogdrive.json'
+    all_data = []
+    with open(data_file, 'r') as f:
+        for line in f:
+            data = json.loads(line)
+            all_data.append(data)
+
+    instance = all_data[3] # 3 is a good example of pruning too much?
+    print(instance)
+
+    requests = []
+    for i in range(total_jobs):
+        cur = copy.deepcopy(instance)
+        cur['trajectory_id'] = i
+        requests.append(cur)
+
+    llm_server_address = 'https://api.deepseek.com/chat/completions'
+    sampling_params = {
+        'model': 'deepseek/deepseek-chat',
+        'api_key': os.getenv('DEEPSEEK_API_KEY', ''),
+        'modify_params': False,
+        'log_completions': False,
+        'native_tool_calling': True,
+        'temperature': 0.6,
+        'max_iterations': 35,
+    }
+
+    print('Starting server')
+    server = OpenHandsServer(
+        llm_server_addresses=[llm_server_address, llm_server_address],
+        max_init_workers=max_parallel_jobs,
+        max_run_workers=max_parallel_jobs,
+        allow_skip_eval=allow_skip_eval,
+    )
+    server.start()
+    print('Server started')
+
+    print('Job submission started')
+
+    # Process instances using ThreadPoolExecutor for parallel processing
+    with ThreadPoolExecutor(max_workers=len(requests)) as executor:
+        futures = [
+            executor.submit(
+                server.process, inst, dict(sampling_params), timeout=timeout
+            )
+            for inst in requests
+        ]
+        results = [future.result() for future in futures]
+
+    print('Job submission finished')
+    # print(results)
+    server.stop()
+    return results
+
+
+if __name__ == '__main__':
+    start = time.time()
+    # set timeout approriate to terminate
+    results = test_server(
+        total_jobs=1, max_parallel_jobs=1, allow_skip_eval=False, timeout=6000
+    )
+    # Don't print full messages
+    print(f'Time taken: {time.time() - start}')
+    print('All tests passed!')
diff --git a/scripts/tests/test_osworld_utils.py b/scripts/tests/test_osworld_utils.py
index 44a68c6a3..d00f66c74 100644
--- a/scripts/tests/test_osworld_utils.py
+++ b/scripts/tests/test_osworld_utils.py
@@ -1,4 +1,5 @@
 import asyncio
+import os
 
 import numpy as np
 import pandas as pd
@@ -18,7 +19,7 @@ async def run(instance):
     max_iterations = 35
     sampling_params = {
         'model': 'deepseek/deepseek-chat',
-        'api_key': 'sk-697e5dc7145849a3b2ad1595718b35f2',
+        'api_key': os.getenv('DEEPSEEK_API_KEY', ''),
         'modify_params': False,
         'log_completions': True,
         'native_tool_calling': True,

From ca94689317f76e0e6a5ca470be4a964179dfb8ff Mon Sep 17 00:00:00 2001
From: Mingjie Liu <mingjiel@nvidia.com>
Date: Tue, 25 Nov 2025 11:49:09 -0600
Subject: [PATCH 12/14] Resolved server to skip runtime.close during RUN for
 osworld.

---
 openhands/nvidia/async_server.py         | 7 ++++++-
 openhands/nvidia/async_server_process.py | 5 ++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/openhands/nvidia/async_server.py b/openhands/nvidia/async_server.py
index 8af4be22a..59a2c0088 100644
--- a/openhands/nvidia/async_server.py
+++ b/openhands/nvidia/async_server.py
@@ -430,7 +430,8 @@ async def _worker(self, wid, job_type: JobType):
                         )
                         job_details.run_results = run_results
                         # Close runtime (automatically in "others" phase - doesn't count toward timeout)
-                        if job_details.runtime:
+                        # OSWorld runtime is closed in evaluate function
+                        if dataset_type != 'osworld' and job_details.runtime:
                             self._cleanup_job_runtime(job_details.runtime, job_id)
                             job_details.runtime = None
                         # Push to evaluation queue (automatically "others" phase)
@@ -454,6 +455,10 @@ async def _worker(self, wid, job_type: JobType):
                             job_details.eval_results = eval_report
                         if job_details.event is not None:
                             job_details.event.set()
+                        # Close runtime if needed
+                        if job_details.runtime:
+                            self._cleanup_job_runtime(job_details.runtime, job_id)
+                            job_details.runtime = None
 
             except TimeoutError as e:
                 logger.warning(
diff --git a/openhands/nvidia/async_server_process.py b/openhands/nvidia/async_server_process.py
index ded74da58..d0fbd5174 100644
--- a/openhands/nvidia/async_server_process.py
+++ b/openhands/nvidia/async_server_process.py
@@ -231,7 +231,7 @@ async def run_step(self, job_type: JobType):
                     )
                     self.job_details.run_results = run_results
                     # Close runtime (automatically in "others" phase - doesn't count toward timeout)
-                    if self.job_details.runtime:
+                    if data_source != 'osworld' and self.job_details.runtime:
                         self._cleanup_job_runtime(self.job_details.runtime, self.job_id)
                         self.job_details.runtime = None
 
@@ -253,6 +253,9 @@ async def run_step(self, job_type: JobType):
                         self.job_details.eval_results = eval_report
                     if self.job_details.event is not None:
                         self.job_details.event.set()
+                    if self.job_details.runtime:
+                        self._cleanup_job_runtime(self.job_details.runtime, self.job_id)
+                        self.job_details.runtime = None
 
         except TimeoutError as e:
             self.job_details.timeout_error = True

From 986c70d872822ff1bba735704eff48d3e05ffd48 Mon Sep 17 00:00:00 2001
From: Mingjie Liu <mingjiel@nvidia.com>
Date: Tue, 25 Nov 2025 14:47:16 -0600
Subject: [PATCH 13/14] Fixed agent and evaluator.

---
 openhands/nvidia/async_server_process.py      |  2 +-
 openhands/nvidia/os_world/evaluate.py         | 36 ++++++++-----------
 openhands/nvidia/os_world/osworld_utils.py    |  3 +-
 .../osworld_singularity_runtime.py            |  4 +--
 4 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/openhands/nvidia/async_server_process.py b/openhands/nvidia/async_server_process.py
index d0fbd5174..ac1970ad5 100644
--- a/openhands/nvidia/async_server_process.py
+++ b/openhands/nvidia/async_server_process.py
@@ -231,7 +231,7 @@ async def run_step(self, job_type: JobType):
                     )
                     self.job_details.run_results = run_results
                     # Close runtime (automatically in "others" phase - doesn't count toward timeout)
-                    if data_source != 'osworld' and self.job_details.runtime:
+                    if dataset_type != 'osworld' and self.job_details.runtime:
                         self._cleanup_job_runtime(self.job_details.runtime, self.job_id)
                         self.job_details.runtime = None
 
diff --git a/openhands/nvidia/os_world/evaluate.py b/openhands/nvidia/os_world/evaluate.py
index 6a88335b8..cd3478021 100644
--- a/openhands/nvidia/os_world/evaluate.py
+++ b/openhands/nvidia/os_world/evaluate.py
@@ -5,6 +5,13 @@
 from openhands.nvidia.os_world import metrics, getters
 from openhands.core.logger import openhands_logger as logger
 
+async def function_wrapper(func, *args, **kwargs):
+    # wrap function to handle coroutine functions
+    if inspect.iscoroutinefunction(func):
+        return await func(*args, **kwargs)
+    else:
+        return func(*args, **kwargs)
+
 class Evaluator:
     def __init__(self, task_config: Dict[str, Any], controller):
         self.setup_controller = controller
@@ -111,24 +118,17 @@ def last_action_is_fail(last_action):
             for idx, metric in enumerate(self.metric):
                 try:
                     config = self.evaluator["result"][idx]
-
-                    # Check if it's a coroutine function and await if necessary
-                    if inspect.iscoroutinefunction(self.result_getter[idx]):
-                        result_state = await self.result_getter[idx](self, config)
-                    else:
-                        result_state = self.result_getter[idx](self, config)
-
-                    
+                    result_state = await function_wrapper(self.result_getter[idx], self, config)
                 except FileNotFoundError:
                     logger.error("File not found!")
                     if self.metric_conj == 'and':
                         return 0
 
                 if "expected" in self.evaluator and self.expected_getter and self.evaluator["expected"]:
-                    expected_state = self.expected_getter[idx](self, self.evaluator["expected"][idx])
-                    metric: int = metric(result_state, expected_state, **self.metric_options[idx])
+                    expected_state = await function_wrapper(self.expected_getter[idx], self, self.evaluator["expected"][idx])
+                    metric: int = await function_wrapper(metric, result_state, expected_state, **self.metric_options[idx])
                 else:
-                    metric: int = metric(result_state, **self.metric_options[idx])
+                    metric: int = await function_wrapper(metric, result_state, **self.metric_options[idx])
 
                 if self.metric_conj == 'and' and float(metric) == 0.0:
                     return 0
@@ -141,21 +141,15 @@ def last_action_is_fail(last_action):
         else:
             # Single metric to evaluate whether the task is successfully completed
             try:
-
-                # Check if it's a coroutine function and await if necessary
-                if inspect.iscoroutinefunction(self.result_getter):
-                    result_state = await self.result_getter(self, self.evaluator["result"])
-                else:
-                    result_state = self.result_getter(self, self.evaluator["result"])
-
+                result_state = await function_wrapper(self.result_getter, self, self.evaluator["result"])
             except FileNotFoundError:
                 logger.error("File not found!")
                 return 0
 
             if "expected" in self.evaluator and self.expected_getter and self.evaluator["expected"]:
-                expected_state = self.expected_getter(self, self.evaluator["expected"])
-                metric: float = self.metric(result_state, expected_state, **self.metric_options)
+                expected_state = await function_wrapper(self.expected_getter, self, self.evaluator["expected"])
+                metric: float = await function_wrapper(self.metric, result_state, expected_state, **self.metric_options)
             else:
-                metric: float = self.metric(result_state, **self.metric_options)
+                metric: float = await function_wrapper(self.metric, result_state, **self.metric_options)
 
         return metric
\ No newline at end of file
diff --git a/openhands/nvidia/os_world/osworld_utils.py b/openhands/nvidia/os_world/osworld_utils.py
index 0a73bc95a..61e66a454 100644
--- a/openhands/nvidia/os_world/osworld_utils.py
+++ b/openhands/nvidia/os_world/osworld_utils.py
@@ -267,7 +267,8 @@ async def evaluate_agent(run_results: dict, instance: dict, runtime: Runtime):
     try:
         evaluator = Evaluator(instance, runtime.setup_controller)
         score = await evaluator.evaluate(run_results['messages'])
-        if score > 0.99:
+        # Some evaluation metrics are fuzzy matching such as pdf comparison
+        if score > 0.95:
             return {'resolved': True, 'reward': score}
         return {'resolved': False, 'reward': score}
     except:
diff --git a/openhands/runtime/impl/singularity/osworld_singularity_runtime.py b/openhands/runtime/impl/singularity/osworld_singularity_runtime.py
index c4f31447b..54b782164 100644
--- a/openhands/runtime/impl/singularity/osworld_singularity_runtime.py
+++ b/openhands/runtime/impl/singularity/osworld_singularity_runtime.py
@@ -241,8 +241,8 @@ def _get_qemu_command(self) -> list[str]:
         )
         
         cmd.extend([
-            '-m', '8G',
-            '-smp', '8',
+            '-m', '2G',
+            '-smp', '2',
             '-drive', f'file={vm_image_container_path},if=ide',
             '-netdev', portfwd,
             '-device', 'virtio-net-pci,netdev=net0',

From 02daba33a810b5a0fffec7b3d0581880760f3ec2 Mon Sep 17 00:00:00 2001
From: Mingjie Liu <mingjiel@nvidia.com>
Date: Tue, 25 Nov 2025 17:43:26 -0600
Subject: [PATCH 14/14] Improved. Added pause_time.

---
 openhands/agenthub/gui_agent/osworld_agent.py | 53 ++++++++++++++-----
 .../agenthub/gui_agent/prompts/osworld.py     |  6 +--
 openhands/events/action/os.py                 |  1 +
 openhands/events/observation/osworld.py       |  9 ++--
 openhands/nvidia/os_world/osworld_utils.py    |  9 +++-
 openhands/nvidia/registry.py                  |  2 +
 .../osworld_singularity_runtime.py            | 10 ++--
 scripts/tests/test_async_server_osworld.py    |  4 +-
 scripts/tests/test_osworld_utils.py           | 10 ++--
 9 files changed, 74 insertions(+), 30 deletions(-)

diff --git a/openhands/agenthub/gui_agent/osworld_agent.py b/openhands/agenthub/gui_agent/osworld_agent.py
index cb580a7e0..518d5f43a 100644
--- a/openhands/agenthub/gui_agent/osworld_agent.py
+++ b/openhands/agenthub/gui_agent/osworld_agent.py
@@ -47,6 +47,10 @@ def convert_action_to_message(action: OSWorldInteractiveAction) -> Message:
     output_ids = getattr(llm_response.choices[0], 'output_ids', None)
     logprobs = getattr(llm_response.choices[0], 'logprobs', None)
 
+    text_content = assistant_msg.content
+    if text_content is None:
+        text_content = ''
+
     return Message(
         role=getattr(assistant_msg, 'role', 'assistant'),
         content=[TextContent(text=assistant_msg.content)],
@@ -63,11 +67,19 @@ def convert_message_action_to_message(
     ) -> Message:
     text_content = action.content
     if include_a11y_tree:
-        accessibility_tree = linearize_accessibility_tree(action.accessibility_tree)
-        text_content += f"\n\nAccessibility Tree:\n{accessibility_tree}"
+        accessibility_tree = action.accessibility_tree
+        if accessibility_tree is None or len(accessibility_tree) < 1:
+            logger.error('Accessibility tree is None or empty, skipping')
+        else:
+            accessibility_tree = linearize_accessibility_tree(accessibility_tree)
+            text_content += f"\n\nAccessibility Tree:\n{accessibility_tree}"
     content = [TextContent(text=text_content)]
     if include_screenshot:
-        content.append(ImageContent(image_urls=action.image_urls))
+        image_urls = action.image_urls
+        if image_urls is None or len(image_urls) < 1:
+            logger.error('Image urls is None or empty, skipping')
+        else:
+            content.append(ImageContent(image_urls=image_urls))
     return Message(
         role='user',
         content=content,
@@ -82,11 +94,19 @@ def convert_observation_to_message(
     if isinstance(observation, OSWorldOutputObservation):
         prompt_text = OSWORLD_OBSERVATION_FEEDBACK_PROMPT.format(instruction=instruction)
         if include_a11y_tree:
-            accessibility_tree = linearize_accessibility_tree(observation.accessibility_tree)
-            prompt_text += f"\n\nAccessibility Tree:\n{accessibility_tree}"
+            accessibility_tree = observation.accessibility_tree
+            if accessibility_tree and len(accessibility_tree) >= 1:
+                logger.error('Accessibility tree is None or empty, skipping')
+            else:
+                accessibility_tree = linearize_accessibility_tree(accessibility_tree)
+                prompt_text += f"\n\nAccessibility Tree:\n{accessibility_tree}"
         content = [TextContent(text=prompt_text)]
         if include_screenshot:
-            content.append(ImageContent(image_urls=observation.image_urls))
+            image_url = observation.image_urls
+            if image_url is None or len(image_url) < 1:
+                logger.error('Image urls is None or empty, skipping')
+            else:
+                content.append(ImageContent(image_urls=image_url))
         return Message(
             role='tool', # or user?
             content=content,
@@ -106,11 +126,13 @@ def convert_message_action_to_message_full_state(
     action: MessageAction,
     include_a11y_tree: bool = True,
     ) -> Message:
-    test_content = action.content
+    text_content = action.content
     if include_a11y_tree:
-        accessibility_tree = linearize_accessibility_tree(action.accessibility_tree)
-        test_content += f"\n\nAccessibility Tree:\n{accessibility_tree}"
-    content = [TextContent(text=action.content)]
+        accessibility_tree = action.accessibility_tree
+        if accessibility_tree and len(accessibility_tree) > 0:
+            accessibility_tree = linearize_accessibility_tree(action.accessibility_tree)
+            text_content += f"\n\nAccessibility Tree:\n{accessibility_tree}"
+    content = [TextContent(text=text_content)]
     content.append(ImageContent(image_urls=action.image_urls))
     content.append(TextContent(text=action.accessibility_tree))
     return Message(
@@ -126,8 +148,10 @@ def convert_observation_to_message_full_state(
     if isinstance(observation, OSWorldOutputObservation):
         prompt_text = OSWORLD_OBSERVATION_FEEDBACK_PROMPT.format(instruction=instruction)
         if include_a11y_tree:
-            accessibility_tree = linearize_accessibility_tree(observation.accessibility_tree)
-            prompt_text += f"\n\nAccessibility Tree:\n{accessibility_tree}"
+            accessibility_tree = observation.accessibility_tree
+            if accessibility_tree and len(accessibility_tree) > 0:
+                accessibility_tree = linearize_accessibility_tree(accessibility_tree)
+                prompt_text += f"\n\nAccessibility Tree:\n{accessibility_tree}"
         content = [TextContent(text=prompt_text)]
         
         # We always add screenshot and accessibility tree to the message
@@ -168,6 +192,7 @@ def __init__(
         """
         super().__init__(llm, config)
 
+        self.pause_time = 0.0
         self.system_prompt = os.path.join(os.path.dirname(__file__), 'prompts', 'system_prompt_osworld.j2')
         with open(self.system_prompt, 'r') as file:
             self.system_prompt = file.read()
@@ -296,10 +321,14 @@ def step(self, state: State) -> Action:
         }
         params['tools'] = self.tools
         params['extra_body'] = {'metadata': state.to_llm_metadata(agent_name=self.name)}
+        import pdb; pdb.set_trace()
         response = self.llm.completion(**params)
         import pdb; pdb.set_trace()
         logger.debug(f'Response from LLM: {response}')
         action = codeact_function_calling.response_to_actions(response, timeout=self.config.action_timeout)
+        if self.pause_time > 0.5:
+            logger.info(f'Setting pause time to {self.pause_time} seconds for agentic action')
+            action.pause_time = self.pause_time
         logger.debug(f'Actions after response_to_actions: {action}')
         return action
 
diff --git a/openhands/agenthub/gui_agent/prompts/osworld.py b/openhands/agenthub/gui_agent/prompts/osworld.py
index 16bdb9173..5cc3018ce 100644
--- a/openhands/agenthub/gui_agent/prompts/osworld.py
+++ b/openhands/agenthub/gui_agent/prompts/osworld.py
@@ -1,10 +1,10 @@
-OSWORLD_OBSERVATION_FEEDBACK_PROMPT = """Action executed. Please generate the next move according to the UI screenshot and instruction. And you can refer to the previous actions and observations for reflection.
+OSWORLD_OBSERVATION_FEEDBACK_PROMPT = """Action executed. Please generate the next move according to the UI screenshot and instruction.
 
 Instruction: {instruction}
 """
 
-ERROR_OBSERVATION_FEEDBACK_PROMPT = """Action failed. Please refer to previous message for the UI screenshot. Please continue working on the task according to the instruction.
-Error message: {error_message}.
+ERROR_OBSERVATION_FEEDBACK_PROMPT = """Action failed. Please continue working on the task according to the instruction.
+Error message: {error_message}
 
 Instruction: {instruction}
 """
\ No newline at end of file
diff --git a/openhands/events/action/os.py b/openhands/events/action/os.py
index c92f906d1..a8613ff1e 100644
--- a/openhands/events/action/os.py
+++ b/openhands/events/action/os.py
@@ -40,6 +40,7 @@ class OSWorldInteractiveAction(Action):
     action: str = ActionType.OSWORLD_INTERACTIVE
     runnable: ClassVar[bool] = True
     security_risk: ActionSecurityRisk | None = None
+    pause_time: float = 0.0
 
     def __post_init__(self):
         if self.params is None:
diff --git a/openhands/events/observation/osworld.py b/openhands/events/observation/osworld.py
index e1da07ac7..2142b01f5 100644
--- a/openhands/events/observation/osworld.py
+++ b/openhands/events/observation/osworld.py
@@ -10,8 +10,8 @@ class OSWorldOutputObservation(Observation):
     observation: str = ObservationType.OSWORLD
     command: str = field(default='')
     content: str = field(default='')
-    screenshot: str = field(repr=False, default='')  # don't show in repr, in base64 format
-    accessibility_tree: str = field(repr=False, default='')
+    screenshot: str | None = None
+    accessibility_tree: str | None = None
     tool_call_id: str | None = None
     name: str = ''
 
@@ -21,7 +21,10 @@ def message(self) -> str:
 
     @property
     def image_urls(self) -> list[str]:
-        return [f'data:image/png;base64,{self.screenshot}']
+        if self.screenshot:
+            return [f'data:image/png;base64,{self.screenshot}']
+        else:
+            return []
 
     def __str__(self) -> str:
         ret = (
diff --git a/openhands/nvidia/os_world/osworld_utils.py b/openhands/nvidia/os_world/osworld_utils.py
index 61e66a454..0fdb87e51 100644
--- a/openhands/nvidia/os_world/osworld_utils.py
+++ b/openhands/nvidia/os_world/osworld_utils.py
@@ -2,6 +2,7 @@
 import time
 import os
 import pandas as pd
+import base64
 import numpy as np
 import asyncio
 from evaluation.utils.shared import (  # type: ignore
@@ -89,8 +90,8 @@ def get_config(
         ensure_thinking_end_properly=agent_config['ensure_thinking_end_properly'], # set to true only if using text based server for training.
         action_timeout=30.0, # 30 seconds per action
         strict_loop_detector=agent_config['strict_loop_detector'], # set to true only if training
-        enable_vision=False,
-        enable_a11y_tree=True,
+        enable_vision=agent_config['enable_vision'],
+        enable_a11y_tree=agent_config['enable_a11y_tree'],
     )
     config.set_agent_config(agent_config)
     return config
@@ -114,6 +115,7 @@ def get_instruction(instance: pd.Series | dict, metadata: EvalMetadata, runtime:
     if include_screenshot:
         image = runtime.get_vm_screenshot()
         if image:
+            image = base64.b64encode(image).decode('utf-8')
             image_url = [f'data:image/png;base64,{image}']
 
     return MessageAction(content=instruction, image_urls=image_url, accessibility_tree=accessibility_tree)
@@ -218,6 +220,9 @@ async def run_agent(
     message_action = get_instruction(instance, metadata, runtime)
     try:
         agent = create_agent(config)
+        # Set pause time for agent to wait for the screenshot to be taken
+        # You can play around with this value to find appropriate value for setup.
+        agent.pause_time = 4.0
         job_details.agent = agent
         controller, initial_state = create_controller(
             agent=agent,
diff --git a/openhands/nvidia/registry.py b/openhands/nvidia/registry.py
index e975b6462..4205d72ee 100644
--- a/openhands/nvidia/registry.py
+++ b/openhands/nvidia/registry.py
@@ -17,6 +17,8 @@
     'max_iterations': 2,
     'ensure_thinking_end_properly': False,
     'strict_loop_detector': False,
+    'enable_vision': False,
+    'enable_a11y_tree': False,
 }
 
 
diff --git a/openhands/runtime/impl/singularity/osworld_singularity_runtime.py b/openhands/runtime/impl/singularity/osworld_singularity_runtime.py
index 54b782164..49306942f 100644
--- a/openhands/runtime/impl/singularity/osworld_singularity_runtime.py
+++ b/openhands/runtime/impl/singularity/osworld_singularity_runtime.py
@@ -858,7 +858,7 @@ def osworld_interactive(self, action) -> 'Observation':
             if method == 'execute_action':
                 return self._handle_execute_action(params)
             elif method == 'execute_agentic_action':
-                return self._handle_execute_agentic_action(params, action.tool_call_metadata)
+                return self._handle_execute_agentic_action(params, action.tool_call_metadata, action.pause_time)
             elif method == 'get_screenshot':
                 return self._handle_get_screenshot()
             elif method == 'get_accessibility_tree':
@@ -919,7 +919,7 @@ def _handle_execute_action(self, params: dict) -> 'Observation':
                 exit_code=1,
             )
 
-    def _handle_execute_agentic_action(self, params: dict, tool_call_metadata: ToolCallMetadata | None) -> 'Observation':
+    def _handle_execute_agentic_action(self, params: dict, tool_call_metadata: ToolCallMetadata | None, pause_time: float = 0.0) -> 'Observation':
         """Handle execute_action - PyAutoGUI actions like CLICK, TYPING, etc."""
         from openhands.events.observation.osworld import OSWorldOutputObservation  
         from openhands.events.observation import ErrorObservation   
@@ -946,15 +946,17 @@ def _handle_execute_agentic_action(self, params: dict, tool_call_metadata: ToolC
         result = self.execute_vm_action(action_data)
         
         if result.get('status') == 'success':
+            if pause_time > 0.5:
+                time.sleep(pause_time)
             if include_screenshot:
                 screenshot_bytes = self.get_vm_screenshot()
-                screenshot_bytes = base64.b64encode(screenshot_bytes).decode('utf-8')
+                if screenshot_bytes:
+                    screenshot_bytes = base64.b64encode(screenshot_bytes).decode('utf-8')
             else:
                 screenshot_bytes = None
 
             if include_a11y_tree:
                 accessibility_tree = self.get_vm_accessibility_tree()
-                #accessibility_tree = linearize_accessibility_tree(accessibility_tree)
             else:
                 accessibility_tree = None
 
diff --git a/scripts/tests/test_async_server_osworld.py b/scripts/tests/test_async_server_osworld.py
index cab3062f0..3f1afc271 100644
--- a/scripts/tests/test_async_server_osworld.py
+++ b/scripts/tests/test_async_server_osworld.py
@@ -42,7 +42,7 @@ def test_server(
         'log_completions': False,
         'native_tool_calling': True,
         'temperature': 0.6,
-        'max_iterations': 35,
+        'max_iterations': 3,
     }
 
     print('Starting server')
@@ -77,7 +77,7 @@ def test_server(
     start = time.time()
     # set timeout approriate to terminate
     results = test_server(
-        total_jobs=1, max_parallel_jobs=1, allow_skip_eval=False, timeout=6000
+        total_jobs=4, max_parallel_jobs=4, allow_skip_eval=False, timeout=6000
     )
     # Don't print full messages
     print(f'Time taken: {time.time() - start}')
diff --git a/scripts/tests/test_osworld_utils.py b/scripts/tests/test_osworld_utils.py
index d00f66c74..c1bc97498 100644
--- a/scripts/tests/test_osworld_utils.py
+++ b/scripts/tests/test_osworld_utils.py
@@ -18,14 +18,14 @@
 async def run(instance):
     max_iterations = 35
     sampling_params = {
-        'model': 'deepseek/deepseek-chat',
-        'api_key': os.getenv('DEEPSEEK_API_KEY', ''),
+        'model': 'gpt-5-mini-2025-08-07',
+        'api_key': os.getenv('OPENAI_API_KEY', ''),
         'modify_params': False,
         'log_completions': True,
         'native_tool_calling': True,
-        'temperature': 0.6,
+        'temperature': 1,
     }
-    llm_config = LLMConfig(base_url='https://api.deepseek.com/chat/completions', **sampling_params)
+    llm_config = LLMConfig(base_url='https://api.openai.com/v1', **sampling_params)
 
 
     job_details = JobDetails(
@@ -34,6 +34,8 @@ async def run(instance):
         llm_config=llm_config,
     )
     job_details.agent_config['max_iterations'] = max_iterations
+    job_details.agent_config['enable_vision'] = True
+    job_details.agent_config['enable_a11y_tree'] = False
     job_details.timer = PausableTimer(timeout=500)
     job_details.timer.start()