Richar-Du · 720502225 · Dec 3, 2025 · Dec 4, 2025 · Dec 5, 2025 · Dec 6, 2025
diff --git a/agent/actor/__init__.py b/agent/actor/__init__.py
@@ -0,0 +1,5 @@
+"""Actor Agent components for action execution and validation."""
+
+from .action_executor import ActionExecutor
+
+__all__ = ["ActionExecutor"]
diff --git a/agent/actor/action_executor.py b/agent/actor/action_executor.py
@@ -0,0 +1,117 @@
+"""Action execution and validation for Actor Agent."""
+
+from typing import Any, Dict, List
+
+from browser_env import Action
+
+
+class ActionExecutor:
+    """Executes and validates actions for the Actor Agent."""
+
+    def __init__(self, action_set_tag: str) -> None:
+        self.action_set_tag = action_set_tag
+        self.execution_history: List[Dict[str, Any]] = []
+
+    def validate_action(self, action: Action) -> Dict[str, Any]:
+        """Validate an action format and content before execution.
+
+        Args:
+            action: The action to validate
+
+        Returns:
+            Dictionary containing validation results
+        """
+        try:
+            # Validate action format and content
+            validation_result = self._validate_action_format(action)
+
+            # Store validation history
+            validation_record = {
+                "action": action,
+                "validation_passed": validation_result["valid"],
+                "validation_details": validation_result,
+            }
+            self.execution_history.append(validation_record)
+
+            return {
+                "valid": validation_result["valid"],
+                "action": action,
+                "validation_details": validation_result,
+            }
+
+        except Exception as e:
+            # Record failed validation
+            validation_record = {
+                "action": action,
+                "validation_passed": False,
+                "error": str(e),
+            }
+            self.execution_history.append(validation_record)
+
+            return {
+                "valid": False,
+                "error": str(e),
+                "action": action,
+                "validation_details": {"valid": False, "error": str(e)},
+            }
+
+    def _validate_action_format(self, action: Action) -> Dict[str, Any]:
+        """Validate the format and content of an action.
+
+        Args:
+            action: The action to validate
+
+        Returns:
+            Dictionary containing validation results
+        """
+        required_fields = ["action_type"]
+        validation_result = {
+            "valid": True,
+            "missing_fields": [],
+            "invalid_fields": [],
+            "warnings": [],
+        }
+
+        # Check required fields
+        for field in required_fields:
+            if field not in action:
+                validation_result["valid"] = False
+                validation_result["missing_fields"].append(field)
+
+        # Validate action type
+        if "action_type" in action:
+            action_type = action["action_type"]
+            valid_types = [
+                "CLICK", "TYPE", "SCROLL", "KEY_PRESS", "GOTO_URL",
+                "NEW_TAB", "PAGE_CLOSE", "GO_BACK", "GO_FORWARD",
+                "PAGE_FOCUS", "CLEAR", "UPLOAD", "STOP", "NONE", "HOVER"
+            ]
+
+            if action_type not in valid_types:
+                validation_result["valid"] = False
+                validation_result["invalid_fields"].append(f"Invalid action_type: {action_type}")
+
+            # Type-specific validations
+            if action_type == "TYPE" and "element_id" not in action:
+                validation_result["valid"] = False
+                validation_result["missing_fields"].append("element_id for TYPE action")
+
+            if action_type == "CLICK" and "element_id" not in action:
+                validation_result["valid"] = False
+                validation_result["missing_fields"].append("element_id for CLICK action")
+
+            if action_type == "SCROLL" and "direction" not in action:
+                validation_result["valid"] = False
+                validation_result["missing_fields"].append("direction for SCROLL action")
+
+        # Check for potential issues (warnings)
+        if "element_id" in action:
+            element_id = action["element_id"]
+            if isinstance(element_id, str) and not element_id.strip():
+                validation_result["warnings"].append("Empty element_id detected")
+
+        return validation_result
+
+    def reset_execution_history(self) -> None:
+        """Reset execution history for a new task."""
+        self.execution_history.clear()
diff --git a/agent/actor_agent.py b/agent/actor_agent.py
@@ -0,0 +1,156 @@
+"""Actor Agent for executing high-level intentions with specific browser actions."""
+
+from typing import Any, Dict, List, Optional
+
+from PIL import Image
+
+from browser_env import Trajectory
+from browser_env.utils import Observation
+from llms import lm_config
+
+from agent import PromptAgent  # Import existing PromptAgent
+from .actor.action_executor import ActionExecutor
+
+
+class ActorAgent(PromptAgent):
+    """Executes high-level intentions using specific browser actions.
+
+    Extends the existing PromptAgent to work with high-level intentions from
+    the Planner Agent while maintaining compatibility with the existing codebase.
+    """
+
+    def __init__(
+        self,
+        action_set_tag: str,
+        lm_config: lm_config.LMConfig,
+        prompt_constructor,
+        captioning_fn=None,
+    ) -> None:
+        """Initialize Actor Agent with enhanced capabilities."""
+        # Initialize parent PromptAgent with existing parameters
+        super().__init__(
+            action_set_tag=action_set_tag,
+            lm_config=lm_config,
+            prompt_constructor=prompt_constructor,
+            captioning_fn=captioning_fn,
+        )
+
+        # Initialize action executor for validation and tracking
+        self.action_executor = ActionExecutor(action_set_tag)
+
+        # Track intention execution history
+        self.intention_history: List[Dict[str, Any]] = []
+
+    def execute_intention(
+        self,
+        intention: str,
+        current_observation: Observation,
+        trajectory: Trajectory,
+        meta_data: Optional[Dict[str, Any]] = None,
+        images: Optional[List[Image.Image]] = None,
+    ) -> Dict[str, Any]:
+        """Execute a high-level intention and generate specific actions.
+
+        Args:
+            intention: High-level intention from Planner Agent
+            current_observation: Current page observation
+            trajectory: Current execution trajectory
+            meta_data: Additional metadata for execution
+            images: Optional input images
+
+        Returns:
+            Dictionary containing execution results
+        """
+        # Record intention execution attempt
+        execution_record = {
+            "intention": intention,
+            "timestamp": None,  # Would be set in actual implementation
+            "observation_before": current_observation,
+        }
+
+        try:
+            # Create a simple intention message that works with the existing prompt system
+            intention_message = f"Execute browser actions to fulfill this intention: {intention}"
+
+            # Use existing PromptAgent's next_action method with the intention message
+            try:
+                action = self.next_action(
+                    trajectory=trajectory,
+                    intent=intention_message,
+                    meta_data=meta_data or {},
+                    images=images,
+                    output_response=False,
+                )
+            except Exception as next_action_error:
+                print(f"🎬 Actor Error: {str(next_action_error)[:200]}")
+                print(f"🎬 Error Type: {type(next_action_error).__name__}")
+                raise next_action_error
+
+            # Extract LLM raw response from action
+            llm_response = action.get("raw_prediction", "No LLM response available")
+
+            # Validate the generated action (execution will be handled externally)
+            validation_result = self.action_executor.validate_action(action)
+
+            # Record validation results
+            execution_record.update({
+                "generated_action": action,
+                "validation_result": validation_result,
+                "llm_response": llm_response,
+                # intention_fulfilled will be determined after actual execution
+            })
+
+            # Store in intention history
+            self.intention_history.append(execution_record)
+
+            return {
+                "action": action,
+                "validation_result": validation_result,
+                "intention": intention,
+                # intention_fulfilled will be determined by actual browser execution
+                "intention_fulfilled": False,  # Default to False, will be updated after execution
+                "execution_history_length": len(self.intention_history),
+                "llm_response": llm_response,
+                "response": f"LLM Response: {llm_response[:200]}{'...' if len(llm_response) > 200 else ''}",
+            }
+
+        except Exception as e:
+            # Provide more detailed error information
+            error_details = str(e)
+            if "prompt_constructor" in error_details.lower():
+                error_details += " (Prompt constructor issue)"
+            elif "next_action" in error_details.lower():
+                error_details += " (next_action method failure)"
+            elif "traject" in error_details.lower():
+                error_details += " (Trajectory processing issue)"
+
+            # Record failed execution
+            execution_record.update({
+                "error": error_details,
+                "exception_type": type(e).__name__,
+            })
+            self.intention_history.append(execution_record)
+
+            return {
+                "error": error_details,
+                "intention": intention,
+                "intention_fulfilled": False,
+                "exception_type": type(e).__name__,
+                "response": f"Execution failed: {error_details}",
+            }
+
+    def reset_intention_history(self) -> None:
+        """Reset intention execution history for a new task."""
+        self.intention_history.clear()
+        self.action_executor.reset_execution_history()
+
+    def get_recent_intentions(self, count: int = 5) -> List[Dict[str, Any]]:
+        """Get the most recent intention executions.
+
+        Args:
+            count: Number of recent intentions to return
+
+        Returns:
+            List of recent intention execution records
+        """
+        return self.intention_history[-count:] if self.intention_history else []
diff --git a/agent/agent.py b/agent/agent.py
@@ -116,7 +116,7 @@ def __init__(
         self.captioning_fn = captioning_fn
 
         # Check if the model is multimodal.
-        if ("gemini" in lm_config.model or "gpt-4" in lm_config.model and "vision" in lm_config.model) and type(prompt_constructor) == MultimodalCoTPromptConstructor:
+        if type(prompt_constructor) == MultimodalCoTPromptConstructor:
             self.multimodal_inputs = True
         else:
             self.multimodal_inputs = False
@@ -132,9 +132,14 @@ def next_action(
         # Create page screenshot image for multimodal models.
         if self.multimodal_inputs:
             page_screenshot_arr = trajectory[-1]["observation"]["image"]
-            page_screenshot_img = Image.fromarray(
-                page_screenshot_arr
-            )  # size = (viewport_width, viewport_width)
+            if page_screenshot_arr is not None:
+                page_screenshot_img = Image.fromarray(
+                    page_screenshot_arr
+                )  # size = (viewport_width, viewport_width)
+            else:
+                # Fallback: create empty image if image is None
+                print("WARNING: No page screenshot image found, creating empty image.")
+                page_screenshot_img = Image.new('RGB', (1280, 720), color='white')
 
         # Caption the input image, if provided.
         if images is not None and len(images) > 0:

diff --git a/agent/context/__init__.py b/agent/context/__init__.py
@@ -0,0 +1,86 @@
+"""Context manager for agent execution history and progress tracking."""
+
+from typing import Any, Dict, List
+
+from browser_env import Action
+from browser_env.utils import Observation
+
+
+class StateManager:
+    """Manages execution history for context awareness."""
+
+    def __init__(self) -> None:
+        self.observations: List[Observation] = []
+        self.actions: List[Action] = []
+        self.reflections: List[Dict[str, Any]] = []
+        self.intentions: List[str] = []
+        self.user_goal: str = ""
+
+    def add_observation(self, observation: Observation) -> None:
+        """Add a new observation to the history."""
+        self.observations.append(observation)
+
+    def add_action(self, action: Action) -> None:
+        """Add a new action to the history."""
+        self.actions.append(action)
+
+    def add_reflection(self, reflection: Dict[str, Any]) -> None:
+        """Add a new reflection to the history."""
+        self.reflections.append(reflection)
+
+    def add_intention(self, intention: str) -> None:
+        """Add a new intention to the history."""
+        self.intentions.append(intention)
+
+    def get_all_observations(self) -> List[Observation]:
+        """Get all observations."""
+        return self.observations
+
+    def get_all_actions(self) -> List[Action]:
+        """Get all actions."""
+        return self.actions
+
+    def get_all_reflections(self) -> List[Dict[str, Any]]:
+        """Get all reflections."""
+        return self.reflections
+
+    def get_all_intentions(self) -> List[str]:
+        """Get all intentions."""
+        return self.intentions
+
+    def get_latest_observation(self) -> Observation:
+        """Get the most recent observation."""
+        return self.observations[-1] if self.observations else None
+
+    def get_latest_action(self) -> Action:
+        """Get the most recent action."""
+        return self.actions[-1] if self.actions else None
+
+    def get_history(self) -> Dict[str, Any]:
+        """Get complete execution history."""
+        return {
+            "observations": self.observations,
+            "actions": self.actions,
+            "reflections": self.reflections,
+            "intentions": self.intentions,
+            "total_steps": len(self.actions),
+            "total_observations": len(self.observations),
+            "total_reflections": len(self.reflections),
+            "total_intentions": len(self.intentions),
+        }
+
+    def set_user_goal(self, user_goal: str) -> None:
+        """Set the user goal for this task."""
+        self.user_goal = user_goal
+
+    def get_user_goal(self) -> str:
+        """Get the user goal for this task."""
+        return self.user_goal
+
+    def clear(self) -> None:
+        """Clear all history."""
+        self.observations.clear()
+        self.actions.clear()
+        self.reflections.clear()
+        self.intentions.clear()
+        self.user_goal = ""