Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions agent/actor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Actor Agent components for action execution and validation."""

from .action_executor import ActionExecutor

__all__ = ["ActionExecutor"]
117 changes: 117 additions & 0 deletions agent/actor/action_executor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
"""Action execution and validation for Actor Agent."""

from typing import Any, Dict, List

from browser_env import Action


class ActionExecutor:
"""Executes and validates actions for the Actor Agent."""

def __init__(self, action_set_tag: str) -> None:
self.action_set_tag = action_set_tag
self.execution_history: List[Dict[str, Any]] = []

def validate_action(self, action: Action) -> Dict[str, Any]:
"""Validate an action format and content before execution.

Args:
action: The action to validate

Returns:
Dictionary containing validation results
"""
try:
# Validate action format and content
validation_result = self._validate_action_format(action)

# Store validation history
validation_record = {
"action": action,
"validation_passed": validation_result["valid"],
"validation_details": validation_result,
}
self.execution_history.append(validation_record)

return {
"valid": validation_result["valid"],
"action": action,
"validation_details": validation_result,
}

except Exception as e:
# Record failed validation
validation_record = {
"action": action,
"validation_passed": False,
"error": str(e),
}
self.execution_history.append(validation_record)

return {
"valid": False,
"error": str(e),
"action": action,
"validation_details": {"valid": False, "error": str(e)},
}

def _validate_action_format(self, action: Action) -> Dict[str, Any]:
"""Validate the format and content of an action.

Args:
action: The action to validate

Returns:
Dictionary containing validation results
"""
required_fields = ["action_type"]
validation_result = {
"valid": True,
"missing_fields": [],
"invalid_fields": [],
"warnings": [],
}

# Check required fields
for field in required_fields:
if field not in action:
validation_result["valid"] = False
validation_result["missing_fields"].append(field)

# Validate action type
if "action_type" in action:
action_type = action["action_type"]
valid_types = [
"CLICK", "TYPE", "SCROLL", "KEY_PRESS", "GOTO_URL",
"NEW_TAB", "PAGE_CLOSE", "GO_BACK", "GO_FORWARD",
"PAGE_FOCUS", "CLEAR", "UPLOAD", "STOP", "NONE", "HOVER"
]

if action_type not in valid_types:
validation_result["valid"] = False
validation_result["invalid_fields"].append(f"Invalid action_type: {action_type}")

# Type-specific validations
if action_type == "TYPE" and "element_id" not in action:
validation_result["valid"] = False
validation_result["missing_fields"].append("element_id for TYPE action")

if action_type == "CLICK" and "element_id" not in action:
validation_result["valid"] = False
validation_result["missing_fields"].append("element_id for CLICK action")

if action_type == "SCROLL" and "direction" not in action:
validation_result["valid"] = False
validation_result["missing_fields"].append("direction for SCROLL action")

# Check for potential issues (warnings)
if "element_id" in action:
element_id = action["element_id"]
if isinstance(element_id, str) and not element_id.strip():
validation_result["warnings"].append("Empty element_id detected")

return validation_result

def reset_execution_history(self) -> None:
"""Reset execution history for a new task."""
self.execution_history.clear()
156 changes: 156 additions & 0 deletions agent/actor_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
"""Actor Agent for executing high-level intentions with specific browser actions."""

from typing import Any, Dict, List, Optional

from PIL import Image

from browser_env import Trajectory
from browser_env.utils import Observation
from llms import lm_config

from agent import PromptAgent # Import existing PromptAgent
from .actor.action_executor import ActionExecutor


class ActorAgent(PromptAgent):
"""Executes high-level intentions using specific browser actions.

Extends the existing PromptAgent to work with high-level intentions from
the Planner Agent while maintaining compatibility with the existing codebase.
"""

def __init__(
self,
action_set_tag: str,
lm_config: lm_config.LMConfig,
prompt_constructor,
captioning_fn=None,
) -> None:
"""Initialize Actor Agent with enhanced capabilities."""
# Initialize parent PromptAgent with existing parameters
super().__init__(
action_set_tag=action_set_tag,
lm_config=lm_config,
prompt_constructor=prompt_constructor,
captioning_fn=captioning_fn,
)

# Initialize action executor for validation and tracking
self.action_executor = ActionExecutor(action_set_tag)

# Track intention execution history
self.intention_history: List[Dict[str, Any]] = []

def execute_intention(
self,
intention: str,
current_observation: Observation,
trajectory: Trajectory,
meta_data: Optional[Dict[str, Any]] = None,
images: Optional[List[Image.Image]] = None,
) -> Dict[str, Any]:
"""Execute a high-level intention and generate specific actions.

Args:
intention: High-level intention from Planner Agent
current_observation: Current page observation
trajectory: Current execution trajectory
meta_data: Additional metadata for execution
images: Optional input images

Returns:
Dictionary containing execution results
"""
# Record intention execution attempt
execution_record = {
"intention": intention,
"timestamp": None, # Would be set in actual implementation
"observation_before": current_observation,
}

try:
# Create a simple intention message that works with the existing prompt system
intention_message = f"Execute browser actions to fulfill this intention: {intention}"

# Use existing PromptAgent's next_action method with the intention message
try:
action = self.next_action(
trajectory=trajectory,
intent=intention_message,
meta_data=meta_data or {},
images=images,
output_response=False,
)
except Exception as next_action_error:
print(f"🎬 Actor Error: {str(next_action_error)[:200]}")
print(f"🎬 Error Type: {type(next_action_error).__name__}")
raise next_action_error

# Extract LLM raw response from action
llm_response = action.get("raw_prediction", "No LLM response available")

# Validate the generated action (execution will be handled externally)
validation_result = self.action_executor.validate_action(action)

# Record validation results
execution_record.update({
"generated_action": action,
"validation_result": validation_result,
"llm_response": llm_response,
# intention_fulfilled will be determined after actual execution
})

# Store in intention history
self.intention_history.append(execution_record)

return {
"action": action,
"validation_result": validation_result,
"intention": intention,
# intention_fulfilled will be determined by actual browser execution
"intention_fulfilled": False, # Default to False, will be updated after execution
"execution_history_length": len(self.intention_history),
"llm_response": llm_response,
"response": f"LLM Response: {llm_response[:200]}{'...' if len(llm_response) > 200 else ''}",
}

except Exception as e:
# Provide more detailed error information
error_details = str(e)
if "prompt_constructor" in error_details.lower():
error_details += " (Prompt constructor issue)"
elif "next_action" in error_details.lower():
error_details += " (next_action method failure)"
elif "traject" in error_details.lower():
error_details += " (Trajectory processing issue)"

# Record failed execution
execution_record.update({
"error": error_details,
"exception_type": type(e).__name__,
})
self.intention_history.append(execution_record)

return {
"error": error_details,
"intention": intention,
"intention_fulfilled": False,
"exception_type": type(e).__name__,
"response": f"Execution failed: {error_details}",
}

def reset_intention_history(self) -> None:
"""Reset intention execution history for a new task."""
self.intention_history.clear()
self.action_executor.reset_execution_history()

def get_recent_intentions(self, count: int = 5) -> List[Dict[str, Any]]:
"""Get the most recent intention executions.

Args:
count: Number of recent intentions to return

Returns:
List of recent intention execution records
"""
return self.intention_history[-count:] if self.intention_history else []
13 changes: 9 additions & 4 deletions agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def __init__(
self.captioning_fn = captioning_fn

# Check if the model is multimodal.
if ("gemini" in lm_config.model or "gpt-4" in lm_config.model and "vision" in lm_config.model) and type(prompt_constructor) == MultimodalCoTPromptConstructor:
if type(prompt_constructor) == MultimodalCoTPromptConstructor:
self.multimodal_inputs = True
else:
self.multimodal_inputs = False
Expand All @@ -132,9 +132,14 @@ def next_action(
# Create page screenshot image for multimodal models.
if self.multimodal_inputs:
page_screenshot_arr = trajectory[-1]["observation"]["image"]
page_screenshot_img = Image.fromarray(
page_screenshot_arr
) # size = (viewport_width, viewport_width)
if page_screenshot_arr is not None:
page_screenshot_img = Image.fromarray(
page_screenshot_arr
) # size = (viewport_width, viewport_width)
else:
# Fallback: create empty image if image is None
print("WARNING: No page screenshot image found, creating empty image.")
page_screenshot_img = Image.new('RGB', (1280, 720), color='white')

# Caption the input image, if provided.
if images is not None and len(images) > 0:
Expand Down
86 changes: 86 additions & 0 deletions agent/context/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""Context manager for agent execution history and progress tracking."""

from typing import Any, Dict, List

from browser_env import Action
from browser_env.utils import Observation


class StateManager:
"""Manages execution history for context awareness."""

def __init__(self) -> None:
self.observations: List[Observation] = []
self.actions: List[Action] = []
self.reflections: List[Dict[str, Any]] = []
self.intentions: List[str] = []
self.user_goal: str = ""

def add_observation(self, observation: Observation) -> None:
"""Add a new observation to the history."""
self.observations.append(observation)

def add_action(self, action: Action) -> None:
"""Add a new action to the history."""
self.actions.append(action)

def add_reflection(self, reflection: Dict[str, Any]) -> None:
"""Add a new reflection to the history."""
self.reflections.append(reflection)

def add_intention(self, intention: str) -> None:
"""Add a new intention to the history."""
self.intentions.append(intention)

def get_all_observations(self) -> List[Observation]:
"""Get all observations."""
return self.observations

def get_all_actions(self) -> List[Action]:
"""Get all actions."""
return self.actions

def get_all_reflections(self) -> List[Dict[str, Any]]:
"""Get all reflections."""
return self.reflections

def get_all_intentions(self) -> List[str]:
"""Get all intentions."""
return self.intentions

def get_latest_observation(self) -> Observation:
"""Get the most recent observation."""
return self.observations[-1] if self.observations else None

def get_latest_action(self) -> Action:
"""Get the most recent action."""
return self.actions[-1] if self.actions else None

def get_history(self) -> Dict[str, Any]:
"""Get complete execution history."""
return {
"observations": self.observations,
"actions": self.actions,
"reflections": self.reflections,
"intentions": self.intentions,
"total_steps": len(self.actions),
"total_observations": len(self.observations),
"total_reflections": len(self.reflections),
"total_intentions": len(self.intentions),
}

def set_user_goal(self, user_goal: str) -> None:
"""Set the user goal for this task."""
self.user_goal = user_goal

def get_user_goal(self) -> str:
"""Get the user goal for this task."""
return self.user_goal

def clear(self) -> None:
"""Clear all history."""
self.observations.clear()
self.actions.clear()
self.reflections.clear()
self.intentions.clear()
self.user_goal = ""
Loading