diff --git a/packages/magnitude-core/baml_src/planner.baml b/packages/magnitude-core/baml_src/planner.baml index 69daf911..319b5d32 100644 --- a/packages/magnitude-core/baml_src/planner.baml +++ b/packages/magnitude-core/baml_src/planner.baml @@ -175,3 +175,30 @@ function QueryMemory(memory: AgentContext, query: string, includeClaudeSpoof: bo // {{ DescribeBrowserExecutionContext(context) }} // "# // } + +class GroundedActionList { + @@dynamic +} + +function GroundActions(screenshot: image, actions: string) -> GroundedActionList { + client GeminiPro + prompt #" + {{ _.role("system") }} + You are a helpful assistant that grounds actions to a screenshot. + + You will receive a list of actions that need to be executed. Some of these actions may be missing specific coordinates (e.g. they may have generic coordinates like 0,0) but describe what they intend to click/scroll. + + Your job is to: + 1. Analyze the screenshot to find the correct elements. + 2. Update the actions with the precise x,y coordinates based on the screenshot. + 3. Return the list of grounded actions. + + {{ ctx.output_format }} + + {{ _.role("user") }} + {{ screenshot }} + + Actions to ground: + {{ actions }} + "# +} diff --git a/packages/magnitude-core/src/agent/index.ts b/packages/magnitude-core/src/agent/index.ts index 49c2f697..9eae9326 100644 --- a/packages/magnitude-core/src/agent/index.ts +++ b/packages/magnitude-core/src/agent/index.ts @@ -18,6 +18,7 @@ import { isClaude } from '@/ai/util'; import { retryOnError } from '@/common'; import { renderContentParts } from '@/memory/rendering'; import { MultiModelHarness } from '@/ai/multiModelHarness'; +import { BrowserConnector } from '@/connectors/browserConnector'; export interface AgentOptions { @@ -171,8 +172,10 @@ export class Agent { const actionDefinition = this.actions.find(def => def.name === action.variant); if (!actionDefinition) { - // It's possible the action name was from a connector that is no longer active, - // or the action space was not correctly aggregated. + // throw new AgentError(`Undefined action type '${action.variant}'. Ensure agent is configured with appropriate action definitions from connectors.`); + if (action.variant === undefined) { + throw new AgentError(`Undefined action type 'undefined'. This usually means the grounding step failed to produce a valid action structure. Ensure the grounding model is returning actions with a 'variant' property matching the available actions.`); + } throw new AgentError(`Undefined action type '${action.variant}'. Ensure agent is configured with appropriate action definitions from connectors.`); } return actionDefinition; @@ -382,6 +385,42 @@ export class Agent { `Error planning actions: ${(error as Error).message}`, { variant: 'misalignment' } ) } + + // Ground actions if necessary + // We assume that if we have a browser connector, we might want to ground mouse/scroll actions + const browserConnector = this.getConnector(BrowserConnector); + if (browserConnector) { + const screenshot = await browserConnector.getLastScreenshot(); + + // Filter actions that need grounding (mouse interactions) + // Check against the list of actions we know are spatial + const spatialPrefixes = ['mouse:']; + const actionsToGround = actions.filter(a => spatialPrefixes.some(prefix => a.variant.startsWith(prefix))); + + if (actionsToGround.length > 0) { + // Define vocabulary for grounding (only mouse actions) + const groundingVocabulary = this.actions.filter(a => spatialPrefixes.some(prefix => a.name.startsWith(prefix))); + + try { + // Ground the subset of actions + const groundedSubset = await this.models.ground(screenshot, actionsToGround, groundingVocabulary); + + // Merge back + let groundedIndex = 0; + actions = actions.map(a => { + if (spatialPrefixes.some(prefix => a.variant.startsWith(prefix))) { + const grounded = groundedSubset[groundedIndex]; + groundedIndex++; + return grounded || a; // Fallback to original if missing/dropped + } + return a; + }); + } catch (groundingError) { + logger.warn(`Grounding failed: ${groundingError instanceof Error ? groundingError.message : String(groundingError)}. Proceeding with ungrounded actions.`); + // Proceed with original actions if grounding fails + } + } + } logger.info({ reasoning, actions }, `Partial recipe created`); diff --git a/packages/magnitude-core/src/ai/modelHarness.ts b/packages/magnitude-core/src/ai/modelHarness.ts index 2577ac51..589a790d 100644 --- a/packages/magnitude-core/src/ai/modelHarness.ts +++ b/packages/magnitude-core/src/ai/modelHarness.ts @@ -264,6 +264,28 @@ export class ModelHarness { return resp.data; } } + + async ground(screenshot: Image, actions: Action[], actionVocabulary: ActionDefinition[]): Promise { + const tb = new TypeBuilder(); + + // Use the same dynamic construction for the list of grounded actions + // We want BAML to return an object with an 'actions' property which is a list of concrete actions + // This mirrors PartialRecipe but just for the actions part. + // Actually, we should just construct 'GroundedActionList' to have a property 'actions' which is the list. + tb.GroundedActionList.addProperty('actions', tb.list(convertActionDefinitionsToBaml(tb, actionVocabulary))); + + const jsonActions = JSON.stringify(actions, null, 2); + + const response = await this.baml.GroundActions( + await screenshot.toBaml(), + jsonActions, + { tb } + ); + this._reportUsage(); + + // Return the strictly typed actions from the dynamic response + return response.actions as Action[]; + } // async classifyCheckFailure(screenshot: Image, check: string, existingRecipe: Action[], tabState: TabState): Promise { // const stringifiedExistingRecipe = []; diff --git a/packages/magnitude-core/src/ai/multiModelHarness.ts b/packages/magnitude-core/src/ai/multiModelHarness.ts index 17cf8220..8886b5c8 100644 --- a/packages/magnitude-core/src/ai/multiModelHarness.ts +++ b/packages/magnitude-core/src/ai/multiModelHarness.ts @@ -67,6 +67,10 @@ export class MultiModelHarness { return await this.roles['query'].query(context, query, schema); } + async ground(screenshot: Image, actions: Action[], actionVocabulary: ActionDefinition[]): Promise { + return await this.roles['ground'].ground(screenshot, actions, actionVocabulary); + } + get numUniqueModels() { return this.uniqueModels.length; } diff --git a/packages/magnitude-core/src/ai/types.ts b/packages/magnitude-core/src/ai/types.ts index dbe3a365..00ec4ace 100644 --- a/packages/magnitude-core/src/ai/types.ts +++ b/packages/magnitude-core/src/ai/types.ts @@ -3,8 +3,8 @@ // confidence: number // } -export type BrowserAgentRole= 'act' | 'extract' | 'query'; -export const allBrowserAgentRoles: BrowserAgentRole[] = ['act', 'extract', 'query'] as const; +export type BrowserAgentRole= 'act' | 'extract' | 'query' | 'ground'; +export const allBrowserAgentRoles: BrowserAgentRole[] = ['act', 'extract', 'query', 'ground'] as const; // Approximately mirrors https://docs.boundaryml.com/ref/llm-client-providers export type LLMClient = (AnthropicClient | ClaudeCodeClient | BedrockClient | GoogleAIClient | GoogleVertexClient | OpenAIClient | OpenAIGenericClient | AzureOpenAIClient) &