Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 23 additions & 17 deletions bun.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions packages/cli/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@
"start": "bun run src/index.ts"
},
"dependencies": {
"open-browser": "workspace:*",
"chalk": "^5.4.0",
"commander": "^12.1.0",
"chalk": "^5.4.0"
"ollama-ai-provider": "^1.2.0",
"open-browser": "workspace:*"
},
"license": "MIT"
}
27 changes: 25 additions & 2 deletions packages/cli/src/commands/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ interface RunOptions {
provider: string;
headless: boolean;
stepLimit: number;
maxSteps: string;
verbose: boolean;
noCost: boolean;
}
Expand Down Expand Up @@ -53,10 +54,32 @@ async function createModel(provider: string, modelId: string): Promise<LanguageM
languageModel = google(modelId);
break;
}
case 'ollama': {
// Local Ollama instance - free, no API key needed.
// Default base URL: http://localhost:11434/api
// Override with OLLAMA_BASE_URL env var.
const { createOllama } = await import('ollama-ai-provider');
const ollama = createOllama({
baseURL: process.env.OLLAMA_BASE_URL ?? 'http://localhost:11434/api',
});
languageModel = ollama(modelId);
break;
}
case 'openrouter': {
// OpenRouter - access many models via one API key (OPENROUTER_API_KEY).
// Free tier available. See https://openrouter.ai/models?q=free
const { createOpenAI } = await import('@ai-sdk/openai');
const openrouter = createOpenAI({
baseURL: 'https://openrouter.ai/api/v1',
apiKey: process.env.OPENROUTER_API_KEY,
});
languageModel = openrouter(modelId);
break;
}
default:
throw new Error(
`Unsupported provider: ${provider}. ` +
'Supported: openai, anthropic, google',
'Supported: openai, anthropic, google, ollama, openrouter',
);
}

Expand All @@ -76,7 +99,7 @@ export function registerRunCommand(program: Command): void {
.option('-v, --verbose', 'Show detailed step information', false)
.option('--no-cost', 'Hide cost tracking information')
.action(async (task: string, options: RunOptions) => {
const stepLimit = Number.parseInt(String(options.stepLimit), 10);
const stepLimit = Number.parseInt(String(options.maxSteps ?? options.stepLimit ?? '25'), 10);

displayHeader(`Agent Task: ${task}`);
console.log(
Expand Down
Empty file modified packages/cli/src/index.ts
100644 → 100755
Empty file.
2 changes: 1 addition & 1 deletion packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"@ai-sdk/google": "^1.1.0",
"zod": "^3.24.0",
"playwright": "^1.51.0",
"mitt": "^3.0.2",
"mitt": "^3.0.1",
"nanoid": "^5.1.0",
"turndown": "^7.2.1",
"dotenv": "^16.5.0"
Expand Down
16 changes: 12 additions & 4 deletions packages/core/src/agent/agent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -685,6 +685,10 @@ export class Agent {
* Normalize the various output schema shapes into the standard AgentDecision.
*/
private normalizeOutput(output: Record<string, unknown>): AgentDecision {
// Ensure actions is always an array (LLMs may return {} instead of [])
const ensureArray = (val: unknown): Record<string, unknown>[] =>
Array.isArray(val) ? val : [];

// Flash schema: { goal, actions }
if ('goal' in output && !('currentState' in output)) {
return {
Expand All @@ -693,7 +697,7 @@ export class Agent {
memory: '',
nextGoal: String(output.goal ?? ''),
},
actions: (output.actions ?? []) as Record<string, unknown>[],
actions: ensureArray(output.actions),
};
}

Expand All @@ -705,12 +709,16 @@ export class Agent {
memory: '',
nextGoal: '',
},
actions: (output.actions ?? []) as Record<string, unknown>[],
actions: ensureArray(output.actions),
};
}

// Standard schema passthrough
return output as AgentDecision;
// Standard schema passthrough — still guard actions
const decision = output as AgentDecision;
if (!Array.isArray(decision.actions)) {
decision.actions = [];
}
return decision;
}

// ────────────────────────────────────────
Expand Down
2 changes: 2 additions & 0 deletions packages/core/src/agent/instructions/instructions.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,12 +141,14 @@ You must use the `done` action when:
- You have fully completed the task
- You reach the final allowed step, even if the task is incomplete
- It is absolutely impossible to continue
- **You already have all the information needed to answer the task** — stop immediately, do NOT browse further to verify or explore

Rules for `done`:
- Set `success` to `true` only if the FULL task has been completed
- If any part is missing, incomplete, or uncertain, set `success` to `false`
- Put ALL relevant findings in the `text` field
- You are ONLY allowed to call `done` as a single action - never combine it with other actions
- **For read/extract/list tasks: as soon as extract_content or browser_state contains the answer, call done. Do not click links, open articles, or navigate away.**

**Before calling done with success=true, verify:**
1. Re-read the original task and list every concrete requirement
Expand Down
6 changes: 4 additions & 2 deletions packages/core/src/page/content-extractor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,13 @@ function getTurndown(): TurndownService {

function htmlTableToMarkdown(table: HTMLTableElement): string {
const rows: string[][] = [];
const tableRows = table.querySelectorAll('tr');
// Turndown's DOM shim may return a non-iterable object from querySelectorAll,
// so wrap with Array.from to ensure iterability.
const tableRows = Array.from(table.querySelectorAll('tr') ?? []);

for (const row of tableRows) {
const cells: string[] = [];
for (const cell of row.querySelectorAll('th, td')) {
for (const cell of Array.from(row.querySelectorAll('th, td') ?? [])) {
cells.push((cell.textContent ?? '').trim().replace(/\|/g, '\\|'));
}
if (cells.length > 0) {
Expand Down
39 changes: 34 additions & 5 deletions packages/core/src/page/snapshot-builder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -63,20 +63,25 @@ export class SnapshotBuilder {
};
}

const { nodes, layout, strings } = doc;
const { nodes, layout } = doc;
// In newer Chromium, `strings` is at the top level of the snapshot result,
// not nested inside each document. Fall back to doc.strings for older versions.
const strings: string[] = snapshot.strings ?? doc.strings ?? [];

// Build backend node ID → AX node map
const axNodeMap = new Map<number, AXNode>();
this.buildAXMap(axTree, axNodeMap);

// Build layout index map
// In newer Chromium, `paintOrder` was renamed to `paintOrders` (plural).
const paintOrders = layout.paintOrder ?? (layout as unknown as { paintOrders?: number[] }).paintOrders;
const layoutMap = new Map<number, { bounds: number[]; text?: string; paintOrder?: number }>();
for (let i = 0; i < layout.nodeIndex.length; i++) {
const nodeIdx = layout.nodeIndex[i];
layoutMap.set(nodeIdx, {
bounds: layout.bounds[i],
text: layout.text[i] !== -1 ? strings[layout.text[i]] : undefined,
paintOrder: layout.paintOrder?.[i],
paintOrder: paintOrders?.[i],
});
}

Expand All @@ -94,7 +99,28 @@ export class SnapshotBuilder {
for (let i = 0; i < nodes.inputValue.index.length; i++) {
const nodeIdx = nodes.inputValue.index[i];
const valueIdx = nodes.inputValue.value[i];
inputValueMap.set(nodeIdx, strings[valueIdx]);
// Skip -1 values (no string)
if (valueIdx >= 0) {
inputValueMap.set(nodeIdx, strings[valueIdx]);
}
}
}

// Build children map from parentIndex.
// In newer Chromium, `childNodeIndexes` no longer exists; instead, each node
// has a `parentIndex` entry pointing to its parent. We invert that to get children.
const childrenMap = new Map<number, number[]>();
if (nodes.parentIndex) {
for (let i = 0; i < nodes.parentIndex.length; i++) {
const parentIdx = nodes.parentIndex[i];
if (parentIdx >= 0) {
let children = childrenMap.get(parentIdx);
if (!children) {
children = [];
childrenMap.set(parentIdx, children);
}
children.push(i);
}
}
}

Expand All @@ -107,6 +133,7 @@ export class SnapshotBuilder {
axNodeMap,
clickableSet,
inputValueMap,
childrenMap,
viewportSize,
capturedAttributes,
);
Expand All @@ -122,6 +149,7 @@ export class SnapshotBuilder {
axNodeMap: Map<number, AXNode>,
clickableSet: Set<number>,
inputValueMap: Map<number, string>,
childrenMap: Map<number, number[]>,
viewportSize: { width: number; height: number },
capturedAttributes: string[],
): PageTreeNode {
Expand Down Expand Up @@ -199,8 +227,8 @@ export class SnapshotBuilder {
node.highlightIndex = elementIndex(this.indexCounter++);
}

// Build children
const childIndexes: number[] = nodes.childNodeIndexes?.[nodeIndex] ?? [];
// Build children using the pre-built childrenMap (derived from parentIndex)
const childIndexes = childrenMap.get(nodeIndex) ?? [];
for (const childIdx of childIndexes) {
const child = this.buildNodeTree(
childIdx,
Expand All @@ -210,6 +238,7 @@ export class SnapshotBuilder {
axNodeMap,
clickableSet,
inputValueMap,
childrenMap,
viewportSize,
capturedAttributes,
);
Expand Down
10 changes: 6 additions & 4 deletions packages/core/src/page/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -151,16 +151,17 @@ export interface CDPLayoutNode {
}

export interface CDPSnapshotResult {
// In newer Chromium (Playwright 1.58+), `strings` is at the top level
strings?: string[];
documents: Array<{
nodes: {
nodeType: number[];
nodeName: number[];
nodeValue: number[];
backendNodeId: number[];
childNodeIndexes?: number[][];
attributes: Array<number[]>;
parentIndex: number[];
contentDocumentIndex?: { index: number[] };
attributes: Array<number[]>;
contentDocumentIndex?: { index: number[]; value: number[] };
shadowRootType?: { index: number[]; value: number[] };
isClickable?: { index: number[] };
inputValue?: { index: number[]; value: number[] };
Expand All @@ -178,7 +179,8 @@ export interface CDPSnapshotResult {
layoutIndex: number[];
bounds: number[][];
};
strings: string[];
// In older Chromium, `strings` was nested inside each document
strings?: string[];
}>;
}

Expand Down
41 changes: 41 additions & 0 deletions packages/core/test-extract-debug.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import { chromium } from 'playwright';
import { extractMarkdown } from './src/page/content-extractor.js';

async function main() {
const browser = await chromium.launch({ headless: true });
const context = await browser.newContext({ viewport: { width: 1280, height: 720 } });
const page = await context.newPage();

await page.goto('https://news.ycombinator.com', { waitUntil: 'networkidle' });

try {
console.log('Extracting markdown...');
const markdown = await extractMarkdown(page);
console.log('Markdown length:', markdown.length);
console.log('First 500 chars:', markdown.slice(0, 500));
} catch (error) {
console.error('extractMarkdown error:', error);
}

// Now try invoking the model via the ContentExtractor
try {
const { createOpenAI } = await import('@ai-sdk/openai');
const { VercelModelAdapter } = await import('./src/model/adapters/vercel.js');
const { ContentExtractor } = await import('./src/commands/extraction/extractor.js');

const openai = createOpenAI({});
const model = new VercelModelAdapter({ model: openai('gpt-4o') });
const extractor = new ContentExtractor(model);

console.log('\nExtracting with LLM...');
const result = await extractor.extract(page, 'List the top 5 story titles');
console.log('Result:', result);
} catch (error: any) {
console.error('ContentExtractor error:', error?.message ?? error);
console.error('Stack:', error?.stack);
}

await browser.close();
}

main().catch(console.error);
Loading