diff --git a/ainyc-canonry-1.12.0.tgz b/ainyc-canonry-1.12.0.tgz new file mode 100644 index 0000000..2f2c04e Binary files /dev/null and b/ainyc-canonry-1.12.0.tgz differ diff --git a/apps/web/public/evidence-preview.html b/apps/web/public/evidence-preview.html new file mode 100644 index 0000000..6e262de --- /dev/null +++ b/apps/web/public/evidence-preview.html @@ -0,0 +1,348 @@ + + + + + +Evidence Redesign Preview — Canonry + + + + + +
+ + +
+

Canonry · Evidence Redesign Preview

+

Citypoint Dental NYC

+

citypointdental.com · US / English / Local-intent monitoring

+
+ +
+ + +
+ +
+
+

Visibility evidence

+

Key phrase citation tracking

+
+
+ 3 key phrases tracked + +
+
+ +
+ + +
+
+

emergency dentist brooklyn

+
+ Lost + + Lost citation + +
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 1/3 providers +
+
+ + + +
+
+ + +
+
+

best invisalign dentist downtown brooklyn

+
+ Emerging + + Newly cited + +
+
+
+
+
+
+
+
+
+
+
+
+ 2/3 providers +
+
+ + + +
+
+ + +
+
+

pediatric dentist brooklyn heights

+
+ Not-Cited + + Missed for 4 runs + +
+
+
+
+
+
+
+
+
+ 0/3 providers +
+
+ + + +
+
+ +
+
+ + +
+ +
+

Citypoint Dental NYC · gemini

+

emergency dentist brooklyn

+
+ + +
+

Citation lost

+

2 domains cited instead

+

gemini · lost since mar 5

+
+ + +
+

What the AI said

+
+

For urgent dental care in Brooklyn, Downtown Smiles and Harbor Dental are now cited first for emergency availability and same-day booking. Citypoint Dental does not appear in this answer despite previously holding a citation for this query.

+
+ +
+ + +
+

Who was cited — in order

+
+
+ #1 + downtownsmiles.com + Competitor +
+
+ #2 + harbordental.com + Competitor +
+
+ + citypointdental.com + Not cited +
+
+
+ + +
+

What to fix

+
+
+ + + + + FAQ schema missing on the emergency service page +
+
+ + + + + llms.txt not found during latest site audit +
+
+ + + + + Location pages link weakly into the emergency care hub +
+
+
+ +

AI answers now cite two competitors while your emergency page is no longer grounded.

+
+ +
+ + +
+

Drawer — Cited scenario (when you click the Gemini ✓ pill on invisalign card)

+
+
+ +
+

Citypoint Dental NYC · gemini

+

best invisalign dentist downtown brooklyn

+
+ + +
+

Citation confirmed

+

Cited #1 of 1 domain

+

gemini · cited for 8 runs

+
+ + +
+

What the AI said

+
+

Citypoint Dental in Downtown Brooklyn is highlighted for its Invisalign expertise and patient outcomes. Their case study pages demonstrate consistent before-and-after results for complex alignment cases.

+
+
+ + +
+

Who was cited — in order

+
+
+ #1 + citypointdental.com + You +
+
+
+ + +
+

Why you're cited

+
+
+ + + + Case study pages well-indexed with structured data +
+
+ + + + Internal links from service pages to case studies improved crawl depth +
+
+
+ +

Gemini consistently cites your Invisalign page with no competitor overlap.

+
+ +
+
+

+

Clicking "gemini ✓" on the
Invisalign card opens this drawer

+
+
+
+
+ +
+ + + diff --git a/packages/api-routes/src/index.ts b/packages/api-routes/src/index.ts index 211820d..f0a2cca 100644 --- a/packages/api-routes/src/index.ts +++ b/packages/api-routes/src/index.ts @@ -22,6 +22,8 @@ import type { ScheduleRoutesOptions } from './schedules.js' import { notificationRoutes } from './notifications.js' import { googleRoutes } from './google.js' import type { GoogleRoutesOptions } from './google.js' +import { sweepRoutes } from './sweeps.js' +import type { SweepRoutesOptions } from './sweeps.js' declare module 'fastify' { interface FastifyInstance { @@ -61,6 +63,8 @@ export interface ApiRoutesOptions { publicUrl?: string onGscSyncRequested?: GoogleRoutesOptions['onGscSyncRequested'] onInspectSitemapRequested?: GoogleRoutesOptions['onInspectSitemapRequested'] + /** Called when an indexing sweep is created */ + onSweepCreated?: SweepRoutesOptions['onSweepCreated'] } export async function apiRoutes(app: FastifyInstance, opts: ApiRoutesOptions) { @@ -115,6 +119,9 @@ export async function apiRoutes(app: FastifyInstance, opts: ApiRoutesOptions) { onGscSyncRequested: opts.onGscSyncRequested, onInspectSitemapRequested: opts.onInspectSitemapRequested, } satisfies GoogleRoutesOptions) + await api.register(sweepRoutes, { + onSweepCreated: opts.onSweepCreated, + } satisfies SweepRoutesOptions) }, { prefix: '/api/v1' }) } diff --git a/packages/api-routes/src/settings.ts b/packages/api-routes/src/settings.ts index d5860c1..e563db9 100644 --- a/packages/api-routes/src/settings.ts +++ b/packages/api-routes/src/settings.ts @@ -15,7 +15,7 @@ export interface GoogleSettingsSummary { export interface SettingsRoutesOptions { providerSummary?: ProviderSummaryEntry[] - onProviderUpdate?: (provider: string, apiKey: string, model?: string, baseUrl?: string, quota?: Partial) => ProviderSummaryEntry | null + onProviderUpdate?: (provider: string, apiKey: string, model?: string, baseUrl?: string, quota?: Partial, meta?: Record) => ProviderSummaryEntry | null google?: GoogleSettingsSummary onGoogleUpdate?: (clientId: string, clientSecret: string) => GoogleSettingsSummary | null } @@ -30,11 +30,26 @@ export async function settingsRoutes(app: FastifyInstance, opts: SettingsRoutesO Params: { name: string } Body: { apiKey?: string; baseUrl?: string; model?: string; quota?: Partial } }>('/settings/providers/:name', async (request, reply) => { + // web-search is a special non-LLM provider + if (request.params.name === 'web-search') { + const { apiKey, backend, cx } = request.body as { apiKey?: string; backend?: string; cx?: string } ?? {} + if (!apiKey || typeof apiKey !== 'string') { + return reply.status(400).send({ error: 'apiKey is required for web-search provider' }) + } + if (!opts.onProviderUpdate) { + return reply.status(501).send({ error: 'Provider configuration updates are not supported in this deployment' }) + } + // Store web-search config via the generic provider update callback using name 'web-search'. + // cx is passed explicitly via the meta bag so it is persisted for google-cse backend. + const result = opts.onProviderUpdate('web-search', apiKey, undefined, backend ?? 'serper', undefined, cx ? { cx } : undefined) + return result ?? reply.status(500).send({ error: 'Failed to update web-search provider configuration' }) + } + const providerName = parseProviderName(request.params.name) const { apiKey, baseUrl, model, quota } = request.body ?? {} if (!providerName) { - return reply.status(400).send({ error: `Invalid provider: ${request.params.name}. Must be one of: gemini, openai, claude, local` }) + return reply.status(400).send({ error: `Invalid provider: ${request.params.name}. Must be one of: gemini, openai, claude, local, web-search` }) } const name = providerName diff --git a/packages/api-routes/src/sweeps.ts b/packages/api-routes/src/sweeps.ts new file mode 100644 index 0000000..9008bce --- /dev/null +++ b/packages/api-routes/src/sweeps.ts @@ -0,0 +1,195 @@ +import crypto from 'node:crypto' +import { eq, asc, and, inArray } from 'drizzle-orm' +import type { FastifyInstance } from 'fastify' +import { indexingSweeps, indexingSweepResults, keywords, projects } from '@ainyc/canonry-db' +import { resolveProject, writeAuditLog } from './helpers.js' + +const ALLOWED_TRIGGERS = new Set(['manual', 'scheduled', 'api']) + +export interface SweepRoutesOptions { + /** Called when a new indexing sweep is created */ + onSweepCreated?: (sweepId: string, projectId: string, keyword?: string) => void +} + +export async function sweepRoutes(app: FastifyInstance, opts: SweepRoutesOptions) { + // POST /projects/:name/sweeps — trigger an indexing sweep + app.post<{ + Params: { name: string } + Body: { keyword?: string; trigger?: string } + }>('/projects/:name/sweeps', async (request, reply) => { + const project = resolveProjectSafe(app, request.params.name, reply) + if (!project) return + + const now = new Date().toISOString() + const trigger = ALLOWED_TRIGGERS.has(request.body?.trigger ?? '') + ? request.body!.trigger! + : 'manual' + const keyword = request.body?.keyword + + // Guard against concurrent sweeps for the same project. + // Wrap the check+insert in a transaction so two simultaneous requests cannot + // both observe no active sweep and then both insert — SQLite serialises writers. + const sweepId = crypto.randomUUID() + const txResult = app.db.transaction((tx) => { + const activeSweep = tx + .select() + .from(indexingSweeps) + .where( + and( + eq(indexingSweeps.projectId, project.id), + inArray(indexingSweeps.status, ['queued', 'running']), + ), + ) + .get() + if (activeSweep) { + return { conflict: true, activeSweep } as const + } + + tx.insert(indexingSweeps).values({ + id: sweepId, + projectId: project.id, + status: 'queued', + trigger, + createdAt: now, + }).run() + + return { conflict: false } as const + }) + + if (txResult.conflict) { + return reply.status(409).send({ error: `Sweep ${txResult.activeSweep.id} is already ${txResult.activeSweep.status}` }) + } + + writeAuditLog(app.db, { + projectId: project.id, + actor: 'api', + action: 'sweep.created', + entityType: 'indexing_sweep', + entityId: sweepId, + }) + + const sweep = app.db.select().from(indexingSweeps).where(eq(indexingSweeps.id, sweepId)).get()! + + if (opts.onSweepCreated) { + opts.onSweepCreated(sweepId, project.id, keyword) + } + + return reply.status(201).send(formatSweep(sweep)) + }) + + // GET /projects/:name/sweeps — list sweeps for project + app.get<{ Params: { name: string } }>('/projects/:name/sweeps', async (request, reply) => { + const project = resolveProjectSafe(app, request.params.name, reply) + if (!project) return + const rows = app.db.select().from(indexingSweeps) + .where(eq(indexingSweeps.projectId, project.id)) + .orderBy(asc(indexingSweeps.createdAt)) + .all() + return reply.send(rows.map(formatSweep)) + }) + + // GET /sweeps/:id — get sweep with results + app.get<{ Params: { id: string } }>('/sweeps/:id', async (request, reply) => { + const sweep = app.db.select().from(indexingSweeps).where(eq(indexingSweeps.id, request.params.id)).get() + if (!sweep) { + return reply.status(404).send({ error: { code: 'NOT_FOUND', message: `Sweep '${request.params.id}' not found` } }) + } + + const results = app.db + .select({ + id: indexingSweepResults.id, + sweepId: indexingSweepResults.sweepId, + keywordId: indexingSweepResults.keywordId, + keyword: keywords.keyword, + domain: indexingSweepResults.domain, + domainRole: indexingSweepResults.domainRole, + indexedPageCount: indexingSweepResults.indexedPageCount, + topPages: indexingSweepResults.topPages, + createdAt: indexingSweepResults.createdAt, + }) + .from(indexingSweepResults) + .leftJoin(keywords, eq(indexingSweepResults.keywordId, keywords.id)) + .where(eq(indexingSweepResults.sweepId, sweep.id)) + .all() + + return reply.send({ + ...formatSweep(sweep), + results: results.map(r => ({ + ...r, + topPages: tryParseJson(r.topPages, []), + })), + }) + }) + + // GET /sweeps — list all sweeps across all projects (paginated) + app.get<{ Querystring: { limit?: string; offset?: string } }>('/sweeps', async (request, reply) => { + const limit = Math.min(Math.max(parseInt(String(request.query.limit ?? '50'), 10) || 50, 1), 200) + const offset = Math.max(parseInt(String(request.query.offset ?? '0'), 10) || 0, 0) + const rows = app.db + .select({ + id: indexingSweeps.id, + projectId: indexingSweeps.projectId, + projectName: projects.name, + status: indexingSweeps.status, + trigger: indexingSweeps.trigger, + startedAt: indexingSweeps.startedAt, + finishedAt: indexingSweeps.finishedAt, + error: indexingSweeps.error, + createdAt: indexingSweeps.createdAt, + }) + .from(indexingSweeps) + .leftJoin(projects, eq(indexingSweeps.projectId, projects.id)) + .orderBy(asc(indexingSweeps.createdAt)) + .limit(limit) + .offset(offset) + .all() + return reply.send(rows.map(r => ({ ...formatSweep(r), projectName: r.projectName }))) + }) +} + +function formatSweep(row: { + id: string + projectId: string + status: string + trigger: string + startedAt?: string | null + finishedAt?: string | null + error?: string | null + createdAt: string +}) { + return { + id: row.id, + projectId: row.projectId, + status: row.status, + trigger: row.trigger, + startedAt: row.startedAt ?? null, + finishedAt: row.finishedAt ?? null, + error: row.error ?? null, + createdAt: row.createdAt, + } +} + +function tryParseJson(value: string, fallback: T): T { + try { + return JSON.parse(value) as T + } catch { + return fallback + } +} + +function resolveProjectSafe( + app: FastifyInstance, + name: string, + reply: { status: (code: number) => { send: (body: unknown) => unknown } }, +) { + try { + return resolveProject(app.db, name) + } catch (e: unknown) { + if (e && typeof e === 'object' && 'statusCode' in e && 'toJSON' in e) { + const err = e as { statusCode: number; toJSON(): unknown } + reply.status(err.statusCode).send(err.toJSON()) + return null + } + throw e + } +} diff --git a/packages/canonry/ainyc-canonry-1.15.3.tgz b/packages/canonry/ainyc-canonry-1.15.3.tgz new file mode 100644 index 0000000..655679b Binary files /dev/null and b/packages/canonry/ainyc-canonry-1.15.3.tgz differ diff --git a/packages/canonry/ainyc-canonry-1.16.0.tgz b/packages/canonry/ainyc-canonry-1.16.0.tgz new file mode 100644 index 0000000..eef616a Binary files /dev/null and b/packages/canonry/ainyc-canonry-1.16.0.tgz differ diff --git a/packages/canonry/ainyc-canonry-1.19.3.tgz b/packages/canonry/ainyc-canonry-1.19.3.tgz new file mode 100644 index 0000000..3ac44dc Binary files /dev/null and b/packages/canonry/ainyc-canonry-1.19.3.tgz differ diff --git a/packages/canonry/package.json b/packages/canonry/package.json index 3a81648..22ca6ca 100644 --- a/packages/canonry/package.json +++ b/packages/canonry/package.json @@ -64,6 +64,7 @@ "@ainyc/canonry-provider-local": "workspace:*", "@ainyc/canonry-integration-google": "workspace:*", "@ainyc/canonry-provider-openai": "workspace:*", + "@ainyc/canonry-provider-web-search": "workspace:*", "@types/better-sqlite3": "^7.6.13", "@types/node-cron": "^3.0.11", "tsup": "^8.5.1", diff --git a/packages/canonry/src/cli.ts b/packages/canonry/src/cli.ts index 3af9cbd..5db7bee 100644 --- a/packages/canonry/src/cli.ts +++ b/packages/canonry/src/cli.ts @@ -17,6 +17,7 @@ import { showSettings, setProvider, setGoogleAuth } from './commands/settings.js import { setSchedule, showSchedule, enableSchedule, disableSchedule, removeSchedule } from './commands/schedule.js' import { addNotification, listNotifications, removeNotification, testNotification, listEvents } from './commands/notify.js' import { telemetryCommand } from './commands/telemetry.js' +import { triggerSweep, listSweeps, showSweep } from './commands/sweep.js' import { googleConnect, googleDisconnect, googleStatus, googleProperties, googleSetProperty, googleSetSitemap, googleListSitemaps, googleSync, googlePerformance, googleInspect, @@ -90,8 +91,13 @@ Usage: canonry google coverage Show index coverage summary canonry google inspections Show URL inspection history (--url ) canonry google deindexed Show pages that lost indexing + canonry sweep Run indexing sweep (site: queries) for all keywords + canonry sweep --keyword Run sweep for a specific keyword + canonry sweep show Show sweep details and results + canonry sweeps List past indexing sweeps canonry settings Show active provider and quota settings - canonry settings provider Update a provider config + canonry settings provider Update a provider config (name: gemini, openai, claude, local, web-search) + canonry settings provider web-search --api-key --backend serper|google-cse canonry settings google Update Google OAuth credentials canonry telemetry status Show telemetry status canonry telemetry enable Enable anonymous telemetry @@ -1091,6 +1097,55 @@ async function main() { break } + // ── canonry sweep / sweeps ──────────────────────────────────────────── + case 'sweep': { + const subcommand = args[1] + + if (subcommand === 'show') { + const id = args[2] + if (!id) { + console.error('Usage: canonry sweep show ') + process.exit(1) + } + await showSweep(id, format) + break + } + + // `canonry sweep ` — trigger or default to list if no project + const sweepProject = subcommand + if (!sweepProject) { + console.error('Usage: canonry sweep [--keyword "..."] [--wait] [--format json]') + process.exit(1) + } + + const { values: sweepValues } = parseArgs({ + args: args.slice(2), + options: { + keyword: { type: 'string' }, + wait: { type: 'boolean', default: false }, + format: { type: 'string' }, + }, + allowPositionals: true, + }) + + await triggerSweep(sweepProject, { + keyword: sweepValues.keyword, + wait: sweepValues.wait ?? false, + format: sweepValues.format === 'json' ? 'json' : format, + }) + break + } + + case 'sweeps': { + const sweepsProject = args[1] + if (!sweepsProject) { + console.error('Usage: canonry sweeps ') + process.exit(1) + } + await listSweeps(sweepsProject, format) + break + } + default: console.error(`Unknown command: ${command}`) console.log('Run "canonry --help" for usage.') diff --git a/packages/canonry/src/client.ts b/packages/canonry/src/client.ts index 3d945ad..075e82d 100644 --- a/packages/canonry/src/client.ts +++ b/packages/canonry/src/client.ts @@ -267,4 +267,22 @@ export class ApiClient { async gscDiscoverSitemaps(project: string): Promise { return this.request('POST', `/projects/${encodeURIComponent(project)}/google/gsc/discover-sitemaps`, {}) } + + // ── Indexing sweeps ─────────────────────────────────────────────────────── + + async triggerSweep(project: string, body?: { keyword?: string }): Promise { + return this.request('POST', `/projects/${encodeURIComponent(project)}/sweeps`, body ?? {}) + } + + async listSweeps(project: string): Promise { + return this.request('GET', `/projects/${encodeURIComponent(project)}/sweeps`) + } + + async getSweep(id: string): Promise { + return this.request('GET', `/sweeps/${encodeURIComponent(id)}`) + } + + async setWebSearchProvider(opts: { apiKey: string; backend?: string; cx?: string }): Promise { + return this.request('PUT', '/settings/providers/web-search', opts) + } } diff --git a/packages/canonry/src/commands/serve.ts b/packages/canonry/src/commands/serve.ts index 866e084..76a1b50 100644 --- a/packages/canonry/src/commands/serve.ts +++ b/packages/canonry/src/commands/serve.ts @@ -21,9 +21,11 @@ export async function serveCommand(): Promise { console.log(`\nCanonry server running at http://${host === '0.0.0.0' ? 'localhost' : host}:${port}`) console.log('Press Ctrl+C to stop.\n') - const providerNames = Object.keys(config.providers ?? {}).filter( - k => config.providers?.[k as keyof typeof config.providers]?.apiKey || config.providers?.[k as keyof typeof config.providers]?.baseUrl, - ) + const providerNames = Object.keys(config.providers ?? {}).filter(k => { + if (k === 'webSearch') return Boolean(config.providers?.webSearch?.apiKey) + const p = config.providers?.[k as Exclude, 'webSearch'>] + return p?.apiKey || p?.baseUrl + }) trackEvent('serve.started', { providerCount: providerNames.length, providers: providerNames, diff --git a/packages/canonry/src/commands/sweep.ts b/packages/canonry/src/commands/sweep.ts new file mode 100644 index 0000000..3d5e9d6 --- /dev/null +++ b/packages/canonry/src/commands/sweep.ts @@ -0,0 +1,165 @@ +import { loadConfig } from '../config.js' +import { ApiClient } from '../client.js' + +function getClient(): ApiClient { + const config = loadConfig() + return new ApiClient(config.apiUrl, config.apiKey) +} + +const TERMINAL_STATUSES = new Set(['completed', 'failed']) + +interface SweepRow { + id: string + projectId: string + status: string + trigger: string + startedAt: string | null + finishedAt: string | null + createdAt: string +} + +interface SweepResultRow { + id: string + sweepId: string + keywordId: string + keyword?: string | null + domain: string + domainRole: string + indexedPageCount: number + topPages: Array<{ url: string; title: string }> + createdAt: string +} + +/** + * `canonry sweep ` + * Trigger an indexing sweep for all keywords in the project. + */ +export async function triggerSweep( + project: string, + opts?: { keyword?: string; wait?: boolean; format?: string }, +): Promise { + const client = getClient() + const body: Record = {} + if (opts?.keyword) { + body.keyword = opts.keyword + } + + const sweep = (await client.triggerSweep(project, body)) as SweepRow + + if (opts?.format === 'json') { + if (opts?.wait) { + const result = await pollSweep(client, sweep.id) + console.log(JSON.stringify(result, null, 2)) + } else { + console.log(JSON.stringify(sweep, null, 2)) + } + return + } + + console.log(`Sweep created: ${sweep.id}`) + console.log(` Status: ${sweep.status}`) + console.log(` Trigger: ${sweep.trigger}`) + + if (opts?.wait) { + process.stderr.write(`Waiting for sweep ${sweep.id}`) + const result = await pollSweep(client, sweep.id) + process.stderr.write('\n') + printSweepDetail(result as SweepRow) + } +} + +/** + * `canonry sweeps ` + * List past indexing sweeps for a project. + */ +export async function listSweeps(project: string, format?: string): Promise { + const client = getClient() + const sweeps = (await client.listSweeps(project)) as SweepRow[] + + if (format === 'json') { + console.log(JSON.stringify(sweeps, null, 2)) + return + } + + if (sweeps.length === 0) { + console.log(`No indexing sweeps found for "${project}".`) + return + } + + console.log(`Indexing sweeps for "${project}" (${sweeps.length}):\n`) + console.log(' ID STATUS TRIGGER CREATED') + console.log(' ──────────────────────────────────── ────────── ───────── ──────────────────────') + + for (const s of sweeps) { + console.log( + ` ${s.id} ${s.status.padEnd(10)} ${s.trigger.padEnd(9)} ${s.createdAt}`, + ) + } +} + +/** + * `canonry sweep show ` + * Show details (including results) of a specific sweep. + */ +export async function showSweep(id: string, format?: string): Promise { + const client = getClient() + const sweep = (await client.getSweep(id)) as SweepRow & { results?: SweepResultRow[] } + + if (format === 'json') { + console.log(JSON.stringify(sweep, null, 2)) + return + } + + printSweepDetail(sweep) + + const results = sweep.results ?? [] + if (results.length === 0) { + console.log('\n No results recorded yet.') + return + } + + // Group by keyword + const byKeyword = new Map() + for (const r of results) { + const kw = r.keyword ?? r.keywordId + if (!byKeyword.has(kw)) byKeyword.set(kw, []) + byKeyword.get(kw)!.push(r) + } + + console.log('\n Content Coverage:\n') + for (const [kw, rows] of byKeyword) { + console.log(` Keyword: "${kw}"`) + for (const r of rows) { + const role = r.domainRole === 'client' ? ' (client) ' : ' (competitor)' + console.log(` ${role} ${r.domain.padEnd(40)} ${r.indexedPageCount} indexed pages`) + for (const page of (r.topPages ?? []).slice(0, 3)) { + console.log(` - ${page.title} — ${page.url}`) + } + } + console.log() + } +} + +async function pollSweep(client: ApiClient, sweepId: string): Promise { + const deadline = Date.now() + 10 * 60 * 1000 // 10 minutes + for (;;) { + await new Promise(r => setTimeout(r, 2000)) + if (Date.now() > deadline) { + throw new Error(`Timed out waiting for sweep ${sweepId}`) + } + const sweep = (await client.getSweep(sweepId)) as { status: string } + process.stderr.write('.') + if (TERMINAL_STATUSES.has(sweep.status)) { + return sweep + } + } +} + +function printSweepDetail(sweep: SweepRow): void { + console.log(`Sweep: ${sweep.id}`) + console.log(` Status: ${sweep.status}`) + console.log(` Trigger: ${sweep.trigger}`) + if (sweep.startedAt) console.log(` Started: ${sweep.startedAt}`) + if (sweep.finishedAt) console.log(` Finished: ${sweep.finishedAt}`) + console.log(` Created: ${sweep.createdAt}`) +} diff --git a/packages/canonry/src/config.ts b/packages/canonry/src/config.ts index 43de106..4cc16e1 100644 --- a/packages/canonry/src/config.ts +++ b/packages/canonry/src/config.ts @@ -48,6 +48,13 @@ export interface CanonryConfig { openai?: ProviderConfigEntry claude?: ProviderConfigEntry local?: ProviderConfigEntry + /** Web search provider for indexing sweeps */ + webSearch?: { + apiKey: string + backend?: 'serper' | 'google-cse' + /** Google CSE search engine ID (cx parameter) */ + cx?: string + } } google?: GoogleConfigEntry // Telemetry (opt-out: undefined/true = enabled, false = disabled) diff --git a/packages/canonry/src/server.ts b/packages/canonry/src/server.ts index ae36155..0ccabcb 100644 --- a/packages/canonry/src/server.ts +++ b/packages/canonry/src/server.ts @@ -34,6 +34,7 @@ import { ProviderRegistry } from './provider-registry.js' import { Scheduler } from './scheduler.js' import { Notifier } from './notifier.js' import { fetchSiteText } from './site-fetch.js' +import { SweepRunner } from './sweep-runner.js' const DEFAULT_QUOTA = { maxConcurrency: 2, @@ -93,7 +94,8 @@ export async function createServer(opts: { } console.log('[Server] Configured providers:', Object.keys(providers).filter(k => { - const p = providers[k as keyof typeof providers] + if (k === 'webSearch') return Boolean(providers.webSearch?.apiKey) + const p = providers[k as Exclude] return p?.apiKey || p?.baseUrl })) @@ -139,6 +141,16 @@ export async function createServer(opts: { const notifier = new Notifier(opts.db, serverUrl) jobRunner.onRunCompleted = (runId, projectId) => notifier.onRunCompleted(runId, projectId) + // Build sweep runner (reads web-search settings lazily so it picks up live updates) + const sweepRunner = new SweepRunner(opts.db, () => { + const ws = opts.config.providers?.webSearch ?? null + const envKey = process.env.WEB_SEARCH_API_KEY + if (envKey) { + return { apiKey: envKey, backend: (process.env.WEB_SEARCH_BACKEND as 'serper' | 'google-cse' | undefined) ?? 'serper' } + } + return ws?.apiKey ? { apiKey: ws.apiKey, backend: ws.backend ?? 'serper', cx: ws.cx } : null + }) + const scheduler = new Scheduler(opts.db, { onRunCreated: (runId, projectId, providers) => { jobRunner.executeRun(runId, projectId, providers).catch((err: unknown) => { @@ -148,12 +160,19 @@ export async function createServer(opts: { }) // Build provider summary for API routes + const webSearchConfig = opts.config.providers?.webSearch const providerSummary = (['gemini', 'openai', 'claude', 'local'] as const).map(name => ({ name, model: registry.get(name)?.config.model, configured: !!registry.get(name), quota: registry.get(name)?.config.quotaPolicy, })) + + // Add web-search to provider summary + ;(providerSummary as Array<{ name: string; configured: boolean; model?: string; quota?: object }>).push({ + name: 'web-search', + configured: Boolean(webSearchConfig?.apiKey || process.env.WEB_SEARCH_API_KEY), + }) const googleSettingsSummary = { configured: Boolean(opts.config.google?.clientId && opts.config.google?.clientSecret), } @@ -239,6 +258,11 @@ export async function createServer(opts: { app.log.error({ runId, err }, 'Inspect sitemap failed') }) }, + onSweepCreated: (sweepId: string, projectId: string, keyword?: string) => { + sweepRunner.executeSweep(sweepId, projectId, keyword).catch((err: unknown) => { + app.log.error({ sweepId, err }, 'Sweep runner failed') + }) + }, openApiInfo: { title: 'Canonry API', version: PKG_VERSION, @@ -251,7 +275,29 @@ export async function createServer(opts: { app.log.error({ runId, err }, 'Job runner failed') }) }, - onProviderUpdate: (providerName: string, apiKey: string, model?: string, baseUrl?: string, incomingQuota?: Partial) => { + onProviderUpdate: (providerName: string, apiKey: string, model?: string, baseUrl?: string, incomingQuota?: Partial, meta?: Record) => { + // Handle web-search provider separately — it isn't an LLM adapter + if (providerName === 'web-search') { + if (!opts.config.providers) opts.config.providers = {} + const cx = typeof meta?.cx === 'string' ? meta.cx : undefined + opts.config.providers.webSearch = { + apiKey, + // baseUrl carries the backend name (serper | google-cse) from the route handler + backend: (baseUrl ?? 'serper') as 'serper' | 'google-cse', + ...(cx ? { cx } : {}), + } + try { + saveConfig(opts.config) + } catch (err) { + app.log.error({ err }, 'Failed to save web-search config') + return null + } + const wsEntry = (providerSummary as Array<{ name: string; configured: boolean; model?: string }>) + .find(p => p.name === 'web-search') + if (wsEntry) wsEntry.configured = true + return { name: 'web-search', configured: true } + } + const name = providerName as keyof typeof adapterMap if (!(name in adapterMap)) return null diff --git a/packages/canonry/src/sweep-runner.ts b/packages/canonry/src/sweep-runner.ts new file mode 100644 index 0000000..718767f --- /dev/null +++ b/packages/canonry/src/sweep-runner.ts @@ -0,0 +1,158 @@ +/** + * SweepRunner — executes indexing sweeps for a project. + * + * For each keyword × domain (client + competitors), fires a + * `site: ` query via the configured web_search provider + * and stores the results in the indexing_sweep_results table. + */ + +import crypto from 'node:crypto' +import { eq } from 'drizzle-orm' +import type { DatabaseClient } from '@ainyc/canonry-db' +import { indexingSweeps, indexingSweepResults, keywords, competitors, projects } from '@ainyc/canonry-db' +import { createWebSearchAdapter } from '@ainyc/canonry-provider-web-search' +import type { WebSearchBackend } from '@ainyc/canonry-contracts' + +export interface WebSearchProviderSettings { + apiKey: string + backend?: WebSearchBackend + cx?: string +} + +export class SweepRunner { + private db: DatabaseClient + private getWebSearchSettings: () => WebSearchProviderSettings | null + + onSweepCompleted?: (sweepId: string, projectId: string) => Promise + + constructor( + db: DatabaseClient, + getWebSearchSettings: () => WebSearchProviderSettings | null, + ) { + this.db = db + this.getWebSearchSettings = getWebSearchSettings + } + + async executeSweep(sweepId: string, projectId: string, keywordFilter?: string): Promise { + const now = new Date().toISOString() + + try { + // Mark sweep as running + this.db + .update(indexingSweeps) + .set({ status: 'running', startedAt: now }) + .where(eq(indexingSweeps.id, sweepId)) + .run() + + // Fetch project + const project = this.db + .select() + .from(projects) + .where(eq(projects.id, projectId)) + .get() + + if (!project) { + throw new Error(`Project ${projectId} not found`) + } + + // Load web search settings + const settings = this.getWebSearchSettings() + if (!settings?.apiKey) { + throw new Error( + 'web_search provider not configured. ' + + 'Set WEB_SEARCH_API_KEY env var or run: canonry settings provider web-search --api-key ', + ) + } + + const adapter = createWebSearchAdapter(settings.apiKey, settings.backend ?? 'serper', settings.cx) + + // Load keywords — differentiate "none configured" from "filter matched nothing" + const allKws = this.db.select().from(keywords).where(eq(keywords.projectId, projectId)).all() + if (allKws.length === 0) { + throw new Error('No keywords configured for this project') + } + const kws = keywordFilter ? allKws.filter(k => k.keyword === keywordFilter) : allKws + if (kws.length === 0) { + throw new Error(`No keyword matching '${keywordFilter}' found in this project`) + } + + // Load competitors + const comps = this.db + .select() + .from(competitors) + .where(eq(competitors.projectId, projectId)) + .all() + + const clientDomain = project.canonicalDomain + + // Process each keyword × domain pair + for (const kw of kws) { + const domains: Array<{ domain: string; role: 'client' | 'competitor' }> = [ + { domain: clientDomain, role: 'client' }, + ...comps.map(c => ({ domain: c.domain, role: 'competitor' as const })), + ] + + for (const { domain, role } of domains) { + // Brief delay to avoid rate-limiting (Serper free tier: 2,500 req/month) + await new Promise(r => setTimeout(r, 300)) + try { + const result = await adapter.siteQuery(domain, kw.keyword) + const createdAt = new Date().toISOString() + + this.db + .insert(indexingSweepResults) + .values({ + id: crypto.randomUUID(), + sweepId, + keywordId: kw.id, + domain, + domainRole: role, + indexedPageCount: result.indexedPageCount, + topPages: JSON.stringify(result.topPages), + createdAt, + }) + .onConflictDoUpdate({ + target: [ + indexingSweepResults.sweepId, + indexingSweepResults.keywordId, + indexingSweepResults.domain, + ], + set: { + indexedPageCount: result.indexedPageCount, + topPages: JSON.stringify(result.topPages), + }, + }) + .run() + } catch (err) { + // Log but don't abort the sweep — partial results are useful + console.error( + `[SweepRunner] Failed site:${domain} "${kw.keyword}":`, + err instanceof Error ? err.message : String(err), + ) + } + } + } + + // Mark completed + const finishedAt = new Date().toISOString() + this.db + .update(indexingSweeps) + .set({ status: 'completed', finishedAt }) + .where(eq(indexingSweeps.id, sweepId)) + .run() + + if (this.onSweepCompleted) { + await this.onSweepCompleted(sweepId, projectId) + } + } catch (err) { + const finishedAt = new Date().toISOString() + const message = err instanceof Error ? err.message : String(err) + this.db + .update(indexingSweeps) + .set({ status: 'failed', finishedAt, error: message }) + .where(eq(indexingSweeps.id, sweepId)) + .run() + console.error(`[SweepRunner] Sweep ${sweepId} failed:`, message) + } + } +} diff --git a/packages/contracts/src/index.ts b/packages/contracts/src/index.ts index f8f8162..ccb900a 100644 --- a/packages/contracts/src/index.ts +++ b/packages/contracts/src/index.ts @@ -7,3 +7,4 @@ export * from './project.js' export * from './provider.js' export * from './run.js' export * from './schedule.js' +export * from './sweep.js' diff --git a/packages/contracts/src/sweep.ts b/packages/contracts/src/sweep.ts new file mode 100644 index 0000000..2e0685a --- /dev/null +++ b/packages/contracts/src/sweep.ts @@ -0,0 +1,53 @@ +import { z } from 'zod' + +// ── Indexing sweep contracts ─────────────────────────────────────────────── + +export const topPageSchema = z.object({ + url: z.string(), + title: z.string(), +}) + +export const sweepResultSchema = z.object({ + id: z.string(), + sweepId: z.string(), + keywordId: z.string(), + keyword: z.string().optional(), + domain: z.string(), + domainRole: z.enum(['client', 'competitor']), + indexedPageCount: z.number().int().nonnegative(), + topPages: z.array(topPageSchema), + createdAt: z.string(), +}) + +export const indexingSweepSchema = z.object({ + id: z.string(), + projectId: z.string(), + status: z.enum(['queued', 'running', 'completed', 'failed']), + trigger: z.string(), + startedAt: z.string().nullable(), + finishedAt: z.string().nullable(), + error: z.string().nullable(), + createdAt: z.string(), +}) + +export const indexingSweepWithResultsSchema = indexingSweepSchema.extend({ + results: z.array(sweepResultSchema), +}) + +export type TopPage = z.infer +export type SweepResult = z.infer +export type IndexingSweep = z.infer +export type IndexingSweepWithResults = z.infer + +// ── Web search provider config ───────────────────────────────────────────── + +export const webSearchBackendSchema = z.enum(['serper', 'google-cse']) +export type WebSearchBackend = z.infer + +export const webSearchProviderConfigSchema = z.object({ + apiKey: z.string().min(1), + backend: webSearchBackendSchema.default('serper'), + /** For Google CSE — the search engine ID (cx parameter) */ + cx: z.string().optional(), +}) +export type WebSearchProviderConfig = z.infer diff --git a/packages/db/src/migrate.ts b/packages/db/src/migrate.ts index de3e931..42ec6d2 100644 --- a/packages/db/src/migrate.ts +++ b/packages/db/src/migrate.ts @@ -238,4 +238,50 @@ export function migrate(db: DatabaseClient) { // Column already exists — ignore } } + + // Run indexing sweep migrations + migrateSweeps(db) +} + +// Appended by indexing sweep feature (issue #75) +const SWEEP_MIGRATIONS = [ + `CREATE TABLE IF NOT EXISTS indexing_sweeps ( + id TEXT PRIMARY KEY, + project_id TEXT NOT NULL REFERENCES projects(id) ON DELETE CASCADE, + status TEXT NOT NULL DEFAULT 'queued', + trigger TEXT NOT NULL DEFAULT 'manual', + started_at TEXT, + finished_at TEXT, + error TEXT, + created_at TEXT NOT NULL + )`, + `CREATE INDEX IF NOT EXISTS idx_indexing_sweeps_project ON indexing_sweeps(project_id)`, + `CREATE INDEX IF NOT EXISTS idx_indexing_sweeps_status ON indexing_sweeps(status)`, + `CREATE TABLE IF NOT EXISTS indexing_sweep_results ( + id TEXT PRIMARY KEY, + sweep_id TEXT NOT NULL REFERENCES indexing_sweeps(id) ON DELETE CASCADE, + keyword_id TEXT NOT NULL REFERENCES keywords(id) ON DELETE CASCADE, + domain TEXT NOT NULL, + domain_role TEXT NOT NULL DEFAULT 'client', + indexed_page_count INTEGER NOT NULL DEFAULT 0, + top_pages TEXT NOT NULL DEFAULT '[]', + created_at TEXT NOT NULL, + UNIQUE(sweep_id, keyword_id, domain) + )`, + `CREATE INDEX IF NOT EXISTS idx_sweep_results_sweep ON indexing_sweep_results(sweep_id)`, + `CREATE INDEX IF NOT EXISTS idx_sweep_results_keyword ON indexing_sweep_results(keyword_id)`, +] + +export function migrateSweeps(db: DatabaseClient) { + for (const migration of SWEEP_MIGRATIONS) { + try { + db.run(sql.raw(migration)) + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err) + if (!msg.includes('already exists')) { + throw err // re-throw unexpected errors (disk full, permission error, etc.) + } + // Table already exists — safe to ignore + } + } } diff --git a/packages/db/src/schema.ts b/packages/db/src/schema.ts index 3b0e893..b53d6db 100644 --- a/packages/db/src/schema.ts +++ b/packages/db/src/schema.ts @@ -213,3 +213,36 @@ export const usageCounters = sqliteTable('usage_counters', { uniqueIndex('idx_usage_scope_period_metric').on(table.scope, table.period, table.metric), index('idx_usage_scope_period').on(table.scope, table.period), ]) + +// ── Indexing sweep tables ────────────────────────────────────────────────── + +export const indexingSweeps = sqliteTable('indexing_sweeps', { + id: text('id').primaryKey(), + projectId: text('project_id').notNull().references(() => projects.id, { onDelete: 'cascade' }), + status: text('status').notNull().default('queued'), + trigger: text('trigger').notNull().default('manual'), + startedAt: text('started_at'), + finishedAt: text('finished_at'), + error: text('error'), + createdAt: text('created_at').notNull(), +}, (table) => [ + index('idx_indexing_sweeps_project').on(table.projectId), + index('idx_indexing_sweeps_status').on(table.status), +]) + +export const indexingSweepResults = sqliteTable('indexing_sweep_results', { + id: text('id').primaryKey(), + sweepId: text('sweep_id').notNull().references(() => indexingSweeps.id, { onDelete: 'cascade' }), + keywordId: text('keyword_id').notNull().references(() => keywords.id, { onDelete: 'cascade' }), + domain: text('domain').notNull(), + /** 'client' | 'competitor' */ + domainRole: text('domain_role').notNull().default('client'), + indexedPageCount: integer('indexed_page_count').notNull().default(0), + /** JSON: Array<{ url: string; title: string }> */ + topPages: text('top_pages').notNull().default('[]'), + createdAt: text('created_at').notNull(), +}, (table) => [ + index('idx_sweep_results_sweep').on(table.sweepId), + index('idx_sweep_results_keyword').on(table.keywordId), + uniqueIndex('idx_sweep_results_sweep_kw_domain').on(table.sweepId, table.keywordId, table.domain), +]) diff --git a/packages/provider-web-search/package.json b/packages/provider-web-search/package.json new file mode 100644 index 0000000..403d22d --- /dev/null +++ b/packages/provider-web-search/package.json @@ -0,0 +1,22 @@ +{ + "name": "@ainyc/canonry-provider-web-search", + "version": "0.0.0", + "private": true, + "type": "module", + "license": "FSL-1.1-ALv2", + "exports": { + ".": { + "types": "./src/index.ts", + "default": "./src/index.ts" + } + }, + "types": "./src/index.ts", + "scripts": { + "typecheck": "tsc --noEmit -p tsconfig.json", + "test": "tsx --test test/*.test.ts", + "lint": "eslint src/" + }, + "dependencies": { + "@ainyc/canonry-contracts": "workspace:*" + } +} diff --git a/packages/provider-web-search/src/adapter.ts b/packages/provider-web-search/src/adapter.ts new file mode 100644 index 0000000..22cdea1 --- /dev/null +++ b/packages/provider-web-search/src/adapter.ts @@ -0,0 +1,143 @@ +/** + * Web search adapter for indexing sweep queries. + * + * Supports two backends: + * - serper → https://serper.dev (simple JSON POST, fast) + * - google-cse → Google Custom Search JSON API (requires cx) + * + * Usage: + * const adapter = new WebSearchAdapter({ apiKey: '...', backend: 'serper' }) + * const result = await adapter.siteQuery('example.com', 'keyword phrase') + */ + +import type { WebSearchConfig, SiteQueryResult, IndexPage } from './types.js' + +const SERPER_ENDPOINT = 'https://google.serper.dev/search' +const GOOGLE_CSE_ENDPOINT = 'https://www.googleapis.com/customsearch/v1' + +export class WebSearchAdapter { + private config: WebSearchConfig + + constructor(config: WebSearchConfig) { + if (!config.apiKey) { + throw new Error('WebSearchAdapter: apiKey is required') + } + if (config.backend === 'google-cse' && !config.cx) { + throw new Error('WebSearchAdapter: cx (search engine ID) is required for google-cse backend') + } + this.config = config + } + + /** + * Execute a `site: ` query and return indexed page data. + * + * @param domain The domain to scope the query to (e.g. "example.com") + * @param keyword The keyword phrase to search for + * @param maxResults Maximum number of top pages to return (default 10) + */ + async siteQuery(domain: string, keyword: string, maxResults = 10): Promise { + const query = `site:${domain} ${keyword}` + + if (this.config.backend === 'serper') { + return this.serperQuery(domain, keyword, query, maxResults) + } + return this.googleCseQuery(domain, keyword, query, maxResults) + } + + private async serperQuery( + domain: string, + keyword: string, + query: string, + maxResults: number, + ): Promise { + const res = await fetch(SERPER_ENDPOINT, { + method: 'POST', + headers: { + 'X-API-KEY': this.config.apiKey, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ q: query, num: Math.min(maxResults, 10) }), + }) + + if (!res.ok) { + throw new Error(`Serper API error: ${res.status} ${res.statusText}`) + } + + const data = (await res.json()) as { + organic?: Array<{ title?: string; link?: string }> + searchInformation?: { totalResults?: string } + } + + const organic = data.organic ?? [] + const topPages: IndexPage[] = organic.slice(0, maxResults).map(r => ({ + url: r.link ?? '', + title: r.title ?? '', + })) + + // Serper returns the estimated total result count in searchInformation + const totalStr = data.searchInformation?.totalResults + const indexedPageCount = totalStr ? parseInt(totalStr.replace(/,/g, ''), 10) || topPages.length : topPages.length + + return { domain, keyword, indexedPageCount, topPages } + } + + private async googleCseQuery( + domain: string, + keyword: string, + query: string, + maxResults: number, + ): Promise { + const params = new URLSearchParams({ + key: this.config.apiKey, + cx: this.config.cx!, + q: query, + num: String(Math.min(maxResults, 10)), + }) + + const res = await fetch(`${GOOGLE_CSE_ENDPOINT}?${params.toString()}`) + + if (!res.ok) { + // NOTE: Google CSE does not support bearer-token auth; the API key is in the query string. + // Redact the key from error messages to avoid leaking it in logs. + throw new Error(`Google CSE API error: ${res.status} ${res.statusText} (key redacted from URL)`) + } + + const data = (await res.json()) as { + items?: Array<{ title?: string; link?: string }> + searchInformation?: { totalResults?: string } + } + + const items = data.items ?? [] + const topPages: IndexPage[] = items.slice(0, maxResults).map(r => ({ + url: r.link ?? '', + title: r.title ?? '', + })) + + // Google CSE returns formatted strings like "1,230" — strip commas before parsing + const totalStr = data.searchInformation?.totalResults + const indexedPageCount = totalStr + ? parseInt(totalStr.replace(/,/g, ''), 10) || topPages.length + : topPages.length + + return { domain, keyword, indexedPageCount, topPages } + } + + /** Validate configuration without making an API call */ + validateConfig(): { ok: boolean; message: string } { + if (!this.config.apiKey) { + return { ok: false, message: 'Missing apiKey for web_search provider' } + } + if (this.config.backend === 'google-cse' && !this.config.cx) { + return { ok: false, message: 'Missing cx (search engine ID) for google-cse backend' } + } + return { ok: true, message: `web_search (${this.config.backend}) configured` } + } +} + +export function createWebSearchAdapter( + apiKey: string, + backend: WebSearchConfig['backend'] = 'serper', + cx?: string, +): WebSearchAdapter { + return new WebSearchAdapter({ apiKey, backend, cx }) +} diff --git a/packages/provider-web-search/src/index.ts b/packages/provider-web-search/src/index.ts new file mode 100644 index 0000000..35c0ed0 --- /dev/null +++ b/packages/provider-web-search/src/index.ts @@ -0,0 +1,2 @@ +export * from './types.js' +export { WebSearchAdapter, createWebSearchAdapter } from './adapter.js' diff --git a/packages/provider-web-search/src/types.ts b/packages/provider-web-search/src/types.ts new file mode 100644 index 0000000..ab76991 --- /dev/null +++ b/packages/provider-web-search/src/types.ts @@ -0,0 +1,20 @@ +import type { WebSearchBackend } from '@ainyc/canonry-contracts' + +export interface WebSearchConfig { + apiKey: string + backend: WebSearchBackend + /** Google CSE search engine ID (cx parameter) — required when backend is 'google-cse' */ + cx?: string +} + +export interface IndexPage { + url: string + title: string +} + +export interface SiteQueryResult { + domain: string + keyword: string + indexedPageCount: number + topPages: IndexPage[] +} diff --git a/packages/provider-web-search/test/adapter.test.ts b/packages/provider-web-search/test/adapter.test.ts new file mode 100644 index 0000000..70f8788 --- /dev/null +++ b/packages/provider-web-search/test/adapter.test.ts @@ -0,0 +1,246 @@ +/** + * Unit tests for WebSearchAdapter + * Uses Node.js built-in test runner (tsx --test) + */ + +import { describe, it, before, after } from 'node:test' +import assert from 'node:assert/strict' +import { WebSearchAdapter } from '../src/adapter.js' + +// --------------------------------------------------------------------------- +// Helpers — minimal fetch mock +// --------------------------------------------------------------------------- + +type FetchMock = (input: RequestInfo | URL, init?: RequestInit) => Promise + +let currentFetchMock: FetchMock | null = null + +const originalFetch = globalThis.fetch + +before(() => { + // Replace globalThis.fetch with our interceptor + ;(globalThis as unknown as Record).fetch = ( + input: RequestInfo | URL, + init?: RequestInit, + ) => { + if (currentFetchMock) return currentFetchMock(input, init) + return originalFetch(input, init) + } +}) + +after(() => { + ;(globalThis as unknown as Record).fetch = originalFetch +}) + +function mockFetch(status: number, body: unknown): void { + currentFetchMock = async () => + new Response(JSON.stringify(body), { + status, + headers: { 'Content-Type': 'application/json' }, + }) +} + +function clearFetchMock(): void { + currentFetchMock = null +} + +// --------------------------------------------------------------------------- +// Constructor validation +// --------------------------------------------------------------------------- + +describe('WebSearchAdapter constructor', () => { + it('throws when apiKey is missing', () => { + assert.throws( + () => new WebSearchAdapter({ apiKey: '', backend: 'serper' }), + /apiKey is required/, + ) + }) + + it('throws when backend is google-cse but cx is missing', () => { + assert.throws( + () => new WebSearchAdapter({ apiKey: 'key', backend: 'google-cse' }), + /cx \(search engine ID\) is required/, + ) + }) + + it('constructs successfully for serper', () => { + assert.doesNotThrow(() => new WebSearchAdapter({ apiKey: 'key', backend: 'serper' })) + }) + + it('constructs successfully for google-cse with cx', () => { + assert.doesNotThrow( + () => new WebSearchAdapter({ apiKey: 'key', backend: 'google-cse', cx: 'engine-id' }), + ) + }) +}) + +// --------------------------------------------------------------------------- +// Serper backend — siteQuery +// --------------------------------------------------------------------------- + +describe('WebSearchAdapter.siteQuery (serper backend)', () => { + after(clearFetchMock) + + it('returns parsed results for successful response', async () => { + mockFetch(200, { + organic: [ + { title: 'Page A', link: 'https://example.com/a' }, + { title: 'Page B', link: 'https://example.com/b' }, + ], + searchInformation: { totalResults: '1,230' }, + }) + + const adapter = new WebSearchAdapter({ apiKey: 'test-key', backend: 'serper' }) + const result = await adapter.siteQuery('example.com', 'test keyword') + + assert.equal(result.domain, 'example.com') + assert.equal(result.keyword, 'test keyword') + assert.equal(result.indexedPageCount, 1230, 'should strip commas from totalResults') + assert.equal(result.topPages.length, 2) + assert.equal(result.topPages[0].url, 'https://example.com/a') + assert.equal(result.topPages[0].title, 'Page A') + }) + + it('falls back to topPages.length when totalResults is missing', async () => { + mockFetch(200, { + organic: [{ title: 'Only Page', link: 'https://example.com/only' }], + }) + + const adapter = new WebSearchAdapter({ apiKey: 'test-key', backend: 'serper' }) + const result = await adapter.siteQuery('example.com', 'keyword') + + assert.equal(result.indexedPageCount, 1) + }) + + it('handles missing organic field gracefully', async () => { + mockFetch(200, { searchInformation: { totalResults: '0' } }) + + const adapter = new WebSearchAdapter({ apiKey: 'test-key', backend: 'serper' }) + const result = await adapter.siteQuery('example.com', 'keyword') + + assert.equal(result.indexedPageCount, 0) + assert.deepEqual(result.topPages, []) + }) + + it('throws on non-OK HTTP response', async () => { + mockFetch(403, { message: 'Forbidden' }) + + const adapter = new WebSearchAdapter({ apiKey: 'test-key', backend: 'serper' }) + await assert.rejects(adapter.siteQuery('example.com', 'keyword'), /Serper API error: 403/) + }) + + it('surfaces 429 status code in error message for rate-limit response', async () => { + mockFetch(429, { message: 'Too Many Requests' }) + + const adapter = new WebSearchAdapter({ apiKey: 'test-key', backend: 'serper' }) + await assert.rejects( + adapter.siteQuery('example.com', 'keyword'), + /Serper API error: 429/, + 'error message should include the HTTP 429 status code', + ) + }) + + it('surfaces 5xx status code in error message for server error response', async () => { + mockFetch(500, { message: 'Internal Server Error' }) + + const adapter = new WebSearchAdapter({ apiKey: 'test-key', backend: 'serper' }) + await assert.rejects( + adapter.siteQuery('example.com', 'keyword'), + /Serper API error: 500/, + 'error message should include the HTTP 500 status code', + ) + }) +}) + +// --------------------------------------------------------------------------- +// Google CSE backend — siteQuery +// --------------------------------------------------------------------------- + +describe('WebSearchAdapter.siteQuery (google-cse backend)', () => { + after(clearFetchMock) + + it('returns parsed results for successful response', async () => { + mockFetch(200, { + items: [ + { title: 'CSE Page A', link: 'https://example.com/a' }, + { title: 'CSE Page B', link: 'https://example.com/b' }, + ], + searchInformation: { totalResults: '5,678' }, + }) + + const adapter = new WebSearchAdapter({ + apiKey: 'test-key', + backend: 'google-cse', + cx: 'engine-id', + }) + const result = await adapter.siteQuery('example.com', 'test keyword') + + assert.equal(result.indexedPageCount, 5678, 'should strip commas from totalResults') + assert.equal(result.topPages.length, 2) + }) + + it('handles missing items field gracefully', async () => { + mockFetch(200, { searchInformation: { totalResults: '0' } }) + + const adapter = new WebSearchAdapter({ + apiKey: 'test-key', + backend: 'google-cse', + cx: 'engine-id', + }) + const result = await adapter.siteQuery('example.com', 'keyword') + + assert.deepEqual(result.topPages, []) + }) + + it('throws with redacted key on non-OK HTTP response', async () => { + mockFetch(429, { error: { message: 'quota exceeded' } }) + + const adapter = new WebSearchAdapter({ + apiKey: 'my-secret-api-key', + backend: 'google-cse', + cx: 'engine-id', + }) + await assert.rejects( + adapter.siteQuery('example.com', 'keyword'), + /key redacted from URL/, + 'error message should not contain the raw API key', + ) + }) + + it('does not include raw API key in thrown error', async () => { + const secretKey = 'super-secret-key-12345' + mockFetch(500, {}) + + const adapter = new WebSearchAdapter({ + apiKey: secretKey, + backend: 'google-cse', + cx: 'engine-id', + }) + + try { + await adapter.siteQuery('example.com', 'keyword') + assert.fail('Expected error to be thrown') + } catch (err) { + const msg = err instanceof Error ? err.message : String(err) + assert.ok(!msg.includes(secretKey), `Error message must not contain the API key: "${msg}"`) + } + }) +}) + +// --------------------------------------------------------------------------- +// validateConfig +// --------------------------------------------------------------------------- + +describe('WebSearchAdapter.validateConfig', () => { + it('returns ok for valid serper config', () => { + const adapter = new WebSearchAdapter({ apiKey: 'key', backend: 'serper' }) + const result = adapter.validateConfig() + assert.equal(result.ok, true) + }) + + it('returns ok for valid google-cse config', () => { + const adapter = new WebSearchAdapter({ apiKey: 'key', backend: 'google-cse', cx: 'cx-id' }) + const result = adapter.validateConfig() + assert.equal(result.ok, true) + }) +}) diff --git a/packages/provider-web-search/tsconfig.json b/packages/provider-web-search/tsconfig.json new file mode 100644 index 0000000..65aaab0 --- /dev/null +++ b/packages/provider-web-search/tsconfig.json @@ -0,0 +1,7 @@ +{ + "extends": "../../tsconfig.base.json", + "compilerOptions": { + "rootDir": "./src" + }, + "include": ["src/**/*.ts"] +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 86a14dc..4dae320 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -212,6 +212,9 @@ importers: '@ainyc/canonry-provider-openai': specifier: workspace:* version: link:../provider-openai + '@ainyc/canonry-provider-web-search': + specifier: workspace:* + version: link:../provider-web-search '@types/better-sqlite3': specifier: ^7.6.13 version: 7.6.13 @@ -298,6 +301,12 @@ importers: specifier: ^6.0.0 version: 6.27.0(zod@4.3.6) + packages/provider-web-search: + dependencies: + '@ainyc/canonry-contracts': + specifier: workspace:* + version: link:../contracts + packages: '@ainyc/aeo-audit@1.3.2':