Skip to content

Commit 6f14af0

Browse files
committed
fix: code-verify retry logic + forced verdict followup
- Add 3x retry with backoff on LLM calls in code-verify agent loop - Add 3x retry on screenshot analysis LLM calls - When agent doesn't call deliver_code_verdict, nudge with followup message - After max iterations, force tool_choice=deliver_code_verdict as last resort - Throw on total failure so queue retries instead of silently returning false
1 parent 8531303 commit 6f14af0

File tree

1 file changed

+124
-56
lines changed

1 file changed

+124
-56
lines changed

src/detection/code-verify.ts

Lines changed: 124 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -248,40 +248,61 @@ async function toolAnalyzeScreenshot(
248248
});
249249
}
250250

251-
const stream = await openai.chat.completions.create({
252-
model: VISION_MODEL,
253-
messages: [
254-
{ role: "system", content: SCREENSHOT_SYSTEM_PROMPT },
255-
{
256-
role: "user",
257-
content: [
258-
{
259-
type: "text",
260-
text: `Bug description: ${bugDescription.slice(0, 500)}\n\nAnalyze this screenshot:`,
261-
},
251+
const SCREENSHOT_MAX_RETRIES = 3;
252+
const SCREENSHOT_BASE_DELAY_MS = 1000;
253+
254+
for (let attempt = 1; attempt <= SCREENSHOT_MAX_RETRIES; attempt++) {
255+
try {
256+
const stream = await openai.chat.completions.create({
257+
model: VISION_MODEL,
258+
messages: [
259+
{ role: "system", content: SCREENSHOT_SYSTEM_PROMPT },
262260
{
263-
type: "image_url",
264-
image_url: { url: base64Url, detail: "high" },
261+
role: "user",
262+
content: [
263+
{
264+
type: "text",
265+
text: `Bug description: ${bugDescription.slice(0, 500)}\n\nAnalyze this screenshot:`,
266+
},
267+
{
268+
type: "image_url",
269+
image_url: { url: base64Url, detail: "high" },
270+
},
271+
],
265272
},
266273
],
267-
},
268-
],
269-
max_tokens: 300,
270-
temperature: 0,
271-
stream: true,
272-
});
274+
max_tokens: 300,
275+
temperature: 0,
276+
stream: true,
277+
});
273278

274-
const response = await collectStream(stream);
275-
const content = response.message.content ?? "";
276-
const jsonMatch = content.match(/\{[\s\S]*\}/);
277-
if (jsonMatch) {
278-
return jsonMatch[0];
279+
const response = await collectStream(stream);
280+
const content = response.message.content ?? "";
281+
const jsonMatch = content.match(/\{[\s\S]*\}/);
282+
if (jsonMatch) {
283+
return jsonMatch[0];
284+
}
285+
logger.warn({ url, attempt }, "Code-verify: could not parse vision response — retrying");
286+
if (attempt < SCREENSHOT_MAX_RETRIES) {
287+
await new Promise((r) => setTimeout(r, SCREENSHOT_BASE_DELAY_MS * attempt));
288+
continue;
289+
}
290+
return JSON.stringify({ valid: true, reasoning: "Could not parse vision response after retries", shows: "unknown" });
291+
} catch (err) {
292+
const msg = err instanceof Error ? err.message : String(err);
293+
logger.warn({ err: msg, url, attempt }, "Code-verify: screenshot analysis failed");
294+
if (attempt < SCREENSHOT_MAX_RETRIES) {
295+
await new Promise((r) => setTimeout(r, SCREENSHOT_BASE_DELAY_MS * attempt));
296+
continue;
297+
}
298+
return JSON.stringify({ valid: true, reasoning: `Vision analysis failed after retries: ${msg.slice(0, 100)}`, shows: "unknown" });
299+
}
279300
}
280-
return JSON.stringify({ valid: true, reasoning: "Could not parse vision response", shows: "unknown" });
301+
return JSON.stringify({ valid: true, reasoning: "Screenshot analysis exhausted retries", shows: "unknown" });
281302
} catch (err) {
282303
const msg = err instanceof Error ? err.message : String(err);
283-
logger.warn({ err: msg, url }, "Code-verify: screenshot analysis failed");
284-
return JSON.stringify({ valid: true, reasoning: `Vision analysis failed: ${msg.slice(0, 100)}`, shows: "unknown" });
304+
logger.warn({ err: msg, url }, "Code-verify: screenshot download/processing failed");
305+
return JSON.stringify({ valid: false, reasoning: `Screenshot processing failed: ${msg.slice(0, 100)}`, shows: "error" });
285306
}
286307
}
287308

@@ -473,22 +494,42 @@ Verify this bug: explore the code AND analyze all screenshots. Then call deliver
473494
{ role: "user", content: userMessage },
474495
];
475496

497+
const LLM_MAX_RETRIES = 3;
498+
const LLM_BASE_DELAY_MS = 1000;
499+
476500
for (let i = 0; i < CODE_VERIFY_MAX_ITERATIONS; i++) {
477-
let assembled: Awaited<ReturnType<typeof collectStream>>;
478-
try {
479-
const stream = await openai.chat.completions.create({
480-
model: LLM_SCORING_MODEL,
481-
messages,
482-
tools,
483-
tool_choice: "auto",
484-
temperature: 0,
485-
max_tokens: 2000,
486-
stream: true,
487-
});
488-
assembled = await collectStream(stream);
489-
} catch (err) {
490-
logger.error({ err, issueNumber, iteration: i }, "Code-verify: LLM call failed");
491-
return { plausible: false, confidence: 0.8, reasoning: "Code verification LLM call failed. Cannot confirm bug." };
501+
let assembled!: Awaited<ReturnType<typeof collectStream>>;
502+
503+
let llmSuccess = false;
504+
for (let attempt = 1; attempt <= LLM_MAX_RETRIES; attempt++) {
505+
try {
506+
const stream = await openai.chat.completions.create({
507+
model: LLM_SCORING_MODEL,
508+
messages,
509+
tools,
510+
tool_choice: "auto",
511+
temperature: 0,
512+
max_tokens: 2000,
513+
stream: true,
514+
});
515+
assembled = await collectStream(stream);
516+
llmSuccess = true;
517+
break;
518+
} catch (err) {
519+
const msg = err instanceof Error ? err.message : String(err);
520+
logger.warn(
521+
{ err: msg, issueNumber, iteration: i, attempt, maxRetries: LLM_MAX_RETRIES },
522+
`Code-verify: LLM call failed (attempt ${attempt}/${LLM_MAX_RETRIES})`,
523+
);
524+
if (attempt < LLM_MAX_RETRIES) {
525+
await new Promise((r) => setTimeout(r, LLM_BASE_DELAY_MS * attempt));
526+
}
527+
}
528+
}
529+
530+
if (!llmSuccess) {
531+
logger.error({ issueNumber, iteration: i }, "Code-verify: LLM call failed after all retries");
532+
throw new Error(`Code verification LLM call failed after ${LLM_MAX_RETRIES} retries for issue #${issueNumber}`);
492533
}
493534

494535
const msg = assembled.message;
@@ -566,21 +607,48 @@ Verify this bug: explore the code AND analyze all screenshots. Then call deliver
566607
continue;
567608
}
568609

569-
if (!msg.content) {
570-
messages.push({
571-
role: "user",
572-
content: "Continue investigating. Call deliver_code_verdict when ready.",
573-
});
574-
continue;
575-
}
610+
// Agent responded with text but no tool call — nudge it to deliver verdict
611+
messages.push({
612+
role: "user",
613+
content:
614+
"You MUST now call deliver_code_verdict with your findings. " +
615+
"Do not explain further — call the tool immediately.",
616+
});
617+
}
618+
619+
logger.warn({ issueNumber }, "Code-verify: agent did not deliver verdict — retrying with forced tool_choice");
576620

577-
break;
621+
// Final forced attempt: explicitly require deliver_code_verdict
622+
try {
623+
const stream = await openai.chat.completions.create({
624+
model: LLM_SCORING_MODEL,
625+
messages,
626+
tools,
627+
tool_choice: { type: "function", function: { name: "deliver_code_verdict" } },
628+
temperature: 0,
629+
max_tokens: 1500,
630+
stream: true,
631+
});
632+
const forced = await collectStream(stream);
633+
const tc = forced.message.tool_calls?.[0];
634+
if (tc && tc.function.name === "deliver_code_verdict") {
635+
let fnArgs: Record<string, unknown>;
636+
try { fnArgs = JSON.parse(tc.function.arguments); } catch { fnArgs = {}; }
637+
const result: CodeVerifyResult = {
638+
plausible: (fnArgs.plausible as boolean) ?? false,
639+
confidence: (fnArgs.confidence as number) ?? 0.5,
640+
reasoning: (fnArgs.reasoning as string) ?? "No reasoning (forced verdict).",
641+
codeEvidence: fnArgs.code_evidence as string | undefined,
642+
screenshotValid: fnArgs.screenshot_valid as boolean | undefined,
643+
screenshotReasoning: fnArgs.screenshot_reasoning as string | undefined,
644+
};
645+
logger.info({ issueNumber, plausible: result.plausible, confidence: result.confidence }, "Code-verify: forced verdict delivered");
646+
return result;
647+
}
648+
} catch (err) {
649+
const errMsg = err instanceof Error ? err.message : String(err);
650+
logger.error({ err: errMsg, issueNumber }, "Code-verify: forced verdict call failed");
578651
}
579652

580-
logger.warn({ issueNumber }, "Code-verify: agent did not deliver verdict");
581-
return {
582-
plausible: false,
583-
confidence: 0.8,
584-
reasoning: "Verification agent exhausted iterations without delivering a verdict. Bug unverified.",
585-
};
653+
throw new Error(`Code verification agent failed to deliver verdict for issue #${issueNumber} after ${CODE_VERIFY_MAX_ITERATIONS} iterations`);
586654
}

0 commit comments

Comments
 (0)