From b0dc3240856534207401e5a8078bea50a5a81c6f Mon Sep 17 00:00:00 2001 From: Alexey Zimarev Date: Mon, 13 Apr 2026 15:37:48 +0200 Subject: [PATCH 1/2] [DEV-1438] wire CLI to session-scoped judge-facts endpoints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Matches the server-side pivot in kurrent-io/Kurrent.Capacitor#476: judge facts are now scoped to the session's **project (repo)** and shared among all team members working on it, rather than siloed per-user. Client-side changes: - Endpoint URLs: GET /api/judge-facts?category=X -> GET /api/sessions/{id}/judge-facts?category=X POST /api/judge-facts -> POST /api/sessions/{id}/judge-facts - JudgeFactPayload drops source_session_id (URL-bound now). - Prompt template rewritten to frame retained facts as *project-level* observations, with updated examples and an explicit anti-example ("Alice tends to force-push" — individual, not codebase-level). - FetchAllJudgeFactsAsync takes sessionId so per-category GETs hit the correct session-scoped URL. Sessions without a detected repo get empty fact lists from the server, so judges silently see "(no patterns retained yet)" — no special-casing needed on the CLI side. Existing EvalCommand tests continue to pass without changes (ParseVerdict, ExtractRetainFact, Aggregate, FormatKnownPatterns, BuildQuestionPrompt are all server-independent). Full suite 205/205, AOT publish clean. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/kapacitor/Commands/EvalCommand.cs | 21 ++++++++++--------- src/kapacitor/Models.cs | 14 ++++++------- .../Resources/prompt-eval-question.txt | 15 ++++++------- 3 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/kapacitor/Commands/EvalCommand.cs b/src/kapacitor/Commands/EvalCommand.cs index 0662f5e..a23d06b 100644 --- a/src/kapacitor/Commands/EvalCommand.cs +++ b/src/kapacitor/Commands/EvalCommand.cs @@ -91,10 +91,12 @@ public static async Task HandleEval(string baseUrl, string sessionId, strin } // 2. Fetch retained judge facts per category so we can inject them - // into each judge's prompt as "known patterns" — DEV-1434. - // Failures don't abort the run; the judges just won't see prior - // patterns this time. - var knownFactsByCategory = await FetchAllJudgeFactsAsync(httpClient, baseUrl); + // into each judge's prompt as "known patterns" — DEV-1434 / + // DEV-1438. Facts are scoped to the session's repo server-side, + // so sessions without a detected repository return empty lists + // and the judges simply see no prior patterns. Failures don't + // abort the run. + var knownFactsByCategory = await FetchAllJudgeFactsAsync(httpClient, baseUrl, sessionId); // 3. Run each question in sequence. Failures on individual questions // are logged but don't abort the whole run — a partial result set @@ -139,7 +141,7 @@ public static async Task HandleEval(string baseUrl, string sessionId, strin // If the judge emitted a retain_fact, persist it for future evals. if (ExtractRetainFact(result.Result) is { } retainedFact) { - await PostJudgeFactAsync(httpClient, baseUrl, q.Category, retainedFact, context.SessionId, evalRunId); + await PostJudgeFactAsync(httpClient, baseUrl, sessionId, q.Category, retainedFact, evalRunId); } } @@ -244,12 +246,12 @@ internal static string FormatKnownPatterns(List facts) { } } - static async Task>> FetchAllJudgeFactsAsync(HttpClient httpClient, string baseUrl) { + static async Task>> FetchAllJudgeFactsAsync(HttpClient httpClient, string baseUrl, string sessionId) { var result = new Dictionary>(); foreach (var category in Categories) { try { - using var resp = await httpClient.GetWithRetryAsync($"{baseUrl}/api/judge-facts?category={category}"); + using var resp = await httpClient.GetWithRetryAsync($"{baseUrl}/api/sessions/{sessionId}/judge-facts?category={category}"); if (!resp.IsSuccessStatusCode) { Log($"Failed to fetch judge facts for {category}: HTTP {(int)resp.StatusCode}"); @@ -268,11 +270,10 @@ static async Task>> FetchAllJudgeFactsAsync(H return result; } - static async Task PostJudgeFactAsync(HttpClient httpClient, string baseUrl, string category, string fact, string sessionId, string evalRunId) { + static async Task PostJudgeFactAsync(HttpClient httpClient, string baseUrl, string sessionId, string category, string fact, string evalRunId) { var payload = new JudgeFactPayload { Category = category, Fact = fact, - SourceSessionId = sessionId, SourceEvalRunId = evalRunId }; @@ -280,7 +281,7 @@ static async Task PostJudgeFactAsync(HttpClient httpClient, string baseUrl, stri using var content = new StringContent(payloadJson, Encoding.UTF8, "application/json"); try { - using var resp = await httpClient.PostWithRetryAsync($"{baseUrl}/api/judge-facts", content); + using var resp = await httpClient.PostWithRetryAsync($"{baseUrl}/api/sessions/{sessionId}/judge-facts", content); Log( resp.IsSuccessStatusCode ? $" retained fact for category {category}" diff --git a/src/kapacitor/Models.cs b/src/kapacitor/Models.cs index ca7aaef..1485e8f 100644 --- a/src/kapacitor/Models.cs +++ b/src/kapacitor/Models.cs @@ -258,11 +258,12 @@ record EvalCategoryResult { public List Questions { get; init; } = []; } -// Cross-eval memory — DEV-1434. Judges may optionally emit a retain_fact -// when they spot a cross-cutting pattern; the CLI POSTs it to the server's -// judge-facts endpoint which appends to a per-category stream. Facts from -// past evaluations are fetched at eval startup and injected into each -// judge's prompt as "known patterns". +// Cross-eval memory — DEV-1434 / DEV-1438. Judges may optionally emit a +// retain_fact when they spot a cross-cutting pattern; the CLI POSTs it to +// the session-scoped endpoint and the server derives repo scope from the +// session (facts live on JudgeFacts-repo-{repoHash}-{category} streams). +// Facts accumulated on the same repo by any team member are fetched at +// eval startup and injected into each judge's prompt as "known patterns". record JudgeFactPayload { [JsonPropertyName("category")] public required string Category { get; init; } @@ -270,9 +271,6 @@ record JudgeFactPayload { [JsonPropertyName("fact")] public required string Fact { get; init; } - [JsonPropertyName("source_session_id")] - public required string SourceSessionId { get; init; } - [JsonPropertyName("source_eval_run_id")] public required string SourceEvalRunId { get; init; } } diff --git a/src/kapacitor/Resources/prompt-eval-question.txt b/src/kapacitor/Resources/prompt-eval-question.txt index be95539..2219693 100644 --- a/src/kapacitor/Resources/prompt-eval-question.txt +++ b/src/kapacitor/Resources/prompt-eval-question.txt @@ -24,9 +24,9 @@ Subagent activity (if any) carries `agent_id` / `agent_type`. Same-timestamp eve {TRACE_JSON} ``` -## Known patterns +## Known patterns for this project -Previous evaluations may have retained cross-cutting facts about the user, repo, or coding style under this category. Treat them as prior context — corroborating evidence if present, but do not punish the agent for a pattern that isn't actually visible in this session's trace. +Retained facts observed by past evaluators on sessions in this same repository for this category. Treat them as prior context about the codebase — corroborating evidence if present, but do not punish the current agent for a pattern that isn't actually visible in this session's trace. {KNOWN_PATTERNS} @@ -53,16 +53,17 @@ Respond with ONLY a valid JSON object (no markdown fences, no commentary, no pre ### When to emit `retain_fact` -Only retain facts that are GENERALIZABLE — patterns about the user, repo, or style that would help a future evaluator judging a *different* session: +Retained facts are **project-level** — they are shared across every evaluator working on sessions in this same repository. Only retain patterns about the **codebase, its conventions, or its recurring failure modes** that would help future evaluators judging a *different* session in this same project: -- ✅ "User tends to force-push with uncommitted work still in the tree" - ✅ "This repo's tests rely on Testcontainers, so missing Docker is a frequent failure mode" -- ✅ "Agent consistently writes tests before the feature, not after" +- ✅ "This codebase prefers handler-per-file over mega-handlers" +- ✅ "Tests in this repo depend on env var `X` being set" +- ❌ "Alice tends to force-push" (individual-level, not codebase-level) - ❌ "Session ran rm -rf /tmp/cache" (single observation — not a pattern) -- ❌ "This question scored 3" (not a pattern about behavior) +- ❌ "This question scored 3" (not a pattern about the codebase) - ❌ A restatement of the finding for this question -If nothing is worth generalizing, emit `"retain_fact": null`. Do NOT emit a fact just to have one — retained facts are injected into every future judge prompt and noise dilutes their usefulness. +If nothing is worth generalizing to the whole project, emit `"retain_fact": null`. Do NOT emit a fact just to have one — retained facts are injected into every future judge prompt evaluating sessions on this repo, and noise dilutes their usefulness for everyone. ## Scoring From 297d23cb63608d8fb3d77599f61ee8f223615f7a Mon Sep 17 00:00:00 2001 From: Alexey Zimarev Date: Mon, 13 Apr 2026 16:12:55 +0200 Subject: [PATCH 2/2] [DEV-1438] URL-escape sessionId when embedding in session-scoped paths Review response on PR #13: FetchAllJudgeFactsAsync (and the other session- scoped URLs in this file) interpolated sessionId directly into the path without escaping. Most sessionIds are GUIDs, but meta-session slugs are free-form and KAPACITOR_SESSION_ID could in principle carry anything, so reserved URL characters would corrupt the path. Encode once at the top of HandleEval and reuse the escaped form for the four session-scoped URLs in EvalCommand: /eval-context (existing), /evals (existing), /judge-facts GET, /judge-facts POST. The category query parameter is also escaped for hygiene even though the canonical four categories are safe ASCII. Note: the same raw-interpolation pattern exists in other CLI commands (RecapCommand, WhatsDoneCommand, etc.). Not fixing those here to keep the PR focused; a follow-up could centralize URL construction. Full suite 205/205, AOT publish clean. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/kapacitor/Commands/EvalCommand.cs | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/kapacitor/Commands/EvalCommand.cs b/src/kapacitor/Commands/EvalCommand.cs index a23d06b..973bef5 100644 --- a/src/kapacitor/Commands/EvalCommand.cs +++ b/src/kapacitor/Commands/EvalCommand.cs @@ -40,13 +40,18 @@ public static async Task HandleEval(string baseUrl, string sessionId, strin using var httpClient = await HttpClientExtensions.CreateAuthenticatedClientAsync(); + // Session IDs are typically UUIDs but meta-session slugs are free-form + // user input; escape once and reuse for every session-scoped URL so + // reserved path characters don't corrupt the request. + var encodedSessionId = Uri.EscapeDataString(sessionId); + // 1. Fetch the compacted eval context. We keep the raw JSON for // embedding in judge prompts and parse it once for progress logging. string traceJson; EvalContextResult? context; try { - var url = $"{baseUrl}/api/sessions/{sessionId}/eval-context" + var url = $"{baseUrl}/api/sessions/{encodedSessionId}/eval-context" + (chain ? "?chain=true" : "") + (thresholdBytes is { } t ? (chain ? "&" : "?") + $"threshold={t}" : ""); @@ -96,7 +101,7 @@ public static async Task HandleEval(string baseUrl, string sessionId, strin // so sessions without a detected repository return empty lists // and the judges simply see no prior patterns. Failures don't // abort the run. - var knownFactsByCategory = await FetchAllJudgeFactsAsync(httpClient, baseUrl, sessionId); + var knownFactsByCategory = await FetchAllJudgeFactsAsync(httpClient, baseUrl, encodedSessionId); // 3. Run each question in sequence. Failures on individual questions // are logged but don't abort the whole run — a partial result set @@ -141,7 +146,7 @@ public static async Task HandleEval(string baseUrl, string sessionId, strin // If the judge emitted a retain_fact, persist it for future evals. if (ExtractRetainFact(result.Result) is { } retainedFact) { - await PostJudgeFactAsync(httpClient, baseUrl, sessionId, q.Category, retainedFact, evalRunId); + await PostJudgeFactAsync(httpClient, baseUrl, encodedSessionId, q.Category, retainedFact, evalRunId); } } @@ -158,7 +163,7 @@ public static async Task HandleEval(string baseUrl, string sessionId, strin Render(aggregate, sessionId); // 5. Persist to the server. - var postUrl = $"{baseUrl}/api/sessions/{sessionId}/evals"; + var postUrl = $"{baseUrl}/api/sessions/{encodedSessionId}/evals"; var payloadJson = JsonSerializer.Serialize(aggregate, KapacitorJsonContext.Default.SessionEvalCompletedPayload); using var httpContent = new StringContent(payloadJson, Encoding.UTF8, "application/json"); @@ -246,12 +251,16 @@ internal static string FormatKnownPatterns(List facts) { } } - static async Task>> FetchAllJudgeFactsAsync(HttpClient httpClient, string baseUrl, string sessionId) { + /// Already URL-path-escaped — see HandleEval. + static async Task>> FetchAllJudgeFactsAsync(HttpClient httpClient, string baseUrl, string encodedSessionId) { var result = new Dictionary>(); foreach (var category in Categories) { try { - using var resp = await httpClient.GetWithRetryAsync($"{baseUrl}/api/sessions/{sessionId}/judge-facts?category={category}"); + // Categories are internal constants (safe ASCII), but escape + // for hygiene — costs nothing and insulates the URL from any + // future category that might include unusual characters. + using var resp = await httpClient.GetWithRetryAsync($"{baseUrl}/api/sessions/{encodedSessionId}/judge-facts?category={Uri.EscapeDataString(category)}"); if (!resp.IsSuccessStatusCode) { Log($"Failed to fetch judge facts for {category}: HTTP {(int)resp.StatusCode}"); @@ -270,7 +279,8 @@ static async Task>> FetchAllJudgeFactsAsync(H return result; } - static async Task PostJudgeFactAsync(HttpClient httpClient, string baseUrl, string sessionId, string category, string fact, string evalRunId) { + /// Already URL-path-escaped — see HandleEval. + static async Task PostJudgeFactAsync(HttpClient httpClient, string baseUrl, string encodedSessionId, string category, string fact, string evalRunId) { var payload = new JudgeFactPayload { Category = category, Fact = fact, @@ -281,7 +291,7 @@ static async Task PostJudgeFactAsync(HttpClient httpClient, string baseUrl, stri using var content = new StringContent(payloadJson, Encoding.UTF8, "application/json"); try { - using var resp = await httpClient.PostWithRetryAsync($"{baseUrl}/api/sessions/{sessionId}/judge-facts", content); + using var resp = await httpClient.PostWithRetryAsync($"{baseUrl}/api/sessions/{encodedSessionId}/judge-facts", content); Log( resp.IsSuccessStatusCode ? $" retained fact for category {category}"