Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 23 additions & 12 deletions src/kapacitor/Commands/EvalCommand.cs
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,18 @@ public static async Task<int> HandleEval(string baseUrl, string sessionId, strin

using var httpClient = await HttpClientExtensions.CreateAuthenticatedClientAsync();

// Session IDs are typically UUIDs but meta-session slugs are free-form
// user input; escape once and reuse for every session-scoped URL so
// reserved path characters don't corrupt the request.
var encodedSessionId = Uri.EscapeDataString(sessionId);

// 1. Fetch the compacted eval context. We keep the raw JSON for
// embedding in judge prompts and parse it once for progress logging.
string traceJson;
EvalContextResult? context;

try {
var url = $"{baseUrl}/api/sessions/{sessionId}/eval-context"
var url = $"{baseUrl}/api/sessions/{encodedSessionId}/eval-context"
+ (chain ? "?chain=true" : "")
+ (thresholdBytes is { } t ? (chain ? "&" : "?") + $"threshold={t}" : "");

Expand Down Expand Up @@ -91,10 +96,12 @@ public static async Task<int> HandleEval(string baseUrl, string sessionId, strin
}

// 2. Fetch retained judge facts per category so we can inject them
// into each judge's prompt as "known patterns" — DEV-1434.
// Failures don't abort the run; the judges just won't see prior
// patterns this time.
var knownFactsByCategory = await FetchAllJudgeFactsAsync(httpClient, baseUrl);
// into each judge's prompt as "known patterns" — DEV-1434 /
// DEV-1438. Facts are scoped to the session's repo server-side,
// so sessions without a detected repository return empty lists
// and the judges simply see no prior patterns. Failures don't
// abort the run.
var knownFactsByCategory = await FetchAllJudgeFactsAsync(httpClient, baseUrl, encodedSessionId);

// 3. Run each question in sequence. Failures on individual questions
// are logged but don't abort the whole run — a partial result set
Expand Down Expand Up @@ -139,7 +146,7 @@ public static async Task<int> HandleEval(string baseUrl, string sessionId, strin

// If the judge emitted a retain_fact, persist it for future evals.
if (ExtractRetainFact(result.Result) is { } retainedFact) {
await PostJudgeFactAsync(httpClient, baseUrl, q.Category, retainedFact, context.SessionId, evalRunId);
await PostJudgeFactAsync(httpClient, baseUrl, encodedSessionId, q.Category, retainedFact, evalRunId);
}
}

Expand All @@ -156,7 +163,7 @@ public static async Task<int> HandleEval(string baseUrl, string sessionId, strin
Render(aggregate, sessionId);

// 5. Persist to the server.
var postUrl = $"{baseUrl}/api/sessions/{sessionId}/evals";
var postUrl = $"{baseUrl}/api/sessions/{encodedSessionId}/evals";
var payloadJson = JsonSerializer.Serialize(aggregate, KapacitorJsonContext.Default.SessionEvalCompletedPayload);
using var httpContent = new StringContent(payloadJson, Encoding.UTF8, "application/json");

Expand Down Expand Up @@ -244,12 +251,16 @@ internal static string FormatKnownPatterns(List<JudgeFact> facts) {
}
}

static async Task<Dictionary<string, List<JudgeFact>>> FetchAllJudgeFactsAsync(HttpClient httpClient, string baseUrl) {
/// <param name="encodedSessionId">Already URL-path-escaped — see HandleEval.</param>
static async Task<Dictionary<string, List<JudgeFact>>> FetchAllJudgeFactsAsync(HttpClient httpClient, string baseUrl, string encodedSessionId) {
var result = new Dictionary<string, List<JudgeFact>>();

foreach (var category in Categories) {
try {
using var resp = await httpClient.GetWithRetryAsync($"{baseUrl}/api/judge-facts?category={category}");
// Categories are internal constants (safe ASCII), but escape
// for hygiene — costs nothing and insulates the URL from any
// future category that might include unusual characters.
using var resp = await httpClient.GetWithRetryAsync($"{baseUrl}/api/sessions/{encodedSessionId}/judge-facts?category={Uri.EscapeDataString(category)}");
if (!resp.IsSuccessStatusCode) {
Log($"Failed to fetch judge facts for {category}: HTTP {(int)resp.StatusCode}");

Expand All @@ -268,19 +279,19 @@ static async Task<Dictionary<string, List<JudgeFact>>> FetchAllJudgeFactsAsync(H
return result;
}

static async Task PostJudgeFactAsync(HttpClient httpClient, string baseUrl, string category, string fact, string sessionId, string evalRunId) {
/// <param name="encodedSessionId">Already URL-path-escaped — see HandleEval.</param>
static async Task PostJudgeFactAsync(HttpClient httpClient, string baseUrl, string encodedSessionId, string category, string fact, string evalRunId) {
var payload = new JudgeFactPayload {
Category = category,
Fact = fact,
SourceSessionId = sessionId,
SourceEvalRunId = evalRunId
};

var payloadJson = JsonSerializer.Serialize(payload, KapacitorJsonContext.Default.JudgeFactPayload);
using var content = new StringContent(payloadJson, Encoding.UTF8, "application/json");

try {
using var resp = await httpClient.PostWithRetryAsync($"{baseUrl}/api/judge-facts", content);
using var resp = await httpClient.PostWithRetryAsync($"{baseUrl}/api/sessions/{encodedSessionId}/judge-facts", content);
Log(
resp.IsSuccessStatusCode
? $" retained fact for category {category}"
Expand Down
14 changes: 6 additions & 8 deletions src/kapacitor/Models.cs
Original file line number Diff line number Diff line change
Expand Up @@ -258,21 +258,19 @@ record EvalCategoryResult {
public List<EvalQuestionVerdict> Questions { get; init; } = [];
}

// Cross-eval memory — DEV-1434. Judges may optionally emit a retain_fact
// when they spot a cross-cutting pattern; the CLI POSTs it to the server's
// judge-facts endpoint which appends to a per-category stream. Facts from
// past evaluations are fetched at eval startup and injected into each
// judge's prompt as "known patterns".
// Cross-eval memory — DEV-1434 / DEV-1438. Judges may optionally emit a
// retain_fact when they spot a cross-cutting pattern; the CLI POSTs it to
// the session-scoped endpoint and the server derives repo scope from the
// session (facts live on JudgeFacts-repo-{repoHash}-{category} streams).
// Facts accumulated on the same repo by any team member are fetched at
// eval startup and injected into each judge's prompt as "known patterns".
record JudgeFactPayload {
[JsonPropertyName("category")]
public required string Category { get; init; }

[JsonPropertyName("fact")]
public required string Fact { get; init; }

[JsonPropertyName("source_session_id")]
public required string SourceSessionId { get; init; }

[JsonPropertyName("source_eval_run_id")]
public required string SourceEvalRunId { get; init; }
}
Expand Down
15 changes: 8 additions & 7 deletions src/kapacitor/Resources/prompt-eval-question.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ Subagent activity (if any) carries `agent_id` / `agent_type`. Same-timestamp eve
{TRACE_JSON}
```

## Known patterns
## Known patterns for this project

Previous evaluations may have retained cross-cutting facts about the user, repo, or coding style under this category. Treat them as prior context — corroborating evidence if present, but do not punish the agent for a pattern that isn't actually visible in this session's trace.
Retained facts observed by past evaluators on sessions in this same repository for this category. Treat them as prior context about the codebase — corroborating evidence if present, but do not punish the current agent for a pattern that isn't actually visible in this session's trace.

{KNOWN_PATTERNS}

Expand All @@ -53,16 +53,17 @@ Respond with ONLY a valid JSON object (no markdown fences, no commentary, no pre

### When to emit `retain_fact`

Only retain facts that are GENERALIZABLE — patterns about the user, repo, or style that would help a future evaluator judging a *different* session:
Retained facts are **project-level**they are shared across every evaluator working on sessions in this same repository. Only retain patterns about the **codebase, its conventions, or its recurring failure modes** that would help future evaluators judging a *different* session in this same project:

- ✅ "User tends to force-push with uncommitted work still in the tree"
- ✅ "This repo's tests rely on Testcontainers, so missing Docker is a frequent failure mode"
- ✅ "Agent consistently writes tests before the feature, not after"
- ✅ "This codebase prefers handler-per-file over mega-handlers"
- ✅ "Tests in this repo depend on env var `X` being set"
- ❌ "Alice tends to force-push" (individual-level, not codebase-level)
- ❌ "Session ran rm -rf /tmp/cache" (single observation — not a pattern)
- ❌ "This question scored 3" (not a pattern about behavior)
- ❌ "This question scored 3" (not a pattern about the codebase)
- ❌ A restatement of the finding for this question

If nothing is worth generalizing, emit `"retain_fact": null`. Do NOT emit a fact just to have one — retained facts are injected into every future judge prompt and noise dilutes their usefulness.
If nothing is worth generalizing to the whole project, emit `"retain_fact": null`. Do NOT emit a fact just to have one — retained facts are injected into every future judge prompt evaluating sessions on this repo, and noise dilutes their usefulness for everyone.

## Scoring

Expand Down
Loading