Skip to content

Commit c0ed534

Browse files
committed
Add baseline-vs-MCP IR aggregates and per-suite cost tables
1 parent d600dda commit c0ed534

File tree

5 files changed

+568
-0
lines changed

5 files changed

+568
-0
lines changed

docs/BLOG_POST.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,13 @@ I built an information retrieval evaluation pipeline alongside the task scoring
155155

156156
The refreshed retrieval pipeline run confirms moderate retrieval quality overall (file recall 0.460, MRR 0.364), but a large fraction of traces still lack mapped ground truth files (488/799), which limits configuration-level retrieval comparisons.
157157

158+
On the computable subset, aggregated baseline vs MCP retrieval metrics are:
159+
160+
| Config Type | n | File Recall | MRR | MAP | Context Efficiency |
161+
|-------------|---|-------------|-----|-----|--------------------|
162+
| baseline | 132 | 0.330 | 0.346 | 0.231 | 0.184 |
163+
| mcp | 179 | 0.556 | 0.378 | 0.267 | 0.204 |
164+
158165
But better retrieval doesn't always mean better outcomes. Still investigating this but likely finding the right files is necessary but not sufficient. The agent still has to correctly apply what it finds, and in some tasks the local code modification step is where removing local code availability from the MCP run environment hurts more than others.
159166

160167
## Patterns in the Retrieval-Outcome Pairing Data
@@ -175,6 +182,8 @@ Let's take a break from whatever voodoo variables control reward outcomes and ta
175182

176183
This updated snapshot indicates MCP token/tool usage overhead is currently dominating cost in the analysis set.
177184

185+
Suite-level cost is mixed: MCP is cheaper on several Org suites (for example crossorg −$0.062/task and incident −$0.048/task) but more expensive on some SDLC suites (refactor +$0.398/task, feature +$0.211/task). The full per-suite cost table is in the technical report.
186+
178187
Speed tells an even cleaner story:
179188

180189
| Metric | Baseline Mean | MCP Mean | Delta |

docs/analysis/analysis_refresh_tables_20260303.json

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,5 +433,179 @@
433433
"mean_reward_delta": 0.048,
434434
"reward_delta_variance": 0.059717
435435
}
436+
},
437+
"ir_by_type": {
438+
"baseline": {
439+
"n": 132,
440+
"file_recall": 0.3295,
441+
"mrr": 0.3462,
442+
"map_score": 0.2307,
443+
"context_efficiency": 0.1843
444+
},
445+
"mcp": {
446+
"n": 179,
447+
"file_recall": 0.5558,
448+
"mrr": 0.3778,
449+
"map_score": 0.2667,
450+
"context_efficiency": 0.2043
451+
}
452+
},
453+
"ir_by_config": {
454+
"baseline-local-direct": {
455+
"n": 132,
456+
"file_recall": 0.3295,
457+
"mrr": 0.3462,
458+
"map_score": 0.2307,
459+
"context_efficiency": 0.1843
460+
},
461+
"mcp-remote-direct": {
462+
"n": 179,
463+
"file_recall": 0.5558,
464+
"mrr": 0.3778,
465+
"map_score": 0.2667,
466+
"context_efficiency": 0.2043
467+
}
468+
},
469+
"cost_by_suite": {
470+
"csb_org_compliance": {
471+
"n": 18,
472+
"baseline_cost_mean_usd": 0.2679,
473+
"mcp_cost_mean_usd": 0.2521,
474+
"delta_cost_mean_usd": -0.0158,
475+
"delta_cost_variance": 0.003486
476+
},
477+
"csb_org_crossorg": {
478+
"n": 15,
479+
"baseline_cost_mean_usd": 0.2756,
480+
"mcp_cost_mean_usd": 0.2136,
481+
"delta_cost_mean_usd": -0.062,
482+
"delta_cost_variance": 0.017902
483+
},
484+
"csb_org_crossrepo": {
485+
"n": 14,
486+
"baseline_cost_mean_usd": 0.2575,
487+
"mcp_cost_mean_usd": 0.2523,
488+
"delta_cost_mean_usd": -0.0052,
489+
"delta_cost_variance": 0.005375
490+
},
491+
"csb_org_crossrepo_tracing": {
492+
"n": 22,
493+
"baseline_cost_mean_usd": 0.2478,
494+
"mcp_cost_mean_usd": 0.2187,
495+
"delta_cost_mean_usd": -0.0292,
496+
"delta_cost_variance": 0.003282
497+
},
498+
"csb_org_domain": {
499+
"n": 20,
500+
"baseline_cost_mean_usd": 0.2108,
501+
"mcp_cost_mean_usd": 0.2258,
502+
"delta_cost_mean_usd": 0.015,
503+
"delta_cost_variance": 0.003268
504+
},
505+
"csb_org_incident": {
506+
"n": 20,
507+
"baseline_cost_mean_usd": 0.2465,
508+
"mcp_cost_mean_usd": 0.1989,
509+
"delta_cost_mean_usd": -0.0476,
510+
"delta_cost_variance": 0.007914
511+
},
512+
"csb_org_migration": {
513+
"n": 26,
514+
"baseline_cost_mean_usd": 0.2534,
515+
"mcp_cost_mean_usd": 0.2501,
516+
"delta_cost_mean_usd": -0.0033,
517+
"delta_cost_variance": 0.009546
518+
},
519+
"csb_org_onboarding": {
520+
"n": 28,
521+
"baseline_cost_mean_usd": 0.1029,
522+
"mcp_cost_mean_usd": 0.1049,
523+
"delta_cost_mean_usd": 0.002,
524+
"delta_cost_variance": 0.00086
525+
},
526+
"csb_org_org": {
527+
"n": 15,
528+
"baseline_cost_mean_usd": 0.2362,
529+
"mcp_cost_mean_usd": 0.2193,
530+
"delta_cost_mean_usd": -0.0169,
531+
"delta_cost_variance": 0.00171
532+
},
533+
"csb_org_platform": {
534+
"n": 18,
535+
"baseline_cost_mean_usd": 0.194,
536+
"mcp_cost_mean_usd": 0.2149,
537+
"delta_cost_mean_usd": 0.0209,
538+
"delta_cost_variance": 0.001999
539+
},
540+
"csb_org_security": {
541+
"n": 24,
542+
"baseline_cost_mean_usd": 0.2167,
543+
"mcp_cost_mean_usd": 0.2146,
544+
"delta_cost_mean_usd": -0.002,
545+
"delta_cost_variance": 0.003105
546+
},
547+
"csb_sdlc_debug": {
548+
"n": 18,
549+
"baseline_cost_mean_usd": 0.3669,
550+
"mcp_cost_mean_usd": 0.4569,
551+
"delta_cost_mean_usd": 0.0901,
552+
"delta_cost_variance": 0.02381
553+
},
554+
"csb_sdlc_design": {
555+
"n": 14,
556+
"baseline_cost_mean_usd": 0.41,
557+
"mcp_cost_mean_usd": 0.359,
558+
"delta_cost_mean_usd": -0.051,
559+
"delta_cost_variance": 0.097988
560+
},
561+
"csb_sdlc_document": {
562+
"n": 13,
563+
"baseline_cost_mean_usd": 0.2669,
564+
"mcp_cost_mean_usd": 0.2974,
565+
"delta_cost_mean_usd": 0.0305,
566+
"delta_cost_variance": 0.01439
567+
},
568+
"csb_sdlc_feature": {
569+
"n": 23,
570+
"baseline_cost_mean_usd": 0.4965,
571+
"mcp_cost_mean_usd": 0.7079,
572+
"delta_cost_mean_usd": 0.2114,
573+
"delta_cost_variance": 0.183988
574+
},
575+
"csb_sdlc_fix": {
576+
"n": 26,
577+
"baseline_cost_mean_usd": 0.5997,
578+
"mcp_cost_mean_usd": 0.7057,
579+
"delta_cost_mean_usd": 0.1059,
580+
"delta_cost_variance": 0.06587
581+
},
582+
"csb_sdlc_refactor": {
583+
"n": 15,
584+
"baseline_cost_mean_usd": 0.3194,
585+
"mcp_cost_mean_usd": 0.7173,
586+
"delta_cost_mean_usd": 0.398,
587+
"delta_cost_variance": 0.147469
588+
},
589+
"csb_sdlc_secure": {
590+
"n": 12,
591+
"baseline_cost_mean_usd": 0.4825,
592+
"mcp_cost_mean_usd": 0.5657,
593+
"delta_cost_mean_usd": 0.0832,
594+
"delta_cost_variance": 0.030859
595+
},
596+
"csb_sdlc_test": {
597+
"n": 18,
598+
"baseline_cost_mean_usd": 0.2641,
599+
"mcp_cost_mean_usd": 0.2976,
600+
"delta_cost_mean_usd": 0.0335,
601+
"delta_cost_variance": 0.015625
602+
},
603+
"csb_sdlc_understand": {
604+
"n": 10,
605+
"baseline_cost_mean_usd": 0.3519,
606+
"mcp_cost_mean_usd": 0.4475,
607+
"delta_cost_mean_usd": 0.0956,
608+
"delta_cost_variance": 0.022037
609+
}
436610
}
437611
}

docs/analysis/analysis_set_metrics_20260303.json

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,5 +433,179 @@
433433
"mean_reward_delta": 0.048,
434434
"reward_delta_variance": 0.059717
435435
}
436+
},
437+
"ir_by_type": {
438+
"baseline": {
439+
"n": 132,
440+
"file_recall": 0.3295,
441+
"mrr": 0.3462,
442+
"map_score": 0.2307,
443+
"context_efficiency": 0.1843
444+
},
445+
"mcp": {
446+
"n": 179,
447+
"file_recall": 0.5558,
448+
"mrr": 0.3778,
449+
"map_score": 0.2667,
450+
"context_efficiency": 0.2043
451+
}
452+
},
453+
"ir_by_config": {
454+
"baseline-local-direct": {
455+
"n": 132,
456+
"file_recall": 0.3295,
457+
"mrr": 0.3462,
458+
"map_score": 0.2307,
459+
"context_efficiency": 0.1843
460+
},
461+
"mcp-remote-direct": {
462+
"n": 179,
463+
"file_recall": 0.5558,
464+
"mrr": 0.3778,
465+
"map_score": 0.2667,
466+
"context_efficiency": 0.2043
467+
}
468+
},
469+
"cost_by_suite": {
470+
"csb_org_compliance": {
471+
"n": 18,
472+
"baseline_cost_mean_usd": 0.2679,
473+
"mcp_cost_mean_usd": 0.2521,
474+
"delta_cost_mean_usd": -0.0158,
475+
"delta_cost_variance": 0.003486
476+
},
477+
"csb_org_crossorg": {
478+
"n": 15,
479+
"baseline_cost_mean_usd": 0.2756,
480+
"mcp_cost_mean_usd": 0.2136,
481+
"delta_cost_mean_usd": -0.062,
482+
"delta_cost_variance": 0.017902
483+
},
484+
"csb_org_crossrepo": {
485+
"n": 14,
486+
"baseline_cost_mean_usd": 0.2575,
487+
"mcp_cost_mean_usd": 0.2523,
488+
"delta_cost_mean_usd": -0.0052,
489+
"delta_cost_variance": 0.005375
490+
},
491+
"csb_org_crossrepo_tracing": {
492+
"n": 22,
493+
"baseline_cost_mean_usd": 0.2478,
494+
"mcp_cost_mean_usd": 0.2187,
495+
"delta_cost_mean_usd": -0.0292,
496+
"delta_cost_variance": 0.003282
497+
},
498+
"csb_org_domain": {
499+
"n": 20,
500+
"baseline_cost_mean_usd": 0.2108,
501+
"mcp_cost_mean_usd": 0.2258,
502+
"delta_cost_mean_usd": 0.015,
503+
"delta_cost_variance": 0.003268
504+
},
505+
"csb_org_incident": {
506+
"n": 20,
507+
"baseline_cost_mean_usd": 0.2465,
508+
"mcp_cost_mean_usd": 0.1989,
509+
"delta_cost_mean_usd": -0.0476,
510+
"delta_cost_variance": 0.007914
511+
},
512+
"csb_org_migration": {
513+
"n": 26,
514+
"baseline_cost_mean_usd": 0.2534,
515+
"mcp_cost_mean_usd": 0.2501,
516+
"delta_cost_mean_usd": -0.0033,
517+
"delta_cost_variance": 0.009546
518+
},
519+
"csb_org_onboarding": {
520+
"n": 28,
521+
"baseline_cost_mean_usd": 0.1029,
522+
"mcp_cost_mean_usd": 0.1049,
523+
"delta_cost_mean_usd": 0.002,
524+
"delta_cost_variance": 0.00086
525+
},
526+
"csb_org_org": {
527+
"n": 15,
528+
"baseline_cost_mean_usd": 0.2362,
529+
"mcp_cost_mean_usd": 0.2193,
530+
"delta_cost_mean_usd": -0.0169,
531+
"delta_cost_variance": 0.00171
532+
},
533+
"csb_org_platform": {
534+
"n": 18,
535+
"baseline_cost_mean_usd": 0.194,
536+
"mcp_cost_mean_usd": 0.2149,
537+
"delta_cost_mean_usd": 0.0209,
538+
"delta_cost_variance": 0.001999
539+
},
540+
"csb_org_security": {
541+
"n": 24,
542+
"baseline_cost_mean_usd": 0.2167,
543+
"mcp_cost_mean_usd": 0.2146,
544+
"delta_cost_mean_usd": -0.002,
545+
"delta_cost_variance": 0.003105
546+
},
547+
"csb_sdlc_debug": {
548+
"n": 18,
549+
"baseline_cost_mean_usd": 0.3669,
550+
"mcp_cost_mean_usd": 0.4569,
551+
"delta_cost_mean_usd": 0.0901,
552+
"delta_cost_variance": 0.02381
553+
},
554+
"csb_sdlc_design": {
555+
"n": 14,
556+
"baseline_cost_mean_usd": 0.41,
557+
"mcp_cost_mean_usd": 0.359,
558+
"delta_cost_mean_usd": -0.051,
559+
"delta_cost_variance": 0.097988
560+
},
561+
"csb_sdlc_document": {
562+
"n": 13,
563+
"baseline_cost_mean_usd": 0.2669,
564+
"mcp_cost_mean_usd": 0.2974,
565+
"delta_cost_mean_usd": 0.0305,
566+
"delta_cost_variance": 0.01439
567+
},
568+
"csb_sdlc_feature": {
569+
"n": 23,
570+
"baseline_cost_mean_usd": 0.4965,
571+
"mcp_cost_mean_usd": 0.7079,
572+
"delta_cost_mean_usd": 0.2114,
573+
"delta_cost_variance": 0.183988
574+
},
575+
"csb_sdlc_fix": {
576+
"n": 26,
577+
"baseline_cost_mean_usd": 0.5997,
578+
"mcp_cost_mean_usd": 0.7057,
579+
"delta_cost_mean_usd": 0.1059,
580+
"delta_cost_variance": 0.06587
581+
},
582+
"csb_sdlc_refactor": {
583+
"n": 15,
584+
"baseline_cost_mean_usd": 0.3194,
585+
"mcp_cost_mean_usd": 0.7173,
586+
"delta_cost_mean_usd": 0.398,
587+
"delta_cost_variance": 0.147469
588+
},
589+
"csb_sdlc_secure": {
590+
"n": 12,
591+
"baseline_cost_mean_usd": 0.4825,
592+
"mcp_cost_mean_usd": 0.5657,
593+
"delta_cost_mean_usd": 0.0832,
594+
"delta_cost_variance": 0.030859
595+
},
596+
"csb_sdlc_test": {
597+
"n": 18,
598+
"baseline_cost_mean_usd": 0.2641,
599+
"mcp_cost_mean_usd": 0.2976,
600+
"delta_cost_mean_usd": 0.0335,
601+
"delta_cost_variance": 0.015625
602+
},
603+
"csb_sdlc_understand": {
604+
"n": 10,
605+
"baseline_cost_mean_usd": 0.3519,
606+
"mcp_cost_mean_usd": 0.4475,
607+
"delta_cost_mean_usd": 0.0956,
608+
"delta_cost_variance": 0.022037
609+
}
436610
}
437611
}

0 commit comments

Comments
 (0)