CodeTracer/src/codetracer/config/default.yaml at main · NJU-LINK/CodeTracer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
llm:
  api_base: "https://cloudgpt-openai.azure-api.net/openai/v1/"
  api_key: ""
  model_name: "gpt-4.1-mini-20250414"
  azure_ad_resource: "api://feb7b661-cac7-44a8-8dc1-163b63c23df2"
  model_kwargs: {}

trace:
  cost_limit: 3.0
  step_limit: 0
  timeout: 60
  action_regex: "```bash\\s*\\n([\\s\\S]*?)\\n?```"

  system_template: |
    You are CodeTracer, a trajectory diagnosis agent. You analyze agent run directories to identify failure-relevant steps.

    Your response must contain exactly ONE bash code block with ONE command:

    ```bash
    <single command>
    ```

    Include a THOUGHT section before the command explaining your reasoning.

    Rules:
    - One command per response
    - No && or || chaining
    - No heredocs
    - The closing ``` must be on its own line

  instance_template: |
    Analyze this agent trajectory and label failure-relevant steps.

    {% if exploration_instructions %}
    ============================================================
    PHASE 1 — MANDATORY TASK EXPLORATION (do this FIRST)
    ============================================================

    Task: {{ task_name }}
    Task directory: {{ task_dir }}

    Before analyzing ANY trajectory steps, you MUST explore the task
    context by executing the following commands (one per response).
    Do NOT skip this phase.

    {{ exploration_instructions }}

    {% if problem_statement %}
    Problem Statement:
    {{ problem_statement }}
    {% endif %}

    After you have completed all exploration commands and understand
    the task, proceed to Phase 2.

    ============================================================
    PHASE 2 — TRAJECTORY ANALYSIS
    ============================================================
    {% else %}
    <task>
    {{ task_description }}
    </task>
    {% endif %}

    Tree index (navigation aid):
    <tree>
    {{ tree_md }}
    </tree>

    Stage ranges:
    <stages>
    {{ stage_ranges_json }}
    </stages>

    Format context ({{ skill_name }}):
    <format_doc>
    {{ skill_doc }}
    </format_doc>

    Your working directory: {{ work_dir }}

    Available files:
    - steps.json: full trajectory (large, query by step_id)
    - task.md: task specification
    - tree.md: tree index
    - stage_ranges.json: stage spans

    Step inspection one-liner:
    ```bash
    python -c 'import json; sid=N; s=next(x for x in json.load(open("steps.json")) if x.get("step_id")==sid); print(json.dumps(s,indent=2,ensure_ascii=False))'
    ```

    Labels:
    - incorrect: wrong state-changing action (mislocalized edit, bad hypothesis, regression)
    - unuseful: redundant exploration (repeated reads/searches without new evidence)

    Workflow:
    {% if task_dir %}
    1. EXPLORE the task context (Phase 1 above — mandatory)
    2. Initialize output file: python -c 'import json; open("{{ output_file }}","w").write("[]")'
    {% else %}
    1. Initialize output file: python -c 'import json; open("{{ output_file }}","w").write("[]")'
    {% endif %}
    3. Use tree.md to spot suspicious areas
    4. INSPECT suspicious step_ids (one per response)
    5. WRITE labels after inspecting evidence
    6. FINALIZE with: echo TRACER_FINAL_OUTPUT

    Anti-cheat rules:
    - Every labeled step_id MUST have been explicitly inspected in this run
    - Inspect at least 3 distinct steps spread across the trajectory before finalizing
    - Do NOT auto-label without evidence

    CRITICAL — Output file and format:
    {{ profile_finalize_instruction }}
    Write your analysis to "{{ output_file }}" using a bash python one-liner.
    You MUST follow the output format above exactly.

    Finalize last:
    ```bash
    echo TRACER_FINAL_OUTPUT
    ```

  observation_template: |
    <returncode>{{ output.returncode }}</returncode>
    <output>
    {{ output.output -}}
    </output>

  format_error_template: "Please provide EXACTLY ONE bash code block. Found {{ actions | length }} blocks."

  timeout_template: |
    Command timed out: {{ action['action'] }}
    Output: {{ output }}

discovery:
  max_attempts: 3
  max_depth: 10
  skip_dirs:
    - .git
    - node_modules
    - __pycache__
    - .venv
    - env
    - venv
    - .tox
    - .mypy_cache
    - .pytest_cache
    - .ruff_cache

  system_template: |
    You are a code agent specializing in parsing agent trajectory files.
    You will be given the file layout of an unknown run directory and must produce TWO outputs:

    1. A SKILL.md file (in a ```markdown block) with YAML frontmatter containing name,
       description, fingerprints, and metadata fields, followed by documentation of
       the directory layout and extraction logic.
    2. A parser.py file (in a ```python block) implementing the parser.

    The parser must follow this interface:

    ```python
    from pathlib import Path
    from codetracer.core.models import FileRef, NormalizedTrajectory, StepRecord

    class GeneratedParser:
        format_id = "<unique_format_id>"

        def can_parse(self, run_dir: Path) -> bool: ...
        def parse(self, run_dir: Path) -> NormalizedTrajectory:
            ...
            return NormalizedTrajectory(
                steps=steps,
                task_description=instruction,
                metadata={"format": self.format_id, "run_dir": str(run_dir)},
            )

    parser = GeneratedParser()
    ```

    Rules:
    - format_id must be a short lowercase slug
    - Each StepRecord must have step_id (1-indexed int), action (str), observation (str|None)
    - The file must end with `parser = GeneratedParser()`
    - parse() must extract task_description from the run data (e.g. results.json
      "instruction" field) and pass it to NormalizedTrajectory

  instance_template: |
    Run directory: {run_dir}

    Directory listing (first 200 files):
    {listing}

    Sample file contents:
    {samples}

    Existing skills for reference:
    {skill_index}

    Write a SKILL.md and parser.py for this trajectory format.

  validation_error_template: |
    Your parser raised an error during validation:

    {error}

    Fix the parser and return corrected ```markdown and ```python blocks.

tree:
  system_template: "You are a trajectory analysis assistant. Respond only with the requested JSON."
  instance_template: |
    Classify each step as 'change' (modifies state: writes files, installs packages,
    edits code) or 'explore' (reads, searches, tests without state change).
    Also give a short 3-6 word label per step.

    Steps:
    {{ steps_json }}

    Return JSON array: [{"step_id": N, "node_type": "change"|"explore", "label": "..."}]

environment:
  env:
    PAGER: cat
    MANPAGER: cat

replay:
  max_replay_steps: 30
  timeout: 120

  system_template: |
    You are CodeTracer Replay Agent. You are resuming a failed agent run at a
    specific breakpoint where errors were detected. Your job is to continue
    the task using a corrected strategy informed by the error analysis provided.

    Respond with exactly ONE bash code block per message:
    ```bash
    <single command>
    ```

  breakpoint_template: |
    You are resuming an agent trajectory that was previously analyzed for errors.

    == Trajectory Context ==
    Task: {{ task_description }}

    Steps executed so far (1 to {{ preceding_count }}):
    {% for s in preceding_steps -%}
    [Step {{ s.step_id }}] {{ s.action[:120] }}
    {% endfor %}

    == Error Analysis at Step {{ target_step_id }} ==
    {{ analysis_summary }}

    {% for label in error_labels -%}
    - Step {{ label.step_id }} ({{ label.verdict }}): {{ label.reasoning }}
    {% endfor %}

    == Instructions ==
    The trajectory went wrong at step {{ target_step_id }}.
    You are now at that exact point. The environment has been restored to
    the state right before step {{ target_step_id }} was executed.

    Review the error analysis above and continue from here with a corrected approach.
    Do NOT repeat the commands from steps 1-{{ preceding_count }}.

    When finished, run:
    ```bash
    echo TRACER_FINAL_OUTPUT
    ```

bench:
  user_bench_dir: null
  auto_generate: true
  generator_max_attempts: 3

classification:
  store_path: null

output:
  default_profile: detailed
  profiles:
    tracebench:
      schema_ref: tracebench_labels
      finalize_instruction: "Output codetracer_labels.json with stage-level labels as a JSON array."
      output_file: codetracer_labels.json
    detailed:
      schema_ref: deep_analysis
      finalize_instruction: >-
        Output a comprehensive root cause analysis as a JSON OBJECT (not array) in codetracer_analysis.json.
        The JSON object must have these top-level keys: root_cause_chain (array of strings describing the
        failure chain from final failure back to initial error), critical_decision_points (array of objects
        with keys step_id, decision, should_have), correct_strategy (string), stage_labels (array of objects
        with keys stage_id, incorrect_step_ids, unuseful_step_ids, reasoning), and summary (string).
      output_file: codetracer_analysis.json
    rl_feedback:
      schema_ref: rl_feedback
      finalize_instruction: "Output codetracer_rl_feedback.json with per-step deviation analysis for RL training."
      output_file: codetracer_rl_feedback.json

memory:
  enabled: true
  memory_dir: null
  online_step_interval: 8
  online_token_threshold: 30000

plugin:
  adapters: {}
  hooks_enabled: true