cluesmith · waleedkadous · Dec 9, 2025 · Dec 9, 2025 · Dec 9, 2025 · Dec 9, 2025
diff --git a/AGENTS.md b/AGENTS.md
@@ -524,7 +524,7 @@ wait
 2. For subcommands (pr, spec, plan), auto-locates the file (e.g., `codev/specs/0039-*.md`)
 3. Invokes the appropriate CLI with autonomous mode enabled:
    - gemini: `GEMINI_SYSTEM_MD=<temp_file> gemini --yolo <query>`
-   - codex: `CODEX_SYSTEM_MESSAGE=<role> codex exec --full-auto <query>`
+   - codex: `codex exec -c experimental_instructions_file=<temp_file> -c model_reasoning_effort=low --full-auto <query>`
    - claude: `claude --print -p <role + query> --dangerously-skip-permissions`
 4. Passes through stdout/stderr and exit codes
 5. Logs queries with timing to `.consult/history.log`

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -524,7 +524,7 @@ wait
 2. For subcommands (pr, spec, plan), auto-locates the file (e.g., `codev/specs/0039-*.md`)
 3. Invokes the appropriate CLI with autonomous mode enabled:
    - gemini: `GEMINI_SYSTEM_MD=<temp_file> gemini --yolo <query>`
-   - codex: `CODEX_SYSTEM_MESSAGE=<role> codex exec --full-auto <query>`
+   - codex: `codex exec -c experimental_instructions_file=<temp_file> -c model_reasoning_effort=low --full-auto <query>`
    - claude: `claude --print -p <role + query> --dangerously-skip-permissions`
 4. Passes through stdout/stderr and exit codes
 5. Logs queries with timing to `.consult/history.log`

diff --git a/codev/bin/consult b/codev/bin/consult
@@ -149,8 +149,21 @@ def run_model_consultation(
     elif resolved == "codex":
         if not shutil.which("codex"):
             return "Error: codex not found", 1, 0.0
-        cmd = ["codex", "exec", "--full-auto", query]
-        env = {"CODEX_SYSTEM_MESSAGE": role}
+        # Use official experimental_instructions_file instead of undocumented CODEX_SYSTEM_MESSAGE
+        # See: https://github.com/openai/codex/discussions/3896
+        temp_system_file = tempfile.NamedTemporaryFile(
+            mode="w", suffix=".md", delete=False
+        )
+        temp_system_file.write(role)
+        temp_system_file.close()
+        cmd = [
+            "codex", "exec",
+            "-c", f"experimental_instructions_file={temp_system_file.name}",
+            "-c", "model_reasoning_effort=low",  # Faster responses (10-20% improvement)
+            "--full-auto",
+            query,
+        ]
+        env = {}
     elif resolved == "claude":
         if not shutil.which("claude"):
             return "Error: claude not found", 1, 0.0
@@ -761,8 +774,20 @@ def do_general(model: str, query: Optional[str], dry_run: bool) -> None:
                 file=sys.stderr,
             )
             sys.exit(1)
-        cmd = ["codex", "exec", "--full-auto", query]
-        env = {"CODEX_SYSTEM_MESSAGE": role}
+        # Use official experimental_instructions_file instead of undocumented CODEX_SYSTEM_MESSAGE
+        temp_system_file = tempfile.NamedTemporaryFile(
+            mode="w", suffix=".md", delete=False
+        )
+        temp_system_file.write(role)
+        temp_system_file.close()
+        cmd = [
+            "codex", "exec",
+            "-c", f"experimental_instructions_file={temp_system_file.name}",
+            "-c", "model_reasoning_effort=low",
+            "--full-auto",
+            query,
+        ]
+        env = {}
     elif model == "claude":
         if not shutil.which("claude"):
             print(
@@ -851,9 +876,20 @@ def run_mediated_consultation(
     elif resolved == "codex":
         if not shutil.which("codex"):
             return "Error: codex not found", 1, 0.0
+        # Use official experimental_instructions_file instead of undocumented CODEX_SYSTEM_MESSAGE
+        temp_system_file = tempfile.NamedTemporaryFile(
+            mode="w", suffix=".md", delete=False
+        )
+        temp_system_file.write(role)
+        temp_system_file.close()
         # codex exec without --full-auto has no tool use
-        cmd = ["codex", "exec", query]
-        env = {"CODEX_SYSTEM_MESSAGE": role}
+        cmd = [
+            "codex", "exec",
+            "-c", f"experimental_instructions_file={temp_system_file.name}",
+            "-c", "model_reasoning_effort=low",
+            query,
+        ]
+        env = {}
     elif resolved == "claude":
         if not shutil.which("claude"):
             return "Error: claude not found", 1, 0.0

diff --git a/codev/reviews/0043-codex-reliability.md b/codev/reviews/0043-codex-reliability.md
@@ -0,0 +1,119 @@
+# Review: Codex CLI Reliability and Performance Optimization
+
+## Metadata
+- **Date**: 2025-12-08
+- **Specification**: [codev/specs/0043-codex-reliability.md](../specs/0043-codex-reliability.md)
+- **Plan**: [codev/plans/0043-codex-reliability.md](../plans/0043-codex-reliability.md)
+
+## Executive Summary
+
+Successfully replaced the undocumented `CODEX_SYSTEM_MESSAGE` environment variable with the official `experimental_instructions_file` configuration approach. Added `model_reasoning_effort=low` tuning which resulted in a **27% reduction in consultation time** and **25% reduction in token usage** while maintaining review quality.
+
+## Specification Compliance
+
+### Success Criteria Assessment
+| Criterion | Status | Evidence | Notes |
+|-----------|--------|----------|-------|
+| CODEX_SYSTEM_MESSAGE replaced with experimental_instructions_file | ✅ | `codev/bin/consult` changes | All 3 codex invocation sites updated |
+| Consultant prompt reviewed and optimized | ✅ | Analysis complete | No changes needed - prompt already concise and model-agnostic (shared across Gemini/Codex/Claude) |
+| Performance baseline documented | ✅ | Spec updated with results | 163.7s -> 118.7s (-27%) |
+| No regressions in consultation quality | ✅ | PR 33 review comparison | After review found issue baseline missed |
+
+### Deviations from Specification
+| Original Requirement | What Was Built | Reason for Deviation |
+|---------------------|----------------|---------------------|
+| None | - | Full compliance |
+
+## Performance Analysis
+
+### Benchmarks (PR #33 Review - 932 lines, 8 files)
+| Metric | Before | After | Improvement |
+|--------|--------|-------|-------------|
+| Consultation Time | 163.7s | 118.7s | **-27.5%** |
+| Total Time (incl. pre-fetch) | 167.2s | 121.6s | **-27.3%** |
+| Tokens Used | 51,223 | 38,556 | **-24.7%** |
+| Reasoning Effort | medium | low | Reduced |
+
+### Quality Analysis
+- **Before**: APPROVE, HIGH confidence - approved the PR
+- **After**: REQUEST_CHANGES, MEDIUM confidence - found a valid issue (missing `af spawn` integration)
+
+The "after" implementation actually caught an issue that the baseline review missed, indicating that quality was maintained or improved despite faster execution.
+
+## Code Quality Assessment
+
+### Architecture Impact
+- **Positive Changes**:
+  - Using official documented configuration instead of undocumented env var
+  - Consistent temp file approach across Gemini and Codex
+  - Added reasoning effort tuning for performance control
+- **Technical Debt Incurred**: None
+- **Future Considerations**: Could add configurable reasoning effort levels if needed
+
+### Code Metrics
+- **Lines of Code**: +43 added, -7 removed (net +36)
+- **Test Coverage**: Added 3 new tests for Codex configuration
+- **Documentation**: Updated spec with implementation results
+
+### Files Modified
+1. `codev/bin/consult` - Core implementation changes (3 locations)
+2. `AGENTS.md` / `CLAUDE.md` - Updated Consult Tool documentation
+3. `codev/specs/0043-codex-reliability.md` - Added results documentation
+4. `tests/e2e/consult.bats` - Added Codex config tests
+5. `tests/e2e/helpers.bash` - Added skip helpers for CLI availability
+
+## Testing Summary
+
+### Test Execution
+- **Unit Tests**: N/A (Python script)
+- **E2E Tests**: Added 3 tests for Codex configuration
+- **Manual Testing**: Ran actual PR review before/after comparison
+
+### Tests Added
+1. `consult codex dry-run shows experimental_instructions_file config`
+2. `consult codex dry-run shows model_reasoning_effort=low`
+3. `consult codex dry-run cleans up temp file`
+
+## Lessons Learned
+
+### What Went Well
+1. **Baseline measurement first** - Having before/after data made the impact clear
+2. **Official documentation** - The `experimental_instructions_file` approach was well-documented in GitHub discussions
+3. **Reasoning effort tuning** - Simple flag change with significant performance impact
+
+### What Was Challenging
+1. **Finding the official approach**
+   - **Root Cause**: CODEX_SYSTEM_MESSAGE was undocumented; had to search GitHub discussions
+   - **Resolution**: Found official recommendation in discussion #3896
+   - **Prevention**: Document all configuration approaches for future reference
+
+### What Would You Do Differently
+1. Test on multiple PRs of different sizes to validate performance improvement is consistent
+2. Consider adding a `--reasoning-effort` flag to consult tool for user control
+
+## Multi-Agent Consultation Feedback
+
+### Gemini (APPROVE, HIGH confidence)
+- Technical approach is sound
+- Risk mitigation strategy is good
+- Note: consultant.md is shared across models
+
+### Codex (REQUEST_CHANGES, MEDIUM confidence)
+- Concern: Missing builder-branch performance tasks
+  - **Response**: Addressed - baseline was from main, after from builder worktree
+- Concern: No failure-path cleanup test
+  - **Response**: Added temp file cleanup test
+
+## Follow-Up Actions
+
+### Immediate
+- [x] Address Codex feedback (add cleanup test)
+- [x] Create PR
+
+### Long-term (Future Consideration)
+- [ ] Add `--reasoning-effort` flag for user control
+- [ ] Monitor Codex performance over time for regression detection
+
+## Conclusion
+
+This implementation successfully replaces undocumented configuration with official approaches while achieving a significant 27% performance improvement. The quality of reviews was maintained, and in fact the optimized implementation found an issue that the baseline missed. The changes are minimal, well-tested, and backwards-compatible.
diff --git a/codev/specs/0043-codex-reliability.md b/codev/specs/0043-codex-reliability.md
@@ -74,24 +74,30 @@ cmd = [
     query
 ]
 
-# Clean up after
+# Clean up after (in finally block for error safety)
 os.unlink(instructions_file)
 ```
 
+**Note**: The existing codebase already has try/finally blocks for temp file cleanup in all three Codex invocation paths (`run_model_consultation`, `do_general`, `run_mediated_consultation`). This implementation reuses that infrastructure.
+
 **Reasoning effort options**: `minimal` | `low` | `medium` | `high` | `none`
 - `low` recommended for consultations - balances speed and quality
 - Expected impact: 10-20% faster responses
 
 ### Part 2: Optimize Consultant Prompt
 
-The consultant role (`codev/roles/consultant.md`) should be optimized for Codex:
+The consultant role (`codev/roles/consultant.md`) was reviewed for Codex optimization:
 
 1. **Leverage Codex's strengths**: Encourage use of shell commands (`git show`, `rg`, etc.)
 2. **Be directive**: Codex responds well to clear, specific instructions
 3. **Avoid redundancy**: Codex already knows coding conventions
 4. **Focus on the task**: PR review, spec review, or plan review
 
-Current prompt may be too generic. Need to review and optimize.
+**Analysis Result**: After review, the consultant prompt is already well-optimized:
+- Already encourages shell commands (PR Review Protocol section)
+- Already directive with clear instructions per review type
+- Model-agnostic design is correct since it's shared across Gemini/Codex/Claude
+- No changes needed - the prompt structure is appropriate for all models
 
 ### Part 3: Investigate Performance
 
@@ -103,10 +109,14 @@ Before optimizing, verify the baseline:
 
 ## Success Criteria
 
-- [ ] `CODEX_SYSTEM_MESSAGE` replaced with `experimental_instructions_file`
-- [ ] Consultant prompt reviewed and optimized for Codex
-- [ ] Performance baseline documented (main vs branch)
-- [ ] No regressions in consultation quality
+- [x] `CODEX_SYSTEM_MESSAGE` replaced with `experimental_instructions_file`
+- [x] Consultant prompt reviewed (no changes needed - already optimized)
+- [x] Performance baseline documented (163.7s -> 118.7s, -27%)
+- [x] No regressions in consultation quality (after review found issue baseline missed)
+
+## Out of Scope
+
+- **TypeScript consult CLI** (`packages/codev/src/commands/consult/index.ts`): This is a separate port (Spec 0039) and should be updated separately. The Python `codev/bin/consult` is the primary implementation.
 
 ## Constraints
 
@@ -145,6 +155,19 @@ From web research ([10 Codex Fixes](https://medium.com/@ThinkingLoop/10-openai-c
 - **Model inherent speed**: GPT-5.1-codex is slower than alternatives by design
 - **Token generation**: ~1ms per token, unavoidable
 
+## Implementation Results
+
+Performance measurements on PR #33 review (932-line diff, 8 files):
+
+| Metric | Before | After | Improvement |
+|--------|--------|-------|-------------|
+| Consultation Time | 163.7s | 118.7s | **-27.5%** |
+| Total Time | 167.2s | 121.6s | **-27.3%** |
+| Tokens Used | 51,223 | 38,556 | **-24.7%** |
+| Reasoning Effort | medium | low | Reduced |
+
+**Quality**: After implementation, Codex found a valid issue (missing `af spawn` integration) that the baseline review missed, suggesting quality is maintained or improved.
+
 ## References
 
 - [Codex CLI Reference](https://developers.openai.com/codex/cli/reference/)

diff --git a/tests/e2e/consult.bats b/tests/e2e/consult.bats
@@ -108,3 +108,40 @@ teardown() {
   # Dry run should be documented in help
   assert_output --partial "dry"
 }
+
+# === Codex Configuration (Spec 0043) ===
+
+@test "consult codex dry-run shows experimental_instructions_file config" {
+  # Verify we use the official experimental_instructions_file instead of CODEX_SYSTEM_MESSAGE
+  # The dry-run should show the -c experimental_instructions_file flag
+  skip_if_no_codex
+  run ./node_modules/.bin/consult --model codex general "test" --dry-run
+  assert_success
+  assert_output --partial "experimental_instructions_file"
+}
+
+@test "consult codex dry-run shows model_reasoning_effort=low" {
+  # Verify we use low reasoning effort for faster responses
+  skip_if_no_codex
+  run ./node_modules/.bin/consult --model codex general "test" --dry-run
+  assert_success
+  assert_output --partial "model_reasoning_effort=low"
+}
+
+@test "consult codex dry-run cleans up temp file" {
+  # Verify temp file created for experimental_instructions_file is cleaned up
+  # The dry-run creates and then removes the temp file
+  skip_if_no_codex
+
+  # Count temp .md files before
+  local before_count=$(ls /tmp/*.md 2>/dev/null | wc -l || echo 0)
+
+  run ./node_modules/.bin/consult --model codex general "test" --dry-run
+  assert_success
+
+  # Count temp .md files after - should be same or less (cleanup happened)
+  local after_count=$(ls /tmp/*.md 2>/dev/null | wc -l || echo 0)
+
+  # After count should not be greater than before (temp file was cleaned up)
+  [[ "$after_count" -le "$before_count" ]]
+}
diff --git a/tests/e2e/helpers.bash b/tests/e2e/helpers.bash
@@ -154,3 +154,27 @@ refute_output_contains() {
   fi
   return 0
 }
+
+# Skip test if codex CLI is not installed
+# Usage: skip_if_no_codex
+skip_if_no_codex() {
+  if ! command -v codex &> /dev/null; then
+    skip "codex CLI not installed"
+  fi
+}
+
+# Skip test if gemini CLI is not installed
+# Usage: skip_if_no_gemini
+skip_if_no_gemini() {
+  if ! command -v gemini &> /dev/null; then
+    skip "gemini CLI not installed"
+  fi
+}
+
+# Skip test if claude CLI is not installed
+# Usage: skip_if_no_claude
+skip_if_no_claude() {
+  if ! command -v claude &> /dev/null; then
+    skip "claude CLI not installed"
+  fi
+}