diff --git a/cli/trellis b/cli/trellis index 8877211..3846134 100755 --- a/cli/trellis +++ b/cli/trellis @@ -267,28 +267,89 @@ def yaml_read_nested(text, parent, field): return None +def parse_numeric_scalar(value): + """Parse a YAML scalar that should contain a numeric value.""" + if value in (None, "", "null", "{}"): + return None + try: + return float(str(value).strip().strip('"').strip("'")) + except (TypeError, ValueError): + return None + + +def extract_self_eval_score(text): + """Read the recorded self-eval score from supported spec shapes.""" + for parent in ("self_eval", "perf_eval"): + for field in ("total", "score"): + score = parse_numeric_scalar(yaml_read_nested(text, parent, field)) + if score is not None: + return score + score = parse_numeric_scalar(yaml_read_field(text, parent)) + if score is not None: + return score + + return parse_numeric_scalar(yaml_read_field(text, "score")) + + +def parse_phase_statuses(text): + """Parse phase statuses from the phases block without counting unrelated statuses.""" + lines = text.splitlines() + statuses = [] + in_phases = False + i = 0 + + while i < len(lines): + line = lines[i] + stripped = line.strip() + indent = len(line) - len(line.lstrip()) + + if not in_phases: + if re.match(r'^phases:\s*$', line): + in_phases = True + i += 1 + continue + + if stripped and indent == 0: + break + + match = re.match(r'^\s+-\s+id:\s*"?(phase\d+)"?', line) + if not match: + i += 1 + continue + + item_indent = len(match.group(0)) - len(match.group(0).lstrip()) + status = "pending" + i += 1 + + while i < len(lines): + field_line = lines[i] + field_stripped = field_line.strip() + field_indent = len(field_line) - len(field_line.lstrip()) + + if field_stripped and field_indent <= item_indent: + break + + status_match = re.match( + r'^\s+status:\s*"?(pending|in_progress|completed|failed|skipped)"?', + field_line, + ) + if status_match: + status = status_match.group(1) + + i += 1 + + statuses.append(status) + + return statuses + + def count_phases(text): """Count phases and their statuses.""" - total = 0 - completed = 0 - failed = 0 - in_progress = 0 - for m in re.finditer(r'^\s+-\s+id:\s*"?phase\d+"?', text, re.MULTILINE): - total += 1 - for m in re.finditer(r'^\s+status:\s*"?completed"?', text, re.MULTILINE): - completed += 1 - for m in re.finditer(r'^\s+status:\s*"?failed"?', text, re.MULTILINE): - failed += 1 - for m in re.finditer(r'^\s+status:\s*"?in_progress"?', text, re.MULTILINE): - in_progress += 1 - # Subtract the top-level status from phase counts - top_status = yaml_read_field(text, "status") - if top_status == "completed": - completed = max(0, completed - 1) - elif top_status == "failed": - failed = max(0, failed - 1) - elif top_status == "in_progress": - in_progress = max(0, in_progress - 1) + phase_statuses = parse_phase_statuses(text) + total = len(phase_statuses) + completed = sum(1 for status in phase_statuses if status == "completed") + failed = sum(1 for status in phase_statuses if status == "failed") + in_progress = sum(1 for status in phase_statuses if status == "in_progress") return total, completed, failed, in_progress @@ -1073,18 +1134,16 @@ def cmd_start(args): def check_self_eval(text, task_id): """Warn if self-eval looks like a rubber stamp.""" - # Look for self_eval or perf_eval score - score_match = re.search(r'(?:self_eval|perf_eval|score):\s*(\d+)', text) - if not score_match: + score = extract_self_eval_score(text) + if score is None: print(f" {c(C_YELLOW, 'warn')}: no self-eval score found in spec") return - - score = int(score_match.group(1)) has_deviations = bool(re.search(r'deviations:', text, re.MULTILINE)) has_improvements = bool(re.search(r'improvements:', text, re.MULTILINE)) + score_display = int(score) if float(score).is_integer() else score if score >= 9 and not has_deviations and not has_improvements: - print(f" {c(C_YELLOW, 'warn')}: self-eval {score}/10 with no deviations or improvements noted") + print(f" {c(C_YELLOW, 'warn')}: self-eval {score_display}/10 with no deviations or improvements noted") print(f" scores above 8 should document at least one deviation or improvement") elif score == 10: print(f" {c(C_YELLOW, 'note')}: perfect 10/10 - are you sure? 10 means flawless with improvements beyond spec") @@ -1112,7 +1171,7 @@ def cmd_complete(args): text = spec.read_text() # Check for exec results - warn if no criteria were executed - has_results = bool(re.search(r'result:\s*"?(pass|fail)"?', text, re.MULTILINE)) + has_results = any(ac.get("result") in ("pass", "fail") for ac in parse_acceptance_criteria(text)) if not has_results: print(f" {c(C_YELLOW, 'warn')}: no exec results recorded. Run '{c(C_BOLD, f'trellis exec {args.task_id}')}' first") @@ -2333,13 +2392,14 @@ def cmd_report(args): risks[risk] = risks.get(risk, 0) + 1 # Self-eval scores - score_match = re.search(r'total:\s*(\d+)', text) - if score_match: - self_eval_scores.append(int(score_match.group(1))) + score = extract_self_eval_score(text) + if score is not None: + self_eval_scores.append(score) # Exec results - passes = len(re.findall(r'result:\s*"?pass"?', text)) - fails = len(re.findall(r'result:\s*"?fail"?', text)) + criteria = parse_acceptance_criteria(text) + passes = sum(1 for ac in criteria if ac.get("result") == "pass") + fails = sum(1 for ac in criteria if ac.get("result") == "fail") if passes or fails: exec_pass += passes exec_fail += fails diff --git a/tests/review_gate_smoke.sh b/tests/review_gate_smoke.sh index 5f989c2..a3f14ad 100755 --- a/tests/review_gate_smoke.sh +++ b/tests/review_gate_smoke.sh @@ -739,6 +739,218 @@ EOF assert_contains "$spec_text" 'Command timed out after 1s' "spec should record the configured timeout in result_output" } +case_complete_nested_exec_and_self_eval() { + local repo task_id output archive_path spec_text + repo="$(new_repo)" + task_id="complete-nested-exec-and-self-eval" + write_changed_file "$repo" + + cat > "$repo/.ai/specs/active/$task_id.yaml" < "$repo/.ai/specs/archive/2026-03/$task_id.yaml" < "$repo/.ai/specs/archive/2026-03/$task_id.yaml" <