Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 35 additions & 13 deletions src/upskill/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
import asyncio
import json
import sys
from collections.abc import AsyncIterator
from contextlib import asynccontextmanager
from importlib import resources
from pathlib import Path
from typing import AsyncIterator, TypedDict
from typing import TypedDict

import click
from dotenv import load_dotenv
Expand Down Expand Up @@ -279,9 +280,6 @@ async def _generate_async(
console.print(f"Logging runs to: {batch_folder}", style="dim")


skill: Skill | None = None
results = None

async with _fast_agent_context() as agent:

# Generate from trace file
Expand Down Expand Up @@ -759,8 +757,13 @@ async def _eval_async(

if is_benchmark_mode:
# Benchmark mode: multiple models and/or runs
console.print(f"\nEvaluating [bold]{skill.name}[/bold] across {len(models)} model(s)")
console.print(f" {len(test_cases)} test case(s), {num_runs} run(s) per model{provider_info}\n")
console.print(
f"\nEvaluating [bold]{skill.name}[/bold] across {len(models)} model(s)"
)
console.print(
f" {len(test_cases)} test case(s), "
f"{num_runs} run(s) per model{provider_info}\n"
)

model_results: dict[str, list[RunResult]] = {m: [] for m in models}
all_run_results: list[RunResult] = []
Expand All @@ -783,7 +786,10 @@ async def _eval_async(

for tc_idx, tc in enumerate(test_cases, 1):
if verbose:
console.print(f" Running test {tc_idx}/{len(test_cases)}...", style="dim")
console.print(
f" Running test {tc_idx}/{len(test_cases)}...",
style="dim",
)

try:
result = await run_test(
Expand Down Expand Up @@ -864,7 +870,12 @@ async def _eval_async(

pass_rate = passed_runs / total_runs if total_runs else 0
pass_rate_str = f"{pass_rate:.0%}"
pass_rate_style = "green" if pass_rate > 0.5 else "yellow" if pass_rate > 0 else "red"
if pass_rate > 0.5:
pass_rate_style = "green"
elif pass_rate > 0:
pass_rate_style = "yellow"
else:
pass_rate_style = "red"

console.print(f"[bold]{model}[/bold]")
console.print(
Expand Down Expand Up @@ -993,10 +1004,12 @@ async def _eval_async(
savings_str = f"-{savings:.0%}" if savings >= 0 else f"+{-savings:.0%}"
savings_style = "green" if savings > 0 else "red" if savings < 0 else "dim"
console.print()
console.print(
f" tokens: {results.baseline_total_tokens} → {results.with_skill_total_tokens} "
token_line = (
f" tokens: {results.baseline_total_tokens} → "
f"{results.with_skill_total_tokens} "
f"[{savings_style}]({savings_str})[/{savings_style}]"
)
console.print(token_line)
else:
with_skill_rate = results.with_skill_success_rate
with_skill_bar = _render_bar(with_skill_rate)
Expand Down Expand Up @@ -1383,7 +1396,8 @@ def runs_cmd(
content_lines.append(_format_comparison_bars(r, metric))

panel_content = "\n".join(content_lines)
console.print(Panel(panel_content, title=f"Evaluation History: {skill_name}", border_style="blue"))
panel_title = f"Evaluation History: {skill_name}"
console.print(Panel(panel_content, title=panel_title, border_style="blue"))

elif len(unique_models) == 1 and len(unique_skills) >= 1:
# Single model, multiple skills - use Panel
Expand All @@ -1394,7 +1408,8 @@ def runs_cmd(
content_lines.append(_format_comparison_bars(r, metric, label_field="skill_name"))

panel_content = "\n".join(content_lines)
console.print(Panel(panel_content, title=f"Evaluation History: {model_name}", border_style="blue"))
panel_title = f"Evaluation History: {model_name}"
console.print(Panel(panel_content, title=panel_title, border_style="blue"))

else:
# Multiple skills and models - matrix view with Panel
Expand Down Expand Up @@ -1423,7 +1438,14 @@ def plot_cmd(
):
"""[Deprecated] Use 'upskill runs' instead."""
console.print("[yellow]Note: 'plot' is deprecated. Use 'upskill runs' instead.[/yellow]\n")
ctx.invoke(runs_cmd, runs_dir=runs_dir, skills=skills, models=models, csv_output=None, metric=metric)
ctx.invoke(
runs_cmd,
runs_dir=runs_dir,
skills=skills,
models=models,
csv_output=None,
metric=metric,
)


def _format_comparison_bars(
Expand Down
2 changes: 2 additions & 0 deletions tests/test_smoke.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
def test_smoke() -> None:
assert True