diff --git a/docs/evals.md b/docs/evals.md index e80234b..19aac4c 100644 --- a/docs/evals.md +++ b/docs/evals.md @@ -90,8 +90,8 @@ Validate existing logs without running new tests: **AppWorld:** ```bash -# Validate logs (auto-detects from results/{model}/{dataset}/) -.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --dataset train +# Validate logs (auto-detects from results/{model}/{datasets}/) +.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --datasets train ``` ### AppWorld Results Organization @@ -100,7 +100,7 @@ AppWorld tests automatically organize results during execution: ```bash # Run tests - results automatically organized -.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --dataset train --model gpt-4o +.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --datasets train --model gpt-4o # Results automatically written to: # - results/gpt-4o/train/outputs/raw/ (conversation logs) @@ -113,9 +113,10 @@ rm -rf experiments/outputs/gpt-4o/ # Frees ~15GB **AppWorld-specific options:** ```bash ---dataset DATASET # Dataset: train, dev, test_normal, test_challenge (default: train) +--datasets DATASETS # Comma-separated datasets (default: train,dev) --limit N # Run only first N tasks from dataset --start-from TASK_ID # Resume from specific task ID +--default-few-shot # Include few-shot examples (default: zero-shot) ``` ### Parallel Execution @@ -127,12 +128,39 @@ Run tests in parallel using multiple workers: .venv/bin/pytest tests/benchmarks/bfcl/test_bfcl.py -n 4 # Run with 8 workers -.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --dataset train -n 8 +.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --datasets train -n 8 # Auto-detect number of CPUs .venv/bin/pytest tests/benchmarks/bfcl/test_bfcl.py -n auto ``` +### AppWorld Dataset Selection + +Run tests on one or more datasets (default: `train,dev`): + +```bash +# Run on default datasets (train,dev) +.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py + +# Run on specific dataset +.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --datasets train + +# Run on multiple datasets +.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --datasets train,dev,test_normal +``` + +### AppWorld Prompt Mode + +By default, AppWorld tests run in **zero-shot mode** (no examples in prompt). Use `--default-few-shot` to include worked-out examples: + +```bash +# Zero-shot (default) - no examples in prompt +.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --datasets train + +# Few-shot - include demo examples in system prompt +.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --datasets train --default-few-shot +``` + ## Further Reading ### BFCL diff --git a/tests/README.md b/tests/README.md index 481edce..538271e 100644 --- a/tests/README.md +++ b/tests/README.md @@ -99,14 +99,17 @@ UV_GIT_LFS=1 uv pip install -e ".[dev,evals]" appworld install appworld download data -# Run all train tasks (results automatically organized to results/{model}/{dataset}/) -.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --dataset train --model gpt-4o +# Run default datasets (train,dev) - results organized to results/{model}/{datasets}/ +.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --model gpt-4o + +# Run specific dataset +.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --datasets train --model gpt-4o # Run specific task .venv/bin/pytest 'tests/benchmarks/appworld/test_appworld.py::test_appworld[train_001]' -# Validate existing results (auto-detects from results/{model}/{dataset}/) -.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --dataset train +# Validate existing results (auto-detects from results/{model}/{datasets}/) +.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --datasets train ``` ### 5. Smoke Tests (`smoke/`) @@ -237,11 +240,12 @@ Registered markers: **AppWorld-specific options:** ```bash ---dataset DATASET # Dataset: train, dev, test_normal, test_challenge (default: train) ---limit N # Run only first N tasks from dataset ---api-mode MODE # API prediction mode (default: app_oracle) ---experiment-dir DIR # Custom experiment directory name ---start-from TASK_ID # Resume from specific task ID +--datasets DATASETS # Comma-separated datasets (default: train,dev) +--limit N # Run only first N tasks from dataset +--api-mode MODE # API prediction mode (default: app_oracle) +--start-from TASK_ID # Resume from specific task ID +--default-few-shot # Include few-shot examples (default: zero-shot) +--appworld-experiment-name NAME # Custom experiment name (default: {model}/{datasets}) ``` ## Common Commands diff --git a/tests/benchmarks/appworld/README.md b/tests/benchmarks/appworld/README.md index 7229206..64b2ee9 100644 --- a/tests/benchmarks/appworld/README.md +++ b/tests/benchmarks/appworld/README.md @@ -16,27 +16,32 @@ appworld download data ## Run Tests ```bash -# Run first task from train dataset (automatically organized to results/gpt-4o/train/) -pytest tests/benchmarks/appworld/test_appworld.py --dataset train --limit 1 --model gpt-4o +# Run first task (uses default: train,dev datasets, organized to results/gpt-4o/train_dev/) +pytest tests/benchmarks/appworld/test_appworld.py --limit 1 --model gpt-4o -# Run first 5 train tasks -pytest tests/benchmarks/appworld/test_appworld.py --dataset train --limit 5 --model gpt-4o +# Run only train dataset +pytest tests/benchmarks/appworld/test_appworld.py --datasets train --limit 5 --model gpt-4o -# Run all dev tasks -pytest tests/benchmarks/appworld/test_appworld.py --dataset dev --model gpt-4o +# Run only dev dataset +pytest tests/benchmarks/appworld/test_appworld.py --datasets dev --model gpt-4o + +# Run multiple datasets +pytest tests/benchmarks/appworld/test_appworld.py --datasets train,dev,test_normal # Run specific task (use actual task IDs like 82e2fac_1, not train_001) pytest 'tests/benchmarks/appworld/test_appworld.py::test_appworld[82e2fac_1]' # Validate existing results (auto-detects from results/gpt-4o/train/) -pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --dataset train +pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --datasets train ``` ## CLI Options ### Dataset Selection ```bash ---dataset {train,dev,test_normal,test_challenge} # Default: train +--datasets train # Single dataset +--datasets train,dev # Multiple datasets (default) +--datasets train,dev,test_normal # Run train, dev, and test_normal ``` - `train`: Training dataset (90 tasks) - `dev`: Development/validation dataset @@ -54,6 +59,15 @@ pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --temperature 0.001 # Temperature for sampling (default: 0.001) ``` +### Prompt Mode +```bash +# Default: zero-shot (no examples in prompt) +pytest tests/benchmarks/appworld/test_appworld.py --datasets train + +# Few-shot: include worked-out examples in system prompt +pytest tests/benchmarks/appworld/test_appworld.py --datasets train --default-few-shot +``` + ### Parallel Execution ```bash -n 4 # Run with 4 workers @@ -61,7 +75,7 @@ pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o -n auto # Auto-detect number of CPUs ``` -Example: `pytest tests/benchmarks/appworld/test_appworld.py --dataset train -n 4` +Example: `pytest tests/benchmarks/appworld/test_appworld.py --datasets train -n 4` ## File Structure diff --git a/tests/benchmarks/appworld/conftest.py b/tests/benchmarks/appworld/conftest.py index bd1bee5..f519db4 100644 --- a/tests/benchmarks/appworld/conftest.py +++ b/tests/benchmarks/appworld/conftest.py @@ -4,19 +4,26 @@ import pytest +VALID_DATASETS = {"train", "dev", "test_normal", "test_challenge"} + @pytest.fixture def output_dir(request: pytest.FixtureRequest) -> Path: - """AppWorld-specific output directory. + """AppWorld output directory. - Overrides the global output_dir fixture to write directly to - results/{model}/{dataset}/outputs/ for organized storage. + Uses --output-dir if specified, otherwise auto-infers as + results/{experiment_name}/outputs/ """ - model = str(request.config.getoption("--model")) - dataset = str(request.config.getoption("--dataset")) + output_dir_opt = str(request.config.getoption("--output-dir")) + + # If not default "outputs", use the specified path directly + if output_dir_opt != "outputs": + path = Path(output_dir_opt) + else: + # Use experiment_name for consistent directory structure + exp_name = get_experiment_name(request.config) + path = Path("results") / exp_name / "outputs" - # Write directly to results directory - path = Path("results") / model / dataset / "outputs" path.mkdir(parents=True, exist_ok=True) return path @@ -24,10 +31,9 @@ def output_dir(request: pytest.FixtureRequest) -> Path: def pytest_addoption(parser: pytest.Parser) -> None: """Add AppWorld-specific CLI options.""" parser.addoption( - "--dataset", - default="train", - choices=["train", "dev", "test_normal", "test_challenge"], - help="AppWorld dataset to use (default: train)", + "--datasets", + default="train,dev", + help="Comma-separated AppWorld datasets: train,dev,test_normal,test_challenge (default: train,dev)", ) parser.addoption( "--limit", @@ -45,30 +51,61 @@ def pytest_addoption(parser: pytest.Parser) -> None: ), ) parser.addoption( - "--experiment-dir", + "--start-from", default=None, type=str, help=( - "Experiment directory name (e.g., 'gpt-5/train' or 'claude-sonnet-4-5/dev'). " - "If not specified, auto-generates timestamp-based name. " - "Results will be saved to experiments/outputs/{experiment-dir}/" + "Start from specified task_id (skip all tests before it). " + "Example: --start-from 692c77d_1. Useful for resuming interrupted benchmark runs." ), ) parser.addoption( - "--start-from", + "--default-few-shot", + action="store_true", + default=False, + help="Include few-shot examples in system prompt (default: zero-shot, no examples)", + ) + parser.addoption( + "--appworld-experiment-name", default=None, type=str, - help=( - "Start from specified task_id (skip all tests before it). " - "Example: --start-from 692c77d_1. Useful for resuming interrupted benchmark runs." - ), + help="Experiment name for AppWorld data (default: auto-inferred as {model}/{datasets})", ) +def parse_datasets(datasets_str: str) -> list[str]: + """Parse comma-separated datasets string and validate.""" + datasets = [d.strip() for d in datasets_str.split(",") if d.strip()] + invalid = set(datasets) - VALID_DATASETS + if invalid: + raise ValueError(f"Invalid datasets: {invalid}. Valid options: {VALID_DATASETS}") + return datasets + + +def get_datasets_dir(datasets_str: str) -> str: + """Parse datasets and return underscore-joined directory name.""" + datasets = parse_datasets(datasets_str) + return "_".join(datasets) + + +def get_experiment_name(config: pytest.Config) -> str: + """Get experiment name from config (helper for use outside fixtures).""" + name = config.getoption("--appworld-experiment-name", None) + if name: + return str(name) + + # Auto-infer from model/datasets + model = str(config.getoption("--model")) + datasets_str = str(config.getoption("--datasets")) + datasets_dir = get_datasets_dir(datasets_str) + return f"{model}/{datasets_dir}" + + @pytest.fixture -def appworld_dataset(request: pytest.FixtureRequest) -> str: - """Get the AppWorld dataset name from CLI.""" - return str(request.config.getoption("--dataset")) +def appworld_datasets(request: pytest.FixtureRequest) -> list[str]: + """Get the AppWorld dataset names from CLI.""" + datasets_str = str(request.config.getoption("--datasets")) + return parse_datasets(datasets_str) @pytest.fixture @@ -92,25 +129,25 @@ def api_mode(request: pytest.FixtureRequest) -> str: return str(request.config.getoption("--api-mode")) -@pytest.fixture(scope="session") -def experiment_name(request: pytest.FixtureRequest) -> str: +@pytest.fixture +def use_few_shot(request: pytest.FixtureRequest) -> bool: """ - Get or generate experiment directory name for the test session. - - All tests in this session will write to the same experiment directory, - organized by task_id in subdirectories: experiments/outputs/{experiment_name}/tasks/{task_id}/ + Get few-shot mode from CLI. - Automatically uses {model}/{dataset} pattern for organized experiment tracking. + Returns: + True if --default-few-shot flag is set (include examples in prompt) + False by default (zero-shot, no examples) """ + return bool(request.config.getoption("--default-few-shot")) - experiment_dir = request.config.getoption("--experiment-dir", None) - if experiment_dir: - # Use specified experiment directory - return str(experiment_dir) - else: - # Use model/dataset pattern for organized experiment tracking - # This works for both normal runs and validation - model = str(request.config.getoption("--model")) - dataset = str(request.config.getoption("--dataset")) - return f"{model}/{dataset}" +@pytest.fixture(scope="session") +def experiment_name(request: pytest.FixtureRequest) -> str: + """ + Experiment name for AppWorld evaluation data. + + AppWorld saves to: experiments/outputs/{experiment_name}/tasks/{task_id}/ + Results saved to: results/{experiment_name}/outputs/ + Can be specified via --appworld-experiment-name or auto-inferred as {model}/{datasets}. + """ + return get_experiment_name(request.config) diff --git a/tests/benchmarks/appworld/prompts.py b/tests/benchmarks/appworld/prompts.py index aee66f1..1ff3d81 100644 --- a/tests/benchmarks/appworld/prompts.py +++ b/tests/benchmarks/appworld/prompts.py @@ -14,15 +14,17 @@ EXPERIMENTS_PATH = Path(appworld_experiments.__file__).parent -def load_system_instruction(task: Task) -> str: +def load_system_instruction(task: Task, use_few_shot: bool = False) -> str: """ - Load and render system instruction from AppWorld's template with demo examples. + Load and render system instruction from AppWorld's template. Args: task: AppWorld Task object + use_few_shot: If True, include demo examples in prompt. Default is False (zero-shot). Returns: - Rendered system instruction with supervisor info, rules, and demos + Rendered system instruction with supervisor info and rules. + If use_few_shot=True, also includes worked-out demo examples. """ # Load and render base system instruction template template_path = Path(__file__).parent / "system_instruction.txt" @@ -42,6 +44,17 @@ def load_system_instruction(task: Task) -> str: app_descriptions=app_descriptions_yaml, ) + # Zero-shot mode: return base instruction as-is + if not use_few_shot: + return base_instruction + + # Few-shot mode: Add intro line and demo examples + examples_intro = ( + "\n\nNext, I will show you some worked-out examples " + "as a tutorial before we proceed with the real task instruction." + ) + base_instruction += examples_intro + # Load demo messages and format them demos_path = EXPERIMENTS_PATH / "prompts/function_calling_agent/demos.json" demo_messages = read_json(str(demos_path)) diff --git a/tests/benchmarks/appworld/system_instruction.txt b/tests/benchmarks/appworld/system_instruction.txt index f777a4f..833baf5 100644 --- a/tests/benchmarks/appworld/system_instruction.txt +++ b/tests/benchmarks/appworld/system_instruction.txt @@ -45,5 +45,3 @@ When the answer is given: E.g., for the song title of the current playing track, return just the title. - Numbers must be numeric and not in words. E.g., for the number of songs in the queue, return "10", not "ten". - -Next, I will show you some worked-out examples as a tutorial before we proceed with the real task instruction. diff --git a/tests/benchmarks/appworld/test_appworld.py b/tests/benchmarks/appworld/test_appworld.py index 5c8dfa9..b4d8fbc 100644 --- a/tests/benchmarks/appworld/test_appworld.py +++ b/tests/benchmarks/appworld/test_appworld.py @@ -12,6 +12,7 @@ from fast_agent.llm.request_params import RequestParams from tests.benchmarks.appworld import api_predictor, prompts +from tests.benchmarks.appworld.conftest import get_experiment_name, parse_datasets from tests.benchmarks.appworld.reporting import ( find_evaluation_report, generate_failure_report, @@ -27,17 +28,16 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: - """Dynamically generate test cases from AppWorld dataset.""" + """Dynamically generate test cases from AppWorld dataset(s).""" if "task_id" not in metafunc.fixturenames: return validate_only = metafunc.config.getoption("--validate-only", False) if validate_only: - # Auto-detect log directory from model/dataset - model = metafunc.config.getoption("--model") - dataset = metafunc.config.getoption("--dataset", "train") - log_dir = Path("results") / model / dataset / "outputs" / "raw" + # Auto-detect log directory from experiment_name + exp_name = get_experiment_name(metafunc.config) + log_dir = Path("results") / exp_name / "outputs" / "raw" # Find existing log files to validate log_files = list(log_dir.glob("*_complete.json")) if log_dir.exists() else [] @@ -47,12 +47,17 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: pytest.exit( f"\nError: No test results found in {log_dir}\n" f"Expected to find *_complete.json files for validation.\n" - f"Make sure you've run tests for --model {model} --dataset {dataset} first." + f"Run tests first or check --appworld-experiment-name." ) else: - # Load task IDs from AppWorld dataset - dataset = metafunc.config.getoption("--dataset", "train") - task_ids = load_task_ids(dataset) + # Load task IDs from AppWorld dataset(s) + datasets_str = metafunc.config.getoption("--datasets", "train,dev") + datasets = parse_datasets(datasets_str) + + # Collect task IDs from all specified datasets + task_ids = [] + for dataset in datasets: + task_ids.extend(load_task_ids(dataset)) # Apply --start-from filter first (before --limit) start_from = metafunc.config.getoption("--start-from", None) @@ -64,10 +69,10 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None: except ValueError: # Task ID not found - provide helpful error pytest.exit( - f"\nError: Task ID '{start_from}' not found in {dataset} dataset.\n" + f"\nError: Task ID '{start_from}' not found in datasets {datasets}.\n" f"Available task IDs (first 10): {', '.join(task_ids[:10])}\n" - f"Total tasks in dataset: {len(task_ids)}\n" - f"Use: pytest tests/benchmarks/appworld/test_appworld.py --dataset {dataset} " + f"Total tasks: {len(task_ids)}\n" + f"Use: pytest tests/benchmarks/appworld/test_appworld.py --datasets {datasets_str} " f"--collect-only to see all task IDs." ) @@ -92,6 +97,7 @@ async def test_appworld( output_dir: Path, api_mode: str, experiment_name: str, + use_few_shot: bool, request: pytest.FixtureRequest, ) -> None: """Run or validate an AppWorld test.""" @@ -99,7 +105,7 @@ async def test_appworld( # Run test if not in validate-only mode if not validate_only: - await _run_appworld_test(task_id, model, temperature, output_dir, api_mode, experiment_name) + await _run_appworld_test(task_id, model, temperature, output_dir, api_mode, experiment_name, use_few_shot) # Get complete.json path (always in output_dir/raw now) complete_path = output_dir / "raw" / f"{task_id}_complete.json" @@ -190,6 +196,7 @@ async def _run_appworld_test( output_dir: Path, api_mode: str, experiment_name: str, + use_few_shot: bool, ) -> None: """Run AppWorld test using the provided experiment name.""" @@ -213,7 +220,7 @@ async def _run_appworld_test( # Create and run FastAgent config_path = Path(__file__).parent / "fastagent.config.yaml" agent = FastAgent("AppWorld Test", config_path=str(config_path), ignore_unknown_args=True) - system_instruction = prompts.load_system_instruction(task) + system_instruction = prompts.load_system_instruction(task, use_few_shot=use_few_shot) @agent.agent( name="test_agent", @@ -279,9 +286,10 @@ def _generate_failure_report_inline( # Load complete.json complete_data = load_complete_json(output_dir, task_id) - # Determine output path - dataset = request.config.getoption("--dataset", "train") - failure_report_dir = Path("results") / model / dataset / "failure_reports" + # Derive failure_report_dir from output_dir (same parent directory) + # output_dir: results/{model}/{datasets}/outputs + # failure_report_dir: results/{model}/{datasets}/failure_reports + failure_report_dir = output_dir.parent / "failure_reports" failure_report_dir.mkdir(parents=True, exist_ok=True) # Generate failure report