chughtapan · chughtapan · Dec 4, 2025 · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025
diff --git a/docs/evals.md b/docs/evals.md
@@ -90,8 +90,8 @@ Validate existing logs without running new tests:
 
 **AppWorld:**
 ```bash
-# Validate logs (auto-detects from results/{model}/{dataset}/)
-.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --dataset train
+# Validate logs (auto-detects from results/{model}/{datasets}/)
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --datasets train
 ```
 
 ### AppWorld Results Organization
@@ -100,7 +100,7 @@ AppWorld tests automatically organize results during execution:
 
 ```bash
 # Run tests - results automatically organized
-.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --dataset train --model gpt-4o
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --datasets train --model gpt-4o
 
 # Results automatically written to:
 # - results/gpt-4o/train/outputs/raw/ (conversation logs)
@@ -113,9 +113,10 @@ rm -rf experiments/outputs/gpt-4o/  # Frees ~15GB
 
 **AppWorld-specific options:**
 ```bash
---dataset DATASET         # Dataset: train, dev, test_normal, test_challenge (default: train)
+--datasets DATASETS       # Comma-separated datasets (default: train,dev)
 --limit N                 # Run only first N tasks from dataset
 --start-from TASK_ID      # Resume from specific task ID
+--default-few-shot        # Include few-shot examples (default: zero-shot)
 ```
 
 ### Parallel Execution
@@ -127,12 +128,39 @@ Run tests in parallel using multiple workers:
 .venv/bin/pytest tests/benchmarks/bfcl/test_bfcl.py -n 4
 
 # Run with 8 workers
-.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --dataset train -n 8
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --datasets train -n 8
 
 # Auto-detect number of CPUs
 .venv/bin/pytest tests/benchmarks/bfcl/test_bfcl.py -n auto
 ```
 
+### AppWorld Dataset Selection
+
+Run tests on one or more datasets (default: `train,dev`):
+
+```bash
+# Run on default datasets (train,dev)
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py
+
+# Run on specific dataset
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --datasets train
+
+# Run on multiple datasets
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --datasets train,dev,test_normal
+```
+
+### AppWorld Prompt Mode
+
+By default, AppWorld tests run in **zero-shot mode** (no examples in prompt). Use `--default-few-shot` to include worked-out examples:
+
+```bash
+# Zero-shot (default) - no examples in prompt
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --datasets train
+
+# Few-shot - include demo examples in system prompt
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --datasets train --default-few-shot
+```
+
 ## Further Reading
 
 ### BFCL

diff --git a/tests/README.md b/tests/README.md
@@ -99,14 +99,17 @@ UV_GIT_LFS=1 uv pip install -e ".[dev,evals]"
 appworld install
 appworld download data
 
-# Run all train tasks (results automatically organized to results/{model}/{dataset}/)
-.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --dataset train --model gpt-4o
+# Run default datasets (train,dev) - results organized to results/{model}/{datasets}/
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --model gpt-4o
+
+# Run specific dataset
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --datasets train --model gpt-4o
 
 # Run specific task
 .venv/bin/pytest 'tests/benchmarks/appworld/test_appworld.py::test_appworld[train_001]'
 
-# Validate existing results (auto-detects from results/{model}/{dataset}/)
-.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --dataset train
+# Validate existing results (auto-detects from results/{model}/{datasets}/)
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --datasets train
 ```
 
 ### 5. Smoke Tests (`smoke/`)
@@ -237,11 +240,12 @@ Registered markers:
 
 **AppWorld-specific options:**
 ```bash
---dataset DATASET         # Dataset: train, dev, test_normal, test_challenge (default: train)
---limit N                 # Run only first N tasks from dataset
---api-mode MODE           # API prediction mode (default: app_oracle)
---experiment-dir DIR      # Custom experiment directory name
---start-from TASK_ID      # Resume from specific task ID
+--datasets DATASETS               # Comma-separated datasets (default: train,dev)
+--limit N                         # Run only first N tasks from dataset
+--api-mode MODE                   # API prediction mode (default: app_oracle)
+--start-from TASK_ID              # Resume from specific task ID
+--default-few-shot                # Include few-shot examples (default: zero-shot)
+--appworld-experiment-name NAME   # Custom experiment name (default: {model}/{datasets})
 ```
 
 ## Common Commands

diff --git a/tests/benchmarks/appworld/README.md b/tests/benchmarks/appworld/README.md
@@ -16,27 +16,32 @@ appworld download data
 ## Run Tests
 
 ```bash
-# Run first task from train dataset (automatically organized to results/gpt-4o/train/)
-pytest tests/benchmarks/appworld/test_appworld.py --dataset train --limit 1 --model gpt-4o
+# Run first task (uses default: train,dev datasets, organized to results/gpt-4o/train_dev/)
+pytest tests/benchmarks/appworld/test_appworld.py --limit 1 --model gpt-4o
 
-# Run first 5 train tasks
-pytest tests/benchmarks/appworld/test_appworld.py --dataset train --limit 5 --model gpt-4o
+# Run only train dataset
+pytest tests/benchmarks/appworld/test_appworld.py --datasets train --limit 5 --model gpt-4o
 
-# Run all dev tasks
-pytest tests/benchmarks/appworld/test_appworld.py --dataset dev --model gpt-4o
+# Run only dev dataset
+pytest tests/benchmarks/appworld/test_appworld.py --datasets dev --model gpt-4o
+
+# Run multiple datasets
+pytest tests/benchmarks/appworld/test_appworld.py --datasets train,dev,test_normal
 
 # Run specific task (use actual task IDs like 82e2fac_1, not train_001)
 pytest 'tests/benchmarks/appworld/test_appworld.py::test_appworld[82e2fac_1]'
 
 # Validate existing results (auto-detects from results/gpt-4o/train/)
-pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --dataset train
+pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --datasets train
 ```
 
 ## CLI Options
 
 ### Dataset Selection
 ```bash
---dataset {train,dev,test_normal,test_challenge}  # Default: train
+--datasets train                    # Single dataset
+--datasets train,dev                # Multiple datasets (default)
+--datasets train,dev,test_normal    # Run train, dev, and test_normal
 ```
 - `train`: Training dataset (90 tasks)
 - `dev`: Development/validation dataset
@@ -54,14 +59,23 @@ pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o
 --temperature 0.001  # Temperature for sampling (default: 0.001)
 ```
 
+### Prompt Mode
+```bash
+# Default: zero-shot (no examples in prompt)
+pytest tests/benchmarks/appworld/test_appworld.py --datasets train
+
+# Few-shot: include worked-out examples in system prompt
+pytest tests/benchmarks/appworld/test_appworld.py --datasets train --default-few-shot
+```
+
 ### Parallel Execution
 ```bash
 -n 4     # Run with 4 workers
 -n 8     # Run with 8 workers
 -n auto  # Auto-detect number of CPUs
 ```
 
-Example: `pytest tests/benchmarks/appworld/test_appworld.py --dataset train -n 4`
+Example: `pytest tests/benchmarks/appworld/test_appworld.py --datasets train -n 4`
 
 ## File Structure
 

diff --git a/tests/benchmarks/appworld/conftest.py b/tests/benchmarks/appworld/conftest.py
@@ -4,30 +4,36 @@
 
 import pytest
 
+VALID_DATASETS = {"train", "dev", "test_normal", "test_challenge"}
+
 
 @pytest.fixture
 def output_dir(request: pytest.FixtureRequest) -> Path:
-    """AppWorld-specific output directory.
+    """AppWorld output directory.
 
-    Overrides the global output_dir fixture to write directly to
-    results/{model}/{dataset}/outputs/ for organized storage.
+    Uses --output-dir if specified, otherwise auto-infers as
+    results/{experiment_name}/outputs/
     """
-    model = str(request.config.getoption("--model"))
-    dataset = str(request.config.getoption("--dataset"))
+    output_dir_opt = str(request.config.getoption("--output-dir"))
+
+    # If not default "outputs", use the specified path directly
+    if output_dir_opt != "outputs":
+        path = Path(output_dir_opt)
+    else:
+        # Use experiment_name for consistent directory structure
+        exp_name = get_experiment_name(request.config)
+        path = Path("results") / exp_name / "outputs"
 
-    # Write directly to results directory
-    path = Path("results") / model / dataset / "outputs"
     path.mkdir(parents=True, exist_ok=True)
     return path
 
 
 def pytest_addoption(parser: pytest.Parser) -> None:
     """Add AppWorld-specific CLI options."""
     parser.addoption(
-        "--dataset",
-        default="train",
-        choices=["train", "dev", "test_normal", "test_challenge"],
-        help="AppWorld dataset to use (default: train)",
+        "--datasets",
+        default="train,dev",
+        help="Comma-separated AppWorld datasets: train,dev,test_normal,test_challenge (default: train,dev)",
     )
     parser.addoption(
         "--limit",
@@ -45,30 +51,61 @@ def pytest_addoption(parser: pytest.Parser) -> None:
         ),
     )
     parser.addoption(
-        "--experiment-dir",
+        "--start-from",
         default=None,
         type=str,
         help=(
-            "Experiment directory name (e.g., 'gpt-5/train' or 'claude-sonnet-4-5/dev'). "
-            "If not specified, auto-generates timestamp-based name. "
-            "Results will be saved to experiments/outputs/{experiment-dir}/"
+            "Start from specified task_id (skip all tests before it). "
+            "Example: --start-from 692c77d_1. Useful for resuming interrupted benchmark runs."
         ),
     )
     parser.addoption(
-        "--start-from",
+        "--default-few-shot",
+        action="store_true",
+        default=False,
+        help="Include few-shot examples in system prompt (default: zero-shot, no examples)",
+    )
+    parser.addoption(
+        "--appworld-experiment-name",
         default=None,
         type=str,
-        help=(
-            "Start from specified task_id (skip all tests before it). "
-            "Example: --start-from 692c77d_1. Useful for resuming interrupted benchmark runs."
-        ),
+        help="Experiment name for AppWorld data (default: auto-inferred as {model}/{datasets})",
     )
 
 
+def parse_datasets(datasets_str: str) -> list[str]:
+    """Parse comma-separated datasets string and validate."""
+    datasets = [d.strip() for d in datasets_str.split(",") if d.strip()]
+    invalid = set(datasets) - VALID_DATASETS
+    if invalid:
+        raise ValueError(f"Invalid datasets: {invalid}. Valid options: {VALID_DATASETS}")
+    return datasets
+
+
+def get_datasets_dir(datasets_str: str) -> str:
+    """Parse datasets and return underscore-joined directory name."""
+    datasets = parse_datasets(datasets_str)
+    return "_".join(datasets)
+
+
+def get_experiment_name(config: pytest.Config) -> str:
+    """Get experiment name from config (helper for use outside fixtures)."""
+    name = config.getoption("--appworld-experiment-name", None)
+    if name:
+        return str(name)
+
+    # Auto-infer from model/datasets
+    model = str(config.getoption("--model"))
+    datasets_str = str(config.getoption("--datasets"))
+    datasets_dir = get_datasets_dir(datasets_str)
+    return f"{model}/{datasets_dir}"
+
+
 @pytest.fixture
-def appworld_dataset(request: pytest.FixtureRequest) -> str:
-    """Get the AppWorld dataset name from CLI."""
-    return str(request.config.getoption("--dataset"))
+def appworld_datasets(request: pytest.FixtureRequest) -> list[str]:
+    """Get the AppWorld dataset names from CLI."""
+    datasets_str = str(request.config.getoption("--datasets"))
+    return parse_datasets(datasets_str)
 
 
 @pytest.fixture
@@ -92,25 +129,25 @@ def api_mode(request: pytest.FixtureRequest) -> str:
     return str(request.config.getoption("--api-mode"))
 
 
-@pytest.fixture(scope="session")
-def experiment_name(request: pytest.FixtureRequest) -> str:
+@pytest.fixture
+def use_few_shot(request: pytest.FixtureRequest) -> bool:
     """
-    Get or generate experiment directory name for the test session.
-
-    All tests in this session will write to the same experiment directory,
-    organized by task_id in subdirectories: experiments/outputs/{experiment_name}/tasks/{task_id}/
+    Get few-shot mode from CLI.
 
-    Automatically uses {model}/{dataset} pattern for organized experiment tracking.
+    Returns:
+        True if --default-few-shot flag is set (include examples in prompt)
+        False by default (zero-shot, no examples)
     """
+    return bool(request.config.getoption("--default-few-shot"))
 
-    experiment_dir = request.config.getoption("--experiment-dir", None)
 
-    if experiment_dir:
-        # Use specified experiment directory
-        return str(experiment_dir)
-    else:
-        # Use model/dataset pattern for organized experiment tracking
-        # This works for both normal runs and validation
-        model = str(request.config.getoption("--model"))
-        dataset = str(request.config.getoption("--dataset"))
-        return f"{model}/{dataset}"
+@pytest.fixture(scope="session")
+def experiment_name(request: pytest.FixtureRequest) -> str:
+    """
+    Experiment name for AppWorld evaluation data.
+
+    AppWorld saves to: experiments/outputs/{experiment_name}/tasks/{task_id}/
+    Results saved to: results/{experiment_name}/outputs/
+    Can be specified via --appworld-experiment-name or auto-inferred as {model}/{datasets}.
+    """
+    return get_experiment_name(request.config)
diff --git a/tests/benchmarks/appworld/prompts.py b/tests/benchmarks/appworld/prompts.py
@@ -14,15 +14,17 @@
 EXPERIMENTS_PATH = Path(appworld_experiments.__file__).parent
 
 
-def load_system_instruction(task: Task) -> str:
+def load_system_instruction(task: Task, use_few_shot: bool = False) -> str:
     """
-    Load and render system instruction from AppWorld's template with demo examples.
+    Load and render system instruction from AppWorld's template.
 
     Args:
         task: AppWorld Task object
+        use_few_shot: If True, include demo examples in prompt. Default is False (zero-shot).
 
     Returns:
-        Rendered system instruction with supervisor info, rules, and demos
+        Rendered system instruction with supervisor info and rules.
+        If use_few_shot=True, also includes worked-out demo examples.
     """
     # Load and render base system instruction template
     template_path = Path(__file__).parent / "system_instruction.txt"
@@ -42,6 +44,17 @@ def load_system_instruction(task: Task) -> str:
         app_descriptions=app_descriptions_yaml,
     )
 
+    # Zero-shot mode: return base instruction as-is
+    if not use_few_shot:
+        return base_instruction
+
+    # Few-shot mode: Add intro line and demo examples
+    examples_intro = (
+        "\n\nNext, I will show you some worked-out examples "
+        "as a tutorial before we proceed with the real task instruction."
+    )
+    base_instruction += examples_intro
+
     # Load demo messages and format them
     demos_path = EXPERIMENTS_PATH / "prompts/function_calling_agent/demos.json"
     demo_messages = read_json(str(demos_path))

diff --git a/tests/benchmarks/appworld/system_instruction.txt b/tests/benchmarks/appworld/system_instruction.txt
@@ -45,5 +45,3 @@ When the answer is given:
   E.g., for the song title of the current playing track, return just the title.
 - Numbers must be numeric and not in words.
   E.g., for the number of songs in the queue, return "10", not "ten".
-
-Next, I will show you some worked-out examples as a tutorial before we proceed with the real task instruction.