diff --git a/docs/evals.md b/docs/evals.md
index e80234b..19aac4c 100644
--- a/docs/evals.md
+++ b/docs/evals.md
@@ -90,8 +90,8 @@ Validate existing logs without running new tests:
 
 **AppWorld:**
 ```bash
-# Validate logs (auto-detects from results/{model}/{dataset}/)
-.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --dataset train
+# Validate logs (auto-detects from results/{model}/{datasets}/)
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --datasets train
 ```
 
 ### AppWorld Results Organization
@@ -100,7 +100,7 @@ AppWorld tests automatically organize results during execution:
 
 ```bash
 # Run tests - results automatically organized
-.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --dataset train --model gpt-4o
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --datasets train --model gpt-4o
 
 # Results automatically written to:
 # - results/gpt-4o/train/outputs/raw/ (conversation logs)
@@ -113,9 +113,10 @@ rm -rf experiments/outputs/gpt-4o/  # Frees ~15GB
 
 **AppWorld-specific options:**
 ```bash
---dataset DATASET         # Dataset: train, dev, test_normal, test_challenge (default: train)
+--datasets DATASETS       # Comma-separated datasets (default: train,dev)
 --limit N                 # Run only first N tasks from dataset
 --start-from TASK_ID      # Resume from specific task ID
+--default-few-shot        # Include few-shot examples (default: zero-shot)
 ```
 
 ### Parallel Execution
@@ -127,12 +128,39 @@ Run tests in parallel using multiple workers:
 .venv/bin/pytest tests/benchmarks/bfcl/test_bfcl.py -n 4
 
 # Run with 8 workers
-.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --dataset train -n 8
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --datasets train -n 8
 
 # Auto-detect number of CPUs
 .venv/bin/pytest tests/benchmarks/bfcl/test_bfcl.py -n auto
 ```
 
+### AppWorld Dataset Selection
+
+Run tests on one or more datasets (default: `train,dev`):
+
+```bash
+# Run on default datasets (train,dev)
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py
+
+# Run on specific dataset
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --datasets train
+
+# Run on multiple datasets
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --datasets train,dev,test_normal
+```
+
+### AppWorld Prompt Mode
+
+By default, AppWorld tests run in **zero-shot mode** (no examples in prompt). Use `--default-few-shot` to include worked-out examples:
+
+```bash
+# Zero-shot (default) - no examples in prompt
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --datasets train
+
+# Few-shot - include demo examples in system prompt
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --datasets train --default-few-shot
+```
+
 ## Further Reading
 
 ### BFCL
diff --git a/tests/README.md b/tests/README.md
index 481edce..538271e 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -99,14 +99,17 @@ UV_GIT_LFS=1 uv pip install -e ".[dev,evals]"
 appworld install
 appworld download data
 
-# Run all train tasks (results automatically organized to results/{model}/{dataset}/)
-.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --dataset train --model gpt-4o
+# Run default datasets (train,dev) - results organized to results/{model}/{datasets}/
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --model gpt-4o
+
+# Run specific dataset
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --datasets train --model gpt-4o
 
 # Run specific task
 .venv/bin/pytest 'tests/benchmarks/appworld/test_appworld.py::test_appworld[train_001]'
 
-# Validate existing results (auto-detects from results/{model}/{dataset}/)
-.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --dataset train
+# Validate existing results (auto-detects from results/{model}/{datasets}/)
+.venv/bin/pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --datasets train
 ```
 
 ### 5. Smoke Tests (`smoke/`)
@@ -237,11 +240,12 @@ Registered markers:
 
 **AppWorld-specific options:**
 ```bash
---dataset DATASET         # Dataset: train, dev, test_normal, test_challenge (default: train)
---limit N                 # Run only first N tasks from dataset
---api-mode MODE           # API prediction mode (default: app_oracle)
---experiment-dir DIR      # Custom experiment directory name
---start-from TASK_ID      # Resume from specific task ID
+--datasets DATASETS               # Comma-separated datasets (default: train,dev)
+--limit N                         # Run only first N tasks from dataset
+--api-mode MODE                   # API prediction mode (default: app_oracle)
+--start-from TASK_ID              # Resume from specific task ID
+--default-few-shot                # Include few-shot examples (default: zero-shot)
+--appworld-experiment-name NAME   # Custom experiment name (default: {model}/{datasets})
 ```
 
 ## Common Commands
diff --git a/tests/benchmarks/appworld/README.md b/tests/benchmarks/appworld/README.md
index 7229206..64b2ee9 100644
--- a/tests/benchmarks/appworld/README.md
+++ b/tests/benchmarks/appworld/README.md
@@ -16,27 +16,32 @@ appworld download data
 ## Run Tests
 
 ```bash
-# Run first task from train dataset (automatically organized to results/gpt-4o/train/)
-pytest tests/benchmarks/appworld/test_appworld.py --dataset train --limit 1 --model gpt-4o
+# Run first task (uses default: train,dev datasets, organized to results/gpt-4o/train_dev/)
+pytest tests/benchmarks/appworld/test_appworld.py --limit 1 --model gpt-4o
 
-# Run first 5 train tasks
-pytest tests/benchmarks/appworld/test_appworld.py --dataset train --limit 5 --model gpt-4o
+# Run only train dataset
+pytest tests/benchmarks/appworld/test_appworld.py --datasets train --limit 5 --model gpt-4o
 
-# Run all dev tasks
-pytest tests/benchmarks/appworld/test_appworld.py --dataset dev --model gpt-4o
+# Run only dev dataset
+pytest tests/benchmarks/appworld/test_appworld.py --datasets dev --model gpt-4o
+
+# Run multiple datasets
+pytest tests/benchmarks/appworld/test_appworld.py --datasets train,dev,test_normal
 
 # Run specific task (use actual task IDs like 82e2fac_1, not train_001)
 pytest 'tests/benchmarks/appworld/test_appworld.py::test_appworld[82e2fac_1]'
 
 # Validate existing results (auto-detects from results/gpt-4o/train/)
-pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --dataset train
+pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o --datasets train
 ```
 
 ## CLI Options
 
 ### Dataset Selection
 ```bash
---dataset {train,dev,test_normal,test_challenge}  # Default: train
+--datasets train                    # Single dataset
+--datasets train,dev                # Multiple datasets (default)
+--datasets train,dev,test_normal    # Run train, dev, and test_normal
 ```
 - `train`: Training dataset (90 tasks)
 - `dev`: Development/validation dataset
@@ -54,6 +59,15 @@ pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o
 --temperature 0.001  # Temperature for sampling (default: 0.001)
 ```
 
+### Prompt Mode
+```bash
+# Default: zero-shot (no examples in prompt)
+pytest tests/benchmarks/appworld/test_appworld.py --datasets train
+
+# Few-shot: include worked-out examples in system prompt
+pytest tests/benchmarks/appworld/test_appworld.py --datasets train --default-few-shot
+```
+
 ### Parallel Execution
 ```bash
 -n 4     # Run with 4 workers
@@ -61,7 +75,7 @@ pytest tests/benchmarks/appworld/test_appworld.py --validate-only --model gpt-4o
 -n auto  # Auto-detect number of CPUs
 ```
 
-Example: `pytest tests/benchmarks/appworld/test_appworld.py --dataset train -n 4`
+Example: `pytest tests/benchmarks/appworld/test_appworld.py --datasets train -n 4`
 
 ## File Structure
 
diff --git a/tests/benchmarks/appworld/conftest.py b/tests/benchmarks/appworld/conftest.py
index bd1bee5..f519db4 100644
--- a/tests/benchmarks/appworld/conftest.py
+++ b/tests/benchmarks/appworld/conftest.py
@@ -4,19 +4,26 @@
 
 import pytest
 
+VALID_DATASETS = {"train", "dev", "test_normal", "test_challenge"}
+
 
 @pytest.fixture
 def output_dir(request: pytest.FixtureRequest) -> Path:
-    """AppWorld-specific output directory.
+    """AppWorld output directory.
 
-    Overrides the global output_dir fixture to write directly to
-    results/{model}/{dataset}/outputs/ for organized storage.
+    Uses --output-dir if specified, otherwise auto-infers as
+    results/{experiment_name}/outputs/
     """
-    model = str(request.config.getoption("--model"))
-    dataset = str(request.config.getoption("--dataset"))
+    output_dir_opt = str(request.config.getoption("--output-dir"))
+
+    # If not default "outputs", use the specified path directly
+    if output_dir_opt != "outputs":
+        path = Path(output_dir_opt)
+    else:
+        # Use experiment_name for consistent directory structure
+        exp_name = get_experiment_name(request.config)
+        path = Path("results") / exp_name / "outputs"
 
-    # Write directly to results directory
-    path = Path("results") / model / dataset / "outputs"
     path.mkdir(parents=True, exist_ok=True)
     return path
 
@@ -24,10 +31,9 @@ def output_dir(request: pytest.FixtureRequest) -> Path:
 def pytest_addoption(parser: pytest.Parser) -> None:
     """Add AppWorld-specific CLI options."""
     parser.addoption(
-        "--dataset",
-        default="train",
-        choices=["train", "dev", "test_normal", "test_challenge"],
-        help="AppWorld dataset to use (default: train)",
+        "--datasets",
+        default="train,dev",
+        help="Comma-separated AppWorld datasets: train,dev,test_normal,test_challenge (default: train,dev)",
     )
     parser.addoption(
         "--limit",
@@ -45,30 +51,61 @@ def pytest_addoption(parser: pytest.Parser) -> None:
         ),
     )
     parser.addoption(
-        "--experiment-dir",
+        "--start-from",
         default=None,
         type=str,
         help=(
-            "Experiment directory name (e.g., 'gpt-5/train' or 'claude-sonnet-4-5/dev'). "
-            "If not specified, auto-generates timestamp-based name. "
-            "Results will be saved to experiments/outputs/{experiment-dir}/"
+            "Start from specified task_id (skip all tests before it). "
+            "Example: --start-from 692c77d_1. Useful for resuming interrupted benchmark runs."
         ),
     )
     parser.addoption(
-        "--start-from",
+        "--default-few-shot",
+        action="store_true",
+        default=False,
+        help="Include few-shot examples in system prompt (default: zero-shot, no examples)",
+    )
+    parser.addoption(
+        "--appworld-experiment-name",
         default=None,
         type=str,
-        help=(
-            "Start from specified task_id (skip all tests before it). "
-            "Example: --start-from 692c77d_1. Useful for resuming interrupted benchmark runs."
-        ),
+        help="Experiment name for AppWorld data (default: auto-inferred as {model}/{datasets})",
     )
 
 
+def parse_datasets(datasets_str: str) -> list[str]:
+    """Parse comma-separated datasets string and validate."""
+    datasets = [d.strip() for d in datasets_str.split(",") if d.strip()]
+    invalid = set(datasets) - VALID_DATASETS
+    if invalid:
+        raise ValueError(f"Invalid datasets: {invalid}. Valid options: {VALID_DATASETS}")
+    return datasets
+
+
+def get_datasets_dir(datasets_str: str) -> str:
+    """Parse datasets and return underscore-joined directory name."""
+    datasets = parse_datasets(datasets_str)
+    return "_".join(datasets)
+
+
+def get_experiment_name(config: pytest.Config) -> str:
+    """Get experiment name from config (helper for use outside fixtures)."""
+    name = config.getoption("--appworld-experiment-name", None)
+    if name:
+        return str(name)
+
+    # Auto-infer from model/datasets
+    model = str(config.getoption("--model"))
+    datasets_str = str(config.getoption("--datasets"))
+    datasets_dir = get_datasets_dir(datasets_str)
+    return f"{model}/{datasets_dir}"
+
+
 @pytest.fixture
-def appworld_dataset(request: pytest.FixtureRequest) -> str:
-    """Get the AppWorld dataset name from CLI."""
-    return str(request.config.getoption("--dataset"))
+def appworld_datasets(request: pytest.FixtureRequest) -> list[str]:
+    """Get the AppWorld dataset names from CLI."""
+    datasets_str = str(request.config.getoption("--datasets"))
+    return parse_datasets(datasets_str)
 
 
 @pytest.fixture
@@ -92,25 +129,25 @@ def api_mode(request: pytest.FixtureRequest) -> str:
     return str(request.config.getoption("--api-mode"))
 
 
-@pytest.fixture(scope="session")
-def experiment_name(request: pytest.FixtureRequest) -> str:
+@pytest.fixture
+def use_few_shot(request: pytest.FixtureRequest) -> bool:
     """
-    Get or generate experiment directory name for the test session.
-
-    All tests in this session will write to the same experiment directory,
-    organized by task_id in subdirectories: experiments/outputs/{experiment_name}/tasks/{task_id}/
+    Get few-shot mode from CLI.
 
-    Automatically uses {model}/{dataset} pattern for organized experiment tracking.
+    Returns:
+        True if --default-few-shot flag is set (include examples in prompt)
+        False by default (zero-shot, no examples)
     """
+    return bool(request.config.getoption("--default-few-shot"))
 
-    experiment_dir = request.config.getoption("--experiment-dir", None)
 
-    if experiment_dir:
-        # Use specified experiment directory
-        return str(experiment_dir)
-    else:
-        # Use model/dataset pattern for organized experiment tracking
-        # This works for both normal runs and validation
-        model = str(request.config.getoption("--model"))
-        dataset = str(request.config.getoption("--dataset"))
-        return f"{model}/{dataset}"
+@pytest.fixture(scope="session")
+def experiment_name(request: pytest.FixtureRequest) -> str:
+    """
+    Experiment name for AppWorld evaluation data.
+
+    AppWorld saves to: experiments/outputs/{experiment_name}/tasks/{task_id}/
+    Results saved to: results/{experiment_name}/outputs/
+    Can be specified via --appworld-experiment-name or auto-inferred as {model}/{datasets}.
+    """
+    return get_experiment_name(request.config)
diff --git a/tests/benchmarks/appworld/prompts.py b/tests/benchmarks/appworld/prompts.py
index aee66f1..1ff3d81 100644
--- a/tests/benchmarks/appworld/prompts.py
+++ b/tests/benchmarks/appworld/prompts.py
@@ -14,15 +14,17 @@
 EXPERIMENTS_PATH = Path(appworld_experiments.__file__).parent
 
 
-def load_system_instruction(task: Task) -> str:
+def load_system_instruction(task: Task, use_few_shot: bool = False) -> str:
     """
-    Load and render system instruction from AppWorld's template with demo examples.
+    Load and render system instruction from AppWorld's template.
 
     Args:
         task: AppWorld Task object
+        use_few_shot: If True, include demo examples in prompt. Default is False (zero-shot).
 
     Returns:
-        Rendered system instruction with supervisor info, rules, and demos
+        Rendered system instruction with supervisor info and rules.
+        If use_few_shot=True, also includes worked-out demo examples.
     """
     # Load and render base system instruction template
     template_path = Path(__file__).parent / "system_instruction.txt"
@@ -42,6 +44,17 @@ def load_system_instruction(task: Task) -> str:
         app_descriptions=app_descriptions_yaml,
     )
 
+    # Zero-shot mode: return base instruction as-is
+    if not use_few_shot:
+        return base_instruction
+
+    # Few-shot mode: Add intro line and demo examples
+    examples_intro = (
+        "\n\nNext, I will show you some worked-out examples "
+        "as a tutorial before we proceed with the real task instruction."
+    )
+    base_instruction += examples_intro
+
     # Load demo messages and format them
     demos_path = EXPERIMENTS_PATH / "prompts/function_calling_agent/demos.json"
     demo_messages = read_json(str(demos_path))
diff --git a/tests/benchmarks/appworld/system_instruction.txt b/tests/benchmarks/appworld/system_instruction.txt
index f777a4f..833baf5 100644
--- a/tests/benchmarks/appworld/system_instruction.txt
+++ b/tests/benchmarks/appworld/system_instruction.txt
@@ -45,5 +45,3 @@ When the answer is given:
   E.g., for the song title of the current playing track, return just the title.
 - Numbers must be numeric and not in words.
   E.g., for the number of songs in the queue, return "10", not "ten".
-
-Next, I will show you some worked-out examples as a tutorial before we proceed with the real task instruction.
diff --git a/tests/benchmarks/appworld/test_appworld.py b/tests/benchmarks/appworld/test_appworld.py
index 5c8dfa9..b4d8fbc 100644
--- a/tests/benchmarks/appworld/test_appworld.py
+++ b/tests/benchmarks/appworld/test_appworld.py
@@ -12,6 +12,7 @@
 from fast_agent.llm.request_params import RequestParams
 
 from tests.benchmarks.appworld import api_predictor, prompts
+from tests.benchmarks.appworld.conftest import get_experiment_name, parse_datasets
 from tests.benchmarks.appworld.reporting import (
     find_evaluation_report,
     generate_failure_report,
@@ -27,17 +28,16 @@
 
 
 def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
-    """Dynamically generate test cases from AppWorld dataset."""
+    """Dynamically generate test cases from AppWorld dataset(s)."""
     if "task_id" not in metafunc.fixturenames:
         return
 
     validate_only = metafunc.config.getoption("--validate-only", False)
 
     if validate_only:
-        # Auto-detect log directory from model/dataset
-        model = metafunc.config.getoption("--model")
-        dataset = metafunc.config.getoption("--dataset", "train")
-        log_dir = Path("results") / model / dataset / "outputs" / "raw"
+        # Auto-detect log directory from experiment_name
+        exp_name = get_experiment_name(metafunc.config)
+        log_dir = Path("results") / exp_name / "outputs" / "raw"
 
         # Find existing log files to validate
         log_files = list(log_dir.glob("*_complete.json")) if log_dir.exists() else []
@@ -47,12 +47,17 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
             pytest.exit(
                 f"\nError: No test results found in {log_dir}\n"
                 f"Expected to find *_complete.json files for validation.\n"
-                f"Make sure you've run tests for --model {model} --dataset {dataset} first."
+                f"Run tests first or check --appworld-experiment-name."
             )
     else:
-        # Load task IDs from AppWorld dataset
-        dataset = metafunc.config.getoption("--dataset", "train")
-        task_ids = load_task_ids(dataset)
+        # Load task IDs from AppWorld dataset(s)
+        datasets_str = metafunc.config.getoption("--datasets", "train,dev")
+        datasets = parse_datasets(datasets_str)
+
+        # Collect task IDs from all specified datasets
+        task_ids = []
+        for dataset in datasets:
+            task_ids.extend(load_task_ids(dataset))
 
         # Apply --start-from filter first (before --limit)
         start_from = metafunc.config.getoption("--start-from", None)
@@ -64,10 +69,10 @@ def pytest_generate_tests(metafunc: pytest.Metafunc) -> None:
             except ValueError:
                 # Task ID not found - provide helpful error
                 pytest.exit(
-                    f"\nError: Task ID '{start_from}' not found in {dataset} dataset.\n"
+                    f"\nError: Task ID '{start_from}' not found in datasets {datasets}.\n"
                     f"Available task IDs (first 10): {', '.join(task_ids[:10])}\n"
-                    f"Total tasks in dataset: {len(task_ids)}\n"
-                    f"Use: pytest tests/benchmarks/appworld/test_appworld.py --dataset {dataset} "
+                    f"Total tasks: {len(task_ids)}\n"
+                    f"Use: pytest tests/benchmarks/appworld/test_appworld.py --datasets {datasets_str} "
                     f"--collect-only to see all task IDs."
                 )
 
@@ -92,6 +97,7 @@ async def test_appworld(
     output_dir: Path,
     api_mode: str,
     experiment_name: str,
+    use_few_shot: bool,
     request: pytest.FixtureRequest,
 ) -> None:
     """Run or validate an AppWorld test."""
@@ -99,7 +105,7 @@ async def test_appworld(
 
     # Run test if not in validate-only mode
     if not validate_only:
-        await _run_appworld_test(task_id, model, temperature, output_dir, api_mode, experiment_name)
+        await _run_appworld_test(task_id, model, temperature, output_dir, api_mode, experiment_name, use_few_shot)
 
     # Get complete.json path (always in output_dir/raw now)
     complete_path = output_dir / "raw" / f"{task_id}_complete.json"
@@ -190,6 +196,7 @@ async def _run_appworld_test(
     output_dir: Path,
     api_mode: str,
     experiment_name: str,
+    use_few_shot: bool,
 ) -> None:
     """Run AppWorld test using the provided experiment name."""
 
@@ -213,7 +220,7 @@ async def _run_appworld_test(
     # Create and run FastAgent
     config_path = Path(__file__).parent / "fastagent.config.yaml"
     agent = FastAgent("AppWorld Test", config_path=str(config_path), ignore_unknown_args=True)
-    system_instruction = prompts.load_system_instruction(task)
+    system_instruction = prompts.load_system_instruction(task, use_few_shot=use_few_shot)
 
     @agent.agent(
         name="test_agent",
@@ -279,9 +286,10 @@ def _generate_failure_report_inline(
     # Load complete.json
     complete_data = load_complete_json(output_dir, task_id)
 
-    # Determine output path
-    dataset = request.config.getoption("--dataset", "train")
-    failure_report_dir = Path("results") / model / dataset / "failure_reports"
+    # Derive failure_report_dir from output_dir (same parent directory)
+    # output_dir: results/{model}/{datasets}/outputs
+    # failure_report_dir: results/{model}/{datasets}/failure_reports
+    failure_report_dir = output_dir.parent / "failure_reports"
     failure_report_dir.mkdir(parents=True, exist_ok=True)
 
     # Generate failure report