From eb4eeaa831e5fbee4ca3995f2f90c416de913afd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Wed, 25 Mar 2026 15:32:44 +0100
Subject: [PATCH 1/2] feat: remove deprecated Eval API and related APIs
 (datasets, datasetio, scoring, scoring_functions, benchmarks)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove the Eval API and all connected APIs that were already marked as
deprecated in the spec. This includes the datasets, datasetio, scoring,
scoring_functions, and benchmarks APIs along with all their provider
implementations, routing tables, routers, registry entries, distribution
configs, and tests.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Sébastien Han <seb@redhat.com>
---
 client-sdks/stainless/openapi.yml             | 1998 +---------
 docs/static/deprecated-llama-stack-spec.yaml  | 1555 +-------
 .../static/experimental-llama-stack-spec.yaml | 1608 +-------
 docs/static/llama-stack-spec.yaml             | 1376 +------
 docs/static/stainless-llama-stack-spec.yaml   | 1998 +---------
 src/llama_stack/core/datatypes.py             |   36 +-
 src/llama_stack/core/distribution.py          |   12 -
 src/llama_stack/core/resolver.py              |   22 -
 src/llama_stack/core/routers/__init__.py      |   11 -
 src/llama_stack/core/routers/datasets.py      |   73 -
 src/llama_stack/core/routers/eval_scoring.py  |  248 --
 .../core/routing_tables/benchmarks.py         |   66 -
 src/llama_stack/core/routing_tables/common.py |   30 -
 .../core/routing_tables/datasets.py           |   91 -
 .../core/routing_tables/scoring_functions.py  |   66 -
 .../core/server/fastapi_router_registry.py    |   12 -
 src/llama_stack/core/stack.py                 |   27 -
 .../distributions/ci-tests/build.yaml         |    9 -
 .../distributions/ci-tests/config.yaml        |   35 -
 .../ci-tests/run-with-postgres-store.yaml     |   35 -
 .../distributions/dell/config.yaml            |   35 -
 src/llama_stack/distributions/dell/dell.py    |   10 -
 .../distributions/dell/run-with-safety.yaml   |   35 -
 .../distributions/nvidia/config.yaml          |   22 -
 .../distributions/nvidia/nvidia.py            |   35 +-
 .../distributions/nvidia/run-with-safety.yaml |   28 -
 src/llama_stack/distributions/oci/config.yaml |   35 -
 src/llama_stack/distributions/oci/oci.py      |   10 -
 .../distributions/open-benchmark/config.yaml  |  101 -
 .../open-benchmark/open_benchmark.py          |   93 +-
 .../distributions/postgres-demo/config.yaml   |    3 -
 .../distributions/starter-gpu/build.yaml      |    7 -
 .../distributions/starter-gpu/config.yaml     |   35 -
 .../starter-gpu/run-with-postgres-store.yaml  |   35 -
 .../distributions/starter/build.yaml          |    7 -
 .../distributions/starter/config.yaml         |   35 -
 .../starter/run-with-postgres-store.yaml      |   35 -
 .../distributions/starter/starter.py          |   10 -
 src/llama_stack/distributions/template.py     |    7 -
 .../distributions/watsonx/config.yaml         |   35 -
 .../distributions/watsonx/watsonx.py          |   10 -
 .../providers/inline/datasetio/__init__.py    |    5 -
 .../inline/datasetio/localfs/__init__.py      |   20 -
 .../inline/datasetio/localfs/config.py        |   23 -
 .../inline/datasetio/localfs/datasetio.py     |  110 -
 .../providers/inline/eval/__init__.py         |    5 -
 .../providers/inline/eval/builtin/__init__.py |   28 -
 .../providers/inline/eval/builtin/config.py   |   23 -
 .../providers/inline/eval/builtin/eval.py     |  222 --
 .../providers/inline/scoring/__init__.py      |    5 -
 .../inline/scoring/basic/__init__.py          |   25 -
 .../providers/inline/scoring/basic/config.py  |   14 -
 .../providers/inline/scoring/basic/scoring.py |  115 -
 .../scoring/basic/scoring_fn/__init__.py      |    5 -
 .../basic/scoring_fn/docvqa_scoring_fn.py     |  239 --
 .../basic/scoring_fn/equality_scoring_fn.py   |   40 -
 .../basic/scoring_fn/fn_defs/__init__.py      |    5 -
 .../basic/scoring_fn/fn_defs/docvqa.py        |   21 -
 .../basic/scoring_fn/fn_defs/equality.py      |   21 -
 .../basic/scoring_fn/fn_defs/ifeval.py        |   23 -
 .../fn_defs/regex_parser_math_response.py     |   27 -
 .../regex_parser_multiple_choice_answer.py    |   71 -
 .../basic/scoring_fn/fn_defs/subset_of.py     |   21 -
 .../basic/scoring_fn/ifeval_scoring_fn.py     |   79 -
 .../regex_parser_math_response_scoring_fn.py  |   65 -
 .../scoring_fn/regex_parser_scoring_fn.py     |   57 -
 .../basic/scoring_fn/subset_of_scoring_fn.py  |   37 -
 .../inline/scoring/basic/utils/__init__.py    |    5 -
 .../scoring/basic/utils/ifeval_utils.py       | 3319 -----------------
 .../inline/scoring/basic/utils/math_utils.py  |  330 --
 .../inline/scoring/braintrust/__init__.py     |   27 -
 .../inline/scoring/braintrust/braintrust.py   |  212 --
 .../inline/scoring/braintrust/config.py       |   21 -
 .../scoring/braintrust/scoring_fn/__init__.py |    5 -
 .../braintrust/scoring_fn/fn_defs/__init__.py |    5 -
 .../scoring_fn/fn_defs/answer_correctness.py  |   24 -
 .../scoring_fn/fn_defs/answer_relevancy.py    |   24 -
 .../scoring_fn/fn_defs/answer_similarity.py   |   24 -
 .../fn_defs/context_entity_recall.py          |   24 -
 .../scoring_fn/fn_defs/context_precision.py   |   24 -
 .../scoring_fn/fn_defs/context_recall.py      |   24 -
 .../scoring_fn/fn_defs/context_relevancy.py   |   23 -
 .../scoring_fn/fn_defs/factuality.py          |   24 -
 .../scoring_fn/fn_defs/faithfulness.py        |   24 -
 .../inline/scoring/llm_as_judge/__init__.py   |   21 -
 .../inline/scoring/llm_as_judge/config.py     |   14 -
 .../inline/scoring/llm_as_judge/scoring.py    |  102 -
 .../llm_as_judge/scoring_fn/__init__.py       |    5 -
 .../scoring_fn/fn_defs/__init__.py            |    5 -
 .../fn_defs/llm_as_judge_405b_simpleqa.py     |   96 -
 .../scoring_fn/fn_defs/llm_as_judge_base.py   |   19 -
 .../scoring_fn/llm_as_judge_scoring_fn.py     |   79 -
 .../providers/registry/datasetio.py           |   49 -
 src/llama_stack/providers/registry/eval.py    |   46 -
 src/llama_stack/providers/registry/scoring.py |   51 -
 .../providers/remote/datasetio/__init__.py    |    5 -
 .../remote/datasetio/huggingface/__init__.py  |   18 -
 .../remote/datasetio/huggingface/config.py    |   23 -
 .../datasetio/huggingface/huggingface.py      |   96 -
 .../remote/datasetio/nvidia/README.md         |   74 -
 .../remote/datasetio/nvidia/__init__.py       |   23 -
 .../remote/datasetio/nvidia/config.py         |   61 -
 .../remote/datasetio/nvidia/datasetio.py      |  113 -
 .../providers/remote/eval/__init__.py         |    5 -
 .../providers/remote/eval/nvidia/README.md    |  142 -
 .../providers/remote/eval/nvidia/__init__.py  |   31 -
 .../providers/remote/eval/nvidia/config.py    |   29 -
 .../providers/remote/eval/nvidia/eval.py      |  179 -
 .../utils/common/data_schema_validator.py     |   99 -
 .../providers/utils/datasetio/__init__.py     |    5 -
 .../providers/utils/datasetio/url_utils.py    |   47 -
 .../providers/utils/scoring/__init__.py       |    5 -
 .../utils/scoring/aggregation_utils.py        |   74 -
 .../utils/scoring/base_scoring_fn.py          |  113 -
 .../utils/scoring/basic_scoring_utils.py      |   26 -
 src/llama_stack_api/__init__.py               |  152 -
 src/llama_stack_api/benchmarks/__init__.py    |   43 -
 src/llama_stack_api/benchmarks/api.py         |   39 -
 .../benchmarks/fastapi_routes.py              |  109 -
 src/llama_stack_api/benchmarks/models.py      |  109 -
 src/llama_stack_api/datasetio/__init__.py     |   36 -
 src/llama_stack_api/datasetio/api.py          |   42 -
 .../datasetio/fastapi_routes.py               |   95 -
 src/llama_stack_api/datasetio/models.py       |   55 -
 src/llama_stack_api/datasets/__init__.py      |   61 -
 src/llama_stack_api/datasets/api.py           |   35 -
 .../datasets/fastapi_routes.py                |  104 -
 src/llama_stack_api/datasets/models.py        |  150 -
 src/llama_stack_api/datatypes.py              |   31 -
 src/llama_stack_api/eval/__init__.py          |   55 -
 src/llama_stack_api/eval/api.py               |   51 -
 src/llama_stack_api/eval/compat.py            |  300 --
 src/llama_stack_api/eval/fastapi_routes.py    |  126 -
 src/llama_stack_api/eval/models.py            |  141 -
 src/llama_stack_api/pyproject.toml            |    7 -
 src/llama_stack_api/resource.py               |    3 -
 src/llama_stack_api/scoring/__init__.py       |   66 -
 src/llama_stack_api/scoring/api.py            |   35 -
 src/llama_stack_api/scoring/fastapi_routes.py |   67 -
 src/llama_stack_api/scoring/models.py         |   81 -
 .../scoring_functions/__init__.py             |   50 -
 src/llama_stack_api/scoring_functions/api.py  |   39 -
 .../scoring_functions/fastapi_routes.py       |  108 -
 .../scoring_functions/models.py               |  214 --
 tests/backward_compat/test_eval_compat.py     |  533 ---
 .../llama-stack-provider-lmeval/config.yaml   |    8 -
 tests/integration/datasets/__init__.py        |    5 -
 tests/integration/datasets/test_dataset.csv   |    6 -
 tests/integration/datasets/test_datasets.py   |   95 -
 .../integration/datasets/test_rag_dataset.csv |    6 -
 tests/integration/eval/__init__.py            |    5 -
 tests/integration/eval/constants.py           |   20 -
 ...4bed06bcbaa03d13b228f61e2b36e23093469.json |   58 -
 ...d4626e70673c405ae1cd72b8dd0617104263e.json |   57 -
 ...ab45674040195c15013c9ea43bc6331e1a831.json |   57 -
 ...2fc61593f964148d6c05df5c4a387a5389e6b.json |   57 -
 ...fe465b66d8436754b30ff4da28c7c03c094a4.json |   58 -
 ...1453f12cd31ce2e294d20868c0c498b7d9136.json |   57 -
 ...4e6d567d1e8243e9b0d18f8803cb9b7c8f92f.json |   58 -
 ...61fe8e95d98692c189e7308724338f918678d.json |   58 -
 ...c99669c7e19b3d551090eb6bec83b33de2a18.json |   57 -
 ...44a231426700a772b8dc64abf05d8b126a736.json |   57 -
 ...280591f73cf26c00b7308dde7d19a1ced016c.json |   57 -
 ...36b1e7f4a9d4f7f8ba8bd844d50265067f417.json |   58 -
 ...0acc212cf2ac3bdd4192aabb5f98359236572.json |   57 -
 ...9e9d5a9a5ad6ee7bcad0b12853979b1e43ede.json |   58 -
 ...ebe742743cd3042654efefa86714e357b86f6.json |   58 -
 ...169a1235798c2b3ad9abbb29acf1f1b2952fa.json |   58 -
 tests/integration/eval/test_eval.py           |  104 -
 tests/integration/scoring/__init__.py         |    5 -
 tests/integration/scoring/test_scoring.py     |  251 --
 .../routers/test_routing_tables.py            |  244 --
 tests/unit/providers/nvidia/test_eval.py      |  234 --
 tests/unit/test_eval_models.py                |   85 -
 174 files changed, 664 insertions(+), 21292 deletions(-)
 delete mode 100644 src/llama_stack/core/routers/datasets.py
 delete mode 100644 src/llama_stack/core/routers/eval_scoring.py
 delete mode 100644 src/llama_stack/core/routing_tables/benchmarks.py
 delete mode 100644 src/llama_stack/core/routing_tables/datasets.py
 delete mode 100644 src/llama_stack/core/routing_tables/scoring_functions.py
 delete mode 100644 src/llama_stack/providers/inline/datasetio/__init__.py
 delete mode 100644 src/llama_stack/providers/inline/datasetio/localfs/__init__.py
 delete mode 100644 src/llama_stack/providers/inline/datasetio/localfs/config.py
 delete mode 100644 src/llama_stack/providers/inline/datasetio/localfs/datasetio.py
 delete mode 100644 src/llama_stack/providers/inline/eval/__init__.py
 delete mode 100644 src/llama_stack/providers/inline/eval/builtin/__init__.py
 delete mode 100644 src/llama_stack/providers/inline/eval/builtin/config.py
 delete mode 100644 src/llama_stack/providers/inline/eval/builtin/eval.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/__init__.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/basic/__init__.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/basic/config.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/basic/scoring.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/basic/scoring_fn/__init__.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/__init__.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/basic/utils/__init__.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/basic/utils/math_utils.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/braintrust/__init__.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/braintrust/braintrust.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/braintrust/config.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/__init__.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/__init__.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/llm_as_judge/__init__.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/llm_as_judge/config.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/__init__.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/__init__.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py
 delete mode 100644 src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py
 delete mode 100644 src/llama_stack/providers/registry/datasetio.py
 delete mode 100644 src/llama_stack/providers/registry/eval.py
 delete mode 100644 src/llama_stack/providers/registry/scoring.py
 delete mode 100644 src/llama_stack/providers/remote/datasetio/__init__.py
 delete mode 100644 src/llama_stack/providers/remote/datasetio/huggingface/__init__.py
 delete mode 100644 src/llama_stack/providers/remote/datasetio/huggingface/config.py
 delete mode 100644 src/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
 delete mode 100644 src/llama_stack/providers/remote/datasetio/nvidia/README.md
 delete mode 100644 src/llama_stack/providers/remote/datasetio/nvidia/__init__.py
 delete mode 100644 src/llama_stack/providers/remote/datasetio/nvidia/config.py
 delete mode 100644 src/llama_stack/providers/remote/datasetio/nvidia/datasetio.py
 delete mode 100644 src/llama_stack/providers/remote/eval/__init__.py
 delete mode 100644 src/llama_stack/providers/remote/eval/nvidia/README.md
 delete mode 100644 src/llama_stack/providers/remote/eval/nvidia/__init__.py
 delete mode 100644 src/llama_stack/providers/remote/eval/nvidia/config.py
 delete mode 100644 src/llama_stack/providers/remote/eval/nvidia/eval.py
 delete mode 100644 src/llama_stack/providers/utils/common/data_schema_validator.py
 delete mode 100644 src/llama_stack/providers/utils/datasetio/__init__.py
 delete mode 100644 src/llama_stack/providers/utils/datasetio/url_utils.py
 delete mode 100644 src/llama_stack/providers/utils/scoring/__init__.py
 delete mode 100644 src/llama_stack/providers/utils/scoring/aggregation_utils.py
 delete mode 100644 src/llama_stack/providers/utils/scoring/base_scoring_fn.py
 delete mode 100644 src/llama_stack/providers/utils/scoring/basic_scoring_utils.py
 delete mode 100644 src/llama_stack_api/benchmarks/__init__.py
 delete mode 100644 src/llama_stack_api/benchmarks/api.py
 delete mode 100644 src/llama_stack_api/benchmarks/fastapi_routes.py
 delete mode 100644 src/llama_stack_api/benchmarks/models.py
 delete mode 100644 src/llama_stack_api/datasetio/__init__.py
 delete mode 100644 src/llama_stack_api/datasetio/api.py
 delete mode 100644 src/llama_stack_api/datasetio/fastapi_routes.py
 delete mode 100644 src/llama_stack_api/datasetio/models.py
 delete mode 100644 src/llama_stack_api/datasets/__init__.py
 delete mode 100644 src/llama_stack_api/datasets/api.py
 delete mode 100644 src/llama_stack_api/datasets/fastapi_routes.py
 delete mode 100644 src/llama_stack_api/datasets/models.py
 delete mode 100644 src/llama_stack_api/eval/__init__.py
 delete mode 100644 src/llama_stack_api/eval/api.py
 delete mode 100644 src/llama_stack_api/eval/compat.py
 delete mode 100644 src/llama_stack_api/eval/fastapi_routes.py
 delete mode 100644 src/llama_stack_api/eval/models.py
 delete mode 100644 src/llama_stack_api/scoring/__init__.py
 delete mode 100644 src/llama_stack_api/scoring/api.py
 delete mode 100644 src/llama_stack_api/scoring/fastapi_routes.py
 delete mode 100644 src/llama_stack_api/scoring/models.py
 delete mode 100644 src/llama_stack_api/scoring_functions/__init__.py
 delete mode 100644 src/llama_stack_api/scoring_functions/api.py
 delete mode 100644 src/llama_stack_api/scoring_functions/fastapi_routes.py
 delete mode 100644 src/llama_stack_api/scoring_functions/models.py
 delete mode 100644 tests/backward_compat/test_eval_compat.py
 delete mode 100644 tests/integration/datasets/__init__.py
 delete mode 100644 tests/integration/datasets/test_dataset.csv
 delete mode 100644 tests/integration/datasets/test_datasets.py
 delete mode 100644 tests/integration/datasets/test_rag_dataset.csv
 delete mode 100644 tests/integration/eval/__init__.py
 delete mode 100644 tests/integration/eval/constants.py
 delete mode 100644 tests/integration/eval/recordings/0a2ea52bcc4c7e04d0b4b844ad94bed06bcbaa03d13b228f61e2b36e23093469.json
 delete mode 100644 tests/integration/eval/recordings/171c4dcb3dc848196f5d7fd87efd4626e70673c405ae1cd72b8dd0617104263e.json
 delete mode 100644 tests/integration/eval/recordings/1b2720589d2a4273b5eb2c06b50ab45674040195c15013c9ea43bc6331e1a831.json
 delete mode 100644 tests/integration/eval/recordings/3e5ea35cb3dc92835d230456b6e2fc61593f964148d6c05df5c4a387a5389e6b.json
 delete mode 100644 tests/integration/eval/recordings/58177cd1c0d7d8de9e20515c3e8fe465b66d8436754b30ff4da28c7c03c094a4.json
 delete mode 100644 tests/integration/eval/recordings/6de6d1ebc3128dfaba1efe654ca1453f12cd31ce2e294d20868c0c498b7d9136.json
 delete mode 100644 tests/integration/eval/recordings/752abf1ef7f71bbe7028eae85814e6d567d1e8243e9b0d18f8803cb9b7c8f92f.json
 delete mode 100644 tests/integration/eval/recordings/94e3d8dba56da92e1014a6ee81b61fe8e95d98692c189e7308724338f918678d.json
 delete mode 100644 tests/integration/eval/recordings/9ebe1e04fc3a8d41f88992428a7c99669c7e19b3d551090eb6bec83b33de2a18.json
 delete mode 100644 tests/integration/eval/recordings/aa20023c358a0dc718355082cc244a231426700a772b8dc64abf05d8b126a736.json
 delete mode 100644 tests/integration/eval/recordings/b52a054b314c8b42634c4a9ef76280591f73cf26c00b7308dde7d19a1ced016c.json
 delete mode 100644 tests/integration/eval/recordings/bf6b37511a044df8ad1c6113d3936b1e7f4a9d4f7f8ba8bd844d50265067f417.json
 delete mode 100644 tests/integration/eval/recordings/c07b01fe99467efcfa99f6ac9c60acc212cf2ac3bdd4192aabb5f98359236572.json
 delete mode 100644 tests/integration/eval/recordings/c4ef767672c890e77ceaa15b6239e9d5a9a5ad6ee7bcad0b12853979b1e43ede.json
 delete mode 100644 tests/integration/eval/recordings/cbf92825593fd79fe76e0ad0193ebe742743cd3042654efefa86714e357b86f6.json
 delete mode 100644 tests/integration/eval/recordings/dcf3c9afad420e66c3cc7434a48169a1235798c2b3ad9abbb29acf1f1b2952fa.json
 delete mode 100644 tests/integration/eval/test_eval.py
 delete mode 100644 tests/integration/scoring/__init__.py
 delete mode 100644 tests/integration/scoring/test_scoring.py
 delete mode 100644 tests/unit/providers/nvidia/test_eval.py
 delete mode 100644 tests/unit/test_eval_models.py

diff --git a/client-sdks/stainless/openapi.yml b/client-sdks/stainless/openapi.yml
index 84d18c9718..b0fd085fc3 100644
--- a/client-sdks/stainless/openapi.yml
+++ b/client-sdks/stainless/openapi.yml
@@ -1759,190 +1759,6 @@ paths:
             schema:
               $ref: '#/components/schemas/RunShieldRequest'
         required: true
-  /v1/scoring-functions:
-    get:
-      responses:
-        '200':
-          description: A ListScoringFunctionsResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListScoringFunctionsResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Scoring Functions
-      summary: List all scoring functions.
-      description: List all scoring functions.
-      operationId: list_scoring_functions_v1_scoring_functions_get
-    post:
-      responses:
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-        '204':
-          description: The scoring function was successfully registered.
-      tags:
-      - Scoring Functions
-      summary: Register a scoring function.
-      description: Register a scoring function.
-      operationId: register_scoring_function_v1_scoring_functions_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterScoringFunctionRequest'
-        required: true
-      deprecated: true
-  /v1/scoring-functions/{scoring_fn_id}:
-    get:
-      responses:
-        '200':
-          description: A ScoringFn.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoringFn'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Scoring Functions
-      summary: Get a scoring function by its ID.
-      description: Get a scoring function by its ID.
-      operationId: get_scoring_function_v1_scoring_functions__scoring_fn_id__get
-      parameters:
-      - name: scoring_fn_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the scoring function to get.
-          title: Scoring Fn Id
-        description: The ID of the scoring function to get.
-    delete:
-      responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: The scoring function was successfully unregistered.
-      tags:
-      - Scoring Functions
-      summary: Unregister a scoring function.
-      description: Unregister a scoring function.
-      operationId: unregister_scoring_function_v1_scoring_functions__scoring_fn_id__delete
-      parameters:
-      - name: scoring_fn_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the scoring function to unregister.
-          title: Scoring Fn Id
-        description: The ID of the scoring function to unregister.
-      deprecated: true
-  /v1/scoring/score:
-    post:
-      responses:
-        '200':
-          description: A ScoreResponse object containing rows and aggregated results.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoreResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Scoring
-      summary: Score a list of rows.
-      description: Score a list of rows.
-      operationId: score_v1_scoring_score_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/ScoreRequest'
-        required: true
-  /v1/scoring/score-batch:
-    post:
-      responses:
-        '200':
-          description: A ScoreBatchResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoreBatchResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Scoring
-      summary: Score a batch of rows.
-      description: Score a batch of rows.
-      operationId: score_batch_v1_scoring_score_batch_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/ScoreBatchRequest'
-        required: true
   /v1/shields:
     get:
       responses:
@@ -2999,116 +2815,15 @@ paths:
       description: Get the version of the service.
       operationId: version_v1_version_get
       x-public: true
-  /v1beta/datasetio/append-rows/{dataset_id}:
+  /v1alpha/inference/rerank:
     post:
-      responses:
-        '204':
-          description: Rows were successfully appended.
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - DatasetIO
-      summary: Append rows to a dataset.
-      description: Append rows to a dataset.
-      operationId: append_rows_v1beta_datasetio_append_rows__dataset_id__post
-      parameters:
-      - name: dataset_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the dataset to append the rows to.
-          title: Dataset Id
-        description: The ID of the dataset to append the rows to.
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/AppendRowsRequest'
-  /v1beta/datasetio/iterrows/{dataset_id}:
-    get:
-      responses:
-        '200':
-          description: A PaginatedResponse containing the rows.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/PaginatedResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - DatasetIO
-      summary: Get a paginated list of rows from a dataset.
-      description: |-
-        Get a paginated list of rows from a dataset.
-
-        Uses offset-based pagination where:
-        - start_index: The starting index (0-based). If None, starts from beginning.
-        - limit: Number of items to return. If None or -1, returns all items.
-
-        The response includes:
-        - data: List of items for the current page.
-        - has_more: Whether there are more items available after this set.
-      operationId: iterrows_v1beta_datasetio_iterrows__dataset_id__get
-      parameters:
-      - name: dataset_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the dataset to get the rows from.
-          title: Dataset Id
-        description: The ID of the dataset to get the rows from.
-      - name: start_index
-        in: query
-        required: false
-        schema:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: Index into dataset for the first row to get. Get all rows if None.
-          title: Start Index
-        description: Index into dataset for the first row to get. Get all rows if None.
-      - name: limit
-        in: query
-        required: false
-        schema:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: The number of rows to get.
-          title: Limit
-        description: The number of rows to get.
-  /v1beta/datasets:
-    get:
       responses:
         '200':
-          description: A list of dataset objects.
+          description: RerankResponse with indices sorted by relevance score (descending).
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/ListDatasetsResponse'
+                $ref: '#/components/schemas/RerankResponse'
         '400':
           description: Bad Request
           $ref: '#/components/responses/BadRequest400'
@@ -3122,18 +2837,25 @@ paths:
           description: Default Response
           $ref: '#/components/responses/DefaultError'
       tags:
-      - Datasets
-      summary: List all datasets.
-      description: List all datasets.
-      operationId: list_datasets_v1beta_datasets_get
-    post:
+      - Inference
+      summary: Rerank documents based on relevance to a query.
+      description: Rerank a list of documents based on their relevance to a query.
+      operationId: rerank_v1alpha_inference_rerank_post
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RerankRequest'
+        required: true
+  /v1alpha/admin/providers:
+    get:
       responses:
         '200':
-          description: The registered dataset object.
+          description: A list of provider information objects.
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/Dataset'
+                $ref: '#/components/schemas/ListProvidersResponse'
         '400':
           description: Bad Request
           $ref: '#/components/responses/BadRequest400'
@@ -3147,26 +2869,19 @@ paths:
           description: Default Response
           $ref: '#/components/responses/DefaultError'
       tags:
-      - Datasets
-      summary: Register a new dataset.
-      description: Register a new dataset.
-      operationId: register_dataset_v1beta_datasets_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterDatasetRequest'
-        required: true
-      deprecated: true
-  /v1beta/datasets/{dataset_id}:
+      - Admin
+      summary: List all available providers
+      description: List all available providers with their configuration and health status.
+      operationId: list_providers_v1alpha_admin_providers_get
+  /v1alpha/admin/providers/{provider_id}:
     get:
       responses:
         '200':
-          description: The dataset object.
+          description: The provider information object.
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/Dataset'
+                $ref: '#/components/schemas/ProviderInfo'
         '400':
           $ref: '#/components/responses/BadRequest400'
           description: Bad Request
@@ -3179,469 +2894,31 @@ paths:
         default:
           $ref: '#/components/responses/DefaultError'
           description: Default Response
+        '404':
+          description: Provider not found.
       tags:
-      - Datasets
-      summary: Get a dataset by its ID.
-      description: Get a dataset by its ID.
-      operationId: get_dataset_v1beta_datasets__dataset_id__get
+      - Admin
+      summary: Get provider details
+      description: Get detailed information about a specific provider.
+      operationId: inspect_provider_v1alpha_admin_providers__provider_id__get
       parameters:
-      - name: dataset_id
+      - name: provider_id
         in: path
         required: true
         schema:
           type: string
-          description: The ID of the dataset to get.
-          title: Dataset Id
-        description: The ID of the dataset to get.
-    delete:
+          description: The ID of the provider to inspect.
+          title: Provider Id
+        description: The ID of the provider to inspect.
+  /v1alpha/admin/inspect/routes:
+    get:
       responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: The dataset was successfully unregistered.
-      tags:
-      - Datasets
-      summary: Unregister a dataset by its ID.
-      description: Unregister a dataset by its ID.
-      operationId: unregister_dataset_v1beta_datasets__dataset_id__delete
-      parameters:
-      - name: dataset_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the dataset to unregister.
-          title: Dataset Id
-        description: The ID of the dataset to unregister.
-      deprecated: true
-  /v1alpha/eval/benchmarks:
-    get:
-      responses:
-        '200':
-          description: A ListBenchmarksResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListBenchmarksResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Benchmarks
-      summary: List all benchmarks.
-      description: List all benchmarks.
-      operationId: list_benchmarks_v1alpha_eval_benchmarks_get
-    post:
-      responses:
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-        '204':
-          description: The benchmark was successfully registered.
-      tags:
-      - Benchmarks
-      summary: Register a benchmark.
-      description: Register a benchmark.
-      operationId: register_benchmark_v1alpha_eval_benchmarks_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterBenchmarkRequest'
-        required: true
-      deprecated: true
-  /v1alpha/eval/benchmarks/{benchmark_id}:
-    get:
-      responses:
-        '200':
-          description: A Benchmark.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Benchmark'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Benchmarks
-      summary: Get a benchmark by its ID.
-      description: Get a benchmark by its ID.
-      operationId: get_benchmark_v1alpha_eval_benchmarks__benchmark_id__get
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark to get.
-          title: Benchmark Id
-        description: The ID of the benchmark to get.
-    delete:
-      responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: The benchmark was successfully unregistered.
-      tags:
-      - Benchmarks
-      summary: Unregister a benchmark.
-      description: Unregister a benchmark.
-      operationId: unregister_benchmark_v1alpha_eval_benchmarks__benchmark_id__delete
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark to unregister.
-          title: Benchmark Id
-        description: The ID of the benchmark to unregister.
-      deprecated: true
-  /v1alpha/eval/benchmarks/{benchmark_id}/evaluations:
-    post:
-      responses:
-        '200':
-          description: EvaluateResponse object containing generations and scores.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluateResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Evaluate Rows
-      description: Evaluate a list of rows on a benchmark.
-      operationId: evaluate_rows_v1alpha_eval_benchmarks__benchmark_id__evaluations_post
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark
-          title: Benchmark Id
-        description: The ID of the benchmark
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/EvaluateRowsBodyRequest'
-  /v1alpha/eval/benchmarks/{benchmark_id}/jobs:
-    post:
-      responses:
-        '200':
-          description: The job that was created to run the evaluation.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Job'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Run Eval
-      description: Run an evaluation on a benchmark.
-      operationId: run_eval_v1alpha_eval_benchmarks__benchmark_id__jobs_post
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark
-          title: Benchmark Id
-        description: The ID of the benchmark
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RunEvalBodyRequest'
-  /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
-    get:
-      responses:
-        '200':
-          description: The status of the evaluation job.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Job'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Job Status
-      description: Get the status of a job.
-      operationId: job_status_v1alpha_eval_benchmarks__benchmark_id__jobs__job_id__get
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Benchmark Id
-      - name: job_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Job Id
-    delete:
-      responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: Successful Response
-      tags:
-      - Eval
-      summary: Job Cancel
-      description: Cancel a job.
-      operationId: job_cancel_v1alpha_eval_benchmarks__benchmark_id__jobs__job_id__delete
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Benchmark Id
-      - name: job_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Job Id
-  /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
-    get:
-      responses:
-        '200':
-          description: The result of the job.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluateResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Job Result
-      description: Get the result of a job.
-      operationId: job_result_v1alpha_eval_benchmarks__benchmark_id__jobs__job_id__result_get
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Benchmark Id
-      - name: job_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Job Id
-  /v1alpha/inference/rerank:
-    post:
-      responses:
-        '200':
-          description: RerankResponse with indices sorted by relevance score (descending).
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/RerankResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Inference
-      summary: Rerank documents based on relevance to a query.
-      description: Rerank a list of documents based on their relevance to a query.
-      operationId: rerank_v1alpha_inference_rerank_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RerankRequest'
-        required: true
-  /v1alpha/admin/providers:
-    get:
-      responses:
-        '200':
-          description: A list of provider information objects.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListProvidersResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Admin
-      summary: List all available providers
-      description: List all available providers with their configuration and health status.
-      operationId: list_providers_v1alpha_admin_providers_get
-  /v1alpha/admin/providers/{provider_id}:
-    get:
-      responses:
-        '200':
-          description: The provider information object.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ProviderInfo'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '404':
-          description: Provider not found.
-      tags:
-      - Admin
-      summary: Get provider details
-      description: Get detailed information about a specific provider.
-      operationId: inspect_provider_v1alpha_admin_providers__provider_id__get
-      parameters:
-      - name: provider_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the provider to inspect.
-          title: Provider Id
-        description: The ID of the provider to inspect.
-  /v1alpha/admin/inspect/routes:
-    get:
-      responses:
-        '200':
-          description: A list of route information objects.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListRoutesResponse'
+        '200':
+          description: A list of route information objects.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListRoutesResponse'
         '400':
           $ref: '#/components/responses/BadRequest400'
           description: Bad Request
@@ -8933,408 +8210,87 @@ components:
       - error
       title: ViolationLevel
       description: Severity level of a safety violation.
-    AggregationFunctionType:
-      type: string
-      enum:
-      - average
-      - weighted_average
-      - median
-      - categorical_count
-      - accuracy
-      title: AggregationFunctionType
-      description: Types of aggregation functions for scoring results.
     ArrayType:
+      description: Parameter type for array values.
       properties:
         type:
-          type: string
           const: array
-          title: Type
           default: array
-      title: ArrayType
-      description: Parameter type for array values.
-    BasicScoringFnParams:
-      properties:
-        type:
-          type: string
-          const: basic
           title: Type
-          default: basic
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      title: BasicScoringFnParams
-      description: Parameters for basic scoring function configuration.
+          type: string
+      title: ArrayType
     BooleanType:
+      description: Parameter type for boolean values.
       properties:
         type:
-          type: string
           const: boolean
-          title: Type
           default: boolean
+          title: Type
+          type: string
       title: BooleanType
-      description: Parameter type for boolean values.
     ChatCompletionInputType:
+      description: Parameter type for chat completion input.
       properties:
         type:
-          type: string
           const: chat_completion_input
-          title: Type
           default: chat_completion_input
+          title: Type
+          type: string
       title: ChatCompletionInputType
-      description: Parameter type for chat completion input.
     CompletionInputType:
+      description: Parameter type for completion input.
       properties:
         type:
-          type: string
           const: completion_input
-          title: Type
           default: completion_input
+          title: Type
+          type: string
       title: CompletionInputType
-      description: Parameter type for completion input.
     JsonType:
+      description: Parameter type for JSON values.
       properties:
         type:
-          type: string
           const: json
-          title: Type
           default: json
-      title: JsonType
-      description: Parameter type for JSON values.
-    LLMAsJudgeScoringFnParams:
-      properties:
-        type:
-          type: string
-          const: llm_as_judge
           title: Type
-          default: llm_as_judge
-        judge_model:
           type: string
-          title: Judge Model
-        prompt_template:
-          anyOf:
-          - type: string
-          - type: 'null'
-        judge_score_regexes:
-          items:
-            type: string
-          type: array
-          title: Judge Score Regexes
-          description: Regexes to extract the answer from generated response
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      required:
-      - judge_model
-      title: LLMAsJudgeScoringFnParams
-      description: Parameters for LLM-as-judge scoring function configuration.
+      title: JsonType
     NumberType:
+      description: Parameter type for numeric values.
       properties:
         type:
-          type: string
           const: number
-          title: Type
           default: number
+          title: Type
+          type: string
       title: NumberType
-      description: Parameter type for numeric values.
     ObjectType:
+      description: Parameter type for object values.
       properties:
         type:
-          type: string
           const: object
-          title: Type
           default: object
-      title: ObjectType
-      description: Parameter type for object values.
-    RegexParserScoringFnParams:
-      properties:
-        type:
-          type: string
-          const: regex_parser
           title: Type
-          default: regex_parser
-        parsing_regexes:
-          items:
-            type: string
-          type: array
-          title: Parsing Regexes
-          description: Regex to extract the answer from generated response
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      title: RegexParserScoringFnParams
-      description: Parameters for regex parser scoring function configuration.
-    ScoringFn:
-      properties:
-        identifier:
           type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          const: scoring_function
-          title: Type
-          default: scoring_function
-        description:
-          anyOf:
-          - type: string
-          - type: 'null'
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Any additional metadata for this definition
-        return_type:
-          oneOf:
-          - $ref: '#/components/schemas/StringType'
-            title: StringType
-          - $ref: '#/components/schemas/NumberType'
-            title: NumberType
-          - $ref: '#/components/schemas/BooleanType'
-            title: BooleanType
-          - $ref: '#/components/schemas/ArrayType'
-            title: ArrayType
-          - $ref: '#/components/schemas/ObjectType'
-            title: ObjectType
-          - $ref: '#/components/schemas/JsonType'
-            title: JsonType
-          - $ref: '#/components/schemas/UnionType'
-            title: UnionType
-          - $ref: '#/components/schemas/ChatCompletionInputType'
-            title: ChatCompletionInputType
-          - $ref: '#/components/schemas/CompletionInputType'
-            title: CompletionInputType
-          title: StringType | ... (9 variants)
-          description: The return type of the deterministic function
-          discriminator:
-            propertyName: type
-            mapping:
-              array: '#/components/schemas/ArrayType'
-              boolean: '#/components/schemas/BooleanType'
-              chat_completion_input: '#/components/schemas/ChatCompletionInputType'
-              completion_input: '#/components/schemas/CompletionInputType'
-              json: '#/components/schemas/JsonType'
-              number: '#/components/schemas/NumberType'
-              object: '#/components/schemas/ObjectType'
-              string: '#/components/schemas/StringType'
-              union: '#/components/schemas/UnionType'
-        params:
-          anyOf:
-          - oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          - type: 'null'
-          title: Params
-          description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval
-      required:
-      - identifier
-      - provider_id
-      - return_type
-      title: ScoringFn
-      description: A scoring function resource for evaluating model outputs.
-    ScoringFnParams:
-      discriminator:
-        mapping:
-          basic: '#/components/schemas/BasicScoringFnParams'
-          llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-          regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-        propertyName: type
-      oneOf:
-      - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-        title: LLMAsJudgeScoringFnParams
-      - $ref: '#/components/schemas/RegexParserScoringFnParams'
-        title: RegexParserScoringFnParams
-      - $ref: '#/components/schemas/BasicScoringFnParams'
-        title: BasicScoringFnParams
-      title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-    ScoringFnParamsType:
-      description: Types of scoring function parameter configurations.
-      enum:
-      - llm_as_judge
-      - regex_parser
-      - basic
-      title: ScoringFnParamsType
-      type: string
+      title: ObjectType
     StringType:
+      description: Parameter type for string values.
       properties:
         type:
-          type: string
           const: string
-          title: Type
           default: string
+          title: Type
+          type: string
       title: StringType
-      description: Parameter type for string values.
     UnionType:
+      description: Parameter type for union values.
       properties:
         type:
-          type: string
           const: union
-          title: Type
           default: union
-      title: UnionType
-      description: Parameter type for union values.
-    ListScoringFunctionsResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/ScoringFn'
-          type: array
-          title: Data
-          description: List of scoring function objects.
-      required:
-      - data
-      title: ListScoringFunctionsResponse
-      description: Response containing a list of scoring function objects.
-    ScoreRequest:
-      properties:
-        input_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Input Rows
-          description: The rows to score.
-        scoring_functions:
-          additionalProperties:
-            anyOf:
-            - oneOf:
-              - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                title: LLMAsJudgeScoringFnParams
-              - $ref: '#/components/schemas/RegexParserScoringFnParams'
-                title: RegexParserScoringFnParams
-              - $ref: '#/components/schemas/BasicScoringFnParams'
-                title: BasicScoringFnParams
-              discriminator:
-                propertyName: type
-                mapping:
-                  basic: '#/components/schemas/BasicScoringFnParams'
-                  llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                  regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-              title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-            - type: 'null'
-            title: AdditionalpropertiesUnion
-          type: object
-          title: Scoring Functions
-          description: The scoring functions to use for the scoring.
-      required:
-      - input_rows
-      - scoring_functions
-      title: ScoreRequest
-      description: Request model for scoring a list of rows.
-    ScoreResponse:
-      properties:
-        results:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Results
-          description: A map of scoring function name to ScoringResult.
-      required:
-      - results
-      title: ScoreResponse
-      description: The response from scoring.
-    ScoringResult:
-      properties:
-        score_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Score Rows
-          description: The scoring result for each row. Each row is a map of column name to value.
-        aggregated_results:
-          additionalProperties: true
-          type: object
-          title: Aggregated Results
-          description: Map of metric name to aggregated value
-      required:
-      - score_rows
-      - aggregated_results
-      title: ScoringResult
-      description: A scoring result for a single row.
-    ScoreBatchRequest:
-      properties:
-        dataset_id:
+          title: Type
           type: string
-          title: Dataset Id
-          description: The ID of the dataset to score.
-        scoring_functions:
-          additionalProperties:
-            anyOf:
-            - oneOf:
-              - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                title: LLMAsJudgeScoringFnParams
-              - $ref: '#/components/schemas/RegexParserScoringFnParams'
-                title: RegexParserScoringFnParams
-              - $ref: '#/components/schemas/BasicScoringFnParams'
-                title: BasicScoringFnParams
-              discriminator:
-                propertyName: type
-                mapping:
-                  basic: '#/components/schemas/BasicScoringFnParams'
-                  llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                  regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-              title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-            - type: 'null'
-            title: AdditionalpropertiesUnion
-          type: object
-          title: Scoring Functions
-          description: The scoring functions to use for the scoring.
-        save_results_dataset:
-          type: boolean
-          title: Save Results Dataset
-          description: Whether to save the results to a dataset.
-          default: false
-      required:
-      - dataset_id
-      - scoring_functions
-      title: ScoreBatchRequest
-      description: Request model for scoring a batch of rows from a dataset.
-    ScoreBatchResponse:
-      properties:
-        dataset_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: (Optional) The identifier of the dataset that was scored
-        results:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Results
-          description: A map of scoring function name to ScoringResult
-      required:
-      - results
-      title: ScoreBatchResponse
-      description: Response from batch scoring operations on datasets.
+      title: UnionType
     Shield:
       properties:
         identifier:
@@ -10365,264 +9321,48 @@ components:
       - version
       title: VersionInfo
       description: Version information for the service.
-    AppendRowsRequest:
-      properties:
-        rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Rows
-          description: The rows to append to the dataset.
-      required:
-      - rows
-      title: AppendRowsRequest
-      description: Request body for appending rows to a dataset.
     PaginatedResponse:
+      description: A generic paginated response that follows a simple format.
       properties:
         data:
           items:
             additionalProperties: true
             type: object
-          type: array
           title: Data
+          type: array
         has_more:
-          type: boolean
           title: Has More
+          type: boolean
         url:
           anyOf:
           - type: string
           - type: 'null'
+          nullable: true
       required:
       - data
       - has_more
       title: PaginatedResponse
-      description: A generic paginated response that follows a simple format.
-    Dataset:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          const: dataset
-          title: Type
-          description: Type of resource, always 'dataset' for datasets
-          default: dataset
-        purpose:
-          $ref: '#/components/schemas/DatasetPurpose'
-          description: Purpose of the dataset indicating its intended use
-        source:
-          oneOf:
-          - $ref: '#/components/schemas/URIDataSource'
-            title: URIDataSource
-          - $ref: '#/components/schemas/RowsDataSource'
-            title: RowsDataSource
-          title: URIDataSource | RowsDataSource
-          description: Data source configuration for the dataset
-          discriminator:
-            propertyName: type
-            mapping:
-              rows: '#/components/schemas/RowsDataSource'
-              uri: '#/components/schemas/URIDataSource'
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Any additional metadata for this dataset
-      required:
-      - identifier
-      - provider_id
-      - purpose
-      - source
-      title: Dataset
-      description: Dataset resource for storing and accessing training or evaluation data.
-    RowsDataSource:
-      properties:
-        type:
-          type: string
-          const: rows
-          title: Type
-          description: The type of data source.
-          default: rows
-        rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Rows
-          description: 'The dataset is stored in rows. E.g. [{"messages": [{"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}]}]'
-      required:
-      - rows
-      title: RowsDataSource
-      description: A dataset stored in rows.
-    URIDataSource:
-      properties:
-        type:
-          type: string
-          const: uri
-          title: Type
-          description: The type of data source.
-          default: uri
-        uri:
-          type: string
-          title: Uri
-          description: The dataset can be obtained from a URI. E.g. "https://mywebsite.com/mydata.jsonl", "lsfs://mydata.jsonl", "data:csv;base64,{base64_content}"
-      required:
-      - uri
-      title: URIDataSource
-      description: A dataset that can be obtained from a URI.
-    ListDatasetsResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Dataset'
-          type: array
-          title: Data
-          description: List of datasets
-      required:
-      - data
-      title: ListDatasetsResponse
-      description: Response from listing datasets.
-    Benchmark:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          const: benchmark
-          title: Type
-          description: The resource type, always benchmark.
-          default: benchmark
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: Identifier of the dataset to use for the benchmark evaluation.
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          title: Scoring Functions
-          description: List of scoring function identifiers to apply during evaluation.
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Metadata for this evaluation task.
-      required:
-      - identifier
-      - provider_id
-      - dataset_id
-      - scoring_functions
-      title: Benchmark
-      description: A benchmark resource for evaluating model performance.
-    ListBenchmarksResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Benchmark'
-          type: array
-          title: Data
-          description: List of benchmark objects.
-      required:
-      - data
-      title: ListBenchmarksResponse
-      description: Response containing a list of benchmark objects.
-    BenchmarkConfig:
-      properties:
-        eval_candidate:
-          $ref: '#/components/schemas/ModelCandidate'
-          description: The candidate to evaluate
-        scoring_params:
-          additionalProperties:
-            oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          type: object
-          title: Scoring Params
-          description: Map between scoring function id and parameters for each scoring function you want to run
-        num_examples:
-          anyOf:
-          - type: integer
-            minimum: 1.0
-          - type: 'null'
-          description: Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated
-      required:
-      - eval_candidate
-      title: BenchmarkConfig
-      description: A benchmark configuration for evaluation.
     GreedySamplingStrategy:
+      description: Greedy sampling strategy that selects the highest probability token at each step.
       properties:
         type:
-          type: string
           const: greedy
-          title: Type
-          description: Must be 'greedy' to identify this sampling strategy.
           default: greedy
-      title: GreedySamplingStrategy
-      description: Greedy sampling strategy that selects the highest probability token at each step.
-    ModelCandidate:
-      properties:
-        type:
-          type: string
-          const: model
+          description: Must be 'greedy' to identify this sampling strategy.
           title: Type
-          default: model
-        model:
           type: string
-          minLength: 1
-          title: Model
-          description: The model ID to evaluate
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: The sampling parameters for the model
-        system_message:
-          anyOf:
-          - $ref: '#/components/schemas/SystemMessage'
-            title: SystemMessage
-          - type: 'null'
-          description: The system message providing instructions or context to the model
-          title: SystemMessage
-      required:
-      - model
-      - sampling_params
-      title: ModelCandidate
-      description: A model candidate for evaluation.
+      title: GreedySamplingStrategy
     SamplingParams:
+      description: Sampling parameters for text generation.
       properties:
         strategy:
+          description: The sampling strategy to use.
+          discriminator:
+            mapping:
+              greedy: '#/components/schemas/GreedySamplingStrategy'
+              top_k: '#/components/schemas/TopKSamplingStrategy'
+              top_p: '#/components/schemas/TopPSamplingStrategy'
+            propertyName: type
           oneOf:
           - $ref: '#/components/schemas/GreedySamplingStrategy'
             title: GreedySamplingStrategy
@@ -10631,200 +9371,127 @@ components:
           - $ref: '#/components/schemas/TopKSamplingStrategy'
             title: TopKSamplingStrategy
           title: GreedySamplingStrategy | TopPSamplingStrategy | TopKSamplingStrategy
-          description: The sampling strategy to use.
-          discriminator:
-            propertyName: type
-            mapping:
-              greedy: '#/components/schemas/GreedySamplingStrategy'
-              top_k: '#/components/schemas/TopKSamplingStrategy'
-              top_p: '#/components/schemas/TopPSamplingStrategy'
         max_tokens:
           anyOf:
-          - type: integer
-            minimum: 1.0
+          - minimum: 1
+            type: integer
           - type: 'null'
           description: The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length.
+          nullable: true
         repetition_penalty:
           anyOf:
-          - type: number
-            maximum: 2.0
+          - maximum: 2.0
             minimum: -2.0
+            type: number
           - type: 'null'
-          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
           default: 1.0
+          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
         stop:
           anyOf:
           - items:
               type: string
-            type: array
             maxItems: 4
+            type: array
           - type: 'null'
           description: Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+          nullable: true
       title: SamplingParams
-      description: Sampling parameters for text generation.
     SystemMessage:
+      description: A system message providing instructions or context to the model.
       properties:
         role:
-          type: string
           const: system
-          title: Role
-          description: Must be 'system' to identify this as a system message.
           default: system
+          description: Must be 'system' to identify this as a system message.
+          title: Role
+          type: string
         content:
           anyOf:
           - type: string
-          - oneOf:
-            - $ref: '#/components/schemas/ImageContentItem-Input'
-              title: ImageContentItem-Input
-            - $ref: '#/components/schemas/TextContentItem'
-              title: TextContentItem
-            discriminator:
-              propertyName: type
+          - discriminator:
               mapping:
-                image: '#/components/schemas/ImageContentItem-Input'
+                image: '#/components/schemas/ImageContentItem'
                 text: '#/components/schemas/TextContentItem'
-            title: ImageContentItem-Input | TextContentItem
-          - items:
-              oneOf:
-              - $ref: '#/components/schemas/ImageContentItem-Input'
-                title: ImageContentItem-Input
-              - $ref: '#/components/schemas/TextContentItem'
-                title: TextContentItem
+              propertyName: type
+            oneOf:
+            - $ref: '#/components/schemas/ImageContentItem'
+              title: ImageContentItem
+            - $ref: '#/components/schemas/TextContentItem'
+              title: TextContentItem
+            title: ImageContentItem | TextContentItem
+          - items:
               discriminator:
-                propertyName: type
                 mapping:
-                  image: '#/components/schemas/ImageContentItem-Input'
+                  image: '#/components/schemas/ImageContentItem'
                   text: '#/components/schemas/TextContentItem'
-              title: ImageContentItem-Input | TextContentItem
+                propertyName: type
+              oneOf:
+              - $ref: '#/components/schemas/ImageContentItem'
+                title: ImageContentItem
+              - $ref: '#/components/schemas/TextContentItem'
+                title: TextContentItem
+              title: ImageContentItem | TextContentItem
             type: array
-            title: list[ImageContentItem-Input | TextContentItem]
-          title: string | list[ImageContentItem-Input | TextContentItem]
+            title: list[ImageContentItem | TextContentItem]
           description: The content of the 'system prompt'. If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages.
+          title: string | list[ImageContentItem | TextContentItem]
       required:
       - content
       title: SystemMessage
-      description: A system message providing instructions or context to the model.
     TopKSamplingStrategy:
+      description: Top-k sampling strategy that restricts sampling to the k most likely tokens.
       properties:
         type:
-          type: string
           const: top_k
-          title: Type
-          description: Must be 'top_k' to identify this sampling strategy.
           default: top_k
+          description: Must be 'top_k' to identify this sampling strategy.
+          title: Type
+          type: string
         top_k:
-          type: integer
-          minimum: 1.0
-          title: Top K
           description: Number of top tokens to consider for sampling. Must be at least 1.
+          minimum: 1
+          title: Top K
+          type: integer
       required:
       - top_k
       title: TopKSamplingStrategy
-      description: Top-k sampling strategy that restricts sampling to the k most likely tokens.
     TopPSamplingStrategy:
+      description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
       properties:
         type:
-          type: string
           const: top_p
-          title: Type
-          description: Must be 'top_p' to identify this sampling strategy.
           default: top_p
+          description: Must be 'top_p' to identify this sampling strategy.
+          title: Type
+          type: string
         temperature:
-          type: number
+          description: Controls randomness in sampling. Higher values increase randomness.
           maximum: 2.0
           title: Temperature
-          description: Controls randomness in sampling. Higher values increase randomness.
+          type: number
           minimum: 0.0
         top_p:
-          type: number
+          default: 0.95
+          description: Cumulative probability threshold for nucleus sampling.
           maximum: 1.0
           minimum: 0.0
           title: Top P
-          description: Cumulative probability threshold for nucleus sampling.
-          default: 0.95
+          type: number
       required:
       - temperature
       title: TopPSamplingStrategy
-      description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
-    EvaluateRowsRequest:
-      description: Request model for evaluating a list of rows on a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to run the evaluation on
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        input_rows:
-          description: The rows to evaluate
-          items:
-            additionalProperties: true
-            type: object
-          minItems: 1
-          title: Input Rows
-          type: array
-        scoring_functions:
-          description: The scoring functions to use for the evaluation
-          items:
-            type: string
-          minItems: 1
-          title: Scoring Functions
-          type: array
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_id
-      - input_rows
-      - scoring_functions
-      - benchmark_config
-      title: EvaluateRowsRequest
-    EvaluateResponse:
-      properties:
-        generations:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Generations
-          description: The generations from the evaluation
-        scores:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Scores
-          description: The scores from the evaluation. Each key in the dict is a scoring function name
-      required:
-      - generations
-      - scores
-      title: EvaluateResponse
-      description: The response from an evaluation.
-    RunEvalRequest:
-      description: Request model for running an evaluation on a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to run the evaluation on
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_id
-      - benchmark_config
-      title: RunEvalRequest
     Job:
+      description: A job execution instance with status tracking.
       properties:
         job_id:
-          type: string
           title: Job Id
+          type: string
         status:
           $ref: '#/components/schemas/JobStatus'
       required:
       - job_id
       - status
       title: Job
-      description: A job execution instance with status tracking.
     RerankRequest:
       properties:
         model:
@@ -10964,85 +9631,6 @@ components:
       - $ref: '#/components/schemas/CompletionInputType'
         title: CompletionInputType
       title: StringType | ... (9 variants)
-    RegisterScoringFunctionRequest:
-      properties:
-        scoring_fn_id:
-          type: string
-          title: Scoring Fn Id
-          description: The ID of the scoring function to register.
-        description:
-          type: string
-          title: Description
-          description: The description of the scoring function.
-        return_type:
-          oneOf:
-          - $ref: '#/components/schemas/StringType'
-            title: StringType
-          - $ref: '#/components/schemas/NumberType'
-            title: NumberType
-          - $ref: '#/components/schemas/BooleanType'
-            title: BooleanType
-          - $ref: '#/components/schemas/ArrayType'
-            title: ArrayType
-          - $ref: '#/components/schemas/ObjectType'
-            title: ObjectType
-          - $ref: '#/components/schemas/JsonType'
-            title: JsonType
-          - $ref: '#/components/schemas/UnionType'
-            title: UnionType
-          - $ref: '#/components/schemas/ChatCompletionInputType'
-            title: ChatCompletionInputType
-          - $ref: '#/components/schemas/CompletionInputType'
-            title: CompletionInputType
-          title: StringType | ... (9 variants)
-          description: The return type of the scoring function.
-          discriminator:
-            propertyName: type
-            mapping:
-              array: '#/components/schemas/ArrayType'
-              boolean: '#/components/schemas/BooleanType'
-              chat_completion_input: '#/components/schemas/ChatCompletionInputType'
-              completion_input: '#/components/schemas/CompletionInputType'
-              json: '#/components/schemas/JsonType'
-              number: '#/components/schemas/NumberType'
-              object: '#/components/schemas/ObjectType'
-              string: '#/components/schemas/StringType'
-              union: '#/components/schemas/UnionType'
-        provider_scoring_fn_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider scoring function to use for the scoring function.
-        provider_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider to use for the scoring function.
-        params:
-          anyOf:
-          - oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          - type: 'null'
-          title: Params
-          description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval.
-      required:
-      - scoring_fn_id
-      - description
-      - return_type
-      title: RegisterScoringFunctionRequest
-      description: Request model for registering a scoring function.
     RegisterShieldRequest:
       properties:
         shield_id:
@@ -11069,90 +9657,6 @@ components:
       - shield_id
       title: RegisterShieldRequest
       description: Request model for registering a shield.
-    DataSource:
-      discriminator:
-        mapping:
-          rows: '#/components/schemas/RowsDataSource'
-          uri: '#/components/schemas/URIDataSource'
-        propertyName: type
-      oneOf:
-      - $ref: '#/components/schemas/URIDataSource'
-        title: URIDataSource
-      - $ref: '#/components/schemas/RowsDataSource'
-        title: RowsDataSource
-      title: URIDataSource | RowsDataSource
-    RegisterDatasetRequest:
-      properties:
-        purpose:
-          $ref: '#/components/schemas/DatasetPurpose'
-          description: The purpose of the dataset.
-        source:
-          oneOf:
-          - $ref: '#/components/schemas/URIDataSource'
-            title: URIDataSource
-          - $ref: '#/components/schemas/RowsDataSource'
-            title: RowsDataSource
-          title: URIDataSource | RowsDataSource
-          description: The data source of the dataset.
-          discriminator:
-            propertyName: type
-            mapping:
-              rows: '#/components/schemas/RowsDataSource'
-              uri: '#/components/schemas/URIDataSource'
-        metadata:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-          description: The metadata for the dataset.
-        dataset_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the dataset. If not provided, an ID will be generated.
-      required:
-      - purpose
-      - source
-      title: RegisterDatasetRequest
-      description: Request model for registering a dataset.
-    RegisterBenchmarkRequest:
-      properties:
-        benchmark_id:
-          type: string
-          title: Benchmark Id
-          description: The ID of the benchmark to register.
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: The ID of the dataset to use for the benchmark.
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          title: Scoring Functions
-          description: The scoring functions to use for the benchmark.
-        provider_benchmark_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider benchmark to use for the benchmark.
-        provider_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider to use for the benchmark.
-        metadata:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-          description: The metadata to use for the benchmark.
-      required:
-      - benchmark_id
-      - dataset_id
-      - scoring_functions
-      title: RegisterBenchmarkRequest
-      description: Request model for registering a benchmark.
     AllowedToolsFilter:
       properties:
         tool_names:
@@ -11622,13 +10126,6 @@ components:
       - model
       title: CreateResponseRequest
       description: Request model for creating a response.
-    DatasetPurpose:
-      type: string
-      enum:
-      - eval/question-answer
-      - eval/messages-answer
-      title: DatasetPurpose
-      description: Purpose of the dataset. Each purpose has a required input data schema.
     EmbeddedChunk-Input:
       properties:
         content:
@@ -11767,32 +10264,6 @@ components:
           - type: 'null'
       additionalProperties: true
       title: Errors
-    EvaluateRowsBodyRequest:
-      properties:
-        input_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          minItems: 1
-          title: Input Rows
-          description: The rows to evaluate
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          minItems: 1
-          title: Scoring Functions
-          description: The scoring functions to use for the evaluation
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - input_rows
-      - scoring_functions
-      - benchmark_config
-      title: EvaluateRowsBodyRequest
-      description: Request body model for evaluating rows (without path parameter).
     HealthStatus:
       type: string
       enum:
@@ -11835,16 +10306,6 @@ components:
       required:
       - cached_tokens
       title: InputTokensDetails
-    JobStatus:
-      type: string
-      enum:
-      - completed
-      - in_progress
-      - failed
-      - scheduled
-      - cancelled
-      title: JobStatus
-      description: Status of a job execution.
     ListConnectorsResponse:
       properties:
         data:
@@ -12680,15 +11141,6 @@ components:
       - disabled
       title: ResponseTruncation
       description: Controls how the service truncates input when it exceeds the model context window.
-    RunEvalBodyRequest:
-      properties:
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_config
-      title: RunEvalBodyRequest
-      description: Request body model for running an evaluation (without path parameter).
     SearchRankingOptions:
       properties:
         ranker:
@@ -13073,50 +11525,6 @@ components:
       - $ref: '#/components/schemas/OpenAIResponseContentPartReasoningText'
         title: OpenAIResponseContentPartReasoningText
       title: OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal | OpenAIResponseContentPartReasoningText
-    ListBenchmarksRequest:
-      description: Request model for listing benchmarks.
-      properties: {}
-      title: ListBenchmarksRequest
-    GetBenchmarkRequest:
-      description: Request model for getting a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to get.
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: GetBenchmarkRequest
-    UnregisterBenchmarkRequest:
-      description: Request model for unregistering a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to unregister.
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: UnregisterBenchmarkRequest
-    GetDatasetRequest:
-      description: Request model for getting a dataset by ID.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to get.
-          title: Dataset Id
-          type: string
-      required:
-      - dataset_id
-      title: GetDatasetRequest
-    UnregisterDatasetRequest:
-      description: Request model for unregistering a dataset.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to unregister.
-          title: Dataset Id
-          type: string
-      required:
-      - dataset_id
-      title: UnregisterDatasetRequest
     ListModelsResponse:
       description: Response containing a list of model objects.
       properties:
@@ -13149,39 +11557,6 @@ components:
       required:
       - model_id
       title: UnregisterModelRequest
-    DialogType:
-      description: Parameter type for dialog data with semantic output labels.
-      properties:
-        type:
-          const: dialog
-          default: dialog
-          title: Type
-          type: string
-      title: DialogType
-    ListScoringFunctionsRequest:
-      description: Request model for listing scoring functions.
-      properties: {}
-      title: ListScoringFunctionsRequest
-    GetScoringFunctionRequest:
-      description: Request model for getting a scoring function.
-      properties:
-        scoring_fn_id:
-          description: The ID of the scoring function to get.
-          title: Scoring Fn Id
-          type: string
-      required:
-      - scoring_fn_id
-      title: GetScoringFunctionRequest
-    UnregisterScoringFunctionRequest:
-      description: Request model for unregistering a scoring function.
-      properties:
-        scoring_fn_id:
-          description: The ID of the scoring function to unregister.
-          title: Scoring Fn Id
-          type: string
-      required:
-      - scoring_fn_id
-      title: UnregisterScoringFunctionRequest
     GetShieldRequest:
       description: Request model for getting a shield by identifier.
       properties:
@@ -13280,16 +11655,10 @@ components:
       - agents
       - batches
       - vector_io
-      - datasetio
-      - scoring
-      - eval
       - tool_runtime
       - models
       - shields
       - vector_stores
-      - datasets
-      - scoring_functions
-      - benchmarks
       - tool_groups
       - files
       - file_processors
@@ -14086,6 +12455,25 @@ components:
       required:
       - batch_id
       title: CancelBatchRequest
+    JobStatus:
+      description: Status of a job execution.
+      enum:
+      - completed
+      - in_progress
+      - failed
+      - scheduled
+      - cancelled
+      title: JobStatus
+      type: string
+    DialogType:
+      description: Parameter type for dialog data with semantic output labels.
+      properties:
+        type:
+          const: dialog
+          default: dialog
+          title: Type
+          type: string
+      title: DialogType
     ConnectorInput:
       description: Input for creating a connector
       properties:
@@ -14316,90 +12704,6 @@ components:
       - conversation_id
       - item_id
       title: DeleteItemRequest
-    IterRowsRequest:
-      description: Request model for iterating over rows in a dataset.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to get the rows from.
-          title: Dataset Id
-          type: string
-        start_index:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: Index into dataset for the first row to get. Get all rows if None.
-          nullable: true
-        limit:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: The number of rows to get.
-          nullable: true
-      required:
-      - dataset_id
-      title: IterRowsRequest
-    BenchmarkIdRequest:
-      description: Request model containing benchmark_id path parameter.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark
-          minLength: 1
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: BenchmarkIdRequest
-    JobStatusRequest:
-      description: Request model for getting the status of a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to get the status of
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobStatusRequest
-    JobCancelRequest:
-      description: Request model for canceling a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to cancel
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobCancelRequest
-    JobResultRequest:
-      description: Request model for getting the result of a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to get the result of
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobResultRequest
     ProcessFileRequest:
       description: |-
         Request model for file processing operation.
diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml
index 6d68f9fbf1..242bed4a4a 100644
--- a/docs/static/deprecated-llama-stack-spec.yaml
+++ b/docs/static/deprecated-llama-stack-spec.yaml
@@ -137,126 +137,6 @@ paths:
           title: Model Id
         description: The ID of the model to unregister.
       deprecated: true
-  /v1/scoring-functions:
-    get:
-      responses:
-        '200':
-          description: A ListScoringFunctionsResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListScoringFunctionsResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Scoring Functions
-      summary: List all scoring functions.
-      description: List all scoring functions.
-      operationId: list_scoring_functions_v1_scoring_functions_get
-    post:
-      responses:
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-        '204':
-          description: The scoring function was successfully registered.
-      tags:
-      - Scoring Functions
-      summary: Register a scoring function.
-      description: Register a scoring function.
-      operationId: register_scoring_function_v1_scoring_functions_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterScoringFunctionRequest'
-        required: true
-      deprecated: true
-  /v1/scoring-functions/{scoring_fn_id}:
-    get:
-      responses:
-        '200':
-          description: A ScoringFn.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoringFn'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Scoring Functions
-      summary: Get a scoring function by its ID.
-      description: Get a scoring function by its ID.
-      operationId: get_scoring_function_v1_scoring_functions__scoring_fn_id__get
-      parameters:
-      - name: scoring_fn_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the scoring function to get.
-          title: Scoring Fn Id
-        description: The ID of the scoring function to get.
-    delete:
-      responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: The scoring function was successfully unregistered.
-      tags:
-      - Scoring Functions
-      summary: Unregister a scoring function.
-      description: Unregister a scoring function.
-      operationId: unregister_scoring_function_v1_scoring_functions__scoring_fn_id__delete
-      parameters:
-      - name: scoring_fn_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the scoring function to unregister.
-          title: Scoring Fn Id
-        description: The ID of the scoring function to unregister.
-      deprecated: true
   /v1/shields:
     get:
       responses:
@@ -381,250 +261,6 @@ paths:
           title: Identifier
         description: The identifier of the shield to unregister.
       deprecated: true
-  /v1beta/datasets:
-    get:
-      responses:
-        '200':
-          description: A list of dataset objects.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListDatasetsResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Datasets
-      summary: List all datasets.
-      description: List all datasets.
-      operationId: list_datasets_v1beta_datasets_get
-    post:
-      responses:
-        '200':
-          description: The registered dataset object.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Dataset'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Datasets
-      summary: Register a new dataset.
-      description: Register a new dataset.
-      operationId: register_dataset_v1beta_datasets_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterDatasetRequest'
-        required: true
-      deprecated: true
-  /v1beta/datasets/{dataset_id}:
-    get:
-      responses:
-        '200':
-          description: The dataset object.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Dataset'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Datasets
-      summary: Get a dataset by its ID.
-      description: Get a dataset by its ID.
-      operationId: get_dataset_v1beta_datasets__dataset_id__get
-      parameters:
-      - name: dataset_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the dataset to get.
-          title: Dataset Id
-        description: The ID of the dataset to get.
-    delete:
-      responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: The dataset was successfully unregistered.
-      tags:
-      - Datasets
-      summary: Unregister a dataset by its ID.
-      description: Unregister a dataset by its ID.
-      operationId: unregister_dataset_v1beta_datasets__dataset_id__delete
-      parameters:
-      - name: dataset_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the dataset to unregister.
-          title: Dataset Id
-        description: The ID of the dataset to unregister.
-      deprecated: true
-  /v1alpha/eval/benchmarks:
-    get:
-      responses:
-        '200':
-          description: A ListBenchmarksResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListBenchmarksResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Benchmarks
-      summary: List all benchmarks.
-      description: List all benchmarks.
-      operationId: list_benchmarks_v1alpha_eval_benchmarks_get
-    post:
-      responses:
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-        '204':
-          description: The benchmark was successfully registered.
-      tags:
-      - Benchmarks
-      summary: Register a benchmark.
-      description: Register a benchmark.
-      operationId: register_benchmark_v1alpha_eval_benchmarks_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterBenchmarkRequest'
-        required: true
-      deprecated: true
-  /v1alpha/eval/benchmarks/{benchmark_id}:
-    get:
-      responses:
-        '200':
-          description: A Benchmark.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Benchmark'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Benchmarks
-      summary: Get a benchmark by its ID.
-      description: Get a benchmark by its ID.
-      operationId: get_benchmark_v1alpha_eval_benchmarks__benchmark_id__get
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark to get.
-          title: Benchmark Id
-        description: The ID of the benchmark to get.
-    delete:
-      responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: The benchmark was successfully unregistered.
-      tags:
-      - Benchmarks
-      summary: Unregister a benchmark.
-      description: Unregister a benchmark.
-      operationId: unregister_benchmark_v1alpha_eval_benchmarks__benchmark_id__delete
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark to unregister.
-          title: Benchmark Id
-        description: The ID of the benchmark to unregister.
-      deprecated: true
 components:
   schemas:
     Error:
@@ -5634,408 +5270,87 @@ components:
       - error
       title: ViolationLevel
       description: Severity level of a safety violation.
-    AggregationFunctionType:
-      type: string
-      enum:
-      - average
-      - weighted_average
-      - median
-      - categorical_count
-      - accuracy
-      title: AggregationFunctionType
-      description: Types of aggregation functions for scoring results.
     ArrayType:
+      description: Parameter type for array values.
       properties:
         type:
-          type: string
           const: array
-          title: Type
           default: array
-      title: ArrayType
-      description: Parameter type for array values.
-    BasicScoringFnParams:
-      properties:
-        type:
-          type: string
-          const: basic
           title: Type
-          default: basic
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      title: BasicScoringFnParams
-      description: Parameters for basic scoring function configuration.
+          type: string
+      title: ArrayType
     BooleanType:
+      description: Parameter type for boolean values.
       properties:
         type:
-          type: string
           const: boolean
-          title: Type
           default: boolean
+          title: Type
+          type: string
       title: BooleanType
-      description: Parameter type for boolean values.
     ChatCompletionInputType:
+      description: Parameter type for chat completion input.
       properties:
         type:
-          type: string
           const: chat_completion_input
-          title: Type
           default: chat_completion_input
+          title: Type
+          type: string
       title: ChatCompletionInputType
-      description: Parameter type for chat completion input.
     CompletionInputType:
+      description: Parameter type for completion input.
       properties:
         type:
-          type: string
           const: completion_input
-          title: Type
           default: completion_input
+          title: Type
+          type: string
       title: CompletionInputType
-      description: Parameter type for completion input.
     JsonType:
+      description: Parameter type for JSON values.
       properties:
         type:
-          type: string
           const: json
-          title: Type
           default: json
-      title: JsonType
-      description: Parameter type for JSON values.
-    LLMAsJudgeScoringFnParams:
-      properties:
-        type:
-          type: string
-          const: llm_as_judge
           title: Type
-          default: llm_as_judge
-        judge_model:
           type: string
-          title: Judge Model
-        prompt_template:
-          anyOf:
-          - type: string
-          - type: 'null'
-        judge_score_regexes:
-          items:
-            type: string
-          type: array
-          title: Judge Score Regexes
-          description: Regexes to extract the answer from generated response
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      required:
-      - judge_model
-      title: LLMAsJudgeScoringFnParams
-      description: Parameters for LLM-as-judge scoring function configuration.
+      title: JsonType
     NumberType:
+      description: Parameter type for numeric values.
       properties:
         type:
-          type: string
           const: number
-          title: Type
           default: number
+          title: Type
+          type: string
       title: NumberType
-      description: Parameter type for numeric values.
     ObjectType:
+      description: Parameter type for object values.
       properties:
         type:
-          type: string
           const: object
-          title: Type
           default: object
-      title: ObjectType
-      description: Parameter type for object values.
-    RegexParserScoringFnParams:
-      properties:
-        type:
-          type: string
-          const: regex_parser
           title: Type
-          default: regex_parser
-        parsing_regexes:
-          items:
-            type: string
-          type: array
-          title: Parsing Regexes
-          description: Regex to extract the answer from generated response
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      title: RegexParserScoringFnParams
-      description: Parameters for regex parser scoring function configuration.
-    ScoringFn:
-      properties:
-        identifier:
           type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          const: scoring_function
-          title: Type
-          default: scoring_function
-        description:
-          anyOf:
-          - type: string
-          - type: 'null'
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Any additional metadata for this definition
-        return_type:
-          oneOf:
-          - $ref: '#/components/schemas/StringType'
-            title: StringType
-          - $ref: '#/components/schemas/NumberType'
-            title: NumberType
-          - $ref: '#/components/schemas/BooleanType'
-            title: BooleanType
-          - $ref: '#/components/schemas/ArrayType'
-            title: ArrayType
-          - $ref: '#/components/schemas/ObjectType'
-            title: ObjectType
-          - $ref: '#/components/schemas/JsonType'
-            title: JsonType
-          - $ref: '#/components/schemas/UnionType'
-            title: UnionType
-          - $ref: '#/components/schemas/ChatCompletionInputType'
-            title: ChatCompletionInputType
-          - $ref: '#/components/schemas/CompletionInputType'
-            title: CompletionInputType
-          title: StringType | ... (9 variants)
-          description: The return type of the deterministic function
-          discriminator:
-            propertyName: type
-            mapping:
-              array: '#/components/schemas/ArrayType'
-              boolean: '#/components/schemas/BooleanType'
-              chat_completion_input: '#/components/schemas/ChatCompletionInputType'
-              completion_input: '#/components/schemas/CompletionInputType'
-              json: '#/components/schemas/JsonType'
-              number: '#/components/schemas/NumberType'
-              object: '#/components/schemas/ObjectType'
-              string: '#/components/schemas/StringType'
-              union: '#/components/schemas/UnionType'
-        params:
-          anyOf:
-          - oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          - type: 'null'
-          title: Params
-          description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval
-      required:
-      - identifier
-      - provider_id
-      - return_type
-      title: ScoringFn
-      description: A scoring function resource for evaluating model outputs.
-    ScoringFnParams:
-      discriminator:
-        mapping:
-          basic: '#/components/schemas/BasicScoringFnParams'
-          llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-          regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-        propertyName: type
-      oneOf:
-      - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-        title: LLMAsJudgeScoringFnParams
-      - $ref: '#/components/schemas/RegexParserScoringFnParams'
-        title: RegexParserScoringFnParams
-      - $ref: '#/components/schemas/BasicScoringFnParams'
-        title: BasicScoringFnParams
-      title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-    ScoringFnParamsType:
-      description: Types of scoring function parameter configurations.
-      enum:
-      - llm_as_judge
-      - regex_parser
-      - basic
-      title: ScoringFnParamsType
-      type: string
+      title: ObjectType
     StringType:
+      description: Parameter type for string values.
       properties:
         type:
-          type: string
           const: string
-          title: Type
           default: string
+          title: Type
+          type: string
       title: StringType
-      description: Parameter type for string values.
     UnionType:
+      description: Parameter type for union values.
       properties:
         type:
-          type: string
           const: union
-          title: Type
           default: union
-      title: UnionType
-      description: Parameter type for union values.
-    ListScoringFunctionsResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/ScoringFn'
-          type: array
-          title: Data
-          description: List of scoring function objects.
-      required:
-      - data
-      title: ListScoringFunctionsResponse
-      description: Response containing a list of scoring function objects.
-    ScoreRequest:
-      properties:
-        input_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Input Rows
-          description: The rows to score.
-        scoring_functions:
-          additionalProperties:
-            anyOf:
-            - oneOf:
-              - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                title: LLMAsJudgeScoringFnParams
-              - $ref: '#/components/schemas/RegexParserScoringFnParams'
-                title: RegexParserScoringFnParams
-              - $ref: '#/components/schemas/BasicScoringFnParams'
-                title: BasicScoringFnParams
-              discriminator:
-                propertyName: type
-                mapping:
-                  basic: '#/components/schemas/BasicScoringFnParams'
-                  llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                  regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-              title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-            - type: 'null'
-            title: AdditionalpropertiesUnion
-          type: object
-          title: Scoring Functions
-          description: The scoring functions to use for the scoring.
-      required:
-      - input_rows
-      - scoring_functions
-      title: ScoreRequest
-      description: Request model for scoring a list of rows.
-    ScoreResponse:
-      properties:
-        results:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Results
-          description: A map of scoring function name to ScoringResult.
-      required:
-      - results
-      title: ScoreResponse
-      description: The response from scoring.
-    ScoringResult:
-      properties:
-        score_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Score Rows
-          description: The scoring result for each row. Each row is a map of column name to value.
-        aggregated_results:
-          additionalProperties: true
-          type: object
-          title: Aggregated Results
-          description: Map of metric name to aggregated value
-      required:
-      - score_rows
-      - aggregated_results
-      title: ScoringResult
-      description: A scoring result for a single row.
-    ScoreBatchRequest:
-      properties:
-        dataset_id:
+          title: Type
           type: string
-          title: Dataset Id
-          description: The ID of the dataset to score.
-        scoring_functions:
-          additionalProperties:
-            anyOf:
-            - oneOf:
-              - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                title: LLMAsJudgeScoringFnParams
-              - $ref: '#/components/schemas/RegexParserScoringFnParams'
-                title: RegexParserScoringFnParams
-              - $ref: '#/components/schemas/BasicScoringFnParams'
-                title: BasicScoringFnParams
-              discriminator:
-                propertyName: type
-                mapping:
-                  basic: '#/components/schemas/BasicScoringFnParams'
-                  llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                  regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-              title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-            - type: 'null'
-            title: AdditionalpropertiesUnion
-          type: object
-          title: Scoring Functions
-          description: The scoring functions to use for the scoring.
-        save_results_dataset:
-          type: boolean
-          title: Save Results Dataset
-          description: Whether to save the results to a dataset.
-          default: false
-      required:
-      - dataset_id
-      - scoring_functions
-      title: ScoreBatchRequest
-      description: Request model for scoring a batch of rows from a dataset.
-    ScoreBatchResponse:
-      properties:
-        dataset_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: (Optional) The identifier of the dataset that was scored
-        results:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Results
-          description: A map of scoring function name to ScoringResult
-      required:
-      - results
-      title: ScoreBatchResponse
-      description: Response from batch scoring operations on datasets.
+      title: UnionType
     Shield:
       properties:
         identifier:
@@ -7066,264 +6381,48 @@ components:
       - version
       title: VersionInfo
       description: Version information for the service.
-    AppendRowsRequest:
-      properties:
-        rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Rows
-          description: The rows to append to the dataset.
-      required:
-      - rows
-      title: AppendRowsRequest
-      description: Request body for appending rows to a dataset.
     PaginatedResponse:
+      description: A generic paginated response that follows a simple format.
       properties:
         data:
           items:
             additionalProperties: true
             type: object
-          type: array
           title: Data
+          type: array
         has_more:
-          type: boolean
           title: Has More
+          type: boolean
         url:
           anyOf:
           - type: string
           - type: 'null'
+          nullable: true
       required:
       - data
       - has_more
       title: PaginatedResponse
-      description: A generic paginated response that follows a simple format.
-    Dataset:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          const: dataset
-          title: Type
-          description: Type of resource, always 'dataset' for datasets
-          default: dataset
-        purpose:
-          $ref: '#/components/schemas/DatasetPurpose'
-          description: Purpose of the dataset indicating its intended use
-        source:
-          oneOf:
-          - $ref: '#/components/schemas/URIDataSource'
-            title: URIDataSource
-          - $ref: '#/components/schemas/RowsDataSource'
-            title: RowsDataSource
-          title: URIDataSource | RowsDataSource
-          description: Data source configuration for the dataset
-          discriminator:
-            propertyName: type
-            mapping:
-              rows: '#/components/schemas/RowsDataSource'
-              uri: '#/components/schemas/URIDataSource'
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Any additional metadata for this dataset
-      required:
-      - identifier
-      - provider_id
-      - purpose
-      - source
-      title: Dataset
-      description: Dataset resource for storing and accessing training or evaluation data.
-    RowsDataSource:
-      properties:
-        type:
-          type: string
-          const: rows
-          title: Type
-          description: The type of data source.
-          default: rows
-        rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Rows
-          description: 'The dataset is stored in rows. E.g. [{"messages": [{"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}]}]'
-      required:
-      - rows
-      title: RowsDataSource
-      description: A dataset stored in rows.
-    URIDataSource:
-      properties:
-        type:
-          type: string
-          const: uri
-          title: Type
-          description: The type of data source.
-          default: uri
-        uri:
-          type: string
-          title: Uri
-          description: The dataset can be obtained from a URI. E.g. "https://mywebsite.com/mydata.jsonl", "lsfs://mydata.jsonl", "data:csv;base64,{base64_content}"
-      required:
-      - uri
-      title: URIDataSource
-      description: A dataset that can be obtained from a URI.
-    ListDatasetsResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Dataset'
-          type: array
-          title: Data
-          description: List of datasets
-      required:
-      - data
-      title: ListDatasetsResponse
-      description: Response from listing datasets.
-    Benchmark:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          const: benchmark
-          title: Type
-          description: The resource type, always benchmark.
-          default: benchmark
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: Identifier of the dataset to use for the benchmark evaluation.
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          title: Scoring Functions
-          description: List of scoring function identifiers to apply during evaluation.
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Metadata for this evaluation task.
-      required:
-      - identifier
-      - provider_id
-      - dataset_id
-      - scoring_functions
-      title: Benchmark
-      description: A benchmark resource for evaluating model performance.
-    ListBenchmarksResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Benchmark'
-          type: array
-          title: Data
-          description: List of benchmark objects.
-      required:
-      - data
-      title: ListBenchmarksResponse
-      description: Response containing a list of benchmark objects.
-    BenchmarkConfig:
-      properties:
-        eval_candidate:
-          $ref: '#/components/schemas/ModelCandidate'
-          description: The candidate to evaluate
-        scoring_params:
-          additionalProperties:
-            oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          type: object
-          title: Scoring Params
-          description: Map between scoring function id and parameters for each scoring function you want to run
-        num_examples:
-          anyOf:
-          - type: integer
-            minimum: 1.0
-          - type: 'null'
-          description: Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated
-      required:
-      - eval_candidate
-      title: BenchmarkConfig
-      description: A benchmark configuration for evaluation.
     GreedySamplingStrategy:
+      description: Greedy sampling strategy that selects the highest probability token at each step.
       properties:
         type:
-          type: string
           const: greedy
-          title: Type
-          description: Must be 'greedy' to identify this sampling strategy.
           default: greedy
-      title: GreedySamplingStrategy
-      description: Greedy sampling strategy that selects the highest probability token at each step.
-    ModelCandidate:
-      properties:
-        type:
-          type: string
-          const: model
+          description: Must be 'greedy' to identify this sampling strategy.
           title: Type
-          default: model
-        model:
           type: string
-          minLength: 1
-          title: Model
-          description: The model ID to evaluate
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: The sampling parameters for the model
-        system_message:
-          anyOf:
-          - $ref: '#/components/schemas/SystemMessage'
-            title: SystemMessage
-          - type: 'null'
-          description: The system message providing instructions or context to the model
-          title: SystemMessage
-      required:
-      - model
-      - sampling_params
-      title: ModelCandidate
-      description: A model candidate for evaluation.
+      title: GreedySamplingStrategy
     SamplingParams:
+      description: Sampling parameters for text generation.
       properties:
         strategy:
+          description: The sampling strategy to use.
+          discriminator:
+            mapping:
+              greedy: '#/components/schemas/GreedySamplingStrategy'
+              top_k: '#/components/schemas/TopKSamplingStrategy'
+              top_p: '#/components/schemas/TopPSamplingStrategy'
+            propertyName: type
           oneOf:
           - $ref: '#/components/schemas/GreedySamplingStrategy'
             title: GreedySamplingStrategy
@@ -7332,200 +6431,127 @@ components:
           - $ref: '#/components/schemas/TopKSamplingStrategy'
             title: TopKSamplingStrategy
           title: GreedySamplingStrategy | TopPSamplingStrategy | TopKSamplingStrategy
-          description: The sampling strategy to use.
-          discriminator:
-            propertyName: type
-            mapping:
-              greedy: '#/components/schemas/GreedySamplingStrategy'
-              top_k: '#/components/schemas/TopKSamplingStrategy'
-              top_p: '#/components/schemas/TopPSamplingStrategy'
         max_tokens:
           anyOf:
-          - type: integer
-            minimum: 1.0
+          - minimum: 1
+            type: integer
           - type: 'null'
           description: The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length.
+          nullable: true
         repetition_penalty:
           anyOf:
-          - type: number
-            maximum: 2.0
+          - maximum: 2.0
             minimum: -2.0
+            type: number
           - type: 'null'
-          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
           default: 1.0
+          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
         stop:
           anyOf:
           - items:
               type: string
-            type: array
             maxItems: 4
+            type: array
           - type: 'null'
           description: Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+          nullable: true
       title: SamplingParams
-      description: Sampling parameters for text generation.
     SystemMessage:
+      description: A system message providing instructions or context to the model.
       properties:
         role:
-          type: string
           const: system
-          title: Role
-          description: Must be 'system' to identify this as a system message.
           default: system
+          description: Must be 'system' to identify this as a system message.
+          title: Role
+          type: string
         content:
           anyOf:
           - type: string
-          - oneOf:
-            - $ref: '#/components/schemas/ImageContentItem-Input'
-              title: ImageContentItem-Input
-            - $ref: '#/components/schemas/TextContentItem'
-              title: TextContentItem
-            discriminator:
-              propertyName: type
+          - discriminator:
               mapping:
-                image: '#/components/schemas/ImageContentItem-Input'
+                image: '#/components/schemas/ImageContentItem'
                 text: '#/components/schemas/TextContentItem'
-            title: ImageContentItem-Input | TextContentItem
-          - items:
-              oneOf:
-              - $ref: '#/components/schemas/ImageContentItem-Input'
-                title: ImageContentItem-Input
-              - $ref: '#/components/schemas/TextContentItem'
-                title: TextContentItem
+              propertyName: type
+            oneOf:
+            - $ref: '#/components/schemas/ImageContentItem'
+              title: ImageContentItem
+            - $ref: '#/components/schemas/TextContentItem'
+              title: TextContentItem
+            title: ImageContentItem | TextContentItem
+          - items:
               discriminator:
-                propertyName: type
                 mapping:
-                  image: '#/components/schemas/ImageContentItem-Input'
+                  image: '#/components/schemas/ImageContentItem'
                   text: '#/components/schemas/TextContentItem'
-              title: ImageContentItem-Input | TextContentItem
+                propertyName: type
+              oneOf:
+              - $ref: '#/components/schemas/ImageContentItem'
+                title: ImageContentItem
+              - $ref: '#/components/schemas/TextContentItem'
+                title: TextContentItem
+              title: ImageContentItem | TextContentItem
             type: array
-            title: list[ImageContentItem-Input | TextContentItem]
-          title: string | list[ImageContentItem-Input | TextContentItem]
+            title: list[ImageContentItem | TextContentItem]
           description: The content of the 'system prompt'. If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages.
+          title: string | list[ImageContentItem | TextContentItem]
       required:
       - content
       title: SystemMessage
-      description: A system message providing instructions or context to the model.
     TopKSamplingStrategy:
+      description: Top-k sampling strategy that restricts sampling to the k most likely tokens.
       properties:
         type:
-          type: string
           const: top_k
-          title: Type
-          description: Must be 'top_k' to identify this sampling strategy.
           default: top_k
+          description: Must be 'top_k' to identify this sampling strategy.
+          title: Type
+          type: string
         top_k:
-          type: integer
-          minimum: 1.0
-          title: Top K
           description: Number of top tokens to consider for sampling. Must be at least 1.
+          minimum: 1
+          title: Top K
+          type: integer
       required:
       - top_k
       title: TopKSamplingStrategy
-      description: Top-k sampling strategy that restricts sampling to the k most likely tokens.
     TopPSamplingStrategy:
+      description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
       properties:
         type:
-          type: string
           const: top_p
-          title: Type
-          description: Must be 'top_p' to identify this sampling strategy.
           default: top_p
+          description: Must be 'top_p' to identify this sampling strategy.
+          title: Type
+          type: string
         temperature:
-          type: number
+          description: Controls randomness in sampling. Higher values increase randomness.
           maximum: 2.0
           title: Temperature
-          description: Controls randomness in sampling. Higher values increase randomness.
+          type: number
           minimum: 0.0
         top_p:
-          type: number
+          default: 0.95
+          description: Cumulative probability threshold for nucleus sampling.
           maximum: 1.0
           minimum: 0.0
           title: Top P
-          description: Cumulative probability threshold for nucleus sampling.
-          default: 0.95
+          type: number
       required:
       - temperature
       title: TopPSamplingStrategy
-      description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
-    EvaluateRowsRequest:
-      description: Request model for evaluating a list of rows on a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to run the evaluation on
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        input_rows:
-          description: The rows to evaluate
-          items:
-            additionalProperties: true
-            type: object
-          minItems: 1
-          title: Input Rows
-          type: array
-        scoring_functions:
-          description: The scoring functions to use for the evaluation
-          items:
-            type: string
-          minItems: 1
-          title: Scoring Functions
-          type: array
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_id
-      - input_rows
-      - scoring_functions
-      - benchmark_config
-      title: EvaluateRowsRequest
-    EvaluateResponse:
-      properties:
-        generations:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Generations
-          description: The generations from the evaluation
-        scores:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Scores
-          description: The scores from the evaluation. Each key in the dict is a scoring function name
-      required:
-      - generations
-      - scores
-      title: EvaluateResponse
-      description: The response from an evaluation.
-    RunEvalRequest:
-      description: Request model for running an evaluation on a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to run the evaluation on
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_id
-      - benchmark_config
-      title: RunEvalRequest
     Job:
+      description: A job execution instance with status tracking.
       properties:
         job_id:
-          type: string
           title: Job Id
+          type: string
         status:
           $ref: '#/components/schemas/JobStatus'
       required:
       - job_id
       - status
       title: Job
-      description: A job execution instance with status tracking.
     RerankRequest:
       properties:
         model:
@@ -7665,85 +6691,6 @@ components:
       - $ref: '#/components/schemas/CompletionInputType'
         title: CompletionInputType
       title: StringType | ... (9 variants)
-    RegisterScoringFunctionRequest:
-      properties:
-        scoring_fn_id:
-          type: string
-          title: Scoring Fn Id
-          description: The ID of the scoring function to register.
-        description:
-          type: string
-          title: Description
-          description: The description of the scoring function.
-        return_type:
-          oneOf:
-          - $ref: '#/components/schemas/StringType'
-            title: StringType
-          - $ref: '#/components/schemas/NumberType'
-            title: NumberType
-          - $ref: '#/components/schemas/BooleanType'
-            title: BooleanType
-          - $ref: '#/components/schemas/ArrayType'
-            title: ArrayType
-          - $ref: '#/components/schemas/ObjectType'
-            title: ObjectType
-          - $ref: '#/components/schemas/JsonType'
-            title: JsonType
-          - $ref: '#/components/schemas/UnionType'
-            title: UnionType
-          - $ref: '#/components/schemas/ChatCompletionInputType'
-            title: ChatCompletionInputType
-          - $ref: '#/components/schemas/CompletionInputType'
-            title: CompletionInputType
-          title: StringType | ... (9 variants)
-          description: The return type of the scoring function.
-          discriminator:
-            propertyName: type
-            mapping:
-              array: '#/components/schemas/ArrayType'
-              boolean: '#/components/schemas/BooleanType'
-              chat_completion_input: '#/components/schemas/ChatCompletionInputType'
-              completion_input: '#/components/schemas/CompletionInputType'
-              json: '#/components/schemas/JsonType'
-              number: '#/components/schemas/NumberType'
-              object: '#/components/schemas/ObjectType'
-              string: '#/components/schemas/StringType'
-              union: '#/components/schemas/UnionType'
-        provider_scoring_fn_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider scoring function to use for the scoring function.
-        provider_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider to use for the scoring function.
-        params:
-          anyOf:
-          - oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          - type: 'null'
-          title: Params
-          description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval.
-      required:
-      - scoring_fn_id
-      - description
-      - return_type
-      title: RegisterScoringFunctionRequest
-      description: Request model for registering a scoring function.
     RegisterShieldRequest:
       properties:
         shield_id:
@@ -7770,90 +6717,6 @@ components:
       - shield_id
       title: RegisterShieldRequest
       description: Request model for registering a shield.
-    DataSource:
-      discriminator:
-        mapping:
-          rows: '#/components/schemas/RowsDataSource'
-          uri: '#/components/schemas/URIDataSource'
-        propertyName: type
-      oneOf:
-      - $ref: '#/components/schemas/URIDataSource'
-        title: URIDataSource
-      - $ref: '#/components/schemas/RowsDataSource'
-        title: RowsDataSource
-      title: URIDataSource | RowsDataSource
-    RegisterDatasetRequest:
-      properties:
-        purpose:
-          $ref: '#/components/schemas/DatasetPurpose'
-          description: The purpose of the dataset.
-        source:
-          oneOf:
-          - $ref: '#/components/schemas/URIDataSource'
-            title: URIDataSource
-          - $ref: '#/components/schemas/RowsDataSource'
-            title: RowsDataSource
-          title: URIDataSource | RowsDataSource
-          description: The data source of the dataset.
-          discriminator:
-            propertyName: type
-            mapping:
-              rows: '#/components/schemas/RowsDataSource'
-              uri: '#/components/schemas/URIDataSource'
-        metadata:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-          description: The metadata for the dataset.
-        dataset_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the dataset. If not provided, an ID will be generated.
-      required:
-      - purpose
-      - source
-      title: RegisterDatasetRequest
-      description: Request model for registering a dataset.
-    RegisterBenchmarkRequest:
-      properties:
-        benchmark_id:
-          type: string
-          title: Benchmark Id
-          description: The ID of the benchmark to register.
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: The ID of the dataset to use for the benchmark.
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          title: Scoring Functions
-          description: The scoring functions to use for the benchmark.
-        provider_benchmark_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider benchmark to use for the benchmark.
-        provider_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider to use for the benchmark.
-        metadata:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-          description: The metadata to use for the benchmark.
-      required:
-      - benchmark_id
-      - dataset_id
-      - scoring_functions
-      title: RegisterBenchmarkRequest
-      description: Request model for registering a benchmark.
     AllowedToolsFilter:
       properties:
         tool_names:
@@ -8325,13 +7188,6 @@ components:
       - model
       title: CreateResponseRequest
       description: Request model for creating a response.
-    DatasetPurpose:
-      type: string
-      enum:
-      - eval/question-answer
-      - eval/messages-answer
-      title: DatasetPurpose
-      description: Purpose of the dataset. Each purpose has a required input data schema.
     EmbeddedChunk-Input:
       properties:
         content:
@@ -8470,32 +7326,6 @@ components:
           - type: 'null'
       additionalProperties: true
       title: Errors
-    EvaluateRowsBodyRequest:
-      properties:
-        input_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          minItems: 1
-          title: Input Rows
-          description: The rows to evaluate
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          minItems: 1
-          title: Scoring Functions
-          description: The scoring functions to use for the evaluation
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - input_rows
-      - scoring_functions
-      - benchmark_config
-      title: EvaluateRowsBodyRequest
-      description: Request body model for evaluating rows (without path parameter).
     HealthStatus:
       type: string
       enum:
@@ -8538,16 +7368,6 @@ components:
       required:
       - cached_tokens
       title: InputTokensDetails
-    JobStatus:
-      type: string
-      enum:
-      - completed
-      - in_progress
-      - failed
-      - scheduled
-      - cancelled
-      title: JobStatus
-      description: Status of a job execution.
     ListConnectorsResponse:
       properties:
         data:
@@ -9383,15 +8203,6 @@ components:
       - disabled
       title: ResponseTruncation
       description: Controls how the service truncates input when it exceeds the model context window.
-    RunEvalBodyRequest:
-      properties:
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_config
-      title: RunEvalBodyRequest
-      description: Request body model for running an evaluation (without path parameter).
     SearchRankingOptions:
       properties:
         ranker:
@@ -9776,50 +8587,6 @@ components:
       - $ref: '#/components/schemas/OpenAIResponseContentPartReasoningText'
         title: OpenAIResponseContentPartReasoningText
       title: OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal | OpenAIResponseContentPartReasoningText
-    ListBenchmarksRequest:
-      description: Request model for listing benchmarks.
-      properties: {}
-      title: ListBenchmarksRequest
-    GetBenchmarkRequest:
-      description: Request model for getting a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to get.
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: GetBenchmarkRequest
-    UnregisterBenchmarkRequest:
-      description: Request model for unregistering a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to unregister.
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: UnregisterBenchmarkRequest
-    GetDatasetRequest:
-      description: Request model for getting a dataset by ID.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to get.
-          title: Dataset Id
-          type: string
-      required:
-      - dataset_id
-      title: GetDatasetRequest
-    UnregisterDatasetRequest:
-      description: Request model for unregistering a dataset.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to unregister.
-          title: Dataset Id
-          type: string
-      required:
-      - dataset_id
-      title: UnregisterDatasetRequest
     ListModelsResponse:
       description: Response containing a list of model objects.
       properties:
@@ -9852,39 +8619,6 @@ components:
       required:
       - model_id
       title: UnregisterModelRequest
-    DialogType:
-      description: Parameter type for dialog data with semantic output labels.
-      properties:
-        type:
-          const: dialog
-          default: dialog
-          title: Type
-          type: string
-      title: DialogType
-    ListScoringFunctionsRequest:
-      description: Request model for listing scoring functions.
-      properties: {}
-      title: ListScoringFunctionsRequest
-    GetScoringFunctionRequest:
-      description: Request model for getting a scoring function.
-      properties:
-        scoring_fn_id:
-          description: The ID of the scoring function to get.
-          title: Scoring Fn Id
-          type: string
-      required:
-      - scoring_fn_id
-      title: GetScoringFunctionRequest
-    UnregisterScoringFunctionRequest:
-      description: Request model for unregistering a scoring function.
-      properties:
-        scoring_fn_id:
-          description: The ID of the scoring function to unregister.
-          title: Scoring Fn Id
-          type: string
-      required:
-      - scoring_fn_id
-      title: UnregisterScoringFunctionRequest
     GetShieldRequest:
       description: Request model for getting a shield by identifier.
       properties:
@@ -9983,16 +8717,10 @@ components:
       - agents
       - batches
       - vector_io
-      - datasetio
-      - scoring
-      - eval
       - tool_runtime
       - models
       - shields
       - vector_stores
-      - datasets
-      - scoring_functions
-      - benchmarks
       - tool_groups
       - files
       - file_processors
@@ -10789,6 +9517,25 @@ components:
       required:
       - batch_id
       title: CancelBatchRequest
+    JobStatus:
+      description: Status of a job execution.
+      enum:
+      - completed
+      - in_progress
+      - failed
+      - scheduled
+      - cancelled
+      title: JobStatus
+      type: string
+    DialogType:
+      description: Parameter type for dialog data with semantic output labels.
+      properties:
+        type:
+          const: dialog
+          default: dialog
+          title: Type
+          type: string
+      title: DialogType
     ConnectorInput:
       description: Input for creating a connector
       properties:
@@ -11019,90 +9766,6 @@ components:
       - conversation_id
       - item_id
       title: DeleteItemRequest
-    IterRowsRequest:
-      description: Request model for iterating over rows in a dataset.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to get the rows from.
-          title: Dataset Id
-          type: string
-        start_index:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: Index into dataset for the first row to get. Get all rows if None.
-          nullable: true
-        limit:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: The number of rows to get.
-          nullable: true
-      required:
-      - dataset_id
-      title: IterRowsRequest
-    BenchmarkIdRequest:
-      description: Request model containing benchmark_id path parameter.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark
-          minLength: 1
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: BenchmarkIdRequest
-    JobStatusRequest:
-      description: Request model for getting the status of a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to get the status of
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobStatusRequest
-    JobCancelRequest:
-      description: Request model for canceling a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to cancel
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobCancelRequest
-    JobResultRequest:
-      description: Request model for getting the result of a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to get the result of
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobResultRequest
     ProcessFileRequest:
       description: |-
         Request model for file processing operation.
diff --git a/docs/static/experimental-llama-stack-spec.yaml b/docs/static/experimental-llama-stack-spec.yaml
index b3daad8698..6f5f182246 100644
--- a/docs/static/experimental-llama-stack-spec.yaml
+++ b/docs/static/experimental-llama-stack-spec.yaml
@@ -13,423 +13,6 @@ info:
 servers:
 - url: http://any-hosted-llama-stack.com
 paths:
-  /v1beta/datasetio/append-rows/{dataset_id}:
-    post:
-      responses:
-        '204':
-          description: Rows were successfully appended.
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - DatasetIO
-      summary: Append rows to a dataset.
-      description: Append rows to a dataset.
-      operationId: append_rows_v1beta_datasetio_append_rows__dataset_id__post
-      parameters:
-      - name: dataset_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the dataset to append the rows to.
-          title: Dataset Id
-        description: The ID of the dataset to append the rows to.
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/AppendRowsRequest'
-  /v1beta/datasetio/iterrows/{dataset_id}:
-    get:
-      responses:
-        '200':
-          description: A PaginatedResponse containing the rows.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/PaginatedResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - DatasetIO
-      summary: Get a paginated list of rows from a dataset.
-      description: |-
-        Get a paginated list of rows from a dataset.
-
-        Uses offset-based pagination where:
-        - start_index: The starting index (0-based). If None, starts from beginning.
-        - limit: Number of items to return. If None or -1, returns all items.
-
-        The response includes:
-        - data: List of items for the current page.
-        - has_more: Whether there are more items available after this set.
-      operationId: iterrows_v1beta_datasetio_iterrows__dataset_id__get
-      parameters:
-      - name: dataset_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the dataset to get the rows from.
-          title: Dataset Id
-        description: The ID of the dataset to get the rows from.
-      - name: start_index
-        in: query
-        required: false
-        schema:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: Index into dataset for the first row to get. Get all rows if None.
-          title: Start Index
-        description: Index into dataset for the first row to get. Get all rows if None.
-      - name: limit
-        in: query
-        required: false
-        schema:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: The number of rows to get.
-          title: Limit
-        description: The number of rows to get.
-  /v1beta/datasets:
-    get:
-      responses:
-        '200':
-          description: A list of dataset objects.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListDatasetsResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Datasets
-      summary: List all datasets.
-      description: List all datasets.
-      operationId: list_datasets_v1beta_datasets_get
-  /v1beta/datasets/{dataset_id}:
-    get:
-      responses:
-        '200':
-          description: The dataset object.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Dataset'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Datasets
-      summary: Get a dataset by its ID.
-      description: Get a dataset by its ID.
-      operationId: get_dataset_v1beta_datasets__dataset_id__get
-      parameters:
-      - name: dataset_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the dataset to get.
-          title: Dataset Id
-        description: The ID of the dataset to get.
-  /v1alpha/eval/benchmarks:
-    get:
-      responses:
-        '200':
-          description: A ListBenchmarksResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListBenchmarksResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Benchmarks
-      summary: List all benchmarks.
-      description: List all benchmarks.
-      operationId: list_benchmarks_v1alpha_eval_benchmarks_get
-  /v1alpha/eval/benchmarks/{benchmark_id}:
-    get:
-      responses:
-        '200':
-          description: A Benchmark.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Benchmark'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Benchmarks
-      summary: Get a benchmark by its ID.
-      description: Get a benchmark by its ID.
-      operationId: get_benchmark_v1alpha_eval_benchmarks__benchmark_id__get
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark to get.
-          title: Benchmark Id
-        description: The ID of the benchmark to get.
-  /v1alpha/eval/benchmarks/{benchmark_id}/evaluations:
-    post:
-      responses:
-        '200':
-          description: EvaluateResponse object containing generations and scores.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluateResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Evaluate Rows
-      description: Evaluate a list of rows on a benchmark.
-      operationId: evaluate_rows_v1alpha_eval_benchmarks__benchmark_id__evaluations_post
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark
-          title: Benchmark Id
-        description: The ID of the benchmark
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/EvaluateRowsBodyRequest'
-  /v1alpha/eval/benchmarks/{benchmark_id}/jobs:
-    post:
-      responses:
-        '200':
-          description: The job that was created to run the evaluation.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Job'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Run Eval
-      description: Run an evaluation on a benchmark.
-      operationId: run_eval_v1alpha_eval_benchmarks__benchmark_id__jobs_post
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark
-          title: Benchmark Id
-        description: The ID of the benchmark
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RunEvalBodyRequest'
-  /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
-    get:
-      responses:
-        '200':
-          description: The status of the evaluation job.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Job'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Job Status
-      description: Get the status of a job.
-      operationId: job_status_v1alpha_eval_benchmarks__benchmark_id__jobs__job_id__get
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Benchmark Id
-      - name: job_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Job Id
-    delete:
-      responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: Successful Response
-      tags:
-      - Eval
-      summary: Job Cancel
-      description: Cancel a job.
-      operationId: job_cancel_v1alpha_eval_benchmarks__benchmark_id__jobs__job_id__delete
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Benchmark Id
-      - name: job_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Job Id
-  /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
-    get:
-      responses:
-        '200':
-          description: The result of the job.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluateResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Job Result
-      description: Get the result of a job.
-      operationId: job_result_v1alpha_eval_benchmarks__benchmark_id__jobs__job_id__result_get
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Benchmark Id
-      - name: job_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Job Id
   /v1alpha/inference/rerank:
     post:
       responses:
@@ -5815,408 +5398,87 @@ components:
       - error
       title: ViolationLevel
       description: Severity level of a safety violation.
-    AggregationFunctionType:
-      type: string
-      enum:
-      - average
-      - weighted_average
-      - median
-      - categorical_count
-      - accuracy
-      title: AggregationFunctionType
-      description: Types of aggregation functions for scoring results.
     ArrayType:
+      description: Parameter type for array values.
       properties:
         type:
-          type: string
           const: array
-          title: Type
           default: array
-      title: ArrayType
-      description: Parameter type for array values.
-    BasicScoringFnParams:
-      properties:
-        type:
-          type: string
-          const: basic
           title: Type
-          default: basic
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      title: BasicScoringFnParams
-      description: Parameters for basic scoring function configuration.
+          type: string
+      title: ArrayType
     BooleanType:
+      description: Parameter type for boolean values.
       properties:
         type:
-          type: string
           const: boolean
-          title: Type
           default: boolean
+          title: Type
+          type: string
       title: BooleanType
-      description: Parameter type for boolean values.
     ChatCompletionInputType:
+      description: Parameter type for chat completion input.
       properties:
         type:
-          type: string
           const: chat_completion_input
-          title: Type
           default: chat_completion_input
+          title: Type
+          type: string
       title: ChatCompletionInputType
-      description: Parameter type for chat completion input.
     CompletionInputType:
+      description: Parameter type for completion input.
       properties:
         type:
-          type: string
           const: completion_input
-          title: Type
           default: completion_input
+          title: Type
+          type: string
       title: CompletionInputType
-      description: Parameter type for completion input.
     JsonType:
+      description: Parameter type for JSON values.
       properties:
         type:
-          type: string
           const: json
-          title: Type
           default: json
-      title: JsonType
-      description: Parameter type for JSON values.
-    LLMAsJudgeScoringFnParams:
-      properties:
-        type:
-          type: string
-          const: llm_as_judge
           title: Type
-          default: llm_as_judge
-        judge_model:
           type: string
-          title: Judge Model
-        prompt_template:
-          anyOf:
-          - type: string
-          - type: 'null'
-        judge_score_regexes:
-          items:
-            type: string
-          type: array
-          title: Judge Score Regexes
-          description: Regexes to extract the answer from generated response
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      required:
-      - judge_model
-      title: LLMAsJudgeScoringFnParams
-      description: Parameters for LLM-as-judge scoring function configuration.
+      title: JsonType
     NumberType:
+      description: Parameter type for numeric values.
       properties:
         type:
-          type: string
           const: number
-          title: Type
           default: number
+          title: Type
+          type: string
       title: NumberType
-      description: Parameter type for numeric values.
     ObjectType:
+      description: Parameter type for object values.
       properties:
         type:
-          type: string
           const: object
-          title: Type
           default: object
-      title: ObjectType
-      description: Parameter type for object values.
-    RegexParserScoringFnParams:
-      properties:
-        type:
-          type: string
-          const: regex_parser
           title: Type
-          default: regex_parser
-        parsing_regexes:
-          items:
-            type: string
-          type: array
-          title: Parsing Regexes
-          description: Regex to extract the answer from generated response
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      title: RegexParserScoringFnParams
-      description: Parameters for regex parser scoring function configuration.
-    ScoringFn:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
           type: string
-          const: scoring_function
-          title: Type
-          default: scoring_function
-        description:
-          anyOf:
-          - type: string
-          - type: 'null'
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Any additional metadata for this definition
-        return_type:
-          oneOf:
-          - $ref: '#/components/schemas/StringType'
-            title: StringType
-          - $ref: '#/components/schemas/NumberType'
-            title: NumberType
-          - $ref: '#/components/schemas/BooleanType'
-            title: BooleanType
-          - $ref: '#/components/schemas/ArrayType'
-            title: ArrayType
-          - $ref: '#/components/schemas/ObjectType'
-            title: ObjectType
-          - $ref: '#/components/schemas/JsonType'
-            title: JsonType
-          - $ref: '#/components/schemas/UnionType'
-            title: UnionType
-          - $ref: '#/components/schemas/ChatCompletionInputType'
-            title: ChatCompletionInputType
-          - $ref: '#/components/schemas/CompletionInputType'
-            title: CompletionInputType
-          title: StringType | ... (9 variants)
-          description: The return type of the deterministic function
-          discriminator:
-            propertyName: type
-            mapping:
-              array: '#/components/schemas/ArrayType'
-              boolean: '#/components/schemas/BooleanType'
-              chat_completion_input: '#/components/schemas/ChatCompletionInputType'
-              completion_input: '#/components/schemas/CompletionInputType'
-              json: '#/components/schemas/JsonType'
-              number: '#/components/schemas/NumberType'
-              object: '#/components/schemas/ObjectType'
-              string: '#/components/schemas/StringType'
-              union: '#/components/schemas/UnionType'
-        params:
-          anyOf:
-          - oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          - type: 'null'
-          title: Params
-          description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval
-      required:
-      - identifier
-      - provider_id
-      - return_type
-      title: ScoringFn
-      description: A scoring function resource for evaluating model outputs.
-    ScoringFnParams:
-      discriminator:
-        mapping:
-          basic: '#/components/schemas/BasicScoringFnParams'
-          llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-          regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-        propertyName: type
-      oneOf:
-      - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-        title: LLMAsJudgeScoringFnParams
-      - $ref: '#/components/schemas/RegexParserScoringFnParams'
-        title: RegexParserScoringFnParams
-      - $ref: '#/components/schemas/BasicScoringFnParams'
-        title: BasicScoringFnParams
-      title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-    ScoringFnParamsType:
-      description: Types of scoring function parameter configurations.
-      enum:
-      - llm_as_judge
-      - regex_parser
-      - basic
-      title: ScoringFnParamsType
-      type: string
+      title: ObjectType
     StringType:
+      description: Parameter type for string values.
       properties:
         type:
-          type: string
           const: string
-          title: Type
           default: string
+          title: Type
+          type: string
       title: StringType
-      description: Parameter type for string values.
     UnionType:
+      description: Parameter type for union values.
       properties:
         type:
-          type: string
           const: union
-          title: Type
           default: union
-      title: UnionType
-      description: Parameter type for union values.
-    ListScoringFunctionsResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/ScoringFn'
-          type: array
-          title: Data
-          description: List of scoring function objects.
-      required:
-      - data
-      title: ListScoringFunctionsResponse
-      description: Response containing a list of scoring function objects.
-    ScoreRequest:
-      properties:
-        input_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Input Rows
-          description: The rows to score.
-        scoring_functions:
-          additionalProperties:
-            anyOf:
-            - oneOf:
-              - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                title: LLMAsJudgeScoringFnParams
-              - $ref: '#/components/schemas/RegexParserScoringFnParams'
-                title: RegexParserScoringFnParams
-              - $ref: '#/components/schemas/BasicScoringFnParams'
-                title: BasicScoringFnParams
-              discriminator:
-                propertyName: type
-                mapping:
-                  basic: '#/components/schemas/BasicScoringFnParams'
-                  llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                  regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-              title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-            - type: 'null'
-            title: AdditionalpropertiesUnion
-          type: object
-          title: Scoring Functions
-          description: The scoring functions to use for the scoring.
-      required:
-      - input_rows
-      - scoring_functions
-      title: ScoreRequest
-      description: Request model for scoring a list of rows.
-    ScoreResponse:
-      properties:
-        results:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Results
-          description: A map of scoring function name to ScoringResult.
-      required:
-      - results
-      title: ScoreResponse
-      description: The response from scoring.
-    ScoringResult:
-      properties:
-        score_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Score Rows
-          description: The scoring result for each row. Each row is a map of column name to value.
-        aggregated_results:
-          additionalProperties: true
-          type: object
-          title: Aggregated Results
-          description: Map of metric name to aggregated value
-      required:
-      - score_rows
-      - aggregated_results
-      title: ScoringResult
-      description: A scoring result for a single row.
-    ScoreBatchRequest:
-      properties:
-        dataset_id:
+          title: Type
           type: string
-          title: Dataset Id
-          description: The ID of the dataset to score.
-        scoring_functions:
-          additionalProperties:
-            anyOf:
-            - oneOf:
-              - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                title: LLMAsJudgeScoringFnParams
-              - $ref: '#/components/schemas/RegexParserScoringFnParams'
-                title: RegexParserScoringFnParams
-              - $ref: '#/components/schemas/BasicScoringFnParams'
-                title: BasicScoringFnParams
-              discriminator:
-                propertyName: type
-                mapping:
-                  basic: '#/components/schemas/BasicScoringFnParams'
-                  llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                  regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-              title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-            - type: 'null'
-            title: AdditionalpropertiesUnion
-          type: object
-          title: Scoring Functions
-          description: The scoring functions to use for the scoring.
-        save_results_dataset:
-          type: boolean
-          title: Save Results Dataset
-          description: Whether to save the results to a dataset.
-          default: false
-      required:
-      - dataset_id
-      - scoring_functions
-      title: ScoreBatchRequest
-      description: Request model for scoring a batch of rows from a dataset.
-    ScoreBatchResponse:
-      properties:
-        dataset_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: (Optional) The identifier of the dataset that was scored
-        results:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Results
-          description: A map of scoring function name to ScoringResult
-      required:
-      - results
-      title: ScoreBatchResponse
-      description: Response from batch scoring operations on datasets.
+      title: UnionType
     Shield:
       properties:
         identifier:
@@ -7247,264 +6509,48 @@ components:
       - version
       title: VersionInfo
       description: Version information for the service.
-    AppendRowsRequest:
-      properties:
-        rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Rows
-          description: The rows to append to the dataset.
-      required:
-      - rows
-      title: AppendRowsRequest
-      description: Request body for appending rows to a dataset.
     PaginatedResponse:
+      description: A generic paginated response that follows a simple format.
       properties:
         data:
           items:
             additionalProperties: true
             type: object
-          type: array
           title: Data
+          type: array
         has_more:
-          type: boolean
           title: Has More
+          type: boolean
         url:
           anyOf:
           - type: string
           - type: 'null'
+          nullable: true
       required:
       - data
       - has_more
       title: PaginatedResponse
-      description: A generic paginated response that follows a simple format.
-    Dataset:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          const: dataset
-          title: Type
-          description: Type of resource, always 'dataset' for datasets
-          default: dataset
-        purpose:
-          $ref: '#/components/schemas/DatasetPurpose'
-          description: Purpose of the dataset indicating its intended use
-        source:
-          oneOf:
-          - $ref: '#/components/schemas/URIDataSource'
-            title: URIDataSource
-          - $ref: '#/components/schemas/RowsDataSource'
-            title: RowsDataSource
-          title: URIDataSource | RowsDataSource
-          description: Data source configuration for the dataset
-          discriminator:
-            propertyName: type
-            mapping:
-              rows: '#/components/schemas/RowsDataSource'
-              uri: '#/components/schemas/URIDataSource'
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Any additional metadata for this dataset
-      required:
-      - identifier
-      - provider_id
-      - purpose
-      - source
-      title: Dataset
-      description: Dataset resource for storing and accessing training or evaluation data.
-    RowsDataSource:
-      properties:
-        type:
-          type: string
-          const: rows
-          title: Type
-          description: The type of data source.
-          default: rows
-        rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Rows
-          description: 'The dataset is stored in rows. E.g. [{"messages": [{"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}]}]'
-      required:
-      - rows
-      title: RowsDataSource
-      description: A dataset stored in rows.
-    URIDataSource:
-      properties:
-        type:
-          type: string
-          const: uri
-          title: Type
-          description: The type of data source.
-          default: uri
-        uri:
-          type: string
-          title: Uri
-          description: The dataset can be obtained from a URI. E.g. "https://mywebsite.com/mydata.jsonl", "lsfs://mydata.jsonl", "data:csv;base64,{base64_content}"
-      required:
-      - uri
-      title: URIDataSource
-      description: A dataset that can be obtained from a URI.
-    ListDatasetsResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Dataset'
-          type: array
-          title: Data
-          description: List of datasets
-      required:
-      - data
-      title: ListDatasetsResponse
-      description: Response from listing datasets.
-    Benchmark:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          const: benchmark
-          title: Type
-          description: The resource type, always benchmark.
-          default: benchmark
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: Identifier of the dataset to use for the benchmark evaluation.
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          title: Scoring Functions
-          description: List of scoring function identifiers to apply during evaluation.
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Metadata for this evaluation task.
-      required:
-      - identifier
-      - provider_id
-      - dataset_id
-      - scoring_functions
-      title: Benchmark
-      description: A benchmark resource for evaluating model performance.
-    ListBenchmarksResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Benchmark'
-          type: array
-          title: Data
-          description: List of benchmark objects.
-      required:
-      - data
-      title: ListBenchmarksResponse
-      description: Response containing a list of benchmark objects.
-    BenchmarkConfig:
-      properties:
-        eval_candidate:
-          $ref: '#/components/schemas/ModelCandidate'
-          description: The candidate to evaluate
-        scoring_params:
-          additionalProperties:
-            oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          type: object
-          title: Scoring Params
-          description: Map between scoring function id and parameters for each scoring function you want to run
-        num_examples:
-          anyOf:
-          - type: integer
-            minimum: 1.0
-          - type: 'null'
-          description: Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated
-      required:
-      - eval_candidate
-      title: BenchmarkConfig
-      description: A benchmark configuration for evaluation.
     GreedySamplingStrategy:
+      description: Greedy sampling strategy that selects the highest probability token at each step.
       properties:
         type:
-          type: string
           const: greedy
-          title: Type
-          description: Must be 'greedy' to identify this sampling strategy.
           default: greedy
-      title: GreedySamplingStrategy
-      description: Greedy sampling strategy that selects the highest probability token at each step.
-    ModelCandidate:
-      properties:
-        type:
-          type: string
-          const: model
+          description: Must be 'greedy' to identify this sampling strategy.
           title: Type
-          default: model
-        model:
           type: string
-          minLength: 1
-          title: Model
-          description: The model ID to evaluate
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: The sampling parameters for the model
-        system_message:
-          anyOf:
-          - $ref: '#/components/schemas/SystemMessage'
-            title: SystemMessage
-          - type: 'null'
-          description: The system message providing instructions or context to the model
-          title: SystemMessage
-      required:
-      - model
-      - sampling_params
-      title: ModelCandidate
-      description: A model candidate for evaluation.
+      title: GreedySamplingStrategy
     SamplingParams:
+      description: Sampling parameters for text generation.
       properties:
         strategy:
+          description: The sampling strategy to use.
+          discriminator:
+            mapping:
+              greedy: '#/components/schemas/GreedySamplingStrategy'
+              top_k: '#/components/schemas/TopKSamplingStrategy'
+              top_p: '#/components/schemas/TopPSamplingStrategy'
+            propertyName: type
           oneOf:
           - $ref: '#/components/schemas/GreedySamplingStrategy'
             title: GreedySamplingStrategy
@@ -7513,200 +6559,127 @@ components:
           - $ref: '#/components/schemas/TopKSamplingStrategy'
             title: TopKSamplingStrategy
           title: GreedySamplingStrategy | TopPSamplingStrategy | TopKSamplingStrategy
-          description: The sampling strategy to use.
-          discriminator:
-            propertyName: type
-            mapping:
-              greedy: '#/components/schemas/GreedySamplingStrategy'
-              top_k: '#/components/schemas/TopKSamplingStrategy'
-              top_p: '#/components/schemas/TopPSamplingStrategy'
         max_tokens:
           anyOf:
-          - type: integer
-            minimum: 1.0
+          - minimum: 1
+            type: integer
           - type: 'null'
           description: The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length.
+          nullable: true
         repetition_penalty:
           anyOf:
-          - type: number
-            maximum: 2.0
+          - maximum: 2.0
             minimum: -2.0
+            type: number
           - type: 'null'
-          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
           default: 1.0
+          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
         stop:
           anyOf:
           - items:
               type: string
-            type: array
             maxItems: 4
+            type: array
           - type: 'null'
           description: Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+          nullable: true
       title: SamplingParams
-      description: Sampling parameters for text generation.
     SystemMessage:
+      description: A system message providing instructions or context to the model.
       properties:
         role:
-          type: string
           const: system
-          title: Role
-          description: Must be 'system' to identify this as a system message.
           default: system
+          description: Must be 'system' to identify this as a system message.
+          title: Role
+          type: string
         content:
           anyOf:
           - type: string
-          - oneOf:
-            - $ref: '#/components/schemas/ImageContentItem-Input'
-              title: ImageContentItem-Input
-            - $ref: '#/components/schemas/TextContentItem'
-              title: TextContentItem
-            discriminator:
-              propertyName: type
+          - discriminator:
               mapping:
-                image: '#/components/schemas/ImageContentItem-Input'
+                image: '#/components/schemas/ImageContentItem'
                 text: '#/components/schemas/TextContentItem'
-            title: ImageContentItem-Input | TextContentItem
-          - items:
-              oneOf:
-              - $ref: '#/components/schemas/ImageContentItem-Input'
-                title: ImageContentItem-Input
-              - $ref: '#/components/schemas/TextContentItem'
-                title: TextContentItem
+              propertyName: type
+            oneOf:
+            - $ref: '#/components/schemas/ImageContentItem'
+              title: ImageContentItem
+            - $ref: '#/components/schemas/TextContentItem'
+              title: TextContentItem
+            title: ImageContentItem | TextContentItem
+          - items:
               discriminator:
-                propertyName: type
                 mapping:
-                  image: '#/components/schemas/ImageContentItem-Input'
+                  image: '#/components/schemas/ImageContentItem'
                   text: '#/components/schemas/TextContentItem'
-              title: ImageContentItem-Input | TextContentItem
+                propertyName: type
+              oneOf:
+              - $ref: '#/components/schemas/ImageContentItem'
+                title: ImageContentItem
+              - $ref: '#/components/schemas/TextContentItem'
+                title: TextContentItem
+              title: ImageContentItem | TextContentItem
             type: array
-            title: list[ImageContentItem-Input | TextContentItem]
-          title: string | list[ImageContentItem-Input | TextContentItem]
+            title: list[ImageContentItem | TextContentItem]
           description: The content of the 'system prompt'. If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages.
+          title: string | list[ImageContentItem | TextContentItem]
       required:
       - content
       title: SystemMessage
-      description: A system message providing instructions or context to the model.
     TopKSamplingStrategy:
+      description: Top-k sampling strategy that restricts sampling to the k most likely tokens.
       properties:
         type:
-          type: string
           const: top_k
-          title: Type
-          description: Must be 'top_k' to identify this sampling strategy.
           default: top_k
+          description: Must be 'top_k' to identify this sampling strategy.
+          title: Type
+          type: string
         top_k:
-          type: integer
-          minimum: 1.0
-          title: Top K
           description: Number of top tokens to consider for sampling. Must be at least 1.
+          minimum: 1
+          title: Top K
+          type: integer
       required:
       - top_k
       title: TopKSamplingStrategy
-      description: Top-k sampling strategy that restricts sampling to the k most likely tokens.
     TopPSamplingStrategy:
+      description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
       properties:
         type:
-          type: string
           const: top_p
-          title: Type
-          description: Must be 'top_p' to identify this sampling strategy.
           default: top_p
+          description: Must be 'top_p' to identify this sampling strategy.
+          title: Type
+          type: string
         temperature:
-          type: number
+          description: Controls randomness in sampling. Higher values increase randomness.
           maximum: 2.0
           title: Temperature
-          description: Controls randomness in sampling. Higher values increase randomness.
+          type: number
           minimum: 0.0
         top_p:
-          type: number
+          default: 0.95
+          description: Cumulative probability threshold for nucleus sampling.
           maximum: 1.0
           minimum: 0.0
           title: Top P
-          description: Cumulative probability threshold for nucleus sampling.
-          default: 0.95
+          type: number
       required:
       - temperature
       title: TopPSamplingStrategy
-      description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
-    EvaluateRowsRequest:
-      description: Request model for evaluating a list of rows on a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to run the evaluation on
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        input_rows:
-          description: The rows to evaluate
-          items:
-            additionalProperties: true
-            type: object
-          minItems: 1
-          title: Input Rows
-          type: array
-        scoring_functions:
-          description: The scoring functions to use for the evaluation
-          items:
-            type: string
-          minItems: 1
-          title: Scoring Functions
-          type: array
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_id
-      - input_rows
-      - scoring_functions
-      - benchmark_config
-      title: EvaluateRowsRequest
-    EvaluateResponse:
-      properties:
-        generations:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Generations
-          description: The generations from the evaluation
-        scores:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Scores
-          description: The scores from the evaluation. Each key in the dict is a scoring function name
-      required:
-      - generations
-      - scores
-      title: EvaluateResponse
-      description: The response from an evaluation.
-    RunEvalRequest:
-      description: Request model for running an evaluation on a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to run the evaluation on
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_id
-      - benchmark_config
-      title: RunEvalRequest
     Job:
+      description: A job execution instance with status tracking.
       properties:
         job_id:
-          type: string
           title: Job Id
+          type: string
         status:
           $ref: '#/components/schemas/JobStatus'
       required:
       - job_id
       - status
       title: Job
-      description: A job execution instance with status tracking.
     RerankRequest:
       properties:
         model:
@@ -7846,85 +6819,6 @@ components:
       - $ref: '#/components/schemas/CompletionInputType'
         title: CompletionInputType
       title: StringType | ... (9 variants)
-    RegisterScoringFunctionRequest:
-      properties:
-        scoring_fn_id:
-          type: string
-          title: Scoring Fn Id
-          description: The ID of the scoring function to register.
-        description:
-          type: string
-          title: Description
-          description: The description of the scoring function.
-        return_type:
-          oneOf:
-          - $ref: '#/components/schemas/StringType'
-            title: StringType
-          - $ref: '#/components/schemas/NumberType'
-            title: NumberType
-          - $ref: '#/components/schemas/BooleanType'
-            title: BooleanType
-          - $ref: '#/components/schemas/ArrayType'
-            title: ArrayType
-          - $ref: '#/components/schemas/ObjectType'
-            title: ObjectType
-          - $ref: '#/components/schemas/JsonType'
-            title: JsonType
-          - $ref: '#/components/schemas/UnionType'
-            title: UnionType
-          - $ref: '#/components/schemas/ChatCompletionInputType'
-            title: ChatCompletionInputType
-          - $ref: '#/components/schemas/CompletionInputType'
-            title: CompletionInputType
-          title: StringType | ... (9 variants)
-          description: The return type of the scoring function.
-          discriminator:
-            propertyName: type
-            mapping:
-              array: '#/components/schemas/ArrayType'
-              boolean: '#/components/schemas/BooleanType'
-              chat_completion_input: '#/components/schemas/ChatCompletionInputType'
-              completion_input: '#/components/schemas/CompletionInputType'
-              json: '#/components/schemas/JsonType'
-              number: '#/components/schemas/NumberType'
-              object: '#/components/schemas/ObjectType'
-              string: '#/components/schemas/StringType'
-              union: '#/components/schemas/UnionType'
-        provider_scoring_fn_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider scoring function to use for the scoring function.
-        provider_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider to use for the scoring function.
-        params:
-          anyOf:
-          - oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          - type: 'null'
-          title: Params
-          description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval.
-      required:
-      - scoring_fn_id
-      - description
-      - return_type
-      title: RegisterScoringFunctionRequest
-      description: Request model for registering a scoring function.
     RegisterShieldRequest:
       properties:
         shield_id:
@@ -7951,90 +6845,6 @@ components:
       - shield_id
       title: RegisterShieldRequest
       description: Request model for registering a shield.
-    DataSource:
-      discriminator:
-        mapping:
-          rows: '#/components/schemas/RowsDataSource'
-          uri: '#/components/schemas/URIDataSource'
-        propertyName: type
-      oneOf:
-      - $ref: '#/components/schemas/URIDataSource'
-        title: URIDataSource
-      - $ref: '#/components/schemas/RowsDataSource'
-        title: RowsDataSource
-      title: URIDataSource | RowsDataSource
-    RegisterDatasetRequest:
-      properties:
-        purpose:
-          $ref: '#/components/schemas/DatasetPurpose'
-          description: The purpose of the dataset.
-        source:
-          oneOf:
-          - $ref: '#/components/schemas/URIDataSource'
-            title: URIDataSource
-          - $ref: '#/components/schemas/RowsDataSource'
-            title: RowsDataSource
-          title: URIDataSource | RowsDataSource
-          description: The data source of the dataset.
-          discriminator:
-            propertyName: type
-            mapping:
-              rows: '#/components/schemas/RowsDataSource'
-              uri: '#/components/schemas/URIDataSource'
-        metadata:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-          description: The metadata for the dataset.
-        dataset_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the dataset. If not provided, an ID will be generated.
-      required:
-      - purpose
-      - source
-      title: RegisterDatasetRequest
-      description: Request model for registering a dataset.
-    RegisterBenchmarkRequest:
-      properties:
-        benchmark_id:
-          type: string
-          title: Benchmark Id
-          description: The ID of the benchmark to register.
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: The ID of the dataset to use for the benchmark.
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          title: Scoring Functions
-          description: The scoring functions to use for the benchmark.
-        provider_benchmark_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider benchmark to use for the benchmark.
-        provider_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider to use for the benchmark.
-        metadata:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-          description: The metadata to use for the benchmark.
-      required:
-      - benchmark_id
-      - dataset_id
-      - scoring_functions
-      title: RegisterBenchmarkRequest
-      description: Request model for registering a benchmark.
     AllowedToolsFilter:
       properties:
         tool_names:
@@ -8202,13 +7012,6 @@ components:
       - reasoning.encrypted_content
       title: ConversationItemInclude
       description: Specify additional output data to include in the model response.
-    DatasetPurpose:
-      type: string
-      enum:
-      - eval/question-answer
-      - eval/messages-answer
-      title: DatasetPurpose
-      description: Purpose of the dataset. Each purpose has a required input data schema.
     EmbeddedChunk-Input:
       properties:
         content:
@@ -8347,32 +7150,6 @@ components:
           - type: 'null'
       additionalProperties: true
       title: Errors
-    EvaluateRowsBodyRequest:
-      properties:
-        input_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          minItems: 1
-          title: Input Rows
-          description: The rows to evaluate
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          minItems: 1
-          title: Scoring Functions
-          description: The scoring functions to use for the evaluation
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - input_rows
-      - scoring_functions
-      - benchmark_config
-      title: EvaluateRowsBodyRequest
-      description: Request body model for evaluating rows (without path parameter).
     HealthStatus:
       type: string
       enum:
@@ -8415,16 +7192,6 @@ components:
       required:
       - cached_tokens
       title: InputTokensDetails
-    JobStatus:
-      type: string
-      enum:
-      - completed
-      - in_progress
-      - failed
-      - scheduled
-      - cancelled
-      title: JobStatus
-      description: Status of a job execution.
     ListConnectorsResponse:
       properties:
         data:
@@ -9231,15 +7998,6 @@ components:
 
         Returns a list of chunks ready for storage in vector databases.
         Each chunk contains the content and metadata.
-    RunEvalBodyRequest:
-      properties:
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_config
-      title: RunEvalBodyRequest
-      description: Request body model for running an evaluation (without path parameter).
     SearchRankingOptions:
       properties:
         ranker:
@@ -9624,50 +8382,6 @@ components:
       - $ref: '#/components/schemas/OpenAIResponseContentPartReasoningText'
         title: OpenAIResponseContentPartReasoningText
       title: OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal | OpenAIResponseContentPartReasoningText
-    ListBenchmarksRequest:
-      description: Request model for listing benchmarks.
-      properties: {}
-      title: ListBenchmarksRequest
-    GetBenchmarkRequest:
-      description: Request model for getting a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to get.
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: GetBenchmarkRequest
-    UnregisterBenchmarkRequest:
-      description: Request model for unregistering a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to unregister.
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: UnregisterBenchmarkRequest
-    GetDatasetRequest:
-      description: Request model for getting a dataset by ID.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to get.
-          title: Dataset Id
-          type: string
-      required:
-      - dataset_id
-      title: GetDatasetRequest
-    UnregisterDatasetRequest:
-      description: Request model for unregistering a dataset.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to unregister.
-          title: Dataset Id
-          type: string
-      required:
-      - dataset_id
-      title: UnregisterDatasetRequest
     ListModelsResponse:
       description: Response containing a list of model objects.
       properties:
@@ -9700,39 +8414,6 @@ components:
       required:
       - model_id
       title: UnregisterModelRequest
-    DialogType:
-      description: Parameter type for dialog data with semantic output labels.
-      properties:
-        type:
-          const: dialog
-          default: dialog
-          title: Type
-          type: string
-      title: DialogType
-    ListScoringFunctionsRequest:
-      description: Request model for listing scoring functions.
-      properties: {}
-      title: ListScoringFunctionsRequest
-    GetScoringFunctionRequest:
-      description: Request model for getting a scoring function.
-      properties:
-        scoring_fn_id:
-          description: The ID of the scoring function to get.
-          title: Scoring Fn Id
-          type: string
-      required:
-      - scoring_fn_id
-      title: GetScoringFunctionRequest
-    UnregisterScoringFunctionRequest:
-      description: Request model for unregistering a scoring function.
-      properties:
-        scoring_fn_id:
-          description: The ID of the scoring function to unregister.
-          title: Scoring Fn Id
-          type: string
-      required:
-      - scoring_fn_id
-      title: UnregisterScoringFunctionRequest
     GetShieldRequest:
       description: Request model for getting a shield by identifier.
       properties:
@@ -9831,16 +8512,10 @@ components:
       - agents
       - batches
       - vector_io
-      - datasetio
-      - scoring
-      - eval
       - tool_runtime
       - models
       - shields
       - vector_stores
-      - datasets
-      - scoring_functions
-      - benchmarks
       - tool_groups
       - files
       - file_processors
@@ -10637,6 +9312,25 @@ components:
       required:
       - batch_id
       title: CancelBatchRequest
+    JobStatus:
+      description: Status of a job execution.
+      enum:
+      - completed
+      - in_progress
+      - failed
+      - scheduled
+      - cancelled
+      title: JobStatus
+      type: string
+    DialogType:
+      description: Parameter type for dialog data with semantic output labels.
+      properties:
+        type:
+          const: dialog
+          default: dialog
+          title: Type
+          type: string
+      title: DialogType
     ConnectorInput:
       description: Input for creating a connector
       properties:
@@ -10867,90 +9561,6 @@ components:
       - conversation_id
       - item_id
       title: DeleteItemRequest
-    IterRowsRequest:
-      description: Request model for iterating over rows in a dataset.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to get the rows from.
-          title: Dataset Id
-          type: string
-        start_index:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: Index into dataset for the first row to get. Get all rows if None.
-          nullable: true
-        limit:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: The number of rows to get.
-          nullable: true
-      required:
-      - dataset_id
-      title: IterRowsRequest
-    BenchmarkIdRequest:
-      description: Request model containing benchmark_id path parameter.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark
-          minLength: 1
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: BenchmarkIdRequest
-    JobStatusRequest:
-      description: Request model for getting the status of a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to get the status of
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobStatusRequest
-    JobCancelRequest:
-      description: Request model for canceling a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to cancel
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobCancelRequest
-    JobResultRequest:
-      description: Request model for getting the result of a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to get the result of
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobResultRequest
     ProcessFileRequest:
       description: |-
         Request model for file processing operation.
diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml
index a0e5bd36cc..9b03dd1f0d 100644
--- a/docs/static/llama-stack-spec.yaml
+++ b/docs/static/llama-stack-spec.yaml
@@ -1694,131 +1694,6 @@ paths:
             schema:
               $ref: '#/components/schemas/RunShieldRequest'
         required: true
-  /v1/scoring-functions:
-    get:
-      responses:
-        '200':
-          description: A ListScoringFunctionsResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListScoringFunctionsResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Scoring Functions
-      summary: List all scoring functions.
-      description: List all scoring functions.
-      operationId: list_scoring_functions_v1_scoring_functions_get
-  /v1/scoring-functions/{scoring_fn_id}:
-    get:
-      responses:
-        '200':
-          description: A ScoringFn.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoringFn'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Scoring Functions
-      summary: Get a scoring function by its ID.
-      description: Get a scoring function by its ID.
-      operationId: get_scoring_function_v1_scoring_functions__scoring_fn_id__get
-      parameters:
-      - name: scoring_fn_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the scoring function to get.
-          title: Scoring Fn Id
-        description: The ID of the scoring function to get.
-  /v1/scoring/score:
-    post:
-      responses:
-        '200':
-          description: A ScoreResponse object containing rows and aggregated results.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoreResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Scoring
-      summary: Score a list of rows.
-      description: Score a list of rows.
-      operationId: score_v1_scoring_score_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/ScoreRequest'
-        required: true
-  /v1/scoring/score-batch:
-    post:
-      responses:
-        '200':
-          description: A ScoreBatchResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoreBatchResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Scoring
-      summary: Score a batch of rows.
-      description: Score a batch of rows.
-      operationId: score_batch_v1_scoring_score_batch_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/ScoreBatchRequest'
-        required: true
   /v1/shields:
     get:
       responses:
@@ -7818,146 +7693,88 @@ components:
       - error
       title: ViolationLevel
       description: Severity level of a safety violation.
-    AggregationFunctionType:
-      type: string
-      enum:
-      - average
-      - weighted_average
-      - median
-      - categorical_count
-      - accuracy
-      title: AggregationFunctionType
-      description: Types of aggregation functions for scoring results.
     ArrayType:
+      description: Parameter type for array values.
       properties:
         type:
-          type: string
           const: array
-          title: Type
           default: array
-      title: ArrayType
-      description: Parameter type for array values.
-    BasicScoringFnParams:
-      properties:
-        type:
-          type: string
-          const: basic
           title: Type
-          default: basic
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      title: BasicScoringFnParams
-      description: Parameters for basic scoring function configuration.
+          type: string
+      title: ArrayType
     BooleanType:
+      description: Parameter type for boolean values.
       properties:
         type:
-          type: string
           const: boolean
-          title: Type
           default: boolean
+          title: Type
+          type: string
       title: BooleanType
-      description: Parameter type for boolean values.
     ChatCompletionInputType:
+      description: Parameter type for chat completion input.
       properties:
         type:
-          type: string
           const: chat_completion_input
-          title: Type
           default: chat_completion_input
+          title: Type
+          type: string
       title: ChatCompletionInputType
-      description: Parameter type for chat completion input.
     CompletionInputType:
+      description: Parameter type for completion input.
       properties:
         type:
-          type: string
           const: completion_input
-          title: Type
           default: completion_input
+          title: Type
+          type: string
       title: CompletionInputType
-      description: Parameter type for completion input.
     JsonType:
+      description: Parameter type for JSON values.
       properties:
         type:
-          type: string
           const: json
-          title: Type
           default: json
-      title: JsonType
-      description: Parameter type for JSON values.
-    LLMAsJudgeScoringFnParams:
-      properties:
-        type:
-          type: string
-          const: llm_as_judge
           title: Type
-          default: llm_as_judge
-        judge_model:
           type: string
-          title: Judge Model
-        prompt_template:
-          anyOf:
-          - type: string
-          - type: 'null'
-        judge_score_regexes:
-          items:
-            type: string
-          type: array
-          title: Judge Score Regexes
-          description: Regexes to extract the answer from generated response
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      required:
-      - judge_model
-      title: LLMAsJudgeScoringFnParams
-      description: Parameters for LLM-as-judge scoring function configuration.
+      title: JsonType
     NumberType:
+      description: Parameter type for numeric values.
       properties:
         type:
-          type: string
           const: number
-          title: Type
           default: number
+          title: Type
+          type: string
       title: NumberType
-      description: Parameter type for numeric values.
     ObjectType:
+      description: Parameter type for object values.
       properties:
         type:
-          type: string
           const: object
-          title: Type
           default: object
+          title: Type
+          type: string
       title: ObjectType
-      description: Parameter type for object values.
-    RegexParserScoringFnParams:
+    StringType:
+      description: Parameter type for string values.
       properties:
         type:
+          const: string
+          default: string
+          title: Type
           type: string
-          const: regex_parser
+      title: StringType
+    UnionType:
+      description: Parameter type for union values.
+      properties:
+        type:
+          const: union
+          default: union
           title: Type
-          default: regex_parser
-        parsing_regexes:
-          items:
-            type: string
-          type: array
-          title: Parsing Regexes
-          description: Regex to extract the answer from generated response
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      title: RegexParserScoringFnParams
-      description: Parameters for regex parser scoring function configuration.
-    ScoringFn:
+          type: string
+      title: UnionType
+    Shield:
       properties:
         identifier:
           type: string
@@ -7974,304 +7791,41 @@ components:
           description: ID of the provider that owns this resource
         type:
           type: string
-          const: scoring_function
+          const: shield
           title: Type
-          default: scoring_function
-        description:
-          anyOf:
-          - type: string
-          - type: 'null'
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Any additional metadata for this definition
-        return_type:
-          oneOf:
-          - $ref: '#/components/schemas/StringType'
-            title: StringType
-          - $ref: '#/components/schemas/NumberType'
-            title: NumberType
-          - $ref: '#/components/schemas/BooleanType'
-            title: BooleanType
-          - $ref: '#/components/schemas/ArrayType'
-            title: ArrayType
-          - $ref: '#/components/schemas/ObjectType'
-            title: ObjectType
-          - $ref: '#/components/schemas/JsonType'
-            title: JsonType
-          - $ref: '#/components/schemas/UnionType'
-            title: UnionType
-          - $ref: '#/components/schemas/ChatCompletionInputType'
-            title: ChatCompletionInputType
-          - $ref: '#/components/schemas/CompletionInputType'
-            title: CompletionInputType
-          title: StringType | ... (9 variants)
-          description: The return type of the deterministic function
-          discriminator:
-            propertyName: type
-            mapping:
-              array: '#/components/schemas/ArrayType'
-              boolean: '#/components/schemas/BooleanType'
-              chat_completion_input: '#/components/schemas/ChatCompletionInputType'
-              completion_input: '#/components/schemas/CompletionInputType'
-              json: '#/components/schemas/JsonType'
-              number: '#/components/schemas/NumberType'
-              object: '#/components/schemas/ObjectType'
-              string: '#/components/schemas/StringType'
-              union: '#/components/schemas/UnionType'
+          default: shield
         params:
           anyOf:
-          - oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
+          - additionalProperties: true
+            type: object
           - type: 'null'
-          title: Params
-          description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval
       required:
       - identifier
       - provider_id
-      - return_type
-      title: ScoringFn
-      description: A scoring function resource for evaluating model outputs.
-    ScoringFnParams:
-      discriminator:
-        mapping:
-          basic: '#/components/schemas/BasicScoringFnParams'
-          llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-          regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-        propertyName: type
-      oneOf:
-      - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-        title: LLMAsJudgeScoringFnParams
-      - $ref: '#/components/schemas/RegexParserScoringFnParams'
-        title: RegexParserScoringFnParams
-      - $ref: '#/components/schemas/BasicScoringFnParams'
-        title: BasicScoringFnParams
-      title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-    ScoringFnParamsType:
-      description: Types of scoring function parameter configurations.
-      enum:
-      - llm_as_judge
-      - regex_parser
-      - basic
-      title: ScoringFnParamsType
-      type: string
-    StringType:
-      properties:
-        type:
-          type: string
-          const: string
-          title: Type
-          default: string
-      title: StringType
-      description: Parameter type for string values.
-    UnionType:
-      properties:
-        type:
-          type: string
-          const: union
-          title: Type
-          default: union
-      title: UnionType
-      description: Parameter type for union values.
-    ListScoringFunctionsResponse:
+      title: Shield
+      description: A safety shield resource that can be used to check content.
+    ListShieldsResponse:
       properties:
         data:
           items:
-            $ref: '#/components/schemas/ScoringFn'
+            $ref: '#/components/schemas/Shield'
           type: array
           title: Data
-          description: List of scoring function objects.
+          description: List of shield objects
       required:
       - data
-      title: ListScoringFunctionsResponse
-      description: Response containing a list of scoring function objects.
-    ScoreRequest:
-      properties:
-        input_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Input Rows
-          description: The rows to score.
-        scoring_functions:
-          additionalProperties:
-            anyOf:
-            - oneOf:
-              - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                title: LLMAsJudgeScoringFnParams
-              - $ref: '#/components/schemas/RegexParserScoringFnParams'
-                title: RegexParserScoringFnParams
-              - $ref: '#/components/schemas/BasicScoringFnParams'
-                title: BasicScoringFnParams
-              discriminator:
-                propertyName: type
-                mapping:
-                  basic: '#/components/schemas/BasicScoringFnParams'
-                  llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                  regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-              title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-            - type: 'null'
-            title: AdditionalpropertiesUnion
-          type: object
-          title: Scoring Functions
-          description: The scoring functions to use for the scoring.
-      required:
-      - input_rows
-      - scoring_functions
-      title: ScoreRequest
-      description: Request model for scoring a list of rows.
-    ScoreResponse:
+      title: ListShieldsResponse
+      description: Response containing a list of all shields.
+    ImageContentItem:
+      description: A image content item
       properties:
-        results:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Results
-          description: A map of scoring function name to ScoringResult.
-      required:
-      - results
-      title: ScoreResponse
-      description: The response from scoring.
-    ScoringResult:
-      properties:
-        score_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Score Rows
-          description: The scoring result for each row. Each row is a map of column name to value.
-        aggregated_results:
-          additionalProperties: true
-          type: object
-          title: Aggregated Results
-          description: Map of metric name to aggregated value
-      required:
-      - score_rows
-      - aggregated_results
-      title: ScoringResult
-      description: A scoring result for a single row.
-    ScoreBatchRequest:
-      properties:
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: The ID of the dataset to score.
-        scoring_functions:
-          additionalProperties:
-            anyOf:
-            - oneOf:
-              - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                title: LLMAsJudgeScoringFnParams
-              - $ref: '#/components/schemas/RegexParserScoringFnParams'
-                title: RegexParserScoringFnParams
-              - $ref: '#/components/schemas/BasicScoringFnParams'
-                title: BasicScoringFnParams
-              discriminator:
-                propertyName: type
-                mapping:
-                  basic: '#/components/schemas/BasicScoringFnParams'
-                  llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                  regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-              title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-            - type: 'null'
-            title: AdditionalpropertiesUnion
-          type: object
-          title: Scoring Functions
-          description: The scoring functions to use for the scoring.
-        save_results_dataset:
-          type: boolean
-          title: Save Results Dataset
-          description: Whether to save the results to a dataset.
-          default: false
-      required:
-      - dataset_id
-      - scoring_functions
-      title: ScoreBatchRequest
-      description: Request model for scoring a batch of rows from a dataset.
-    ScoreBatchResponse:
-      properties:
-        dataset_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: (Optional) The identifier of the dataset that was scored
-        results:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Results
-          description: A map of scoring function name to ScoringResult
-      required:
-      - results
-      title: ScoreBatchResponse
-      description: Response from batch scoring operations on datasets.
-    Shield:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          const: shield
-          title: Type
-          default: shield
-        params:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-      required:
-      - identifier
-      - provider_id
-      title: Shield
-      description: A safety shield resource that can be used to check content.
-    ListShieldsResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Shield'
-          type: array
-          title: Data
-          description: List of shield objects
-      required:
-      - data
-      title: ListShieldsResponse
-      description: Response containing a list of all shields.
-    ImageContentItem:
-      description: A image content item
-      properties:
-        type:
-          const: image
-          default: image
-          title: Type
-          type: string
-        image:
-          $ref: '#/components/schemas/_URLOrData'
+        type:
+          const: image
+          default: image
+          title: Type
+          type: string
+        image:
+          $ref: '#/components/schemas/_URLOrData'
       required:
       - image
       title: ImageContentItem
@@ -9250,264 +8804,48 @@ components:
       - version
       title: VersionInfo
       description: Version information for the service.
-    AppendRowsRequest:
-      properties:
-        rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Rows
-          description: The rows to append to the dataset.
-      required:
-      - rows
-      title: AppendRowsRequest
-      description: Request body for appending rows to a dataset.
     PaginatedResponse:
+      description: A generic paginated response that follows a simple format.
       properties:
         data:
           items:
             additionalProperties: true
             type: object
-          type: array
           title: Data
+          type: array
         has_more:
-          type: boolean
           title: Has More
+          type: boolean
         url:
           anyOf:
           - type: string
           - type: 'null'
+          nullable: true
       required:
       - data
       - has_more
       title: PaginatedResponse
-      description: A generic paginated response that follows a simple format.
-    Dataset:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          const: dataset
-          title: Type
-          description: Type of resource, always 'dataset' for datasets
-          default: dataset
-        purpose:
-          $ref: '#/components/schemas/DatasetPurpose'
-          description: Purpose of the dataset indicating its intended use
-        source:
-          oneOf:
-          - $ref: '#/components/schemas/URIDataSource'
-            title: URIDataSource
-          - $ref: '#/components/schemas/RowsDataSource'
-            title: RowsDataSource
-          title: URIDataSource | RowsDataSource
-          description: Data source configuration for the dataset
-          discriminator:
-            propertyName: type
-            mapping:
-              rows: '#/components/schemas/RowsDataSource'
-              uri: '#/components/schemas/URIDataSource'
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Any additional metadata for this dataset
-      required:
-      - identifier
-      - provider_id
-      - purpose
-      - source
-      title: Dataset
-      description: Dataset resource for storing and accessing training or evaluation data.
-    RowsDataSource:
-      properties:
-        type:
-          type: string
-          const: rows
-          title: Type
-          description: The type of data source.
-          default: rows
-        rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Rows
-          description: 'The dataset is stored in rows. E.g. [{"messages": [{"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}]}]'
-      required:
-      - rows
-      title: RowsDataSource
-      description: A dataset stored in rows.
-    URIDataSource:
-      properties:
-        type:
-          type: string
-          const: uri
-          title: Type
-          description: The type of data source.
-          default: uri
-        uri:
-          type: string
-          title: Uri
-          description: The dataset can be obtained from a URI. E.g. "https://mywebsite.com/mydata.jsonl", "lsfs://mydata.jsonl", "data:csv;base64,{base64_content}"
-      required:
-      - uri
-      title: URIDataSource
-      description: A dataset that can be obtained from a URI.
-    ListDatasetsResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Dataset'
-          type: array
-          title: Data
-          description: List of datasets
-      required:
-      - data
-      title: ListDatasetsResponse
-      description: Response from listing datasets.
-    Benchmark:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          const: benchmark
-          title: Type
-          description: The resource type, always benchmark.
-          default: benchmark
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: Identifier of the dataset to use for the benchmark evaluation.
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          title: Scoring Functions
-          description: List of scoring function identifiers to apply during evaluation.
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Metadata for this evaluation task.
-      required:
-      - identifier
-      - provider_id
-      - dataset_id
-      - scoring_functions
-      title: Benchmark
-      description: A benchmark resource for evaluating model performance.
-    ListBenchmarksResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Benchmark'
-          type: array
-          title: Data
-          description: List of benchmark objects.
-      required:
-      - data
-      title: ListBenchmarksResponse
-      description: Response containing a list of benchmark objects.
-    BenchmarkConfig:
-      properties:
-        eval_candidate:
-          $ref: '#/components/schemas/ModelCandidate'
-          description: The candidate to evaluate
-        scoring_params:
-          additionalProperties:
-            oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          type: object
-          title: Scoring Params
-          description: Map between scoring function id and parameters for each scoring function you want to run
-        num_examples:
-          anyOf:
-          - type: integer
-            minimum: 1.0
-          - type: 'null'
-          description: Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated
-      required:
-      - eval_candidate
-      title: BenchmarkConfig
-      description: A benchmark configuration for evaluation.
     GreedySamplingStrategy:
+      description: Greedy sampling strategy that selects the highest probability token at each step.
       properties:
         type:
-          type: string
           const: greedy
-          title: Type
-          description: Must be 'greedy' to identify this sampling strategy.
           default: greedy
-      title: GreedySamplingStrategy
-      description: Greedy sampling strategy that selects the highest probability token at each step.
-    ModelCandidate:
-      properties:
-        type:
-          type: string
-          const: model
+          description: Must be 'greedy' to identify this sampling strategy.
           title: Type
-          default: model
-        model:
           type: string
-          minLength: 1
-          title: Model
-          description: The model ID to evaluate
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: The sampling parameters for the model
-        system_message:
-          anyOf:
-          - $ref: '#/components/schemas/SystemMessage'
-            title: SystemMessage
-          - type: 'null'
-          description: The system message providing instructions or context to the model
-          title: SystemMessage
-      required:
-      - model
-      - sampling_params
-      title: ModelCandidate
-      description: A model candidate for evaluation.
+      title: GreedySamplingStrategy
     SamplingParams:
+      description: Sampling parameters for text generation.
       properties:
         strategy:
+          description: The sampling strategy to use.
+          discriminator:
+            mapping:
+              greedy: '#/components/schemas/GreedySamplingStrategy'
+              top_k: '#/components/schemas/TopKSamplingStrategy'
+              top_p: '#/components/schemas/TopPSamplingStrategy'
+            propertyName: type
           oneOf:
           - $ref: '#/components/schemas/GreedySamplingStrategy'
             title: GreedySamplingStrategy
@@ -9516,200 +8854,127 @@ components:
           - $ref: '#/components/schemas/TopKSamplingStrategy'
             title: TopKSamplingStrategy
           title: GreedySamplingStrategy | TopPSamplingStrategy | TopKSamplingStrategy
-          description: The sampling strategy to use.
-          discriminator:
-            propertyName: type
-            mapping:
-              greedy: '#/components/schemas/GreedySamplingStrategy'
-              top_k: '#/components/schemas/TopKSamplingStrategy'
-              top_p: '#/components/schemas/TopPSamplingStrategy'
         max_tokens:
           anyOf:
-          - type: integer
-            minimum: 1.0
+          - minimum: 1
+            type: integer
           - type: 'null'
           description: The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length.
+          nullable: true
         repetition_penalty:
           anyOf:
-          - type: number
-            maximum: 2.0
+          - maximum: 2.0
             minimum: -2.0
+            type: number
           - type: 'null'
-          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
           default: 1.0
+          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
         stop:
           anyOf:
           - items:
               type: string
-            type: array
             maxItems: 4
+            type: array
           - type: 'null'
           description: Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+          nullable: true
       title: SamplingParams
-      description: Sampling parameters for text generation.
     SystemMessage:
+      description: A system message providing instructions or context to the model.
       properties:
         role:
-          type: string
           const: system
-          title: Role
-          description: Must be 'system' to identify this as a system message.
           default: system
+          description: Must be 'system' to identify this as a system message.
+          title: Role
+          type: string
         content:
           anyOf:
           - type: string
-          - oneOf:
-            - $ref: '#/components/schemas/ImageContentItem-Input'
-              title: ImageContentItem-Input
-            - $ref: '#/components/schemas/TextContentItem'
-              title: TextContentItem
-            discriminator:
-              propertyName: type
+          - discriminator:
               mapping:
-                image: '#/components/schemas/ImageContentItem-Input'
+                image: '#/components/schemas/ImageContentItem'
                 text: '#/components/schemas/TextContentItem'
-            title: ImageContentItem-Input | TextContentItem
-          - items:
-              oneOf:
-              - $ref: '#/components/schemas/ImageContentItem-Input'
-                title: ImageContentItem-Input
-              - $ref: '#/components/schemas/TextContentItem'
-                title: TextContentItem
+              propertyName: type
+            oneOf:
+            - $ref: '#/components/schemas/ImageContentItem'
+              title: ImageContentItem
+            - $ref: '#/components/schemas/TextContentItem'
+              title: TextContentItem
+            title: ImageContentItem | TextContentItem
+          - items:
               discriminator:
-                propertyName: type
                 mapping:
-                  image: '#/components/schemas/ImageContentItem-Input'
+                  image: '#/components/schemas/ImageContentItem'
                   text: '#/components/schemas/TextContentItem'
-              title: ImageContentItem-Input | TextContentItem
+                propertyName: type
+              oneOf:
+              - $ref: '#/components/schemas/ImageContentItem'
+                title: ImageContentItem
+              - $ref: '#/components/schemas/TextContentItem'
+                title: TextContentItem
+              title: ImageContentItem | TextContentItem
             type: array
-            title: list[ImageContentItem-Input | TextContentItem]
-          title: string | list[ImageContentItem-Input | TextContentItem]
+            title: list[ImageContentItem | TextContentItem]
           description: The content of the 'system prompt'. If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages.
+          title: string | list[ImageContentItem | TextContentItem]
       required:
       - content
       title: SystemMessage
-      description: A system message providing instructions or context to the model.
     TopKSamplingStrategy:
+      description: Top-k sampling strategy that restricts sampling to the k most likely tokens.
       properties:
         type:
-          type: string
           const: top_k
-          title: Type
-          description: Must be 'top_k' to identify this sampling strategy.
           default: top_k
+          description: Must be 'top_k' to identify this sampling strategy.
+          title: Type
+          type: string
         top_k:
-          type: integer
-          minimum: 1.0
-          title: Top K
           description: Number of top tokens to consider for sampling. Must be at least 1.
+          minimum: 1
+          title: Top K
+          type: integer
       required:
       - top_k
       title: TopKSamplingStrategy
-      description: Top-k sampling strategy that restricts sampling to the k most likely tokens.
     TopPSamplingStrategy:
+      description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
       properties:
         type:
-          type: string
           const: top_p
-          title: Type
-          description: Must be 'top_p' to identify this sampling strategy.
           default: top_p
+          description: Must be 'top_p' to identify this sampling strategy.
+          title: Type
+          type: string
         temperature:
-          type: number
+          description: Controls randomness in sampling. Higher values increase randomness.
           maximum: 2.0
           title: Temperature
-          description: Controls randomness in sampling. Higher values increase randomness.
+          type: number
           minimum: 0.0
         top_p:
-          type: number
+          default: 0.95
+          description: Cumulative probability threshold for nucleus sampling.
           maximum: 1.0
           minimum: 0.0
           title: Top P
-          description: Cumulative probability threshold for nucleus sampling.
-          default: 0.95
+          type: number
       required:
       - temperature
       title: TopPSamplingStrategy
-      description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
-    EvaluateRowsRequest:
-      description: Request model for evaluating a list of rows on a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to run the evaluation on
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        input_rows:
-          description: The rows to evaluate
-          items:
-            additionalProperties: true
-            type: object
-          minItems: 1
-          title: Input Rows
-          type: array
-        scoring_functions:
-          description: The scoring functions to use for the evaluation
-          items:
-            type: string
-          minItems: 1
-          title: Scoring Functions
-          type: array
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_id
-      - input_rows
-      - scoring_functions
-      - benchmark_config
-      title: EvaluateRowsRequest
-    EvaluateResponse:
-      properties:
-        generations:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Generations
-          description: The generations from the evaluation
-        scores:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Scores
-          description: The scores from the evaluation. Each key in the dict is a scoring function name
-      required:
-      - generations
-      - scores
-      title: EvaluateResponse
-      description: The response from an evaluation.
-    RunEvalRequest:
-      description: Request model for running an evaluation on a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to run the evaluation on
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_id
-      - benchmark_config
-      title: RunEvalRequest
     Job:
+      description: A job execution instance with status tracking.
       properties:
         job_id:
-          type: string
           title: Job Id
+          type: string
         status:
           $ref: '#/components/schemas/JobStatus'
       required:
       - job_id
       - status
       title: Job
-      description: A job execution instance with status tracking.
     RerankRequest:
       properties:
         model:
@@ -9849,85 +9114,6 @@ components:
       - $ref: '#/components/schemas/CompletionInputType'
         title: CompletionInputType
       title: StringType | ... (9 variants)
-    RegisterScoringFunctionRequest:
-      properties:
-        scoring_fn_id:
-          type: string
-          title: Scoring Fn Id
-          description: The ID of the scoring function to register.
-        description:
-          type: string
-          title: Description
-          description: The description of the scoring function.
-        return_type:
-          oneOf:
-          - $ref: '#/components/schemas/StringType'
-            title: StringType
-          - $ref: '#/components/schemas/NumberType'
-            title: NumberType
-          - $ref: '#/components/schemas/BooleanType'
-            title: BooleanType
-          - $ref: '#/components/schemas/ArrayType'
-            title: ArrayType
-          - $ref: '#/components/schemas/ObjectType'
-            title: ObjectType
-          - $ref: '#/components/schemas/JsonType'
-            title: JsonType
-          - $ref: '#/components/schemas/UnionType'
-            title: UnionType
-          - $ref: '#/components/schemas/ChatCompletionInputType'
-            title: ChatCompletionInputType
-          - $ref: '#/components/schemas/CompletionInputType'
-            title: CompletionInputType
-          title: StringType | ... (9 variants)
-          description: The return type of the scoring function.
-          discriminator:
-            propertyName: type
-            mapping:
-              array: '#/components/schemas/ArrayType'
-              boolean: '#/components/schemas/BooleanType'
-              chat_completion_input: '#/components/schemas/ChatCompletionInputType'
-              completion_input: '#/components/schemas/CompletionInputType'
-              json: '#/components/schemas/JsonType'
-              number: '#/components/schemas/NumberType'
-              object: '#/components/schemas/ObjectType'
-              string: '#/components/schemas/StringType'
-              union: '#/components/schemas/UnionType'
-        provider_scoring_fn_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider scoring function to use for the scoring function.
-        provider_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider to use for the scoring function.
-        params:
-          anyOf:
-          - oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          - type: 'null'
-          title: Params
-          description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval.
-      required:
-      - scoring_fn_id
-      - description
-      - return_type
-      title: RegisterScoringFunctionRequest
-      description: Request model for registering a scoring function.
     RegisterShieldRequest:
       properties:
         shield_id:
@@ -9954,90 +9140,6 @@ components:
       - shield_id
       title: RegisterShieldRequest
       description: Request model for registering a shield.
-    DataSource:
-      discriminator:
-        mapping:
-          rows: '#/components/schemas/RowsDataSource'
-          uri: '#/components/schemas/URIDataSource'
-        propertyName: type
-      oneOf:
-      - $ref: '#/components/schemas/URIDataSource'
-        title: URIDataSource
-      - $ref: '#/components/schemas/RowsDataSource'
-        title: RowsDataSource
-      title: URIDataSource | RowsDataSource
-    RegisterDatasetRequest:
-      properties:
-        purpose:
-          $ref: '#/components/schemas/DatasetPurpose'
-          description: The purpose of the dataset.
-        source:
-          oneOf:
-          - $ref: '#/components/schemas/URIDataSource'
-            title: URIDataSource
-          - $ref: '#/components/schemas/RowsDataSource'
-            title: RowsDataSource
-          title: URIDataSource | RowsDataSource
-          description: The data source of the dataset.
-          discriminator:
-            propertyName: type
-            mapping:
-              rows: '#/components/schemas/RowsDataSource'
-              uri: '#/components/schemas/URIDataSource'
-        metadata:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-          description: The metadata for the dataset.
-        dataset_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the dataset. If not provided, an ID will be generated.
-      required:
-      - purpose
-      - source
-      title: RegisterDatasetRequest
-      description: Request model for registering a dataset.
-    RegisterBenchmarkRequest:
-      properties:
-        benchmark_id:
-          type: string
-          title: Benchmark Id
-          description: The ID of the benchmark to register.
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: The ID of the dataset to use for the benchmark.
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          title: Scoring Functions
-          description: The scoring functions to use for the benchmark.
-        provider_benchmark_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider benchmark to use for the benchmark.
-        provider_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider to use for the benchmark.
-        metadata:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-          description: The metadata to use for the benchmark.
-      required:
-      - benchmark_id
-      - dataset_id
-      - scoring_functions
-      title: RegisterBenchmarkRequest
-      description: Request model for registering a benchmark.
     AllowedToolsFilter:
       properties:
         tool_names:
@@ -10484,13 +9586,6 @@ components:
       - model
       title: CreateResponseRequest
       description: Request model for creating a response.
-    DatasetPurpose:
-      type: string
-      enum:
-      - eval/question-answer
-      - eval/messages-answer
-      title: DatasetPurpose
-      description: Purpose of the dataset. Each purpose has a required input data schema.
     EmbeddedChunk-Input:
       properties:
         content:
@@ -10629,32 +9724,6 @@ components:
           - type: 'null'
       additionalProperties: true
       title: Errors
-    EvaluateRowsBodyRequest:
-      properties:
-        input_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          minItems: 1
-          title: Input Rows
-          description: The rows to evaluate
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          minItems: 1
-          title: Scoring Functions
-          description: The scoring functions to use for the evaluation
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - input_rows
-      - scoring_functions
-      - benchmark_config
-      title: EvaluateRowsBodyRequest
-      description: Request body model for evaluating rows (without path parameter).
     HealthStatus:
       type: string
       enum:
@@ -10697,16 +9766,6 @@ components:
       required:
       - cached_tokens
       title: InputTokensDetails
-    JobStatus:
-      type: string
-      enum:
-      - completed
-      - in_progress
-      - failed
-      - scheduled
-      - cancelled
-      title: JobStatus
-      description: Status of a job execution.
     ListConnectorsResponse:
       properties:
         data:
@@ -11542,15 +10601,6 @@ components:
       - disabled
       title: ResponseTruncation
       description: Controls how the service truncates input when it exceeds the model context window.
-    RunEvalBodyRequest:
-      properties:
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_config
-      title: RunEvalBodyRequest
-      description: Request body model for running an evaluation (without path parameter).
     SearchRankingOptions:
       properties:
         ranker:
@@ -11935,50 +10985,6 @@ components:
       - $ref: '#/components/schemas/OpenAIResponseContentPartReasoningText'
         title: OpenAIResponseContentPartReasoningText
       title: OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal | OpenAIResponseContentPartReasoningText
-    ListBenchmarksRequest:
-      description: Request model for listing benchmarks.
-      properties: {}
-      title: ListBenchmarksRequest
-    GetBenchmarkRequest:
-      description: Request model for getting a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to get.
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: GetBenchmarkRequest
-    UnregisterBenchmarkRequest:
-      description: Request model for unregistering a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to unregister.
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: UnregisterBenchmarkRequest
-    GetDatasetRequest:
-      description: Request model for getting a dataset by ID.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to get.
-          title: Dataset Id
-          type: string
-      required:
-      - dataset_id
-      title: GetDatasetRequest
-    UnregisterDatasetRequest:
-      description: Request model for unregistering a dataset.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to unregister.
-          title: Dataset Id
-          type: string
-      required:
-      - dataset_id
-      title: UnregisterDatasetRequest
     ListModelsResponse:
       description: Response containing a list of model objects.
       properties:
@@ -12011,39 +11017,6 @@ components:
       required:
       - model_id
       title: UnregisterModelRequest
-    DialogType:
-      description: Parameter type for dialog data with semantic output labels.
-      properties:
-        type:
-          const: dialog
-          default: dialog
-          title: Type
-          type: string
-      title: DialogType
-    ListScoringFunctionsRequest:
-      description: Request model for listing scoring functions.
-      properties: {}
-      title: ListScoringFunctionsRequest
-    GetScoringFunctionRequest:
-      description: Request model for getting a scoring function.
-      properties:
-        scoring_fn_id:
-          description: The ID of the scoring function to get.
-          title: Scoring Fn Id
-          type: string
-      required:
-      - scoring_fn_id
-      title: GetScoringFunctionRequest
-    UnregisterScoringFunctionRequest:
-      description: Request model for unregistering a scoring function.
-      properties:
-        scoring_fn_id:
-          description: The ID of the scoring function to unregister.
-          title: Scoring Fn Id
-          type: string
-      required:
-      - scoring_fn_id
-      title: UnregisterScoringFunctionRequest
     GetShieldRequest:
       description: Request model for getting a shield by identifier.
       properties:
@@ -12142,16 +11115,10 @@ components:
       - agents
       - batches
       - vector_io
-      - datasetio
-      - scoring
-      - eval
       - tool_runtime
       - models
       - shields
       - vector_stores
-      - datasets
-      - scoring_functions
-      - benchmarks
       - tool_groups
       - files
       - file_processors
@@ -12948,6 +11915,25 @@ components:
       required:
       - batch_id
       title: CancelBatchRequest
+    JobStatus:
+      description: Status of a job execution.
+      enum:
+      - completed
+      - in_progress
+      - failed
+      - scheduled
+      - cancelled
+      title: JobStatus
+      type: string
+    DialogType:
+      description: Parameter type for dialog data with semantic output labels.
+      properties:
+        type:
+          const: dialog
+          default: dialog
+          title: Type
+          type: string
+      title: DialogType
     ConnectorInput:
       description: Input for creating a connector
       properties:
@@ -13178,90 +12164,6 @@ components:
       - conversation_id
       - item_id
       title: DeleteItemRequest
-    IterRowsRequest:
-      description: Request model for iterating over rows in a dataset.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to get the rows from.
-          title: Dataset Id
-          type: string
-        start_index:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: Index into dataset for the first row to get. Get all rows if None.
-          nullable: true
-        limit:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: The number of rows to get.
-          nullable: true
-      required:
-      - dataset_id
-      title: IterRowsRequest
-    BenchmarkIdRequest:
-      description: Request model containing benchmark_id path parameter.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark
-          minLength: 1
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: BenchmarkIdRequest
-    JobStatusRequest:
-      description: Request model for getting the status of a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to get the status of
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobStatusRequest
-    JobCancelRequest:
-      description: Request model for canceling a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to cancel
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobCancelRequest
-    JobResultRequest:
-      description: Request model for getting the result of a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to get the result of
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobResultRequest
     ProcessFileRequest:
       description: |-
         Request model for file processing operation.
diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml
index 84d18c9718..b0fd085fc3 100644
--- a/docs/static/stainless-llama-stack-spec.yaml
+++ b/docs/static/stainless-llama-stack-spec.yaml
@@ -1759,190 +1759,6 @@ paths:
             schema:
               $ref: '#/components/schemas/RunShieldRequest'
         required: true
-  /v1/scoring-functions:
-    get:
-      responses:
-        '200':
-          description: A ListScoringFunctionsResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListScoringFunctionsResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Scoring Functions
-      summary: List all scoring functions.
-      description: List all scoring functions.
-      operationId: list_scoring_functions_v1_scoring_functions_get
-    post:
-      responses:
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-        '204':
-          description: The scoring function was successfully registered.
-      tags:
-      - Scoring Functions
-      summary: Register a scoring function.
-      description: Register a scoring function.
-      operationId: register_scoring_function_v1_scoring_functions_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterScoringFunctionRequest'
-        required: true
-      deprecated: true
-  /v1/scoring-functions/{scoring_fn_id}:
-    get:
-      responses:
-        '200':
-          description: A ScoringFn.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoringFn'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Scoring Functions
-      summary: Get a scoring function by its ID.
-      description: Get a scoring function by its ID.
-      operationId: get_scoring_function_v1_scoring_functions__scoring_fn_id__get
-      parameters:
-      - name: scoring_fn_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the scoring function to get.
-          title: Scoring Fn Id
-        description: The ID of the scoring function to get.
-    delete:
-      responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: The scoring function was successfully unregistered.
-      tags:
-      - Scoring Functions
-      summary: Unregister a scoring function.
-      description: Unregister a scoring function.
-      operationId: unregister_scoring_function_v1_scoring_functions__scoring_fn_id__delete
-      parameters:
-      - name: scoring_fn_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the scoring function to unregister.
-          title: Scoring Fn Id
-        description: The ID of the scoring function to unregister.
-      deprecated: true
-  /v1/scoring/score:
-    post:
-      responses:
-        '200':
-          description: A ScoreResponse object containing rows and aggregated results.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoreResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Scoring
-      summary: Score a list of rows.
-      description: Score a list of rows.
-      operationId: score_v1_scoring_score_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/ScoreRequest'
-        required: true
-  /v1/scoring/score-batch:
-    post:
-      responses:
-        '200':
-          description: A ScoreBatchResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ScoreBatchResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Scoring
-      summary: Score a batch of rows.
-      description: Score a batch of rows.
-      operationId: score_batch_v1_scoring_score_batch_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/ScoreBatchRequest'
-        required: true
   /v1/shields:
     get:
       responses:
@@ -2999,116 +2815,15 @@ paths:
       description: Get the version of the service.
       operationId: version_v1_version_get
       x-public: true
-  /v1beta/datasetio/append-rows/{dataset_id}:
+  /v1alpha/inference/rerank:
     post:
-      responses:
-        '204':
-          description: Rows were successfully appended.
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - DatasetIO
-      summary: Append rows to a dataset.
-      description: Append rows to a dataset.
-      operationId: append_rows_v1beta_datasetio_append_rows__dataset_id__post
-      parameters:
-      - name: dataset_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the dataset to append the rows to.
-          title: Dataset Id
-        description: The ID of the dataset to append the rows to.
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/AppendRowsRequest'
-  /v1beta/datasetio/iterrows/{dataset_id}:
-    get:
-      responses:
-        '200':
-          description: A PaginatedResponse containing the rows.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/PaginatedResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - DatasetIO
-      summary: Get a paginated list of rows from a dataset.
-      description: |-
-        Get a paginated list of rows from a dataset.
-
-        Uses offset-based pagination where:
-        - start_index: The starting index (0-based). If None, starts from beginning.
-        - limit: Number of items to return. If None or -1, returns all items.
-
-        The response includes:
-        - data: List of items for the current page.
-        - has_more: Whether there are more items available after this set.
-      operationId: iterrows_v1beta_datasetio_iterrows__dataset_id__get
-      parameters:
-      - name: dataset_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the dataset to get the rows from.
-          title: Dataset Id
-        description: The ID of the dataset to get the rows from.
-      - name: start_index
-        in: query
-        required: false
-        schema:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: Index into dataset for the first row to get. Get all rows if None.
-          title: Start Index
-        description: Index into dataset for the first row to get. Get all rows if None.
-      - name: limit
-        in: query
-        required: false
-        schema:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: The number of rows to get.
-          title: Limit
-        description: The number of rows to get.
-  /v1beta/datasets:
-    get:
       responses:
         '200':
-          description: A list of dataset objects.
+          description: RerankResponse with indices sorted by relevance score (descending).
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/ListDatasetsResponse'
+                $ref: '#/components/schemas/RerankResponse'
         '400':
           description: Bad Request
           $ref: '#/components/responses/BadRequest400'
@@ -3122,18 +2837,25 @@ paths:
           description: Default Response
           $ref: '#/components/responses/DefaultError'
       tags:
-      - Datasets
-      summary: List all datasets.
-      description: List all datasets.
-      operationId: list_datasets_v1beta_datasets_get
-    post:
+      - Inference
+      summary: Rerank documents based on relevance to a query.
+      description: Rerank a list of documents based on their relevance to a query.
+      operationId: rerank_v1alpha_inference_rerank_post
+      requestBody:
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/RerankRequest'
+        required: true
+  /v1alpha/admin/providers:
+    get:
       responses:
         '200':
-          description: The registered dataset object.
+          description: A list of provider information objects.
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/Dataset'
+                $ref: '#/components/schemas/ListProvidersResponse'
         '400':
           description: Bad Request
           $ref: '#/components/responses/BadRequest400'
@@ -3147,26 +2869,19 @@ paths:
           description: Default Response
           $ref: '#/components/responses/DefaultError'
       tags:
-      - Datasets
-      summary: Register a new dataset.
-      description: Register a new dataset.
-      operationId: register_dataset_v1beta_datasets_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterDatasetRequest'
-        required: true
-      deprecated: true
-  /v1beta/datasets/{dataset_id}:
+      - Admin
+      summary: List all available providers
+      description: List all available providers with their configuration and health status.
+      operationId: list_providers_v1alpha_admin_providers_get
+  /v1alpha/admin/providers/{provider_id}:
     get:
       responses:
         '200':
-          description: The dataset object.
+          description: The provider information object.
           content:
             application/json:
               schema:
-                $ref: '#/components/schemas/Dataset'
+                $ref: '#/components/schemas/ProviderInfo'
         '400':
           $ref: '#/components/responses/BadRequest400'
           description: Bad Request
@@ -3179,469 +2894,31 @@ paths:
         default:
           $ref: '#/components/responses/DefaultError'
           description: Default Response
+        '404':
+          description: Provider not found.
       tags:
-      - Datasets
-      summary: Get a dataset by its ID.
-      description: Get a dataset by its ID.
-      operationId: get_dataset_v1beta_datasets__dataset_id__get
+      - Admin
+      summary: Get provider details
+      description: Get detailed information about a specific provider.
+      operationId: inspect_provider_v1alpha_admin_providers__provider_id__get
       parameters:
-      - name: dataset_id
+      - name: provider_id
         in: path
         required: true
         schema:
           type: string
-          description: The ID of the dataset to get.
-          title: Dataset Id
-        description: The ID of the dataset to get.
-    delete:
+          description: The ID of the provider to inspect.
+          title: Provider Id
+        description: The ID of the provider to inspect.
+  /v1alpha/admin/inspect/routes:
+    get:
       responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: The dataset was successfully unregistered.
-      tags:
-      - Datasets
-      summary: Unregister a dataset by its ID.
-      description: Unregister a dataset by its ID.
-      operationId: unregister_dataset_v1beta_datasets__dataset_id__delete
-      parameters:
-      - name: dataset_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the dataset to unregister.
-          title: Dataset Id
-        description: The ID of the dataset to unregister.
-      deprecated: true
-  /v1alpha/eval/benchmarks:
-    get:
-      responses:
-        '200':
-          description: A ListBenchmarksResponse.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListBenchmarksResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Benchmarks
-      summary: List all benchmarks.
-      description: List all benchmarks.
-      operationId: list_benchmarks_v1alpha_eval_benchmarks_get
-    post:
-      responses:
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-        '204':
-          description: The benchmark was successfully registered.
-      tags:
-      - Benchmarks
-      summary: Register a benchmark.
-      description: Register a benchmark.
-      operationId: register_benchmark_v1alpha_eval_benchmarks_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RegisterBenchmarkRequest'
-        required: true
-      deprecated: true
-  /v1alpha/eval/benchmarks/{benchmark_id}:
-    get:
-      responses:
-        '200':
-          description: A Benchmark.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Benchmark'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Benchmarks
-      summary: Get a benchmark by its ID.
-      description: Get a benchmark by its ID.
-      operationId: get_benchmark_v1alpha_eval_benchmarks__benchmark_id__get
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark to get.
-          title: Benchmark Id
-        description: The ID of the benchmark to get.
-    delete:
-      responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: The benchmark was successfully unregistered.
-      tags:
-      - Benchmarks
-      summary: Unregister a benchmark.
-      description: Unregister a benchmark.
-      operationId: unregister_benchmark_v1alpha_eval_benchmarks__benchmark_id__delete
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark to unregister.
-          title: Benchmark Id
-        description: The ID of the benchmark to unregister.
-      deprecated: true
-  /v1alpha/eval/benchmarks/{benchmark_id}/evaluations:
-    post:
-      responses:
-        '200':
-          description: EvaluateResponse object containing generations and scores.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluateResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Evaluate Rows
-      description: Evaluate a list of rows on a benchmark.
-      operationId: evaluate_rows_v1alpha_eval_benchmarks__benchmark_id__evaluations_post
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark
-          title: Benchmark Id
-        description: The ID of the benchmark
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/EvaluateRowsBodyRequest'
-  /v1alpha/eval/benchmarks/{benchmark_id}/jobs:
-    post:
-      responses:
-        '200':
-          description: The job that was created to run the evaluation.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Job'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Run Eval
-      description: Run an evaluation on a benchmark.
-      operationId: run_eval_v1alpha_eval_benchmarks__benchmark_id__jobs_post
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the benchmark
-          title: Benchmark Id
-        description: The ID of the benchmark
-      requestBody:
-        required: true
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RunEvalBodyRequest'
-  /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}:
-    get:
-      responses:
-        '200':
-          description: The status of the evaluation job.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/Job'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Job Status
-      description: Get the status of a job.
-      operationId: job_status_v1alpha_eval_benchmarks__benchmark_id__jobs__job_id__get
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Benchmark Id
-      - name: job_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Job Id
-    delete:
-      responses:
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '204':
-          description: Successful Response
-      tags:
-      - Eval
-      summary: Job Cancel
-      description: Cancel a job.
-      operationId: job_cancel_v1alpha_eval_benchmarks__benchmark_id__jobs__job_id__delete
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Benchmark Id
-      - name: job_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Job Id
-  /v1alpha/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result:
-    get:
-      responses:
-        '200':
-          description: The result of the job.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/EvaluateResponse'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-      tags:
-      - Eval
-      summary: Job Result
-      description: Get the result of a job.
-      operationId: job_result_v1alpha_eval_benchmarks__benchmark_id__jobs__job_id__result_get
-      parameters:
-      - name: benchmark_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Benchmark Id
-      - name: job_id
-        in: path
-        required: true
-        schema:
-          type: string
-          title: Job Id
-  /v1alpha/inference/rerank:
-    post:
-      responses:
-        '200':
-          description: RerankResponse with indices sorted by relevance score (descending).
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/RerankResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Inference
-      summary: Rerank documents based on relevance to a query.
-      description: Rerank a list of documents based on their relevance to a query.
-      operationId: rerank_v1alpha_inference_rerank_post
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: '#/components/schemas/RerankRequest'
-        required: true
-  /v1alpha/admin/providers:
-    get:
-      responses:
-        '200':
-          description: A list of provider information objects.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListProvidersResponse'
-        '400':
-          description: Bad Request
-          $ref: '#/components/responses/BadRequest400'
-        '429':
-          description: Too Many Requests
-          $ref: '#/components/responses/TooManyRequests429'
-        '500':
-          description: Internal Server Error
-          $ref: '#/components/responses/InternalServerError500'
-        default:
-          description: Default Response
-          $ref: '#/components/responses/DefaultError'
-      tags:
-      - Admin
-      summary: List all available providers
-      description: List all available providers with their configuration and health status.
-      operationId: list_providers_v1alpha_admin_providers_get
-  /v1alpha/admin/providers/{provider_id}:
-    get:
-      responses:
-        '200':
-          description: The provider information object.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ProviderInfo'
-        '400':
-          $ref: '#/components/responses/BadRequest400'
-          description: Bad Request
-        '429':
-          $ref: '#/components/responses/TooManyRequests429'
-          description: Too Many Requests
-        '500':
-          $ref: '#/components/responses/InternalServerError500'
-          description: Internal Server Error
-        default:
-          $ref: '#/components/responses/DefaultError'
-          description: Default Response
-        '404':
-          description: Provider not found.
-      tags:
-      - Admin
-      summary: Get provider details
-      description: Get detailed information about a specific provider.
-      operationId: inspect_provider_v1alpha_admin_providers__provider_id__get
-      parameters:
-      - name: provider_id
-        in: path
-        required: true
-        schema:
-          type: string
-          description: The ID of the provider to inspect.
-          title: Provider Id
-        description: The ID of the provider to inspect.
-  /v1alpha/admin/inspect/routes:
-    get:
-      responses:
-        '200':
-          description: A list of route information objects.
-          content:
-            application/json:
-              schema:
-                $ref: '#/components/schemas/ListRoutesResponse'
+        '200':
+          description: A list of route information objects.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ListRoutesResponse'
         '400':
           $ref: '#/components/responses/BadRequest400'
           description: Bad Request
@@ -8933,408 +8210,87 @@ components:
       - error
       title: ViolationLevel
       description: Severity level of a safety violation.
-    AggregationFunctionType:
-      type: string
-      enum:
-      - average
-      - weighted_average
-      - median
-      - categorical_count
-      - accuracy
-      title: AggregationFunctionType
-      description: Types of aggregation functions for scoring results.
     ArrayType:
+      description: Parameter type for array values.
       properties:
         type:
-          type: string
           const: array
-          title: Type
           default: array
-      title: ArrayType
-      description: Parameter type for array values.
-    BasicScoringFnParams:
-      properties:
-        type:
-          type: string
-          const: basic
           title: Type
-          default: basic
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      title: BasicScoringFnParams
-      description: Parameters for basic scoring function configuration.
+          type: string
+      title: ArrayType
     BooleanType:
+      description: Parameter type for boolean values.
       properties:
         type:
-          type: string
           const: boolean
-          title: Type
           default: boolean
+          title: Type
+          type: string
       title: BooleanType
-      description: Parameter type for boolean values.
     ChatCompletionInputType:
+      description: Parameter type for chat completion input.
       properties:
         type:
-          type: string
           const: chat_completion_input
-          title: Type
           default: chat_completion_input
+          title: Type
+          type: string
       title: ChatCompletionInputType
-      description: Parameter type for chat completion input.
     CompletionInputType:
+      description: Parameter type for completion input.
       properties:
         type:
-          type: string
           const: completion_input
-          title: Type
           default: completion_input
+          title: Type
+          type: string
       title: CompletionInputType
-      description: Parameter type for completion input.
     JsonType:
+      description: Parameter type for JSON values.
       properties:
         type:
-          type: string
           const: json
-          title: Type
           default: json
-      title: JsonType
-      description: Parameter type for JSON values.
-    LLMAsJudgeScoringFnParams:
-      properties:
-        type:
-          type: string
-          const: llm_as_judge
           title: Type
-          default: llm_as_judge
-        judge_model:
           type: string
-          title: Judge Model
-        prompt_template:
-          anyOf:
-          - type: string
-          - type: 'null'
-        judge_score_regexes:
-          items:
-            type: string
-          type: array
-          title: Judge Score Regexes
-          description: Regexes to extract the answer from generated response
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      required:
-      - judge_model
-      title: LLMAsJudgeScoringFnParams
-      description: Parameters for LLM-as-judge scoring function configuration.
+      title: JsonType
     NumberType:
+      description: Parameter type for numeric values.
       properties:
         type:
-          type: string
           const: number
-          title: Type
           default: number
+          title: Type
+          type: string
       title: NumberType
-      description: Parameter type for numeric values.
     ObjectType:
+      description: Parameter type for object values.
       properties:
         type:
-          type: string
           const: object
-          title: Type
           default: object
-      title: ObjectType
-      description: Parameter type for object values.
-    RegexParserScoringFnParams:
-      properties:
-        type:
-          type: string
-          const: regex_parser
           title: Type
-          default: regex_parser
-        parsing_regexes:
-          items:
-            type: string
-          type: array
-          title: Parsing Regexes
-          description: Regex to extract the answer from generated response
-        aggregation_functions:
-          items:
-            $ref: '#/components/schemas/AggregationFunctionType'
-          type: array
-          title: Aggregation Functions
-          description: Aggregation functions to apply to the scores of each row
-      title: RegexParserScoringFnParams
-      description: Parameters for regex parser scoring function configuration.
-    ScoringFn:
-      properties:
-        identifier:
           type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          const: scoring_function
-          title: Type
-          default: scoring_function
-        description:
-          anyOf:
-          - type: string
-          - type: 'null'
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Any additional metadata for this definition
-        return_type:
-          oneOf:
-          - $ref: '#/components/schemas/StringType'
-            title: StringType
-          - $ref: '#/components/schemas/NumberType'
-            title: NumberType
-          - $ref: '#/components/schemas/BooleanType'
-            title: BooleanType
-          - $ref: '#/components/schemas/ArrayType'
-            title: ArrayType
-          - $ref: '#/components/schemas/ObjectType'
-            title: ObjectType
-          - $ref: '#/components/schemas/JsonType'
-            title: JsonType
-          - $ref: '#/components/schemas/UnionType'
-            title: UnionType
-          - $ref: '#/components/schemas/ChatCompletionInputType'
-            title: ChatCompletionInputType
-          - $ref: '#/components/schemas/CompletionInputType'
-            title: CompletionInputType
-          title: StringType | ... (9 variants)
-          description: The return type of the deterministic function
-          discriminator:
-            propertyName: type
-            mapping:
-              array: '#/components/schemas/ArrayType'
-              boolean: '#/components/schemas/BooleanType'
-              chat_completion_input: '#/components/schemas/ChatCompletionInputType'
-              completion_input: '#/components/schemas/CompletionInputType'
-              json: '#/components/schemas/JsonType'
-              number: '#/components/schemas/NumberType'
-              object: '#/components/schemas/ObjectType'
-              string: '#/components/schemas/StringType'
-              union: '#/components/schemas/UnionType'
-        params:
-          anyOf:
-          - oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          - type: 'null'
-          title: Params
-          description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval
-      required:
-      - identifier
-      - provider_id
-      - return_type
-      title: ScoringFn
-      description: A scoring function resource for evaluating model outputs.
-    ScoringFnParams:
-      discriminator:
-        mapping:
-          basic: '#/components/schemas/BasicScoringFnParams'
-          llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-          regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-        propertyName: type
-      oneOf:
-      - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-        title: LLMAsJudgeScoringFnParams
-      - $ref: '#/components/schemas/RegexParserScoringFnParams'
-        title: RegexParserScoringFnParams
-      - $ref: '#/components/schemas/BasicScoringFnParams'
-        title: BasicScoringFnParams
-      title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-    ScoringFnParamsType:
-      description: Types of scoring function parameter configurations.
-      enum:
-      - llm_as_judge
-      - regex_parser
-      - basic
-      title: ScoringFnParamsType
-      type: string
+      title: ObjectType
     StringType:
+      description: Parameter type for string values.
       properties:
         type:
-          type: string
           const: string
-          title: Type
           default: string
+          title: Type
+          type: string
       title: StringType
-      description: Parameter type for string values.
     UnionType:
+      description: Parameter type for union values.
       properties:
         type:
-          type: string
           const: union
-          title: Type
           default: union
-      title: UnionType
-      description: Parameter type for union values.
-    ListScoringFunctionsResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/ScoringFn'
-          type: array
-          title: Data
-          description: List of scoring function objects.
-      required:
-      - data
-      title: ListScoringFunctionsResponse
-      description: Response containing a list of scoring function objects.
-    ScoreRequest:
-      properties:
-        input_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Input Rows
-          description: The rows to score.
-        scoring_functions:
-          additionalProperties:
-            anyOf:
-            - oneOf:
-              - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                title: LLMAsJudgeScoringFnParams
-              - $ref: '#/components/schemas/RegexParserScoringFnParams'
-                title: RegexParserScoringFnParams
-              - $ref: '#/components/schemas/BasicScoringFnParams'
-                title: BasicScoringFnParams
-              discriminator:
-                propertyName: type
-                mapping:
-                  basic: '#/components/schemas/BasicScoringFnParams'
-                  llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                  regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-              title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-            - type: 'null'
-            title: AdditionalpropertiesUnion
-          type: object
-          title: Scoring Functions
-          description: The scoring functions to use for the scoring.
-      required:
-      - input_rows
-      - scoring_functions
-      title: ScoreRequest
-      description: Request model for scoring a list of rows.
-    ScoreResponse:
-      properties:
-        results:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Results
-          description: A map of scoring function name to ScoringResult.
-      required:
-      - results
-      title: ScoreResponse
-      description: The response from scoring.
-    ScoringResult:
-      properties:
-        score_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Score Rows
-          description: The scoring result for each row. Each row is a map of column name to value.
-        aggregated_results:
-          additionalProperties: true
-          type: object
-          title: Aggregated Results
-          description: Map of metric name to aggregated value
-      required:
-      - score_rows
-      - aggregated_results
-      title: ScoringResult
-      description: A scoring result for a single row.
-    ScoreBatchRequest:
-      properties:
-        dataset_id:
+          title: Type
           type: string
-          title: Dataset Id
-          description: The ID of the dataset to score.
-        scoring_functions:
-          additionalProperties:
-            anyOf:
-            - oneOf:
-              - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                title: LLMAsJudgeScoringFnParams
-              - $ref: '#/components/schemas/RegexParserScoringFnParams'
-                title: RegexParserScoringFnParams
-              - $ref: '#/components/schemas/BasicScoringFnParams'
-                title: BasicScoringFnParams
-              discriminator:
-                propertyName: type
-                mapping:
-                  basic: '#/components/schemas/BasicScoringFnParams'
-                  llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                  regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-              title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-            - type: 'null'
-            title: AdditionalpropertiesUnion
-          type: object
-          title: Scoring Functions
-          description: The scoring functions to use for the scoring.
-        save_results_dataset:
-          type: boolean
-          title: Save Results Dataset
-          description: Whether to save the results to a dataset.
-          default: false
-      required:
-      - dataset_id
-      - scoring_functions
-      title: ScoreBatchRequest
-      description: Request model for scoring a batch of rows from a dataset.
-    ScoreBatchResponse:
-      properties:
-        dataset_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: (Optional) The identifier of the dataset that was scored
-        results:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Results
-          description: A map of scoring function name to ScoringResult
-      required:
-      - results
-      title: ScoreBatchResponse
-      description: Response from batch scoring operations on datasets.
+      title: UnionType
     Shield:
       properties:
         identifier:
@@ -10365,264 +9321,48 @@ components:
       - version
       title: VersionInfo
       description: Version information for the service.
-    AppendRowsRequest:
-      properties:
-        rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Rows
-          description: The rows to append to the dataset.
-      required:
-      - rows
-      title: AppendRowsRequest
-      description: Request body for appending rows to a dataset.
     PaginatedResponse:
+      description: A generic paginated response that follows a simple format.
       properties:
         data:
           items:
             additionalProperties: true
             type: object
-          type: array
           title: Data
+          type: array
         has_more:
-          type: boolean
           title: Has More
+          type: boolean
         url:
           anyOf:
           - type: string
           - type: 'null'
+          nullable: true
       required:
       - data
       - has_more
       title: PaginatedResponse
-      description: A generic paginated response that follows a simple format.
-    Dataset:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          const: dataset
-          title: Type
-          description: Type of resource, always 'dataset' for datasets
-          default: dataset
-        purpose:
-          $ref: '#/components/schemas/DatasetPurpose'
-          description: Purpose of the dataset indicating its intended use
-        source:
-          oneOf:
-          - $ref: '#/components/schemas/URIDataSource'
-            title: URIDataSource
-          - $ref: '#/components/schemas/RowsDataSource'
-            title: RowsDataSource
-          title: URIDataSource | RowsDataSource
-          description: Data source configuration for the dataset
-          discriminator:
-            propertyName: type
-            mapping:
-              rows: '#/components/schemas/RowsDataSource'
-              uri: '#/components/schemas/URIDataSource'
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Any additional metadata for this dataset
-      required:
-      - identifier
-      - provider_id
-      - purpose
-      - source
-      title: Dataset
-      description: Dataset resource for storing and accessing training or evaluation data.
-    RowsDataSource:
-      properties:
-        type:
-          type: string
-          const: rows
-          title: Type
-          description: The type of data source.
-          default: rows
-        rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Rows
-          description: 'The dataset is stored in rows. E.g. [{"messages": [{"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}]}]'
-      required:
-      - rows
-      title: RowsDataSource
-      description: A dataset stored in rows.
-    URIDataSource:
-      properties:
-        type:
-          type: string
-          const: uri
-          title: Type
-          description: The type of data source.
-          default: uri
-        uri:
-          type: string
-          title: Uri
-          description: The dataset can be obtained from a URI. E.g. "https://mywebsite.com/mydata.jsonl", "lsfs://mydata.jsonl", "data:csv;base64,{base64_content}"
-      required:
-      - uri
-      title: URIDataSource
-      description: A dataset that can be obtained from a URI.
-    ListDatasetsResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Dataset'
-          type: array
-          title: Data
-          description: List of datasets
-      required:
-      - data
-      title: ListDatasetsResponse
-      description: Response from listing datasets.
-    Benchmark:
-      properties:
-        identifier:
-          type: string
-          title: Identifier
-          description: Unique identifier for this resource in llama stack
-        provider_resource_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: Unique identifier for this resource in the provider
-        provider_id:
-          type: string
-          title: Provider Id
-          description: ID of the provider that owns this resource
-        type:
-          type: string
-          const: benchmark
-          title: Type
-          description: The resource type, always benchmark.
-          default: benchmark
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: Identifier of the dataset to use for the benchmark evaluation.
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          title: Scoring Functions
-          description: List of scoring function identifiers to apply during evaluation.
-        metadata:
-          additionalProperties: true
-          type: object
-          title: Metadata
-          description: Metadata for this evaluation task.
-      required:
-      - identifier
-      - provider_id
-      - dataset_id
-      - scoring_functions
-      title: Benchmark
-      description: A benchmark resource for evaluating model performance.
-    ListBenchmarksResponse:
-      properties:
-        data:
-          items:
-            $ref: '#/components/schemas/Benchmark'
-          type: array
-          title: Data
-          description: List of benchmark objects.
-      required:
-      - data
-      title: ListBenchmarksResponse
-      description: Response containing a list of benchmark objects.
-    BenchmarkConfig:
-      properties:
-        eval_candidate:
-          $ref: '#/components/schemas/ModelCandidate'
-          description: The candidate to evaluate
-        scoring_params:
-          additionalProperties:
-            oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          type: object
-          title: Scoring Params
-          description: Map between scoring function id and parameters for each scoring function you want to run
-        num_examples:
-          anyOf:
-          - type: integer
-            minimum: 1.0
-          - type: 'null'
-          description: Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated
-      required:
-      - eval_candidate
-      title: BenchmarkConfig
-      description: A benchmark configuration for evaluation.
     GreedySamplingStrategy:
+      description: Greedy sampling strategy that selects the highest probability token at each step.
       properties:
         type:
-          type: string
           const: greedy
-          title: Type
-          description: Must be 'greedy' to identify this sampling strategy.
           default: greedy
-      title: GreedySamplingStrategy
-      description: Greedy sampling strategy that selects the highest probability token at each step.
-    ModelCandidate:
-      properties:
-        type:
-          type: string
-          const: model
+          description: Must be 'greedy' to identify this sampling strategy.
           title: Type
-          default: model
-        model:
           type: string
-          minLength: 1
-          title: Model
-          description: The model ID to evaluate
-        sampling_params:
-          $ref: '#/components/schemas/SamplingParams'
-          description: The sampling parameters for the model
-        system_message:
-          anyOf:
-          - $ref: '#/components/schemas/SystemMessage'
-            title: SystemMessage
-          - type: 'null'
-          description: The system message providing instructions or context to the model
-          title: SystemMessage
-      required:
-      - model
-      - sampling_params
-      title: ModelCandidate
-      description: A model candidate for evaluation.
+      title: GreedySamplingStrategy
     SamplingParams:
+      description: Sampling parameters for text generation.
       properties:
         strategy:
+          description: The sampling strategy to use.
+          discriminator:
+            mapping:
+              greedy: '#/components/schemas/GreedySamplingStrategy'
+              top_k: '#/components/schemas/TopKSamplingStrategy'
+              top_p: '#/components/schemas/TopPSamplingStrategy'
+            propertyName: type
           oneOf:
           - $ref: '#/components/schemas/GreedySamplingStrategy'
             title: GreedySamplingStrategy
@@ -10631,200 +9371,127 @@ components:
           - $ref: '#/components/schemas/TopKSamplingStrategy'
             title: TopKSamplingStrategy
           title: GreedySamplingStrategy | TopPSamplingStrategy | TopKSamplingStrategy
-          description: The sampling strategy to use.
-          discriminator:
-            propertyName: type
-            mapping:
-              greedy: '#/components/schemas/GreedySamplingStrategy'
-              top_k: '#/components/schemas/TopKSamplingStrategy'
-              top_p: '#/components/schemas/TopPSamplingStrategy'
         max_tokens:
           anyOf:
-          - type: integer
-            minimum: 1.0
+          - minimum: 1
+            type: integer
           - type: 'null'
           description: The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length.
+          nullable: true
         repetition_penalty:
           anyOf:
-          - type: number
-            maximum: 2.0
+          - maximum: 2.0
             minimum: -2.0
+            type: number
           - type: 'null'
-          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
           default: 1.0
+          description: Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far.
         stop:
           anyOf:
           - items:
               type: string
-            type: array
             maxItems: 4
+            type: array
           - type: 'null'
           description: Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence.
+          nullable: true
       title: SamplingParams
-      description: Sampling parameters for text generation.
     SystemMessage:
+      description: A system message providing instructions or context to the model.
       properties:
         role:
-          type: string
           const: system
-          title: Role
-          description: Must be 'system' to identify this as a system message.
           default: system
+          description: Must be 'system' to identify this as a system message.
+          title: Role
+          type: string
         content:
           anyOf:
           - type: string
-          - oneOf:
-            - $ref: '#/components/schemas/ImageContentItem-Input'
-              title: ImageContentItem-Input
-            - $ref: '#/components/schemas/TextContentItem'
-              title: TextContentItem
-            discriminator:
-              propertyName: type
+          - discriminator:
               mapping:
-                image: '#/components/schemas/ImageContentItem-Input'
+                image: '#/components/schemas/ImageContentItem'
                 text: '#/components/schemas/TextContentItem'
-            title: ImageContentItem-Input | TextContentItem
-          - items:
-              oneOf:
-              - $ref: '#/components/schemas/ImageContentItem-Input'
-                title: ImageContentItem-Input
-              - $ref: '#/components/schemas/TextContentItem'
-                title: TextContentItem
+              propertyName: type
+            oneOf:
+            - $ref: '#/components/schemas/ImageContentItem'
+              title: ImageContentItem
+            - $ref: '#/components/schemas/TextContentItem'
+              title: TextContentItem
+            title: ImageContentItem | TextContentItem
+          - items:
               discriminator:
-                propertyName: type
                 mapping:
-                  image: '#/components/schemas/ImageContentItem-Input'
+                  image: '#/components/schemas/ImageContentItem'
                   text: '#/components/schemas/TextContentItem'
-              title: ImageContentItem-Input | TextContentItem
+                propertyName: type
+              oneOf:
+              - $ref: '#/components/schemas/ImageContentItem'
+                title: ImageContentItem
+              - $ref: '#/components/schemas/TextContentItem'
+                title: TextContentItem
+              title: ImageContentItem | TextContentItem
             type: array
-            title: list[ImageContentItem-Input | TextContentItem]
-          title: string | list[ImageContentItem-Input | TextContentItem]
+            title: list[ImageContentItem | TextContentItem]
           description: The content of the 'system prompt'. If multiple system messages are provided, they are concatenated. The underlying Llama Stack code may also add other system messages.
+          title: string | list[ImageContentItem | TextContentItem]
       required:
       - content
       title: SystemMessage
-      description: A system message providing instructions or context to the model.
     TopKSamplingStrategy:
+      description: Top-k sampling strategy that restricts sampling to the k most likely tokens.
       properties:
         type:
-          type: string
           const: top_k
-          title: Type
-          description: Must be 'top_k' to identify this sampling strategy.
           default: top_k
+          description: Must be 'top_k' to identify this sampling strategy.
+          title: Type
+          type: string
         top_k:
-          type: integer
-          minimum: 1.0
-          title: Top K
           description: Number of top tokens to consider for sampling. Must be at least 1.
+          minimum: 1
+          title: Top K
+          type: integer
       required:
       - top_k
       title: TopKSamplingStrategy
-      description: Top-k sampling strategy that restricts sampling to the k most likely tokens.
     TopPSamplingStrategy:
+      description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
       properties:
         type:
-          type: string
           const: top_p
-          title: Type
-          description: Must be 'top_p' to identify this sampling strategy.
           default: top_p
+          description: Must be 'top_p' to identify this sampling strategy.
+          title: Type
+          type: string
         temperature:
-          type: number
+          description: Controls randomness in sampling. Higher values increase randomness.
           maximum: 2.0
           title: Temperature
-          description: Controls randomness in sampling. Higher values increase randomness.
+          type: number
           minimum: 0.0
         top_p:
-          type: number
+          default: 0.95
+          description: Cumulative probability threshold for nucleus sampling.
           maximum: 1.0
           minimum: 0.0
           title: Top P
-          description: Cumulative probability threshold for nucleus sampling.
-          default: 0.95
+          type: number
       required:
       - temperature
       title: TopPSamplingStrategy
-      description: Top-p (nucleus) sampling strategy that samples from the smallest set of tokens with cumulative probability >= p.
-    EvaluateRowsRequest:
-      description: Request model for evaluating a list of rows on a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to run the evaluation on
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        input_rows:
-          description: The rows to evaluate
-          items:
-            additionalProperties: true
-            type: object
-          minItems: 1
-          title: Input Rows
-          type: array
-        scoring_functions:
-          description: The scoring functions to use for the evaluation
-          items:
-            type: string
-          minItems: 1
-          title: Scoring Functions
-          type: array
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_id
-      - input_rows
-      - scoring_functions
-      - benchmark_config
-      title: EvaluateRowsRequest
-    EvaluateResponse:
-      properties:
-        generations:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          title: Generations
-          description: The generations from the evaluation
-        scores:
-          additionalProperties:
-            $ref: '#/components/schemas/ScoringResult'
-          type: object
-          title: Scores
-          description: The scores from the evaluation. Each key in the dict is a scoring function name
-      required:
-      - generations
-      - scores
-      title: EvaluateResponse
-      description: The response from an evaluation.
-    RunEvalRequest:
-      description: Request model for running an evaluation on a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to run the evaluation on
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_id
-      - benchmark_config
-      title: RunEvalRequest
     Job:
+      description: A job execution instance with status tracking.
       properties:
         job_id:
-          type: string
           title: Job Id
+          type: string
         status:
           $ref: '#/components/schemas/JobStatus'
       required:
       - job_id
       - status
       title: Job
-      description: A job execution instance with status tracking.
     RerankRequest:
       properties:
         model:
@@ -10964,85 +9631,6 @@ components:
       - $ref: '#/components/schemas/CompletionInputType'
         title: CompletionInputType
       title: StringType | ... (9 variants)
-    RegisterScoringFunctionRequest:
-      properties:
-        scoring_fn_id:
-          type: string
-          title: Scoring Fn Id
-          description: The ID of the scoring function to register.
-        description:
-          type: string
-          title: Description
-          description: The description of the scoring function.
-        return_type:
-          oneOf:
-          - $ref: '#/components/schemas/StringType'
-            title: StringType
-          - $ref: '#/components/schemas/NumberType'
-            title: NumberType
-          - $ref: '#/components/schemas/BooleanType'
-            title: BooleanType
-          - $ref: '#/components/schemas/ArrayType'
-            title: ArrayType
-          - $ref: '#/components/schemas/ObjectType'
-            title: ObjectType
-          - $ref: '#/components/schemas/JsonType'
-            title: JsonType
-          - $ref: '#/components/schemas/UnionType'
-            title: UnionType
-          - $ref: '#/components/schemas/ChatCompletionInputType'
-            title: ChatCompletionInputType
-          - $ref: '#/components/schemas/CompletionInputType'
-            title: CompletionInputType
-          title: StringType | ... (9 variants)
-          description: The return type of the scoring function.
-          discriminator:
-            propertyName: type
-            mapping:
-              array: '#/components/schemas/ArrayType'
-              boolean: '#/components/schemas/BooleanType'
-              chat_completion_input: '#/components/schemas/ChatCompletionInputType'
-              completion_input: '#/components/schemas/CompletionInputType'
-              json: '#/components/schemas/JsonType'
-              number: '#/components/schemas/NumberType'
-              object: '#/components/schemas/ObjectType'
-              string: '#/components/schemas/StringType'
-              union: '#/components/schemas/UnionType'
-        provider_scoring_fn_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider scoring function to use for the scoring function.
-        provider_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider to use for the scoring function.
-        params:
-          anyOf:
-          - oneOf:
-            - $ref: '#/components/schemas/LLMAsJudgeScoringFnParams'
-              title: LLMAsJudgeScoringFnParams
-            - $ref: '#/components/schemas/RegexParserScoringFnParams'
-              title: RegexParserScoringFnParams
-            - $ref: '#/components/schemas/BasicScoringFnParams'
-              title: BasicScoringFnParams
-            discriminator:
-              propertyName: type
-              mapping:
-                basic: '#/components/schemas/BasicScoringFnParams'
-                llm_as_judge: '#/components/schemas/LLMAsJudgeScoringFnParams'
-                regex_parser: '#/components/schemas/RegexParserScoringFnParams'
-            title: LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams
-          - type: 'null'
-          title: Params
-          description: The parameters for the scoring function for benchmark eval, these can be overridden for app eval.
-      required:
-      - scoring_fn_id
-      - description
-      - return_type
-      title: RegisterScoringFunctionRequest
-      description: Request model for registering a scoring function.
     RegisterShieldRequest:
       properties:
         shield_id:
@@ -11069,90 +9657,6 @@ components:
       - shield_id
       title: RegisterShieldRequest
       description: Request model for registering a shield.
-    DataSource:
-      discriminator:
-        mapping:
-          rows: '#/components/schemas/RowsDataSource'
-          uri: '#/components/schemas/URIDataSource'
-        propertyName: type
-      oneOf:
-      - $ref: '#/components/schemas/URIDataSource'
-        title: URIDataSource
-      - $ref: '#/components/schemas/RowsDataSource'
-        title: RowsDataSource
-      title: URIDataSource | RowsDataSource
-    RegisterDatasetRequest:
-      properties:
-        purpose:
-          $ref: '#/components/schemas/DatasetPurpose'
-          description: The purpose of the dataset.
-        source:
-          oneOf:
-          - $ref: '#/components/schemas/URIDataSource'
-            title: URIDataSource
-          - $ref: '#/components/schemas/RowsDataSource'
-            title: RowsDataSource
-          title: URIDataSource | RowsDataSource
-          description: The data source of the dataset.
-          discriminator:
-            propertyName: type
-            mapping:
-              rows: '#/components/schemas/RowsDataSource'
-              uri: '#/components/schemas/URIDataSource'
-        metadata:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-          description: The metadata for the dataset.
-        dataset_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the dataset. If not provided, an ID will be generated.
-      required:
-      - purpose
-      - source
-      title: RegisterDatasetRequest
-      description: Request model for registering a dataset.
-    RegisterBenchmarkRequest:
-      properties:
-        benchmark_id:
-          type: string
-          title: Benchmark Id
-          description: The ID of the benchmark to register.
-        dataset_id:
-          type: string
-          title: Dataset Id
-          description: The ID of the dataset to use for the benchmark.
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          title: Scoring Functions
-          description: The scoring functions to use for the benchmark.
-        provider_benchmark_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider benchmark to use for the benchmark.
-        provider_id:
-          anyOf:
-          - type: string
-          - type: 'null'
-          description: The ID of the provider to use for the benchmark.
-        metadata:
-          anyOf:
-          - additionalProperties: true
-            type: object
-          - type: 'null'
-          description: The metadata to use for the benchmark.
-      required:
-      - benchmark_id
-      - dataset_id
-      - scoring_functions
-      title: RegisterBenchmarkRequest
-      description: Request model for registering a benchmark.
     AllowedToolsFilter:
       properties:
         tool_names:
@@ -11622,13 +10126,6 @@ components:
       - model
       title: CreateResponseRequest
       description: Request model for creating a response.
-    DatasetPurpose:
-      type: string
-      enum:
-      - eval/question-answer
-      - eval/messages-answer
-      title: DatasetPurpose
-      description: Purpose of the dataset. Each purpose has a required input data schema.
     EmbeddedChunk-Input:
       properties:
         content:
@@ -11767,32 +10264,6 @@ components:
           - type: 'null'
       additionalProperties: true
       title: Errors
-    EvaluateRowsBodyRequest:
-      properties:
-        input_rows:
-          items:
-            additionalProperties: true
-            type: object
-          type: array
-          minItems: 1
-          title: Input Rows
-          description: The rows to evaluate
-        scoring_functions:
-          items:
-            type: string
-          type: array
-          minItems: 1
-          title: Scoring Functions
-          description: The scoring functions to use for the evaluation
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - input_rows
-      - scoring_functions
-      - benchmark_config
-      title: EvaluateRowsBodyRequest
-      description: Request body model for evaluating rows (without path parameter).
     HealthStatus:
       type: string
       enum:
@@ -11835,16 +10306,6 @@ components:
       required:
       - cached_tokens
       title: InputTokensDetails
-    JobStatus:
-      type: string
-      enum:
-      - completed
-      - in_progress
-      - failed
-      - scheduled
-      - cancelled
-      title: JobStatus
-      description: Status of a job execution.
     ListConnectorsResponse:
       properties:
         data:
@@ -12680,15 +11141,6 @@ components:
       - disabled
       title: ResponseTruncation
       description: Controls how the service truncates input when it exceeds the model context window.
-    RunEvalBodyRequest:
-      properties:
-        benchmark_config:
-          $ref: '#/components/schemas/BenchmarkConfig'
-          description: The configuration for the benchmark
-      required:
-      - benchmark_config
-      title: RunEvalBodyRequest
-      description: Request body model for running an evaluation (without path parameter).
     SearchRankingOptions:
       properties:
         ranker:
@@ -13073,50 +11525,6 @@ components:
       - $ref: '#/components/schemas/OpenAIResponseContentPartReasoningText'
         title: OpenAIResponseContentPartReasoningText
       title: OpenAIResponseContentPartOutputText | OpenAIResponseContentPartRefusal | OpenAIResponseContentPartReasoningText
-    ListBenchmarksRequest:
-      description: Request model for listing benchmarks.
-      properties: {}
-      title: ListBenchmarksRequest
-    GetBenchmarkRequest:
-      description: Request model for getting a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to get.
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: GetBenchmarkRequest
-    UnregisterBenchmarkRequest:
-      description: Request model for unregistering a benchmark.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark to unregister.
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: UnregisterBenchmarkRequest
-    GetDatasetRequest:
-      description: Request model for getting a dataset by ID.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to get.
-          title: Dataset Id
-          type: string
-      required:
-      - dataset_id
-      title: GetDatasetRequest
-    UnregisterDatasetRequest:
-      description: Request model for unregistering a dataset.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to unregister.
-          title: Dataset Id
-          type: string
-      required:
-      - dataset_id
-      title: UnregisterDatasetRequest
     ListModelsResponse:
       description: Response containing a list of model objects.
       properties:
@@ -13149,39 +11557,6 @@ components:
       required:
       - model_id
       title: UnregisterModelRequest
-    DialogType:
-      description: Parameter type for dialog data with semantic output labels.
-      properties:
-        type:
-          const: dialog
-          default: dialog
-          title: Type
-          type: string
-      title: DialogType
-    ListScoringFunctionsRequest:
-      description: Request model for listing scoring functions.
-      properties: {}
-      title: ListScoringFunctionsRequest
-    GetScoringFunctionRequest:
-      description: Request model for getting a scoring function.
-      properties:
-        scoring_fn_id:
-          description: The ID of the scoring function to get.
-          title: Scoring Fn Id
-          type: string
-      required:
-      - scoring_fn_id
-      title: GetScoringFunctionRequest
-    UnregisterScoringFunctionRequest:
-      description: Request model for unregistering a scoring function.
-      properties:
-        scoring_fn_id:
-          description: The ID of the scoring function to unregister.
-          title: Scoring Fn Id
-          type: string
-      required:
-      - scoring_fn_id
-      title: UnregisterScoringFunctionRequest
     GetShieldRequest:
       description: Request model for getting a shield by identifier.
       properties:
@@ -13280,16 +11655,10 @@ components:
       - agents
       - batches
       - vector_io
-      - datasetio
-      - scoring
-      - eval
       - tool_runtime
       - models
       - shields
       - vector_stores
-      - datasets
-      - scoring_functions
-      - benchmarks
       - tool_groups
       - files
       - file_processors
@@ -14086,6 +12455,25 @@ components:
       required:
       - batch_id
       title: CancelBatchRequest
+    JobStatus:
+      description: Status of a job execution.
+      enum:
+      - completed
+      - in_progress
+      - failed
+      - scheduled
+      - cancelled
+      title: JobStatus
+      type: string
+    DialogType:
+      description: Parameter type for dialog data with semantic output labels.
+      properties:
+        type:
+          const: dialog
+          default: dialog
+          title: Type
+          type: string
+      title: DialogType
     ConnectorInput:
       description: Input for creating a connector
       properties:
@@ -14316,90 +12704,6 @@ components:
       - conversation_id
       - item_id
       title: DeleteItemRequest
-    IterRowsRequest:
-      description: Request model for iterating over rows in a dataset.
-      properties:
-        dataset_id:
-          description: The ID of the dataset to get the rows from.
-          title: Dataset Id
-          type: string
-        start_index:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: Index into dataset for the first row to get. Get all rows if None.
-          nullable: true
-        limit:
-          anyOf:
-          - type: integer
-          - type: 'null'
-          description: The number of rows to get.
-          nullable: true
-      required:
-      - dataset_id
-      title: IterRowsRequest
-    BenchmarkIdRequest:
-      description: Request model containing benchmark_id path parameter.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark
-          minLength: 1
-          title: Benchmark Id
-          type: string
-      required:
-      - benchmark_id
-      title: BenchmarkIdRequest
-    JobStatusRequest:
-      description: Request model for getting the status of a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to get the status of
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobStatusRequest
-    JobCancelRequest:
-      description: Request model for canceling a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to cancel
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobCancelRequest
-    JobResultRequest:
-      description: Request model for getting the result of a job.
-      properties:
-        benchmark_id:
-          description: The ID of the benchmark associated with the job
-          minLength: 1
-          title: Benchmark Id
-          type: string
-        job_id:
-          description: The ID of the job to get the result of
-          minLength: 1
-          title: Job Id
-          type: string
-      required:
-      - benchmark_id
-      - job_id
-      title: JobResultRequest
     ProcessFileRequest:
       description: |-
         Request model for file processing operation.
diff --git a/src/llama_stack/core/datatypes.py b/src/llama_stack/core/datatypes.py
index a67743fa41..85c574a84f 100644
--- a/src/llama_stack/core/datatypes.py
+++ b/src/llama_stack/core/datatypes.py
@@ -21,22 +21,13 @@
 from llama_stack.log import LoggingConfig
 from llama_stack_api import (
     Api,
-    Benchmark,
-    BenchmarkInput,
     ConnectorInput,
-    Dataset,
-    DatasetInput,
-    DatasetIO,
-    Eval,
     Inference,
     Model,
     ModelInput,
     ProviderSpec,
     Resource,
     Safety,
-    Scoring,
-    ScoringFn,
-    ScoringFnInput,
     Shield,
     ShieldInput,
     ToolGroup,
@@ -89,36 +80,18 @@ class VectorStoreWithOwner(VectorStore, ResourceWithOwner):
     pass
 
 
-class DatasetWithOwner(Dataset, ResourceWithOwner):
-    pass
-
-
-class ScoringFnWithOwner(ScoringFn, ResourceWithOwner):
-    pass
-
-
-class BenchmarkWithOwner(Benchmark, ResourceWithOwner):
-    pass
-
-
 class ToolGroupWithOwner(ToolGroup, ResourceWithOwner):
     pass
 
 
-RoutableObject = Model | Shield | VectorStore | Dataset | ScoringFn | Benchmark | ToolGroup
+RoutableObject = Model | Shield | VectorStore | ToolGroup
 
 RoutableObjectWithProvider = Annotated[
-    ModelWithOwner
-    | ShieldWithOwner
-    | VectorStoreWithOwner
-    | DatasetWithOwner
-    | ScoringFnWithOwner
-    | BenchmarkWithOwner
-    | ToolGroupWithOwner,
+    ModelWithOwner | ShieldWithOwner | VectorStoreWithOwner | ToolGroupWithOwner,
     Field(discriminator="type"),
 ]
 
-RoutedProtocol = Inference | Safety | VectorIO | DatasetIO | Scoring | Eval | ToolRuntime
+RoutedProtocol = Inference | Safety | VectorIO | ToolRuntime
 
 
 # Example: /inference, /safety
@@ -697,9 +670,6 @@ class RegisteredResources(BaseModel):
     models: list[ModelInput] = Field(default_factory=list)
     shields: list[ShieldInput] = Field(default_factory=list)
     vector_stores: list[VectorStoreInput] = Field(default_factory=list)
-    datasets: list[DatasetInput] = Field(default_factory=list)
-    scoring_fns: list[ScoringFnInput] = Field(default_factory=list)
-    benchmarks: list[BenchmarkInput] = Field(default_factory=list)
     tool_groups: list[ToolGroupInput] = Field(default_factory=list, deprecated=True)
 
     @model_validator(mode="after")
diff --git a/src/llama_stack/core/distribution.py b/src/llama_stack/core/distribution.py
index 12f7eeea4a..3dd93ef9b8 100644
--- a/src/llama_stack/core/distribution.py
+++ b/src/llama_stack/core/distribution.py
@@ -47,18 +47,6 @@ def builtin_automatically_routed_apis() -> list[AutoRoutedApiInfo]:
             routing_table_api=Api.shields,
             router_api=Api.safety,
         ),
-        AutoRoutedApiInfo(
-            routing_table_api=Api.datasets,
-            router_api=Api.datasetio,
-        ),
-        AutoRoutedApiInfo(
-            routing_table_api=Api.scoring_functions,
-            router_api=Api.scoring,
-        ),
-        AutoRoutedApiInfo(
-            routing_table_api=Api.benchmarks,
-            router_api=Api.eval,
-        ),
         AutoRoutedApiInfo(
             routing_table_api=Api.tool_groups,
             router_api=Api.tool_runtime,
diff --git a/src/llama_stack/core/resolver.py b/src/llama_stack/core/resolver.py
index e5bbbdf875..e4c5817e8a 100644
--- a/src/llama_stack/core/resolver.py
+++ b/src/llama_stack/core/resolver.py
@@ -28,14 +28,8 @@
     Agents,
     Api,
     Batches,
-    Benchmarks,
-    BenchmarksProtocolPrivate,
     Connectors,
     Conversations,
-    DatasetIO,
-    Datasets,
-    DatasetsProtocolPrivate,
-    Eval,
     ExternalApiSpec,
     FileProcessors,
     Files,
@@ -49,9 +43,6 @@
     RemoteProviderConfig,
     RemoteProviderSpec,
     Safety,
-    Scoring,
-    ScoringFunctions,
-    ScoringFunctionsProtocolPrivate,
     Shields,
     ShieldsProtocolPrivate,
     ToolGroups,
@@ -92,12 +83,6 @@ def api_protocol_map(external_apis: dict[Api, ExternalApiSpec] | None = None) ->
         Api.models: Models,
         Api.safety: Safety,
         Api.shields: Shields,
-        Api.datasetio: DatasetIO,
-        Api.datasets: Datasets,
-        Api.scoring: Scoring,
-        Api.scoring_functions: ScoringFunctions,
-        Api.eval: Eval,
-        Api.benchmarks: Benchmarks,
         Api.tool_groups: ToolGroups,
         Api.tool_runtime: ToolRuntime,
         Api.files: Files,
@@ -133,13 +118,6 @@ def additional_protocols_map() -> dict[Api, Any]:
         Api.inference: (ModelsProtocolPrivate, Models, Api.models),
         Api.tool_groups: (ToolGroupsProtocolPrivate, ToolGroups, Api.tool_groups),
         Api.safety: (ShieldsProtocolPrivate, Shields, Api.shields),
-        Api.datasetio: (DatasetsProtocolPrivate, Datasets, Api.datasets),
-        Api.scoring: (
-            ScoringFunctionsProtocolPrivate,
-            ScoringFunctions,
-            Api.scoring_functions,
-        ),
-        Api.eval: (BenchmarksProtocolPrivate, Benchmarks, Api.benchmarks),
     }
 
 
diff --git a/src/llama_stack/core/routers/__init__.py b/src/llama_stack/core/routers/__init__.py
index 8cef9e3514..5fe983603b 100644
--- a/src/llama_stack/core/routers/__init__.py
+++ b/src/llama_stack/core/routers/__init__.py
@@ -23,10 +23,7 @@ async def get_routing_table_impl(
     dist_registry: DistributionRegistry,
     policy: list[AccessRule],
 ) -> Any:
-    from ..routing_tables.benchmarks import BenchmarksRoutingTable
-    from ..routing_tables.datasets import DatasetsRoutingTable
     from ..routing_tables.models import ModelsRoutingTable
-    from ..routing_tables.scoring_functions import ScoringFunctionsRoutingTable
     from ..routing_tables.shields import ShieldsRoutingTable
     from ..routing_tables.toolgroups import ToolGroupsRoutingTable
     from ..routing_tables.vector_stores import VectorStoresRoutingTable
@@ -34,9 +31,6 @@ async def get_routing_table_impl(
     api_to_tables = {
         "models": ModelsRoutingTable,
         "shields": ShieldsRoutingTable,
-        "datasets": DatasetsRoutingTable,
-        "scoring_functions": ScoringFunctionsRoutingTable,
-        "benchmarks": BenchmarksRoutingTable,
         "tool_groups": ToolGroupsRoutingTable,
         "vector_stores": VectorStoresRoutingTable,
     }
@@ -53,8 +47,6 @@ async def get_routing_table_impl(
 async def get_auto_router_impl(
     api: Api, routing_table: RoutingTable, deps: dict[str, Any], run_config: StackConfig, policy: list[AccessRule]
 ) -> Any:
-    from .datasets import DatasetIORouter
-    from .eval_scoring import EvalRouter, ScoringRouter
     from .inference import InferenceRouter
     from .safety import SafetyRouter
     from .tool_runtime import ToolRuntimeRouter
@@ -64,9 +56,6 @@ async def get_auto_router_impl(
         "vector_io": VectorIORouter,
         "inference": InferenceRouter,
         "safety": SafetyRouter,
-        "datasetio": DatasetIORouter,
-        "scoring": ScoringRouter,
-        "eval": EvalRouter,
         "tool_runtime": ToolRuntimeRouter,
     }
     if api.value not in api_to_routers:
diff --git a/src/llama_stack/core/routers/datasets.py b/src/llama_stack/core/routers/datasets.py
deleted file mode 100644
index 2ae1c08fb0..0000000000
--- a/src/llama_stack/core/routers/datasets.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from llama_stack.log import get_logger
-from llama_stack_api import (
-    AppendRowsParams,
-    DatasetIO,
-    DatasetPurpose,
-    DataSource,
-    IterRowsRequest,
-    PaginatedResponse,
-    RoutingTable,
-)
-
-logger = get_logger(name=__name__, category="core::routers")
-
-
-class DatasetIORouter(DatasetIO):
-    def __init__(
-        self,
-        routing_table: RoutingTable,
-    ) -> None:
-        logger.debug("Initializing DatasetIORouter")
-        self.routing_table = routing_table
-
-    async def initialize(self) -> None:
-        logger.debug("DatasetIORouter.initialize")
-        pass
-
-    async def shutdown(self) -> None:
-        logger.debug("DatasetIORouter.shutdown")
-        pass
-
-    async def register_dataset(
-        self,
-        purpose: DatasetPurpose,
-        source: DataSource,
-        metadata: dict[str, Any] | None = None,
-        dataset_id: str | None = None,
-    ) -> None:
-        logger.debug(
-            f"DatasetIORouter.register_dataset: {purpose=} {source=} {metadata=} {dataset_id=}",
-        )
-        await self.routing_table.register_dataset(
-            purpose=purpose,
-            source=source,
-            metadata=metadata,
-            dataset_id=dataset_id,
-        )
-
-    async def iterrows(self, request: IterRowsRequest) -> PaginatedResponse:
-        logger.debug(
-            f"DatasetIORouter.iterrows: {request.dataset_id}, start_index={request.start_index} limit={request.limit}",
-        )
-        provider = await self.routing_table.get_provider_impl(request.dataset_id)
-        return await provider.iterrows(
-            dataset_id=request.dataset_id,
-            start_index=request.start_index,
-            limit=request.limit,
-        )
-
-    async def append_rows(self, params: AppendRowsParams) -> None:
-        logger.debug(f"DatasetIORouter.append_rows: {params.dataset_id}, {len(params.rows)} rows")
-        provider = await self.routing_table.get_provider_impl(params.dataset_id)
-        return await provider.append_rows(
-            dataset_id=params.dataset_id,
-            rows=params.rows,
-        )
diff --git a/src/llama_stack/core/routers/eval_scoring.py b/src/llama_stack/core/routers/eval_scoring.py
deleted file mode 100644
index 9fe28b5a29..0000000000
--- a/src/llama_stack/core/routers/eval_scoring.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from llama_stack.log import get_logger
-from llama_stack_api import (
-    BenchmarkConfig,
-    Eval,
-    EvaluateResponse,
-    EvaluateRowsRequest,
-    Job,
-    JobCancelRequest,
-    JobResultRequest,
-    JobStatusRequest,
-    RoutingTable,
-    RunEvalRequest,
-    ScoreBatchRequest,
-    ScoreBatchResponse,
-    ScoreRequest,
-    ScoreResponse,
-    Scoring,
-    resolve_evaluate_rows_request,
-    resolve_job_cancel_request,
-    resolve_job_result_request,
-    resolve_job_status_request,
-    resolve_run_eval_request,
-)
-
-logger = get_logger(name=__name__, category="core::routers")
-
-
-class ScoringRouter(Scoring):
-    def __init__(
-        self,
-        routing_table: RoutingTable,
-    ) -> None:
-        logger.debug("Initializing ScoringRouter")
-        self.routing_table = routing_table
-
-    async def initialize(self) -> None:
-        logger.debug("ScoringRouter.initialize")
-        pass
-
-    async def shutdown(self) -> None:
-        logger.debug("ScoringRouter.shutdown")
-        pass
-
-    async def score_batch(
-        self,
-        request: ScoreBatchRequest,
-    ) -> ScoreBatchResponse:
-        logger.debug(f"ScoringRouter.score_batch: {request.dataset_id}")
-        res = {}
-        for fn_identifier in request.scoring_functions.keys():
-            provider = await self.routing_table.get_provider_impl(fn_identifier)
-            # Create a request for this specific scoring function
-            single_fn_request = ScoreBatchRequest(
-                dataset_id=request.dataset_id,
-                scoring_functions={fn_identifier: request.scoring_functions[fn_identifier]},
-                save_results_dataset=request.save_results_dataset,
-            )
-            score_response = await provider.score_batch(single_fn_request)
-            res.update(score_response.results)
-
-        if request.save_results_dataset:
-            raise NotImplementedError("Save results dataset not implemented yet")
-
-        return ScoreBatchResponse(
-            results=res,
-        )
-
-    async def score(
-        self,
-        request: ScoreRequest,
-    ) -> ScoreResponse:
-        logger.debug(f"ScoringRouter.score: {len(request.input_rows)} rows, {len(request.scoring_functions)} functions")
-        res = {}
-        # look up and map each scoring function to its provider impl
-        for fn_identifier in request.scoring_functions.keys():
-            provider = await self.routing_table.get_provider_impl(fn_identifier)
-            # Create a request for this specific scoring function
-            single_fn_request = ScoreRequest(
-                input_rows=request.input_rows,
-                scoring_functions={fn_identifier: request.scoring_functions[fn_identifier]},
-            )
-            score_response = await provider.score(single_fn_request)
-            res.update(score_response.results)
-
-        return ScoreResponse(results=res)
-
-
-class EvalRouter(Eval):
-    def __init__(
-        self,
-        routing_table: RoutingTable,
-    ) -> None:
-        logger.debug("Initializing EvalRouter")
-        self.routing_table = routing_table
-
-    async def initialize(self) -> None:
-        logger.debug("EvalRouter.initialize")
-        pass
-
-    async def shutdown(self) -> None:
-        logger.debug("EvalRouter.shutdown")
-        pass
-
-    async def run_eval(
-        self,
-        request: RunEvalRequest | None = None,
-        *,
-        benchmark_id: str | None = None,
-        benchmark_config: BenchmarkConfig | None = None,
-    ) -> Job:
-        """Run an evaluation on a benchmark.
-
-        Supports both new-style (request object) and old-style (individual parameters).
-        Old-style usage is deprecated and will emit a DeprecationWarning.
-
-        Args:
-            request: The new-style request object (preferred)
-            benchmark_id: (Deprecated) The benchmark ID
-            benchmark_config: (Deprecated) The benchmark configuration
-
-        Returns:
-            Job object representing the evaluation job
-        """
-        resolved_request = resolve_run_eval_request(
-            request, benchmark_id=benchmark_id, benchmark_config=benchmark_config
-        )
-        logger.debug(f"EvalRouter.run_eval: {resolved_request.benchmark_id}")
-        provider = await self.routing_table.get_provider_impl(resolved_request.benchmark_id)
-        return await provider.run_eval(resolved_request)
-
-    async def evaluate_rows(
-        self,
-        request: EvaluateRowsRequest | None = None,
-        *,
-        benchmark_id: str | None = None,
-        input_rows: list[dict[str, Any]] | None = None,
-        scoring_functions: list[str] | None = None,
-        benchmark_config: BenchmarkConfig | None = None,
-    ) -> EvaluateResponse:
-        """Evaluate a list of rows on a benchmark.
-
-        Supports both new-style (request object) and old-style (individual parameters).
-        Old-style usage is deprecated and will emit a DeprecationWarning.
-
-        Args:
-            request: The new-style request object (preferred)
-            benchmark_id: (Deprecated) The benchmark ID
-            input_rows: (Deprecated) The rows to evaluate
-            scoring_functions: (Deprecated) The scoring functions to use
-            benchmark_config: (Deprecated) The benchmark configuration
-
-        Returns:
-            EvaluateResponse object containing generations and scores
-        """
-        resolved_request = resolve_evaluate_rows_request(
-            request,
-            benchmark_id=benchmark_id,
-            input_rows=input_rows,
-            scoring_functions=scoring_functions,
-            benchmark_config=benchmark_config,
-        )
-        logger.debug(
-            f"EvalRouter.evaluate_rows: {resolved_request.benchmark_id}, {len(resolved_request.input_rows)} rows"
-        )
-        provider = await self.routing_table.get_provider_impl(resolved_request.benchmark_id)
-        return await provider.evaluate_rows(resolved_request)
-
-    async def job_status(
-        self,
-        request: JobStatusRequest | None = None,
-        *,
-        benchmark_id: str | None = None,
-        job_id: str | None = None,
-    ) -> Job:
-        """Get the status of a job.
-
-        Supports both new-style (request object) and old-style (individual parameters).
-        Old-style usage is deprecated and will emit a DeprecationWarning.
-
-        Args:
-            request: The new-style request object (preferred)
-            benchmark_id: (Deprecated) The benchmark ID
-            job_id: (Deprecated) The job ID
-
-        Returns:
-            Job object with the current status
-        """
-        resolved_request = resolve_job_status_request(request, benchmark_id=benchmark_id, job_id=job_id)
-        logger.debug(f"EvalRouter.job_status: {resolved_request.benchmark_id}, {resolved_request.job_id}")
-        provider = await self.routing_table.get_provider_impl(resolved_request.benchmark_id)
-        return await provider.job_status(resolved_request)
-
-    async def job_cancel(
-        self,
-        request: JobCancelRequest | None = None,
-        *,
-        benchmark_id: str | None = None,
-        job_id: str | None = None,
-    ) -> None:
-        """Cancel a job.
-
-        Supports both new-style (request object) and old-style (individual parameters).
-        Old-style usage is deprecated and will emit a DeprecationWarning.
-
-        Args:
-            request: The new-style request object (preferred)
-            benchmark_id: (Deprecated) The benchmark ID
-            job_id: (Deprecated) The job ID
-
-        Returns:
-            None
-        """
-        resolved_request = resolve_job_cancel_request(request, benchmark_id=benchmark_id, job_id=job_id)
-        logger.debug(f"EvalRouter.job_cancel: {resolved_request.benchmark_id}, {resolved_request.job_id}")
-        provider = await self.routing_table.get_provider_impl(resolved_request.benchmark_id)
-        await provider.job_cancel(resolved_request)
-
-    async def job_result(
-        self,
-        request: JobResultRequest | None = None,
-        *,
-        benchmark_id: str | None = None,
-        job_id: str | None = None,
-    ) -> EvaluateResponse:
-        """Get the result of a job.
-
-        Supports both new-style (request object) and old-style (individual parameters).
-        Old-style usage is deprecated and will emit a DeprecationWarning.
-
-        Args:
-            request: The new-style request object (preferred)
-            benchmark_id: (Deprecated) The benchmark ID
-            job_id: (Deprecated) The job ID
-
-        Returns:
-            EvaluateResponse object with the job results
-        """
-        resolved_request = resolve_job_result_request(request, benchmark_id=benchmark_id, job_id=job_id)
-        logger.debug(f"EvalRouter.job_result: {resolved_request.benchmark_id}, {resolved_request.job_id}")
-        provider = await self.routing_table.get_provider_impl(resolved_request.benchmark_id)
-        return await provider.job_result(resolved_request)
diff --git a/src/llama_stack/core/routing_tables/benchmarks.py b/src/llama_stack/core/routing_tables/benchmarks.py
deleted file mode 100644
index d5e3799bab..0000000000
--- a/src/llama_stack/core/routing_tables/benchmarks.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from llama_stack.core.datatypes import (
-    BenchmarkWithOwner,
-)
-from llama_stack.log import get_logger
-from llama_stack_api import (
-    Benchmark,
-    Benchmarks,
-    GetBenchmarkRequest,
-    ListBenchmarksRequest,
-    ListBenchmarksResponse,
-    RegisterBenchmarkRequest,
-    UnregisterBenchmarkRequest,
-)
-
-from .common import CommonRoutingTableImpl
-
-logger = get_logger(name=__name__, category="core::routing_tables")
-
-
-class BenchmarksRoutingTable(CommonRoutingTableImpl, Benchmarks):
-    async def list_benchmarks(self, request: ListBenchmarksRequest) -> ListBenchmarksResponse:
-        return ListBenchmarksResponse(data=await self.get_all_with_type("benchmark"))
-
-    async def get_benchmark(self, request: GetBenchmarkRequest) -> Benchmark:
-        benchmark = await self.get_object_by_identifier("benchmark", request.benchmark_id)
-        if benchmark is None:
-            raise ValueError(f"Benchmark '{request.benchmark_id}' not found")
-        return benchmark
-
-    async def register_benchmark(
-        self,
-        request: RegisterBenchmarkRequest,
-    ) -> None:
-        metadata = request.metadata if request.metadata is not None else {}
-        provider_id = request.provider_id
-        if provider_id is None:
-            if len(self.impls_by_provider_id) == 1:
-                provider_id = list(self.impls_by_provider_id.keys())[0]
-            else:
-                raise ValueError(
-                    "No provider specified and multiple providers available. Please specify a provider_id."
-                )
-        provider_benchmark_id = request.provider_benchmark_id
-        if provider_benchmark_id is None:
-            provider_benchmark_id = request.benchmark_id
-        benchmark = BenchmarkWithOwner(
-            identifier=request.benchmark_id,
-            dataset_id=request.dataset_id,
-            scoring_functions=request.scoring_functions,
-            metadata=metadata,
-            provider_id=provider_id,
-            provider_resource_id=provider_benchmark_id,
-        )
-        await self.register_object(benchmark)
-
-    async def unregister_benchmark(self, request: UnregisterBenchmarkRequest) -> None:
-        get_request = GetBenchmarkRequest(benchmark_id=request.benchmark_id)
-        existing_benchmark = await self.get_benchmark(get_request)
-        await self.unregister_object(existing_benchmark)
diff --git a/src/llama_stack/core/routing_tables/common.py b/src/llama_stack/core/routing_tables/common.py
index e61d1ee221..becad21549 100644
--- a/src/llama_stack/core/routing_tables/common.py
+++ b/src/llama_stack/core/routing_tables/common.py
@@ -13,7 +13,6 @@
     RoutableObject,
     RoutableObjectWithProvider,
     RoutedProtocol,
-    ScoringFnWithOwner,
 )
 from llama_stack.core.request_headers import get_authenticated_user
 from llama_stack.core.store import DistributionRegistry
@@ -39,12 +38,6 @@ async def register_object_with_provider(obj: RoutableObject, p: Any) -> Routable
         return await p.register_shield(obj)
     elif api == Api.vector_io:
         return await p.register_vector_store(obj)
-    elif api == Api.datasetio:
-        return await p.register_dataset(obj)
-    elif api == Api.scoring:
-        return await p.register_scoring_function(obj)
-    elif api == Api.eval:
-        return await p.register_benchmark(obj)
     elif api == Api.tool_runtime:
         return await p.register_toolgroup(obj)
     else:
@@ -59,12 +52,6 @@ async def unregister_object_from_provider(obj: RoutableObject, p: Any) -> None:
         return await p.unregister_model(obj.identifier)
     elif api == Api.safety:
         return await p.unregister_shield(obj.identifier)
-    elif api == Api.datasetio:
-        return await p.unregister_dataset(obj.identifier)
-    elif api == Api.eval:
-        return await p.unregister_benchmark(obj.identifier)
-    elif api == Api.scoring:
-        return await p.unregister_scoring_function(obj.identifier)
     elif api == Api.tool_runtime:
         return await p.unregister_toolgroup(obj.identifier)
     else:
@@ -106,14 +93,6 @@ async def add_objects(objs: list[RoutableObjectWithProvider], provider_id: str,
                 p.shield_store = self
             elif api == Api.vector_io:
                 p.vector_store_store = self
-            elif api == Api.datasetio:
-                p.dataset_store = self
-            elif api == Api.scoring:
-                p.scoring_function_store = self
-                scoring_functions = await p.list_scoring_functions()
-                await add_objects(scoring_functions, pid, ScoringFnWithOwner)
-            elif api == Api.eval:
-                p.benchmark_store = self
             elif api == Api.tool_runtime:
                 p.tool_store = self
 
@@ -125,10 +104,7 @@ async def refresh(self) -> None:
         pass
 
     async def get_provider_impl(self, routing_key: str, provider_id: str | None = None) -> Any:
-        from .benchmarks import BenchmarksRoutingTable
-        from .datasets import DatasetsRoutingTable
         from .models import ModelsRoutingTable
-        from .scoring_functions import ScoringFunctionsRoutingTable
         from .shields import ShieldsRoutingTable
         from .toolgroups import ToolGroupsRoutingTable
         from .vector_stores import VectorStoresRoutingTable
@@ -140,12 +116,6 @@ def apiname_object():
                 return ("Safety", "shield")
             elif isinstance(self, VectorStoresRoutingTable):
                 return ("VectorIO", "vector_store")
-            elif isinstance(self, DatasetsRoutingTable):
-                return ("DatasetIO", "dataset")
-            elif isinstance(self, ScoringFunctionsRoutingTable):
-                return ("Scoring", "scoring_function")
-            elif isinstance(self, BenchmarksRoutingTable):
-                return ("Eval", "benchmark")
             elif isinstance(self, ToolGroupsRoutingTable):
                 return ("ToolGroups", "tool_group")
             else:
diff --git a/src/llama_stack/core/routing_tables/datasets.py b/src/llama_stack/core/routing_tables/datasets.py
deleted file mode 100644
index 43aa0692c1..0000000000
--- a/src/llama_stack/core/routing_tables/datasets.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import uuid
-
-from llama_stack.core.datatypes import (
-    DatasetWithOwner,
-)
-from llama_stack.log import get_logger
-from llama_stack_api import (
-    Dataset,
-    DatasetNotFoundError,
-    DatasetType,
-    ListDatasetsResponse,
-    ResourceType,
-    RowsDataSource,
-    URIDataSource,
-)
-from llama_stack_api.datasets.api import (
-    Datasets,
-    GetDatasetRequest,
-    RegisterDatasetRequest,
-    UnregisterDatasetRequest,
-)
-
-from .common import CommonRoutingTableImpl
-
-logger = get_logger(name=__name__, category="core::routing_tables")
-
-
-class DatasetsRoutingTable(CommonRoutingTableImpl, Datasets):
-    async def list_datasets(self) -> ListDatasetsResponse:
-        return ListDatasetsResponse(data=await self.get_all_with_type(ResourceType.dataset.value))
-
-    async def get_dataset(self, request: GetDatasetRequest) -> Dataset:
-        dataset = await self.get_object_by_identifier("dataset", request.dataset_id)
-        if dataset is None:
-            raise DatasetNotFoundError(request.dataset_id)
-        return dataset
-
-    async def register_dataset(self, request: RegisterDatasetRequest) -> Dataset:
-        purpose = request.purpose
-        source = request.source
-        metadata = request.metadata
-        dataset_id = request.dataset_id
-        if isinstance(source, dict):
-            if source["type"] == "uri":
-                source = URIDataSource.parse_obj(source)
-            elif source["type"] == "rows":
-                source = RowsDataSource.parse_obj(source)
-
-        if not dataset_id:
-            dataset_id = f"dataset-{str(uuid.uuid4())}"
-
-        provider_dataset_id = dataset_id
-
-        # infer provider from source
-        if metadata and metadata.get("provider_id"):
-            provider_id = metadata.get("provider_id")  # pass through from nvidia datasetio
-        elif source.type == DatasetType.rows.value:
-            provider_id = "localfs"
-        elif source.type == DatasetType.uri.value:
-            # infer provider from uri
-            if source.uri.startswith("huggingface"):
-                provider_id = "huggingface"
-            else:
-                provider_id = "localfs"
-        else:
-            raise ValueError(f"Unknown data source type: {source.type}")
-
-        if metadata is None:
-            metadata = {}
-
-        dataset = DatasetWithOwner(
-            identifier=dataset_id,
-            provider_resource_id=provider_dataset_id,
-            provider_id=provider_id,
-            purpose=purpose,
-            source=source,
-            metadata=metadata,
-        )
-
-        await self.register_object(dataset)
-        return dataset
-
-    async def unregister_dataset(self, request: UnregisterDatasetRequest) -> None:
-        dataset = await self.get_dataset(GetDatasetRequest(dataset_id=request.dataset_id))
-        await self.unregister_object(dataset)
diff --git a/src/llama_stack/core/routing_tables/scoring_functions.py b/src/llama_stack/core/routing_tables/scoring_functions.py
deleted file mode 100644
index b9f2855905..0000000000
--- a/src/llama_stack/core/routing_tables/scoring_functions.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack.core.datatypes import (
-    ScoringFnWithOwner,
-)
-from llama_stack.log import get_logger
-from llama_stack_api import (
-    GetScoringFunctionRequest,
-    ListScoringFunctionsRequest,
-    ListScoringFunctionsResponse,
-    RegisterScoringFunctionRequest,
-    ResourceType,
-    ScoringFn,
-    ScoringFunctions,
-    UnregisterScoringFunctionRequest,
-)
-
-from .common import CommonRoutingTableImpl
-
-logger = get_logger(name=__name__, category="core::routing_tables")
-
-
-class ScoringFunctionsRoutingTable(CommonRoutingTableImpl, ScoringFunctions):
-    async def list_scoring_functions(self, request: ListScoringFunctionsRequest) -> ListScoringFunctionsResponse:
-        return ListScoringFunctionsResponse(data=await self.get_all_with_type(ResourceType.scoring_function.value))
-
-    async def get_scoring_function(self, request: GetScoringFunctionRequest) -> ScoringFn:
-        scoring_fn = await self.get_object_by_identifier("scoring_function", request.scoring_fn_id)
-        if scoring_fn is None:
-            raise ValueError(f"Scoring function '{request.scoring_fn_id}' not found")
-        return scoring_fn
-
-    async def register_scoring_function(
-        self,
-        request: RegisterScoringFunctionRequest,
-    ) -> None:
-        provider_scoring_fn_id = request.provider_scoring_fn_id
-        if provider_scoring_fn_id is None:
-            provider_scoring_fn_id = request.scoring_fn_id
-        provider_id = request.provider_id
-        if provider_id is None:
-            if len(self.impls_by_provider_id) == 1:
-                provider_id = list(self.impls_by_provider_id.keys())[0]
-            else:
-                raise ValueError(
-                    "No provider specified and multiple providers available. Please specify a provider_id."
-                )
-        scoring_fn = ScoringFnWithOwner(
-            identifier=request.scoring_fn_id,
-            description=request.description,
-            return_type=request.return_type,
-            provider_resource_id=provider_scoring_fn_id,
-            provider_id=provider_id,
-            params=request.params,
-        )
-        scoring_fn.provider_id = provider_id
-        await self.register_object(scoring_fn)
-
-    async def unregister_scoring_function(self, request: UnregisterScoringFunctionRequest) -> None:
-        get_request = GetScoringFunctionRequest(scoring_fn_id=request.scoring_fn_id)
-        existing_scoring_fn = await self.get_scoring_function(get_request)
-        await self.unregister_object(existing_scoring_fn)
diff --git a/src/llama_stack/core/server/fastapi_router_registry.py b/src/llama_stack/core/server/fastapi_router_registry.py
index 01f6881e9b..b4789f50a0 100644
--- a/src/llama_stack/core/server/fastapi_router_registry.py
+++ b/src/llama_stack/core/server/fastapi_router_registry.py
@@ -20,12 +20,8 @@
     admin,
     agents,
     batches,
-    benchmarks,
     connectors,
     conversations,
-    datasetio,
-    datasets,
-    eval,
     file_processors,
     files,
     inference,
@@ -34,8 +30,6 @@
     prompts,
     providers,
     safety,
-    scoring,
-    scoring_functions,
     shields,
     tools,
     vector_io,
@@ -48,12 +42,8 @@
     "admin": admin.fastapi_routes.create_router,
     "agents": agents.fastapi_routes.create_router,
     "batches": batches.fastapi_routes.create_router,
-    "benchmarks": benchmarks.fastapi_routes.create_router,
     "connectors": connectors.fastapi_routes.create_router,
     "conversations": conversations.fastapi_routes.create_router,
-    "datasetio": datasetio.fastapi_routes.create_router,
-    "datasets": datasets.fastapi_routes.create_router,
-    "eval": eval.fastapi_routes.create_router,
     "file_processors": file_processors.fastapi_routes.create_router,
     "files": files.fastapi_routes.create_router,
     "inference": inference.fastapi_routes.create_router,
@@ -62,8 +52,6 @@
     "prompts": prompts.fastapi_routes.create_router,
     "providers": providers.fastapi_routes.create_router,
     "safety": safety.fastapi_routes.create_router,
-    "scoring": scoring.fastapi_routes.create_router,
-    "scoring_functions": scoring_functions.fastapi_routes.create_router,
     "shields": shields.fastapi_routes.create_router,
     "tool_groups": tools.fastapi_routes.create_router,
     "vector_io": vector_io.fastapi_routes.create_router,
diff --git a/src/llama_stack/core/stack.py b/src/llama_stack/core/stack.py
index 0d44fb97d7..5ebc4a4a11 100644
--- a/src/llama_stack/core/stack.py
+++ b/src/llama_stack/core/stack.py
@@ -50,12 +50,8 @@
     Agents,
     Api,
     Batches,
-    Benchmarks,
     Connectors,
     Conversations,
-    DatasetIO,
-    Datasets,
-    Eval,
     Files,
     Inference,
     Inspect,
@@ -63,18 +59,13 @@
     ModelType,
     Prompts,
     Providers,
-    RegisterBenchmarkRequest,
     RegisterModelRequest,
-    RegisterScoringFunctionRequest,
     RegisterShieldRequest,
     Safety,
-    Scoring,
-    ScoringFunctions,
     Shields,
     ToolGroupNotFoundError,
     VectorIO,
 )
-from llama_stack_api.datasets import RegisterDatasetRequest
 
 logger = get_logger(name=__name__, category="core")
 
@@ -85,13 +76,7 @@ class LlamaStack(
     Agents,
     Batches,
     Safety,
-    Datasets,
     VectorIO,
-    Eval,
-    Benchmarks,
-    Scoring,
-    ScoringFunctions,
-    DatasetIO,
     Models,
     Shields,
     Inspect,
@@ -108,15 +93,6 @@ class LlamaStack(
 RESOURCES = [
     ("models", Api.models, "register_model", "list_models", RegisterModelRequest),
     ("shields", Api.shields, "register_shield", "list_shields", RegisterShieldRequest),
-    ("datasets", Api.datasets, "register_dataset", "list_datasets", RegisterDatasetRequest),
-    (
-        "scoring_fns",
-        Api.scoring_functions,
-        "register_scoring_function",
-        "list_scoring_functions",
-        RegisterScoringFunctionRequest,
-    ),
-    ("benchmarks", Api.benchmarks, "register_benchmark", "list_benchmarks", RegisterBenchmarkRequest),
     ("vector_stores", Api.vector_stores, "register_vector_store", "list_vector_stores", None),
 ]
 
@@ -131,9 +107,6 @@ class LlamaStack(
     "vector_store_id",
     "model_id",
     "shield_id",
-    "dataset_id",
-    "scoring_fn_id",
-    "benchmark_id",
 ]
 
 
diff --git a/src/llama_stack/distributions/ci-tests/build.yaml b/src/llama_stack/distributions/ci-tests/build.yaml
index 40aa0d5a3b..01bd087c37 100644
--- a/src/llama_stack/distributions/ci-tests/build.yaml
+++ b/src/llama_stack/distributions/ci-tests/build.yaml
@@ -38,15 +38,6 @@ distribution_spec:
     - provider_type: inline::code-scanner
     agents:
     - provider_type: inline::builtin
-    eval:
-    - provider_type: inline::builtin
-    datasetio:
-    - provider_type: remote::huggingface
-    - provider_type: inline::localfs
-    scoring:
-    - provider_type: inline::basic
-    - provider_type: inline::llm-as-judge
-    - provider_type: inline::braintrust
     tool_runtime:
     - provider_type: remote::brave-search
     - provider_type: remote::tavily-search
diff --git a/src/llama_stack/distributions/ci-tests/config.yaml b/src/llama_stack/distributions/ci-tests/config.yaml
index 46242331f0..598d13a6db 100644
--- a/src/llama_stack/distributions/ci-tests/config.yaml
+++ b/src/llama_stack/distributions/ci-tests/config.yaml
@@ -3,13 +3,10 @@ distro_name: ci-tests
 apis:
 - agents
 - batches
-- datasetio
-- eval
 - file_processors
 - files
 - inference
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -214,35 +211,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: builtin
-    provider_type: inline::builtin
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -316,9 +284,6 @@ registered_resources:
     provider_id: ${env.CODE_SCANNER_MODEL:+code-scanner}
     provider_shield_id: ${env.CODE_SCANNER_MODEL:=}
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
   auth:
diff --git a/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml b/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml
index 5d0d19a940..5eca7a294a 100644
--- a/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/ci-tests/run-with-postgres-store.yaml
@@ -3,13 +3,10 @@ distro_name: ci-tests
 apis:
 - agents
 - batches
-- datasetio
-- eval
 - file_processors
 - files
 - inference
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -214,35 +211,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: builtin
-    provider_type: inline::builtin
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -329,9 +297,6 @@ registered_resources:
     provider_id: ${env.CODE_SCANNER_MODEL:+code-scanner}
     provider_shield_id: ${env.CODE_SCANNER_MODEL:=}
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
   auth:
diff --git a/src/llama_stack/distributions/dell/config.yaml b/src/llama_stack/distributions/dell/config.yaml
index dbb1ae7dac..9c8f618735 100644
--- a/src/llama_stack/distributions/dell/config.yaml
+++ b/src/llama_stack/distributions/dell/config.yaml
@@ -2,11 +2,8 @@ version: 2
 distro_name: dell
 apis:
 - agents
-- datasetio
-- eval
 - inference
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -47,35 +44,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: builtin
-    provider_type: inline::builtin
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -132,8 +100,5 @@ registered_resources:
     model_type: rerank
   shields: []
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
diff --git a/src/llama_stack/distributions/dell/dell.py b/src/llama_stack/distributions/dell/dell.py
index 271955f15c..6732479936 100644
--- a/src/llama_stack/distributions/dell/dell.py
+++ b/src/llama_stack/distributions/dell/dell.py
@@ -35,16 +35,6 @@ def get_distribution_template() -> DistributionTemplate:
         ],
         "safety": [BuildProvider(provider_type="inline::llama-guard")],
         "agents": [BuildProvider(provider_type="inline::builtin")],
-        "eval": [BuildProvider(provider_type="inline::builtin")],
-        "datasetio": [
-            BuildProvider(provider_type="remote::huggingface"),
-            BuildProvider(provider_type="inline::localfs"),
-        ],
-        "scoring": [
-            BuildProvider(provider_type="inline::basic"),
-            BuildProvider(provider_type="inline::llm-as-judge"),
-            BuildProvider(provider_type="inline::braintrust"),
-        ],
         "tool_runtime": [
             BuildProvider(provider_type="remote::brave-search"),
             BuildProvider(provider_type="remote::tavily-search"),
diff --git a/src/llama_stack/distributions/dell/run-with-safety.yaml b/src/llama_stack/distributions/dell/run-with-safety.yaml
index 2365b1b99c..05fde3372f 100644
--- a/src/llama_stack/distributions/dell/run-with-safety.yaml
+++ b/src/llama_stack/distributions/dell/run-with-safety.yaml
@@ -2,11 +2,8 @@ version: 2
 distro_name: dell
 apis:
 - agents
-- datasetio
-- eval
 - inference
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -51,35 +48,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: builtin
-    provider_type: inline::builtin
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -141,8 +109,5 @@ registered_resources:
   shields:
   - shield_id: ${env.SAFETY_MODEL}
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
diff --git a/src/llama_stack/distributions/nvidia/config.yaml b/src/llama_stack/distributions/nvidia/config.yaml
index e8de16609c..e3883f513b 100644
--- a/src/llama_stack/distributions/nvidia/config.yaml
+++ b/src/llama_stack/distributions/nvidia/config.yaml
@@ -2,12 +2,9 @@ version: 2
 distro_name: nvidia
 apis:
 - agents
-- datasetio
-- eval
 - files
 - inference
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -43,22 +40,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      evaluator_url: ${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331}
-  datasetio:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      api_key: ${env.NVIDIA_API_KEY:=}
-      dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default}
-      project_id: ${env.NVIDIA_PROJECT_ID:=test-project}
-      datasets_url: ${env.NVIDIA_DATASETS_URL:=http://nemo.test}
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
   tool_runtime:
   - provider_id: file-search
     provider_type: inline::file-search
@@ -100,8 +81,5 @@ registered_resources:
   models: []
   shields: []
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
diff --git a/src/llama_stack/distributions/nvidia/nvidia.py b/src/llama_stack/distributions/nvidia/nvidia.py
index fa0c5cdc5a..b9f3659dbc 100644
--- a/src/llama_stack/distributions/nvidia/nvidia.py
+++ b/src/llama_stack/distributions/nvidia/nvidia.py
@@ -9,8 +9,6 @@
 from llama_stack.core.datatypes import BuildProvider, ModelInput, Provider, ShieldInput
 from llama_stack.distributions.template import DistributionTemplate, RunConfigSettings
 from llama_stack.providers.inline.files.localfs.config import LocalfsFilesImplConfig
-from llama_stack.providers.remote.datasetio.nvidia import NvidiaDatasetIOConfig
-from llama_stack.providers.remote.eval.nvidia import NVIDIAEvalConfig
 from llama_stack.providers.remote.inference.nvidia import NVIDIAConfig
 from llama_stack.providers.remote.safety.nvidia import NVIDIASafetyConfig
 
@@ -21,12 +19,6 @@ def get_distribution_template(name: str = "nvidia") -> DistributionTemplate:
         "vector_io": [BuildProvider(provider_type="inline::faiss")],
         "safety": [BuildProvider(provider_type="remote::nvidia")],
         "agents": [BuildProvider(provider_type="inline::builtin")],
-        "eval": [BuildProvider(provider_type="remote::nvidia")],
-        "datasetio": [
-            BuildProvider(provider_type="inline::localfs"),
-            BuildProvider(provider_type="remote::nvidia"),
-        ],
-        "scoring": [BuildProvider(provider_type="inline::basic")],
         "tool_runtime": [BuildProvider(provider_type="inline::file-search")],
         "files": [BuildProvider(provider_type="inline::localfs")],
     }
@@ -41,16 +33,6 @@ def get_distribution_template(name: str = "nvidia") -> DistributionTemplate:
         provider_type="remote::nvidia",
         config=NVIDIASafetyConfig.sample_run_config(),
     )
-    datasetio_provider = Provider(
-        provider_id="nvidia",
-        provider_type="remote::nvidia",
-        config=NvidiaDatasetIOConfig.sample_run_config(),
-    )
-    eval_provider = Provider(
-        provider_id="nvidia",
-        provider_type="remote::nvidia",
-        config=NVIDIAEvalConfig.sample_run_config(),
-    )
     files_provider = Provider(
         provider_id="builtin-files",
         provider_type="inline::localfs",
@@ -68,7 +50,7 @@ def get_distribution_template(name: str = "nvidia") -> DistributionTemplate:
     return DistributionTemplate(
         name=name,
         distro_type="self_hosted",
-        description="Use NVIDIA NIM for running LLM inference, evaluation and safety",
+        description="Use NVIDIA NIM for running LLM inference and safety",
         container_image=None,
         template_path=Path(__file__).parent / "doc_template.md",
         providers=providers,
@@ -76,8 +58,6 @@ def get_distribution_template(name: str = "nvidia") -> DistributionTemplate:
             "config.yaml": RunConfigSettings(
                 provider_overrides={
                     "inference": [inference_provider],
-                    "datasetio": [datasetio_provider],
-                    "eval": [eval_provider],
                     "files": [files_provider],
                 },
             ),
@@ -87,7 +67,6 @@ def get_distribution_template(name: str = "nvidia") -> DistributionTemplate:
                         inference_provider,
                         safety_provider,
                     ],
-                    "eval": [eval_provider],
                     "files": [files_provider],
                 },
                 default_models=[inference_model, safety_model],
@@ -103,14 +82,6 @@ def get_distribution_template(name: str = "nvidia") -> DistributionTemplate:
                 "True",
                 "Whether to append the API version to the base_url",
             ),
-            "NVIDIA_DATASET_NAMESPACE": (
-                "default",
-                "NVIDIA Dataset Namespace",
-            ),
-            "NVIDIA_PROJECT_ID": (
-                "test-project",
-                "NVIDIA Project ID",
-            ),
             "GUARDRAILS_SERVICE_URL": (
                 "http://0.0.0.0:7331",
                 "URL for the NeMo Guardrails Service",
@@ -119,10 +90,6 @@ def get_distribution_template(name: str = "nvidia") -> DistributionTemplate:
                 "self-check",
                 "NVIDIA Guardrail Configuration ID",
             ),
-            "NVIDIA_EVALUATOR_URL": (
-                "http://0.0.0.0:7331",
-                "URL for the NeMo Evaluator Service",
-            ),
             "INFERENCE_MODEL": (
                 "Llama3.1-8B-Instruct",
                 "Inference model",
diff --git a/src/llama_stack/distributions/nvidia/run-with-safety.yaml b/src/llama_stack/distributions/nvidia/run-with-safety.yaml
index 756cc3fc70..db754758a0 100644
--- a/src/llama_stack/distributions/nvidia/run-with-safety.yaml
+++ b/src/llama_stack/distributions/nvidia/run-with-safety.yaml
@@ -2,12 +2,9 @@ version: 2
 distro_name: nvidia
 apis:
 - agents
-- datasetio
-- eval
 - files
 - inference
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -48,28 +45,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      evaluator_url: ${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331}
-  datasetio:
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  - provider_id: nvidia
-    provider_type: remote::nvidia
-    config:
-      api_key: ${env.NVIDIA_API_KEY:=}
-      dataset_namespace: ${env.NVIDIA_DATASET_NAMESPACE:=default}
-      project_id: ${env.NVIDIA_PROJECT_ID:=test-project}
-      datasets_url: ${env.NVIDIA_DATASETS_URL:=http://nemo.test}
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
   tool_runtime:
   - provider_id: file-search
     provider_type: inline::file-search
@@ -121,8 +96,5 @@ registered_resources:
   - shield_id: ${env.SAFETY_MODEL}
     provider_id: nvidia
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
diff --git a/src/llama_stack/distributions/oci/config.yaml b/src/llama_stack/distributions/oci/config.yaml
index 7e54e16e18..d5128a6423 100644
--- a/src/llama_stack/distributions/oci/config.yaml
+++ b/src/llama_stack/distributions/oci/config.yaml
@@ -2,12 +2,9 @@ version: 2
 distro_name: oci
 apis:
 - agents
-- datasetio
-- eval
 - files
 - inference
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -45,35 +42,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: builtin
-    provider_type: inline::builtin
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -127,8 +95,5 @@ registered_resources:
   models: []
   shields: []
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
diff --git a/src/llama_stack/distributions/oci/oci.py b/src/llama_stack/distributions/oci/oci.py
index 651e2cc747..0fe72f1d76 100644
--- a/src/llama_stack/distributions/oci/oci.py
+++ b/src/llama_stack/distributions/oci/oci.py
@@ -23,16 +23,6 @@ def get_distribution_template(name: str = "oci") -> DistributionTemplate:
         ],
         "safety": [BuildProvider(provider_type="inline::llama-guard")],
         "agents": [BuildProvider(provider_type="inline::builtin")],
-        "eval": [BuildProvider(provider_type="inline::builtin")],
-        "datasetio": [
-            BuildProvider(provider_type="remote::huggingface"),
-            BuildProvider(provider_type="inline::localfs"),
-        ],
-        "scoring": [
-            BuildProvider(provider_type="inline::basic"),
-            BuildProvider(provider_type="inline::llm-as-judge"),
-            BuildProvider(provider_type="inline::braintrust"),
-        ],
         "tool_runtime": [
             BuildProvider(provider_type="remote::brave-search"),
             BuildProvider(provider_type="remote::tavily-search"),
diff --git a/src/llama_stack/distributions/open-benchmark/config.yaml b/src/llama_stack/distributions/open-benchmark/config.yaml
index f21e8987af..0938350234 100644
--- a/src/llama_stack/distributions/open-benchmark/config.yaml
+++ b/src/llama_stack/distributions/open-benchmark/config.yaml
@@ -2,11 +2,8 @@ version: 2
 distro_name: open-benchmark
 apis:
 - agents
-- datasetio
-- eval
 - inference
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -84,35 +81,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: builtin
-    provider_type: inline::builtin
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -184,74 +152,5 @@ registered_resources:
   shields:
   - shield_id: meta-llama/Llama-Guard-3-8B
   vector_dbs: []
-  datasets:
-  - purpose: eval/messages-answer
-    source:
-      type: uri
-      uri: huggingface://datasets/llamastack/simpleqa?split=train
-    metadata: {}
-    dataset_id: simpleqa
-  - purpose: eval/messages-answer
-    source:
-      type: uri
-      uri: huggingface://datasets/llamastack/mmlu_cot?split=test&name=all
-    metadata: {}
-    dataset_id: mmlu_cot
-  - purpose: eval/messages-answer
-    source:
-      type: uri
-      uri: huggingface://datasets/llamastack/gpqa_0shot_cot?split=test&name=gpqa_main
-    metadata: {}
-    dataset_id: gpqa_cot
-  - purpose: eval/messages-answer
-    source:
-      type: uri
-      uri: huggingface://datasets/llamastack/math_500?split=test
-    metadata: {}
-    dataset_id: math_500
-  - purpose: eval/messages-answer
-    source:
-      type: uri
-      uri: huggingface://datasets/llamastack/IfEval?split=train
-    metadata: {}
-    dataset_id: ifeval
-  - purpose: eval/messages-answer
-    source:
-      type: uri
-      uri: huggingface://datasets/llamastack/docvqa?split=val
-    metadata: {}
-    dataset_id: docvqa
-  scoring_fns: []
-  benchmarks:
-  - dataset_id: simpleqa
-    scoring_functions:
-    - llm-as-judge::405b-simpleqa
-    metadata: {}
-    benchmark_id: builtin-simpleqa
-  - dataset_id: mmlu_cot
-    scoring_functions:
-    - basic::regex_parser_multiple_choice_answer
-    metadata: {}
-    benchmark_id: builtin-mmlu-cot
-  - dataset_id: gpqa_cot
-    scoring_functions:
-    - basic::regex_parser_multiple_choice_answer
-    metadata: {}
-    benchmark_id: builtin-gpqa-cot
-  - dataset_id: math_500
-    scoring_functions:
-    - basic::regex_parser_math_response
-    metadata: {}
-    benchmark_id: builtin-math-500
-  - dataset_id: ifeval
-    scoring_functions:
-    - basic::ifeval
-    metadata: {}
-    benchmark_id: builtin-ifeval
-  - dataset_id: docvqa
-    scoring_functions:
-    - basic::docvqa
-    metadata: {}
-    benchmark_id: builtin-docvqa
 server:
   port: 8321
diff --git a/src/llama_stack/distributions/open-benchmark/open_benchmark.py b/src/llama_stack/distributions/open-benchmark/open_benchmark.py
index 8b782e4dad..daf3e64d44 100644
--- a/src/llama_stack/distributions/open-benchmark/open_benchmark.py
+++ b/src/llama_stack/distributions/open-benchmark/open_benchmark.py
@@ -6,9 +6,7 @@
 
 
 from llama_stack.core.datatypes import (
-    BenchmarkInput,
     BuildProvider,
-    DatasetInput,
     ModelInput,
     Provider,
     ShieldInput,
@@ -31,7 +29,7 @@
     PGVectorVectorIOConfig,
 )
 from llama_stack.providers.utils.inference.model_registry import ProviderModelEntry
-from llama_stack_api import DatasetPurpose, ModelType, URIDataSource
+from llama_stack_api import ModelType
 
 
 def get_inference_providers() -> tuple[list[Provider], dict[str, list[ProviderModelEntry]]]:
@@ -103,16 +101,6 @@ def get_distribution_template() -> DistributionTemplate:
         ],
         "safety": [BuildProvider(provider_type="inline::llama-guard")],
         "agents": [BuildProvider(provider_type="inline::builtin")],
-        "eval": [BuildProvider(provider_type="inline::builtin")],
-        "datasetio": [
-            BuildProvider(provider_type="remote::huggingface"),
-            BuildProvider(provider_type="inline::localfs"),
-        ],
-        "scoring": [
-            BuildProvider(provider_type="inline::basic"),
-            BuildProvider(provider_type="inline::llm-as-judge"),
-            BuildProvider(provider_type="inline::braintrust"),
-        ],
         "tool_runtime": [
             BuildProvider(provider_type="remote::brave-search"),
             BuildProvider(provider_type="remote::tavily-search"),
@@ -163,83 +151,6 @@ def get_distribution_template() -> DistributionTemplate:
         ),
     ]
 
-    default_datasets = [
-        DatasetInput(
-            dataset_id="simpleqa",
-            purpose=DatasetPurpose.eval_messages_answer,
-            source=URIDataSource(
-                uri="huggingface://datasets/llamastack/simpleqa?split=train",
-            ),
-        ),
-        DatasetInput(
-            dataset_id="mmlu_cot",
-            purpose=DatasetPurpose.eval_messages_answer,
-            source=URIDataSource(
-                uri="huggingface://datasets/llamastack/mmlu_cot?split=test&name=all",
-            ),
-        ),
-        DatasetInput(
-            dataset_id="gpqa_cot",
-            purpose=DatasetPurpose.eval_messages_answer,
-            source=URIDataSource(
-                uri="huggingface://datasets/llamastack/gpqa_0shot_cot?split=test&name=gpqa_main",
-            ),
-        ),
-        DatasetInput(
-            dataset_id="math_500",
-            purpose=DatasetPurpose.eval_messages_answer,
-            source=URIDataSource(
-                uri="huggingface://datasets/llamastack/math_500?split=test",
-            ),
-        ),
-        DatasetInput(
-            dataset_id="ifeval",
-            purpose=DatasetPurpose.eval_messages_answer,
-            source=URIDataSource(
-                uri="huggingface://datasets/llamastack/IfEval?split=train",
-            ),
-        ),
-        DatasetInput(
-            dataset_id="docvqa",
-            purpose=DatasetPurpose.eval_messages_answer,
-            source=URIDataSource(
-                uri="huggingface://datasets/llamastack/docvqa?split=val",
-            ),
-        ),
-    ]
-
-    default_benchmarks = [
-        BenchmarkInput(
-            benchmark_id="builtin-simpleqa",
-            dataset_id="simpleqa",
-            scoring_functions=["llm-as-judge::405b-simpleqa"],
-        ),
-        BenchmarkInput(
-            benchmark_id="builtin-mmlu-cot",
-            dataset_id="mmlu_cot",
-            scoring_functions=["basic::regex_parser_multiple_choice_answer"],
-        ),
-        BenchmarkInput(
-            benchmark_id="builtin-gpqa-cot",
-            dataset_id="gpqa_cot",
-            scoring_functions=["basic::regex_parser_multiple_choice_answer"],
-        ),
-        BenchmarkInput(
-            benchmark_id="builtin-math-500",
-            dataset_id="math_500",
-            scoring_functions=["basic::regex_parser_math_response"],
-        ),
-        BenchmarkInput(
-            benchmark_id="builtin-ifeval",
-            dataset_id="ifeval",
-            scoring_functions=["basic::ifeval"],
-        ),
-        BenchmarkInput(
-            benchmark_id="builtin-docvqa",
-            dataset_id="docvqa",
-            scoring_functions=["basic::docvqa"],
-        ),
-    ]
     return DistributionTemplate(
         name=name,
         distro_type="self_hosted",
@@ -256,8 +167,6 @@ def get_distribution_template() -> DistributionTemplate:
                 },
                 default_models=default_models,
                 default_shields=[ShieldInput(shield_id="meta-llama/Llama-Guard-3-8B")],
-                default_datasets=default_datasets,
-                default_benchmarks=default_benchmarks,
             ),
         },
         run_config_env_vars={
diff --git a/src/llama_stack/distributions/postgres-demo/config.yaml b/src/llama_stack/distributions/postgres-demo/config.yaml
index a864bab4b5..baa139da8c 100644
--- a/src/llama_stack/distributions/postgres-demo/config.yaml
+++ b/src/llama_stack/distributions/postgres-demo/config.yaml
@@ -110,8 +110,5 @@ registered_resources:
   shields:
   - shield_id: meta-llama/Llama-Guard-3-8B
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
diff --git a/src/llama_stack/distributions/starter-gpu/build.yaml b/src/llama_stack/distributions/starter-gpu/build.yaml
index a599104355..a097bb02b8 100644
--- a/src/llama_stack/distributions/starter-gpu/build.yaml
+++ b/src/llama_stack/distributions/starter-gpu/build.yaml
@@ -39,16 +39,9 @@ distribution_spec:
     - provider_type: inline::code-scanner
     agents:
     - provider_type: inline::builtin
-    eval:
     - provider_type: inline::builtin
-    datasetio:
     - provider_type: remote::huggingface
     - provider_type: inline::localfs
-    scoring:
-    - provider_type: inline::basic
-    - provider_type: inline::llm-as-judge
-    - provider_type: inline::braintrust
-    tool_runtime:
     - provider_type: remote::brave-search
     - provider_type: remote::tavily-search
     - provider_type: inline::file-search
diff --git a/src/llama_stack/distributions/starter-gpu/config.yaml b/src/llama_stack/distributions/starter-gpu/config.yaml
index b8a422f949..f83fc2f116 100644
--- a/src/llama_stack/distributions/starter-gpu/config.yaml
+++ b/src/llama_stack/distributions/starter-gpu/config.yaml
@@ -3,13 +3,10 @@ distro_name: starter-gpu
 apis:
 - agents
 - batches
-- datasetio
-- eval
 - file_processors
 - files
 - inference
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -208,35 +205,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: builtin
-    provider_type: inline::builtin
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -295,9 +263,6 @@ registered_resources:
     provider_id: ${env.CODE_SCANNER_MODEL:+code-scanner}
     provider_shield_id: ${env.CODE_SCANNER_MODEL:=}
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
 vector_stores:
diff --git a/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml b/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml
index 1f7874df12..7e0b4b493b 100644
--- a/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/starter-gpu/run-with-postgres-store.yaml
@@ -3,13 +3,10 @@ distro_name: starter-gpu
 apis:
 - agents
 - batches
-- datasetio
-- eval
 - file_processors
 - files
 - inference
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -208,35 +205,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: builtin
-    provider_type: inline::builtin
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -308,9 +276,6 @@ registered_resources:
     provider_id: ${env.CODE_SCANNER_MODEL:+code-scanner}
     provider_shield_id: ${env.CODE_SCANNER_MODEL:=}
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
 vector_stores:
diff --git a/src/llama_stack/distributions/starter/build.yaml b/src/llama_stack/distributions/starter/build.yaml
index 27e6fc0bf6..acf54a13ee 100644
--- a/src/llama_stack/distributions/starter/build.yaml
+++ b/src/llama_stack/distributions/starter/build.yaml
@@ -39,16 +39,9 @@ distribution_spec:
     - provider_type: inline::code-scanner
     agents:
     - provider_type: inline::builtin
-    eval:
     - provider_type: inline::builtin
-    datasetio:
     - provider_type: remote::huggingface
     - provider_type: inline::localfs
-    scoring:
-    - provider_type: inline::basic
-    - provider_type: inline::llm-as-judge
-    - provider_type: inline::braintrust
-    tool_runtime:
     - provider_type: remote::brave-search
     - provider_type: remote::tavily-search
     - provider_type: inline::file-search
diff --git a/src/llama_stack/distributions/starter/config.yaml b/src/llama_stack/distributions/starter/config.yaml
index 7f757401ca..f661d03be3 100644
--- a/src/llama_stack/distributions/starter/config.yaml
+++ b/src/llama_stack/distributions/starter/config.yaml
@@ -3,13 +3,10 @@ distro_name: starter
 apis:
 - agents
 - batches
-- datasetio
-- eval
 - file_processors
 - files
 - inference
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -208,35 +205,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: builtin
-    provider_type: inline::builtin
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -295,9 +263,6 @@ registered_resources:
     provider_id: ${env.CODE_SCANNER_MODEL:+code-scanner}
     provider_shield_id: ${env.CODE_SCANNER_MODEL:=}
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
 vector_stores:
diff --git a/src/llama_stack/distributions/starter/run-with-postgres-store.yaml b/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
index aecf3eb99c..928b8bc6da 100644
--- a/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
+++ b/src/llama_stack/distributions/starter/run-with-postgres-store.yaml
@@ -3,13 +3,10 @@ distro_name: starter
 apis:
 - agents
 - batches
-- datasetio
-- eval
 - file_processors
 - files
 - inference
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -208,35 +205,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: builtin
-    provider_type: inline::builtin
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -308,9 +276,6 @@ registered_resources:
     provider_id: ${env.CODE_SCANNER_MODEL:+code-scanner}
     provider_shield_id: ${env.CODE_SCANNER_MODEL:=}
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
 vector_stores:
diff --git a/src/llama_stack/distributions/starter/starter.py b/src/llama_stack/distributions/starter/starter.py
index 1b790883db..24a8b2dd3c 100644
--- a/src/llama_stack/distributions/starter/starter.py
+++ b/src/llama_stack/distributions/starter/starter.py
@@ -142,16 +142,6 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
             BuildProvider(provider_type="inline::code-scanner"),
         ],
         "agents": [BuildProvider(provider_type="inline::builtin")],
-        "eval": [BuildProvider(provider_type="inline::builtin")],
-        "datasetio": [
-            BuildProvider(provider_type="remote::huggingface"),
-            BuildProvider(provider_type="inline::localfs"),
-        ],
-        "scoring": [
-            BuildProvider(provider_type="inline::basic"),
-            BuildProvider(provider_type="inline::llm-as-judge"),
-            BuildProvider(provider_type="inline::braintrust"),
-        ],
         "tool_runtime": [
             BuildProvider(provider_type="remote::brave-search"),
             BuildProvider(provider_type="remote::tavily-search"),
diff --git a/src/llama_stack/distributions/template.py b/src/llama_stack/distributions/template.py
index ae29a1c024..81df611cd0 100644
--- a/src/llama_stack/distributions/template.py
+++ b/src/llama_stack/distributions/template.py
@@ -15,9 +15,7 @@
 from llama_stack.core.datatypes import (
     LLAMA_STACK_RUN_CONFIG_VERSION,
     Api,
-    BenchmarkInput,
     BuildProvider,
-    DatasetInput,
     ModelInput,
     Provider,
     SafetyConfig,
@@ -177,8 +175,6 @@ class RunConfigSettings(BaseModel):
     provider_overrides: dict[str, list[Provider]] = Field(default_factory=dict)
     default_models: list[ModelInput] | None = None
     default_shields: list[ShieldInput] | None = None
-    default_datasets: list[DatasetInput] | None = None
-    default_benchmarks: list[BenchmarkInput] | None = None
     default_connectors: list[ConnectorInput] | None = None
     vector_stores_config: VectorStoresConfig | None = None
     safety_config: SafetyConfig | None = None
@@ -278,9 +274,6 @@ def run_config(
                 "models": [m.model_dump(exclude_none=True) for m in (self.default_models or [])],
                 "shields": [s.model_dump(exclude_none=True) for s in (self.default_shields or [])],
                 "vector_dbs": [],
-                "datasets": [d.model_dump(exclude_none=True) for d in (self.default_datasets or [])],
-                "scoring_fns": [],
-                "benchmarks": [b.model_dump(exclude_none=True) for b in (self.default_benchmarks or [])],
             },
             "server": {
                 "port": 8321,
diff --git a/src/llama_stack/distributions/watsonx/config.yaml b/src/llama_stack/distributions/watsonx/config.yaml
index 2d2e206218..c3c5e134f2 100644
--- a/src/llama_stack/distributions/watsonx/config.yaml
+++ b/src/llama_stack/distributions/watsonx/config.yaml
@@ -2,12 +2,9 @@ version: 2
 distro_name: watsonx
 apis:
 - agents
-- datasetio
-- eval
 - files
 - inference
 - safety
-- scoring
 - tool_runtime
 - vector_io
 providers:
@@ -43,35 +40,6 @@ providers:
           backend: sql_default
           max_write_queue_size: 10000
           num_writers: 4
-  eval:
-  - provider_id: builtin
-    provider_type: inline::builtin
-    config:
-      kvstore:
-        namespace: eval
-        backend: kv_default
-  datasetio:
-  - provider_id: huggingface
-    provider_type: remote::huggingface
-    config:
-      kvstore:
-        namespace: datasetio::huggingface
-        backend: kv_default
-  - provider_id: localfs
-    provider_type: inline::localfs
-    config:
-      kvstore:
-        namespace: datasetio::localfs
-        backend: kv_default
-  scoring:
-  - provider_id: basic
-    provider_type: inline::basic
-  - provider_id: llm-as-judge
-    provider_type: inline::llm-as-judge
-  - provider_id: braintrust
-    provider_type: inline::braintrust
-    config:
-      openai_api_key: ${env.OPENAI_API_KEY:=}
   tool_runtime:
   - provider_id: brave-search
     provider_type: remote::brave-search
@@ -125,8 +93,5 @@ registered_resources:
   models: []
   shields: []
   vector_dbs: []
-  datasets: []
-  scoring_fns: []
-  benchmarks: []
 server:
   port: 8321
diff --git a/src/llama_stack/distributions/watsonx/watsonx.py b/src/llama_stack/distributions/watsonx/watsonx.py
index 6fe9e2e30a..f4eb2de614 100644
--- a/src/llama_stack/distributions/watsonx/watsonx.py
+++ b/src/llama_stack/distributions/watsonx/watsonx.py
@@ -20,16 +20,6 @@ def get_distribution_template(name: str = "watsonx") -> DistributionTemplate:
         "vector_io": [BuildProvider(provider_type="inline::faiss")],
         "safety": [BuildProvider(provider_type="inline::llama-guard")],
         "agents": [BuildProvider(provider_type="inline::builtin")],
-        "eval": [BuildProvider(provider_type="inline::builtin")],
-        "datasetio": [
-            BuildProvider(provider_type="remote::huggingface"),
-            BuildProvider(provider_type="inline::localfs"),
-        ],
-        "scoring": [
-            BuildProvider(provider_type="inline::basic"),
-            BuildProvider(provider_type="inline::llm-as-judge"),
-            BuildProvider(provider_type="inline::braintrust"),
-        ],
         "tool_runtime": [
             BuildProvider(provider_type="remote::brave-search"),
             BuildProvider(provider_type="remote::tavily-search"),
diff --git a/src/llama_stack/providers/inline/datasetio/__init__.py b/src/llama_stack/providers/inline/datasetio/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/inline/datasetio/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/inline/datasetio/localfs/__init__.py b/src/llama_stack/providers/inline/datasetio/localfs/__init__.py
deleted file mode 100644
index 58aa6ffaf3..0000000000
--- a/src/llama_stack/providers/inline/datasetio/localfs/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from .config import LocalFSDatasetIOConfig
-
-
-async def get_provider_impl(
-    config: LocalFSDatasetIOConfig,
-    _deps: dict[str, Any],
-):
-    from .datasetio import LocalFSDatasetIOImpl
-
-    impl = LocalFSDatasetIOImpl(config)
-    await impl.initialize()
-    return impl
diff --git a/src/llama_stack/providers/inline/datasetio/localfs/config.py b/src/llama_stack/providers/inline/datasetio/localfs/config.py
deleted file mode 100644
index 6e878df629..0000000000
--- a/src/llama_stack/providers/inline/datasetio/localfs/config.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from pydantic import BaseModel
-
-from llama_stack.core.storage.datatypes import KVStoreReference
-
-
-class LocalFSDatasetIOConfig(BaseModel):
-    kvstore: KVStoreReference
-
-    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
-        return {
-            "kvstore": KVStoreReference(
-                backend="kv_default",
-                namespace="datasetio::localfs",
-            ).model_dump(exclude_none=True)
-        }
diff --git a/src/llama_stack/providers/inline/datasetio/localfs/datasetio.py b/src/llama_stack/providers/inline/datasetio/localfs/datasetio.py
deleted file mode 100644
index 85c7cff3e8..0000000000
--- a/src/llama_stack/providers/inline/datasetio/localfs/datasetio.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from llama_stack.core.storage.kvstore import kvstore_impl
-from llama_stack.providers.utils.datasetio.url_utils import get_dataframe_from_uri
-from llama_stack.providers.utils.pagination import paginate_records
-from llama_stack_api import Dataset, DatasetIO, DatasetsProtocolPrivate, PaginatedResponse
-
-from .config import LocalFSDatasetIOConfig
-
-DATASETS_PREFIX = "localfs_datasets:"
-
-
-class PandasDataframeDataset:
-    def __init__(self, dataset_def: Dataset, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.dataset_def = dataset_def
-        self.df = None
-
-    def __len__(self) -> int:
-        assert self.df is not None, "Dataset not loaded. Please call .load() first"
-        return len(self.df)
-
-    def __getitem__(self, idx):
-        assert self.df is not None, "Dataset not loaded. Please call .load() first"
-        if isinstance(idx, slice):
-            return self.df.iloc[idx].to_dict(orient="records")
-        else:
-            return self.df.iloc[idx].to_dict()
-
-    async def load(self) -> None:
-        if self.df is not None:
-            return
-
-        if self.dataset_def.source.type == "uri":
-            self.df = await get_dataframe_from_uri(self.dataset_def.source.uri)
-        elif self.dataset_def.source.type == "rows":
-            import pandas
-
-            self.df = pandas.DataFrame(self.dataset_def.source.rows)
-        else:
-            raise ValueError(f"Unsupported dataset source type: {self.dataset_def.source.type}")
-
-        if self.df is None:
-            raise ValueError(f"Failed to load dataset from {self.dataset_def.url}")
-
-
-class LocalFSDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
-    def __init__(self, config: LocalFSDatasetIOConfig) -> None:
-        self.config = config
-        # local registry for keeping track of datasets within the provider
-        self.dataset_infos = {}
-        self.kvstore = None
-
-    async def initialize(self) -> None:
-        self.kvstore = await kvstore_impl(self.config.kvstore)
-        # Load existing datasets from kvstore
-        start_key = DATASETS_PREFIX
-        end_key = f"{DATASETS_PREFIX}\xff"
-        stored_datasets = await self.kvstore.values_in_range(start_key, end_key)
-
-        for dataset in stored_datasets:
-            dataset = Dataset.model_validate_json(dataset)
-            self.dataset_infos[dataset.identifier] = dataset
-
-    async def shutdown(self) -> None: ...
-
-    async def register_dataset(
-        self,
-        dataset_def: Dataset,
-    ) -> None:
-        # Store in kvstore
-        key = f"{DATASETS_PREFIX}{dataset_def.identifier}"
-        await self.kvstore.set(
-            key=key,
-            value=dataset_def.model_dump_json(),
-        )
-        self.dataset_infos[dataset_def.identifier] = dataset_def
-
-    async def unregister_dataset(self, dataset_id: str) -> None:
-        key = f"{DATASETS_PREFIX}{dataset_id}"
-        await self.kvstore.delete(key=key)
-        del self.dataset_infos[dataset_id]
-
-    async def iterrows(
-        self,
-        dataset_id: str,
-        start_index: int | None = None,
-        limit: int | None = None,
-    ) -> PaginatedResponse:
-        dataset_def = self.dataset_infos[dataset_id]
-        dataset_impl = PandasDataframeDataset(dataset_def)
-        await dataset_impl.load()
-
-        records = dataset_impl.df.to_dict("records")
-        return paginate_records(records, start_index, limit)
-
-    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
-        import pandas
-
-        dataset_def = self.dataset_infos[dataset_id]
-        dataset_impl = PandasDataframeDataset(dataset_def)
-        await dataset_impl.load()
-
-        new_rows_df = pandas.DataFrame(rows)
-        dataset_impl.df = pandas.concat([dataset_impl.df, new_rows_df], ignore_index=True)
diff --git a/src/llama_stack/providers/inline/eval/__init__.py b/src/llama_stack/providers/inline/eval/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/inline/eval/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/inline/eval/builtin/__init__.py b/src/llama_stack/providers/inline/eval/builtin/__init__.py
deleted file mode 100644
index 4ec94b4c27..0000000000
--- a/src/llama_stack/providers/inline/eval/builtin/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from llama_stack.core.datatypes import Api
-
-from .config import BuiltinEvalConfig
-
-
-async def get_provider_impl(
-    config: BuiltinEvalConfig,
-    deps: dict[Api, Any],
-):
-    from .eval import BuiltinEvalImpl
-
-    impl = BuiltinEvalImpl(
-        config,
-        deps[Api.datasetio],
-        deps[Api.datasets],
-        deps[Api.scoring],
-        deps[Api.inference],
-        deps[Api.agents],
-    )
-    await impl.initialize()
-    return impl
diff --git a/src/llama_stack/providers/inline/eval/builtin/config.py b/src/llama_stack/providers/inline/eval/builtin/config.py
deleted file mode 100644
index c26e5e02f8..0000000000
--- a/src/llama_stack/providers/inline/eval/builtin/config.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from pydantic import BaseModel
-
-from llama_stack.core.storage.datatypes import KVStoreReference
-
-
-class BuiltinEvalConfig(BaseModel):
-    kvstore: KVStoreReference
-
-    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
-        return {
-            "kvstore": KVStoreReference(
-                backend="kv_default",
-                namespace="eval",
-            ).model_dump(exclude_none=True)
-        }
diff --git a/src/llama_stack/providers/inline/eval/builtin/eval.py b/src/llama_stack/providers/inline/eval/builtin/eval.py
deleted file mode 100644
index bd242ae059..0000000000
--- a/src/llama_stack/providers/inline/eval/builtin/eval.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import json
-from typing import Any
-
-from tqdm import tqdm
-
-from llama_stack.core.storage.kvstore import kvstore_impl
-from llama_stack.providers.utils.common.data_schema_validator import ColumnName
-from llama_stack_api import (
-    Agents,
-    Benchmark,
-    BenchmarksProtocolPrivate,
-    DatasetIO,
-    Datasets,
-    Eval,
-    EvaluateResponse,
-    EvaluateRowsRequest,
-    Inference,
-    IterRowsRequest,
-    Job,
-    JobCancelRequest,
-    JobResultRequest,
-    JobStatus,
-    JobStatusRequest,
-    OpenAIChatCompletionRequestWithExtraBody,
-    OpenAICompletionRequestWithExtraBody,
-    OpenAISystemMessageParam,
-    OpenAIUserMessageParam,
-    RunEvalRequest,
-    ScoreRequest,
-    Scoring,
-)
-
-from .config import BuiltinEvalConfig
-
-EVAL_TASKS_PREFIX = "benchmarks:"
-
-
-class BuiltinEvalImpl(
-    Eval,
-    BenchmarksProtocolPrivate,
-):
-    def __init__(
-        self,
-        config: BuiltinEvalConfig,
-        datasetio_api: DatasetIO,
-        datasets_api: Datasets,
-        scoring_api: Scoring,
-        inference_api: Inference,
-        agents_api: Agents,
-    ) -> None:
-        self.config = config
-        self.datasetio_api = datasetio_api
-        self.datasets_api = datasets_api
-        self.scoring_api = scoring_api
-        self.inference_api = inference_api
-        self.agents_api = agents_api
-
-        # TODO: assume sync job, will need jobs API for async scheduling
-        self.jobs = {}
-
-        self.benchmarks = {}
-
-    async def initialize(self) -> None:
-        self.kvstore = await kvstore_impl(self.config.kvstore)
-        # Load existing benchmarks from kvstore
-        start_key = EVAL_TASKS_PREFIX
-        end_key = f"{EVAL_TASKS_PREFIX}\xff"
-        stored_benchmarks = await self.kvstore.values_in_range(start_key, end_key)
-
-        for benchmark in stored_benchmarks:
-            benchmark = Benchmark.model_validate_json(benchmark)
-            self.benchmarks[benchmark.identifier] = benchmark
-
-    async def shutdown(self) -> None: ...
-
-    async def register_benchmark(self, task_def: Benchmark) -> None:
-        # Store in kvstore
-        key = f"{EVAL_TASKS_PREFIX}{task_def.identifier}"
-        await self.kvstore.set(
-            key=key,
-            value=task_def.model_dump_json(),
-        )
-        self.benchmarks[task_def.identifier] = task_def
-
-    async def unregister_benchmark(self, benchmark_id: str) -> None:
-        if benchmark_id in self.benchmarks:
-            del self.benchmarks[benchmark_id]
-
-        key = f"{EVAL_TASKS_PREFIX}{benchmark_id}"
-        await self.kvstore.delete(key)
-
-    async def run_eval(
-        self,
-        request: RunEvalRequest,
-    ) -> Job:
-        task_def = self.benchmarks[request.benchmark_id]
-        dataset_id = task_def.dataset_id
-        scoring_functions = task_def.scoring_functions
-
-        # TODO (xiyan): validate dataset schema
-        # dataset_def = await self.datasets_api.get_dataset(dataset_id=dataset_id)
-
-        all_rows = await self.datasetio_api.iterrows(
-            IterRowsRequest(
-                dataset_id=dataset_id,
-                limit=(-1 if request.benchmark_config.num_examples is None else request.benchmark_config.num_examples),
-            )
-        )
-        eval_rows_request = EvaluateRowsRequest(
-            benchmark_id=request.benchmark_id,
-            input_rows=all_rows.data,
-            scoring_functions=scoring_functions,
-            benchmark_config=request.benchmark_config,
-        )
-        res = await self.evaluate_rows(eval_rows_request)
-
-        # TODO: currently needs to wait for generation before returning
-        # need job scheduler queue (ray/celery) w/ jobs api
-        job_id = str(len(self.jobs))
-        self.jobs[job_id] = res
-        return Job(job_id=job_id, status=JobStatus.completed)
-
-    async def _run_model_generation(
-        self, input_rows: list[dict[str, Any]], request: EvaluateRowsRequest
-    ) -> list[dict[str, Any]]:
-        candidate = request.benchmark_config.eval_candidate
-        assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided"
-        sampling_params = {"max_tokens": candidate.sampling_params.max_tokens}
-
-        generations = []
-        for x in tqdm(input_rows):
-            if ColumnName.completion_input.value in x:
-                if candidate.sampling_params.stop:
-                    sampling_params["stop"] = candidate.sampling_params.stop
-
-                input_content = json.loads(x[ColumnName.completion_input.value])
-                params = OpenAICompletionRequestWithExtraBody(
-                    model=candidate.model,
-                    prompt=input_content,
-                    **sampling_params,
-                )
-                response = await self.inference_api.openai_completion(params)
-                generations.append({ColumnName.generated_answer.value: response.choices[0].text})
-            elif ColumnName.chat_completion_input.value in x:
-                chat_completion_input_json = json.loads(x[ColumnName.chat_completion_input.value])
-                input_messages = [
-                    OpenAIUserMessageParam(**x) for x in chat_completion_input_json if x["role"] == "user"
-                ]
-
-                messages = []
-                if candidate.system_message:
-                    messages.append(candidate.system_message)
-
-                messages += [OpenAISystemMessageParam(**x) for x in chat_completion_input_json if x["role"] == "system"]
-
-                messages += input_messages
-                params = OpenAIChatCompletionRequestWithExtraBody(
-                    model=candidate.model,
-                    messages=messages,
-                    **sampling_params,
-                )
-                response = await self.inference_api.openai_chat_completion(params)
-                generations.append({ColumnName.generated_answer.value: response.choices[0].message.content})
-            else:
-                raise ValueError("Invalid input row")
-
-        return generations
-
-    async def evaluate_rows(
-        self,
-        request: EvaluateRowsRequest,
-    ) -> EvaluateResponse:
-        candidate = request.benchmark_config.eval_candidate
-        # Agent evaluation removed
-        if candidate.type == "model":
-            generations = await self._run_model_generation(request.input_rows, request)
-        else:
-            raise ValueError(f"Invalid candidate type: {candidate.type}")
-
-        # scoring with generated_answer
-        score_input_rows = [
-            input_r | generated_r for input_r, generated_r in zip(request.input_rows, generations, strict=False)
-        ]
-
-        if request.benchmark_config.scoring_params is not None:
-            scoring_functions_dict = {
-                scoring_fn_id: request.benchmark_config.scoring_params.get(scoring_fn_id, None)
-                for scoring_fn_id in request.scoring_functions
-            }
-        else:
-            scoring_functions_dict = dict.fromkeys(request.scoring_functions)
-
-        score_request = ScoreRequest(
-            input_rows=score_input_rows,
-            scoring_functions=scoring_functions_dict,
-        )
-        score_response = await self.scoring_api.score(score_request)
-
-        return EvaluateResponse(generations=generations, scores=score_response.results)
-
-    async def job_status(self, request: JobStatusRequest) -> Job:
-        if request.job_id in self.jobs:
-            return Job(job_id=request.job_id, status=JobStatus.completed)
-
-        raise ValueError(f"Job {request.job_id} not found")
-
-    async def job_cancel(self, request: JobCancelRequest) -> None:
-        raise NotImplementedError("Job cancel is not implemented yet")
-
-    async def job_result(self, request: JobResultRequest) -> EvaluateResponse:
-        job_status_request = JobStatusRequest(benchmark_id=request.benchmark_id, job_id=request.job_id)
-        job = await self.job_status(job_status_request)
-        status = job.status
-        if not status or status != JobStatus.completed:
-            raise ValueError(f"Job is not completed, Status: {status.value}")
-
-        return self.jobs[request.job_id]
diff --git a/src/llama_stack/providers/inline/scoring/__init__.py b/src/llama_stack/providers/inline/scoring/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/inline/scoring/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/inline/scoring/basic/__init__.py b/src/llama_stack/providers/inline/scoring/basic/__init__.py
deleted file mode 100644
index c996b9c2db..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from llama_stack.core.datatypes import Api
-
-from .config import BasicScoringConfig
-
-
-async def get_provider_impl(
-    config: BasicScoringConfig,
-    deps: dict[Api, Any],
-):
-    from .scoring import BasicScoringImpl
-
-    impl = BasicScoringImpl(
-        config,
-        deps[Api.datasetio],
-        deps[Api.datasets],
-    )
-    await impl.initialize()
-    return impl
diff --git a/src/llama_stack/providers/inline/scoring/basic/config.py b/src/llama_stack/providers/inline/scoring/basic/config.py
deleted file mode 100644
index e9c7fb4516..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/config.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from pydantic import BaseModel
-
-
-class BasicScoringConfig(BaseModel):
-    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
-        return {}
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring.py b/src/llama_stack/providers/inline/scoring/basic/scoring.py
deleted file mode 100644
index d4f57aed8d..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    DatasetIO,
-    Datasets,
-    IterRowsRequest,
-    ScoreBatchRequest,
-    ScoreBatchResponse,
-    ScoreRequest,
-    ScoreResponse,
-    Scoring,
-    ScoringFn,
-    ScoringFunctionsProtocolPrivate,
-    ScoringResult,
-)
-
-from .config import BasicScoringConfig
-from .scoring_fn.docvqa_scoring_fn import DocVQAScoringFn
-from .scoring_fn.equality_scoring_fn import EqualityScoringFn
-from .scoring_fn.ifeval_scoring_fn import IfEvalScoringFn
-from .scoring_fn.regex_parser_math_response_scoring_fn import (
-    RegexParserMathResponseScoringFn,
-)
-from .scoring_fn.regex_parser_scoring_fn import RegexParserScoringFn
-from .scoring_fn.subset_of_scoring_fn import SubsetOfScoringFn
-
-FIXED_FNS = [
-    EqualityScoringFn,
-    SubsetOfScoringFn,
-    RegexParserScoringFn,
-    RegexParserMathResponseScoringFn,
-    IfEvalScoringFn,
-    DocVQAScoringFn,
-]
-
-
-class BasicScoringImpl(
-    Scoring,
-    ScoringFunctionsProtocolPrivate,
-):
-    def __init__(
-        self,
-        config: BasicScoringConfig,
-        datasetio_api: DatasetIO,
-        datasets_api: Datasets,
-    ) -> None:
-        self.config = config
-        self.datasetio_api = datasetio_api
-        self.datasets_api = datasets_api
-        self.scoring_fn_id_impls = {}
-
-    async def initialize(self) -> None:
-        for fn in FIXED_FNS:
-            impl = fn()
-            for fn_defs in impl.get_supported_scoring_fn_defs():
-                self.scoring_fn_id_impls[fn_defs.identifier] = impl
-
-    async def shutdown(self) -> None: ...
-
-    async def list_scoring_functions(self) -> list[ScoringFn]:
-        scoring_fn_defs_list = [
-            fn_def for impl in self.scoring_fn_id_impls.values() for fn_def in impl.get_supported_scoring_fn_defs()
-        ]
-
-        for f in scoring_fn_defs_list:
-            assert f.identifier.startswith("basic"), "All basic scoring fn must have identifier prefixed with 'basic'! "
-
-        return scoring_fn_defs_list
-
-    async def register_scoring_function(self, function_def: ScoringFn) -> None:
-        raise NotImplementedError("Register scoring function not implemented yet")
-
-    async def score_batch(
-        self,
-        request: ScoreBatchRequest,
-    ) -> ScoreBatchResponse:
-        all_rows = await self.datasetio_api.iterrows(IterRowsRequest(dataset_id=request.dataset_id, limit=-1))
-        score_request = ScoreRequest(
-            input_rows=all_rows.data,
-            scoring_functions=request.scoring_functions,
-        )
-        res = await self.score(score_request)
-        if request.save_results_dataset:
-            # TODO: persist and register dataset on to server for reading
-            # self.datasets_api.register_dataset()
-            raise NotImplementedError("Save results dataset not implemented yet")
-
-        return ScoreBatchResponse(
-            results=res.results,
-        )
-
-    async def score(
-        self,
-        request: ScoreRequest,
-    ) -> ScoreResponse:
-        res = {}
-        for scoring_fn_id in request.scoring_functions.keys():
-            if scoring_fn_id not in self.scoring_fn_id_impls:
-                raise ValueError(f"Scoring function {scoring_fn_id} is not supported.")
-            scoring_fn = self.scoring_fn_id_impls[scoring_fn_id]
-            scoring_fn_params = request.scoring_functions.get(scoring_fn_id, None)
-            score_results = await scoring_fn.score(request.input_rows, scoring_fn_id, scoring_fn_params)
-            agg_results = await scoring_fn.aggregate(score_results, scoring_fn_id, scoring_fn_params)
-            res[scoring_fn_id] = ScoringResult(
-                score_rows=score_results,
-                aggregated_results=agg_results,
-            )
-
-        return ScoreResponse(
-            results=res,
-        )
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/__init__.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py
deleted file mode 100644
index e48bab8fa0..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/docvqa_scoring_fn.py
+++ /dev/null
@@ -1,239 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import json
-import re
-from typing import Any
-
-from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
-from llama_stack_api import ScoringFnParams, ScoringResultRow
-
-from .fn_defs.docvqa import docvqa
-
-CONTRACTIONS = {
-    "aint": "ain't",
-    "arent": "aren't",
-    "cant": "can't",
-    "couldve": "could've",
-    "couldnt": "couldn't",
-    "couldn'tve": "couldn't've",
-    "couldnt've": "couldn't've",
-    "didnt": "didn't",
-    "doesnt": "doesn't",
-    "dont": "don't",
-    "hadnt": "hadn't",
-    "hadnt've": "hadn't've",
-    "hadn'tve": "hadn't've",
-    "hasnt": "hasn't",
-    "havent": "haven't",
-    "hed": "he'd",
-    "hed've": "he'd've",
-    "he'dve": "he'd've",
-    "hes": "he's",
-    "howd": "how'd",
-    "howll": "how'll",
-    "hows": "how's",
-    "Id've": "I'd've",
-    "I'dve": "I'd've",
-    "Im": "I'm",
-    "Ive": "I've",
-    "isnt": "isn't",
-    "itd": "it'd",
-    "itd've": "it'd've",
-    "it'dve": "it'd've",
-    "itll": "it'll",
-    "let's": "let's",
-    "maam": "ma'am",
-    "mightnt": "mightn't",
-    "mightnt've": "mightn't've",
-    "mightn'tve": "mightn't've",
-    "mightve": "might've",
-    "mustnt": "mustn't",
-    "mustve": "must've",
-    "neednt": "needn't",
-    "notve": "not've",
-    "oclock": "o'clock",
-    "oughtnt": "oughtn't",
-    "ow's'at": "'ow's'at",
-    "'ows'at": "'ow's'at",
-    "'ow'sat": "'ow's'at",
-    "shant": "shan't",
-    "shed've": "she'd've",
-    "she'dve": "she'd've",
-    "she's": "she's",
-    "shouldve": "should've",
-    "shouldnt": "shouldn't",
-    "shouldnt've": "shouldn't've",
-    "shouldn'tve": "shouldn't've",
-    "somebody'd": "somebodyd",
-    "somebodyd've": "somebody'd've",
-    "somebody'dve": "somebody'd've",
-    "somebodyll": "somebody'll",
-    "somebodys": "somebody's",
-    "someoned": "someone'd",
-    "someoned've": "someone'd've",
-    "someone'dve": "someone'd've",
-    "someonell": "someone'll",
-    "someones": "someone's",
-    "somethingd": "something'd",
-    "somethingd've": "something'd've",
-    "something'dve": "something'd've",
-    "somethingll": "something'll",
-    "thats": "that's",
-    "thered": "there'd",
-    "thered've": "there'd've",
-    "there'dve": "there'd've",
-    "therere": "there're",
-    "theres": "there's",
-    "theyd": "they'd",
-    "theyd've": "they'd've",
-    "they'dve": "they'd've",
-    "theyll": "they'll",
-    "theyre": "they're",
-    "theyve": "they've",
-    "twas": "'twas",
-    "wasnt": "wasn't",
-    "wed've": "we'd've",
-    "we'dve": "we'd've",
-    "weve": "we've",
-    "werent": "weren't",
-    "whatll": "what'll",
-    "whatre": "what're",
-    "whats": "what's",
-    "whatve": "what've",
-    "whens": "when's",
-    "whered": "where'd",
-    "wheres": "where's",
-    "whereve": "where've",
-    "whod": "who'd",
-    "whod've": "who'd've",
-    "who'dve": "who'd've",
-    "wholl": "who'll",
-    "whos": "who's",
-    "whove": "who've",
-    "whyll": "why'll",
-    "whyre": "why're",
-    "whys": "why's",
-    "wont": "won't",
-    "wouldve": "would've",
-    "wouldnt": "wouldn't",
-    "wouldnt've": "wouldn't've",
-    "wouldn'tve": "wouldn't've",
-    "yall": "y'all",
-    "yall'll": "y'all'll",
-    "y'allll": "y'all'll",
-    "yall'd've": "y'all'd've",
-    "y'alld've": "y'all'd've",
-    "y'all'dve": "y'all'd've",
-    "youd": "you'd",
-    "youd've": "you'd've",
-    "you'dve": "you'd've",
-    "youll": "you'll",
-    "youre": "you're",
-    "youve": "you've",
-    "1st": "first",
-    "2nd": "second",
-    "3rd": "third",
-}
-NUMBERS = {
-    "none": "0",
-    "zero": "0",
-    "one": "1",
-    "two": "2",
-    "three": "3",
-    "four": "4",
-    "five": "5",
-    "six": "6",
-    "seven": "7",
-    "eight": "8",
-    "nine": "9",
-    "ten": "10",
-}
-ARTICLES = [
-    "a",
-    "an",
-    "the",
-    "to",
-    "in",
-    "from",
-    "by",
-]  # Contains a bit more than just articles, but we want to get rid of these elements influencing the accuracy
-PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)")
-COMMA_STRIP = re.compile(r"(\d)(\,)(\d)")
-PUNCTUATION = [
-    ";",
-    r"/",
-    "[",
-    "]",
-    '"',
-    "{",
-    "}",
-    "(",
-    ")",
-    "=",
-    "+",
-    "\\",
-    "_",
-    "-",
-    ">",
-    "<",
-    "@",
-    "`",
-    ",",
-    "?",
-    "!",
-]
-
-
-def normalize_answer(s: str) -> str:
-    # process punctuation
-    for p in PUNCTUATION:
-        if (p + " " in s or " " + p in s) or (re.search(COMMA_STRIP, s) is not None):
-            s = s.replace(p, "")
-        else:
-            s = s.replace(p, " ")
-        s = PERIOD_STRIP.sub("", s, re.UNICODE)
-
-    # process digits and articles
-    temp_text = s.lower().split()
-    out_text = []
-    for word in temp_text:
-        word = NUMBERS.setdefault(word, word)
-        if word not in ARTICLES:
-            out_text.append(word)
-
-    # standardize contractions
-    for word_id, word in enumerate(out_text):
-        if word in CONTRACTIONS:
-            out_text[word_id] = CONTRACTIONS[word]
-    return " ".join(out_text)
-
-
-class DocVQAScoringFn(RegisteredBaseScoringFn):
-    """
-    docvqa basically matches the generated answer against several allowed
-    choices, but we need to normalize the answer to avoid penalizing
-    trivial differences
-    """
-
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.supported_fn_defs_registry = {
-            docvqa.identifier: docvqa,
-        }
-
-    async def score_row(
-        self,
-        input_row: dict[str, Any],
-        scoring_fn_identifier: str | None = "docvqa",
-        scoring_params: ScoringFnParams | None = None,
-    ) -> ScoringResultRow:
-        expected_answers = json.loads(input_row["expected_answer"])
-        generated_answer = input_row["generated_answer"]
-        score = 1.0 if normalize_answer(generated_answer) in [normalize_answer(s) for s in expected_answers] else 0.0
-        return {
-            "score": score,
-        }
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py
deleted file mode 100644
index 2e79240bec..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/equality_scoring_fn.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
-from llama_stack_api import ScoringFnParams, ScoringResultRow
-
-from .fn_defs.equality import equality
-
-
-class EqualityScoringFn(RegisteredBaseScoringFn):
-    """
-    A scoring_fn that assigns a score of 1.0 if the input string matches the target string, and 0.0 otherwise.
-    """
-
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.supported_fn_defs_registry = {
-            equality.identifier: equality,
-        }
-
-    async def score_row(
-        self,
-        input_row: dict[str, Any],
-        scoring_fn_identifier: str | None = "equality",
-        scoring_params: ScoringFnParams | None = None,
-    ) -> ScoringResultRow:
-        assert "expected_answer" in input_row, "Expected answer not found in input row."
-        assert "generated_answer" in input_row, "Generated answer not found in input row."
-
-        expected_answer = input_row["expected_answer"]
-        generated_answer = input_row["generated_answer"]
-        score = 1.0 if expected_answer == generated_answer else 0.0
-        return {
-            "score": score,
-        }
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/__init__.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py
deleted file mode 100644
index a7305d13aa..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/docvqa.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-docvqa = ScoringFn(
-    identifier="basic::docvqa",
-    description="DocVQA Visual Question & Answer scoring function",
-    return_type=NumberType(),
-    provider_id="basic",
-    provider_resource_id="docvqa",
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.accuracy]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py
deleted file mode 100644
index f7d2f32ae3..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/equality.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-equality = ScoringFn(
-    identifier="basic::equality",
-    description="Returns 1.0 if the input is equal to the target, 0.0 otherwise.",
-    provider_id="basic",
-    provider_resource_id="equality",
-    return_type=NumberType(),
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.accuracy]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py
deleted file mode 100644
index a2ed1d695d..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/ifeval.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-ifeval = ScoringFn(
-    identifier="basic::ifeval",
-    description="Eval intruction follow capacity by checkping how many instructions can be followed in each example",
-    return_type=NumberType(),
-    provider_id="basic",
-    provider_resource_id="ifeval",
-    params=BasicScoringFnParams(
-        aggregation_functions=[AggregationFunctionType.weighted_average],
-    ),
-)
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py
deleted file mode 100644
index 4e2b49a1fd..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_math_response.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    NumberType,
-    RegexParserScoringFnParams,
-    ScoringFn,
-)
-
-MATH_ANSWER_REGEXES = [r".*final answer is:?\s*\$\\boxed{(?P<X>.*)}\$"]
-
-
-regex_parser_math_response = ScoringFn(
-    identifier="basic::regex_parser_math_response",
-    description="For math related benchmarks, extract answer from the generated response and expected_answer and see if they match",
-    return_type=NumberType(),
-    provider_id="basic",
-    provider_resource_id="regex-parser-math-response",
-    params=RegexParserScoringFnParams(
-        parsing_regexes=MATH_ANSWER_REGEXES,
-        aggregation_functions=[AggregationFunctionType.accuracy],
-    ),
-)
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py
deleted file mode 100644
index df0cf52d9d..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/regex_parser_multiple_choice_answer.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    NumberType,
-    RegexParserScoringFnParams,
-    ScoringFn,
-)
-
-MULTILINGUAL_ANSWER_REGEXES = [
-    r"The best answer is ",
-    r"Answer\s*:",
-    r"Answer\s*:​​​​​​",  # Korean invisible character
-    r"উত্তর\s*:",
-    r"उत्तर\s*:",
-    r"উত্তরঃ",
-    r"উত্তর\s*:",
-    r"Antwort\s*:",
-    r"답변\s*:",
-    r"정답\s*:",
-    r"답\s*:",
-    r"答案\s*：",
-    r"答案\s*:",
-    r"答\s*：",
-    r"答\s*:",
-    r"答复\s*：",
-    r"答曰\s*：",
-    r"الإجابة:",
-    r"الجواب:",
-    r"إجابة:",
-    r"الإجابة النهائية:",
-    r"الإجابة الصحيحة:",
-    r"الإجابة الصحيحة هي:",
-    r"الإجابة هي:",
-    r"Respuesta\s*:",
-    r"Risposta\s*:",
-    r"答え\s*:",
-    r"答え\s*：",
-    r"回答\s*:",
-    r"回答\s*：",
-    r"解答\s*:",
-    r"Jawaban\s*:",
-    r"Réponse\s*:",
-    r"Resposta\s*:",
-    r"Jibu\s*:",
-    r"Idahun\s*:",
-    r"Ìdáhùn\s*:",
-    r"Idáhùn\s*:",
-    r"Àmọ̀nà\s*:",
-    r"Àdáhùn\s*:",
-    r"Ànúgọ\s*:",
-    r"Àṣàyàn\s*:",
-]
-
-MULTILINGUAL_ANSWER_PATTERN_TEMPLATE = r"(?i){}\s*([A-D]|[أ-د]|[অ]|[ব]|[ড]|[ঢ]|[Ａ]|[Ｂ]|[Ｃ]|[Ｄ])"
-
-regex_parser_multiple_choice_answer = ScoringFn(
-    identifier="basic::regex_parser_multiple_choice_answer",
-    description="Extract answer from response matching Answer: [the_answer_letter], and compare with expected result",
-    return_type=NumberType(),
-    provider_id="basic",
-    provider_resource_id="regex-parser-multiple-choice-answer",
-    params=RegexParserScoringFnParams(
-        parsing_regexes=[MULTILINGUAL_ANSWER_PATTERN_TEMPLATE.format(x) for x in MULTILINGUAL_ANSWER_REGEXES],
-        aggregation_functions=[AggregationFunctionType.accuracy],
-    ),
-)
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py
deleted file mode 100644
index 1f143c4a62..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/fn_defs/subset_of.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-subset_of = ScoringFn(
-    identifier="basic::subset_of",
-    description="Returns 1.0 if the expected is included in generated, 0.0 otherwise.",
-    return_type=NumberType(),
-    provider_id="basic",
-    provider_resource_id="subset-of",
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.accuracy]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py
deleted file mode 100644
index 33b1c5a312..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/ifeval_scoring_fn.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
-from llama_stack_api import ScoringFnParams, ScoringResultRow
-
-from .fn_defs.ifeval import (
-    ifeval,
-)
-
-
-class IfEvalScoringFn(RegisteredBaseScoringFn):
-    """
-    A scoring_fn Instruction-Following Eval (IFEval) benchmark
-    """
-
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.supported_fn_defs_registry = {
-            ifeval.identifier: ifeval,
-        }
-
-    async def score_row(
-        self,
-        input_row: dict[str, Any],
-        scoring_fn_identifier: str | None = None,
-        scoring_params: ScoringFnParams | None = None,
-    ) -> ScoringResultRow:
-        from ..utils.ifeval_utils import INSTRUCTION_DICT, INSTRUCTION_LIST
-
-        assert scoring_fn_identifier is not None, "Scoring function identifier not found."
-        fn_def = self.supported_fn_defs_registry[scoring_fn_identifier]
-        if scoring_params is not None:
-            fn_def.params = scoring_params
-
-        instruction_list = input_row["instruction_id_list"]
-        generated_answer = input_row["generated_answer"].strip()
-
-        is_following_list = []
-        results = dict(
-            {k + "_correct": 0.0 for k in INSTRUCTION_LIST},
-            **{k + "_total": 0.0 for k in INSTRUCTION_LIST},
-        )
-
-        for index, instruction_id in enumerate(instruction_list):
-            instruction_cls = INSTRUCTION_DICT[instruction_id]
-            instruction = instruction_cls(instruction_id)
-            results[instruction_id + "_total"] += 1.0
-            results[instruction_id.split(":")[0] + "_total"] += 1.0
-
-            clean_input_row = {k: v for k, v in input_row["kwargs"][index].items() if v is not None}
-            print(clean_input_row)
-            instruction.build_description(**clean_input_row)
-            args = instruction.get_instruction_args()
-            if args and "prompt" in args:
-                instruction.build_description(prompt=input_row["prompt"])
-
-            if generated_answer and instruction.check_following(generated_answer):
-                is_following_list.append(True)
-                results[instruction_id + "_correct"] += 1.0
-                results[instruction_id.split(":")[0] + "_correct"] += 1.0
-            else:
-                is_following_list.append(False)
-
-        if len(is_following_list) == 0:
-            return {
-                "score": 0.0,
-                "weight": 0.0,
-            }
-
-        return {
-            "score": float(sum(is_following_list)) / float(len(is_following_list)),
-            "weight": float(len(is_following_list)),
-        }
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py
deleted file mode 100644
index 1f4f2f9794..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_math_response_scoring_fn.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
-from llama_stack_api import ScoringFnParams, ScoringFnParamsType, ScoringResultRow
-
-from ..utils.math_utils import first_answer, normalize_final_answer, try_evaluate_frac, try_evaluate_latex
-from .fn_defs.regex_parser_math_response import (
-    regex_parser_math_response,
-)
-
-
-class RegexParserMathResponseScoringFn(RegisteredBaseScoringFn):
-    """
-    A scoring_fn for math benchamrks that parses answer from generated response according to context and check match with expected_answer.
-    """
-
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.supported_fn_defs_registry = {
-            regex_parser_math_response.identifier: regex_parser_math_response,
-        }
-
-    async def score_row(
-        self,
-        input_row: dict[str, Any],
-        scoring_fn_identifier: str | None = None,
-        scoring_params: ScoringFnParams | None = None,
-    ) -> ScoringResultRow:
-        assert scoring_fn_identifier is not None, "Scoring function identifier not found."
-        fn_def = self.supported_fn_defs_registry[scoring_fn_identifier]
-        if scoring_params is not None:
-            fn_def.params = scoring_params
-
-        assert fn_def.params is not None and fn_def.params.type == ScoringFnParamsType.regex_parser.value, (
-            f"RegexParserScoringFnParams not found for {fn_def}."
-        )
-
-        expected_answer = input_row["expected_answer"]
-        generated_answer = input_row["generated_answer"]
-
-        parsing_regexes = fn_def.params.parsing_regexes
-        assert len(parsing_regexes) == 1, (
-            "Only one parsing regex is supported for regex_parser_math_response scoring function."
-        )
-        parsing_regexes = fn_def.params.parsing_regexes[0]
-
-        normalized_generated_answer = normalize_final_answer(
-            first_answer(generated_answer),
-            parsing_regexes,
-            match_first=True,
-        )
-        normalized_generated_answer = try_evaluate_frac(try_evaluate_latex(normalized_generated_answer))
-
-        normalized_expected_answer = normalize_final_answer(expected_answer, r".*")
-        normalized_expected_answer = try_evaluate_frac(try_evaluate_latex(normalized_expected_answer))
-
-        score = 1.0 if normalized_generated_answer == normalized_expected_answer else 0.0
-        return {
-            "score": score,
-        }
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py
deleted file mode 100644
index 1cc74f8746..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/regex_parser_scoring_fn.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import re
-from typing import Any
-
-from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
-from llama_stack_api import ScoringFnParams, ScoringFnParamsType, ScoringResultRow
-
-from .fn_defs.regex_parser_multiple_choice_answer import (
-    regex_parser_multiple_choice_answer,
-)
-
-
-class RegexParserScoringFn(RegisteredBaseScoringFn):
-    """
-    A scoring_fn that parses answer from generated response according to context and check match with expected_answer.
-    """
-
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.supported_fn_defs_registry = {
-            regex_parser_multiple_choice_answer.identifier: regex_parser_multiple_choice_answer,
-        }
-
-    async def score_row(
-        self,
-        input_row: dict[str, Any],
-        scoring_fn_identifier: str | None = None,
-        scoring_params: ScoringFnParams | None = None,
-    ) -> ScoringResultRow:
-        assert scoring_fn_identifier is not None, "Scoring function identifier not found."
-        fn_def = self.supported_fn_defs_registry[scoring_fn_identifier]
-        if scoring_params is not None:
-            fn_def.params = scoring_params
-
-        assert fn_def.params is not None and fn_def.params.type == ScoringFnParamsType.regex_parser.value, (
-            f"RegexParserScoringFnParams not found for {fn_def}."
-        )
-
-        expected_answer = input_row["expected_answer"]
-        generated_answer = input_row["generated_answer"]
-
-        # parse answer according to regex
-        parsed_answer = None
-        for regex in fn_def.params.parsing_regexes:
-            match = re.search(regex, generated_answer)
-            if match:
-                parsed_answer = match.group(1)
-                break
-
-        score = 1.0 if parsed_answer and parsed_answer == expected_answer else 0.0
-        return {
-            "score": score,
-        }
diff --git a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py b/src/llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py
deleted file mode 100644
index fe15a4972d..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/scoring_fn/subset_of_scoring_fn.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
-from llama_stack_api import ScoringFnParams, ScoringResultRow
-
-from .fn_defs.subset_of import subset_of
-
-
-class SubsetOfScoringFn(RegisteredBaseScoringFn):
-    """
-    A scoring_fn that assigns a score of 1.0 if the expected string is included in the generated string, and 0.0 otherwise.
-    """
-
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.supported_fn_defs_registry = {
-            subset_of.identifier: subset_of,
-        }
-
-    async def score_row(
-        self,
-        input_row: dict[str, Any],
-        scoring_fn_identifier: str | None = "subset_of",
-        scoring_params: ScoringFnParams | None = None,
-    ) -> ScoringResultRow:
-        expected_answer = input_row["expected_answer"]
-        generated_answer = input_row["generated_answer"]
-        score = 1.0 if expected_answer in generated_answer else 0.0
-        return {
-            "score": score,
-        }
diff --git a/src/llama_stack/providers/inline/scoring/basic/utils/__init__.py b/src/llama_stack/providers/inline/scoring/basic/utils/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/utils/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py b/src/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
deleted file mode 100644
index c9358101d0..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/utils/ifeval_utils.py
+++ /dev/null
@@ -1,3319 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import collections
-import functools
-import json
-import random
-import re
-import string
-from collections.abc import Iterable, Sequence
-from types import MappingProxyType
-
-import emoji
-import langdetect
-import nltk
-from pythainlp.tokenize import sent_tokenize as sent_tokenize_thai
-from pythainlp.tokenize import word_tokenize as word_tokenize_thai
-
-from llama_stack.log import get_logger
-
-logger = get_logger(name=__name__, category="scoring")
-
-WORD_LIST = [
-    "western",
-    "sentence",
-    "signal",
-    "dump",
-    "spot",
-    "opposite",
-    "bottom",
-    "potato",
-    "administration",
-    "working",
-    "welcome",
-    "morning",
-    "good",
-    "agency",
-    "primary",
-    "wish",
-    "responsibility",
-    "press",
-    "problem",
-    "president",
-    "steal",
-    "brush",
-    "read",
-    "type",
-    "beat",
-    "trainer",
-    "growth",
-    "lock",
-    "bone",
-    "case",
-    "equal",
-    "comfortable",
-    "region",
-    "replacement",
-    "performance",
-    "mate",
-    "walk",
-    "medicine",
-    "film",
-    "thing",
-    "rock",
-    "tap",
-    "total",
-    "competition",
-    "ease",
-    "south",
-    "establishment",
-    "gather",
-    "parking",
-    "world",
-    "plenty",
-    "breath",
-    "claim",
-    "alcohol",
-    "trade",
-    "dear",
-    "highlight",
-    "street",
-    "matter",
-    "decision",
-    "mess",
-    "agreement",
-    "studio",
-    "coach",
-    "assist",
-    "brain",
-    "wing",
-    "style",
-    "private",
-    "top",
-    "brown",
-    "leg",
-    "buy",
-    "procedure",
-    "method",
-    "speed",
-    "high",
-    "company",
-    "valuable",
-    "pie",
-    "analyst",
-    "session",
-    "pattern",
-    "district",
-    "pleasure",
-    "dinner",
-    "swimming",
-    "joke",
-    "order",
-    "plate",
-    "department",
-    "motor",
-    "cell",
-    "spend",
-    "cabinet",
-    "difference",
-    "power",
-    "examination",
-    "engine",
-    "horse",
-    "dimension",
-    "pay",
-    "toe",
-    "curve",
-    "literature",
-    "bother",
-    "fire",
-    "possibility",
-    "debate",
-    "activity",
-    "passage",
-    "hello",
-    "cycle",
-    "background",
-    "quiet",
-    "author",
-    "effect",
-    "actor",
-    "page",
-    "bicycle",
-    "error",
-    "throat",
-    "attack",
-    "character",
-    "phone",
-    "tea",
-    "increase",
-    "outcome",
-    "file",
-    "specific",
-    "inspector",
-    "internal",
-    "potential",
-    "staff",
-    "building",
-    "employer",
-    "shoe",
-    "hand",
-    "direction",
-    "garden",
-    "purchase",
-    "interview",
-    "study",
-    "recognition",
-    "member",
-    "spiritual",
-    "oven",
-    "sandwich",
-    "weird",
-    "passenger",
-    "particular",
-    "response",
-    "reaction",
-    "size",
-    "variation",
-    "a",
-    "cancel",
-    "candy",
-    "exit",
-    "guest",
-    "condition",
-    "fly",
-    "price",
-    "weakness",
-    "convert",
-    "hotel",
-    "great",
-    "mouth",
-    "mind",
-    "song",
-    "sugar",
-    "suspect",
-    "telephone",
-    "ear",
-    "roof",
-    "paint",
-    "refrigerator",
-    "organization",
-    "jury",
-    "reward",
-    "engineering",
-    "day",
-    "possession",
-    "crew",
-    "bar",
-    "road",
-    "description",
-    "celebration",
-    "score",
-    "mark",
-    "letter",
-    "shower",
-    "suggestion",
-    "sir",
-    "luck",
-    "national",
-    "progress",
-    "hall",
-    "stroke",
-    "theory",
-    "offer",
-    "story",
-    "tax",
-    "definition",
-    "history",
-    "ride",
-    "medium",
-    "opening",
-    "glass",
-    "elevator",
-    "stomach",
-    "question",
-    "ability",
-    "leading",
-    "village",
-    "computer",
-    "city",
-    "grand",
-    "confidence",
-    "candle",
-    "priest",
-    "recommendation",
-    "point",
-    "necessary",
-    "body",
-    "desk",
-    "secret",
-    "horror",
-    "noise",
-    "culture",
-    "warning",
-    "water",
-    "round",
-    "diet",
-    "flower",
-    "bus",
-    "tough",
-    "permission",
-    "week",
-    "prompt",
-    "connection",
-    "abuse",
-    "height",
-    "save",
-    "corner",
-    "border",
-    "stress",
-    "drive",
-    "stop",
-    "rip",
-    "meal",
-    "listen",
-    "confusion",
-    "girlfriend",
-    "living",
-    "relation",
-    "significance",
-    "plan",
-    "creative",
-    "atmosphere",
-    "blame",
-    "invite",
-    "housing",
-    "paper",
-    "drink",
-    "roll",
-    "silver",
-    "drunk",
-    "age",
-    "damage",
-    "smoke",
-    "environment",
-    "pack",
-    "savings",
-    "influence",
-    "tourist",
-    "rain",
-    "post",
-    "sign",
-    "grandmother",
-    "run",
-    "profit",
-    "push",
-    "clerk",
-    "final",
-    "wine",
-    "swim",
-    "pause",
-    "stuff",
-    "singer",
-    "funeral",
-    "average",
-    "source",
-    "scene",
-    "tradition",
-    "personal",
-    "snow",
-    "nobody",
-    "distance",
-    "sort",
-    "sensitive",
-    "animal",
-    "major",
-    "negotiation",
-    "click",
-    "mood",
-    "period",
-    "arrival",
-    "expression",
-    "holiday",
-    "repeat",
-    "dust",
-    "closet",
-    "gold",
-    "bad",
-    "sail",
-    "combination",
-    "clothes",
-    "emphasis",
-    "duty",
-    "black",
-    "step",
-    "school",
-    "jump",
-    "document",
-    "professional",
-    "lip",
-    "chemical",
-    "front",
-    "wake",
-    "while",
-    "inside",
-    "watch",
-    "row",
-    "subject",
-    "penalty",
-    "balance",
-    "possible",
-    "adult",
-    "aside",
-    "sample",
-    "appeal",
-    "wedding",
-    "depth",
-    "king",
-    "award",
-    "wife",
-    "blow",
-    "site",
-    "camp",
-    "music",
-    "safe",
-    "gift",
-    "fault",
-    "guess",
-    "act",
-    "shame",
-    "drama",
-    "capital",
-    "exam",
-    "stupid",
-    "record",
-    "sound",
-    "swing",
-    "novel",
-    "minimum",
-    "ratio",
-    "machine",
-    "shape",
-    "lead",
-    "operation",
-    "salary",
-    "cloud",
-    "affair",
-    "hit",
-    "chapter",
-    "stage",
-    "quantity",
-    "access",
-    "army",
-    "chain",
-    "traffic",
-    "kick",
-    "analysis",
-    "airport",
-    "time",
-    "vacation",
-    "philosophy",
-    "ball",
-    "chest",
-    "thanks",
-    "place",
-    "mountain",
-    "advertising",
-    "red",
-    "past",
-    "rent",
-    "return",
-    "tour",
-    "house",
-    "construction",
-    "net",
-    "native",
-    "war",
-    "figure",
-    "fee",
-    "spray",
-    "user",
-    "dirt",
-    "shot",
-    "task",
-    "stick",
-    "friend",
-    "software",
-    "promotion",
-    "interaction",
-    "surround",
-    "block",
-    "purpose",
-    "practice",
-    "conflict",
-    "routine",
-    "requirement",
-    "bonus",
-    "hole",
-    "state",
-    "junior",
-    "sweet",
-    "catch",
-    "tear",
-    "fold",
-    "wall",
-    "editor",
-    "life",
-    "position",
-    "pound",
-    "respect",
-    "bathroom",
-    "coat",
-    "script",
-    "job",
-    "teach",
-    "birth",
-    "view",
-    "resolve",
-    "theme",
-    "employee",
-    "doubt",
-    "market",
-    "education",
-    "serve",
-    "recover",
-    "tone",
-    "harm",
-    "miss",
-    "union",
-    "understanding",
-    "cow",
-    "river",
-    "association",
-    "concept",
-    "training",
-    "recipe",
-    "relationship",
-    "reserve",
-    "depression",
-    "proof",
-    "hair",
-    "revenue",
-    "independent",
-    "lift",
-    "assignment",
-    "temporary",
-    "amount",
-    "loss",
-    "edge",
-    "track",
-    "check",
-    "rope",
-    "estimate",
-    "pollution",
-    "stable",
-    "message",
-    "delivery",
-    "perspective",
-    "mirror",
-    "assistant",
-    "representative",
-    "witness",
-    "nature",
-    "judge",
-    "fruit",
-    "tip",
-    "devil",
-    "town",
-    "emergency",
-    "upper",
-    "drop",
-    "stay",
-    "human",
-    "neck",
-    "speaker",
-    "network",
-    "sing",
-    "resist",
-    "league",
-    "trip",
-    "signature",
-    "lawyer",
-    "importance",
-    "gas",
-    "choice",
-    "engineer",
-    "success",
-    "part",
-    "external",
-    "worker",
-    "simple",
-    "quarter",
-    "student",
-    "heart",
-    "pass",
-    "spite",
-    "shift",
-    "rough",
-    "lady",
-    "grass",
-    "community",
-    "garage",
-    "youth",
-    "standard",
-    "skirt",
-    "promise",
-    "blind",
-    "television",
-    "disease",
-    "commission",
-    "positive",
-    "energy",
-    "calm",
-    "presence",
-    "tune",
-    "basis",
-    "preference",
-    "head",
-    "common",
-    "cut",
-    "somewhere",
-    "presentation",
-    "current",
-    "thought",
-    "revolution",
-    "effort",
-    "master",
-    "implement",
-    "republic",
-    "floor",
-    "principle",
-    "stranger",
-    "shoulder",
-    "grade",
-    "button",
-    "tennis",
-    "police",
-    "collection",
-    "account",
-    "register",
-    "glove",
-    "divide",
-    "professor",
-    "chair",
-    "priority",
-    "combine",
-    "peace",
-    "extension",
-    "maybe",
-    "evening",
-    "frame",
-    "sister",
-    "wave",
-    "code",
-    "application",
-    "mouse",
-    "match",
-    "counter",
-    "bottle",
-    "half",
-    "cheek",
-    "resolution",
-    "back",
-    "knowledge",
-    "make",
-    "discussion",
-    "screw",
-    "length",
-    "accident",
-    "battle",
-    "dress",
-    "knee",
-    "log",
-    "package",
-    "it",
-    "turn",
-    "hearing",
-    "newspaper",
-    "layer",
-    "wealth",
-    "profile",
-    "imagination",
-    "answer",
-    "weekend",
-    "teacher",
-    "appearance",
-    "meet",
-    "bike",
-    "rise",
-    "belt",
-    "crash",
-    "bowl",
-    "equivalent",
-    "support",
-    "image",
-    "poem",
-    "risk",
-    "excitement",
-    "remote",
-    "secretary",
-    "public",
-    "produce",
-    "plane",
-    "display",
-    "money",
-    "sand",
-    "situation",
-    "punch",
-    "customer",
-    "title",
-    "shake",
-    "mortgage",
-    "option",
-    "number",
-    "pop",
-    "window",
-    "extent",
-    "nothing",
-    "experience",
-    "opinion",
-    "departure",
-    "dance",
-    "indication",
-    "boy",
-    "material",
-    "band",
-    "leader",
-    "sun",
-    "beautiful",
-    "muscle",
-    "farmer",
-    "variety",
-    "fat",
-    "handle",
-    "director",
-    "opportunity",
-    "calendar",
-    "outside",
-    "pace",
-    "bath",
-    "fish",
-    "consequence",
-    "put",
-    "owner",
-    "go",
-    "doctor",
-    "information",
-    "share",
-    "hurt",
-    "protection",
-    "career",
-    "finance",
-    "force",
-    "golf",
-    "garbage",
-    "aspect",
-    "kid",
-    "food",
-    "boot",
-    "milk",
-    "respond",
-    "objective",
-    "reality",
-    "raw",
-    "ring",
-    "mall",
-    "one",
-    "impact",
-    "area",
-    "news",
-    "international",
-    "series",
-    "impress",
-    "mother",
-    "shelter",
-    "strike",
-    "loan",
-    "month",
-    "seat",
-    "anything",
-    "entertainment",
-    "familiar",
-    "clue",
-    "year",
-    "glad",
-    "supermarket",
-    "natural",
-    "god",
-    "cost",
-    "conversation",
-    "tie",
-    "ruin",
-    "comfort",
-    "earth",
-    "storm",
-    "percentage",
-    "assistance",
-    "budget",
-    "strength",
-    "beginning",
-    "sleep",
-    "other",
-    "young",
-    "unit",
-    "fill",
-    "store",
-    "desire",
-    "hide",
-    "value",
-    "cup",
-    "maintenance",
-    "nurse",
-    "function",
-    "tower",
-    "role",
-    "class",
-    "camera",
-    "database",
-    "panic",
-    "nation",
-    "basket",
-    "ice",
-    "art",
-    "spirit",
-    "chart",
-    "exchange",
-    "feedback",
-    "statement",
-    "reputation",
-    "search",
-    "hunt",
-    "exercise",
-    "nasty",
-    "notice",
-    "male",
-    "yard",
-    "annual",
-    "collar",
-    "date",
-    "platform",
-    "plant",
-    "fortune",
-    "passion",
-    "friendship",
-    "spread",
-    "cancer",
-    "ticket",
-    "attitude",
-    "island",
-    "active",
-    "object",
-    "service",
-    "buyer",
-    "bite",
-    "card",
-    "face",
-    "steak",
-    "proposal",
-    "patient",
-    "heat",
-    "rule",
-    "resident",
-    "broad",
-    "politics",
-    "west",
-    "knife",
-    "expert",
-    "girl",
-    "design",
-    "salt",
-    "baseball",
-    "grab",
-    "inspection",
-    "cousin",
-    "couple",
-    "magazine",
-    "cook",
-    "dependent",
-    "security",
-    "chicken",
-    "version",
-    "currency",
-    "ladder",
-    "scheme",
-    "kitchen",
-    "employment",
-    "local",
-    "attention",
-    "manager",
-    "fact",
-    "cover",
-    "sad",
-    "guard",
-    "relative",
-    "county",
-    "rate",
-    "lunch",
-    "program",
-    "initiative",
-    "gear",
-    "bridge",
-    "breast",
-    "talk",
-    "dish",
-    "guarantee",
-    "beer",
-    "vehicle",
-    "reception",
-    "woman",
-    "substance",
-    "copy",
-    "lecture",
-    "advantage",
-    "park",
-    "cold",
-    "death",
-    "mix",
-    "hold",
-    "scale",
-    "tomorrow",
-    "blood",
-    "request",
-    "green",
-    "cookie",
-    "church",
-    "strip",
-    "forever",
-    "beyond",
-    "debt",
-    "tackle",
-    "wash",
-    "following",
-    "feel",
-    "maximum",
-    "sector",
-    "sea",
-    "property",
-    "economics",
-    "menu",
-    "bench",
-    "try",
-    "language",
-    "start",
-    "call",
-    "solid",
-    "address",
-    "income",
-    "foot",
-    "senior",
-    "honey",
-    "few",
-    "mixture",
-    "cash",
-    "grocery",
-    "link",
-    "map",
-    "form",
-    "factor",
-    "pot",
-    "model",
-    "writer",
-    "farm",
-    "winter",
-    "skill",
-    "anywhere",
-    "birthday",
-    "policy",
-    "release",
-    "husband",
-    "lab",
-    "hurry",
-    "mail",
-    "equipment",
-    "sink",
-    "pair",
-    "driver",
-    "consideration",
-    "leather",
-    "skin",
-    "blue",
-    "boat",
-    "sale",
-    "brick",
-    "two",
-    "feed",
-    "square",
-    "dot",
-    "rush",
-    "dream",
-    "location",
-    "afternoon",
-    "manufacturer",
-    "control",
-    "occasion",
-    "trouble",
-    "introduction",
-    "advice",
-    "bet",
-    "eat",
-    "kill",
-    "category",
-    "manner",
-    "office",
-    "estate",
-    "pride",
-    "awareness",
-    "slip",
-    "crack",
-    "client",
-    "nail",
-    "shoot",
-    "membership",
-    "soft",
-    "anybody",
-    "web",
-    "official",
-    "individual",
-    "pizza",
-    "interest",
-    "bag",
-    "spell",
-    "profession",
-    "queen",
-    "deal",
-    "resource",
-    "ship",
-    "guy",
-    "chocolate",
-    "joint",
-    "formal",
-    "upstairs",
-    "car",
-    "resort",
-    "abroad",
-    "dealer",
-    "associate",
-    "finger",
-    "surgery",
-    "comment",
-    "team",
-    "detail",
-    "crazy",
-    "path",
-    "tale",
-    "initial",
-    "arm",
-    "radio",
-    "demand",
-    "single",
-    "draw",
-    "yellow",
-    "contest",
-    "piece",
-    "quote",
-    "pull",
-    "commercial",
-    "shirt",
-    "contribution",
-    "cream",
-    "channel",
-    "suit",
-    "discipline",
-    "instruction",
-    "concert",
-    "speech",
-    "low",
-    "effective",
-    "hang",
-    "scratch",
-    "industry",
-    "breakfast",
-    "lay",
-    "join",
-    "metal",
-    "bedroom",
-    "minute",
-    "product",
-    "rest",
-    "temperature",
-    "many",
-    "give",
-    "argument",
-    "print",
-    "purple",
-    "laugh",
-    "health",
-    "credit",
-    "investment",
-    "sell",
-    "setting",
-    "lesson",
-    "egg",
-    "middle",
-    "marriage",
-    "level",
-    "evidence",
-    "phrase",
-    "love",
-    "self",
-    "benefit",
-    "guidance",
-    "affect",
-    "you",
-    "dad",
-    "anxiety",
-    "special",
-    "boyfriend",
-    "test",
-    "blank",
-    "payment",
-    "soup",
-    "obligation",
-    "reply",
-    "smile",
-    "deep",
-    "complaint",
-    "addition",
-    "review",
-    "box",
-    "towel",
-    "minor",
-    "fun",
-    "soil",
-    "issue",
-    "cigarette",
-    "internet",
-    "gain",
-    "tell",
-    "entry",
-    "spare",
-    "incident",
-    "family",
-    "refuse",
-    "branch",
-    "can",
-    "pen",
-    "grandfather",
-    "constant",
-    "tank",
-    "uncle",
-    "climate",
-    "ground",
-    "volume",
-    "communication",
-    "kind",
-    "poet",
-    "child",
-    "screen",
-    "mine",
-    "quit",
-    "gene",
-    "lack",
-    "charity",
-    "memory",
-    "tooth",
-    "fear",
-    "mention",
-    "marketing",
-    "reveal",
-    "reason",
-    "court",
-    "season",
-    "freedom",
-    "land",
-    "sport",
-    "audience",
-    "classroom",
-    "law",
-    "hook",
-    "win",
-    "carry",
-    "eye",
-    "smell",
-    "distribution",
-    "research",
-    "country",
-    "dare",
-    "hope",
-    "whereas",
-    "stretch",
-    "library",
-    "if",
-    "delay",
-    "college",
-    "plastic",
-    "book",
-    "present",
-    "use",
-    "worry",
-    "champion",
-    "goal",
-    "economy",
-    "march",
-    "election",
-    "reflection",
-    "midnight",
-    "slide",
-    "inflation",
-    "action",
-    "challenge",
-    "guitar",
-    "coast",
-    "apple",
-    "campaign",
-    "field",
-    "jacket",
-    "sense",
-    "way",
-    "visual",
-    "remove",
-    "weather",
-    "trash",
-    "cable",
-    "regret",
-    "buddy",
-    "beach",
-    "historian",
-    "courage",
-    "sympathy",
-    "truck",
-    "tension",
-    "permit",
-    "nose",
-    "bed",
-    "son",
-    "person",
-    "base",
-    "meat",
-    "usual",
-    "air",
-    "meeting",
-    "worth",
-    "game",
-    "independence",
-    "physical",
-    "brief",
-    "play",
-    "raise",
-    "board",
-    "she",
-    "key",
-    "writing",
-    "pick",
-    "command",
-    "party",
-    "yesterday",
-    "spring",
-    "candidate",
-    "physics",
-    "university",
-    "concern",
-    "development",
-    "change",
-    "string",
-    "target",
-    "instance",
-    "room",
-    "bitter",
-    "bird",
-    "football",
-    "normal",
-    "split",
-    "impression",
-    "wood",
-    "long",
-    "meaning",
-    "stock",
-    "cap",
-    "leadership",
-    "media",
-    "ambition",
-    "fishing",
-    "essay",
-    "salad",
-    "repair",
-    "today",
-    "designer",
-    "night",
-    "bank",
-    "drawing",
-    "inevitable",
-    "phase",
-    "vast",
-    "chip",
-    "anger",
-    "switch",
-    "cry",
-    "twist",
-    "personality",
-    "attempt",
-    "storage",
-    "being",
-    "preparation",
-    "bat",
-    "selection",
-    "white",
-    "technology",
-    "contract",
-    "side",
-    "section",
-    "station",
-    "till",
-    "structure",
-    "tongue",
-    "taste",
-    "truth",
-    "difficulty",
-    "group",
-    "limit",
-    "main",
-    "move",
-    "feeling",
-    "light",
-    "example",
-    "mission",
-    "might",
-    "wait",
-    "wheel",
-    "shop",
-    "host",
-    "classic",
-    "alternative",
-    "cause",
-    "agent",
-    "consist",
-    "table",
-    "airline",
-    "text",
-    "pool",
-    "craft",
-    "range",
-    "fuel",
-    "tool",
-    "partner",
-    "load",
-    "entrance",
-    "deposit",
-    "hate",
-    "article",
-    "video",
-    "summer",
-    "feature",
-    "extreme",
-    "mobile",
-    "hospital",
-    "flight",
-    "fall",
-    "pension",
-    "piano",
-    "fail",
-    "result",
-    "rub",
-    "gap",
-    "system",
-    "report",
-    "suck",
-    "ordinary",
-    "wind",
-    "nerve",
-    "ask",
-    "shine",
-    "note",
-    "line",
-    "mom",
-    "perception",
-    "brother",
-    "reference",
-    "bend",
-    "charge",
-    "treat",
-    "trick",
-    "term",
-    "homework",
-    "bake",
-    "bid",
-    "status",
-    "project",
-    "strategy",
-    "orange",
-    "let",
-    "enthusiasm",
-    "parent",
-    "concentrate",
-    "device",
-    "travel",
-    "poetry",
-    "business",
-    "society",
-    "kiss",
-    "end",
-    "vegetable",
-    "employ",
-    "schedule",
-    "hour",
-    "brave",
-    "focus",
-    "process",
-    "movie",
-    "illegal",
-    "general",
-    "coffee",
-    "ad",
-    "highway",
-    "chemistry",
-    "psychology",
-    "hire",
-    "bell",
-    "conference",
-    "relief",
-    "show",
-    "neat",
-    "funny",
-    "weight",
-    "quality",
-    "club",
-    "daughter",
-    "zone",
-    "touch",
-    "tonight",
-    "shock",
-    "burn",
-    "excuse",
-    "name",
-    "survey",
-    "landscape",
-    "advance",
-    "satisfaction",
-    "bread",
-    "disaster",
-    "item",
-    "hat",
-    "prior",
-    "shopping",
-    "visit",
-    "east",
-    "photo",
-    "home",
-    "idea",
-    "father",
-    "comparison",
-    "cat",
-    "pipe",
-    "winner",
-    "count",
-    "lake",
-    "fight",
-    "prize",
-    "foundation",
-    "dog",
-    "keep",
-    "ideal",
-    "fan",
-    "struggle",
-    "peak",
-    "safety",
-    "solution",
-    "hell",
-    "conclusion",
-    "population",
-    "strain",
-    "alarm",
-    "measurement",
-    "second",
-    "train",
-    "race",
-    "due",
-    "insurance",
-    "boss",
-    "tree",
-    "monitor",
-    "sick",
-    "course",
-    "drag",
-    "appointment",
-    "slice",
-    "still",
-    "care",
-    "patience",
-    "rich",
-    "escape",
-    "emotion",
-    "royal",
-    "female",
-    "childhood",
-    "government",
-    "picture",
-    "will",
-    "sock",
-    "big",
-    "gate",
-    "oil",
-    "cross",
-    "pin",
-    "improvement",
-    "championship",
-    "silly",
-    "help",
-    "sky",
-    "pitch",
-    "man",
-    "diamond",
-    "most",
-    "transition",
-    "work",
-    "science",
-    "committee",
-    "moment",
-    "fix",
-    "teaching",
-    "dig",
-    "specialist",
-    "complex",
-    "guide",
-    "people",
-    "dead",
-    "voice",
-    "original",
-    "break",
-    "topic",
-    "data",
-    "degree",
-    "reading",
-    "recording",
-    "bunch",
-    "reach",
-    "judgment",
-    "lie",
-    "regular",
-    "set",
-    "painting",
-    "mode",
-    "list",
-    "player",
-    "bear",
-    "north",
-    "wonder",
-    "carpet",
-    "heavy",
-    "officer",
-    "negative",
-    "clock",
-    "unique",
-    "baby",
-    "pain",
-    "assumption",
-    "disk",
-    "iron",
-    "bill",
-    "drawer",
-    "look",
-    "double",
-    "mistake",
-    "finish",
-    "future",
-    "brilliant",
-    "contact",
-    "math",
-    "rice",
-    "leave",
-    "restaurant",
-    "discount",
-    "sex",
-    "virus",
-    "bit",
-    "trust",
-    "event",
-    "wear",
-    "juice",
-    "failure",
-    "bug",
-    "context",
-    "mud",
-    "whole",
-    "wrap",
-    "intention",
-    "draft",
-    "pressure",
-    "cake",
-    "dark",
-    "explanation",
-    "space",
-    "angle",
-    "word",
-    "efficiency",
-    "management",
-    "habit",
-    "star",
-    "chance",
-    "finding",
-    "transportation",
-    "stand",
-    "criticism",
-    "flow",
-    "door",
-    "injury",
-    "insect",
-    "surprise",
-    "apartment",
-]  # pylint: disable=line-too-long
-
-# ISO 639-1 codes to language names.
-LANGUAGE_CODES = MappingProxyType(
-    {
-        "en": "English",
-        "es": "Spanish",
-        "pt": "Portuguese",
-        "ar": "Arabic",
-        "hi": "Hindi",
-        "fr": "French",
-        "ru": "Russian",
-        "de": "German",
-        "ja": "Japanese",
-        "it": "Italian",
-        "bn": "Bengali",
-        "uk": "Ukrainian",
-        "th": "Thai",
-        "ur": "Urdu",
-        "ta": "Tamil",
-        "te": "Telugu",
-        "bg": "Bulgarian",
-        "ko": "Korean",
-        "pl": "Polish",
-        "he": "Hebrew",
-        "fa": "Persian",
-        "vi": "Vietnamese",
-        "ne": "Nepali",
-        "sw": "Swahili",
-        "kn": "Kannada",
-        "mr": "Marathi",
-        "gu": "Gujarati",
-        "pa": "Punjabi",
-        "ml": "Malayalam",
-        "fi": "Finnish",
-    }
-)
-
-# Chinese characters
-_CHINESE_CHARS_PATTERN = r"[\u4E00-\u9FFF\u3400-\u4DBF]"
-# Japanese Hiragana & Katakana
-_JAPANESE_CHARS_PATTERN = r"[\u3040-\u309f\u30a0-\u30ff]"
-# Korean (Hangul Syllables)
-_KOREAN_CHARS_PATTERN = r"[\uAC00-\uD7AF]"
-_ALPHABETS = "([A-Za-z])"
-_PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]"
-_SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)"
-_STARTERS = (
-    r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
-)
-_ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
-_WEBSITES = "[.](com|net|org|io|gov|edu|me)"
-_DIGITS = "([0-9])"
-_MULTIPLE_DOTS = r"\.{2,}"
-
-
-# Util functions
-def split_into_sentences(text):
-    """Split the text into sentences.
-
-    Args:
-      text: A string that consists of more than or equal to one sentences.
-
-    Returns:
-      A list of strings where each string is a sentence.
-    """
-    text = " " + text + "  "
-    text = text.replace("\n", " ")
-    text = re.sub(_PREFIXES, "\\1<prd>", text)
-    text = re.sub(_WEBSITES, "<prd>\\1", text)
-    text = re.sub(_DIGITS + "[.]" + _DIGITS, "\\1<prd>\\2", text)
-    text = re.sub(
-        _MULTIPLE_DOTS,
-        lambda match: "<prd>" * len(match.group(0)) + "<stop>",
-        text,
-    )
-    if "Ph.D" in text:
-        text = text.replace("Ph.D.", "Ph<prd>D<prd>")
-    text = re.sub(r"\s" + _ALPHABETS + "[.] ", " \\1<prd> ", text)
-    text = re.sub(_ACRONYMS + " " + _STARTERS, "\\1<stop> \\2", text)
-    text = re.sub(
-        _ALPHABETS + "[.]" + _ALPHABETS + "[.]" + _ALPHABETS + "[.]",
-        "\\1<prd>\\2<prd>\\3<prd>",
-        text,
-    )
-    text = re.sub(_ALPHABETS + "[.]" + _ALPHABETS + "[.]", "\\1<prd>\\2<prd>", text)
-    text = re.sub(" " + _SUFFIXES + "[.] " + _STARTERS, " \\1<stop> \\2", text)
-    text = re.sub(" " + _SUFFIXES + "[.]", " \\1<prd>", text)
-    text = re.sub(" " + _ALPHABETS + "[.]", " \\1<prd>", text)
-    if "”" in text:
-        text = text.replace(".”", "”.")
-    if '"' in text:
-        text = text.replace('."', '".')
-    if "!" in text:
-        text = text.replace('!"', '"!')
-    if "?" in text:
-        text = text.replace('?"', '"?')
-    text = text.replace(".", ".<stop>")
-    text = text.replace("?", "?<stop>")
-    text = text.replace("!", "!<stop>")
-    text = text.replace("<prd>", ".")
-    sentences = text.split("<stop>")
-    sentences = [s.strip() for s in sentences]
-    if sentences and not sentences[-1]:
-        sentences = sentences[:-1]
-    return sentences
-
-
-def count_words(text):
-    """Counts the number of words."""
-    tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")
-    tokens = tokenizer.tokenize(text)
-    num_words = len(tokens)
-    return num_words
-
-
-def split_chinese_japanese_hindi(lines: str) -> Iterable[str]:
-    """
-    Split Chinese and Japanese text into sentences.
-    From https://stackoverflow.com/questions/27441191/splitting-chinese-document-into-sentences
-    Special question/exclamation marks were added upon inspection of our raw data,
-    Also supports multiple lines.
-    The separator for hindi is '।'
-    """
-    for line in lines.splitlines():
-        yield from re.findall(
-            r"[^!?。\.\!\?\！\？\．\n।]+[!?。\.\!\?\！\？\．\n।]?",
-            line.strip(),
-            flags=re.U,
-        )
-
-
-def count_words_cjk(text: str) -> int:
-    """Counts the number of words for Chinese and Japanese and Korean.
-    Can be extended to additional languages.
-    Source: https://stackoverflow.com/questions/49164507/how-to-count-the-number-of-chinese-korean-and-english-words withadditional modifications
-    Example:
-        >In: count_words_cjk('こんにちは、ジェイソンさん、Jason? Nice to meet you☺ ❤')
-        >Out: 19
-    """
-    # Non alpha numeric patterns in latin and asian languages.
-    non_alphanumeric_patterns = (
-        r"[\\.\!\?\．\/_,\{\}<>:;$%^&*(+\"\'+——！，。？、`~@#￥……（）：；《）《》“”()\[\]«»〔〕\-「」]+"
-    )
-    text = re.sub(non_alphanumeric_patterns, "", text)
-
-    emoji_cnt = emoji.emoji_count(text)  # count emojis
-    text = emoji.replace_emoji(text, "")  # remove emojis
-
-    foreign_chars_patterns = "|".join([_CHINESE_CHARS_PATTERN, _JAPANESE_CHARS_PATTERN, _KOREAN_CHARS_PATTERN])
-    asian_chars = re.findall(foreign_chars_patterns, text)
-    asian_chars_cnt = len(asian_chars)
-    non_asian_chars = re.sub(foreign_chars_patterns, " ", text)
-    non_asian_words_cnt = len(non_asian_chars.split())
-
-    return non_asian_words_cnt + asian_chars_cnt + emoji_cnt
-
-
-@functools.cache
-def _get_sentence_tokenizer():
-    return nltk.data.load("nltk:tokenizers/punkt/english.pickle")
-
-
-def count_sentences(text):
-    """Count the number of sentences."""
-    tokenizer = _get_sentence_tokenizer()
-    tokenized_sentences = tokenizer.tokenize(text)
-    return len(tokenized_sentences)
-
-
-def get_langid(text: str, lid_path: str | None = None) -> str:
-    line_langs: list[str] = []
-    lines = [line.strip() for line in text.split("\n") if len(line.strip()) >= 4]
-
-    for line in lines:
-        try:
-            line_langs.append(langdetect.detect(line))
-        except langdetect.LangDetectException as e:
-            logger.info("Unable to detect language for text %s due to %s", line, e)  # refex: disable=pytotw.037
-
-    if len(line_langs) == 0:
-        return "en"
-    # select the text language to be the most commonly predicted language of the lines.
-    return collections.Counter(line_langs).most_common(1)[0][0]
-
-
-def generate_keywords(num_keywords):
-    """Randomly generates a few keywords."""
-    return random.sample(WORD_LIST, k=num_keywords)
-
-
-"""Library of instructions"""
-_InstructionArgsDtype = dict[str, int | str | Sequence[str]] | None
-
-_LANGUAGES = LANGUAGE_CODES
-
-# The relational operation for comparison.
-_COMPARISON_RELATION = ("less than", "at least")
-
-# The maximum number of sentences.
-_MAX_NUM_SENTENCES = 20
-
-# The number of placeholders.
-_NUM_PLACEHOLDERS = 4
-
-# The number of bullet lists.
-_NUM_BULLETS = 5
-
-# The options of constrained response.
-_CONSTRAINED_RESPONSE_OPTIONS = (
-    "My answer is yes.",
-    "My answer is no.",
-    "My answer is maybe.",
-)
-
-# The options of starter keywords.
-_STARTER_OPTIONS = (
-    "I would say",
-    "My answer is",
-    "I believe",
-    "In my opinion",
-    "I think",
-    "I reckon",
-    "I feel",
-    "From my perspective",
-    "As I see it",
-    "According to me",
-    "As far as I'm concerned",
-    "To my understanding",
-    "In my view",
-    "My take on it is",
-    "As per my perception",
-)
-
-# The options of ending keywords.
-# TODO(jeffreyzhou) add more ending options
-_ENDING_OPTIONS = ("Any other questions?", "Is there anything else I can help with?")
-
-# The number of highlighted sections.
-_NUM_HIGHLIGHTED_SECTIONS = 4
-
-# The section spliter.
-_SECTION_SPLITER = ("Section", "SECTION")
-
-# The number of sections.
-_NUM_SECTIONS = 5
-
-# The number of paragraphs.
-_NUM_PARAGRAPHS = 5
-
-# The postscript marker.
-_POSTSCRIPT_MARKER = ("P.S.", "P.P.S")
-
-# The number of keywords.
-_NUM_KEYWORDS = 2
-
-# The occurrences of a single keyword.
-_KEYWORD_FREQUENCY = 3
-
-# The occurrences of a single letter.
-_LETTER_FREQUENCY = 10
-
-# The occurrences of words with all capital letters.
-_ALL_CAPITAL_WORD_FREQUENCY = 20
-
-# The number of words in the response.
-_NUM_WORDS_LOWER_LIMIT = 100
-_NUM_WORDS_UPPER_LIMIT = 500
-
-
-class Instruction:
-    """An instruction template."""
-
-    def __init__(self, instruction_id):
-        self.id = instruction_id
-
-    def build_description(self, **kwargs):
-        raise NotImplementedError("`build_description` not implemented.")
-
-    def get_instruction_args(self):
-        raise NotImplementedError("`get_instruction_args` not implemented.")
-
-    def get_instruction_args_keys(self):
-        raise NotImplementedError("`get_instruction_args_keys` not implemented.")
-
-    def check_following(self, value):
-        raise NotImplementedError("`check_following` not implemented.")
-
-
-class ResponseLanguageChecker(Instruction):
-    """Check the language of the entire response."""
-
-    def build_description(self, *, language=None):
-        """Build the instruction description.
-
-        Args:
-          language: A string representing the expected language of the response. The
-            language has to comply to the 97 types defined in
-            `langid.py` (https://pypi.org/project/langid/1.1.5/), which follows
-            ISO 639-1 codes (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes);
-            for example, `en` for English, `zh` for Chinese, `fr` for French.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._language = language
-        if self._language is None:
-            self._language = random.choice(list(_LANGUAGES.keys()))
-
-        self._description_pattern = (
-            "Your ENTIRE response should be in {language} language, no other " + "language is allowed."
-        )
-        return self._description_pattern.format(language=_LANGUAGES[self._language])
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"language": self._language}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["language"]
-
-    def check_following(self, value):
-        """Check if the language of the entire response follows the instruction.
-
-        Args:
-          value: A string representing the response.
-
-        Returns:
-          True if the language of `value` follows instruction; otherwise False.
-        """
-        assert isinstance(value, str)
-
-        try:
-            return langdetect.detect(value) == self._language
-        except langdetect.LangDetectException as e:
-            # Count as instruction is followed.
-            logger.info("Unable to detect language for text %s due to %s", value, e)  # refex: disable=pytotw.037
-            return True
-
-
-class NumberOfSentences(Instruction):
-    """Check the number of sentences."""
-
-    def build_description(self, *, num_sentences=None, relation=None):
-        """Build the instruction description.
-
-        Args:
-          num_sentences: An integer specifying the number of sentences as a
-            threshold.
-          relation: A string in (`less than`, `at least`), defining the relational
-            operator for comparison.
-            Two relational comparisons are supported for now:
-            if 'less than', the actual number of sentences < the threshold;
-            if 'at least', the actual number of sentences >= the threshold.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        # The number of sentences as a threshold for comparison.
-        self._num_sentences_threshold = num_sentences
-        if self._num_sentences_threshold is None or self._num_sentences_threshold < 0:
-            self._num_sentences_threshold = random.randint(1, _MAX_NUM_SENTENCES)
-
-        if relation is None:
-            self._comparison_relation = random.choice(_COMPARISON_RELATION)
-        elif relation not in _COMPARISON_RELATION:
-            raise ValueError(
-                f"The supported relation for comparison must be in {_COMPARISON_RELATION}, but {relation} is given."
-            )
-        else:
-            self._comparison_relation = relation
-
-        self._description_pattern = "Your response should contain {relation} {num_sentences} sentences."
-        return self._description_pattern.format(
-            relation=self._comparison_relation,
-            num_sentences=self._num_sentences_threshold,
-        )
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {
-            "num_sentences": self._num_sentences_threshold,
-            "relation": self._comparison_relation,
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_sentences", "relation"]
-
-    def check_following(self, value):
-        """Check if the number of sentences follows the instruction.
-
-        Args:
-          value: A string representing the response.
-
-        Returns:
-          True if the response follows the instruction.
-
-        Raise:
-            ValueError if the string in `instruction_args` is not in
-            [`less_than`, `at_least`].
-        """
-        lang = get_langid(value)
-        if lang == "th":
-            # Counting Newline also as a new sentence:
-            num_sentences = sum([len(sent_tokenize_thai(line)) for line in value.splitlines()])
-        elif lang in ["zh", "zh-cn", "zh-tw", "ja", "hi"]:
-            num_sentences = len(list(split_chinese_japanese_hindi(value)))
-        else:
-            num_sentences = count_sentences(value)
-        if self._comparison_relation == _COMPARISON_RELATION[0]:
-            return num_sentences < self._num_sentences_threshold
-        elif self._comparison_relation == _COMPARISON_RELATION[1]:
-            return num_sentences >= self._num_sentences_threshold
-
-
-class PlaceholderChecker(Instruction):
-    """Check the placeholders in template writing."""
-
-    def build_description(self, *, num_placeholders=None):
-        """Build the instruction description.
-
-        Args:
-          num_placeholders: An integer denoting the minimum number of
-            placeholders required in the response.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._num_placeholders = num_placeholders
-        if self._num_placeholders is None or self._num_placeholders < 0:
-            self._num_placeholders = random.randint(1, _NUM_PLACEHOLDERS)
-        self._description_pattern = (
-            "The response must contain at least {num_placeholders} placeholders "
-            + "represented by square brackets, such as [address]."
-        )
-        return self._description_pattern.format(num_placeholders=self._num_placeholders)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"num_placeholders": self._num_placeholders}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_placeholders"]
-
-    def check_following(self, value):
-        """Check if the number of placeholders follows the instruction.
-
-        Args:
-          value: A string representing the response.
-
-        Returns:
-          True if the actual number of placeholders in the response is greater than
-          or equal to `num_placeholders`; otherwise, False.
-        """
-        placeholders = re.findall(r"\[.*?\]", value)
-        num_placeholders = len(placeholders)
-        return num_placeholders >= self._num_placeholders
-
-
-class BulletListChecker(Instruction):
-    """Checks the bullet list in the prompt."""
-
-    def build_description(self, *, num_bullets=None):
-        """Build the instruction description.
-
-        Args:
-          num_bullets: An integer specifying the exact number of bullet lists
-            that is required to appear in the response.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._num_bullets = num_bullets
-        if self._num_bullets is None or self._num_bullets < 0:
-            self._num_bullets = random.randint(1, _NUM_BULLETS)
-        self._description_pattern = (
-            "Your answer must contain exactly {num_bullets} bullet points. "
-            + "Use the markdown bullet points such as:\n"
-            + "* This is point 1. \n"
-            + "* This is point 2"
-        )
-        return self._description_pattern.format(num_bullets=self._num_bullets)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"num_bullets": self._num_bullets}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_bullets"]
-
-    def check_following(self, value):
-        r"""Check if the number of bullet lists meets the requirement.
-
-        Args:
-          value: A string representing the response. The response is expected to
-            contain some bullet lists that start with `\*`.
-
-        Returns:
-          True if the actual number of bullet lists in the response meets the
-          requirement.
-        """
-        bullet_lists = re.findall(r"^\s*\*[^\*].*$", value, flags=re.MULTILINE)
-        bullet_lists_2 = re.findall(r"^\s*-.*$", value, flags=re.MULTILINE)
-        num_bullet_lists = len(bullet_lists) + len(bullet_lists_2)
-        return num_bullet_lists == self._num_bullets
-
-
-class ConstrainedResponseChecker(Instruction):
-    """Checks the constrained response."""
-
-    def build_description(self):
-        """Build the instruction description."""
-        # A sequence of string(s) representing the options of the expected response.
-        self._constrained_responses = _CONSTRAINED_RESPONSE_OPTIONS
-        self._description_pattern = "Answer with one of the following options: {response_options}"
-        return self._description_pattern.format(response_options=self._constrained_responses)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        """Checks if the response matches the constrained options.
-
-        Args:
-          value: A string representing the response.
-
-        Returns:
-          True if the actual response contains one of the options in the constrained
-          responses; otherwise False.
-        """
-        value = value.strip()
-        for constrained_response in self._constrained_responses:
-            if constrained_response in value:
-                return True
-        return False
-
-
-class ConstrainedStartChecker(Instruction):
-    """Checks the response start."""
-
-    def build_description(self, *, starter=None):
-        """Build the instruction description.
-
-        Args:
-          starter: A string representing the keyward that the response should start
-            with.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._starter = starter.strip() if isinstance(starter, str) else starter
-        if self._starter is None:
-            self._starter = random.choice(_STARTER_OPTIONS)
-        self._description_pattern = (
-            "During the conversation, when it is your turn, " + "please always start with {starter}"
-        )
-        return self._description_pattern.format(starter=self._starter)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"starter": self._starter}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["starter"]
-
-    def check_following(self, value):
-        """Checks if the response starts with the constrained keyword or phrase.
-
-        Args:
-          value: A string representing the response.
-
-        Returns:
-          True if the response starts with the given phrase or keyword that is
-          contained in `instruction_args`; otherwise, False.
-        """
-        response_pattern = r"^\s*" + self._starter + r".*$"
-        response_with_constrained_start = re.search(response_pattern, value, flags=re.MULTILINE)
-        return True if response_with_constrained_start else False
-
-
-class HighlightSectionChecker(Instruction):
-    """Checks the highlighted section."""
-
-    def build_description(self, *, num_highlights=None):
-        """Build the instruction description.
-
-        Args:
-          num_highlights: An integer specifying the minimum number of highlighted
-            sections.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._num_highlights = num_highlights
-        if self._num_highlights is None or self._num_highlights < 0:
-            self._num_highlights = random.randint(1, _NUM_HIGHLIGHTED_SECTIONS)
-
-        self._description_pattern = (
-            "Highlight at least {num_highlights} sections in your answer with "
-            + "markdown, i.e. *highlighted section*."
-        )
-
-        return self._description_pattern.format(num_highlights=self._num_highlights)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"num_highlights": self._num_highlights}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_highlights"]
-
-    def check_following(self, value):
-        """Checks if the number of highlighted sections meets the requirement.
-
-        Args:
-          value: a string repesenting the response. The response is expected to
-            contain highlighted sections in the format of *highlighted*.
-
-        Returns:
-          True if the actual number of highlighted sections in the format of
-          *highlighed sections* meets the minimum requirement; otherwise False.
-        """
-        num_highlights = 0
-        highlights = re.findall(r"\*[^\n\*]*\*", value)
-        double_highlights = re.findall(r"\*\*[^\n\*]*\*\*", value)
-        for highlight in highlights:
-            if highlight.strip("*").strip():
-                num_highlights += 1
-        for highlight in double_highlights:
-            if highlight.removeprefix("**").removesuffix("**").strip():
-                num_highlights += 1
-
-        return num_highlights >= self._num_highlights
-
-
-class SectionChecker(Instruction):
-    """Checks the sections."""
-
-    def build_description(self, *, section_spliter=None, num_sections=None):
-        """Build the instruction description.
-
-        Args:
-          section_spliter: A string represents the section spliter keyword that
-            marks a new section, i.e., `Section` or `SECTION`.
-          num_sections: An integer specifying the number of sections.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._section_spliter = section_spliter.strip() if isinstance(section_spliter, str) else section_spliter
-        if self._section_spliter is None:
-            self._section_spliter = random.choice(_SECTION_SPLITER)
-
-        self._num_sections = num_sections
-        if self._num_sections is None or self._num_sections < 0:
-            self._num_sections = random.randint(1, _NUM_SECTIONS)
-
-        self._description_pattern = (
-            "Your response must have {num_sections} sections. Mark the beginning "
-            + "of each section with {section_spliter} X, such as:\n"
-            + "{section_spliter} 1\n"
-            + "[content of section 1]\n"
-            + "{section_spliter} 2\n"
-            + "[content of section 2]"
-        )
-
-        return self._description_pattern.format(num_sections=self._num_sections, section_spliter=self._section_spliter)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {
-            "section_spliter": self._section_spliter,
-            "num_sections": self._num_sections,
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["section_spliter", "num_sections"]
-
-    def check_following(self, value):
-        """Checks the response contains multiple sections.
-
-        Args:
-          value: A string representing the response. The response is expected
-            to contain multiple sections (number of sections is greater than 1).
-            A new section starts with `Section 1`, where the number denotes the
-            section index.
-
-        Returns:
-          True if the number of sections in the response is greater than or equal to
-          the minimum number of sections; otherwise, False.
-        """
-        section_splitter_patten = r"\s?" + self._section_spliter + r"\s?\d+\s?"
-        sections = re.split(section_splitter_patten, value)
-        num_sections = len(sections) - 1
-        return num_sections >= self._num_sections
-
-
-class ParagraphChecker(Instruction):
-    """Checks the paragraphs."""
-
-    def build_description(self, *, num_paragraphs=None):
-        """Build the instruction description.
-
-        Args:
-          num_paragraphs: An integer specifying the number of paragraphs.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._num_paragraphs = num_paragraphs
-        if self._num_paragraphs is None or self._num_paragraphs < 0:
-            self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
-
-        self._description_pattern = (
-            "There should be {num_paragraphs} paragraphs. " + "Paragraphs are separated with the markdown divider: ***"
-        )
-
-        return self._description_pattern.format(num_paragraphs=self._num_paragraphs)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"num_paragraphs": self._num_paragraphs}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_paragraphs"]
-
-    def check_following(self, value):
-        """Checks the response contains required number of paragraphs.
-
-        Args:
-          value: A string representing the response. The response may contain
-            paragraphs that are separated by the markdown divider: `***`.
-
-        Returns:
-          True if the actual number of paragraphs is the same as required;
-          otherwise, False.
-        """
-        paragraphs = re.split(r"\s?\*\*\*\s?", value)
-        num_paragraphs = len(paragraphs)
-
-        for index, paragraph in enumerate(paragraphs):
-            if not paragraph.strip():
-                if index == 0 or index == len(paragraphs) - 1:
-                    num_paragraphs -= 1
-                else:
-                    return False
-
-        return num_paragraphs == self._num_paragraphs
-
-
-class PostscriptChecker(Instruction):
-    """Checks the postscript."""
-
-    def build_description(self, *, postscript_marker=None):
-        """Build the instruction description.
-
-        Args:
-          postscript_marker: A string containing the keyword that marks the start
-            of the postscript section.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._postscript_marker = postscript_marker.strip() if isinstance(postscript_marker, str) else postscript_marker
-        if self._postscript_marker is None:
-            self._postscript_marker = random.choice(_POSTSCRIPT_MARKER)
-
-        self._description_pattern = (
-            "At the end of your response, please explicitly add a postscript " + "starting with {postscript}"
-        )
-
-        return self._description_pattern.format(postscript=self._postscript_marker)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"postscript_marker": self._postscript_marker}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["postscript_marker"]
-
-    def check_following(self, value):
-        """Checks if the response follows the postscript format.
-
-        Args:
-          value: a string representing the response. The response is expected to
-            contain a postscript section.
-
-        Returns:
-          True if the response contains a postscript section starting with
-          the keyword containing in the `instruction_args`; otherwise False.
-        """
-        value = value.lower()
-        if self._postscript_marker == "P.P.S":
-            postscript_pattern = r"\s*p\.\s?p\.\s?s.*$"
-        elif self._postscript_marker == "P.S.":
-            postscript_pattern = r"\s*p\.\s?s\..*$"
-        else:
-            postscript_pattern = r"\s*" + self._postscript_marker.lower() + r".*$"
-        postscript = re.findall(postscript_pattern, value, flags=re.MULTILINE)
-        return True if postscript else False
-
-
-class RephraseChecker(Instruction):
-    """Checks the repharse."""
-
-    def build_description(self, *, original_message):
-        """Build the instruction description.
-
-        Args:
-          original_message: A string representing the original message. The
-            rephrased response should only change its words/sentences in between
-            its two asterisks, for example, *change me*. Both original and rephrased
-            messages should contain the changes in the form of *change me*.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        if not self.is_change(original_message):
-            raise ValueError(f"Message {original_message} does not contain changes in the form of *change me*.")
-
-        self._reference_without_change = original_message
-        self._description = (
-            "Rephrasing: Your rephrased response should only"
-            + "change the words/sentences in between two asterisks"
-            + "such as *change me*."
-        )
-        return self._description
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"original_message": self._reference_without_change}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["original_message"]
-
-    def check_following(self, value):
-        r"""Checks if the rephrasing follows the instruction.
-
-        Args:
-          value: A string representing the response, which is expected to rephras
-            the string of `instruction_args`.
-
-        Returns:
-          True if `value` and `instruction_args` only differ by the words/sentences
-          in between two asterisks such as *change me*; otherwise, False.
-        """
-
-        if not self.is_change(value):
-            raise ValueError(f"value {value} does not contain changes in the form of *change me*.")
-
-        response_without_changes = self.strip_changes(value)
-        reference_without_changes = self.strip_changes(self._reference_without_change)
-
-        return response_without_changes == reference_without_changes
-
-    def is_change(self, response):
-        """Check if there is change in the response in the form of *change me*."""
-        return re.search(r"\*.*\*", response)
-
-    def strip_changes(self, response):
-        """Strips off the changes."""
-        return re.sub(r"\*.*\*", "", response)
-
-
-class KeywordChecker(Instruction):
-    """Check the exisitence of certain keywords."""
-
-    def build_description(self, *, keywords=None):
-        """Build the instruction description.
-
-        Args:
-          keywords: A sequence of strings representing the keywords that are
-            expected in the response.
-
-        Returns:
-          A string representing the instruction description.
-        """
-
-        if not keywords:
-            self._keywords = generate_keywords(num_keywords=_NUM_KEYWORDS)
-        else:
-            self._keywords = keywords
-        self._keywords = sorted(self._keywords)
-
-        self._description_pattern = "Include keywords {keywords} in the response."
-
-        return self._description_pattern.format(keywords=self._keywords)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"keywords": self._keywords}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["keywords"]
-
-    def check_following(self, value):
-        """Check if the response contain the expected keywords."""
-        for keyword in self._keywords:
-            if not re.search(keyword, value, flags=re.IGNORECASE):
-                return False
-        return True
-
-
-class KeywordFrequencyChecker(Instruction):
-    """Check the keyword frequency."""
-
-    def build_description(self, *, keyword=None, frequency=None, relation=None):
-        """Build the instruction description.
-
-        Args:
-          keyword: A string representing a keyword that is expected in the response.
-          frequency: An integer specifying the number of times `keyword` is expected
-            to appear in the response.
-          relation: A string in (`less than`, `at least`), defining the relational
-            operator for comparison.
-            Two relational comparisons are supported for now:
-            if 'less than', the actual number of occurrences < frequency;
-            if 'at least', the actual number of occurrences >= frequency.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        if not keyword:
-            self._keyword = generate_keywords(num_keywords=1)[0]
-        else:
-            self._keyword = keyword.strip()
-
-        self._frequency = frequency
-        if self._frequency is None or self._frequency < 0:
-            self._frequency = random.randint(1, _KEYWORD_FREQUENCY)
-
-        if relation is None:
-            self._comparison_relation = random.choice(_COMPARISON_RELATION)
-        elif relation not in _COMPARISON_RELATION:
-            raise ValueError(
-                f"The supported relation for comparison must be in {_COMPARISON_RELATION}, but {relation} is given."
-            )
-        else:
-            self._comparison_relation = relation
-
-        self._description_pattern = (
-            "In your response, the word {keyword} should appear {relation} " + "{frequency} times."
-        )
-
-        return self._description_pattern.format(
-            keyword=self._keyword,
-            relation=self._comparison_relation,
-            frequency=self._frequency,
-        )
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {
-            "keyword": self._keyword,
-            "frequency": self._frequency,
-            "relation": self._comparison_relation,
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["keyword", "frequency", "relation"]
-
-    def check_following(self, value):
-        """Checks if the response contain the keyword with required frequency."""
-        actual_occurrences = len(re.findall(self._keyword, value, flags=re.IGNORECASE))
-
-        if self._comparison_relation == _COMPARISON_RELATION[0]:
-            return actual_occurrences < self._frequency
-        elif self._comparison_relation == _COMPARISON_RELATION[1]:
-            return actual_occurrences >= self._frequency
-
-
-class NumberOfWords(Instruction):
-    """Checks the number of words."""
-
-    def build_description(self, *, num_words=None, relation=None):
-        """Build the instruction description.
-
-        Args:
-          num_words: An integer specifying the number of words contained in the
-            response.
-          relation: A string in (`less than`, `at least`), defining the relational
-            operator for comparison.
-            Two relational comparisons are supported for now:
-            if 'less than', the actual number of words < num_words;
-            if 'at least', the actual number of words >= num_words.
-
-        Returns:
-          A string representing the instruction description.
-        """
-
-        self._num_words = num_words
-        if self._num_words is None or self._num_words < 0:
-            self._num_words = random.randint(_NUM_WORDS_LOWER_LIMIT, _NUM_WORDS_UPPER_LIMIT)
-
-        if relation is None:
-            self._comparison_relation = random.choice(_COMPARISON_RELATION)
-        elif relation not in _COMPARISON_RELATION:
-            raise ValueError(
-                f"The supported relation for comparison must be in {_COMPARISON_RELATION}, but {relation} is given."
-            )
-        else:
-            self._comparison_relation = relation
-
-        self._description_pattern = "Answer with {relation} {num_words} words."
-
-        return self._description_pattern.format(relation=self._comparison_relation, num_words=self._num_words)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"num_words": self._num_words, "relation": self._comparison_relation}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_words", "relation"]
-
-    def check_following(self, value):
-        """Checks if the response contains the expected number of words."""
-        lang = get_langid(value)
-        if lang == "th":
-            num_words = len(word_tokenize_thai(value))
-        elif lang in ["zh", "zh-cn", "zh-tw", "ja", "ko"]:
-            num_words = count_words_cjk(value)
-        else:
-            num_words = count_words(value)
-
-        if self._comparison_relation == _COMPARISON_RELATION[0]:
-            return num_words < self._num_words
-        elif self._comparison_relation == _COMPARISON_RELATION[1]:
-            return num_words >= self._num_words
-
-
-class JsonFormat(Instruction):
-    """Check the Json format."""
-
-    def build_description(self):
-        self._description_pattern = (
-            "Entire output should be wrapped in JSON format. You can use markdown ticks such as ```."
-        )
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        value = (
-            value.strip()
-            .removeprefix("```json")
-            .removeprefix("```Json")
-            .removeprefix("```JSON")
-            .removeprefix("```")
-            .removesuffix("```")
-            .strip()
-        )
-        try:
-            json.loads(value)
-        except ValueError as _:
-            return False
-        return True
-
-
-class ParagraphFirstWordCheck(Instruction):
-    """Check the paragraph and the first word of the nth paragraph."""
-
-    def build_description(self, num_paragraphs=None, nth_paragraph=None, first_word=None):
-        r"""Build the instruction description.
-
-        Args:
-          num_paragraphs: An integer indicating the number of paragraphs expected
-            in the response. A paragraph is a subset of the string that is
-            expected to be separated by '\n\n'.
-          nth_paragraph: An integer indicating the paragraph number that we look at.
-            Note that n starts from 1.
-          first_word: A string that represent the first word of the bth paragraph.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._num_paragraphs = num_paragraphs
-        if self._num_paragraphs is None or self._num_paragraphs < 0:
-            self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS)
-
-        self._nth_paragraph = nth_paragraph
-        if self._nth_paragraph is None or self._nth_paragraph <= 0 or self._nth_paragraph > self._num_paragraphs:
-            self._nth_paragraph = random.randint(1, self._num_paragraphs + 1)
-
-        self._first_word = first_word
-        if self._first_word is None:
-            self._first_word = generate_keywords(num_keywords=1)[0]
-        self._first_word = self._first_word.lower()
-
-        self._description_pattern = (
-            "There should be {num_paragraphs} paragraphs. "
-            + "Paragraphs and only paragraphs are separated with each other by two "
-            + "new lines as if it was '\\n\\n' in python. "
-            + "Paragraph {nth_paragraph} must start with word {first_word}."
-        )
-
-        return self._description_pattern.format(
-            num_paragraphs=self._num_paragraphs,
-            nth_paragraph=self._nth_paragraph,
-            first_word=self._first_word,
-        )
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {
-            "num_paragraphs": self._num_paragraphs,
-            "nth_paragraph": self._nth_paragraph,
-            "first_word": self._first_word,
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_paragraphs", "nth_paragraph", "first_word"]
-
-    def check_following(self, value):
-        """Checks for required number of paragraphs and correct first word.
-
-        Args:
-          value: a string representing the response. The response may contain
-            paragraphs that are separated by two new lines and the first word of
-            the nth paragraph will have to match a specified word.
-
-        Returns:
-          True if the number of paragraphs is the same as required and the first
-          word of the specified paragraph is the same as required. Otherwise, false.
-        """
-
-        paragraphs = re.split(r"\n\n", value)
-        num_paragraphs = len(paragraphs)
-
-        for paragraph in paragraphs:
-            if not paragraph.strip():
-                num_paragraphs -= 1
-
-        # check that index doesn't go out of bounds
-        if self._nth_paragraph <= num_paragraphs:
-            paragraph = paragraphs[self._nth_paragraph - 1].strip()
-            if not paragraph:
-                return False
-        else:
-            return False
-
-        first_word = ""
-        punctuation = {".", ",", "?", "!", "'", '"'}
-
-        # get first word and remove punctuation
-        word = paragraph.split()[0].strip()
-        word = word.lstrip("'")
-        word = word.lstrip('"')
-
-        for letter in word:
-            if letter in punctuation:
-                break
-            first_word += letter.lower()
-
-        return num_paragraphs == self._num_paragraphs and first_word == self._first_word
-
-
-class KeySentenceChecker(Instruction):
-    """Check the existence of certain key sentences."""
-
-    def build_description(self, key_sentences=None, num_sentences=None):
-        """Build the instruction description.
-
-        Args:
-          key_sentences: A sequences of strings representing the key sentences that
-            are expected in the response.
-          num_sentences: The number of key sentences that are expected to be seen in
-            the response.
-
-        Returns:
-          A string representing the instruction description.
-        """
-
-        if not key_sentences:
-            self._key_sentences = {["For now, this is fine."]}
-        else:
-            self._key_sentences = key_sentences
-
-        if not num_sentences:
-            self._num_sentences = random.randint(1, len(self._key_sentences))
-        else:
-            self._num_sentences = num_sentences
-
-        self._description_pattern = "Include {num_sentences} of the following sentences {key_sentences}"
-
-        return self._description_pattern.format(num_sentences=self._num_sentences, key_sentences=self._key_sentences)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {
-            "num_sentences": self._num_sentences,
-            "key_sentences": list(self._key_sentences),
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["num_sentences", "key_sentences"]
-
-    def check_following(self, value):
-        """Checks if the response contains the expected key sentences."""
-        count = 0
-        sentences = split_into_sentences(value)
-        for sentence in self._key_sentences:
-            if sentence in sentences:
-                count += 1
-
-        return count == self._num_sentences
-
-
-class ForbiddenWords(Instruction):
-    """Checks that specified words are not used in response."""
-
-    def build_description(self, forbidden_words=None):
-        """Build the instruction description.
-
-        Args:
-          forbidden_words: A sequences of strings respresenting words that are not
-            allowed in the response.
-
-        Returns:
-          A string representing the instruction description.
-        """
-
-        if not forbidden_words:
-            self._forbidden_words = generate_keywords(num_keywords=_NUM_KEYWORDS)
-        else:
-            self._forbidden_words = list(set(forbidden_words))
-        self._forbidden_words = sorted(self._forbidden_words)
-        self._description_pattern = "Do not include keywords {forbidden_words} in the response."
-
-        return self._description_pattern.format(forbidden_words=self._forbidden_words)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {"forbidden_words": self._forbidden_words}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["forbidden_words"]
-
-    def check_following(self, value):
-        """Check if the response does not contain the expected keywords."""
-        for word in self._forbidden_words:
-            if re.search(r"\b" + word + r"\b", value, flags=re.IGNORECASE):
-                return False
-        return True
-
-
-class RephraseParagraph(Instruction):
-    """Checks that the paragraph is rephrased."""
-
-    def build_description(self, *, original_paragraph, low, high):
-        """Builds the instruction description.
-
-        Args:
-          original_paragraph: A string presenting the original paragraph. The
-            rephrases response should have betweeb low-high words in common.
-          low: An integer presenting the lower bound of similar words.
-          high: An integer representing the upper bound of similar words.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._original_paragraph = original_paragraph
-        self._low = low
-        self._high = high
-
-        self._description = (
-            "Rephrase the following paragraph: "
-            + "{original_paragraph}\nYour response should have "
-            + "between {low} and {high} of the same words. "
-            + "Words are the same if and only if all of the "
-            + "letters, ignoring cases, are the same. For "
-            + "example, 'run' is the same as 'Run' but different "
-            + "to 'ran'."
-        )
-
-        return self._description.format(original_paragraph=original_paragraph, low=self._low, high=self._high)
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return {
-            "original_paragraph": self._original_paragraph,
-            "low": self._low,
-            "high": self._high,
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["original_paragraph", "low", "high"]
-
-    def check_following(self, value):
-        val_words = re.findall(r"\w+", value.lower())
-        original_words = re.findall(r"\w+", self._original_paragraph.lower())
-        similar_words = 0
-
-        dict_val = collections.Counter(val_words)
-        dict_original = collections.Counter(original_words)
-
-        for word in dict_original:
-            similar_words += min(dict_original[word], dict_val[word])
-
-        return similar_words >= self._low and similar_words <= self._high
-
-
-class TwoResponsesChecker(Instruction):
-    """Check that two responses were given."""
-
-    def build_description(self):
-        """Build the instruction description."""
-        self._description_pattern = (
-            "Give two different responses. Responses and only responses should"
-            " be separated by 6 asterisk symbols: ******."
-        )
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        """Returns the keyward args of `build_description`."""
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        """Checks if the response has two different answers.
-
-        Args:
-          value: A string representing the response.
-
-        Returns:
-          True if two responses are detected and false otherwise.
-        """
-        valid_responses = list()
-        responses = value.split("******")
-        for index, response in enumerate(responses):
-            if not response.strip():
-                if index != 0 and index != len(responses) - 1:
-                    return False
-            else:
-                valid_responses.append(response)
-        return len(valid_responses) == 2 and valid_responses[0].strip() != valid_responses[1].strip()
-
-
-class RepeatPromptThenAnswer(Instruction):
-    """Checks that Prompt is first repeated then answered."""
-
-    def build_description(self, *, prompt_to_repeat=None):
-        """Build the instruction description.
-
-        Args:
-          prompt_to_repeat: The prompt that is meant to be repeated.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        if not prompt_to_repeat:
-            raise ValueError("prompt_to_repeat must be set.")
-        else:
-            self._prompt_to_repeat = prompt_to_repeat
-        self._description_pattern = (
-            "First repeat the request word for word without change,"
-            " then give your answer (1. do not say any words or characters"
-            " before repeating the request; 2. the request you need to repeat"
-            " does not include this sentence)"
-        )
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        return {"prompt_to_repeat": self._prompt_to_repeat}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["prompt_to_repeat"]
-
-    def check_following(self, value):
-        if value.strip().lower().startswith(self._prompt_to_repeat.strip().lower()):
-            return True
-        return False
-
-
-class EndChecker(Instruction):
-    """Checks that the prompt ends with a given phrase."""
-
-    def build_description(self, *, end_phrase=None):
-        """Build the instruction description.
-
-        Args:
-          end_phrase: A string representing the phrase the response should end with.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._end_phrase = end_phrase.strip() if isinstance(end_phrase, str) else end_phrase
-        if self._end_phrase is None:
-            self._end_phrase = random.choice(_ENDING_OPTIONS)
-        self._description_pattern = (
-            "Finish your response with this exact phrase {ender}. No other words should follow this phrase."
-        )
-        return self._description_pattern.format(ender=self._end_phrase)
-
-    def get_instruction_args(self):
-        return {"end_phrase": self._end_phrase}
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["end_phrase"]
-
-    def check_following(self, value):
-        """Checks if the response ends with the expected phrase."""
-        value = value.strip().strip('"').lower()
-        self._end_phrase = self._end_phrase.strip().lower()
-        return value.endswith(self._end_phrase)
-
-
-class TitleChecker(Instruction):
-    """Checks the response for a title."""
-
-    def build_description(self):
-        """Build the instruction description."""
-        self._description_pattern = (
-            "Your answer must contain a title, wrapped in double angular brackets, such as <<poem of joy>>."
-        )
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        """Checks if the response contains a title."""
-        pattern = r"<<[^\n]+>>"
-        re_pattern = re.compile(pattern)
-        titles = re.findall(re_pattern, value)
-
-        for title in titles:
-            if title.lstrip("<").rstrip(">").strip():
-                return True
-        return False
-
-
-class LetterFrequencyChecker(Instruction):
-    """Checks letter frequency."""
-
-    def build_description(self, *, letter=None, let_frequency=None, let_relation=None):
-        """Build the instruction description.
-
-        Args:
-          letter: A string representing a letter that is expected in the response.
-          let_frequency: An integer specifying the number of times `keyword` is
-            expected to appear in the response.
-          let_relation: A string in (`less than`, `at least`), defining the
-            relational operator for comparison. Two relational comparisons are
-            supported for now; if 'less than', the actual number of
-            occurrences < frequency; if 'at least', the actual number of
-            occurrences >= frequency.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        if not letter or len(letter) > 1 or ord(letter.lower()) < 97 or ord(letter.lower()) > 122:
-            self._letter = random.choice(list(string.ascii_letters))
-        else:
-            self._letter = letter.strip()
-        self._letter = self._letter.lower()
-
-        self._frequency = let_frequency
-        if self._frequency is None or self._frequency < 0:
-            self._frequency = random.randint(1, _LETTER_FREQUENCY)
-
-        if let_relation is None:
-            self._comparison_relation = random.choice(_COMPARISON_RELATION)
-        elif let_relation not in _COMPARISON_RELATION:
-            raise ValueError(
-                f"The supported relation for comparison must be in {_COMPARISON_RELATION}, but {let_relation} is given."
-            )
-        else:
-            self._comparison_relation = let_relation
-
-        self._description_pattern = (
-            "In your response, the letter {letter} should appear {let_relation} {let_frequency} times."
-        )
-
-        return self._description_pattern.format(
-            letter=self._letter,
-            let_frequency=self._frequency,
-            let_relation=self._comparison_relation,
-        )
-
-    def get_instruction_args(self):
-        """Returns the keyword args of build description."""
-        return {
-            "letter": self._letter,
-            "let_frequency": self._frequency,
-            "let_relation": self._comparison_relation,
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["letter", "let_frequency", "let_relation"]
-
-    def check_following(self, value):
-        """Checks that the response contains the letter at the right frequency."""
-        value = value.lower()
-        letters = collections.Counter(value)
-
-        if self._comparison_relation == _COMPARISON_RELATION[0]:
-            return letters[self._letter] < self._frequency
-        else:
-            return letters[self._letter] >= self._frequency
-
-
-class CapitalLettersEnglishChecker(Instruction):
-    """Checks that the response is in english and is in all capital letters."""
-
-    def build_description(self):
-        """Build the instruction description."""
-        self._description_pattern = "Your entire response should be in English, and in all capital letters."
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        """Checks that the response is in English and in all capital letters."""
-        assert isinstance(value, str)
-
-        try:
-            return value.isupper() and langdetect.detect(value) == "en"
-        except langdetect.LangDetectException as e:
-            # Count as instruction is followed.
-            logger.info("Unable to detect language for text %s due to %s", value, e)  # refex: disable=pytotw.037
-            return True
-
-
-class LowercaseLettersEnglishChecker(Instruction):
-    """Checks that the response is in english and is in all lowercase letters."""
-
-    def build_description(self):
-        """Build the instruction description."""
-        self._description_pattern = (
-            "Your entire response should be in English, and in all lowercase letters. No capital letters are allowed."
-        )
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        """Checks that the response is in English and in all lowercase letters."""
-        assert isinstance(value, str)
-
-        try:
-            return value.islower() and langdetect.detect(value) == "en"
-        except langdetect.LangDetectException as e:
-            # Count as instruction is followed.
-            logger.info("Unable to detect language for text %s due to %s", value, e)  # refex: disable=pytotw.037
-            return True
-
-
-class CommaChecker(Instruction):
-    """Checks the response for no commas."""
-
-    def build_description(self, **kwargs):
-        """Build the instruction description."""
-        self._description_pattern = "In your entire response, refrain from the use of any commas."
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        """Checks that the response does not contain commas."""
-        return not re.search(r"\,", value)
-
-
-class CapitalWordFrequencyChecker(Instruction):
-    """Checks frequency of words with all capital letters."""
-
-    def build_description(
-        self,
-        capital_frequency=None,
-        capital_relation=None,
-    ):
-        """Build the instruction description.
-
-        Args:
-          capital_frequency: An integer that represents the number of words that
-            should be in all capital letters.
-          capital_relation: A string that is 'at least' or 'at most' that refers to
-            the frequency.
-
-        Returns:
-          A string representing the instruction description.
-        """
-        self._frequency = capital_frequency
-        if self._frequency is None:
-            self._frequency = random.randint(1, _ALL_CAPITAL_WORD_FREQUENCY)
-
-        self._comparison_relation = capital_relation
-        if capital_relation is None:
-            self._comparison_relation = random.choice(_COMPARISON_RELATION)
-        elif capital_relation not in _COMPARISON_RELATION:
-            raise ValueError(
-                "The supported relation for comparison must be in "
-                f"{_COMPARISON_RELATION}, but {capital_relation} is given."
-            )
-
-        self._description_pattern = (
-            "In your response, words with all capital letters should appear {relation} {frequency} times."
-        )
-
-        return self._description_pattern.format(frequency=self._frequency, relation=self._comparison_relation)
-
-    def get_instruction_args(self):
-        """Returns the keyword args of build description."""
-        return {
-            "capital_frequency": self._frequency,
-            "capital_relation": self._comparison_relation,
-        }
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return ["capital_frequency", "capital_relation"]
-
-    def check_following(self, value):
-        """Checks the frequency of words with all capital letters."""
-        # Hyphenated words will count as one word
-        nltk.download("punkt_tab")
-        words = nltk.word_tokenize(value)
-        capital_words = [word for word in words if word.isupper()]
-
-        capital_words = len(capital_words)
-
-        if self._comparison_relation == _COMPARISON_RELATION[0]:
-            return capital_words < self._frequency
-        else:
-            return capital_words >= self._frequency
-
-
-class QuotationChecker(Instruction):
-    """Checks response is wrapped with double quotation marks."""
-
-    def build_description(self):
-        """Build the instruction description."""
-        self._description_pattern = "Wrap your entire response with double quotation marks."
-        return self._description_pattern
-
-    def get_instruction_args(self):
-        """Returns the keyword args of build description."""
-        return None
-
-    def get_instruction_args_keys(self):
-        """Returns the args keys of `build_description`."""
-        return []
-
-    def check_following(self, value):
-        """Checks if the response is wrapped with double quotation marks."""
-        quotations_map = {
-            "ja": "「」",
-            "ru": "«»",
-            "th": "“”",
-            "zh": "“”",
-            "zh-cn": "“”",
-            "zh-tw": "“”",
-        }
-        value = value.strip()
-        lang = get_langid(value)
-        quotes = quotations_map.get(lang, '""')
-        # TODO: We may wanna revisit this logic in new generations to only check of the response language's quotes.
-        return len(value) > 1 and value[0] in [quotes[0], '"'] and value[-1] in [quotes[1], '"']
-
-
-# Define instruction dicts
-_KEYWORD = "keywords:"
-_LANGUAGE = "language:"
-_LENGTH = "length_constraints:"
-_CONTENT = "detectable_content:"
-_FORMAT = "detectable_format:"
-_MULTITURN = "multi-turn:"
-_COMBINATION = "combination:"
-_STARTEND = "startend:"
-_CHANGE_CASES = "change_case:"
-_PUNCTUATION = "punctuation:"
-
-INSTRUCTION_DICT = {
-    _KEYWORD + "existence": KeywordChecker,
-    _KEYWORD + "frequency": KeywordFrequencyChecker,
-    # _KEYWORD + "key_sentences": KeySentenceChecker,
-    _KEYWORD + "forbidden_words": ForbiddenWords,
-    _KEYWORD + "letter_frequency": LetterFrequencyChecker,
-    _LANGUAGE + "response_language": ResponseLanguageChecker,
-    _LENGTH + "number_sentences": NumberOfSentences,
-    _LENGTH + "number_paragraphs": ParagraphChecker,
-    _LENGTH + "number_words": NumberOfWords,
-    _LENGTH + "nth_paragraph_first_word": ParagraphFirstWordCheck,
-    _CONTENT + "number_placeholders": PlaceholderChecker,
-    _CONTENT + "postscript": PostscriptChecker,
-    _FORMAT + "number_bullet_lists": BulletListChecker,
-    # _CONTENT + "rephrase_paragraph": RephraseParagraph,
-    _FORMAT + "constrained_response": ConstrainedResponseChecker,
-    _FORMAT + "number_highlighted_sections": (HighlightSectionChecker),
-    _FORMAT + "multiple_sections": SectionChecker,
-    # _FORMAT + "rephrase": RephraseChecker,
-    _FORMAT + "json_format": JsonFormat,
-    _FORMAT + "title": TitleChecker,
-    # _MULTITURN + "constrained_start": ConstrainedStartChecker,
-    _COMBINATION + "two_responses": TwoResponsesChecker,
-    _COMBINATION + "repeat_prompt": RepeatPromptThenAnswer,
-    _STARTEND + "end_checker": EndChecker,
-    _CHANGE_CASES + "capital_word_frequency": CapitalWordFrequencyChecker,
-    _CHANGE_CASES + "english_capital": CapitalLettersEnglishChecker,
-    _CHANGE_CASES + "english_lowercase": LowercaseLettersEnglishChecker,
-    _PUNCTUATION + "no_comma": CommaChecker,
-    _STARTEND + "quotation": QuotationChecker,
-}
-
-INSTRUCTION_LIST = list(INSTRUCTION_DICT.keys()) + [
-    _KEYWORD[:-1],
-    _LANGUAGE[:-1],
-    _LENGTH[:-1],
-    _CONTENT[:-1],
-    _FORMAT[:-1],
-    _MULTITURN[:-1],
-    _COMBINATION[:-1],
-    _STARTEND[:-1],
-    _CHANGE_CASES[:-1],
-    _PUNCTUATION[:-1],
-]
diff --git a/src/llama_stack/providers/inline/scoring/basic/utils/math_utils.py b/src/llama_stack/providers/inline/scoring/basic/utils/math_utils.py
deleted file mode 100644
index 6840aad148..0000000000
--- a/src/llama_stack/providers/inline/scoring/basic/utils/math_utils.py
+++ /dev/null
@@ -1,330 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import re
-from collections.abc import Sequence
-
-from llama_stack.providers.utils.scoring.basic_scoring_utils import time_limit
-
-# from minerva
-SUBSTITUTIONS = [
-    ("an ", ""),
-    ("a ", ""),
-    (".$", "$"),
-    ("\\$", ""),
-    (r"\ ", ""),
-    (" ", ""),
-    ("mbox", "text"),
-    (",\\text{and}", ","),
-    ("\\text{and}", ","),
-    ("\\text{m}", "\\text{}"),
-]
-
-REMOVED_EXPRESSIONS = [
-    "square",
-    "ways",
-    "integers",
-    "dollars",
-    "mph",
-    "inches",
-    "ft",
-    "hours",
-    "km",
-    "units",
-    "\\ldots",
-    "sue",
-    "points",
-    "feet",
-    "minutes",
-    "digits",
-    "cents",
-    "degrees",
-    "cm",
-    "gm",
-    "pounds",
-    "meters",
-    "meals",
-    "edges",
-    "students",
-    "childrentickets",
-    "multiples",
-    "\\text{s}",
-    "\\text{.}",
-    "\\text{\ns}",
-    "\\text{}^2",
-    "\\text{}^3",
-    "\\text{\n}",
-    "\\text{}",
-    r"\mathrm{th}",
-    r"^\circ",
-    r"^{\circ}",
-    r"\;",
-    r",\!",
-    "{,}",
-    '"',
-    "\\dots",
-]
-
-
-def try_evaluate_frac(expression: str, fmt: str = "0.2e") -> str:
-    if isinstance(expression, float):
-        return expression
-    new_expression = f"{expression}"
-    regex = re.compile(r"\\frac{([^}]+)}{([^}]+)}")
-    for match in re.finditer(regex, expression):
-        try:
-            value = float(match.group(1)) / float(match.group(2))
-            new_expression = new_expression.replace(
-                match.group(),
-                f"{{value:{fmt}}}".format(value=value),
-                1,
-            )
-        except Exception:
-            continue
-    return new_expression
-
-
-def try_evaluate_latex(expression: str, fmt: str = ".2e") -> str:
-    try:
-        with time_limit(seconds=5):
-            from sympy.parsing.latex import parse_latex
-
-            value = parse_latex(expression).evalf()  # type: ignore
-            return f"{{value:{fmt}}}".format(value=value)
-    except Exception:
-        return expression
-
-
-def first_answer(text: str, markers: Sequence[str] = ("Q:", "A:")) -> str:
-    for marker in markers:
-        text = text.split(marker)[0]
-    return text
-
-
-def extract_result_from_boxed(answer: str) -> str:
-    box_start = "\\boxed"
-    # format is `\\boxed <value>$` or `\\boxed{<value>}`, with potential white spaces framing `<value>`
-    start = answer.rfind(box_start)
-    if start < 0:
-        return ""
-    answer = answer[start + len(box_start) :].strip()
-    ends_with_curly = answer.startswith("{")
-    i = 0
-    open_braces = 0
-    while i < len(answer):
-        if answer[i] == "{":
-            open_braces += 1
-        elif answer[i] == "}":
-            open_braces -= 1
-        if open_braces == 0:
-            if ends_with_curly:
-                answer = answer[: i + 1].strip()
-                break
-            elif answer[i] == "$":
-                answer = answer[:i].strip()
-                break
-        i += 1
-    else:
-        return ""
-    # remove extra curly braces
-    while True:
-        if answer.startswith("{") and answer.endswith("}"):
-            answer = answer[1:-1].strip()
-        else:
-            break
-    return answer
-
-
-# from minerva paper + _normalise_result from xavierm
-def normalize_final_answer(final_answer: str, regex_pattern: str, match_first: bool = True) -> str:
-    """Extract and normalize a final answer to a quantitative reasoning question."""
-    match = re.findall(regex_pattern, final_answer)
-    extraction: str
-    if len(match) > 0:
-        if match_first:
-            extraction = match[0]
-        else:
-            extraction = match[-1]
-    else:
-        extraction = extract_result_from_boxed(final_answer)
-
-    if len(extraction) == 0:
-        return final_answer
-    else:
-        final_answer = extraction
-    final_answer = final_answer.split("=")[-1]
-    for before, after in SUBSTITUTIONS:
-        final_answer = final_answer.replace(before, after)
-    for expr in REMOVED_EXPRESSIONS:
-        final_answer = final_answer.replace(expr, "")
-    # Extract answer that is in LaTeX math, is bold,
-    # is surrounded by a box, etc.
-    final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer)
-    final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer)
-    final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer)
-    final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer)
-    final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer)
-    # Normalize shorthand TeX:
-    # \fracab -> \frac{a}{b}
-    # \frac{abc}{bef} -> \frac{abc}{bef}
-    # \fracabc -> \frac{a}{b}c
-    # \sqrta -> \sqrt{a}
-    # \sqrtab -> sqrt{a}b
-    final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer)
-    final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer)
-    final_answer = final_answer.replace("$", "")
-    # Normalize 100,000 -> 100000
-    if final_answer.replace(",", "").isdigit():
-        final_answer = final_answer.replace(",", "")
-    # If the final answer is a single letter in parentheses, remove the parentheses
-    # Example: (a) -> a (but not (ab) -> ab)
-    if re.match(r"\([a-zA-Z]\)", final_answer):
-        final_answer = final_answer[1]
-    return _normalise_result(final_answer)
-
-
-def _normalise_result(string: str) -> str:
-    # linebreaks
-    string = string.replace("\n", "")
-
-    # remove inverse spaces
-    string = string.replace("\\!", "")
-
-    # replace \\ with \
-    string = string.replace("\\\\", "\\")
-
-    # replace tfrac and dfrac with frac
-    string = string.replace("cfrac", "frac")
-    string = string.replace("tfrac", "frac")
-    string = string.replace("dfrac", "frac")
-
-    # remove \left and \right
-    string = string.replace("\\left", "")
-    string = string.replace("\\le", "")
-    string = string.replace("\\right", "")
-
-    # Remove circ (degrees)
-    string = string.replace("^{\\circ}", "")
-    string = string.replace("^\\circ", "")
-
-    # remove dollar signs
-    string = string.replace("\\$", "")
-
-    # remove units (on the right)
-    string = _remove_right_units(string)
-
-    # remove percentage
-    string = string.replace("\\%", "")
-    string = string.replace(r"\%", "")
-
-    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
-    string = string.replace(" .", " 0.")
-    string = string.replace("{.", "{0.")
-    # if empty, return empty string
-    if len(string) == 0:
-        return string
-    if string[0] == ".":
-        string = "0" + string
-
-    # to consider: get rid of e.g. "k = " or "q = " at beginning
-    string = string.split("=")[-1]
-
-    # fix sqrt3 --> sqrt{3}
-    string = _fix_sqrt(string)
-
-    # remove spaces
-    string = string.replace(" ", "")
-
-    # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
-    string = _fix_fracs(string)
-
-    # manually change 0.5 --> \frac{1}{2}
-    if string == "0.5":
-        string = "\\frac{1}{2}"
-
-    # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
-    string = _fix_a_slash_b(string)
-
-    return string
-
-
-def _remove_right_units(string: str) -> str:
-    # "\\text{ " only ever occurs (at least in the val set) when describing units
-    try:
-        if "\\text{ " in string:
-            splits = string.split("\\text{ ")
-            assert len(splits) == 2
-            return splits[0]
-        else:
-            return string
-    except AssertionError:
-        return string
-
-
-def _fix_sqrt(string: str) -> str:
-    if "\\sqrt" not in string:
-        return string
-    splits = string.split("\\sqrt")
-    new_string = splits[0]
-    for split in splits[1:]:
-        if len(split) == 0:
-            return string
-        if split[0] != "{":
-            a = split[0]
-            new_substr = "\\sqrt{" + a + "}" + split[1:]
-        else:
-            new_substr = "\\sqrt" + split
-        new_string += new_substr
-    return new_string
-
-
-def _fix_fracs(string: str) -> str:
-    substrs = string.split("\\frac")
-    new_str = substrs[0]
-    if len(substrs) > 1:
-        substrs = substrs[1:]
-        for substr in substrs:
-            new_str += "\\frac"
-            if len(substr) == 0:
-                return string
-            if substr[0] == "{":
-                new_str += substr
-            else:
-                try:
-                    assert len(substr) >= 2
-                except AssertionError:
-                    return string
-                a = substr[0]
-                b = substr[1]
-                if b != "{":
-                    if len(substr) > 2:
-                        post_substr = substr[2:]
-                        new_str += "{" + a + "}{" + b + "}" + post_substr
-                    else:
-                        new_str += "{" + a + "}{" + b + "}"
-                else:
-                    if len(substr) > 2:
-                        post_substr = substr[2:]
-                        new_str += "{" + a + "}" + b + post_substr
-                    else:
-                        new_str += "{" + a + "}" + b
-    string = new_str
-    return string
-
-
-def _fix_a_slash_b(string: str) -> str:
-    if len(string.split("/")) != 2:
-        return string
-    a = string.split("/")[0]
-    b = string.split("/")[1]
-    try:
-        ia = int(a)
-        ib = int(b)
-        assert string == f"{ia}/{ib}"
-        new_string = "\\frac{" + str(ia) + "}{" + str(ib) + "}"
-        return new_string
-    except (ValueError, AssertionError):
-        return string
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/__init__.py b/src/llama_stack/providers/inline/scoring/braintrust/__init__.py
deleted file mode 100644
index 2f3dce9666..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from pydantic import BaseModel, SecretStr
-
-from llama_stack.core.datatypes import Api
-
-from .config import BraintrustScoringConfig
-
-
-class BraintrustProviderDataValidator(BaseModel):
-    openai_api_key: SecretStr
-
-
-async def get_provider_impl(
-    config: BraintrustScoringConfig,
-    deps: dict[Api, Any],
-):
-    from .braintrust import BraintrustScoringImpl
-
-    impl = BraintrustScoringImpl(config, deps[Api.datasetio], deps[Api.datasets])
-    await impl.initialize()
-    return impl
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/braintrust.py b/src/llama_stack/providers/inline/scoring/braintrust/braintrust.py
deleted file mode 100644
index ace3d08f41..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/braintrust.py
+++ /dev/null
@@ -1,212 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import os
-import threading
-from typing import Any
-
-from llama_stack.core.datatypes import Api
-from llama_stack.core.request_headers import NeedsRequestProviderData
-from llama_stack.providers.utils.common.data_schema_validator import (
-    get_valid_schemas,
-    validate_row_schema,
-)
-from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_metrics
-from llama_stack_api import (
-    DatasetIO,
-    Datasets,
-    IterRowsRequest,
-    ScoreBatchRequest,
-    ScoreBatchResponse,
-    ScoreRequest,
-    ScoreResponse,
-    Scoring,
-    ScoringFn,
-    ScoringFunctionsProtocolPrivate,
-    ScoringResult,
-    ScoringResultRow,
-)
-
-from .config import BraintrustScoringConfig
-from .scoring_fn.fn_defs.answer_correctness import answer_correctness_fn_def
-from .scoring_fn.fn_defs.answer_relevancy import answer_relevancy_fn_def
-from .scoring_fn.fn_defs.answer_similarity import answer_similarity_fn_def
-from .scoring_fn.fn_defs.context_entity_recall import context_entity_recall_fn_def
-from .scoring_fn.fn_defs.context_precision import context_precision_fn_def
-from .scoring_fn.fn_defs.context_recall import context_recall_fn_def
-from .scoring_fn.fn_defs.context_relevancy import context_relevancy_fn_def
-from .scoring_fn.fn_defs.factuality import factuality_fn_def
-from .scoring_fn.fn_defs.faithfulness import faithfulness_fn_def
-
-# Mapping of scoring function identifiers to their definitions (lightweight, no heavy imports)
-SUPPORTED_BRAINTRUST_SCORING_FN_DEFS: dict[str, ScoringFn] = {
-    "braintrust::factuality": factuality_fn_def,
-    "braintrust::answer-correctness": answer_correctness_fn_def,
-    "braintrust::answer-relevancy": answer_relevancy_fn_def,
-    "braintrust::answer-similarity": answer_similarity_fn_def,
-    "braintrust::faithfulness": faithfulness_fn_def,
-    "braintrust::context-entity-recall": context_entity_recall_fn_def,
-    "braintrust::context-precision": context_precision_fn_def,
-    "braintrust::context-recall": context_recall_fn_def,
-    "braintrust::context-relevancy": context_relevancy_fn_def,
-}
-
-# Lazy-loaded evaluators (defers loading autoevals and its pyarrow dependency)
-_braintrust_evaluators: dict[str, Any] | None = None
-_braintrust_evaluators_lock = threading.Lock()
-
-
-def _get_braintrust_evaluators() -> dict[str, Any]:
-    """Lazily load autoevals evaluators on first use.
-
-    This defers importing autoevals (and its pyarrow dependency) until
-    braintrust scoring is actually needed, saving ~63MB of memory at startup.
-    """
-    global _braintrust_evaluators
-    if _braintrust_evaluators is not None:
-        return _braintrust_evaluators
-
-    with _braintrust_evaluators_lock:
-        if _braintrust_evaluators is not None:
-            return _braintrust_evaluators
-
-        from autoevals.llm import Factuality
-        from autoevals.ragas import (
-            AnswerCorrectness,
-            AnswerRelevancy,
-            AnswerSimilarity,
-            ContextEntityRecall,
-            ContextPrecision,
-            ContextRecall,
-            ContextRelevancy,
-            Faithfulness,
-        )
-
-        _braintrust_evaluators = {
-            "braintrust::factuality": Factuality(),
-            "braintrust::answer-correctness": AnswerCorrectness(),
-            "braintrust::answer-relevancy": AnswerRelevancy(),
-            "braintrust::answer-similarity": AnswerSimilarity(),
-            "braintrust::faithfulness": Faithfulness(),
-            "braintrust::context-entity-recall": ContextEntityRecall(),
-            "braintrust::context-precision": ContextPrecision(),
-            "braintrust::context-recall": ContextRecall(),
-            "braintrust::context-relevancy": ContextRelevancy(),
-        }
-        return _braintrust_evaluators
-
-
-class BraintrustScoringImpl(
-    Scoring,
-    ScoringFunctionsProtocolPrivate,
-    NeedsRequestProviderData,
-):
-    def __init__(
-        self,
-        config: BraintrustScoringConfig,
-        datasetio_api: DatasetIO,
-        datasets_api: Datasets,
-    ) -> None:
-        self.config = config
-        self.datasetio_api = datasetio_api
-        self.datasets_api = datasets_api
-        self.supported_fn_defs_registry = SUPPORTED_BRAINTRUST_SCORING_FN_DEFS
-
-    async def initialize(self) -> None: ...
-
-    async def shutdown(self) -> None: ...
-
-    async def list_scoring_functions(self) -> list[ScoringFn]:
-        scoring_fn_defs_list = list(self.supported_fn_defs_registry.values())
-        for f in scoring_fn_defs_list:
-            assert f.identifier.startswith("braintrust"), (
-                "All braintrust scoring fn must have identifier prefixed with 'braintrust'! "
-            )
-
-        return scoring_fn_defs_list
-
-    async def register_scoring_function(self, scoring_fn: ScoringFn) -> None:
-        raise NotImplementedError("Registering scoring function not allowed for braintrust provider")
-
-    async def set_api_key(self) -> None:
-        # api key is in the request headers
-        if not self.config.openai_api_key:
-            provider_data = self.get_request_provider_data()
-            if provider_data is None or not provider_data.openai_api_key:
-                raise ValueError(
-                    'Pass OpenAI API Key in the header X-LlamaStack-Provider-Data as { "openai_api_key": <your api key>}'
-                )
-            self.config.openai_api_key = provider_data.openai_api_key
-
-        os.environ["OPENAI_API_KEY"] = self.config.openai_api_key
-
-    async def score_batch(
-        self,
-        request: ScoreBatchRequest,
-    ) -> ScoreBatchResponse:
-        await self.set_api_key()
-
-        all_rows = await self.datasetio_api.iterrows(IterRowsRequest(dataset_id=request.dataset_id, limit=-1))
-        score_request = ScoreRequest(
-            input_rows=all_rows.data,
-            scoring_functions=request.scoring_functions,
-        )
-        res = await self.score(score_request)
-        if request.save_results_dataset:
-            # TODO: persist and register dataset on to server for reading
-            # self.datasets_api.register_dataset()
-            raise NotImplementedError("Save results dataset not implemented yet")
-
-        return ScoreBatchResponse(
-            results=res.results,
-        )
-
-    async def score_row(self, input_row: dict[str, Any], scoring_fn_identifier: str | None = None) -> ScoringResultRow:
-        validate_row_schema(input_row, get_valid_schemas(Api.scoring.value))
-        await self.set_api_key()
-        assert scoring_fn_identifier is not None, "scoring_fn_identifier cannot be None"
-        expected_answer = input_row["expected_answer"]
-        generated_answer = input_row["generated_answer"]
-        input_query = input_row["input_query"]
-        evaluators = _get_braintrust_evaluators()
-        evaluator = evaluators[scoring_fn_identifier]
-
-        result = evaluator(
-            generated_answer,
-            expected_answer,
-            input=input_query,
-            context=input_row["context"] if "context" in input_row else None,
-        )
-        score = result.score
-        return {"score": score, "metadata": result.metadata}
-
-    async def score(
-        self,
-        request: ScoreRequest,
-    ) -> ScoreResponse:
-        await self.set_api_key()
-        res = {}
-        for scoring_fn_id in request.scoring_functions:
-            if scoring_fn_id not in self.supported_fn_defs_registry:
-                raise ValueError(f"Scoring function {scoring_fn_id} is not supported.")
-
-            score_results = [await self.score_row(input_row, scoring_fn_id) for input_row in request.input_rows]
-            aggregation_functions = self.supported_fn_defs_registry[scoring_fn_id].params.aggregation_functions
-
-            # override scoring_fn params if provided
-            if request.scoring_functions[scoring_fn_id] is not None:
-                override_params = request.scoring_functions[scoring_fn_id]
-                if override_params.aggregation_functions:
-                    aggregation_functions = override_params.aggregation_functions
-
-            agg_results = aggregate_metrics(score_results, aggregation_functions)
-            res[scoring_fn_id] = ScoringResult(
-                score_rows=score_results,
-                aggregated_results=agg_results,
-            )
-
-        return ScoreResponse(
-            results=res,
-        )
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/config.py b/src/llama_stack/providers/inline/scoring/braintrust/config.py
deleted file mode 100644
index 057f0ba5db..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/config.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from pydantic import BaseModel, Field
-
-
-class BraintrustScoringConfig(BaseModel):
-    openai_api_key: str | None = Field(
-        default=None,
-        description="The OpenAI API Key",
-    )
-
-    @classmethod
-    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
-        return {
-            "openai_api_key": "${env.OPENAI_API_KEY:=}",
-        }
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/__init__.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/__init__.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py
deleted file mode 100644
index b058305b45..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_correctness.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-answer_correctness_fn_def = ScoringFn(
-    identifier="braintrust::answer-correctness",
-    description=(
-        "Scores the correctness of the answer based on the ground truth. "
-        "Uses Braintrust LLM-based scorer from autoevals library."
-    ),
-    provider_id="braintrust",
-    provider_resource_id="answer-correctness",
-    return_type=NumberType(),
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.average]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py
deleted file mode 100644
index d619d38a80..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_relevancy.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-answer_relevancy_fn_def = ScoringFn(
-    identifier="braintrust::answer-relevancy",
-    description=(
-        "Test output relevancy against the input query using Braintrust LLM scorer. "
-        "See: github.com/braintrustdata/autoevals"
-    ),
-    provider_id="braintrust",
-    provider_resource_id="answer-relevancy",
-    return_type=NumberType(),
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.average]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py
deleted file mode 100644
index 34354a1fc2..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/answer_similarity.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-answer_similarity_fn_def = ScoringFn(
-    identifier="braintrust::answer-similarity",
-    description=(
-        "Test output similarity against expected value using Braintrust LLM scorer. "
-        "See: github.com/braintrustdata/autoevals"
-    ),
-    provider_id="braintrust",
-    provider_resource_id="answer-similarity",
-    return_type=NumberType(),
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.average]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py
deleted file mode 100644
index 4092ccc4ad..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_entity_recall.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-context_entity_recall_fn_def = ScoringFn(
-    identifier="braintrust::context-entity-recall",
-    description=(
-        "Evaluates how well the context captures the named entities present in the "
-        "reference answer. See: github.com/braintrustdata/autoevals"
-    ),
-    provider_id="braintrust",
-    provider_resource_id="context-entity-recall",
-    return_type=NumberType(),
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.average]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py
deleted file mode 100644
index 2b32b9eec8..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_precision.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-context_precision_fn_def = ScoringFn(
-    identifier="braintrust::context-precision",
-    description=(
-        "Measures how much of the provided context is actually relevant to answering the "
-        "question. See: github.com/braintrustdata/autoevals"
-    ),
-    provider_id="braintrust",
-    provider_resource_id="context-precision",
-    return_type=NumberType(),
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.average]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py
deleted file mode 100644
index 4d6547002d..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_recall.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-context_recall_fn_def = ScoringFn(
-    identifier="braintrust::context-recall",
-    description=(
-        "Evaluates how well the context covers the information needed to answer the "
-        "question. See: github.com/braintrustdata/autoevals"
-    ),
-    provider_id="braintrust",
-    provider_resource_id="context-recall",
-    return_type=NumberType(),
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.average]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py
deleted file mode 100644
index 739dfd7bdb..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/context_relevancy.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-context_relevancy_fn_def = ScoringFn(
-    identifier="braintrust::context-relevancy",
-    description=(
-        "Assesses how relevant the provided context is to the given question. See: github.com/braintrustdata/autoevals"
-    ),
-    provider_id="braintrust",
-    provider_resource_id="context-relevancy",
-    return_type=NumberType(),
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.average]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py
deleted file mode 100644
index 59ed5949bc..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/factuality.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-factuality_fn_def = ScoringFn(
-    identifier="braintrust::factuality",
-    description=(
-        "Test output factuality against expected value using Braintrust LLM scorer. "
-        "See: github.com/braintrustdata/autoevals"
-    ),
-    provider_id="braintrust",
-    provider_resource_id="factuality",
-    return_type=NumberType(),
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.average]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py b/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py
deleted file mode 100644
index 96c36d226a..0000000000
--- a/src/llama_stack/providers/inline/scoring/braintrust/scoring_fn/fn_defs/faithfulness.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-faithfulness_fn_def = ScoringFn(
-    identifier="braintrust::faithfulness",
-    description=(
-        "Test output faithfulness to the input query using Braintrust LLM scorer. "
-        "See: github.com/braintrustdata/autoevals"
-    ),
-    provider_id="braintrust",
-    provider_resource_id="faithfulness",
-    return_type=NumberType(),
-    params=BasicScoringFnParams(aggregation_functions=[AggregationFunctionType.average]),
-)
diff --git a/src/llama_stack/providers/inline/scoring/llm_as_judge/__init__.py b/src/llama_stack/providers/inline/scoring/llm_as_judge/__init__.py
deleted file mode 100644
index 76735fcb34..0000000000
--- a/src/llama_stack/providers/inline/scoring/llm_as_judge/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from llama_stack.core.datatypes import Api
-
-from .config import LlmAsJudgeScoringConfig
-
-
-async def get_provider_impl(
-    config: LlmAsJudgeScoringConfig,
-    deps: dict[Api, Any],
-):
-    from .scoring import LlmAsJudgeScoringImpl
-
-    impl = LlmAsJudgeScoringImpl(config, deps[Api.datasetio], deps[Api.datasets], deps[Api.inference])
-    await impl.initialize()
-    return impl
diff --git a/src/llama_stack/providers/inline/scoring/llm_as_judge/config.py b/src/llama_stack/providers/inline/scoring/llm_as_judge/config.py
deleted file mode 100644
index b150ef54ce..0000000000
--- a/src/llama_stack/providers/inline/scoring/llm_as_judge/config.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from pydantic import BaseModel
-
-
-class LlmAsJudgeScoringConfig(BaseModel):
-    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
-        return {}
diff --git a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py b/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
deleted file mode 100644
index f15c80364a..0000000000
--- a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    DatasetIO,
-    Datasets,
-    Inference,
-    IterRowsRequest,
-    ScoreBatchRequest,
-    ScoreBatchResponse,
-    ScoreRequest,
-    ScoreResponse,
-    Scoring,
-    ScoringFn,
-    ScoringFunctionsProtocolPrivate,
-    ScoringResult,
-)
-
-from .config import LlmAsJudgeScoringConfig
-from .scoring_fn.llm_as_judge_scoring_fn import LlmAsJudgeScoringFn
-
-LLM_JUDGE_FN = LlmAsJudgeScoringFn
-
-
-class LlmAsJudgeScoringImpl(
-    Scoring,
-    ScoringFunctionsProtocolPrivate,
-):
-    def __init__(
-        self,
-        config: LlmAsJudgeScoringConfig,
-        datasetio_api: DatasetIO,
-        datasets_api: Datasets,
-        inference_api: Inference,
-    ) -> None:
-        self.config = config
-        self.datasetio_api = datasetio_api
-        self.datasets_api = datasets_api
-        self.inference_api = inference_api
-
-    async def initialize(self) -> None:
-        impl = LLM_JUDGE_FN(inference_api=self.inference_api)
-        self.llm_as_judge_fn = impl
-
-    async def shutdown(self) -> None: ...
-
-    async def list_scoring_functions(self) -> list[ScoringFn]:
-        scoring_fn_defs_list = self.llm_as_judge_fn.get_supported_scoring_fn_defs()
-
-        for f in self.llm_as_judge_fn.get_supported_scoring_fn_defs():
-            assert f.identifier.startswith("llm-as-judge"), (
-                "All llm-as-judge scoring fn must have identifier prefixed with 'llm-as-judge'! "
-            )
-
-        return scoring_fn_defs_list
-
-    async def register_scoring_function(self, function_def: ScoringFn) -> None:
-        self.llm_as_judge_fn.register_scoring_fn_def(function_def)
-
-    async def unregister_scoring_function(self, scoring_fn_id: str) -> None:
-        self.llm_as_judge_fn.unregister_scoring_fn_def(scoring_fn_id)
-
-    async def score_batch(
-        self,
-        request: ScoreBatchRequest,
-    ) -> ScoreBatchResponse:
-        all_rows = await self.datasetio_api.iterrows(IterRowsRequest(dataset_id=request.dataset_id, limit=-1))
-        score_request = ScoreRequest(
-            input_rows=all_rows.data,
-            scoring_functions=request.scoring_functions,
-        )
-        res = await self.score(score_request)
-        if request.save_results_dataset:
-            # TODO: persist and register dataset on to server for reading
-            # self.datasets_api.register_dataset()
-            raise NotImplementedError("Save results dataset not implemented yet")
-
-        return ScoreBatchResponse(
-            results=res.results,
-        )
-
-    async def score(
-        self,
-        request: ScoreRequest,
-    ) -> ScoreResponse:
-        res = {}
-        for scoring_fn_id in request.scoring_functions.keys():
-            scoring_fn = self.llm_as_judge_fn
-            scoring_fn_params = request.scoring_functions.get(scoring_fn_id, None)
-            score_results = await scoring_fn.score(request.input_rows, scoring_fn_id, scoring_fn_params)
-            agg_results = await scoring_fn.aggregate(score_results, scoring_fn_id, scoring_fn_params)
-            res[scoring_fn_id] = ScoringResult(
-                score_rows=score_results,
-                aggregated_results=agg_results,
-            )
-
-        return ScoreResponse(
-            results=res,
-        )
diff --git a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/__init__.py b/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/__init__.py b/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py b/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py
deleted file mode 100644
index ed26169a5a..0000000000
--- a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_405b_simpleqa.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import (
-    AggregationFunctionType,
-    LLMAsJudgeScoringFnParams,
-    NumberType,
-    ScoringFn,
-)
-
-GRADER_TEMPLATE = """
-Your job is to look at a question, a gold target, and a predicted answer, and then assign a grade of either ["CORRECT", "INCORRECT", "NOT_ATTEMPTED"].
-First, I will give examples of each grade, and then you will grade a new example.
-The following are examples of CORRECT predicted answers.
-```
-Question: What are the names of Barack Obama's children?
-Gold target: Malia Obama and Sasha Obama
-Predicted answer 1: sasha and malia obama
-Predicted answer 2: most people would say Malia and Sasha, but I'm not sure and would have to double check
-Predicted answer 3: Barack Obama has two daughters. Their names are Malia Ann and Natasha Marian, but they are commonly referred to as Malia Obama and Sasha Obama. Malia was born on July 4, 1998, and Sasha was born on June 10, 2001.
-```
-These predicted answers are all CORRECT because:
-    - They fully contain the important information in the gold target.
-    - They do not contain any information that contradicts the gold target.
-    - Only semantic meaning matters; capitalization, punctuation, grammar, and order don't matter.
-    - Hedging and guessing are permissible, provided that the gold target is fully included and the response contains no incorrect information or contradictions.
-The following are examples of INCORRECT predicted answers.
-```
-Question: What are the names of Barack Obama's children?
-Gold target: Malia and Sasha
-Predicted answer 1: Malia.
-Predicted answer 2: Malia, Sasha, and Susan.
-Predicted answer 3: Barack Obama does not have any children.
-Predicted answer 4: I think it's either Malia and Sasha. Or it could be Malia and Jackie. Or it could be Joey and Malia.
-Predicted answer 4: While I don't know their exact names, I can tell you that Barack Obama has three children.
-Predicted answer 5: It's possible you may mean Betsy and Olivia. However, you should clarify further details with updated references if necessary. Is that the correct answer?
-Predicted answer 6: It may be the case that Obama's child is named James. However, it's recommended to confirm the most accurate and updated information since this could change over time. This model may not always reflect the most current information.
-```
-These predicted answers are all INCORRECT because:
-    - A factual statement in the answer contradicts the gold target. Incorrect statements that have some hedging (e.g., "it is possible that", "although i'm not sure, i think") are also considered incorrect.
-The following are examples of NOT_ATTEMPTED predicted answers.
-```
-Question: What are the names of Barack Obama's children?
-Gold target: Malia and Sasha
-Predicted answer 1: I don't know.
-Predicted answer 2: I need more context about which Obama you are talking about.
-Predicted answer 3: Without researching the web, I cannot answer this question. However, I can tell you that Barack Obama has two children.
-Predicted answer 4: Barack Obama has two children. I know that one of them is Malia, but I'm not sure about the other one.
-```
-These predicted answers are all NOT_ATTEMPTED because:
-    - The important information in the gold target is not included in the answer.
-    - No statements in the answer contradict the gold target.
-Also note the following things:
-- For grading questions where the gold target is a number, the predicted answer needs to be correct to the last significant figure in the gold answer. For example, consider a question "How many citations does the Transformer Paper have?" with gold target "120k".
-    - Predicted answers "120k", "124k", and 115k" are all CORRECT.
-    - Predicted answers "100k" and "113k" are INCORRECT.
-    - Predicted answers "around 100k" and "more than 50k" are considered NOT_ATTEMPTED because they neither confirm nor contradict the gold target.
-- The gold target may contain more information than the question. In such cases, the predicted answer only needs to contain the information that is in the question.
-    - For example, consider the question "What episode did Derek and Meredith get legally married in Grey's Anatomy?" with gold target "Season 7, Episode 20: White Wedding". Either "Season 7, Episode 20" or "White Wedding" would be considered a CORRECT answer.
-- Do not punish predicted answers if they omit information that would be clearly inferred from the question.
-    - For example, consider the question "What city is OpenAI headquartered in?" and the gold target "San Francisco, California". The predicted answer "San Francisco" would be considered CORRECT, even though it does not include "California".
-    - Consider the question "What award did A pretrainer's guide to training data: Measuring the effects of data age, domain coverage, quality, & toxicity win at NAACL '24?", the gold target is "Outstanding Paper Award". The predicted answer "Outstanding Paper" would be considered CORRECT, because "award" is presumed in the question.
-    - For the question "What is the height of Jason Wei in meters?", the gold target is "1.73 m". The predicted answer "1.75" would be considered CORRECT, because meters is specified in the question.
-    - For the question "What is the name of Barack Obama's wife?", the gold target is "Michelle Obama". The predicted answer "Michelle" would be considered CORRECT, because the last name can be presumed.
-- Do not punish for typos in people's name if it's clearly the same name.
-    - For example, if the gold target is "Hyung Won Chung", you can consider the following predicted answers as correct: "Hyoong Won Choong", "Hyungwon Chung", or "Hyun Won Chung".
-Here is a new example. Simply reply with either CORRECT, INCORRECT, NOT ATTEMPTED. Don't apologize or correct yourself if there was a mistake; we are just trying to grade the answer.
-```
-Question: {input_query}
-Gold target: {expected_answer}
-Predicted answer: {generated_answer}
-```
-Grade the predicted answer of this new question as one of:
-A: CORRECT
-B: INCORRECT
-C: NOT_ATTEMPTED
-Just return the letters "A", "B", or "C", with no text around it.
-""".strip()
-
-
-llm_as_judge_405b_simpleqa = ScoringFn(
-    identifier="llm-as-judge::405b-simpleqa",
-    description="Llm As Judge Scoring Function for SimpleQA Benchmark (https://github.com/openai/simple-evals/blob/main/simpleqa_eval.py)",
-    return_type=NumberType(),
-    provider_id="llm-as-judge",
-    provider_resource_id="llm-as-judge-405b-simpleqa",
-    params=LLMAsJudgeScoringFnParams(
-        judge_model="meta-llama/Llama-3.1-405B-Instruct",
-        prompt_template=GRADER_TEMPLATE,
-        judge_score_regexes=[r"(A|B|C)"],
-        aggregation_functions=[AggregationFunctionType.categorical_count.value],
-    ),
-)
diff --git a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py b/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py
deleted file mode 100644
index bffffd878c..0000000000
--- a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/fn_defs/llm_as_judge_base.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api import LLMAsJudgeScoringFnParams, NumberType, ScoringFn
-
-llm_as_judge_base = ScoringFn(
-    identifier="llm-as-judge::base",
-    description="Llm As Judge Scoring Function",
-    return_type=NumberType(),
-    provider_id="llm-as-judge",
-    provider_resource_id="llm-as-judge-base",
-    params=LLMAsJudgeScoringFnParams(
-        judge_model="meta-llama/Llama-3.1-405B-Instruct",
-        prompt_template="Enter custom LLM as Judge Prompt Template",
-    ),
-)
diff --git a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py b/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py
deleted file mode 100644
index 73ce82cda2..0000000000
--- a/src/llama_stack/providers/inline/scoring/llm_as_judge/scoring_fn/llm_as_judge_scoring_fn.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import re
-from typing import Any
-
-from llama_stack.providers.utils.scoring.base_scoring_fn import RegisteredBaseScoringFn
-from llama_stack_api import Inference, OpenAIChatCompletionRequestWithExtraBody, ScoringFnParams, ScoringResultRow
-
-from .fn_defs.llm_as_judge_405b_simpleqa import llm_as_judge_405b_simpleqa
-from .fn_defs.llm_as_judge_base import llm_as_judge_base
-
-
-class LlmAsJudgeScoringFn(RegisteredBaseScoringFn):
-    """
-    A scoring_fn that assigns
-    """
-
-    def __init__(self, inference_api: Inference, *arg, **kwargs) -> None:
-        super().__init__(*arg, **kwargs)
-        self.inference_api = inference_api
-        self.supported_fn_defs_registry = {
-            llm_as_judge_base.identifier: llm_as_judge_base,
-            llm_as_judge_405b_simpleqa.identifier: llm_as_judge_405b_simpleqa,
-        }
-
-    async def score_row(
-        self,
-        input_row: dict[str, Any],
-        scoring_fn_identifier: str | None = None,
-        scoring_params: ScoringFnParams | None = None,
-    ) -> ScoringResultRow:
-        assert scoring_fn_identifier is not None, "Scoring function identifier not found."
-        fn_def = self.supported_fn_defs_registry[scoring_fn_identifier]
-
-        # override params if scoring_params is provided
-        if scoring_params is not None:
-            fn_def.params = scoring_params
-
-        assert fn_def.params is not None, f"LLMAsJudgeparams not found for {fn_def}."
-        assert fn_def.params.prompt_template is not None, "LLM Judge prompt_template not found."
-        assert fn_def.params.judge_score_regexes is not None, "LLM Judge judge_score_regexes not found."
-
-        input_query = input_row["input_query"]
-        expected_answer = input_row["expected_answer"]
-        generated_answer = input_row["generated_answer"]
-
-        judge_input_msg = fn_def.params.prompt_template.format(
-            input_query=input_query,
-            expected_answer=expected_answer,
-            generated_answer=generated_answer,
-        )
-
-        params = OpenAIChatCompletionRequestWithExtraBody(
-            model=fn_def.params.judge_model,
-            messages=[
-                {
-                    "role": "user",
-                    "content": judge_input_msg,
-                }
-            ],
-        )
-        judge_response = await self.inference_api.openai_chat_completion(params)
-        content = judge_response.choices[0].message.content
-        rating_regexes = fn_def.params.judge_score_regexes
-
-        judge_rating = None
-        for regex in rating_regexes:
-            match = re.search(regex, content)
-            if match:
-                judge_rating = match.group(1)
-                break
-
-        return {
-            "score": judge_rating,
-            "judge_feedback": content,
-        }
diff --git a/src/llama_stack/providers/registry/datasetio.py b/src/llama_stack/providers/registry/datasetio.py
deleted file mode 100644
index bfd7ede3c6..0000000000
--- a/src/llama_stack/providers/registry/datasetio.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from llama_stack_api import (
-    Api,
-    InlineProviderSpec,
-    ProviderSpec,
-    RemoteProviderSpec,
-)
-
-
-def available_providers() -> list[ProviderSpec]:
-    return [
-        InlineProviderSpec(
-            api=Api.datasetio,
-            provider_type="inline::localfs",
-            pip_packages=["pandas"],
-            module="llama_stack.providers.inline.datasetio.localfs",
-            config_class="llama_stack.providers.inline.datasetio.localfs.LocalFSDatasetIOConfig",
-            api_dependencies=[],
-            description="Local filesystem-based dataset I/O provider for reading and writing datasets to local storage.",
-        ),
-        RemoteProviderSpec(
-            api=Api.datasetio,
-            adapter_type="huggingface",
-            provider_type="remote::huggingface",
-            pip_packages=[
-                "datasets>=4.0.0",
-            ],
-            module="llama_stack.providers.remote.datasetio.huggingface",
-            config_class="llama_stack.providers.remote.datasetio.huggingface.HuggingfaceDatasetIOConfig",
-            description="HuggingFace datasets provider for accessing and managing datasets from the HuggingFace Hub.",
-        ),
-        RemoteProviderSpec(
-            api=Api.datasetio,
-            adapter_type="nvidia",
-            provider_type="remote::nvidia",
-            module="llama_stack.providers.remote.datasetio.nvidia",
-            config_class="llama_stack.providers.remote.datasetio.nvidia.NvidiaDatasetIOConfig",
-            pip_packages=[
-                "datasets>=4.0.0",
-            ],
-            description="NVIDIA's dataset I/O provider for accessing datasets from NVIDIA's data platform.",
-        ),
-    ]
diff --git a/src/llama_stack/providers/registry/eval.py b/src/llama_stack/providers/registry/eval.py
deleted file mode 100644
index 3d80b9aa0f..0000000000
--- a/src/llama_stack/providers/registry/eval.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from llama_stack_api import Api, InlineProviderSpec, ProviderSpec, RemoteProviderSpec
-
-
-def available_providers() -> list[ProviderSpec]:
-    return [
-        InlineProviderSpec(
-            api=Api.eval,
-            provider_type="inline::builtin",
-            pip_packages=["tree_sitter", "pythainlp", "langdetect", "emoji", "nltk>=3.9.4"],
-            module="llama_stack.providers.inline.eval.builtin",
-            config_class="llama_stack.providers.inline.eval.builtin.BuiltinEvalConfig",
-            api_dependencies=[
-                Api.datasetio,
-                Api.datasets,
-                Api.scoring,
-                Api.inference,
-                Api.agents,
-            ],
-            description="Meta's reference implementation of evaluation tasks with support for multiple languages and evaluation metrics.",
-        ),
-        RemoteProviderSpec(
-            api=Api.eval,
-            adapter_type="nvidia",
-            pip_packages=[
-                "requests",
-            ],
-            provider_type="remote::nvidia",
-            module="llama_stack.providers.remote.eval.nvidia",
-            config_class="llama_stack.providers.remote.eval.nvidia.NVIDIAEvalConfig",
-            description="NVIDIA's evaluation provider for running evaluation tasks on NVIDIA's platform.",
-            api_dependencies=[
-                Api.datasetio,
-                Api.datasets,
-                Api.scoring,
-                Api.inference,
-                Api.agents,
-            ],
-        ),
-    ]
diff --git a/src/llama_stack/providers/registry/scoring.py b/src/llama_stack/providers/registry/scoring.py
deleted file mode 100644
index 45c5dbed79..0000000000
--- a/src/llama_stack/providers/registry/scoring.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from llama_stack_api import Api, InlineProviderSpec, ProviderSpec
-
-
-def available_providers() -> list[ProviderSpec]:
-    return [
-        InlineProviderSpec(
-            api=Api.scoring,
-            provider_type="inline::basic",
-            pip_packages=["requests"],
-            module="llama_stack.providers.inline.scoring.basic",
-            config_class="llama_stack.providers.inline.scoring.basic.BasicScoringConfig",
-            api_dependencies=[
-                Api.datasetio,
-                Api.datasets,
-            ],
-            description="Basic scoring provider for simple evaluation metrics and scoring functions.",
-        ),
-        InlineProviderSpec(
-            api=Api.scoring,
-            provider_type="inline::llm-as-judge",
-            pip_packages=[],
-            module="llama_stack.providers.inline.scoring.llm_as_judge",
-            config_class="llama_stack.providers.inline.scoring.llm_as_judge.LlmAsJudgeScoringConfig",
-            api_dependencies=[
-                Api.datasetio,
-                Api.datasets,
-                Api.inference,
-            ],
-            description="LLM-as-judge scoring provider that uses language models to evaluate and score responses.",
-        ),
-        InlineProviderSpec(
-            api=Api.scoring,
-            provider_type="inline::braintrust",
-            pip_packages=["autoevals"],
-            module="llama_stack.providers.inline.scoring.braintrust",
-            config_class="llama_stack.providers.inline.scoring.braintrust.BraintrustScoringConfig",
-            api_dependencies=[
-                Api.datasetio,
-                Api.datasets,
-            ],
-            provider_data_validator="llama_stack.providers.inline.scoring.braintrust.BraintrustProviderDataValidator",
-            description="Braintrust scoring provider for evaluation and scoring using the Braintrust platform.",
-        ),
-    ]
diff --git a/src/llama_stack/providers/remote/datasetio/__init__.py b/src/llama_stack/providers/remote/datasetio/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/remote/datasetio/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/remote/datasetio/huggingface/__init__.py b/src/llama_stack/providers/remote/datasetio/huggingface/__init__.py
deleted file mode 100644
index db803d1838..0000000000
--- a/src/llama_stack/providers/remote/datasetio/huggingface/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .config import HuggingfaceDatasetIOConfig
-
-
-async def get_adapter_impl(
-    config: HuggingfaceDatasetIOConfig,
-    _deps,
-):
-    from .huggingface import HuggingfaceDatasetIOImpl
-
-    impl = HuggingfaceDatasetIOImpl(config)
-    await impl.initialize()
-    return impl
diff --git a/src/llama_stack/providers/remote/datasetio/huggingface/config.py b/src/llama_stack/providers/remote/datasetio/huggingface/config.py
deleted file mode 100644
index 35297cb585..0000000000
--- a/src/llama_stack/providers/remote/datasetio/huggingface/config.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from pydantic import BaseModel
-
-from llama_stack.core.storage.datatypes import KVStoreReference
-
-
-class HuggingfaceDatasetIOConfig(BaseModel):
-    kvstore: KVStoreReference
-
-    @classmethod
-    def sample_run_config(cls, __distro_dir__: str, **kwargs: Any) -> dict[str, Any]:
-        return {
-            "kvstore": KVStoreReference(
-                backend="kv_default",
-                namespace="datasetio::huggingface",
-            ).model_dump(exclude_none=True)
-        }
diff --git a/src/llama_stack/providers/remote/datasetio/huggingface/huggingface.py b/src/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
deleted file mode 100644
index 26390a63b6..0000000000
--- a/src/llama_stack/providers/remote/datasetio/huggingface/huggingface.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-from urllib.parse import parse_qs, urlparse
-
-from llama_stack.core.storage.kvstore import kvstore_impl
-from llama_stack.providers.utils.pagination import paginate_records
-from llama_stack_api import Dataset, DatasetIO, DatasetsProtocolPrivate, PaginatedResponse
-
-from .config import HuggingfaceDatasetIOConfig
-
-DATASETS_PREFIX = "datasets:"
-
-
-def parse_hf_params(dataset_def: Dataset):
-    uri = dataset_def.source.uri
-    parsed_uri = urlparse(uri)
-    params = parse_qs(parsed_uri.query)
-    params = {k: v[0] for k, v in params.items()}
-    path = parsed_uri.path.lstrip("/")
-
-    return path, params
-
-
-class HuggingfaceDatasetIOImpl(DatasetIO, DatasetsProtocolPrivate):
-    def __init__(self, config: HuggingfaceDatasetIOConfig) -> None:
-        self.config = config
-        # local registry for keeping track of datasets within the provider
-        self.dataset_infos = {}
-        self.kvstore = None
-
-    async def initialize(self) -> None:
-        self.kvstore = await kvstore_impl(self.config.kvstore)
-        # Load existing datasets from kvstore
-        start_key = DATASETS_PREFIX
-        end_key = f"{DATASETS_PREFIX}\xff"
-        stored_datasets = await self.kvstore.values_in_range(start_key, end_key)
-
-        for dataset in stored_datasets:
-            dataset = Dataset.model_validate_json(dataset)
-            self.dataset_infos[dataset.identifier] = dataset
-
-    async def shutdown(self) -> None: ...
-
-    async def register_dataset(
-        self,
-        dataset_def: Dataset,
-    ) -> None:
-        # Store in kvstore
-        key = f"{DATASETS_PREFIX}{dataset_def.identifier}"
-        await self.kvstore.set(
-            key=key,
-            value=dataset_def.model_dump_json(),
-        )
-        self.dataset_infos[dataset_def.identifier] = dataset_def
-
-    async def unregister_dataset(self, dataset_id: str) -> None:
-        key = f"{DATASETS_PREFIX}{dataset_id}"
-        await self.kvstore.delete(key=key)
-        del self.dataset_infos[dataset_id]
-
-    async def iterrows(
-        self,
-        dataset_id: str,
-        start_index: int | None = None,
-        limit: int | None = None,
-    ) -> PaginatedResponse:
-        import datasets as hf_datasets
-
-        dataset_def = self.dataset_infos[dataset_id]
-        path, params = parse_hf_params(dataset_def)
-        loaded_dataset = hf_datasets.load_dataset(path, **params)
-
-        records = [loaded_dataset[i] for i in range(len(loaded_dataset))]
-        return paginate_records(records, start_index, limit)
-
-    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
-        import datasets as hf_datasets
-
-        dataset_def = self.dataset_infos[dataset_id]
-        path, params = parse_hf_params(dataset_def)
-        loaded_dataset = hf_datasets.load_dataset(path, **params)
-
-        # Convert rows to HF Dataset format
-        new_dataset = hf_datasets.Dataset.from_list(rows)
-
-        # Concatenate the new rows with existing dataset
-        updated_dataset = hf_datasets.concatenate_datasets([loaded_dataset, new_dataset])
-
-        if dataset_def.metadata.get("path", None):
-            updated_dataset.push_to_hub(dataset_def.metadata["path"])
-        else:
-            raise NotImplementedError("Uploading to URL-based datasets is not supported yet")
diff --git a/src/llama_stack/providers/remote/datasetio/nvidia/README.md b/src/llama_stack/providers/remote/datasetio/nvidia/README.md
deleted file mode 100644
index a872c61303..0000000000
--- a/src/llama_stack/providers/remote/datasetio/nvidia/README.md
+++ /dev/null
@@ -1,74 +0,0 @@
-# NVIDIA DatasetIO Provider for LlamaStack
-
-This provider enables dataset management using NVIDIA's NeMo Customizer service.
-
-## Features
-
-- Register datasets for fine-tuning LLMs
-- Unregister datasets
-
-## Getting Started
-
-### Prerequisites
-
-- LlamaStack with NVIDIA configuration
-- Access to Hosted NVIDIA NeMo Microservice
-- API key for authentication with the NVIDIA service
-
-### Setup
-
-Build the NVIDIA environment:
-
-```bash
-uv pip install llama-stack-client
-uv run llama stack list-deps nvidia | xargs -L1 uv pip install
-```
-
-### Basic Usage using the LlamaStack Python Client
-
-#### Initialize the client
-
-```python
-import os
-
-os.environ["NVIDIA_API_KEY"] = "your-api-key"
-os.environ["NVIDIA_CUSTOMIZER_URL"] = "http://nemo.test"
-os.environ["NVIDIA_DATASET_NAMESPACE"] = "default"
-os.environ["NVIDIA_PROJECT_ID"] = "test-project"
-from llama_stack.core.library_client import LlamaStackAsLibraryClient
-
-client = LlamaStackAsLibraryClient("nvidia")
-client.initialize()
-```
-
-#### Register a dataset
-
-```python
-client.datasets.register(
-    purpose="eval/question-answer",
-    dataset_id="my-eval-dataset",
-    source={"type": "uri", "uri": "hf://datasets/default/sample-dataset"},
-    metadata={
-        "format": "json",
-        "description": "Dataset for evaluation",
-        "provider": "nvidia",
-    },
-)
-```
-
-#### Get a list of all registered datasets
-
-```python
-datasets = client.datasets.list()
-for dataset in datasets:
-    print(f"Dataset ID: {dataset.identifier}")
-    print(f"Description: {dataset.metadata.get('description', '')}")
-    print(f"Source: {dataset.source.uri}")
-    print("---")
-```
-
-#### Unregister a dataset
-
-```python
-client.datasets.unregister(dataset_id="my-training-dataset")
-```
diff --git a/src/llama_stack/providers/remote/datasetio/nvidia/__init__.py b/src/llama_stack/providers/remote/datasetio/nvidia/__init__.py
deleted file mode 100644
index 418daec8d8..0000000000
--- a/src/llama_stack/providers/remote/datasetio/nvidia/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from .config import NvidiaDatasetIOConfig
-
-
-async def get_adapter_impl(
-    config: NvidiaDatasetIOConfig,
-    _deps,
-):
-    from .datasetio import NvidiaDatasetIOAdapter
-
-    if not isinstance(config, NvidiaDatasetIOConfig):
-        raise RuntimeError(f"Unexpected config type: {type(config)}")
-
-    impl = NvidiaDatasetIOAdapter(config)
-    return impl
-
-
-__all__ = ["get_adapter_impl", "NvidiaDatasetIOAdapter"]
diff --git a/src/llama_stack/providers/remote/datasetio/nvidia/config.py b/src/llama_stack/providers/remote/datasetio/nvidia/config.py
deleted file mode 100644
index addce6c1f0..0000000000
--- a/src/llama_stack/providers/remote/datasetio/nvidia/config.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import os
-import warnings
-from typing import Any
-
-from pydantic import BaseModel, Field
-
-
-class NvidiaDatasetIOConfig(BaseModel):
-    """Configuration for NVIDIA DatasetIO implementation."""
-
-    api_key: str | None = Field(
-        default_factory=lambda: os.getenv("NVIDIA_API_KEY"),
-        description="The NVIDIA API key.",
-    )
-
-    dataset_namespace: str | None = Field(
-        default_factory=lambda: os.getenv("NVIDIA_DATASET_NAMESPACE", "default"),
-        description="The NVIDIA dataset namespace.",
-    )
-
-    project_id: str | None = Field(
-        default_factory=lambda: os.getenv("NVIDIA_PROJECT_ID", "test-project"),
-        description="The NVIDIA project ID.",
-    )
-
-    datasets_url: str = Field(
-        default_factory=lambda: os.getenv("NVIDIA_DATASETS_URL", "http://nemo.test"),
-        description="Base URL for the NeMo Dataset API",
-    )
-
-    # warning for default values
-    def __post_init__(self):
-        default_values = []
-        if os.getenv("NVIDIA_PROJECT_ID") is None:
-            default_values.append("project_id='test-project'")
-        if os.getenv("NVIDIA_DATASET_NAMESPACE") is None:
-            default_values.append("dataset_namespace='default'")
-        if os.getenv("NVIDIA_DATASETS_URL") is None:
-            default_values.append("datasets_url='http://nemo.test'")
-
-        if default_values:
-            warnings.warn(
-                f"Using default values: {', '.join(default_values)}. \
-                          Please set the environment variables to avoid this default behavior.",
-                stacklevel=2,
-            )
-
-    @classmethod
-    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
-        return {
-            "api_key": "${env.NVIDIA_API_KEY:=}",
-            "dataset_namespace": "${env.NVIDIA_DATASET_NAMESPACE:=default}",
-            "project_id": "${env.NVIDIA_PROJECT_ID:=test-project}",
-            "datasets_url": "${env.NVIDIA_DATASETS_URL:=http://nemo.test}",
-        }
diff --git a/src/llama_stack/providers/remote/datasetio/nvidia/datasetio.py b/src/llama_stack/providers/remote/datasetio/nvidia/datasetio.py
deleted file mode 100644
index 2f5548fa96..0000000000
--- a/src/llama_stack/providers/remote/datasetio/nvidia/datasetio.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any
-
-import aiohttp
-
-from llama_stack_api import URL, Dataset, PaginatedResponse, ParamType
-
-from .config import NvidiaDatasetIOConfig
-
-
-class NvidiaDatasetIOAdapter:
-    """Nvidia NeMo DatasetIO API."""
-
-    def __init__(self, config: NvidiaDatasetIOConfig):
-        self.config = config
-        self.headers = {}
-
-    async def _make_request(
-        self,
-        method: str,
-        path: str,
-        headers: dict[str, Any] | None = None,
-        params: dict[str, Any] | None = None,
-        json: dict[str, Any] | None = None,
-        **kwargs,
-    ) -> dict[str, Any]:
-        """Helper method to make HTTP requests to the Customizer API."""
-        url = f"{self.config.datasets_url}{path}"
-        request_headers = self.headers.copy()
-
-        # Set default Content-Type for JSON requests
-        if json is not None:
-            request_headers["Content-Type"] = "application/json"
-
-        if headers:
-            request_headers.update(headers)
-
-        async with aiohttp.ClientSession(headers=request_headers) as session:
-            async with session.request(method, url, params=params, json=json, **kwargs) as response:
-                if response.status != 200:
-                    error_data = await response.json()
-                    raise Exception(f"API request failed: {error_data}")
-                return await response.json()
-
-    async def register_dataset(
-        self,
-        dataset_def: Dataset,
-    ) -> Dataset:
-        """Register a new dataset.
-
-        Args:
-            dataset_def [Dataset]: The dataset definition.
-                dataset_id [str]: The ID of the dataset.
-                source [DataSource]: The source of the dataset.
-                metadata [Dict[str, Any]]: The metadata of the dataset.
-                    format [str]: The format of the dataset.
-                    description [str]: The description of the dataset.
-        Returns:
-            Dataset
-        """
-        # add warnings for unsupported params
-        request_body = {
-            "name": dataset_def.identifier,
-            "namespace": self.config.dataset_namespace,
-            "files_url": dataset_def.source.uri,
-            "project": self.config.project_id,
-        }
-        if dataset_def.metadata:
-            request_body["format"] = dataset_def.metadata.get("format")
-            request_body["description"] = dataset_def.metadata.get("description")
-        await self._make_request(
-            "POST",
-            "/v1/datasets",
-            json=request_body,
-        )
-        return dataset_def
-
-    async def update_dataset(
-        self,
-        dataset_id: str,
-        dataset_schema: dict[str, ParamType],
-        url: URL,
-        provider_dataset_id: str | None = None,
-        provider_id: str | None = None,
-        metadata: dict[str, Any] | None = None,
-    ) -> None:
-        raise NotImplementedError("Not implemented")
-
-    async def unregister_dataset(
-        self,
-        dataset_id: str,
-    ) -> None:
-        await self._make_request(
-            "DELETE",
-            f"/v1/datasets/{self.config.dataset_namespace}/{dataset_id}",
-            headers={"Accept": "application/json", "Content-Type": "application/json"},
-        )
-
-    async def iterrows(
-        self,
-        dataset_id: str,
-        start_index: int | None = None,
-        limit: int | None = None,
-    ) -> PaginatedResponse:
-        raise NotImplementedError("Not implemented")
-
-    async def append_rows(self, dataset_id: str, rows: list[dict[str, Any]]) -> None:
-        raise NotImplementedError("Not implemented")
diff --git a/src/llama_stack/providers/remote/eval/__init__.py b/src/llama_stack/providers/remote/eval/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/remote/eval/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/remote/eval/nvidia/README.md b/src/llama_stack/providers/remote/eval/nvidia/README.md
deleted file mode 100644
index 4443d484be..0000000000
--- a/src/llama_stack/providers/remote/eval/nvidia/README.md
+++ /dev/null
@@ -1,142 +0,0 @@
-# NVIDIA NeMo Evaluator Eval Provider
-
-## Overview
-
-For the first integration, Benchmarks are mapped to Evaluation Configs on in the NeMo Evaluator. The full evaluation config object is provided as part of the meta-data. The `dataset_id` and `scoring_functions` are not used.
-
-Below are a few examples of how to register a benchmark, which in turn will create an evaluation config in NeMo Evaluator and how to trigger an evaluation.
-
-### Example for register an academic benchmark
-
-```text
-POST /eval/benchmarks
-```
-
-```json
-{
-  "benchmark_id": "mmlu",
-  "dataset_id": "",
-  "scoring_functions": [],
-  "metadata": {
-    "type": "mmlu"
-  }
-}
-```
-
-### Example for register a custom evaluation
-
-```text
-POST /eval/benchmarks
-```
-
-```json
-{
-  "benchmark_id": "my-custom-benchmark",
-  "dataset_id": "",
-  "scoring_functions": [],
-  "metadata": {
-    "type": "custom",
-    "params": {
-      "parallelism": 8
-    },
-    "tasks": {
-      "qa": {
-        "type": "completion",
-        "params": {
-          "template": {
-            "prompt": "{{prompt}}",
-            "max_tokens": 200
-          }
-        },
-        "dataset": {
-          "files_url": "hf://datasets/default/sample-basic-test/testing/testing.jsonl"
-        },
-        "metrics": {
-          "bleu": {
-            "type": "bleu",
-            "params": {
-              "references": [
-                "{{ideal_response}}"
-              ]
-            }
-          }
-        }
-      }
-    }
-  }
-}
-```
-
-### Example for triggering a benchmark/custom evaluation
-
-```text
-POST /eval/benchmarks/{benchmark_id}/jobs
-```
-
-```json
-{
-  "benchmark_id": "my-custom-benchmark",
-  "benchmark_config": {
-    "eval_candidate": {
-      "type": "model",
-      "model": "meta-llama/Llama3.1-8B-Instruct",
-      "sampling_params": {
-        "max_tokens": 100,
-        "temperature": 0.7
-      }
-    },
-    "scoring_params": {}
-  }
-}
-```
-
-Response example:
-
-```json
-{
-    "job_id": "eval-1234",
-    "status": "in_progress"
-}
-```
-
-### Example for getting the status of a job
-
-```text
-GET /eval/benchmarks/{benchmark_id}/jobs/{job_id}
-```
-
-Response example:
-
-```json
-{
-  "job_id": "eval-1234",
-  "status": "in_progress"
-}
-```
-
-### Example for cancelling a job
-
-```text
-POST /eval/benchmarks/{benchmark_id}/jobs/{job_id}/cancel
-```
-
-### Example for getting the results
-
-```text
-GET /eval/benchmarks/{benchmark_id}/results
-```
-
-```json
-{
-  "generations": [],
-  "scores": {
-    "{benchmark_id}": {
-      "score_rows": [],
-      "aggregated_results": {
-        "tasks": {},
-        "groups": {}
-      }
-    }
-  }
-}
-```
diff --git a/src/llama_stack/providers/remote/eval/nvidia/__init__.py b/src/llama_stack/providers/remote/eval/nvidia/__init__.py
deleted file mode 100644
index 1314fdb83e..0000000000
--- a/src/llama_stack/providers/remote/eval/nvidia/__init__.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-from llama_stack.core.datatypes import Api
-
-from .config import NVIDIAEvalConfig
-
-
-async def get_adapter_impl(
-    config: NVIDIAEvalConfig,
-    deps: dict[Api, Any],
-):
-    from .eval import NVIDIAEvalImpl
-
-    impl = NVIDIAEvalImpl(
-        config,
-        deps[Api.datasetio],
-        deps[Api.datasets],
-        deps[Api.scoring],
-        deps[Api.inference],
-        deps[Api.agents],
-    )
-    await impl.initialize()
-    return impl
-
-
-__all__ = ["get_adapter_impl", "NVIDIAEvalImpl"]
diff --git a/src/llama_stack/providers/remote/eval/nvidia/config.py b/src/llama_stack/providers/remote/eval/nvidia/config.py
deleted file mode 100644
index 7a1c04304f..0000000000
--- a/src/llama_stack/providers/remote/eval/nvidia/config.py
+++ /dev/null
@@ -1,29 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import os
-from typing import Any
-
-from pydantic import BaseModel, Field
-
-
-class NVIDIAEvalConfig(BaseModel):
-    """
-     Configuration for the NVIDIA NeMo Evaluator microservice endpoint.
-
-    Attributes:
-        evaluator_url (str): A base url for accessing the NVIDIA evaluation endpoint, e.g. http://localhost:8000.
-    """
-
-    evaluator_url: str = Field(
-        default_factory=lambda: os.getenv("NVIDIA_EVALUATOR_URL", "http://0.0.0.0:7331"),
-        description="The url for accessing the evaluator service",
-    )
-
-    @classmethod
-    def sample_run_config(cls, **kwargs) -> dict[str, Any]:
-        return {
-            "evaluator_url": "${env.NVIDIA_EVALUATOR_URL:=http://localhost:7331}",
-        }
diff --git a/src/llama_stack/providers/remote/eval/nvidia/eval.py b/src/llama_stack/providers/remote/eval/nvidia/eval.py
deleted file mode 100644
index d8cdd78ce5..0000000000
--- a/src/llama_stack/providers/remote/eval/nvidia/eval.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from typing import Any
-
-import httpx
-
-from llama_stack.providers.utils.inference.model_registry import ModelRegistryHelper
-from llama_stack_api import (
-    Agents,
-    Benchmark,
-    BenchmarksProtocolPrivate,
-    DatasetIO,
-    Datasets,
-    Eval,
-    EvaluateResponse,
-    EvaluateRowsRequest,
-    Inference,
-    Job,
-    JobCancelRequest,
-    JobResultRequest,
-    JobStatus,
-    JobStatusRequest,
-    RunEvalRequest,
-    Scoring,
-    ScoringResult,
-)
-
-from .config import NVIDIAEvalConfig
-
-DEFAULT_NAMESPACE = "nvidia"
-
-
-class NVIDIAEvalImpl(
-    Eval,
-    BenchmarksProtocolPrivate,
-    ModelRegistryHelper,
-):
-    def __init__(
-        self,
-        config: NVIDIAEvalConfig,
-        datasetio_api: DatasetIO,
-        datasets_api: Datasets,
-        scoring_api: Scoring,
-        inference_api: Inference,
-        agents_api: Agents,
-    ) -> None:
-        self.config = config
-        self.datasetio_api = datasetio_api
-        self.datasets_api = datasets_api
-        self.scoring_api = scoring_api
-        self.inference_api = inference_api
-        self.agents_api = agents_api
-
-        ModelRegistryHelper.__init__(self)
-        self._client: httpx.AsyncClient | None = None
-
-    @property
-    def client(self) -> httpx.AsyncClient:
-        if self._client is None:
-            raise RuntimeError("Client not initialized. Call initialize() first.")
-        return self._client
-
-    async def initialize(self) -> None:
-        self._client = httpx.AsyncClient(timeout=httpx.Timeout(30.0))
-
-    async def shutdown(self) -> None:
-        if self._client:
-            await self._client.aclose()
-
-    async def _evaluator_get(self, path: str):
-        """Helper for making GET requests to the evaluator service."""
-        response = await self.client.get(url=f"{self.config.evaluator_url}{path}")
-        response.raise_for_status()
-        return response.json()
-
-    async def _evaluator_post(self, path: str, data: dict[str, Any]):
-        """Helper for making POST requests to the evaluator service."""
-        response = await self.client.post(url=f"{self.config.evaluator_url}{path}", json=data)
-        response.raise_for_status()
-        return response.json()
-
-    async def _evaluator_delete(self, path: str) -> None:
-        """Helper for making DELETE requests to the evaluator service."""
-        response = await self.client.delete(url=f"{self.config.evaluator_url}{path}")
-        response.raise_for_status()
-
-    async def register_benchmark(self, task_def: Benchmark) -> None:
-        """Register a benchmark as an evaluation configuration."""
-        await self._evaluator_post(
-            "/v1/evaluation/configs",
-            {
-                "namespace": DEFAULT_NAMESPACE,
-                "name": task_def.benchmark_id,
-                # metadata is copied to request body as-is
-                **task_def.metadata,
-            },
-        )
-
-    async def unregister_benchmark(self, benchmark_id: str) -> None:
-        """Unregister a benchmark evaluation configuration from NeMo Evaluator."""
-        await self._evaluator_delete(f"/v1/evaluation/configs/{DEFAULT_NAMESPACE}/{benchmark_id}")
-
-    async def run_eval(
-        self,
-        request: RunEvalRequest,
-    ) -> Job:
-        """Run an evaluation job for a benchmark."""
-        model = (
-            request.benchmark_config.eval_candidate.model
-            if request.benchmark_config.eval_candidate.type == "model"
-            else request.benchmark_config.eval_candidate.config.model
-        )
-        nvidia_model = self.get_provider_model_id(model) or model
-
-        result = await self._evaluator_post(
-            "/v1/evaluation/jobs",
-            {
-                "config": f"{DEFAULT_NAMESPACE}/{request.benchmark_id}",
-                "target": {"type": "model", "model": nvidia_model},
-            },
-        )
-
-        return Job(job_id=result["id"], status=JobStatus.in_progress)
-
-    async def evaluate_rows(
-        self,
-        request: EvaluateRowsRequest,
-    ) -> EvaluateResponse:
-        raise NotImplementedError()
-
-    async def job_status(self, request: JobStatusRequest) -> Job:
-        """Get the status of an evaluation job.
-
-        EvaluatorStatus: "created", "pending", "running", "cancelled", "cancelling", "failed", "completed".
-        JobStatus: "scheduled", "in_progress", "completed", "cancelled", "failed"
-        """
-        result = await self._evaluator_get(f"/v1/evaluation/jobs/{request.job_id}")
-        result_status = result["status"]
-
-        job_status = JobStatus.failed
-        if result_status in ["created", "pending"]:
-            job_status = JobStatus.scheduled
-        elif result_status in ["running"]:
-            job_status = JobStatus.in_progress
-        elif result_status in ["completed"]:
-            job_status = JobStatus.completed
-        elif result_status in ["cancelled"]:
-            job_status = JobStatus.cancelled
-
-        return Job(job_id=request.job_id, status=job_status)
-
-    async def job_cancel(self, request: JobCancelRequest) -> None:
-        """Cancel the evaluation job."""
-        await self._evaluator_post(f"/v1/evaluation/jobs/{request.job_id}/cancel", {})
-
-    async def job_result(self, request: JobResultRequest) -> EvaluateResponse:
-        """Returns the results of the evaluation job."""
-
-        job_status_request = JobStatusRequest(benchmark_id=request.benchmark_id, job_id=request.job_id)
-        job = await self.job_status(job_status_request)
-        status = job.status
-        if not status or status != JobStatus.completed:
-            raise ValueError(f"Job {request.job_id} not completed. Status: {status.value}")
-
-        result = await self._evaluator_get(f"/v1/evaluation/jobs/{request.job_id}/results")
-
-        return EvaluateResponse(
-            # TODO: these are stored in detailed results on NeMo Evaluator side; can be added
-            generations=[],
-            scores={
-                request.benchmark_id: ScoringResult(
-                    score_rows=[],
-                    aggregated_results=result,
-                )
-            },
-        )
diff --git a/src/llama_stack/providers/utils/common/data_schema_validator.py b/src/llama_stack/providers/utils/common/data_schema_validator.py
deleted file mode 100644
index c9a3b0920f..0000000000
--- a/src/llama_stack/providers/utils/common/data_schema_validator.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from enum import Enum
-from typing import Any
-
-from llama_stack.core.datatypes import Api
-from llama_stack_api import ChatCompletionInputType, CompletionInputType, StringType
-
-
-class ColumnName(Enum):
-    input_query = "input_query"
-    expected_answer = "expected_answer"
-    chat_completion_input = "chat_completion_input"
-    completion_input = "completion_input"
-    generated_answer = "generated_answer"
-    context = "context"
-    dialog = "dialog"
-    function = "function"
-    language = "language"
-    id = "id"
-    ground_truth = "ground_truth"
-
-
-VALID_SCHEMAS_FOR_SCORING = [
-    {
-        ColumnName.input_query.value: StringType(),
-        ColumnName.expected_answer.value: StringType(),
-        ColumnName.generated_answer.value: StringType(),
-    },
-    {
-        ColumnName.input_query.value: StringType(),
-        ColumnName.expected_answer.value: StringType(),
-        ColumnName.generated_answer.value: StringType(),
-        ColumnName.context.value: StringType(),
-    },
-    {
-        ColumnName.input_query.value: StringType(),
-        ColumnName.expected_answer.value: StringType(),
-        ColumnName.generated_answer.value: StringType(),
-        ColumnName.function.value: StringType(),
-        ColumnName.language.value: StringType(),
-        ColumnName.id.value: StringType(),
-        ColumnName.ground_truth.value: StringType(),
-    },
-]
-
-VALID_SCHEMAS_FOR_EVAL = [
-    {
-        ColumnName.input_query.value: StringType(),
-        ColumnName.expected_answer.value: StringType(),
-        ColumnName.chat_completion_input.value: ChatCompletionInputType(),
-    },
-    {
-        ColumnName.input_query.value: StringType(),
-        ColumnName.expected_answer.value: StringType(),
-        ColumnName.completion_input.value: CompletionInputType(),
-    },
-    {
-        ColumnName.input_query.value: StringType(),
-        ColumnName.expected_answer.value: StringType(),
-        ColumnName.generated_answer.value: StringType(),
-        ColumnName.function.value: StringType(),
-        ColumnName.language.value: StringType(),
-        ColumnName.id.value: StringType(),
-        ColumnName.ground_truth.value: StringType(),
-    },
-]
-
-
-def get_valid_schemas(api_str: str):
-    if api_str == Api.scoring.value:
-        return VALID_SCHEMAS_FOR_SCORING
-    elif api_str == Api.eval.value:
-        return VALID_SCHEMAS_FOR_EVAL
-    else:
-        raise ValueError(f"Invalid API string: {api_str}")
-
-
-def validate_dataset_schema(
-    dataset_schema: dict[str, Any],
-    expected_schemas: list[dict[str, Any]],
-):
-    if dataset_schema not in expected_schemas:
-        raise ValueError(f"Dataset {dataset_schema} does not have a correct input schema in {expected_schemas}")
-
-
-def validate_row_schema(
-    input_row: dict[str, Any],
-    expected_schemas: list[dict[str, Any]],
-):
-    for schema in expected_schemas:
-        if all(key in input_row for key in schema):
-            return
-
-    raise ValueError(f"Input row {input_row} does not match any of the expected schemas in {expected_schemas}")
diff --git a/src/llama_stack/providers/utils/datasetio/__init__.py b/src/llama_stack/providers/utils/datasetio/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/utils/datasetio/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/utils/datasetio/url_utils.py b/src/llama_stack/providers/utils/datasetio/url_utils.py
deleted file mode 100644
index fe78366414..0000000000
--- a/src/llama_stack/providers/utils/datasetio/url_utils.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import asyncio
-import base64
-import io
-from urllib.parse import unquote
-
-from llama_stack.providers.utils.common.data_url import parse_data_url
-
-
-async def get_dataframe_from_uri(uri: str):
-    import pandas
-
-    df = None
-    if uri.endswith(".csv"):
-        # Moving to its own thread to avoid io from blocking the eventloop
-        # This isn't ideal as it moves more then just the IO to a new thread
-        # but it is as close as we can easly get
-        df = await asyncio.to_thread(pandas.read_csv, uri)
-    elif uri.endswith(".xlsx"):
-        df = await asyncio.to_thread(pandas.read_excel, uri)
-    elif uri.startswith("data:"):
-        parts = parse_data_url(uri)
-        data = parts["data"]
-        if parts["is_base64"]:
-            data = base64.b64decode(data)
-        else:
-            data = unquote(data)
-            encoding = parts["encoding"] or "utf-8"
-            data = data.encode(encoding)
-
-        mime_type = parts["mimetype"]
-        mime_category = mime_type.split("/")[0]
-        data_bytes = io.BytesIO(data)
-
-        if mime_category == "text":
-            df = pandas.read_csv(data_bytes)
-        else:
-            df = pandas.read_excel(data_bytes)
-    else:
-        raise ValueError(f"Unsupported file type: {uri}")
-
-    return df
diff --git a/src/llama_stack/providers/utils/scoring/__init__.py b/src/llama_stack/providers/utils/scoring/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/src/llama_stack/providers/utils/scoring/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/src/llama_stack/providers/utils/scoring/aggregation_utils.py b/src/llama_stack/providers/utils/scoring/aggregation_utils.py
deleted file mode 100644
index aa6fe7248f..0000000000
--- a/src/llama_stack/providers/utils/scoring/aggregation_utils.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import statistics
-from typing import Any
-
-from llama_stack_api import AggregationFunctionType, ScoringResultRow
-
-
-def aggregate_accuracy(scoring_results: list[ScoringResultRow]) -> dict[str, Any]:
-    num_correct = sum(result["score"] for result in scoring_results)
-    avg_score = num_correct / len(scoring_results)
-
-    return {
-        "accuracy": avg_score,
-        "num_correct": num_correct,
-        "num_total": len(scoring_results),
-    }
-
-
-def aggregate_average(scoring_results: list[ScoringResultRow]) -> dict[str, Any]:
-    return {
-        "average": sum(result["score"] for result in scoring_results if result["score"] is not None)
-        / len([_ for _ in scoring_results if _["score"] is not None]),
-    }
-
-
-def aggregate_weighted_average(scoring_results: list[ScoringResultRow]) -> dict[str, Any]:
-    return {
-        "weighted_average": sum(
-            result["score"] * result["weight"]
-            for result in scoring_results
-            if result["score"] is not None and result["weight"] is not None
-        )
-        / sum(result["weight"] for result in scoring_results if result["weight"] is not None),
-    }
-
-
-def aggregate_categorical_count(
-    scoring_results: list[ScoringResultRow],
-) -> dict[str, Any]:
-    scores = [str(r["score"]) for r in scoring_results]
-    unique_scores = sorted(set(scores))
-    return {"categorical_count": {s: scores.count(s) for s in unique_scores}}
-
-
-def aggregate_median(scoring_results: list[ScoringResultRow]) -> dict[str, Any]:
-    scores = [r["score"] for r in scoring_results if r["score"] is not None]
-    median = statistics.median(scores) if scores else None
-    return {"median": median}
-
-
-# TODO: decide whether we want to make aggregation functions as a registerable resource
-AGGREGATION_FUNCTIONS = {
-    AggregationFunctionType.accuracy: aggregate_accuracy,
-    AggregationFunctionType.average: aggregate_average,
-    AggregationFunctionType.weighted_average: aggregate_weighted_average,
-    AggregationFunctionType.categorical_count: aggregate_categorical_count,
-    AggregationFunctionType.median: aggregate_median,
-}
-
-
-def aggregate_metrics(
-    scoring_results: list[ScoringResultRow], metrics: list[AggregationFunctionType]
-) -> dict[str, Any]:
-    agg_results = {}
-    for metric in metrics:
-        if metric not in AGGREGATION_FUNCTIONS:
-            raise ValueError(f"Aggregation function {metric} not found")
-        agg_fn = AGGREGATION_FUNCTIONS[metric]
-        agg_results[metric] = agg_fn(scoring_results)
-    return agg_results
diff --git a/src/llama_stack/providers/utils/scoring/base_scoring_fn.py b/src/llama_stack/providers/utils/scoring/base_scoring_fn.py
deleted file mode 100644
index f372db8b52..0000000000
--- a/src/llama_stack/providers/utils/scoring/base_scoring_fn.py
+++ /dev/null
@@ -1,113 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-from abc import ABC, abstractmethod
-from typing import Any
-
-from llama_stack.providers.utils.scoring.aggregation_utils import aggregate_metrics
-from llama_stack_api import ScoringFn, ScoringFnParams, ScoringResultRow
-
-
-class BaseScoringFn(ABC):
-    """
-    Base interface class for Scoring Functions.
-    Each scoring function needs to implement the following methods:
-    - score_row(self, row)
-    - aggregate(self, scoring_fn_results)
-    """
-
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-
-    def __str__(self) -> str:
-        return self.__class__.__name__
-
-    @abstractmethod
-    async def score_row(
-        self,
-        input_row: dict[str, Any],
-        scoring_fn_identifier: str | None = None,
-        scoring_params: ScoringFnParams | None = None,
-    ) -> ScoringResultRow:
-        raise NotImplementedError()
-
-    @abstractmethod
-    async def aggregate(
-        self,
-        scoring_results: list[ScoringResultRow],
-        scoring_fn_identifier: str | None = None,
-        scoring_params: ScoringFnParams | None = None,
-    ) -> dict[str, Any]:
-        raise NotImplementedError()
-
-    @abstractmethod
-    async def score(
-        self,
-        input_rows: list[dict[str, Any]],
-        scoring_fn_identifier: str | None = None,
-        scoring_params: ScoringFnParams | None = None,
-    ) -> list[ScoringResultRow]:
-        raise NotImplementedError()
-
-
-class RegisteredBaseScoringFn(BaseScoringFn):
-    """
-    Interface for native scoring functions that are registered in LlamaStack.
-    """
-
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-        self.supported_fn_defs_registry = {}
-
-    def __str__(self) -> str:
-        return self.__class__.__name__
-
-    def get_supported_scoring_fn_defs(self) -> list[ScoringFn]:
-        return list(self.supported_fn_defs_registry.values())
-
-    def register_scoring_fn_def(self, scoring_fn: ScoringFn) -> None:
-        if scoring_fn.identifier in self.supported_fn_defs_registry:
-            raise ValueError(f"Scoring function def with identifier {scoring_fn.identifier} already exists.")
-        self.supported_fn_defs_registry[scoring_fn.identifier] = scoring_fn
-
-    def unregister_scoring_fn_def(self, scoring_fn_id: str) -> None:
-        if scoring_fn_id not in self.supported_fn_defs_registry:
-            raise ValueError(f"Scoring function def with identifier {scoring_fn_id} does not exist.")
-        del self.supported_fn_defs_registry[scoring_fn_id]
-
-    @abstractmethod
-    async def score_row(
-        self,
-        input_row: dict[str, Any],
-        scoring_fn_identifier: str | None = None,
-        scoring_params: ScoringFnParams | None = None,
-    ) -> ScoringResultRow:
-        raise NotImplementedError()
-
-    async def aggregate(
-        self,
-        scoring_results: list[ScoringResultRow],
-        scoring_fn_identifier: str | None = None,
-        scoring_params: ScoringFnParams | None = None,
-    ) -> dict[str, Any]:
-        params = self.supported_fn_defs_registry[scoring_fn_identifier].params
-        if scoring_params is not None:
-            if params is None:
-                params = scoring_params
-            else:
-                params.aggregation_functions = scoring_params.aggregation_functions
-
-        aggregation_functions = []
-        if params and hasattr(params, "aggregation_functions") and params.aggregation_functions:
-            aggregation_functions.extend(params.aggregation_functions)
-        return aggregate_metrics(scoring_results, aggregation_functions)
-
-    async def score(
-        self,
-        input_rows: list[dict[str, Any]],
-        scoring_fn_identifier: str | None = None,
-        scoring_params: ScoringFnParams | None = None,
-    ) -> list[ScoringResultRow]:
-        return [await self.score_row(input_row, scoring_fn_identifier, scoring_params) for input_row in input_rows]
diff --git a/src/llama_stack/providers/utils/scoring/basic_scoring_utils.py b/src/llama_stack/providers/utils/scoring/basic_scoring_utils.py
deleted file mode 100644
index 7372a521c1..0000000000
--- a/src/llama_stack/providers/utils/scoring/basic_scoring_utils.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import contextlib
-import signal
-from collections.abc import Iterator
-from types import FrameType
-
-
-class TimeoutError(Exception):
-    pass
-
-
-@contextlib.contextmanager
-def time_limit(seconds: float) -> Iterator[None]:
-    def signal_handler(signum: int, frame: FrameType | None) -> None:
-        raise TimeoutError("Timed out!")
-
-    signal.setitimer(signal.ITIMER_REAL, seconds)
-    signal.signal(signal.SIGALRM, signal_handler)
-    try:
-        yield
-    finally:
-        signal.setitimer(signal.ITIMER_REAL, 0)
diff --git a/src/llama_stack_api/__init__.py b/src/llama_stack_api/__init__.py
index d4bdd6df8d..0be5fffe78 100644
--- a/src/llama_stack_api/__init__.py
+++ b/src/llama_stack_api/__init__.py
@@ -73,17 +73,6 @@
     ListBatchesResponse,
     RetrieveBatchRequest,
 )
-from .benchmarks import (
-    Benchmark,
-    BenchmarkInput,
-    Benchmarks,
-    CommonBenchmarkFields,
-    GetBenchmarkRequest,
-    ListBenchmarksRequest,
-    ListBenchmarksResponse,
-    RegisterBenchmarkRequest,
-    UnregisterBenchmarkRequest,
-)
 
 # Import commonly used types from common submodule
 from .common.content_types import (
@@ -101,7 +90,6 @@
     ConnectorToolNotFoundError,
     ConversationItemNotFoundError,
     ConversationNotFoundError,
-    DatasetNotFoundError,
     InternalServerError,
     InvalidParameterError,
     ModelNotFoundError,
@@ -156,29 +144,8 @@
     RetrieveItemRequest,
     UpdateConversationRequest,
 )
-from .datasetio import (
-    AppendRowsParams,
-    AppendRowsRequest,
-    DatasetIO,
-    DatasetStore,
-    IterRowsRequest,
-)
-from .datasets import (
-    CommonDatasetFields,
-    Dataset,
-    DatasetInput,
-    DatasetPurpose,
-    Datasets,
-    DatasetType,
-    DataSource,
-    ListDatasetsResponse,
-    RowsDataSource,
-    URIDataSource,
-)
 from .datatypes import (
     Api,
-    BenchmarksProtocolPrivate,
-    DatasetsProtocolPrivate,
     DynamicApiMeta,
     Error,
     ExternalApiSpec,
@@ -190,32 +157,10 @@
     RemoteProviderConfig,
     RemoteProviderSpec,
     RoutingTable,
-    ScoringFunctionsProtocolPrivate,
     ShieldsProtocolPrivate,
     ToolGroupsProtocolPrivate,
     VectorStoresProtocolPrivate,
 )
-from .eval import (
-    BenchmarkConfig,
-    BenchmarkIdRequest,
-    Eval,
-    EvalCandidate,
-    EvaluateResponse,
-    EvaluateRowsBodyRequest,
-    EvaluateRowsRequest,
-    JobCancelRequest,
-    JobResultRequest,
-    JobStatusRequest,
-    ModelCandidate,
-    RunEvalBodyRequest,
-    RunEvalRequest,
-    # Backward compatibility helpers
-    resolve_evaluate_rows_request,
-    resolve_job_cancel_request,
-    resolve_job_result_request,
-    resolve_job_status_request,
-    resolve_run_eval_request,
-)
 from .file_processors import FileProcessors, ProcessFileRequest, ProcessFileResponse
 from .filters import COMPARISON_FILTER_TYPES, COMPOUND_FILTER_TYPES, ComparisonFilter, CompoundFilter, Filter
 from .files import (
@@ -470,33 +415,6 @@
     ViolationLevel,
 )
 
-from .scoring import (
-    ScoreBatchRequest,
-    ScoreBatchResponse,
-    ScoreRequest,
-    ScoreResponse,
-    Scoring,
-    ScoringFunctionStore,
-    ScoringResult,
-    ScoringResultRow,
-)
-from .scoring_functions import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    CommonScoringFnFields,
-    GetScoringFunctionRequest,
-    ListScoringFunctionsRequest,
-    ListScoringFunctionsResponse,
-    LLMAsJudgeScoringFnParams,
-    RegexParserScoringFnParams,
-    RegisterScoringFunctionRequest,
-    ScoringFn,
-    ScoringFnInput,
-    ScoringFnParams,
-    ScoringFnParamsType,
-    ScoringFunctions,
-    UnregisterScoringFunctionRequest,
-)
 from .shields import (
     CommonShieldFields,
     GetShieldRequest,
@@ -586,7 +504,6 @@
     "LLAMA_STACK_API_V1BETA",
     # API Symbols
     "Agents",
-    "AggregationFunctionType",
     # Agents Request Models
     "CreateResponseRequest",
     "DeleteResponseRequest",
@@ -597,18 +514,12 @@
     "Api",
     "ApiFilter",
     "ApprovalFilter",
-    "BasicScoringFnParams",
     "Batches",
     "BatchNotFoundError",
     "BatchObject",
     "CancelBatchRequest",
     "CreateBatchRequest",
     "ListBatchesRequest",
-    "Benchmark",
-    "BenchmarkConfig",
-    "BenchmarkInput",
-    "Benchmarks",
-    "BenchmarksProtocolPrivate",
     "Bf16QuantizationConfig",
     "CallableT",
     "ChatCompletionInputType",
@@ -620,11 +531,8 @@
     "DEFAULT_CHUNK_SIZE_TOKENS",
     "DeleteChunksRequest",
     "EmbeddedChunk",
-    "CommonBenchmarkFields",
     "ConflictError",
-    "CommonDatasetFields",
     "CommonModelFields",
-    "CommonScoringFnFields",
     "CommonShieldFields",
     "CompletionInputType",
     "CompletionRequest",
@@ -653,19 +561,6 @@
     "ListItemsRequest",
     "RetrieveItemRequest",
     "UpdateConversationRequest",
-    "DataSource",
-    "Dataset",
-    "DatasetIO",
-    "DatasetInput",
-    "DatasetPurpose",
-    "DatasetNotFoundError",
-    "DatasetStore",
-    "DatasetType",
-    "AppendRowsParams",
-    "AppendRowsRequest",
-    "IterRowsRequest",
-    "Datasets",
-    "DatasetsProtocolPrivate",
     "DefaultRAGQueryGeneratorConfig",
     "DeleteFileRequest",
     "Docstring",
@@ -673,23 +568,6 @@
     "EmbeddingTaskType",
     "EmbeddingsResponse",
     "Error",
-    "Eval",
-    "EvalCandidate",
-    "EvaluateResponse",
-    "EvaluateRowsBodyRequest",
-    "EvaluateRowsRequest",
-    "BenchmarkIdRequest",
-    "JobCancelRequest",
-    "JobResultRequest",
-    "JobStatusRequest",
-    "RunEvalBodyRequest",
-    "RunEvalRequest",
-    # Backward compatibility helpers
-    "resolve_run_eval_request",
-    "resolve_evaluate_rows_request",
-    "resolve_job_status_request",
-    "resolve_job_cancel_request",
-    "resolve_job_result_request",
     "ExpiresAfter",
     "ExternalApiSpec",
     "ExtraBodyField",
@@ -737,20 +615,13 @@
     "JsonSchemaGenerator",
     "JsonSchemaResponseFormat",
     "JsonType",
-    "LLMAsJudgeScoringFnParams",
     "LLMRAGQueryGeneratorConfig",
     "ListBatchesResponse",
     "RetrieveBatchRequest",
-    "GetBenchmarkRequest",
-    "ListBenchmarksRequest",
-    "ListBenchmarksResponse",
-    "RegisterBenchmarkRequest",
-    "UnregisterBenchmarkRequest",
     "GetConnectorRequest",
     "GetConnectorToolRequest",
     "ListConnectorToolsRequest",
     "ListConnectorsResponse",
-    "ListDatasetsResponse",
     "ListFilesRequest",
     "ListModelsResponse",
     "GetChatCompletionRequest",
@@ -763,7 +634,6 @@
     "ListProvidersResponse",
     "ListRoutesRequest",
     "ListRoutesResponse",
-    "ListScoringFunctionsResponse",
     "ListShieldsResponse",
     "ListToolDefsResponse",
     "ListToolGroupsResponse",
@@ -774,7 +644,6 @@
     "MCPListToolsTool",
     "Metadata",
     "Model",
-    "ModelCandidate",
     "ModelInput",
     "ModelNotFoundError",
     "ModelStore",
@@ -976,7 +845,6 @@
     "register_schema",
     "RRFRanker",
     "Ranker",
-    "RegexParserScoringFnParams",
     "RemoteProviderConfig",
     "RemoteProviderSpec",
     "RerankData",
@@ -998,7 +866,6 @@
     "RetrieveFileRequest",
     "RouteInfo",
     "RoutingTable",
-    "RowsDataSource",
     "RunModerationRequest",
     "RunShieldRequest",
     "RunShieldResponse",
@@ -1006,24 +873,6 @@
     "SafetyViolation",
     "SamplingParams",
     "SamplingStrategy",
-    "ScoreBatchRequest",
-    "ScoreBatchResponse",
-    "ScoreRequest",
-    "ScoreResponse",
-    "Scoring",
-    "ScoringFn",
-    "ScoringFnInput",
-    "ScoringFnParams",
-    "ScoringFnParamsType",
-    "ScoringFunctionStore",
-    "ScoringFunctions",
-    "ScoringFunctionsProtocolPrivate",
-    "ScoringResult",
-    "GetScoringFunctionRequest",
-    "ListScoringFunctionsRequest",
-    "RegisterScoringFunctionRequest",
-    "UnregisterScoringFunctionRequest",
-    "ScoringResultRow",
     "Schema",
     "SchemaInfo",
     "SchemaOptions",
@@ -1064,7 +913,6 @@
     "unwrap_optional_type",
     "unwrap_union_types",
     "UploadFileRequest",
-    "URIDataSource",
     "URL",
     "_URLOrData",
     "UserMessage",
diff --git a/src/llama_stack_api/benchmarks/__init__.py b/src/llama_stack_api/benchmarks/__init__.py
deleted file mode 100644
index 9c5652dce2..0000000000
--- a/src/llama_stack_api/benchmarks/__init__.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""Benchmarks API protocol and models.
-
-This module contains the Benchmarks protocol definition.
-Pydantic models are defined in llama_stack_api.benchmarks.models.
-The FastAPI router is defined in llama_stack_api.benchmarks.fastapi_routes.
-"""
-
-# Import fastapi_routes for router factory access
-from . import fastapi_routes
-
-# Import protocol for re-export
-from .api import Benchmarks
-
-# Import models for re-export
-from .models import (
-    Benchmark,
-    BenchmarkInput,
-    CommonBenchmarkFields,
-    GetBenchmarkRequest,
-    ListBenchmarksRequest,
-    ListBenchmarksResponse,
-    RegisterBenchmarkRequest,
-    UnregisterBenchmarkRequest,
-)
-
-__all__ = [
-    "Benchmarks",
-    "Benchmark",
-    "BenchmarkInput",
-    "CommonBenchmarkFields",
-    "ListBenchmarksResponse",
-    "ListBenchmarksRequest",
-    "GetBenchmarkRequest",
-    "RegisterBenchmarkRequest",
-    "UnregisterBenchmarkRequest",
-    "fastapi_routes",
-]
diff --git a/src/llama_stack_api/benchmarks/api.py b/src/llama_stack_api/benchmarks/api.py
deleted file mode 100644
index 26f88dbe29..0000000000
--- a/src/llama_stack_api/benchmarks/api.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Protocol, runtime_checkable
-
-from .models import (
-    Benchmark,
-    GetBenchmarkRequest,
-    ListBenchmarksRequest,
-    ListBenchmarksResponse,
-    RegisterBenchmarkRequest,
-    UnregisterBenchmarkRequest,
-)
-
-
-@runtime_checkable
-class Benchmarks(Protocol):
-    async def list_benchmarks(
-        self,
-        request: ListBenchmarksRequest,
-    ) -> ListBenchmarksResponse: ...
-
-    async def get_benchmark(
-        self,
-        request: GetBenchmarkRequest,
-    ) -> Benchmark: ...
-
-    async def register_benchmark(
-        self,
-        request: RegisterBenchmarkRequest,
-    ) -> None: ...
-
-    async def unregister_benchmark(
-        self,
-        request: UnregisterBenchmarkRequest,
-    ) -> None: ...
diff --git a/src/llama_stack_api/benchmarks/fastapi_routes.py b/src/llama_stack_api/benchmarks/fastapi_routes.py
deleted file mode 100644
index 461939ab95..0000000000
--- a/src/llama_stack_api/benchmarks/fastapi_routes.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""FastAPI router for the Benchmarks API.
-
-This module defines the FastAPI router for the Benchmarks API using standard
-FastAPI route decorators. The router is defined in the API package to keep
-all API-related code together.
-"""
-
-from typing import Annotated
-
-from fastapi import APIRouter, Body, Depends
-
-from llama_stack_api.router_utils import create_path_dependency, create_query_dependency, standard_responses
-from llama_stack_api.version import LLAMA_STACK_API_V1ALPHA
-
-from .api import Benchmarks
-from .models import (
-    Benchmark,
-    GetBenchmarkRequest,
-    ListBenchmarksRequest,
-    ListBenchmarksResponse,
-    RegisterBenchmarkRequest,
-    UnregisterBenchmarkRequest,
-)
-
-# Automatically generate dependency functions from Pydantic models
-# This ensures the models are the single source of truth for descriptions
-get_list_benchmarks_request = create_query_dependency(ListBenchmarksRequest)
-get_get_benchmark_request = create_path_dependency(GetBenchmarkRequest)
-get_unregister_benchmark_request = create_path_dependency(UnregisterBenchmarkRequest)
-
-
-def create_router(impl: Benchmarks) -> APIRouter:
-    """Create a FastAPI router for the Benchmarks API.
-
-    Args:
-        impl: The Benchmarks implementation instance
-
-    Returns:
-        APIRouter configured for the Benchmarks API
-    """
-    router = APIRouter(
-        prefix=f"/{LLAMA_STACK_API_V1ALPHA}",
-        tags=["Benchmarks"],
-        responses=standard_responses,
-    )
-
-    @router.get(
-        "/eval/benchmarks",
-        response_model=ListBenchmarksResponse,
-        summary="List all benchmarks.",
-        description="List all benchmarks.",
-        responses={
-            200: {"description": "A ListBenchmarksResponse."},
-        },
-    )
-    async def list_benchmarks(
-        request: Annotated[ListBenchmarksRequest, Depends(get_list_benchmarks_request)],
-    ) -> ListBenchmarksResponse:
-        return await impl.list_benchmarks(request)
-
-    @router.get(
-        "/eval/benchmarks/{benchmark_id}",
-        response_model=Benchmark,
-        summary="Get a benchmark by its ID.",
-        description="Get a benchmark by its ID.",
-        responses={
-            200: {"description": "A Benchmark."},
-        },
-    )
-    async def get_benchmark(
-        request: Annotated[GetBenchmarkRequest, Depends(get_get_benchmark_request)],
-    ) -> Benchmark:
-        return await impl.get_benchmark(request)
-
-    @router.post(
-        "/eval/benchmarks",
-        summary="Register a benchmark.",
-        description="Register a benchmark.",
-        responses={
-            200: {"description": "The benchmark was successfully registered."},
-        },
-        deprecated=True,
-    )
-    async def register_benchmark(
-        request: Annotated[RegisterBenchmarkRequest, Body(...)],
-    ) -> None:
-        return await impl.register_benchmark(request)
-
-    @router.delete(
-        "/eval/benchmarks/{benchmark_id}",
-        summary="Unregister a benchmark.",
-        description="Unregister a benchmark.",
-        responses={
-            200: {"description": "The benchmark was successfully unregistered."},
-        },
-        deprecated=True,
-    )
-    async def unregister_benchmark(
-        request: Annotated[UnregisterBenchmarkRequest, Depends(get_unregister_benchmark_request)],
-    ) -> None:
-        return await impl.unregister_benchmark(request)
-
-    return router
diff --git a/src/llama_stack_api/benchmarks/models.py b/src/llama_stack_api/benchmarks/models.py
deleted file mode 100644
index 4d9eeb8c80..0000000000
--- a/src/llama_stack_api/benchmarks/models.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""Pydantic models for Benchmarks API requests and responses.
-
-This module defines the request and response models for the Benchmarks API
-using Pydantic with Field descriptions for OpenAPI schema generation.
-"""
-
-from typing import Any, Literal
-
-from pydantic import BaseModel, Field
-
-from llama_stack_api.resource import Resource, ResourceType
-from llama_stack_api.schema_utils import json_schema_type
-
-
-@json_schema_type
-class ListBenchmarksRequest(BaseModel):
-    """Request model for listing benchmarks."""
-
-    pass
-
-
-@json_schema_type
-class GetBenchmarkRequest(BaseModel):
-    """Request model for getting a benchmark."""
-
-    benchmark_id: str = Field(..., description="The ID of the benchmark to get.")
-
-
-@json_schema_type
-class RegisterBenchmarkRequest(BaseModel):
-    """Request model for registering a benchmark."""
-
-    benchmark_id: str = Field(..., description="The ID of the benchmark to register.")
-    dataset_id: str = Field(..., description="The ID of the dataset to use for the benchmark.")
-    scoring_functions: list[str] = Field(..., description="The scoring functions to use for the benchmark.")
-    provider_benchmark_id: str | None = Field(
-        default=None, description="The ID of the provider benchmark to use for the benchmark."
-    )
-    provider_id: str | None = Field(default=None, description="The ID of the provider to use for the benchmark.")
-    metadata: dict[str, Any] | None = Field(default=None, description="The metadata to use for the benchmark.")
-
-
-@json_schema_type
-class UnregisterBenchmarkRequest(BaseModel):
-    """Request model for unregistering a benchmark."""
-
-    benchmark_id: str = Field(..., description="The ID of the benchmark to unregister.")
-
-
-class CommonBenchmarkFields(BaseModel):
-    dataset_id: str = Field(..., description="Identifier of the dataset to use for the benchmark evaluation.")
-    scoring_functions: list[str] = Field(
-        ..., description="List of scoring function identifiers to apply during evaluation."
-    )
-    metadata: dict[str, Any] = Field(
-        default_factory=dict,
-        description="Metadata for this evaluation task.",
-    )
-
-
-@json_schema_type
-class Benchmark(CommonBenchmarkFields, Resource):
-    """A benchmark resource for evaluating model performance."""
-
-    type: Literal[ResourceType.benchmark] = Field(
-        default=ResourceType.benchmark,
-        description="The resource type, always benchmark.",
-    )
-
-    @property
-    def benchmark_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_benchmark_id(self) -> str | None:
-        return self.provider_resource_id
-
-
-class BenchmarkInput(CommonBenchmarkFields, BaseModel):
-    benchmark_id: str = Field(..., description="The ID of the benchmark.")
-    provider_id: str | None = Field(default=None, description="The ID of the provider to use for the benchmark.")
-    provider_benchmark_id: str | None = Field(
-        default=None, description="The ID of the provider benchmark to use for the benchmark."
-    )
-
-
-@json_schema_type
-class ListBenchmarksResponse(BaseModel):
-    """Response containing a list of benchmark objects."""
-
-    data: list[Benchmark] = Field(..., description="List of benchmark objects.")
-
-
-__all__ = [
-    "ListBenchmarksRequest",
-    "GetBenchmarkRequest",
-    "RegisterBenchmarkRequest",
-    "UnregisterBenchmarkRequest",
-    "CommonBenchmarkFields",
-    "Benchmark",
-    "BenchmarkInput",
-    "ListBenchmarksResponse",
-]
diff --git a/src/llama_stack_api/datasetio/__init__.py b/src/llama_stack_api/datasetio/__init__.py
deleted file mode 100644
index e696d14145..0000000000
--- a/src/llama_stack_api/datasetio/__init__.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""DatasetIO API protocol and models.
-
-This module contains the DatasetIO protocol definition.
-Pydantic models are defined in llama_stack_api.datasetio.models.
-The FastAPI router is defined in llama_stack_api.datasetio.fastapi_routes.
-"""
-
-# Import fastapi_routes for router factory access
-from . import fastapi_routes
-
-# Import protocol for FastAPI router
-from .api import DatasetIO, DatasetStore
-
-# Import models for re-export
-from .models import (
-    AppendRowsParams,
-    AppendRowsRequest,
-    IterRowsRequest,
-    PaginatedResponse,
-)
-
-__all__ = [
-    "DatasetIO",
-    "DatasetStore",
-    "AppendRowsParams",
-    "AppendRowsRequest",
-    "IterRowsRequest",
-    "PaginatedResponse",
-    "fastapi_routes",
-]
diff --git a/src/llama_stack_api/datasetio/api.py b/src/llama_stack_api/datasetio/api.py
deleted file mode 100644
index ab71fee6fd..0000000000
--- a/src/llama_stack_api/datasetio/api.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""DatasetIO API protocol definition.
-
-This module contains the DatasetIO protocol definition.
-Pydantic models are defined in llama_stack_api.datasetio.models.
-The FastAPI router is defined in llama_stack_api.datasetio.fastapi_routes.
-"""
-
-from typing import Protocol, runtime_checkable
-
-from llama_stack_api.datasets import Dataset
-
-from .models import (
-    AppendRowsParams,
-    IterRowsRequest,
-    PaginatedResponse,
-)
-
-
-class DatasetStore(Protocol):
-    def get_dataset(self, dataset_id: str) -> Dataset: ...
-
-
-@runtime_checkable
-class DatasetIO(Protocol):
-    """Protocol for dataset I/O operations.
-
-    The DatasetIO API provides operations for reading and writing data to datasets.
-    This includes iterating over rows and appending new rows to existing datasets.
-    """
-
-    # keeping for aligning with inference/safety, but this is not used
-    dataset_store: DatasetStore
-
-    async def iterrows(self, request: IterRowsRequest) -> PaginatedResponse: ...
-
-    async def append_rows(self, params: AppendRowsParams) -> None: ...
diff --git a/src/llama_stack_api/datasetio/fastapi_routes.py b/src/llama_stack_api/datasetio/fastapi_routes.py
deleted file mode 100644
index 040c8e9b3c..0000000000
--- a/src/llama_stack_api/datasetio/fastapi_routes.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""FastAPI router for the DatasetIO API.
-
-This module defines the FastAPI router for the DatasetIO API using standard
-FastAPI route decorators.
-"""
-
-from typing import Annotated
-
-from fastapi import APIRouter, Body, Path, Query
-
-from llama_stack_api.common.responses import PaginatedResponse
-from llama_stack_api.router_utils import standard_responses
-from llama_stack_api.version import LLAMA_STACK_API_V1BETA
-
-from .api import DatasetIO
-from .models import (
-    AppendRowsParams,
-    AppendRowsRequest,
-    IterRowsRequest,
-)
-
-
-def create_router(impl: DatasetIO) -> APIRouter:
-    """Create a FastAPI router for the DatasetIO API.
-
-    Args:
-        impl: The DatasetIO implementation instance
-
-    Returns:
-        APIRouter configured for the DatasetIO API
-    """
-    router = APIRouter(
-        prefix=f"/{LLAMA_STACK_API_V1BETA}",
-        tags=["DatasetIO"],
-        responses=standard_responses,
-    )
-
-    @router.get(
-        "/datasetio/iterrows/{dataset_id:path}",
-        response_model=PaginatedResponse,
-        summary="Get a paginated list of rows from a dataset.",
-        description="""Get a paginated list of rows from a dataset.
-
-Uses offset-based pagination where:
-- start_index: The starting index (0-based). If None, starts from beginning.
-- limit: Number of items to return. If None or -1, returns all items.
-
-The response includes:
-- data: List of items for the current page.
-- has_more: Whether there are more items available after this set.""",
-        responses={
-            200: {"description": "A PaginatedResponse containing the rows."},
-        },
-    )
-    async def iterrows(
-        dataset_id: Annotated[str, Path(description="The ID of the dataset to get the rows from.")],
-        start_index: Annotated[
-            int | None, Query(description="Index into dataset for the first row to get. Get all rows if None.")
-        ] = None,
-        limit: Annotated[int | None, Query(description="The number of rows to get.")] = None,
-    ) -> PaginatedResponse:
-        request = IterRowsRequest(
-            dataset_id=dataset_id,
-            start_index=start_index,
-            limit=limit,
-        )
-        return await impl.iterrows(request)
-
-    @router.post(
-        "/datasetio/append-rows/{dataset_id:path}",
-        status_code=204,
-        summary="Append rows to a dataset.",
-        description="Append rows to a dataset.",
-        responses={
-            204: {"description": "Rows were successfully appended."},
-        },
-    )
-    async def append_rows(
-        dataset_id: Annotated[str, Path(description="The ID of the dataset to append the rows to.")],
-        body: Annotated[AppendRowsRequest, Body(...)],
-    ) -> None:
-        # Combine path parameter with request body
-        params = AppendRowsParams(
-            dataset_id=dataset_id,
-            rows=body.rows,
-        )
-        return await impl.append_rows(params)
-
-    return router
diff --git a/src/llama_stack_api/datasetio/models.py b/src/llama_stack_api/datasetio/models.py
deleted file mode 100644
index 62e941ed9a..0000000000
--- a/src/llama_stack_api/datasetio/models.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""Pydantic models for DatasetIO API requests and responses.
-
-This module defines the request and response models for the DatasetIO API
-using Pydantic with Field descriptions for OpenAPI schema generation.
-"""
-
-from typing import Any
-
-from pydantic import BaseModel, Field
-
-from llama_stack_api.common.responses import PaginatedResponse
-from llama_stack_api.schema_utils import json_schema_type
-
-
-@json_schema_type
-class IterRowsRequest(BaseModel):
-    """Request model for iterating over rows in a dataset."""
-
-    dataset_id: str = Field(..., description="The ID of the dataset to get the rows from.")
-    start_index: int | None = Field(
-        default=None,
-        description="Index into dataset for the first row to get. Get all rows if None.",
-    )
-    limit: int | None = Field(
-        default=None,
-        description="The number of rows to get.",
-    )
-
-
-@json_schema_type
-class AppendRowsRequest(BaseModel):
-    """Request body for appending rows to a dataset."""
-
-    rows: list[dict[str, Any]] = Field(..., description="The rows to append to the dataset.")
-
-
-class AppendRowsParams(BaseModel):
-    """Internal parameters for appending rows to a dataset (includes dataset_id)."""
-
-    dataset_id: str = Field(..., description="The ID of the dataset to append the rows to.")
-    rows: list[dict[str, Any]] = Field(..., description="The rows to append to the dataset.")
-
-
-__all__ = [
-    "AppendRowsRequest",
-    "AppendRowsParams",
-    "IterRowsRequest",
-    "PaginatedResponse",
-]
diff --git a/src/llama_stack_api/datasets/__init__.py b/src/llama_stack_api/datasets/__init__.py
deleted file mode 100644
index cff53476e8..0000000000
--- a/src/llama_stack_api/datasets/__init__.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""Datasets API protocol and models.
-
-This module contains the Datasets protocol definition.
-Pydantic models are defined in llama_stack_api.datasets.models.
-The FastAPI router is defined in llama_stack_api.datasets.fastapi_routes.
-"""
-
-# Import fastapi_routes for router factory access
-from . import fastapi_routes
-
-# Import new protocol for FastAPI router
-from .api import Datasets
-
-# Import models for re-export
-from .models import (
-    CommonDatasetFields,
-    Dataset,
-    DatasetPurpose,
-    DatasetType,
-    DataSource,
-    GetDatasetRequest,
-    ListDatasetsResponse,
-    RegisterDatasetRequest,
-    RowsDataSource,
-    UnregisterDatasetRequest,
-    URIDataSource,
-)
-
-
-# Define DatasetInput for backward compatibility
-class DatasetInput(CommonDatasetFields):
-    """Input parameters for dataset operations.
-
-    :param dataset_id: Unique identifier for the dataset
-    """
-
-    dataset_id: str
-
-
-__all__ = [
-    "Datasets",
-    "Dataset",
-    "CommonDatasetFields",
-    "DatasetPurpose",
-    "DataSource",
-    "DatasetInput",
-    "DatasetType",
-    "RowsDataSource",
-    "URIDataSource",
-    "ListDatasetsResponse",
-    "RegisterDatasetRequest",
-    "GetDatasetRequest",
-    "UnregisterDatasetRequest",
-    "fastapi_routes",
-]
diff --git a/src/llama_stack_api/datasets/api.py b/src/llama_stack_api/datasets/api.py
deleted file mode 100644
index 981b438f0c..0000000000
--- a/src/llama_stack_api/datasets/api.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""Datasets API protocol definition.
-
-This module contains the Datasets protocol definition.
-Pydantic models are defined in llama_stack_api.datasets.models.
-The FastAPI router is defined in llama_stack_api.datasets.fastapi_routes.
-"""
-
-from typing import Protocol, runtime_checkable
-
-from .models import (
-    Dataset,
-    GetDatasetRequest,
-    ListDatasetsResponse,
-    RegisterDatasetRequest,
-    UnregisterDatasetRequest,
-)
-
-
-@runtime_checkable
-class Datasets(Protocol):
-    """Protocol for dataset management operations."""
-
-    async def register_dataset(self, request: RegisterDatasetRequest) -> Dataset: ...
-
-    async def get_dataset(self, request: GetDatasetRequest) -> Dataset: ...
-
-    async def list_datasets(self) -> ListDatasetsResponse: ...
-
-    async def unregister_dataset(self, request: UnregisterDatasetRequest) -> None: ...
diff --git a/src/llama_stack_api/datasets/fastapi_routes.py b/src/llama_stack_api/datasets/fastapi_routes.py
deleted file mode 100644
index 07a32a59f4..0000000000
--- a/src/llama_stack_api/datasets/fastapi_routes.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""FastAPI router for the Datasets API.
-
-This module defines the FastAPI router for the Datasets API using standard
-FastAPI route decorators.
-"""
-
-from typing import Annotated
-
-from fastapi import APIRouter, Body, Depends
-
-from llama_stack_api.router_utils import create_path_dependency, standard_responses
-from llama_stack_api.version import LLAMA_STACK_API_V1BETA
-
-from .api import Datasets
-from .models import (
-    Dataset,
-    GetDatasetRequest,
-    ListDatasetsResponse,
-    RegisterDatasetRequest,
-    UnregisterDatasetRequest,
-)
-
-# Path parameter dependencies for single-field models
-get_dataset_request = create_path_dependency(GetDatasetRequest)
-unregister_dataset_request = create_path_dependency(UnregisterDatasetRequest)
-
-
-def create_router(impl: Datasets) -> APIRouter:
-    """Create a FastAPI router for the Datasets API.
-
-    Args:
-        impl: The Datasets implementation instance
-
-    Returns:
-        APIRouter configured for the Datasets API
-    """
-    router = APIRouter(
-        prefix=f"/{LLAMA_STACK_API_V1BETA}",
-        tags=["Datasets"],
-        responses=standard_responses,
-    )
-
-    @router.post(
-        "/datasets",
-        response_model=Dataset,
-        summary="Register a new dataset.",
-        description="Register a new dataset.",
-        responses={
-            200: {"description": "The registered dataset object."},
-        },
-        deprecated=True,
-    )
-    async def register_dataset(
-        request: Annotated[RegisterDatasetRequest, Body(...)],
-    ) -> Dataset:
-        return await impl.register_dataset(request)
-
-    @router.get(
-        "/datasets/{dataset_id:path}",
-        response_model=Dataset,
-        summary="Get a dataset by its ID.",
-        description="Get a dataset by its ID.",
-        responses={
-            200: {"description": "The dataset object."},
-        },
-    )
-    async def get_dataset(
-        request: Annotated[GetDatasetRequest, Depends(get_dataset_request)],
-    ) -> Dataset:
-        return await impl.get_dataset(request)
-
-    @router.get(
-        "/datasets",
-        response_model=ListDatasetsResponse,
-        summary="List all datasets.",
-        description="List all datasets.",
-        responses={
-            200: {"description": "A list of dataset objects."},
-        },
-    )
-    async def list_datasets() -> ListDatasetsResponse:
-        return await impl.list_datasets()
-
-    @router.delete(
-        "/datasets/{dataset_id:path}",
-        summary="Unregister a dataset by its ID.",
-        description="Unregister a dataset by its ID.",
-        responses={
-            200: {"description": "The dataset was successfully unregistered."},
-        },
-        deprecated=True,
-    )
-    async def unregister_dataset(
-        request: Annotated[UnregisterDatasetRequest, Depends(unregister_dataset_request)],
-    ) -> None:
-        return await impl.unregister_dataset(request)
-
-    return router
diff --git a/src/llama_stack_api/datasets/models.py b/src/llama_stack_api/datasets/models.py
deleted file mode 100644
index 6fda228e43..0000000000
--- a/src/llama_stack_api/datasets/models.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""Pydantic models for Datasets API requests and responses.
-
-This module defines the request and response models for the Datasets API
-using Pydantic with Field descriptions for OpenAPI schema generation.
-"""
-
-from enum import Enum, StrEnum
-from typing import Annotated, Any, Literal
-
-from pydantic import BaseModel, Field
-
-from llama_stack_api.resource import Resource, ResourceType
-from llama_stack_api.schema_utils import json_schema_type, register_schema
-
-
-class DatasetPurpose(StrEnum):
-    """Purpose of the dataset. Each purpose has a required input data schema."""
-
-    eval_question_answer = "eval/question-answer"
-    """The dataset contains a question column and an answer column."""
-    eval_messages_answer = "eval/messages-answer"
-    """The dataset contains a messages column with list of messages and an answer column."""
-
-
-class DatasetType(Enum):
-    """Type of the dataset source."""
-
-    uri = "uri"
-    """The dataset can be obtained from a URI."""
-    rows = "rows"
-    """The dataset is stored in rows."""
-
-
-@json_schema_type
-class URIDataSource(BaseModel):
-    """A dataset that can be obtained from a URI."""
-
-    type: Literal["uri"] = Field(default="uri", description="The type of data source.")
-    uri: str = Field(
-        ...,
-        description='The dataset can be obtained from a URI. E.g. "https://mywebsite.com/mydata.jsonl", "lsfs://mydata.jsonl", "data:csv;base64,{base64_content}"',
-    )
-
-
-@json_schema_type
-class RowsDataSource(BaseModel):
-    """A dataset stored in rows."""
-
-    type: Literal["rows"] = Field(default="rows", description="The type of data source.")
-    rows: list[dict[str, Any]] = Field(
-        ...,
-        description='The dataset is stored in rows. E.g. [{"messages": [{"role": "user", "content": "Hello, world!"}, {"role": "assistant", "content": "Hello, world!"}]}]',
-    )
-
-
-DataSource = Annotated[
-    URIDataSource | RowsDataSource,
-    Field(discriminator="type"),
-]
-register_schema(DataSource, name="DataSource")
-
-
-class CommonDatasetFields(BaseModel):
-    """Common fields for a dataset."""
-
-    purpose: DatasetPurpose = Field(..., description="Purpose of the dataset indicating its intended use")
-    source: DataSource = Field(..., description="Data source configuration for the dataset")
-    metadata: dict[str, Any] = Field(
-        default_factory=dict,
-        description="Any additional metadata for this dataset",
-    )
-
-
-@json_schema_type
-class Dataset(CommonDatasetFields, Resource):
-    """Dataset resource for storing and accessing training or evaluation data."""
-
-    type: Literal[ResourceType.dataset] = Field(
-        default=ResourceType.dataset,
-        description="Type of resource, always 'dataset' for datasets",
-    )
-
-    @property
-    def dataset_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_dataset_id(self) -> str | None:
-        return self.provider_resource_id
-
-
-@json_schema_type
-class ListDatasetsResponse(BaseModel):
-    """Response from listing datasets."""
-
-    data: list[Dataset] = Field(..., description="List of datasets")
-
-
-# Request models for each endpoint
-
-
-@json_schema_type
-class RegisterDatasetRequest(BaseModel):
-    """Request model for registering a dataset."""
-
-    purpose: DatasetPurpose = Field(..., description="The purpose of the dataset.")
-    source: DataSource = Field(..., description="The data source of the dataset.")
-    metadata: dict[str, Any] | None = Field(
-        default=None,
-        description="The metadata for the dataset.",
-    )
-    dataset_id: str | None = Field(
-        default=None,
-        description="The ID of the dataset. If not provided, an ID will be generated.",
-    )
-
-
-@json_schema_type
-class GetDatasetRequest(BaseModel):
-    """Request model for getting a dataset by ID."""
-
-    dataset_id: str = Field(..., description="The ID of the dataset to get.")
-
-
-@json_schema_type
-class UnregisterDatasetRequest(BaseModel):
-    """Request model for unregistering a dataset."""
-
-    dataset_id: str = Field(..., description="The ID of the dataset to unregister.")
-
-
-__all__ = [
-    "CommonDatasetFields",
-    "Dataset",
-    "DatasetPurpose",
-    "DatasetType",
-    "DataSource",
-    "RowsDataSource",
-    "URIDataSource",
-    "ListDatasetsResponse",
-    "RegisterDatasetRequest",
-    "GetDatasetRequest",
-    "UnregisterDatasetRequest",
-]
diff --git a/src/llama_stack_api/datatypes.py b/src/llama_stack_api/datatypes.py
index 7480866326..552e996a89 100644
--- a/src/llama_stack_api/datatypes.py
+++ b/src/llama_stack_api/datatypes.py
@@ -10,11 +10,8 @@
 
 from pydantic import BaseModel, Field
 
-from llama_stack_api.benchmarks import Benchmark
-from llama_stack_api.datasets import Dataset
 from llama_stack_api.models import Model
 from llama_stack_api.schema_utils import json_schema_type
-from llama_stack_api.scoring_functions import ScoringFn
 from llama_stack_api.shields import Shield
 from llama_stack_api.tools import ToolGroup
 from llama_stack_api.vector_stores import VectorStore
@@ -97,16 +94,10 @@ class Api(Enum, metaclass=DynamicApiMeta):
     :cvar agents: Agent orchestration and execution
     :cvar batches: Batch processing for asynchronous API requests
     :cvar vector_io: Vector database operations and queries
-    :cvar datasetio: Dataset input/output operations
-    :cvar scoring: Model output evaluation and scoring
-    :cvar eval: Model evaluation and benchmarking framework
     :cvar tool_runtime: Tool execution and management
     :cvar telemetry: Observability and system monitoring
     :cvar models: Model metadata and management
     :cvar shields: Safety shield implementations
-    :cvar datasets: Dataset creation and management
-    :cvar scoring_functions: Scoring function definitions
-    :cvar benchmarks: Benchmark suite management
     :cvar tool_groups: Tool group organization
     :cvar files: File storage and management
     :cvar file_processors: File parsing and processing operations
@@ -121,17 +112,11 @@ class Api(Enum, metaclass=DynamicApiMeta):
     agents = "agents"
     batches = "batches"
     vector_io = "vector_io"
-    datasetio = "datasetio"
-    scoring = "scoring"
-    eval = "eval"
     tool_runtime = "tool_runtime"
 
     models = "models"
     shields = "shields"
     vector_stores = "vector_stores"  # only used for routing table
-    datasets = "datasets"
-    scoring_functions = "scoring_functions"
-    benchmarks = "benchmarks"
     tool_groups = "tool_groups"
     files = "files"
     file_processors = "file_processors"
@@ -228,22 +213,6 @@ async def register_vector_store(self, vector_store: VectorStore) -> None: ...
     async def unregister_vector_store(self, vector_store_id: str) -> None: ...
 
 
-class DatasetsProtocolPrivate(Protocol):
-    async def register_dataset(self, dataset: Dataset) -> None: ...
-
-    async def unregister_dataset(self, dataset_id: str) -> None: ...
-
-
-class ScoringFunctionsProtocolPrivate(Protocol):
-    async def list_scoring_functions(self) -> list[ScoringFn]: ...
-
-    async def register_scoring_function(self, scoring_fn: ScoringFn) -> None: ...
-
-
-class BenchmarksProtocolPrivate(Protocol):
-    async def register_benchmark(self, benchmark: Benchmark) -> None: ...
-
-
 class ToolGroupsProtocolPrivate(Protocol):
     async def register_toolgroup(self, toolgroup: ToolGroup) -> None: ...
 
diff --git a/src/llama_stack_api/eval/__init__.py b/src/llama_stack_api/eval/__init__.py
deleted file mode 100644
index 0f97a1d244..0000000000
--- a/src/llama_stack_api/eval/__init__.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from llama_stack_api.common.job_types import Job
-
-from . import fastapi_routes
-from .api import Eval
-from .compat import (
-    resolve_evaluate_rows_request,
-    resolve_job_cancel_request,
-    resolve_job_result_request,
-    resolve_job_status_request,
-    resolve_run_eval_request,
-)
-from .models import (
-    BenchmarkConfig,
-    BenchmarkIdRequest,
-    EvalCandidate,
-    EvaluateResponse,
-    EvaluateRowsBodyRequest,
-    EvaluateRowsRequest,
-    JobCancelRequest,
-    JobResultRequest,
-    JobStatusRequest,
-    ModelCandidate,
-    RunEvalBodyRequest,
-    RunEvalRequest,
-)
-
-__all__ = [
-    "Eval",
-    "BenchmarkConfig",
-    "BenchmarkIdRequest",
-    "EvalCandidate",
-    "EvaluateResponse",
-    "EvaluateRowsBodyRequest",
-    "EvaluateRowsRequest",
-    "Job",
-    "JobCancelRequest",
-    "JobResultRequest",
-    "JobStatusRequest",
-    "ModelCandidate",
-    "RunEvalBodyRequest",
-    "RunEvalRequest",
-    "fastapi_routes",
-    # Backward compatibility helpers
-    "resolve_run_eval_request",
-    "resolve_evaluate_rows_request",
-    "resolve_job_status_request",
-    "resolve_job_cancel_request",
-    "resolve_job_result_request",
-]
diff --git a/src/llama_stack_api/eval/api.py b/src/llama_stack_api/eval/api.py
deleted file mode 100644
index 547b0f3757..0000000000
--- a/src/llama_stack_api/eval/api.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Protocol, runtime_checkable
-
-from llama_stack_api.common.job_types import Job
-
-from .models import (
-    EvaluateResponse,
-    EvaluateRowsRequest,
-    JobCancelRequest,
-    JobResultRequest,
-    JobStatusRequest,
-    RunEvalRequest,
-)
-
-
-@runtime_checkable
-class Eval(Protocol):
-    """Evaluations
-
-    Llama Stack Evaluation API for running evaluations on model and agent candidates."""
-
-    async def run_eval(
-        self,
-        request: RunEvalRequest,
-    ) -> Job:
-        """Run an evaluation on a benchmark."""
-        ...
-
-    async def evaluate_rows(
-        self,
-        request: EvaluateRowsRequest,
-    ) -> EvaluateResponse:
-        """Evaluate a list of rows on a benchmark."""
-        ...
-
-    async def job_status(self, request: JobStatusRequest) -> Job:
-        """Get the status of a job."""
-        ...
-
-    async def job_cancel(self, request: JobCancelRequest) -> None:
-        """Cancel a job."""
-        ...
-
-    async def job_result(self, request: JobResultRequest) -> EvaluateResponse:
-        """Get the result of a job."""
-        ...
diff --git a/src/llama_stack_api/eval/compat.py b/src/llama_stack_api/eval/compat.py
deleted file mode 100644
index 81ff485803..0000000000
--- a/src/llama_stack_api/eval/compat.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""
-Backward compatibility helpers for the Eval API.
-
-This module provides utilities to support both the old-style (individual parameters)
-and new-style (request objects) calling conventions for Eval API methods.
-
-The old-style parameters are deprecated and will be removed in a future release.
-
-Note: When both a request object AND individual parameters are provided, the request
-object takes precedence and individual parameters are ignored.
-"""
-
-import warnings
-from typing import Any
-
-from .models import (
-    BenchmarkConfig,
-    EvaluateRowsRequest,
-    JobCancelRequest,
-    JobResultRequest,
-    JobStatusRequest,
-    RunEvalRequest,
-)
-
-_DEPRECATION_TARGET = "0.6.0"
-
-_DEPRECATION_MESSAGE = (
-    "Passing individual parameters to {method_name}() is deprecated. "
-    "Please use {request_class}(benchmark_id=..., ...) instead. "
-    "This will be removed in version {target}."
-)
-
-
-def _emit_deprecation_warning(method_name: str, request_class: str) -> None:
-    """Emit a deprecation warning for old-style parameter usage."""
-    warnings.warn(
-        _DEPRECATION_MESSAGE.format(method_name=method_name, request_class=request_class, target=_DEPRECATION_TARGET),
-        DeprecationWarning,
-        stacklevel=4,
-    )
-
-
-def _format_missing_params(required: list[str], provided: dict[str, Any]) -> str:
-    """Format error message showing which parameters are missing."""
-    missing = [p for p in required if provided.get(p) is None]
-    provided_names = [p for p in required if provided.get(p) is not None]
-
-    parts = []
-    if missing:
-        parts.append(f"missing: {', '.join(missing)}")
-    if provided_names:
-        parts.append(f"provided: {', '.join(provided_names)}")
-
-    return "; ".join(parts)
-
-
-def _validate_not_empty(value: Any, name: str) -> None:
-    """Validate that a value is not None, empty string, or empty list."""
-    if not value:
-        raise ValueError(f"'{name}' cannot be None or empty. Provided: {value}")
-
-
-def resolve_run_eval_request(
-    request: RunEvalRequest | None = None,
-    *,
-    benchmark_id: str | None = None,
-    benchmark_config: BenchmarkConfig | None = None,
-) -> RunEvalRequest:
-    """
-    Resolve run_eval parameters to a RunEvalRequest object.
-
-    Supports both new-style (request object) and old-style (individual parameters).
-    Old-style usage emits a DeprecationWarning.
-
-    Note: If both request object and individual parameters are provided, the request
-    object takes precedence and individual parameters are ignored.
-
-    Args:
-        request: The new-style request object (preferred)
-        benchmark_id: (Deprecated) The benchmark ID
-        benchmark_config: (Deprecated) The benchmark configuration
-
-    Returns:
-        RunEvalRequest object
-    """
-    if request is not None:
-        _validate_not_empty(request.benchmark_id, "benchmark_id")
-        _validate_not_empty(request.benchmark_config, "benchmark_config")
-        return request
-
-    # Old-style parameters
-    if benchmark_id and benchmark_config:
-        _emit_deprecation_warning("run_eval", "RunEvalRequest")
-        return RunEvalRequest(
-            benchmark_id=benchmark_id,
-            benchmark_config=benchmark_config,
-        )
-
-    required = ["benchmark_id", "benchmark_config"]
-    provided = {"benchmark_id": benchmark_id, "benchmark_config": benchmark_config}
-    raise ValueError(
-        f"Either 'request' (RunEvalRequest) or both 'benchmark_id' and 'benchmark_config' "
-        f"must be provided. {_format_missing_params(required, provided)}"
-    )
-
-
-def resolve_evaluate_rows_request(
-    request: EvaluateRowsRequest | None = None,
-    *,
-    benchmark_id: str | None = None,
-    input_rows: list[dict[str, Any]] | None = None,
-    scoring_functions: list[str] | None = None,
-    benchmark_config: BenchmarkConfig | None = None,
-) -> EvaluateRowsRequest:
-    """
-    Resolve evaluate_rows parameters to an EvaluateRowsRequest object.
-
-    Supports both new-style (request object) and old-style (individual parameters).
-    Old-style usage emits a DeprecationWarning.
-
-    Note: If both request object and individual parameters are provided, the request
-    object takes precedence and individual parameters are ignored.
-
-    Args:
-        request: The new-style request object (preferred)
-        benchmark_id: (Deprecated) The benchmark ID
-        input_rows: (Deprecated) The rows to evaluate
-        scoring_functions: (Deprecated) The scoring functions to use
-        benchmark_config: (Deprecated) The benchmark configuration
-
-    Returns:
-        EvaluateRowsRequest object
-    """
-    if request is not None:
-        _validate_not_empty(request.benchmark_id, "benchmark_id")
-        _validate_not_empty(request.input_rows, "input_rows")
-        _validate_not_empty(request.scoring_functions, "scoring_functions")
-        _validate_not_empty(request.benchmark_config, "benchmark_config")
-        return request
-
-    # Old-style parameters
-    if benchmark_id and input_rows and scoring_functions and benchmark_config:
-        _emit_deprecation_warning("evaluate_rows", "EvaluateRowsRequest")
-        return EvaluateRowsRequest(
-            benchmark_id=benchmark_id,
-            input_rows=input_rows,
-            scoring_functions=scoring_functions,
-            benchmark_config=benchmark_config,
-        )
-
-    required = ["benchmark_id", "input_rows", "scoring_functions", "benchmark_config"]
-    provided = {
-        "benchmark_id": benchmark_id,
-        "input_rows": input_rows,
-        "scoring_functions": scoring_functions,
-        "benchmark_config": benchmark_config,
-    }
-    raise ValueError(
-        f"Either 'request' (EvaluateRowsRequest) or all of 'benchmark_id', 'input_rows', "
-        f"'scoring_functions', and 'benchmark_config' must be provided. "
-        f"{_format_missing_params(required, provided)}"
-    )
-
-
-def resolve_job_status_request(
-    request: JobStatusRequest | None = None,
-    *,
-    benchmark_id: str | None = None,
-    job_id: str | None = None,
-) -> JobStatusRequest:
-    """
-    Resolve job_status parameters to a JobStatusRequest object.
-
-    Supports both new-style (request object) and old-style (individual parameters).
-    Old-style usage emits a DeprecationWarning.
-
-    Note: If both request object and individual parameters are provided, the request
-    object takes precedence and individual parameters are ignored.
-
-    Args:
-        request: The new-style request object (preferred)
-        benchmark_id: (Deprecated) The benchmark ID
-        job_id: (Deprecated) The job ID
-
-    Returns:
-        JobStatusRequest object
-    """
-    if request is not None:
-        _validate_not_empty(request.benchmark_id, "benchmark_id")
-        _validate_not_empty(request.job_id, "job_id")
-        return request
-
-    # Old-style parameters
-    if benchmark_id and job_id:
-        _emit_deprecation_warning("job_status", "JobStatusRequest")
-        return JobStatusRequest(
-            benchmark_id=benchmark_id,
-            job_id=job_id,
-        )
-
-    required = ["benchmark_id", "job_id"]
-    provided = {"benchmark_id": benchmark_id, "job_id": job_id}
-    raise ValueError(
-        f"Either 'request' (JobStatusRequest) or both 'benchmark_id' and 'job_id' "
-        f"must be provided. {_format_missing_params(required, provided)}"
-    )
-
-
-def resolve_job_cancel_request(
-    request: JobCancelRequest | None = None,
-    *,
-    benchmark_id: str | None = None,
-    job_id: str | None = None,
-) -> JobCancelRequest:
-    """
-    Resolve job_cancel parameters to a JobCancelRequest object.
-
-    Supports both new-style (request object) and old-style (individual parameters).
-    Old-style usage emits a DeprecationWarning.
-
-    Note: If both request object and individual parameters are provided, the request
-    object takes precedence and individual parameters are ignored.
-
-    Args:
-        request: The new-style request object (preferred)
-        benchmark_id: (Deprecated) The benchmark ID
-        job_id: (Deprecated) The job ID
-
-    Returns:
-        JobCancelRequest object
-    """
-    if request is not None:
-        _validate_not_empty(request.benchmark_id, "benchmark_id")
-        _validate_not_empty(request.job_id, "job_id")
-        return request
-
-    # Old-style parameters
-    if benchmark_id and job_id:
-        _emit_deprecation_warning("job_cancel", "JobCancelRequest")
-        return JobCancelRequest(
-            benchmark_id=benchmark_id,
-            job_id=job_id,
-        )
-
-    required = ["benchmark_id", "job_id"]
-    provided = {"benchmark_id": benchmark_id, "job_id": job_id}
-    raise ValueError(
-        f"Either 'request' (JobCancelRequest) or both 'benchmark_id' and 'job_id' "
-        f"must be provided. {_format_missing_params(required, provided)}"
-    )
-
-
-def resolve_job_result_request(
-    request: JobResultRequest | None = None,
-    *,
-    benchmark_id: str | None = None,
-    job_id: str | None = None,
-) -> JobResultRequest:
-    """
-    Resolve job_result parameters to a JobResultRequest object.
-
-    Supports both new-style (request object) and old-style (individual parameters).
-    Old-style usage emits a DeprecationWarning.
-
-    Note: If both request object and individual parameters are provided, the request
-    object takes precedence and individual parameters are ignored.
-
-    Args:
-        request: The new-style request object (preferred)
-        benchmark_id: (Deprecated) The benchmark ID
-        job_id: (Deprecated) The job ID
-
-    Returns:
-        JobResultRequest object
-    """
-    if request is not None:
-        _validate_not_empty(request.benchmark_id, "benchmark_id")
-        _validate_not_empty(request.job_id, "job_id")
-        return request
-
-    # Old-style parameters
-    if benchmark_id and job_id:
-        _emit_deprecation_warning("job_result", "JobResultRequest")
-        return JobResultRequest(
-            benchmark_id=benchmark_id,
-            job_id=job_id,
-        )
-
-    required = ["benchmark_id", "job_id"]
-    provided = {"benchmark_id": benchmark_id, "job_id": job_id}
-    raise ValueError(
-        f"Either 'request' (JobResultRequest) or both 'benchmark_id' and 'job_id' "
-        f"must be provided. {_format_missing_params(required, provided)}"
-    )
diff --git a/src/llama_stack_api/eval/fastapi_routes.py b/src/llama_stack_api/eval/fastapi_routes.py
deleted file mode 100644
index b6e2b812d4..0000000000
--- a/src/llama_stack_api/eval/fastapi_routes.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Annotated
-
-from fastapi import APIRouter, Body, Depends
-
-from llama_stack_api.common.job_types import Job
-from llama_stack_api.router_utils import create_path_dependency, standard_responses
-from llama_stack_api.version import LLAMA_STACK_API_V1ALPHA
-
-from .api import Eval
-from .models import (
-    BenchmarkIdRequest,
-    EvaluateResponse,
-    EvaluateRowsBodyRequest,
-    EvaluateRowsRequest,
-    JobCancelRequest,
-    JobResultRequest,
-    JobStatusRequest,
-    RunEvalBodyRequest,
-    RunEvalRequest,
-)
-
-get_benchmark_id_request = create_path_dependency(BenchmarkIdRequest)
-
-
-def create_router(impl: Eval) -> APIRouter:
-    """Create a FastAPI router for the Eval API."""
-    router = APIRouter(
-        prefix=f"/{LLAMA_STACK_API_V1ALPHA}",
-        tags=["Eval"],
-        responses=standard_responses,
-    )
-
-    @router.post(
-        "/eval/benchmarks/{benchmark_id}/jobs",
-        response_model=Job,
-        summary="Run Eval",
-        description="Run an evaluation on a benchmark.",
-        responses={
-            200: {"description": "The job that was created to run the evaluation."},
-        },
-    )
-    async def run_eval(
-        benchmark_id_request: Annotated[BenchmarkIdRequest, Depends(get_benchmark_id_request)],
-        body_request: Annotated[RunEvalBodyRequest, Body(...)],
-    ) -> Job:
-        request = RunEvalRequest(
-            benchmark_id=benchmark_id_request.benchmark_id,
-            benchmark_config=body_request.benchmark_config,
-        )
-        return await impl.run_eval(request)
-
-    @router.post(
-        "/eval/benchmarks/{benchmark_id}/evaluations",
-        response_model=EvaluateResponse,
-        summary="Evaluate Rows",
-        description="Evaluate a list of rows on a benchmark.",
-        responses={
-            200: {"description": "EvaluateResponse object containing generations and scores."},
-        },
-    )
-    async def evaluate_rows(
-        benchmark_id_request: Annotated[BenchmarkIdRequest, Depends(get_benchmark_id_request)],
-        body_request: Annotated[EvaluateRowsBodyRequest, Body(...)],
-    ) -> EvaluateResponse:
-        request = EvaluateRowsRequest(
-            benchmark_id=benchmark_id_request.benchmark_id,
-            input_rows=body_request.input_rows,
-            scoring_functions=body_request.scoring_functions,
-            benchmark_config=body_request.benchmark_config,
-        )
-        return await impl.evaluate_rows(request)
-
-    @router.get(
-        "/eval/benchmarks/{benchmark_id}/jobs/{job_id}",
-        response_model=Job,
-        summary="Job Status",
-        description="Get the status of a job.",
-        responses={
-            200: {"description": "The status of the evaluation job."},
-        },
-    )
-    async def job_status(
-        benchmark_id: str,
-        job_id: str,
-    ) -> Job:
-        request = JobStatusRequest(benchmark_id=benchmark_id, job_id=job_id)
-        return await impl.job_status(request)
-
-    @router.delete(
-        "/eval/benchmarks/{benchmark_id}/jobs/{job_id}",
-        summary="Job Cancel",
-        description="Cancel a job.",
-        responses={
-            200: {"description": "Successful Response"},
-        },
-    )
-    async def job_cancel(
-        benchmark_id: str,
-        job_id: str,
-    ) -> None:
-        request = JobCancelRequest(benchmark_id=benchmark_id, job_id=job_id)
-        return await impl.job_cancel(request)
-
-    @router.get(
-        "/eval/benchmarks/{benchmark_id}/jobs/{job_id}/result",
-        response_model=EvaluateResponse,
-        summary="Job Result",
-        description="Get the result of a job.",
-        responses={
-            200: {"description": "The result of the job."},
-        },
-    )
-    async def job_result(
-        benchmark_id: str,
-        job_id: str,
-    ) -> EvaluateResponse:
-        request = JobResultRequest(benchmark_id=benchmark_id, job_id=job_id)
-        return await impl.job_result(request)
-
-    return router
diff --git a/src/llama_stack_api/eval/models.py b/src/llama_stack_api/eval/models.py
deleted file mode 100644
index ec5db00b7f..0000000000
--- a/src/llama_stack_api/eval/models.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Any, Literal
-
-from pydantic import BaseModel, Field
-
-from llama_stack_api.inference import SamplingParams, SystemMessage
-from llama_stack_api.schema_utils import json_schema_type
-from llama_stack_api.scoring import ScoringResult
-from llama_stack_api.scoring_functions import ScoringFnParams
-
-
-@json_schema_type
-class ModelCandidate(BaseModel):
-    """A model candidate for evaluation."""
-
-    type: Literal["model"] = "model"
-    model: str = Field(..., description="The model ID to evaluate", min_length=1)
-    sampling_params: SamplingParams = Field(..., description="The sampling parameters for the model")
-    system_message: SystemMessage | None = Field(
-        None, description="The system message providing instructions or context to the model"
-    )
-
-
-EvalCandidate = ModelCandidate
-
-
-@json_schema_type
-class BenchmarkConfig(BaseModel):
-    """A benchmark configuration for evaluation."""
-
-    eval_candidate: EvalCandidate = Field(..., description="The candidate to evaluate")
-    scoring_params: dict[str, ScoringFnParams] = Field(
-        default_factory=dict,
-        description="Map between scoring function id and parameters for each scoring function you want to run",
-    )
-    num_examples: int | None = Field(
-        None,
-        description="Number of examples to evaluate (useful for testing), if not provided, all examples in the dataset will be evaluated",
-        ge=1,
-    )
-    # we could optinally add any specific dataset config here
-
-
-@json_schema_type
-class EvaluateResponse(BaseModel):
-    """The response from an evaluation."""
-
-    generations: list[dict[str, Any]] = Field(..., description="The generations from the evaluation")
-    scores: dict[str, ScoringResult] = Field(
-        ..., description="The scores from the evaluation. Each key in the dict is a scoring function name"
-    )
-
-
-@json_schema_type
-class BenchmarkIdRequest(BaseModel):
-    """Request model containing benchmark_id path parameter."""
-
-    benchmark_id: str = Field(..., description="The ID of the benchmark", min_length=1)
-
-
-@json_schema_type
-class RunEvalRequest(BaseModel):
-    """Request model for running an evaluation on a benchmark."""
-
-    benchmark_id: str = Field(..., description="The ID of the benchmark to run the evaluation on", min_length=1)
-    benchmark_config: BenchmarkConfig = Field(..., description="The configuration for the benchmark")
-
-
-@json_schema_type
-class RunEvalBodyRequest(BaseModel):
-    """Request body model for running an evaluation (without path parameter)."""
-
-    benchmark_config: BenchmarkConfig = Field(..., description="The configuration for the benchmark")
-
-
-@json_schema_type
-class EvaluateRowsRequest(BaseModel):
-    """Request model for evaluating a list of rows on a benchmark."""
-
-    benchmark_id: str = Field(..., description="The ID of the benchmark to run the evaluation on", min_length=1)
-    input_rows: list[dict[str, Any]] = Field(..., description="The rows to evaluate", min_length=1)
-    scoring_functions: list[str] = Field(
-        ..., description="The scoring functions to use for the evaluation", min_length=1
-    )
-    benchmark_config: BenchmarkConfig = Field(..., description="The configuration for the benchmark")
-
-
-@json_schema_type
-class EvaluateRowsBodyRequest(BaseModel):
-    """Request body model for evaluating rows (without path parameter)."""
-
-    input_rows: list[dict[str, Any]] = Field(..., description="The rows to evaluate", min_length=1)
-    scoring_functions: list[str] = Field(
-        ..., description="The scoring functions to use for the evaluation", min_length=1
-    )
-    benchmark_config: BenchmarkConfig = Field(..., description="The configuration for the benchmark")
-
-
-@json_schema_type
-class JobStatusRequest(BaseModel):
-    """Request model for getting the status of a job."""
-
-    benchmark_id: str = Field(..., description="The ID of the benchmark associated with the job", min_length=1)
-    job_id: str = Field(..., description="The ID of the job to get the status of", min_length=1)
-
-
-@json_schema_type
-class JobCancelRequest(BaseModel):
-    """Request model for canceling a job."""
-
-    benchmark_id: str = Field(..., description="The ID of the benchmark associated with the job", min_length=1)
-    job_id: str = Field(..., description="The ID of the job to cancel", min_length=1)
-
-
-@json_schema_type
-class JobResultRequest(BaseModel):
-    """Request model for getting the result of a job."""
-
-    benchmark_id: str = Field(..., description="The ID of the benchmark associated with the job", min_length=1)
-    job_id: str = Field(..., description="The ID of the job to get the result of", min_length=1)
-
-
-__all__ = [
-    "ModelCandidate",
-    "EvalCandidate",
-    "BenchmarkConfig",
-    "EvaluateResponse",
-    "BenchmarkIdRequest",
-    "RunEvalRequest",
-    "RunEvalBodyRequest",
-    "EvaluateRowsRequest",
-    "EvaluateRowsBodyRequest",
-    "JobStatusRequest",
-    "JobCancelRequest",
-    "JobResultRequest",
-]
diff --git a/src/llama_stack_api/pyproject.toml b/src/llama_stack_api/pyproject.toml
index 45d92fb98e..1b98b1bd7b 100644
--- a/src/llama_stack_api/pyproject.toml
+++ b/src/llama_stack_api/pyproject.toml
@@ -46,12 +46,8 @@ packages = [
     "llama_stack_api.admin",
     "llama_stack_api.agents",
     "llama_stack_api.batches",
-    "llama_stack_api.benchmarks",
     "llama_stack_api.common",
     "llama_stack_api.conversations",
-    "llama_stack_api.datasetio",
-    "llama_stack_api.datasets",
-    "llama_stack_api.eval",
     "llama_stack_api.file_processors",
     "llama_stack_api.files",
     "llama_stack_api.inspect_api",
@@ -61,9 +57,7 @@ packages = [
 
     "llama_stack_api.providers",
     "llama_stack_api.shields",
-    "llama_stack_api.scoring_functions",
     "llama_stack_api.prompts",
-    "llama_stack_api.scoring",
     "llama_stack_api.safety",
     "llama_stack_api.tools",
     "llama_stack_api.vector_io",
@@ -84,7 +78,6 @@ py-modules = [
     "llama_stack_api.vector_stores",
     "llama_stack_api.version",
     "llama_stack_api.validators",
-    "llama_stack_api.helpers",
 ]
 
 [tool.setuptools.package-data]
diff --git a/src/llama_stack_api/resource.py b/src/llama_stack_api/resource.py
index 246333cd3d..a3af95ff9f 100644
--- a/src/llama_stack_api/resource.py
+++ b/src/llama_stack_api/resource.py
@@ -13,9 +13,6 @@ class ResourceType(StrEnum):
     model = "model"
     shield = "shield"
     vector_store = "vector_store"
-    dataset = "dataset"
-    scoring_function = "scoring_function"
-    benchmark = "benchmark"
     tool = "tool"
     tool_group = "tool_group"
     prompt = "prompt"
diff --git a/src/llama_stack_api/scoring/__init__.py b/src/llama_stack_api/scoring/__init__.py
deleted file mode 100644
index db5fcab4d1..0000000000
--- a/src/llama_stack_api/scoring/__init__.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""Scoring API protocol and models.
-
-This module contains the Scoring protocol definition.
-Pydantic models are defined in llama_stack_api.scoring.models.
-The FastAPI router is defined in llama_stack_api.scoring.fastapi_routes.
-"""
-
-# Import fastapi_routes for router factory access
-# Import scoring_functions for re-export
-from llama_stack_api.scoring_functions import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    CommonScoringFnFields,
-    ListScoringFunctionsResponse,
-    LLMAsJudgeScoringFnParams,
-    RegexParserScoringFnParams,
-    ScoringFn,
-    ScoringFnInput,
-    ScoringFnParams,
-    ScoringFnParamsType,
-    ScoringFunctions,
-)
-
-from . import fastapi_routes
-
-# Import protocol for FastAPI router
-from .api import Scoring, ScoringFunctionStore
-
-# Import models for re-export
-from .models import (
-    ScoreBatchRequest,
-    ScoreBatchResponse,
-    ScoreRequest,
-    ScoreResponse,
-    ScoringResult,
-    ScoringResultRow,
-)
-
-__all__ = [
-    "Scoring",
-    "ScoringFunctionStore",
-    "ScoringResult",
-    "ScoringResultRow",
-    "ScoreBatchResponse",
-    "ScoreResponse",
-    "ScoreRequest",
-    "ScoreBatchRequest",
-    "AggregationFunctionType",
-    "BasicScoringFnParams",
-    "CommonScoringFnFields",
-    "LLMAsJudgeScoringFnParams",
-    "ListScoringFunctionsResponse",
-    "RegexParserScoringFnParams",
-    "ScoringFn",
-    "ScoringFnInput",
-    "ScoringFnParams",
-    "ScoringFnParamsType",
-    "ScoringFunctions",
-    "fastapi_routes",
-]
diff --git a/src/llama_stack_api/scoring/api.py b/src/llama_stack_api/scoring/api.py
deleted file mode 100644
index 9263eb06cd..0000000000
--- a/src/llama_stack_api/scoring/api.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""Scoring API protocol definition.
-
-This module contains the Scoring protocol definition.
-Pydantic models are defined in llama_stack_api.scoring.models.
-The FastAPI router is defined in llama_stack_api.scoring.fastapi_routes.
-"""
-
-from typing import Protocol, runtime_checkable
-
-from llama_stack_api.scoring_functions import ScoringFn
-
-from .models import ScoreBatchRequest, ScoreBatchResponse, ScoreRequest, ScoreResponse
-
-
-class ScoringFunctionStore(Protocol):
-    """Protocol for storing and retrieving scoring functions."""
-
-    def get_scoring_function(self, scoring_fn_id: str) -> ScoringFn: ...
-
-
-@runtime_checkable
-class Scoring(Protocol):
-    """Protocol for scoring operations."""
-
-    scoring_function_store: ScoringFunctionStore
-
-    async def score_batch(self, request: ScoreBatchRequest) -> ScoreBatchResponse: ...
-
-    async def score(self, request: ScoreRequest) -> ScoreResponse: ...
diff --git a/src/llama_stack_api/scoring/fastapi_routes.py b/src/llama_stack_api/scoring/fastapi_routes.py
deleted file mode 100644
index fe1df9a289..0000000000
--- a/src/llama_stack_api/scoring/fastapi_routes.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""FastAPI router for the Scoring API.
-
-This module defines the FastAPI router for the Scoring API using standard
-FastAPI route decorators.
-"""
-
-from typing import Annotated
-
-from fastapi import APIRouter, Body
-
-from llama_stack_api.router_utils import standard_responses
-from llama_stack_api.version import LLAMA_STACK_API_V1
-
-from .api import Scoring
-from .models import ScoreBatchRequest, ScoreBatchResponse, ScoreRequest, ScoreResponse
-
-
-def create_router(impl: Scoring) -> APIRouter:
-    """Create a FastAPI router for the Scoring API.
-
-    Args:
-        impl: The Scoring implementation instance
-
-    Returns:
-        APIRouter configured for the Scoring API
-    """
-    router = APIRouter(
-        prefix=f"/{LLAMA_STACK_API_V1}",
-        tags=["Scoring"],
-        responses=standard_responses,
-    )
-
-    @router.post(
-        "/scoring/score",
-        response_model=ScoreResponse,
-        summary="Score a list of rows.",
-        description="Score a list of rows.",
-        responses={
-            200: {"description": "A ScoreResponse object containing rows and aggregated results."},
-        },
-    )
-    async def score(
-        request: Annotated[ScoreRequest, Body(...)],
-    ) -> ScoreResponse:
-        return await impl.score(request)
-
-    @router.post(
-        "/scoring/score-batch",
-        response_model=ScoreBatchResponse,
-        summary="Score a batch of rows.",
-        description="Score a batch of rows.",
-        responses={
-            200: {"description": "A ScoreBatchResponse."},
-        },
-    )
-    async def score_batch(
-        request: Annotated[ScoreBatchRequest, Body(...)],
-    ) -> ScoreBatchResponse:
-        return await impl.score_batch(request)
-
-    return router
diff --git a/src/llama_stack_api/scoring/models.py b/src/llama_stack_api/scoring/models.py
deleted file mode 100644
index 77edfc74d1..0000000000
--- a/src/llama_stack_api/scoring/models.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""Pydantic models for Scoring API requests and responses.
-
-This module defines the request and response models for the Scoring API
-using Pydantic with Field descriptions for OpenAPI schema generation.
-"""
-
-from typing import Any
-
-from pydantic import BaseModel, Field
-
-from llama_stack_api.schema_utils import json_schema_type
-from llama_stack_api.scoring_functions import ScoringFnParams
-
-# mapping of metric to value
-ScoringResultRow = dict[str, Any]
-
-
-@json_schema_type
-class ScoringResult(BaseModel):
-    """
-    A scoring result for a single row.
-    """
-
-    score_rows: list[ScoringResultRow] = Field(
-        ..., description="The scoring result for each row. Each row is a map of column name to value."
-    )
-    aggregated_results: dict[str, Any] = Field(..., description="Map of metric name to aggregated value")
-
-
-@json_schema_type
-class ScoreBatchResponse(BaseModel):
-    """Response from batch scoring operations on datasets."""
-
-    dataset_id: str | None = Field(default=None, description="(Optional) The identifier of the dataset that was scored")
-    results: dict[str, ScoringResult] = Field(..., description="A map of scoring function name to ScoringResult")
-
-
-@json_schema_type
-class ScoreResponse(BaseModel):
-    """
-    The response from scoring.
-    """
-
-    results: dict[str, ScoringResult] = Field(..., description="A map of scoring function name to ScoringResult.")
-
-
-@json_schema_type
-class ScoreRequest(BaseModel):
-    """Request model for scoring a list of rows."""
-
-    input_rows: list[dict[str, Any]] = Field(..., description="The rows to score.")
-    scoring_functions: dict[str, ScoringFnParams | None] = Field(
-        ..., description="The scoring functions to use for the scoring."
-    )
-
-
-@json_schema_type
-class ScoreBatchRequest(BaseModel):
-    """Request model for scoring a batch of rows from a dataset."""
-
-    dataset_id: str = Field(..., description="The ID of the dataset to score.")
-    scoring_functions: dict[str, ScoringFnParams | None] = Field(
-        ..., description="The scoring functions to use for the scoring."
-    )
-    save_results_dataset: bool = Field(default=False, description="Whether to save the results to a dataset.")
-
-
-__all__ = [
-    "ScoringResult",
-    "ScoringResultRow",
-    "ScoreBatchResponse",
-    "ScoreResponse",
-    "ScoreRequest",
-    "ScoreBatchRequest",
-]
diff --git a/src/llama_stack_api/scoring_functions/__init__.py b/src/llama_stack_api/scoring_functions/__init__.py
deleted file mode 100644
index db9047e26f..0000000000
--- a/src/llama_stack_api/scoring_functions/__init__.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""ScoringFunctions API protocol and models.
-
-This module contains the ScoringFunctions protocol definition.
-Pydantic models are defined in llama_stack_api.scoring_functions.models.
-The FastAPI router is defined in llama_stack_api.scoring_functions.fastapi_routes.
-"""
-
-from . import fastapi_routes
-from .api import ScoringFunctions
-from .models import (
-    AggregationFunctionType,
-    BasicScoringFnParams,
-    CommonScoringFnFields,
-    GetScoringFunctionRequest,
-    ListScoringFunctionsRequest,
-    ListScoringFunctionsResponse,
-    LLMAsJudgeScoringFnParams,
-    RegexParserScoringFnParams,
-    RegisterScoringFunctionRequest,
-    ScoringFn,
-    ScoringFnInput,
-    ScoringFnParams,
-    ScoringFnParamsType,
-    UnregisterScoringFunctionRequest,
-)
-
-__all__ = [
-    "ScoringFunctions",
-    "ScoringFn",
-    "ScoringFnInput",
-    "ScoringFnParams",
-    "ScoringFnParamsType",
-    "AggregationFunctionType",
-    "LLMAsJudgeScoringFnParams",
-    "RegexParserScoringFnParams",
-    "BasicScoringFnParams",
-    "CommonScoringFnFields",
-    "ListScoringFunctionsResponse",
-    "ListScoringFunctionsRequest",
-    "GetScoringFunctionRequest",
-    "RegisterScoringFunctionRequest",
-    "UnregisterScoringFunctionRequest",
-    "fastapi_routes",
-]
diff --git a/src/llama_stack_api/scoring_functions/api.py b/src/llama_stack_api/scoring_functions/api.py
deleted file mode 100644
index 6ca8bcc9d2..0000000000
--- a/src/llama_stack_api/scoring_functions/api.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-from typing import Protocol, runtime_checkable
-
-from .models import (
-    GetScoringFunctionRequest,
-    ListScoringFunctionsRequest,
-    ListScoringFunctionsResponse,
-    RegisterScoringFunctionRequest,
-    ScoringFn,
-    UnregisterScoringFunctionRequest,
-)
-
-
-@runtime_checkable
-class ScoringFunctions(Protocol):
-    async def list_scoring_functions(
-        self,
-        request: ListScoringFunctionsRequest,
-    ) -> ListScoringFunctionsResponse: ...
-
-    async def get_scoring_function(
-        self,
-        request: GetScoringFunctionRequest,
-    ) -> ScoringFn: ...
-
-    async def register_scoring_function(
-        self,
-        request: RegisterScoringFunctionRequest,
-    ) -> None: ...
-
-    async def unregister_scoring_function(
-        self,
-        request: UnregisterScoringFunctionRequest,
-    ) -> None: ...
diff --git a/src/llama_stack_api/scoring_functions/fastapi_routes.py b/src/llama_stack_api/scoring_functions/fastapi_routes.py
deleted file mode 100644
index 4d85d7b358..0000000000
--- a/src/llama_stack_api/scoring_functions/fastapi_routes.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""FastAPI router for the ScoringFunctions API.
-
-This module defines the FastAPI router for the ScoringFunctions API using standard
-FastAPI route decorators.
-
-The router is defined in the API package to keep all API-related code together.
-"""
-
-from typing import Annotated
-
-from fastapi import APIRouter, Body, Depends
-
-from llama_stack_api.router_utils import create_path_dependency, create_query_dependency, standard_responses
-from llama_stack_api.version import LLAMA_STACK_API_V1
-
-from .api import ScoringFunctions
-from .models import (
-    GetScoringFunctionRequest,
-    ListScoringFunctionsRequest,
-    ListScoringFunctionsResponse,
-    RegisterScoringFunctionRequest,
-    ScoringFn,
-    UnregisterScoringFunctionRequest,
-)
-
-get_list_scoring_functions_request = create_query_dependency(ListScoringFunctionsRequest)
-get_get_scoring_function_request = create_path_dependency(GetScoringFunctionRequest)
-get_unregister_scoring_function_request = create_path_dependency(UnregisterScoringFunctionRequest)
-
-
-def create_router(impl: ScoringFunctions) -> APIRouter:
-    """Create a FastAPI router for the ScoringFunctions API.
-
-    Args:
-        impl: The ScoringFunctions implementation instance
-
-    Returns:
-        APIRouter configured for the ScoringFunctions API
-    """
-    router = APIRouter(
-        prefix=f"/{LLAMA_STACK_API_V1}",
-        tags=["Scoring Functions"],
-        responses=standard_responses,
-    )
-
-    @router.get(
-        "/scoring-functions",
-        response_model=ListScoringFunctionsResponse,
-        summary="List all scoring functions.",
-        description="List all scoring functions.",
-        responses={
-            200: {"description": "A ListScoringFunctionsResponse."},
-        },
-    )
-    async def list_scoring_functions(
-        request: Annotated[ListScoringFunctionsRequest, Depends(get_list_scoring_functions_request)],
-    ) -> ListScoringFunctionsResponse:
-        return await impl.list_scoring_functions(request)
-
-    @router.get(
-        "/scoring-functions/{scoring_fn_id:path}",
-        response_model=ScoringFn,
-        summary="Get a scoring function by its ID.",
-        description="Get a scoring function by its ID.",
-        responses={
-            200: {"description": "A ScoringFn."},
-        },
-    )
-    async def get_scoring_function(
-        request: Annotated[GetScoringFunctionRequest, Depends(get_get_scoring_function_request)],
-    ) -> ScoringFn:
-        return await impl.get_scoring_function(request)
-
-    @router.post(
-        "/scoring-functions",
-        summary="Register a scoring function.",
-        description="Register a scoring function.",
-        responses={
-            200: {"description": "The scoring function was successfully registered."},
-        },
-        deprecated=True,
-    )
-    async def register_scoring_function(
-        request: Annotated[RegisterScoringFunctionRequest, Body(...)],
-    ) -> None:
-        return await impl.register_scoring_function(request)
-
-    @router.delete(
-        "/scoring-functions/{scoring_fn_id:path}",
-        summary="Unregister a scoring function.",
-        description="Unregister a scoring function.",
-        responses={
-            200: {"description": "The scoring function was successfully unregistered."},
-        },
-        deprecated=True,
-    )
-    async def unregister_scoring_function(
-        request: Annotated[UnregisterScoringFunctionRequest, Depends(get_unregister_scoring_function_request)],
-    ) -> None:
-        return await impl.unregister_scoring_function(request)
-
-    return router
diff --git a/src/llama_stack_api/scoring_functions/models.py b/src/llama_stack_api/scoring_functions/models.py
deleted file mode 100644
index f821f9ecb7..0000000000
--- a/src/llama_stack_api/scoring_functions/models.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""Pydantic models for ScoringFunctions API requests and responses.
-
-This module defines the request and response models for the ScoringFunctions API
-using Pydantic with Field descriptions for OpenAPI schema generation.
-"""
-
-from enum import StrEnum
-from typing import Annotated, Any, Literal
-
-from pydantic import BaseModel, Field
-
-from llama_stack_api.common.type_system import ParamType
-from llama_stack_api.resource import Resource, ResourceType
-from llama_stack_api.schema_utils import json_schema_type, register_schema
-
-
-@json_schema_type
-class ScoringFnParamsType(StrEnum):
-    """Types of scoring function parameter configurations.
-    :cvar llm_as_judge: Use an LLM model to evaluate and score responses
-    :cvar regex_parser: Use regex patterns to extract and score specific parts of responses
-    :cvar basic: Basic scoring with simple aggregation functions
-    """
-
-    llm_as_judge = "llm_as_judge"
-    regex_parser = "regex_parser"
-    basic = "basic"
-
-
-@json_schema_type
-class AggregationFunctionType(StrEnum):
-    """Types of aggregation functions for scoring results.
-    :cvar average: Calculate the arithmetic mean of scores
-    :cvar weighted_average: Calculate a weighted average of scores
-    :cvar median: Calculate the median value of scores
-    :cvar categorical_count: Count occurrences of categorical values
-    :cvar accuracy: Calculate accuracy as the proportion of correct answers
-    """
-
-    average = "average"
-    weighted_average = "weighted_average"
-    median = "median"
-    categorical_count = "categorical_count"
-    accuracy = "accuracy"
-
-
-@json_schema_type
-class LLMAsJudgeScoringFnParams(BaseModel):
-    """Parameters for LLM-as-judge scoring function configuration.
-    :param type: The type of scoring function parameters, always llm_as_judge
-    :param judge_model: Identifier of the LLM model to use as a judge for scoring
-    :param prompt_template: (Optional) Custom prompt template for the judge model
-    :param judge_score_regexes: Regexes to extract the answer from generated response
-    :param aggregation_functions: Aggregation functions to apply to the scores of each row
-    """
-
-    type: Literal[ScoringFnParamsType.llm_as_judge] = ScoringFnParamsType.llm_as_judge
-    judge_model: str
-    prompt_template: str | None = None
-    judge_score_regexes: list[str] = Field(
-        description="Regexes to extract the answer from generated response",
-        default_factory=lambda: [],
-    )
-    aggregation_functions: list[AggregationFunctionType] = Field(
-        description="Aggregation functions to apply to the scores of each row",
-        default_factory=lambda: [],
-    )
-
-
-@json_schema_type
-class RegexParserScoringFnParams(BaseModel):
-    """Parameters for regex parser scoring function configuration.
-    :param type: The type of scoring function parameters, always regex_parser
-    :param parsing_regexes: Regex to extract the answer from generated response
-    :param aggregation_functions: Aggregation functions to apply to the scores of each row
-    """
-
-    type: Literal[ScoringFnParamsType.regex_parser] = ScoringFnParamsType.regex_parser
-    parsing_regexes: list[str] = Field(
-        description="Regex to extract the answer from generated response",
-        default_factory=lambda: [],
-    )
-    aggregation_functions: list[AggregationFunctionType] = Field(
-        description="Aggregation functions to apply to the scores of each row",
-        default_factory=lambda: [],
-    )
-
-
-@json_schema_type
-class BasicScoringFnParams(BaseModel):
-    """Parameters for basic scoring function configuration.
-    :param type: The type of scoring function parameters, always basic
-    :param aggregation_functions: Aggregation functions to apply to the scores of each row
-    """
-
-    type: Literal[ScoringFnParamsType.basic] = ScoringFnParamsType.basic
-    aggregation_functions: list[AggregationFunctionType] = Field(
-        description="Aggregation functions to apply to the scores of each row",
-        default_factory=list,
-    )
-
-
-ScoringFnParams = Annotated[
-    LLMAsJudgeScoringFnParams | RegexParserScoringFnParams | BasicScoringFnParams,
-    Field(discriminator="type"),
-]
-register_schema(ScoringFnParams, name="ScoringFnParams")
-
-
-@json_schema_type
-class ListScoringFunctionsRequest(BaseModel):
-    """Request model for listing scoring functions."""
-
-    pass
-
-
-@json_schema_type
-class GetScoringFunctionRequest(BaseModel):
-    """Request model for getting a scoring function."""
-
-    scoring_fn_id: str = Field(..., description="The ID of the scoring function to get.")
-
-
-@json_schema_type
-class RegisterScoringFunctionRequest(BaseModel):
-    """Request model for registering a scoring function."""
-
-    scoring_fn_id: str = Field(..., description="The ID of the scoring function to register.")
-    description: str = Field(..., description="The description of the scoring function.")
-    return_type: ParamType = Field(..., description="The return type of the scoring function.")
-    provider_scoring_fn_id: str | None = Field(
-        default=None, description="The ID of the provider scoring function to use for the scoring function."
-    )
-    provider_id: str | None = Field(default=None, description="The ID of the provider to use for the scoring function.")
-    params: ScoringFnParams | None = Field(
-        default=None,
-        description="The parameters for the scoring function for benchmark eval, these can be overridden for app eval.",
-    )
-
-
-@json_schema_type
-class UnregisterScoringFunctionRequest(BaseModel):
-    """Request model for unregistering a scoring function."""
-
-    scoring_fn_id: str = Field(..., description="The ID of the scoring function to unregister.")
-
-
-class CommonScoringFnFields(BaseModel):
-    description: str | None = None
-    metadata: dict[str, Any] = Field(
-        default_factory=dict,
-        description="Any additional metadata for this definition",
-    )
-    return_type: ParamType = Field(
-        description="The return type of the deterministic function",
-    )
-    params: ScoringFnParams | None = Field(
-        description="The parameters for the scoring function for benchmark eval, these can be overridden for app eval",
-        default=None,
-    )
-
-
-@json_schema_type
-class ScoringFn(CommonScoringFnFields, Resource):
-    """A scoring function resource for evaluating model outputs.
-    :param type: The resource type, always scoring_function
-    """
-
-    type: Literal[ResourceType.scoring_function] = ResourceType.scoring_function
-
-    @property
-    def scoring_fn_id(self) -> str:
-        return self.identifier
-
-    @property
-    def provider_scoring_fn_id(self) -> str | None:
-        return self.provider_resource_id
-
-
-class ScoringFnInput(CommonScoringFnFields, BaseModel):
-    scoring_fn_id: str
-    provider_id: str | None = None
-    provider_scoring_fn_id: str | None = None
-
-
-@json_schema_type
-class ListScoringFunctionsResponse(BaseModel):
-    """Response containing a list of scoring function objects."""
-
-    data: list[ScoringFn] = Field(..., description="List of scoring function objects.")
-
-
-__all__ = [
-    "ScoringFnParamsType",
-    "AggregationFunctionType",
-    "LLMAsJudgeScoringFnParams",
-    "RegexParserScoringFnParams",
-    "BasicScoringFnParams",
-    "ScoringFnParams",
-    "ListScoringFunctionsRequest",
-    "GetScoringFunctionRequest",
-    "RegisterScoringFunctionRequest",
-    "UnregisterScoringFunctionRequest",
-    "CommonScoringFnFields",
-    "ScoringFn",
-    "ScoringFnInput",
-    "ListScoringFunctionsResponse",
-]
diff --git a/tests/backward_compat/test_eval_compat.py b/tests/backward_compat/test_eval_compat.py
deleted file mode 100644
index fa15045671..0000000000
--- a/tests/backward_compat/test_eval_compat.py
+++ /dev/null
@@ -1,533 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-"""
-Tests for Eval API backward compatibility.
-
-These tests verify that both old-style (individual parameters) and new-style
-(request objects) calling conventions work correctly, and that old-style usage
-emits appropriate deprecation warnings.
-"""
-
-import warnings
-
-import pytest
-
-from llama_stack_api import (
-    BenchmarkConfig,
-    EvaluateRowsRequest,
-    JobCancelRequest,
-    JobResultRequest,
-    JobStatusRequest,
-    ModelCandidate,
-    RunEvalRequest,
-    resolve_evaluate_rows_request,
-    resolve_job_cancel_request,
-    resolve_job_result_request,
-    resolve_job_status_request,
-    resolve_run_eval_request,
-)
-from llama_stack_api.inference import SamplingParams, TopPSamplingStrategy
-
-
-@pytest.fixture
-def sample_benchmark_config():
-    return BenchmarkConfig(
-        eval_candidate=ModelCandidate(
-            model="test-model",
-            sampling_params=SamplingParams(max_tokens=100, strategy=TopPSamplingStrategy(temperature=0.7)),
-        )
-    )
-
-
-class TestResolveRunEvalRequest:
-    """Tests for resolve_run_eval_request."""
-
-    def test_new_style_with_request_object(self, sample_benchmark_config):
-        """Test that new-style (request object) works without deprecation warning."""
-        request = RunEvalRequest(benchmark_id="bench-123", benchmark_config=sample_benchmark_config)
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_run_eval_request(request)
-
-            # No deprecation warning should be emitted
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-
-        assert result.benchmark_id == "bench-123"
-        assert result.benchmark_config == sample_benchmark_config
-
-    def test_old_style_with_individual_params(self, sample_benchmark_config):
-        """Test that old-style (individual parameters) works and emits deprecation warning."""
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_run_eval_request(
-                benchmark_id="bench-123",
-                benchmark_config=sample_benchmark_config,
-            )
-
-            # Deprecation warning should be emitted
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 1
-            assert "run_eval" in str(deprecation_warnings[0].message)
-            assert "RunEvalRequest" in str(deprecation_warnings[0].message)
-
-        assert result.benchmark_id == "bench-123"
-        assert result.benchmark_config == sample_benchmark_config
-
-    def test_request_object_takes_precedence_over_individual_params(self, sample_benchmark_config):
-        """Test that request object takes precedence when both are provided."""
-        request = RunEvalRequest(benchmark_id="from-request", benchmark_config=sample_benchmark_config)
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_run_eval_request(
-                request,
-                benchmark_id="from-param",  # Should be ignored
-                benchmark_config=sample_benchmark_config,
-            )
-
-            # No deprecation warning since request object is used
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-
-        # Request object values should be used
-        assert result.benchmark_id == "from-request"
-
-    def test_missing_parameters_raises_error(self, sample_benchmark_config):
-        """Test that missing parameters raises ValueError with helpful message."""
-        with pytest.raises(ValueError) as exc_info:
-            resolve_run_eval_request()
-        assert "Either 'request'" in str(exc_info.value)
-        assert "missing:" in str(exc_info.value)
-
-        with pytest.raises(ValueError) as exc_info:
-            resolve_run_eval_request(benchmark_id="bench-123")  # missing benchmark_config
-        assert "missing: benchmark_config" in str(exc_info.value)
-        assert "provided: benchmark_id" in str(exc_info.value)
-
-        with pytest.raises(ValueError) as exc_info:
-            resolve_run_eval_request(benchmark_config=sample_benchmark_config)  # missing benchmark_id
-        assert "missing: benchmark_id" in str(exc_info.value)
-        assert "provided: benchmark_config" in str(exc_info.value)
-
-
-class TestResolveEvaluateRowsRequest:
-    """Tests for resolve_evaluate_rows_request."""
-
-    def test_new_style_with_request_object(self, sample_benchmark_config):
-        """Test that new-style (request object) works without deprecation warning."""
-        request = EvaluateRowsRequest(
-            benchmark_id="bench-123",
-            input_rows=[{"test": "data"}],
-            scoring_functions=["func1"],
-            benchmark_config=sample_benchmark_config,
-        )
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_evaluate_rows_request(request)
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-
-        assert result.benchmark_id == "bench-123"
-        assert result.input_rows == [{"test": "data"}]
-        assert result.scoring_functions == ["func1"]
-
-    def test_old_style_with_individual_params(self, sample_benchmark_config):
-        """Test that old-style (individual parameters) works and emits deprecation warning."""
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_evaluate_rows_request(
-                benchmark_id="bench-123",
-                input_rows=[{"test": "data"}],
-                scoring_functions=["func1"],
-                benchmark_config=sample_benchmark_config,
-            )
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 1
-            assert "evaluate_rows" in str(deprecation_warnings[0].message)
-            assert "EvaluateRowsRequest" in str(deprecation_warnings[0].message)
-
-        assert result.benchmark_id == "bench-123"
-        assert result.input_rows == [{"test": "data"}]
-        assert result.scoring_functions == ["func1"]
-
-    def test_request_object_takes_precedence_over_individual_params(self, sample_benchmark_config):
-        """Test that request object takes precedence when both are provided."""
-        request = EvaluateRowsRequest(
-            benchmark_id="from-request",
-            input_rows=[{"from": "request"}],
-            scoring_functions=["request-func"],
-            benchmark_config=sample_benchmark_config,
-        )
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_evaluate_rows_request(
-                request,
-                benchmark_id="from-param",
-                input_rows=[{"from": "param"}],
-                scoring_functions=["param-func"],
-                benchmark_config=sample_benchmark_config,
-            )
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-
-        assert result.benchmark_id == "from-request"
-        assert result.input_rows == [{"from": "request"}]
-
-    def test_missing_parameters_raises_error(self, sample_benchmark_config):
-        """Test that missing parameters raises ValueError with helpful message."""
-        with pytest.raises(ValueError) as exc_info:
-            resolve_evaluate_rows_request()
-        assert "missing:" in str(exc_info.value)
-
-        with pytest.raises(ValueError) as exc_info:
-            resolve_evaluate_rows_request(
-                benchmark_id="bench-123",
-                input_rows=[{"test": "data"}],
-                # missing scoring_functions and benchmark_config
-            )
-        assert "missing: scoring_functions, benchmark_config" in str(exc_info.value)
-
-
-class TestResolveJobStatusRequest:
-    """Tests for resolve_job_status_request."""
-
-    def test_new_style_with_request_object(self):
-        """Test that new-style (request object) works without deprecation warning."""
-        request = JobStatusRequest(benchmark_id="bench-123", job_id="job-456")
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_job_status_request(request)
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-
-        assert result.benchmark_id == "bench-123"
-        assert result.job_id == "job-456"
-
-    def test_old_style_with_individual_params(self):
-        """Test that old-style (individual parameters) works and emits deprecation warning."""
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_job_status_request(benchmark_id="bench-123", job_id="job-456")
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 1
-            assert "job_status" in str(deprecation_warnings[0].message)
-            assert "JobStatusRequest" in str(deprecation_warnings[0].message)
-
-        assert result.benchmark_id == "bench-123"
-        assert result.job_id == "job-456"
-
-    def test_request_object_takes_precedence_over_individual_params(self):
-        """Test that request object takes precedence when both are provided."""
-        request = JobStatusRequest(benchmark_id="from-request", job_id="job-from-request")
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_job_status_request(
-                request,
-                benchmark_id="from-param",
-                job_id="job-from-param",
-            )
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-
-        assert result.benchmark_id == "from-request"
-        assert result.job_id == "job-from-request"
-
-    def test_missing_parameters_raises_error(self):
-        """Test that missing parameters raises ValueError with helpful message."""
-        with pytest.raises(ValueError) as exc_info:
-            resolve_job_status_request()
-        assert "missing:" in str(exc_info.value)
-
-        with pytest.raises(ValueError) as exc_info:
-            resolve_job_status_request(benchmark_id="bench-123")  # missing job_id
-        assert "missing: job_id" in str(exc_info.value)
-        assert "provided: benchmark_id" in str(exc_info.value)
-
-        with pytest.raises(ValueError) as exc_info:
-            resolve_job_status_request(job_id="job-456")  # missing benchmark_id
-        assert "missing: benchmark_id" in str(exc_info.value)
-        assert "provided: job_id" in str(exc_info.value)
-
-
-class TestResolveJobCancelRequest:
-    """Tests for resolve_job_cancel_request."""
-
-    def test_new_style_with_request_object(self):
-        """Test that new-style (request object) works without deprecation warning."""
-        request = JobCancelRequest(benchmark_id="bench-123", job_id="job-456")
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_job_cancel_request(request)
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-
-        assert result.benchmark_id == "bench-123"
-        assert result.job_id == "job-456"
-
-    def test_old_style_with_individual_params(self):
-        """Test that old-style (individual parameters) works and emits deprecation warning."""
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_job_cancel_request(benchmark_id="bench-123", job_id="job-456")
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 1
-            assert "job_cancel" in str(deprecation_warnings[0].message)
-            assert "JobCancelRequest" in str(deprecation_warnings[0].message)
-
-        assert result.benchmark_id == "bench-123"
-        assert result.job_id == "job-456"
-
-    def test_request_object_takes_precedence_over_individual_params(self):
-        """Test that request object takes precedence when both are provided."""
-        request = JobCancelRequest(benchmark_id="from-request", job_id="job-from-request")
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_job_cancel_request(
-                request,
-                benchmark_id="from-param",
-                job_id="job-from-param",
-            )
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-
-        assert result.benchmark_id == "from-request"
-        assert result.job_id == "job-from-request"
-
-    def test_missing_parameters_raises_error(self):
-        """Test that missing parameters raises ValueError with helpful message."""
-        with pytest.raises(ValueError) as exc_info:
-            resolve_job_cancel_request()
-        assert "missing:" in str(exc_info.value)
-
-        with pytest.raises(ValueError) as exc_info:
-            resolve_job_cancel_request(benchmark_id="bench-123")  # missing job_id
-        assert "missing: job_id" in str(exc_info.value)
-        assert "provided: benchmark_id" in str(exc_info.value)
-
-        with pytest.raises(ValueError) as exc_info:
-            resolve_job_cancel_request(job_id="job-456")  # missing benchmark_id
-        assert "missing: benchmark_id" in str(exc_info.value)
-        assert "provided: job_id" in str(exc_info.value)
-
-
-class TestResolveJobResultRequest:
-    """Tests for resolve_job_result_request."""
-
-    def test_new_style_with_request_object(self):
-        """Test that new-style (request object) works without deprecation warning."""
-        request = JobResultRequest(benchmark_id="bench-123", job_id="job-456")
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_job_result_request(request)
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-
-        assert result.benchmark_id == "bench-123"
-        assert result.job_id == "job-456"
-
-    def test_old_style_with_individual_params(self):
-        """Test that old-style (individual parameters) works and emits deprecation warning."""
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_job_result_request(benchmark_id="bench-123", job_id="job-456")
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 1
-            assert "job_result" in str(deprecation_warnings[0].message)
-            assert "JobResultRequest" in str(deprecation_warnings[0].message)
-
-        assert result.benchmark_id == "bench-123"
-        assert result.job_id == "job-456"
-
-    def test_request_object_takes_precedence_over_individual_params(self):
-        """Test that request object takes precedence when both are provided."""
-        request = JobResultRequest(benchmark_id="from-request", job_id="job-from-request")
-
-        with warnings.catch_warnings(record=True) as w:
-            warnings.simplefilter("always")
-            result = resolve_job_result_request(
-                request,
-                benchmark_id="from-param",
-                job_id="job-from-param",
-            )
-
-            deprecation_warnings = [x for x in w if issubclass(x.category, DeprecationWarning)]
-            assert len(deprecation_warnings) == 0
-
-        assert result.benchmark_id == "from-request"
-        assert result.job_id == "job-from-request"
-
-    def test_missing_parameters_raises_error(self):
-        """Test that missing parameters raises ValueError with helpful message."""
-        with pytest.raises(ValueError) as exc_info:
-            resolve_job_result_request()
-        assert "missing:" in str(exc_info.value)
-
-        with pytest.raises(ValueError) as exc_info:
-            resolve_job_result_request(benchmark_id="bench-123")  # missing job_id
-        assert "missing: job_id" in str(exc_info.value)
-        assert "provided: benchmark_id" in str(exc_info.value)
-
-        with pytest.raises(ValueError) as exc_info:
-            resolve_job_result_request(job_id="job-456")  # missing benchmark_id
-        assert "missing: benchmark_id" in str(exc_info.value)
-        assert "provided: job_id" in str(exc_info.value)
-
-
-class TestEmptyValueValidation:
-    """Tests for validation of None, empty strings, and empty lists."""
-
-    def test_empty_benchmark_id_old_style(self, sample_benchmark_config):
-        """Empty benchmark_id is rejected when using old-style parameters (treated as missing)."""
-        with pytest.raises(ValueError) as exc_info:
-            resolve_run_eval_request(benchmark_id="", benchmark_config=sample_benchmark_config)
-        # Empty string is falsy, so it's treated as missing
-        assert "benchmark_id" in str(exc_info.value)
-
-    def test_empty_benchmark_id_in_request_object(self, sample_benchmark_config):
-        """Empty benchmark_id in request object (via model_construct) is rejected."""
-        request = RunEvalRequest.model_construct(
-            benchmark_id="",
-            benchmark_config=sample_benchmark_config,
-        )
-        with pytest.raises(ValueError) as exc_info:
-            resolve_run_eval_request(request)
-        assert "benchmark_id" in str(exc_info.value)
-        assert "cannot be None or empty" in str(exc_info.value)
-
-    def test_none_benchmark_id_in_request_object(self, sample_benchmark_config):
-        """None benchmark_id in request object (via model_construct) is rejected."""
-        request = RunEvalRequest.model_construct(
-            benchmark_id=None,
-            benchmark_config=sample_benchmark_config,
-        )
-        with pytest.raises(ValueError) as exc_info:
-            resolve_run_eval_request(request)
-        assert "benchmark_id" in str(exc_info.value)
-        assert "cannot be None or empty" in str(exc_info.value)
-
-    @pytest.mark.parametrize(
-        "resolver,request_class",
-        [
-            (resolve_job_status_request, JobStatusRequest),
-            (resolve_job_cancel_request, JobCancelRequest),
-            (resolve_job_result_request, JobResultRequest),
-        ],
-    )
-    def test_empty_job_id_old_style(self, resolver, request_class):
-        """Empty job_id is rejected when using old-style parameters (treated as missing)."""
-        with pytest.raises(ValueError) as exc_info:
-            resolver(benchmark_id="bench-123", job_id="")
-        # Empty string is falsy, so it's treated as missing
-        assert "job_id" in str(exc_info.value)
-
-    @pytest.mark.parametrize(
-        "resolver,request_class",
-        [
-            (resolve_job_status_request, JobStatusRequest),
-            (resolve_job_cancel_request, JobCancelRequest),
-            (resolve_job_result_request, JobResultRequest),
-        ],
-    )
-    def test_empty_job_id_in_request_object(self, resolver, request_class):
-        """Empty job_id in request object (via model_construct) is rejected."""
-        request = request_class.model_construct(
-            benchmark_id="bench-123",
-            job_id="",
-        )
-        with pytest.raises(ValueError) as exc_info:
-            resolver(request)
-        assert "job_id" in str(exc_info.value)
-        assert "cannot be None or empty" in str(exc_info.value)
-
-    @pytest.mark.parametrize(
-        "resolver,request_class",
-        [
-            (resolve_job_status_request, JobStatusRequest),
-            (resolve_job_cancel_request, JobCancelRequest),
-            (resolve_job_result_request, JobResultRequest),
-        ],
-    )
-    def test_none_job_id_in_request_object(self, resolver, request_class):
-        """None job_id in request object (via model_construct) is rejected."""
-        request = request_class.model_construct(
-            benchmark_id="bench-123",
-            job_id=None,
-        )
-        with pytest.raises(ValueError) as exc_info:
-            resolver(request)
-        assert "job_id" in str(exc_info.value)
-        assert "cannot be None or empty" in str(exc_info.value)
-
-    def test_empty_input_rows_old_style(self, sample_benchmark_config):
-        """Empty input_rows is rejected when using old-style parameters (treated as missing)."""
-        with pytest.raises(ValueError) as exc_info:
-            resolve_evaluate_rows_request(
-                benchmark_id="bench-123",
-                input_rows=[],
-                scoring_functions=["func1"],
-                benchmark_config=sample_benchmark_config,
-            )
-        # Empty list is falsy, so it's treated as missing
-        assert "input_rows" in str(exc_info.value)
-
-    def test_empty_scoring_functions_old_style(self, sample_benchmark_config):
-        """Empty scoring_functions is rejected when using old-style parameters (treated as missing)."""
-        with pytest.raises(ValueError) as exc_info:
-            resolve_evaluate_rows_request(
-                benchmark_id="bench-123",
-                input_rows=[{"test": "data"}],
-                scoring_functions=[],
-                benchmark_config=sample_benchmark_config,
-            )
-        # Empty list is falsy, so it's treated as missing
-        assert "scoring_functions" in str(exc_info.value)
-
-    def test_empty_input_rows_in_request_object(self, sample_benchmark_config):
-        """Empty input_rows in request object (via model_construct) is rejected."""
-        request = EvaluateRowsRequest.model_construct(
-            benchmark_id="bench-123",
-            input_rows=[],
-            scoring_functions=["func1"],
-            benchmark_config=sample_benchmark_config,
-        )
-        with pytest.raises(ValueError) as exc_info:
-            resolve_evaluate_rows_request(request)
-        assert "input_rows" in str(exc_info.value)
-        assert "cannot be None or empty" in str(exc_info.value)
-
-    def test_none_input_rows_in_request_object(self, sample_benchmark_config):
-        """None input_rows in request object (via model_construct) is rejected."""
-        request = EvaluateRowsRequest.model_construct(
-            benchmark_id="bench-123",
-            input_rows=None,
-            scoring_functions=["func1"],
-            benchmark_config=sample_benchmark_config,
-        )
-        with pytest.raises(ValueError) as exc_info:
-            resolve_evaluate_rows_request(request)
-        assert "input_rows" in str(exc_info.value)
-        assert "cannot be None or empty" in str(exc_info.value)
diff --git a/tests/external/llama-stack-provider-lmeval/config.yaml b/tests/external/llama-stack-provider-lmeval/config.yaml
index 966689bae7..ea2fe77f40 100644
--- a/tests/external/llama-stack-provider-lmeval/config.yaml
+++ b/tests/external/llama-stack-provider-lmeval/config.yaml
@@ -2,19 +2,11 @@ version: 2
 distro_name: external-provider-test
 apis:
 - inference
-- eval
 providers:
   inference:
   - provider_id: ollama
     provider_type: remote::ollama
     config:
       base_url: ${env.OLLAMA_URL:=http://localhost:11434}
-  eval:
-  - provider_id: trustyai_lmeval
-    provider_type: remote::trustyai_lmeval
-    module: llama_stack_provider_lmeval
-    config:
-      use_k8s: ${env.TRUSTYAI_LMEVAL_USE_K8S:=false}
-      base_url: ${env.OLLAMA_URL:=http://localhost:11434}
 server:
   port: 8321
diff --git a/tests/integration/datasets/__init__.py b/tests/integration/datasets/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/tests/integration/datasets/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/tests/integration/datasets/test_dataset.csv b/tests/integration/datasets/test_dataset.csv
deleted file mode 100644
index 7fc1c3623a..0000000000
--- a/tests/integration/datasets/test_dataset.csv
+++ /dev/null
@@ -1,6 +0,0 @@
-input_query,generated_answer,expected_answer,chat_completion_input
-What is the capital of France?,London,Paris,"[{""role"": ""user"", ""content"": ""What is the capital of France?""}]"
-Who is the CEO of Meta?,Mark Zuckerberg,Mark Zuckerberg,"[{""role"": ""user"", ""content"": ""Who is the CEO of Meta?""}]"
-What is the largest planet in our solar system?,Jupiter,Jupiter,"[{""role"": ""user"", ""content"": ""What is the largest planet in our solar system?""}]"
-What is the smallest country in the world?,China,Vatican City,"[{""role"": ""user"", ""content"": ""What is the smallest country in the world?""}]"
-What is the currency of Japan?,Yen,Yen,"[{""role"": ""user"", ""content"": ""What is the currency of Japan?""}]"
diff --git a/tests/integration/datasets/test_datasets.py b/tests/integration/datasets/test_datasets.py
deleted file mode 100644
index 3ad5570f07..0000000000
--- a/tests/integration/datasets/test_datasets.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-import base64
-import mimetypes
-import os
-
-import pytest
-
-# How to run this test:
-#
-# LLAMA_STACK_CONFIG="template-name" pytest -v tests/integration/datasets
-
-
-def data_url_from_file(file_path: str) -> str:
-    if not os.path.exists(file_path):
-        raise FileNotFoundError(f"File not found: {file_path}")
-
-    with open(file_path, "rb") as file:
-        file_content = file.read()
-
-    base64_content = base64.b64encode(file_content).decode("utf-8")
-    mime_type, _ = mimetypes.guess_type(file_path)
-
-    data_url = f"data:{mime_type};base64,{base64_content}"
-
-    return data_url
-
-
-@pytest.mark.parametrize(
-    "purpose, source, provider_id, limit",
-    [
-        (
-            "eval/messages-answer",
-            {
-                "type": "uri",
-                "uri": "huggingface://datasets/llamastack/simpleqa?split=train",
-            },
-            "huggingface",
-            10,
-        ),
-        (
-            "eval/messages-answer",
-            {
-                "type": "rows",
-                "rows": [
-                    {
-                        "messages": [{"role": "user", "content": "Hello, world!"}],
-                        "answer": "Hello, world!",
-                    },
-                    {
-                        "messages": [
-                            {
-                                "role": "user",
-                                "content": "What is the capital of France?",
-                            }
-                        ],
-                        "answer": "Paris",
-                    },
-                ],
-            },
-            "localfs",
-            2,
-        ),
-        (
-            "eval/messages-answer",
-            {
-                "type": "uri",
-                "uri": data_url_from_file(os.path.join(os.path.dirname(__file__), "test_dataset.csv")),
-            },
-            "localfs",
-            5,
-        ),
-    ],
-)
-def test_register_and_iterrows(llama_stack_client, purpose, source, provider_id, limit):
-    dataset = llama_stack_client.beta.datasets.register(
-        purpose=purpose,
-        source=source,
-    )
-    assert dataset.identifier is not None
-    assert dataset.provider_id == provider_id
-    iterrow_response = llama_stack_client.beta.datasets.iterrows(dataset.identifier, limit=limit)
-    assert len(iterrow_response.data) == limit
-
-    dataset_list = llama_stack_client.beta.datasets.list()
-    assert dataset.identifier in [d.identifier for d in dataset_list]
-
-    llama_stack_client.beta.datasets.unregister(dataset.identifier)
-    dataset_list = llama_stack_client.beta.datasets.list()
-    assert dataset.identifier not in [d.identifier for d in dataset_list]
diff --git a/tests/integration/datasets/test_rag_dataset.csv b/tests/integration/datasets/test_rag_dataset.csv
deleted file mode 100644
index a0e1fce72b..0000000000
--- a/tests/integration/datasets/test_rag_dataset.csv
+++ /dev/null
@@ -1,6 +0,0 @@
-input_query,context,generated_answer,expected_answer
-What is the capital of France?,"France is a country in Western Europe with a population of about 67 million people. Its capital city has been a major European cultural center since the 17th century and is known for landmarks like the Eiffel Tower and the Louvre Museum.",London,Paris
-Who is the CEO of Meta?,"Meta Platforms, formerly known as Facebook, is one of the world's largest technology companies. Founded by Mark Zuckerberg in 2004, the company has expanded to include platforms like Instagram, WhatsApp, and virtual reality technologies.",Mark Zuckerberg,Mark Zuckerberg
-What is the largest planet in our solar system?,"The solar system consists of eight planets orbiting around the Sun. These planets, in order from the Sun, are Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. Gas giants are significantly larger than terrestrial planets.",Jupiter,Jupiter
-What is the smallest country in the world?,"Independent city-states and micronations are among the world's smallest sovereign territories. Some notable examples include Monaco, San Marino, and Vatican City, which is an enclave within Rome, Italy.",China,Vatican City
-What is the currency of Japan?,"Japan is an island country in East Asia with a rich cultural heritage and one of the world's largest economies. Its financial system has been established since the Meiji period, with its modern currency being introduced in 1871.",Yen,Yen
diff --git a/tests/integration/eval/__init__.py b/tests/integration/eval/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/tests/integration/eval/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/tests/integration/eval/constants.py b/tests/integration/eval/constants.py
deleted file mode 100644
index 0fb1a44c49..0000000000
--- a/tests/integration/eval/constants.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-JUDGE_PROMPT = """
-You will be given a question, a expected_answer, and a system_answer.
-Your task is to provide a 'total rating' scoring how well the system_answer answers compared with ground truth in expected_answer in terms of factual correctness to the question.
-Give your answer as a integer on a scale of 0 to 5, where 0 means that the system_answer is not correct at all compared with expected_answer, and 5 means that the answer completely and correctly answers the question.
-Provide your feedback as follows:
-Feedback:::
-Total rating: (your rating, as a int between 0 and 5)
-Now here are the question, expected_answer, system_answer.
-Question: {input_query}
-Expected Answer: {expected_answer}
-System Answer: {generated_answer}
-Feedback:::
-Total rating:
-"""
diff --git a/tests/integration/eval/recordings/0a2ea52bcc4c7e04d0b4b844ad94bed06bcbaa03d13b228f61e2b36e23093469.json b/tests/integration/eval/recordings/0a2ea52bcc4c7e04d0b4b844ad94bed06bcbaa03d13b228f61e2b36e23093469.json
deleted file mode 100644
index e9e69b231d..0000000000
--- a/tests/integration/eval/recordings/0a2ea52bcc4c7e04d0b4b844ad94bed06bcbaa03d13b228f61e2b36e23093469.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the largest planet in our solar system?"
-        }
-      ],
-      "max_tokens": 512
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-0a2ea52bcc4c",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The largest planet in our solar system is Jupiter. It has a diameter of approximately 142,984 kilometers (88,846 miles), which is more than 11 times the diameter of Earth and about 2.5 times the mass of all the other planets in our solar system combined.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 59,
-          "prompt_tokens": 35,
-          "total_tokens": 94,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  },
-  "id_normalization_mapping": {}
-}
diff --git a/tests/integration/eval/recordings/171c4dcb3dc848196f5d7fd87efd4626e70673c405ae1cd72b8dd0617104263e.json b/tests/integration/eval/recordings/171c4dcb3dc848196f5d7fd87efd4626e70673c405ae1cd72b8dd0617104263e.json
deleted file mode 100644
index 3ef9e1e026..0000000000
--- a/tests/integration/eval/recordings/171c4dcb3dc848196f5d7fd87efd4626e70673c405ae1cd72b8dd0617104263e.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_rows[txt=ollama/llama3.2:3b-instruct-fp16-basic::equality]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "Who is the CEO of Meta?"
-        }
-      ],
-      "max_tokens": 0
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-171c4dcb3dc8",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 24,
-          "prompt_tokens": 32,
-          "total_tokens": 56,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  }
-}
diff --git a/tests/integration/eval/recordings/1b2720589d2a4273b5eb2c06b50ab45674040195c15013c9ea43bc6331e1a831.json b/tests/integration/eval/recordings/1b2720589d2a4273b5eb2c06b50ab45674040195c15013c9ea43bc6331e1a831.json
deleted file mode 100644
index 86c415a4d1..0000000000
--- a/tests/integration/eval/recordings/1b2720589d2a4273b5eb2c06b50ab45674040195c15013c9ea43bc6331e1a831.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the currency of Japan?"
-        }
-      ],
-      "max_tokens": 0
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-1b2720589d2a",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The currency of Japan is the Japanese yen (\u00a5).",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 12,
-          "prompt_tokens": 32,
-          "total_tokens": 44,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  }
-}
diff --git a/tests/integration/eval/recordings/3e5ea35cb3dc92835d230456b6e2fc61593f964148d6c05df5c4a387a5389e6b.json b/tests/integration/eval/recordings/3e5ea35cb3dc92835d230456b6e2fc61593f964148d6c05df5c4a387a5389e6b.json
deleted file mode 100644
index c7c8576425..0000000000
--- a/tests/integration/eval/recordings/3e5ea35cb3dc92835d230456b6e2fc61593f964148d6c05df5c4a387a5389e6b.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the smallest country in the world?"
-        }
-      ],
-      "max_tokens": 0
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-3e5ea35cb3dc",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The smallest country in the world is the Vatican City, with an area of approximately 0.44 km\u00b2 (0.17 square miles). It is an independent city-state located within Rome, Italy, and serves as the headquarters of the Catholic Church. The Vatican City has a population of around 800 people and is home to numerous iconic landmarks, including St. Peter's Basilica and the Sistine Chapel.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 84,
-          "prompt_tokens": 34,
-          "total_tokens": 118,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  }
-}
diff --git a/tests/integration/eval/recordings/58177cd1c0d7d8de9e20515c3e8fe465b66d8436754b30ff4da28c7c03c094a4.json b/tests/integration/eval/recordings/58177cd1c0d7d8de9e20515c3e8fe465b66d8436754b30ff4da28c7c03c094a4.json
deleted file mode 100644
index df2f664e76..0000000000
--- a/tests/integration/eval/recordings/58177cd1c0d7d8de9e20515c3e8fe465b66d8436754b30ff4da28c7c03c094a4.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "Who is the CEO of Meta?"
-        }
-      ],
-      "max_tokens": 512
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-58177cd1c0d7",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 24,
-          "prompt_tokens": 32,
-          "total_tokens": 56,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  },
-  "id_normalization_mapping": {}
-}
diff --git a/tests/integration/eval/recordings/6de6d1ebc3128dfaba1efe654ca1453f12cd31ce2e294d20868c0c498b7d9136.json b/tests/integration/eval/recordings/6de6d1ebc3128dfaba1efe654ca1453f12cd31ce2e294d20868c0c498b7d9136.json
deleted file mode 100644
index 5fadb9186d..0000000000
--- a/tests/integration/eval/recordings/6de6d1ebc3128dfaba1efe654ca1453f12cd31ce2e294d20868c0c498b7d9136.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the capital of France?"
-        }
-      ],
-      "max_tokens": 0
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-6de6d1ebc312",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The capital of France is Paris.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 8,
-          "prompt_tokens": 32,
-          "total_tokens": 40,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  }
-}
diff --git a/tests/integration/eval/recordings/752abf1ef7f71bbe7028eae85814e6d567d1e8243e9b0d18f8803cb9b7c8f92f.json b/tests/integration/eval/recordings/752abf1ef7f71bbe7028eae85814e6d567d1e8243e9b0d18f8803cb9b7c8f92f.json
deleted file mode 100644
index a9affde521..0000000000
--- a/tests/integration/eval/recordings/752abf1ef7f71bbe7028eae85814e6d567d1e8243e9b0d18f8803cb9b7c8f92f.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_rows[txt=ollama/llama3.2:3b-instruct-fp16-basic::equality]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the capital of France?"
-        }
-      ],
-      "max_tokens": 512
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-752abf1ef7f7",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The capital of France is Paris.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 8,
-          "prompt_tokens": 32,
-          "total_tokens": 40,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  },
-  "id_normalization_mapping": {}
-}
diff --git a/tests/integration/eval/recordings/94e3d8dba56da92e1014a6ee81b61fe8e95d98692c189e7308724338f918678d.json b/tests/integration/eval/recordings/94e3d8dba56da92e1014a6ee81b61fe8e95d98692c189e7308724338f918678d.json
deleted file mode 100644
index ae2fe160cb..0000000000
--- a/tests/integration/eval/recordings/94e3d8dba56da92e1014a6ee81b61fe8e95d98692c189e7308724338f918678d.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the capital of France?"
-        }
-      ],
-      "max_tokens": 512
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-94e3d8dba56d",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The capital of France is Paris.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 8,
-          "prompt_tokens": 32,
-          "total_tokens": 40,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  },
-  "id_normalization_mapping": {}
-}
diff --git a/tests/integration/eval/recordings/9ebe1e04fc3a8d41f88992428a7c99669c7e19b3d551090eb6bec83b33de2a18.json b/tests/integration/eval/recordings/9ebe1e04fc3a8d41f88992428a7c99669c7e19b3d551090eb6bec83b33de2a18.json
deleted file mode 100644
index cc55f2a777..0000000000
--- a/tests/integration/eval/recordings/9ebe1e04fc3a8d41f88992428a7c99669c7e19b3d551090eb6bec83b33de2a18.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "Who is the CEO of Meta?"
-        }
-      ],
-      "max_tokens": 0
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-9ebe1e04fc3a",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 24,
-          "prompt_tokens": 32,
-          "total_tokens": 56,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  }
-}
diff --git a/tests/integration/eval/recordings/aa20023c358a0dc718355082cc244a231426700a772b8dc64abf05d8b126a736.json b/tests/integration/eval/recordings/aa20023c358a0dc718355082cc244a231426700a772b8dc64abf05d8b126a736.json
deleted file mode 100644
index 56746ef9e4..0000000000
--- a/tests/integration/eval/recordings/aa20023c358a0dc718355082cc244a231426700a772b8dc64abf05d8b126a736.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the largest planet in our solar system?"
-        }
-      ],
-      "max_tokens": 0
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-aa20023c358a",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The largest planet in our solar system is Jupiter. It has a diameter of approximately 142,984 kilometers (88,846 miles), which is more than 11 times the diameter of the Earth! Jupiter is a gas giant and is known for its massive size, stormy atmosphere, and numerous moons.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 62,
-          "prompt_tokens": 35,
-          "total_tokens": 97,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  }
-}
diff --git a/tests/integration/eval/recordings/b52a054b314c8b42634c4a9ef76280591f73cf26c00b7308dde7d19a1ced016c.json b/tests/integration/eval/recordings/b52a054b314c8b42634c4a9ef76280591f73cf26c00b7308dde7d19a1ced016c.json
deleted file mode 100644
index f6290abcd8..0000000000
--- a/tests/integration/eval/recordings/b52a054b314c8b42634c4a9ef76280591f73cf26c00b7308dde7d19a1ced016c.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_rows[txt=ollama/llama3.2:3b-instruct-fp16-basic::equality]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the capital of France?"
-        }
-      ],
-      "max_tokens": 0
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-b52a054b314c",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The capital of France is Paris.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 8,
-          "prompt_tokens": 32,
-          "total_tokens": 40,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  }
-}
diff --git a/tests/integration/eval/recordings/bf6b37511a044df8ad1c6113d3936b1e7f4a9d4f7f8ba8bd844d50265067f417.json b/tests/integration/eval/recordings/bf6b37511a044df8ad1c6113d3936b1e7f4a9d4f7f8ba8bd844d50265067f417.json
deleted file mode 100644
index 1a95b7cadd..0000000000
--- a/tests/integration/eval/recordings/bf6b37511a044df8ad1c6113d3936b1e7f4a9d4f7f8ba8bd844d50265067f417.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the smallest country in the world?"
-        }
-      ],
-      "max_tokens": 512
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-bf6b37511a04",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The smallest country in the world is the Vatican City, with an area of approximately 0.44 km\u00b2 (0.17 sq mi). It is an independent city-state located within Rome, Italy, and is home to the Pope and the central government of the Catholic Church.\n\nVatican City is so small that it's actually the smallest internationally recognized sovereign state in the world, with a population of just over 800 people. Despite its tiny size, it has its own government, currency, postal system, and even its own branch of the military, known as the Pontifical Swiss Guard.\n\nInterestingly, Vatican City is also home to numerous famous landmarks, including St. Peter's Basilica, the Sistine Chapel, and the Vatican Museums, which attract millions of visitors each year.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 160,
-          "prompt_tokens": 34,
-          "total_tokens": 194,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  },
-  "id_normalization_mapping": {}
-}
diff --git a/tests/integration/eval/recordings/c07b01fe99467efcfa99f6ac9c60acc212cf2ac3bdd4192aabb5f98359236572.json b/tests/integration/eval/recordings/c07b01fe99467efcfa99f6ac9c60acc212cf2ac3bdd4192aabb5f98359236572.json
deleted file mode 100644
index 532a7d6b3f..0000000000
--- a/tests/integration/eval/recordings/c07b01fe99467efcfa99f6ac9c60acc212cf2ac3bdd4192aabb5f98359236572.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_rows[txt=ollama/llama3.2:3b-instruct-fp16-basic::equality]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the largest planet in our solar system?"
-        }
-      ],
-      "max_tokens": 0
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-c07b01fe9946",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "Jupiter is the largest planet in our solar system, with a diameter of approximately 142,984 kilometers (88,846 miles). It is a gas giant and composed mainly of hydrogen and helium. Jupiter's large size and mass are more than 2.5 times that of all the other planets in our solar system combined.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 67,
-          "prompt_tokens": 35,
-          "total_tokens": 102,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  }
-}
diff --git a/tests/integration/eval/recordings/c4ef767672c890e77ceaa15b6239e9d5a9a5ad6ee7bcad0b12853979b1e43ede.json b/tests/integration/eval/recordings/c4ef767672c890e77ceaa15b6239e9d5a9a5ad6ee7bcad0b12853979b1e43ede.json
deleted file mode 100644
index 0663e23c22..0000000000
--- a/tests/integration/eval/recordings/c4ef767672c890e77ceaa15b6239e9d5a9a5ad6ee7bcad0b12853979b1e43ede.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_benchmark[txt=ollama/llama3.2:3b-instruct-fp16-basic::subset_of]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the currency of Japan?"
-        }
-      ],
-      "max_tokens": 512
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-c4ef767672c8",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The currency of Japan is the Yen (JPY). It is divided into 100 sen, but the sen is no longer in circulation. The yen is widely accepted and used for most transactions in Japan.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 42,
-          "prompt_tokens": 32,
-          "total_tokens": 74,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  },
-  "id_normalization_mapping": {}
-}
diff --git a/tests/integration/eval/recordings/cbf92825593fd79fe76e0ad0193ebe742743cd3042654efefa86714e357b86f6.json b/tests/integration/eval/recordings/cbf92825593fd79fe76e0ad0193ebe742743cd3042654efefa86714e357b86f6.json
deleted file mode 100644
index ace935a78b..0000000000
--- a/tests/integration/eval/recordings/cbf92825593fd79fe76e0ad0193ebe742743cd3042654efefa86714e357b86f6.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_rows[txt=ollama/llama3.2:3b-instruct-fp16-basic::equality]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "What is the largest planet in our solar system?"
-        }
-      ],
-      "max_tokens": 512
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-cbf92825593f",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "The largest planet in our solar system is Jupiter. It is a gas giant, with a diameter of approximately 142,984 kilometers (88,846 miles). This is more than 11 times the diameter of Earth and is the fifth-largest object in the solar system overall. Despite its large size, Jupiter is relatively lightweight compared to solid objects, due to its composition of mostly hydrogen and helium gases.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 81,
-          "prompt_tokens": 35,
-          "total_tokens": 116,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  },
-  "id_normalization_mapping": {}
-}
diff --git a/tests/integration/eval/recordings/dcf3c9afad420e66c3cc7434a48169a1235798c2b3ad9abbb29acf1f1b2952fa.json b/tests/integration/eval/recordings/dcf3c9afad420e66c3cc7434a48169a1235798c2b3ad9abbb29acf1f1b2952fa.json
deleted file mode 100644
index 92d07571b2..0000000000
--- a/tests/integration/eval/recordings/dcf3c9afad420e66c3cc7434a48169a1235798c2b3ad9abbb29acf1f1b2952fa.json
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-  "test_id": "tests/integration/eval/test_eval.py::test_evaluate_rows[txt=ollama/llama3.2:3b-instruct-fp16-basic::equality]",
-  "request": {
-    "method": "POST",
-    "url": "http://0.0.0.0:11434/v1/v1/chat/completions",
-    "headers": {},
-    "body": {
-      "model": "llama3.2:3b-instruct-fp16",
-      "messages": [
-        {
-          "role": "user",
-          "content": "Who is the CEO of Meta?"
-        }
-      ],
-      "max_tokens": 512
-    },
-    "endpoint": "/v1/chat/completions",
-    "model": "llama3.2:3b-instruct-fp16"
-  },
-  "response": {
-    "body": {
-      "__type__": "openai.types.chat.chat_completion.ChatCompletion",
-      "__data__": {
-        "id": "rec-dcf3c9afad42",
-        "choices": [
-          {
-            "finish_reason": "stop",
-            "index": 0,
-            "logprobs": null,
-            "message": {
-              "content": "Mark Zuckerberg is the founder, chairman and CEO of Meta, which he originally founded as Facebook in 2004.",
-              "refusal": null,
-              "role": "assistant",
-              "annotations": null,
-              "audio": null,
-              "function_call": null,
-              "tool_calls": null
-            }
-          }
-        ],
-        "created": 0,
-        "model": "llama3.2:3b-instruct-fp16",
-        "object": "chat.completion",
-        "service_tier": null,
-        "system_fingerprint": "fp_ollama",
-        "usage": {
-          "completion_tokens": 24,
-          "prompt_tokens": 32,
-          "total_tokens": 56,
-          "completion_tokens_details": null,
-          "prompt_tokens_details": null
-        }
-      }
-    },
-    "is_streaming": false
-  },
-  "id_normalization_mapping": {}
-}
diff --git a/tests/integration/eval/test_eval.py b/tests/integration/eval/test_eval.py
deleted file mode 100644
index e042008dd2..0000000000
--- a/tests/integration/eval/test_eval.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-import uuid
-from pathlib import Path
-
-import pytest
-
-from ..datasets.test_datasets import data_url_from_file
-
-# How to run this test:
-#
-# LLAMA_STACK_CONFIG="template-name" pytest -v tests/integration/eval
-
-
-@pytest.mark.parametrize("scoring_fn_id", ["basic::equality"])
-def test_evaluate_rows(llama_stack_client, text_model_id, scoring_fn_id):
-    dataset = llama_stack_client.beta.datasets.register(
-        purpose="eval/messages-answer",
-        source={
-            "type": "uri",
-            "uri": data_url_from_file(Path(__file__).parent.parent / "datasets" / "test_dataset.csv"),
-        },
-    )
-    response = llama_stack_client.beta.datasets.list()
-    assert any(x.identifier == dataset.identifier for x in response)
-
-    rows = llama_stack_client.beta.datasets.iterrows(
-        dataset_id=dataset.identifier,
-        limit=3,
-    )
-    assert len(rows.data) == 3
-
-    scoring_functions = [
-        scoring_fn_id,
-    ]
-    benchmark_id = str(uuid.uuid4())
-    llama_stack_client.alpha.benchmarks.register(
-        benchmark_id=benchmark_id,
-        dataset_id=dataset.identifier,
-        scoring_functions=scoring_functions,
-    )
-    list_benchmarks = llama_stack_client.alpha.benchmarks.list()
-    assert any(x.identifier == benchmark_id for x in list_benchmarks)
-
-    response = llama_stack_client.alpha.eval.evaluate_rows(
-        benchmark_id=benchmark_id,
-        input_rows=rows.data,
-        scoring_functions=scoring_functions,
-        benchmark_config={
-            "eval_candidate": {
-                "type": "model",
-                "model": text_model_id,
-                "sampling_params": {
-                    "temperature": 0.0,
-                    "max_tokens": 512,
-                },
-            },
-        },
-    )
-
-    assert len(response.generations) == 3
-    assert scoring_fn_id in response.scores
-
-
-@pytest.mark.parametrize("scoring_fn_id", ["basic::subset_of"])
-def test_evaluate_benchmark(llama_stack_client, text_model_id, scoring_fn_id):
-    dataset = llama_stack_client.beta.datasets.register(
-        purpose="eval/messages-answer",
-        source={
-            "type": "uri",
-            "uri": data_url_from_file(Path(__file__).parent.parent / "datasets" / "test_dataset.csv"),
-        },
-    )
-    benchmark_id = str(uuid.uuid4())
-    llama_stack_client.alpha.benchmarks.register(
-        benchmark_id=benchmark_id,
-        dataset_id=dataset.identifier,
-        scoring_functions=[scoring_fn_id],
-    )
-
-    response = llama_stack_client.alpha.eval.run_eval(
-        benchmark_id=benchmark_id,
-        benchmark_config={
-            "eval_candidate": {
-                "type": "model",
-                "model": text_model_id,
-                "sampling_params": {
-                    "temperature": 0.0,
-                    "max_tokens": 512,
-                },
-            },
-        },
-    )
-    assert response.job_id == "0"
-    job_status = llama_stack_client.alpha.eval.jobs.status(job_id=response.job_id, benchmark_id=benchmark_id)
-    assert job_status and job_status.status == "completed"
-
-    eval_response = llama_stack_client.alpha.eval.jobs.retrieve(job_id=response.job_id, benchmark_id=benchmark_id)
-    assert eval_response is not None
-    assert len(eval_response.generations) == 5
-    assert scoring_fn_id in eval_response.scores
diff --git a/tests/integration/scoring/__init__.py b/tests/integration/scoring/__init__.py
deleted file mode 100644
index 756f351d88..0000000000
--- a/tests/integration/scoring/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
diff --git a/tests/integration/scoring/test_scoring.py b/tests/integration/scoring/test_scoring.py
deleted file mode 100644
index 8ca11fc897..0000000000
--- a/tests/integration/scoring/test_scoring.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-
-from pathlib import Path
-
-import pandas as pd
-import pytest
-import requests
-
-
-@pytest.fixture
-def sample_judge_prompt_template():
-    return "Output a number response in the following format: Score: <number>, where <number> is the number between 0 and 9."
-
-
-@pytest.fixture
-def sample_scoring_fn_id():
-    return "llm-as-judge-test-prompt"
-
-
-def register_scoring_function(
-    llama_stack_client,
-    provider_id,
-    scoring_fn_id,
-    judge_model_id,
-    judge_prompt_template,
-):
-    llama_stack_client.scoring_functions.register(
-        scoring_fn_id=scoring_fn_id,
-        provider_id=provider_id,
-        description="LLM as judge scoring function with test prompt",
-        return_type={
-            "type": "string",
-        },
-        params={
-            "type": "llm_as_judge",
-            "judge_model": judge_model_id,
-            "prompt_template": judge_prompt_template,
-        },
-    )
-
-
-def unregister_scoring_function(llama_stack_client, scoring_fn_id):
-    try:
-        base_url = llama_stack_client.base_url
-    except AttributeError:
-        pytest.skip("No server base_url available; cannot test HTTP unregister in library mode")
-
-    resp = requests.delete(f"{base_url}/v1/scoring-functions/{scoring_fn_id}", timeout=30)
-    assert resp.status_code in (200, 204)
-
-
-def test_scoring_functions_list(llama_stack_client):
-    response = llama_stack_client.scoring_functions.list()
-    assert isinstance(response, list)
-    assert len(response) > 0
-
-
-def test_scoring_functions_register(
-    llama_stack_client,
-    sample_scoring_fn_id,
-    judge_model_id,
-    sample_judge_prompt_template,
-):
-    llm_as_judge_provider = [
-        x
-        for x in llama_stack_client.providers.list()
-        if x.api == "scoring" and x.provider_type == "inline::llm-as-judge"
-    ]
-    if len(llm_as_judge_provider) == 0:
-        pytest.skip("No llm-as-judge provider found, cannot test registeration")
-
-    llm_as_judge_provider_id = llm_as_judge_provider[0].provider_id
-    register_scoring_function(
-        llama_stack_client,
-        llm_as_judge_provider_id,
-        sample_scoring_fn_id,
-        judge_model_id,
-        sample_judge_prompt_template,
-    )
-
-    list_response = llama_stack_client.scoring_functions.list()
-    assert isinstance(list_response, list)
-    assert len(list_response) > 0
-    assert any(x.identifier == sample_scoring_fn_id for x in list_response)
-
-
-def test_scoring_functions_unregister(
-    llama_stack_client,
-    sample_scoring_fn_id,
-    judge_model_id,
-    sample_judge_prompt_template,
-):
-    llm_as_judge_provider = [
-        x
-        for x in llama_stack_client.providers.list()
-        if x.api == "scoring" and x.provider_type == "inline::llm-as-judge"
-    ]
-    if len(llm_as_judge_provider) == 0:
-        pytest.skip("No llm-as-judge provider found, cannot test unregister")
-
-    llm_as_judge_provider_id = llm_as_judge_provider[0].provider_id
-
-    # Ensure a clean state: shared server runs can keep a prior registration, and
-    # re-registering the same identifier would fail with a 400.
-    unregister_scoring_function(llama_stack_client, sample_scoring_fn_id)
-
-    # Register first
-    register_scoring_function(
-        llama_stack_client,
-        llm_as_judge_provider_id,
-        sample_scoring_fn_id,
-        judge_model_id,
-        sample_judge_prompt_template,
-    )
-
-    # Ensure it is present
-    list_response = llama_stack_client.scoring_functions.list()
-    assert any(x.identifier == sample_scoring_fn_id for x in list_response)
-
-    # Unregister scoring fn
-    try:
-        base_url = llama_stack_client.base_url
-    except AttributeError:
-        pytest.skip("No server base_url available; cannot test HTTP unregister in library mode")
-
-    resp = requests.delete(f"{base_url}/v1/scoring-functions/{sample_scoring_fn_id}", timeout=30)
-    assert resp.status_code in (200, 204)
-    list_after = llama_stack_client.scoring_functions.list()
-    assert all(x.identifier != sample_scoring_fn_id for x in list_after)
-
-
-@pytest.mark.parametrize("scoring_fn_id", ["basic::equality"])
-def test_scoring_score(llama_stack_client, scoring_fn_id):
-    # scoring individual rows
-    df = pd.read_csv(Path(__file__).parent.parent / "datasets" / "test_dataset.csv")
-    rows = df.to_dict(orient="records")
-
-    scoring_functions = {
-        scoring_fn_id: None,
-    }
-
-    response = llama_stack_client.scoring.score(
-        input_rows=rows,
-        scoring_functions=scoring_functions,
-    )
-    assert len(response.results) == len(scoring_functions)
-    for x in scoring_functions:
-        assert x in response.results
-        assert len(response.results[x].score_rows) == len(rows)
-
-
-def test_scoring_score_with_params_llm_as_judge(
-    llama_stack_client,
-    sample_judge_prompt_template,
-    judge_model_id,
-):
-    # scoring individual rows
-    df = pd.read_csv(Path(__file__).parent.parent / "datasets" / "test_dataset.csv")
-    rows = df.to_dict(orient="records")
-
-    scoring_functions = {
-        "llm-as-judge::base": dict(
-            type="llm_as_judge",
-            judge_model=judge_model_id,
-            prompt_template=sample_judge_prompt_template,
-            judge_score_regexes=[r"Score: (\d+)"],
-            aggregation_functions=[
-                "categorical_count",
-            ],
-        )
-    }
-
-    response = llama_stack_client.scoring.score(
-        input_rows=rows,
-        scoring_functions=scoring_functions,
-    )
-    assert len(response.results) == len(scoring_functions)
-    for x in scoring_functions:
-        assert x in response.results
-        assert len(response.results[x].score_rows) == len(rows)
-
-
-@pytest.mark.parametrize(
-    "provider_id",
-    [
-        "basic",
-        "llm-as-judge",
-        "braintrust",
-    ],
-)
-def test_scoring_score_with_aggregation_functions(
-    llama_stack_client,
-    sample_judge_prompt_template,
-    judge_model_id,
-    provider_id,
-):
-    df = pd.read_csv(Path(__file__).parent.parent / "datasets" / "test_dataset.csv")
-    rows = df.to_dict(orient="records")
-
-    scoring_fns_list = [x for x in llama_stack_client.scoring_functions.list() if x.provider_id == provider_id]
-    if len(scoring_fns_list) == 0:
-        pytest.skip(f"No scoring functions found for provider {provider_id}, skipping")
-
-    scoring_functions = {}
-    aggr_fns = [
-        "accuracy",
-        "median",
-        "categorical_count",
-        "average",
-    ]
-
-    scoring_fn = scoring_fns_list[0]
-    if scoring_fn.provider_id == "llm-as-judge":
-        aggr_fns = ["categorical_count"]
-        scoring_functions[scoring_fn.identifier] = dict(
-            type="llm_as_judge",
-            judge_model=judge_model_id,
-            prompt_template=sample_judge_prompt_template,
-            judge_score_regexes=[r"Score: (\d+)"],
-            aggregation_functions=aggr_fns,
-        )
-    elif scoring_fn.provider_id == "basic" or scoring_fn.provider_id == "braintrust":
-        if "regex_parser" in scoring_fn.identifier:
-            scoring_functions[scoring_fn.identifier] = dict(
-                type="regex_parser",
-                parsing_regexes=[r"Score: (\d+)"],
-                aggregation_functions=aggr_fns,
-            )
-        else:
-            scoring_functions[scoring_fn.identifier] = dict(
-                type="basic",
-                aggregation_functions=aggr_fns,
-            )
-    else:
-        scoring_functions[scoring_fn.identifier] = None
-
-    response = llama_stack_client.scoring.score(
-        input_rows=rows,
-        scoring_functions=scoring_functions,
-    )
-
-    assert len(response.results) == len(scoring_functions)
-    for x in scoring_functions:
-        assert x in response.results
-        assert len(response.results[x].score_rows) == len(rows)
-        assert len(response.results[x].aggregated_results) == len(aggr_fns)
diff --git a/tests/unit/distribution/routers/test_routing_tables.py b/tests/unit/distribution/routers/test_routing_tables.py
index 6f2603965d..bbf5a9aecc 100644
--- a/tests/unit/distribution/routers/test_routing_tables.py
+++ b/tests/unit/distribution/routers/test_routing_tables.py
@@ -11,38 +11,23 @@
 import pytest
 
 from llama_stack.core.datatypes import RegistryEntrySource
-from llama_stack.core.routing_tables.benchmarks import BenchmarksRoutingTable
-from llama_stack.core.routing_tables.datasets import DatasetsRoutingTable
 from llama_stack.core.routing_tables.models import ModelsRoutingTable
-from llama_stack.core.routing_tables.scoring_functions import ScoringFunctionsRoutingTable
 from llama_stack.core.routing_tables.shields import ShieldsRoutingTable
 from llama_stack.core.routing_tables.toolgroups import ToolGroupsRoutingTable
 from llama_stack_api import (
     URL,
     Api,
-    Dataset,
-    DatasetPurpose,
-    GetBenchmarkRequest,
     GetShieldRequest,
-    ListBenchmarksRequest,
     ListToolDefsResponse,
     ListToolsRequest,
     Model,
     ModelNotFoundError,
     ModelType,
-    NumberType,
-    RegisterBenchmarkRequest,
     RegisterShieldRequest,
     Shield,
     ToolDef,
     ToolGroup,
-    UnregisterBenchmarkRequest,
     UnregisterShieldRequest,
-    URIDataSource,
-)
-from llama_stack_api.datasets import (
-    RegisterDatasetRequest,
-    UnregisterDatasetRequest,
 )
 
 
@@ -103,42 +88,6 @@ async def unregister_shield(self, shield_id: str):
         return shield_id
 
 
-class DatasetsImpl(Impl):
-    def __init__(self):
-        super().__init__(Api.datasetio)
-
-    async def register_dataset(self, dataset: Dataset):
-        return dataset
-
-    async def unregister_dataset(self, dataset_id: str):
-        return dataset_id
-
-
-class ScoringFunctionsImpl(Impl):
-    def __init__(self):
-        super().__init__(Api.scoring)
-
-    async def list_scoring_functions(self):
-        return []
-
-    async def register_scoring_function(self, scoring_fn):
-        return scoring_fn
-
-    async def unregister_scoring_function(self, scoring_fn_id: str):
-        return scoring_fn_id
-
-
-class BenchmarksImpl(Impl):
-    def __init__(self):
-        super().__init__(Api.eval)
-
-    async def register_benchmark(self, benchmark):
-        return benchmark
-
-    async def unregister_benchmark(self, benchmark_id: str):
-        return benchmark_id
-
-
 class ToolGroupsImpl(Impl):
     def __init__(self):
         super().__init__(Api.tool_runtime)
@@ -264,83 +213,6 @@ async def test_shields_routing_table(cached_disk_dist_registry):
         await table.unregister_shield(UnregisterShieldRequest(identifier="non-existent"))
 
 
-async def test_datasets_routing_table(cached_disk_dist_registry):
-    table = DatasetsRoutingTable({"localfs": DatasetsImpl()}, cached_disk_dist_registry, {})
-    await table.initialize()
-
-    # Register multiple datasets and verify listing
-    await table.register_dataset(
-        RegisterDatasetRequest(
-            dataset_id="test-dataset",
-            purpose=DatasetPurpose.eval_messages_answer,
-            source=URIDataSource(uri="test-uri"),
-        )
-    )
-    await table.register_dataset(
-        RegisterDatasetRequest(
-            dataset_id="test-dataset-2",
-            purpose=DatasetPurpose.eval_messages_answer,
-            source=URIDataSource(uri="test-uri-2"),
-        )
-    )
-    datasets = await table.list_datasets()
-
-    assert len(datasets.data) == 2
-    dataset_ids = {d.identifier for d in datasets.data}
-    assert "test-dataset" in dataset_ids
-    assert "test-dataset-2" in dataset_ids
-
-    await table.unregister_dataset(UnregisterDatasetRequest(dataset_id="test-dataset"))
-    await table.unregister_dataset(UnregisterDatasetRequest(dataset_id="test-dataset-2"))
-
-    datasets = await table.list_datasets()
-    assert len(datasets.data) == 0
-
-
-async def test_scoring_functions_routing_table(cached_disk_dist_registry):
-    table = ScoringFunctionsRoutingTable({"test_provider": ScoringFunctionsImpl()}, cached_disk_dist_registry, {})
-    await table.initialize()
-
-    # Register multiple scoring functions and verify listing
-    from llama_stack_api import (
-        ListScoringFunctionsRequest,
-        RegisterScoringFunctionRequest,
-        UnregisterScoringFunctionRequest,
-    )
-
-    await table.register_scoring_function(
-        RegisterScoringFunctionRequest(
-            scoring_fn_id="test-scoring-fn",
-            provider_id="test_provider",
-            description="Test scoring function",
-            return_type=NumberType(),
-        )
-    )
-    await table.register_scoring_function(
-        RegisterScoringFunctionRequest(
-            scoring_fn_id="test-scoring-fn-2",
-            provider_id="test_provider",
-            description="Another test scoring function",
-            return_type=NumberType(),
-        )
-    )
-    scoring_functions = await table.list_scoring_functions(ListScoringFunctionsRequest())
-
-    assert len(scoring_functions.data) == 2
-    scoring_fn_ids = {fn.identifier for fn in scoring_functions.data}
-    assert "test-scoring-fn" in scoring_fn_ids
-    assert "test-scoring-fn-2" in scoring_fn_ids
-
-    # Unregister scoring functions and verify listing
-    for i in range(len(scoring_functions.data)):
-        await table.unregister_scoring_function(
-            UnregisterScoringFunctionRequest(scoring_fn_id=scoring_functions.data[i].scoring_fn_id)
-        )
-
-    scoring_functions_list_after_deletion = await table.list_scoring_functions(ListScoringFunctionsRequest())
-    assert len(scoring_functions_list_after_deletion.data) == 0
-
-
 async def test_double_registration_models_positive(cached_disk_dist_registry):
     """Test that registering the same model twice with identical data succeeds."""
     table = ModelsRoutingTable({"test_provider": InferenceImpl()}, cached_disk_dist_registry, {})
@@ -373,68 +245,6 @@ async def test_double_registration_models_negative(cached_disk_dist_registry):
         )
 
 
-async def test_double_registration_scoring_functions_positive(cached_disk_dist_registry):
-    """Test that registering the same scoring function twice with identical data succeeds."""
-    from llama_stack_api import ListScoringFunctionsRequest, RegisterScoringFunctionRequest
-
-    table = ScoringFunctionsRoutingTable({"test_provider": ScoringFunctionsImpl()}, cached_disk_dist_registry, {})
-    await table.initialize()
-
-    # Register a scoring function
-    await table.register_scoring_function(
-        RegisterScoringFunctionRequest(
-            scoring_fn_id="test-scoring-fn",
-            provider_id="test_provider",
-            description="Test scoring function",
-            return_type=NumberType(),
-        )
-    )
-
-    # Register the exact same scoring function again - should succeed (idempotent)
-    await table.register_scoring_function(
-        RegisterScoringFunctionRequest(
-            scoring_fn_id="test-scoring-fn",
-            provider_id="test_provider",
-            description="Test scoring function",
-            return_type=NumberType(),
-        )
-    )
-
-    # Verify only one scoring function exists
-    scoring_functions = await table.list_scoring_functions(ListScoringFunctionsRequest())
-    assert len(scoring_functions.data) == 1
-    assert scoring_functions.data[0].identifier == "test-scoring-fn"
-
-
-async def test_double_registration_scoring_functions_negative(cached_disk_dist_registry):
-    """Test that registering the same scoring function with conflicting data fails."""
-    from llama_stack_api import RegisterScoringFunctionRequest
-
-    table = ScoringFunctionsRoutingTable({"test_provider": ScoringFunctionsImpl()}, cached_disk_dist_registry, {})
-    await table.initialize()
-
-    # Register a scoring function
-    await table.register_scoring_function(
-        RegisterScoringFunctionRequest(
-            scoring_fn_id="test-scoring-fn",
-            provider_id="test_provider",
-            description="Test scoring function",
-            return_type=NumberType(),
-        )
-    )
-
-    # Try to register the same scoring function with conflicting description - should fail
-    with pytest.raises(ValueError, match="conflicting field values"):
-        await table.register_scoring_function(
-            RegisterScoringFunctionRequest(
-                scoring_fn_id="test-scoring-fn",
-                provider_id="test_provider",
-                description="Different description",
-                return_type=NumberType(),
-            )
-        )
-
-
 async def test_double_registration_different_providers(cached_disk_dist_registry):
     """Test that registering objects with same ID but different providers succeeds."""
     impl1 = InferenceImpl()
@@ -454,60 +264,6 @@ async def test_double_registration_different_providers(cached_disk_dist_registry
     assert "provider2/shared-model" in model_ids
 
 
-async def test_benchmarks_routing_table(cached_disk_dist_registry):
-    table = BenchmarksRoutingTable({"test_provider": BenchmarksImpl()}, cached_disk_dist_registry, {})
-    await table.initialize()
-
-    # Register multiple benchmarks and verify listing
-    await table.register_benchmark(
-        RegisterBenchmarkRequest(
-            benchmark_id="test-benchmark",
-            dataset_id="test-dataset",
-            scoring_functions=["test-scoring-fn", "test-scoring-fn-2"],
-        )
-    )
-    benchmarks = await table.list_benchmarks(ListBenchmarksRequest())
-
-    assert len(benchmarks.data) == 1
-    benchmark_ids = {b.identifier for b in benchmarks.data}
-    assert "test-benchmark" in benchmark_ids
-
-    # Unregister the benchmark and verify removal
-    await table.unregister_benchmark(UnregisterBenchmarkRequest(benchmark_id="test-benchmark"))
-    benchmarks_after = await table.list_benchmarks(ListBenchmarksRequest())
-    assert len(benchmarks_after.data) == 0
-
-    # Unregistering a non-existent benchmark should raise a clear error
-    with pytest.raises(ValueError, match="Benchmark 'dummy_benchmark' not found"):
-        await table.unregister_benchmark(UnregisterBenchmarkRequest(benchmark_id="dummy_benchmark"))
-
-
-async def test_benchmarks_routing_table_stores_dataset_id(cached_disk_dist_registry):
-    """Test that register_benchmark correctly stores dataset_id on the benchmark."""
-    table = BenchmarksRoutingTable({"test_provider": BenchmarksImpl()}, cached_disk_dist_registry, {})
-    await table.initialize()
-
-    test_dataset_id = "my-evaluation-dataset"
-    test_scoring_functions = ["accuracy", "f1-score"]
-
-    await table.register_benchmark(
-        RegisterBenchmarkRequest(
-            benchmark_id="test-benchmark-with-dataset",
-            dataset_id=test_dataset_id,
-            scoring_functions=test_scoring_functions,
-        )
-    )
-
-    benchmark = await table.get_benchmark(GetBenchmarkRequest(benchmark_id="test-benchmark-with-dataset"))
-
-    assert benchmark is not None
-    assert benchmark.identifier == "test-benchmark-with-dataset"
-    assert benchmark.dataset_id == test_dataset_id
-    assert benchmark.scoring_functions == test_scoring_functions
-
-    await table.unregister_benchmark(UnregisterBenchmarkRequest(benchmark_id="test-benchmark-with-dataset"))
-
-
 async def test_tool_groups_routing_table(cached_disk_dist_registry):
     table = ToolGroupsRoutingTable({"test_provider": ToolGroupsImpl()}, cached_disk_dist_registry, {})
     await table.initialize()
diff --git a/tests/unit/providers/nvidia/test_eval.py b/tests/unit/providers/nvidia/test_eval.py
deleted file mode 100644
index 265bfc20ad..0000000000
--- a/tests/unit/providers/nvidia/test_eval.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import os
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from llama_stack.providers.remote.eval.nvidia.config import NVIDIAEvalConfig
-from llama_stack.providers.remote.eval.nvidia.eval import NVIDIAEvalImpl
-from llama_stack_api import (
-    Benchmark,
-    BenchmarkConfig,
-    EvaluateResponse,
-    Job,
-    JobStatus,
-    ModelCandidate,
-    ResourceType,
-    SamplingParams,
-    TopPSamplingStrategy,
-)
-from llama_stack_api.eval.models import (
-    JobCancelRequest,
-    JobResultRequest,
-    JobStatusRequest,
-    RunEvalRequest,
-)
-
-MOCK_DATASET_ID = "default/test-dataset"
-MOCK_BENCHMARK_ID = "test-benchmark"
-
-
-@pytest.fixture
-def nvidia_eval_setup():
-    """Set up the NVIDIA eval implementation with mocked dependencies."""
-    os.environ["NVIDIA_EVALUATOR_URL"] = "http://nemo.test"
-
-    # Create mock APIs
-    datasetio_api = MagicMock()
-    datasets_api = MagicMock()
-    scoring_api = MagicMock()
-    inference_api = MagicMock()
-    agents_api = MagicMock()
-
-    config = NVIDIAEvalConfig(
-        evaluator_url=os.environ["NVIDIA_EVALUATOR_URL"],
-    )
-
-    eval_impl = NVIDIAEvalImpl(
-        config=config,
-        datasetio_api=datasetio_api,
-        datasets_api=datasets_api,
-        scoring_api=scoring_api,
-        inference_api=inference_api,
-        agents_api=agents_api,
-    )
-
-    # Mock the HTTP request methods
-    with (
-        patch("llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_get") as mock_evaluator_get,
-        patch("llama_stack.providers.remote.eval.nvidia.eval.NVIDIAEvalImpl._evaluator_post") as mock_evaluator_post,
-    ):
-        yield {
-            "eval_impl": eval_impl,
-            "mock_evaluator_get": mock_evaluator_get,
-            "mock_evaluator_post": mock_evaluator_post,
-            "datasetio_api": datasetio_api,
-            "datasets_api": datasets_api,
-            "scoring_api": scoring_api,
-            "inference_api": inference_api,
-            "agents_api": agents_api,
-        }
-
-
-def _assert_request_body(mock_evaluator_post, expected_json):
-    """Helper method to verify request body in Evaluator POST request is correct"""
-    call_args = mock_evaluator_post.call_args
-    actual_json = call_args[0][1]
-
-    # Check that all expected keys contain the expected values in the actual JSON
-    for key, value in expected_json.items():
-        assert key in actual_json, f"Key '{key}' missing in actual JSON"
-
-        if isinstance(value, dict):
-            for nested_key, nested_value in value.items():
-                assert nested_key in actual_json[key], f"Nested key '{nested_key}' missing in actual JSON['{key}']"
-                assert actual_json[key][nested_key] == nested_value, f"Value mismatch for '{key}.{nested_key}'"
-        else:
-            assert actual_json[key] == value, f"Value mismatch for '{key}'"
-
-
-async def test_register_benchmark(nvidia_eval_setup):
-    eval_impl = nvidia_eval_setup["eval_impl"]
-    mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"]
-
-    eval_config = {
-        "type": "custom",
-        "params": {"parallelism": 8},
-        "tasks": {
-            "qa": {
-                "type": "completion",
-                "params": {"template": {"prompt": "{{prompt}}", "max_tokens": 200}},
-                "dataset": {"files_url": f"hf://datasets/{MOCK_DATASET_ID}/testing/testing.jsonl"},
-                "metrics": {"bleu": {"type": "bleu", "params": {"references": ["{{ideal_response}}"]}}},
-            }
-        },
-    }
-
-    benchmark = Benchmark(
-        provider_id="nvidia",
-        type=ResourceType.benchmark,
-        identifier=MOCK_BENCHMARK_ID,
-        dataset_id=MOCK_DATASET_ID,
-        scoring_functions=["basic::equality"],
-        metadata=eval_config,
-    )
-
-    # Mock Evaluator API response
-    mock_evaluator_response = {"id": MOCK_BENCHMARK_ID, "status": "created"}
-    mock_evaluator_post.return_value = mock_evaluator_response
-
-    # Register the benchmark
-    await eval_impl.register_benchmark(benchmark)
-
-    # Verify the Evaluator API was called correctly
-    mock_evaluator_post.assert_called_once()
-    _assert_request_body(
-        mock_evaluator_post, {"namespace": benchmark.provider_id, "name": benchmark.identifier, **eval_config}
-    )
-
-
-async def test_run_eval(nvidia_eval_setup):
-    eval_impl = nvidia_eval_setup["eval_impl"]
-    mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"]
-
-    benchmark_config = BenchmarkConfig(
-        eval_candidate=ModelCandidate(
-            type="model",
-            model="Llama3.1-8B-Instruct",
-            sampling_params=SamplingParams(max_tokens=100, strategy=TopPSamplingStrategy(temperature=0.7)),
-        )
-    )
-
-    # Mock Evaluator API response
-    mock_evaluator_response = {"id": "job-123", "status": "created"}
-    mock_evaluator_post.return_value = mock_evaluator_response
-
-    # Run the Evaluation job
-    result = await eval_impl.run_eval(
-        request=RunEvalRequest(benchmark_id=MOCK_BENCHMARK_ID, benchmark_config=benchmark_config)
-    )
-
-    # Verify the Evaluator API was called correctly
-    mock_evaluator_post.assert_called_once()
-    _assert_request_body(
-        mock_evaluator_post,
-        {
-            "config": f"nvidia/{MOCK_BENCHMARK_ID}",
-            "target": {"type": "model", "model": "Llama3.1-8B-Instruct"},
-        },
-    )
-
-    # Verify the result
-    assert isinstance(result, Job)
-    assert result.job_id == "job-123"
-    assert result.status == JobStatus.in_progress
-
-
-async def test_job_status(nvidia_eval_setup):
-    eval_impl = nvidia_eval_setup["eval_impl"]
-    mock_evaluator_get = nvidia_eval_setup["mock_evaluator_get"]
-
-    # Mock Evaluator API response
-    mock_evaluator_response = {"id": "job-123", "status": "completed"}
-    mock_evaluator_get.return_value = mock_evaluator_response
-
-    # Get the Evaluation job
-    result = await eval_impl.job_status(request=JobStatusRequest(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
-
-    # Verify the result
-    assert isinstance(result, Job)
-    assert result.job_id == "job-123"
-    assert result.status == JobStatus.completed
-
-    # Verify the API was called correctly
-    mock_evaluator_get.assert_called_once_with(f"/v1/evaluation/jobs/{result.job_id}")
-
-
-async def test_job_cancel(nvidia_eval_setup):
-    eval_impl = nvidia_eval_setup["eval_impl"]
-    mock_evaluator_post = nvidia_eval_setup["mock_evaluator_post"]
-
-    # Mock Evaluator API response
-    mock_evaluator_response = {"id": "job-123", "status": "cancelled"}
-    mock_evaluator_post.return_value = mock_evaluator_response
-
-    # Cancel the Evaluation job
-    await eval_impl.job_cancel(request=JobCancelRequest(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
-
-    # Verify the API was called correctly
-    mock_evaluator_post.assert_called_once_with("/v1/evaluation/jobs/job-123/cancel", {})
-
-
-async def test_job_result(nvidia_eval_setup):
-    eval_impl = nvidia_eval_setup["eval_impl"]
-    mock_evaluator_get = nvidia_eval_setup["mock_evaluator_get"]
-
-    # Mock Evaluator API responses
-    mock_job_status_response = {"id": "job-123", "status": "completed"}
-    mock_job_results_response = {
-        "id": "job-123",
-        "status": "completed",
-        "results": {MOCK_BENCHMARK_ID: {"score": 0.85, "details": {"accuracy": 0.85, "f1": 0.84}}},
-    }
-    mock_evaluator_get.side_effect = [
-        mock_job_status_response,  # First call to retrieve job
-        mock_job_results_response,  # Second call to retrieve job results
-    ]
-
-    # Get the Evaluation job results
-    result = await eval_impl.job_result(request=JobResultRequest(benchmark_id=MOCK_BENCHMARK_ID, job_id="job-123"))
-
-    # Verify the result
-    assert isinstance(result, EvaluateResponse)
-    assert MOCK_BENCHMARK_ID in result.scores
-    assert result.scores[MOCK_BENCHMARK_ID].aggregated_results["results"][MOCK_BENCHMARK_ID]["score"] == 0.85
-
-    # Verify the API was called correctly
-    assert mock_evaluator_get.call_count == 2
-    mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123")
-    mock_evaluator_get.assert_any_call("/v1/evaluation/jobs/job-123/results")
diff --git a/tests/unit/test_eval_models.py b/tests/unit/test_eval_models.py
deleted file mode 100644
index 2b6bcd9419..0000000000
--- a/tests/unit/test_eval_models.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the terms described in the LICENSE file in
-# the root directory of this source tree.
-
-import pytest
-from pydantic import ValidationError
-
-from llama_stack_api.eval.models import (
-    BenchmarkConfig,
-    EvaluateResponse,
-    EvaluateRowsRequest,
-    ModelCandidate,
-    RunEvalRequest,
-)
-from llama_stack_api.inference import SamplingParams, TopPSamplingStrategy
-from llama_stack_api.scoring import ScoringResult
-
-
-def test_model_candidate_valid():
-    mc = ModelCandidate(
-        model="test-model",
-        sampling_params=SamplingParams(max_tokens=100, strategy=TopPSamplingStrategy(temperature=0.7)),
-    )
-    assert mc.model == "test-model"
-    assert mc.type == "model"
-
-
-def test_benchmark_config_valid():
-    mc = ModelCandidate(
-        model="test-model",
-        sampling_params=SamplingParams(max_tokens=100, strategy=TopPSamplingStrategy(temperature=0.7)),
-    )
-    bc = BenchmarkConfig(eval_candidate=mc, num_examples=5)
-    assert bc.num_examples == 5
-    assert bc.scoring_params == {}
-
-
-def test_evaluate_response_valid():
-    er = EvaluateResponse(
-        generations=[{"input": "test", "output": "result"}],
-        scores={
-            "accuracy": ScoringResult(
-                score_rows=[{"score": 0.9}],
-                aggregated_results={"average": 0.9},
-            )
-        },
-    )
-    assert len(er.generations) == 1
-    assert "accuracy" in er.scores
-
-
-def test_run_eval_request_valid():
-    mc = ModelCandidate(
-        model="test-model",
-        sampling_params=SamplingParams(max_tokens=100, strategy=TopPSamplingStrategy(temperature=0.7)),
-    )
-    bc = BenchmarkConfig(eval_candidate=mc)
-    req = RunEvalRequest(benchmark_id="bench-123", benchmark_config=bc)
-    assert req.benchmark_id == "bench-123"
-
-
-def test_evaluate_rows_request_empty_arrays_fail():
-    mc = ModelCandidate(
-        model="test-model",
-        sampling_params=SamplingParams(max_tokens=100, strategy=TopPSamplingStrategy(temperature=0.7)),
-    )
-    bc = BenchmarkConfig(eval_candidate=mc)
-
-    with pytest.raises(ValidationError):
-        EvaluateRowsRequest(
-            benchmark_id="bench-123",
-            input_rows=[],
-            scoring_functions=["func1"],
-            benchmark_config=bc,
-        )
-
-    with pytest.raises(ValidationError):
-        EvaluateRowsRequest(
-            benchmark_id="bench-123",
-            input_rows=[{"test": "data"}],
-            scoring_functions=[],
-            benchmark_config=bc,
-        )

From 3e5c7d1d6a0a70a81a96e876f559dd50d63fd05b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C3=A9bastien=20Han?= <seb@redhat.com>
Date: Wed, 25 Mar 2026 18:07:26 +0100
Subject: [PATCH 2/2] =?UTF-8?q?fix:=20resolve=20agents=E2=86=92responses?=
 =?UTF-8?q?=20rename=20conflict?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Sébastien Han <seb@redhat.com>
---
 src/llama_stack_api/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama_stack_api/__init__.py b/src/llama_stack_api/__init__.py
index 0be5fffe78..6376c19044 100644
--- a/src/llama_stack_api/__init__.py
+++ b/src/llama_stack_api/__init__.py
@@ -51,7 +51,7 @@
 )
 
 # Import all public API symbols
-from .agents import (
+from .responses import (
     Agents,
     CreateResponseRequest,
     DeleteResponseRequest,