From 2c0dbe1b1b71c8a246be132cb40287bf8319e8c5 Mon Sep 17 00:00:00 2001
From: Daniel O'Connell <github@ahiru.pl>
Date: Wed, 25 Feb 2026 00:50:45 +0100
Subject: [PATCH] Add Lab Bench eval

---
 src/tamperbench/whitebox/evals/__init__.py    |   6 +
 .../whitebox/evals/lab_bench/__init__.py      |  21 ++
 .../whitebox/evals/lab_bench/lab_bench.py     | 289 +++++++++++++++++
 src/tamperbench/whitebox/utils/names.py       |  10 +
 tests/evals/test_lab_bench.py                 | 298 ++++++++++++++++++
 5 files changed, 624 insertions(+)
 create mode 100644 src/tamperbench/whitebox/evals/lab_bench/__init__.py
 create mode 100644 src/tamperbench/whitebox/evals/lab_bench/lab_bench.py
 create mode 100644 tests/evals/test_lab_bench.py

diff --git a/src/tamperbench/whitebox/evals/__init__.py b/src/tamperbench/whitebox/evals/__init__.py
index 45f867ce..e259d905 100644
--- a/src/tamperbench/whitebox/evals/__init__.py
+++ b/src/tamperbench/whitebox/evals/__init__.py
@@ -12,6 +12,10 @@
     IFEvalEvaluation,
     IFEvalEvaluationConfig,
 )
+from tamperbench.whitebox.evals.lab_bench.lab_bench import (
+    LabBenchEvaluation,
+    LabBenchEvaluationConfig,
+)
 from tamperbench.whitebox.evals.mbpp.mbpp import (
     MBPPEvaluation,
     MBPPEvaluationConfig,
@@ -59,6 +63,8 @@
     "IFEvalEvaluation",
     "IFEvalEvaluationConfig",
     "JailbreakBenchEvaluation",
+    "LabBenchEvaluation",
+    "LabBenchEvaluationConfig",
     "MBPPEvaluation",
     "MBPPEvaluationConfig",
     "MMLUProEvaluationConfig",
diff --git a/src/tamperbench/whitebox/evals/lab_bench/__init__.py b/src/tamperbench/whitebox/evals/lab_bench/__init__.py
new file mode 100644
index 00000000..5b218cf8
--- /dev/null
+++ b/src/tamperbench/whitebox/evals/lab_bench/__init__.py
@@ -0,0 +1,21 @@
+"""LAB-Bench (Language Agent Biology Benchmark) Evaluation.
+
+LAB-Bench measures biology research capabilities across text-based subtasks:
+LitQA2, DbQA, SuppQA, ProtocolQA, SeqQA, and CloningScenarios.
+
+Reference:
+    - Paper: https://arxiv.org/abs/2407.10362
+    - Dataset: https://huggingface.co/datasets/futurehouse/lab-bench
+"""
+
+from tamperbench.whitebox.evals.lab_bench.lab_bench import (
+    LabBenchEvaluation,
+    LabBenchEvaluationConfig,
+    LabBenchSubtask,
+)
+
+__all__ = [
+    "LabBenchEvaluation",
+    "LabBenchEvaluationConfig",
+    "LabBenchSubtask",
+]
diff --git a/src/tamperbench/whitebox/evals/lab_bench/lab_bench.py b/src/tamperbench/whitebox/evals/lab_bench/lab_bench.py
new file mode 100644
index 00000000..8b7e2307
--- /dev/null
+++ b/src/tamperbench/whitebox/evals/lab_bench/lab_bench.py
@@ -0,0 +1,289 @@
+"""LAB-Bench (Language Agent Biology Benchmark) Evaluation.
+
+Measures biology research capabilities via multiple-choice questions across
+text-based subtasks: literature reasoning, database navigation, protocol
+troubleshooting, sequence manipulation, and molecular cloning.
+
+Reference:
+    - Paper: "LAB-Bench: Measuring Capabilities of Language Models for Biology Research"
+             https://arxiv.org/abs/2407.10362
+    - Dataset: https://huggingface.co/datasets/futurehouse/lab-bench
+"""
+
+import random
+from collections import defaultdict
+from dataclasses import dataclass, field
+from enum import Enum
+from functools import cached_property
+from itertools import chain
+from typing import Any, cast
+
+import polars as pl
+import torch
+from datasets import load_dataset
+from pandera.typing.polars import DataFrame
+from typing_extensions import override
+
+from tamperbench.whitebox.evals.base import WhiteBoxEvaluation, WhiteBoxEvaluationConfig, load_vllm_model_and_tokenizer
+from tamperbench.whitebox.evals.output_schema import (
+    EvaluationSchema,
+    InferenceSchema,
+    ScoreSchema,
+)
+from tamperbench.whitebox.evals.reference import ReferenceScore, ReferenceScores
+from tamperbench.whitebox.evals.registry import register_evaluation
+from tamperbench.whitebox.evals.utils import get_multiple_choice_answers
+from tamperbench.whitebox.utils import EvalName, MetricName, OptimizationDirection
+
+DATASET_ID = "futurehouse/lab-bench"
+
+# Reference: Measured empirically with this implementation.
+# The LAB-Bench paper (Ribeiro et al., 2024, Table 2) reports results for
+# GPT-4, Claude 3 Opus, Gemini 1.5 Pro, and Llama-3-70B-Instruct, but not
+# for 8B-class models. These scores were measured on compute.safe.ai (vLLM v0.12,
+# 1x GPU, logprob-based MCQ with per-choice-count grouping).
+LAB_BENCH_REFERENCE_SCORES = ReferenceScores(
+    scores=[
+        ReferenceScore(
+            model_id="meta-llama/Llama-3.1-8B-Instruct",
+            metrics={
+                str(MetricName.LAB_BENCH_ACCURACY): 0.3541,
+                str(MetricName.LAB_BENCH_LITQA2_ACCURACY): 0.3367,
+                str(MetricName.LAB_BENCH_DBQA_ACCURACY): 0.3154,
+                str(MetricName.LAB_BENCH_SUPPQA_ACCURACY): 0.2927,
+                str(MetricName.LAB_BENCH_PROTOCOLQA_ACCURACY): 0.3796,
+                str(MetricName.LAB_BENCH_SEQQA_ACCURACY): 0.3967,
+                str(MetricName.LAB_BENCH_CLONING_ACCURACY): 0.3636,
+            },
+            source="Empirical measurement (SafeTuneBed, Feb 2026)",
+            tolerance=0.05,  # 5% for MCQ accuracy (wider than WMDP due to empirical-only reference)
+            notes="vLLM v0.12, logprob-based MCQ, per-choice-count grouping",
+        ),
+    ]
+)
+
+
+class LabBenchSubtask(str, Enum):
+    """LAB-Bench subtask configs (text-only)."""
+
+    LITQA2 = "LitQA2"
+    DBQA = "DbQA"
+    SUPPQA = "SuppQA"
+    PROTOCOLQA = "ProtocolQA"
+    SEQQA = "SeqQA"
+    CLONING = "CloningScenarios"
+
+
+SUBTASK_METRIC: dict[LabBenchSubtask, MetricName] = {
+    LabBenchSubtask.LITQA2: MetricName.LAB_BENCH_LITQA2_ACCURACY,
+    LabBenchSubtask.DBQA: MetricName.LAB_BENCH_DBQA_ACCURACY,
+    LabBenchSubtask.SUPPQA: MetricName.LAB_BENCH_SUPPQA_ACCURACY,
+    LabBenchSubtask.PROTOCOLQA: MetricName.LAB_BENCH_PROTOCOLQA_ACCURACY,
+    LabBenchSubtask.SEQQA: MetricName.LAB_BENCH_SEQQA_ACCURACY,
+    LabBenchSubtask.CLONING: MetricName.LAB_BENCH_CLONING_ACCURACY,
+}
+
+
+@dataclass(frozen=True)
+class LabBenchQuestion:
+    """Single LAB-Bench question with metadata."""
+
+    prompt: str
+    correct_answer: str
+    subtask: LabBenchSubtask
+    num_choices: int
+
+
+@dataclass
+class LabBenchEvaluationConfig(WhiteBoxEvaluationConfig):
+    """Configuration for LAB-Bench evaluation."""
+
+    subtasks: list[LabBenchSubtask] = field(
+        default_factory=lambda: list(LabBenchSubtask),
+    )
+
+
+def build_choices(ideal: str, distractors: list[str], seed: str) -> tuple[list[str], int]:
+    """Combine ideal answer with distractors and shuffle deterministically.
+
+    Returns (shuffled_choices, index_of_correct_answer).
+    """
+    indexed = list(enumerate([ideal, *distractors]))
+    rng = random.Random(seed)
+    rng.shuffle(indexed)
+    choices = [text for _, text in indexed]
+    correct_idx = next(i for i, (orig_i, _) in enumerate(indexed) if orig_i == 0)
+    return choices, correct_idx
+
+
+def format_question(question_text: str, choices: list[str]) -> str:
+    """Format a LAB-Bench question as a multiple-choice prompt."""
+    letters = [chr(ord("A") + i) for i in range(len(choices))]
+    choices_text = "\n".join(f"{letter}. {choice}" for letter, choice in zip(letters, choices, strict=True))
+    return (
+        f"The following are multiple choice questions (with answers) about biology research.\n\n"
+        f"{question_text}\n"
+        f"{choices_text}\n"
+        f"Answer:"
+    )
+
+
+@register_evaluation(EvalName.LAB_BENCH)
+class LabBenchEvaluation(WhiteBoxEvaluation[LabBenchEvaluationConfig]):
+    """LAB-Bench Evaluation using multiple-choice questions.
+
+    Measures model capability on biology research tasks.
+    Higher accuracy = better capability.
+
+    For defenders: Higher accuracy is better (MAXIMIZE).
+    For attackers: Lower accuracy is better (MINIMIZE).
+
+    Direction rationale: LAB-Bench is a **benign capability** eval, not a safety eval.
+    Although it covers biology, its subtasks measure general research skills (literature
+    comprehension, database querying, protocol understanding, sequence manipulation) rather
+    than dual-use dangerous knowledge. WMDP-Bio specifically targets hazardous biosecurity
+    knowledge (e.g., synthesizing pathogens) and is treated as a safety eval. Here, the
+    defender wants the model to *retain* biology research competence after safety training,
+    so defender=MAXIMIZE. The attacker tries to *degrade* this competence, so
+    attacker=MINIMIZE.
+    """
+
+    name: EvalName = EvalName.LAB_BENCH
+    objective: MetricName = MetricName.LAB_BENCH_ACCURACY
+    attacker_direction: OptimizationDirection = OptimizationDirection.MINIMIZE
+    defender_direction: OptimizationDirection = OptimizationDirection.MAXIMIZE
+    reference_scores: ReferenceScores = LAB_BENCH_REFERENCE_SCORES
+
+    def __init__(self, eval_config: LabBenchEvaluationConfig) -> None:
+        """Initialize LAB-Bench evaluation."""
+        super().__init__(eval_config)
+
+    @cached_property
+    def questions(self) -> list[LabBenchQuestion]:
+        """Load all LAB-Bench questions (cached on first access)."""
+        return list(chain.from_iterable(self._load_subtask_questions(subtask) for subtask in self.eval_config.subtasks))
+
+    def _load_subtask_questions(self, subtask: LabBenchSubtask) -> list[LabBenchQuestion]:
+        """Load questions for a single subtask."""
+        # LAB-Bench has only a "train" split (no test/val split in the HF dataset)
+        dataset = load_dataset(DATASET_ID, subtask.value, split="train")
+
+        questions = []
+        for raw_item in dataset:
+            item = cast(dict[str, Any], raw_item)
+            assert "id" in item, f"LAB-Bench item missing 'id' field in {subtask.value}"
+            seed = str(item["id"])
+            choices, correct_idx = build_choices(item["ideal"], item["distractors"], seed)
+            answer_letter = chr(ord("A") + correct_idx)
+
+            questions.append(
+                LabBenchQuestion(
+                    prompt=format_question(item["question"], choices),
+                    correct_answer=answer_letter,
+                    subtask=subtask,
+                    num_choices=len(choices),
+                )
+            )
+        return questions
+
+    @override
+    def compute_inferences(self) -> DataFrame[InferenceSchema]:
+        """Run model on LAB-Bench multiple-choice questions."""
+        model, _ = load_vllm_model_and_tokenizer(self.eval_config)
+        prompts = [q.prompt for q in self.questions]
+
+        # Group questions by choice count to avoid phantom-letter bias.
+        # If all questions are passed with the max superset of letters (e.g., A-G),
+        # questions with fewer choices (e.g., 4) could have the model "pick" a
+        # non-existent option (E, F, G) that happens to have higher logprob.
+        groups: dict[int, list[int]] = defaultdict(list)
+        for i, q in enumerate(self.questions):
+            groups[q.num_choices].append(i)
+
+        answers: list[str] = [""] * len(self.questions)
+        for num_choices, indices in groups.items():
+            choice_letters = [chr(ord("A") + i) for i in range(num_choices)]
+            group_prompts = [prompts[i] for i in indices]
+            group_answers = get_multiple_choice_answers(group_prompts, model, choices=choice_letters)
+            for idx, answer in zip(indices, group_answers, strict=True):
+                answers[idx] = answer
+
+        assert all(a != "" for a in answers), "Some questions were not assigned answers"
+
+        del model
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+        return InferenceSchema.validate(
+            pl.DataFrame(
+                {
+                    InferenceSchema.prompt: prompts,
+                    InferenceSchema.response: answers,
+                }
+            )
+        )
+
+    @override
+    def compute_scores(self, inferences: DataFrame[InferenceSchema]) -> DataFrame[ScoreSchema]:
+        """Compute accuracy scores (0 or 1) for each question."""
+        # NOTE: Uses full prompt string as join key between questions and inferences.
+        # This is fragile if any transformation (serialization, whitespace normalization)
+        # modifies prompts between compute_inferences and compute_scores. Currently safe
+        # because both methods run in the same process without serialization round-trips.
+        prompt_to_answer = {q.prompt: q.correct_answer for q in self.questions}
+        prompts = inferences[InferenceSchema.prompt].to_list()
+        responses = inferences[InferenceSchema.response].to_list()
+
+        scores = [
+            float(response == prompt_to_answer[prompt]) for prompt, response in zip(prompts, responses, strict=True)
+        ]
+
+        return ScoreSchema.validate(
+            pl.DataFrame(
+                {
+                    ScoreSchema.prompt: prompts,
+                    ScoreSchema.response: responses,
+                    ScoreSchema.score: scores,
+                }
+            )
+        )
+
+    @override
+    def compute_results(self, scores: DataFrame[ScoreSchema]) -> DataFrame[EvaluationSchema]:
+        """Compute accuracy metrics per subtask and overall."""
+        prompt_to_subtask = {q.prompt: q.subtask for q in self.questions}
+        scores_list = scores["score"].to_list()
+        prompts = scores[ScoreSchema.prompt].to_list()
+
+        subtask_scores: dict[LabBenchSubtask, list[float]] = {s: [] for s in self.eval_config.subtasks}
+        for prompt, score in zip(prompts, scores_list, strict=True):
+            subtask = prompt_to_subtask.get(prompt)
+            if subtask is not None:
+                subtask_scores[subtask].append(score)
+
+        metrics = [
+            (
+                str(SUBTASK_METRIC[subtask]),
+                sum(subtask_scores[subtask]) / len(subtask_scores[subtask]),
+            )
+            for subtask in self.eval_config.subtasks
+            if subtask_scores[subtask]
+        ]
+        # Overall accuracy is micro-averaged (weighted by subtask size).
+        # SeqQA has ~600 questions vs CloningScenarios ~33, so subtask sizes
+        # vary by ~18x. Per-subtask metrics above allow macro-style analysis.
+        metrics.append(
+            (
+                str(MetricName.LAB_BENCH_ACCURACY),
+                sum(scores_list) / len(scores_list) if scores_list else 0.0,
+            )
+        )
+
+        return EvaluationSchema.validate(
+            pl.DataFrame(
+                {
+                    EvaluationSchema.metric_name: [m[0] for m in metrics],
+                    EvaluationSchema.metric_value: [m[1] for m in metrics],
+                }
+            )
+        )
diff --git a/src/tamperbench/whitebox/utils/names.py b/src/tamperbench/whitebox/utils/names.py
index d07aa6ec..dfdd5c62 100644
--- a/src/tamperbench/whitebox/utils/names.py
+++ b/src/tamperbench/whitebox/utils/names.py
@@ -48,6 +48,7 @@ class EvalName(StrEnum):
     SAFETY_GAP = "safety_gap"
     XSTEST = "xstest"
     WMDP = "wmdp"
+    LAB_BENCH = "lab_bench"
 
 
 class MetricName(StrEnum):
@@ -82,6 +83,15 @@ class MetricName(StrEnum):
     WMDP_CYBER_ACCURACY = "wmdp_cyber_accuracy"
     WMDP_CHEM_ACCURACY = "wmdp_chem_accuracy"
 
+    # Lab-Bench metrics
+    LAB_BENCH_ACCURACY = "lab_bench_accuracy"
+    LAB_BENCH_LITQA2_ACCURACY = "lab_bench_litqa2_accuracy"
+    LAB_BENCH_DBQA_ACCURACY = "lab_bench_dbqa_accuracy"
+    LAB_BENCH_SUPPQA_ACCURACY = "lab_bench_suppqa_accuracy"
+    LAB_BENCH_PROTOCOLQA_ACCURACY = "lab_bench_protocolqa_accuracy"
+    LAB_BENCH_SEQQA_ACCURACY = "lab_bench_seqqa_accuracy"
+    LAB_BENCH_CLONING_ACCURACY = "lab_bench_cloning_accuracy"
+
     # XSTest metrics
     XSTEST_SAFE_REFUSAL_RATE = "xstest_safe_refusal_rate"
     XSTEST_SAFE_FULL_REFUSAL_RATE = "xstest_safe_full_refusal_rate"
diff --git a/tests/evals/test_lab_bench.py b/tests/evals/test_lab_bench.py
new file mode 100644
index 00000000..e1aa0f7a
--- /dev/null
+++ b/tests/evals/test_lab_bench.py
@@ -0,0 +1,298 @@
+"""Tests for LAB-Bench evaluation."""
+
+import polars as pl
+import pytest
+
+from tamperbench.whitebox.evals.lab_bench.lab_bench import (
+    LabBenchEvaluation,
+    LabBenchEvaluationConfig,
+    LabBenchQuestion,
+    LabBenchSubtask,
+    build_choices,
+    format_question,
+)
+from tamperbench.whitebox.evals.output_schema import (
+    EvaluationSchema,
+    InferenceSchema,
+    ScoreSchema,
+)
+from tamperbench.whitebox.utils import MetricName
+from tamperbench.whitebox.utils.models.config import ModelConfig
+
+
+def test_build_choices_includes_all_options() -> None:
+    """All distractors and the ideal answer appear in the output."""
+    ideal = "correct answer"
+    distractors = ["wrong1", "wrong2", "wrong3"]
+    choices, correct_idx = build_choices(ideal, distractors, seed="test")
+    assert set(choices) == {ideal, *distractors}
+    assert choices[correct_idx] == ideal
+
+
+def test_build_choices_deterministic() -> None:
+    """Same seed produces the same shuffle."""
+    ideal = "correct"
+    distractors = ["a", "b", "c"]
+    result1 = build_choices(ideal, distractors, seed="fixed")
+    result2 = build_choices(ideal, distractors, seed="fixed")
+    assert result1 == result2
+
+
+def test_build_choices_duplicate_ideal_and_distractor() -> None:
+    """Correctly tracks correct answer even when ideal text matches a distractor."""
+    ideal = "same"
+    distractors = ["same", "other"]
+    choices, correct_idx = build_choices(ideal, distractors, seed="test")
+    assert len(choices) == 3
+    assert choices[correct_idx] == ideal
+
+
+def test_build_choices_different_seeds_can_differ() -> None:
+    """Different seeds can produce different orderings."""
+    ideal = "correct"
+    distractors = ["a", "b", "c", "d", "e", "f"]
+    results = {tuple(build_choices(ideal, distractors, seed=str(i))[0]) for i in range(20)}
+    # With 7 items and 20 seeds, extremely unlikely all are the same ordering
+    assert len(results) > 1
+
+
+def test_format_question_structure() -> None:
+    """Formatted prompt has expected structure."""
+    prompt = format_question("What is DNA?", ["Protein", "Nucleic acid", "Sugar"])
+    assert prompt.startswith("The following are multiple choice questions (with answers) about biology research.")
+    assert "What is DNA?" in prompt
+    assert "A. Protein" in prompt
+    assert "B. Nucleic acid" in prompt
+    assert "C. Sugar" in prompt
+    assert prompt.endswith("Answer:")
+
+
+@pytest.mark.parametrize(
+    "num_choices",
+    [2, 4, 7],
+)
+def test_format_question_variable_choice_count(num_choices: int) -> None:
+    """Works with different numbers of choices."""
+    choices = [f"option_{i}" for i in range(num_choices)]
+    prompt = format_question("Q?", choices)
+    for i in range(num_choices):
+        letter = chr(ord("A") + i)
+        assert f"{letter}. option_{i}" in prompt
+
+
+def test_subtask_enum_values_match_dataset_configs() -> None:
+    """Enum values match the HuggingFace dataset config names."""
+    expected = {"LitQA2", "DbQA", "SuppQA", "ProtocolQA", "SeqQA", "CloningScenarios"}
+    actual = {s.value for s in LabBenchSubtask}
+    assert actual == expected
+
+
+# --- Helper to construct a LabBenchEvaluation with mock questions ---
+
+
+def make_evaluation(
+    questions: list[LabBenchQuestion], subtasks: list[LabBenchSubtask] | None = None
+) -> LabBenchEvaluation:
+    """Create a LabBenchEvaluation with pre-loaded questions (no HF download)."""
+    if subtasks is None:
+        subtasks = list({q.subtask for q in questions})
+
+    config = LabBenchEvaluationConfig(
+        model_checkpoint="test-model",
+        out_dir="/tmp/lab-bench-test",
+        model_config=ModelConfig(
+            user_prefix="",
+            assistant_prefix="",
+            end_turn="",
+            max_generation_length=1,
+            inference_batch_size=32,
+        ),
+        subtasks=subtasks,
+    )
+    evaluation = LabBenchEvaluation(config)
+    # Inject questions directly into instance dict, bypassing HF dataset download.
+    # This populates the cached_property cache so .questions returns our mock data.
+    vars(evaluation)["questions"] = questions
+    return evaluation
+
+
+def make_question(prompt: str, correct_answer: str, subtask: LabBenchSubtask, num_choices: int = 4) -> LabBenchQuestion:
+    """Create a LabBenchQuestion for testing."""
+    return LabBenchQuestion(
+        prompt=prompt,
+        correct_answer=correct_answer,
+        subtask=subtask,
+        num_choices=num_choices,
+    )
+
+
+# --- Tests for compute_scores ---
+
+
+def test_compute_scores_all_correct() -> None:
+    """All correct answers produce scores of 1.0."""
+    questions = [
+        make_question("Q1?", "A", LabBenchSubtask.LITQA2),
+        make_question("Q2?", "B", LabBenchSubtask.DBQA),
+        make_question("Q3?", "C", LabBenchSubtask.SEQQA),
+    ]
+    evaluation = make_evaluation(questions)
+
+    inferences = InferenceSchema.validate(
+        pl.DataFrame(
+            {
+                InferenceSchema.prompt: ["Q1?", "Q2?", "Q3?"],
+                InferenceSchema.response: ["A", "B", "C"],
+            }
+        )
+    )
+
+    scores_df = evaluation.compute_scores(inferences)
+    scores = scores_df["score"].to_list()
+    assert scores == [1.0, 1.0, 1.0]
+
+
+def test_compute_scores_all_wrong() -> None:
+    """All wrong answers produce scores of 0.0."""
+    questions = [
+        make_question("Q1?", "A", LabBenchSubtask.LITQA2),
+        make_question("Q2?", "B", LabBenchSubtask.DBQA),
+    ]
+    evaluation = make_evaluation(questions)
+
+    inferences = InferenceSchema.validate(
+        pl.DataFrame(
+            {
+                InferenceSchema.prompt: ["Q1?", "Q2?"],
+                InferenceSchema.response: ["C", "D"],
+            }
+        )
+    )
+
+    scores_df = evaluation.compute_scores(inferences)
+    scores = scores_df["score"].to_list()
+    assert scores == [0.0, 0.0]
+
+
+def test_compute_scores_mixed() -> None:
+    """Mix of correct and wrong answers."""
+    questions = [
+        make_question("Q1?", "A", LabBenchSubtask.LITQA2),
+        make_question("Q2?", "B", LabBenchSubtask.LITQA2),
+        make_question("Q3?", "C", LabBenchSubtask.LITQA2),
+    ]
+    evaluation = make_evaluation(questions)
+
+    inferences = InferenceSchema.validate(
+        pl.DataFrame(
+            {
+                InferenceSchema.prompt: ["Q1?", "Q2?", "Q3?"],
+                InferenceSchema.response: ["A", "D", "C"],
+            }
+        )
+    )
+
+    scores_df = evaluation.compute_scores(inferences)
+    scores = scores_df["score"].to_list()
+    assert scores == [1.0, 0.0, 1.0]
+
+
+# --- Tests for compute_results ---
+
+
+def test_compute_results_single_subtask() -> None:
+    """Accuracy computed for a single subtask and overall."""
+    questions = [
+        make_question("Q1?", "A", LabBenchSubtask.LITQA2),
+        make_question("Q2?", "B", LabBenchSubtask.LITQA2),
+    ]
+    evaluation = make_evaluation(questions, subtasks=[LabBenchSubtask.LITQA2])
+
+    scores_df = ScoreSchema.validate(
+        pl.DataFrame(
+            {
+                ScoreSchema.prompt: ["Q1?", "Q2?"],
+                ScoreSchema.response: ["A", "D"],
+                ScoreSchema.score: [1.0, 0.0],
+            }
+        )
+    )
+
+    results_df = evaluation.compute_results(scores_df)
+    metrics = dict(
+        zip(
+            results_df[EvaluationSchema.metric_name].to_list(),
+            results_df["metric_value"].to_list(),
+            strict=True,
+        )
+    )
+
+    assert metrics[str(MetricName.LAB_BENCH_LITQA2_ACCURACY)] == pytest.approx(0.5)
+    assert metrics[str(MetricName.LAB_BENCH_ACCURACY)] == pytest.approx(0.5)
+
+
+def test_compute_results_multiple_subtasks() -> None:
+    """Per-subtask and overall accuracy computed correctly across subtasks."""
+    questions = [
+        make_question("Q1?", "A", LabBenchSubtask.LITQA2),
+        make_question("Q2?", "B", LabBenchSubtask.LITQA2),
+        make_question("Q3?", "C", LabBenchSubtask.DBQA),
+        make_question("Q4?", "D", LabBenchSubtask.DBQA),
+        make_question("Q5?", "A", LabBenchSubtask.DBQA),
+    ]
+    evaluation = make_evaluation(questions, subtasks=[LabBenchSubtask.LITQA2, LabBenchSubtask.DBQA])
+
+    # LitQA2: 1/2 correct, DbQA: 2/3 correct, overall: 3/5
+    scores_df = ScoreSchema.validate(
+        pl.DataFrame(
+            {
+                ScoreSchema.prompt: ["Q1?", "Q2?", "Q3?", "Q4?", "Q5?"],
+                ScoreSchema.response: ["A", "X", "C", "D", "B"],
+                ScoreSchema.score: [1.0, 0.0, 1.0, 1.0, 0.0],
+            }
+        )
+    )
+
+    results_df = evaluation.compute_results(scores_df)
+    metrics = dict(
+        zip(
+            results_df[EvaluationSchema.metric_name].to_list(),
+            results_df["metric_value"].to_list(),
+            strict=True,
+        )
+    )
+
+    assert metrics[str(MetricName.LAB_BENCH_LITQA2_ACCURACY)] == pytest.approx(0.5)
+    assert metrics[str(MetricName.LAB_BENCH_DBQA_ACCURACY)] == pytest.approx(2 / 3)
+    assert metrics[str(MetricName.LAB_BENCH_ACCURACY)] == pytest.approx(3 / 5)
+
+
+def test_compute_results_perfect_score() -> None:
+    """All correct produces 1.0 across the board."""
+    questions = [
+        make_question("Q1?", "A", LabBenchSubtask.SEQQA),
+        make_question("Q2?", "B", LabBenchSubtask.SEQQA),
+    ]
+    evaluation = make_evaluation(questions, subtasks=[LabBenchSubtask.SEQQA])
+
+    scores_df = ScoreSchema.validate(
+        pl.DataFrame(
+            {
+                ScoreSchema.prompt: ["Q1?", "Q2?"],
+                ScoreSchema.response: ["A", "B"],
+                ScoreSchema.score: [1.0, 1.0],
+            }
+        )
+    )
+
+    results_df = evaluation.compute_results(scores_df)
+    metrics = dict(
+        zip(
+            results_df[EvaluationSchema.metric_name].to_list(),
+            results_df["metric_value"].to_list(),
+            strict=True,
+        )
+    )
+
+    assert metrics[str(MetricName.LAB_BENCH_SEQQA_ACCURACY)] == pytest.approx(1.0)
+    assert metrics[str(MetricName.LAB_BENCH_ACCURACY)] == pytest.approx(1.0)