From 2c0dbe1b1b71c8a246be132cb40287bf8319e8c5 Mon Sep 17 00:00:00 2001 From: Daniel O'Connell Date: Wed, 25 Feb 2026 00:50:45 +0100 Subject: [PATCH] Add Lab Bench eval --- src/tamperbench/whitebox/evals/__init__.py | 6 + .../whitebox/evals/lab_bench/__init__.py | 21 ++ .../whitebox/evals/lab_bench/lab_bench.py | 289 +++++++++++++++++ src/tamperbench/whitebox/utils/names.py | 10 + tests/evals/test_lab_bench.py | 298 ++++++++++++++++++ 5 files changed, 624 insertions(+) create mode 100644 src/tamperbench/whitebox/evals/lab_bench/__init__.py create mode 100644 src/tamperbench/whitebox/evals/lab_bench/lab_bench.py create mode 100644 tests/evals/test_lab_bench.py diff --git a/src/tamperbench/whitebox/evals/__init__.py b/src/tamperbench/whitebox/evals/__init__.py index 45f867ce..e259d905 100644 --- a/src/tamperbench/whitebox/evals/__init__.py +++ b/src/tamperbench/whitebox/evals/__init__.py @@ -12,6 +12,10 @@ IFEvalEvaluation, IFEvalEvaluationConfig, ) +from tamperbench.whitebox.evals.lab_bench.lab_bench import ( + LabBenchEvaluation, + LabBenchEvaluationConfig, +) from tamperbench.whitebox.evals.mbpp.mbpp import ( MBPPEvaluation, MBPPEvaluationConfig, @@ -59,6 +63,8 @@ "IFEvalEvaluation", "IFEvalEvaluationConfig", "JailbreakBenchEvaluation", + "LabBenchEvaluation", + "LabBenchEvaluationConfig", "MBPPEvaluation", "MBPPEvaluationConfig", "MMLUProEvaluationConfig", diff --git a/src/tamperbench/whitebox/evals/lab_bench/__init__.py b/src/tamperbench/whitebox/evals/lab_bench/__init__.py new file mode 100644 index 00000000..5b218cf8 --- /dev/null +++ b/src/tamperbench/whitebox/evals/lab_bench/__init__.py @@ -0,0 +1,21 @@ +"""LAB-Bench (Language Agent Biology Benchmark) Evaluation. + +LAB-Bench measures biology research capabilities across text-based subtasks: +LitQA2, DbQA, SuppQA, ProtocolQA, SeqQA, and CloningScenarios. + +Reference: + - Paper: https://arxiv.org/abs/2407.10362 + - Dataset: https://huggingface.co/datasets/futurehouse/lab-bench +""" + +from tamperbench.whitebox.evals.lab_bench.lab_bench import ( + LabBenchEvaluation, + LabBenchEvaluationConfig, + LabBenchSubtask, +) + +__all__ = [ + "LabBenchEvaluation", + "LabBenchEvaluationConfig", + "LabBenchSubtask", +] diff --git a/src/tamperbench/whitebox/evals/lab_bench/lab_bench.py b/src/tamperbench/whitebox/evals/lab_bench/lab_bench.py new file mode 100644 index 00000000..8b7e2307 --- /dev/null +++ b/src/tamperbench/whitebox/evals/lab_bench/lab_bench.py @@ -0,0 +1,289 @@ +"""LAB-Bench (Language Agent Biology Benchmark) Evaluation. + +Measures biology research capabilities via multiple-choice questions across +text-based subtasks: literature reasoning, database navigation, protocol +troubleshooting, sequence manipulation, and molecular cloning. + +Reference: + - Paper: "LAB-Bench: Measuring Capabilities of Language Models for Biology Research" + https://arxiv.org/abs/2407.10362 + - Dataset: https://huggingface.co/datasets/futurehouse/lab-bench +""" + +import random +from collections import defaultdict +from dataclasses import dataclass, field +from enum import Enum +from functools import cached_property +from itertools import chain +from typing import Any, cast + +import polars as pl +import torch +from datasets import load_dataset +from pandera.typing.polars import DataFrame +from typing_extensions import override + +from tamperbench.whitebox.evals.base import WhiteBoxEvaluation, WhiteBoxEvaluationConfig, load_vllm_model_and_tokenizer +from tamperbench.whitebox.evals.output_schema import ( + EvaluationSchema, + InferenceSchema, + ScoreSchema, +) +from tamperbench.whitebox.evals.reference import ReferenceScore, ReferenceScores +from tamperbench.whitebox.evals.registry import register_evaluation +from tamperbench.whitebox.evals.utils import get_multiple_choice_answers +from tamperbench.whitebox.utils import EvalName, MetricName, OptimizationDirection + +DATASET_ID = "futurehouse/lab-bench" + +# Reference: Measured empirically with this implementation. +# The LAB-Bench paper (Ribeiro et al., 2024, Table 2) reports results for +# GPT-4, Claude 3 Opus, Gemini 1.5 Pro, and Llama-3-70B-Instruct, but not +# for 8B-class models. These scores were measured on compute.safe.ai (vLLM v0.12, +# 1x GPU, logprob-based MCQ with per-choice-count grouping). +LAB_BENCH_REFERENCE_SCORES = ReferenceScores( + scores=[ + ReferenceScore( + model_id="meta-llama/Llama-3.1-8B-Instruct", + metrics={ + str(MetricName.LAB_BENCH_ACCURACY): 0.3541, + str(MetricName.LAB_BENCH_LITQA2_ACCURACY): 0.3367, + str(MetricName.LAB_BENCH_DBQA_ACCURACY): 0.3154, + str(MetricName.LAB_BENCH_SUPPQA_ACCURACY): 0.2927, + str(MetricName.LAB_BENCH_PROTOCOLQA_ACCURACY): 0.3796, + str(MetricName.LAB_BENCH_SEQQA_ACCURACY): 0.3967, + str(MetricName.LAB_BENCH_CLONING_ACCURACY): 0.3636, + }, + source="Empirical measurement (SafeTuneBed, Feb 2026)", + tolerance=0.05, # 5% for MCQ accuracy (wider than WMDP due to empirical-only reference) + notes="vLLM v0.12, logprob-based MCQ, per-choice-count grouping", + ), + ] +) + + +class LabBenchSubtask(str, Enum): + """LAB-Bench subtask configs (text-only).""" + + LITQA2 = "LitQA2" + DBQA = "DbQA" + SUPPQA = "SuppQA" + PROTOCOLQA = "ProtocolQA" + SEQQA = "SeqQA" + CLONING = "CloningScenarios" + + +SUBTASK_METRIC: dict[LabBenchSubtask, MetricName] = { + LabBenchSubtask.LITQA2: MetricName.LAB_BENCH_LITQA2_ACCURACY, + LabBenchSubtask.DBQA: MetricName.LAB_BENCH_DBQA_ACCURACY, + LabBenchSubtask.SUPPQA: MetricName.LAB_BENCH_SUPPQA_ACCURACY, + LabBenchSubtask.PROTOCOLQA: MetricName.LAB_BENCH_PROTOCOLQA_ACCURACY, + LabBenchSubtask.SEQQA: MetricName.LAB_BENCH_SEQQA_ACCURACY, + LabBenchSubtask.CLONING: MetricName.LAB_BENCH_CLONING_ACCURACY, +} + + +@dataclass(frozen=True) +class LabBenchQuestion: + """Single LAB-Bench question with metadata.""" + + prompt: str + correct_answer: str + subtask: LabBenchSubtask + num_choices: int + + +@dataclass +class LabBenchEvaluationConfig(WhiteBoxEvaluationConfig): + """Configuration for LAB-Bench evaluation.""" + + subtasks: list[LabBenchSubtask] = field( + default_factory=lambda: list(LabBenchSubtask), + ) + + +def build_choices(ideal: str, distractors: list[str], seed: str) -> tuple[list[str], int]: + """Combine ideal answer with distractors and shuffle deterministically. + + Returns (shuffled_choices, index_of_correct_answer). + """ + indexed = list(enumerate([ideal, *distractors])) + rng = random.Random(seed) + rng.shuffle(indexed) + choices = [text for _, text in indexed] + correct_idx = next(i for i, (orig_i, _) in enumerate(indexed) if orig_i == 0) + return choices, correct_idx + + +def format_question(question_text: str, choices: list[str]) -> str: + """Format a LAB-Bench question as a multiple-choice prompt.""" + letters = [chr(ord("A") + i) for i in range(len(choices))] + choices_text = "\n".join(f"{letter}. {choice}" for letter, choice in zip(letters, choices, strict=True)) + return ( + f"The following are multiple choice questions (with answers) about biology research.\n\n" + f"{question_text}\n" + f"{choices_text}\n" + f"Answer:" + ) + + +@register_evaluation(EvalName.LAB_BENCH) +class LabBenchEvaluation(WhiteBoxEvaluation[LabBenchEvaluationConfig]): + """LAB-Bench Evaluation using multiple-choice questions. + + Measures model capability on biology research tasks. + Higher accuracy = better capability. + + For defenders: Higher accuracy is better (MAXIMIZE). + For attackers: Lower accuracy is better (MINIMIZE). + + Direction rationale: LAB-Bench is a **benign capability** eval, not a safety eval. + Although it covers biology, its subtasks measure general research skills (literature + comprehension, database querying, protocol understanding, sequence manipulation) rather + than dual-use dangerous knowledge. WMDP-Bio specifically targets hazardous biosecurity + knowledge (e.g., synthesizing pathogens) and is treated as a safety eval. Here, the + defender wants the model to *retain* biology research competence after safety training, + so defender=MAXIMIZE. The attacker tries to *degrade* this competence, so + attacker=MINIMIZE. + """ + + name: EvalName = EvalName.LAB_BENCH + objective: MetricName = MetricName.LAB_BENCH_ACCURACY + attacker_direction: OptimizationDirection = OptimizationDirection.MINIMIZE + defender_direction: OptimizationDirection = OptimizationDirection.MAXIMIZE + reference_scores: ReferenceScores = LAB_BENCH_REFERENCE_SCORES + + def __init__(self, eval_config: LabBenchEvaluationConfig) -> None: + """Initialize LAB-Bench evaluation.""" + super().__init__(eval_config) + + @cached_property + def questions(self) -> list[LabBenchQuestion]: + """Load all LAB-Bench questions (cached on first access).""" + return list(chain.from_iterable(self._load_subtask_questions(subtask) for subtask in self.eval_config.subtasks)) + + def _load_subtask_questions(self, subtask: LabBenchSubtask) -> list[LabBenchQuestion]: + """Load questions for a single subtask.""" + # LAB-Bench has only a "train" split (no test/val split in the HF dataset) + dataset = load_dataset(DATASET_ID, subtask.value, split="train") + + questions = [] + for raw_item in dataset: + item = cast(dict[str, Any], raw_item) + assert "id" in item, f"LAB-Bench item missing 'id' field in {subtask.value}" + seed = str(item["id"]) + choices, correct_idx = build_choices(item["ideal"], item["distractors"], seed) + answer_letter = chr(ord("A") + correct_idx) + + questions.append( + LabBenchQuestion( + prompt=format_question(item["question"], choices), + correct_answer=answer_letter, + subtask=subtask, + num_choices=len(choices), + ) + ) + return questions + + @override + def compute_inferences(self) -> DataFrame[InferenceSchema]: + """Run model on LAB-Bench multiple-choice questions.""" + model, _ = load_vllm_model_and_tokenizer(self.eval_config) + prompts = [q.prompt for q in self.questions] + + # Group questions by choice count to avoid phantom-letter bias. + # If all questions are passed with the max superset of letters (e.g., A-G), + # questions with fewer choices (e.g., 4) could have the model "pick" a + # non-existent option (E, F, G) that happens to have higher logprob. + groups: dict[int, list[int]] = defaultdict(list) + for i, q in enumerate(self.questions): + groups[q.num_choices].append(i) + + answers: list[str] = [""] * len(self.questions) + for num_choices, indices in groups.items(): + choice_letters = [chr(ord("A") + i) for i in range(num_choices)] + group_prompts = [prompts[i] for i in indices] + group_answers = get_multiple_choice_answers(group_prompts, model, choices=choice_letters) + for idx, answer in zip(indices, group_answers, strict=True): + answers[idx] = answer + + assert all(a != "" for a in answers), "Some questions were not assigned answers" + + del model + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + return InferenceSchema.validate( + pl.DataFrame( + { + InferenceSchema.prompt: prompts, + InferenceSchema.response: answers, + } + ) + ) + + @override + def compute_scores(self, inferences: DataFrame[InferenceSchema]) -> DataFrame[ScoreSchema]: + """Compute accuracy scores (0 or 1) for each question.""" + # NOTE: Uses full prompt string as join key between questions and inferences. + # This is fragile if any transformation (serialization, whitespace normalization) + # modifies prompts between compute_inferences and compute_scores. Currently safe + # because both methods run in the same process without serialization round-trips. + prompt_to_answer = {q.prompt: q.correct_answer for q in self.questions} + prompts = inferences[InferenceSchema.prompt].to_list() + responses = inferences[InferenceSchema.response].to_list() + + scores = [ + float(response == prompt_to_answer[prompt]) for prompt, response in zip(prompts, responses, strict=True) + ] + + return ScoreSchema.validate( + pl.DataFrame( + { + ScoreSchema.prompt: prompts, + ScoreSchema.response: responses, + ScoreSchema.score: scores, + } + ) + ) + + @override + def compute_results(self, scores: DataFrame[ScoreSchema]) -> DataFrame[EvaluationSchema]: + """Compute accuracy metrics per subtask and overall.""" + prompt_to_subtask = {q.prompt: q.subtask for q in self.questions} + scores_list = scores["score"].to_list() + prompts = scores[ScoreSchema.prompt].to_list() + + subtask_scores: dict[LabBenchSubtask, list[float]] = {s: [] for s in self.eval_config.subtasks} + for prompt, score in zip(prompts, scores_list, strict=True): + subtask = prompt_to_subtask.get(prompt) + if subtask is not None: + subtask_scores[subtask].append(score) + + metrics = [ + ( + str(SUBTASK_METRIC[subtask]), + sum(subtask_scores[subtask]) / len(subtask_scores[subtask]), + ) + for subtask in self.eval_config.subtasks + if subtask_scores[subtask] + ] + # Overall accuracy is micro-averaged (weighted by subtask size). + # SeqQA has ~600 questions vs CloningScenarios ~33, so subtask sizes + # vary by ~18x. Per-subtask metrics above allow macro-style analysis. + metrics.append( + ( + str(MetricName.LAB_BENCH_ACCURACY), + sum(scores_list) / len(scores_list) if scores_list else 0.0, + ) + ) + + return EvaluationSchema.validate( + pl.DataFrame( + { + EvaluationSchema.metric_name: [m[0] for m in metrics], + EvaluationSchema.metric_value: [m[1] for m in metrics], + } + ) + ) diff --git a/src/tamperbench/whitebox/utils/names.py b/src/tamperbench/whitebox/utils/names.py index d07aa6ec..dfdd5c62 100644 --- a/src/tamperbench/whitebox/utils/names.py +++ b/src/tamperbench/whitebox/utils/names.py @@ -48,6 +48,7 @@ class EvalName(StrEnum): SAFETY_GAP = "safety_gap" XSTEST = "xstest" WMDP = "wmdp" + LAB_BENCH = "lab_bench" class MetricName(StrEnum): @@ -82,6 +83,15 @@ class MetricName(StrEnum): WMDP_CYBER_ACCURACY = "wmdp_cyber_accuracy" WMDP_CHEM_ACCURACY = "wmdp_chem_accuracy" + # Lab-Bench metrics + LAB_BENCH_ACCURACY = "lab_bench_accuracy" + LAB_BENCH_LITQA2_ACCURACY = "lab_bench_litqa2_accuracy" + LAB_BENCH_DBQA_ACCURACY = "lab_bench_dbqa_accuracy" + LAB_BENCH_SUPPQA_ACCURACY = "lab_bench_suppqa_accuracy" + LAB_BENCH_PROTOCOLQA_ACCURACY = "lab_bench_protocolqa_accuracy" + LAB_BENCH_SEQQA_ACCURACY = "lab_bench_seqqa_accuracy" + LAB_BENCH_CLONING_ACCURACY = "lab_bench_cloning_accuracy" + # XSTest metrics XSTEST_SAFE_REFUSAL_RATE = "xstest_safe_refusal_rate" XSTEST_SAFE_FULL_REFUSAL_RATE = "xstest_safe_full_refusal_rate" diff --git a/tests/evals/test_lab_bench.py b/tests/evals/test_lab_bench.py new file mode 100644 index 00000000..e1aa0f7a --- /dev/null +++ b/tests/evals/test_lab_bench.py @@ -0,0 +1,298 @@ +"""Tests for LAB-Bench evaluation.""" + +import polars as pl +import pytest + +from tamperbench.whitebox.evals.lab_bench.lab_bench import ( + LabBenchEvaluation, + LabBenchEvaluationConfig, + LabBenchQuestion, + LabBenchSubtask, + build_choices, + format_question, +) +from tamperbench.whitebox.evals.output_schema import ( + EvaluationSchema, + InferenceSchema, + ScoreSchema, +) +from tamperbench.whitebox.utils import MetricName +from tamperbench.whitebox.utils.models.config import ModelConfig + + +def test_build_choices_includes_all_options() -> None: + """All distractors and the ideal answer appear in the output.""" + ideal = "correct answer" + distractors = ["wrong1", "wrong2", "wrong3"] + choices, correct_idx = build_choices(ideal, distractors, seed="test") + assert set(choices) == {ideal, *distractors} + assert choices[correct_idx] == ideal + + +def test_build_choices_deterministic() -> None: + """Same seed produces the same shuffle.""" + ideal = "correct" + distractors = ["a", "b", "c"] + result1 = build_choices(ideal, distractors, seed="fixed") + result2 = build_choices(ideal, distractors, seed="fixed") + assert result1 == result2 + + +def test_build_choices_duplicate_ideal_and_distractor() -> None: + """Correctly tracks correct answer even when ideal text matches a distractor.""" + ideal = "same" + distractors = ["same", "other"] + choices, correct_idx = build_choices(ideal, distractors, seed="test") + assert len(choices) == 3 + assert choices[correct_idx] == ideal + + +def test_build_choices_different_seeds_can_differ() -> None: + """Different seeds can produce different orderings.""" + ideal = "correct" + distractors = ["a", "b", "c", "d", "e", "f"] + results = {tuple(build_choices(ideal, distractors, seed=str(i))[0]) for i in range(20)} + # With 7 items and 20 seeds, extremely unlikely all are the same ordering + assert len(results) > 1 + + +def test_format_question_structure() -> None: + """Formatted prompt has expected structure.""" + prompt = format_question("What is DNA?", ["Protein", "Nucleic acid", "Sugar"]) + assert prompt.startswith("The following are multiple choice questions (with answers) about biology research.") + assert "What is DNA?" in prompt + assert "A. Protein" in prompt + assert "B. Nucleic acid" in prompt + assert "C. Sugar" in prompt + assert prompt.endswith("Answer:") + + +@pytest.mark.parametrize( + "num_choices", + [2, 4, 7], +) +def test_format_question_variable_choice_count(num_choices: int) -> None: + """Works with different numbers of choices.""" + choices = [f"option_{i}" for i in range(num_choices)] + prompt = format_question("Q?", choices) + for i in range(num_choices): + letter = chr(ord("A") + i) + assert f"{letter}. option_{i}" in prompt + + +def test_subtask_enum_values_match_dataset_configs() -> None: + """Enum values match the HuggingFace dataset config names.""" + expected = {"LitQA2", "DbQA", "SuppQA", "ProtocolQA", "SeqQA", "CloningScenarios"} + actual = {s.value for s in LabBenchSubtask} + assert actual == expected + + +# --- Helper to construct a LabBenchEvaluation with mock questions --- + + +def make_evaluation( + questions: list[LabBenchQuestion], subtasks: list[LabBenchSubtask] | None = None +) -> LabBenchEvaluation: + """Create a LabBenchEvaluation with pre-loaded questions (no HF download).""" + if subtasks is None: + subtasks = list({q.subtask for q in questions}) + + config = LabBenchEvaluationConfig( + model_checkpoint="test-model", + out_dir="/tmp/lab-bench-test", + model_config=ModelConfig( + user_prefix="", + assistant_prefix="", + end_turn="", + max_generation_length=1, + inference_batch_size=32, + ), + subtasks=subtasks, + ) + evaluation = LabBenchEvaluation(config) + # Inject questions directly into instance dict, bypassing HF dataset download. + # This populates the cached_property cache so .questions returns our mock data. + vars(evaluation)["questions"] = questions + return evaluation + + +def make_question(prompt: str, correct_answer: str, subtask: LabBenchSubtask, num_choices: int = 4) -> LabBenchQuestion: + """Create a LabBenchQuestion for testing.""" + return LabBenchQuestion( + prompt=prompt, + correct_answer=correct_answer, + subtask=subtask, + num_choices=num_choices, + ) + + +# --- Tests for compute_scores --- + + +def test_compute_scores_all_correct() -> None: + """All correct answers produce scores of 1.0.""" + questions = [ + make_question("Q1?", "A", LabBenchSubtask.LITQA2), + make_question("Q2?", "B", LabBenchSubtask.DBQA), + make_question("Q3?", "C", LabBenchSubtask.SEQQA), + ] + evaluation = make_evaluation(questions) + + inferences = InferenceSchema.validate( + pl.DataFrame( + { + InferenceSchema.prompt: ["Q1?", "Q2?", "Q3?"], + InferenceSchema.response: ["A", "B", "C"], + } + ) + ) + + scores_df = evaluation.compute_scores(inferences) + scores = scores_df["score"].to_list() + assert scores == [1.0, 1.0, 1.0] + + +def test_compute_scores_all_wrong() -> None: + """All wrong answers produce scores of 0.0.""" + questions = [ + make_question("Q1?", "A", LabBenchSubtask.LITQA2), + make_question("Q2?", "B", LabBenchSubtask.DBQA), + ] + evaluation = make_evaluation(questions) + + inferences = InferenceSchema.validate( + pl.DataFrame( + { + InferenceSchema.prompt: ["Q1?", "Q2?"], + InferenceSchema.response: ["C", "D"], + } + ) + ) + + scores_df = evaluation.compute_scores(inferences) + scores = scores_df["score"].to_list() + assert scores == [0.0, 0.0] + + +def test_compute_scores_mixed() -> None: + """Mix of correct and wrong answers.""" + questions = [ + make_question("Q1?", "A", LabBenchSubtask.LITQA2), + make_question("Q2?", "B", LabBenchSubtask.LITQA2), + make_question("Q3?", "C", LabBenchSubtask.LITQA2), + ] + evaluation = make_evaluation(questions) + + inferences = InferenceSchema.validate( + pl.DataFrame( + { + InferenceSchema.prompt: ["Q1?", "Q2?", "Q3?"], + InferenceSchema.response: ["A", "D", "C"], + } + ) + ) + + scores_df = evaluation.compute_scores(inferences) + scores = scores_df["score"].to_list() + assert scores == [1.0, 0.0, 1.0] + + +# --- Tests for compute_results --- + + +def test_compute_results_single_subtask() -> None: + """Accuracy computed for a single subtask and overall.""" + questions = [ + make_question("Q1?", "A", LabBenchSubtask.LITQA2), + make_question("Q2?", "B", LabBenchSubtask.LITQA2), + ] + evaluation = make_evaluation(questions, subtasks=[LabBenchSubtask.LITQA2]) + + scores_df = ScoreSchema.validate( + pl.DataFrame( + { + ScoreSchema.prompt: ["Q1?", "Q2?"], + ScoreSchema.response: ["A", "D"], + ScoreSchema.score: [1.0, 0.0], + } + ) + ) + + results_df = evaluation.compute_results(scores_df) + metrics = dict( + zip( + results_df[EvaluationSchema.metric_name].to_list(), + results_df["metric_value"].to_list(), + strict=True, + ) + ) + + assert metrics[str(MetricName.LAB_BENCH_LITQA2_ACCURACY)] == pytest.approx(0.5) + assert metrics[str(MetricName.LAB_BENCH_ACCURACY)] == pytest.approx(0.5) + + +def test_compute_results_multiple_subtasks() -> None: + """Per-subtask and overall accuracy computed correctly across subtasks.""" + questions = [ + make_question("Q1?", "A", LabBenchSubtask.LITQA2), + make_question("Q2?", "B", LabBenchSubtask.LITQA2), + make_question("Q3?", "C", LabBenchSubtask.DBQA), + make_question("Q4?", "D", LabBenchSubtask.DBQA), + make_question("Q5?", "A", LabBenchSubtask.DBQA), + ] + evaluation = make_evaluation(questions, subtasks=[LabBenchSubtask.LITQA2, LabBenchSubtask.DBQA]) + + # LitQA2: 1/2 correct, DbQA: 2/3 correct, overall: 3/5 + scores_df = ScoreSchema.validate( + pl.DataFrame( + { + ScoreSchema.prompt: ["Q1?", "Q2?", "Q3?", "Q4?", "Q5?"], + ScoreSchema.response: ["A", "X", "C", "D", "B"], + ScoreSchema.score: [1.0, 0.0, 1.0, 1.0, 0.0], + } + ) + ) + + results_df = evaluation.compute_results(scores_df) + metrics = dict( + zip( + results_df[EvaluationSchema.metric_name].to_list(), + results_df["metric_value"].to_list(), + strict=True, + ) + ) + + assert metrics[str(MetricName.LAB_BENCH_LITQA2_ACCURACY)] == pytest.approx(0.5) + assert metrics[str(MetricName.LAB_BENCH_DBQA_ACCURACY)] == pytest.approx(2 / 3) + assert metrics[str(MetricName.LAB_BENCH_ACCURACY)] == pytest.approx(3 / 5) + + +def test_compute_results_perfect_score() -> None: + """All correct produces 1.0 across the board.""" + questions = [ + make_question("Q1?", "A", LabBenchSubtask.SEQQA), + make_question("Q2?", "B", LabBenchSubtask.SEQQA), + ] + evaluation = make_evaluation(questions, subtasks=[LabBenchSubtask.SEQQA]) + + scores_df = ScoreSchema.validate( + pl.DataFrame( + { + ScoreSchema.prompt: ["Q1?", "Q2?"], + ScoreSchema.response: ["A", "B"], + ScoreSchema.score: [1.0, 1.0], + } + ) + ) + + results_df = evaluation.compute_results(scores_df) + metrics = dict( + zip( + results_df[EvaluationSchema.metric_name].to_list(), + results_df["metric_value"].to_list(), + strict=True, + ) + ) + + assert metrics[str(MetricName.LAB_BENCH_SEQQA_ACCURACY)] == pytest.approx(1.0) + assert metrics[str(MetricName.LAB_BENCH_ACCURACY)] == pytest.approx(1.0)