Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions src/safetunebed/whitebox/evals/lm_eval/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""LM-Eval Integration for SafeTuneBed.

This work is based on EleutherAI's LM-Eval Harness, and can be cited as follows:
@misc{eval-harness,
author = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
title = {The Language Model Evaluation Harness},
month = 07,
year = 2024,
publisher = {Zenodo},
version = {v0.4.3},
doi = {10.5281/zenodo.12608602},
url = {https://zenodo.org/records/12608602}
}
"""

from safetunebed.whitebox.evals.lm_eval.lm_eval import (
LMRunEvaluationConfig,
LMRunEvaluation,
)

__all__ = [
"LMRunEvaluationConfig",
"LMRunEvaluation",
]
133 changes: 133 additions & 0 deletions src/safetunebed/whitebox/evals/lm_eval/lm_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
"""
This work is inspired by LM-Eval's implementation by EleutherAI, their respective citation is below:
@misc{eval-harness,
author = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
title = {The Language Model Evaluation Harness},
month = 07,
year = 2024,
publisher = {Zenodo},
version = {v0.4.3},
doi = {10.5281/zenodo.12608602},
url = {https://zenodo.org/records/12608602}
}
"""
#Original "driver" code, which works. The uncommented code in this file is solely formatted for integration within SafeTuneBed.
"""
import subprocess
from typing import List, Dict, Union

# no support for GGUF yet since I don't think we'll need it, however can very easily add
def run_lm_eval(
tasks: Union[str, List[str]],
pretrained_path: str,
tokenizer_path: str,
device: str = "cuda:0",
batch_size: int = 8,
) -> Dict[str, Dict[str, str]]:
# plural tasks
if isinstance(tasks, str):
tasks = [tasks]

results = {}

for task in tasks:
print(f"\n=== Running task: {task} ===")
cmd = [
"lm_eval",
"--model", "hf",
"--model_args", f"pretrained={pretrained_path},tokenizer={tokenizer_path}",
"--tasks", task,
"--device", device,
"--batch_size", str(batch_size),
]

result = subprocess.run(cmd, capture_output=True, text=True)

results[task] = {
"stdout": result.stdout,
"stderr": result.stderr,
"returncode": result.returncode,
}

return results

if __name__ == "__main__":
#def test_runner_real_execution():
results = run_lm_eval(
tasks=["hellaswag"],
pretrained_path="/model-weights/Qwen2.5-3B",
tokenizer_path="/model-weights/Qwen2.5-3B",
)

# "echo" prints all arguments back as a single string
output = results["hellaswag"]["stdout"].strip()
print(output)
"""

import subprocess
from dataclasses import dataclass
from typing import List, Dict, Union, TypeVar
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: as of Python 3.9, can use dict and list in type hints rather than importing Dict and List


import polars as pl
from safetunebed.whitebox.evals.base import WhiteBoxEvaluation, WhiteBoxEvaluationConfig

S = TypeVar(name="S", bound="LMRunEvaluationConfig")


@dataclass
class LMRunEvaluationConfig(WhiteBoxEvaluationConfig):
tasks: Union[str, List[str]]
pretrained_path: str
tokenizer_path: str
device: str = "cuda:0"
batch_size: int = 8
model_checkpoint: str = "dummy" # required by base
out_dir: str = "./tmp_eval" # required by base
max_generation_length: int = 0 # required by base


class LMRunEvaluation(WhiteBoxEvaluation[S]):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you don't need to define S = TypeVar(...) and you could just do LMRunEvaluation(WhiteBoxEvaluation[LMRunEvaluationConfig]). Although it looks like we do this TypeVar thing throughout the code so maybe cleaning this up should be a separate PR

"""
Inference-only evaluation wrapper around 'lm_eval' CLI using subprocess.
Returns DataFrame response=stdout.
"""

name = "LM_EVAL_RUNNER"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the other evals (e.g., strong reject) we set the name, score, and optimization direction like:

from safetunebed.whitebox.utils import (
    EvalName,
    MetricName,
    OptimizationDirection,
    dealloc_model_and_tokenizer
)
...
    name: EvalName = EvalName.StrongReject
    objective: MetricName = MetricName.STRONGREJECT_SCORE
    attacker_direction: OptimizationDirection = OptimizationDirection.MAXIMIZE
    defender_direction: OptimizationDirection = OptimizationDirection.MINIMIZE

Not sure if this should be here as well - in terms of naming convention consistency, but also, there are a bunch of different tasks that could have different optimization directions, although I could also imagine they all share the same direction so we could include this here as well.


def _run_single_task(self, task: str) -> Dict[str, str]:
cmd = [
"lm_eval",
"--model", "hf",
"--model_args", f"pretrained={self.eval_config.pretrained_path},tokenizer={self.eval_config.tokenizer_path}",
"--output_path", "results",
"--tasks", task,
"--device", self.eval_config.device,
"--batch_size", str(self.eval_config.batch_size),
]

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there's probably some way in this function we could get the output json and parse the outputs to a json

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Screen Shot 2025-10-21 at 11 51 50 PM

I believe we just need to add a --output_path “${OUTPUT_PATH}” arg to save the output jsons to the specified path. They already handle saving and caching results.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pubmedqa_llama3_qlora_2025-10-10T15-53-15.177293.json

Example of an output json^ generated by lm_eval. We can parse this further, if required - @sdhossain

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

Copy link
Collaborator

@sdhossain sdhossain Oct 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In our format we'd need a bit more information - i.e. what each datapoint was - the metric score that it has, etc.

some for datapoint 1:

we'd need the exact input to the model, the model's exact output, and the evaluation systems score.

I've attached an example csv we have for another evaluation (we store it as a .parquet - see StrongREJECT as an example)

evaluator_scores.csv

Right now - (from what I saw in the json, I mainly saw configs, and an aggregate score). This would perhaps make it difficult to do further analysis, i.e. if we wanted to check responses using an LLM-Judge, etc.

result = subprocess.run(cmd, capture_output=True, text=True)

return {
"task": task,
"stdout": result.stdout.strip(),
"stderr": result.stderr.strip(),
"returncode": str(result.returncode),
}

def compute_inferences(self) -> pl.DataFrame:
tasks = self.eval_config.tasks
if isinstance(tasks, str):
tasks = [tasks]

logs = []
for task in tasks:
print(f"\n=== Running task: {task} ===")
logs.append(self._run_single_task(task))

return pl.DataFrame(logs) #Inference schema doesn't really make sense until now for me.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

remove comment? or elaborate on comment about what's confuising?


def compute_scores(self, inferences):
return None

def compute_results(self, scores):
return None
28 changes: 28 additions & 0 deletions tests/evals/test_lm_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""Sanity check for LMRunEvaluation (print-only)."""

import tempfile
from dotenv import load_dotenv

from safetunebed.whitebox.evals.lm_eval.lm_eval import (
LMRunEvaluation,
LMRunEvaluationConfig,
)

if __name__ == "__main__":
load_dotenv()

with tempfile.TemporaryDirectory() as tmpdirname:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Do we need the tempfile directory if we are testing print-only? Probably makes sense to keep if we incorporate the logging and test that too though, which seems like a good idea.

eval_config = LMRunEvaluationConfig(
tasks=["hellaswag"],
pretrained_path="/model-weights/Qwen2.5-3B",
tokenizer_path="/model-weights/Qwen2.5-3B",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just wondering because I am curious: when would the pretrained_path and tokenizer_path be different?

batch_size=1,
)

evaluator = LMRunEvaluation(eval_config=eval_config)

print("=== Running LMRunEvaluation ===")
inferences = evaluator.compute_inferences()

print("\n=== RAW INFERENCES OUTPUT ===")
print(inferences)