-
Notifications
You must be signed in to change notification settings - Fork 1
eval: added lm-eval #36
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| """LM-Eval Integration for SafeTuneBed. | ||
|
|
||
| This work is based on EleutherAI's LM-Eval Harness, and can be cited as follows: | ||
| @misc{eval-harness, | ||
| author = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy}, | ||
| title = {The Language Model Evaluation Harness}, | ||
| month = 07, | ||
| year = 2024, | ||
| publisher = {Zenodo}, | ||
| version = {v0.4.3}, | ||
| doi = {10.5281/zenodo.12608602}, | ||
| url = {https://zenodo.org/records/12608602} | ||
| } | ||
| """ | ||
|
|
||
| from safetunebed.whitebox.evals.lm_eval.lm_eval import ( | ||
| LMRunEvaluationConfig, | ||
| LMRunEvaluation, | ||
| ) | ||
|
|
||
| __all__ = [ | ||
| "LMRunEvaluationConfig", | ||
| "LMRunEvaluation", | ||
| ] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,133 @@ | ||
| """ | ||
| This work is inspired by LM-Eval's implementation by EleutherAI, their respective citation is below: | ||
| @misc{eval-harness, | ||
| author = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy}, | ||
| title = {The Language Model Evaluation Harness}, | ||
| month = 07, | ||
| year = 2024, | ||
| publisher = {Zenodo}, | ||
| version = {v0.4.3}, | ||
| doi = {10.5281/zenodo.12608602}, | ||
| url = {https://zenodo.org/records/12608602} | ||
| } | ||
| """ | ||
| #Original "driver" code, which works. The uncommented code in this file is solely formatted for integration within SafeTuneBed. | ||
| """ | ||
| import subprocess | ||
| from typing import List, Dict, Union | ||
|
|
||
| # no support for GGUF yet since I don't think we'll need it, however can very easily add | ||
| def run_lm_eval( | ||
| tasks: Union[str, List[str]], | ||
| pretrained_path: str, | ||
| tokenizer_path: str, | ||
| device: str = "cuda:0", | ||
| batch_size: int = 8, | ||
| ) -> Dict[str, Dict[str, str]]: | ||
| # plural tasks | ||
| if isinstance(tasks, str): | ||
| tasks = [tasks] | ||
|
|
||
| results = {} | ||
|
|
||
| for task in tasks: | ||
| print(f"\n=== Running task: {task} ===") | ||
| cmd = [ | ||
| "lm_eval", | ||
| "--model", "hf", | ||
| "--model_args", f"pretrained={pretrained_path},tokenizer={tokenizer_path}", | ||
| "--tasks", task, | ||
| "--device", device, | ||
| "--batch_size", str(batch_size), | ||
| ] | ||
|
|
||
| result = subprocess.run(cmd, capture_output=True, text=True) | ||
|
|
||
| results[task] = { | ||
| "stdout": result.stdout, | ||
| "stderr": result.stderr, | ||
| "returncode": result.returncode, | ||
| } | ||
|
|
||
| return results | ||
|
|
||
| if __name__ == "__main__": | ||
| #def test_runner_real_execution(): | ||
| results = run_lm_eval( | ||
| tasks=["hellaswag"], | ||
| pretrained_path="/model-weights/Qwen2.5-3B", | ||
| tokenizer_path="/model-weights/Qwen2.5-3B", | ||
| ) | ||
|
|
||
| # "echo" prints all arguments back as a single string | ||
| output = results["hellaswag"]["stdout"].strip() | ||
| print(output) | ||
| """ | ||
|
|
||
| import subprocess | ||
| from dataclasses import dataclass | ||
| from typing import List, Dict, Union, TypeVar | ||
|
|
||
| import polars as pl | ||
| from safetunebed.whitebox.evals.base import WhiteBoxEvaluation, WhiteBoxEvaluationConfig | ||
|
|
||
| S = TypeVar(name="S", bound="LMRunEvaluationConfig") | ||
|
|
||
|
|
||
| @dataclass | ||
| class LMRunEvaluationConfig(WhiteBoxEvaluationConfig): | ||
| tasks: Union[str, List[str]] | ||
| pretrained_path: str | ||
| tokenizer_path: str | ||
| device: str = "cuda:0" | ||
| batch_size: int = 8 | ||
| model_checkpoint: str = "dummy" # required by base | ||
| out_dir: str = "./tmp_eval" # required by base | ||
| max_generation_length: int = 0 # required by base | ||
|
|
||
|
|
||
| class LMRunEvaluation(WhiteBoxEvaluation[S]): | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think you don't need to define |
||
| """ | ||
| Inference-only evaluation wrapper around 'lm_eval' CLI using subprocess. | ||
| Returns DataFrame response=stdout. | ||
| """ | ||
|
|
||
| name = "LM_EVAL_RUNNER" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For the other evals (e.g., strong reject) we set the name, score, and optimization direction like: Not sure if this should be here as well - in terms of naming convention consistency, but also, there are a bunch of different tasks that could have different optimization directions, although I could also imagine they all share the same direction so we could include this here as well. |
||
|
|
||
| def _run_single_task(self, task: str) -> Dict[str, str]: | ||
| cmd = [ | ||
| "lm_eval", | ||
| "--model", "hf", | ||
| "--model_args", f"pretrained={self.eval_config.pretrained_path},tokenizer={self.eval_config.tokenizer_path}", | ||
| "--output_path", "results", | ||
| "--tasks", task, | ||
| "--device", self.eval_config.device, | ||
| "--batch_size", str(self.eval_config.batch_size), | ||
| ] | ||
|
|
||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there's probably some way in this function we could get the output json and parse the outputs to a json
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pubmedqa_llama3_qlora_2025-10-10T15-53-15.177293.json Example of an output json^ generated by lm_eval. We can parse this further, if required - @sdhossain
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In our format we'd need a bit more information - i.e. what each datapoint was - the metric score that it has, etc. some for datapoint 1: we'd need the exact input to the model, the model's exact output, and the evaluation systems score. I've attached an example csv we have for another evaluation (we store it as a .parquet - see StrongREJECT as an example) Right now - (from what I saw in the json, I mainly saw configs, and an aggregate score). This would perhaps make it difficult to do further analysis, i.e. if we wanted to check responses using an LLM-Judge, etc. |
||
| result = subprocess.run(cmd, capture_output=True, text=True) | ||
|
|
||
| return { | ||
| "task": task, | ||
| "stdout": result.stdout.strip(), | ||
| "stderr": result.stderr.strip(), | ||
| "returncode": str(result.returncode), | ||
| } | ||
|
|
||
| def compute_inferences(self) -> pl.DataFrame: | ||
| tasks = self.eval_config.tasks | ||
| if isinstance(tasks, str): | ||
| tasks = [tasks] | ||
|
|
||
| logs = [] | ||
| for task in tasks: | ||
| print(f"\n=== Running task: {task} ===") | ||
| logs.append(self._run_single_task(task)) | ||
|
|
||
| return pl.DataFrame(logs) #Inference schema doesn't really make sense until now for me. | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove comment? or elaborate on comment about what's confuising? |
||
|
|
||
| def compute_scores(self, inferences): | ||
| return None | ||
|
|
||
| def compute_results(self, scores): | ||
| return None | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| """Sanity check for LMRunEvaluation (print-only).""" | ||
|
|
||
| import tempfile | ||
| from dotenv import load_dotenv | ||
|
|
||
| from safetunebed.whitebox.evals.lm_eval.lm_eval import ( | ||
| LMRunEvaluation, | ||
| LMRunEvaluationConfig, | ||
| ) | ||
|
|
||
| if __name__ == "__main__": | ||
| load_dotenv() | ||
|
|
||
| with tempfile.TemporaryDirectory() as tmpdirname: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: Do we need the tempfile directory if we are testing print-only? Probably makes sense to keep if we incorporate the logging and test that too though, which seems like a good idea. |
||
| eval_config = LMRunEvaluationConfig( | ||
| tasks=["hellaswag"], | ||
| pretrained_path="/model-weights/Qwen2.5-3B", | ||
| tokenizer_path="/model-weights/Qwen2.5-3B", | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just wondering because I am curious: when would the |
||
| batch_size=1, | ||
| ) | ||
|
|
||
| evaluator = LMRunEvaluation(eval_config=eval_config) | ||
|
|
||
| print("=== Running LMRunEvaluation ===") | ||
| inferences = evaluator.compute_inferences() | ||
|
|
||
| print("\n=== RAW INFERENCES OUTPUT ===") | ||
| print(inferences) | ||

There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: as of Python 3.9, can use dict and list in type hints rather than importing Dict and List