criticalml-uw · psyonp · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/src/safetunebed/whitebox/evals/lm_eval/__init__.py b/src/safetunebed/whitebox/evals/lm_eval/__init__.py
@@ -0,0 +1,24 @@
+"""LM-Eval Integration for SafeTuneBed.
+
+This work is based on EleutherAI's LM-Eval Harness, and can be cited as follows:
+@misc{eval-harness,
+  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
+  title        = {The Language Model Evaluation Harness},
+  month        = 07,
+  year         = 2024,
+  publisher    = {Zenodo},
+  version      = {v0.4.3},
+  doi          = {10.5281/zenodo.12608602},
+  url          = {https://zenodo.org/records/12608602}
+}
+"""
+
+from safetunebed.whitebox.evals.lm_eval.lm_eval import (
+    LMRunEvaluationConfig,
+    LMRunEvaluation,
+)
+
+__all__ = [
+    "LMRunEvaluationConfig",
+    "LMRunEvaluation",
+]
diff --git a/src/safetunebed/whitebox/evals/lm_eval/lm_eval.py b/src/safetunebed/whitebox/evals/lm_eval/lm_eval.py
@@ -0,0 +1,133 @@
+"""
+This work is inspired by LM-Eval's implementation by EleutherAI, their respective citation is below:
+@misc{eval-harness,
+  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
+  title        = {The Language Model Evaluation Harness},
+  month        = 07,
+  year         = 2024,
+  publisher    = {Zenodo},
+  version      = {v0.4.3},
+  doi          = {10.5281/zenodo.12608602},
+  url          = {https://zenodo.org/records/12608602}
+}
+"""
+#Original "driver" code, which works. The uncommented code in this file is solely formatted for integration within SafeTuneBed.
+"""
+import subprocess
+from typing import List, Dict, Union
+
+# no support for GGUF yet since I don't think we'll need it, however can very easily add
+def run_lm_eval(
+    tasks: Union[str, List[str]],
+    pretrained_path: str,
+    tokenizer_path: str,
+    device: str = "cuda:0",
+    batch_size: int = 8,
+) -> Dict[str, Dict[str, str]]:
+    # plural tasks
+    if isinstance(tasks, str):
+        tasks = [tasks]
+
+    results = {}
+
+    for task in tasks:
+        print(f"\n=== Running task: {task} ===")
+        cmd = [
+            "lm_eval",
+            "--model", "hf",
+            "--model_args", f"pretrained={pretrained_path},tokenizer={tokenizer_path}",
+            "--tasks", task,
+            "--device", device,
+            "--batch_size", str(batch_size),
+        ]
+
+        result = subprocess.run(cmd, capture_output=True, text=True)
+
+        results[task] = {
+            "stdout": result.stdout,
+            "stderr": result.stderr,
+            "returncode": result.returncode,
+        }
+
+    return results
+
+if __name__ == "__main__":
+#def test_runner_real_execution():
+    results = run_lm_eval(
+        tasks=["hellaswag"],
+        pretrained_path="/model-weights/Qwen2.5-3B",
+        tokenizer_path="/model-weights/Qwen2.5-3B",
+    )
+
+    # "echo" prints all arguments back as a single string
+    output = results["hellaswag"]["stdout"].strip()
+    print(output)
+"""
+
+import subprocess
+from dataclasses import dataclass
+from typing import List, Dict, Union, TypeVar
+
+import polars as pl
+from safetunebed.whitebox.evals.base import WhiteBoxEvaluation, WhiteBoxEvaluationConfig
+
+S = TypeVar(name="S", bound="LMRunEvaluationConfig")
+
+
+@dataclass
+class LMRunEvaluationConfig(WhiteBoxEvaluationConfig):
+    tasks: Union[str, List[str]]
+    pretrained_path: str
+    tokenizer_path: str
+    device: str = "cuda:0"
+    batch_size: int = 8
+    model_checkpoint: str = "dummy"   # required by base
+    out_dir: str = "./tmp_eval"       # required by base
+    max_generation_length: int = 0    # required by base
+
+
+class LMRunEvaluation(WhiteBoxEvaluation[S]):
+    """
+    Inference-only evaluation wrapper around 'lm_eval' CLI using subprocess.
+    Returns DataFrame response=stdout.
+    """
+
+    name = "LM_EVAL_RUNNER"
+
+    def _run_single_task(self, task: str) -> Dict[str, str]:
+        cmd = [
+            "lm_eval",
+            "--model", "hf",
+            "--model_args", f"pretrained={self.eval_config.pretrained_path},tokenizer={self.eval_config.tokenizer_path}",
+            "--output_path", "results",
+            "--tasks", task,
+            "--device", self.eval_config.device,
+            "--batch_size", str(self.eval_config.batch_size),
+        ]
+
+        result = subprocess.run(cmd, capture_output=True, text=True)
+
+        return {
+            "task": task,
+            "stdout": result.stdout.strip(),
+            "stderr": result.stderr.strip(),
+            "returncode": str(result.returncode),
+        }
+
+    def compute_inferences(self) -> pl.DataFrame:
+        tasks = self.eval_config.tasks
+        if isinstance(tasks, str):
+            tasks = [tasks]
+
+        logs = []
+        for task in tasks:
+            print(f"\n=== Running task: {task} ===")
+            logs.append(self._run_single_task(task))
+
+        return pl.DataFrame(logs) #Inference schema doesn't really make sense until now for me.
+
+    def compute_scores(self, inferences):
+        return None
+
+    def compute_results(self, scores):
+        return None
diff --git a/tests/evals/test_lm_eval.py b/tests/evals/test_lm_eval.py
@@ -0,0 +1,28 @@
+"""Sanity check for LMRunEvaluation (print-only)."""
+
+import tempfile
+from dotenv import load_dotenv
+
+from safetunebed.whitebox.evals.lm_eval.lm_eval import (
+    LMRunEvaluation,
+    LMRunEvaluationConfig,
+)
+
+if __name__ == "__main__":
+    load_dotenv()
+
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        eval_config = LMRunEvaluationConfig(
+            tasks=["hellaswag"],
+            pretrained_path="/model-weights/Qwen2.5-3B",
+            tokenizer_path="/model-weights/Qwen2.5-3B",
+            batch_size=1,
+        )
+
+        evaluator = LMRunEvaluation(eval_config=eval_config)
+
+        print("=== Running LMRunEvaluation ===")
+        inferences = evaluator.compute_inferences()
+
+        print("\n=== RAW INFERENCES OUTPUT ===")
+        print(inferences)