From fcb926805bc231e510ada01301468e7c021442ea Mon Sep 17 00:00:00 2001
From: Punya Syon Pandey <101074452+psyonp@users.noreply.github.com>
Date: Tue, 14 Oct 2025 13:03:22 -0400
Subject: [PATCH 1/6] Create lm_eval.py

---
 .../whitebox/evals/lm_eval/lm_eval.py         | 133 ++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 src/safetunebed/whitebox/evals/lm_eval/lm_eval.py

diff --git a/src/safetunebed/whitebox/evals/lm_eval/lm_eval.py b/src/safetunebed/whitebox/evals/lm_eval/lm_eval.py
new file mode 100644
index 00000000..29def966
--- /dev/null
+++ b/src/safetunebed/whitebox/evals/lm_eval/lm_eval.py
@@ -0,0 +1,133 @@
+"""
+This work is inspired by LM-Eval's implementation by EleutherAI, their respective citation is below:
+@misc{eval-harness,
+  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
+  title        = {The Language Model Evaluation Harness},
+  month        = 07,
+  year         = 2024,
+  publisher    = {Zenodo},
+  version      = {v0.4.3},
+  doi          = {10.5281/zenodo.12608602},
+  url          = {https://zenodo.org/records/12608602}
+}
+"""
+#Original "driver" code, which works. The uncommented code in this file is solely formatted for integration within SafeTuneBed.
+"""
+import subprocess
+from typing import List, Dict, Union
+
+# no support for GGUF yet since I don't think we'll need it, however can very easily add
+def run_lm_eval(
+    tasks: Union[str, List[str]],
+    pretrained_path: str,
+    tokenizer_path: str,
+    device: str = "cuda:0",
+    batch_size: int = 8,
+) -> Dict[str, Dict[str, str]]:
+    # plural tasks
+    if isinstance(tasks, str):
+        tasks = [tasks]
+
+    results = {}
+
+    for task in tasks:
+        print(f"\n=== Running task: {task} ===")
+        cmd = [
+            "lm_eval",
+            "--model", "hf",
+            "--model_args", f"pretrained={pretrained_path},tokenizer={tokenizer_path}",
+            "--tasks", task,
+            "--device", device,
+            "--batch_size", str(batch_size),
+        ]
+
+        result = subprocess.run(cmd, capture_output=True, text=True)
+
+        results[task] = {
+            "stdout": result.stdout,
+            "stderr": result.stderr,
+            "returncode": result.returncode,
+        }
+
+    return results
+
+if __name__ == "__main__":
+#def test_runner_real_execution():
+    results = run_lm_eval(
+        tasks=["hellaswag"],
+        pretrained_path="/model-weights/Qwen2.5-3B",
+        tokenizer_path="/model-weights/Qwen2.5-3B",
+    )
+
+    # "echo" prints all arguments back as a single string
+    output = results["hellaswag"]["stdout"].strip()
+    print(output)
+"""
+
+import subprocess
+from dataclasses import dataclass
+from typing import List, Dict, Union, TypeVar
+
+import polars as pl
+from safetunebed.whitebox.evals.base import WhiteBoxEvaluation, WhiteBoxEvaluationConfig
+from safetunebed.whitebox.evals.output_schema import InferenceSchema
+
+S = TypeVar(name="S", bound="LMRunEvaluationConfig")
+
+
+@dataclass
+class LMRunEvaluationConfig(WhiteBoxEvaluationConfig):
+    tasks: Union[str, List[str]]
+    pretrained_path: str
+    tokenizer_path: str
+    device: str = "cuda:0"
+    batch_size: int = 8
+    model_checkpoint: str = "dummy"   # required by base
+    out_dir: str = "./tmp_eval"       # required by base
+    max_generation_length: int = 0    # required by base
+
+
+class LMRunEvaluation(WhiteBoxEvaluation[S]):
+    """
+    Inference-only evaluation wrapper around 'lm_eval' CLI using subprocess.
+    Returns DataFrame response=stdout.
+    """
+
+    name = "LM_EVAL_RUNNER"
+
+    def _run_single_task(self, task: str) -> Dict[str, str]:
+        cmd = [
+            "lm_eval",
+            "--model", "hf",
+            "--model_args", f"pretrained={self.eval_config.pretrained_path},tokenizer={self.eval_config.tokenizer_path}",
+            "--tasks", task,
+            "--device", self.eval_config.device,
+            "--batch_size", str(self.eval_config.batch_size),
+        ]
+
+        result = subprocess.run(cmd, capture_output=True, text=True)
+
+        return {
+            "task": task,
+            "stdout": result.stdout.strip(),
+            "stderr": result.stderr.strip(),
+            "returncode": str(result.returncode),
+        }
+
+    def compute_inferences(self) -> pl.DataFrame:
+        tasks = self.eval_config.tasks
+        if isinstance(tasks, str):
+            tasks = [tasks]
+
+        logs = []
+        for task in tasks:
+            print(f"\n=== Running task: {task} ===")
+            logs.append(self._run_single_task(task))
+
+        return pl.DataFrame(logs) #Inference schema doesn't really make sense until now for me.
+
+    def compute_scores(self, inferences):
+        return None
+
+    def compute_results(self, scores):
+        return None

From 12d3b231523b41254e3e0ab69fa5697604a6aa7c Mon Sep 17 00:00:00 2001
From: Punya Syon Pandey <101074452+psyonp@users.noreply.github.com>
Date: Tue, 14 Oct 2025 13:05:11 -0400
Subject: [PATCH 2/6] Create __init__.py

---
 .../whitebox/evals/lm_eval/__init__.py        | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 src/safetunebed/whitebox/evals/lm_eval/__init__.py

diff --git a/src/safetunebed/whitebox/evals/lm_eval/__init__.py b/src/safetunebed/whitebox/evals/lm_eval/__init__.py
new file mode 100644
index 00000000..c39eb5e4
--- /dev/null
+++ b/src/safetunebed/whitebox/evals/lm_eval/__init__.py
@@ -0,0 +1,24 @@
+"""LM-Eval Integration for SafeTuneBed.
+
+This work is based on EleutherAI's LM-Eval Harness, and can be cited as follows:
+@misc{eval-harness,
+  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
+  title        = {The Language Model Evaluation Harness},
+  month        = 07,
+  year         = 2024,
+  publisher    = {Zenodo},
+  version      = {v0.4.3},
+  doi          = {10.5281/zenodo.12608602},
+  url          = {https://zenodo.org/records/12608602}
+}
+"""
+
+from safetunebed.whitebox.evals.lm_eval_runner.runner import (
+    LMRunEvaluationConfig,
+    LMRunEvaluation,
+)
+
+__all__ = [
+    "LMRunEvaluationConfig",
+    "LMRunEvaluation",
+]

From 4f005d6873ca7801c442404816655dd032603b57 Mon Sep 17 00:00:00 2001
From: Punya Syon Pandey <101074452+psyonp@users.noreply.github.com>
Date: Tue, 14 Oct 2025 13:06:55 -0400
Subject: [PATCH 3/6] Create test_lm_eval.py

---
 tests/evals/test_lm_eval.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)
 create mode 100644 tests/evals/test_lm_eval.py

diff --git a/tests/evals/test_lm_eval.py b/tests/evals/test_lm_eval.py
new file mode 100644
index 00000000..856fe190
--- /dev/null
+++ b/tests/evals/test_lm_eval.py
@@ -0,0 +1,28 @@
+"""Sanity check for LMRunEvaluation (print-only)."""
+
+import tempfile
+from dotenv import load_dotenv
+
+from safetunebed.whitebox.evals.lm_eval.lm_eval import (
+    LMRunEvaluation,
+    LMRunEvaluationConfig,
+)
+
+if __name__ == "__main__":
+    load_dotenv()
+
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        eval_config = LMRunEvaluationConfig(
+            tasks=["hellaswag"],
+            pretrained_path="/model-weights/Qwen2.5-3B",
+            tokenizer_path="/model-weights/Qwen2.5-3B",
+            batch_size=1,
+        )
+
+        evaluator = LMRunEvaluation(eval_config=eval_config)
+
+        print("=== Running LMRunEvaluation ===")
+        inferences = evaluator.compute_inferences()
+
+        print("\n=== RAW INFERENCES OUTPUT ===")
+        print(inferences)

From 7b6310710ed38b4a90da37df465d5f108bebe700 Mon Sep 17 00:00:00 2001
From: Punya Syon Pandey <101074452+psyonp@users.noreply.github.com>
Date: Tue, 14 Oct 2025 13:07:26 -0400
Subject: [PATCH 4/6] Update __init__.py

---
 src/safetunebed/whitebox/evals/lm_eval/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/safetunebed/whitebox/evals/lm_eval/__init__.py b/src/safetunebed/whitebox/evals/lm_eval/__init__.py
index c39eb5e4..e4cc3259 100644
--- a/src/safetunebed/whitebox/evals/lm_eval/__init__.py
+++ b/src/safetunebed/whitebox/evals/lm_eval/__init__.py
@@ -13,7 +13,7 @@
 }
 """
 
-from safetunebed.whitebox.evals.lm_eval_runner.runner import (
+from safetunebed.whitebox.evals.lm_eval.lm_eval import (
     LMRunEvaluationConfig,
     LMRunEvaluation,
 )

From a4e0ffb98149d2cc0cd74af6ad421b7608b1c56b Mon Sep 17 00:00:00 2001
From: Punya Syon Pandey <101074452+psyonp@users.noreply.github.com>
Date: Tue, 14 Oct 2025 13:07:41 -0400
Subject: [PATCH 5/6] Update lm_eval.py

---
 src/safetunebed/whitebox/evals/lm_eval/lm_eval.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/safetunebed/whitebox/evals/lm_eval/lm_eval.py b/src/safetunebed/whitebox/evals/lm_eval/lm_eval.py
index 29def966..91fba4de 100644
--- a/src/safetunebed/whitebox/evals/lm_eval/lm_eval.py
+++ b/src/safetunebed/whitebox/evals/lm_eval/lm_eval.py
@@ -70,7 +70,6 @@ def run_lm_eval(
 
 import polars as pl
 from safetunebed.whitebox.evals.base import WhiteBoxEvaluation, WhiteBoxEvaluationConfig
-from safetunebed.whitebox.evals.output_schema import InferenceSchema
 
 S = TypeVar(name="S", bound="LMRunEvaluationConfig")
 

From e911971e7f339940edb43ab96394205c17cecd21 Mon Sep 17 00:00:00 2001
From: Punya Syon Pandey <101074452+psyonp@users.noreply.github.com>
Date: Tue, 21 Oct 2025 16:59:01 -0400
Subject: [PATCH 6/6] Add logging

---
 src/safetunebed/whitebox/evals/lm_eval/lm_eval.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/safetunebed/whitebox/evals/lm_eval/lm_eval.py b/src/safetunebed/whitebox/evals/lm_eval/lm_eval.py
index 91fba4de..d6f4655f 100644
--- a/src/safetunebed/whitebox/evals/lm_eval/lm_eval.py
+++ b/src/safetunebed/whitebox/evals/lm_eval/lm_eval.py
@@ -99,6 +99,7 @@ def _run_single_task(self, task: str) -> Dict[str, str]:
             "lm_eval",
             "--model", "hf",
             "--model_args", f"pretrained={self.eval_config.pretrained_path},tokenizer={self.eval_config.tokenizer_path}",
+            "--output_path", "results",
             "--tasks", task,
             "--device", self.eval_config.device,
             "--batch_size", str(self.eval_config.batch_size),