From 01285bfdf6495c15ead0b9c61ffa7b564dae0d06 Mon Sep 17 00:00:00 2001
From: Dylan Socolobsky <dylan.socolobsky@lambdaclass.com>
Date: Wed, 7 Jan 2026 10:47:10 -0800
Subject: [PATCH 1/6] CI step to run model evaluation example

---
 .github/lm_eval_results.json           |  13 ++
 .github/workflows/model-evaluation.yml | 223 +++++++++++++++++++++++++
 2 files changed, 236 insertions(+)
 create mode 100644 .github/lm_eval_results.json
 create mode 100644 .github/workflows/model-evaluation.yml

diff --git a/.github/lm_eval_results.json b/.github/lm_eval_results.json
new file mode 100644
index 000000000..fa2a37c6c
--- /dev/null
+++ b/.github/lm_eval_results.json
@@ -0,0 +1,13 @@
+{
+	"version": "0.4.9.2",
+	"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+	"limit": 5,
+	"seed": 42,
+	"evals": [
+		{
+			"name": "arc_easy",
+			"acc": 0.4,
+			"acc_norm": 0.6
+		}
+	]
+}
diff --git a/.github/workflows/model-evaluation.yml b/.github/workflows/model-evaluation.yml
new file mode 100644
index 000000000..53d02b7fb
--- /dev/null
+++ b/.github/workflows/model-evaluation.yml
@@ -0,0 +1,223 @@
+name: Model Evaluation Example
+
+on:
+  push:
+    branches:
+      - main
+      - dy/ci-run-evals-example
+    paths:
+      - 'shared/eval/**'
+      - 'shared/modeling/**'
+      - 'shared/core/**'
+      - 'shared/data-provider/**'
+      - 'Cargo.toml'
+      - 'Cargo.lock'
+      - '.github/workflows/model-evaluation.yml'
+  workflow_dispatch: # Allow manual triggering for testing
+
+jobs:
+  evaluate-model:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    continue-on-error: true
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+
+      - name: Free Disk Space (Ubuntu)
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: true
+
+      - name: Check initial disk space
+        run: df -h
+
+      - name: Install Nix
+        uses: nixbuild/nix-quick-install-action@v31
+        with:
+          nix_conf: |
+            download-buffer-size = 524288000
+            accept-flake-config = true
+            substituters = https://cache.nixos.org/ https://cache.garnix.io/ https://nix-community.cachix.org
+            trusted-public-keys = cache.garnix.io:CTFPyKSLcx5RMJKfLo5EEPUObbA78b0YQ2DTCJXqr9g= cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY= nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs=
+
+      - name: Cache Cargo dependencies
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/bin/
+            ~/.cargo/registry/index/
+            ~/.cargo/registry/cache/
+            ~/.cargo/git/db/
+            target/
+          key: ${{ runner.os }}-cargo-eval-${{ hashFiles('**/Cargo.lock') }}
+          restore-keys: |
+            ${{ runner.os }}-cargo-eval-
+            ${{ runner.os }}-cargo-
+
+      - name: Cache HuggingFace models
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/huggingface/
+          key: ${{ runner.os }}-huggingface-tinyllama-v1
+          restore-keys: |
+            ${{ runner.os }}-huggingface-
+
+      - name: Build evaluation binary
+        run: |
+          echo "Building psyche-eval"
+          nix develop --command cargo build --release --example evaluate -p psyche-eval
+
+      - name: Check disk space after build
+        run: df -h
+
+      - name: Run model evaluation
+        id: eval
+        run: |
+          echo "Running evaluation on TinyLlama with ARC-Easy (limit=5)"
+          nix develop --command cargo run --release --example evaluate -p psyche-eval -- \
+            --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+            --tasks arc_easy \
+            --limit 5 \
+            --seed 42 | tee evaluation-results.txt
+
+          echo "Evaluation complete!"
+
+      - name: Parse and display results
+        if: always()
+        run: |
+          echo "=== Model Evaluation Results ==="
+          if [ -f evaluation-results.txt ]; thent
+            cat evaluation-results.txt
+            echo ""
+
+            # Extract the JSON results line (look for line with JSON braces and acc field)
+            RESULTS_LINE=$(grep -E 'ARC-?Easy.*\{.*"acc"' evaluation-results.txt || echo "")
+
+            if [ -n "$RESULTS_LINE" ]; then
+              # Extract just the JSON part (everything after the colon)
+              JSON_RESULTS=$(echo "$RESULTS_LINE" | sed 's/^[^:]*: *//')
+
+              if echo "$JSON_RESULTS" | jq . > /dev/null 2>&1; then
+                # Extract each individual metric
+                ACC=$(echo "$JSON_RESULTS" | jq -r '.acc // "N/A"')
+                ACC_NORM=$(echo "$JSON_RESULTS" | jq -r '.acc_norm // "N/A"')
+                ACC_UNCOND=$(echo "$JSON_RESULTS" | jq -r '.acc_uncond // "N/A"')
+
+                echo ""
+                echo "📊 Metrics Summary:"
+                echo "  • Accuracy:             $ACC"
+                echo "  • Accuracy (normalized): $ACC_NORM"
+                echo "  • Accuracy (unconditional): $ACC_UNCOND"
+
+                # Save for later comparison
+                cat > results.json <<EOF
+          {
+            "task": "arc_easy",
+            "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+            "metrics": $JSON_RESULTS
+          }
+          EOF
+              else
+                echo "❌ Failed to parse JSON. Raw content:"
+                echo "$JSON_RESULTS" | od -c
+              fi
+            else
+              echo "❌ No results line found in evaluation output"
+            fi
+          else
+            echo "❌ No results file found"
+          fi
+
+      - name: Load reference results
+        if: always()
+        run: |
+          echo "=== Loading reference lm_eval results ==="
+
+          # Ensure we actually have the lm_eval results file
+          if [ -f .github/lm_eval_results.json ]; then
+            cat .github/lm_eval_results.json | jq '.'
+
+            # Extract the specific eval for this run
+            EVAL=$(cat .github/lm_eval_results.json | jq '.evals[] | select(.name == "arc_easy")')
+
+            if [ -n "$EVAL" ]; then
+              cat > lm_eval_results.json <<EOF
+          {
+            "task": "arc_easy",
+            "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+            "metrics": {
+              "acc": $(echo "$EVAL" | jq '.acc'),
+              "acc_norm": $(echo "$EVAL" | jq '.acc_norm')
+            }
+          }
+          EOF
+
+              echo ""
+              echo "📊 Reference Metrics (arc_easy):"
+              cat lm_eval_results.json | jq '.metrics'
+              echo ""
+            else
+              echo "⚠️ No arc_easy results found in reference file"
+            fi
+          else
+            echo "⚠️ Reference results file not found at .github/lm_eval_results.json"
+          fi
+
+      - name: Compare results
+        if: always()
+        run: |
+          echo "=== Comparison: Psyche vs lm_eval ==="
+          echo ""
+
+          if [ -f results.json ] && [ -f lm_eval_results.json ]; then
+            PSYCHE_ACC=$(jq -r '.metrics.acc' results.json)
+            PSYCHE_ACC_NORM=$(jq -r '.metrics.acc_norm' results.json)
+
+            LM_EVAL_ACC=$(jq -r '.metrics.acc' lm_eval_results.json)
+            LM_EVAL_ACC_NORM=$(jq -r '.metrics.acc_norm' lm_eval_results.json)
+
+            echo "| Metric | Psyche | lm_eval | Match |"
+            echo "|--------|--------|---------|-------|"
+
+            # Compare acc
+            if [ "$PSYCHE_ACC" = "$LM_EVAL_ACC" ]; then
+              MATCH_ACC="✅"
+            else
+              MATCH_ACC="❌"
+            fi
+            echo "| acc    | $PSYCHE_ACC | $LM_EVAL_ACC | $MATCH_ACC |"
+
+            # Compare acc_norm
+            if [ "$PSYCHE_ACC_NORM" = "$LM_EVAL_ACC_NORM" ]; then
+              MATCH_NORM="✅"
+            else
+              MATCH_NORM="❌"
+            fi
+            echo "| acc_norm | $PSYCHE_ACC_NORM | $LM_EVAL_ACC_NORM | $MATCH_NORM |"
+
+          else
+            echo "⚠️  Missing results files for comparison"
+          fi
+
+      #- name: Upload evaluation results
+      #  if: always()
+      #  uses: actions/upload-artifact@v4
+      #  with:
+      #    name: evaluation-results
+      #    path: |
+      #      evaluation-results.txt
+      #      results.json
+      #      lm_eval_results.json
+      #      lm_eval_out_*.json
+      #    retention-days: 30
+
+      - name: Clean up
+        if: always()
+        run: |
+          nix-collect-garbage -d || true
+          echo "Final disk usage:"
+          df -h

From 8d3057fda6c16a25b60bc04e27b844d3915e1fd2 Mon Sep 17 00:00:00 2001
From: Dylan Socolobsky <dylan.socolobsky@lambdaclass.com>
Date: Fri, 9 Jan 2026 07:36:04 -0800
Subject: [PATCH 2/6] evals ci: check in python script and allow 10% margin

---
 .github/scripts/compare_metrics.py     | 74 ++++++++++++++++++++++++++
 .github/workflows/model-evaluation.yml | 28 ++--------
 2 files changed, 77 insertions(+), 25 deletions(-)
 create mode 100755 .github/scripts/compare_metrics.py

diff --git a/.github/scripts/compare_metrics.py b/.github/scripts/compare_metrics.py
new file mode 100755
index 000000000..38184a806
--- /dev/null
+++ b/.github/scripts/compare_metrics.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python3
+
+# This script parses two JSON files containing evaluation metrics from Psyche and lm_eval,
+# and compares the metrics to check if they are within a 10% margin of each other.
+
+import json
+import sys
+
+
+# Returns (is_in_margin, margin_difference)
+def compare_within_margin(psyche, lm_eval, margin=10):
+    try:
+        p = float(psyche)
+        l = float(lm_eval)
+
+        if l == 0:
+            return (p == 0, 0.0)
+
+        diff_percent = abs((p - l) / l * 100)
+        within_margin = diff_percent <= margin
+        return (within_margin, diff_percent)
+    except (ValueError, ZeroDivisionError):
+        return (False, 0.0)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage: compare_metrics.py <psyche_results.json> <lm_eval_results.json>")
+        sys.exit(1)
+
+    psyche_file = sys.argv[1]
+    lm_eval_file = sys.argv[2]
+
+    try:
+        with open(psyche_file, "r") as f:
+            psyche_data = json.load(f)
+        with open(lm_eval_file, "r") as f:
+            lm_eval_data = json.load(f)
+
+        psyche_acc = psyche_data["metrics"]["acc"]
+        lm_eval_acc = lm_eval_data["metrics"]["acc"]
+        psyche_acc_norm = psyche_data["metrics"]["acc_norm"]
+        lm_eval_acc_norm = lm_eval_data["metrics"]["acc_norm"]
+
+        print("| Metric | Psyche | lm_eval | Match |")
+        print("|--------|--------|---------|-------|")
+
+        # Compare acc
+        acc_match, acc_diff = compare_within_margin(psyche_acc, lm_eval_acc)
+        if acc_match:
+            print(f"| acc    | {psyche_acc} | {lm_eval_acc} | ✅ |")
+        else:
+            print(
+                f"| acc    | {psyche_acc} | {lm_eval_acc} | ❌ ({acc_diff:.1f}% diff) |"
+            )
+
+        # Compare acc_norm
+        norm_match, norm_diff = compare_within_margin(psyche_acc_norm, lm_eval_acc_norm)
+        if norm_match:
+            print(f"| acc_norm | {psyche_acc_norm} | {lm_eval_acc_norm} | ✅ |")
+        else:
+            print(
+                f"| acc_norm | {psyche_acc_norm} | {lm_eval_acc_norm} | ❌ ({norm_diff:.1f}% diff) |"
+            )
+
+    except FileNotFoundError as e:
+        print(f"Error: Could not find file {e}")
+        sys.exit(1)
+    except KeyError as e:
+        print(f"Error: Missing metric in JSON {e}")
+        sys.exit(1)
+    except json.JSONDecodeError as e:
+        print(f"Error: Invalid JSON format {e}")
+        sys.exit(1)
diff --git a/.github/workflows/model-evaluation.yml b/.github/workflows/model-evaluation.yml
index 53d02b7fb..f48155c65 100644
--- a/.github/workflows/model-evaluation.yml
+++ b/.github/workflows/model-evaluation.yml
@@ -90,7 +90,7 @@ jobs:
         if: always()
         run: |
           echo "=== Model Evaluation Results ==="
-          if [ -f evaluation-results.txt ]; thent
+          if [ -f evaluation-results.txt ]; then
             cat evaluation-results.txt
             echo ""
 
@@ -174,30 +174,8 @@ jobs:
           echo ""
 
           if [ -f results.json ] && [ -f lm_eval_results.json ]; then
-            PSYCHE_ACC=$(jq -r '.metrics.acc' results.json)
-            PSYCHE_ACC_NORM=$(jq -r '.metrics.acc_norm' results.json)
-
-            LM_EVAL_ACC=$(jq -r '.metrics.acc' lm_eval_results.json)
-            LM_EVAL_ACC_NORM=$(jq -r '.metrics.acc_norm' lm_eval_results.json)
-
-            echo "| Metric | Psyche | lm_eval | Match |"
-            echo "|--------|--------|---------|-------|"
-
-            # Compare acc
-            if [ "$PSYCHE_ACC" = "$LM_EVAL_ACC" ]; then
-              MATCH_ACC="✅"
-            else
-              MATCH_ACC="❌"
-            fi
-            echo "| acc    | $PSYCHE_ACC | $LM_EVAL_ACC | $MATCH_ACC |"
-
-            # Compare acc_norm
-            if [ "$PSYCHE_ACC_NORM" = "$LM_EVAL_ACC_NORM" ]; then
-              MATCH_NORM="✅"
-            else
-              MATCH_NORM="❌"
-            fi
-            echo "| acc_norm | $PSYCHE_ACC_NORM | $LM_EVAL_ACC_NORM | $MATCH_NORM |"
+            # This script compares the two evals and reports if they're within 10% of each other
+            python3 .github/scripts/compare_metrics.py results.json lm_eval_results.json
 
           else
             echo "⚠️  Missing results files for comparison"

From 56f3dded5d68509f5bcbe1c14e244d08315d764d Mon Sep 17 00:00:00 2001
From: Dylan Socolobsky <dylan.socolobsky@lambdaclass.com>
Date: Fri, 9 Jan 2026 09:46:17 -0800
Subject: [PATCH 3/6] ci evals: run without limit

---
 .github/lm_eval_results.json           |  6 ++++--
 .github/scripts/compare_metrics.py     | 23 ++++++++---------------
 .github/workflows/model-evaluation.yml |  6 +++---
 3 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/.github/lm_eval_results.json b/.github/lm_eval_results.json
index fa2a37c6c..b259b55c9 100644
--- a/.github/lm_eval_results.json
+++ b/.github/lm_eval_results.json
@@ -1,13 +1,15 @@
 {
 	"version": "0.4.9.2",
+	"command": "lm_eval --model hf --model_args pretrained=TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tasks arc_easy --num_fewshot 3 --seed 42",
 	"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
 	"limit": 5,
 	"seed": 42,
+	"fewshot": 3,
 	"evals": [
 		{
 			"name": "arc_easy",
-			"acc": 0.4,
-			"acc_norm": 0.6
+			"acc": 0.6561,
+			"acc_norm": 0.6423
 		}
 	]
 }
diff --git a/.github/scripts/compare_metrics.py b/.github/scripts/compare_metrics.py
index 38184a806..70a98e755 100755
--- a/.github/scripts/compare_metrics.py
+++ b/.github/scripts/compare_metrics.py
@@ -42,26 +42,19 @@ def compare_within_margin(psyche, lm_eval, margin=10):
         psyche_acc_norm = psyche_data["metrics"]["acc_norm"]
         lm_eval_acc_norm = lm_eval_data["metrics"]["acc_norm"]
 
-        print("| Metric | Psyche | lm_eval | Match |")
-        print("|--------|--------|---------|-------|")
-
         # Compare acc
         acc_match, acc_diff = compare_within_margin(psyche_acc, lm_eval_acc)
-        if acc_match:
-            print(f"| acc    | {psyche_acc} | {lm_eval_acc} | ✅ |")
-        else:
-            print(
-                f"| acc    | {psyche_acc} | {lm_eval_acc} | ❌ ({acc_diff:.1f}% diff) |"
-            )
+        status_acc = "✅ PASS" if acc_match else f"❌ FAIL ({acc_diff:.1f}% diff)"
+        print(
+            f"acc:      Psyche={psyche_acc:.4f}  lm_eval={lm_eval_acc:.4f} - {status_acc}"
+        )
 
         # Compare acc_norm
         norm_match, norm_diff = compare_within_margin(psyche_acc_norm, lm_eval_acc_norm)
-        if norm_match:
-            print(f"| acc_norm | {psyche_acc_norm} | {lm_eval_acc_norm} | ✅ |")
-        else:
-            print(
-                f"| acc_norm | {psyche_acc_norm} | {lm_eval_acc_norm} | ❌ ({norm_diff:.1f}% diff) |"
-            )
+        status_norm = "✅ PASS" if norm_match else f"❌ FAIL ({norm_diff:.1f}% diff)"
+        print(
+            f"acc_norm: Psyche={psyche_acc_norm:.4f}  lm_eval={lm_eval_acc_norm:.4f} - {status_norm}"
+        )
 
     except FileNotFoundError as e:
         print(f"Error: Could not find file {e}")
diff --git a/.github/workflows/model-evaluation.yml b/.github/workflows/model-evaluation.yml
index f48155c65..d34afa0f8 100644
--- a/.github/workflows/model-evaluation.yml
+++ b/.github/workflows/model-evaluation.yml
@@ -77,11 +77,11 @@ jobs:
       - name: Run model evaluation
         id: eval
         run: |
-          echo "Running evaluation on TinyLlama with ARC-Easy (limit=5)"
+          echo "Running evaluation on TinyLlama with ARC-Easy"
           nix develop --command cargo run --release --example evaluate -p psyche-eval -- \
             --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
             --tasks arc_easy \
-            --limit 5 \
+            --num-fewshot 3 \
             --seed 42 | tee evaluation-results.txt
 
           echo "Evaluation complete!"
@@ -108,7 +108,7 @@ jobs:
                 ACC_UNCOND=$(echo "$JSON_RESULTS" | jq -r '.acc_uncond // "N/A"')
 
                 echo ""
-                echo "📊 Metrics Summary:"
+                echo "Metrics Summary:"
                 echo "  • Accuracy:             $ACC"
                 echo "  • Accuracy (normalized): $ACC_NORM"
                 echo "  • Accuracy (unconditional): $ACC_UNCOND"

From 34415a762b8805a0b187faea85447ca68e82d9ae Mon Sep 17 00:00:00 2001
From: Dylan Socolobsky <dylan.socolobsky@lambdaclass.com>
Date: Fri, 9 Jan 2026 16:53:36 -0300
Subject: [PATCH 4/6] ci evals: limit 50

---
 .github/workflows/model-evaluation.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/model-evaluation.yml b/.github/workflows/model-evaluation.yml
index d34afa0f8..250541f42 100644
--- a/.github/workflows/model-evaluation.yml
+++ b/.github/workflows/model-evaluation.yml
@@ -82,6 +82,7 @@ jobs:
             --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
             --tasks arc_easy \
             --num-fewshot 3 \
+            --limit 50 \
             --seed 42 | tee evaluation-results.txt
 
           echo "Evaluation complete!"

From ff8f034434a80b41693b51d6a7a0c328b06dfc27 Mon Sep 17 00:00:00 2001
From: Dylan Socolobsky <dylan.socolobsky@lambdaclass.com>
Date: Fri, 9 Jan 2026 17:25:22 -0300
Subject: [PATCH 5/6] ci evals: Change evaluation limit from 50 to 10

---
 .github/workflows/model-evaluation.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/model-evaluation.yml b/.github/workflows/model-evaluation.yml
index 250541f42..215c7bd8b 100644
--- a/.github/workflows/model-evaluation.yml
+++ b/.github/workflows/model-evaluation.yml
@@ -82,7 +82,7 @@ jobs:
             --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
             --tasks arc_easy \
             --num-fewshot 3 \
-            --limit 50 \
+            --limit 10 \
             --seed 42 | tee evaluation-results.txt
 
           echo "Evaluation complete!"

From 43c69a20615ea02aad448d80a283d42c18ea0c32 Mon Sep 17 00:00:00 2001
From: Dylan Socolobsky <dylan.socolobsky@lambdaclass.com>
Date: Wed, 4 Feb 2026 12:00:50 -0300
Subject: [PATCH 6/6] ci: try running with deepseek lite

---
 .github/lm_eval_results.json           | 10 +++++-----
 .github/workflows/model-evaluation.yml | 10 +++++-----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/lm_eval_results.json b/.github/lm_eval_results.json
index b259b55c9..aff043904 100644
--- a/.github/lm_eval_results.json
+++ b/.github/lm_eval_results.json
@@ -1,15 +1,15 @@
 {
 	"version": "0.4.9.2",
-	"command": "lm_eval --model hf --model_args pretrained=TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tasks arc_easy --num_fewshot 3 --seed 42",
-	"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-	"limit": 5,
+	"command": "lm_eval --model hf --model_args pretrained=deepseek-ai/DeepSeek-V2-Lite,trust_remote_code=True --tasks arc_easy --num_fewshot 3 --seed 42 --limit 50",
+	"model": "deepseek-ai/DeepSeek-V2-Lite",
+	"limit": 50,
 	"seed": 42,
 	"fewshot": 3,
 	"evals": [
 		{
 			"name": "arc_easy",
-			"acc": 0.6561,
-			"acc_norm": 0.6423
+			"acc": 0.0,
+			"acc_norm": 0.0
 		}
 	]
 }
diff --git a/.github/workflows/model-evaluation.yml b/.github/workflows/model-evaluation.yml
index 215c7bd8b..d1c685057 100644
--- a/.github/workflows/model-evaluation.yml
+++ b/.github/workflows/model-evaluation.yml
@@ -62,7 +62,7 @@ jobs:
         uses: actions/cache@v4
         with:
           path: ~/.cache/huggingface/
-          key: ${{ runner.os }}-huggingface-tinyllama-v1
+          key: ${{ runner.os }}-huggingface-deepseek-v2-lite-v1
           restore-keys: |
             ${{ runner.os }}-huggingface-
 
@@ -77,9 +77,9 @@ jobs:
       - name: Run model evaluation
         id: eval
         run: |
-          echo "Running evaluation on TinyLlama with ARC-Easy"
+          echo "Running evaluation on DeepSeek-V2-Lite with ARC-Easy"
           nix develop --command cargo run --release --example evaluate -p psyche-eval -- \
-            --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
+            --model deepseek-ai/DeepSeek-V2-Lite \
             --tasks arc_easy \
             --num-fewshot 3 \
             --limit 10 \
@@ -118,7 +118,7 @@ jobs:
                 cat > results.json <<EOF
           {
             "task": "arc_easy",
-            "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+            "model": "deepseek-ai/DeepSeek-V2-Lite",
             "metrics": $JSON_RESULTS
           }
           EOF
@@ -149,7 +149,7 @@ jobs:
               cat > lm_eval_results.json <<EOF
           {
             "task": "arc_easy",
-            "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+            "model": "deepseek-ai/DeepSeek-V2-Lite",
             "metrics": {
               "acc": $(echo "$EVAL" | jq '.acc'),
               "acc_norm": $(echo "$EVAL" | jq '.acc_norm')