From 01285bfdf6495c15ead0b9c61ffa7b564dae0d06 Mon Sep 17 00:00:00 2001 From: Dylan Socolobsky Date: Wed, 7 Jan 2026 10:47:10 -0800 Subject: [PATCH 1/6] CI step to run model evaluation example --- .github/lm_eval_results.json | 13 ++ .github/workflows/model-evaluation.yml | 223 +++++++++++++++++++++++++ 2 files changed, 236 insertions(+) create mode 100644 .github/lm_eval_results.json create mode 100644 .github/workflows/model-evaluation.yml diff --git a/.github/lm_eval_results.json b/.github/lm_eval_results.json new file mode 100644 index 000000000..fa2a37c6c --- /dev/null +++ b/.github/lm_eval_results.json @@ -0,0 +1,13 @@ +{ + "version": "0.4.9.2", + "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "limit": 5, + "seed": 42, + "evals": [ + { + "name": "arc_easy", + "acc": 0.4, + "acc_norm": 0.6 + } + ] +} diff --git a/.github/workflows/model-evaluation.yml b/.github/workflows/model-evaluation.yml new file mode 100644 index 000000000..53d02b7fb --- /dev/null +++ b/.github/workflows/model-evaluation.yml @@ -0,0 +1,223 @@ +name: Model Evaluation Example + +on: + push: + branches: + - main + - dy/ci-run-evals-example + paths: + - 'shared/eval/**' + - 'shared/modeling/**' + - 'shared/core/**' + - 'shared/data-provider/**' + - 'Cargo.toml' + - 'Cargo.lock' + - '.github/workflows/model-evaluation.yml' + workflow_dispatch: # Allow manual triggering for testing + +jobs: + evaluate-model: + runs-on: ubuntu-latest + timeout-minutes: 30 + continue-on-error: true + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Free Disk Space (Ubuntu) + uses: jlumbroso/free-disk-space@main + with: + tool-cache: true + + - name: Check initial disk space + run: df -h + + - name: Install Nix + uses: nixbuild/nix-quick-install-action@v31 + with: + nix_conf: | + download-buffer-size = 524288000 + accept-flake-config = true + substituters = https://cache.nixos.org/ https://cache.garnix.io/ https://nix-community.cachix.org + trusted-public-keys = cache.garnix.io:CTFPyKSLcx5RMJKfLo5EEPUObbA78b0YQ2DTCJXqr9g= cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY= nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs= + + - name: Cache Cargo dependencies + uses: actions/cache@v4 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + target/ + key: ${{ runner.os }}-cargo-eval-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-eval- + ${{ runner.os }}-cargo- + + - name: Cache HuggingFace models + uses: actions/cache@v4 + with: + path: ~/.cache/huggingface/ + key: ${{ runner.os }}-huggingface-tinyllama-v1 + restore-keys: | + ${{ runner.os }}-huggingface- + + - name: Build evaluation binary + run: | + echo "Building psyche-eval" + nix develop --command cargo build --release --example evaluate -p psyche-eval + + - name: Check disk space after build + run: df -h + + - name: Run model evaluation + id: eval + run: | + echo "Running evaluation on TinyLlama with ARC-Easy (limit=5)" + nix develop --command cargo run --release --example evaluate -p psyche-eval -- \ + --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --tasks arc_easy \ + --limit 5 \ + --seed 42 | tee evaluation-results.txt + + echo "Evaluation complete!" + + - name: Parse and display results + if: always() + run: | + echo "=== Model Evaluation Results ===" + if [ -f evaluation-results.txt ]; thent + cat evaluation-results.txt + echo "" + + # Extract the JSON results line (look for line with JSON braces and acc field) + RESULTS_LINE=$(grep -E 'ARC-?Easy.*\{.*"acc"' evaluation-results.txt || echo "") + + if [ -n "$RESULTS_LINE" ]; then + # Extract just the JSON part (everything after the colon) + JSON_RESULTS=$(echo "$RESULTS_LINE" | sed 's/^[^:]*: *//') + + if echo "$JSON_RESULTS" | jq . > /dev/null 2>&1; then + # Extract each individual metric + ACC=$(echo "$JSON_RESULTS" | jq -r '.acc // "N/A"') + ACC_NORM=$(echo "$JSON_RESULTS" | jq -r '.acc_norm // "N/A"') + ACC_UNCOND=$(echo "$JSON_RESULTS" | jq -r '.acc_uncond // "N/A"') + + echo "" + echo "📊 Metrics Summary:" + echo " • Accuracy: $ACC" + echo " • Accuracy (normalized): $ACC_NORM" + echo " • Accuracy (unconditional): $ACC_UNCOND" + + # Save for later comparison + cat > results.json < lm_eval_results.json < Date: Fri, 9 Jan 2026 07:36:04 -0800 Subject: [PATCH 2/6] evals ci: check in python script and allow 10% margin --- .github/scripts/compare_metrics.py | 74 ++++++++++++++++++++++++++ .github/workflows/model-evaluation.yml | 28 ++-------- 2 files changed, 77 insertions(+), 25 deletions(-) create mode 100755 .github/scripts/compare_metrics.py diff --git a/.github/scripts/compare_metrics.py b/.github/scripts/compare_metrics.py new file mode 100755 index 000000000..38184a806 --- /dev/null +++ b/.github/scripts/compare_metrics.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 + +# This script parses two JSON files containing evaluation metrics from Psyche and lm_eval, +# and compares the metrics to check if they are within a 10% margin of each other. + +import json +import sys + + +# Returns (is_in_margin, margin_difference) +def compare_within_margin(psyche, lm_eval, margin=10): + try: + p = float(psyche) + l = float(lm_eval) + + if l == 0: + return (p == 0, 0.0) + + diff_percent = abs((p - l) / l * 100) + within_margin = diff_percent <= margin + return (within_margin, diff_percent) + except (ValueError, ZeroDivisionError): + return (False, 0.0) + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: compare_metrics.py ") + sys.exit(1) + + psyche_file = sys.argv[1] + lm_eval_file = sys.argv[2] + + try: + with open(psyche_file, "r") as f: + psyche_data = json.load(f) + with open(lm_eval_file, "r") as f: + lm_eval_data = json.load(f) + + psyche_acc = psyche_data["metrics"]["acc"] + lm_eval_acc = lm_eval_data["metrics"]["acc"] + psyche_acc_norm = psyche_data["metrics"]["acc_norm"] + lm_eval_acc_norm = lm_eval_data["metrics"]["acc_norm"] + + print("| Metric | Psyche | lm_eval | Match |") + print("|--------|--------|---------|-------|") + + # Compare acc + acc_match, acc_diff = compare_within_margin(psyche_acc, lm_eval_acc) + if acc_match: + print(f"| acc | {psyche_acc} | {lm_eval_acc} | ✅ |") + else: + print( + f"| acc | {psyche_acc} | {lm_eval_acc} | ❌ ({acc_diff:.1f}% diff) |" + ) + + # Compare acc_norm + norm_match, norm_diff = compare_within_margin(psyche_acc_norm, lm_eval_acc_norm) + if norm_match: + print(f"| acc_norm | {psyche_acc_norm} | {lm_eval_acc_norm} | ✅ |") + else: + print( + f"| acc_norm | {psyche_acc_norm} | {lm_eval_acc_norm} | ❌ ({norm_diff:.1f}% diff) |" + ) + + except FileNotFoundError as e: + print(f"Error: Could not find file {e}") + sys.exit(1) + except KeyError as e: + print(f"Error: Missing metric in JSON {e}") + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON format {e}") + sys.exit(1) diff --git a/.github/workflows/model-evaluation.yml b/.github/workflows/model-evaluation.yml index 53d02b7fb..f48155c65 100644 --- a/.github/workflows/model-evaluation.yml +++ b/.github/workflows/model-evaluation.yml @@ -90,7 +90,7 @@ jobs: if: always() run: | echo "=== Model Evaluation Results ===" - if [ -f evaluation-results.txt ]; thent + if [ -f evaluation-results.txt ]; then cat evaluation-results.txt echo "" @@ -174,30 +174,8 @@ jobs: echo "" if [ -f results.json ] && [ -f lm_eval_results.json ]; then - PSYCHE_ACC=$(jq -r '.metrics.acc' results.json) - PSYCHE_ACC_NORM=$(jq -r '.metrics.acc_norm' results.json) - - LM_EVAL_ACC=$(jq -r '.metrics.acc' lm_eval_results.json) - LM_EVAL_ACC_NORM=$(jq -r '.metrics.acc_norm' lm_eval_results.json) - - echo "| Metric | Psyche | lm_eval | Match |" - echo "|--------|--------|---------|-------|" - - # Compare acc - if [ "$PSYCHE_ACC" = "$LM_EVAL_ACC" ]; then - MATCH_ACC="✅" - else - MATCH_ACC="❌" - fi - echo "| acc | $PSYCHE_ACC | $LM_EVAL_ACC | $MATCH_ACC |" - - # Compare acc_norm - if [ "$PSYCHE_ACC_NORM" = "$LM_EVAL_ACC_NORM" ]; then - MATCH_NORM="✅" - else - MATCH_NORM="❌" - fi - echo "| acc_norm | $PSYCHE_ACC_NORM | $LM_EVAL_ACC_NORM | $MATCH_NORM |" + # This script compares the two evals and reports if they're within 10% of each other + python3 .github/scripts/compare_metrics.py results.json lm_eval_results.json else echo "⚠️ Missing results files for comparison" From 56f3dded5d68509f5bcbe1c14e244d08315d764d Mon Sep 17 00:00:00 2001 From: Dylan Socolobsky Date: Fri, 9 Jan 2026 09:46:17 -0800 Subject: [PATCH 3/6] ci evals: run without limit --- .github/lm_eval_results.json | 6 ++++-- .github/scripts/compare_metrics.py | 23 ++++++++--------------- .github/workflows/model-evaluation.yml | 6 +++--- 3 files changed, 15 insertions(+), 20 deletions(-) diff --git a/.github/lm_eval_results.json b/.github/lm_eval_results.json index fa2a37c6c..b259b55c9 100644 --- a/.github/lm_eval_results.json +++ b/.github/lm_eval_results.json @@ -1,13 +1,15 @@ { "version": "0.4.9.2", + "command": "lm_eval --model hf --model_args pretrained=TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tasks arc_easy --num_fewshot 3 --seed 42", "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", "limit": 5, "seed": 42, + "fewshot": 3, "evals": [ { "name": "arc_easy", - "acc": 0.4, - "acc_norm": 0.6 + "acc": 0.6561, + "acc_norm": 0.6423 } ] } diff --git a/.github/scripts/compare_metrics.py b/.github/scripts/compare_metrics.py index 38184a806..70a98e755 100755 --- a/.github/scripts/compare_metrics.py +++ b/.github/scripts/compare_metrics.py @@ -42,26 +42,19 @@ def compare_within_margin(psyche, lm_eval, margin=10): psyche_acc_norm = psyche_data["metrics"]["acc_norm"] lm_eval_acc_norm = lm_eval_data["metrics"]["acc_norm"] - print("| Metric | Psyche | lm_eval | Match |") - print("|--------|--------|---------|-------|") - # Compare acc acc_match, acc_diff = compare_within_margin(psyche_acc, lm_eval_acc) - if acc_match: - print(f"| acc | {psyche_acc} | {lm_eval_acc} | ✅ |") - else: - print( - f"| acc | {psyche_acc} | {lm_eval_acc} | ❌ ({acc_diff:.1f}% diff) |" - ) + status_acc = "✅ PASS" if acc_match else f"❌ FAIL ({acc_diff:.1f}% diff)" + print( + f"acc: Psyche={psyche_acc:.4f} lm_eval={lm_eval_acc:.4f} - {status_acc}" + ) # Compare acc_norm norm_match, norm_diff = compare_within_margin(psyche_acc_norm, lm_eval_acc_norm) - if norm_match: - print(f"| acc_norm | {psyche_acc_norm} | {lm_eval_acc_norm} | ✅ |") - else: - print( - f"| acc_norm | {psyche_acc_norm} | {lm_eval_acc_norm} | ❌ ({norm_diff:.1f}% diff) |" - ) + status_norm = "✅ PASS" if norm_match else f"❌ FAIL ({norm_diff:.1f}% diff)" + print( + f"acc_norm: Psyche={psyche_acc_norm:.4f} lm_eval={lm_eval_acc_norm:.4f} - {status_norm}" + ) except FileNotFoundError as e: print(f"Error: Could not find file {e}") diff --git a/.github/workflows/model-evaluation.yml b/.github/workflows/model-evaluation.yml index f48155c65..d34afa0f8 100644 --- a/.github/workflows/model-evaluation.yml +++ b/.github/workflows/model-evaluation.yml @@ -77,11 +77,11 @@ jobs: - name: Run model evaluation id: eval run: | - echo "Running evaluation on TinyLlama with ARC-Easy (limit=5)" + echo "Running evaluation on TinyLlama with ARC-Easy" nix develop --command cargo run --release --example evaluate -p psyche-eval -- \ --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ --tasks arc_easy \ - --limit 5 \ + --num-fewshot 3 \ --seed 42 | tee evaluation-results.txt echo "Evaluation complete!" @@ -108,7 +108,7 @@ jobs: ACC_UNCOND=$(echo "$JSON_RESULTS" | jq -r '.acc_uncond // "N/A"') echo "" - echo "📊 Metrics Summary:" + echo "Metrics Summary:" echo " • Accuracy: $ACC" echo " • Accuracy (normalized): $ACC_NORM" echo " • Accuracy (unconditional): $ACC_UNCOND" From 34415a762b8805a0b187faea85447ca68e82d9ae Mon Sep 17 00:00:00 2001 From: Dylan Socolobsky Date: Fri, 9 Jan 2026 16:53:36 -0300 Subject: [PATCH 4/6] ci evals: limit 50 --- .github/workflows/model-evaluation.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/model-evaluation.yml b/.github/workflows/model-evaluation.yml index d34afa0f8..250541f42 100644 --- a/.github/workflows/model-evaluation.yml +++ b/.github/workflows/model-evaluation.yml @@ -82,6 +82,7 @@ jobs: --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ --tasks arc_easy \ --num-fewshot 3 \ + --limit 50 \ --seed 42 | tee evaluation-results.txt echo "Evaluation complete!" From ff8f034434a80b41693b51d6a7a0c328b06dfc27 Mon Sep 17 00:00:00 2001 From: Dylan Socolobsky Date: Fri, 9 Jan 2026 17:25:22 -0300 Subject: [PATCH 5/6] ci evals: Change evaluation limit from 50 to 10 --- .github/workflows/model-evaluation.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/model-evaluation.yml b/.github/workflows/model-evaluation.yml index 250541f42..215c7bd8b 100644 --- a/.github/workflows/model-evaluation.yml +++ b/.github/workflows/model-evaluation.yml @@ -82,7 +82,7 @@ jobs: --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ --tasks arc_easy \ --num-fewshot 3 \ - --limit 50 \ + --limit 10 \ --seed 42 | tee evaluation-results.txt echo "Evaluation complete!" From 43c69a20615ea02aad448d80a283d42c18ea0c32 Mon Sep 17 00:00:00 2001 From: Dylan Socolobsky Date: Wed, 4 Feb 2026 12:00:50 -0300 Subject: [PATCH 6/6] ci: try running with deepseek lite --- .github/lm_eval_results.json | 10 +++++----- .github/workflows/model-evaluation.yml | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/lm_eval_results.json b/.github/lm_eval_results.json index b259b55c9..aff043904 100644 --- a/.github/lm_eval_results.json +++ b/.github/lm_eval_results.json @@ -1,15 +1,15 @@ { "version": "0.4.9.2", - "command": "lm_eval --model hf --model_args pretrained=TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tasks arc_easy --num_fewshot 3 --seed 42", - "model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "limit": 5, + "command": "lm_eval --model hf --model_args pretrained=deepseek-ai/DeepSeek-V2-Lite,trust_remote_code=True --tasks arc_easy --num_fewshot 3 --seed 42 --limit 50", + "model": "deepseek-ai/DeepSeek-V2-Lite", + "limit": 50, "seed": 42, "fewshot": 3, "evals": [ { "name": "arc_easy", - "acc": 0.6561, - "acc_norm": 0.6423 + "acc": 0.0, + "acc_norm": 0.0 } ] } diff --git a/.github/workflows/model-evaluation.yml b/.github/workflows/model-evaluation.yml index 215c7bd8b..d1c685057 100644 --- a/.github/workflows/model-evaluation.yml +++ b/.github/workflows/model-evaluation.yml @@ -62,7 +62,7 @@ jobs: uses: actions/cache@v4 with: path: ~/.cache/huggingface/ - key: ${{ runner.os }}-huggingface-tinyllama-v1 + key: ${{ runner.os }}-huggingface-deepseek-v2-lite-v1 restore-keys: | ${{ runner.os }}-huggingface- @@ -77,9 +77,9 @@ jobs: - name: Run model evaluation id: eval run: | - echo "Running evaluation on TinyLlama with ARC-Easy" + echo "Running evaluation on DeepSeek-V2-Lite with ARC-Easy" nix develop --command cargo run --release --example evaluate -p psyche-eval -- \ - --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --model deepseek-ai/DeepSeek-V2-Lite \ --tasks arc_easy \ --num-fewshot 3 \ --limit 10 \ @@ -118,7 +118,7 @@ jobs: cat > results.json < lm_eval_results.json <