diff --git a/.github/lm_eval_results.json b/.github/lm_eval_results.json new file mode 100644 index 000000000..aff043904 --- /dev/null +++ b/.github/lm_eval_results.json @@ -0,0 +1,15 @@ +{ + "version": "0.4.9.2", + "command": "lm_eval --model hf --model_args pretrained=deepseek-ai/DeepSeek-V2-Lite,trust_remote_code=True --tasks arc_easy --num_fewshot 3 --seed 42 --limit 50", + "model": "deepseek-ai/DeepSeek-V2-Lite", + "limit": 50, + "seed": 42, + "fewshot": 3, + "evals": [ + { + "name": "arc_easy", + "acc": 0.0, + "acc_norm": 0.0 + } + ] +} diff --git a/.github/scripts/compare_metrics.py b/.github/scripts/compare_metrics.py new file mode 100755 index 000000000..70a98e755 --- /dev/null +++ b/.github/scripts/compare_metrics.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 + +# This script parses two JSON files containing evaluation metrics from Psyche and lm_eval, +# and compares the metrics to check if they are within a 10% margin of each other. + +import json +import sys + + +# Returns (is_in_margin, margin_difference) +def compare_within_margin(psyche, lm_eval, margin=10): + try: + p = float(psyche) + l = float(lm_eval) + + if l == 0: + return (p == 0, 0.0) + + diff_percent = abs((p - l) / l * 100) + within_margin = diff_percent <= margin + return (within_margin, diff_percent) + except (ValueError, ZeroDivisionError): + return (False, 0.0) + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: compare_metrics.py ") + sys.exit(1) + + psyche_file = sys.argv[1] + lm_eval_file = sys.argv[2] + + try: + with open(psyche_file, "r") as f: + psyche_data = json.load(f) + with open(lm_eval_file, "r") as f: + lm_eval_data = json.load(f) + + psyche_acc = psyche_data["metrics"]["acc"] + lm_eval_acc = lm_eval_data["metrics"]["acc"] + psyche_acc_norm = psyche_data["metrics"]["acc_norm"] + lm_eval_acc_norm = lm_eval_data["metrics"]["acc_norm"] + + # Compare acc + acc_match, acc_diff = compare_within_margin(psyche_acc, lm_eval_acc) + status_acc = "✅ PASS" if acc_match else f"❌ FAIL ({acc_diff:.1f}% diff)" + print( + f"acc: Psyche={psyche_acc:.4f} lm_eval={lm_eval_acc:.4f} - {status_acc}" + ) + + # Compare acc_norm + norm_match, norm_diff = compare_within_margin(psyche_acc_norm, lm_eval_acc_norm) + status_norm = "✅ PASS" if norm_match else f"❌ FAIL ({norm_diff:.1f}% diff)" + print( + f"acc_norm: Psyche={psyche_acc_norm:.4f} lm_eval={lm_eval_acc_norm:.4f} - {status_norm}" + ) + + except FileNotFoundError as e: + print(f"Error: Could not find file {e}") + sys.exit(1) + except KeyError as e: + print(f"Error: Missing metric in JSON {e}") + sys.exit(1) + except json.JSONDecodeError as e: + print(f"Error: Invalid JSON format {e}") + sys.exit(1) diff --git a/.github/workflows/model-evaluation.yml b/.github/workflows/model-evaluation.yml new file mode 100644 index 000000000..d1c685057 --- /dev/null +++ b/.github/workflows/model-evaluation.yml @@ -0,0 +1,202 @@ +name: Model Evaluation Example + +on: + push: + branches: + - main + - dy/ci-run-evals-example + paths: + - 'shared/eval/**' + - 'shared/modeling/**' + - 'shared/core/**' + - 'shared/data-provider/**' + - 'Cargo.toml' + - 'Cargo.lock' + - '.github/workflows/model-evaluation.yml' + workflow_dispatch: # Allow manual triggering for testing + +jobs: + evaluate-model: + runs-on: ubuntu-latest + timeout-minutes: 30 + continue-on-error: true + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 1 + + - name: Free Disk Space (Ubuntu) + uses: jlumbroso/free-disk-space@main + with: + tool-cache: true + + - name: Check initial disk space + run: df -h + + - name: Install Nix + uses: nixbuild/nix-quick-install-action@v31 + with: + nix_conf: | + download-buffer-size = 524288000 + accept-flake-config = true + substituters = https://cache.nixos.org/ https://cache.garnix.io/ https://nix-community.cachix.org + trusted-public-keys = cache.garnix.io:CTFPyKSLcx5RMJKfLo5EEPUObbA78b0YQ2DTCJXqr9g= cache.nixos.org-1:6NCHdD59X431o0gWypbMrAURkbJ16ZPMQFGspcDShjY= nix-community.cachix.org-1:mB9FSh9qf2dCimDSUo8Zy7bkq5CX+/rkCWyvRCYg3Fs= + + - name: Cache Cargo dependencies + uses: actions/cache@v4 + with: + path: | + ~/.cargo/bin/ + ~/.cargo/registry/index/ + ~/.cargo/registry/cache/ + ~/.cargo/git/db/ + target/ + key: ${{ runner.os }}-cargo-eval-${{ hashFiles('**/Cargo.lock') }} + restore-keys: | + ${{ runner.os }}-cargo-eval- + ${{ runner.os }}-cargo- + + - name: Cache HuggingFace models + uses: actions/cache@v4 + with: + path: ~/.cache/huggingface/ + key: ${{ runner.os }}-huggingface-deepseek-v2-lite-v1 + restore-keys: | + ${{ runner.os }}-huggingface- + + - name: Build evaluation binary + run: | + echo "Building psyche-eval" + nix develop --command cargo build --release --example evaluate -p psyche-eval + + - name: Check disk space after build + run: df -h + + - name: Run model evaluation + id: eval + run: | + echo "Running evaluation on DeepSeek-V2-Lite with ARC-Easy" + nix develop --command cargo run --release --example evaluate -p psyche-eval -- \ + --model deepseek-ai/DeepSeek-V2-Lite \ + --tasks arc_easy \ + --num-fewshot 3 \ + --limit 10 \ + --seed 42 | tee evaluation-results.txt + + echo "Evaluation complete!" + + - name: Parse and display results + if: always() + run: | + echo "=== Model Evaluation Results ===" + if [ -f evaluation-results.txt ]; then + cat evaluation-results.txt + echo "" + + # Extract the JSON results line (look for line with JSON braces and acc field) + RESULTS_LINE=$(grep -E 'ARC-?Easy.*\{.*"acc"' evaluation-results.txt || echo "") + + if [ -n "$RESULTS_LINE" ]; then + # Extract just the JSON part (everything after the colon) + JSON_RESULTS=$(echo "$RESULTS_LINE" | sed 's/^[^:]*: *//') + + if echo "$JSON_RESULTS" | jq . > /dev/null 2>&1; then + # Extract each individual metric + ACC=$(echo "$JSON_RESULTS" | jq -r '.acc // "N/A"') + ACC_NORM=$(echo "$JSON_RESULTS" | jq -r '.acc_norm // "N/A"') + ACC_UNCOND=$(echo "$JSON_RESULTS" | jq -r '.acc_uncond // "N/A"') + + echo "" + echo "Metrics Summary:" + echo " • Accuracy: $ACC" + echo " • Accuracy (normalized): $ACC_NORM" + echo " • Accuracy (unconditional): $ACC_UNCOND" + + # Save for later comparison + cat > results.json < lm_eval_results.json <