ref(tools): Merge search/list tool pairs into unified tools #788
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Eval | |
| on: | |
| workflow_dispatch: | |
| push: | |
| branches: [main] | |
| paths: | |
| - "packages/mcp-core/src/tools*" | |
| - "packages/mcp-server-evals/**" | |
| - "packages/mcp-server-mocks/**" | |
| - ".github/workflows/eval.yml" | |
| pull_request: | |
| paths: | |
| - "packages/mcp-core/src/tools*" | |
| - "packages/mcp-server-evals/**" | |
| - "packages/mcp-server-mocks/**" | |
| - ".github/workflows/eval.yml" | |
| jobs: | |
| eval: | |
| environment: Actions | |
| runs-on: ubuntu-latest | |
| permissions: | |
| checks: write | |
| contents: read | |
| steps: | |
| - uses: actions/checkout@v4 | |
| - name: Setup Node.js | |
| uses: actions/setup-node@v4 | |
| with: | |
| node-version: "20" | |
| # pnpm/action-setup@v4 | |
| - uses: pnpm/action-setup@a7487c7e89a18df4991f7f222e4898a00d66ddda | |
| name: Install pnpm | |
| with: | |
| run_install: false | |
| - name: Get pnpm store directory | |
| shell: bash | |
| run: | | |
| echo "STORE_PATH=$(pnpm store path --silent)" >> $GITHUB_ENV | |
| - uses: actions/cache@v4 | |
| name: Setup pnpm cache | |
| with: | |
| path: ${{ env.STORE_PATH }} | |
| key: ${{ runner.os }}-pnpm-store-${{ hashFiles('**/pnpm-lock.yaml') }} | |
| restore-keys: | | |
| ${{ runner.os }}-pnpm-store- | |
| - name: Install dependencies | |
| run: pnpm install --frozen-lockfile | |
| - name: Run build | |
| run: pnpm build | |
| - name: Run evals | |
| run: pnpm eval:ci evals | |
| continue-on-error: true | |
| env: | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| - name: Create eval status check | |
| uses: actions/github-script@v7 | |
| # Skip for fork PRs (no write permissions) but still run for pushes, workflow_dispatch, and same-repo PRs | |
| if: ${{ !cancelled() && (github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository) }} | |
| continue-on-error: true # Don't fail workflow if check creation fails | |
| with: | |
| script: | | |
| const fs = require('fs'); | |
| const path = require('path'); | |
| // Read eval results | |
| const resultsPath = path.resolve(process.cwd(), 'packages/mcp-server-evals/eval-results.json'); | |
| console.log(`Reading eval results from: ${resultsPath}`); | |
| let vitestResults; | |
| try { | |
| vitestResults = JSON.parse(fs.readFileSync(resultsPath, 'utf-8')); | |
| } catch (error) { | |
| if (error.code === 'ENOENT') { | |
| throw new Error( | |
| `Eval results file not found at ${resultsPath}. The eval run likely failed before producing results. Check the "Run evals" step logs for errors.` | |
| ); | |
| } | |
| throw new Error(`Failed to read/parse eval results: ${error.message}`); | |
| } | |
| // Extract eval results from vitest format | |
| const evalResults = []; | |
| for (const testFile of vitestResults.testResults || []) { | |
| for (const test of testFile.assertionResults || []) { | |
| if (test.meta?.eval) { | |
| evalResults.push({ | |
| name: test.fullName || test.title, | |
| file: testFile.name, | |
| avgScore: test.meta.eval.avgScore ?? null, | |
| scores: test.meta.eval.scores || [], | |
| passed: test.status === 'passed', | |
| duration: test.duration, | |
| }); | |
| } | |
| } | |
| } | |
| // Calculate statistics | |
| const totalTests = evalResults.length; | |
| // Treat null scores as 0.0 for consistent categorization | |
| const scores = evalResults.map(r => r.avgScore !== null ? r.avgScore : 0); | |
| const avgScore = scores.length > 0 | |
| ? scores.reduce((sum, score) => sum + score, 0) / scores.length | |
| : 0; | |
| const green = scores.filter(s => s >= 0.75).length; | |
| const yellow = scores.filter(s => s >= 0.5 && s < 0.75).length; | |
| const red = scores.filter(s => s < 0.5).length; | |
| // Determine conclusion | |
| const conclusion = avgScore >= 0.5 ? 'success' : 'failure'; | |
| // Format score helper | |
| function formatScore(score) { | |
| if (score >= 0.75) return `🟢 ${score.toFixed(2)}`; | |
| if (score >= 0.5) return `🟡 ${score.toFixed(2)}`; | |
| return `🔴 ${score.toFixed(2)}`; | |
| } | |
| // Build title | |
| const title = `Eval Score: ${avgScore.toFixed(2)} (${green} green, ${yellow} yellow, ${red} red)`; | |
| // Build summary | |
| const summary = [ | |
| `## Overall Statistics`, | |
| ``, | |
| `- **Total Evaluations**: ${totalTests}`, | |
| `- **Average Score**: ${formatScore(avgScore)}`, | |
| `- **Pass Threshold**: 0.50 (catastrophic failure)`, | |
| ``, | |
| `### Score Distribution`, | |
| `- 🟢 Green (≥0.75): ${green} evals`, | |
| `- 🟡 Yellow (0.50-0.74): ${yellow} evals`, | |
| `- 🔴 Red (<0.50): ${red} evals`, | |
| ].join('\n'); | |
| // Build detailed results | |
| const detailsByScore = [...evalResults].sort((a, b) => (b.avgScore || 0) - (a.avgScore || 0)); | |
| const details = [ | |
| `## Individual Eval Scores`, | |
| ``, | |
| ...detailsByScore.map(result => { | |
| const score = result.avgScore !== null ? result.avgScore : 0; | |
| const statusIcon = result.passed ? '✅' : '❌'; | |
| const scoreDisplay = formatScore(score); | |
| let line = `${statusIcon} **${result.name}**: ${scoreDisplay}`; | |
| // Add rationale for failed or low-scoring tests | |
| if (!result.passed || score < 0.75) { | |
| const firstScore = result.scores[0]; | |
| if (firstScore?.metadata?.rationale) { | |
| line += `\n - ${firstScore.metadata.rationale}`; | |
| } | |
| } | |
| return line; | |
| }), | |
| ``, | |
| `---`, | |
| ``, | |
| `### Conclusion`, | |
| ``, | |
| conclusion === 'success' | |
| ? `✅ **Passed**: Average score (${avgScore.toFixed(2)}) is above the catastrophic failure threshold (0.50)` | |
| : `❌ **Failed**: Average score (${avgScore.toFixed(2)}) is below the catastrophic failure threshold (0.50)`, | |
| ].join('\n'); | |
| // Create check run | |
| await github.rest.checks.create({ | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| name: 'Evaluation Results', | |
| head_sha: context.sha, | |
| status: 'completed', | |
| conclusion: conclusion, | |
| output: { | |
| title: title, | |
| summary: summary, | |
| text: details, | |
| }, | |
| }); | |
| console.log(`✅ Check run created with conclusion: ${conclusion}`); | |
| console.log(` Average Score: ${avgScore.toFixed(2)}`); |