From b66ffa5e0b78a41cb5752e339b9a11cba5e07f21 Mon Sep 17 00:00:00 2001 From: riceharvest Date: Sun, 22 Mar 2026 11:10:04 +0100 Subject: [PATCH 1/3] Add AI image sorting toggle and fix leaderboard parsing issues --- app/page.tsx | 93 ++++++++++- app/submission/[id]/page.tsx | 34 ++-- components/leaderboard-header.tsx | 22 +++ components/leaderboard-view.tsx | 20 ++- components/task-breakdown.tsx | 6 +- components/task-heatmap.tsx | 250 +++++++++++++++--------------- lib/task-metadata.ts | 7 + lib/transforms.ts | 10 +- 8 files changed, 296 insertions(+), 146 deletions(-) diff --git a/app/page.tsx b/app/page.tsx index fc968c7..cfe5bb2 100644 --- a/app/page.tsx +++ b/app/page.tsx @@ -1,10 +1,11 @@ import type { Metadata } from 'next' -import { fetchLeaderboard, fetchBenchmarkVersions } from '@/lib/api' -import { calculateRanks, transformLeaderboardEntry } from '@/lib/transforms' +import { fetchLeaderboard, fetchBenchmarkVersions, fetchSubmission } from '@/lib/api' +import { calculateRanks, transformLeaderboardEntry, EPSILON, estimateSuccessfulTasks } from '@/lib/transforms' +import { isExcludedLeaderboardTask } from '@/lib/task-metadata' import { LeaderboardView } from '@/components/leaderboard-view' interface HomeProps { - searchParams: Promise<{ version?: string; view?: string; official?: string }> + searchParams: Promise<{ version?: string; view?: string; official?: string; excludeImageGen?: string }> } export async function generateMetadata({ searchParams }: HomeProps): Promise { @@ -42,13 +43,94 @@ export async function generateMetadata({ searchParams }: HomeProps): Promise + fetchSubmission(entry.submission_id) + .then(res => res.submission) + .catch(err => { + console.error('Failed to fetch submission for', entry.submission_id, err) + return null + }) + ) + ) + bestSubmissions.push(...results) + } + + const adjustedEntries = [] + + for (let i = 0; i < transformedEntries.length; i++) { + const entry = transformedEntries[i] + const sub = bestSubmissions[i] + + // Preserve the original entry if submission details are temporarily unavailable. + if (!sub) { + adjustedEntries.push(entry) + continue + } + + const excludedTasks = sub.tasks.filter(task => isExcludedLeaderboardTask(task.task_id)) + if (excludedTasks.length === 0) { + adjustedEntries.push(entry) + continue + } + + const excludedScore = excludedTasks.reduce((sum, task) => sum + task.score, 0) + const excludedMax = excludedTasks.reduce((sum, task) => sum + task.max_score, 0) + const remainingTaskCount = sub.tasks.filter(task => !isExcludedLeaderboardTask(task.task_id)).length + + const adjustedScore = sub.total_score - excludedScore + const adjustedMax = sub.max_score - excludedMax + if (adjustedMax <= 0 || remainingTaskCount <= 0) { + adjustedEntries.push(entry) + continue + } + + const adjustedPercentage = adjustedScore / adjustedMax + if (!Number.isFinite(adjustedPercentage)) { + adjustedEntries.push(entry) + continue + } + + const bestCost = entry.best_cost_usd + let value_score: number | null = null + if (bestCost != null && bestCost > EPSILON) { + value_score = (adjustedPercentage * 100) / bestCost + } + + let cpst: number | null = null + const successfulTasks = estimateSuccessfulTasks(adjustedPercentage, remainingTaskCount) + if (bestCost != null && bestCost > EPSILON && successfulTasks != null && successfulTasks > 0) { + cpst = bestCost / successfulTasks + } + + adjustedEntries.push({ + ...entry, + percentage: adjustedPercentage * 100, + value_score, + cpst, + }) + } + + transformedEntries = adjustedEntries + } + + const entries = calculateRanks(transformedEntries) const latestTimestamp = entries.reduce((latest, entry) => { const current = new Date(entry.timestamp).getTime() return Number.isNaN(current) ? latest : Math.max(latest, current) @@ -71,6 +153,7 @@ export default async function Home({ searchParams }: HomeProps) { versions={versionsResponse.versions} currentVersion={version ?? null} officialOnly={officialOnly} + excludeImageGen={excludeImageGenBool} /> ) } diff --git a/app/submission/[id]/page.tsx b/app/submission/[id]/page.tsx index 0276191..8faa590 100644 --- a/app/submission/[id]/page.tsx +++ b/app/submission/[id]/page.tsx @@ -13,16 +13,18 @@ import { PROVIDER_COLORS } from '@/lib/types' import { formatDistanceToNow } from 'date-fns' import { fetchSubmission } from '@/lib/api' import { transformSubmission } from '@/lib/transforms' +import { isExcludedLeaderboardTask } from '@/lib/task-metadata' interface SubmissionPageProps { params: Promise<{ id: string }> - searchParams: Promise<{ official?: string }> + searchParams: Promise<{ official?: string; excludeImageGen?: string }> } export default async function SubmissionPage({ params, searchParams }: SubmissionPageProps) { const { id } = await params - const { official } = await searchParams + const { official, excludeImageGen } = await searchParams const officialOnly = official !== 'false' + const excludeImageGenBool = excludeImageGen === 'true' let submission try { @@ -68,7 +70,23 @@ export default async function SubmissionPage({ params, searchParams }: Submissio notFound() } - const categoryStats = submission.task_results.reduce( + // Apply image generation exclusion if requested + let displayTasks = submission.task_results + let displayTotalScore = submission.total_score + let displayMaxScore = submission.max_score + + if (excludeImageGenBool) { + const excludedTasks = submission.task_results.filter(task => isExcludedLeaderboardTask(task.task_id)) + if (excludedTasks.length > 0) { + const excludedScore = excludedTasks.reduce((sum, task) => sum + task.score, 0) + const excludedMax = excludedTasks.reduce((sum, task) => sum + task.max_score, 0) + displayTotalScore = submission.total_score - excludedScore + displayMaxScore = submission.max_score - excludedMax + displayTasks = submission.task_results.filter(task => !isExcludedLeaderboardTask(task.task_id)) + } + } + + const categoryStats = displayTasks.reduce( (acc, task) => { if (!acc[task.category]) { acc[task.category] = { total: 0, max: 0, count: 0 } @@ -191,8 +209,8 @@ export default async function SubmissionPage({ params, searchParams }: Submissio
@@ -237,10 +255,10 @@ export default async function SubmissionPage({ params, searchParams }: Submissio Task Breakdown

- {submission.task_results.length} tasks completed + {displayTasks.length} tasks completed

- + {/* Hardware Info */} {submission.metadata.system && ( @@ -276,8 +294,6 @@ export default async function SubmissionPage({ params, searchParams }: Submissio - - ) } diff --git a/components/leaderboard-header.tsx b/components/leaderboard-header.tsx index 14113da..4c595d1 100644 --- a/components/leaderboard-header.tsx +++ b/components/leaderboard-header.tsx @@ -19,10 +19,12 @@ interface LeaderboardHeaderProps { scoreMode: ScoreMode officialOnly: boolean openWeightsOnly: boolean + excludeImageGen?: boolean onViewChange: (view: ViewMode) => void onScoreModeChange: (mode: ScoreMode) => void onOfficialOnlyChange: (officialOnly: boolean) => void onOpenWeightsOnlyChange: (openWeightsOnly: boolean) => void + onExcludeImageGenChange?: (value: boolean) => void onClearProviderFilter: () => void } @@ -38,10 +40,12 @@ export function LeaderboardHeader({ scoreMode, officialOnly, openWeightsOnly, + excludeImageGen, onViewChange, onScoreModeChange, onOfficialOnlyChange, onOpenWeightsOnlyChange, + onExcludeImageGenChange, onClearProviderFilter, }: LeaderboardHeaderProps) { return ( @@ -111,6 +115,15 @@ export function LeaderboardHeader({ /> Open-weight only + Updated {lastUpdated} @@ -227,6 +240,15 @@ export function LeaderboardHeader({ /> Open-weight only + Updated {lastUpdated} diff --git a/components/leaderboard-view.tsx b/components/leaderboard-view.tsx index 640bb62..1896792 100644 --- a/components/leaderboard-view.tsx +++ b/components/leaderboard-view.tsx @@ -1,6 +1,6 @@ 'use client' -import { useCallback, useMemo, useState } from 'react' +import { useCallback, useMemo, useState, useEffect } from 'react' import { useSearchParams, useRouter, usePathname } from 'next/navigation' import type { LeaderboardEntry, BenchmarkVersion } from '@/lib/types' import { PROVIDER_COLORS } from '@/lib/types' @@ -25,9 +25,10 @@ interface LeaderboardViewProps { versions: BenchmarkVersion[] currentVersion: string | null officialOnly: boolean + excludeImageGen?: boolean } -export function LeaderboardView({ entries, lastUpdated, versions, currentVersion, officialOnly }: LeaderboardViewProps) { +export function LeaderboardView({ entries, lastUpdated, versions, currentVersion, officialOnly, excludeImageGen = false }: LeaderboardViewProps) { const searchParams = useSearchParams() const router = useRouter() const pathname = usePathname() @@ -51,6 +52,7 @@ export function LeaderboardView({ entries, lastUpdated, versions, currentVersion const [providerFilter, setProviderFilterState] = useState(initialProvider) const [openWeightsOnly, setOpenWeightsOnlyState] = useState(initialOpenWeights) const [graphSubTab, setGraphSubTabState] = useState(initialGraphTab) + const [excludeImageGenLocal, setExcludeImageGenState] = useState(excludeImageGen) // Helper to update URL params without full page reload const updateUrl = useCallback((updates: Record) => { @@ -94,6 +96,16 @@ export function LeaderboardView({ entries, lastUpdated, versions, currentVersion updateUrl({ graph: t === 'scatter' ? null : t }) }, [updateUrl]) + const setExcludeImageGen = useCallback((v: boolean) => { + setExcludeImageGenState(v) + updateUrl({ excludeImageGen: v ? 'true' : null }) + }, [updateUrl]) + + // Sync prop to local state when URL changes (e.g., browser back/forward) + useEffect(() => { + setExcludeImageGenState(excludeImageGen) + }, [excludeImageGen]) + const setOfficialOnly = useCallback((v: boolean) => { setOfficialOnlyState(v) updateUrl({ official: v ? null : 'false' }) @@ -134,10 +146,12 @@ export function LeaderboardView({ entries, lastUpdated, versions, currentVersion scoreMode={scoreMode} officialOnly={officialOnlyState} openWeightsOnly={openWeightsOnly} + excludeImageGen={excludeImageGenLocal} onViewChange={setView} onScoreModeChange={setScoreMode} onOfficialOnlyChange={setOfficialOnly} onOpenWeightsOnlyChange={setOpenWeightsOnly} + onExcludeImageGenChange={setExcludeImageGen} onClearProviderFilter={() => setProviderFilter(null)} /> @@ -169,7 +183,7 @@ export function LeaderboardView({ entries, lastUpdated, versions, currentVersion )} {graphSubTab === 'heatmap' && ( - + )} {graphSubTab === 'distribution' && ( diff --git a/components/task-breakdown.tsx b/components/task-breakdown.tsx index 13c54cc..f0649cd 100644 --- a/components/task-breakdown.tsx +++ b/components/task-breakdown.tsx @@ -8,6 +8,7 @@ import { ColoredProgress } from '@/components/ui/colored-progress' import { ChevronDown, ChevronRight } from 'lucide-react' import type { TaskResult } from '@/lib/types' import { CATEGORY_ICONS } from '@/lib/types' +import { isExcludedLeaderboardTask } from '@/lib/task-metadata' import { Collapsible, CollapsibleContent, @@ -16,6 +17,7 @@ import { interface TaskBreakdownProps { tasks: TaskResult[] + excludeImageGen?: boolean } const getScoreColor = (score: number, maxScore: number) => { @@ -46,7 +48,7 @@ const getGradingBadgeVariant = ( } } -export function TaskBreakdown({ tasks }: TaskBreakdownProps) { +export function TaskBreakdown({ tasks, excludeImageGen = false }: TaskBreakdownProps) { const [openTasks, setOpenTasks] = useState>(new Set()) const toggleTask = (taskId: string) => { @@ -61,7 +63,7 @@ export function TaskBreakdown({ tasks }: TaskBreakdownProps) { return (
- {tasks.map((task) => { + {tasks.filter(task => !(excludeImageGen && isExcludedLeaderboardTask(task.task_id))).map((task) => { const percentage = (task.score / task.max_score) * 100 const isOpen = openTasks.has(task.task_id) diff --git a/components/task-heatmap.tsx b/components/task-heatmap.tsx index 4873aa6..c32787f 100644 --- a/components/task-heatmap.tsx +++ b/components/task-heatmap.tsx @@ -5,12 +5,14 @@ import type { LeaderboardEntry, TaskResult } from '@/lib/types' import { PROVIDER_COLORS, CATEGORY_ICONS } from '@/lib/types' import { fetchSubmissionClient } from '@/lib/api' import { transformSubmission } from '@/lib/transforms' +import { isExcludedLeaderboardTask } from '@/lib/task-metadata' import { ShareableWrapper } from '@/components/shareable-wrapper' import { Tooltip, TooltipContent, TooltipProvider, TooltipTrigger } from '@/components/ui/tooltip' interface TaskHeatmapProps { entries: LeaderboardEntry[] scoreMode: 'best' | 'average' + excludeImageGen?: boolean } interface ModelTaskData { @@ -37,7 +39,7 @@ function getScoreTextColor(ratio: number): string { return 'hsl(0, 70%, 75%)' } -export function TaskHeatmap({ entries, scoreMode }: TaskHeatmapProps) { +export function TaskHeatmap({ entries, scoreMode, excludeImageGen = false }: TaskHeatmapProps) { const [modelData, setModelData] = useState([]) const [loading, setLoading] = useState(true) const [error, setError] = useState(null) @@ -69,6 +71,10 @@ export function TaskHeatmap({ entries, scoreMode }: TaskHeatmapProps) { const taskMap = new Map() for (const task of submission.task_results) { + if (excludeImageGen && isExcludedLeaderboardTask(task.task_id)) { + continue + } + taskMap.set(task.task_id, { score: task.score, maxScore: task.max_score, @@ -106,7 +112,7 @@ export function TaskHeatmap({ entries, scoreMode }: TaskHeatmapProps) { loadData() return () => { cancelled = true } - }, [entries]) + }, [entries, excludeImageGen]) // Collect all unique tasks and sort by category const allTasks = useMemo(() => { @@ -248,133 +254,133 @@ export function TaskHeatmap({ entries, scoreMode }: TaskHeatmapProps) { subtitle={`${sortedModels.length} models x ${allTasks.length} tasks`} >
- - {/* Category header row */} - - - + ) + })} + +
- {categoryGroups.map((group) => ( - - - {CATEGORY_ICONS[group.category] || ''} {group.category} - + + {/* Category header row */} + + + + ))} + + {/* Task name header row */} + + - ))} - - {/* Task name header row */} - - - {allTasks.map((task) => ( - - ))} - - - - - {sortedModels.map((model) => { - const providerColor = PROVIDER_COLORS[model.provider.toLowerCase()] || '#888' - return ( - - - {allTasks.map((task) => { - const taskData = model.tasks.get(task.taskId) - const ratio = taskData ? taskData.score / taskData.maxScore : 0 - const hasData = !!taskData - const isHovered = hoveredCell?.model === model.model && hoveredCell?.taskId === task.taskId - - return ( - + + + + {sortedModels.map((model) => { + const providerColor = PROVIDER_COLORS[model.provider.toLowerCase()] || '#888' + return ( + + + {allTasks.map((task) => { + const taskData = model.tasks.get(task.taskId) + const ratio = taskData ? taskData.score / taskData.maxScore : 0 + const hasData = !!taskData + const isHovered = hoveredCell?.model === model.model && hoveredCell?.taskId === task.taskId + + return ( + - ) - })} - - ) - })} - -
+ {categoryGroups.map((group) => ( + + + {CATEGORY_ICONS[group.category] || ''} {group.category} + +
+ Model
- Model - - - -
- ( +
+ + +
- {task.taskName} - -
-
- -

{task.taskName}

-

{task.category}

-
-
-
-
- - - {model.model} - -
-
setHoveredCell({ model: model.model, taskId: task.taskId })} - onMouseLeave={() => setHoveredCell(null)} - > - {hasData && ( - {Math.round(ratio * 100)} + {task.taskName} - )} - {!hasData && ( - - - )} + + + +

{task.taskName}

+

{task.category}

+
+ + + ))} +
+
+ + + {model.model} + +
+
setHoveredCell({ model: model.model, taskId: task.taskId })} + onMouseLeave={() => setHoveredCell(null)} + > + {hasData && ( + + {Math.round(ratio * 100)} + + )} + {!hasData && ( + - + )} - {/* Tooltip */} - {isHovered && taskData && ( -
-
-

{model.model}

-

{task.taskName}

-

- Score: - {taskData.score}/{taskData.maxScore} - ({Math.round(ratio * 100)}%) -

+ {/* Tooltip */} + {isHovered && taskData && ( +
+
+

{model.model}

+

{task.taskName}

+

+ Score: + {taskData.score}/{taskData.maxScore} + ({Math.round(ratio * 100)}%) +

+
-
- )} -
+ )} + + ) + })} +
diff --git a/lib/task-metadata.ts b/lib/task-metadata.ts index f94a685..0d20d82 100644 --- a/lib/task-metadata.ts +++ b/lib/task-metadata.ts @@ -1,3 +1,10 @@ +export const IMAGE_GEN_TASK_ID = "task_13_image_gen"; +export const EXCLUDED_LEADERBOARD_TASK_IDS = new Set([IMAGE_GEN_TASK_ID]); + +export function isExcludedLeaderboardTask(taskId: string): boolean { + return EXCLUDED_LEADERBOARD_TASK_IDS.has(taskId); +} + export const TASK_FALLBACK: Record = { task_00_sanity: { name: "Sanity Check", category: "validation" }, diff --git a/lib/transforms.ts b/lib/transforms.ts index 0b0c4d8..285e167 100644 --- a/lib/transforms.ts +++ b/lib/transforms.ts @@ -8,7 +8,7 @@ import type { } from "@/lib/types"; import { TASK_FALLBACK } from "@/lib/task-metadata"; -const EPSILON = 1e-6; +export const EPSILON = 1e-6; /** * Normalize provider name. When the provider is "openrouter", the real @@ -28,11 +28,11 @@ export const normalizeProvider = (provider: string, model?: string): string => { /** * Estimate the number of successful tasks from a score percentage. - * Uses best_score_percentage * max_score (from API) — but since max_score - * is not in ApiLeaderboardEntry, we approximate using a standard task count of 40 - * (the current PinchBench task count). Falls back to null if score is unavailable. + * Since max_score is not present on ApiLeaderboardEntry, this uses an approximate + * task count. Callers with submission-level data should pass an exact remaining + * task count instead of relying on the default. */ -function estimateSuccessfulTasks( +export function estimateSuccessfulTasks( scorePercentage: number | null | undefined, taskCount = 40, ): number | null { From 186ac7f66cc8548438fa6df13bfd5e6d16bc220f Mon Sep 17 00:00:00 2001 From: riceharvest Date: Sun, 22 Mar 2026 13:05:46 +0100 Subject: [PATCH 2/3] Remove redundant task filtering on submission page --- app/submission/[id]/page.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/submission/[id]/page.tsx b/app/submission/[id]/page.tsx index 8faa590..da332ad 100644 --- a/app/submission/[id]/page.tsx +++ b/app/submission/[id]/page.tsx @@ -258,7 +258,7 @@ export default async function SubmissionPage({ params, searchParams }: Submissio {displayTasks.length} tasks completed

- + {/* Hardware Info */} {submission.metadata.system && ( From a8588c6cce71f576d5e577ad48c66ec0b7df5a8f Mon Sep 17 00:00:00 2001 From: riceharvest Date: Sun, 22 Mar 2026 13:29:20 +0100 Subject: [PATCH 3/3] Avoid refetching heatmap data when toggling image exclusion --- components/task-heatmap.tsx | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/components/task-heatmap.tsx b/components/task-heatmap.tsx index c32787f..e6b80a8 100644 --- a/components/task-heatmap.tsx +++ b/components/task-heatmap.tsx @@ -71,10 +71,6 @@ export function TaskHeatmap({ entries, scoreMode, excludeImageGen = false }: Tas const taskMap = new Map() for (const task of submission.task_results) { - if (excludeImageGen && isExcludedLeaderboardTask(task.task_id)) { - continue - } - taskMap.set(task.task_id, { score: task.score, maxScore: task.max_score, @@ -112,7 +108,7 @@ export function TaskHeatmap({ entries, scoreMode, excludeImageGen = false }: Tas loadData() return () => { cancelled = true } - }, [entries, excludeImageGen]) + }, [entries]) // Collect all unique tasks and sort by category const allTasks = useMemo(() => { @@ -127,12 +123,13 @@ export function TaskHeatmap({ entries, scoreMode, excludeImageGen = false }: Tas return Array.from(taskMap.entries()) .map(([taskId, info]) => ({ taskId, ...info })) + .filter(task => !(excludeImageGen && isExcludedLeaderboardTask(task.taskId))) .sort((a, b) => { const catCmp = a.category.localeCompare(b.category) if (catCmp !== 0) return catCmp return a.taskName.localeCompare(b.taskName) }) - }, [modelData]) + }, [modelData, excludeImageGen]) // Sort models const sortedModels = useMemo(() => {