Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 88 additions & 5 deletions app/page.tsx
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import type { Metadata } from 'next'
import { fetchLeaderboard, fetchBenchmarkVersions } from '@/lib/api'
import { calculateRanks, transformLeaderboardEntry } from '@/lib/transforms'
import { fetchLeaderboard, fetchBenchmarkVersions, fetchSubmission } from '@/lib/api'
import { calculateRanks, transformLeaderboardEntry, EPSILON, estimateSuccessfulTasks } from '@/lib/transforms'
import { isExcludedLeaderboardTask } from '@/lib/task-metadata'
import { LeaderboardView } from '@/components/leaderboard-view'

interface HomeProps {
searchParams: Promise<{ version?: string; view?: string; official?: string }>
searchParams: Promise<{ version?: string; view?: string; official?: string; excludeImageGen?: string }>
}

export async function generateMetadata({ searchParams }: HomeProps): Promise<Metadata> {
Expand Down Expand Up @@ -42,13 +43,94 @@ export async function generateMetadata({ searchParams }: HomeProps): Promise<Met
}

export default async function Home({ searchParams }: HomeProps) {
const { version, official } = await searchParams
const { version, official, excludeImageGen } = await searchParams
const officialOnly = official !== 'false'
const excludeImageGenBool = excludeImageGen === 'true'
const [response, versionsResponse] = await Promise.all([
fetchLeaderboard(version, { officialOnly }),
fetchBenchmarkVersions(),
])
const entries = calculateRanks(response.leaderboard.map(transformLeaderboardEntry))
let transformedEntries = response.leaderboard.map(transformLeaderboardEntry)

if (excludeImageGenBool) {
// Fetch best submission details and adjust scores to exclude image-generation tasks.
const bestSubmissions = []
const batchSize = 10

for (let i = 0; i < transformedEntries.length; i += batchSize) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WARNING: Performance — N individual API calls on every cold-cache page load

When excludeImageGen=true, this server component sequentially fetches submission details for every leaderboard entry (batched 10 at a time). If there are 50+ models on the leaderboard, that's 5+ serial rounds of API calls blocking the page response. Even with ISR (revalidate: 60), every 60 seconds a user will hit a cold cache and experience significant latency.

Consider:

  1. Adding a dedicated API endpoint (e.g. /leaderboard?excludeTasks=task_13_image_gen) that computes adjusted scores server-side, avoiding the N+1 fetch pattern entirely.
  2. If a new endpoint isn't feasible, moving this computation to the client side (similar to how TaskHeatmap already fetches submission data client-side) so the page renders immediately and scores update asynchronously.
  3. At minimum, using Promise.all for all entries at once instead of sequential batches — the batching here throttles throughput but doesn't reduce total call count.

const batch = transformedEntries.slice(i, i + batchSize)
const results = await Promise.all(
batch.map(entry =>
fetchSubmission(entry.submission_id)
.then(res => res.submission)
.catch(err => {
console.error('Failed to fetch submission for', entry.submission_id, err)
return null
})
)
)
bestSubmissions.push(...results)
}

const adjustedEntries = []

for (let i = 0; i < transformedEntries.length; i++) {
const entry = transformedEntries[i]
const sub = bestSubmissions[i]

// Preserve the original entry if submission details are temporarily unavailable.
if (!sub) {
adjustedEntries.push(entry)
continue
}

const excludedTasks = sub.tasks.filter(task => isExcludedLeaderboardTask(task.task_id))
if (excludedTasks.length === 0) {
adjustedEntries.push(entry)
continue
}

const excludedScore = excludedTasks.reduce((sum, task) => sum + task.score, 0)
const excludedMax = excludedTasks.reduce((sum, task) => sum + task.max_score, 0)
const remainingTaskCount = sub.tasks.filter(task => !isExcludedLeaderboardTask(task.task_id)).length

const adjustedScore = sub.total_score - excludedScore
const adjustedMax = sub.max_score - excludedMax
if (adjustedMax <= 0 || remainingTaskCount <= 0) {
adjustedEntries.push(entry)
continue
}

const adjustedPercentage = adjustedScore / adjustedMax
if (!Number.isFinite(adjustedPercentage)) {
adjustedEntries.push(entry)
continue
}

const bestCost = entry.best_cost_usd
let value_score: number | null = null
if (bestCost != null && bestCost > EPSILON) {
value_score = (adjustedPercentage * 100) / bestCost
}

let cpst: number | null = null
const successfulTasks = estimateSuccessfulTasks(adjustedPercentage, remainingTaskCount)
if (bestCost != null && bestCost > EPSILON && successfulTasks != null && successfulTasks > 0) {
cpst = bestCost / successfulTasks
}

adjustedEntries.push({
...entry,
percentage: adjustedPercentage * 100,
value_score,
cpst,
})
}

transformedEntries = adjustedEntries
}

const entries = calculateRanks(transformedEntries)
const latestTimestamp = entries.reduce((latest, entry) => {
const current = new Date(entry.timestamp).getTime()
return Number.isNaN(current) ? latest : Math.max(latest, current)
Expand All @@ -71,6 +153,7 @@ export default async function Home({ searchParams }: HomeProps) {
versions={versionsResponse.versions}
currentVersion={version ?? null}
officialOnly={officialOnly}
excludeImageGen={excludeImageGenBool}
/>
)
}
34 changes: 25 additions & 9 deletions app/submission/[id]/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,18 @@ import { PROVIDER_COLORS } from '@/lib/types'
import { formatDistanceToNow } from 'date-fns'
import { fetchSubmission } from '@/lib/api'
import { transformSubmission } from '@/lib/transforms'
import { isExcludedLeaderboardTask } from '@/lib/task-metadata'

interface SubmissionPageProps {
params: Promise<{ id: string }>
searchParams: Promise<{ official?: string }>
searchParams: Promise<{ official?: string; excludeImageGen?: string }>
}

export default async function SubmissionPage({ params, searchParams }: SubmissionPageProps) {
const { id } = await params
const { official } = await searchParams
const { official, excludeImageGen } = await searchParams
const officialOnly = official !== 'false'
const excludeImageGenBool = excludeImageGen === 'true'
let submission

try {
Expand Down Expand Up @@ -68,7 +70,23 @@ export default async function SubmissionPage({ params, searchParams }: Submissio
notFound()
}

const categoryStats = submission.task_results.reduce(
// Apply image generation exclusion if requested
let displayTasks = submission.task_results
let displayTotalScore = submission.total_score
let displayMaxScore = submission.max_score

if (excludeImageGenBool) {
const excludedTasks = submission.task_results.filter(task => isExcludedLeaderboardTask(task.task_id))
if (excludedTasks.length > 0) {
const excludedScore = excludedTasks.reduce((sum, task) => sum + task.score, 0)
const excludedMax = excludedTasks.reduce((sum, task) => sum + task.max_score, 0)
displayTotalScore = submission.total_score - excludedScore
displayMaxScore = submission.max_score - excludedMax
displayTasks = submission.task_results.filter(task => !isExcludedLeaderboardTask(task.task_id))
}
}

const categoryStats = displayTasks.reduce(
(acc, task) => {
if (!acc[task.category]) {
acc[task.category] = { total: 0, max: 0, count: 0 }
Expand Down Expand Up @@ -191,8 +209,8 @@ export default async function SubmissionPage({ params, searchParams }: Submissio
<div className="grid grid-cols-1 lg:grid-cols-3 gap-6 mb-8">
<div className="lg:col-span-1">
<ScoreGauge
score={submission.total_score}
maxScore={submission.max_score}
score={displayTotalScore}
maxScore={displayMaxScore}
/>
</div>

Expand Down Expand Up @@ -237,10 +255,10 @@ export default async function SubmissionPage({ params, searchParams }: Submissio
Task Breakdown
</h2>
<p className="text-sm text-muted-foreground">
{submission.task_results.length} tasks completed
{displayTasks.length} tasks completed
</p>
</div>
<TaskBreakdown tasks={submission.task_results} />
<TaskBreakdown tasks={displayTasks} />

{/* Hardware Info */}
{submission.metadata.system && (
Expand Down Expand Up @@ -276,8 +294,6 @@ export default async function SubmissionPage({ params, searchParams }: Submissio
</div>
</Card>
</div>


</div>
)
}
22 changes: 22 additions & 0 deletions components/leaderboard-header.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@ interface LeaderboardHeaderProps {
scoreMode: ScoreMode
officialOnly: boolean
openWeightsOnly: boolean
excludeImageGen?: boolean
onViewChange: (view: ViewMode) => void
onScoreModeChange: (mode: ScoreMode) => void
onOfficialOnlyChange: (officialOnly: boolean) => void
onOpenWeightsOnlyChange: (openWeightsOnly: boolean) => void
onExcludeImageGenChange?: (value: boolean) => void
onClearProviderFilter: () => void
}

Expand All @@ -38,10 +40,12 @@ export function LeaderboardHeader({
scoreMode,
officialOnly,
openWeightsOnly,
excludeImageGen,
onViewChange,
onScoreModeChange,
onOfficialOnlyChange,
onOpenWeightsOnlyChange,
onExcludeImageGenChange,
onClearProviderFilter,
}: LeaderboardHeaderProps) {
return (
Expand Down Expand Up @@ -111,6 +115,15 @@ export function LeaderboardHeader({
/>
Open-weight only
</label>
<label className="flex items-center gap-2 text-xs text-muted-foreground/90 cursor-pointer hover:text-foreground transition-colors">
<input
type="checkbox"
checked={excludeImageGen}
onChange={(e) => onExcludeImageGenChange?.(e.target.checked)}
className="h-3.5 w-3.5 rounded border border-border/70 bg-muted/30 text-muted-foreground checked:border-muted-foreground checked:bg-muted-foreground focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring focus-visible:ring-offset-0"
/>
Exclude image generation tasks
</label>
<span className="text-xs text-muted-foreground/60">Updated {lastUpdated}</span>
</div>
</div>
Expand Down Expand Up @@ -227,6 +240,15 @@ export function LeaderboardHeader({
/>
Open-weight only
</label>
<label className="flex items-center gap-2 cursor-pointer hover:text-foreground transition-colors">
<input
type="checkbox"
checked={excludeImageGen}
onChange={(e) => onExcludeImageGenChange?.(e.target.checked)}
className="h-3.5 w-3.5 rounded border border-border/70 bg-muted/30 text-muted-foreground checked:border-muted-foreground checked:bg-muted-foreground focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring focus-visible:ring-offset-0"
/>
Exclude image generation tasks
</label>
<span className="text-muted-foreground/60">Updated {lastUpdated}</span>
</div>
</div>
Expand Down
20 changes: 17 additions & 3 deletions components/leaderboard-view.tsx
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
'use client'

import { useCallback, useMemo, useState } from 'react'
import { useCallback, useMemo, useState, useEffect } from 'react'
import { useSearchParams, useRouter, usePathname } from 'next/navigation'
import type { LeaderboardEntry, BenchmarkVersion } from '@/lib/types'
import { PROVIDER_COLORS } from '@/lib/types'
Expand All @@ -25,9 +25,10 @@ interface LeaderboardViewProps {
versions: BenchmarkVersion[]
currentVersion: string | null
officialOnly: boolean
excludeImageGen?: boolean
}

export function LeaderboardView({ entries, lastUpdated, versions, currentVersion, officialOnly }: LeaderboardViewProps) {
export function LeaderboardView({ entries, lastUpdated, versions, currentVersion, officialOnly, excludeImageGen = false }: LeaderboardViewProps) {
const searchParams = useSearchParams()
const router = useRouter()
const pathname = usePathname()
Expand All @@ -51,6 +52,7 @@ export function LeaderboardView({ entries, lastUpdated, versions, currentVersion
const [providerFilter, setProviderFilterState] = useState<string | null>(initialProvider)
const [openWeightsOnly, setOpenWeightsOnlyState] = useState<boolean>(initialOpenWeights)
const [graphSubTab, setGraphSubTabState] = useState<GraphSubTab>(initialGraphTab)
const [excludeImageGenLocal, setExcludeImageGenState] = useState<boolean>(excludeImageGen)

// Helper to update URL params without full page reload
const updateUrl = useCallback((updates: Record<string, string | null>) => {
Expand Down Expand Up @@ -94,6 +96,16 @@ export function LeaderboardView({ entries, lastUpdated, versions, currentVersion
updateUrl({ graph: t === 'scatter' ? null : t })
}, [updateUrl])

const setExcludeImageGen = useCallback((v: boolean) => {
setExcludeImageGenState(v)
updateUrl({ excludeImageGen: v ? 'true' : null })
}, [updateUrl])

// Sync prop to local state when URL changes (e.g., browser back/forward)
useEffect(() => {
setExcludeImageGenState(excludeImageGen)
}, [excludeImageGen])

const setOfficialOnly = useCallback((v: boolean) => {
setOfficialOnlyState(v)
updateUrl({ official: v ? null : 'false' })
Expand Down Expand Up @@ -134,10 +146,12 @@ export function LeaderboardView({ entries, lastUpdated, versions, currentVersion
scoreMode={scoreMode}
officialOnly={officialOnlyState}
openWeightsOnly={openWeightsOnly}
excludeImageGen={excludeImageGenLocal}
onViewChange={setView}
onScoreModeChange={setScoreMode}
onOfficialOnlyChange={setOfficialOnly}
onOpenWeightsOnlyChange={setOpenWeightsOnly}
onExcludeImageGenChange={setExcludeImageGen}
onClearProviderFilter={() => setProviderFilter(null)}
/>

Expand Down Expand Up @@ -169,7 +183,7 @@ export function LeaderboardView({ entries, lastUpdated, versions, currentVersion
<ScatterGraphs entries={filteredEntries} scoreMode={scoreMode} />
)}
{graphSubTab === 'heatmap' && (
<TaskHeatmap entries={filteredEntries} scoreMode={scoreMode} />
<TaskHeatmap entries={filteredEntries} scoreMode={scoreMode} excludeImageGen={excludeImageGenLocal} />
)}
{graphSubTab === 'distribution' && (
<ScoreDistribution entries={filteredEntries} scoreMode={scoreMode} currentVersion={currentVersion} officialOnly={officialOnlyState} />
Expand Down
6 changes: 4 additions & 2 deletions components/task-breakdown.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { ColoredProgress } from '@/components/ui/colored-progress'
import { ChevronDown, ChevronRight } from 'lucide-react'
import type { TaskResult } from '@/lib/types'
import { CATEGORY_ICONS } from '@/lib/types'
import { isExcludedLeaderboardTask } from '@/lib/task-metadata'
import {
Collapsible,
CollapsibleContent,
Expand All @@ -16,6 +17,7 @@ import {

interface TaskBreakdownProps {
tasks: TaskResult[]
excludeImageGen?: boolean
}

const getScoreColor = (score: number, maxScore: number) => {
Expand Down Expand Up @@ -46,7 +48,7 @@ const getGradingBadgeVariant = (
}
}

export function TaskBreakdown({ tasks }: TaskBreakdownProps) {
export function TaskBreakdown({ tasks, excludeImageGen = false }: TaskBreakdownProps) {
const [openTasks, setOpenTasks] = useState<Set<string>>(new Set())

const toggleTask = (taskId: string) => {
Expand All @@ -61,7 +63,7 @@ export function TaskBreakdown({ tasks }: TaskBreakdownProps) {

return (
<div className="space-y-2">
{tasks.map((task) => {
{tasks.filter(task => !(excludeImageGen && isExcludedLeaderboardTask(task.task_id))).map((task) => {
const percentage = (task.score / task.max_score) * 100
const isOpen = openTasks.has(task.task_id)

Expand Down
Loading