+ {subtitle} +
+ )} + {children} + > + ); +}; + +/** + * Forecasting Performance Over Time header (left-aligned) + */ +export const FutureEvalForecastingPerformanceHeader: React.FC = () => { + return ( ++ Updated every day based on our standardized forecasting performance + measurement methodology. +
+ + View full leaderboard + ++ FutureEval measures AI's ability to predict future + outcomes, which is essential in many real-world tasks. Models + that score high in our benchmark will be better at planning, + risk assessment, and decision-making. +
+ + Learn more + ++ We run all major models with a simple prompt on most open + Metaculus forecasting questions, and collect their forecasts. As + questions resolve, we score the models' forecasts and + continuously update our leaderboard to rank them against each + other. +
+ ++ Since we measure against real world events, it takes time for + new models to populate the leaderboard. +
+ ++ Updated every day based on our standardized forecasting performance + measurement methodology. +
+| + | Model | +Forecasts | +Avg Score | + {hasCI && ( + <> ++ 95% CI lower + | ++ 95% CI higher + | + > + )} +
|---|---|---|---|---|---|
| {i + 1} | + +
+
+ {(r.icons.light || r.icons.dark) && (
+
+
+
+
+ {r.label}
+
+
+ {r.username}
+
+ |
+
+ + {r.forecasts} + | ++ {fmt(r.score, 2)} + | + + {hasCI && ( + <> ++ {fmt(r.ciLower, 2)} + | ++ {fmt(r.ciUpper, 2)} + | + > + )} +
+ We run all major models with a simple prompt on most open Metaculus + forecasting questions, and collect their forecasts. As questions + resolve, we score the models' forecasts and continuously update + our leaderboard to rank them against each other. We also plot trends + in model release date and score over time. +
+ > + ), + }, + { + icon: faBullseye, + title: "Bots vs Humans", + linkHref: + "/notebooks/38928/futureeval-resources-page/#what-do-the-tournaments-look-like", + content: ( + <> ++ We also run seasonal and biweekly Benchmarking Tournaments with + $175k in combined prizes. They are open to all, and the best + scaffold builders compete to share the prize pool in proportion to + their bot's accuracy. Some of the forecasting questions are + also submitted to our top human forecasters, allowing a direct + comparison. +
+ > + ), + }, + { + icon: faBrain, + title: "Reasoning Beyond Memorization", + linkHref: + "/notebooks/38928/futureeval-resources-page/#what-is-unique-about-futureeval", + content: ( + <> ++ Our diverse question topics range from economics, politics, tech, + sports, war, elections, society, and more. It forces models to + generalize beyond memorization on actively evolving + interdisciplinary domains relevant to the world. This correlates + with skill in long-term planning and decision-making. +
+ > + ), + }, + ] as const; + + return ( ++ FutureEval measures AI's ability to predict future outcomes, + which is essential in many real-world tasks. Models that score high in + our benchmark will be better at planning, risk assessment, and + decision-making. FutureEval is guaranteed leak-proof, since answers + are not known yet at test time. +
++ FutureEval has two arms: a fixed-prompt benchmark to compare model + performance directly, and a bots vs. humans tournament to probe the + frontier of scaffolding. +
++ Learn how to submit your forecasting bot in 30 minutes +
+ + ++ {content} +
++ {description} +
++ Make sure to check out{" "} + + MiniBench + + , our shorter-term experimental Bot Tournament! +
++ {item.description} +
+ + {/* Auto-scrolling carousel */} + {carouselChips.length > 0 && ( ++ {item.description} +
+ + {/* Auto-scrolling carousel */} + {carouselChips.length > 0 && ( ++
{subtitle}
{children} @@ -51,9 +51,9 @@ export const AIBBenchmarkModelsSubsectionHeader: React.FC = () => { infoHref="/notebooks/38928/futureeval-resources-page/#what-is-the-model-leaderboard" >
{t.rich("aibBenchModelsBlurb", {
br: () =>
,
diff --git a/front_end/src/app/(main)/aib/components/aib/tabs/benchmark/performance-over-time/aib-benchmark-forecasting-performance.tsx b/front_end/src/app/(main)/aib/components/aib/tabs/benchmark/performance-over-time/aib-benchmark-forecasting-performance.tsx
index 586d94f621..4495adbe14 100644
--- a/front_end/src/app/(main)/aib/components/aib/tabs/benchmark/performance-over-time/aib-benchmark-forecasting-performance.tsx
+++ b/front_end/src/app/(main)/aib/components/aib/tabs/benchmark/performance-over-time/aib-benchmark-forecasting-performance.tsx
@@ -21,20 +21,17 @@ const AIBBenchmarkForecastingPerformance: React.FC = () => {
if (!firstIdxByGroup.has(group)) firstIdxByGroup.set(group, i);
});
+ // Show all companies in the legend (no filtering)
const legend = [
...Array.from(firstIdxByGroup.entries()).map(([label, pointIndex]) => ({
label,
pointIndex,
})),
{ label: t("aibSOTALinearTrend"), trend: true as const },
- {
- label: t("aibSotaModels"),
- sota: true as const,
- },
];
return (
-