metaculus-bot/backtest.py at main · No-Stream/metaculus-bot · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
"""Ground truth backtesting: score bot predictions against actual resolved outcomes."""

import argparse
import asyncio
import logging
import os
import sys
import time
from typing import Any

import typeguard
from forecasting_tools import Benchmarker, ForecastBot, MonetaryCostManager
from tqdm import tqdm

from metaculus_bot.aiohttp_cleanup import enable_aiohttp_session_autoclose
from metaculus_bot.backtest.analysis import (
    BacktestResult,
    aggregate_scores,
    generate_backtest_report,
    save_backtest_data,
)
from metaculus_bot.backtest.leakage import screen_research_for_leakage
from metaculus_bot.backtest.question_prep import BacktestQuestionSet, fetch_resolved_questions
from metaculus_bot.backtest.scoring import QuestionScore, score_report
from metaculus_bot.benchmark.bot_factory import (
    BENCHMARK_BOT_CONFIG,
    DEFAULT_HELPER_LLMS,
    INDIVIDUAL_MODEL_SPECS,
    create_individual_bots,
)
from metaculus_bot.benchmark.heartbeat import install_benchmarker_heartbeat
from metaculus_bot.benchmark.logging_setup import configure_benchmark_logging
from metaculus_bot.config import load_environment
from metaculus_bot.constants import (
    BACKTEST_DEFAULT_RESOLVED_AFTER,
    BACKTEST_DEFAULT_TOURNAMENT,
    BENCHMARK_BATCH_SIZE,
    HEARTBEAT_INTERVAL,
)
from metaculus_bot.scoring_patches import apply_scoring_patches

logger: logging.Logger = logging.getLogger(__name__)

load_environment()


enable_aiohttp_session_autoclose()


_progress_state: dict[str, Any] = {
    "total_predictions": 0,
    "start_time": 0,
    "completed_batches": 0,
    "total_batches": 0,
    "pbar": None,
}


install_benchmarker_heartbeat(HEARTBEAT_INTERVAL, _progress_state)


def _build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(description="Ground truth backtesting against resolved questions")
    parser.add_argument(
        "--num-questions",
        type=int,
        default=20,
        help="Number of resolved questions to backtest (default: 20)",
    )
    parser.add_argument(
        "--resolved-after",
        type=str,
        default=BACKTEST_DEFAULT_RESOLVED_AFTER,
        help=f"Only use questions resolved after this date (default: {BACKTEST_DEFAULT_RESOLVED_AFTER})",
    )
    parser.add_argument(
        "--tournament",
        type=str,
        default=BACKTEST_DEFAULT_TOURNAMENT,
        help=f"Tournament slug to fetch resolved questions from (default: {BACKTEST_DEFAULT_TOURNAMENT})",
    )
    parser.add_argument(
        "--include-models",
        nargs="*",
        default=None,
        help="Only include models matching these substrings (case-insensitive)",
    )
    parser.add_argument(
        "--exclude-models",
        nargs="*",
        default=None,
        help="Exclude models by substring match (case-insensitive)",
    )
    return parser


def _filter_bots(
    bots: list[ForecastBot],
    include_models: list[str] | None,
    exclude_models: list[str] | None,
) -> list[ForecastBot]:
    """Filter bots by include/exclude substring matching on bot name."""
    filtered = list(bots)

    if include_models:
        filtered = [b for b in filtered if any(token.lower() in b.name.lower() for token in include_models)]

    if exclude_models:
        filtered = [b for b in filtered if not any(token.lower() in b.name.lower() for token in exclude_models)]

    if not filtered:
        available_names = [b.name for b in bots]
        raise ValueError(
            f"No bots remaining after model filtering. "
            f"Available: {available_names}, include={include_models}, exclude={exclude_models}"
        )

    logger.info(f"Model filtering: {len(bots)} -> {len(filtered)} bots: {[b.name for b in filtered]}")
    return filtered


async def run_backtest(args: argparse.Namespace) -> None:
    """Run the full backtest pipeline."""
    # 1. Fetch resolved questions and extract ground truths
    logger.info(
        f"Fetching {args.num_questions} resolved questions "
        f"(tournament={args.tournament}, resolved_after={args.resolved_after})"
    )
    sys.stdout.flush()

    question_set: BacktestQuestionSet = await fetch_resolved_questions(
        total_questions=args.num_questions,
        resolved_after=args.resolved_after,
        tournament=args.tournament,
    )

    logger.info(
        f"Prepared {len(question_set.questions)} questions with {len(question_set.ground_truths)} ground truths"
    )

    # 2. Leakage pre-screening
    clean_questions, clean_ground_truths, research_cache = await screen_research_for_leakage(
        question_set.questions,
        question_set.ground_truths,
    )
    question_set.questions = clean_questions
    question_set.ground_truths = clean_ground_truths
    question_set.research_cache = research_cache

    logger.info(f"After leakage screening: {len(clean_questions)} clean questions")

    # 3. Create bots
    bots: list[ForecastBot] = create_individual_bots(
        INDIVIDUAL_MODEL_SPECS,
        DEFAULT_HELPER_LLMS,
        BENCHMARK_BOT_CONFIG,
        batch_size=BENCHMARK_BATCH_SIZE,
        research_cache=research_cache,
    )
    bots = typeguard.check_type(bots, list[ForecastBot])

    bots = _filter_bots(bots, args.include_models, args.exclude_models)

    # 4. Apply scoring patches for mixed question types
    apply_scoring_patches()

    with MonetaryCostManager() as cost_manager:
        # 5. Run Benchmarker
        total_predictions = len(bots) * len(clean_questions)
        logger.info(
            f"Starting backtest: {len(bots)} bots x {len(clean_questions)} questions "
            f"= {total_predictions} total predictions"
        )
        sys.stdout.flush()

        _progress_state.update(
            {
                "total_predictions": total_predictions,
                "start_time": time.time(),
                "completed_batches": 0,
                "total_batches": len(bots),
                "pbar": tqdm(total=total_predictions, desc="Backtesting", unit="predictions"),
            }
        )

        benchmarks = await Benchmarker(
            questions_to_use=clean_questions,
            forecast_bots=bots,
            file_path_to_save_reports="backtests/",
            concurrent_question_batch_size=BENCHMARK_BATCH_SIZE,
        ).run_benchmark()

        if _progress_state["pbar"] is not None:
            _progress_state["pbar"].close()
            _progress_state["pbar"] = None

        logger.info("Benchmarker completed, scoring against ground truth...")
        sys.stdout.flush()

        # 6. Score each bot's reports against ground truth
        results: list[BacktestResult] = []
        for benchmark in benchmarks:
            bot_scores: list[QuestionScore] = []
            num_scored = 0
            num_failed = 0

            for report in benchmark.forecast_reports:
                qid = report.question.id_of_question
                if qid not in clean_ground_truths:
                    num_failed += 1
                    logger.warning(f"No ground truth for question {qid}, skipping")
                    continue

                report_scores = score_report(report, clean_ground_truths[qid])
                if report_scores:
                    bot_scores.extend(report_scores)
                    num_scored += 1
                else:
                    num_failed += 1

            result = BacktestResult(
                bot_name=benchmark.name,
                scores=bot_scores,
                num_questions=len(benchmark.forecast_reports),
                num_scored=num_scored,
                num_failed=num_failed,
            )
            results.append(result)

            aggregated = aggregate_scores(bot_scores)
            logger.info(f"Bot '{benchmark.name}': scored={num_scored}, failed={num_failed}")
            for metric_name, agg in aggregated.items():
                community_str = ""
                if agg.get("community_mean") is not None:
                    community_str = f" | Community: {agg['community_mean']:.4f}"
                logger.info(f"  {metric_name}: Bot mean = {agg['bot_mean']:.4f} (n={agg['n']}){community_str}")

        # 7. Generate report and save data
        report_text = generate_backtest_report(results, question_set, output_path="backtests/backtest_report.md")
        save_backtest_data(question_set, results, output_dir="backtests")

        logger.info(f"\nTotal Cost: {cost_manager.current_usage}")
        logger.info("\n" + "=" * 60)
        logger.info("BACKTEST REPORT")
        logger.info("=" * 60)
        logger.info(report_text)


if __name__ == "__main__":
    os.environ["PYTHONUNBUFFERED"] = "1"

    configure_benchmark_logging(log_dir="backtests")

    parser = _build_parser()
    args = parser.parse_args()

    asyncio.run(run_backtest(args))