diff --git a/records/track_non_record_16mb/2026-03-26_EvalTime_NGram_ModelGrowth_Study/README.md b/records/track_non_record_16mb/2026-03-26_EvalTime_NGram_ModelGrowth_Study/README.md new file mode 100644 index 000000000..b5869db16 --- /dev/null +++ b/records/track_non_record_16mb/2026-03-26_EvalTime_NGram_ModelGrowth_Study/README.md @@ -0,0 +1,17 @@ +# Non-Record: The N-gram BPB Scores Are Not Real + +**Author:** abaybektursun | **Date:** 2026-03-26 | **Track:** Non-record study + +N-gram caching in Parameter Golf claims sub-0.5 BPB. The scores come from an invalid probability distribution that sums to ~410, not 1. This study presents the proof, experimental evidence, and proposed fixes. + +Full analysis: [abay.tech/posts/eval-time-model-growth](https://abay.tech/posts/eval-time-model-growth) + +PR discussion: [#886](https://github.com/openai/parameter-golf/pull/886) + +## Credits + +- [@Eppie](https://github.com/openai/parameter-golf/issues/677#issuecomment-4139902162) for identifying the probability validity issue +- Mirco (Discord) for the `P(cache_bin)` formulation +- N-gram cache concept: [PR #727](https://github.com/openai/parameter-golf/pull/727), [PR #779](https://github.com/openai/parameter-golf/pull/779), [PR #788](https://github.com/openai/parameter-golf/pull/788) +- Base model: [PR #728](https://github.com/openai/parameter-golf/pull/728) +- Code: `experiments/eval_time_mixing/` diff --git a/records/track_non_record_16mb/2026-03-26_EvalTime_NGram_ModelGrowth_Study/submission.json b/records/track_non_record_16mb/2026-03-26_EvalTime_NGram_ModelGrowth_Study/submission.json new file mode 100644 index 000000000..eebcd8d50 --- /dev/null +++ b/records/track_non_record_16mb/2026-03-26_EvalTime_NGram_ModelGrowth_Study/submission.json @@ -0,0 +1,31 @@ +{ + "author": "abaybektursun", + "github_id": "abaybektursun", + "name": "The N-gram BPB Scores Are Not Real", + "blurb": "N-gram caching claims sub-0.5 BPB but the scores come from an invalid probability distribution (sums to ~410, not 1). The hash ratio P(cache_bin) is not a conditional probability. Bucket sweep confirms: collision-free tables give baseline-level BPB. Proposes distribution verification and causality enforcement.", + "date": "2026-03-26", + "track": "non_record_study", + "val_bpb_neural_only": 1.1109, + "val_bpb_best_ngram": 0.3779, + "val_bpb_ngram_only_no_neural": 1.0615, + "artifact_bytes": 15866156, + "eval_time_state_bytes_best": 268435456, + "effective_model_bytes_best": 284301612, + "base_model_pr": 728, + "base_model_record": "records/track_10min_16mb/2026-03-25_ValCalib_GPTQ_XSA_BigramHash3072", + "hardware": "1xH100 80GB SXM (single-GPU experiments)", + "experiments_run": [ + "baseline", + "ngram_only_7", + "fixed_7gram", + "backoff_7", + "backoff_7_ent", + "backoff_9_ent_oadapt" + ], + "scripts": [ + "experiments/eval_time_mixing/scripts/eval_ngram.py", + "experiments/eval_time_mixing/scripts/eval_ngram_distributed.py", + "experiments/eval_time_mixing/scripts/analyze_ngram_matches.py" + ], + "technique_summary": "Eval-time n-gram hash tables with multi-order backoff, linear probability mixing, strict score-first causality" +}