From be77e070e76a69d3670d5906fa68d46150727b7f Mon Sep 17 00:00:00 2001 From: ShihChunHao Date: Thu, 26 Mar 2026 08:59:43 +0800 Subject: [PATCH] Add WSD cosine decay schedule submission Replace linear warmdown LR schedule with Warmup-Stable-Decay (WSD): 5% warmup, 75% stable at peak LR, 20% cosine decay. Built on SOTA base (10L, MLP3x, SmearGate, BigramHash, int5/int6, SWA, zstd-22). --- .../README.md | 46 + .../submission.json | 9 + .../train_gpt.py | 1246 +++++++++++++++++ .../train_seed42.log | 668 +++++++++ 4 files changed, 1969 insertions(+) create mode 100644 records/track_10min_16mb/2026-03-25_WSD_CosineDecay_Schedule/README.md create mode 100644 records/track_10min_16mb/2026-03-25_WSD_CosineDecay_Schedule/submission.json create mode 100644 records/track_10min_16mb/2026-03-25_WSD_CosineDecay_Schedule/train_gpt.py create mode 100644 records/track_10min_16mb/2026-03-25_WSD_CosineDecay_Schedule/train_seed42.log diff --git a/records/track_10min_16mb/2026-03-25_WSD_CosineDecay_Schedule/README.md b/records/track_10min_16mb/2026-03-25_WSD_CosineDecay_Schedule/README.md new file mode 100644 index 000000000..f1ac19909 --- /dev/null +++ b/records/track_10min_16mb/2026-03-25_WSD_CosineDecay_Schedule/README.md @@ -0,0 +1,46 @@ +# WSD Cosine Decay Schedule + +**val_bpb: TBD (8xH100)** — Preliminary 1-GPU result: 1.2824 BPB + +## Key Change + +Replace the default linear warmdown LR schedule with a **Warmup-Stable-Decay (WSD)** cosine schedule: + +| Phase | Fraction | LR behavior | +|-------|----------|-------------| +| Warmup | 0-5% of steps | Linear 0 → peak | +| Stable | 5-80% of steps | Constant at peak LR | +| Decay | 80-100% of steps | Cosine decay → 0 | + +The original schedule computes warmdown based on `warmdown_iters` and remaining wallclock time, which can cause LR to start decaying from very early in training (especially with fewer steps). WSD ensures the model trains at peak LR for the majority of the run. + +## Base Techniques (inherited from SOTA) + +- 10 layers, 512-dim, MLP 3x expansion +- SmearGate + BigramHash(10240) +- Mixed int5 (MLP) / int6 (attention) quantization +- SWA (start_frac=0.4, every=50 steps) +- Orthogonal init + Muon optimizer (WD=0.04) +- zstd-22 compression +- Sliding window eval (stride=64) + +## Preliminary Results (1 GPU, seed=42) + +| Config | val_bpb | artifact_bytes | +|--------|---------|---------------| +| 1 GPU, 600s, ~877 steps | 1.2824 | 15,767,236 | + +8xH100 3-seed results pending. + +## Run Command + +```bash +# Single GPU +python train_gpt.py + +# 8xH100 (competition setting) +torchrun --standalone --nproc_per_node=8 train_gpt.py + +# With specific seed +SEED=42 torchrun --standalone --nproc_per_node=8 train_gpt.py +``` diff --git a/records/track_10min_16mb/2026-03-25_WSD_CosineDecay_Schedule/submission.json b/records/track_10min_16mb/2026-03-25_WSD_CosineDecay_Schedule/submission.json new file mode 100644 index 000000000..518155dbb --- /dev/null +++ b/records/track_10min_16mb/2026-03-25_WSD_CosineDecay_Schedule/submission.json @@ -0,0 +1,9 @@ +{ + "name": "WSD Cosine Decay Schedule + 10L Int5-MLP BigramHash SmearGate SWA", + "val_loss": 1.28242, + "bytes_total": 15767236, + "blurb": "Replace linear warmdown with Warmup-Stable-Decay (WSD) cosine schedule: 5% warmup, 75% stable at peak LR, 20% cosine decay. Built on SOTA base (10L, MLP3x, SmearGate, BigramHash 10240, SWA 0.4, int5/int6 mixed quant, zstd-22). Preliminary 1-GPU result; 8xH100 3-seed results pending.", + "author": "ShihChunHao", + "github_id": "ShihChunHao", + "date": "2026-03-25" +} diff --git a/records/track_10min_16mb/2026-03-25_WSD_CosineDecay_Schedule/train_gpt.py b/records/track_10min_16mb/2026-03-25_WSD_CosineDecay_Schedule/train_gpt.py new file mode 100644 index 000000000..406cd5edc --- /dev/null +++ b/records/track_10min_16mb/2026-03-25_WSD_CosineDecay_Schedule/train_gpt.py @@ -0,0 +1,1246 @@ +""" +The `train_gpt.py` and `train_gpt_mlx.py` scripts are intended as good launching-off points for new participants, not SOTA configs. We'll accept PRs that tune, improve, or simplify these scripts without significantly increasing complexity, but competitive submissions should stay in the `/records` folder. + +Hard stop: To keep readable for newcomers, let's make sure `train_gpt.py` and `train_gpt_mlx.py` never are longer than 1500 lines. +""" + +from __future__ import annotations + +import copy +import glob +import io +import math +import os +import random +import subprocess +import sys +import time +import uuid +import zlib +from pathlib import Path + +try: + import zstandard + _COMPRESSOR = "zstd" +except ImportError: + _COMPRESSOR = "zlib" + +import numpy as np +import sentencepiece as spm +import torch +import torch.distributed as dist +import torch.nn.functional as F +from torch import Tensor, nn +from torch.nn.parallel import DistributedDataParallel as DDP + +# ----------------------------- +# HYPERPARAMETERS +# ----------------------------- + +class Hyperparameters: + data_path = os.environ.get("DATA_PATH", "./data/datasets/fineweb10B_sp1024") + train_files = os.path.join(data_path, "fineweb_train_*.bin") + val_files = os.path.join(data_path, "fineweb_val_*.bin") + tokenizer_path = os.environ.get("TOKENIZER_PATH", "./data/tokenizers/fineweb_1024_bpe.model") + run_id = os.environ.get("RUN_ID", str(uuid.uuid4())) + seed = int(os.environ.get("SEED", 42)) + + val_batch_size = int(os.environ.get("VAL_BATCH_SIZE", 524_288)) + val_loss_every = int(os.environ.get("VAL_LOSS_EVERY", 500)) + train_log_every = int(os.environ.get("TRAIN_LOG_EVERY", 100)) + + iterations = int(os.environ.get("ITERATIONS", 20000)) + warmdown_iters = int(os.environ.get("WARMDOWN_ITERS", 3000)) + warmup_steps = int(os.environ.get("WARMUP_STEPS", 20)) + train_batch_tokens = int(os.environ.get("TRAIN_BATCH_TOKENS", 786_432)) + train_seq_len = int(os.environ.get("TRAIN_SEQ_LEN", 2048)) + max_wallclock_seconds = float(os.environ.get("MAX_WALLCLOCK_SECONDS", 600.0)) + qk_gain_init = float(os.environ.get("QK_GAIN_INIT", 1.5)) + + vocab_size = int(os.environ.get("VOCAB_SIZE", 1024)) + num_layers = int(os.environ.get("NUM_LAYERS", 10)) + num_kv_heads = int(os.environ.get("NUM_KV_HEADS", 4)) + model_dim = int(os.environ.get("MODEL_DIM", 512)) + num_heads = int(os.environ.get("NUM_HEADS", 8)) + mlp_mult = float(os.environ.get("MLP_MULT", 3.0)) + tie_embeddings = bool(int(os.environ.get("TIE_EMBEDDINGS", "1"))) + rope_base = float(os.environ.get("ROPE_BASE", 10000.0)) + logit_softcap = float(os.environ.get("LOGIT_SOFTCAP", 30.0)) + + embed_lr = float(os.environ.get("EMBED_LR", 0.6)) + head_lr = float(os.environ.get("HEAD_LR", 0.008)) + tied_embed_lr = float(os.environ.get("TIED_EMBED_LR", 0.03)) + tied_embed_init_std = float(os.environ.get("TIED_EMBED_INIT_STD", 0.005)) + matrix_lr = float(os.environ.get("MATRIX_LR", 0.02)) + scalar_lr = float(os.environ.get("SCALAR_LR", 0.02)) + muon_momentum = float(os.environ.get("MUON_MOMENTUM", 0.99)) + muon_backend_steps = int(os.environ.get("MUON_BACKEND_STEPS", 5)) + muon_momentum_warmup_start = float(os.environ.get("MUON_MOMENTUM_WARMUP_START", 0.92)) + muon_momentum_warmup_steps = int(os.environ.get("MUON_MOMENTUM_WARMUP_STEPS", 1500)) + beta1 = float(os.environ.get("BETA1", 0.9)) + beta2 = float(os.environ.get("BETA2", 0.95)) + adam_eps = float(os.environ.get("ADAM_EPS", 1e-8)) + grad_clip_norm = float(os.environ.get("GRAD_CLIP_NORM", 0.3)) + weight_decay = float(os.environ.get("WEIGHT_DECAY", 0.04)) + + eval_stride = int(os.environ.get("EVAL_STRIDE", 64)) + eval_batch_seqs = int(os.environ.get("EVAL_BATCH_SEQS", 32)) + + bigram_vocab_size = int(os.environ.get("BIGRAM_VOCAB_SIZE", 10240)) + bigram_dim = int(os.environ.get("BIGRAM_DIM", 128)) + + swa_enabled = bool(int(os.environ.get("SWA_ENABLED", "1"))) + swa_start_frac = float(os.environ.get("SWA_START_FRAC", 0.4)) + swa_every = int(os.environ.get("SWA_EVERY", 50)) + +# ----------------------------- +# MUON OPTIMIZER +# ----------------------------- + +def zeropower_via_newtonschulz5(G: Tensor, steps: int = 10, eps: float = 1e-7) -> Tensor: + a, b, c = (3.4445, -4.7750, 2.0315) + X = G.bfloat16() + X /= X.norm() + eps + transposed = G.size(0) > G.size(1) + if transposed: + X = X.T + for _ in range(steps): + A = X @ X.T + B = b * A + c * A @ A + X = a * X + B @ X + return X.T if transposed else X + + +class Muon(torch.optim.Optimizer): + def __init__(self, params, lr: float, momentum: float, backend_steps: int, nesterov: bool = True, weight_decay: float = 0.0): + super().__init__( + params, + dict(lr=lr, momentum=momentum, backend_steps=backend_steps, nesterov=nesterov, weight_decay=weight_decay), + ) + + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + distributed = dist.is_available() and dist.is_initialized() + world_size = dist.get_world_size() if distributed else 1 + rank = dist.get_rank() if distributed else 0 + + for group in self.param_groups: + params = group["params"] + if not params: + continue + lr = group["lr"] + momentum = group["momentum"] + backend_steps = group["backend_steps"] + nesterov = group["nesterov"] + + total_params = sum(int(p.numel()) for p in params) + updates_flat = torch.zeros(total_params, device=params[0].device, dtype=torch.bfloat16) + + curr = 0 + for i, p in enumerate(params): + if i % world_size == rank and p.grad is not None: + g = p.grad + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros_like(g) + buf = state["momentum_buffer"] + buf.mul_(momentum).add_(g) + if nesterov: + g = g.add(buf, alpha=momentum) + g = zeropower_via_newtonschulz5(g, steps=backend_steps) + g *= max(1, g.size(0) / g.size(1)) ** 0.5 + updates_flat[curr : curr + p.numel()] = g.reshape(-1) + curr += p.numel() + + if distributed: + dist.all_reduce(updates_flat, op=dist.ReduceOp.SUM) + + wd = group.get("weight_decay", 0.0) + curr = 0 + for p in params: + g = updates_flat[curr : curr + p.numel()].view_as(p).to(dtype=p.dtype) + if wd > 0: + p.data.mul_(1.0 - lr * wd) + p.add_(g, alpha=-lr) + curr += p.numel() + return loss + + +# ----------------------------- +# TOKENIZER-AGNOSTIC EVALUATION +# ----------------------------- + +def build_sentencepiece_luts( + sp: spm.SentencePieceProcessor, vocab_size: int, device: torch.device +) -> tuple[Tensor, Tensor, Tensor]: + sp_vocab_size = int(sp.vocab_size()) + table_size = max(sp_vocab_size, vocab_size) + base_bytes_np = np.zeros((table_size,), dtype=np.int16) + has_leading_space_np = np.zeros((table_size,), dtype=np.bool_) + is_boundary_token_np = np.ones((table_size,), dtype=np.bool_) + for token_id in range(sp_vocab_size): + if sp.is_control(token_id) or sp.is_unknown(token_id) or sp.is_unused(token_id): + continue + is_boundary_token_np[token_id] = False + if sp.is_byte(token_id): + base_bytes_np[token_id] = 1 + continue + piece = sp.id_to_piece(token_id) + if piece.startswith("\u2581"): + has_leading_space_np[token_id] = True + piece = piece[1:] + base_bytes_np[token_id] = len(piece.encode("utf-8")) + return ( + torch.tensor(base_bytes_np, dtype=torch.int16, device=device), + torch.tensor(has_leading_space_np, dtype=torch.bool, device=device), + torch.tensor(is_boundary_token_np, dtype=torch.bool, device=device), + ) + + +def load_validation_tokens(pattern: str, seq_len: int) -> Tensor: + files = [Path(p) for p in sorted(glob.glob(pattern))] + if not files: + raise FileNotFoundError(f"No files found for pattern: {pattern}") + tokens = torch.cat([load_data_shard(file) for file in files]).contiguous() + usable = ((tokens.numel() - 1) // seq_len) * seq_len + if usable <= 0: + raise ValueError(f"Validation split is too short for TRAIN_SEQ_LEN={seq_len}") + return tokens[: usable + 1] + + +def eval_val( + args: Hyperparameters, + model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + grad_accum_steps: int, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, +) -> tuple[float, float]: + local_batch_tokens = args.val_batch_size // (world_size * grad_accum_steps) + if local_batch_tokens < args.train_seq_len: + raise ValueError( + "VAL_BATCH_SIZE must provide at least one sequence per rank; " + f"got VAL_BATCH_SIZE={args.val_batch_size}, WORLD_SIZE={world_size}, " + f"GRAD_ACCUM_STEPS={grad_accum_steps}, TRAIN_SEQ_LEN={args.train_seq_len}" + ) + local_batch_seqs = local_batch_tokens // args.train_seq_len + total_seqs = (val_tokens.numel() - 1) // args.train_seq_len + seq_start = (total_seqs * rank) // world_size + seq_end = (total_seqs * (rank + 1)) // world_size + val_loss_sum = torch.zeros((), device=device, dtype=torch.float64) + val_token_count = torch.zeros((), device=device, dtype=torch.float64) + val_byte_count = torch.zeros((), device=device, dtype=torch.float64) + model.eval() + with torch.inference_mode(): + for batch_seq_start in range(seq_start, seq_end, local_batch_seqs): + batch_seq_end = min(batch_seq_start + local_batch_seqs, seq_end) + raw_start = batch_seq_start * args.train_seq_len + raw_end = batch_seq_end * args.train_seq_len + 1 + local = val_tokens[raw_start:raw_end].to(device=device, dtype=torch.int64, non_blocking=True) + x = local[:-1].reshape(-1, args.train_seq_len) + y = local[1:].reshape(-1, args.train_seq_len) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + batch_loss = model(x, y).detach() + batch_token_count = float(y.numel()) + val_loss_sum += batch_loss.to(torch.float64) * batch_token_count + val_token_count += batch_token_count + prev_ids = x.reshape(-1) + tgt_ids = y.reshape(-1) + token_bytes = base_bytes_lut[tgt_ids].to(dtype=torch.int16) + token_bytes += (has_leading_space_lut[tgt_ids] & ~is_boundary_token_lut[prev_ids]).to(dtype=torch.int16) + val_byte_count += token_bytes.to(torch.float64).sum() + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(val_loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(val_token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(val_byte_count, op=dist.ReduceOp.SUM) + val_loss = val_loss_sum / val_token_count + bits_per_token = val_loss.item() / math.log(2.0) + tokens_per_byte = val_token_count.item() / val_byte_count.item() + model.train() + return float(val_loss.item()), float(bits_per_token * tokens_per_byte) + + +# ----------------------------- +# POST-TRAINING QUANTIZATION (INT8 legacy + INT6 mixed) +# ----------------------------- + +CONTROL_TENSOR_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "CONTROL_TENSOR_NAME_PATTERNS", + "attn_scale,attn_scales,mlp_scale,mlp_scales,resid_mix,resid_mixes,q_gain,skip_weight,skip_weights,smear,bigram.scale", + ).split(",") + if pattern +) +FP16_KEEP_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get("FP16_KEEP_NAME_PATTERNS", "tok_emb,blocks.8.attn.c_k").split(",") + if pattern +) +INT8_KEEP_FLOAT_FP32_NAME_PATTERNS = tuple( + pattern + for pattern in os.environ.get( + "INT8_KEEP_FLOAT_FP32_NAME_PATTERNS", + ",".join(CONTROL_TENSOR_NAME_PATTERNS), + ).split(",") + if pattern +) +INT8_KEEP_FLOAT_MAX_NUMEL = 65_536 +INT8_KEEP_FLOAT_STORE_DTYPE = torch.float16 +INT8_PER_ROW_SCALE_DTYPE = torch.float16 +INT8_CLIP_PERCENTILE = 99.99984 +INT8_CLIP_Q = INT8_CLIP_PERCENTILE / 100.0 + +def tensor_nbytes(t: Tensor) -> int: + return int(t.numel()) * int(t.element_size()) + +def quantize_float_tensor(t: Tensor) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + clip_abs = ( + torch.quantile(t32.abs(), INT8_CLIP_Q, dim=1) + if t32.numel() + else torch.empty((t32.shape[0],), dtype=torch.float32) + ) + clipped = torch.maximum(torch.minimum(t32, clip_abs[:, None]), -clip_abs[:, None]) + scale = (clip_abs / 127.0).clamp_min(1.0 / 127.0) + q = torch.clamp(torch.round(clipped / scale[:, None]), -127, 127).to(torch.int8).contiguous() + return q, scale.to(dtype=INT8_PER_ROW_SCALE_DTYPE).contiguous() + clip_abs = float(torch.quantile(t32.abs().flatten(), INT8_CLIP_Q).item()) if t32.numel() else 0.0 + scale = torch.tensor(clip_abs / 127.0 if clip_abs > 0 else 1.0, dtype=torch.float32) + q = torch.clamp(torch.round(torch.clamp(t32, -clip_abs, clip_abs) / scale), -127, 127).to(torch.int8).contiguous() + return q, scale + + +def _classify_param(name: str) -> str: + if "tok_emb" in name or "lm_head" in name: + return "embed" + if ".mlp." in name: + return "mlp" + if "bigram" in name: + return "bigram" + if ".attn." in name or (".proj." in name and ".mlp." not in name): + return "attn" + return "other" + +def quantize_intN_per_row(t: Tensor, clip_range: int = 31) -> tuple[Tensor, Tensor]: + t32 = t.float() + if t32.ndim == 2: + row_max = t32.abs().amax(dim=1) + scale = (row_max / clip_range).clamp_min(1e-12).to(torch.float16) + scale = scale.clamp_min(torch.finfo(torch.float16).tiny) + q = torch.clamp(torch.round(t32 / scale.float()[:, None]), -(clip_range+1), clip_range).to(torch.int8) + return q, scale + amax = t32.abs().max().item() + scale = torch.tensor(max(amax / clip_range, 1e-12), dtype=torch.float16) + q = torch.clamp(torch.round(t32 / scale.float()), -(clip_range+1), clip_range).to(torch.int8) + return q, scale + +def mixed_quantize_int6(state_dict: dict[str, Tensor], int6_cats: set[str]): + result: dict[str, Tensor] = {} + meta: dict[str, object] = {} + for name, tensor in state_dict.items(): + t = tensor.detach().cpu().contiguous() + cat = _classify_param(name) + if not t.is_floating_point() or t.numel() <= 8192: + result[name] = t.to(torch.float16) if t.is_floating_point() else t + meta[name] = "passthrough" + continue + if any(p in name for p in CONTROL_TENSOR_NAME_PATTERNS): + result[name] = t.float() + meta[name] = "passthrough_ctrl" + continue + if any(pattern in name for pattern in FP16_KEEP_NAME_PATTERNS): + result[name] = t.to(dtype=torch.float16).contiguous() + meta[name] = "passthrough_fp16" + continue + if cat in int6_cats and t.ndim >= 1: + clip = 15 if cat == "mlp" else 31 # int5 for MLP, int6 for attention + q, s = quantize_intN_per_row(t, clip_range=clip) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": f"int{5 if cat == 'mlp' else 6}"} + else: + q, s = quantize_float_tensor(t) + result[name + ".q"] = q + result[name + ".scale"] = s + meta[name] = {"type": "int8"} + return result, meta + +def dequantize_mixed_int6(result: dict[str, Tensor], meta: dict[str, object], + template_sd: dict[str, Tensor]) -> dict[str, Tensor]: + out: dict[str, Tensor] = {} + for name, orig in template_sd.items(): + info = meta[name] + orig_dtype = orig.dtype + if info in ("passthrough", "passthrough_ctrl", "passthrough_fp16"): + t = result[name] + if t.dtype == torch.float16 and orig_dtype in (torch.float32, torch.bfloat16): + t = t.to(orig_dtype) + out[name] = t + continue + q, s = result[name + ".q"], result[name + ".scale"] + if s.ndim > 0: + out[name] = (q.float() * s.float().view(q.shape[0], *([1] * (q.ndim - 1)))).to(orig_dtype) + else: + out[name] = (q.float() * float(s.item())).to(orig_dtype) + return out + + +# ----------------------------- +# DATA LOADING +# ----------------------------- + +def load_data_shard(file: Path) -> Tensor: + header_bytes = 256 * np.dtype(" None: + self.file_idx = (self.file_idx + 1) % len(self.files) + self.tokens = load_data_shard(self.files[self.file_idx]) + self.pos = 0 + + def take(self, n: int) -> Tensor: + chunks: list[Tensor] = [] + remaining = n + while remaining > 0: + avail = self.tokens.numel() - self.pos + if avail <= 0: + self._advance_file() + continue + k = min(remaining, avail) + chunks.append(self.tokens[self.pos : self.pos + k]) + self.pos += k + remaining -= k + return chunks[0] if len(chunks) == 1 else torch.cat(chunks) + + +class DistributedTokenLoader: + def __init__(self, pattern: str, rank: int, world_size: int, device: torch.device): + self.rank = rank + self.world_size = world_size + self.device = device + self.stream = TokenStream(pattern) + + def next_batch(self, global_tokens: int, seq_len: int, grad_accum_steps: int) -> tuple[Tensor, Tensor]: + local_tokens = global_tokens // (self.world_size * grad_accum_steps) + per_rank_span = local_tokens + 1 + chunk = self.stream.take(per_rank_span * self.world_size) + start = self.rank * per_rank_span + local = chunk[start : start + per_rank_span].to(dtype=torch.int64) + x = local[:-1].reshape(-1, seq_len) + y = local[1:].reshape(-1, seq_len) + return x.to(self.device, non_blocking=True), y.to(self.device, non_blocking=True) + + +# ----------------------------- +# TRANSFORMER MODULES +# ----------------------------- + +class RMSNorm(nn.Module): + def __init__(self, eps: float | None = None): + super().__init__() + self.eps = eps + + def forward(self, x: Tensor) -> Tensor: + return F.rms_norm(x, (x.size(-1),), eps=self.eps) + + +class CastedLinear(nn.Linear): + def forward(self, x: Tensor) -> Tensor: + w = self.weight.to(x.dtype) + bias = self.bias.to(x.dtype) if self.bias is not None else None + return F.linear(x, w, bias) + + +def restore_low_dim_params_to_fp32(module: nn.Module) -> None: + with torch.no_grad(): + for name, param in module.named_parameters(): + if (param.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS)) and param.dtype != torch.float32: + param.data = param.data.float() + + +class Rotary(nn.Module): + def __init__(self, dim: int, base: float = 10000.0): + super().__init__() + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._seq_len_cached = 0 + self._cos_cached: Tensor | None = None + self._sin_cached: Tensor | None = None + + def forward(self, seq_len: int, device: torch.device, dtype: torch.dtype) -> tuple[Tensor, Tensor]: + if ( + self._cos_cached is None + or self._sin_cached is None + or self._seq_len_cached != seq_len + or self._cos_cached.device != device + ): + t = torch.arange(seq_len, device=device, dtype=self.inv_freq.dtype) + freqs = torch.outer(t, self.inv_freq.to(device)) + self._cos_cached = freqs.cos()[None, None, :, :] + self._sin_cached = freqs.sin()[None, None, :, :] + self._seq_len_cached = seq_len + return self._cos_cached.to(dtype=dtype), self._sin_cached.to(dtype=dtype) + + +def apply_rotary_emb(x: Tensor, cos: Tensor, sin: Tensor) -> Tensor: + half = x.size(-1) // 2 + x1, x2 = x[..., :half], x[..., half:] + return torch.cat((x1 * cos + x2 * sin, x1 * (-sin) + x2 * cos), dim=-1) + + +class CausalSelfAttention(nn.Module): + def __init__(self, dim: int, num_heads: int, num_kv_heads: int, rope_base: float, qk_gain_init: float): + super().__init__() + if dim % num_heads != 0: + raise ValueError("model_dim must be divisible by num_heads") + if num_heads % num_kv_heads != 0: + raise ValueError("num_heads must be divisible by num_kv_heads") + self.num_heads = num_heads + self.num_kv_heads = num_kv_heads + self.head_dim = dim // num_heads + if self.head_dim % 2 != 0: + raise ValueError("head_dim must be even for RoPE") + kv_dim = self.num_kv_heads * self.head_dim + self.c_q = CastedLinear(dim, dim, bias=False) + self.c_k = CastedLinear(dim, kv_dim, bias=False) + self.c_v = CastedLinear(dim, kv_dim, bias=False) + self.proj = CastedLinear(dim, dim, bias=False) + self.proj._zero_init = True + self.q_gain = nn.Parameter(torch.full((num_heads,), qk_gain_init, dtype=torch.float32)) + self.rotary = Rotary(self.head_dim, base=rope_base) + + def forward(self, x: Tensor) -> Tensor: + bsz, seqlen, dim = x.shape + q = self.c_q(x).reshape(bsz, seqlen, self.num_heads, self.head_dim).transpose(1, 2) + k = self.c_k(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim).transpose(1, 2) + v = self.c_v(x).reshape(bsz, seqlen, self.num_kv_heads, self.head_dim).transpose(1, 2) + q = F.rms_norm(q, (q.size(-1),)) + k = F.rms_norm(k, (k.size(-1),)) + cos, sin = self.rotary(seqlen, x.device, q.dtype) + q = apply_rotary_emb(q, cos, sin) + k = apply_rotary_emb(k, cos, sin) + q = q * self.q_gain.to(dtype=q.dtype)[None, :, None, None] + y = F.scaled_dot_product_attention( + q, k, v, attn_mask=None, is_causal=True, + enable_gqa=(self.num_kv_heads != self.num_heads), + ) + y = y.transpose(1, 2).contiguous().reshape(bsz, seqlen, dim) + return self.proj(y) + + +class MLP(nn.Module): + def __init__(self, dim: int, mlp_mult: float): + super().__init__() + hidden = int(mlp_mult * dim) + self.fc = CastedLinear(dim, hidden, bias=False) + self.proj = CastedLinear(hidden, dim, bias=False) + self.proj._zero_init = True + + def forward(self, x: Tensor) -> Tensor: + x = torch.relu(self.fc(x)) + return self.proj(x.square()) + + +class SmearGate(nn.Module): + """Blend each token's embedding with the previous token's embedding.""" + def __init__(self, dim: int): + super().__init__() + self.gate = nn.Parameter(torch.zeros(dim, dtype=torch.float32)) + + def forward(self, x: Tensor) -> Tensor: + g = torch.sigmoid(self.gate.to(dtype=x.dtype))[None, None, :] + x_prev = torch.cat([torch.zeros_like(x[:, :1]), x[:, :-1]], dim=1) + return (1 - g) * x + g * x_prev + + +class BigramHashEmbedding(nn.Module): + """Hash consecutive token pairs into a learned embedding table.""" + def __init__(self, bigram_vocab_size: int, bigram_dim: int, model_dim: int): + super().__init__() + self.bigram_vocab_size = bigram_vocab_size + self.embed = nn.Embedding(bigram_vocab_size, bigram_dim) + nn.init.zeros_(self.embed.weight) + self.proj = CastedLinear(bigram_dim, model_dim, bias=False) if bigram_dim != model_dim else None + if self.proj is not None: + nn.init.zeros_(self.proj.weight) + self.scale = nn.Parameter(torch.tensor(0.05, dtype=torch.float32)) + + def bigram_hash(self, tokens: Tensor) -> Tensor: + t = tokens.to(torch.int32) + mod = self.bigram_vocab_size - 1 + out = torch.empty_like(t) + out[..., 0] = mod + out[..., 1:] = torch.bitwise_xor(36313 * t[..., 1:], 27191 * t[..., :-1]) % mod + return out.long() + + def forward(self, token_ids: Tensor) -> Tensor: + h = self.embed(self.bigram_hash(token_ids)) + if self.proj is not None: + h = self.proj(h) + return h * self.scale.to(dtype=h.dtype) + + +class Block(nn.Module): + def __init__(self, dim: int, num_heads: int, num_kv_heads: int, mlp_mult: float, rope_base: float, qk_gain_init: float): + super().__init__() + self.attn_norm = RMSNorm() + self.mlp_norm = RMSNorm() + self.attn = CausalSelfAttention(dim, num_heads, num_kv_heads, rope_base, qk_gain_init) + self.mlp = MLP(dim, mlp_mult) + self.attn_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.mlp_scale = nn.Parameter(torch.ones(dim, dtype=torch.float32)) + self.resid_mix = nn.Parameter(torch.stack((torch.ones(dim), torch.zeros(dim))).float()) + + def forward(self, x: Tensor, x0: Tensor) -> Tensor: + mix = self.resid_mix.to(dtype=x.dtype) + x = mix[0][None, None, :] * x + mix[1][None, None, :] * x0 + attn_out = self.attn(self.attn_norm(x)) + x = x + self.attn_scale.to(dtype=x.dtype)[None, None, :] * attn_out + x = x + self.mlp_scale.to(dtype=x.dtype)[None, None, :] * self.mlp(self.mlp_norm(x)) + return x + + +class GPT(nn.Module): + def __init__( + self, + vocab_size: int, + num_layers: int, + model_dim: int, + num_heads: int, + num_kv_heads: int, + mlp_mult: float, + tie_embeddings: bool, + tied_embed_init_std: float, + logit_softcap: float, + rope_base: float, + qk_gain_init: float, + bigram_vocab_size: int = 0, + bigram_dim: int = 128, + ): + super().__init__() + if logit_softcap <= 0.0: + raise ValueError(f"logit_softcap must be positive, got {logit_softcap}") + self.tie_embeddings = tie_embeddings + self.tied_embed_init_std = tied_embed_init_std + self.logit_softcap = logit_softcap + self.tok_emb = nn.Embedding(vocab_size, model_dim) + self.bigram = BigramHashEmbedding(bigram_vocab_size, bigram_dim, model_dim) if bigram_vocab_size > 0 else None + self.num_encoder_layers = num_layers // 2 + self.num_decoder_layers = num_layers - self.num_encoder_layers + self.num_skip_weights = min(self.num_encoder_layers, self.num_decoder_layers) + self.skip_weights = nn.Parameter(torch.ones(self.num_skip_weights, model_dim, dtype=torch.float32)) + self.smear = SmearGate(model_dim) + self.blocks = nn.ModuleList( + [ + Block(model_dim, num_heads, num_kv_heads, mlp_mult, rope_base, qk_gain_init) + for _ in range(num_layers) + ] + ) + self.final_norm = RMSNorm() + self.lm_head = None if tie_embeddings else CastedLinear(model_dim, vocab_size, bias=False) + if self.lm_head is not None: + self.lm_head._zero_init = True + self._init_weights() + + def _init_weights(self) -> None: + if self.tie_embeddings: + nn.init.normal_(self.tok_emb.weight, mean=0.0, std=self.tied_embed_init_std) + num_layers = len(self.blocks) + for name, module in self.named_modules(): + if isinstance(module, nn.Linear): + if getattr(module, "_zero_init", False): + nn.init.zeros_(module.weight) + elif module.weight.ndim == 2 and module.weight.shape[0] >= 64 and module.weight.shape[1] >= 64: + nn.init.orthogonal_(module.weight, gain=1.0) + if ".proj." in name or name.endswith(".proj"): + with torch.no_grad(): + module.weight.mul_(1.0 / math.sqrt(2 * num_layers)) + + def forward(self, input_ids: Tensor, target_ids: Tensor) -> Tensor: + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + for i in range(self.num_encoder_layers): + x = self.blocks[i](x, x0) + skips.append(x) + for i in range(self.num_decoder_layers): + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + x = self.blocks[self.num_encoder_layers + i](x, x0) + x = self.final_norm(x).reshape(-1, x.size(-1)) + targets = target_ids.reshape(-1) + if self.tie_embeddings: + logits_proj = F.linear(x, self.tok_emb.weight) + else: + if self.lm_head is None: + raise RuntimeError("lm_head is required when tie_embeddings=False") + logits_proj = self.lm_head(x) + logits = self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + return F.cross_entropy(logits.float(), targets, reduction="mean") + + def forward_logits(self, input_ids: Tensor) -> Tensor: + x = self.tok_emb(input_ids) + if self.bigram is not None: + x = x + self.bigram(input_ids) + x = F.rms_norm(x, (x.size(-1),)) + x = self.smear(x) + x0 = x + skips: list[Tensor] = [] + for i in range(self.num_encoder_layers): + x = self.blocks[i](x, x0) + skips.append(x) + for i in range(self.num_decoder_layers): + if skips: + x = x + self.skip_weights[i].to(dtype=x.dtype)[None, None, :] * skips.pop() + x = self.blocks[self.num_encoder_layers + i](x, x0) + x = self.final_norm(x) + if self.tie_embeddings: + logits_proj = F.linear(x, self.tok_emb.weight) + else: + logits_proj = self.lm_head(x) + return self.logit_softcap * torch.tanh(logits_proj / self.logit_softcap) + + +def eval_val_sliding( + args: Hyperparameters, + base_model: nn.Module, + rank: int, + world_size: int, + device: torch.device, + val_tokens: Tensor, + base_bytes_lut: Tensor, + has_leading_space_lut: Tensor, + is_boundary_token_lut: Tensor, + stride: int, + batch_seqs: int = 32, +) -> tuple[float, float]: + seq_len = args.train_seq_len + total_tokens = val_tokens.numel() - 1 + window_starts = [ws for ws in range(0, total_tokens, stride) + if min(ws + seq_len, total_tokens) - ws >= stride or ws == 0] + total_windows = len(window_starts) + my_s = (total_windows * rank) // world_size + my_e = (total_windows * (rank + 1)) // world_size + my_windows = window_starts[my_s:my_e] + + loss_sum = torch.zeros((), device=device, dtype=torch.float64) + token_count = torch.zeros((), device=device, dtype=torch.float64) + byte_count = torch.zeros((), device=device, dtype=torch.float64) + + base_model.eval() + with torch.inference_mode(): + for bi in range(0, len(my_windows), batch_seqs): + batch_ws = my_windows[bi:bi + batch_seqs] + bsz = len(batch_ws) + x_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + y_batch = torch.zeros(bsz, seq_len, dtype=torch.int64, device=device) + wlens: list[int] = [] + for i, ws in enumerate(batch_ws): + end = min(ws + seq_len, total_tokens) + wlen = end - ws + wlens.append(wlen) + chunk = val_tokens[ws:end + 1].to(dtype=torch.int64, device=device) + x_batch[i, :wlen] = chunk[:-1] + y_batch[i, :wlen] = chunk[1:] + with torch.autocast(device_type="cuda", dtype=torch.bfloat16): + logits = base_model.forward_logits(x_batch) + nll = F.cross_entropy( + logits.reshape(-1, logits.size(-1)).float(), + y_batch.reshape(-1), + reduction="none", + ).reshape(bsz, seq_len) + for i, ws in enumerate(batch_ws): + wlen = wlens[i] + s = 0 if ws == 0 else max(wlen - stride, 0) + scored_nll = nll[i, s:wlen].to(torch.float64) + loss_sum += scored_nll.sum() + token_count += float(wlen - s) + tgt = y_batch[i, s:wlen] + prev = x_batch[i, s:wlen] + tb = base_bytes_lut[tgt].to(torch.float64) + tb += (has_leading_space_lut[tgt] & ~is_boundary_token_lut[prev]).to(torch.float64) + byte_count += tb.sum() + if rank == 0 and (bi // batch_seqs) % 50 == 0: + done = min(bi + batch_seqs, len(my_windows)) + pct = done / len(my_windows) * 100 + running_bpb = 0.0 + if token_count.item() > 0: + rl = (loss_sum / token_count).item() + running_bpb = rl / math.log(2.0) * (token_count.item() / byte_count.item()) + print(f" sliding_eval [{pct:5.1f}%] {done}/{len(my_windows)} windows running_bpb={running_bpb:.6f}", flush=True) + + if dist.is_available() and dist.is_initialized(): + dist.all_reduce(loss_sum, op=dist.ReduceOp.SUM) + dist.all_reduce(token_count, op=dist.ReduceOp.SUM) + dist.all_reduce(byte_count, op=dist.ReduceOp.SUM) + + val_loss = (loss_sum / token_count).item() + bits_per_token = val_loss / math.log(2.0) + tokens_per_byte = token_count.item() / byte_count.item() + base_model.train() + return val_loss, bits_per_token * tokens_per_byte + + +# ----------------------------- +# TRAINING +# ----------------------------- + +def main() -> None: + global zeropower_via_newtonschulz5 + + code = Path(__file__).read_text(encoding="utf-8") + args = Hyperparameters() + if os.environ.get("TORCH_COMPILE", "1") != "0": + zeropower_via_newtonschulz5 = torch.compile(zeropower_via_newtonschulz5) + + distributed = "RANK" in os.environ and "WORLD_SIZE" in os.environ + rank = int(os.environ.get("RANK", "0")) + world_size = int(os.environ.get("WORLD_SIZE", "1")) + local_rank = int(os.environ.get("LOCAL_RANK", "0")) + if world_size <= 0: + raise ValueError(f"WORLD_SIZE must be positive, got {world_size}") + if 8 % world_size != 0: + raise ValueError(f"WORLD_SIZE={world_size} must divide 8 so grad_accum_steps stays integral") + grad_accum_steps = 8 // world_size + grad_scale = 1.0 / grad_accum_steps + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required") + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + if distributed: + dist.init_process_group(backend="nccl", device_id=device) + dist.barrier() + master_process = rank == 0 + + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + from torch.backends.cuda import enable_cudnn_sdp, enable_flash_sdp, enable_math_sdp, enable_mem_efficient_sdp + enable_cudnn_sdp(False) + enable_flash_sdp(True) + enable_mem_efficient_sdp(False) + enable_math_sdp(False) + + logfile = None + if master_process: + os.makedirs("logs", exist_ok=True) + logfile = f"logs/{args.run_id}.txt" + print(logfile) + + def log0(msg: str, console: bool = True) -> None: + if not master_process: + return + if console: + print(msg) + if logfile is not None: + with open(logfile, "a", encoding="utf-8") as f: + print(msg, file=f) + + log0(code, console=False) + log0("=" * 100, console=False) + log0(f"Running Python {sys.version}", console=False) + log0(f"Running PyTorch {torch.__version__}", console=False) + log0( + subprocess.run(["nvidia-smi"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, check=False).stdout, + console=False, + ) + log0("=" * 100, console=False) + + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + torch.cuda.manual_seed_all(args.seed) + + if not args.tokenizer_path.endswith(".model"): + raise ValueError(f"Script only setup for SentencePiece .model file: {args.tokenizer_path}") + sp = spm.SentencePieceProcessor(model_file=args.tokenizer_path) + if int(sp.vocab_size()) != args.vocab_size: + raise ValueError( + f"VOCAB_SIZE={args.vocab_size} does not match tokenizer vocab_size={int(sp.vocab_size())}" + ) + dataset_dir = Path(args.data_path).resolve() + actual_train_files = len(list(dataset_dir.glob("fineweb_train_*.bin"))) + val_tokens = load_validation_tokens(args.val_files, args.train_seq_len) + base_bytes_lut, has_leading_space_lut, is_boundary_token_lut = build_sentencepiece_luts( + sp, args.vocab_size, device + ) + log0(f"val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path={args.tokenizer_path}") + log0(f"train_loader:dataset:{dataset_dir.name} train_shards:{actual_train_files}") + log0(f"val_loader:shards pattern={args.val_files} tokens:{val_tokens.numel() - 1}") + + # MODEL + OPTIMIZER SETUP + base_model = GPT( + vocab_size=args.vocab_size, + num_layers=args.num_layers, + model_dim=args.model_dim, + num_heads=args.num_heads, + num_kv_heads=args.num_kv_heads, + mlp_mult=args.mlp_mult, + tie_embeddings=args.tie_embeddings, + tied_embed_init_std=args.tied_embed_init_std, + logit_softcap=args.logit_softcap, + rope_base=args.rope_base, + qk_gain_init=args.qk_gain_init, + bigram_vocab_size=args.bigram_vocab_size, + bigram_dim=args.bigram_dim, + ).to(device).bfloat16() + for module in base_model.modules(): + if isinstance(module, CastedLinear): + module.float() + restore_low_dim_params_to_fp32(base_model) + if os.environ.get("TORCH_COMPILE", "1") != "0": + compiled_model = torch.compile(base_model, dynamic=False, fullgraph=True) + else: + compiled_model = base_model + model: nn.Module = DDP(compiled_model, device_ids=[local_rank], broadcast_buffers=False) if distributed else compiled_model + + block_named_params = list(base_model.blocks.named_parameters()) + matrix_params = [ + p for name, p in block_named_params + if p.ndim == 2 and not any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + scalar_params = [ + p for name, p in block_named_params + if p.ndim < 2 or any(pattern in name for pattern in CONTROL_TENSOR_NAME_PATTERNS) + ] + if base_model.skip_weights.numel() > 0: + scalar_params.append(base_model.skip_weights) + scalar_params.append(base_model.smear.gate) + if base_model.bigram is not None: + scalar_params.append(base_model.bigram.scale) + + token_lr = args.tied_embed_lr if args.tie_embeddings else args.embed_lr + tok_params = [{"params": [base_model.tok_emb.weight], "lr": token_lr, "base_lr": token_lr}] + if base_model.bigram is not None: + tok_params.append({"params": [base_model.bigram.embed.weight], "lr": token_lr, "base_lr": token_lr}) + if base_model.bigram.proj is not None: + matrix_params.append(base_model.bigram.proj.weight) + + optimizer_tok = torch.optim.AdamW( + tok_params, + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.weight_decay, + fused=True, + ) + optimizer_muon = Muon( + matrix_params, + lr=args.matrix_lr, + momentum=args.muon_momentum, + backend_steps=args.muon_backend_steps, + weight_decay=0.04, + ) + for group in optimizer_muon.param_groups: + group["base_lr"] = args.matrix_lr + optimizer_scalar = torch.optim.AdamW( + [{"params": scalar_params, "lr": args.scalar_lr, "base_lr": args.scalar_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + weight_decay=args.weight_decay, + fused=True, + ) + optimizers: list[torch.optim.Optimizer] = [optimizer_tok, optimizer_muon, optimizer_scalar] + if base_model.lm_head is not None: + optimizer_head = torch.optim.Adam( + [{"params": [base_model.lm_head.weight], "lr": args.head_lr, "base_lr": args.head_lr}], + betas=(args.beta1, args.beta2), + eps=args.adam_eps, + fused=True, + ) + optimizers.insert(1, optimizer_head) + + n_params = sum(p.numel() for p in base_model.parameters()) + log0(f"model_params:{n_params}") + log0(f"world_size:{world_size} grad_accum_steps:{grad_accum_steps}") + log0(f"attention_mode:gqa num_heads:{args.num_heads} num_kv_heads:{args.num_kv_heads}") + log0( + f"tie_embeddings:{args.tie_embeddings} embed_lr:{token_lr} " + f"matrix_lr:{args.matrix_lr} scalar_lr:{args.scalar_lr}" + ) + log0( + f"train_batch_tokens:{args.train_batch_tokens} train_seq_len:{args.train_seq_len} " + f"iterations:{args.iterations} warmup_steps:{args.warmup_steps} " + f"max_wallclock_seconds:{args.max_wallclock_seconds:.3f}" + ) + log0(f"seed:{args.seed}") + + # DATA LOADER & MODEL WARMUP + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + + def zero_grad_all() -> None: + for opt in optimizers: + opt.zero_grad(set_to_none=True) + + max_wallclock_ms = 1000.0 * args.max_wallclock_seconds if args.max_wallclock_seconds > 0 else None + + def lr_mul(step: int, elapsed_ms: float) -> float: + """WSD schedule: warmup (5%) → stable at peak LR (75%) → cosine decay (20%)""" + if args.warmdown_iters <= 0: + return 1.0 + if max_wallclock_ms is None: + total = args.iterations + else: + step_ms = elapsed_ms / max(step, 1) + total = int(max_wallclock_ms / max(step_ms, 1e-9)) + + warmup_end = int(total * 0.05) + decay_start = int(total * 0.80) + + if step < warmup_end: + return max(step / max(warmup_end, 1), 0.01) # Linear warmup + elif step < decay_start: + return 1.0 # Stable phase at peak LR + else: + # Cosine decay from 1.0 to 0.0 + progress = (step - decay_start) / max(total - decay_start, 1) + return max(0.5 * (1.0 + math.cos(math.pi * progress)), 0.0) + + if args.warmup_steps > 0: + initial_model_state = {name: tensor.detach().cpu().clone() for name, tensor in base_model.state_dict().items()} + initial_optimizer_states = [copy.deepcopy(opt.state_dict()) for opt in optimizers] + model.train() + for warmup_step in range(args.warmup_steps): + zero_grad_all() + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + warmup_loss = model(x, y) + (warmup_loss * grad_scale).backward() + for opt in optimizers: + opt.step() + zero_grad_all() + if args.warmup_steps <= 20 or (warmup_step + 1) % 10 == 0 or warmup_step + 1 == args.warmup_steps: + log0(f"warmup_step:{warmup_step + 1}/{args.warmup_steps}") + base_model.load_state_dict(initial_model_state, strict=True) + for opt, state in zip(optimizers, initial_optimizer_states, strict=True): + opt.load_state_dict(state) + zero_grad_all() + if distributed: + model.require_backward_grad_sync = True + train_loader = DistributedTokenLoader(args.train_files, rank, world_size, device) + + # MAIN TRAINING LOOP + training_time_ms = 0.0 + stop_after_step: int | None = None + swa_state: dict[str, Tensor] | None = None + swa_count = 0 + torch.cuda.synchronize() + t0 = time.perf_counter() + + step = 0 + while True: + last_step = step == args.iterations or (stop_after_step is not None and step >= stop_after_step) + + should_validate = last_step or (args.val_loss_every > 0 and step % args.val_loss_every == 0) + if should_validate: + torch.cuda.synchronize() + training_time_ms += 1000.0 * (time.perf_counter() - t0) + val_loss, val_bpb = eval_val( + args, model, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + ) + log0( + f"step:{step}/{args.iterations} val_loss:{val_loss:.4f} val_bpb:{val_bpb:.4f} " + f"train_time:{training_time_ms:.0f}ms step_avg:{training_time_ms / max(step, 1):.2f}ms" + ) + torch.cuda.synchronize() + t0 = time.perf_counter() + + if last_step: + if stop_after_step is not None and step < args.iterations: + log0( + f"stopping_early: wallclock_cap train_time:{training_time_ms:.0f}ms " + f"step:{step}/{args.iterations}" + ) + break + + elapsed_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + scale = lr_mul(step, elapsed_ms) + zero_grad_all() + train_loss = torch.zeros((), device=device) + for micro_step in range(grad_accum_steps): + if distributed: + model.require_backward_grad_sync = micro_step == grad_accum_steps - 1 + x, y = train_loader.next_batch(args.train_batch_tokens, args.train_seq_len, grad_accum_steps) + with torch.autocast(device_type="cuda", dtype=torch.bfloat16, enabled=True): + loss = model(x, y) + train_loss += loss.detach() + (loss * grad_scale).backward() + train_loss /= grad_accum_steps + + frac = min(step / args.muon_momentum_warmup_steps, 1.0) if args.muon_momentum_warmup_steps > 0 else 1.0 + muon_momentum = (1 - frac) * args.muon_momentum_warmup_start + frac * args.muon_momentum + for group in optimizer_muon.param_groups: + group["momentum"] = muon_momentum + + for opt in optimizers: + for group in opt.param_groups: + group["lr"] = group["base_lr"] * scale + + if args.grad_clip_norm > 0: + torch.nn.utils.clip_grad_norm_(base_model.parameters(), args.grad_clip_norm) + for opt in optimizers: + opt.step() + zero_grad_all() + + step += 1 + approx_training_time_ms = training_time_ms + 1000.0 * (time.perf_counter() - t0) + + # SWA: collect checkpoints during warmdown + if args.swa_enabled and scale < args.swa_start_frac and step % args.swa_every == 0: + if swa_state is None: + swa_state = {name: t.detach().cpu().clone() for name, t in base_model.state_dict().items()} + swa_count = 1 + log0(f"swa:start step:{step}") + else: + for name, t in base_model.state_dict().items(): + swa_state[name] += t.detach().cpu() + swa_count += 1 + + should_log_train = ( + args.train_log_every > 0 + and (step <= 10 or step % args.train_log_every == 0 or stop_after_step is not None) + ) + if should_log_train: + log0( + f"step:{step}/{args.iterations} train_loss:{train_loss.item():.4f} " + f"train_time:{approx_training_time_ms:.0f}ms step_avg:{approx_training_time_ms / step:.2f}ms" + ) + + reached_cap = max_wallclock_ms is not None and approx_training_time_ms >= max_wallclock_ms + if distributed and max_wallclock_ms is not None: + reached_cap_tensor = torch.tensor(int(reached_cap), device=device) + dist.all_reduce(reached_cap_tensor, op=dist.ReduceOp.MAX) + reached_cap = bool(reached_cap_tensor.item()) + if stop_after_step is None and reached_cap: + stop_after_step = step + + log0( + f"peak memory allocated: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB " + f"reserved: {torch.cuda.max_memory_reserved() // 1024 // 1024} MiB" + ) + + # Apply SWA if collected + if args.swa_enabled and swa_state is not None and swa_count > 1: + log0(f"swa:applying averaged {swa_count} checkpoints") + current_state = base_model.state_dict() + avg_state = { + name: (tensor / swa_count).to(dtype=current_state[name].dtype) + for name, tensor in swa_state.items() + } + base_model.load_state_dict(avg_state, strict=True) + + # SERIALIZATION + ROUNDTRIP VALIDATION + if master_process: + torch.save(base_model.state_dict(), "final_model.pt") + model_bytes = os.path.getsize("final_model.pt") + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model: {model_bytes} bytes") + log0(f"Code size: {code_bytes} bytes") + log0(f"Total submission size: {model_bytes + code_bytes} bytes") + + # Magnitude pruning: zero out smallest weights to improve compression + with torch.no_grad(): + for name, param in base_model.named_parameters(): + if param.ndim == 2 and param.numel() > 65536: + threshold = torch.quantile(param.abs().float().flatten(), 0.03) + mask = param.abs() < threshold + param.masked_fill_(mask, 0.0) + + # INT6 mixed quantization + zstd/zlib export + sd_cpu = {k: v.detach().cpu() for k, v in base_model.state_dict().items()} + quant_result, quant_meta = mixed_quantize_int6(sd_cpu, {"mlp", "attn", "bigram"}) + quant_buf = io.BytesIO() + torch.save({"w": quant_result, "m": quant_meta}, quant_buf) + quant_raw = quant_buf.getvalue() + if _COMPRESSOR == "zstd": + quant_blob = zstandard.ZstdCompressor(level=22).compress(quant_raw) + else: + quant_blob = zlib.compress(quant_raw, 9) + if master_process: + with open("final_model.int8.ptz", "wb") as f: + f.write(quant_blob) + quant_file_bytes = os.path.getsize("final_model.int8.ptz") + code_bytes = len(code.encode("utf-8")) + log0(f"Serialized model int6+{_COMPRESSOR}: {quant_file_bytes} bytes") + log0(f"Total submission size int8+zlib: {quant_file_bytes + code_bytes} bytes") + + if distributed: + dist.barrier() + with open("final_model.int8.ptz", "rb") as f: + quant_blob_disk = f.read() + if _COMPRESSOR == "zstd": + decompressed = zstandard.ZstdDecompressor().decompress(quant_blob_disk) + else: + decompressed = zlib.decompress(quant_blob_disk) + quant_state = torch.load(io.BytesIO(decompressed), map_location="cpu") + deq_state = dequantize_mixed_int6(quant_state["w"], quant_state["m"], sd_cpu) + base_model.load_state_dict(deq_state, strict=True) + + # Sliding window eval on int6-roundtripped weights + torch.cuda.synchronize() + t_qeval = time.perf_counter() + if args.eval_stride > 0 and args.eval_stride < args.train_seq_len: + log0(f"final_eval_mode:sliding_window stride:{args.eval_stride} batch_seqs:{args.eval_batch_seqs}") + q_val_loss, q_val_bpb = eval_val_sliding( + args, base_model, rank, world_size, device, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + stride=args.eval_stride, batch_seqs=args.eval_batch_seqs, + ) + else: + log0("final_eval_mode:standard") + q_val_loss, q_val_bpb = eval_val( + args, model, rank, world_size, device, grad_accum_steps, + val_tokens, base_bytes_lut, has_leading_space_lut, is_boundary_token_lut, + ) + torch.cuda.synchronize() + log0( + f"final_int8_zlib_roundtrip val_loss:{q_val_loss:.4f} val_bpb:{q_val_bpb:.4f} " + f"eval_time:{1000.0 * (time.perf_counter() - t_qeval):.0f}ms" + ) + log0(f"final_int8_zlib_roundtrip_exact val_loss:{q_val_loss:.8f} val_bpb:{q_val_bpb:.8f}") + + if distributed: + dist.destroy_process_group() + + +if __name__ == "__main__": + main() +# fixes applied +# tuned diff --git a/records/track_10min_16mb/2026-03-25_WSD_CosineDecay_Schedule/train_seed42.log b/records/track_10min_16mb/2026-03-25_WSD_CosineDecay_Schedule/train_seed42.log new file mode 100644 index 000000000..f17285d40 --- /dev/null +++ b/records/track_10min_16mb/2026-03-25_WSD_CosineDecay_Schedule/train_seed42.log @@ -0,0 +1,668 @@ +logs/idea22_wsd_schedule_seed42_20260325_215828.txt +val_bpb:enabled tokenizer_kind=sentencepiece tokenizer_path=/raid/R13K47007/parameter-golf/data/tokenizers/fineweb_1024_bpe.model +train_loader:dataset:fineweb10B_sp1024 train_shards:80 +val_loader:shards pattern=/raid/R13K47007/parameter-golf/data/datasets/fineweb10B_sp1024/fineweb_val_*.bin tokens:62021632 +model_params:25517137 +world_size:1 grad_accum_steps:8 +attention_mode:gqa num_heads:8 num_kv_heads:4 +tie_embeddings:True embed_lr:0.03 matrix_lr:0.02 scalar_lr:0.02 +train_batch_tokens:786432 train_seq_len:2048 iterations:20000 warmup_steps:20 max_wallclock_seconds:600.000 +seed:42 +warmup_step:1/20 +warmup_step:2/20 +warmup_step:3/20 +warmup_step:4/20 +warmup_step:5/20 +warmup_step:6/20 +warmup_step:7/20 +warmup_step:8/20 +warmup_step:9/20 +warmup_step:10/20 +warmup_step:11/20 +warmup_step:12/20 +warmup_step:13/20 +warmup_step:14/20 +warmup_step:15/20 +warmup_step:16/20 +warmup_step:17/20 +warmup_step:18/20 +warmup_step:19/20 +warmup_step:20/20 +step:0/20000 val_loss:6.9323 val_bpb:4.1057 train_time:0ms step_avg:0.01ms +step:1/20000 train_loss:6.9334 train_time:716ms step_avg:716.43ms +step:2/20000 train_loss:6.8990 train_time:1394ms step_avg:697.12ms +step:3/20000 train_loss:6.7915 train_time:2082ms step_avg:694.11ms +step:4/20000 train_loss:6.5339 train_time:2769ms step_avg:692.28ms +step:5/20000 train_loss:6.1718 train_time:3450ms step_avg:690.06ms +step:6/20000 train_loss:5.9314 train_time:4134ms step_avg:689.05ms +step:7/20000 train_loss:5.8188 train_time:4817ms step_avg:688.15ms +step:8/20000 train_loss:5.7808 train_time:5495ms step_avg:686.93ms +step:9/20000 train_loss:5.7468 train_time:6176ms step_avg:686.28ms +step:10/20000 train_loss:5.6736 train_time:6856ms step_avg:685.63ms +step:100/20000 train_loss:3.2519 train_time:68437ms step_avg:684.37ms +step:200/20000 train_loss:2.6652 train_time:137126ms step_avg:685.63ms +step:300/20000 train_loss:2.5143 train_time:205599ms step_avg:685.33ms +step:400/20000 train_loss:2.4703 train_time:274034ms step_avg:685.09ms +step:500/20000 train_loss:2.3505 train_time:342479ms step_avg:684.96ms +step:500/20000 val_loss:2.3709 val_bpb:1.4042 train_time:342480ms step_avg:684.96ms +step:600/20000 train_loss:2.3097 train_time:410830ms step_avg:684.72ms +step:700/20000 train_loss:2.3507 train_time:479171ms step_avg:684.53ms +step:800/20000 train_loss:2.2584 train_time:547534ms step_avg:684.42ms +swa:start step:850 +step:877/20000 val_loss:2.1912 val_bpb:1.2977 train_time:600166ms step_avg:684.34ms +stopping_early: wallclock_cap train_time:600166ms step:877/20000 +peak memory allocated: 20295 MiB reserved: 20478 MiB +Serialized model: 98437419 bytes +Code size: 53346 bytes +Total submission size: 98490765 bytes +Serialized model int6+zstd: 15767236 bytes +Total submission size int8+zlib: 15820582 bytes +final_eval_mode:sliding_window stride:64 batch_seqs:32 + sliding_eval [ 0.0%] 32/969088 windows running_bpb=1.346495 + sliding_eval [ 0.2%] 1632/969088 windows running_bpb=1.277198 + sliding_eval [ 0.3%] 3232/969088 windows running_bpb=1.273062 + sliding_eval [ 0.5%] 4832/969088 windows running_bpb=1.269931 + sliding_eval [ 0.7%] 6432/969088 windows running_bpb=1.283534 + sliding_eval [ 0.8%] 8032/969088 windows running_bpb=1.285878 + sliding_eval [ 1.0%] 9632/969088 windows running_bpb=1.288270 + sliding_eval [ 1.2%] 11232/969088 windows running_bpb=1.283163 + sliding_eval [ 1.3%] 12832/969088 windows running_bpb=1.280750 + sliding_eval [ 1.5%] 14432/969088 windows running_bpb=1.282801 + sliding_eval [ 1.7%] 16032/969088 windows running_bpb=1.291640 + sliding_eval [ 1.8%] 17632/969088 windows running_bpb=1.289765 + sliding_eval [ 2.0%] 19232/969088 windows running_bpb=1.290886 + sliding_eval [ 2.1%] 20832/969088 windows running_bpb=1.289621 + sliding_eval [ 2.3%] 22432/969088 windows running_bpb=1.288472 + sliding_eval [ 2.5%] 24032/969088 windows running_bpb=1.289213 + sliding_eval [ 2.6%] 25632/969088 windows running_bpb=1.290712 + sliding_eval [ 2.8%] 27232/969088 windows running_bpb=1.291077 + sliding_eval [ 3.0%] 28832/969088 windows running_bpb=1.296976 + sliding_eval [ 3.1%] 30432/969088 windows running_bpb=1.294338 + sliding_eval [ 3.3%] 32032/969088 windows running_bpb=1.295833 + sliding_eval [ 3.5%] 33632/969088 windows running_bpb=1.294351 + sliding_eval [ 3.6%] 35232/969088 windows running_bpb=1.293613 + sliding_eval [ 3.8%] 36832/969088 windows running_bpb=1.292842 + sliding_eval [ 4.0%] 38432/969088 windows running_bpb=1.293487 + sliding_eval [ 4.1%] 40032/969088 windows running_bpb=1.291048 + sliding_eval [ 4.3%] 41632/969088 windows running_bpb=1.290326 + sliding_eval [ 4.5%] 43232/969088 windows running_bpb=1.290850 + sliding_eval [ 4.6%] 44832/969088 windows running_bpb=1.290079 + sliding_eval [ 4.8%] 46432/969088 windows running_bpb=1.289890 + sliding_eval [ 5.0%] 48032/969088 windows running_bpb=1.288998 + sliding_eval [ 5.1%] 49632/969088 windows running_bpb=1.290382 + sliding_eval [ 5.3%] 51232/969088 windows running_bpb=1.291630 + sliding_eval [ 5.5%] 52832/969088 windows running_bpb=1.292184 + sliding_eval [ 5.6%] 54432/969088 windows running_bpb=1.291788 + sliding_eval [ 5.8%] 56032/969088 windows running_bpb=1.292332 + sliding_eval [ 5.9%] 57632/969088 windows running_bpb=1.291689 + sliding_eval [ 6.1%] 59232/969088 windows running_bpb=1.287752 + sliding_eval [ 6.3%] 60832/969088 windows running_bpb=1.287873 + sliding_eval [ 6.4%] 62432/969088 windows running_bpb=1.288473 + sliding_eval [ 6.6%] 64032/969088 windows running_bpb=1.288747 + sliding_eval [ 6.8%] 65632/969088 windows running_bpb=1.288586 + sliding_eval [ 6.9%] 67232/969088 windows running_bpb=1.287301 + sliding_eval [ 7.1%] 68832/969088 windows running_bpb=1.286991 + sliding_eval [ 7.3%] 70432/969088 windows running_bpb=1.286107 + sliding_eval [ 7.4%] 72032/969088 windows running_bpb=1.286185 + sliding_eval [ 7.6%] 73632/969088 windows running_bpb=1.286287 + sliding_eval [ 7.8%] 75232/969088 windows running_bpb=1.286578 + sliding_eval [ 7.9%] 76832/969088 windows running_bpb=1.286306 + sliding_eval [ 8.1%] 78432/969088 windows running_bpb=1.286687 + sliding_eval [ 8.3%] 80032/969088 windows running_bpb=1.287101 + sliding_eval [ 8.4%] 81632/969088 windows running_bpb=1.287000 + sliding_eval [ 8.6%] 83232/969088 windows running_bpb=1.288238 + sliding_eval [ 8.8%] 84832/969088 windows running_bpb=1.290205 + sliding_eval [ 8.9%] 86432/969088 windows running_bpb=1.289380 + sliding_eval [ 9.1%] 88032/969088 windows running_bpb=1.290107 + sliding_eval [ 9.2%] 89632/969088 windows running_bpb=1.290480 + sliding_eval [ 9.4%] 91232/969088 windows running_bpb=1.290343 + sliding_eval [ 9.6%] 92832/969088 windows running_bpb=1.289875 + sliding_eval [ 9.7%] 94432/969088 windows running_bpb=1.290143 + sliding_eval [ 9.9%] 96032/969088 windows running_bpb=1.289685 + sliding_eval [ 10.1%] 97632/969088 windows running_bpb=1.292549 + sliding_eval [ 10.2%] 99232/969088 windows running_bpb=1.292615 + sliding_eval [ 10.4%] 100832/969088 windows running_bpb=1.292632 + sliding_eval [ 10.6%] 102432/969088 windows running_bpb=1.292297 + sliding_eval [ 10.7%] 104032/969088 windows running_bpb=1.291840 + sliding_eval [ 10.9%] 105632/969088 windows running_bpb=1.291127 + sliding_eval [ 11.1%] 107232/969088 windows running_bpb=1.291165 + sliding_eval [ 11.2%] 108832/969088 windows running_bpb=1.291729 + sliding_eval [ 11.4%] 110432/969088 windows running_bpb=1.291847 + sliding_eval [ 11.6%] 112032/969088 windows running_bpb=1.291775 + sliding_eval [ 11.7%] 113632/969088 windows running_bpb=1.292298 + sliding_eval [ 11.9%] 115232/969088 windows running_bpb=1.292134 + sliding_eval [ 12.1%] 116832/969088 windows running_bpb=1.291814 + sliding_eval [ 12.2%] 118432/969088 windows running_bpb=1.292129 + sliding_eval [ 12.4%] 120032/969088 windows running_bpb=1.292266 + sliding_eval [ 12.6%] 121632/969088 windows running_bpb=1.292438 + sliding_eval [ 12.7%] 123232/969088 windows running_bpb=1.292447 + sliding_eval [ 12.9%] 124832/969088 windows running_bpb=1.291971 + sliding_eval [ 13.0%] 126432/969088 windows running_bpb=1.291967 + sliding_eval [ 13.2%] 128032/969088 windows running_bpb=1.291927 + sliding_eval [ 13.4%] 129632/969088 windows running_bpb=1.292071 + sliding_eval [ 13.5%] 131232/969088 windows running_bpb=1.292315 + sliding_eval [ 13.7%] 132832/969088 windows running_bpb=1.291869 + sliding_eval [ 13.9%] 134432/969088 windows running_bpb=1.291355 + sliding_eval [ 14.0%] 136032/969088 windows running_bpb=1.290177 + sliding_eval [ 14.2%] 137632/969088 windows running_bpb=1.290715 + sliding_eval [ 14.4%] 139232/969088 windows running_bpb=1.290525 + sliding_eval [ 14.5%] 140832/969088 windows running_bpb=1.291263 + sliding_eval [ 14.7%] 142432/969088 windows running_bpb=1.291643 + sliding_eval [ 14.9%] 144032/969088 windows running_bpb=1.292030 + sliding_eval [ 15.0%] 145632/969088 windows running_bpb=1.291892 + sliding_eval [ 15.2%] 147232/969088 windows running_bpb=1.291738 + sliding_eval [ 15.4%] 148832/969088 windows running_bpb=1.291496 + sliding_eval [ 15.5%] 150432/969088 windows running_bpb=1.291199 + sliding_eval [ 15.7%] 152032/969088 windows running_bpb=1.290880 + sliding_eval [ 15.9%] 153632/969088 windows running_bpb=1.291711 + sliding_eval [ 16.0%] 155232/969088 windows running_bpb=1.291640 + sliding_eval [ 16.2%] 156832/969088 windows running_bpb=1.292140 + sliding_eval [ 16.3%] 158432/969088 windows running_bpb=1.291964 + sliding_eval [ 16.5%] 160032/969088 windows running_bpb=1.292403 + sliding_eval [ 16.7%] 161632/969088 windows running_bpb=1.292515 + sliding_eval [ 16.8%] 163232/969088 windows running_bpb=1.292478 + sliding_eval [ 17.0%] 164832/969088 windows running_bpb=1.292400 + sliding_eval [ 17.2%] 166432/969088 windows running_bpb=1.292526 + sliding_eval [ 17.3%] 168032/969088 windows running_bpb=1.291961 + sliding_eval [ 17.5%] 169632/969088 windows running_bpb=1.291935 + sliding_eval [ 17.7%] 171232/969088 windows running_bpb=1.291622 + sliding_eval [ 17.8%] 172832/969088 windows running_bpb=1.291468 + sliding_eval [ 18.0%] 174432/969088 windows running_bpb=1.291464 + sliding_eval [ 18.2%] 176032/969088 windows running_bpb=1.291279 + sliding_eval [ 18.3%] 177632/969088 windows running_bpb=1.291517 + sliding_eval [ 18.5%] 179232/969088 windows running_bpb=1.291858 + sliding_eval [ 18.7%] 180832/969088 windows running_bpb=1.292242 + sliding_eval [ 18.8%] 182432/969088 windows running_bpb=1.292608 + sliding_eval [ 19.0%] 184032/969088 windows running_bpb=1.293281 + sliding_eval [ 19.2%] 185632/969088 windows running_bpb=1.292889 + sliding_eval [ 19.3%] 187232/969088 windows running_bpb=1.292866 + sliding_eval [ 19.5%] 188832/969088 windows running_bpb=1.293050 + sliding_eval [ 19.7%] 190432/969088 windows running_bpb=1.292915 + sliding_eval [ 19.8%] 192032/969088 windows running_bpb=1.293087 + sliding_eval [ 20.0%] 193632/969088 windows running_bpb=1.293188 + sliding_eval [ 20.1%] 195232/969088 windows running_bpb=1.292652 + sliding_eval [ 20.3%] 196832/969088 windows running_bpb=1.292453 + sliding_eval [ 20.5%] 198432/969088 windows running_bpb=1.292689 + sliding_eval [ 20.6%] 200032/969088 windows running_bpb=1.292893 + sliding_eval [ 20.8%] 201632/969088 windows running_bpb=1.292837 + sliding_eval [ 21.0%] 203232/969088 windows running_bpb=1.292735 + sliding_eval [ 21.1%] 204832/969088 windows running_bpb=1.292550 + sliding_eval [ 21.3%] 206432/969088 windows running_bpb=1.292284 + sliding_eval [ 21.5%] 208032/969088 windows running_bpb=1.291801 + sliding_eval [ 21.6%] 209632/969088 windows running_bpb=1.291573 + sliding_eval [ 21.8%] 211232/969088 windows running_bpb=1.291260 + sliding_eval [ 22.0%] 212832/969088 windows running_bpb=1.291691 + sliding_eval [ 22.1%] 214432/969088 windows running_bpb=1.291438 + sliding_eval [ 22.3%] 216032/969088 windows running_bpb=1.291725 + sliding_eval [ 22.5%] 217632/969088 windows running_bpb=1.291981 + sliding_eval [ 22.6%] 219232/969088 windows running_bpb=1.292066 + sliding_eval [ 22.8%] 220832/969088 windows running_bpb=1.291846 + sliding_eval [ 23.0%] 222432/969088 windows running_bpb=1.291443 + sliding_eval [ 23.1%] 224032/969088 windows running_bpb=1.291539 + sliding_eval [ 23.3%] 225632/969088 windows running_bpb=1.291056 + sliding_eval [ 23.4%] 227232/969088 windows running_bpb=1.290750 + sliding_eval [ 23.6%] 228832/969088 windows running_bpb=1.291230 + sliding_eval [ 23.8%] 230432/969088 windows running_bpb=1.290992 + sliding_eval [ 23.9%] 232032/969088 windows running_bpb=1.290808 + sliding_eval [ 24.1%] 233632/969088 windows running_bpb=1.290505 + sliding_eval [ 24.3%] 235232/969088 windows running_bpb=1.290553 + sliding_eval [ 24.4%] 236832/969088 windows running_bpb=1.290749 + sliding_eval [ 24.6%] 238432/969088 windows running_bpb=1.290779 + sliding_eval [ 24.8%] 240032/969088 windows running_bpb=1.290403 + sliding_eval [ 24.9%] 241632/969088 windows running_bpb=1.290081 + sliding_eval [ 25.1%] 243232/969088 windows running_bpb=1.289928 + sliding_eval [ 25.3%] 244832/969088 windows running_bpb=1.289809 + sliding_eval [ 25.4%] 246432/969088 windows running_bpb=1.289851 + sliding_eval [ 25.6%] 248032/969088 windows running_bpb=1.289396 + sliding_eval [ 25.8%] 249632/969088 windows running_bpb=1.289921 + sliding_eval [ 25.9%] 251232/969088 windows running_bpb=1.289852 + sliding_eval [ 26.1%] 252832/969088 windows running_bpb=1.290010 + sliding_eval [ 26.3%] 254432/969088 windows running_bpb=1.289819 + sliding_eval [ 26.4%] 256032/969088 windows running_bpb=1.289491 + sliding_eval [ 26.6%] 257632/969088 windows running_bpb=1.289354 + sliding_eval [ 26.8%] 259232/969088 windows running_bpb=1.289077 + sliding_eval [ 26.9%] 260832/969088 windows running_bpb=1.288827 + sliding_eval [ 27.1%] 262432/969088 windows running_bpb=1.288760 + sliding_eval [ 27.2%] 264032/969088 windows running_bpb=1.288599 + sliding_eval [ 27.4%] 265632/969088 windows running_bpb=1.288680 + sliding_eval [ 27.6%] 267232/969088 windows running_bpb=1.288381 + sliding_eval [ 27.7%] 268832/969088 windows running_bpb=1.288366 + sliding_eval [ 27.9%] 270432/969088 windows running_bpb=1.288754 + sliding_eval [ 28.1%] 272032/969088 windows running_bpb=1.289141 + sliding_eval [ 28.2%] 273632/969088 windows running_bpb=1.288910 + sliding_eval [ 28.4%] 275232/969088 windows running_bpb=1.288689 + sliding_eval [ 28.6%] 276832/969088 windows running_bpb=1.288978 + sliding_eval [ 28.7%] 278432/969088 windows running_bpb=1.288745 + sliding_eval [ 28.9%] 280032/969088 windows running_bpb=1.288671 + sliding_eval [ 29.1%] 281632/969088 windows running_bpb=1.288373 + sliding_eval [ 29.2%] 283232/969088 windows running_bpb=1.288421 + sliding_eval [ 29.4%] 284832/969088 windows running_bpb=1.288268 + sliding_eval [ 29.6%] 286432/969088 windows running_bpb=1.288118 + sliding_eval [ 29.7%] 288032/969088 windows running_bpb=1.288088 + sliding_eval [ 29.9%] 289632/969088 windows running_bpb=1.287886 + sliding_eval [ 30.1%] 291232/969088 windows running_bpb=1.287570 + sliding_eval [ 30.2%] 292832/969088 windows running_bpb=1.287580 + sliding_eval [ 30.4%] 294432/969088 windows running_bpb=1.287437 + sliding_eval [ 30.5%] 296032/969088 windows running_bpb=1.287519 + sliding_eval [ 30.7%] 297632/969088 windows running_bpb=1.287261 + sliding_eval [ 30.9%] 299232/969088 windows running_bpb=1.287331 + sliding_eval [ 31.0%] 300832/969088 windows running_bpb=1.287042 + sliding_eval [ 31.2%] 302432/969088 windows running_bpb=1.286653 + sliding_eval [ 31.4%] 304032/969088 windows running_bpb=1.286762 + sliding_eval [ 31.5%] 305632/969088 windows running_bpb=1.286746 + sliding_eval [ 31.7%] 307232/969088 windows running_bpb=1.286689 + sliding_eval [ 31.9%] 308832/969088 windows running_bpb=1.286462 + sliding_eval [ 32.0%] 310432/969088 windows running_bpb=1.286413 + sliding_eval [ 32.2%] 312032/969088 windows running_bpb=1.286299 + sliding_eval [ 32.4%] 313632/969088 windows running_bpb=1.286178 + sliding_eval [ 32.5%] 315232/969088 windows running_bpb=1.286180 + sliding_eval [ 32.7%] 316832/969088 windows running_bpb=1.286289 + sliding_eval [ 32.9%] 318432/969088 windows running_bpb=1.285963 + sliding_eval [ 33.0%] 320032/969088 windows running_bpb=1.285863 + sliding_eval [ 33.2%] 321632/969088 windows running_bpb=1.285814 + sliding_eval [ 33.4%] 323232/969088 windows running_bpb=1.285518 + sliding_eval [ 33.5%] 324832/969088 windows running_bpb=1.285188 + sliding_eval [ 33.7%] 326432/969088 windows running_bpb=1.284968 + sliding_eval [ 33.8%] 328032/969088 windows running_bpb=1.285056 + sliding_eval [ 34.0%] 329632/969088 windows running_bpb=1.285164 + sliding_eval [ 34.2%] 331232/969088 windows running_bpb=1.284809 + sliding_eval [ 34.3%] 332832/969088 windows running_bpb=1.284520 + sliding_eval [ 34.5%] 334432/969088 windows running_bpb=1.284381 + sliding_eval [ 34.7%] 336032/969088 windows running_bpb=1.284351 + sliding_eval [ 34.8%] 337632/969088 windows running_bpb=1.284235 + sliding_eval [ 35.0%] 339232/969088 windows running_bpb=1.284358 + sliding_eval [ 35.2%] 340832/969088 windows running_bpb=1.284136 + sliding_eval [ 35.3%] 342432/969088 windows running_bpb=1.284051 + sliding_eval [ 35.5%] 344032/969088 windows running_bpb=1.283653 + sliding_eval [ 35.7%] 345632/969088 windows running_bpb=1.283365 + sliding_eval [ 35.8%] 347232/969088 windows running_bpb=1.283255 + sliding_eval [ 36.0%] 348832/969088 windows running_bpb=1.283073 + sliding_eval [ 36.2%] 350432/969088 windows running_bpb=1.282881 + sliding_eval [ 36.3%] 352032/969088 windows running_bpb=1.283032 + sliding_eval [ 36.5%] 353632/969088 windows running_bpb=1.283262 + sliding_eval [ 36.7%] 355232/969088 windows running_bpb=1.282937 + sliding_eval [ 36.8%] 356832/969088 windows running_bpb=1.282838 + sliding_eval [ 37.0%] 358432/969088 windows running_bpb=1.282509 + sliding_eval [ 37.2%] 360032/969088 windows running_bpb=1.282163 + sliding_eval [ 37.3%] 361632/969088 windows running_bpb=1.282017 + sliding_eval [ 37.5%] 363232/969088 windows running_bpb=1.282331 + sliding_eval [ 37.6%] 364832/969088 windows running_bpb=1.282337 + sliding_eval [ 37.8%] 366432/969088 windows running_bpb=1.282175 + sliding_eval [ 38.0%] 368032/969088 windows running_bpb=1.282084 + sliding_eval [ 38.1%] 369632/969088 windows running_bpb=1.282103 + sliding_eval [ 38.3%] 371232/969088 windows running_bpb=1.282085 + sliding_eval [ 38.5%] 372832/969088 windows running_bpb=1.282146 + sliding_eval [ 38.6%] 374432/969088 windows running_bpb=1.282480 + sliding_eval [ 38.8%] 376032/969088 windows running_bpb=1.282377 + sliding_eval [ 39.0%] 377632/969088 windows running_bpb=1.282503 + sliding_eval [ 39.1%] 379232/969088 windows running_bpb=1.282426 + sliding_eval [ 39.3%] 380832/969088 windows running_bpb=1.282171 + sliding_eval [ 39.5%] 382432/969088 windows running_bpb=1.282173 + sliding_eval [ 39.6%] 384032/969088 windows running_bpb=1.281964 + sliding_eval [ 39.8%] 385632/969088 windows running_bpb=1.282097 + sliding_eval [ 40.0%] 387232/969088 windows running_bpb=1.282120 + sliding_eval [ 40.1%] 388832/969088 windows running_bpb=1.282207 + sliding_eval [ 40.3%] 390432/969088 windows running_bpb=1.282122 + sliding_eval [ 40.5%] 392032/969088 windows running_bpb=1.282111 + sliding_eval [ 40.6%] 393632/969088 windows running_bpb=1.282122 + sliding_eval [ 40.8%] 395232/969088 windows running_bpb=1.281943 + sliding_eval [ 40.9%] 396832/969088 windows running_bpb=1.282174 + sliding_eval [ 41.1%] 398432/969088 windows running_bpb=1.282242 + sliding_eval [ 41.3%] 400032/969088 windows running_bpb=1.282225 + sliding_eval [ 41.4%] 401632/969088 windows running_bpb=1.282192 + sliding_eval [ 41.6%] 403232/969088 windows running_bpb=1.282101 + sliding_eval [ 41.8%] 404832/969088 windows running_bpb=1.282166 + sliding_eval [ 41.9%] 406432/969088 windows running_bpb=1.281974 + sliding_eval [ 42.1%] 408032/969088 windows running_bpb=1.282044 + sliding_eval [ 42.3%] 409632/969088 windows running_bpb=1.282068 + sliding_eval [ 42.4%] 411232/969088 windows running_bpb=1.281940 + sliding_eval [ 42.6%] 412832/969088 windows running_bpb=1.282049 + sliding_eval [ 42.8%] 414432/969088 windows running_bpb=1.282093 + sliding_eval [ 42.9%] 416032/969088 windows running_bpb=1.282076 + sliding_eval [ 43.1%] 417632/969088 windows running_bpb=1.281940 + sliding_eval [ 43.3%] 419232/969088 windows running_bpb=1.281892 + sliding_eval [ 43.4%] 420832/969088 windows running_bpb=1.282119 + sliding_eval [ 43.6%] 422432/969088 windows running_bpb=1.282083 + sliding_eval [ 43.8%] 424032/969088 windows running_bpb=1.281894 + sliding_eval [ 43.9%] 425632/969088 windows running_bpb=1.281844 + sliding_eval [ 44.1%] 427232/969088 windows running_bpb=1.281686 + sliding_eval [ 44.3%] 428832/969088 windows running_bpb=1.281681 + sliding_eval [ 44.4%] 430432/969088 windows running_bpb=1.281637 + sliding_eval [ 44.6%] 432032/969088 windows running_bpb=1.281788 + sliding_eval [ 44.7%] 433632/969088 windows running_bpb=1.281795 + sliding_eval [ 44.9%] 435232/969088 windows running_bpb=1.281691 + sliding_eval [ 45.1%] 436832/969088 windows running_bpb=1.281884 + sliding_eval [ 45.2%] 438432/969088 windows running_bpb=1.281881 + sliding_eval [ 45.4%] 440032/969088 windows running_bpb=1.281857 + sliding_eval [ 45.6%] 441632/969088 windows running_bpb=1.282011 + sliding_eval [ 45.7%] 443232/969088 windows running_bpb=1.281988 + sliding_eval [ 45.9%] 444832/969088 windows running_bpb=1.282064 + sliding_eval [ 46.1%] 446432/969088 windows running_bpb=1.282207 + sliding_eval [ 46.2%] 448032/969088 windows running_bpb=1.282196 + sliding_eval [ 46.4%] 449632/969088 windows running_bpb=1.282207 + sliding_eval [ 46.6%] 451232/969088 windows running_bpb=1.282313 + sliding_eval [ 46.7%] 452832/969088 windows running_bpb=1.282389 + sliding_eval [ 46.9%] 454432/969088 windows running_bpb=1.282178 + sliding_eval [ 47.1%] 456032/969088 windows running_bpb=1.281967 + sliding_eval [ 47.2%] 457632/969088 windows running_bpb=1.282177 + sliding_eval [ 47.4%] 459232/969088 windows running_bpb=1.282096 + sliding_eval [ 47.6%] 460832/969088 windows running_bpb=1.282106 + sliding_eval [ 47.7%] 462432/969088 windows running_bpb=1.281952 + sliding_eval [ 47.9%] 464032/969088 windows running_bpb=1.281875 + sliding_eval [ 48.0%] 465632/969088 windows running_bpb=1.281940 + sliding_eval [ 48.2%] 467232/969088 windows running_bpb=1.281928 + sliding_eval [ 48.4%] 468832/969088 windows running_bpb=1.281994 + sliding_eval [ 48.5%] 470432/969088 windows running_bpb=1.281994 + sliding_eval [ 48.7%] 472032/969088 windows running_bpb=1.282105 + sliding_eval [ 48.9%] 473632/969088 windows running_bpb=1.281988 + sliding_eval [ 49.0%] 475232/969088 windows running_bpb=1.282027 + sliding_eval [ 49.2%] 476832/969088 windows running_bpb=1.282069 + sliding_eval [ 49.4%] 478432/969088 windows running_bpb=1.281994 + sliding_eval [ 49.5%] 480032/969088 windows running_bpb=1.282441 + sliding_eval [ 49.7%] 481632/969088 windows running_bpb=1.282366 + sliding_eval [ 49.9%] 483232/969088 windows running_bpb=1.282465 + sliding_eval [ 50.0%] 484832/969088 windows running_bpb=1.282787 + sliding_eval [ 50.2%] 486432/969088 windows running_bpb=1.282829 + sliding_eval [ 50.4%] 488032/969088 windows running_bpb=1.282710 + sliding_eval [ 50.5%] 489632/969088 windows running_bpb=1.282863 + sliding_eval [ 50.7%] 491232/969088 windows running_bpb=1.282791 + sliding_eval [ 50.9%] 492832/969088 windows running_bpb=1.282991 + sliding_eval [ 51.0%] 494432/969088 windows running_bpb=1.283135 + sliding_eval [ 51.2%] 496032/969088 windows running_bpb=1.283358 + sliding_eval [ 51.4%] 497632/969088 windows running_bpb=1.283389 + sliding_eval [ 51.5%] 499232/969088 windows running_bpb=1.283454 + sliding_eval [ 51.7%] 500832/969088 windows running_bpb=1.283471 + sliding_eval [ 51.8%] 502432/969088 windows running_bpb=1.283509 + sliding_eval [ 52.0%] 504032/969088 windows running_bpb=1.283581 + sliding_eval [ 52.2%] 505632/969088 windows running_bpb=1.283556 + sliding_eval [ 52.3%] 507232/969088 windows running_bpb=1.283418 + sliding_eval [ 52.5%] 508832/969088 windows running_bpb=1.283516 + sliding_eval [ 52.7%] 510432/969088 windows running_bpb=1.283581 + sliding_eval [ 52.8%] 512032/969088 windows running_bpb=1.283660 + sliding_eval [ 53.0%] 513632/969088 windows running_bpb=1.283766 + sliding_eval [ 53.2%] 515232/969088 windows running_bpb=1.283796 + sliding_eval [ 53.3%] 516832/969088 windows running_bpb=1.283961 + sliding_eval [ 53.5%] 518432/969088 windows running_bpb=1.283904 + sliding_eval [ 53.7%] 520032/969088 windows running_bpb=1.283965 + sliding_eval [ 53.8%] 521632/969088 windows running_bpb=1.283960 + sliding_eval [ 54.0%] 523232/969088 windows running_bpb=1.284197 + sliding_eval [ 54.2%] 524832/969088 windows running_bpb=1.284354 + sliding_eval [ 54.3%] 526432/969088 windows running_bpb=1.284421 + sliding_eval [ 54.5%] 528032/969088 windows running_bpb=1.284561 + sliding_eval [ 54.7%] 529632/969088 windows running_bpb=1.284630 + sliding_eval [ 54.8%] 531232/969088 windows running_bpb=1.284647 + sliding_eval [ 55.0%] 532832/969088 windows running_bpb=1.284893 + sliding_eval [ 55.1%] 534432/969088 windows running_bpb=1.284796 + sliding_eval [ 55.3%] 536032/969088 windows running_bpb=1.284729 + sliding_eval [ 55.5%] 537632/969088 windows running_bpb=1.284773 + sliding_eval [ 55.6%] 539232/969088 windows running_bpb=1.284912 + sliding_eval [ 55.8%] 540832/969088 windows running_bpb=1.284914 + sliding_eval [ 56.0%] 542432/969088 windows running_bpb=1.284852 + sliding_eval [ 56.1%] 544032/969088 windows running_bpb=1.284797 + sliding_eval [ 56.3%] 545632/969088 windows running_bpb=1.285003 + sliding_eval [ 56.5%] 547232/969088 windows running_bpb=1.285117 + sliding_eval [ 56.6%] 548832/969088 windows running_bpb=1.285015 + sliding_eval [ 56.8%] 550432/969088 windows running_bpb=1.285025 + sliding_eval [ 57.0%] 552032/969088 windows running_bpb=1.285096 + sliding_eval [ 57.1%] 553632/969088 windows running_bpb=1.285027 + sliding_eval [ 57.3%] 555232/969088 windows running_bpb=1.285237 + sliding_eval [ 57.5%] 556832/969088 windows running_bpb=1.285322 + sliding_eval [ 57.6%] 558432/969088 windows running_bpb=1.285290 + sliding_eval [ 57.8%] 560032/969088 windows running_bpb=1.285367 + sliding_eval [ 58.0%] 561632/969088 windows running_bpb=1.285499 + sliding_eval [ 58.1%] 563232/969088 windows running_bpb=1.285438 + sliding_eval [ 58.3%] 564832/969088 windows running_bpb=1.285350 + sliding_eval [ 58.5%] 566432/969088 windows running_bpb=1.285346 + sliding_eval [ 58.6%] 568032/969088 windows running_bpb=1.285229 + sliding_eval [ 58.8%] 569632/969088 windows running_bpb=1.285187 + sliding_eval [ 58.9%] 571232/969088 windows running_bpb=1.285166 + sliding_eval [ 59.1%] 572832/969088 windows running_bpb=1.285037 + sliding_eval [ 59.3%] 574432/969088 windows running_bpb=1.284827 + sliding_eval [ 59.4%] 576032/969088 windows running_bpb=1.284724 + sliding_eval [ 59.6%] 577632/969088 windows running_bpb=1.284786 + sliding_eval [ 59.8%] 579232/969088 windows running_bpb=1.284782 + sliding_eval [ 59.9%] 580832/969088 windows running_bpb=1.284541 + sliding_eval [ 60.1%] 582432/969088 windows running_bpb=1.284456 + sliding_eval [ 60.3%] 584032/969088 windows running_bpb=1.284468 + sliding_eval [ 60.4%] 585632/969088 windows running_bpb=1.284476 + sliding_eval [ 60.6%] 587232/969088 windows running_bpb=1.284470 + sliding_eval [ 60.8%] 588832/969088 windows running_bpb=1.284469 + sliding_eval [ 60.9%] 590432/969088 windows running_bpb=1.284416 + sliding_eval [ 61.1%] 592032/969088 windows running_bpb=1.284329 + sliding_eval [ 61.3%] 593632/969088 windows running_bpb=1.284380 + sliding_eval [ 61.4%] 595232/969088 windows running_bpb=1.284316 + sliding_eval [ 61.6%] 596832/969088 windows running_bpb=1.284351 + sliding_eval [ 61.8%] 598432/969088 windows running_bpb=1.284078 + sliding_eval [ 61.9%] 600032/969088 windows running_bpb=1.284014 + sliding_eval [ 62.1%] 601632/969088 windows running_bpb=1.283926 + sliding_eval [ 62.2%] 603232/969088 windows running_bpb=1.283771 + sliding_eval [ 62.4%] 604832/969088 windows running_bpb=1.283720 + sliding_eval [ 62.6%] 606432/969088 windows running_bpb=1.283715 + sliding_eval [ 62.7%] 608032/969088 windows running_bpb=1.283808 + sliding_eval [ 62.9%] 609632/969088 windows running_bpb=1.283820 + sliding_eval [ 63.1%] 611232/969088 windows running_bpb=1.284040 + sliding_eval [ 63.2%] 612832/969088 windows running_bpb=1.284026 + sliding_eval [ 63.4%] 614432/969088 windows running_bpb=1.284001 + sliding_eval [ 63.6%] 616032/969088 windows running_bpb=1.283967 + sliding_eval [ 63.7%] 617632/969088 windows running_bpb=1.283811 + sliding_eval [ 63.9%] 619232/969088 windows running_bpb=1.283553 + sliding_eval [ 64.1%] 620832/969088 windows running_bpb=1.283767 + sliding_eval [ 64.2%] 622432/969088 windows running_bpb=1.283928 + sliding_eval [ 64.4%] 624032/969088 windows running_bpb=1.284073 + sliding_eval [ 64.6%] 625632/969088 windows running_bpb=1.283920 + sliding_eval [ 64.7%] 627232/969088 windows running_bpb=1.283906 + sliding_eval [ 64.9%] 628832/969088 windows running_bpb=1.283857 + sliding_eval [ 65.1%] 630432/969088 windows running_bpb=1.283833 + sliding_eval [ 65.2%] 632032/969088 windows running_bpb=1.283622 + sliding_eval [ 65.4%] 633632/969088 windows running_bpb=1.283527 + sliding_eval [ 65.5%] 635232/969088 windows running_bpb=1.283468 + sliding_eval [ 65.7%] 636832/969088 windows running_bpb=1.283430 + sliding_eval [ 65.9%] 638432/969088 windows running_bpb=1.283284 + sliding_eval [ 66.0%] 640032/969088 windows running_bpb=1.283013 + sliding_eval [ 66.2%] 641632/969088 windows running_bpb=1.282832 + sliding_eval [ 66.4%] 643232/969088 windows running_bpb=1.282751 + sliding_eval [ 66.5%] 644832/969088 windows running_bpb=1.282722 + sliding_eval [ 66.7%] 646432/969088 windows running_bpb=1.282667 + sliding_eval [ 66.9%] 648032/969088 windows running_bpb=1.282638 + sliding_eval [ 67.0%] 649632/969088 windows running_bpb=1.282522 + sliding_eval [ 67.2%] 651232/969088 windows running_bpb=1.282330 + sliding_eval [ 67.4%] 652832/969088 windows running_bpb=1.282203 + sliding_eval [ 67.5%] 654432/969088 windows running_bpb=1.282026 + sliding_eval [ 67.7%] 656032/969088 windows running_bpb=1.282011 + sliding_eval [ 67.9%] 657632/969088 windows running_bpb=1.281920 + sliding_eval [ 68.0%] 659232/969088 windows running_bpb=1.281868 + sliding_eval [ 68.2%] 660832/969088 windows running_bpb=1.281724 + sliding_eval [ 68.4%] 662432/969088 windows running_bpb=1.281712 + sliding_eval [ 68.5%] 664032/969088 windows running_bpb=1.281792 + sliding_eval [ 68.7%] 665632/969088 windows running_bpb=1.281632 + sliding_eval [ 68.9%] 667232/969088 windows running_bpb=1.281532 + sliding_eval [ 69.0%] 668832/969088 windows running_bpb=1.281518 + sliding_eval [ 69.2%] 670432/969088 windows running_bpb=1.281336 + sliding_eval [ 69.3%] 672032/969088 windows running_bpb=1.281210 + sliding_eval [ 69.5%] 673632/969088 windows running_bpb=1.281168 + sliding_eval [ 69.7%] 675232/969088 windows running_bpb=1.280988 + sliding_eval [ 69.8%] 676832/969088 windows running_bpb=1.280856 + sliding_eval [ 70.0%] 678432/969088 windows running_bpb=1.280745 + sliding_eval [ 70.2%] 680032/969088 windows running_bpb=1.280674 + sliding_eval [ 70.3%] 681632/969088 windows running_bpb=1.280637 + sliding_eval [ 70.5%] 683232/969088 windows running_bpb=1.280569 + sliding_eval [ 70.7%] 684832/969088 windows running_bpb=1.280433 + sliding_eval [ 70.8%] 686432/969088 windows running_bpb=1.280450 + sliding_eval [ 71.0%] 688032/969088 windows running_bpb=1.280447 + sliding_eval [ 71.2%] 689632/969088 windows running_bpb=1.280355 + sliding_eval [ 71.3%] 691232/969088 windows running_bpb=1.280323 + sliding_eval [ 71.5%] 692832/969088 windows running_bpb=1.280327 + sliding_eval [ 71.7%] 694432/969088 windows running_bpb=1.280379 + sliding_eval [ 71.8%] 696032/969088 windows running_bpb=1.280460 + sliding_eval [ 72.0%] 697632/969088 windows running_bpb=1.280495 + sliding_eval [ 72.2%] 699232/969088 windows running_bpb=1.280695 + sliding_eval [ 72.3%] 700832/969088 windows running_bpb=1.280670 + sliding_eval [ 72.5%] 702432/969088 windows running_bpb=1.280755 + sliding_eval [ 72.6%] 704032/969088 windows running_bpb=1.280831 + sliding_eval [ 72.8%] 705632/969088 windows running_bpb=1.280983 + sliding_eval [ 73.0%] 707232/969088 windows running_bpb=1.281005 + sliding_eval [ 73.1%] 708832/969088 windows running_bpb=1.281120 + sliding_eval [ 73.3%] 710432/969088 windows running_bpb=1.281045 + sliding_eval [ 73.5%] 712032/969088 windows running_bpb=1.280782 + sliding_eval [ 73.6%] 713632/969088 windows running_bpb=1.280892 + sliding_eval [ 73.8%] 715232/969088 windows running_bpb=1.280724 + sliding_eval [ 74.0%] 716832/969088 windows running_bpb=1.280818 + sliding_eval [ 74.1%] 718432/969088 windows running_bpb=1.280829 + sliding_eval [ 74.3%] 720032/969088 windows running_bpb=1.280929 + sliding_eval [ 74.5%] 721632/969088 windows running_bpb=1.281036 + sliding_eval [ 74.6%] 723232/969088 windows running_bpb=1.280996 + sliding_eval [ 74.8%] 724832/969088 windows running_bpb=1.281057 + sliding_eval [ 75.0%] 726432/969088 windows running_bpb=1.281055 + sliding_eval [ 75.1%] 728032/969088 windows running_bpb=1.281075 + sliding_eval [ 75.3%] 729632/969088 windows running_bpb=1.281020 + sliding_eval [ 75.5%] 731232/969088 windows running_bpb=1.280994 + sliding_eval [ 75.6%] 732832/969088 windows running_bpb=1.281052 + sliding_eval [ 75.8%] 734432/969088 windows running_bpb=1.281165 + sliding_eval [ 76.0%] 736032/969088 windows running_bpb=1.281326 + sliding_eval [ 76.1%] 737632/969088 windows running_bpb=1.281554 + sliding_eval [ 76.3%] 739232/969088 windows running_bpb=1.281579 + sliding_eval [ 76.4%] 740832/969088 windows running_bpb=1.281535 + sliding_eval [ 76.6%] 742432/969088 windows running_bpb=1.281495 + sliding_eval [ 76.8%] 744032/969088 windows running_bpb=1.281471 + sliding_eval [ 76.9%] 745632/969088 windows running_bpb=1.281366 + sliding_eval [ 77.1%] 747232/969088 windows running_bpb=1.281404 + sliding_eval [ 77.3%] 748832/969088 windows running_bpb=1.281438 + sliding_eval [ 77.4%] 750432/969088 windows running_bpb=1.281498 + sliding_eval [ 77.6%] 752032/969088 windows running_bpb=1.281975 + sliding_eval [ 77.8%] 753632/969088 windows running_bpb=1.282070 + sliding_eval [ 77.9%] 755232/969088 windows running_bpb=1.282075 + sliding_eval [ 78.1%] 756832/969088 windows running_bpb=1.282002 + sliding_eval [ 78.3%] 758432/969088 windows running_bpb=1.281986 + sliding_eval [ 78.4%] 760032/969088 windows running_bpb=1.282298 + sliding_eval [ 78.6%] 761632/969088 windows running_bpb=1.282407 + sliding_eval [ 78.8%] 763232/969088 windows running_bpb=1.282399 + sliding_eval [ 78.9%] 764832/969088 windows running_bpb=1.282462 + sliding_eval [ 79.1%] 766432/969088 windows running_bpb=1.282425 + sliding_eval [ 79.3%] 768032/969088 windows running_bpb=1.282436 + sliding_eval [ 79.4%] 769632/969088 windows running_bpb=1.282471 + sliding_eval [ 79.6%] 771232/969088 windows running_bpb=1.282537 + sliding_eval [ 79.7%] 772832/969088 windows running_bpb=1.282499 + sliding_eval [ 79.9%] 774432/969088 windows running_bpb=1.282452 + sliding_eval [ 80.1%] 776032/969088 windows running_bpb=1.282481 + sliding_eval [ 80.2%] 777632/969088 windows running_bpb=1.282635 + sliding_eval [ 80.4%] 779232/969088 windows running_bpb=1.282647 + sliding_eval [ 80.6%] 780832/969088 windows running_bpb=1.282674 + sliding_eval [ 80.7%] 782432/969088 windows running_bpb=1.282883 + sliding_eval [ 80.9%] 784032/969088 windows running_bpb=1.282899 + sliding_eval [ 81.1%] 785632/969088 windows running_bpb=1.282823 + sliding_eval [ 81.2%] 787232/969088 windows running_bpb=1.282832 + sliding_eval [ 81.4%] 788832/969088 windows running_bpb=1.282958 + sliding_eval [ 81.6%] 790432/969088 windows running_bpb=1.282992 + sliding_eval [ 81.7%] 792032/969088 windows running_bpb=1.283115 + sliding_eval [ 81.9%] 793632/969088 windows running_bpb=1.283185 + sliding_eval [ 82.1%] 795232/969088 windows running_bpb=1.283204 + sliding_eval [ 82.2%] 796832/969088 windows running_bpb=1.283259 + sliding_eval [ 82.4%] 798432/969088 windows running_bpb=1.283275 + sliding_eval [ 82.6%] 800032/969088 windows running_bpb=1.283372 + sliding_eval [ 82.7%] 801632/969088 windows running_bpb=1.283420 + sliding_eval [ 82.9%] 803232/969088 windows running_bpb=1.283408 + sliding_eval [ 83.1%] 804832/969088 windows running_bpb=1.283476 + sliding_eval [ 83.2%] 806432/969088 windows running_bpb=1.283484 + sliding_eval [ 83.4%] 808032/969088 windows running_bpb=1.283593 + sliding_eval [ 83.5%] 809632/969088 windows running_bpb=1.283652 + sliding_eval [ 83.7%] 811232/969088 windows running_bpb=1.283754 + sliding_eval [ 83.9%] 812832/969088 windows running_bpb=1.283700 + sliding_eval [ 84.0%] 814432/969088 windows running_bpb=1.283680 + sliding_eval [ 84.2%] 816032/969088 windows running_bpb=1.283735 + sliding_eval [ 84.4%] 817632/969088 windows running_bpb=1.283818 + sliding_eval [ 84.5%] 819232/969088 windows running_bpb=1.283792 + sliding_eval [ 84.7%] 820832/969088 windows running_bpb=1.283800 + sliding_eval [ 84.9%] 822432/969088 windows running_bpb=1.283877 + sliding_eval [ 85.0%] 824032/969088 windows running_bpb=1.283938 + sliding_eval [ 85.2%] 825632/969088 windows running_bpb=1.284023 + sliding_eval [ 85.4%] 827232/969088 windows running_bpb=1.284043 + sliding_eval [ 85.5%] 828832/969088 windows running_bpb=1.283954 + sliding_eval [ 85.7%] 830432/969088 windows running_bpb=1.283833 + sliding_eval [ 85.9%] 832032/969088 windows running_bpb=1.283878 + sliding_eval [ 86.0%] 833632/969088 windows running_bpb=1.283935 + sliding_eval [ 86.2%] 835232/969088 windows running_bpb=1.283868 + sliding_eval [ 86.4%] 836832/969088 windows running_bpb=1.283804 + sliding_eval [ 86.5%] 838432/969088 windows running_bpb=1.283889 + sliding_eval [ 86.7%] 840032/969088 windows running_bpb=1.283886 + sliding_eval [ 86.8%] 841632/969088 windows running_bpb=1.283976 + sliding_eval [ 87.0%] 843232/969088 windows running_bpb=1.284011 + sliding_eval [ 87.2%] 844832/969088 windows running_bpb=1.283877 + sliding_eval [ 87.3%] 846432/969088 windows running_bpb=1.284039 + sliding_eval [ 87.5%] 848032/969088 windows running_bpb=1.284091 + sliding_eval [ 87.7%] 849632/969088 windows running_bpb=1.284072 + sliding_eval [ 87.8%] 851232/969088 windows running_bpb=1.284068 + sliding_eval [ 88.0%] 852832/969088 windows running_bpb=1.284141 + sliding_eval [ 88.2%] 854432/969088 windows running_bpb=1.284221 + sliding_eval [ 88.3%] 856032/969088 windows running_bpb=1.284227 + sliding_eval [ 88.5%] 857632/969088 windows running_bpb=1.284265 + sliding_eval [ 88.7%] 859232/969088 windows running_bpb=1.284229 + sliding_eval [ 88.8%] 860832/969088 windows running_bpb=1.284377 + sliding_eval [ 89.0%] 862432/969088 windows running_bpb=1.284363 + sliding_eval [ 89.2%] 864032/969088 windows running_bpb=1.284367 + sliding_eval [ 89.3%] 865632/969088 windows running_bpb=1.284460 + sliding_eval [ 89.5%] 867232/969088 windows running_bpb=1.284455 + sliding_eval [ 89.7%] 868832/969088 windows running_bpb=1.284390 + sliding_eval [ 89.8%] 870432/969088 windows running_bpb=1.284556 + sliding_eval [ 90.0%] 872032/969088 windows running_bpb=1.284562 + sliding_eval [ 90.1%] 873632/969088 windows running_bpb=1.284559 + sliding_eval [ 90.3%] 875232/969088 windows running_bpb=1.284584 + sliding_eval [ 90.5%] 876832/969088 windows running_bpb=1.284420 + sliding_eval [ 90.6%] 878432/969088 windows running_bpb=1.284411 + sliding_eval [ 90.8%] 880032/969088 windows running_bpb=1.284369 + sliding_eval [ 91.0%] 881632/969088 windows running_bpb=1.284382 + sliding_eval [ 91.1%] 883232/969088 windows running_bpb=1.284383 + sliding_eval [ 91.3%] 884832/969088 windows running_bpb=1.284413 + sliding_eval [ 91.5%] 886432/969088 windows running_bpb=1.284418 + sliding_eval [ 91.6%] 888032/969088 windows running_bpb=1.284359 + sliding_eval [ 91.8%] 889632/969088 windows running_bpb=1.284272 + sliding_eval [ 92.0%] 891232/969088 windows running_bpb=1.284145 + sliding_eval [ 92.1%] 892832/969088 windows running_bpb=1.284081 + sliding_eval [ 92.3%] 894432/969088 windows running_bpb=1.284042 + sliding_eval [ 92.5%] 896032/969088 windows running_bpb=1.283942 + sliding_eval [ 92.6%] 897632/969088 windows running_bpb=1.283983 + sliding_eval [ 92.8%] 899232/969088 windows running_bpb=1.284004 + sliding_eval [ 93.0%] 900832/969088 windows running_bpb=1.283955 + sliding_eval [ 93.1%] 902432/969088 windows running_bpb=1.283932 + sliding_eval [ 93.3%] 904032/969088 windows running_bpb=1.283920 + sliding_eval [ 93.5%] 905632/969088 windows running_bpb=1.283885 + sliding_eval [ 93.6%] 907232/969088 windows running_bpb=1.283918 + sliding_eval [ 93.8%] 908832/969088 windows running_bpb=1.283835 + sliding_eval [ 93.9%] 910432/969088 windows running_bpb=1.283794 + sliding_eval [ 94.1%] 912032/969088 windows running_bpb=1.283757 + sliding_eval [ 94.3%] 913632/969088 windows running_bpb=1.283585 + sliding_eval [ 94.4%] 915232/969088 windows running_bpb=1.283443 + sliding_eval [ 94.6%] 916832/969088 windows running_bpb=1.283414 + sliding_eval [ 94.8%] 918432/969088 windows running_bpb=1.283364 + sliding_eval [ 94.9%] 920032/969088 windows running_bpb=1.283349 + sliding_eval [ 95.1%] 921632/969088 windows running_bpb=1.283354 + sliding_eval [ 95.3%] 923232/969088 windows running_bpb=1.283317 + sliding_eval [ 95.4%] 924832/969088 windows running_bpb=1.283270 + sliding_eval [ 95.6%] 926432/969088 windows running_bpb=1.283246 + sliding_eval [ 95.8%] 928032/969088 windows running_bpb=1.283269 + sliding_eval [ 95.9%] 929632/969088 windows running_bpb=1.283340 + sliding_eval [ 96.1%] 931232/969088 windows running_bpb=1.283298 + sliding_eval [ 96.3%] 932832/969088 windows running_bpb=1.283207 + sliding_eval [ 96.4%] 934432/969088 windows running_bpb=1.283143 + sliding_eval [ 96.6%] 936032/969088 windows running_bpb=1.283077 + sliding_eval [ 96.8%] 937632/969088 windows running_bpb=1.283036 + sliding_eval [ 96.9%] 939232/969088 windows running_bpb=1.283252 + sliding_eval [ 97.1%] 940832/969088 windows running_bpb=1.283149 + sliding_eval [ 97.2%] 942432/969088 windows running_bpb=1.283079 + sliding_eval [ 97.4%] 944032/969088 windows running_bpb=1.282960 + sliding_eval [ 97.6%] 945632/969088 windows running_bpb=1.282863 + sliding_eval [ 97.7%] 947232/969088 windows running_bpb=1.282852 + sliding_eval [ 97.9%] 948832/969088 windows running_bpb=1.282813 + sliding_eval [ 98.1%] 950432/969088 windows running_bpb=1.282789 + sliding_eval [ 98.2%] 952032/969088 windows running_bpb=1.282730 + sliding_eval [ 98.4%] 953632/969088 windows running_bpb=1.282733 + sliding_eval [ 98.6%] 955232/969088 windows running_bpb=1.282722 + sliding_eval [ 98.7%] 956832/969088 windows running_bpb=1.282712 + sliding_eval [ 98.9%] 958432/969088 windows running_bpb=1.282692 + sliding_eval [ 99.1%] 960032/969088 windows running_bpb=1.282594 + sliding_eval [ 99.2%] 961632/969088 windows running_bpb=1.282462 + sliding_eval [ 99.4%] 963232/969088 windows running_bpb=1.282400 + sliding_eval [ 99.6%] 964832/969088 windows running_bpb=1.282394 + sliding_eval [ 99.7%] 966432/969088 windows running_bpb=1.282312 + sliding_eval [ 99.9%] 968032/969088 windows running_bpb=1.282384 +final_int8_zlib_roundtrip val_loss:2.1653 val_bpb:1.2824 eval_time:1341430ms +final_int8_zlib_roundtrip_exact val_loss:2.16530120 val_bpb:1.28241673