From e70253f76862a410af8e038ce8d8981ca53973c4 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Wed, 4 Mar 2026 17:25:17 -0800 Subject: [PATCH 1/3] [bridge] Fix off-by-one in sliding window size for Gemma2, Gemma3, and GPT-OSS HuggingFace sliding_window is inclusive (tokens within window are attended to), while Megatron/FlashAttention window_size is exclusive. Subtract 1 to align semantics. Also make GPT-OSS read sliding_window from the HF config instead of hardcoding 128. Made-with: Cursor --- src/megatron/bridge/models/gemma/gemma2_bridge.py | 2 +- src/megatron/bridge/models/gemma/gemma3_provider.py | 2 +- src/megatron/bridge/models/gpt_oss/gpt_oss_bridge.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/megatron/bridge/models/gemma/gemma2_bridge.py b/src/megatron/bridge/models/gemma/gemma2_bridge.py index 72c29c8165..f013c06b2e 100644 --- a/src/megatron/bridge/models/gemma/gemma2_bridge.py +++ b/src/megatron/bridge/models/gemma/gemma2_bridge.py @@ -51,7 +51,7 @@ def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> Gemma2ModelProvi provider.query_pre_attn_scalar = hf_config.query_pre_attn_scalar provider.attn_logit_softcapping = hf_config.attn_logit_softcapping provider.final_logit_softcapping = hf_config.final_logit_softcapping - provider.window_size = (hf_config.sliding_window, 0) + provider.window_size = (hf_config.sliding_window - 1, 0) provider.normalization = "RMSNorm" provider.activation_func = fast_gelu diff --git a/src/megatron/bridge/models/gemma/gemma3_provider.py b/src/megatron/bridge/models/gemma/gemma3_provider.py index 1f305ce7a5..afb9ae63b8 100644 --- a/src/megatron/bridge/models/gemma/gemma3_provider.py +++ b/src/megatron/bridge/models/gemma/gemma3_provider.py @@ -302,7 +302,7 @@ def __init__( config = copy.deepcopy(config) if _is_local_attn_layer(layer_number, config.interleaved_attn_pattern): # local attention, (q, k) - config.window_size = (config.window_size, 0) + config.window_size = (config.window_size - 1, 0) else: # global attention config.window_size = None diff --git a/src/megatron/bridge/models/gpt_oss/gpt_oss_bridge.py b/src/megatron/bridge/models/gpt_oss/gpt_oss_bridge.py index a08d8a3e30..dbdba5cb80 100644 --- a/src/megatron/bridge/models/gpt_oss/gpt_oss_bridge.py +++ b/src/megatron/bridge/models/gpt_oss/gpt_oss_bridge.py @@ -93,7 +93,7 @@ def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> GPTModelProvider provider.glu_linear_offset = 1.0 provider.softmax_type = "learnable" - provider.window_size = (128, 0) + provider.window_size = (hf_pretrained.config.sliding_window - 1, 0) provider.window_attn_skip_freq = 2 # GPT-OSS uses intermediate_size for MoE FFN hidden size From 816d423a7b2b959f1d0428246277119bb5d78875 Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Thu, 5 Mar 2026 10:58:21 -0800 Subject: [PATCH 2/3] fix test Signed-off-by: Chen Cui --- tests/unit_tests/models/gemma/test_gemma2_bridge.py | 6 +++--- tests/unit_tests/models/gpt_oss/test_gpt_oss_bridges.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/unit_tests/models/gemma/test_gemma2_bridge.py b/tests/unit_tests/models/gemma/test_gemma2_bridge.py index f9b4c8527a..6da3f8b446 100644 --- a/tests/unit_tests/models/gemma/test_gemma2_bridge.py +++ b/tests/unit_tests/models/gemma/test_gemma2_bridge.py @@ -320,7 +320,7 @@ def test_provider_bridge_gemma2_specific_features(self, mock_pretrained_gemma2_2 assert result.query_pre_attn_scalar == gemma2_2b_config.query_pre_attn_scalar assert result.attn_logit_softcapping == gemma2_2b_config.attn_logit_softcapping assert result.final_logit_softcapping == gemma2_2b_config.final_logit_softcapping - assert result.window_size == (gemma2_2b_config.sliding_window, 0) + assert result.window_size == (gemma2_2b_config.sliding_window - 1, 0) assert result.add_bias_linear == False # Gemma2 doesn't use bias in linear layers assert result.layernorm_zero_centered_gamma == True # Gemma2-specific RMSNorm behavior @@ -406,8 +406,8 @@ def test_provider_bridge_sliding_window_config(self, mock_pretrained_gemma2_2b, result = bridge.provider_bridge(mock_pretrained_gemma2_2b) # Check sliding window configuration specific to Gemma2 - assert result.window_size == (gemma2_2b_config.sliding_window, 0) - assert result.window_size == (4096, 0) + assert result.window_size == (gemma2_2b_config.sliding_window - 1, 0) + assert result.window_size == (4095, 0) def test_provider_bridge_query_pre_attn_scalar_variants(self, mock_pretrained_gemma2_27b, gemma2_27b_config): """Test query_pre_attn_scalar for 27B model which has different value.""" diff --git a/tests/unit_tests/models/gpt_oss/test_gpt_oss_bridges.py b/tests/unit_tests/models/gpt_oss/test_gpt_oss_bridges.py index 5d336cc485..ff7c35a0af 100644 --- a/tests/unit_tests/models/gpt_oss/test_gpt_oss_bridges.py +++ b/tests/unit_tests/models/gpt_oss/test_gpt_oss_bridges.py @@ -39,6 +39,7 @@ def gpt_oss_cfg(self): "torch_dtype": "bfloat16", "vocab_size": 201088, "hidden_act": "silu", + "sliding_window": 4096, } @pytest.fixture From 48b6bfae3e965882584bd96832004dd69e4c8ade Mon Sep 17 00:00:00 2001 From: Chen Cui Date: Thu, 5 Mar 2026 14:07:08 -0800 Subject: [PATCH 3/3] fix Signed-off-by: Chen Cui --- src/megatron/bridge/models/gpt_oss/gpt_oss_provider.py | 2 +- src/megatron/bridge/models/mistral/mistral_bridge.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/megatron/bridge/models/gpt_oss/gpt_oss_provider.py b/src/megatron/bridge/models/gpt_oss/gpt_oss_provider.py index 0574ce7b57..dd7f180395 100644 --- a/src/megatron/bridge/models/gpt_oss/gpt_oss_provider.py +++ b/src/megatron/bridge/models/gpt_oss/gpt_oss_provider.py @@ -74,7 +74,7 @@ class GPTOSSProvider(GPTModelProvider): moe_ffn_hidden_size: int = 2880 moe_router_load_balancing_type: str = "none" seq_length: int = 131072 - window_size: Optional[Tuple[int, int]] = (128, 0) + window_size: Optional[Tuple[int, int]] = (127, 0) softmax_type: Literal["vanilla", "off-by-one", "learnable"] = "learnable" activation_func: Callable = quick_gelu glu_linear_offset: float = 1.0 diff --git a/src/megatron/bridge/models/mistral/mistral_bridge.py b/src/megatron/bridge/models/mistral/mistral_bridge.py index 1d766b8b52..6c6201488e 100644 --- a/src/megatron/bridge/models/mistral/mistral_bridge.py +++ b/src/megatron/bridge/models/mistral/mistral_bridge.py @@ -54,7 +54,7 @@ def provider_bridge(self, hf_pretrained: PreTrainedCausalLM) -> MistralModelProv window_size, cp_comm_type = (None, None) if getattr(hf_config, "sliding_window", None) is not None: - window_size = [hf_config.sliding_window, 0] + window_size = [hf_config.sliding_window - 1, 0] cp_comm_type = "a2a" provider = cls(