diff --git a/python/sglang/srt/layers/attention/hip_attention.py b/python/sglang/srt/layers/attention/hip_attention.py index 57b18ce2fba..665e9c6196c 100644 --- a/python/sglang/srt/layers/attention/hip_attention.py +++ b/python/sglang/srt/layers/attention/hip_attention.py @@ -290,6 +290,7 @@ def forward_extend( # For multi-head latent attention q_rope=q_rope, k_rope=k_rope, + sinks=sinks, ) else: if not self.is_kv_cache_offload_enabled: diff --git a/python/sglang/srt/models/glm4_moe.py b/python/sglang/srt/models/glm4_moe.py index 7738b490289..f00bbdfc9fd 100644 --- a/python/sglang/srt/models/glm4_moe.py +++ b/python/sglang/srt/models/glm4_moe.py @@ -12,7 +12,7 @@ # limitations under the License. # ============================================================================== -"""Inference-only GLM-4.5 model compatible with HuggingFace weights""" +"""Inference-only GLM-4.5, GLM-4.6 model compatible with HuggingFace weights""" import logging from typing import Any, Dict, Iterable, Optional, Tuple @@ -812,9 +812,9 @@ def determine_num_fused_shared_experts( or self.config.architectures[0] != architecture or self.config.n_shared_experts != 1 ): - disable_reason = "Only GLM-4.5 on NV-platform with capability >= 80 can use shared experts fusion optimization." + disable_reason = "Only GLM-4.5 or GLM-4.6 on NV-platform with capability >= 80 can use shared experts fusion optimization." elif get_moe_expert_parallel_world_size() > 1: - disable_reason = "Deepseek and GLM-4.5 can not use shared experts fusion optimization under expert parallelism." + disable_reason = "Deepseek and GLM-4.5 or GLM-4.6 can not use shared experts fusion optimization under expert parallelism." if disable_reason is not None: global_server_args_dict["disable_shared_experts_fusion"] = True @@ -1107,4 +1107,4 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=Fal weight_loader(param, loaded_weight) -EntryClass = [Glm4MoeForCausalLM] +EntryClass = [Glm4MoeForCausalLM] \ No newline at end of file