DeepAuto-AI · gmlwns2000 · Oct 12, 2025
diff --git a/python/sglang/srt/layers/attention/hip_attention.py b/python/sglang/srt/layers/attention/hip_attention.py
@@ -290,6 +290,7 @@ def forward_extend(
                 # For multi-head latent attention
                 q_rope=q_rope,
                 k_rope=k_rope,
+                sinks=sinks,
             )
         else:
             if not self.is_kv_cache_offload_enabled:

diff --git a/python/sglang/srt/models/glm4_moe.py b/python/sglang/srt/models/glm4_moe.py
@@ -12,7 +12,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Inference-only GLM-4.5 model compatible with HuggingFace weights"""
+"""Inference-only GLM-4.5, GLM-4.6 model compatible with HuggingFace weights"""
 
 import logging
 from typing import Any, Dict, Iterable, Optional, Tuple
@@ -812,9 +812,9 @@ def determine_num_fused_shared_experts(
             or self.config.architectures[0] != architecture
             or self.config.n_shared_experts != 1
         ):
-            disable_reason = "Only GLM-4.5 on NV-platform with capability >= 80 can use shared experts fusion optimization."
+            disable_reason = "Only GLM-4.5 or GLM-4.6 on NV-platform with capability >= 80 can use shared experts fusion optimization."
         elif get_moe_expert_parallel_world_size() > 1:
-            disable_reason = "Deepseek and GLM-4.5 can not use shared experts fusion optimization under expert parallelism."
+            disable_reason = "Deepseek and GLM-4.5 or GLM-4.6 can not use shared experts fusion optimization under expert parallelism."
 
         if disable_reason is not None:
             global_server_args_dict["disable_shared_experts_fusion"] = True
@@ -1107,4 +1107,4 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=Fal
                         weight_loader(param, loaded_weight)
 
 
-EntryClass = [Glm4MoeForCausalLM]
+EntryClass = [Glm4MoeForCausalLM]