Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions python/sglang/srt/layers/attention/hip_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ def forward_extend(
# For multi-head latent attention
q_rope=q_rope,
k_rope=k_rope,
sinks=sinks,
)
else:
if not self.is_kv_cache_offload_enabled:
Expand Down
8 changes: 4 additions & 4 deletions python/sglang/srt/models/glm4_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# limitations under the License.
# ==============================================================================

"""Inference-only GLM-4.5 model compatible with HuggingFace weights"""
"""Inference-only GLM-4.5, GLM-4.6 model compatible with HuggingFace weights"""

import logging
from typing import Any, Dict, Iterable, Optional, Tuple
Expand Down Expand Up @@ -812,9 +812,9 @@ def determine_num_fused_shared_experts(
or self.config.architectures[0] != architecture
or self.config.n_shared_experts != 1
):
disable_reason = "Only GLM-4.5 on NV-platform with capability >= 80 can use shared experts fusion optimization."
disable_reason = "Only GLM-4.5 or GLM-4.6 on NV-platform with capability >= 80 can use shared experts fusion optimization."
elif get_moe_expert_parallel_world_size() > 1:
disable_reason = "Deepseek and GLM-4.5 can not use shared experts fusion optimization under expert parallelism."
disable_reason = "Deepseek and GLM-4.5 or GLM-4.6 can not use shared experts fusion optimization under expert parallelism."

if disable_reason is not None:
global_server_args_dict["disable_shared_experts_fusion"] = True
Expand Down Expand Up @@ -1107,4 +1107,4 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]], is_nextn=Fal
weight_loader(param, loaded_weight)


EntryClass = [Glm4MoeForCausalLM]
EntryClass = [Glm4MoeForCausalLM]
Loading