Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 224 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11641,6 +11641,230 @@ glm5-fp8-gb300-dynamo-sglang:
ep: 1
dp-attn: false

glm5-fp8-gb300-dynamo-sglang-mtp:
image: lmsysorg/sglang:v0.5.11-cu130
model: zai-org/GLM-5.1-FP8
model-prefix: glm5
runner: gb300-nv
precision: fp8
framework: dynamo-sglang
multinode: true
disagg: true
scenarios:
fixed-seq-len:
# ---------- 8k1k high-throughput (wide-EP decode, EAGLE MTP) ----------
- isl: 8192
osl: 1024
search-space:
- spec-decoding: "mtp"
conc-list: [2800]
prefill:
num-worker: 14
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_0.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
- spec-decoding: "mtp"
conc-list: [1700]
prefill:
num-worker: 12
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_1.yaml"
decode:
num-worker: 1
tp: 24
ep: 24
dp-attn: true
- spec-decoding: "mtp"
conc-list: [1300]
prefill:
num-worker: 10
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_2.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true
- spec-decoding: "mtp"
conc-list: [900]
prefill:
num-worker: 8
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_3.yaml"
decode:
num-worker: 1
tp: 40
ep: 40
dp-attn: true
# ---------- 8k1k low-latency (per-node TP=4 decode workers, EAGLE MTP) ----------
- isl: 8192
osl: 1024
search-space:
- spec-decoding: "mtp"
conc-list: [150]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml"
decode:
num-worker: 9
tp: 4
ep: 1
dp-attn: false
- spec-decoding: "mtp"
conc-list: [128, 64, 32]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml"
decode:
num-worker: 17
tp: 4
ep: 1
dp-attn: false
- spec-decoding: "mtp"
conc-list: [24]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml"
decode:
num-worker: 17
tp: 4
ep: 1
dp-attn: false
# ---------- 1k1k high-throughput (wide-EP decode, EAGLE MTP) ----------
- isl: 1024
osl: 1024
search-space:
- spec-decoding: "mtp"
conc-list: [8192]
prefill:
num-worker: 12
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_0.yaml"
decode:
num-worker: 1
tp: 24
ep: 24
dp-attn: true
- spec-decoding: "mtp"
conc-list: [7500]
prefill:
num-worker: 10
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_1.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true
- spec-decoding: "mtp"
conc-list: [7300]
prefill:
num-worker: 8
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_2.yaml"
decode:
num-worker: 1
tp: 40
ep: 40
dp-attn: true
- spec-decoding: "mtp"
conc-list: [6500]
prefill:
num-worker: 6
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_3.yaml"
decode:
num-worker: 1
tp: 48
ep: 48
dp-attn: true
- spec-decoding: "mtp"
conc-list: [5700]
prefill:
num-worker: 4
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_4.yaml"
decode:
num-worker: 1
tp: 56
ep: 56
dp-attn: true
# ---------- 1k1k low-latency (per-node TP=4 decode workers, EAGLE MTP) ----------
- isl: 1024
osl: 1024
search-space:
- spec-decoding: "mtp"
conc-list: [512, 256, 128, 64]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml"
decode:
num-worker: 17
tp: 4
ep: 1
dp-attn: false
- spec-decoding: "mtp"
conc-list: [32]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml"
decode:
num-worker: 17
tp: 4
ep: 1
dp-attn: false

# ============================================================================
# Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries).
# Recipes that ALREADY existed on main were intentionally left at main's version
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
name: gb300-fp8-glm5-mtp_1k1k_hightpt_0

model:
path: glm-5-fp8
container: "lmsysorg/sglang:v0.5.11-cu130"
precision: fp8

resources:
gpu_type: gb300
gpus_per_node: 4
prefill_nodes: 12
prefill_workers: 12
decode_nodes: 6
decode_workers: 1
frontend:
type: dynamo
enable_multiple_frontends: true
num_additional_frontends: 9
dynamo:
version: 1.2.1

backend:
prefill_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
PYTHONUNBUFFERED: '1'
DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
MC_TE_METRIC: 'true'
MC_FORCE_MNNVL: '1'
NCCL_MNNVL_ENABLE: '1'
NCCL_CUMEM_ENABLE: '1'
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
DYN_REQUEST_PLANE: nats
Comment on lines +22 to +37

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 All 14 new GB300 MTP recipe YAMLs omit SGLANG_ENABLE_SPEC_V2: '1' from both prefill_environment and decode_environment blocks, even though every other GLM-5 / SGLang MTP path in this repo (every existing dsr1/b200-fp4/{1k1k,8k1k}/disagg/mtp/*.yaml recipe, every single-node *_mtp.sh launcher including benchmarks/single_node/fixed_seq_len/glm5_fp8_b300_mtp.sh:39, and benchmarks/multi_node/amd_utils/env.sh:156) sets it explicitly. runners/launch_gb300-nv.sh does not inject it either, so the recipe YAML is the only entry point — without it, EAGLE on lmsysorg/sglang:v0.5.11-cu130 will run via the legacy spec-decoding path (or silently no-op with the NSA + DeepEP + DPA decode topology), producing decode behavior inconsistent with every other validated MTP benchmark in the repo and invalidating the new measurements. Fix: add SGLANG_ENABLE_SPEC_V2: '1' to both env blocks in every new MTP recipe (matching the dsr1 MTP precedent).

Extended reasoning...

What the bug is

All 14 new MTP recipe YAMLs added by this PR (benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/{1k1k,8k1k}/disagg/mtp/*.yaml) are missing the SGLANG_ENABLE_SPEC_V2: '1' environment variable in both prefill_environment and decode_environment blocks. This env var is the documented enablement gate for EAGLE/MTP speculative decoding in SGLang across this repo.

Why this is a bug — overwhelming precedent

Every other MTP launch path in this repo sets this variable explicitly:

  1. Every existing multinode SGLang MTP recipe under benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/{1k1k,8k1k}/disagg/mtp/*.yaml sets SGLANG_ENABLE_SPEC_V2: '1' in both prefill and decode env blocks (e.g. dsr1/b200-fp4/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml:48 and :66 — 20 hits across 10 files).
  2. Every single-node GLM-5 MTP launcher: benchmarks/single_node/fixed_seq_len/glm5_fp8_b300_mtp.sh:39 runs export SGLANG_ENABLE_SPEC_V2=1 immediately before sglang.launch_server --speculative-algorithm EAGLE. Same in glm5_fp8_b200_mtp.sh, glm5_fp8_mi355x_mtp.sh, glm5_fp4_b300_mtp.sh, glm5_fp4_b200_mtp.sh.
  3. AMD multi-node env: benchmarks/multi_node/amd_utils/env.sh:156 exports it.
  4. perf-changelog.yaml describes every prior GLM-5 MTP entry (b300/b200/mi355x FP8 and FP4 variants) verbatim as "adds EAGLE speculative decoding ... behind SGLANG_ENABLE_SPEC_V2=1" — this is the maintainer-documented contract.

Why existing code doesn't catch it

runners/launch_gb300-nv.sh contains zero references to SPEC_V2, speculative, or MTP — it only srtctl applys the recipe YAML. The recipe YAML's prefill_environment/decode_environment is the only place SGLang env vars reach the worker containers on this launch path. A missing entry is not silently filled in.

Root cause (confirmed by PR description)

The PR description states the new recipes are "byte-identical to the existing stp/ siblings except for the EAGLE speculative-decoding flags on the decode block." The STP siblings don't need this env var (no spec decoding), so the copy carried the STP environment forward and the new EAGLE-specific env var was never added. Spot-check: diff stp/1k1k_stp_hightpt_0.yaml mtp/1k1k_mtp_hightpt_0.yaml shows the new MTP file is byte-identical to STP except for the name change and the four speculative-* keys appended to the decode block.

Step-by-step proof of impact

  1. CI invokes launch_gb300-nv.sh for glm5-fp8-gb300-dynamo-sglang-mtp.
  2. The launcher copies recipes/sglang/glm5/gb300-fp8/.../mtp/*.yaml into srt-slurm and runs srtctl apply. It does not inject SGLANG_ENABLE_SPEC_V2.
  3. srtctl reads prefill_environment/decode_environment from the YAML and exports them into the worker containers. Neither block contains SGLANG_ENABLE_SPEC_V2.
  4. SGLang v0.5.11-cu130 starts with --speculative-algorithm EAGLE but without SGLANG_ENABLE_SPEC_V2=1 — it routes EAGLE through the legacy v1 spec-decoding code path (or silently disables spec for the NSA + DeepEP + DPA decode topology, since v2 is the implementation that supports this combination in v0.5.11).
  5. The benchmark completes and publishes throughput/latency numbers — but they are measuring a different decode code path than every other GLM-5 MTP entry in perf-changelog.yaml, and different from the GB300 single-node sibling glm5_fp8_b300_mtp.sh.

The whole point of -mtp is to measure EAGLE MTP performance; without SPEC_V2=1 the published numbers do not represent the intended config, defeating the purpose of the entry and breaking the apples-to-apples comparison with the existing MTP benchmarks.

Fix

Add SGLANG_ENABLE_SPEC_V2: '1' to both prefill_environment and decode_environment in every new MTP recipe (28 env blocks across 14 files), matching the dsr1 MTP recipes exactly.


decode_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
PYTHONUNBUFFERED: '1'
DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
MC_TE_METRIC: 'true'
MC_FORCE_MNNVL: '1'
NCCL_MNNVL_ENABLE: '1'
NCCL_CUMEM_ENABLE: '1'
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
DYN_REQUEST_PLANE: nats
# DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size).
# Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024.
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '1024'

sglang_config:
prefill:
# Model configuration
served-model-name: GLM-5-FP8
trust-remote-code: true
quantization: fp8
kv-cache-dtype: fp8_e4m3

# Disaggregation mode
disaggregation-mode: prefill
disaggregation-transfer-backend: nixl

# Size limits
max-running-requests: 256
cuda-graph-max-bs: 256
mem-fraction-static: 0.7
context-length: 9600
chunked-prefill-size: 32768
max-prefill-tokens: 8192

# Parallelism
tensor-parallel-size: 4
data-parallel-size: 4
expert-parallel-size: 1
enable-dp-attention: true
enable-dp-lm-head: true
load-balance-method: total_tokens

# Backend
nsa-decode-backend: trtllm
nsa-prefill-backend: trtllm
moe-runner-backend: flashinfer_trtllm

# Other flags
enable-flashinfer-allreduce-fusion: true
disable-radix-cache: true
weight-loader-prefetch-checkpoints: true
model-loader-extra-config: '{"enable_multithread_load": true}'

decode:
# Model configuration
served-model-name: GLM-5-FP8
trust-remote-code: true

quantization: fp8
kv-cache-dtype: fp8_e4m3

# Disaggregation mode
disaggregation-mode: decode
disaggregation-transfer-backend: nixl

# Memory and token limits
mem-fraction-static: 0.8
context-length: 9600

# Backend
nsa-decode-backend: trtllm
nsa-prefill-backend: trtllm
# moe-runner-backend: "cutedsl"

# Detokenizer
skip-tokenizer-init: true
stream-interval: 30

# Other flags
disable-radix-cache: true
weight-loader-prefetch-checkpoints: true
model-loader-extra-config: '{"enable_multithread_load": true}'
tensor-parallel-size: 24
expert-parallel-size: 24
data-parallel-size: 24
enable-dp-lm-head: true
enable-dp-attention: true
moe-dense-tp-size: 1
ep-num-redundant-experts: 32
ep-dispatch-algorithm: static
moe-a2a-backend: deepep
deepep-mode: low_latency
deepep-config: /configs/deepep_config.json
max-running-requests: 8192
cuda-graph-max-bs: 512
speculative-algorithm: "EAGLE"
speculative-num-steps: 2
speculative-eagle-topk: 1
speculative-num-draft-tokens: 3
health_check:
max_attempts: 360
interval_seconds: 10

benchmark:
type: sa-bench
req_rate: inf
isl: 1024
osl: 1024
concurrencies: '8192'
Comment on lines +147 to +152

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 All 14 new MTP recipe YAMLs under benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/{1k1k,8k1k}/disagg/mtp/ omit use_chat_template: true in the benchmark: block, which AGENTS.md and .github/workflows/claude-pr-review.yml explicitly mandate for MTP benchmarks. Without it the benchmark measures EAGLE acceptance against raw prompts instead of chat-formatted inputs, silently regressing the reported acceptance rate and making these numbers not comparable to other MTP benchmarks in the repo. Fix: add use_chat_template: true under the benchmark: block in each of the 14 new files (matching every existing sglang multi-node MTP recipe under dsr1/b200-fp4/8k1k/disagg/mtp/ and deepseek-v4/8k1k/disagg-*-mtp.yaml).

Extended reasoning...

What the bug is

Every MTP YAML in this PR (14 files under benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/{1k1k,8k1k}/disagg/mtp/) ends with a benchmark: block of the shape:

benchmark:
  type: sa-bench
  req_rate: inf
  isl: <isl>
  osl: 1024
  concurrencies: '<N>'

There is no use_chat_template: true field. The PR description states these files are 'byte-identical to the existing stp/ siblings except for the EAGLE speculative-decoding flags on the decode block' — and the STP siblings correctly omit chat-template (raw-prompt input is fine for non-spec-decoding STP). The copy carried that omission into MTP, where it is incorrect.

Why this is mandatory for MTP

AGENTS.md:56 says verbatim: 'MTP scripts MUST pass --use-chat-template to run_benchmark_serving — EAGLE-style spec decoding is trained against chat-formatted inputs; benchmarking against raw prompts silently regresses acceptance rate.' The repository's own automated review at .github/workflows/claude-pr-review.yml:280-296 enforces the same rule: 'MTP benchmarks MUST include the --use-chat-template flag in the benchmark client configuration.'

For multi-node recipes consumed by sa-bench, the YAML key use_chat_template: true under the benchmark: block is the equivalent of the shell --use-chat-template flag — sa-bench plumbs the field through benchmark_lib.sh into benchmark_serving.py, where it gates tokenizer.apply_chat_template formatting of prompts.

Why existing code doesn't prevent it

Nothing in the loader or runtime cross-checks use_chat_template against the presence of speculative-algorithm: EAGLE in the decode block. The omission is silent — the benchmark runs, produces numbers, and the only visible signal is a lower acceptance rate than the model is actually capable of.

Precedent

Every existing sglang multi-node MTP recipe in the repo sets this field:

  • benchmarks/multi_node/srt-slurm-recipes/sglang/dsr1/b200-fp4/8k1k/disagg/mtp/*.yaml — all 6 files
  • benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-*-mtp.yaml — all -mtp variants
  • All single-node GLM-5 MTP scripts (glm5_fp8_b300_mtp.sh, glm5_fp8_b200_mtp.sh, glm5_fp8_h200_mtp.sh, etc.) pass --use-chat-template

GLM-5 specifically requires chat-template formatting for EAGLE to perform as intended — the per-platform MTP scripts in this repo already encode that rule.

Step-by-step proof

Take 1k1k_mtp_hightpt_0.yaml in this PR (lines 147-152):

  1. The decode: block sets speculative-algorithm: "EAGLE", speculative-num-steps: 2, speculative-eagle-topk: 1, speculative-num-draft-tokens: 3 → MTP is on.
  2. The benchmark: block is type: sa-bench / req_rate: inf / isl: 1024 / osl: 1024 / concurrencies: '8192' — no use_chat_template key.
  3. When sa-bench launches, benchmark_serving.py reads use_chat_template (defaults to false) and skips tokenizer.apply_chat_template(...). Prompts are sent to the GLM-5 server in raw form.
  4. The EAGLE draft head was trained on chat-formatted token sequences; raw-prompt distribution shift drops draft-token acceptance.
  5. The reported acceptance rate is silently lower than the model's true capability — and not comparable to other MTP benchmarks in the repo (dsr1, deepseek-v4) which all measure against chat-formatted prompts.

Repeat verbatim for the other 13 files; same structure, same omission.

Fix

Add one line under the benchmark: block of each of the 14 new YAMLs:

benchmark:
  type: sa-bench
  req_rate: inf
  isl: <isl>
  osl: 1024
  concurrencies: '<N>'
  use_chat_template: true

Files to update:

  • 1k1k/disagg/mtp/1k1k_mtp_hightpt_{0,1,2,3,4}.yaml
  • 1k1k/disagg/mtp/1k1k_mtp_lowlat_{0,1}.yaml
  • 8k1k/disagg/mtp/8k1k_mtp_hightpt_{0,1,2,3}.yaml
  • 8k1k/disagg/mtp/8k1k_mtp_lowlat_{0,1,2}.yaml

Loading