From 85f0419ad2041b625cda92a1d1477bdad9611c06 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Tue, 23 Jun 2026 14:23:53 -0700 Subject: [PATCH 1/7] Add GLM-5-FP8 GB300 multinode dynamo-sglang MTP benchmark - nvidia-master.yaml: add glm5-fp8-gb300-dynamo-sglang-mtp (14 topologies across 1k/1k and 8k/1k; prefill TP4 + decode wide-EP DEP16/24/32/40/48/56 high-throughput and per-node TP4 low-latency, all with spec-decoding: mtp). - 14 split recipes under benchmarks/multi_node/srt-slurm-recipes/sglang/ glm5/gb300-fp8/{8k1k,1k1k}/disagg/mtp/, mirroring the existing stp/ siblings with EAGLE speculative decoding (num-steps 2, eagle-topk 1, num-draft-tokens 3). - perf-changelog: entry for the new config. --- .github/configs/nvidia-master.yaml | 224 ++++++++++++++++++ .../1k1k/disagg/mtp/1k1k_mtp_hightpt_0.yaml | 152 ++++++++++++ .../1k1k/disagg/mtp/1k1k_mtp_hightpt_1.yaml | 152 ++++++++++++ .../1k1k/disagg/mtp/1k1k_mtp_hightpt_2.yaml | 152 ++++++++++++ .../1k1k/disagg/mtp/1k1k_mtp_hightpt_3.yaml | 152 ++++++++++++ .../1k1k/disagg/mtp/1k1k_mtp_hightpt_4.yaml | 152 ++++++++++++ .../1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml | 144 +++++++++++ .../1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml | 144 +++++++++++ .../8k1k/disagg/mtp/8k1k_mtp_hightpt_0.yaml | 152 ++++++++++++ .../8k1k/disagg/mtp/8k1k_mtp_hightpt_1.yaml | 152 ++++++++++++ .../8k1k/disagg/mtp/8k1k_mtp_hightpt_2.yaml | 152 ++++++++++++ .../8k1k/disagg/mtp/8k1k_mtp_hightpt_3.yaml | 152 ++++++++++++ .../8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml | 144 +++++++++++ .../8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml | 144 +++++++++++ .../8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml | 144 +++++++++++ perf-changelog.yaml | 8 + 16 files changed, 2320 insertions(+) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_2.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_4.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_2.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index d59ef5841..16117a7c7 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11396,6 +11396,230 @@ glm5-fp8-gb300-dynamo-sglang: ep: 1 dp-attn: false +glm5-fp8-gb300-dynamo-sglang-mtp: + image: lmsysorg/sglang:v0.5.11-cu130 + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + runner: gb300-nv + precision: fp8 + framework: dynamo-sglang + multinode: true + disagg: true + scenarios: + fixed-seq-len: + # ---------- 8k1k high-throughput (wide-EP decode, EAGLE MTP) ---------- + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [2800] + prefill: + num-worker: 14 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1700] + prefill: + num-worker: 12 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_1.yaml" + decode: + num-worker: 1 + tp: 24 + ep: 24 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [1300] + prefill: + num-worker: 10 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_2.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [900] + prefill: + num-worker: 8 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_3.yaml" + decode: + num-worker: 1 + tp: 40 + ep: 40 + dp-attn: true + # ---------- 8k1k low-latency (per-node TP=4 decode workers, EAGLE MTP) ---------- + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [150] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml" + decode: + num-worker: 9 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [128, 64, 32] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml" + decode: + num-worker: 17 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [24] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml" + decode: + num-worker: 17 + tp: 4 + ep: 1 + dp-attn: false + # ---------- 1k1k high-throughput (wide-EP decode, EAGLE MTP) ---------- + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [8192] + prefill: + num-worker: 12 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_0.yaml" + decode: + num-worker: 1 + tp: 24 + ep: 24 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [7500] + prefill: + num-worker: 10 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_1.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [7300] + prefill: + num-worker: 8 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_2.yaml" + decode: + num-worker: 1 + tp: 40 + ep: 40 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [6500] + prefill: + num-worker: 6 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_3.yaml" + decode: + num-worker: 1 + tp: 48 + ep: 48 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [5700] + prefill: + num-worker: 4 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_4.yaml" + decode: + num-worker: 1 + tp: 56 + ep: 56 + dp-attn: true + # ---------- 1k1k low-latency (per-node TP=4 decode workers, EAGLE MTP) ---------- + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "mtp" + conc-list: [512, 256, 128, 64] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml" + decode: + num-worker: 17 + tp: 4 + ep: 1 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [32] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml" + decode: + num-worker: 17 + tp: 4 + ep: 1 + dp-attn: false + # ============================================================================ # Net-new agentic recipes from chore/agentx-v0.3 (no overlap with main entries). # Recipes that ALREADY existed on main were intentionally left at main's version diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_0.yaml new file mode 100644 index 000000000..9fae87e3a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_0.yaml @@ -0,0 +1,152 @@ +name: gb300-fp8-glm5-mtp_1k1k_hightpt_0 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 12 + prefill_workers: 12 + decode_nodes: 6 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 24 + expert-parallel-size: 24 + data-parallel-size: 24 + enable-dp-lm-head: true + enable-dp-attention: true + moe-dense-tp-size: 1 + ep-num-redundant-experts: 32 + ep-dispatch-algorithm: static + moe-a2a-backend: deepep + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + max-running-requests: 8192 + cuda-graph-max-bs: 512 + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 1024 + osl: 1024 + concurrencies: '8192' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_1.yaml new file mode 100644 index 000000000..e509bd610 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_1.yaml @@ -0,0 +1,152 @@ +name: gb300-fp8-glm5-mtp_1k1k_hightpt_1 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 10 + prefill_workers: 10 + decode_nodes: 8 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 32 + expert-parallel-size: 32 + data-parallel-size: 32 + enable-dp-lm-head: true + enable-dp-attention: true + moe-dense-tp-size: 1 + ep-num-redundant-experts: 32 + ep-dispatch-algorithm: static + moe-a2a-backend: deepep + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + max-running-requests: 8192 + cuda-graph-max-bs: 256 + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 1024 + osl: 1024 + concurrencies: '7500' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_2.yaml new file mode 100644 index 000000000..ee0de0bb8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_2.yaml @@ -0,0 +1,152 @@ +name: gb300-fp8-glm5-mtp_1k1k_hightpt_2 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 8 + prefill_workers: 8 + decode_nodes: 10 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 40 + expert-parallel-size: 40 + data-parallel-size: 40 + enable-dp-lm-head: true + enable-dp-attention: true + moe-dense-tp-size: 1 + ep-num-redundant-experts: 24 + ep-dispatch-algorithm: static + moe-a2a-backend: deepep + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + max-running-requests: 7200 + cuda-graph-max-bs: 180 + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 1024 + osl: 1024 + concurrencies: '7300' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_3.yaml new file mode 100644 index 000000000..2e465c496 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_3.yaml @@ -0,0 +1,152 @@ +name: gb300-fp8-glm5-mtp_1k1k_hightpt_3 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 6 + prefill_workers: 6 + decode_nodes: 12 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 48 + expert-parallel-size: 48 + data-parallel-size: 48 + enable-dp-lm-head: true + enable-dp-attention: true + moe-dense-tp-size: 1 + ep-num-redundant-experts: 32 + ep-dispatch-algorithm: static + moe-a2a-backend: deepep + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + max-running-requests: 6144 + cuda-graph-max-bs: 128 + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 1024 + osl: 1024 + concurrencies: '6500' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_4.yaml new file mode 100644 index 000000000..ba6496895 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_4.yaml @@ -0,0 +1,152 @@ +name: gb300-fp8-glm5-mtp_1k1k_hightpt_4 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 4 + prefill_workers: 4 + decode_nodes: 14 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 56 + expert-parallel-size: 56 + data-parallel-size: 56 + enable-dp-lm-head: true + enable-dp-attention: true + moe-dense-tp-size: 1 + ep-num-redundant-experts: 24 + ep-dispatch-algorithm: static + moe-a2a-backend: deepep + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + max-running-requests: 5600 + cuda-graph-max-bs: 100 + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 1024 + osl: 1024 + concurrencies: '5700' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml new file mode 100644 index 000000000..920898604 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml @@ -0,0 +1,144 @@ +name: gb300-fp8-glm5-mtp_1k1k_lowlat_0 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 17 + decode_workers: 17 +frontend: + type: dynamo +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 4 + expert-parallel-size: 1 + data-parallel-size: 1 + enable-flashinfer-allreduce-fusion: true + moe-runner-backend: flashinfer_trtllm + max-running-requests: 32 + cuda-graph-max-bs: 32 + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 1024 + osl: 1024 + concurrencies: 512x256x128x64 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml new file mode 100644 index 000000000..0ff6bd446 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml @@ -0,0 +1,144 @@ +name: gb300-fp8-glm5-mtp_1k1k_lowlat_1 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 17 + decode_workers: 17 +frontend: + type: dynamo +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 4 + expert-parallel-size: 1 + data-parallel-size: 1 + enable-flashinfer-allreduce-fusion: true + moe-runner-backend: flashinfer_trtllm + max-running-requests: 1 + cuda-graph-max-bs: 1 + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 1024 + osl: 1024 + concurrencies: '32' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_0.yaml new file mode 100644 index 000000000..b75eb7664 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_0.yaml @@ -0,0 +1,152 @@ +name: gb300-fp8-glm5-mtp_8k1k_hightpt_0 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 14 + prefill_workers: 14 + decode_nodes: 4 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 16 + expert-parallel-size: 16 + data-parallel-size: 16 + enable-dp-lm-head: true + enable-dp-attention: true + moe-dense-tp-size: 1 + ep-num-redundant-experts: 32 + ep-dispatch-algorithm: static + moe-a2a-backend: deepep + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + max-running-requests: 2800 + cuda-graph-max-bs: 175 + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 8192 + osl: 1024 + concurrencies: '2800' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_1.yaml new file mode 100644 index 000000000..4de3d3056 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_1.yaml @@ -0,0 +1,152 @@ +name: gb300-fp8-glm5-mtp_8k1k_hightpt_1 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 12 + prefill_workers: 12 + decode_nodes: 6 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 24 + expert-parallel-size: 24 + data-parallel-size: 24 + enable-dp-lm-head: true + enable-dp-attention: true + moe-dense-tp-size: 1 + ep-num-redundant-experts: 32 + ep-dispatch-algorithm: static + moe-a2a-backend: deepep + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + max-running-requests: 1680 + cuda-graph-max-bs: 70 + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 8192 + osl: 1024 + concurrencies: '1700' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_2.yaml new file mode 100644 index 000000000..aac1fa7ff --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_2.yaml @@ -0,0 +1,152 @@ +name: gb300-fp8-glm5-mtp_8k1k_hightpt_2 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 10 + prefill_workers: 10 + decode_nodes: 8 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 32 + expert-parallel-size: 32 + data-parallel-size: 32 + enable-dp-lm-head: true + enable-dp-attention: true + moe-dense-tp-size: 1 + ep-num-redundant-experts: 32 + ep-dispatch-algorithm: static + moe-a2a-backend: deepep + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + max-running-requests: 1280 + cuda-graph-max-bs: 40 + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 8192 + osl: 1024 + concurrencies: '1300' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_3.yaml new file mode 100644 index 000000000..e76006e13 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_3.yaml @@ -0,0 +1,152 @@ +name: gb300-fp8-glm5-mtp_8k1k_hightpt_3 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 8 + prefill_workers: 8 + decode_nodes: 10 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 40 + expert-parallel-size: 40 + data-parallel-size: 40 + enable-dp-lm-head: true + enable-dp-attention: true + moe-dense-tp-size: 1 + ep-num-redundant-experts: 24 + ep-dispatch-algorithm: static + moe-a2a-backend: deepep + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + max-running-requests: 880 + cuda-graph-max-bs: 22 + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 8192 + osl: 1024 + concurrencies: '900' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml new file mode 100644 index 000000000..eb4104158 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml @@ -0,0 +1,144 @@ +name: gb300-fp8-glm5-mtp_8k1k_lowlat_0 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 9 + decode_workers: 9 +frontend: + type: dynamo +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 4 + expert-parallel-size: 1 + data-parallel-size: 1 + enable-flashinfer-allreduce-fusion: true + moe-runner-backend: flashinfer_trtllm + max-running-requests: 15 + cuda-graph-max-bs: 15 + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 8192 + osl: 1024 + concurrencies: '150' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml new file mode 100644 index 000000000..793b8386f --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml @@ -0,0 +1,144 @@ +name: gb300-fp8-glm5-mtp_8k1k_lowlat_1 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 17 + decode_workers: 17 +frontend: + type: dynamo +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 4 + expert-parallel-size: 1 + data-parallel-size: 1 + enable-flashinfer-allreduce-fusion: true + moe-runner-backend: flashinfer_trtllm + max-running-requests: 8 + cuda-graph-max-bs: 8 + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 8192 + osl: 1024 + concurrencies: 128x64x32 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml new file mode 100644 index 000000000..45e67c072 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml @@ -0,0 +1,144 @@ +name: gb300-fp8-glm5-mtp_8k1k_lowlat_2 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 17 + decode_workers: 17 +frontend: + type: dynamo +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 4 + expert-parallel-size: 1 + data-parallel-size: 1 + enable-flashinfer-allreduce-fusion: true + moe-runner-backend: flashinfer_trtllm + max-running-requests: 1 + cuda-graph-max-bs: 1 + speculative-algorithm: "EAGLE" + speculative-num-steps: 2 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 3 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 8192 + osl: 1024 + concurrencies: '24' diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 39184d7e2..9b7c0b851 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4115,3 +4115,11 @@ - "Use the dedicated ARM64 MiniMax-M3 performance image; benchmark settings unchanged" - "Allocate FlashInfer MNNVL workspace for one-shot TP8 all-reduce during CUDA graph capture" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1897 + +- config-keys: + - glm5-fp8-gb300-dynamo-sglang-mtp + description: + - "Add GLM-5-FP8 GB300 multinode dynamo-sglang benchmark with EAGLE MTP speculative decoding" + - "Image: lmsysorg/sglang:v0.5.11-cu130" + - "14 topologies across 1k/1k and 8k/1k (prefill TP4 + decode wide-EP DEP16/24/32/40/48/56 high-throughput and per-node TP4 low-latency); MTP flags: speculative-algorithm EAGLE, num-steps 2, eagle-topk 1, num-draft-tokens 3" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX From 7f7d7655e5591045863703ca5066f808484f8e4a Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Tue, 23 Jun 2026 14:25:05 -0700 Subject: [PATCH 2/7] Update perf-changelog pr-link for #1907 --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9b7c0b851..458ce509d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4122,4 +4122,4 @@ - "Add GLM-5-FP8 GB300 multinode dynamo-sglang benchmark with EAGLE MTP speculative decoding" - "Image: lmsysorg/sglang:v0.5.11-cu130" - "14 topologies across 1k/1k and 8k/1k (prefill TP4 + decode wide-EP DEP16/24/32/40/48/56 high-throughput and per-node TP4 low-latency); MTP flags: speculative-algorithm EAGLE, num-steps 2, eagle-topk 1, num-draft-tokens 3" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1907 From 927159c48e950c26ac250d1e6d1e200247a90aae Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Thu, 25 Jun 2026 10:04:42 -0700 Subject: [PATCH 3/7] Raise SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK to 1024 for GLM-5 MTP decode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sweep on 7f7d7655 hit `Assertion error /build/DeepEP/csrc/deep_ep.cpp:1233 'x.size(0) <= num_max_dispatch_tokens_per_rank'` during CUDA-graph capture on the wide-EP decode configs (TP16/EP16, TP32/EP32, TP40/EP40). The old comment sized the buffer for ceil(cuda_graph_max_bs / dp_size) and ignored MTP's speculative_num_draft_tokens=3 multiplier — capture-time per-rank tokens (cuda_graph_max_bs * num_draft_tokens under DP-attention) overflowed the 512 buffer. Co-Authored-By: Claude Opus 4.7 --- .../glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_0.yaml | 2 +- .../glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_1.yaml | 2 +- .../glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_2.yaml | 2 +- .../glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_3.yaml | 2 +- .../glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_4.yaml | 2 +- .../glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml | 2 +- .../glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml | 2 +- .../glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_0.yaml | 2 +- .../glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_1.yaml | 2 +- .../glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_2.yaml | 2 +- .../glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_3.yaml | 2 +- .../glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml | 2 +- .../glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml | 2 +- .../glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml | 2 +- 14 files changed, 14 insertions(+), 14 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_0.yaml index 9fae87e3a..2f935f9ca 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_0.yaml @@ -53,7 +53,7 @@ backend: DYN_REQUEST_PLANE: nats # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '1024' sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_1.yaml index e509bd610..57b373378 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_1.yaml @@ -53,7 +53,7 @@ backend: DYN_REQUEST_PLANE: nats # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '1024' sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_2.yaml index ee0de0bb8..f40d0abc4 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_2.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_2.yaml @@ -53,7 +53,7 @@ backend: DYN_REQUEST_PLANE: nats # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '1024' sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_3.yaml index 2e465c496..3b892a717 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_3.yaml @@ -53,7 +53,7 @@ backend: DYN_REQUEST_PLANE: nats # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '1024' sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_4.yaml index ba6496895..0b0852bd9 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_4.yaml @@ -53,7 +53,7 @@ backend: DYN_REQUEST_PLANE: nats # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '1024' sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml index 920898604..5a999216e 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml @@ -51,7 +51,7 @@ backend: DYN_REQUEST_PLANE: nats # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '1024' sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml index 0ff6bd446..e55e554d0 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml @@ -51,7 +51,7 @@ backend: DYN_REQUEST_PLANE: nats # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '1024' sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_0.yaml index b75eb7664..165387f75 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_0.yaml @@ -53,7 +53,7 @@ backend: DYN_REQUEST_PLANE: nats # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '1024' sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_1.yaml index 4de3d3056..a28eae172 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_1.yaml @@ -53,7 +53,7 @@ backend: DYN_REQUEST_PLANE: nats # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '1024' sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_2.yaml index aac1fa7ff..79fd859dc 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_2.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_2.yaml @@ -53,7 +53,7 @@ backend: DYN_REQUEST_PLANE: nats # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '1024' sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_3.yaml index e76006e13..64c5e8696 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_3.yaml @@ -53,7 +53,7 @@ backend: DYN_REQUEST_PLANE: nats # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '1024' sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml index eb4104158..7572dc68b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml @@ -51,7 +51,7 @@ backend: DYN_REQUEST_PLANE: nats # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '1024' sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml index 793b8386f..82dd34f03 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml @@ -51,7 +51,7 @@ backend: DYN_REQUEST_PLANE: nats # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '1024' sglang_config: prefill: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml index 45e67c072..b38d120ad 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml @@ -51,7 +51,7 @@ backend: DYN_REQUEST_PLANE: nats # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. - SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '1024' sglang_config: prefill: From 6ec9dce0fe1891a0024754174fa024933f4d9f10 Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Tue, 30 Jun 2026 16:12:19 -0700 Subject: [PATCH 4/7] update model paths --- .github/configs/nvidia-master.yaml | 2 +- perf-changelog.yaml | 2 +- runners/launch_gb300-nv.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index dba6961bb..9aad798a4 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -11536,7 +11536,7 @@ glm5-fp8-gb300-dynamo-sglang: glm5-fp8-gb300-dynamo-sglang-mtp: image: lmsysorg/sglang:v0.5.11-cu130 - model: zai-org/GLM-5-FP8 + model: zai-org/GLM-5.1-FP8 model-prefix: glm5 runner: gb300-nv precision: fp8 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 4def097a3..42cf760d9 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4194,7 +4194,7 @@ - config-keys: - glm5-fp8-gb300-dynamo-sglang-mtp description: - - "Add GLM-5-FP8 GB300 multinode dynamo-sglang benchmark with EAGLE MTP speculative decoding" + - "Add GLM-5.1-FP8 GB300 multinode dynamo-sglang benchmark with EAGLE MTP speculative decoding" - "Image: lmsysorg/sglang:v0.5.11-cu130" - "14 topologies across 1k/1k and 8k/1k (prefill TP4 + decode wide-EP DEP16/24/32/40/48/56 high-throughput and per-node TP4 low-latency); MTP flags: speculative-algorithm EAGLE, num-steps 2, eagle-topk 1, num-draft-tokens 3" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1907 diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 93d9eb252..33be7acc8 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -47,7 +47,7 @@ elif [[ $MODEL_PREFIX == "glm5" && $PRECISION == "fp4" ]]; then export MODEL_PATH=/scratch/models/GLM-5-NVFP4 export SRT_SLURM_MODEL_PREFIX="glm-5-fp4" elif [[ $MODEL_PREFIX == "glm5" && $PRECISION == "fp8" ]]; then - export MODEL_PATH=/scratch/models/GLM-5-FP8 + export MODEL_PATH=/scratch/models/GLM-5.1-FP8 export SRT_SLURM_MODEL_PREFIX="glm-5-fp8" elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp4" ]]; then export MODEL_PATH=/data/models/MiniMax-M2.5-NVFP4 From ba342325693725af1d8e73308cb5e88d50fc2d1c Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Wed, 1 Jul 2026 13:44:29 -0700 Subject: [PATCH 5/7] Revert glm5-fp8 GB300 MODEL_PATH to /scratch/models/GLM-5-FP8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per SemiAnalysis working model paths sheet, the GB300 (Local NVMe) column for row 14 (zai-org/GLM-5.1-FP8) is /scratch/models/GLM-5-FP8 — the model directory on GB300 still uses the old-style name. This matches the pattern used for the FP4 sibling one line up (/scratch/models/GLM-5-NVFP4). Co-Authored-By: Claude Opus 4.7 --- runners/launch_gb300-nv.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 34a4eae6a..676615500 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -47,7 +47,7 @@ elif [[ $MODEL_PREFIX == "glm5" && $PRECISION == "fp4" ]]; then export MODEL_PATH=/scratch/models/GLM-5-NVFP4 export SRT_SLURM_MODEL_PREFIX="glm-5-fp4" elif [[ $MODEL_PREFIX == "glm5" && $PRECISION == "fp8" ]]; then - export MODEL_PATH=/scratch/models/GLM-5.1-FP8 + export MODEL_PATH=/scratch/models/GLM-5-FP8 export SRT_SLURM_MODEL_PREFIX="glm-5-fp8" elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp4" ]]; then export MODEL_PATH=/data/models/MiniMax-M2.5-NVFP4 From 16f9e33729e627c05ceba6d759c8462411508bcc Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Wed, 1 Jul 2026 16:30:39 -0700 Subject: [PATCH 6/7] Point glm5-fp8 GB300 MODEL_PATH at /scratch/models/GLM-5.1-FP8 --- runners/launch_gb300-nv.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 676615500..34a4eae6a 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -47,7 +47,7 @@ elif [[ $MODEL_PREFIX == "glm5" && $PRECISION == "fp4" ]]; then export MODEL_PATH=/scratch/models/GLM-5-NVFP4 export SRT_SLURM_MODEL_PREFIX="glm-5-fp4" elif [[ $MODEL_PREFIX == "glm5" && $PRECISION == "fp8" ]]; then - export MODEL_PATH=/scratch/models/GLM-5-FP8 + export MODEL_PATH=/scratch/models/GLM-5.1-FP8 export SRT_SLURM_MODEL_PREFIX="glm-5-fp8" elif [[ $MODEL_PREFIX == "minimaxm2.5" && $PRECISION == "fp4" ]]; then export MODEL_PATH=/data/models/MiniMax-M2.5-NVFP4 From f3ede16efbdcff80dd32da2f666e0c6c3ea38b5e Mon Sep 17 00:00:00 2001 From: hshrivastava-droid Date: Thu, 2 Jul 2026 10:15:12 -0700 Subject: [PATCH 7/7] update dynamo version --- .../glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_0.yaml | 2 +- .../glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_1.yaml | 2 +- .../glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_2.yaml | 2 +- .../glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_3.yaml | 2 +- .../glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_4.yaml | 2 +- .../glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml | 2 +- .../glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml | 2 +- .../glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_0.yaml | 2 +- .../glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_1.yaml | 2 +- .../glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_2.yaml | 2 +- .../glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_3.yaml | 2 +- .../glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml | 2 +- .../glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml | 2 +- .../glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml | 2 +- 14 files changed, 14 insertions(+), 14 deletions(-) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_0.yaml index 2f935f9ca..e98343689 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_0.yaml @@ -17,7 +17,7 @@ frontend: enable_multiple_frontends: true num_additional_frontends: 9 dynamo: - version: 1.1.0 + version: 1.2.1 backend: prefill_environment: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_1.yaml index 57b373378..bfdc19115 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_1.yaml @@ -17,7 +17,7 @@ frontend: enable_multiple_frontends: true num_additional_frontends: 9 dynamo: - version: 1.1.0 + version: 1.2.1 backend: prefill_environment: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_2.yaml index f40d0abc4..37c6dd759 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_2.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_2.yaml @@ -17,7 +17,7 @@ frontend: enable_multiple_frontends: true num_additional_frontends: 9 dynamo: - version: 1.1.0 + version: 1.2.1 backend: prefill_environment: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_3.yaml index 3b892a717..e951e4871 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_3.yaml @@ -17,7 +17,7 @@ frontend: enable_multiple_frontends: true num_additional_frontends: 9 dynamo: - version: 1.1.0 + version: 1.2.1 backend: prefill_environment: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_4.yaml index 0b0852bd9..df6334f14 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_4.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_hightpt_4.yaml @@ -17,7 +17,7 @@ frontend: enable_multiple_frontends: true num_additional_frontends: 9 dynamo: - version: 1.1.0 + version: 1.2.1 backend: prefill_environment: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml index 5a999216e..d5a7abb0f 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_0.yaml @@ -15,7 +15,7 @@ resources: frontend: type: dynamo dynamo: - version: 1.1.0 + version: 1.2.1 backend: prefill_environment: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml index e55e554d0..8c798b39b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/mtp/1k1k_mtp_lowlat_1.yaml @@ -15,7 +15,7 @@ resources: frontend: type: dynamo dynamo: - version: 1.1.0 + version: 1.2.1 backend: prefill_environment: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_0.yaml index 165387f75..650ec30c1 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_0.yaml @@ -17,7 +17,7 @@ frontend: enable_multiple_frontends: true num_additional_frontends: 9 dynamo: - version: 1.1.0 + version: 1.2.1 backend: prefill_environment: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_1.yaml index a28eae172..f8d753204 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_1.yaml @@ -17,7 +17,7 @@ frontend: enable_multiple_frontends: true num_additional_frontends: 9 dynamo: - version: 1.1.0 + version: 1.2.1 backend: prefill_environment: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_2.yaml index 79fd859dc..e3b5c74cd 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_2.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_2.yaml @@ -17,7 +17,7 @@ frontend: enable_multiple_frontends: true num_additional_frontends: 9 dynamo: - version: 1.1.0 + version: 1.2.1 backend: prefill_environment: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_3.yaml index 64c5e8696..5d19a961a 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_3.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_hightpt_3.yaml @@ -17,7 +17,7 @@ frontend: enable_multiple_frontends: true num_additional_frontends: 9 dynamo: - version: 1.1.0 + version: 1.2.1 backend: prefill_environment: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml index 7572dc68b..812168165 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_0.yaml @@ -15,7 +15,7 @@ resources: frontend: type: dynamo dynamo: - version: 1.1.0 + version: 1.2.1 backend: prefill_environment: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml index 82dd34f03..5977fb91b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_1.yaml @@ -15,7 +15,7 @@ resources: frontend: type: dynamo dynamo: - version: 1.1.0 + version: 1.2.1 backend: prefill_environment: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml index b38d120ad..12589dd82 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/mtp/8k1k_mtp_lowlat_2.yaml @@ -15,7 +15,7 @@ resources: frontend: type: dynamo dynamo: - version: 1.1.0 + version: 1.2.1 backend: prefill_environment: