From ecb56e1d430ad37fbda51b7a81eb9bc49f910a52 Mon Sep 17 00:00:00 2001 From: Fangzhou Ai <31551580+Fangzhou-Ai@users.noreply.github.com> Date: Thu, 2 Jul 2026 07:49:26 -0500 Subject: [PATCH 1/5] [AMD] DeepSeek-V4 FP4 MI355X vLLM STP: bump image to latest nightly Update dsv4-fp4-mi355x-vllm from vllm/vllm-openai-rocm:v0.22.0 to the latest nightly (nightly-09663abde0f50944a8d5ea30120666024b503faa). --- .github/configs/amd-master.yaml | 2 +- perf-changelog.yaml | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 9386d9c27..704f8075f 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1952,7 +1952,7 @@ dsv4-fp4-mi355x-sglang-mtp: # gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64 # probe to validate the ROCm DP+EP path. dsv4-fp4-mi355x-vllm: - image: vllm/vllm-openai-rocm:v0.22.0 + image: vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: mi355x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 450226250..ec5525ee0 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4400,3 +4400,9 @@ description: - "Bump SGLang image from lmsysorg/sglang:deepseek-v4-blackwell (digest sha256:df18bfc4...) to mainline nightly lmsysorg/sglang:nightly-dev-cu13-20260628-da802ddc." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1923 + +- config-keys: + - dsv4-fp4-mi355x-vllm + description: + - "Bump DeepSeek-V4-Pro FP4 MI355X single-node vLLM STP image from vllm/vllm-openai-rocm:v0.22.0 to the latest nightly vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/PLACEHOLDER From dd5653f7bffcac98a8b7252e118034bb5de296ef Mon Sep 17 00:00:00 2001 From: Fangzhou Ai <31551580+Fangzhou-Ai@users.noreply.github.com> Date: Thu, 2 Jul 2026 07:50:24 -0500 Subject: [PATCH 2/5] chore(changelog): set pr-link for dsv4-fp4-mi355x-vllm image bump (#1980) --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ec5525ee0..0c148aaa9 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4405,4 +4405,4 @@ - dsv4-fp4-mi355x-vllm description: - "Bump DeepSeek-V4-Pro FP4 MI355X single-node vLLM STP image from vllm/vllm-openai-rocm:v0.22.0 to the latest nightly vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa." - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/PLACEHOLDER + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1980 From 50961ac4454d47d4dae60c7f0540568e80d97d46 Mon Sep 17 00:00:00 2001 From: Fangzhou Ai <31551580+Fangzhou-Ai@users.noreply.github.com> Date: Thu, 2 Jul 2026 07:52:06 -0500 Subject: [PATCH 3/5] docs(changelog): note two-stage attention kernels and AITER MLA for dsv4 vllm STP --- perf-changelog.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 0c148aaa9..21b5e8cfa 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4405,4 +4405,6 @@ - dsv4-fp4-mi355x-vllm description: - "Bump DeepSeek-V4-Pro FP4 MI355X single-node vLLM STP image from vllm/vllm-openai-rocm:v0.22.0 to the latest nightly vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa." + - "The nightly enables two-stage attention kernels (split-KV decode), reducing decode attention latency at high concurrency." + - "Employ the AITER MLA attention backend for the DeepSeek-V4 MLA path." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1980 From 9260991f5493f110047cb2b9f0d6041a75eb2c63 Mon Sep 17 00:00:00 2001 From: Fangzhou Ai <31551580+Fangzhou-Ai@users.noreply.github.com> Date: Thu, 2 Jul 2026 07:55:50 -0500 Subject: [PATCH 4/5] docs(changelog): two-stage attention improves across all concurrency (stp) --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 21b5e8cfa..7b07d653c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4405,6 +4405,6 @@ - dsv4-fp4-mi355x-vllm description: - "Bump DeepSeek-V4-Pro FP4 MI355X single-node vLLM STP image from vllm/vllm-openai-rocm:v0.22.0 to the latest nightly vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa." - - "The nightly enables two-stage attention kernels (split-KV decode), reducing decode attention latency at high concurrency." + - "The nightly enables two-stage attention kernels (split-KV decode), which reduce decode attention latency across all concurrency levels." - "Employ the AITER MLA attention backend for the DeepSeek-V4 MLA path." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1980 From fe4b460cb01b55fd6be5e469b77040e6fa1a59c5 Mon Sep 17 00:00:00 2001 From: Fangzhou Ai <31551580+Fangzhou-Ai@users.noreply.github.com> Date: Thu, 2 Jul 2026 08:12:40 -0500 Subject: [PATCH 5/5] [AMD] dsv4 fp4 mi355x vllm STP: use AITER MoE backend --- .../fixed_seq_len/dsv4_fp4_mi355x_vllm.sh | 13 +++++++------ perf-changelog.yaml | 1 + 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm.sh index dc8989b3e..e6f02db07 100755 --- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm.sh +++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm.sh @@ -12,11 +12,11 @@ set -eo pipefail # same ROCm recipe while switching parallelism to vLLM's DP+EP form. # Image-pin details live in amd-master.yaml. # -# --moe-backend triton_unfused is required for the FP4 MoE expert -# weight format used by deepseek-ai/DeepSeek-V4-Pro. Letting --moe-backend -# default to auto picks a backend that doesn't register the FP4 scale -# parameters (w13_weight_scale / w2_weight_scale), so safetensors -# loading raises KeyError. +# Use the AITER MoE backend (VLLM_ROCM_USE_AITER_MOE=1 + --moe-backend aiter) +# for the FP4 MoE expert weights of deepseek-ai/DeepSeek-V4-Pro. The AITER +# MXFP4 path registers the FP4 scale parameters (w13_weight_scale / +# w2_weight_scale), so safetensors loads correctly and decode runs on the +# fused AITER experts instead of triton_unfused. # # --compilation-config mode=3 with FULL_AND_PIECEWISE cudagraph mode # enables full CUDA graph capture for improved throughput on MI355X. @@ -45,6 +45,7 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then fi export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_MOE=1 SERVER_LOG=/workspace/server.log @@ -75,7 +76,7 @@ vllm serve $MODEL --port $PORT \ --gpu-memory-utilization 0.8 \ --kv-cache-dtype fp8 \ --trust-remote-code \ - --moe-backend triton_unfused \ + --moe-backend aiter \ --tokenizer-mode deepseek_v4 \ --reasoning-parser deepseek_v4 \ --compilation-config '{"mode":3,"cudagraph_mode":"FULL_AND_PIECEWISE"}' > $SERVER_LOG 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 7b07d653c..db42f59bd 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4407,4 +4407,5 @@ - "Bump DeepSeek-V4-Pro FP4 MI355X single-node vLLM STP image from vllm/vllm-openai-rocm:v0.22.0 to the latest nightly vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa." - "The nightly enables two-stage attention kernels (split-KV decode), which reduce decode attention latency across all concurrency levels." - "Employ the AITER MLA attention backend for the DeepSeek-V4 MLA path." + - "Switch the MoE backend from triton_unfused to AITER MoE (VLLM_ROCM_USE_AITER_MOE=1 + --moe-backend aiter) for the FP4 experts." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1980