From f59dca0424b7627613e93d1bb0f42ed4d99e80fc Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Tue, 30 Jun 2026 01:17:12 -0400 Subject: [PATCH 1/3] [AMD] Enable AITER MoE for MiniMax-M3 MI355X FP4 vLLM MTP benchmark Split the FP4 MTP half out of #1955, rebased on current main. - Gate AITER MoE on non-EP configs only: export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1 and pass --moe-backend aiter when EP is off; set VLLM_ROCM_USE_AITER=0 when EP is enabled (DP attention or EP > 1), since AITER MoE is incompatible with expert parallelism. Addresses review #1955 (discussion_r3495386866). - Bump minimaxm3-fp4-mi355x-vllm-mtp to the AITER MoE nightly (nightly-4559c43a9526597c00cbcc4f59979496500268d1). Co-Authored-By: Claude Opus 4.8 (1M context) --- .github/configs/amd-master.yaml | 2 +- .../minimaxm3_fp4_mi355x_vllm_mtp.sh | 16 ++++++++++++++++ perf-changelog.yaml | 8 ++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index cbfc09f81..dd027b463 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2672,7 +2672,7 @@ minimaxm3-fp4-mi355x-vllm: # tokens. Search space mirrors the MI355X MXFP8 MTP entry, trimming the base # FP4 sweep at extreme concurrency where speculative decoding loses value. minimaxm3-fp4-mi355x-vllm-mtp: - image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e + image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 model: amd/MiniMax-M3-MXFP4 model-prefix: minimaxm3 runner: mi355x diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh index 96a560493..374ed3b30 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh @@ -5,6 +5,8 @@ # minimaxm3_fp4_mi355x_vllm.sh and uses three speculative tokens from # Inferact/MiniMax-M3-EAGLE3. The pinned nightly includes upstream AMD # MiniMax-M3 SupportsEagle3 support, so no runtime model patch is needed. +# MoE serving mirrors minimaxm3_fp4_mi355x_vllm.sh (AITER MoE, vllm#46419), +# except AITER MoE is gated off when expert parallelism is enabled (see below). source "$(dirname "$0")/../../benchmark_lib.sh" @@ -37,6 +39,19 @@ SERVER_LOG=/workspace/server.log export VLLM_ENGINE_READY_TIMEOUT_S=3600 export VLLM_USE_BREAKABLE_CUDAGRAPH=0 +# AITER MoE accelerates the dense (non-EP) MoE path but is incompatible with +# expert parallelism, so disable it when EP is enabled (DP attention or EP > 1). +# https://github.com/SemiAnalysisAI/InferenceX/pull/1955#discussion_r3495386866 +MOE_ARGS=() +if [ "${DP_ATTENTION}" = "true" ] || [ "$EP_SIZE" -gt 1 ]; then + export VLLM_ROCM_USE_AITER=0 +else + export VLLM_ROCM_USE_AITER=1 + export VLLM_ROCM_USE_AITER_MOE=1 + export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1 + MOE_ARGS=(--moe-backend aiter) +fi + if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context fi @@ -65,6 +80,7 @@ vllm serve "$MODEL" --port "$PORT" \ --language-model-only \ --max-model-len "$MAX_MODEL_LEN" \ --attention-backend TRITON_ATTN \ + "${MOE_ARGS[@]}" \ --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \ --tool-call-parser minimax_m3 \ --enable-auto-tool-choice \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3ef8c37db..55f25d516 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4323,3 +4323,11 @@ - "Enable AITER MoE on MiniMax-M3 MXFP4 MI355X single-node vLLM STP: export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, and VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1; pass --moe-backend aiter." - "Pin vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 (from nightly-3f5a1e1733200760169ff31ebe60a271072b199e) for AITER MoE and shared-expert fusion support (vllm-project/vllm#46419, vllm-project/vllm#46545)." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1954 + +- config-keys: + - minimaxm3-fp4-mi355x-vllm-mtp + description: + - "Enable AITER MoE on the MiniMax-M3 MI355X single-node vLLM EAGLE3 MTP MXFP4 benchmark for non-EP configs: export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, and VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1, and pass --moe-backend aiter." + - "EP and DP-attention configs keep VLLM_ROCM_USE_AITER=0 since AITER MoE is incompatible with expert parallelism (vLLM #46419)." + - "Pin vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 (from nightly-3f5a1e1733200760169ff31ebe60a271072b199e)." + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/PENDING From 0f19a2faf64c56baf6dd19c7255638055fdf1294 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Tue, 30 Jun 2026 01:17:33 -0400 Subject: [PATCH 2/3] fix: set perf-changelog pr-link for #1958 Co-Authored-By: Claude Opus 4.8 (1M context) --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 55f25d516..341b9a2a8 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4330,4 +4330,4 @@ - "Enable AITER MoE on the MiniMax-M3 MI355X single-node vLLM EAGLE3 MTP MXFP4 benchmark for non-EP configs: export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, and VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1, and pass --moe-backend aiter." - "EP and DP-attention configs keep VLLM_ROCM_USE_AITER=0 since AITER MoE is incompatible with expert parallelism (vLLM #46419)." - "Pin vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 (from nightly-3f5a1e1733200760169ff31ebe60a271072b199e)." - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/PENDING + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1958 From 319188a15a173690e079db22a8f6366b0d3dc38c Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Tue, 30 Jun 2026 12:05:23 +0000 Subject: [PATCH 3/3] [AMD] Fix MiniMax-M3 FP4 MI355X vLLM MTP EP configs on AITER nightly The AITER-MoE nightly ships a torch build without torch.ao.quantization.pt2e, breaking Quark's MXFP4 dequant (mxfp4_utils._dequant_mxfp4) that EP/DP-attn configs fall back to when AITER fused MoE is disabled, crashing engine-core startup. - EP/DP-attn: keep VLLM_ROCM_USE_AITER=1 (AITER dequant, avoids Quark) and set VLLM_ROCM_USE_AITER_MOE=0, instead of disabling AITER entirely. - Drop EP/DP-attn search-space entries for 8k1k; keep them for 1k1k - Update perf-changelog accordingly. --- .github/configs/amd-master.yaml | 11 +++++++---- .../fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh | 11 +++++++++-- perf-changelog.yaml | 3 ++- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index dd027b463..995e1717f 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -2669,8 +2669,13 @@ minimaxm3-fp4-mi355x-vllm: # EAGLE3 speculative-decoding variant of minimaxm3-fp4-mi355x-vllm. Pair the # amd/MiniMax-M3-MXFP4 target with Inferact/MiniMax-M3-EAGLE3 and three draft -# tokens. Search space mirrors the MI355X MXFP8 MTP entry, trimming the base -# FP4 sweep at extreme concurrency where speculative decoding loses value. +# tokens. +# +# EP / dp-attn configs disable AITER fused MoE (incompatible with expert +# parallelism) but keep the general AITER backend on so MXFP4 weight dequant +# uses AITER instead of the Quark path (quark.torch.kernel.mx), which is broken +# in the current nightly (torch.ao.quantization.pt2e removed). See the EP branch +# in minimaxm3_fp4_mi355x_vllm_mtp.sh and run 28422097175 (PR #1958). minimaxm3-fp4-mi355x-vllm-mtp: image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 model: amd/MiniMax-M3-MXFP4 @@ -2693,9 +2698,7 @@ minimaxm3-fp4-mi355x-vllm-mtp: osl: 1024 search-space: - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp } - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp } # MiniMax-M3 MXFP4 MI355X atom recipe: # https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh index 374ed3b30..591eefba5 100755 --- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh +++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh @@ -40,11 +40,18 @@ export VLLM_ENGINE_READY_TIMEOUT_S=3600 export VLLM_USE_BREAKABLE_CUDAGRAPH=0 # AITER MoE accelerates the dense (non-EP) MoE path but is incompatible with -# expert parallelism, so disable it when EP is enabled (DP attention or EP > 1). +# expert parallelism, so disable AITER *fused MoE* when EP is enabled (DP +# attention or EP > 1). We still keep the general AITER backend enabled in that +# case: it routes the MXFP4 weight dequant through AITER instead of the Quark +# path (mxfp4_utils._dequant_mxfp4 -> `from quark.torch.kernel import mx`), +# which is broken in the current nightly (ModuleNotFoundError: +# torch.ao.quantization.pt2e). Fully disabling AITER here would fall back to +# that broken Quark dequant and crash engine-core startup on every EP config. # https://github.com/SemiAnalysisAI/InferenceX/pull/1955#discussion_r3495386866 MOE_ARGS=() if [ "${DP_ATTENTION}" = "true" ] || [ "$EP_SIZE" -gt 1 ]; then - export VLLM_ROCM_USE_AITER=0 + export VLLM_ROCM_USE_AITER=1 + export VLLM_ROCM_USE_AITER_MOE=0 else export VLLM_ROCM_USE_AITER=1 export VLLM_ROCM_USE_AITER_MOE=1 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 341b9a2a8..f98722fee 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -4328,6 +4328,7 @@ - minimaxm3-fp4-mi355x-vllm-mtp description: - "Enable AITER MoE on the MiniMax-M3 MI355X single-node vLLM EAGLE3 MTP MXFP4 benchmark for non-EP configs: export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, and VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1, and pass --moe-backend aiter." - - "EP and DP-attention configs keep VLLM_ROCM_USE_AITER=0 since AITER MoE is incompatible with expert parallelism (vLLM #46419)." + - "EP and DP-attention configs disable AITER fused MoE (VLLM_ROCM_USE_AITER_MOE=0) since AITER MoE is incompatible with expert parallelism (vLLM #46419), but keep the general AITER backend on (VLLM_ROCM_USE_AITER=1) so MXFP4 weight dequant uses AITER instead of the Quark path (mxfp4_utils._dequant_mxfp4), which is broken in this nightly (ModuleNotFoundError: torch.ao.quantization.pt2e)." + - "Drop EP and DP-attention search-space entries for 8k1k (those EP>1 points are off the Pareto curve); 1k1k keeps its EP and DP-attention coverage." - "Pin vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 (from nightly-3f5a1e1733200760169ff31ebe60a271072b199e)." pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1958