From f59dca0424b7627613e93d1bb0f42ed4d99e80fc Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Tue, 30 Jun 2026 01:17:12 -0400
Subject: [PATCH 1/3] [AMD] Enable AITER MoE for MiniMax-M3 MI355X FP4 vLLM MTP
 benchmark

Split the FP4 MTP half out of #1955, rebased on current main.

- Gate AITER MoE on non-EP configs only: export VLLM_ROCM_USE_AITER=1,
  VLLM_ROCM_USE_AITER_MOE=1, VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1 and
  pass --moe-backend aiter when EP is off; set VLLM_ROCM_USE_AITER=0 when EP
  is enabled (DP attention or EP > 1), since AITER MoE is incompatible with
  expert parallelism. Addresses review #1955 (discussion_r3495386866).
- Bump minimaxm3-fp4-mi355x-vllm-mtp to the AITER MoE nightly
  (nightly-4559c43a9526597c00cbcc4f59979496500268d1).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/configs/amd-master.yaml                  |  2 +-
 .../minimaxm3_fp4_mi355x_vllm_mtp.sh             | 16 ++++++++++++++++
 perf-changelog.yaml                              |  8 ++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index cbfc09f81..dd027b463 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2672,7 +2672,7 @@ minimaxm3-fp4-mi355x-vllm:
 # tokens. Search space mirrors the MI355X MXFP8 MTP entry, trimming the base
 # FP4 sweep at extreme concurrency where speculative decoding loses value.
 minimaxm3-fp4-mi355x-vllm-mtp:
-  image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e
+  image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1
   model: amd/MiniMax-M3-MXFP4
   model-prefix: minimaxm3
   runner: mi355x
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh
index 96a560493..374ed3b30 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh
@@ -5,6 +5,8 @@
 # minimaxm3_fp4_mi355x_vllm.sh and uses three speculative tokens from
 # Inferact/MiniMax-M3-EAGLE3. The pinned nightly includes upstream AMD
 # MiniMax-M3 SupportsEagle3 support, so no runtime model patch is needed.
+# MoE serving mirrors minimaxm3_fp4_mi355x_vllm.sh (AITER MoE, vllm#46419),
+# except AITER MoE is gated off when expert parallelism is enabled (see below).
 
 source "$(dirname "$0")/../../benchmark_lib.sh"
 
@@ -37,6 +39,19 @@ SERVER_LOG=/workspace/server.log
 export VLLM_ENGINE_READY_TIMEOUT_S=3600
 export VLLM_USE_BREAKABLE_CUDAGRAPH=0
 
+# AITER MoE accelerates the dense (non-EP) MoE path but is incompatible with
+# expert parallelism, so disable it when EP is enabled (DP attention or EP > 1).
+# https://github.com/SemiAnalysisAI/InferenceX/pull/1955#discussion_r3495386866
+MOE_ARGS=()
+if [ "${DP_ATTENTION}" = "true" ] || [ "$EP_SIZE" -gt 1 ]; then
+    export VLLM_ROCM_USE_AITER=0
+else
+    export VLLM_ROCM_USE_AITER=1
+    export VLLM_ROCM_USE_AITER_MOE=1
+    export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1
+    MOE_ARGS=(--moe-backend aiter)
+fi
+
 if [ "${EVAL_ONLY}" = "true" ]; then
     setup_eval_context
 fi
@@ -65,6 +80,7 @@ vllm serve "$MODEL" --port "$PORT" \
     --language-model-only \
     --max-model-len "$MAX_MODEL_LEN" \
     --attention-backend TRITON_ATTN \
+    "${MOE_ARGS[@]}" \
     --speculative-config "{\"method\": \"eagle3\", \"model\": \"$DRAFT_MODEL\", \"num_speculative_tokens\": $NUM_SPEC_TOKENS}" \
     --tool-call-parser minimax_m3 \
     --enable-auto-tool-choice \
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 3ef8c37db..55f25d516 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4323,3 +4323,11 @@
     - "Enable AITER MoE on MiniMax-M3 MXFP4 MI355X single-node vLLM STP: export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, and VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1; pass --moe-backend aiter."
     - "Pin vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 (from nightly-3f5a1e1733200760169ff31ebe60a271072b199e) for AITER MoE and shared-expert fusion support (vllm-project/vllm#46419, vllm-project/vllm#46545)."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1954
+
+- config-keys:
+    - minimaxm3-fp4-mi355x-vllm-mtp
+  description:
+    - "Enable AITER MoE on the MiniMax-M3 MI355X single-node vLLM EAGLE3 MTP MXFP4 benchmark for non-EP configs: export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, and VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1, and pass --moe-backend aiter."
+    - "EP and DP-attention configs keep VLLM_ROCM_USE_AITER=0 since AITER MoE is incompatible with expert parallelism (vLLM #46419)."
+    - "Pin vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 (from nightly-3f5a1e1733200760169ff31ebe60a271072b199e)."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/PENDING

From 0f19a2faf64c56baf6dd19c7255638055fdf1294 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Tue, 30 Jun 2026 01:17:33 -0400
Subject: [PATCH 2/3] fix: set perf-changelog pr-link for #1958

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 55f25d516..341b9a2a8 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4330,4 +4330,4 @@
     - "Enable AITER MoE on the MiniMax-M3 MI355X single-node vLLM EAGLE3 MTP MXFP4 benchmark for non-EP configs: export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, and VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1, and pass --moe-backend aiter."
     - "EP and DP-attention configs keep VLLM_ROCM_USE_AITER=0 since AITER MoE is incompatible with expert parallelism (vLLM #46419)."
     - "Pin vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 (from nightly-3f5a1e1733200760169ff31ebe60a271072b199e)."
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/PENDING
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1958

From 319188a15a173690e079db22a8f6366b0d3dc38c Mon Sep 17 00:00:00 2001
From: Chun Fang <chun.fang@amd.com>
Date: Tue, 30 Jun 2026 12:05:23 +0000
Subject: [PATCH 3/3] [AMD] Fix MiniMax-M3 FP4 MI355X vLLM MTP EP configs on
 AITER nightly

The AITER-MoE nightly ships a torch build without
torch.ao.quantization.pt2e, breaking Quark's MXFP4 dequant
(mxfp4_utils._dequant_mxfp4) that EP/DP-attn configs fall back to when
AITER fused MoE is disabled, crashing engine-core startup.

- EP/DP-attn: keep VLLM_ROCM_USE_AITER=1 (AITER dequant, avoids Quark)
  and set VLLM_ROCM_USE_AITER_MOE=0, instead of disabling AITER entirely.
- Drop EP/DP-attn search-space entries for 8k1k; keep them for 1k1k
- Update perf-changelog accordingly.
---
 .github/configs/amd-master.yaml                       | 11 +++++++----
 .../fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh    | 11 +++++++++--
 perf-changelog.yaml                                   |  3 ++-
 3 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index dd027b463..995e1717f 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -2669,8 +2669,13 @@ minimaxm3-fp4-mi355x-vllm:
 
 # EAGLE3 speculative-decoding variant of minimaxm3-fp4-mi355x-vllm. Pair the
 # amd/MiniMax-M3-MXFP4 target with Inferact/MiniMax-M3-EAGLE3 and three draft
-# tokens. Search space mirrors the MI355X MXFP8 MTP entry, trimming the base
-# FP4 sweep at extreme concurrency where speculative decoding loses value.
+# tokens.
+#
+# EP / dp-attn configs disable AITER fused MoE (incompatible with expert
+# parallelism) but keep the general AITER backend on so MXFP4 weight dequant
+# uses AITER instead of the Quark path (quark.torch.kernel.mx), which is broken
+# in the current nightly (torch.ao.quantization.pt2e removed). See the EP branch
+# in minimaxm3_fp4_mi355x_vllm_mtp.sh and run 28422097175 (PR #1958).
 minimaxm3-fp4-mi355x-vllm-mtp:
   image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1
   model: amd/MiniMax-M3-MXFP4
@@ -2693,9 +2698,7 @@ minimaxm3-fp4-mi355x-vllm-mtp:
       osl: 1024
       search-space:
       - { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
       - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
 
 # MiniMax-M3 MXFP4 MI355X atom recipe:
 # https://github.com/ROCm/ATOM/blob/5d42d49f9e4292e5b61475917e92e7ec1b1dacb7/recipes/MiniMax-M3.md
diff --git a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh
index 374ed3b30..591eefba5 100755
--- a/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh
+++ b/benchmarks/single_node/fixed_seq_len/minimaxm3_fp4_mi355x_vllm_mtp.sh
@@ -40,11 +40,18 @@ export VLLM_ENGINE_READY_TIMEOUT_S=3600
 export VLLM_USE_BREAKABLE_CUDAGRAPH=0
 
 # AITER MoE accelerates the dense (non-EP) MoE path but is incompatible with
-# expert parallelism, so disable it when EP is enabled (DP attention or EP > 1).
+# expert parallelism, so disable AITER *fused MoE* when EP is enabled (DP
+# attention or EP > 1). We still keep the general AITER backend enabled in that
+# case: it routes the MXFP4 weight dequant through AITER instead of the Quark
+# path (mxfp4_utils._dequant_mxfp4 -> `from quark.torch.kernel import mx`),
+# which is broken in the current nightly (ModuleNotFoundError:
+# torch.ao.quantization.pt2e). Fully disabling AITER here would fall back to
+# that broken Quark dequant and crash engine-core startup on every EP config.
 # https://github.com/SemiAnalysisAI/InferenceX/pull/1955#discussion_r3495386866
 MOE_ARGS=()
 if [ "${DP_ATTENTION}" = "true" ] || [ "$EP_SIZE" -gt 1 ]; then
-    export VLLM_ROCM_USE_AITER=0
+    export VLLM_ROCM_USE_AITER=1
+    export VLLM_ROCM_USE_AITER_MOE=0
 else
     export VLLM_ROCM_USE_AITER=1
     export VLLM_ROCM_USE_AITER_MOE=1
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 341b9a2a8..f98722fee 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4328,6 +4328,7 @@
     - minimaxm3-fp4-mi355x-vllm-mtp
   description:
     - "Enable AITER MoE on the MiniMax-M3 MI355X single-node vLLM EAGLE3 MTP MXFP4 benchmark for non-EP configs: export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, and VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1, and pass --moe-backend aiter."
-    - "EP and DP-attention configs keep VLLM_ROCM_USE_AITER=0 since AITER MoE is incompatible with expert parallelism (vLLM #46419)."
+    - "EP and DP-attention configs disable AITER fused MoE (VLLM_ROCM_USE_AITER_MOE=0) since AITER MoE is incompatible with expert parallelism (vLLM #46419), but keep the general AITER backend on (VLLM_ROCM_USE_AITER=1) so MXFP4 weight dequant uses AITER instead of the Quark path (mxfp4_utils._dequant_mxfp4), which is broken in this nightly (ModuleNotFoundError: torch.ao.quantization.pt2e)."
+    - "Drop EP and DP-attention search-space entries for 8k1k (those EP>1 points are off the Pareto curve); 1k1k keeps its EP and DP-attention coverage."
     - "Pin vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 (from nightly-3f5a1e1733200760169ff31ebe60a271072b199e)."
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1958