Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 13 additions & 19 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2525,7 +2525,7 @@ dsv4-fp4-mi355x-atom-disagg:
# https://github.com/vllm-project/recipes/commit/2a3728ed9892debfd767a72a58ebc90b33f186e5
# MXFP8 runs from TP=4 on gfx950; block size 128 is mandatory for MSA.
minimaxm3-fp8-mi355x-vllm:
image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e
image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: mi355x
Expand All @@ -2537,19 +2537,14 @@ minimaxm3-fp8-mi355x-vllm:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64 }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 512 }
- { tp: 4, conc-start: 1, conc-end: 64 }
- { tp: 8, conc-start: 1, conc-end: 32 }
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 512 }
- { tp: 2, ep: 2, conc-start: 16, conc-end: 128 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64 }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 512 }
- { tp: 4, conc-start: 1, conc-end: 128 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 512 }
- { tp: 8, conc-start: 1, conc-end: 2 }
- { tp: 4, conc-start: 2, conc-end: 128 }

# EAGLE3 speculative-decoding (spec-decoding: mtp) variant of
# minimaxm3-fp8-mi355x-vllm, pairing MiniMaxAI/MiniMax-M3-MXFP8 with the
Expand All @@ -2562,7 +2557,7 @@ minimaxm3-fp8-mi355x-vllm:
# acceptance dilutes in big batches, and the draft weights + draft KV shave
# headroom — tp2-ep2 is dropped since its KV headroom was already thin.
minimaxm3-fp8-mi355x-vllm-mtp:
image: vllm/vllm-openai-rocm:nightly-3f5a1e1733200760169ff31ebe60a271072b199e
image: vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1
model: MiniMaxAI/MiniMax-M3-MXFP8
model-prefix: minimaxm3
runner: mi355x
Expand All @@ -2574,18 +2569,17 @@ minimaxm3-fp8-mi355x-vllm-mtp:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 2, spec-decoding: mtp }
- { tp: 4, conc-start: 32, conc-end: 64, spec-decoding: mtp }
- { tp: 4, ep: 4, conc-start: 128, conc-end: 256, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 1, conc-end: 256, spec-decoding: mtp }
- { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256, spec-decoding: mtp }
- { tp: 8, conc-start: 4, conc-end: 16, spec-decoding: mtp }
- { tp: 4, conc-start: 2, conc-end: 128, spec-decoding: mtp }
- { tp: 8, conc-start: 1, conc-end: 1, spec-decoding: mtp }

# MiniMax-M3 MXFP4 MI355X vLLM disaggregated (prefill/decode) config.
minimaxm3-fp4-mi355x-vllm-disagg:
Expand Down
19 changes: 19 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@ fi
SERVER_LOG=/workspace/server.log
export VLLM_ENGINE_READY_TIMEOUT_S=3600
export VLLM_USE_BREAKABLE_CUDAGRAPH=0
# MI355X mxfp8 recipe (vllm-project/recipes#581): INT6 quick all-reduce plus
# the router-append shared-experts MoE fusion (vllm-project/vllm#46545). The
# fusion checks this env directly and runs on both the aiter and native MXFP8
# MoE paths (it is independent of the AITER master switch, and self-disables
# under expert parallelism inside the model), so enable it unconditionally.
# (The AITER master switch itself is set below, gated on expert parallelism.)
export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
Expand All @@ -47,6 +55,17 @@ elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS+=(--enable-expert-parallel)
fi

# Gate the AITER master switch on expert parallelism. With EP, the aiter fused
# MoE path is the auto-selected backend (no --moe-backend override). With EP
# disabled (TP-only) the AITER master switch produces degenerate MiniMax-M3
# output, so leave it off and fall back to the native MXFP8 path (the
# shared-experts fusion set above still applies — it is master-independent).
if printf '%s\n' "${PARALLEL_ARGS[@]}" | grep -qxF -- '--enable-expert-parallel'; then
export VLLM_ROCM_USE_AITER=1
else
export VLLM_ROCM_USE_AITER=0
fi

start_gpu_monitor

set -x
Expand Down
19 changes: 19 additions & 0 deletions benchmarks/single_node/fixed_seq_len/minimaxm3_fp8_mi355x_mtp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,14 @@ export VLLM_ENGINE_READY_TIMEOUT_S=3600
# Run with CUDA graphs (no --enforce-eager): VLLM_USE_BREAKABLE_CUDAGRAPH=0
# avoids the M3-decode breakable-cudagraph path that previously forced eager.
export VLLM_USE_BREAKABLE_CUDAGRAPH=0
# MI355X mxfp8 recipe (vllm-project/recipes#581): INT6 quick all-reduce plus
# the router-append shared-experts MoE fusion (vllm-project/vllm#46545). The
# fusion checks this env directly and runs on both the aiter and native MXFP8
# MoE paths (it is independent of the AITER master switch, and self-disables
# under expert parallelism inside the model), so enable it unconditionally.
# (The AITER master switch itself is set below, gated on expert parallelism.)
export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6

if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
Expand All @@ -77,6 +85,17 @@ elif [ "$EP_SIZE" -gt 1 ]; then
PARALLEL_ARGS+=(--enable-expert-parallel)
fi

# Gate the AITER master switch on expert parallelism. With EP, the aiter fused
# MoE path is the auto-selected backend (no --moe-backend override). With EP
# disabled (TP-only) the AITER master switch produces degenerate MiniMax-M3
# output, so leave it off and fall back to the native MXFP8 path (the
# shared-experts fusion set above still applies — it is master-independent).
if printf '%s\n' "${PARALLEL_ARGS[@]}" | grep -qxF -- '--enable-expert-parallel'; then
export VLLM_ROCM_USE_AITER=1
else
export VLLM_ROCM_USE_AITER=0
fi

# use 3 speculative tokens for all configs for now
NUM_SPEC_TOKENS=3

Expand Down
12 changes: 12 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4323,3 +4323,15 @@
- "Enable AITER MoE on MiniMax-M3 MXFP4 MI355X single-node vLLM STP: export VLLM_ROCM_USE_AITER=1, VLLM_ROCM_USE_AITER_MOE=1, and VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1; pass --moe-backend aiter."
- "Pin vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1 (from nightly-3f5a1e1733200760169ff31ebe60a271072b199e) for AITER MoE and shared-expert fusion support (vllm-project/vllm#46419, vllm-project/vllm#46545)."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1954

- config-keys:
- minimaxm3-fp8-mi355x-vllm
- minimaxm3-fp8-mi355x-vllm-mtp
description:
- "Update the MiniMax-M3 MXFP8 MI355X vLLM benchmark image from vllm/vllm-openai-rocm:minimax-m3 to vllm/vllm-openai-rocm:nightly-4559c43a9526597c00cbcc4f59979496500268d1, which includes the gfx950 mxfp8 MoE/linear tuning (vllm-project/vllm#45725), fused shared-experts MoE for the mxfp8 model (#46545), and the AITER flydsl MoE backend (#46184)."
- "Align the standard and EAGLE3 (MTP) bench scripts with vllm-project/recipes#581: gate VLLM_ROCM_USE_AITER on expert parallelism (on for EP/DP-attention runs, where the AITER fused MoE is the auto-selected backend; off for TP-only runs, which fall back to the native MXFP8 path since the master switch otherwise yields degenerate MiniMax-M3 output), export VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS=1 unconditionally (the router-append shared-experts fusion is independent of the master switch and self-disables under EP), and export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT6 (INT6 quick all-reduce)."
- "Retune the TP/EP search space to the best layout per concurrency band and drop redundant points (full TP8/EP8, TP2/EP2, DP-attention)."
- "minimaxm3-fp8-mi355x-vllm: 1k1k sweeps TP8 (conc 1-32), TP4 (conc 4-32), TP4/EP4 (conc 64-512); 8k1k sweeps TP8 (conc 1-2), TP4 (conc 2-128)."
- "minimaxm3-fp8-mi355x-vllm-mtp: 1k1k sweeps TP8 (conc 4-32), TP8/EP8 (conc 1-256), TP4 (conc 1-2 and 32-64), TP4/EP4 (conc 128-256); 8k1k sweeps TP8 (conc 1 and 4-16), TP4 (conc 16-128)."
- "Serving flags are otherwise unchanged."
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1946