diff --git a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm.sh b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm.sh
index dc8989b3e..e6f02db07 100755
--- a/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm.sh
+++ b/benchmarks/single_node/fixed_seq_len/dsv4_fp4_mi355x_vllm.sh
@@ -12,11 +12,11 @@ set -eo pipefail
 # same ROCm recipe while switching parallelism to vLLM's DP+EP form.
 # Image-pin details live in amd-master.yaml.
 #
-# --moe-backend triton_unfused is required for the FP4 MoE expert
-# weight format used by deepseek-ai/DeepSeek-V4-Pro. Letting --moe-backend
-# default to auto picks a backend that doesn't register the FP4 scale
-# parameters (w13_weight_scale / w2_weight_scale), so safetensors
-# loading raises KeyError.
+# Use the AITER MoE backend (VLLM_ROCM_USE_AITER_MOE=1 + --moe-backend aiter)
+# for the FP4 MoE expert weights of deepseek-ai/DeepSeek-V4-Pro. The AITER
+# MXFP4 path registers the FP4 scale parameters (w13_weight_scale /
+# w2_weight_scale), so safetensors loads correctly and decode runs on the
+# fused AITER experts instead of triton_unfused.
 #
 # --compilation-config mode=3 with FULL_AND_PIECEWISE cudagraph mode
 # enables full CUDA graph capture for improved throughput on MI355X.
@@ -45,6 +45,7 @@ if [ -n "$ROCR_VISIBLE_DEVICES" ]; then
 fi
 
 export VLLM_ROCM_USE_AITER=1
+export VLLM_ROCM_USE_AITER_MOE=1
 
 SERVER_LOG=/workspace/server.log
 
@@ -75,7 +76,7 @@ vllm serve $MODEL --port $PORT \
     --gpu-memory-utilization 0.8 \
     --kv-cache-dtype fp8 \
     --trust-remote-code \
-    --moe-backend triton_unfused \
+    --moe-backend aiter \
     --tokenizer-mode deepseek_v4 \
     --reasoning-parser deepseek_v4 \
     --compilation-config '{"mode":3,"cudagraph_mode":"FULL_AND_PIECEWISE"}' > $SERVER_LOG 2>&1 &
diff --git a/configs/amd-master.yaml b/configs/amd-master.yaml
index 41f60afda..d3f1c70e6 100644
--- a/configs/amd-master.yaml
+++ b/configs/amd-master.yaml
@@ -1912,7 +1912,7 @@ dsv4-fp4-mi355x-sglang-mtp:
 # gpu-mem-util=0.6. TP8 sweeps conc 4-64; DEP8 has a single conc=64
 # probe to validate the ROCm DP+EP path.
 dsv4-fp4-mi355x-vllm:
-  image: vllm/vllm-openai-rocm:v0.22.0
+  image: vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: mi355x
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 0128b0f27..f13d283d6 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -4439,3 +4439,12 @@
   description:
     - "Remove --hf-overrides use_index_cache/index_topk_freq indexer-skipping override from the ATOM serve command (not allowed: reduces model architecture FLOPs per PR_REVIEW_CHECKLIST.md)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/2037
+
+- config-keys:
+    - dsv4-fp4-mi355x-vllm
+  description:
+    - "Bump DeepSeek-V4-Pro FP4 MI355X single-node vLLM STP image from vllm/vllm-openai-rocm:v0.22.0 to the latest nightly vllm/vllm-openai-rocm:nightly-09663abde0f50944a8d5ea30120666024b503faa."
+    - "The nightly enables two-stage attention kernels (split-KV decode), which reduce decode attention latency across all concurrency levels."
+    - "Employ the AITER MLA attention backend for the DeepSeek-V4 MLA path."
+    - "Switch the MoE backend from triton_unfused to AITER MoE (VLLM_ROCM_USE_AITER_MOE=1 + --moe-backend aiter) for the FP4 experts."
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1980