From f23f216b36320757bb3425ab3d026cd33a468447 Mon Sep 17 00:00:00 2001
From: Lanyu Liao <lancelly@users.noreply.github.com>
Date: Wed, 19 Nov 2025 01:24:20 -0800
Subject: [PATCH 1/4] set correct lm_head_tp_size_upper_bound

Signed-off-by: Lanyu Liao <lancelly@users.noreply.github.com>
---
 tensorrt_llm/_torch/utils.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py
index 5beff19f710..9c744208f90 100644
--- a/tensorrt_llm/_torch/utils.py
+++ b/tensorrt_llm/_torch/utils.py
@@ -1,4 +1,5 @@
 import contextlib
+import os
 import threading
 from dataclasses import dataclass
 from enum import Enum, IntEnum
@@ -317,9 +318,14 @@ def create_lm_head_tp_mapping(mapping: Mapping, token_count: int) -> Mapping:
     # Since token_count=256 will hit the boundary of math-bound problem
     # We use 256 // token_count to determine the lm_head_tp_size
     lm_head_tp_size_raw = 256 // token_count
-    lm_head_tp_size = nearest_in_buckets(lm_head_tp_size_raw,
-                                         [1, mapping.gpus_per_node])
-    assert mapping.tp_size % lm_head_tp_size == 0
+    # TODO: On platforms like GB200, setting lm_head_tp_size_upper_bound to world_size could be more efficient when world_size > gpus_per_node, we need to do further investigation.
+    lm_head_tp_size_upper_bound = min(mapping.world_size, mapping.gpus_per_node)
+    lm_head_tp_size = int(
+        os.getenv(
+            'LM_HEAD_TP_SIZE',
+            nearest_in_buckets(lm_head_tp_size_raw,
+                               [1, lm_head_tp_size_upper_bound])))
+    assert mapping.tp_size % lm_head_tp_size == 0, f"mapping.tp_size: {mapping.tp_size}, lm_head_tp_size: {lm_head_tp_size}"
     lm_head_pp_size = mapping.pp_size * mapping.tp_size // lm_head_tp_size
 
     return Mapping(

From d694d5724bdbc4c90a0cb9944e262ad8144fdab3 Mon Sep 17 00:00:00 2001
From: Lanyu Liao <lancelly@users.noreply.github.com>
Date: Wed, 19 Nov 2025 02:11:27 -0800
Subject: [PATCH 2/4] add it

Signed-off-by: Lanyu Liao <lancelly@users.noreply.github.com>
---
 tests/integration/test_lists/test-db/l0_dgx_b200.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
index 1c2428bba60..6389514cfe2 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -128,6 +128,8 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_fp8kv] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[latency] TIMEOUT (180)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp] TIMEOUT (180)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp] TIMEOUT (180)
 - condition:
     ranges:
       system_gpu_count:

From 73f94e9f5af60d5fe4bbc3e253049974ddc0a3c7 Mon Sep 17 00:00:00 2001
From: Lanyu Liao <lancelly@users.noreply.github.com>
Date: Wed, 19 Nov 2025 05:55:23 -0800
Subject: [PATCH 3/4] add test case where world_size < gpus_per_node

Signed-off-by: Lanyu Liao <lancelly@users.noreply.github.com>
---
 .../defs/accuracy/test_llm_api_pytorch.py      | 18 +++++++++++++++---
 .../test_lists/test-db/l0_dgx_b200.yml         |  3 +--
 2 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 33556590044..201e880a036 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2049,6 +2049,18 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
                          32,
                          "TRTLLM",
                          marks=pytest.mark.skip_less_mpi_world_size(8)),
+            pytest.param(4,
+                         1,
+                         4,
+                         3,
+                         False,
+                         True,
+                         True,
+                         True,
+                         True,
+                         16,
+                         "CUTLASS",
+                         marks=pytest.mark.skip_less_mpi_world_size(4)),
             pytest.param(8,
                          1,
                          8,
@@ -2124,9 +2136,9 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
         ],
         ids=[
             "latency", "latency_trtllmgen", "latency_adp_lmtp",
-            "latency_trtllmgen_adp_lmtp", "throughput", "throughput_tp8",
-            "throughput_tp4", "throughput_mtp", "throughput_bs8_mtp",
-            "throughput_pp4_mtp"
+            "latency_trtllmgen_adp_lmtp", "latency_adp_lmtp_tp4", "throughput",
+            "throughput_tp8", "throughput_tp4", "throughput_mtp",
+            "throughput_bs8_mtp", "throughput_pp4_mtp"
         ])
     def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
                               attention_dp, enable_lm_head_tp_in_adp,
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
index 6389514cfe2..d10ad2260bf 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -59,6 +59,7 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp2pp2
   - accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp_tp4]
 - condition:
     ranges:
       system_gpu_count:
@@ -128,8 +129,6 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_fp8kv] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[latency] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen_adp_lmtp] TIMEOUT (180)
 - condition:
     ranges:
       system_gpu_count:

From ce076d64dfcfa682112961d808e8d2a28025e16b Mon Sep 17 00:00:00 2001
From: Lanyu Liao <lancelly@users.noreply.github.com>
Date: Wed, 19 Nov 2025 22:03:13 -0800
Subject: [PATCH 4/4] add comment for lm_head_tp_size_raw

Signed-off-by: Lanyu Liao <lancelly@users.noreply.github.com>
---
 tensorrt_llm/_torch/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py
index 9c744208f90..1cc0d313cc6 100644
--- a/tensorrt_llm/_torch/utils.py
+++ b/tensorrt_llm/_torch/utils.py
@@ -317,6 +317,7 @@ def create_lm_head_tp_mapping(mapping: Mapping, token_count: int) -> Mapping:
     # We use heuristic to determine the lm_head_tp_size
     # Since token_count=256 will hit the boundary of math-bound problem
     # We use 256 // token_count to determine the lm_head_tp_size
+    # For more details, refer to the blog: https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.md#mtp-lm-head-tensor-parallelism
     lm_head_tp_size_raw = 256 // token_count
     # TODO: On platforms like GB200, setting lm_head_tp_size_upper_bound to world_size could be more efficient when world_size > gpus_per_node, we need to do further investigation.
     lm_head_tp_size_upper_bound = min(mapping.world_size, mapping.gpus_per_node)