[https://nvbugs/5451280][fix] Reduce memory fraction problem by warmu… (#7999)

liji-nv · web-flow · commit b4e6a1648b3e · 2025-10-03T18:14:13.000-07:00
Signed-off-by: Jin Li &lt;59594262+liji-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -207,6 +207,19 @@ def _get_token_num_for_estimation(self) -> int:
             # Requests cannot share KV cache blocks. Round up to nearest integer multiple of block size.
             num_cache_blocks += (num_req_tokens + self._tokens_per_block -
                                  1) // self._tokens_per_block
+
+        # Max cuda graph warmup required tokens
+        max_cuda_graph_bs = min(self._model_engine.batch_size,
+                                self._model_engine._max_cuda_graph_batch_size)
+        cuda_graph_warmup_block = (
+            self._model_engine.max_seq_len +
+            1) // self._tokens_per_block + max_cuda_graph_bs - 1
+        num_cache_blocks = max(cuda_graph_warmup_block, num_cache_blocks)
+
+        # This is the minimal blocks required to run with max bs
+        # If not able to allocate self._model_engine.batch_size blocks, the max batch size should be adjusted.
+        num_cache_blocks = max(num_cache_blocks, self._model_engine.batch_size)
+
         # Multiply by beam width, to prevent rescaling of the max_seq_len caused by the influence of beam width during the preparation for kv_cache_estimation
         return num_cache_blocks * self._tokens_per_block * self._dummy_reqs[
             0].sampling_config.beam_width
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -527,12 +527,16 @@ def get_cuda_graph_warmup_request(batch_size, draft_len):
                 result = None
             return result
 
-        def get_warmup_request(num_tokens: int, num_gen_tokens: int):
+        def get_warmup_request(num_tokens: int,
+                               num_gen_tokens: int,
+                               least_requests: bool = True):
             available_tokens = kv_cache_manager.get_num_available_tokens(
                 self.runtime_draft_len)
             available_blocks = kv_cache_manager.get_num_free_blocks()
             if num_tokens > self.max_num_tokens or num_tokens > available_tokens:
                 return None
+            if num_gen_tokens > self.batch_size:
+                return None
 
             num_extra_decoding_steps = get_num_extra_decoding_steps()
             if num_extra_decoding_steps > 0:
@@ -550,14 +554,28 @@ def get_warmup_request(num_tokens: int, num_gen_tokens: int):
             num_full_seqs = 0
             num_left_over_tokens = 0
 
+            max_context_requests = self.batch_size - num_gen_tokens
+            if max_context_requests * max_seq_len < num_ctx_tokens:
+                return None
+
             if num_ctx_tokens > 0:
-                # We will try to assign as less context requests as possible to
-                # fill the num_ctx_tokens.
+                if least_requests:
+                    # We will try to assign as less context requests as possible to
+                    # fill the num_ctx_tokens.
 
-                # Num full sequences:
-                num_full_seqs = num_ctx_tokens // max_seq_len
-                num_left_over_tokens = num_ctx_tokens - num_full_seqs * max_seq_len
+                    # Num full sequences:
+                    num_full_seqs = num_ctx_tokens // max_seq_len
+                    num_left_over_tokens = num_ctx_tokens - num_full_seqs * max_seq_len
 
+                else:
+                    max_bs = min(num_ctx_tokens,
+                                 self.batch_size - num_gen_tokens)
+                    if num_ctx_tokens % max_bs == 0:
+                        num_full_seqs = max_bs
+                    else:
+                        num_full_seqs = max_bs - 1
+                    max_seq_len = num_ctx_tokens // num_full_seqs
+                    num_left_over_tokens = num_ctx_tokens - max_seq_len * num_full_seqs
                 num_ctx_requests = num_full_seqs + (1 if num_left_over_tokens
                                                     > 0 else 0)
 
@@ -633,33 +651,38 @@ def release_batch(result: ScheduledRequests | None):
         if cp_type == CpType.STAR:
             return
 
-        if self._torch_compile_enabled:
-
+        def general_warmup(reverse: bool = False):
             warmup_requests = set([
                 (1, 1),  # Specialize for 1 token.
                 (self.batch_size,
                  self.batch_size),  # max_batch_size, pure generation
                 (2, 0),  # Non-one, pure context
                 (curr_max_num_tokens, 0),  # max_num_tokens, pure context
             ])
+            if reverse:
+                warmup_requests = sorted(list(warmup_requests), reverse=reverse)
+
+            for warmup_num_tokens, warmup_num_gen_tokens in warmup_requests:
+                with release_batch(
+                        get_warmup_request(warmup_num_tokens,
+                                           warmup_num_gen_tokens)) as batch:
+                    if batch is None:
+                        # No KV cache space!
+                        continue
+                    logger.info(
+                        f"Run warmup with {warmup_num_tokens} tokens, include {warmup_num_gen_tokens} generation tokens"
+                    )
+                    self.forward(batch,
+                                 new_tensors_device=None,
+                                 resource_manager=resource_manager)
+                    torch.cuda.synchronize()
 
+        if self._torch_compile_enabled:
             # Disable cuda graph capture here so that we can properly capture it later
             with self.no_cuda_graph():
-                for warmup_num_tokens, warmup_num_gen_tokens in warmup_requests:
-
-                    with release_batch(
-                            get_warmup_request(warmup_num_tokens,
-                                               warmup_num_gen_tokens)) as batch:
-                        if batch is None:
-                            # No KV cache space!
-                            continue
-                        logger.info(
-                            f"Run warmup with {warmup_num_tokens} tokens, include {warmup_num_gen_tokens} generation tokens"
-                        )
-                        self.forward(batch,
-                                     new_tensors_device=None,
-                                     resource_manager=resource_manager)
-                        torch.cuda.synchronize()
+                # From small case to large to make sure the 1 token case is run first.
+                # If the first graph is not the 1 token case, dynamo will specialize the non-1 token case.
+                general_warmup()
 
         if self.pytorch_backend_config.enable_autotuner:
             with self.no_cuda_graph(), autotune():
@@ -763,6 +786,29 @@ def _update_draft_inference_state(is_first_draft: bool,
                             gc.collect()
                             torch.cuda.empty_cache()
 
+            # When using piecewise cuda graph, the logits may suffer severe memory faction problem.
+            # When the num of requests is growing, the block allocated by torch cannot be reused.
+            # So after piecewise cuda graph capture, a request with most requests is triggered to make
+            # sure that large enough blocks are allocated and can be correctly reused.
+            for num_tokens in piecewise_cuda_graph_num_tokens:
+                batch = get_warmup_request(num_tokens, 0, least_requests=False)
+                if batch is None:
+                    continue
+                with release_batch(batch) as batch:
+                    logger.info(
+                        f"Run piecewise CUDA graph warmup for num tokens={num_tokens} with most requests"
+                    )
+                    self.forward(batch,
+                                 new_tensors_device=None,
+                                 resource_manager=resource_manager)
+
+                    torch.cuda.synchronize()
+
+            # Also, we run a general warmup from large to small to make sure that blocks are allocated well.
+            # The cudagraph and piecewise cuda graph capture calls torch.cuda.empty_cache() and block may already
+            # be freed even we calls general_warmup for torch compile.
+            general_warmup(reverse=True)
+
         # Set the value back to the original value
         self.enable_spec_decode = self.is_spec_decode