From 22327c8238538edbbf91ad647ea9483f65f13eaf Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 27 Dec 2025 07:44:18 +0000 Subject: [PATCH 01/12] Initial plan From 18802b53a0961fb856163b7bb2b841e8d60957b6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 27 Dec 2025 07:50:27 +0000 Subject: [PATCH 02/12] Add all_routed_experts support in ascend cudagraph Co-authored-by: jinminxi104 <18713681+jinminxi104@users.noreply.github.com> --- .../cudagraph/ascend_cudagraph.py | 43 ++++++++++++++++--- 1 file changed, 38 insertions(+), 5 deletions(-) diff --git a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py index 37c975e2..3c129b70 100644 --- a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py +++ b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py @@ -141,9 +141,35 @@ def AscendCudaGraphMixin_update_context_cudagraph(self, graph_meta, context): context.kv_start_indices = input_buffers["kv_start_indices"] +def AscendCudaGraphMixin_make_output_buffers(self, output): + """Make output buffers.""" + if isinstance(output, torch.Tensor): + output_buffers = dict(hidden_states=output) + else: + assert isinstance(output, Dict) + output_buffers = output + return output_buffers + + +def AscendCudaGraphMixin_get_outputs_cudagraph( + self, output_buffers: Dict[str, Tensor], input_ids: Tensor, **kwargs +): + """Get outputs from buffers.""" + num_tokens = input_ids.size(-1) + outputs = dict() + outputs["hidden_states"] = output_buffers["hidden_states"][:, :num_tokens] + if output_buffers.get("all_routed_experts", None) is not None: + outputs["all_routed_experts"] = output_buffers["all_routed_experts"][ + :num_tokens, ... + ].clone() + return outputs + + CudaGraphMixin.make_buffers_cudagraph = AscendCudaGraphMixin_make_buffers_cudagraph CudaGraphMixin.fill_buffers_cudagraph = AscendCudaGraphMixin_fill_buffers_cudagraph CudaGraphMixin.update_context_cudagraph = AscendCudaGraphMixin_update_context_cudagraph +CudaGraphMixin.make_output_buffers = AscendCudaGraphMixin_make_output_buffers +CudaGraphMixin.get_outputs_cudagraph = AscendCudaGraphMixin_get_outputs_cudagraph def next_power_of_2(n: int): @@ -248,6 +274,10 @@ def capture(self, **kwargs): self.model.update_context_cudagraph(self.meta, context) current_stream = torch.cuda.current_stream() + # warmup + warmup_output = self.model(**padded_kwargs) + warmup_buffers = self.model.make_output_buffers(warmup_output) + aclgraph = torch.npu.NPUGraph() with ExitStack() as stack: with torch.npu.graph( @@ -258,15 +288,15 @@ def capture(self, **kwargs): ): output = self.model(**padded_kwargs) - output_buffers = dict(logits=output) + output_buffers = self.model.make_output_buffers(output) self.meta.output_buffers = output_buffers self._graph = aclgraph + output = self.model.get_outputs_cudagraph(warmup_buffers, **kwargs) return output @record_function("forward_cudagraph") def forward(self, **kwargs): """forward.""" - num_tokens = kwargs["input_ids"].size(-1) assert self._graph is not None self.model.fill_buffers_cudagraph(self.meta, **kwargs) context = self.ctx_mgr.current_context() @@ -281,7 +311,8 @@ def forward(self, **kwargs): else: update_attn_params(self.update_stream, self.meta, self.max_tokens) self._graph.replay() - output = self.meta.output_buffers["logits"][:, :num_tokens] + output_buffers = self.meta.output_buffers + output = self.model.get_outputs_cudagraph(output_buffers, **kwargs) return output def reset(self): @@ -368,7 +399,7 @@ def __call__(self, **kwargs): if not enable_graph: with record_function("forward_eager"): ret = self.model(**kwargs) - return ret + return self.model.make_output_buffers(ret) graph_key = self.get_graph_key(**kwargs) max_tokens = graph_key[0] @@ -387,9 +418,11 @@ def __call__(self, **kwargs): update_stream=self.update_stream, ) AscendGraphRunner.capturing = True - runner.capture(**kwargs) + output = runner.capture(**kwargs) AscendGraphRunner.capturing = False self._runner_map[graph_key] = runner + # SSM would update the state in capture(warmup), replay the graph will leads unexpected state update. + return output else: runner = self._runner_map[graph_key] output = runner.forward(**kwargs) From 00ddf4b32043fb105350c11becfd0ae865240515 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 27 Dec 2025 07:55:12 +0000 Subject: [PATCH 03/12] Add clarifying comment for ellipsis in all_routed_experts slicing Co-authored-by: jinminxi104 <18713681+jinminxi104@users.noreply.github.com> --- dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py index 3c129b70..7bce7e1b 100644 --- a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py +++ b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py @@ -159,6 +159,8 @@ def AscendCudaGraphMixin_get_outputs_cudagraph( outputs = dict() outputs["hidden_states"] = output_buffers["hidden_states"][:, :num_tokens] if output_buffers.get("all_routed_experts", None) is not None: + # Use ellipsis to preserve all dimensions after token dimension + # Shape can vary: [num_tokens, num_experts] or [num_tokens, top_k, ...] outputs["all_routed_experts"] = output_buffers["all_routed_experts"][ :num_tokens, ... ].clone() From d88c993f0d71e4c04f4aeeb4f68beab859bdba0d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 27 Dec 2025 07:56:51 +0000 Subject: [PATCH 04/12] Address code review feedback: simplify condition and fix grammar Co-authored-by: jinminxi104 <18713681+jinminxi104@users.noreply.github.com> --- dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py index 7bce7e1b..b380bff0 100644 --- a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py +++ b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py @@ -158,7 +158,7 @@ def AscendCudaGraphMixin_get_outputs_cudagraph( num_tokens = input_ids.size(-1) outputs = dict() outputs["hidden_states"] = output_buffers["hidden_states"][:, :num_tokens] - if output_buffers.get("all_routed_experts", None) is not None: + if "all_routed_experts" in output_buffers: # Use ellipsis to preserve all dimensions after token dimension # Shape can vary: [num_tokens, num_experts] or [num_tokens, top_k, ...] outputs["all_routed_experts"] = output_buffers["all_routed_experts"][ @@ -423,7 +423,7 @@ def __call__(self, **kwargs): output = runner.capture(**kwargs) AscendGraphRunner.capturing = False self._runner_map[graph_key] = runner - # SSM would update the state in capture(warmup), replay the graph will leads unexpected state update. + # SSM would update the state in capture(warmup), replay the graph will lead to unexpected state update. return output else: runner = self._runner_map[graph_key] From d5c5ad78c28e448b0697dc648bcc17e3eb196ccb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 27 Dec 2025 07:58:00 +0000 Subject: [PATCH 05/12] Fix grammar in comment for better clarity Co-authored-by: jinminxi104 <18713681+jinminxi104@users.noreply.github.com> --- dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py index b380bff0..f46f2366 100644 --- a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py +++ b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py @@ -423,7 +423,7 @@ def __call__(self, **kwargs): output = runner.capture(**kwargs) AscendGraphRunner.capturing = False self._runner_map[graph_key] = runner - # SSM would update the state in capture(warmup), replay the graph will lead to unexpected state update. + # SSM would update the state in capture(warmup), replaying the graph would lead to unexpected state update. return output else: runner = self._runner_map[graph_key] From 2cbe4ffa4f189b52f7389baa252ee3abaec86a47 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 27 Dec 2025 07:59:58 +0000 Subject: [PATCH 06/12] Improve code clarity: add assertion message and use descriptive variable names Co-authored-by: jinminxi104 <18713681+jinminxi104@users.noreply.github.com> --- .../lmdeploy_ext/cudagraph/ascend_cudagraph.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py index f46f2366..cd59ab4a 100644 --- a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py +++ b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py @@ -146,7 +146,7 @@ def AscendCudaGraphMixin_make_output_buffers(self, output): if isinstance(output, torch.Tensor): output_buffers = dict(hidden_states=output) else: - assert isinstance(output, Dict) + assert isinstance(output, Dict), f"Expected output to be Tensor or Dict, got {type(output)}" output_buffers = output return output_buffers @@ -288,13 +288,13 @@ def capture(self, **kwargs): pool=self.pool, stream=current_stream, ): - output = self.model(**padded_kwargs) + graph_output = self.model(**padded_kwargs) - output_buffers = self.model.make_output_buffers(output) + output_buffers = self.model.make_output_buffers(graph_output) self.meta.output_buffers = output_buffers self._graph = aclgraph - output = self.model.get_outputs_cudagraph(warmup_buffers, **kwargs) - return output + final_output = self.model.get_outputs_cudagraph(warmup_buffers, **kwargs) + return final_output @record_function("forward_cudagraph") def forward(self, **kwargs): From f3b1ebceddd76358f2d768515854d47cb079dcc6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 27 Dec 2025 08:01:57 +0000 Subject: [PATCH 07/12] Add type hints for better API clarity Co-authored-by: jinminxi104 <18713681+jinminxi104@users.noreply.github.com> --- .../framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py index cd59ab4a..6e7a1a25 100644 --- a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py +++ b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py @@ -1,7 +1,7 @@ # Copyright (c) 2024, OpenMMLab and DeepLink. All rights reserved. # this file implements the cudagraph for ascend backend. import functools -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union from dataclasses import dataclass from contextlib import ExitStack from packaging.version import InvalidVersion, Version @@ -141,7 +141,9 @@ def AscendCudaGraphMixin_update_context_cudagraph(self, graph_meta, context): context.kv_start_indices = input_buffers["kv_start_indices"] -def AscendCudaGraphMixin_make_output_buffers(self, output): +def AscendCudaGraphMixin_make_output_buffers( + self, output: Union[torch.Tensor, Dict[str, torch.Tensor]] +) -> Dict[str, torch.Tensor]: """Make output buffers.""" if isinstance(output, torch.Tensor): output_buffers = dict(hidden_states=output) @@ -153,7 +155,7 @@ def AscendCudaGraphMixin_make_output_buffers(self, output): def AscendCudaGraphMixin_get_outputs_cudagraph( self, output_buffers: Dict[str, Tensor], input_ids: Tensor, **kwargs -): +) -> Dict[str, Tensor]: """Get outputs from buffers.""" num_tokens = input_ids.size(-1) outputs = dict() From 4af6d08f9f18c48d2b2ead3681c9b7b64f416de0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 27 Dec 2025 08:03:24 +0000 Subject: [PATCH 08/12] Final polish: improve error message and comment clarity Co-authored-by: jinminxi104 <18713681+jinminxi104@users.noreply.github.com> --- .../framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py index 6e7a1a25..c72a8f83 100644 --- a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py +++ b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py @@ -148,7 +148,9 @@ def AscendCudaGraphMixin_make_output_buffers( if isinstance(output, torch.Tensor): output_buffers = dict(hidden_states=output) else: - assert isinstance(output, Dict), f"Expected output to be Tensor or Dict, got {type(output)}" + assert isinstance( + output, Dict + ), f"Expected output to be torch.Tensor or Dict, got {type(output)}" output_buffers = output return output_buffers @@ -425,7 +427,7 @@ def __call__(self, **kwargs): output = runner.capture(**kwargs) AscendGraphRunner.capturing = False self._runner_map[graph_key] = runner - # SSM would update the state in capture(warmup), replaying the graph would lead to unexpected state update. + # SSM updates the state in capture(warmup), so replaying the graph would lead to unexpected state update. return output else: runner = self._runner_map[graph_key] From 7ddde69411a8451de26cfe3ffdffcf5ce93635f1 Mon Sep 17 00:00:00 2001 From: jinminxi104 Date: Sun, 28 Dec 2025 01:21:16 +0800 Subject: [PATCH 09/12] Refactor capturing logic in AscendGraphRunner Removed output assignment from runner.capture call. --- dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py index c72a8f83..7f542ddc 100644 --- a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py +++ b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py @@ -424,11 +424,9 @@ def __call__(self, **kwargs): update_stream=self.update_stream, ) AscendGraphRunner.capturing = True - output = runner.capture(**kwargs) + runner.capture(**kwargs) AscendGraphRunner.capturing = False self._runner_map[graph_key] = runner - # SSM updates the state in capture(warmup), so replaying the graph would lead to unexpected state update. - return output else: runner = self._runner_map[graph_key] output = runner.forward(**kwargs) From af47cbef41f76338694252880829d24cdb560c20 Mon Sep 17 00:00:00 2001 From: jinminxi104 Date: Sun, 28 Dec 2025 01:31:17 +0800 Subject: [PATCH 10/12] Remove unused output buffer methods from CudaGraphMixin Removed make_output_buffers and get_outputs_cudagraph methods from CudaGraphMixin. --- .../cudagraph/ascend_cudagraph.py | 32 ------------------- 1 file changed, 32 deletions(-) diff --git a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py index 7f542ddc..c610b0af 100644 --- a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py +++ b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py @@ -141,41 +141,9 @@ def AscendCudaGraphMixin_update_context_cudagraph(self, graph_meta, context): context.kv_start_indices = input_buffers["kv_start_indices"] -def AscendCudaGraphMixin_make_output_buffers( - self, output: Union[torch.Tensor, Dict[str, torch.Tensor]] -) -> Dict[str, torch.Tensor]: - """Make output buffers.""" - if isinstance(output, torch.Tensor): - output_buffers = dict(hidden_states=output) - else: - assert isinstance( - output, Dict - ), f"Expected output to be torch.Tensor or Dict, got {type(output)}" - output_buffers = output - return output_buffers - - -def AscendCudaGraphMixin_get_outputs_cudagraph( - self, output_buffers: Dict[str, Tensor], input_ids: Tensor, **kwargs -) -> Dict[str, Tensor]: - """Get outputs from buffers.""" - num_tokens = input_ids.size(-1) - outputs = dict() - outputs["hidden_states"] = output_buffers["hidden_states"][:, :num_tokens] - if "all_routed_experts" in output_buffers: - # Use ellipsis to preserve all dimensions after token dimension - # Shape can vary: [num_tokens, num_experts] or [num_tokens, top_k, ...] - outputs["all_routed_experts"] = output_buffers["all_routed_experts"][ - :num_tokens, ... - ].clone() - return outputs - - CudaGraphMixin.make_buffers_cudagraph = AscendCudaGraphMixin_make_buffers_cudagraph CudaGraphMixin.fill_buffers_cudagraph = AscendCudaGraphMixin_fill_buffers_cudagraph CudaGraphMixin.update_context_cudagraph = AscendCudaGraphMixin_update_context_cudagraph -CudaGraphMixin.make_output_buffers = AscendCudaGraphMixin_make_output_buffers -CudaGraphMixin.get_outputs_cudagraph = AscendCudaGraphMixin_get_outputs_cudagraph def next_power_of_2(n: int): From 0278be6f232bcfa3e20cb74c0626914f0b6ac0bd Mon Sep 17 00:00:00 2001 From: jinminxi104 Date: Sun, 28 Dec 2025 16:15:05 +0000 Subject: [PATCH 11/12] fix --- dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py index c610b0af..bb9923a3 100644 --- a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py +++ b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py @@ -254,6 +254,7 @@ def capture(self, **kwargs): aclgraph = torch.npu.NPUGraph() with ExitStack() as stack: + AscendGraphRunner.capturing = True with torch.npu.graph( aclgraph, auto_dispatch_capture=True, @@ -261,6 +262,7 @@ def capture(self, **kwargs): stream=current_stream, ): graph_output = self.model(**padded_kwargs) + AscendGraphRunner.capturing = False output_buffers = self.model.make_output_buffers(graph_output) self.meta.output_buffers = output_buffers @@ -391,9 +393,7 @@ def __call__(self, **kwargs): device=self.device, update_stream=self.update_stream, ) - AscendGraphRunner.capturing = True runner.capture(**kwargs) - AscendGraphRunner.capturing = False self._runner_map[graph_key] = runner else: runner = self._runner_map[graph_key] From 4054c25ebbdd2b32c9c7d13a477a5f78013435a3 Mon Sep 17 00:00:00 2001 From: jinminxi104 Date: Mon, 29 Dec 2025 00:44:30 +0800 Subject: [PATCH 12/12] Update dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py index bb9923a3..f9f95976 100644 --- a/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py +++ b/dlinfer/framework/lmdeploy_ext/cudagraph/ascend_cudagraph.py @@ -1,7 +1,7 @@ # Copyright (c) 2024, OpenMMLab and DeepLink. All rights reserved. # this file implements the cudagraph for ascend backend. import functools -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional from dataclasses import dataclass from contextlib import ExitStack from packaging.version import InvalidVersion, Version