Merge branch 'develop' into add_clear_run_batch_ci

xiaolei373 · web-flow · commit 10fe499809cf · 2025-12-01T15:57:54.000+08:00
diff --git a/fastdeploy/config.py b/fastdeploy/config.py
@@ -540,6 +540,11 @@ def __init__(
         self.expert_parallel_size = 1  # EP degree
         self.data_parallel_size = 1  # DP degree
         self.enable_expert_parallel = False
+        self.enable_chunked_moe = False
+        self.chunked_moe_size = 256
+        self.max_moe_num_chunk = 1
+        self.moe_num_chunk = 1
+
         self.local_data_parallel_id = 0
         # Engine worker queue port
         self.engine_worker_queue_port: str = "9923"
diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py
@@ -286,6 +286,16 @@ class EngineArgs:
     Enable expert parallelism.
     """
 
+    enable_chunked_moe: bool = False
+    """
+    Whether use chunked moe.
+    """
+
+    chunked_moe_size: int = 256
+    """
+    Chunk size of moe input.
+    """
+
     cache_transfer_protocol: str = "ipc"
     """
     Protocol to use for cache transfer.
@@ -870,6 +880,18 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
             default=EngineArgs.eplb_config,
             help="Config of eplb.",
         )
+        parallel_group.add_argument(
+            "--enable-chunked-moe",
+            action="store_true",
+            default=EngineArgs.enable_chunked_moe,
+            help="Use chunked moe.",
+        )
+        parallel_group.add_argument(
+            "--chunked-moe-size",
+            type=int,
+            default=EngineArgs.chunked_moe_size,
+            help="Chunked size of moe input.",
+        )
 
         # Load group
         load_group = parser.add_argument_group("Load Configuration")
diff --git a/fastdeploy/engine/async_llm.py b/fastdeploy/engine/async_llm.py
@@ -812,6 +812,7 @@ def _start_worker_service(self):
             f" --splitwise_role {self.cfg.scheduler_config.splitwise_role}"
             f" --kv_cache_ratio {self.cfg.cache_config.kv_cache_ratio}"
             f" --expert_parallel_size {self.cfg.parallel_config.expert_parallel_size}"
+            f" --chunked_moe_size {self.cfg.parallel_config.chunked_moe_size}"
             f" --data_parallel_size {self.cfg.parallel_config.data_parallel_size}"
             f" --quantization '{json.dumps(self.cfg.model_config.quantization)}'"
             f" --ori_vocab_size {ori_vocab_size}"
diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py
@@ -544,6 +544,7 @@ def _start_worker_service(self):
             f" --splitwise_role {self.cfg.scheduler_config.splitwise_role}"
             f" --kv_cache_ratio {self.cfg.cache_config.kv_cache_ratio}"
             f" --expert_parallel_size {self.cfg.parallel_config.expert_parallel_size}"
+            f" --chunked_moe_size {self.cfg.parallel_config.chunked_moe_size}"
             f" --data_parallel_size {self.cfg.parallel_config.data_parallel_size}"
             f" --quantization '{json.dumps(self.cfg.model_config.quantization)}'"
             f" --ori_vocab_size {ori_vocab_size}"
@@ -573,6 +574,7 @@ def _start_worker_service(self):
 
         worker_store_true_flag = {
             "enable_expert_parallel": self.cfg.parallel_config.enable_expert_parallel,
+            "enable_chunked_moe": self.cfg.parallel_config.enable_chunked_moe,
             "enable_prefix_caching": self.cfg.cache_config.enable_prefix_caching,
             "enable_chunked_prefill": self.cfg.cache_config.enable_chunked_prefill,
             "do_profile": self.do_profile,
diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py
@@ -612,6 +612,7 @@ def forward_split_allgather(self, x: paddle.Tensor, gate: nn.Layer):
         multi_outs = paddle.zeros([token_num_per_rank * self.tp_size, x.shape[1]], dtype=x.dtype)
         paddle.distributed.all_gather(multi_outs, out, self.tp_group)
         out = multi_outs[:token_num, :]
+
         return out
 
     def forward(self, x: paddle.Tensor, gate: nn.Layer):
@@ -633,9 +634,63 @@ def forward(self, x: paddle.Tensor, gate: nn.Layer):
             and token_num >= self.tp_size
         ):
             out = self.forward_split_allgather(x, gate)
+        elif self.fd_config.parallel_config.use_ep and self.fd_config.parallel_config.enable_chunked_moe:
+            out = self.forward_chunked_moe(x, gate)
         else:
-            out = self.quant_method.apply(self, x, gate)
+            out = self.forward_normal(x, gate)
 
         if self.reduce_results and self.tp_size > 1:
             out = tensor_model_parallel_all_reduce(out, self.tp_group)
         return out
+
+    def forward_chunked_moe(self, x: paddle.Tensor, gate: nn.Layer):
+        """
+        Split input to multi chunk to reduce the memory usage of moe.
+
+        Args:
+            x (Tensor): Input tensor to the moe layer.
+
+        Returns:
+            Tensor: Output tensor.s
+        """
+        chunk_size = self.fd_config.parallel_config.chunked_moe_size
+        token_num = x.shape[0]
+        fake_x = paddle.empty(
+            shape=[0, self.fd_config.model_config.hidden_size],
+            dtype=paddle.get_default_dtype(),
+        )
+        # input size that are less than a chunk, less than the max size data or empty input
+        # need to be repeated until the max chunk data infer MOE finished.
+        if token_num > chunk_size:  # chunked moe
+            x_split_list = paddle.tensor_split(x, self.fd_config.parallel_config.moe_num_chunk, axis=0)
+            out_split_list = [None] * self.fd_config.parallel_config.moe_num_chunk
+
+            for i in range(self.fd_config.parallel_config.max_moe_num_chunk):
+                if i < self.fd_config.parallel_config.moe_num_chunk:
+                    out_split_list[i] = self.quant_method.apply(self, x_split_list[i], gate)
+                else:
+                    # just need to use real data to infer max_moe_num_chunk times.
+                    self.quant_method.apply(self, fake_x, gate)
+
+            out = paddle.concat(out_split_list, axis=0)
+        else:
+            # when only one chunk, just need to use real data to infer once.
+            out = self.quant_method.apply(self, x, gate)
+            for i in range(self.fd_config.parallel_config.max_moe_num_chunk - 1):
+                self.quant_method.apply(self, fake_x, gate)
+
+        return out
+
+    def forward_normal(self, x: paddle.Tensor, gate: nn.Layer):
+        """
+        Normal mode of forward.
+
+        Args:
+            x (Tensor): Input tensor to the moe layer.
+
+        Returns:
+            Tensor: Output tensor.s
+
+        """
+        out = self.quant_method.apply(self, x, gate)
+        return out
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
@@ -95,7 +95,11 @@
 from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import ScatterOp
 from fastdeploy.model_executor.models.interfaces_base import FdModelForPooling
 from fastdeploy.output.pooler import PoolerOutput
-from fastdeploy.worker.model_runner_base import ModelRunnerBase
+from fastdeploy.worker.model_runner_base import (
+    DistributedOut,
+    DistributedStatus,
+    ModelRunnerBase,
+)
 from fastdeploy.worker.output import LogprobsTensors, ModelOutputData, ModelRunnerOutput
 
 
@@ -250,6 +254,56 @@ def only_prefill(self):
 
         return if_only_prefill
 
+    def collect_distributed_status(self):
+        """
+        Collect distributed status
+        """
+        dist_status_list = []
+        dist_status_obj = DistributedStatus()
+        dist_out = DistributedOut()
+
+        prefill_exists = None
+        if_only_decode = True
+        # mix ep in single node
+        if self.fd_config.parallel_config.use_ep and self.fd_config.scheduler_config.splitwise_role == "mixed":
+            prefill_exists = self.exist_prefill()
+            dist_status_obj.only_decode = not prefill_exists
+
+        # whether chunked moe
+        if self.fd_config.parallel_config.enable_chunked_moe:
+            chunk_size = self.fd_config.parallel_config.chunked_moe_size
+            token_num = self.share_inputs["ids_remove_padding"].shape[0]
+
+            if token_num > chunk_size:
+                self.fd_config.parallel_config.moe_num_chunk = (token_num + chunk_size - 1) // chunk_size
+            else:
+                self.fd_config.parallel_config.moe_num_chunk = 1
+
+            dist_status_obj.moe_num_chunk = self.fd_config.parallel_config.moe_num_chunk
+
+        # only ep need to collect and sync distributed status
+        if self.fd_config.parallel_config.use_ep and self.fd_config.scheduler_config.splitwise_role == "mixed":
+            # call once to gather all status
+            paddle.distributed.all_gather_object(dist_status_list, dist_status_obj)
+
+            # Update Batch type for cuda graph for if_only_decode
+            if_only_decode = all(dist_status.only_decode for dist_status in dist_status_list)
+
+        if_only_decode = if_only_decode and not (
+            prefill_exists if prefill_exists is not None else self.exist_prefill()
+        )
+
+        max_moe_num_chunk = None
+        if self.fd_config.parallel_config.enable_chunked_moe:
+            max_moe_num_chunk = max(dist_status.moe_num_chunk for dist_status in dist_status_list)
+
+        dist_out = DistributedOut(
+            if_only_decode=if_only_decode,
+            max_moe_num_chunk=max_moe_num_chunk,
+        )
+
+        return dist_out
+
     def only_decode(self):
         """
         check whether decode only
@@ -1355,7 +1409,7 @@ def get_model(self) -> nn.Layer:
 
     def initialize_forward_meta(self, is_dummy_or_profile_run=False):
         """
-        Initialize forward meta and attention meta data
+        Initialize forward meta, attention meta data and update some config.
         """
         # Initialize forward meta
         self.forward_meta = ForwardMeta(
@@ -1386,8 +1440,12 @@ def initialize_forward_meta(self, is_dummy_or_profile_run=False):
             kv_num_blocks_x_cpu=self.share_inputs["kv_num_blocks_x_cpu"],
         )
 
-        # Update Batch type for cuda graph for only_decode_batch
-        if_only_decode = self.only_decode()
+        dist_status = self.collect_distributed_status()
+
+        if_only_decode = dist_status.if_only_decode
+        if self.fd_config.parallel_config.enable_chunked_moe:
+            self.fd_config.parallel_config.max_moe_num_chunk = dist_status.max_moe_num_chunk
+
         only_decode_use_cudagraph = self.use_cudagraph and if_only_decode
 
         # Update config about moe for better performance
diff --git a/fastdeploy/worker/model_runner_base.py b/fastdeploy/worker/model_runner_base.py
@@ -15,6 +15,8 @@
 """
 
 from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Optional
 
 from paddle import nn
 
@@ -25,6 +27,18 @@
 logger = get_logger("model_runner_base", "model_runner_base.log")
 
 
+@dataclass
+class DistributedStatus:
+    only_decode: bool = True
+    moe_num_chunk: int = 1
+
+
+@dataclass
+class DistributedOut:
+    if_only_decode: bool = True
+    max_moe_num_chunk: Optional[int] = None
+
+
 class ModelRunnerBase(ABC):
     """
     Engine -> (WIP)Executor -> Worker -> ModelRunner -> Model
diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py
@@ -720,6 +720,17 @@ def parse_args():
         action="store_true",
         help="enable expert parallel",
     )
+    parser.add_argument(
+        "--enable_chunked_moe",
+        action="store_true",
+        help="enable chunked moe",
+    )
+    parser.add_argument(
+        "--chunked_moe_size",
+        type=int,
+        default=256,
+        help="chunk size of moe input",
+    )
     parser.add_argument("--ori_vocab_size", type=int, default=None)
     parser.add_argument("--think_end_id", type=int, default=-1)
     parser.add_argument("--image_patch_id", type=int, default=-1)
diff --git a/tests/ci_use/XPU_45T/run_w4a8.py b/tests/ci_use/XPU_45T/run_w4a8.py
@@ -36,7 +36,7 @@ def test_w4a8():
     )
     print(response.choices[0].message.content)
     # print(base_response)
-    assert any(keyword in response.choices[0].message.content for keyword in ["人工智能", "文心一言"])
+    assert any(keyword in response.choices[0].message.content for keyword in ["人工智能", "文心一言", "小度"])
 
 
 if __name__ == "__main__":
diff --git a/tests/distributed/chunked_moe.py b/tests/distributed/chunked_moe.py
diff --git a/tests/distributed/test_chunked_moe.py b/tests/distributed/test_chunked_moe.py

Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ def test_w4a8():`
`36`	`36`	`)`
`37`	`37`	`print(response.choices[0].message.content)`
`38`	`38`	`# print(base_response)`
`39`		`- assert any(keyword in response.choices[0].message.content for keyword in ["人工智能", "文心一言"])`
	`39`	`+ assert any(keyword in response.choices[0].message.content for keyword in ["人工智能", "文心一言", "小度"])`
`40`	`40`
`41`	`41`
`42`	`42`	`if __name__ == "__main__":`