From 73384a6bae02db9fac76efc604b4f93c42e68ece Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Mon, 13 Oct 2025 11:14:57 +0800
Subject: [PATCH 01/24] delete impl

---
 .../model_executor/pre_and_post_process.py    | 42 --------------
 fastdeploy/worker/gcu_model_runner.py         |  8 ---
 fastdeploy/worker/gpu_model_runner.py         | 49 ----------------
 fastdeploy/worker/metax_model_runner.py       | 21 -------
 fastdeploy/worker/output.py                   | 20 -------
 fastdeploy/worker/xpu_model_runner.py         | 56 -------------------
 6 files changed, 196 deletions(-)

diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py
index 65948ea7d2a..34685b52019 100644
--- a/fastdeploy/model_executor/pre_and_post_process.py
+++ b/fastdeploy/model_executor/pre_and_post_process.py
@@ -194,48 +194,6 @@ def post_process_normal(
     zmq_client: ZmqIpcClient = None,
 ) -> ModelRunnerOutput:
     """Post-processing steps after completing a single token generation."""
-    # handle vl:
-    if model_output.think_end_id != -1:
-        thinking_mask = model_output.enable_thinking[: sampler_output.sampled_token_ids.shape[0]]
-        exists_think_end = (sampler_output.sampled_token_ids == model_output.think_end_id) & thinking_mask
-        paddle.assign(
-            paddle.where(
-                exists_think_end,
-                model_output.need_think_end - 1,
-                model_output.need_think_end,
-            ),
-            model_output.need_think_end,
-        )
-
-        reasoning_index_update_cond = model_output.need_think_end.cast("bool") & thinking_mask
-        paddle.assign(
-            paddle.where(
-                reasoning_index_update_cond,
-                model_output.reasoning_index - 1,
-                model_output.reasoning_index,
-            ),
-            model_output.reasoning_index,
-        )
-
-        stop_wo_think = (
-            (sampler_output.sampled_token_ids == model_output.eos_token_id.T).any(axis=1, keepdim=True)
-            | (model_output.reasoning_index == 0)
-        ) & (model_output.need_think_end > 0)
-
-        stop_wo_think = stop_wo_think & thinking_mask
-        sampler_output.sampled_token_ids = paddle.where(
-            stop_wo_think,
-            model_output.think_end_id,
-            sampler_output.sampled_token_ids,
-        )
-        paddle.assign(
-            paddle.where(
-                stop_wo_think,
-                model_output.need_think_end - 1,
-                model_output.need_think_end,
-            ),
-            model_output.need_think_end,
-        )
     # 1. Set stop value
     paddle.assign(
         paddle.where(
diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py
index 5c0580ea898..36c87c03e85 100644
--- a/fastdeploy/worker/gcu_model_runner.py
+++ b/fastdeploy/worker/gcu_model_runner.py
@@ -837,10 +837,6 @@ def _dummy_run(
                 ),
                 accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None),
                 accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None),
-                enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None),
-                think_end_id=(self.model_config.think_end_id if self.enable_mm else -1),
-                need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None),
-                reasoning_index=(self.share_inputs["reasoning_index"] if self.enable_mm else None),
             )
 
             post_process(
@@ -1065,10 +1061,6 @@ class at the server level, which is too granular for ModelRunner.
             ),
             accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None),
             accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None),
-            enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None),
-            think_end_id=(self.model_config.think_end_id if self.enable_mm else -1),
-            need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None),
-            reasoning_index=(self.share_inputs["reasoning_index"] if self.enable_mm else None),
         )
 
         if self.speculative_config.method in ["mtp"] and self.scheduler_config.splitwise_role == "prefill":
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 1c55b467945..c6034a725a5 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -259,13 +259,9 @@ def _init_logits_processor(self, request):
         elif request.structural_tag is not None:
             schemata_key = ("structural_tag", request.structural_tag)
 
-        enable_thinking = request.get("enable_thinking", True)
-        enable_thinking = enable_thinking if enable_thinking is not None else True
-
         return (
             self.guided_backend.get_logits_processor(
                 schemata_key=schemata_key,
-                enable_thinking=enable_thinking,
             ),
             schemata_key,
         )
@@ -327,23 +323,6 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
                         position_ids, request.get("max_tokens", 2048)
                     )
 
-                if request.get("enable_thinking", False):
-                    # Enable thinking
-                    req_reasoning_max_tokens = request.get("reasoning_max_tokens")
-                    req_max_tokens = request.get("max_tokens")
-                    final_reasoning_tokens = (
-                        req_reasoning_max_tokens if req_reasoning_max_tokens is not None else req_max_tokens
-                    )
-
-                    self.share_inputs["enable_thinking"][idx : idx + 1] = True
-                    self.share_inputs["need_think_end"][idx : idx + 1, :] = 1
-                    self.share_inputs["reasoning_index"][idx : idx + 1, :] = final_reasoning_tokens
-                else:
-                    # Disable thinking
-                    self.share_inputs["enable_thinking"][idx : idx + 1] = False
-                    self.share_inputs["need_think_end"][idx : idx + 1, :] = 0
-                    self.share_inputs["reasoning_index"][idx : idx + 1, :] = 0
-
                 if isinstance(request.prompt_token_ids, np.ndarray):
                     prompt_token_ids = request.prompt_token_ids.tolist()
                 else:
@@ -567,23 +546,6 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests:
                     )
                     self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
 
-                if request.get("enable_thinking", False):
-                    # Enable thinking
-                    req_reasoning_max_tokens = request.get("reasoning_max_tokens")
-                    req_max_tokens = request.get("max_tokens")
-                    final_reasoning_tokens = (
-                        req_reasoning_max_tokens if req_reasoning_max_tokens is not None else req_max_tokens
-                    )
-
-                    self.share_inputs["enable_thinking"][idx : idx + 1] = True
-                    self.share_inputs["need_think_end"][idx : idx + 1, :] = 1
-                    self.share_inputs["reasoning_index"][idx : idx + 1, :] = final_reasoning_tokens
-                else:
-                    # Disable thinking
-                    self.share_inputs["enable_thinking"][idx : idx + 1] = False
-                    self.share_inputs["need_think_end"][idx : idx + 1, :] = 0
-                    self.share_inputs["reasoning_index"][idx : idx + 1, :] = 0
-
             def get_attr_from_request(request, attr, default_value=None):
                 res = request.get(attr, default_value)
                 if res is not None:
@@ -879,9 +841,6 @@ def _init_share_inputs(self, max_num_seqs: int):
         tmp_position_ids = paddle.arange(self.model_config.max_model_len).reshape((1, -1))
 
         # Initialize thinking related buffers
-        self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
-        self.share_inputs["enable_thinking"] = paddle.full(shape=[max_num_seqs, 1], fill_value=False, dtype="bool")
-        self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
 
         # TODO(gongshaotian): move to models
         if not self.enable_mm:
@@ -1457,10 +1416,6 @@ def _dummy_run(
                 ),
                 accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None),
                 accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None),
-                enable_thinking=self.share_inputs["enable_thinking"],
-                think_end_id=self.model_config.think_end_id,
-                need_think_end=self.share_inputs["need_think_end"],
-                reasoning_index=self.share_inputs["reasoning_index"],
                 stop_token_ids=self.share_inputs["stop_seqs"],
                 stop_seqs_len=self.share_inputs["stop_seqs_len"],
             )
@@ -1811,10 +1766,6 @@ class at the server level, which is too granular for ModelRunner.
             ),
             accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None),
             accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None),
-            enable_thinking=self.share_inputs["enable_thinking"],
-            think_end_id=self.model_config.think_end_id,
-            need_think_end=self.share_inputs["need_think_end"][:num_running_requests],
-            reasoning_index=self.share_inputs["reasoning_index"][:num_running_requests],
             stop_token_ids=self.share_inputs["stop_seqs"],
             stop_seqs_len=self.share_inputs["stop_seqs_len"],
         )
diff --git a/fastdeploy/worker/metax_model_runner.py b/fastdeploy/worker/metax_model_runner.py
index dcce154ea51..3ea98654ee5 100644
--- a/fastdeploy/worker/metax_model_runner.py
+++ b/fastdeploy/worker/metax_model_runner.py
@@ -242,11 +242,6 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
                     else:
                         position_ids = None
 
-                    enable_thinking = request.get("enable_thinking", True)
-                    enable_thinking = enable_thinking if enable_thinking is not None else True
-                    self.share_inputs["enable_thinking"][:] = enable_thinking
-                    self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0
-                    self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048)
                     self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d(
                         position_ids, request.get("max_tokens", 2048)
                     )
@@ -459,11 +454,6 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests:
                     self.share_inputs["prompt_lens"][idx : idx + 1] = length
 
                 if self.enable_mm:
-                    enable_thinking = request.get("enable_thinking", True)
-                    enable_thinking = enable_thinking if enable_thinking is not None else True
-                    self.share_inputs["enable_thinking"][:] = enable_thinking
-                    self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0
-                    self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048)
                     self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d(
                         position_ids, request.get("max_tokens", 2048)
                     )
@@ -779,9 +769,6 @@ def _init_share_inputs(self, max_num_seqs: int):
                 dtype="float32",
             )
             self.share_inputs["image_features"] = None
-            self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
-            self.share_inputs["enable_thinking"] = paddle.full(shape=[1], fill_value=True, dtype="bool")
-            self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
 
     def _prepare_inputs(self) -> None:
         """Prepare the model inputs"""
@@ -1133,10 +1120,6 @@ def _dummy_run(
                 ),
                 accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None),
                 accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None),
-                enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None),
-                think_end_id=(self.model_config.think_end_id if self.enable_mm else -1),
-                need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None),
-                reasoning_index=(self.share_inputs["reasoning_index"] if self.enable_mm else None),
                 stop_token_ids=self.share_inputs["stop_seqs"],
                 stop_seqs_len=self.share_inputs["stop_seqs_len"],
             )
@@ -1401,10 +1384,6 @@ class at the server level, which is too granular for ModelRunner.
             ),
             accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None),
             accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None),
-            enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None),
-            think_end_id=(self.model_config.think_end_id if self.enable_mm else -1),
-            need_think_end=(self.share_inputs["need_think_end"][:num_running_requests] if self.enable_mm else None),
-            reasoning_index=(self.share_inputs["reasoning_index"][:num_running_requests] if self.enable_mm else None),
             stop_token_ids=self.share_inputs["stop_seqs"],
             stop_seqs_len=self.share_inputs["stop_seqs_len"],
         )
diff --git a/fastdeploy/worker/output.py b/fastdeploy/worker/output.py
index 6d820a873a4..9b110628595 100644
--- a/fastdeploy/worker/output.py
+++ b/fastdeploy/worker/output.py
@@ -220,26 +220,6 @@ class ModelOutputData:
     """
     accept_num: paddle.Tensor
 
-    """
-        vl model enable to think
-    """
-    enable_thinking: paddle.Tensor = None
-
-    """
-        vl model think end id
-    """
-    think_end_id: int = -1
-
-    """
-        vl model need to think
-    """
-    need_think_end: paddle.Tensor = None
-
-    """
-        vl model reasoning index
-    """
-    reasoning_index: paddle.Tensor = None
-
     """
         the token ids of stop sequence
     """
diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py
index f04680498c0..1ea3266f681 100644
--- a/fastdeploy/worker/xpu_model_runner.py
+++ b/fastdeploy/worker/xpu_model_runner.py
@@ -190,45 +190,6 @@ def xpu_post_process(
         update_inputs,
     )
 
-    # handle vl:
-    if model_output.enable_thinking:
-        exists_think_end = sampled_token_ids == model_output.think_end_id
-        paddle.assign(
-            paddle.where(
-                exists_think_end,
-                model_output.need_think_end - 1,
-                model_output.need_think_end,
-            ),
-            model_output.need_think_end,
-        )
-
-        paddle.assign(
-            paddle.where(
-                model_output.need_think_end.cast("bool"),
-                model_output.reasoning_index - 1,
-                model_output.reasoning_index,
-            ),
-            model_output.reasoning_index,
-        )
-
-        stop_wo_think = (
-            (sampled_token_ids == model_output.eos_token_id.T).any(axis=1, keepdim=True)
-            | (model_output.reasoning_index == 0)
-        ) & (model_output.need_think_end > 0)
-        sampled_token_ids = paddle.where(
-            stop_wo_think,
-            model_output.think_end_id,
-            sampled_token_ids,
-        )
-        paddle.assign(
-            paddle.where(
-                stop_wo_think,
-                model_output.need_think_end - 1,
-                model_output.need_think_end,
-            ),
-            model_output.need_think_end,
-        )
-
     # 1. Set stop value
     paddle.assign(
         paddle.where(
@@ -455,11 +416,6 @@ def insert_tasks_v1(self, req_dicts: List[Request]):
                     else:
                         position_ids = None
 
-                    enable_thinking = request.get("enable_thinking", True)
-                    enable_thinking = enable_thinking if enable_thinking is not None else True
-                    self.share_inputs["enable_thinking"][:] = enable_thinking
-                    self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0
-                    self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048)
                     self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d(
                         position_ids, request.get("max_tokens", 2048)
                     )
@@ -594,11 +550,6 @@ def insert_prefill_inputs(self, req_dicts: List[Request]):
             self.share_inputs["prompt_lens"][idx : idx + 1] = length
 
             if self.enable_mm:
-                enable_thinking = request.get("enable_thinking", True)
-                enable_thinking = enable_thinking if enable_thinking is not None else True
-                self.share_inputs["enable_thinking"][:] = enable_thinking
-                self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0
-                self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048)
                 self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d(
                     position_ids, request.get("max_tokens", 2048)
                 )
@@ -813,9 +764,6 @@ def _init_share_inputs(self, max_num_seqs: int):
                 dtype="float32",
             )
             self.share_inputs["image_features"] = None
-            self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
-            self.share_inputs["enable_thinking"] = paddle.full(shape=[1], fill_value=True, dtype="bool")
-            self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
 
     def _prepare_inputs(self, is_dummy_run=False) -> None:
         """Prepare the model inputs"""
@@ -1112,10 +1060,6 @@ class at the server level, which is too granular for ModelRunner.
             actual_draft_token_num=None,
             accept_tokens=None,
             accept_num=None,
-            enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None),
-            think_end_id=(self.model_config.think_end_id if self.enable_mm else -1),
-            need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None),
-            reasoning_index=(self.share_inputs["reasoning_index"] if self.enable_mm else None),
             stop_token_ids=self.share_inputs["stop_seqs"],
             stop_seqs_len=self.share_inputs["stop_seqs_len"],
         )

From fe924357c7a03bfea87a0844f049bdd298f87e78 Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Mon, 13 Oct 2025 11:28:23 +0800
Subject: [PATCH 02/24] delete min_length&max_length

---
 fastdeploy/worker/gcu_model_runner.py   | 4 ----
 fastdeploy/worker/gpu_model_runner.py   | 4 ----
 fastdeploy/worker/hpu_model_runner.py   | 4 ----
 fastdeploy/worker/metax_model_runner.py | 4 ----
 fastdeploy/worker/xpu_model_runner.py   | 4 ----
 5 files changed, 20 deletions(-)

diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py
index 36c87c03e85..75c1ad40797 100644
--- a/fastdeploy/worker/gcu_model_runner.py
+++ b/fastdeploy/worker/gcu_model_runner.py
@@ -383,10 +383,6 @@ def _init_share_inputs(self, max_num_seqs: int):
         self.share_inputs["max_dec_len"] = paddle.full(
             [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64"
         )
-        self.share_inputs["min_length"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64")
-        self.share_inputs["max_length"] = paddle.full(
-            [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64"
-        )
         self.seq_lens_this_time_buffer = paddle.full(max_num_seqs, 0, dtype="int32")
         self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
         self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index c6034a725a5..9b2e531b880 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -776,10 +776,6 @@ def _init_share_inputs(self, max_num_seqs: int):
         self.share_inputs["max_dec_len"] = paddle.full(
             [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64"
         )
-        self.share_inputs["min_length"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64")
-        self.share_inputs["max_length"] = paddle.full(
-            [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64"
-        )
         self.seq_lens_this_time_buffer = paddle.full([max_num_seqs, 1], 0, dtype="int32")
         if self.fd_config.parallel_config.enable_expert_parallel:
             self.share_inputs["seq_lens_this_time"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
diff --git a/fastdeploy/worker/hpu_model_runner.py b/fastdeploy/worker/hpu_model_runner.py
index 56f84fd86da..317eb0a7398 100644
--- a/fastdeploy/worker/hpu_model_runner.py
+++ b/fastdeploy/worker/hpu_model_runner.py
@@ -591,10 +591,6 @@ def _init_share_inputs(self, max_num_seqs: int):
         self.share_inputs["max_dec_len"] = paddle.full(
             [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64"
         )
-        self.share_inputs["min_length"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64")
-        self.share_inputs["max_length"] = paddle.full(
-            [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64"
-        )
         self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, 0, dtype="int32")
         self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
         self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
diff --git a/fastdeploy/worker/metax_model_runner.py b/fastdeploy/worker/metax_model_runner.py
index 3ea98654ee5..b88a16579c3 100644
--- a/fastdeploy/worker/metax_model_runner.py
+++ b/fastdeploy/worker/metax_model_runner.py
@@ -628,10 +628,6 @@ def _init_share_inputs(self, max_num_seqs: int):
         self.share_inputs["max_dec_len"] = paddle.full(
             [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64"
         )
-        self.share_inputs["min_length"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64")
-        self.share_inputs["max_length"] = paddle.full(
-            [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64"
-        )
         self.seq_lens_this_time_buffer = paddle.full([max_num_seqs, 1], 0, dtype="int32")
         if self.fd_config.parallel_config.enable_expert_parallel:
             self.share_inputs["seq_lens_this_time"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py
index 1ea3266f681..87bf5dd4331 100644
--- a/fastdeploy/worker/xpu_model_runner.py
+++ b/fastdeploy/worker/xpu_model_runner.py
@@ -670,10 +670,6 @@ def _init_share_inputs(self, max_num_seqs: int):
         self.share_inputs["max_dec_len"] = paddle.full(
             [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64"
         )
-        self.share_inputs["min_length"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64")
-        self.share_inputs["max_length"] = paddle.full(
-            [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64"
-        )
         self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, 0, dtype="int32")
         self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")
         self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32")

From 1b289b688142784fcd293da50eb09be9401ac343 Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Mon, 13 Oct 2025 16:06:24 +0800
Subject: [PATCH 03/24] support limit thinking content strategy

---
 .../limit_thinking_content_length_v1.cu       | 100 +++++++++
 .../limit_thinking_content_length_v2.cu       | 130 ++++++++++++
 custom_ops/setup_ops.py                       |   2 +
 fastdeploy/config.py                          |  79 ++++---
 fastdeploy/engine/common_engine.py            | 197 ++++++++++--------
 fastdeploy/engine/engine.py                   |  22 +-
 fastdeploy/entrypoints/cli/tokenizer.py       |   3 +-
 fastdeploy/entrypoints/engine_client.py       |  17 +-
 fastdeploy/entrypoints/openai/api_server.py   |   1 -
 fastdeploy/input/preprocess.py                |  33 +--
 .../model_executor/models/interfaces_base.py  |  54 -----
 .../model_executor/models/model_base.py       |  53 +++--
 .../model_executor/pre_and_post_process.py    |  33 ++-
 fastdeploy/multimodal/registry.py             |  35 ----
 fastdeploy/worker/gpu_model_runner.py         |  45 +++-
 fastdeploy/worker/utils.py                    |  50 -----
 fastdeploy/worker/worker_process.py           |  21 +-
 tests/ce/server/test_completions.py           |   6 -
 18 files changed, 515 insertions(+), 366 deletions(-)
 create mode 100644 custom_ops/gpu_ops/limit_thinking_content_length_v1.cu
 create mode 100644 custom_ops/gpu_ops/limit_thinking_content_length_v2.cu
 delete mode 100644 fastdeploy/model_executor/models/interfaces_base.py
 delete mode 100644 fastdeploy/multimodal/registry.py
 delete mode 100644 fastdeploy/worker/utils.py

diff --git a/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu b/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu
new file mode 100644
index 00000000000..e16e6dc66eb
--- /dev/null
+++ b/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu
@@ -0,0 +1,100 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "helper.h"
+#include "paddle/extension.h"
+
+__global__ void limit_thinking_content_length_kernel_v1(
+    int64_t *next_tokens,
+    const int *max_think_lens,
+    const int64_t *step_idx, // step_idx 不再需要被修改，改为 const
+    int *limit_think_status,
+    const int64_t think_end_id,
+    const int bs)
+{
+  int bid = threadIdx.x;
+  if (bid >= bs)
+    return;
+
+  // 如果该序列未启用思考功能，则直接返回，默认值为 -1，表示不限制思考长度
+  const int max_think_len = max_think_lens[bid];
+  if (max_think_len < 0)
+    return;
+  int current_limit_content_status = limit_think_status[bid];
+  // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行.
+  if (current_limit_content_status == 2)
+  {
+    return;
+  }
+
+  int64_t next_token = next_tokens[bid];
+  const int64_t step = step_idx[bid];
+
+  // ======================= 思考阶段控制 =======================
+  // 阶段 1: 仍在思考 (status == 0), 检查是否需要强制结束
+  if (current_limit_content_status < 1)
+  {
+    // 当开启思考长度控制时，检查是否超时
+    if (step >= max_think_len)
+    {
+      // 强制将当前token替换为结束思考的token
+      next_token = think_end_id;
+      // 将状态推进到 1, 表示 "正在结束思考"
+      current_limit_content_status = 1;
+    }
+  }
+  // ======================= 思考结束处理 =======================
+  // 阶段 2: 检查是否已满足结束思考的条件 (status < 2)
+  // 这种情况会处理两种场景:
+  // 1. status == 0: 模型自己生成了 think_end_id
+  // 2. status == 1: 上一阶段强制注入了 think_end_id
+  if (current_limit_content_status < 2)
+  {
+    if (next_token == think_end_id)
+    {
+      // 确认思考结束，将状态推进到 2 (响应阶段)
+      current_limit_content_status = 2;
+    }
+  }
+  // 写回更新后的 token
+  next_tokens[bid] = next_token;
+  // 更新全局状态
+  limit_think_status[bid] = current_limit_content_status;
+}
+
+void LimitThinkingContentLengthV1(const paddle::Tensor &next_tokens,
+                                const paddle::Tensor &max_think_lens,
+                                const paddle::Tensor &step_idx,
+                                const paddle::Tensor &limit_think_status,
+                                const int64_t think_end_id)
+{
+  const int batch_size = next_tokens.shape()[0];
+  limit_thinking_content_length_kernel_v1<<<1, 1024>>>(
+      const_cast<int64_t *>(next_tokens.data<int64_t>()),
+      max_think_lens.data<int>(),
+      step_idx.data<int64_t>(),
+      const_cast<int *>(limit_think_status.data<int>()),
+      think_end_id,
+      batch_size);
+}
+
+PD_BUILD_OP(limit_thinking_content_length_v1)
+    .Inputs({"next_tokens",
+             "max_think_lens",
+             "step_idx",
+             "limit_think_status"})
+    .Attrs({"think_end_id: int64_t"})
+    .Outputs({"next_tokens_out"})
+    .SetInplaceMap({{"next_tokens", "next_tokens_out"}})
+    .SetKernelFn(PD_KERNEL(LimitThinkingContentLengthV1));
diff --git a/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu b/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu
new file mode 100644
index 00000000000..fdac1cff5d0
--- /dev/null
+++ b/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu
@@ -0,0 +1,130 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "helper.h"
+#include "paddle/extension.h"
+
+// status == 0: 正常生成阶段
+// status == 1: 替换阶段
+// status == 2: 替换结束阶段
+// status == 3: 思考结束阶段
+__global__ void limit_thinking_content_length_kernel_v2(
+    int64_t *next_tokens,
+    const int *max_think_lens,
+    const int64_t *step_idx, // step_idx 不再需要被修改，改为 const
+    int *limit_think_status,
+    const int64_t think_end_id,
+    const int64_t line_break_id,
+    const int bs)
+{
+  int bid = threadIdx.x;
+  if (bid >= bs)
+    return;
+  // 如果该序列未启用思考功能，则直接返回，默认值为 -1，表示不限制思考长度
+  const int max_think_len = max_think_lens[bid];
+  if (max_think_len < 0)
+    return;
+  int current_limit_content_status = limit_think_status[bid];
+  // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行.
+  if (current_limit_content_status == 3)
+  {
+    return;
+  }
+
+  int64_t next_token = next_tokens[bid];
+  const int64_t step = step_idx[bid];
+
+  // ======================= 思考阶段控制 =======================
+  // 阶段 1: 仍在思考 (status == 0), 检查是否需要强制结束
+  // 阶段 2: 在替换 (status == 1), 检查是否替换结束
+  if (current_limit_content_status <= 1)
+  {
+    // 当开启思考长度控制时，检查是否超时
+    if (step == max_think_len)
+    {
+      // 强制将当前token替换为结束思考的token
+      next_token = line_break_id;
+      current_limit_content_status = 1;
+    }
+    else if (step == max_think_len + 1)
+    {
+      // 强制将当前token替换为结束思考的token
+      next_token = think_end_id;
+      current_limit_content_status = 1;
+    }
+    else if (step == max_think_len + 2)
+    {
+      // 强制将当前token替换为结束思考的token
+      next_token = line_break_id;
+      current_limit_content_status = 1;
+    }
+    else if (step == max_think_len + 3)
+    {
+      // 强制将当前token替换为结束思考的token
+      next_token = line_break_id;
+      // 将状态推进到 1, 表示 "正在结束思考"
+      current_limit_content_status = 2;
+    }
+  }
+  // ======================= 思考结束处理 =======================
+  // 阶段 3: 检查是否已满足结束思考的条件 (status == 0 || status == 2)
+  // 这种情况会处理两种场景:
+  // 1. status == 0: 模型可能自己生成了 </think>
+  // 2. status == 2: 上一阶段强制注入了 \n</think>\n\n
+  if (current_limit_content_status == 0)
+  {
+    if (next_token == think_end_id)
+    {
+      // 确认思考结束，将状态推进到 3 (响应阶段)
+      current_limit_content_status = 3;
+    }
+  }
+  if (current_limit_content_status == 2)
+  {
+    // 确认思考结束，将状态推进到 3 (响应阶段)
+      current_limit_content_status = 3;
+  }
+  // 写回更新后的 token
+  next_tokens[bid] = next_token;
+  // 更新全局状态
+  limit_think_status[bid] = current_limit_content_status;
+}
+
+void LimitThinkingContentLengthV2(const paddle::Tensor &next_tokens,
+                                const paddle::Tensor &max_think_lens,
+                                const paddle::Tensor &step_idx,
+                                const paddle::Tensor &limit_think_status,
+                                const int64_t think_end_id,
+                                const int64_t line_break_id)
+{
+  const int batch_size = next_tokens.shape()[0];
+  limit_thinking_content_length_kernel_v2<<<1, 1024>>>(
+      const_cast<int64_t *>(next_tokens.data<int64_t>()),
+      max_think_lens.data<int>(),
+      step_idx.data<int64_t>(),
+      const_cast<int *>(limit_think_status.data<int>()),
+      think_end_id,
+      line_break_id,
+      batch_size);
+}
+
+PD_BUILD_OP(limit_thinking_content_length_v2)
+    .Inputs({"next_tokens",
+             "max_think_lens",
+             "step_idx",
+             "limit_think_status"})
+    .Attrs({"think_end_id: int64_t", "line_break_id: int64_t"})
+    .Outputs({"next_tokens_out"})
+    .SetInplaceMap({{"next_tokens", "next_tokens_out"}})
+    .SetKernelFn(PD_KERNEL(LimitThinkingContentLengthV2));
diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py
index 8636c3de440..8d8d42f2d7e 100644
--- a/custom_ops/setup_ops.py
+++ b/custom_ops/setup_ops.py
@@ -302,6 +302,8 @@ def find_end_files(directory, end_str):
         "gpu_ops/noaux_tc.cu",
         "gpu_ops/custom_all_reduce/all_reduce.cu",
         "gpu_ops/merge_prefill_decode_output.cu",
+        "gpu_ops/limit_thinking_content_length_v1.cu",
+        "gpu_ops/limit_thinking_content_length_v2.cu",
     ]
 
     # pd_disaggregation
diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index 20de8534451..400807339a2 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -30,7 +30,6 @@
 import fastdeploy
 from fastdeploy import envs
 from fastdeploy.model_executor.layers.quantization.quant_base import QuantConfigBase
-from fastdeploy.multimodal.registry import MultimodalRegistry
 from fastdeploy.platforms import current_platform
 from fastdeploy.scheduler import SchedulerConfig
 from fastdeploy.transformer_utils.config import get_pooling_config
@@ -225,50 +224,12 @@ def __init__(
 
         self.ori_vocab_size = args.get("ori_vocab_size", self.vocab_size)
         self.think_end_id = args.get("think_end_id", -1)
-
-        architectures = self.architectures[0]
-
-        if MultimodalRegistry.contains_model(architectures):
-            self.enable_mm = True
-        else:
-            self.enable_mm = False
-
-        self.is_unified_ckpt = check_unified_ckpt(self.model)
+        self.im_patch_id = args.get("image_patch_id", -1)
+        self.line_break_id = args.get("line_break_id", -1)
 
         self.override_name_from_config()
         self.read_from_env()
         self.read_model_config()
-        self.runner_type = self._get_runner_type(self.architectures, self.runner)
-        self.convert_type = self._get_convert_type(self.architectures, self.runner_type, self.convert)
-
-        registry = self.registry
-        is_generative_model = registry.is_text_generation_model(self.architectures, self)
-        is_pooling_model = registry.is_pooling_model(self.architectures, self)
-        is_multimodal_model = registry.is_multimodal_model(self.architectures, self)
-
-        if self.runner_type == "generate" and not is_generative_model:
-            if is_multimodal_model:
-                pass
-            else:
-                generate_converts = _RUNNER_CONVERTS["generate"]
-                if self.convert_type not in generate_converts:
-                    raise ValueError("This model does not support '--runner generate.")
-        if self.runner_type == "pooling" and not is_pooling_model:
-            pooling_converts = _RUNNER_CONVERTS["pooling"]
-            if self.convert_type not in pooling_converts:
-                convert_option = "<" + "|".join(pooling_converts) + ">"
-                raise ValueError(
-                    "This model does not support `--runner pooling`. "
-                    f"You can pass `--convert {convert_option} to adapt "
-                    "it into a pooling model."
-                )
-
-        self.supported_tasks = self._get_supported_tasks(self.architectures, self.runner_type, self.convert_type)
-        model_info, arch = registry.inspect_model_cls(self.architectures, self)
-        self._model_info = model_info
-        self._architecture = arch
-
-        self.pooler_config = self._init_pooler_config()
 
     @property
     def registry(self):
@@ -510,6 +471,42 @@ def print(self):
             logger.info("{:<20}:{:<6}{}".format(k, "", v))
         logger.info("=============================================================")
 
+    def __post_init__(self):
+        self.is_unified_ckpt = check_unified_ckpt(self.model)
+        self.runner_type = self._get_runner_type(self.architectures, self.runner)
+        self.convert_type = self._get_convert_type(self.architectures, self.runner_type, self.convert)
+        registry = self.registry
+        is_generative_model = registry.is_text_generation_model(self.architectures, self)
+        is_pooling_model = registry.is_pooling_model(self.architectures, self)
+        is_multimodal_model = registry.is_multimodal_model(self.architectures, self)
+        self.is_reasoning_model = registry.is_reasoning_model(self.architectures, self)
+
+        self.enable_mm = is_multimodal_model
+
+        if self.runner_type == "generate" and not is_generative_model:
+            if is_multimodal_model:
+                pass
+            else:
+                generate_converts = _RUNNER_CONVERTS["generate"]
+                if self.convert_type not in generate_converts:
+                    raise ValueError("This model does not support '--runner generate.")
+        if self.runner_type == "pooling" and not is_pooling_model:
+            pooling_converts = _RUNNER_CONVERTS["pooling"]
+            if self.convert_type not in pooling_converts:
+                convert_option = "<" + "|".join(pooling_converts) + ">"
+                raise ValueError(
+                    "This model does not support `--runner pooling`. "
+                    f"You can pass `--convert {convert_option} to adapt "
+                    "it into a pooling model."
+                )
+
+        self.supported_tasks = self._get_supported_tasks(self.architectures, self.runner_type, self.convert_type)
+        model_info, arch = registry.inspect_model_cls(self.architectures, self)
+        self._model_info = model_info
+        self._architecture = arch
+
+        self.pooler_config = self._init_pooler_config()
+
 
 class ParallelConfig:
     """Configuration for the distributed execution."""
diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py
index 93dc7258a8c..4c650477fee 100644
--- a/fastdeploy/engine/common_engine.py
+++ b/fastdeploy/engine/common_engine.py
@@ -60,77 +60,77 @@ class EngineService:
     Base class containing common engine functionality
     """
 
-    def __init__(self, cfg, start_queue=True):
+    def __init__(self, fd_config, start_queue=True):
         """
         Initializes the LLMEngine with the provided configuration.
 
         Args:
-            cfg (Config): Config object containing all the configuration parameters.
+            fd_config (FDConfig): Config object containing all the configuration parameters.
         """
-        self.cfg = cfg
-        if isinstance(self.cfg.cache_config.cache_queue_port, str):
-            self.cfg.cache_config.cache_queue_port = self.cfg.cache_config.cache_queue_port.split(",")
-        if isinstance(self.cfg.cache_config.cache_queue_port, list):
-            self.cfg.cache_config.cache_queue_port = int(
-                self.cfg.cache_config.cache_queue_port[self.cfg.parallel_config.local_data_parallel_id]
+        self.fd_config = fd_config
+        if isinstance(self.fd_config.cache_config.cache_queue_port, str):
+            self.fd_config.cache_config.cache_queue_port = self.fd_config.cache_config.cache_queue_port.split(",")
+        if isinstance(self.fd_config.cache_config.cache_queue_port, list):
+            self.fd_config.cache_config.cache_queue_port = int(
+                self.fd_config.cache_config.cache_queue_port[self.fd_config.parallel_config.local_data_parallel_id]
             )
 
-        if self.cfg.parallel_config.enable_expert_parallel:
+        if self.fd_config.parallel_config.enable_expert_parallel:
             self.llm_logger = get_logger(
-                "fastdeploy", f"fastdeploy_rank{self.cfg.parallel_config.local_data_parallel_id}.log"
+                "fastdeploy", f"fastdeploy_rank{self.fd_config.parallel_config.local_data_parallel_id}.log"
             )
         else:
             self.llm_logger = llm_logger
 
-        self.scheduler = cfg.scheduler_config.scheduler()
+        self.scheduler = fd_config.scheduler_config.scheduler()
         self.enable_decode_cache_task = envs.FD_ENABLE_CACHE_TASK == "1"
 
         if envs.ENABLE_V1_KVCACHE_SCHEDULER:
             self.resource_manager = ResourceManagerV1(
-                cfg.scheduler_config.max_num_seqs,
-                cfg,
-                cfg.parallel_config.tensor_parallel_size,
-                cfg.scheduler_config.splitwise_role,
-                cfg.parallel_config.local_data_parallel_id,
+                fd_config.scheduler_config.max_num_seqs,
+                fd_config,
+                fd_config.parallel_config.tensor_parallel_size,
+                fd_config.scheduler_config.splitwise_role,
+                fd_config.parallel_config.local_data_parallel_id,
             )
         else:
             self.resource_manager = ResourceManager(
-                cfg.scheduler_config.max_num_seqs,
-                cfg,
-                cfg.parallel_config.tensor_parallel_size,
-                cfg.scheduler_config.splitwise_role,
-                cfg.parallel_config.local_data_parallel_id,
+                fd_config.scheduler_config.max_num_seqs,
+                fd_config,
+                fd_config.parallel_config.tensor_parallel_size,
+                fd_config.scheduler_config.splitwise_role,
+                fd_config.parallel_config.local_data_parallel_id,
             )
 
         self.start_worker_queue_service(start_queue)
 
-        os.environ["INFERENCE_MSG_QUEUE_ID"] = self.cfg.parallel_config.engine_worker_queue_port[
-            self.cfg.parallel_config.local_data_parallel_id
+        os.environ["INFERENCE_MSG_QUEUE_ID"] = self.fd_config.parallel_config.engine_worker_queue_port[
+            self.fd_config.parallel_config.local_data_parallel_id
         ]
 
-        self.split_connector = SplitwiseConnector(cfg, self.engine_worker_queue, self.resource_manager)
+        self.split_connector = SplitwiseConnector(fd_config, self.engine_worker_queue, self.resource_manager)
         self.waiting_requests = []
         self.token_processor = TokenProcessor(
-            cfg=cfg,
+            cfg=fd_config,
             cached_generated_tokens=self.scheduler,
             engine_worker_queue=self.engine_worker_queue,
             split_connector=self.split_connector,
         )
         self.token_processor.set_resource_manager(self.resource_manager)
 
-        self.partial_chunked_tokens = [0] * (self.cfg.max_num_partial_prefills + 1)
-        for idx in range(1, self.cfg.max_num_partial_prefills + 1):
+        self.partial_chunked_tokens = [0] * (self.fd_config.max_num_partial_prefills + 1)
+        for idx in range(1, self.fd_config.max_num_partial_prefills + 1):
             self.partial_chunked_tokens[idx] = (
-                (self.cfg.scheduler_config.max_num_batched_tokens // idx)
-                // self.cfg.cache_config.block_size
-                * self.cfg.cache_config.block_size
+                (self.fd_config.scheduler_config.max_num_batched_tokens // idx)
+                // self.fd_config.cache_config.block_size
+                * self.fd_config.cache_config.block_size
             )
 
         self.guided_decoding_checker = None
-        if self.cfg.guided_decoding_backend != "off":
+        if self.fd_config.guided_decoding_backend != "off":
             self.guided_decoding_checker = schema_checker(
-                self.cfg.guided_decoding_backend,
-                disable_any_whitespace=self.cfg.disable_any_whitespace,
+                self.fd_config.guided_decoding_backend,
+                disable_any_whitespace=self.fd_config.disable_any_whitespace,
             )
         self._init_worker_monitor_signals()
 
@@ -145,12 +145,14 @@ def start(self):
         self.insert_task_to_worker_thread.start()
         self.token_processor.tasks_queue = self.engine_worker_queue
         self.token_processor.run()
-        if self.cfg.scheduler_config.splitwise_role != "mixed":
+        if self.fd_config.scheduler_config.splitwise_role != "mixed":
             self.split_mode_get_tasks()
 
     def _init_worker_monitor_signals(self):  # exist_task_signal 用于各worker进程感知是否有新Task需要处理
         current_suffix = int(
-            self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]
+            self.fd_config.parallel_config.engine_worker_queue_port[
+                self.fd_config.parallel_config.local_data_parallel_id
+            ]
         )
         self.llm_logger.info(f"current_suffix: {current_suffix}")
         exist_task_signal_data = np.zeros([1], dtype=np.int32)
@@ -184,7 +186,8 @@ def _init_worker_monitor_signals(self):  # exist_task_signal 用于各worker进
 
         # worker_live_signal 用于engine感知各worker进程是否存活，记录每个step 时间
         worker_healthy_live_recorded_time_array = np.zeros(
-            shape=[min(self.cfg.worker_num_per_node, self.cfg.parallel_config.tensor_parallel_size)], dtype=np.int32
+            shape=[min(self.fd_config.worker_num_per_node, self.fd_config.parallel_config.tensor_parallel_size)],
+            dtype=np.int32,
         )
         self.worker_healthy_live_signal = IPCSignal(
             name="worker_healthy_live_signal",
@@ -194,7 +197,7 @@ def _init_worker_monitor_signals(self):  # exist_task_signal 用于各worker进
             create=True,
         )
 
-        cache_ready_signal_data = np.zeros(shape=[self.cfg.parallel_config.tensor_parallel_size], dtype=np.int32)
+        cache_ready_signal_data = np.zeros(shape=[self.fd_config.parallel_config.tensor_parallel_size], dtype=np.int32)
         self.cache_ready_signal = IPCSignal(
             name="cache_ready_signal",
             array=cache_ready_signal_data,
@@ -203,7 +206,9 @@ def _init_worker_monitor_signals(self):  # exist_task_signal 用于各worker进
             create=True,
         )
 
-        swap_space_ready_signal_data = np.zeros(shape=[self.cfg.parallel_config.tensor_parallel_size], dtype=np.int32)
+        swap_space_ready_signal_data = np.zeros(
+            shape=[self.fd_config.parallel_config.tensor_parallel_size], dtype=np.int32
+        )
         self.swap_space_ready_signal = IPCSignal(
             name="swap_space_ready_signal",
             array=swap_space_ready_signal_data,
@@ -244,43 +249,53 @@ def start_worker_queue_service(self, start_queue):
         start queue service for engine worker communication
         """
         address = (
-            self.cfg.master_ip,
-            int(self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]),
+            self.fd_config.master_ip,
+            int(
+                self.fd_config.parallel_config.engine_worker_queue_port[
+                    self.fd_config.parallel_config.local_data_parallel_id
+                ]
+            ),
         )
 
-        if start_queue and (self.cfg.host_ip == self.cfg.master_ip or self.cfg.master_ip == "0.0.0.0"):
+        if start_queue and (
+            self.fd_config.host_ip == self.fd_config.master_ip or self.fd_config.master_ip == "0.0.0.0"
+        ):
             self.llm_logger.info(f"Starting engine worker queue server service at {address}")
             self.engine_worker_queue_server = EngineWorkerQueue(
                 address=address,
                 is_server=True,
-                num_client=self.cfg.parallel_config.tensor_parallel_size,
-                local_data_parallel_size=self.cfg.parallel_config.data_parallel_size,
+                num_client=self.fd_config.parallel_config.tensor_parallel_size,
+                local_data_parallel_size=self.fd_config.parallel_config.data_parallel_size,
             )
 
-            if self.cfg.cache_config.enable_prefix_caching or self.cfg.scheduler_config.splitwise_role != "mixed":
+            if (
+                self.fd_config.cache_config.enable_prefix_caching
+                or self.fd_config.scheduler_config.splitwise_role != "mixed"
+            ):
                 self.cache_task_queue = EngineCacheQueue(
                     address=(
-                        self.cfg.master_ip,
-                        self.cfg.cache_config.cache_queue_port,
+                        self.fd_config.master_ip,
+                        self.fd_config.cache_config.cache_queue_port,
                     ),
                     authkey=b"cache_queue_service",
                     is_server=True,
-                    num_client=self.cfg.parallel_config.tensor_parallel_size,
+                    num_client=self.fd_config.parallel_config.tensor_parallel_size,
                     client_id=-1,
-                    local_data_parallel_size=self.cfg.parallel_config.data_parallel_size,
+                    local_data_parallel_size=self.fd_config.parallel_config.data_parallel_size,
                 )
         self.llm_logger.info(
-            f"local {min(self.cfg.worker_num_per_node * self.cfg.node_rank + self.cfg.parallel_config.local_data_parallel_id,self.cfg.parallel_config.data_parallel_size - 1)}"
+            f"local {min(self.fd_config.worker_num_per_node * self.fd_config.node_rank + self.fd_config.parallel_config.local_data_parallel_id,self.fd_config.parallel_config.data_parallel_size - 1)}"
         )
         self.engine_worker_queue = EngineWorkerQueue(
             address=address,
             is_server=False,
-            num_client=self.cfg.parallel_config.tensor_parallel_size,
+            num_client=self.fd_config.parallel_config.tensor_parallel_size,
             client_id=0,
-            local_data_parallel_size=self.cfg.parallel_config.data_parallel_size,
+            local_data_parallel_size=self.fd_config.parallel_config.data_parallel_size,
             local_data_parallel_id=min(
-                self.cfg.worker_num_per_node * self.cfg.node_rank + self.cfg.parallel_config.local_data_parallel_id,
-                self.cfg.parallel_config.data_parallel_size - 1,
+                self.fd_config.worker_num_per_node * self.fd_config.node_rank
+                + self.fd_config.parallel_config.local_data_parallel_id,
+                self.fd_config.parallel_config.data_parallel_size - 1,
             ),
         )
 
@@ -310,8 +325,8 @@ def insert_tasks(self, tasks, current_id=-1, allocated=False):
                 cur_task.prompt_token_ids[0] = task.outputs.token_ids[0]
                 cur_task.num_cached_tokens = task.num_cached_tokens
                 if (
-                    self.cfg.speculative_config.method in ["mtp"]
-                    and self.cfg.scheduler_config.splitwise_role == "decode"
+                    self.fd_config.speculative_config.method in ["mtp"]
+                    and self.fd_config.scheduler_config.splitwise_role == "decode"
                 ):
                     cur_task.draft_token_ids = copy.deepcopy(task.outputs.draft_token_ids)
                 if task.error_code != 200:
@@ -338,7 +353,7 @@ def insert_tasks(self, tasks, current_id=-1, allocated=False):
 
         need_delete_tasks = []
         for task in tasks:
-            if self.cfg.scheduler_config.splitwise_role != "mixed":
+            if self.fd_config.scheduler_config.splitwise_role != "mixed":
                 status, msg = self.split_connector.check_decode_allocated(task)
                 if not status:
                     self.llm_logger.error(f"{task.request_id} prefill failed with msg:{msg}.")
@@ -394,12 +409,12 @@ def insert_tasks(self, tasks, current_id=-1, allocated=False):
             for task in tasks:
                 task.inference_start_time = time.time()
             if not is_prefill:
-                if not self.cfg.model_config.enable_mm:
+                if not self.fd_config.model_config.enable_mm:
                     self.update_requests_chunk_size(tasks)
                 else:
                     self.update_mm_requests_chunk_size(tasks)
             self.engine_worker_queue.put_tasks((tasks, self.resource_manager.real_bsz))
-            if is_prefill and self.cfg.scheduler_config.name != "splitwise":
+            if is_prefill and self.fd_config.scheduler_config.name != "splitwise":
                 self.engine_worker_queue.available_prefill_instances.put(1)
         return True
 
@@ -432,14 +447,14 @@ def update_tokens(idx, chunk_size, update_chunk=False):
             if current_request_size[idx] <= 0:
                 chunk_request_num -= 1
 
-        if not self.cfg.cache_config.enable_chunked_prefill or len(requests) == 0:
+        if not self.fd_config.cache_config.enable_chunked_prefill or len(requests) == 0:
             return
 
         current_request_size = [request.prompt_token_ids_len for request in requests]
         requests_chunk = [[] for _ in range(len(requests))]
         chunk_request_num = len(current_request_size)
         while chunk_request_num >= 1:
-            remain_batched_tokens = self.cfg.scheduler_config.max_num_batched_tokens
+            remain_batched_tokens = self.fd_config.scheduler_config.max_num_batched_tokens
             for idx in range(len(current_request_size)):
                 if current_request_size[idx] <= 0:
                     continue
@@ -449,14 +464,16 @@ def update_tokens(idx, chunk_size, update_chunk=False):
                 )
                 update_tokens(idx, chunk_size)
 
-            while remain_batched_tokens >= self.cfg.cache_config.block_size:
+            while remain_batched_tokens >= self.fd_config.cache_config.block_size:
                 # 当前 max_num_batched_tokens 还有剩余时，优先分配给较短的请求
                 waiting_requests = [input_lens for input_lens in current_request_size if input_lens > 0]
                 if len(waiting_requests) == 0:
                     break
 
                 available_tokens = (
-                    remain_batched_tokens // self.cfg.cache_config.block_size * self.cfg.cache_config.block_size
+                    remain_batched_tokens
+                    // self.fd_config.cache_config.block_size
+                    * self.fd_config.cache_config.block_size
                 )
                 append_idx = current_request_size.index(min(waiting_requests))
                 chunk_size = min(
@@ -473,7 +490,7 @@ def update_mm_requests_chunk_size(self, requests):
         """
         update each multimodal request's chunk size info
         """
-        if not self.cfg.cache_config.enable_chunked_prefill or len(requests) == 0:
+        if not self.fd_config.cache_config.enable_chunked_prefill or len(requests) == 0:
             return
 
         for request in requests:
@@ -560,7 +577,7 @@ def _insert_task_to_worker(self):
                     continue
                 if hasattr(self, "exist_prefill_task_signal") and self.exist_prefill_task_signal.value[0] > 0:
                     if (
-                        self.cfg.scheduler_config.splitwise_role == "mixed"
+                        self.fd_config.scheduler_config.splitwise_role == "mixed"
                         or self.split_connector.has_splitwise_tasks()
                     ):
                         time.sleep(0.005)
@@ -574,15 +591,15 @@ def _insert_task_to_worker(self):
 
                 num_prefill_batch = min(
                     int(self.resource_manager.available_batch()),
-                    self.cfg.max_prefill_batch,
+                    self.fd_config.max_prefill_batch,
                 )
 
                 self.resource_manager.check_and_free_block_tables()
                 tasks = self.scheduler.get_requests(
                     available_blocks=self.resource_manager.available_block_num(),
-                    block_size=self.cfg.cache_config.block_size,
-                    reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num,
-                    max_num_batched_tokens=self.cfg.scheduler_config.max_num_batched_tokens,
+                    block_size=self.fd_config.cache_config.block_size,
+                    reserved_output_blocks=self.fd_config.cache_config.enc_dec_block_num,
+                    max_num_batched_tokens=self.fd_config.scheduler_config.max_num_batched_tokens,
                     batch=num_prefill_batch,
                 )
 
@@ -590,7 +607,7 @@ def _insert_task_to_worker(self):
                     time.sleep(0.001)
                     continue
 
-                if self.cfg.scheduler_config.splitwise_role != "mixed":
+                if self.fd_config.scheduler_config.splitwise_role != "mixed":
                     self.llm_logger.info("Inserting splitwise tasks")
                     self.split_connector.send_splitwise_tasks(tasks, current_id)
 
@@ -619,21 +636,21 @@ def _fetch_request():
                 is_fetching = True
                 num_prefill_batch = min(
                     int(self.resource_manager.available_batch()),
-                    self.cfg.max_prefill_batch,
+                    self.fd_config.max_prefill_batch,
                 )
-                if self.cfg.model_config.enable_mm:
+                if self.fd_config.model_config.enable_mm:
                     available_blocks = self.resource_manager.available_block_num()
                 else:
-                    available_blocks = self.cfg.cache_config.max_block_num_per_seq
+                    available_blocks = self.fd_config.cache_config.max_block_num_per_seq
 
                 tasks = self.scheduler.get_requests(
                     available_blocks=available_blocks,
-                    block_size=self.cfg.cache_config.block_size,
-                    reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num,
-                    max_num_batched_tokens=self.cfg.model_config.max_model_len,
+                    block_size=self.fd_config.cache_config.block_size,
+                    reserved_output_blocks=self.fd_config.cache_config.enc_dec_block_num,
+                    max_num_batched_tokens=self.fd_config.model_config.max_model_len,
                     batch=num_prefill_batch,
                 )
-                if self.cfg.scheduler_config.splitwise_role != "mixed":
+                if self.fd_config.scheduler_config.splitwise_role != "mixed":
                     for task in tasks:
                         # assure can allocate block ids in P
                         while not self.resource_manager.preallocate_resource_in_p(task):
@@ -642,7 +659,7 @@ def _fetch_request():
                         self.split_connector.send_splitwise_tasks([task], task.idx)
                     need_delete_tasks = []
                     for task in tasks:
-                        if self.cfg.scheduler_config.splitwise_role != "mixed":
+                        if self.fd_config.scheduler_config.splitwise_role != "mixed":
                             # assure fetch block ids from D
                             status, msg = self.split_connector.check_decode_allocated(task)
                             if not status:
@@ -663,7 +680,7 @@ def _fetch_request():
                         tasks.remove(tmp_task)
                         # release resource in P
                         self.resource_manager.prerelease_resource(tmp_task)
-                if self.cfg.scheduler_config.splitwise_role == "prefill":
+                if self.fd_config.scheduler_config.splitwise_role == "prefill":
                     # to send cache info to cache messager
                     if tasks:
                         self.split_connector.send_cache_infos(tasks, 0)
@@ -680,7 +697,7 @@ def _fetch_request():
                                 time.sleep(0.001)
                 # Fetch requests and add them to the scheduling queue
                 if tasks:
-                    if self.cfg.scheduler_config.splitwise_role == "prefill":
+                    if self.fd_config.scheduler_config.splitwise_role == "prefill":
                         self.resource_manager.add_request_in_p(tasks)
                     else:
                         for task in tasks:
@@ -695,7 +712,7 @@ def _fetch_request():
                 if self.engine_worker_queue.num_tasks() > 0:
                     time.sleep(0.001)
                     continue
-                if self.cfg.scheduler_config.splitwise_role != "mixed":
+                if self.fd_config.scheduler_config.splitwise_role != "mixed":
                     if self.scheduler.get_unhandled_request_num() <= envs.FD_EP_MAX_PREFETCH_TASK_NUM and (
                         not is_fetching
                     ):
@@ -712,7 +729,7 @@ def _fetch_request():
                 tasks = self.resource_manager.schedule()
                 # 3. Send to engine
                 if tasks:
-                    if self.cfg.scheduler_config.splitwise_role == "decode":
+                    if self.fd_config.scheduler_config.splitwise_role == "decode":
                         for task in tasks:
                             if task.task_type == RequestType.PREEMPTED:
                                 msg = f"{task.request_id} decode not enough blocks, need to be rescheduled."
@@ -744,7 +761,7 @@ def start_zmq_service(self, api_server_pid=None):
             self.recv_request_server = ZmqTcpServer(port=envs.FD_ZMQ_RECV_REQUEST_SERVER_PORT, mode=zmq.PULL)
             self.send_response_server = ZmqTcpServer(port=envs.FD_ZMQ_SEND_RESPONSE_SERVER_PORT, mode=zmq.ROUTER)
             self.internal_adapter = InternalAdapter(
-                cfg=self.cfg, engine=self, dp_rank=self.cfg.node_rank * self.cfg.worker_num_per_node
+                cfg=self.fd_config, engine=self, dp_rank=self.fd_config.node_rank * self.fd_config.worker_num_per_node
             )
         else:
             self.recv_request_server = ZmqIpcServer(name=api_server_pid, mode=zmq.PULL)
@@ -763,12 +780,12 @@ def start_zmq_service(self, api_server_pid=None):
     def _insert_zmq_task_to_scheduler(self):
         added_requests: Dict[str, int] = dict()
         if envs.FD_ENABLE_INTERNAL_ADAPTER:
-            if self.cfg.scheduler_config.splitwise_role == "decode":
+            if self.fd_config.scheduler_config.splitwise_role == "decode":
                 return
         while self.running:
             try:
                 block = True if len(added_requests) == 0 else False
-                if not self.cfg.model_config.enable_mm:
+                if not self.fd_config.model_config.enable_mm:
                     err, data = self.recv_request_server.receive_json_once(block)
                 else:
                     err, data = self.recv_request_server.receive_pyobj_once(block)
@@ -930,7 +947,7 @@ def receiver_loop():
 
                                     else:
                                         self.insert_tasks(tasks, allocated=True)
-                                        if self.cfg.innode_prefill_ports is not None:
+                                        if self.fd_config.innode_prefill_ports is not None:
                                             self.scheduler.put_results(tasks)
                                 else:
                                     if len(self.waiting_requests):
@@ -975,12 +992,14 @@ def receiver_loop():
 
     def start_cache_service(self, device_ids, ipc_signal_suffix, create_cache_tensor):
         return self.resource_manager.cache_manager.launch_cache_manager(
-            cache_config=self.cfg.cache_config,
-            tensor_parallel_size=self.cfg.parallel_config.tensor_parallel_size,
+            cache_config=self.fd_config.cache_config,
+            tensor_parallel_size=self.fd_config.parallel_config.tensor_parallel_size,
             device_ids=device_ids,
-            pod_ip=self.cfg.master_ip,
+            pod_ip=self.fd_config.master_ip,
             engine_worker_queue_port=int(
-                self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]
+                self.fd_config.parallel_config.engine_worker_queue_port[
+                    self.fd_config.parallel_config.local_data_parallel_id
+                ]
             ),
             pid_suffix=ipc_signal_suffix,
             create_cache_tensor=create_cache_tensor,
diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py
index ca754566702..207229a98be 100644
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -34,7 +34,6 @@
 import paddle
 from tqdm import tqdm
 
-from fastdeploy.config import ErnieArchitectures
 from fastdeploy.engine.args_utils import EngineArgs
 from fastdeploy.engine.common_engine import EngineService
 from fastdeploy.engine.expert_service import start_data_parallel_service
@@ -89,11 +88,10 @@ def __init__(self, cfg):
         self.is_started = False
 
         self.input_processor = InputPreprocessor(
-            cfg.tokenizer,
+            cfg.model_config,
             cfg.reasoning_parser,
             cfg.limit_mm_per_prompt,
             cfg.mm_processor_kwargs,
-            cfg.model_config.enable_mm,
             cfg.tool_parser,
         )
         self.engine = EngineService(cfg)
@@ -490,13 +488,13 @@ def _start_worker_service(self):
             else len(self.data_processor.tokenizer.vocab)
         )
 
-        is_ernie = ErnieArchitectures.contains_ernie_arch(self.cfg.model_config.architectures)
-        if is_ernie:
-            self.cfg.model_config.think_end_id = self.data_processor.tokenizer.get_vocab().get("</think>", -1)
-            if self.cfg.model_config.think_end_id != -1:
-                llm_logger.info(f"Get think_end_id {self.cfg.model_config.think_end_id} from vocab.")
-            else:
-                llm_logger.info("No </think> token found in vocabulary, the model can not do reasoning.")
+        think_end_id = self.data_processor.tokenizer.get_vocab().get("</think>", -1)
+        if think_end_id > 0:
+            llm_logger.info(f"Get think_end_id {think_end_id} from vocab.")
+        else:
+            llm_logger.info("No </think> token found in vocabulary, the model can not do reasoning.")
+        image_patch_id = self.data_processor.tokenizer.get_vocab().get("<|IMAGE_PLACEHOLDER|>", -1)
+        line_break_id = self.data_processor.tokenizer.get_vocab().get("\n", -1)
 
         ports = ",".join(self.cfg.parallel_config.engine_worker_queue_port)
         ips = None
@@ -524,7 +522,9 @@ def _start_worker_service(self):
             f" --data_parallel_size {self.cfg.parallel_config.data_parallel_size}"
             f" --quantization '{json.dumps(self.cfg.model_config.quantization)}'"
             f" --ori_vocab_size {ori_vocab_size}"
-            f" --think_end_id {self.cfg.model_config.think_end_id}"
+            f" --think_end_id {think_end_id}"
+            f" --image_patch_id {image_patch_id}"
+            f" --line_break_id {line_break_id}"
             f" --speculative_config '{self.cfg.speculative_config.to_json_string()}'"
             f" --graph_optimization_config '{self.cfg.graph_opt_config.to_json_string()}'"
             f" --guided_decoding_backend {self.cfg.guided_decoding_backend}"
diff --git a/fastdeploy/entrypoints/cli/tokenizer.py b/fastdeploy/entrypoints/cli/tokenizer.py
index fe477a7e66d..3012fd1f6c6 100644
--- a/fastdeploy/entrypoints/cli/tokenizer.py
+++ b/fastdeploy/entrypoints/cli/tokenizer.py
@@ -21,6 +21,7 @@
 import typing
 from pathlib import Path
 
+from fastdeploy.config import ModelConfig
 from fastdeploy.entrypoints.cli.types import CLISubcommand
 from fastdeploy.input.preprocess import InputPreprocessor
 
@@ -199,7 +200,7 @@ def print_separator(title=""):
         return
 
     # 初始化tokenizer
-    preprocessor = InputPreprocessor(model_name_or_path=args.model_name_or_path, enable_mm=args.enable_mm)
+    preprocessor = InputPreprocessor(model_config=ModelConfig({"model": args.model_name_or_path}))
     tokenizer = preprocessor.create_processor().tokenizer
 
     # 执行操作
diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py
index 525498ed5ce..eb2b8f297c1 100644
--- a/fastdeploy/entrypoints/engine_client.py
+++ b/fastdeploy/entrypoints/engine_client.py
@@ -36,7 +36,6 @@
     ZmqIpcClient,
 )
 from fastdeploy.metrics.work_metrics import work_process_metrics
-from fastdeploy.multimodal.registry import MultimodalRegistry
 from fastdeploy.platforms import current_platform
 from fastdeploy.utils import (
     EngineError,
@@ -61,7 +60,6 @@ def __init__(
         port,
         limit_mm_per_prompt,
         mm_processor_kwargs,
-        # enable_mm=False,
         reasoning_parser=None,
         data_parallel_size=1,
         enable_logprob=False,
@@ -70,20 +68,15 @@ def __init__(
         enable_prefix_caching=None,
         splitwise_role=None,
     ):
-        architectures = ModelConfig({"model": model_name_or_path}).architectures[0]
-        if MultimodalRegistry.contains_model(architectures):
-            self.enable_mm = True
-        else:
-            self.enable_mm = False
-
+        model_config = ModelConfig({"model": model_name_or_path}).architectures[0]
         input_processor = InputPreprocessor(
-            tokenizer,
+            model_config,
             reasoning_parser,
             limit_mm_per_prompt,
             mm_processor_kwargs,
-            self.enable_mm,
             tool_parser,
         )
+        self.enable_mm = model_config.enable_mm
         self.enable_logprob = enable_logprob
         self.reasoning_parser = reasoning_parser
         self.data_processor = input_processor.create_processor()
@@ -263,8 +256,8 @@ def valid_parameters(self, data):
                 raise ParameterError("max_tokens", f"max_tokens can be defined [1, {self.max_model_len}).")
 
         if data.get("reasoning_max_tokens") is not None:
-            if data["reasoning_max_tokens"] < 1:
-                raise ParameterError("reasoning_max_tokens", "reasoning_max_tokens must be greater than 1")
+            if data["reasoning_max_tokens"] < 0:
+                raise ParameterError("reasoning_max_tokens", "reasoning_max_tokens must be greater than 0")
             if data["reasoning_max_tokens"] > data["max_tokens"]:
                 data["reasoning_max_tokens"] = data["max_tokens"]
                 api_server_logger.warning(
diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py
index 235f1cccd26..d246bbb15a9 100644
--- a/fastdeploy/entrypoints/openai/api_server.py
+++ b/fastdeploy/entrypoints/openai/api_server.py
@@ -156,7 +156,6 @@ async def lifespan(app: FastAPI):
         port=int(args.engine_worker_queue_port[args.local_data_parallel_id]),
         limit_mm_per_prompt=args.limit_mm_per_prompt,
         mm_processor_kwargs=args.mm_processor_kwargs,
-        # args.enable_mm,
         reasoning_parser=args.reasoning_parser,
         data_parallel_size=args.data_parallel_size,
         enable_logprob=args.enable_logprob,
diff --git a/fastdeploy/input/preprocess.py b/fastdeploy/input/preprocess.py
index 5b8eb3ccd3c..b3af46c95fa 100644
--- a/fastdeploy/input/preprocess.py
+++ b/fastdeploy/input/preprocess.py
@@ -24,7 +24,7 @@
 class InputPreprocessor:
     """
     Args:
-    model_name_or_path (str):
+    model_config (ModelConfig):
         Model name or path to the pretrained model. If a model name is provided, it should be a
         key in the Hugging Face Transformers' model registry (https://huggingface.co/models).
         The model will be downloaded from the Hugging Face model hub if necessary.
@@ -32,8 +32,6 @@ class InputPreprocessor:
     reasoning_parser (str, optional):
         Reasoning parser type. Defaults to None.
         Flag specifies the reasoning parser to use for extracting reasoning content from the model output
-    enable_mm (bool, optional):
-        Whether to use the multi-modal model processor. Defaults to False.
 
     Raises:
         ValueError:
@@ -43,32 +41,20 @@ class InputPreprocessor:
 
     def __init__(
         self,
-        model_name_or_path: str,
+        model_config: ModelConfig,
         reasoning_parser: str = None,
         limit_mm_per_prompt: Optional[Dict[str, Any]] = None,
         mm_processor_kwargs: Optional[Dict[str, Any]] = None,
-        enable_mm: bool = False,
         tool_parser: str = None,
     ) -> None:
-
-        self.model_name_or_path = model_name_or_path
+        self.model_config = model_config
+        self.model_name_or_path = self.model_config.model
         self.reasoning_parser = reasoning_parser
-        self.enable_mm = enable_mm
         self.limit_mm_per_prompt = limit_mm_per_prompt
         self.mm_processor_kwargs = mm_processor_kwargs
         self.tool_parser = tool_parser
 
     def create_processor(self):
-        """
-            创建数据处理器。如果启用了多模态注册表，则使用该表中的模型；否则，使用传递给构造函数的模型名称或路径。
-        返回值：DataProcessor（如果不启用多模态注册表）或MultiModalRegistry.Processor（如果启用多模态注册表）。
-
-        Args:
-            无参数。
-
-        Returns:
-            DataProcessor or MultiModalRegistry.Processor (Union[DataProcessor, MultiModalRegistry.Processor]): 数据处理器。
-        """
         reasoning_parser_obj = None
         tool_parser_obj = None
 
@@ -77,8 +63,7 @@ def create_processor(self):
         if self.tool_parser:
             tool_parser_obj = ToolParserManager.get_tool_parser(self.tool_parser)
 
-        config = ModelConfig({"model": self.model_name_or_path})
-        architectures = config.architectures[0]
+        architecture = self.model_config.architectures[0]
 
         try:
             from fastdeploy.plugins.input_processor import load_input_processor_plugins
@@ -90,8 +75,8 @@ def create_processor(self):
                 tool_parser_obj=tool_parser_obj,
             )
         except:
-            if not self.enable_mm:
-                if not ErnieArchitectures.contains_ernie_arch(architectures):
+            if not self.model_config.enable_mm:
+                if not ErnieArchitectures.contains_ernie_arch(architecture):
                     from fastdeploy.input.text_processor import DataProcessor
 
                     self.processor = DataProcessor(
@@ -108,7 +93,7 @@ def create_processor(self):
                         tool_parser_obj=tool_parser_obj,
                     )
             else:
-                if ErnieArchitectures.contains_ernie_arch(architectures):
+                if ErnieArchitectures.contains_ernie_arch(architecture):
                     from fastdeploy.input.ernie4_5_vl_processor import (
                         Ernie4_5_VLProcessor,
                     )
@@ -124,7 +109,7 @@ def create_processor(self):
                     from fastdeploy.input.qwen_vl_processor import QwenVLProcessor
 
                     self.processor = QwenVLProcessor(
-                        config=config,
+                        config=self.model_config,
                         model_name_or_path=self.model_name_or_path,
                         limit_mm_per_prompt=self.limit_mm_per_prompt,
                         mm_processor_kwargs=self.mm_processor_kwargs,
diff --git a/fastdeploy/model_executor/models/interfaces_base.py b/fastdeploy/model_executor/models/interfaces_base.py
deleted file mode 100644
index b7ece5fe69a..00000000000
--- a/fastdeploy/model_executor/models/interfaces_base.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Type
-
-from paddle import nn
-
-
-def is_text_generation_model(model_cls: Type[nn.Layer]) -> bool:
-    from .model_base import ModelForCasualLM
-
-    return issubclass(model_cls, ModelForCasualLM)
-
-
-def is_pooling_model(model_cls: Type[nn.Layer]) -> bool:
-    class_name = model_cls.__name__
-    pooling_indicators = ["Embedding", "ForSequenceClassification"]
-    return (
-        any(indicator in class_name for indicator in pooling_indicators)
-        or hasattr(model_cls, "is_embedding_model")
-        and model_cls.is_embedding_model
-    )
-
-
-def is_multimodal_model(class_name: str) -> bool:
-    multimodal_indicators = ["VL", "Vision", "ConditionalGeneration"]
-    return any(indicator in class_name for indicator in multimodal_indicators)
-
-
-def determine_model_category(class_name: str):
-    from fastdeploy.model_executor.models.model_base import ModelCategory
-
-    if any(pattern in class_name for pattern in ["VL", "Vision", "ConditionalGeneration"]):
-        return ModelCategory.MULTIMODAL
-    elif any(pattern in class_name for pattern in ["Embedding", "ForSequenceClassification"]):
-        return ModelCategory.EMBEDDING
-    return ModelCategory.TEXT_GENERATION
-
-
-def get_default_pooling_type(model_cls: Type[nn.Layer] = None) -> str:
-    if model_cls is not None:
-        return getattr(model_cls, "default_pooling_type", "LAST")
-    return "LAST"
diff --git a/fastdeploy/model_executor/models/model_base.py b/fastdeploy/model_executor/models/model_base.py
index fddfb4de51a..1ddd39c9a32 100644
--- a/fastdeploy/model_executor/models/model_base.py
+++ b/fastdeploy/model_executor/models/model_base.py
@@ -12,7 +12,7 @@
 import importlib
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from enum import Enum
+from enum import IntFlag, auto
 from functools import lru_cache
 from typing import Dict, List, Optional, Tuple, Type, Union
 
@@ -26,19 +26,19 @@
     iter_architecture_defaults,
     try_match_architecture_defaults,
 )
-from fastdeploy.model_executor.models.interfaces_base import (
-    determine_model_category,
-    get_default_pooling_type,
-    is_multimodal_model,
-    is_pooling_model,
-    is_text_generation_model,
-)
 
 
-class ModelCategory(Enum):
-    TEXT_GENERATION = "text_generation"
-    MULTIMODAL = "multimodal"
-    EMBEDDING = "embedding"
+def get_default_pooling_type(model_cls: Type[nn.Layer] = None) -> str:
+    if model_cls is not None:
+        return getattr(model_cls, "default_pooling_type", "LAST")
+    return "LAST"
+
+
+class ModelCategory(IntFlag):
+    TEXT_GENERATION = auto()
+    MULTIMODAL = auto()
+    EMBEDDING = auto()
+    REASONING = auto()
 
 
 @dataclass(frozen=True)
@@ -47,18 +47,22 @@ class ModelInfo:
     category: ModelCategory
     is_text_generation: bool
     is_multimodal: bool
+    is_reasoning: bool
     is_pooling: bool
     module_path: str
     default_pooling_type: str
 
     @staticmethod
-    def from_model_cls(model_cls: Type[nn.Layer], module_path: str = "") -> "ModelInfo":
+    def from_model_cls(
+        model_cls: Type[nn.Layer], module_path: str = "", category: ModelCategory = None
+    ) -> "ModelInfo":
         return ModelInfo(
             architecture=model_cls.__name__,
-            category=determine_model_category(model_cls.__name__),
-            is_text_generation=is_text_generation_model(model_cls),
-            is_multimodal=is_multimodal_model(model_cls.__name__),
-            is_pooling=is_pooling_model(model_cls),
+            category=category,
+            is_text_generation=ModelCategory.TEXT_GENERATION in category,
+            is_multimodal=ModelCategory.MULTIMODAL in category,
+            is_reasoning=ModelCategory.REASONING in category,
+            is_pooling=ModelCategory.EMBEDDING in category,
             default_pooling_type=get_default_pooling_type(model_cls),
             module_path=module_path,
         )
@@ -83,6 +87,7 @@ class LazyRegisteredModel(BaseRegisteredModel):
     module_name: str
     module_path: str
     class_name: str
+    category: ModelCategory
 
     def load_model_cls(self) -> Type[nn.Layer]:
         try:
@@ -94,7 +99,7 @@ def load_model_cls(self) -> Type[nn.Layer]:
 
     def inspect_model_cls(self) -> ModelInfo:
         model_cls = self.load_model_cls()
-        return ModelInfo.from_model_cls(model_cls, self.module_name)
+        return ModelInfo.from_model_cls(model_cls, self.module_name, self.category)
 
 
 @lru_cache(maxsize=128)
@@ -126,6 +131,7 @@ def _register_enhanced_models(self):
                 module_name=model_info["module_name"],
                 module_path=model_info["module_path"],
                 class_name=model_info["class_name"],
+                category=model_info["category"],
             )
             self.models[arch] = model
             self._registered_models[arch] = model
@@ -317,6 +323,17 @@ def is_multimodal_model(self, architectures: Union[str, List[str]], model_config
                 return model_info.is_multimodal
         return False
 
+    def is_reasoning_model(self, architectures: Union[str, List[str]], model_config: ModelConfig = None) -> bool:
+        """Check if it's a reasoning model"""
+        if isinstance(architectures, str):
+            architectures = [architectures]
+
+        for arch in architectures:
+            model_info = self._try_inspect_model_cls(arch)
+            if model_info is not None:
+                return model_info.is_reasoning
+        return False
+
     def is_text_generation_model(self, architectures: Union[str, List[str]], model_config: ModelConfig = None) -> bool:
         """Check if it's a text generation model"""
         if isinstance(architectures, str):
diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py
index 34685b52019..384c893f3fc 100644
--- a/fastdeploy/model_executor/pre_and_post_process.py
+++ b/fastdeploy/model_executor/pre_and_post_process.py
@@ -42,6 +42,8 @@
 elif current_platform.is_dcu():
     from fastdeploy.model_executor.ops.gpu import (
         get_padding_offset,
+        limit_thinking_content_length_v1,
+        limit_thinking_content_length_v2,
         save_output,
         set_stop_value_multi_ends,
         step_paddle,
@@ -81,7 +83,6 @@
         speculate_step_reschedule,
     )
 
-
 from fastdeploy.inter_communicator import ZmqIpcClient
 from fastdeploy.output.stream_transfer_data import DecoderState, StreamTransferData
 from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput, SamplerOutput
@@ -641,3 +642,33 @@ def rebuild_padding(
     else:
         raise RuntimeError("Not supported platform")
     return hidden_states
+
+
+def limit_thinking_content_length(
+    limit_strategy: str,
+    sampled_token_ids: paddle.Tensor,
+    max_think_lens: paddle.Tensor,
+    step_idx: paddle.Tensor,
+    limit_think_status: paddle.Tensor,
+    think_end_id: int,
+    line_break_id: int = None,
+):
+    if limit_strategy == "</think>":
+        # for ernie4_5_vl
+        limit_thinking_content_length_v1(
+            sampled_token_ids,
+            max_think_lens,
+            step_idx,
+            limit_think_status,
+            think_end_id,
+        )
+    elif limit_strategy == "\n</think>\n\n":
+        # for ernie_x1
+        limit_thinking_content_length_v2(
+            sampled_token_ids,
+            max_think_lens,
+            step_idx,
+            limit_think_status,
+            think_end_id,
+            line_break_id,
+        )
diff --git a/fastdeploy/multimodal/registry.py b/fastdeploy/multimodal/registry.py
deleted file mode 100644
index f014ba55532..00000000000
--- a/fastdeploy/multimodal/registry.py
+++ /dev/null
@@ -1,35 +0,0 @@
-"""
-# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-
-class MultimodalRegistry:
-    """
-    A registry for multimodal models
-    """
-
-    mm_models: set[str] = {
-        "Ernie4_5_VLMoeForConditionalGeneration",
-        "Ernie5MoeForCausalLM",
-        "Qwen2_5_VLForConditionalGeneration",
-        "Ernie5ForCausalLM",
-    }
-
-    @classmethod
-    def contains_model(cls, name: str) -> bool:
-        """
-        Check if the given name exists in registry.
-        """
-        return name in cls.mm_models
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 9b2e531b880..d33116d45ed 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -63,6 +63,7 @@
     )
 
 from fastdeploy.model_executor.pre_and_post_process import (
+    limit_thinking_content_length,
     post_process,
     pre_process,
     rebuild_padding,
@@ -323,6 +324,16 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
                         position_ids, request.get("max_tokens", 2048)
                     )
 
+                if request.get("reasoning_max_tokens") is not None:
+                    assert request.get("reasoning_max_tokens") >= 0, "reasoning_max_tokens in requests need >= 0."
+                    # Enable thinking
+                    self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get("reasoning_max_tokens")
+                    self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0
+                else:
+                    # Disable thinking
+                    self.share_inputs["max_think_lens"][idx : idx + 1, :] = -1
+                    self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0
+
                 if isinstance(request.prompt_token_ids, np.ndarray):
                     prompt_token_ids = request.prompt_token_ids.tolist()
                 else:
@@ -546,6 +557,16 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests:
                     )
                     self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
 
+                if request.get("reasoning_max_tokens") is not None:
+                    assert request.get("reasoning_max_tokens") >= 0, "reasoning_max_tokens in requests need >= 0."
+                    # Enable thinking
+                    self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get("reasoning_max_tokens")
+                    self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0
+                else:
+                    # Disable thinking
+                    self.share_inputs["max_think_lens"][idx : idx + 1, :] = -1
+                    self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0
+
             def get_attr_from_request(request, attr, default_value=None):
                 res = request.get(attr, default_value)
                 if res is not None:
@@ -833,16 +854,15 @@ def _init_share_inputs(self, max_num_seqs: int):
         self.share_inputs["kv_num_blocks_x_cpu"] = None  # CPU
         self.share_inputs["max_len_kv_cpu"] = None  # CPU
 
-        # Initialize rotary position embedding
-        tmp_position_ids = paddle.arange(self.model_config.max_model_len).reshape((1, -1))
-
         # Initialize thinking related buffers
+        self.share_inputs["max_think_lens"] = paddle.full(shape=[max_num_seqs, 1], fill_value=-1, dtype="int32")
+        self.share_inputs["limit_think_status"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
 
-        # TODO(gongshaotian): move to models
+        # Initialize rotary position embedding
         if not self.enable_mm:
             self.share_inputs["rope_emb"] = get_rope(
                 rotary_dim=self.model_config.head_dim,
-                position_ids=tmp_position_ids,
+                position_ids=paddle.arange(self.model_config.max_model_len).reshape((1, -1)),
                 base=self.model_config.rope_theta,
                 model_config=self.model_config,
                 partial_rotary_factor=self.model_config.partial_rotary_factor,
@@ -1737,6 +1757,21 @@ class at the server level, which is too granular for ModelRunner.
                     group=self.parallel_config.tp_group,
                 )
 
+        if self.model_config.think_end_id > 0 and not self.speculative_decoding:
+            assert (
+                sampler_output is not None
+            ), "Warning, limit thinking content length not support speculative decoding."
+            assert self.model_config.line_break_id > 0
+            limit_thinking_content_length(
+                limit_strategy="</think>",  # Temporary writing death
+                sampled_token_ids=sampler_output.sampled_token_ids,
+                max_think_lens=self.share_inputs["max_think_lens"],
+                step_idx=self.share_inputs["step_idx"],
+                limit_think_status=self.share_inputs["limit_think_status"],
+                think_end_id=self.model_config.think_end_id,
+                line_break_id=self.model_config.line_break_id,
+            )
+
         # 5. Post Process
         model_output_data = ModelOutputData(
             next_tokens=self.share_inputs["next_tokens"],
diff --git a/fastdeploy/worker/utils.py b/fastdeploy/worker/utils.py
deleted file mode 100644
index 7a2562f248b..00000000000
--- a/fastdeploy/worker/utils.py
+++ /dev/null
@@ -1,50 +0,0 @@
-"""
-# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-
-import os
-import traceback
-
-
-def check_safetensors_model(model_dir: str):
-    """
-    model_dir : the directory of the model
-    Check whether the model is safetensors format
-    """
-    model_files = list()
-    all_files = os.listdir(model_dir)
-    for x in all_files:
-        if x.startswith("model") and x.endswith(".safetensors"):
-            model_files.append(x)
-
-    is_safetensors = len(model_files) > 0
-    if not is_safetensors:
-        return False
-
-    if len(model_files) == 1 and model_files[0] == "model.safetensors":
-        return True
-    try:
-        # check all the file exists
-        safetensors_num = int(model_files[0].strip(".safetensors").split("-")[-1])
-        flags = [0] * safetensors_num
-        for x in model_files:
-            current_index = int(x.strip(".safetensors").split("-")[1])
-            flags[current_index - 1] = 1
-        assert (
-            sum(flags) == safetensors_num
-        ), f"Number of safetensor files should be {len(model_files)}, but now it's {sum(flags)}"
-    except Exception as e:
-        raise Exception(f"Failed to check unified checkpoint, details: {e}, {str(traceback.format_exc())}.")
-    return is_safetensors
diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py
index b2dcc792024..21a854d5f17 100644
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -40,7 +40,6 @@
     PlasAttentionConfig,
     SpeculativeConfig,
 )
-from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer
 from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
 from fastdeploy.inter_communicator import ExistTaskStatus, IPCSignal, ModelWeightsStatus
 from fastdeploy.model_executor.layers.quantization import parse_quant_config
@@ -115,25 +114,9 @@ def init_distributed_environment(seed: int = 20) -> Tuple[int, int]:
 def update_fd_config_for_mm(fd_config: FDConfig) -> None:
     architectures = fd_config.model_config.architectures
     if fd_config.model_config.enable_mm and ErnieArchitectures.contains_ernie_arch(architectures):
-        tokenizer = Ernie4_5Tokenizer.from_pretrained(
-            fd_config.model_config.model,
-            model_max_length=fd_config.model_config.max_model_len,
-            padding_side="right",
-            use_fast=False,
-        )
-        tokenizer.ignored_index = -100
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.unk_token
-
         fd_config.model_config.tensor_parallel_degree = fd_config.parallel_config.tensor_parallel_size
         fd_config.model_config.tensor_parallel_rank = fd_config.parallel_config.tensor_parallel_rank
-        vision_config = fd_config.model_config.vision_config
-        vision_config.dtype = fd_config.model_config.dtype
-        # vision_config.tensor_parallel_degree = fd_config.parallel_config.tensor_parallel_size
-        # vision_config.tensor_parallel_rank = fd_config.parallel_config.tensor_parallel_rank
-        fd_config.model_config.im_patch_id = tokenizer.get_vocab()["<|IMAGE_PLACEHOLDER|>"]
-        fd_config.model_config.think_end_id = tokenizer.get_vocab()["</think>"]
-        fd_config.model_config.sequence_parallel = fd_config.parallel_config.sequence_parallel
+        fd_config.model_config.vision_config.dtype = fd_config.model_config.dtype
 
 
 class PaddleDisWorkerProc:
@@ -573,6 +556,8 @@ def parse_args():
     )
     parser.add_argument("--ori_vocab_size", type=int, default=None)
     parser.add_argument("--think_end_id", type=int, default=-1)
+    parser.add_argument("--image_patch_id", type=int, default=-1)
+    parser.add_argument("--line_break_id", type=int, default=-1)
 
     parser.add_argument(
         "--quantization",
diff --git a/tests/ce/server/test_completions.py b/tests/ce/server/test_completions.py
index 5fff0ae1251..188586d7d43 100644
--- a/tests/ce/server/test_completions.py
+++ b/tests/ce/server/test_completions.py
@@ -52,7 +52,6 @@ def test_completion_echo_stream_one_prompt_rti():
 
     payload = build_request_payload(TEMPLATE, data)
     resp = send_request(COMPLETIONS_URL, payload, stream=True)
-    last_data = None
     # 初始化计数器
     counter = 0
     second_data = None
@@ -87,7 +86,6 @@ def test_completion_echo_stream_one_prompt():
 
     payload = build_request_payload(TEMPLATE, data)
     resp = send_request(COMPLETIONS_URL, payload, stream=True)
-    last_data = None
     # 初始化计数器
     counter = 0
     second_data = None
@@ -123,10 +121,6 @@ def test_completion_echo_stream_more_prompt():
 
     payload = build_request_payload(TEMPLATE, data)
     resp = send_request(COMPLETIONS_URL, payload, stream=True)
-    last_data = None
-    # 初始化计数器
-    counter = 0
-    second_data = None
     # 初始化字典来存储每个index的第二包数据
     second_data_by_index = {0: None, 1: None}
     # 初始化字典来记录每个index的包计数

From 1912e72171a8cd6f22285fdcc6973cb1a1d06a90 Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Mon, 13 Oct 2025 16:26:07 +0800
Subject: [PATCH 04/24] fix

---
 fastdeploy/entrypoints/engine_client.py           | 2 +-
 fastdeploy/model_executor/pre_and_post_process.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py
index eb2b8f297c1..7a23525530f 100644
--- a/fastdeploy/entrypoints/engine_client.py
+++ b/fastdeploy/entrypoints/engine_client.py
@@ -68,7 +68,7 @@ def __init__(
         enable_prefix_caching=None,
         splitwise_role=None,
     ):
-        model_config = ModelConfig({"model": model_name_or_path}).architectures[0]
+        model_config = ModelConfig({"model": model_name_or_path})
         input_processor = InputPreprocessor(
             model_config,
             reasoning_parser,
diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py
index 384c893f3fc..70ae261c525 100644
--- a/fastdeploy/model_executor/pre_and_post_process.py
+++ b/fastdeploy/model_executor/pre_and_post_process.py
@@ -672,3 +672,5 @@ def limit_thinking_content_length(
             think_end_id,
             line_break_id,
         )
+    else:
+        raise NotImplementedError(f"Not support {limit_strategy=} for limit thinking content length.")

From fe0fee884b211ae8c60421c6c3a57b399dea9134 Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Mon, 13 Oct 2025 17:03:37 +0800
Subject: [PATCH 05/24] fix

---
 fastdeploy/config.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index 400807339a2..a4d055a5620 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -227,10 +227,6 @@ def __init__(
         self.im_patch_id = args.get("image_patch_id", -1)
         self.line_break_id = args.get("line_break_id", -1)
 
-        self.override_name_from_config()
-        self.read_from_env()
-        self.read_model_config()
-
     @property
     def registry(self):
         from fastdeploy.model_executor.models.model_base import ModelRegistry
@@ -506,6 +502,9 @@ def __post_init__(self):
         self._architecture = arch
 
         self.pooler_config = self._init_pooler_config()
+        self.override_name_from_config()
+        self.read_from_env()
+        self.read_model_config()
 
 
 class ParallelConfig:

From 81674a5cdd07a59175c84e5f476195f48915661d Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Mon, 13 Oct 2025 19:26:47 +0800
Subject: [PATCH 06/24] fix

---
 fastdeploy/config.py | 80 +++++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 39 deletions(-)

diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index a4d055a5620..15c3957c7fb 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -227,6 +227,47 @@ def __init__(
         self.im_patch_id = args.get("image_patch_id", -1)
         self.line_break_id = args.get("line_break_id", -1)
 
+        self._post_init()
+
+    def _post_init(self):
+        self.is_unified_ckpt = check_unified_ckpt(self.model)
+        self.runner_type = self._get_runner_type(self.architectures, self.runner)
+        self.convert_type = self._get_convert_type(self.architectures, self.runner_type, self.convert)
+        registry = self.registry
+        is_generative_model = registry.is_text_generation_model(self.architectures, self)
+        is_pooling_model = registry.is_pooling_model(self.architectures, self)
+        is_multimodal_model = registry.is_multimodal_model(self.architectures, self)
+        self.is_reasoning_model = registry.is_reasoning_model(self.architectures, self)
+
+        self.enable_mm = is_multimodal_model
+
+        if self.runner_type == "generate" and not is_generative_model:
+            if is_multimodal_model:
+                pass
+            else:
+                generate_converts = _RUNNER_CONVERTS["generate"]
+                if self.convert_type not in generate_converts:
+                    raise ValueError("This model does not support '--runner generate.")
+        if self.runner_type == "pooling" and not is_pooling_model:
+            pooling_converts = _RUNNER_CONVERTS["pooling"]
+            if self.convert_type not in pooling_converts:
+                convert_option = "<" + "|".join(pooling_converts) + ">"
+                raise ValueError(
+                    "This model does not support `--runner pooling`. "
+                    f"You can pass `--convert {convert_option} to adapt "
+                    "it into a pooling model."
+                )
+
+        self.supported_tasks = self._get_supported_tasks(self.architectures, self.runner_type, self.convert_type)
+        model_info, arch = registry.inspect_model_cls(self.architectures, self)
+        self._model_info = model_info
+        self._architecture = arch
+
+        self.pooler_config = self._init_pooler_config()
+        self.override_name_from_config()
+        self.read_from_env()
+        self.read_model_config()
+
     @property
     def registry(self):
         from fastdeploy.model_executor.models.model_base import ModelRegistry
@@ -467,45 +508,6 @@ def print(self):
             logger.info("{:<20}:{:<6}{}".format(k, "", v))
         logger.info("=============================================================")
 
-    def __post_init__(self):
-        self.is_unified_ckpt = check_unified_ckpt(self.model)
-        self.runner_type = self._get_runner_type(self.architectures, self.runner)
-        self.convert_type = self._get_convert_type(self.architectures, self.runner_type, self.convert)
-        registry = self.registry
-        is_generative_model = registry.is_text_generation_model(self.architectures, self)
-        is_pooling_model = registry.is_pooling_model(self.architectures, self)
-        is_multimodal_model = registry.is_multimodal_model(self.architectures, self)
-        self.is_reasoning_model = registry.is_reasoning_model(self.architectures, self)
-
-        self.enable_mm = is_multimodal_model
-
-        if self.runner_type == "generate" and not is_generative_model:
-            if is_multimodal_model:
-                pass
-            else:
-                generate_converts = _RUNNER_CONVERTS["generate"]
-                if self.convert_type not in generate_converts:
-                    raise ValueError("This model does not support '--runner generate.")
-        if self.runner_type == "pooling" and not is_pooling_model:
-            pooling_converts = _RUNNER_CONVERTS["pooling"]
-            if self.convert_type not in pooling_converts:
-                convert_option = "<" + "|".join(pooling_converts) + ">"
-                raise ValueError(
-                    "This model does not support `--runner pooling`. "
-                    f"You can pass `--convert {convert_option} to adapt "
-                    "it into a pooling model."
-                )
-
-        self.supported_tasks = self._get_supported_tasks(self.architectures, self.runner_type, self.convert_type)
-        model_info, arch = registry.inspect_model_cls(self.architectures, self)
-        self._model_info = model_info
-        self._architecture = arch
-
-        self.pooler_config = self._init_pooler_config()
-        self.override_name_from_config()
-        self.read_from_env()
-        self.read_model_config()
-
 
 class ParallelConfig:
     """Configuration for the distributed execution."""

From 3282a2fdadc3e5c24330bb798c33f36c9db13e7a Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Tue, 14 Oct 2025 11:53:11 +0800
Subject: [PATCH 07/24] update

---
 fastdeploy/config.py                  | 17 -----------------
 fastdeploy/envs.py                    |  2 ++
 fastdeploy/worker/gpu_model_runner.py |  2 +-
 fastdeploy/worker/worker_process.py   |  3 ---
 4 files changed, 3 insertions(+), 21 deletions(-)

diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index 15c3957c7fb..bcccd80e09b 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -1305,21 +1305,6 @@ def print(self):
         logger.info("=============================================================")
 
 
-class DecodingConfig:
-    """
-    Configuration for decoding
-    """
-
-    def __init__(
-        self,
-        args,
-    ):
-        self.pad_token_id = None
-        for key, value in args.items():
-            if hasattr(self, key):
-                setattr(self, key, value)
-
-
 class CommitConfig:
     """
     Configuration for tracking version information from version.txt
@@ -1392,7 +1377,6 @@ def __init__(
         commit_config: CommitConfig = CommitConfig(),
         scheduler_config: SchedulerConfig = None,
         device_config: DeviceConfig = None,
-        decoding_config: DecodingConfig = None,
         quant_config: QuantConfigBase = None,
         graph_opt_config: GraphOptimizationConfig = None,
         plas_attention_config: PlasAttentionConfig = None,
@@ -1423,7 +1407,6 @@ def __init__(
         self.quant_config: Optional[QuantConfigBase] = quant_config
         self.graph_opt_config: Optional[GraphOptimizationConfig] = graph_opt_config
         self.early_stop_config: Optional[EarlyStopConfig] = early_stop_config
-        self.decoding_config: DecodingConfig = decoding_config  # type: ignore
         self.cache_config: CacheConfig = cache_config  # type: ignore
         self.plas_attention_config: Optional[PlasAttentionConfig] = plas_attention_config
         # Initialize cuda graph capture list
diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py
index bb5bee9680c..68f860719e5 100644
--- a/fastdeploy/envs.py
+++ b/fastdeploy/envs.py
@@ -118,6 +118,8 @@
     "FD_ENABLE_MODEL_LOAD_CACHE": lambda: bool(int(os.getenv("FD_ENABLE_MODEL_LOAD_CACHE", "0"))),
     # Whether to clear cpu cache when clearing model weights.
     "FD_ENABLE_SWAP_SPACE_CLEARING": lambda: int(os.getenv("FD_ENABLE_SWAP_SPACE_CLEARING", "0")),
+    # Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie4_5_vl, \n</think>\n\n for ernie_x1)
+    "FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", "</think>"),
 }
 
 
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index d33116d45ed..bc3f98f87cb 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1763,7 +1763,7 @@ class at the server level, which is too granular for ModelRunner.
             ), "Warning, limit thinking content length not support speculative decoding."
             assert self.model_config.line_break_id > 0
             limit_thinking_content_length(
-                limit_strategy="</think>",  # Temporary writing death
+                limit_strategy=envs.FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR,
                 sampled_token_ids=sampler_output.sampled_token_ids,
                 max_think_lens=self.share_inputs["max_think_lens"],
                 step_idx=self.share_inputs["step_idx"],
diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py
index 21a854d5f17..e63896ce220 100644
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -28,7 +28,6 @@
 from fastdeploy import envs
 from fastdeploy.config import (
     CacheConfig,
-    DecodingConfig,
     DeviceConfig,
     EarlyStopConfig,
     ErnieArchitectures,
@@ -688,7 +687,6 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
     paddle.set_default_dtype(args.dtype)
     model_config = ModelConfig(vars(args))
     device_config = DeviceConfig(vars(args))
-    decoding_config = DecodingConfig(vars(args))
     speculative_config = SpeculativeConfig(args.speculative_config)
     parallel_config = ParallelConfig(vars(args))
     cache_config = CacheConfig(vars(args))
@@ -787,7 +785,6 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
         speculative_config=speculative_config,
         device_config=device_config,
         load_config=load_config,
-        decoding_config=decoding_config,
         quant_config=quant_config,
         graph_opt_config=graph_opt_config,
         early_stop_config=early_stop_config,

From 6f1f082c2156053d1caf0b59d962880044274f88 Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Tue, 14 Oct 2025 18:24:19 +0800
Subject: [PATCH 08/24] fix set_value_by_flags_and_idx

---
 custom_ops/gpu_ops/set_value_by_flags_and_idx.cu               | 2 +-
 .../speculate_decoding/speculate_set_value_by_flags_and_idx.cu | 2 +-
 docs/usage/environment_variables.md                            | 3 +++
 docs/zh/usage/environment_variables.md                         | 3 +++
 4 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu b/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu
index 9e7a0ce1134..39181683038 100644
--- a/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu
+++ b/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu
@@ -35,7 +35,7 @@ __global__ void set_value_by_flag_and_id(const bool *stop_flags,
         const int seq_len_dec = seq_lens_decoder[tid];
         const int seq_len_enc = seq_lens_encoder[tid];
         if (seq_len_dec == 0 && seq_len_enc == 0) return;  // stopped
-        if (step_idx[tid] >= 0) {
+        if (step_idx[tid] > 0) {
             if (seq_len_enc > 0) { // encoder, get last token accord to seq_lens_encoder
                 pre_ids_all_now[step_idx[tid]] = input_ids_now[seq_len_enc - 1];
             } else {  // decoedr, get first token
diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu
index 4b1c7747e9c..a53a4eda641 100644
--- a/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu
+++ b/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu
@@ -37,7 +37,7 @@ __global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all,
         const int seq_len_enc = seq_lens_encoder[tid];
         if (seq_len_dec == 0 && seq_len_enc == 0) return;  // stopped
         // printf("step_idx[tid] %d\n", step_idx[tid]);
-        if (step_idx[tid] >= 0) {
+        if (step_idx[tid] > 0) {
             for (int i = 0; i < accept_num[tid]; i++) {
                 pre_ids_all_now[step_idx[tid] - i] =
                     accept_tokens_now[accept_num[tid] - 1 - i];
diff --git a/docs/usage/environment_variables.md b/docs/usage/environment_variables.md
index 103ff1e5ef2..aa2ac7ad2d8 100644
--- a/docs/usage/environment_variables.md
+++ b/docs/usage/environment_variables.md
@@ -78,5 +78,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Whether to use Machete for wint4 dense GEMM.
     "FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "0"),
 
+    # Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie4_5_vl, \n</think>\n\n for ernie_x1)
+    "FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", "</think>"),
+
 }
 ```
diff --git a/docs/zh/usage/environment_variables.md b/docs/zh/usage/environment_variables.md
index c1289bf984d..e6b9900bd27 100644
--- a/docs/zh/usage/environment_variables.md
+++ b/docs/zh/usage/environment_variables.md
@@ -77,5 +77,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
 
     # 是否使用 Machete 后端的 wint4 GEMM.
     "FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "0"),
+
+    # Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie4_5_vl, \n</think>\n\n for ernie_x1)
+    "FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", "</think>"),
 }
 ```

From 31aa8eefdfe9f1b34c7c2da8ae21ade5352aed2e Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Wed, 15 Oct 2025 11:03:51 +0800
Subject: [PATCH 09/24] fix

---
 fastdeploy/model_executor/pre_and_post_process.py | 1 +
 fastdeploy/worker/gpu_model_runner.py             | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py
index 70ae261c525..f0c4c7011e4 100644
--- a/fastdeploy/model_executor/pre_and_post_process.py
+++ b/fastdeploy/model_executor/pre_and_post_process.py
@@ -664,6 +664,7 @@ def limit_thinking_content_length(
         )
     elif limit_strategy == "\n</think>\n\n":
         # for ernie_x1
+        assert line_break_id > 0
         limit_thinking_content_length_v2(
             sampled_token_ids,
             max_think_lens,
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index f1e4d3b803b..6fe1620ac3f 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1758,7 +1758,6 @@ class at the server level, which is too granular for ModelRunner.
             assert (
                 sampler_output is not None
             ), "Warning, limit thinking content length not support speculative decoding."
-            assert self.model_config.line_break_id > 0
             limit_thinking_content_length(
                 limit_strategy=envs.FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR,
                 sampled_token_ids=sampler_output.sampled_token_ids,

From bc60b268abf016cd52276d4beb0ab3f92483dccc Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Wed, 15 Oct 2025 16:03:15 +0800
Subject: [PATCH 10/24] fix

---
 custom_ops/gpu_ops/cpp_extensions.cc          |  17 ++
 .../limit_thinking_content_length_v1.cu       | 114 ++++++-------
 .../limit_thinking_content_length_v2.cu       | 157 ++++++++----------
 .../model_executor/pre_and_post_process.py    |   4 +-
 4 files changed, 139 insertions(+), 153 deletions(-)

diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc
index 57d6201efd9..d7c6bb44e5e 100644
--- a/custom_ops/gpu_ops/cpp_extensions.cc
+++ b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -898,6 +898,19 @@ void SaveOutMmsgStatic(const paddle::Tensor& x,
                        int64_t rank_id,
                        bool save_each_rank);
 
+void LimitThinkingContentLengthV1(const paddle::Tensor &next_tokens,
+            const paddle::Tensor &max_think_lens,
+            const paddle::Tensor &step_idx,
+            const paddle::Tensor &limit_think_status,
+            const int64_t think_end_id);
+
+void LimitThinkingContentLengthV2(const paddle::Tensor &next_tokens,
+                                  const paddle::Tensor &max_think_lens,
+                                  const paddle::Tensor &step_idx,
+                                  const paddle::Tensor &limit_think_status,
+                                  const int64_t think_end_id,
+                                  const int64_t line_break_id);
+
 PYBIND11_MODULE(fastdeploy_ops, m) {
 
   m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"),
@@ -1286,4 +1299,8 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   m.def("min_p_sampling", &MinPSamplingFromProbs, "min_p_sampling function");
 
   m.def("save_output", &SaveOutMmsgStatic, "save_output function");
+
+  m.def("limit_thinking_content_length_v1", &LimitThinkingContentLengthV1, "save_output function");
+
+  m.def("limit_thinking_content_length_v2", &LimitThinkingContentLengthV2, "save_output function");
 }
diff --git a/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu b/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu
index e16e6dc66eb..d4c494b53a4 100644
--- a/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu
+++ b/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu
@@ -18,82 +18,70 @@
 __global__ void limit_thinking_content_length_kernel_v1(
     int64_t *next_tokens,
     const int *max_think_lens,
-    const int64_t *step_idx, // step_idx 不再需要被修改，改为 const
+    const int64_t *step_idx,
     int *limit_think_status,
     const int64_t think_end_id,
-    const int bs)
-{
-  int bid = threadIdx.x;
-  if (bid >= bs)
-    return;
+    const int bs) {
+    int bid = threadIdx.x;
+    if (bid >= bs) return;
 
-  // 如果该序列未启用思考功能，则直接返回，默认值为 -1，表示不限制思考长度
-  const int max_think_len = max_think_lens[bid];
-  if (max_think_len < 0)
-    return;
-  int current_limit_content_status = limit_think_status[bid];
-  // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行.
-  if (current_limit_content_status == 2)
-  {
-    return;
-  }
+    // 如果该序列未启用思考功能，则直接返回，默认值为 -1，表示不限制思考长度
+    const int max_think_len = max_think_lens[bid];
+    if (max_think_len < 0) return;
+    int current_limit_think_status = limit_think_status[bid];
+    // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行.
+    if (current_limit_think_status == 2) {
+        return;
+    }
 
-  int64_t next_token = next_tokens[bid];
-  const int64_t step = step_idx[bid];
+    int64_t next_token = next_tokens[bid];
+    const int64_t step = step_idx[bid];
 
-  // ======================= 思考阶段控制 =======================
-  // 阶段 1: 仍在思考 (status == 0), 检查是否需要强制结束
-  if (current_limit_content_status < 1)
-  {
-    // 当开启思考长度控制时，检查是否超时
-    if (step >= max_think_len)
-    {
-      // 强制将当前token替换为结束思考的token
-      next_token = think_end_id;
-      // 将状态推进到 1, 表示 "正在结束思考"
-      current_limit_content_status = 1;
+    // ======================= 思考阶段控制 =======================
+    // 阶段 1: 仍在思考 (status == 0), 检查是否需要强制结束
+    if (current_limit_think_status < 1) {
+        // 当开启思考长度控制时，检查是否超时
+        if (step >= max_think_len) {
+            // 强制将当前token替换为结束思考的token
+            next_token = think_end_id;
+            // 将状态推进到 1, 表示 "正在结束思考"
+            current_limit_think_status = 1;
+        }
     }
-  }
-  // ======================= 思考结束处理 =======================
-  // 阶段 2: 检查是否已满足结束思考的条件 (status < 2)
-  // 这种情况会处理两种场景:
-  // 1. status == 0: 模型自己生成了 think_end_id
-  // 2. status == 1: 上一阶段强制注入了 think_end_id
-  if (current_limit_content_status < 2)
-  {
-    if (next_token == think_end_id)
-    {
-      // 确认思考结束，将状态推进到 2 (响应阶段)
-      current_limit_content_status = 2;
+    // ======================= 思考结束处理 =======================
+    // 阶段 2: 检查是否已满足结束思考的条件 (status < 2)
+    // 这种情况会处理两种场景:
+    // 1. status == 0: 模型自己生成了 think_end_id
+    // 2. status == 1: 上一阶段强制注入了 think_end_id
+    if (current_limit_think_status < 2) {
+        if (next_token == think_end_id) {
+            // 确认思考结束，将状态推进到 2 (响应阶段)
+            current_limit_think_status = 2;
+        }
     }
-  }
-  // 写回更新后的 token
-  next_tokens[bid] = next_token;
-  // 更新全局状态
-  limit_think_status[bid] = current_limit_content_status;
+    // 写回更新后的 token
+    next_tokens[bid] = next_token;
+    // 更新全局状态
+    limit_think_status[bid] = current_limit_think_status;
 }
 
 void LimitThinkingContentLengthV1(const paddle::Tensor &next_tokens,
-                                const paddle::Tensor &max_think_lens,
-                                const paddle::Tensor &step_idx,
-                                const paddle::Tensor &limit_think_status,
-                                const int64_t think_end_id)
-{
-  const int batch_size = next_tokens.shape()[0];
-  limit_thinking_content_length_kernel_v1<<<1, 1024>>>(
-      const_cast<int64_t *>(next_tokens.data<int64_t>()),
-      max_think_lens.data<int>(),
-      step_idx.data<int64_t>(),
-      const_cast<int *>(limit_think_status.data<int>()),
-      think_end_id,
-      batch_size);
+                                  const paddle::Tensor &max_think_lens,
+                                  const paddle::Tensor &step_idx,
+                                  const paddle::Tensor &limit_think_status,
+                                  const int64_t think_end_id) {
+    const int batch_size = next_tokens.shape()[0];
+    limit_thinking_content_length_kernel_v1<<<1, 1024>>>(
+        const_cast<int64_t *>(next_tokens.data<int64_t>()),
+        max_think_lens.data<int>(),
+        step_idx.data<int64_t>(),
+        const_cast<int *>(limit_think_status.data<int>()),
+        think_end_id,
+        batch_size);
 }
 
 PD_BUILD_OP(limit_thinking_content_length_v1)
-    .Inputs({"next_tokens",
-             "max_think_lens",
-             "step_idx",
-             "limit_think_status"})
+    .Inputs({"next_tokens", "max_think_lens", "step_idx", "limit_think_status"})
     .Attrs({"think_end_id: int64_t"})
     .Outputs({"next_tokens_out"})
     .SetInplaceMap({{"next_tokens", "next_tokens_out"}})
diff --git a/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu b/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu
index fdac1cff5d0..c28073ba74e 100644
--- a/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu
+++ b/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu
@@ -22,108 +22,89 @@
 __global__ void limit_thinking_content_length_kernel_v2(
     int64_t *next_tokens,
     const int *max_think_lens,
-    const int64_t *step_idx, // step_idx 不再需要被修改，改为 const
+    const int64_t *step_idx,  // step_idx 不再需要被修改，改为 const
     int *limit_think_status,
     const int64_t think_end_id,
     const int64_t line_break_id,
-    const int bs)
-{
-  int bid = threadIdx.x;
-  if (bid >= bs)
-    return;
-  // 如果该序列未启用思考功能，则直接返回，默认值为 -1，表示不限制思考长度
-  const int max_think_len = max_think_lens[bid];
-  if (max_think_len < 0)
-    return;
-  int current_limit_content_status = limit_think_status[bid];
-  // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行.
-  if (current_limit_content_status == 3)
-  {
-    return;
-  }
+    const int bs) {
+    int bid = threadIdx.x;
+    if (bid >= bs) return;
+    // 如果该序列未启用思考功能，则直接返回，默认值为 -1，表示不限制思考长度
+    const int max_think_len = max_think_lens[bid];
+    if (max_think_len < 0) return;
+    int current_limit_think_status = limit_think_status[bid];
+    // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行.
+    if (current_limit_think_status == 3) {
+        return;
+    }
 
-  int64_t next_token = next_tokens[bid];
-  const int64_t step = step_idx[bid];
+    int64_t next_token = next_tokens[bid];
+    const int64_t step = step_idx[bid];
 
-  // ======================= 思考阶段控制 =======================
-  // 阶段 1: 仍在思考 (status == 0), 检查是否需要强制结束
-  // 阶段 2: 在替换 (status == 1), 检查是否替换结束
-  if (current_limit_content_status <= 1)
-  {
-    // 当开启思考长度控制时，检查是否超时
-    if (step == max_think_len)
-    {
-      // 强制将当前token替换为结束思考的token
-      next_token = line_break_id;
-      current_limit_content_status = 1;
-    }
-    else if (step == max_think_len + 1)
-    {
-      // 强制将当前token替换为结束思考的token
-      next_token = think_end_id;
-      current_limit_content_status = 1;
-    }
-    else if (step == max_think_len + 2)
-    {
-      // 强制将当前token替换为结束思考的token
-      next_token = line_break_id;
-      current_limit_content_status = 1;
+    // ======================= 思考阶段控制 =======================
+    // 阶段 1: 仍在思考 (status == 0), 检查是否需要强制结束
+    // 阶段 2: 在替换 (status == 1), 检查是否替换结束
+    if (current_limit_think_status <= 1) {
+        // 当开启思考长度控制时，检查是否超时
+        if (step == max_think_len) {
+            // 强制将当前token替换为结束思考的token
+            next_token = line_break_id;
+            current_limit_think_status = 1;
+        } else if (step == max_think_len + 1) {
+            // 强制将当前token替换为结束思考的token
+            next_token = think_end_id;
+            current_limit_think_status = 1;
+        } else if (step == max_think_len + 2) {
+            // 强制将当前token替换为结束思考的token
+            next_token = line_break_id;
+            current_limit_think_status = 1;
+        } else if (step == max_think_len + 3) {
+            // 强制将当前token替换为结束思考的token
+            next_token = line_break_id;
+            // 将状态推进到 1, 表示 "正在结束思考"
+            current_limit_think_status = 2;
+        }
     }
-    else if (step == max_think_len + 3)
-    {
-      // 强制将当前token替换为结束思考的token
-      next_token = line_break_id;
-      // 将状态推进到 1, 表示 "正在结束思考"
-      current_limit_content_status = 2;
+    // ======================= 思考结束处理 =======================
+    // 阶段 3: 检查是否已满足结束思考的条件 (status == 0 || status == 2)
+    // 这种情况会处理两种场景:
+    // 1. status == 0: 模型可能自己生成了 </think>
+    // 2. status == 2: 上一阶段强制注入了 \n</think>\n\n
+    if (current_limit_think_status == 0) {
+        if (next_token == think_end_id) {
+            // 确认思考结束，将状态推进到 3 (响应阶段)
+            current_limit_think_status = 3;
+        }
     }
-  }
-  // ======================= 思考结束处理 =======================
-  // 阶段 3: 检查是否已满足结束思考的条件 (status == 0 || status == 2)
-  // 这种情况会处理两种场景:
-  // 1. status == 0: 模型可能自己生成了 </think>
-  // 2. status == 2: 上一阶段强制注入了 \n</think>\n\n
-  if (current_limit_content_status == 0)
-  {
-    if (next_token == think_end_id)
-    {
-      // 确认思考结束，将状态推进到 3 (响应阶段)
-      current_limit_content_status = 3;
+    if (current_limit_think_status == 2) {
+        // 确认思考结束，将状态推进到 3 (响应阶段)
+        current_limit_think_status = 3;
     }
-  }
-  if (current_limit_content_status == 2)
-  {
-    // 确认思考结束，将状态推进到 3 (响应阶段)
-      current_limit_content_status = 3;
-  }
-  // 写回更新后的 token
-  next_tokens[bid] = next_token;
-  // 更新全局状态
-  limit_think_status[bid] = current_limit_content_status;
+    // 写回更新后的 token
+    next_tokens[bid] = next_token;
+    // 更新全局状态
+    limit_think_status[bid] = current_limit_think_status;
 }
 
 void LimitThinkingContentLengthV2(const paddle::Tensor &next_tokens,
-                                const paddle::Tensor &max_think_lens,
-                                const paddle::Tensor &step_idx,
-                                const paddle::Tensor &limit_think_status,
-                                const int64_t think_end_id,
-                                const int64_t line_break_id)
-{
-  const int batch_size = next_tokens.shape()[0];
-  limit_thinking_content_length_kernel_v2<<<1, 1024>>>(
-      const_cast<int64_t *>(next_tokens.data<int64_t>()),
-      max_think_lens.data<int>(),
-      step_idx.data<int64_t>(),
-      const_cast<int *>(limit_think_status.data<int>()),
-      think_end_id,
-      line_break_id,
-      batch_size);
+                                  const paddle::Tensor &max_think_lens,
+                                  const paddle::Tensor &step_idx,
+                                  const paddle::Tensor &limit_think_status,
+                                  const int64_t think_end_id,
+                                  const int64_t line_break_id) {
+    const int batch_size = next_tokens.shape()[0];
+    limit_thinking_content_length_kernel_v2<<<1, 1024>>>(
+        const_cast<int64_t *>(next_tokens.data<int64_t>()),
+        max_think_lens.data<int>(),
+        step_idx.data<int64_t>(),
+        const_cast<int *>(limit_think_status.data<int>()),
+        think_end_id,
+        line_break_id,
+        batch_size);
 }
 
 PD_BUILD_OP(limit_thinking_content_length_v2)
-    .Inputs({"next_tokens",
-             "max_think_lens",
-             "step_idx",
-             "limit_think_status"})
+    .Inputs({"next_tokens", "max_think_lens", "step_idx", "limit_think_status"})
     .Attrs({"think_end_id: int64_t", "line_break_id: int64_t"})
     .Outputs({"next_tokens_out"})
     .SetInplaceMap({{"next_tokens", "next_tokens_out"}})
diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py
index f0c4c7011e4..e0625fccd8a 100644
--- a/fastdeploy/model_executor/pre_and_post_process.py
+++ b/fastdeploy/model_executor/pre_and_post_process.py
@@ -42,8 +42,6 @@
 elif current_platform.is_dcu():
     from fastdeploy.model_executor.ops.gpu import (
         get_padding_offset,
-        limit_thinking_content_length_v1,
-        limit_thinking_content_length_v2,
         save_output,
         set_stop_value_multi_ends,
         step_paddle,
@@ -81,6 +79,8 @@
         step_reschedule,
         update_inputs_v1,
         speculate_step_reschedule,
+        limit_thinking_content_length_v1,
+        limit_thinking_content_length_v2,
     )
 
 from fastdeploy.inter_communicator import ZmqIpcClient

From 61d9b72e213c2c81003bdd5f071da2c3827b9fdf Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Wed, 15 Oct 2025 16:04:51 +0800
Subject: [PATCH 11/24] fix

---
 custom_ops/gpu_ops/cpp_extensions.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc
index 898d3e2ccdc..bdd3c231e9d 100644
--- a/custom_ops/gpu_ops/cpp_extensions.cc
+++ b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -1305,7 +1305,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
 
   m.def("save_output", &SaveOutMmsgStatic, "save_output function");
 
-  m.def("limit_thinking_content_length_v1", &LimitThinkingContentLengthV1, "save_output function");
+  m.def("limit_thinking_content_length_v1", &LimitThinkingContentLengthV1, "limit_thinking_content_length_v1 function");
 
-  m.def("limit_thinking_content_length_v2", &LimitThinkingContentLengthV2, "save_output function");
+  m.def("limit_thinking_content_length_v2", &LimitThinkingContentLengthV2, "limit_thinking_content_length_v2 function");
 }

From 06b5441262b5698682cef42b14a46a87065be3fa Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Wed, 15 Oct 2025 18:09:05 +0800
Subject: [PATCH 12/24] fix

---
 .../input/ernie4_5_vl_processor/ernie4_5_vl_processor.py    | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py
index 9251dd9d955..61e02828db9 100644
--- a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py
+++ b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py
@@ -255,7 +255,11 @@ def process_request_dict(self, request, max_model_len=None):
         else:
             request["max_tokens"] = min(max_model_len - len(request["prompt_token_ids"]), request["max_tokens"])
         if request.get("reasoning_max_tokens") is None:
-            request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
+            if request.get("enable_thinking"):
+                request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
+        else:
+            if not request.get("enable_thinking"):
+                request["reasoning_max_tokens"] = None
         data_processor_logger.info(f"Processed request {request}")
 
         return request

From 36ed90d521729d1394233e7defbd676d2688edfa Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Thu, 16 Oct 2025 16:47:08 +0800
Subject: [PATCH 13/24] update

---
 fastdeploy/worker/gpu_model_runner.py | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 63c5e2423ee..cc16cef2014 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1744,6 +1744,16 @@ class at the server level, which is too granular for ModelRunner.
                     group=self.parallel_config.tp_group,
                 )
 
+            if self.model_config.think_end_id > 0:
+                limit_thinking_content_length(
+                    limit_strategy=envs.FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR,
+                    sampled_token_ids=sampler_output.sampled_token_ids,
+                    max_think_lens=self.share_inputs["max_think_lens"],
+                    step_idx=self.share_inputs["step_idx"],
+                    limit_think_status=self.share_inputs["limit_think_status"],
+                    think_end_id=self.model_config.think_end_id,
+                    line_break_id=self.model_config.line_break_id,
+                )
         else:
             self.sampler(
                 logits,
@@ -1774,20 +1784,6 @@ class at the server level, which is too granular for ModelRunner.
                     group=self.parallel_config.tp_group,
                 )
 
-        if self.model_config.think_end_id > 0 and not self.speculative_decoding:
-            assert (
-                sampler_output is not None
-            ), "Warning, limit thinking content length not support speculative decoding."
-            limit_thinking_content_length(
-                limit_strategy=envs.FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR,
-                sampled_token_ids=sampler_output.sampled_token_ids,
-                max_think_lens=self.share_inputs["max_think_lens"],
-                step_idx=self.share_inputs["step_idx"],
-                limit_think_status=self.share_inputs["limit_think_status"],
-                think_end_id=self.model_config.think_end_id,
-                line_break_id=self.model_config.line_break_id,
-            )
-
         # 5. Post Process
         model_output_data = ModelOutputData(
             next_tokens=self.share_inputs["next_tokens"],

From 2f8aa118f924682caa3f8e4d1ec6c5cd62c520ab Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Thu, 16 Oct 2025 17:41:05 +0800
Subject: [PATCH 14/24] fix

---
 custom_ops/gpu_ops/set_value_by_flags_and_idx.cu               | 2 +-
 .../speculate_decoding/speculate_set_value_by_flags_and_idx.cu | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu b/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu
index 39181683038..9e7a0ce1134 100644
--- a/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu
+++ b/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu
@@ -35,7 +35,7 @@ __global__ void set_value_by_flag_and_id(const bool *stop_flags,
         const int seq_len_dec = seq_lens_decoder[tid];
         const int seq_len_enc = seq_lens_encoder[tid];
         if (seq_len_dec == 0 && seq_len_enc == 0) return;  // stopped
-        if (step_idx[tid] > 0) {
+        if (step_idx[tid] >= 0) {
             if (seq_len_enc > 0) { // encoder, get last token accord to seq_lens_encoder
                 pre_ids_all_now[step_idx[tid]] = input_ids_now[seq_len_enc - 1];
             } else {  // decoedr, get first token
diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu
index 316604c7363..d1ee733fecf 100644
--- a/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu
+++ b/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu
@@ -29,6 +29,7 @@ __global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all,
                                                    int length,
                                                    int max_draft_tokens) {
     int tid = threadIdx.x;
+
     if (tid < bs) {
         if (!stop_flags[tid]) {
             int64_t *pre_ids_all_now = pre_ids_all + tid * length;
@@ -37,7 +38,7 @@ __global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all,
             const int seq_len_dec = seq_lens_decoder[tid];
             const int seq_len_enc = seq_lens_encoder[tid];
             if (seq_len_dec == 0 && seq_len_enc == 0) return;  // stoped
-            if (step_idx[tid] > 0) {
+            if (step_idx[tid] >= 0) {
                 for (int i = 0; i < accept_num[tid]; i++) {
                     pre_ids_all_now[step_idx[tid] - i] =
                         accept_tokens_now[accept_num[tid] - 1 - i];

From 324d17ed995deaca60259d47a644a4666f4a1d46 Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Thu, 16 Oct 2025 21:08:44 +0800
Subject: [PATCH 15/24] fix

---
 custom_ops/gpu_ops/set_value_by_flags_and_idx.cu                | 2 +-
 .../speculate_decoding/speculate_set_value_by_flags_and_idx.cu  | 2 +-
 fastdeploy/worker/gpu_model_runner.py                           | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu b/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu
index 9e7a0ce1134..39181683038 100644
--- a/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu
+++ b/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu
@@ -35,7 +35,7 @@ __global__ void set_value_by_flag_and_id(const bool *stop_flags,
         const int seq_len_dec = seq_lens_decoder[tid];
         const int seq_len_enc = seq_lens_encoder[tid];
         if (seq_len_dec == 0 && seq_len_enc == 0) return;  // stopped
-        if (step_idx[tid] >= 0) {
+        if (step_idx[tid] > 0) {
             if (seq_len_enc > 0) { // encoder, get last token accord to seq_lens_encoder
                 pre_ids_all_now[step_idx[tid]] = input_ids_now[seq_len_enc - 1];
             } else {  // decoedr, get first token
diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu
index d1ee733fecf..f28e8369394 100644
--- a/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu
+++ b/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu
@@ -38,7 +38,7 @@ __global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all,
             const int seq_len_dec = seq_lens_decoder[tid];
             const int seq_len_enc = seq_lens_encoder[tid];
             if (seq_len_dec == 0 && seq_len_enc == 0) return;  // stoped
-            if (step_idx[tid] >= 0) {
+            if (step_idx[tid] > 0) {
                 for (int i = 0; i < accept_num[tid]; i++) {
                     pre_ids_all_now[step_idx[tid] - i] =
                         accept_tokens_now[accept_num[tid] - 1 - i];
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index cc16cef2014..3ba7f4d1544 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -282,6 +282,7 @@ def _init_logits_processor(self, request):
         return (
             self.guided_backend.get_logits_processor(
                 schemata_key=schemata_key,
+                enable_thinking=True,
             ),
             schemata_key,
         )

From 0710f34a2b0591bff427f800a3dfaf634662afcd Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Thu, 16 Oct 2025 21:14:09 +0800
Subject: [PATCH 16/24] fix typo

---
 custom_ops/gpu_ops/limit_thinking_content_length_v2.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu b/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu
index c28073ba74e..a61dec8960d 100644
--- a/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu
+++ b/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu
@@ -22,7 +22,7 @@
 __global__ void limit_thinking_content_length_kernel_v2(
     int64_t *next_tokens,
     const int *max_think_lens,
-    const int64_t *step_idx,  // step_idx 不再需要被修改，改为 const
+    const int64_t *step_idx,
     int *limit_think_status,
     const int64_t think_end_id,
     const int64_t line_break_id,

From 141608fc4a353f9c2bdf996d6681c8d1f4b32e15 Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Fri, 17 Oct 2025 11:32:19 +0800
Subject: [PATCH 17/24] fix ci

---
 fastdeploy/engine/common_engine.py            | 200 ++++++++----------
 fastdeploy/entrypoints/engine_client.py       |   4 +-
 .../model_executor/models/interfaces_base.py  |  54 +++++
 fastdeploy/worker/gpu_model_runner.py         |   2 -
 .../test_set_value_by_flags_and_idx.py        |   2 +-
 5 files changed, 148 insertions(+), 114 deletions(-)
 create mode 100644 fastdeploy/model_executor/models/interfaces_base.py

diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py
index 0c3545fc1f6..c3a936b0e3f 100644
--- a/fastdeploy/engine/common_engine.py
+++ b/fastdeploy/engine/common_engine.py
@@ -60,78 +60,78 @@ class EngineService:
     Base class containing common engine functionality
     """
 
-    def __init__(self, fd_config, start_queue=True):
+    def __init__(self, cfg, start_queue=True):
         """
         Initializes the LLMEngine with the provided configuration.
 
         Args:
-            fd_config (FDConfig): Config object containing all the configuration parameters.
+            cfg (Config): Config object containing all the configuration parameters.
         """
-        self.fd_config = fd_config
-        if fd_config.scheduler_config.splitwise_role != "mixed" or fd_config.cache_config.enable_prefix_caching:
-            if isinstance(self.fd_config.cache_config.cache_queue_port, str):
-                self.fd_config.cache_config.cache_queue_port = self.fd_config.cache_config.cache_queue_port.split(",")
-            if isinstance(self.fd_config.cache_config.cache_queue_port, list):
-                self.fd_config.cache_config.cache_queue_port = int(
-                    self.fd_config.cache_config.cache_queue_port[self.fd_config.parallel_config.local_data_parallel_id]
+        self.cfg = cfg
+        if cfg.scheduler_config.splitwise_role != "mixed" or cfg.cache_config.enable_prefix_caching:
+            if isinstance(self.cfg.cache_config.cache_queue_port, str):
+                self.cfg.cache_config.cache_queue_port = self.cfg.cache_config.cache_queue_port.split(",")
+            if isinstance(self.cfg.cache_config.cache_queue_port, list):
+                self.cfg.cache_config.cache_queue_port = int(
+                    self.cfg.cache_config.cache_queue_port[self.cfg.parallel_config.local_data_parallel_id]
                 )
 
-        if self.fd_config.parallel_config.enable_expert_parallel:
+        if self.cfg.parallel_config.enable_expert_parallel:
             self.llm_logger = get_logger(
-                "fastdeploy", f"fastdeploy_rank{self.fd_config.parallel_config.local_data_parallel_id}.log"
+                "fastdeploy", f"fastdeploy_rank{self.cfg.parallel_config.local_data_parallel_id}.log"
             )
         else:
             self.llm_logger = llm_logger
 
-        self.scheduler = fd_config.scheduler_config.scheduler()
+        self.scheduler = cfg.scheduler_config.scheduler()
         self.enable_decode_cache_task = envs.FD_ENABLE_CACHE_TASK == "1"
 
         if envs.ENABLE_V1_KVCACHE_SCHEDULER:
             self.resource_manager = ResourceManagerV1(
-                fd_config.scheduler_config.max_num_seqs,
-                fd_config,
-                fd_config.parallel_config.tensor_parallel_size,
-                fd_config.scheduler_config.splitwise_role,
-                fd_config.parallel_config.local_data_parallel_id,
+                cfg.scheduler_config.max_num_seqs,
+                cfg,
+                cfg.parallel_config.tensor_parallel_size,
+                cfg.scheduler_config.splitwise_role,
+                cfg.parallel_config.local_data_parallel_id,
             )
         else:
             self.resource_manager = ResourceManager(
-                fd_config.scheduler_config.max_num_seqs,
-                fd_config,
-                fd_config.parallel_config.tensor_parallel_size,
-                fd_config.scheduler_config.splitwise_role,
-                fd_config.parallel_config.local_data_parallel_id,
+                cfg.scheduler_config.max_num_seqs,
+                cfg,
+                cfg.parallel_config.tensor_parallel_size,
+                cfg.scheduler_config.splitwise_role,
+                cfg.parallel_config.local_data_parallel_id,
             )
 
         self.start_worker_queue_service(start_queue)
 
-        os.environ["INFERENCE_MSG_QUEUE_ID"] = self.fd_config.parallel_config.engine_worker_queue_port[
-            self.fd_config.parallel_config.local_data_parallel_id
+        os.environ["INFERENCE_MSG_QUEUE_ID"] = self.cfg.parallel_config.engine_worker_queue_port[
+            self.cfg.parallel_config.local_data_parallel_id
         ]
 
-        self.split_connector = SplitwiseConnector(fd_config, self.engine_worker_queue, self.resource_manager)
+        self.split_connector = SplitwiseConnector(cfg, self.engine_worker_queue, self.resource_manager)
         self.waiting_requests = []
         self.token_processor = TokenProcessor(
-            cfg=fd_config,
+            cfg=cfg,
             cached_generated_tokens=self.scheduler,
             engine_worker_queue=self.engine_worker_queue,
             split_connector=self.split_connector,
         )
         self.token_processor.set_resource_manager(self.resource_manager)
 
-        self.partial_chunked_tokens = [0] * (self.fd_config.max_num_partial_prefills + 1)
-        for idx in range(1, self.fd_config.max_num_partial_prefills + 1):
+        self.partial_chunked_tokens = [0] * (self.cfg.max_num_partial_prefills + 1)
+        for idx in range(1, self.cfg.max_num_partial_prefills + 1):
             self.partial_chunked_tokens[idx] = (
-                (self.fd_config.scheduler_config.max_num_batched_tokens // idx)
-                // self.fd_config.cache_config.block_size
-                * self.fd_config.cache_config.block_size
+                (self.cfg.scheduler_config.max_num_batched_tokens // idx)
+                // self.cfg.cache_config.block_size
+                * self.cfg.cache_config.block_size
             )
 
         self.guided_decoding_checker = None
-        if self.fd_config.guided_decoding_backend != "off":
+        if self.cfg.structured_outputs_config.guided_decoding_backend != "off":
             self.guided_decoding_checker = schema_checker(
-                self.fd_config.guided_decoding_backend,
-                disable_any_whitespace=self.fd_config.disable_any_whitespace,
+                self.cfg.structured_outputs_config.guided_decoding_backend,
+                disable_any_whitespace=self.cfg.structured_outputs_config.disable_any_whitespace,
             )
         self._init_worker_monitor_signals()
 
@@ -146,14 +146,12 @@ def start(self):
         self.insert_task_to_worker_thread.start()
         self.token_processor.tasks_queue = self.engine_worker_queue
         self.token_processor.run()
-        if self.fd_config.scheduler_config.splitwise_role != "mixed":
+        if self.cfg.scheduler_config.splitwise_role != "mixed":
             self.split_mode_get_tasks()
 
     def _init_worker_monitor_signals(self):  # exist_task_signal 用于各worker进程感知是否有新Task需要处理
         current_suffix = int(
-            self.fd_config.parallel_config.engine_worker_queue_port[
-                self.fd_config.parallel_config.local_data_parallel_id
-            ]
+            self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]
         )
         self.llm_logger.info(f"current_suffix: {current_suffix}")
         exist_task_signal_data = np.zeros([1], dtype=np.int32)
@@ -187,8 +185,7 @@ def _init_worker_monitor_signals(self):  # exist_task_signal 用于各worker进
 
         # worker_live_signal 用于engine感知各worker进程是否存活，记录每个step 时间
         worker_healthy_live_recorded_time_array = np.zeros(
-            shape=[min(self.fd_config.worker_num_per_node, self.fd_config.parallel_config.tensor_parallel_size)],
-            dtype=np.int32,
+            shape=[min(self.cfg.worker_num_per_node, self.cfg.parallel_config.tensor_parallel_size)], dtype=np.int32
         )
         self.worker_healthy_live_signal = IPCSignal(
             name="worker_healthy_live_signal",
@@ -198,7 +195,7 @@ def _init_worker_monitor_signals(self):  # exist_task_signal 用于各worker进
             create=True,
         )
 
-        cache_ready_signal_data = np.zeros(shape=[self.fd_config.parallel_config.tensor_parallel_size], dtype=np.int32)
+        cache_ready_signal_data = np.zeros(shape=[self.cfg.parallel_config.tensor_parallel_size], dtype=np.int32)
         self.cache_ready_signal = IPCSignal(
             name="cache_ready_signal",
             array=cache_ready_signal_data,
@@ -207,9 +204,7 @@ def _init_worker_monitor_signals(self):  # exist_task_signal 用于各worker进
             create=True,
         )
 
-        swap_space_ready_signal_data = np.zeros(
-            shape=[self.fd_config.parallel_config.tensor_parallel_size], dtype=np.int32
-        )
+        swap_space_ready_signal_data = np.zeros(shape=[self.cfg.parallel_config.tensor_parallel_size], dtype=np.int32)
         self.swap_space_ready_signal = IPCSignal(
             name="swap_space_ready_signal",
             array=swap_space_ready_signal_data,
@@ -250,53 +245,43 @@ def start_worker_queue_service(self, start_queue):
         start queue service for engine worker communication
         """
         address = (
-            self.fd_config.master_ip,
-            int(
-                self.fd_config.parallel_config.engine_worker_queue_port[
-                    self.fd_config.parallel_config.local_data_parallel_id
-                ]
-            ),
+            self.cfg.master_ip,
+            int(self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]),
         )
 
-        if start_queue and (
-            self.fd_config.host_ip == self.fd_config.master_ip or self.fd_config.master_ip == "0.0.0.0"
-        ):
+        if start_queue and (self.cfg.host_ip == self.cfg.master_ip or self.cfg.master_ip == "0.0.0.0"):
             self.llm_logger.info(f"Starting engine worker queue server service at {address}")
             self.engine_worker_queue_server = EngineWorkerQueue(
                 address=address,
                 is_server=True,
-                num_client=self.fd_config.parallel_config.tensor_parallel_size,
-                local_data_parallel_size=self.fd_config.parallel_config.data_parallel_size,
+                num_client=self.cfg.parallel_config.tensor_parallel_size,
+                local_data_parallel_size=self.cfg.parallel_config.data_parallel_size,
             )
 
-            if (
-                self.fd_config.cache_config.enable_prefix_caching
-                or self.fd_config.scheduler_config.splitwise_role != "mixed"
-            ):
+            if self.cfg.cache_config.enable_prefix_caching or self.cfg.scheduler_config.splitwise_role != "mixed":
                 self.cache_task_queue = EngineCacheQueue(
                     address=(
-                        self.fd_config.master_ip,
-                        self.fd_config.cache_config.cache_queue_port,
+                        self.cfg.master_ip,
+                        self.cfg.cache_config.cache_queue_port,
                     ),
                     authkey=b"cache_queue_service",
                     is_server=True,
-                    num_client=self.fd_config.parallel_config.tensor_parallel_size,
+                    num_client=self.cfg.parallel_config.tensor_parallel_size,
                     client_id=-1,
-                    local_data_parallel_size=self.fd_config.parallel_config.data_parallel_size,
+                    local_data_parallel_size=self.cfg.parallel_config.data_parallel_size,
                 )
         self.llm_logger.info(
-            f"local {min(self.fd_config.worker_num_per_node * self.fd_config.node_rank + self.fd_config.parallel_config.local_data_parallel_id,self.fd_config.parallel_config.data_parallel_size - 1)}"
+            f"local {min(self.cfg.worker_num_per_node * self.cfg.node_rank + self.cfg.parallel_config.local_data_parallel_id,self.cfg.parallel_config.data_parallel_size - 1)}"
         )
         self.engine_worker_queue = EngineWorkerQueue(
             address=address,
             is_server=False,
-            num_client=self.fd_config.parallel_config.tensor_parallel_size,
+            num_client=self.cfg.parallel_config.tensor_parallel_size,
             client_id=0,
-            local_data_parallel_size=self.fd_config.parallel_config.data_parallel_size,
+            local_data_parallel_size=self.cfg.parallel_config.data_parallel_size,
             local_data_parallel_id=min(
-                self.fd_config.worker_num_per_node * self.fd_config.node_rank
-                + self.fd_config.parallel_config.local_data_parallel_id,
-                self.fd_config.parallel_config.data_parallel_size - 1,
+                self.cfg.worker_num_per_node * self.cfg.node_rank + self.cfg.parallel_config.local_data_parallel_id,
+                self.cfg.parallel_config.data_parallel_size - 1,
             ),
         )
 
@@ -326,8 +311,8 @@ def insert_tasks(self, tasks, current_id=-1, allocated=False):
                 cur_task.prompt_token_ids[0] = task.outputs.token_ids[0]
                 cur_task.num_cached_tokens = task.num_cached_tokens
                 if (
-                    self.fd_config.speculative_config.method in ["mtp"]
-                    and self.fd_config.scheduler_config.splitwise_role == "decode"
+                    self.cfg.speculative_config.method in ["mtp"]
+                    and self.cfg.scheduler_config.splitwise_role == "decode"
                 ):
                     cur_task.draft_token_ids = copy.deepcopy(task.outputs.draft_token_ids)
                 if task.error_code != 200:
@@ -354,7 +339,7 @@ def insert_tasks(self, tasks, current_id=-1, allocated=False):
 
         need_delete_tasks = []
         for task in tasks:
-            if self.fd_config.scheduler_config.splitwise_role != "mixed":
+            if self.cfg.scheduler_config.splitwise_role != "mixed":
                 status, msg = self.split_connector.check_decode_allocated(task)
                 if not status:
                     self.llm_logger.error(f"{task.request_id} prefill failed with msg:{msg}.")
@@ -410,12 +395,12 @@ def insert_tasks(self, tasks, current_id=-1, allocated=False):
             for task in tasks:
                 task.inference_start_time = time.time()
             if not is_prefill:
-                if not self.fd_config.model_config.enable_mm:
+                if not self.cfg.model_config.enable_mm:
                     self.update_requests_chunk_size(tasks)
                 else:
                     self.update_mm_requests_chunk_size(tasks)
             self.engine_worker_queue.put_tasks((tasks, self.resource_manager.real_bsz))
-            if is_prefill and self.fd_config.scheduler_config.name != "splitwise":
+            if is_prefill and self.cfg.scheduler_config.name != "splitwise":
                 self.engine_worker_queue.available_prefill_instances.put(1)
         return True
 
@@ -448,14 +433,14 @@ def update_tokens(idx, chunk_size, update_chunk=False):
             if current_request_size[idx] <= 0:
                 chunk_request_num -= 1
 
-        if not self.fd_config.cache_config.enable_chunked_prefill or len(requests) == 0:
+        if not self.cfg.cache_config.enable_chunked_prefill or len(requests) == 0:
             return
 
         current_request_size = [request.prompt_token_ids_len for request in requests]
         requests_chunk = [[] for _ in range(len(requests))]
         chunk_request_num = len(current_request_size)
         while chunk_request_num >= 1:
-            remain_batched_tokens = self.fd_config.scheduler_config.max_num_batched_tokens
+            remain_batched_tokens = self.cfg.scheduler_config.max_num_batched_tokens
             for idx in range(len(current_request_size)):
                 if current_request_size[idx] <= 0:
                     continue
@@ -465,16 +450,14 @@ def update_tokens(idx, chunk_size, update_chunk=False):
                 )
                 update_tokens(idx, chunk_size)
 
-            while remain_batched_tokens >= self.fd_config.cache_config.block_size:
+            while remain_batched_tokens >= self.cfg.cache_config.block_size:
                 # 当前 max_num_batched_tokens 还有剩余时，优先分配给较短的请求
                 waiting_requests = [input_lens for input_lens in current_request_size if input_lens > 0]
                 if len(waiting_requests) == 0:
                     break
 
                 available_tokens = (
-                    remain_batched_tokens
-                    // self.fd_config.cache_config.block_size
-                    * self.fd_config.cache_config.block_size
+                    remain_batched_tokens // self.cfg.cache_config.block_size * self.cfg.cache_config.block_size
                 )
                 append_idx = current_request_size.index(min(waiting_requests))
                 chunk_size = min(
@@ -491,7 +474,7 @@ def update_mm_requests_chunk_size(self, requests):
         """
         update each multimodal request's chunk size info
         """
-        if not self.fd_config.cache_config.enable_chunked_prefill or len(requests) == 0:
+        if not self.cfg.cache_config.enable_chunked_prefill or len(requests) == 0:
             return
 
         for request in requests:
@@ -578,7 +561,7 @@ def _insert_task_to_worker(self):
                     continue
                 if hasattr(self, "exist_prefill_task_signal") and self.exist_prefill_task_signal.value[0] > 0:
                     if (
-                        self.fd_config.scheduler_config.splitwise_role == "mixed"
+                        self.cfg.scheduler_config.splitwise_role == "mixed"
                         or self.split_connector.has_splitwise_tasks()
                     ):
                         time.sleep(0.005)
@@ -592,15 +575,15 @@ def _insert_task_to_worker(self):
 
                 num_prefill_batch = min(
                     int(self.resource_manager.available_batch()),
-                    self.fd_config.max_prefill_batch,
+                    self.cfg.max_prefill_batch,
                 )
 
                 self.resource_manager.check_and_free_block_tables()
                 tasks = self.scheduler.get_requests(
                     available_blocks=self.resource_manager.available_block_num(),
-                    block_size=self.fd_config.cache_config.block_size,
-                    reserved_output_blocks=self.fd_config.cache_config.enc_dec_block_num,
-                    max_num_batched_tokens=self.fd_config.scheduler_config.max_num_batched_tokens,
+                    block_size=self.cfg.cache_config.block_size,
+                    reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num,
+                    max_num_batched_tokens=self.cfg.scheduler_config.max_num_batched_tokens,
                     batch=num_prefill_batch,
                 )
 
@@ -608,7 +591,7 @@ def _insert_task_to_worker(self):
                     time.sleep(0.001)
                     continue
 
-                if self.fd_config.scheduler_config.splitwise_role != "mixed":
+                if self.cfg.scheduler_config.splitwise_role != "mixed":
                     self.llm_logger.info("Inserting splitwise tasks")
                     self.split_connector.send_splitwise_tasks(tasks, current_id)
 
@@ -637,21 +620,21 @@ def _fetch_request():
                 is_fetching = True
                 num_prefill_batch = min(
                     int(self.resource_manager.available_batch()),
-                    self.fd_config.max_prefill_batch,
+                    self.cfg.max_prefill_batch,
                 )
-                if self.fd_config.model_config.enable_mm:
+                if self.cfg.model_config.enable_mm:
                     available_blocks = self.resource_manager.available_block_num()
                 else:
-                    available_blocks = self.fd_config.cache_config.max_block_num_per_seq
+                    available_blocks = self.cfg.cache_config.max_block_num_per_seq
 
                 tasks = self.scheduler.get_requests(
                     available_blocks=available_blocks,
-                    block_size=self.fd_config.cache_config.block_size,
-                    reserved_output_blocks=self.fd_config.cache_config.enc_dec_block_num,
-                    max_num_batched_tokens=self.fd_config.model_config.max_model_len,
+                    block_size=self.cfg.cache_config.block_size,
+                    reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num,
+                    max_num_batched_tokens=self.cfg.model_config.max_model_len,
                     batch=num_prefill_batch,
                 )
-                if self.fd_config.scheduler_config.splitwise_role != "mixed":
+                if self.cfg.scheduler_config.splitwise_role != "mixed":
                     for task in tasks:
                         # assure can allocate block ids in P
                         while not self.resource_manager.preallocate_resource_in_p(task):
@@ -660,7 +643,7 @@ def _fetch_request():
                         self.split_connector.send_splitwise_tasks([task], task.idx)
                     need_delete_tasks = []
                     for task in tasks:
-                        if self.fd_config.scheduler_config.splitwise_role != "mixed":
+                        if self.cfg.scheduler_config.splitwise_role != "mixed":
                             # assure fetch block ids from D
                             status, msg = self.split_connector.check_decode_allocated(task)
                             if not status:
@@ -681,7 +664,7 @@ def _fetch_request():
                         tasks.remove(tmp_task)
                         # release resource in P
                         self.resource_manager.prerelease_resource(tmp_task)
-                if self.fd_config.scheduler_config.splitwise_role == "prefill":
+                if self.cfg.scheduler_config.splitwise_role == "prefill":
                     # to send cache info to cache messager
                     if tasks:
                         self.split_connector.send_cache_infos(tasks, 0)
@@ -698,7 +681,7 @@ def _fetch_request():
                                 time.sleep(0.001)
                 # Fetch requests and add them to the scheduling queue
                 if tasks:
-                    if self.fd_config.scheduler_config.splitwise_role == "prefill":
+                    if self.cfg.scheduler_config.splitwise_role == "prefill":
                         self.resource_manager.add_request_in_p(tasks)
                     else:
                         for task in tasks:
@@ -713,9 +696,10 @@ def _fetch_request():
                 if self.engine_worker_queue.num_tasks() > 0:
                     time.sleep(0.001)
                     continue
-                if self.fd_config.scheduler_config.splitwise_role != "mixed":
+                if self.cfg.scheduler_config.splitwise_role != "mixed":
                     if not is_fetching:
                         get_request_pool.submit(_fetch_request)
+
                 else:
                     if (
                         len(self.resource_manager.waiting) == 0
@@ -727,7 +711,7 @@ def _fetch_request():
                 tasks = self.resource_manager.schedule()
                 # 3. Send to engine
                 if tasks:
-                    if self.fd_config.scheduler_config.splitwise_role == "decode":
+                    if self.cfg.scheduler_config.splitwise_role == "decode":
                         for task in tasks:
                             if task.task_type == RequestType.PREEMPTED:
                                 msg = f"{task.request_id} decode not enough blocks, need to be rescheduled."
@@ -759,7 +743,7 @@ def start_zmq_service(self, api_server_pid=None):
             self.recv_request_server = ZmqTcpServer(port=envs.FD_ZMQ_RECV_REQUEST_SERVER_PORT, mode=zmq.PULL)
             self.send_response_server = ZmqTcpServer(port=envs.FD_ZMQ_SEND_RESPONSE_SERVER_PORT, mode=zmq.ROUTER)
             self.internal_adapter = InternalAdapter(
-                cfg=self.fd_config, engine=self, dp_rank=self.fd_config.node_rank * self.fd_config.worker_num_per_node
+                cfg=self.cfg, engine=self, dp_rank=self.cfg.node_rank * self.cfg.worker_num_per_node
             )
         else:
             self.recv_request_server = ZmqIpcServer(name=api_server_pid, mode=zmq.PULL)
@@ -778,12 +762,12 @@ def start_zmq_service(self, api_server_pid=None):
     def _insert_zmq_task_to_scheduler(self):
         added_requests: Dict[str, int] = dict()
         if envs.FD_ENABLE_INTERNAL_ADAPTER:
-            if self.fd_config.scheduler_config.splitwise_role == "decode":
+            if self.cfg.scheduler_config.splitwise_role == "decode":
                 return
         while self.running:
             try:
                 block = True if len(added_requests) == 0 else False
-                if not self.fd_config.model_config.enable_mm:
+                if not self.cfg.model_config.enable_mm:
                     err, data = self.recv_request_server.receive_json_once(block)
                 else:
                     err, data = self.recv_request_server.receive_pyobj_once(block)
@@ -945,7 +929,7 @@ def receiver_loop():
 
                                     else:
                                         self.insert_tasks(tasks, allocated=True)
-                                        if self.fd_config.innode_prefill_ports is not None:
+                                        if self.cfg.innode_prefill_ports is not None:
                                             self.scheduler.put_results(tasks)
                                 else:
                                     if len(self.waiting_requests):
@@ -990,14 +974,12 @@ def receiver_loop():
 
     def start_cache_service(self, device_ids, ipc_signal_suffix, create_cache_tensor):
         return self.resource_manager.cache_manager.launch_cache_manager(
-            cache_config=self.fd_config.cache_config,
-            tensor_parallel_size=self.fd_config.parallel_config.tensor_parallel_size,
+            cache_config=self.cfg.cache_config,
+            tensor_parallel_size=self.cfg.parallel_config.tensor_parallel_size,
             device_ids=device_ids,
-            pod_ip=self.fd_config.master_ip,
+            pod_ip=self.cfg.master_ip,
             engine_worker_queue_port=int(
-                self.fd_config.parallel_config.engine_worker_queue_port[
-                    self.fd_config.parallel_config.local_data_parallel_id
-                ]
+                self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]
             ),
             pid_suffix=ipc_signal_suffix,
             create_cache_tensor=create_cache_tensor,
diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py
index d92a2568cc2..51d49d15fb9 100644
--- a/fastdeploy/entrypoints/engine_client.py
+++ b/fastdeploy/entrypoints/engine_client.py
@@ -256,8 +256,8 @@ def valid_parameters(self, data):
                 raise ParameterError("max_tokens", f"max_tokens can be defined [1, {self.max_model_len}).")
 
         if data.get("reasoning_max_tokens") is not None:
-            if data["reasoning_max_tokens"] < 0:
-                raise ParameterError("reasoning_max_tokens", "reasoning_max_tokens must be greater than 0")
+            if data["reasoning_max_tokens"] < 1:
+                raise ParameterError("reasoning_max_tokens", "reasoning_max_tokens must be greater than 1")
             if data["reasoning_max_tokens"] > data["max_tokens"]:
                 data["reasoning_max_tokens"] = data["max_tokens"]
                 api_server_logger.warning(
diff --git a/fastdeploy/model_executor/models/interfaces_base.py b/fastdeploy/model_executor/models/interfaces_base.py
new file mode 100644
index 00000000000..b7ece5fe69a
--- /dev/null
+++ b/fastdeploy/model_executor/models/interfaces_base.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Type
+
+from paddle import nn
+
+
+def is_text_generation_model(model_cls: Type[nn.Layer]) -> bool:
+    from .model_base import ModelForCasualLM
+
+    return issubclass(model_cls, ModelForCasualLM)
+
+
+def is_pooling_model(model_cls: Type[nn.Layer]) -> bool:
+    class_name = model_cls.__name__
+    pooling_indicators = ["Embedding", "ForSequenceClassification"]
+    return (
+        any(indicator in class_name for indicator in pooling_indicators)
+        or hasattr(model_cls, "is_embedding_model")
+        and model_cls.is_embedding_model
+    )
+
+
+def is_multimodal_model(class_name: str) -> bool:
+    multimodal_indicators = ["VL", "Vision", "ConditionalGeneration"]
+    return any(indicator in class_name for indicator in multimodal_indicators)
+
+
+def determine_model_category(class_name: str):
+    from fastdeploy.model_executor.models.model_base import ModelCategory
+
+    if any(pattern in class_name for pattern in ["VL", "Vision", "ConditionalGeneration"]):
+        return ModelCategory.MULTIMODAL
+    elif any(pattern in class_name for pattern in ["Embedding", "ForSequenceClassification"]):
+        return ModelCategory.EMBEDDING
+    return ModelCategory.TEXT_GENERATION
+
+
+def get_default_pooling_type(model_cls: Type[nn.Layer] = None) -> str:
+    if model_cls is not None:
+        return getattr(model_cls, "default_pooling_type", "LAST")
+    return "LAST"
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 241a42c7ab6..6d51997ba21 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -345,7 +345,6 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
                     )
 
                 if request.get("reasoning_max_tokens") is not None:
-                    assert request.get("reasoning_max_tokens") >= 0, "reasoning_max_tokens in requests need >= 0."
                     # Enable thinking
                     self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get("reasoning_max_tokens")
                     self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0
@@ -578,7 +577,6 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests:
                     self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
 
                 if request.get("reasoning_max_tokens") is not None:
-                    assert request.get("reasoning_max_tokens") >= 0, "reasoning_max_tokens in requests need >= 0."
                     # Enable thinking
                     self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get("reasoning_max_tokens")
                     self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0
diff --git a/tests/operators/test_set_value_by_flags_and_idx.py b/tests/operators/test_set_value_by_flags_and_idx.py
index 6861ca218d0..aada9e2606f 100644
--- a/tests/operators/test_set_value_by_flags_and_idx.py
+++ b/tests/operators/test_set_value_by_flags_and_idx.py
@@ -34,7 +34,7 @@ def set_value_by_flags_and_idx_numpy(
         current_step_idx = step_idx[i]
         if seq_len_enc == 0 and seq_len_dec == 0:
             continue
-        if current_step_idx >= 0:
+        if current_step_idx > 0:
             if seq_len_enc > 0:
                 token_idx = seq_len_enc - 1
                 token_to_assign = input_ids[i, token_idx]

From 9bb46294552f9064afbcde3c784d57ec0e6525a4 Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Fri, 17 Oct 2025 15:42:23 +0800
Subject: [PATCH 18/24] fix

---
 fastdeploy/worker/gpu_model_runner.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 832f57c014f..894aed8794a 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1458,10 +1458,6 @@ def _dummy_sampler_run(
             ),
             accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None),
             accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None),
-            enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None),
-            think_end_id=(getattr(self.model_config, "think_end_id", -1) if self.enable_mm else -1),
-            need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None),
-            reasoning_index=(self.share_inputs["reasoning_index"] if self.enable_mm else None),
             stop_token_ids=self.share_inputs["stop_seqs"],
             stop_seqs_len=self.share_inputs["stop_seqs_len"],
         )

From 41ef32c9d8abd19a0a4ce86a7b168bace80c2e2c Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Fri, 17 Oct 2025 15:46:40 +0800
Subject: [PATCH 19/24] fix

---
 .../model_executor/models/interfaces_base.py  | 21 -------------------
 .../model_executor/models/model_base.py       |  7 +------
 2 files changed, 1 insertion(+), 27 deletions(-)

diff --git a/fastdeploy/model_executor/models/interfaces_base.py b/fastdeploy/model_executor/models/interfaces_base.py
index 77533209d9b..bc222634427 100644
--- a/fastdeploy/model_executor/models/interfaces_base.py
+++ b/fastdeploy/model_executor/models/interfaces_base.py
@@ -26,31 +26,10 @@
 T_co = TypeVar("T_co", default=paddle.Tensor, covariant=True)
 
 
-def is_text_generation_model(model_cls: Type[nn.Layer]) -> bool:
-    from .model_base import ModelForCasualLM
-
-    return issubclass(model_cls, ModelForCasualLM)
-
-
 def is_pooling_model(model_cls: Type[nn.Layer]) -> bool:
     return getattr(model_cls, "is_pooling_model", False)
 
 
-def is_multimodal_model(class_name: str) -> bool:
-    multimodal_indicators = ["VL", "Vision", "ConditionalGeneration"]
-    return any(indicator in class_name for indicator in multimodal_indicators)
-
-
-def determine_model_category(class_name: str):
-    from fastdeploy.model_executor.models.model_base import ModelCategory
-
-    if any(pattern in class_name for pattern in ["VL", "Vision", "ConditionalGeneration"]):
-        return ModelCategory.MULTIMODAL
-    elif any(pattern in class_name for pattern in ["Embedding", "ForSequenceClassification"]):
-        return ModelCategory.EMBEDDING
-    return ModelCategory.TEXT_GENERATION
-
-
 def get_default_pooling_type(model_cls: Type[nn.Layer] = None) -> str:
     if model_cls is not None:
         return getattr(model_cls, "default_pooling_type", "LAST")
diff --git a/fastdeploy/model_executor/models/model_base.py b/fastdeploy/model_executor/models/model_base.py
index 1ddd39c9a32..0ca56f343c1 100644
--- a/fastdeploy/model_executor/models/model_base.py
+++ b/fastdeploy/model_executor/models/model_base.py
@@ -26,12 +26,7 @@
     iter_architecture_defaults,
     try_match_architecture_defaults,
 )
-
-
-def get_default_pooling_type(model_cls: Type[nn.Layer] = None) -> str:
-    if model_cls is not None:
-        return getattr(model_cls, "default_pooling_type", "LAST")
-    return "LAST"
+from fastdeploy.model_executor.models.interfaces_base import get_default_pooling_type
 
 
 class ModelCategory(IntFlag):

From 849eaa6395bddf020ec31841739f78f9937a0cf4 Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Mon, 20 Oct 2025 12:01:56 +0800
Subject: [PATCH 20/24] support mtp

---
 custom_ops/gpu_ops/cpp_extensions.cc          |  23 +++
 ...culate_limit_thinking_content_length_v1.cu | 132 +++++++++++++++
 ...culate_limit_thinking_content_length_v2.cu | 159 ++++++++++++++++++
 .../model_executor/pre_and_post_process.py    | 153 +++++++++++++----
 fastdeploy/worker/gpu_model_runner.py         |  16 +-
 5 files changed, 435 insertions(+), 48 deletions(-)
 create mode 100644 custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu
 create mode 100644 custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu

diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc
index 40417b1e09a..e3c8f33ece5 100644
--- a/custom_ops/gpu_ops/cpp_extensions.cc
+++ b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -916,6 +916,25 @@ void LimitThinkingContentLengthV2(const paddle::Tensor &next_tokens,
                                   const int64_t think_end_id,
                                   const int64_t line_break_id);
 
+void SpeculateLimitThinkingContentLengthV1(
+    const paddle::Tensor& next_tokens,
+    const paddle::Tensor& max_think_lens,
+    const paddle::Tensor& step_idx,
+    const paddle::Tensor& limit_think_status,
+    const paddle::Tensor& accept_num,
+    const paddle::Tensor& seq_lens_decoder,
+    const int64_t think_end_id);
+
+void SpeculateLimitThinkingContentLengthV2(
+    const paddle::Tensor& next_tokens,
+    const paddle::Tensor& max_think_lens,
+    const paddle::Tensor& step_idx,
+    const paddle::Tensor& limit_think_status,
+    const paddle::Tensor& accept_num,
+    const paddle::Tensor& seq_lens_decoder,
+    const int64_t think_end_id,
+    const int64_t line_break_id);
+
 PYBIND11_MODULE(fastdeploy_ops, m) {
 
   m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"),
@@ -1301,4 +1320,8 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
   m.def("limit_thinking_content_length_v1", &LimitThinkingContentLengthV1, "limit_thinking_content_length_v1 function");
 
   m.def("limit_thinking_content_length_v2", &LimitThinkingContentLengthV2, "limit_thinking_content_length_v2 function");
+
+  m.def("speculate_limit_thinking_content_length_v1", &SpeculateLimitThinkingContentLengthV1, "speculate limit thinking content length function");
+
+  m.def("speculate_limit_thinking_content_length_v2", &SpeculateLimitThinkingContentLengthV2, "speculate limit thinking content length function");
 }
diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu
new file mode 100644
index 00000000000..90e3760af4b
--- /dev/null
+++ b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu
@@ -0,0 +1,132 @@
+/ Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "helper.h"
+#include "paddle/extension.h"
+
+__global__ void speculate_limit_thinking_content_length_kernel_v1(
+    int64_t* next_tokens,
+    const int* max_think_lens,
+    int64_t* step_idx,
+    int* limit_think_status,
+    int* accept_num,
+    int* seq_lens_decoder,
+    const int64_t think_end_id,
+    const int tokens_per_step,
+    const int bs) {
+    int bid = threadIdx.x;
+    if (bid >= bs) return;
+
+    const int original_accept_num = accept_num[bid];
+    if (original_accept_num <= 0) return;
+
+    // 如果该序列未启用思考功能，则直接返回，默认值为 -1，表示不限制思考长度
+    const int max_think_len = max_think_lens[bid];
+    if (max_think_len < 0) return;
+    int current_limit_think_status = limit_think_status[bid];
+    // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行.
+    if (current_limit_think_status == 3) {
+        return;
+    }
+
+    int new_accept_num = original_accept_num;
+
+    const int64_t current_base_step = step_idx[bid] - original_accept_num + 1;
+
+    for (int token_offset = 0; token_offset < original_accept_num;
+         token_offset++) {
+        const int token_idx = bid * tokens_per_step + token_offset;
+        int64_t next_token = next_tokens[token_idx];
+        const int64_t current_step = current_base_step + token_offset;
+
+        bool condition_triggered = false;
+
+        // ======================= 思考阶段控制 =======================
+        // 阶段 1: 仍在思考 (status == 0), 检查是否需要强制结束
+        // 阶段 2: 在替换 (status == 1), 检查是否替换结束
+        if (current_limit_think_status < 1) {
+            // 当开启思考长度控制时，检查是否超时
+            if (current_step >= max_think_len) {
+                // 强制将当前token替换为结束思考的token
+                next_token = think_end_id;
+                current_limit_think_status = 1;
+                condition_triggered = true;  // 因为修改了token，需要截断
+            }
+        }
+
+        // ======================= 思考结束处理 =======================
+        // 阶段 3: 检查是否已满足结束思考的条件 (status == 0 || status == 2)
+        // 这种情况会处理两种场景:
+        // 1. status == 0: 模型可能自己生成了 </think>
+        // 2. status == 2: 上一阶段强制注入了 </think>
+        if (current_limit_think_status < 2) {
+            if (next_token == think_end_id) {
+                // 确认思考结束，将状态推进到 2 (响应阶段)
+                current_limit_think_status = 2;
+            }
+        }
+
+        next_tokens[token_idx] = next_token;
+
+        if (condition_triggered) {
+            new_accept_num = token_offset + 1;
+            break;
+        }
+    }
+
+    // 更新全局状态
+    int discarded_tokens = original_accept_num - new_accept_num;
+    if (discarded_tokens > 0) {
+        step_idx[bid] -= discarded_tokens;
+        seq_lens_decoder[bid] -= discarded_tokens;
+    }
+
+    accept_num[bid] = new_accept_num;
+    limit_think_status[bid] = current_limit_think_status;
+}
+
+void SpeculateLimitThinkingContentLengthV1(
+    const paddle::Tensor& next_tokens,
+    const paddle::Tensor& max_think_lens,
+    const paddle::Tensor& step_idx,
+    const paddle::Tensor& limit_think_status,
+    const paddle::Tensor& accept_num,
+    const paddle::Tensor& seq_lens_decoder,
+    const int64_t think_end_id) {
+    const int batch_size = next_tokens.shape()[0];
+    const int tokens_per_step = next_tokens.shape()[1];
+
+    speculate_limit_thinking_content_length_kernel_v1<<<1, 1024>>>(
+        const_cast<int64_t*>(next_tokens.data<int64_t>()),
+        max_think_lens.data<int>(),
+        const_cast<int64_t*>(step_idx.data<int64_t>()),
+        const_cast<int*>(limit_think_status.data<int>()),
+        const_cast<int*>(accept_num.data<int>()),
+        const_cast<int*>(seq_lens_decoder.data<int>()),
+        think_end_id,
+        tokens_per_step,
+        batch_size);
+}
+
+PD_BUILD_STATIC_OP(speculate_limit_thinking_content_length_v1)
+    .Inputs({"next_tokens",
+             "max_think_lens",
+             "step_idx",
+             "limit_think_status",
+             "accept_num",
+             "seq_lens_decoder"})
+    .Attrs({"think_end_id: int64_t"})
+    .Outputs({"next_tokens_out"})
+    .SetInplaceMap({{"next_tokens", "next_tokens_out"}})
+    .SetKernelFn(PD_KERNEL(SpeculateLimitThinkingContentLengthV1));
diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu
new file mode 100644
index 00000000000..9277dd4c137
--- /dev/null
+++ b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu
@@ -0,0 +1,159 @@
+/ Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "helper.h"
+#include "paddle/extension.h"
+
+// status == 0: 正常生成阶段
+// status == 1: 替换阶段
+// status == 2: 替换结束阶段
+// status == 3: 思考结束阶段
+__global__ void speculate_limit_thinking_content_length_kernel_v2(
+    int64_t* next_tokens,
+    const int* max_think_lens,
+    int64_t* step_idx,
+    int* limit_think_status,
+    int* accept_num,
+    int* seq_lens_decoder,
+    const int64_t think_end_id,
+    const int64_t line_break_id,
+    const int tokens_per_step,
+    const int bs) {
+    int bid = threadIdx.x;
+    if (bid >= bs) return;
+
+    const int original_accept_num = accept_num[bid];
+    if (original_accept_num <= 0) return;
+
+    // 如果该序列未启用思考功能，则直接返回，默认值为 -1，表示不限制思考长度
+    const int max_think_len = max_think_lens[bid];
+    if (max_think_len < 0) return;
+    int current_limit_think_status = limit_think_status[bid];
+    // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行.
+    if (current_limit_think_status == 3) {
+        return;
+    }
+
+    int new_accept_num = original_accept_num;
+
+    const int64_t current_base_step = step_idx[bid] - original_accept_num + 1;
+
+    for (int token_offset = 0; token_offset < original_accept_num;
+         token_offset++) {
+        const int token_idx = bid * tokens_per_step + token_offset;
+        int64_t next_token = next_tokens[token_idx];
+        const int64_t current_step = current_base_step + token_offset;
+
+        bool condition_triggered = false;
+
+        // ======================= 思考阶段控制 =======================
+        // 阶段 1: 仍在思考 (status == 0), 检查是否需要强制结束
+        // 阶段 2: 在替换 (status == 1), 检查是否替换结束
+        if (current_limit_think_status <= 1) {
+            // 当开启思考长度控制时，检查是否超时
+            if (current_step == max_think_len) {
+                // 强制将当前token替换为结束思考的token
+                next_token = line_break_id;
+                current_limit_think_status = 1;
+                condition_triggered = true;  // 因为修改了token，需要截断
+            } else if (current_step == max_think_len + 1) {
+                // 强制将当前token替换为结束思考的token
+                next_token = think_end_id;
+                current_limit_think_status = 1;
+                condition_triggered = true;  // 因为修改了token，需要截断
+            } else if (current_step == max_think_len + 2) {
+                // 强制将当前token替换为结束思考的token
+                next_token = line_break_id;
+                current_limit_think_status = 1;
+                condition_triggered = true;  // 因为修改了token，需要截断
+            } else if (current_step == max_think_len + 3) {
+                // 强制将当前token替换为结束思考的token
+                next_token = line_break_id;
+                // 将状态推进到 1, 表示 "正在结束思考"
+                current_limit_think_status = 2;
+                condition_triggered = true;  // 因为修改了token，需要截断
+            }
+        }
+
+        // ======================= 思考结束处理 =======================
+        // 阶段 3: 检查是否已满足结束思考的条件 (status == 0 || status == 2)
+        // 这种情况会处理两种场景:
+        // 1. status == 0: 模型可能自己生成了 </think>
+        // 2. status == 2: 上一阶段强制注入了 \n</think>\n\n
+        if (current_limit_think_status == 0) {
+            if (next_token == think_end_id) {
+                // 确认思考结束，将状态推进到 3 (响应阶段)
+                current_limit_think_status = 3;
+            }
+        }
+        if (current_limit_think_status == 2) {
+            // 确认思考结束，将状态推进到 3 (响应阶段)
+            current_limit_think_status = 3;
+        }
+
+        next_tokens[token_idx] = next_token;
+
+        if (condition_triggered) {
+            new_accept_num = token_offset + 1;
+            break;
+        }
+    }
+
+    // 更新全局状态
+    int discarded_tokens = original_accept_num - new_accept_num;
+    if (discarded_tokens > 0) {
+        step_idx[bid] -= discarded_tokens;
+        seq_lens_decoder[bid] -= discarded_tokens;
+    }
+
+    accept_num[bid] = new_accept_num;
+    limit_think_status[bid] = current_limit_think_status;
+}
+
+void SpeculateLimitThinkingContentLengthV2(
+    const paddle::Tensor& next_tokens,
+    const paddle::Tensor& max_think_lens,
+    const paddle::Tensor& step_idx,
+    const paddle::Tensor& limit_think_status,
+    const paddle::Tensor& accept_num,
+    const paddle::Tensor& seq_lens_decoder,
+    const int64_t think_end_id,
+    const int64_t line_break_id) {
+    const int batch_size = next_tokens.shape()[0];
+    const int tokens_per_step = next_tokens.shape()[1];
+
+    speculate_limit_thinking_content_length_kernel_v2<<<1, 1024>>>(
+        const_cast<int64_t*>(next_tokens.data<int64_t>()),
+        max_think_lens.data<int>(),
+        const_cast<int64_t*>(step_idx.data<int64_t>()),
+        const_cast<int*>(limit_think_status.data<int>()),
+        const_cast<int*>(accept_num.data<int>()),
+        const_cast<int*>(seq_lens_decoder.data<int>()),
+        think_end_id,
+        line_break_id,
+        tokens_per_step,
+        batch_size);
+}
+
+PD_BUILD_STATIC_OP(speculate_limit_thinking_content_length_v2)
+    .Inputs({"next_tokens",
+             "max_think_lens",
+             "step_idx",
+             "limit_think_status",
+             "accept_num",
+             "seq_lens_decoder"})
+    .Attrs({"think_end_id: int64_t", "line_break_id: int64_t"})
+    .Outputs({"next_tokens_out"})
+    .SetInplaceMap({{"next_tokens", "next_tokens_out"}})
+    .SetKernelFn(PD_KERNEL(SpeculateLimitThinkingContentLengthV2));
diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py
index e4b9aeccc63..9d0bf0dbc32 100644
--- a/fastdeploy/model_executor/pre_and_post_process.py
+++ b/fastdeploy/model_executor/pre_and_post_process.py
@@ -80,6 +80,8 @@
         speculate_step_reschedule,
         limit_thinking_content_length_v1,
         limit_thinking_content_length_v2,
+        speculate_limit_thinking_content_length_v1,
+        speculate_limit_thinking_content_length_v2,
     )
 
 from fastdeploy.output.stream_transfer_data import DecoderState, StreamTransferData
@@ -88,6 +90,78 @@
 DISABLE_RECOVER = envs.FD_DISABLED_RECOVER == "1"
 
 
+def limit_thinking_content_length(
+    limit_strategy: str,
+    sampled_token_ids: paddle.Tensor,
+    max_think_lens: paddle.Tensor,
+    step_idx: paddle.Tensor,
+    limit_think_status: paddle.Tensor,
+    think_end_id: int,
+    line_break_id: int = None,
+):
+    if limit_strategy == "</think>":
+        # for ernie4_5_vl
+        limit_thinking_content_length_v1(
+            sampled_token_ids,
+            max_think_lens,
+            step_idx,
+            limit_think_status,
+            think_end_id,
+        )
+    elif limit_strategy == "\n</think>\n\n":
+        # for ernie_x1
+        assert line_break_id > 0
+        limit_thinking_content_length_v2(
+            sampled_token_ids,
+            max_think_lens,
+            step_idx,
+            limit_think_status,
+            think_end_id,
+            line_break_id,
+        )
+    else:
+        raise NotImplementedError(f"Not support {limit_strategy=} for limit thinking content length.")
+
+
+def speculate_limit_thinking_content_length(
+    limit_strategy: str,
+    accept_tokens: paddle.Tensor,
+    max_think_lens: paddle.Tensor,
+    step_idx: paddle.Tensor,
+    limit_think_status: paddle.Tensor,
+    accept_num: paddle.Tensor,
+    seq_lens_decoder: paddle.Tensor,
+    think_end_id: int,
+    line_break_id: int = None,
+):
+    if limit_strategy == "</think>":
+        # for ernie4_5_vl
+        speculate_limit_thinking_content_length_v1(
+            accept_tokens,
+            max_think_lens,
+            step_idx,
+            limit_think_status,
+            accept_num,
+            seq_lens_decoder,
+            think_end_id,
+        )
+    elif limit_strategy == "\n</think>\n\n":
+        # for ernie_x1
+        assert line_break_id > 0
+        speculate_limit_thinking_content_length_v2(
+            accept_tokens,
+            max_think_lens,
+            step_idx,
+            limit_think_status,
+            accept_num,
+            seq_lens_decoder,
+            think_end_id,
+            line_break_id,
+        )
+    else:
+        raise NotImplementedError(f"Not support {limit_strategy=} for limit thinking content length.")
+
+
 def pre_process(
     input_ids: paddle.Tensor,
     seq_lens_this_time: int,
@@ -185,8 +259,20 @@ def post_process_normal(
     save_each_rank: bool = False,
     skip_save_output: bool = False,
     async_output_queue: queue.Queue = None,
+    think_end_id: int = -1,
+    line_break_id: int = -1,
 ) -> ModelRunnerOutput:
     """Post-processing steps after completing a single token generation."""
+    if think_end_id > 0:
+        limit_thinking_content_length(
+            limit_strategy=envs.FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR,
+            sampled_token_ids=sampler_output.sampled_token_ids,
+            max_think_lens=share_inputs["max_think_lens"],
+            step_idx=share_inputs["step_idx"],
+            limit_think_status=share_inputs["limit_think_status"],
+            think_end_id=think_end_id,
+            line_break_id=line_break_id,
+        )
     # 1. Set stop value
     paddle.assign(
         paddle.where(
@@ -296,9 +382,26 @@ def post_process_normal(
 
 
 def post_process_specualate(
-    model_output: ModelOutputData, save_each_rank: bool = False, skip_save_output: bool = False
+    model_output: ModelOutputData,
+    share_inputs: Dict[str, paddle.Tensor],
+    save_each_rank: bool = False,
+    skip_save_output: bool = False,
+    think_end_id: int = -1,
+    line_break_id: int = -1,
 ):
-    """"""
+    if think_end_id > 0:
+        speculate_limit_thinking_content_length(
+            limit_strategy=envs.FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR,
+            accept_tokens=share_inputs["accept_tokens"],
+            max_think_lens=share_inputs["max_think_lens"],
+            step_idx=share_inputs["step_idx"],
+            limit_think_status=share_inputs["limit_think_status"],
+            accept_num=share_inputs["accept_num"],
+            seq_lens_decoder=share_inputs["seq_lens_decoder"],
+            think_end_id=think_end_id,
+            line_break_id=line_break_id,
+        )
+
     speculate_update(
         model_output.seq_lens_encoder,
         model_output.seq_lens_decoder,
@@ -348,10 +451,19 @@ def post_process(
     speculative_decoding: bool = False,
     skip_save_output: bool = False,
     async_output_queue: queue.Queue = None,
+    think_end_id: int = -1,
+    line_break_id: int = -1,
 ) -> None:
     """Post-processing steps after completing a single token generation."""
     if speculative_decoding:
-        post_process_specualate(model_output, save_each_rank, skip_save_output)
+        post_process_specualate(
+            model_output,
+            share_inputs,
+            save_each_rank,
+            skip_save_output,
+            think_end_id,
+            line_break_id,
+        )
     else:
         post_process_normal(
             sampler_output,
@@ -361,6 +473,8 @@ def post_process(
             save_each_rank,
             skip_save_output,
             async_output_queue,
+            think_end_id,
+            line_break_id,
         )
 
 
@@ -639,36 +753,3 @@ def rebuild_padding(
     else:
         raise RuntimeError("Not supported platform")
     return hidden_states
-
-
-def limit_thinking_content_length(
-    limit_strategy: str,
-    sampled_token_ids: paddle.Tensor,
-    max_think_lens: paddle.Tensor,
-    step_idx: paddle.Tensor,
-    limit_think_status: paddle.Tensor,
-    think_end_id: int,
-    line_break_id: int = None,
-):
-    if limit_strategy == "</think>":
-        # for ernie4_5_vl
-        limit_thinking_content_length_v1(
-            sampled_token_ids,
-            max_think_lens,
-            step_idx,
-            limit_think_status,
-            think_end_id,
-        )
-    elif limit_strategy == "\n</think>\n\n":
-        # for ernie_x1
-        assert line_break_id > 0
-        limit_thinking_content_length_v2(
-            sampled_token_ids,
-            max_think_lens,
-            step_idx,
-            limit_think_status,
-            think_end_id,
-            line_break_id,
-        )
-    else:
-        raise NotImplementedError(f"Not support {limit_strategy=} for limit thinking content length.")
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 0fbfd2af31d..ddb27af718e 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -68,7 +68,6 @@
     )
 
 from fastdeploy.model_executor.pre_and_post_process import (
-    limit_thinking_content_length,
     post_process,
     pre_process,
     rebuild_padding,
@@ -1473,6 +1472,8 @@ def _dummy_sampler_run(
             speculative_decoding=self.speculative_decoding,
             skip_save_output=True,
             async_output_queue=self.async_output_queue,
+            think_end_id=self.model_config.think_end_id,
+            line_break_id=self.model_config.line_break_id,
         )
         if self.speculative_decoding:
             if self.speculative_method == "mtp":
@@ -1837,17 +1838,6 @@ class at the server level, which is too granular for ModelRunner.
                     self.parallel_config.data_parallel_rank * self.parallel_config.tensor_parallel_size,
                     group=self.parallel_config.tp_group,
                 )
-
-            if self.model_config.think_end_id > 0:
-                limit_thinking_content_length(
-                    limit_strategy=envs.FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR,
-                    sampled_token_ids=sampler_output.sampled_token_ids,
-                    max_think_lens=self.share_inputs["max_think_lens"],
-                    step_idx=self.share_inputs["step_idx"],
-                    limit_think_status=self.share_inputs["limit_think_status"],
-                    think_end_id=self.model_config.think_end_id,
-                    line_break_id=self.model_config.line_break_id,
-                )
         else:
             self.sampler(
                 logits,
@@ -1921,6 +1911,8 @@ class at the server level, which is too granular for ModelRunner.
             speculative_decoding=self.speculative_decoding,
             skip_save_output=skip_save_output,
             async_output_queue=self.async_output_queue,
+            think_end_id=self.model_config.think_end_id,
+            line_break_id=self.model_config.line_break_id,
         )
         if self.guided_backend is not None and sampler_output is not None:
             self.sampler.post_process(sampler_output.sampled_token_ids, skip_idx_list)

From 393d830a1c99a28013ad36467a12d2b8ea478389 Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Mon, 20 Oct 2025 13:09:07 +0800
Subject: [PATCH 21/24] fix

---
 .../speculate_limit_thinking_content_length_v1.cu               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu
index 90e3760af4b..96e6a700487 100644
--- a/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu
+++ b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu
@@ -1,4 +1,4 @@
-/ Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

From 2e0f6072fcad36dcbf3d1593ce8a13d4a7efa0cc Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Mon, 20 Oct 2025 13:09:32 +0800
Subject: [PATCH 22/24] fix

---
 .../speculate_limit_thinking_content_length_v2.cu               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu
index 9277dd4c137..e885cfb2a0f 100644
--- a/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu
+++ b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu
@@ -1,4 +1,4 @@
-/ Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.

From 4fd1dde31a15eef06d7875124fdc459569325156 Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Mon, 20 Oct 2025 14:58:37 +0800
Subject: [PATCH 23/24] update

---
 .../input/ernie4_5_vl_processor/ernie4_5_vl_processor.py    | 6 +-----
 fastdeploy/worker/gpu_model_runner.py                       | 2 +-
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py
index 0d0920b71f9..439b752c2f8 100644
--- a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py
+++ b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py
@@ -255,11 +255,7 @@ def process_request_dict(self, request, max_model_len=None):
         else:
             request["max_tokens"] = min(max_model_len - len(request["prompt_token_ids"]), request["max_tokens"])
         if request.get("reasoning_max_tokens") is None:
-            if request.get("enable_thinking"):
-                request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
-        else:
-            if not request.get("enable_thinking"):
-                request["reasoning_max_tokens"] = None
+            request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
         data_processor_logger.info(f"Processed request {request}")
 
         return request
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index ddb27af718e..891ae07a49e 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -352,7 +352,7 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
                         position_ids, request.get("max_tokens", 2048)
                     )
 
-                if request.get("reasoning_max_tokens") is not None:
+                if request.get("enable_thinking", False) and request.get("reasoning_max_tokens", None) is not None:
                     # Enable thinking
                     self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get("reasoning_max_tokens")
                     self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0

From db4e279623396b4569e0a8e80a23ffee7f82da36 Mon Sep 17 00:00:00 2001
From: yuanlehome <yuanlehome@163.com>
Date: Mon, 20 Oct 2025 16:28:57 +0800
Subject: [PATCH 24/24] update

---
 fastdeploy/worker/gpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 8083a37618b..7fbbb0ab270 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -584,7 +584,7 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests:
                     )
                     self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
 
-                if request.get("reasoning_max_tokens") is not None:
+                if request.get("enable_thinking", False) and request.get("reasoning_max_tokens", None) is not None:
                     # Enable thinking
                     self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get("reasoning_max_tokens")
                     self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0