From 73384a6bae02db9fac76efc604b4f93c42e68ece Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Mon, 13 Oct 2025 11:14:57 +0800 Subject: [PATCH 01/24] delete impl --- .../model_executor/pre_and_post_process.py | 42 -------------- fastdeploy/worker/gcu_model_runner.py | 8 --- fastdeploy/worker/gpu_model_runner.py | 49 ---------------- fastdeploy/worker/metax_model_runner.py | 21 ------- fastdeploy/worker/output.py | 20 ------- fastdeploy/worker/xpu_model_runner.py | 56 ------------------- 6 files changed, 196 deletions(-) diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index 65948ea7d2a..34685b52019 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -194,48 +194,6 @@ def post_process_normal( zmq_client: ZmqIpcClient = None, ) -> ModelRunnerOutput: """Post-processing steps after completing a single token generation.""" - # handle vl: - if model_output.think_end_id != -1: - thinking_mask = model_output.enable_thinking[: sampler_output.sampled_token_ids.shape[0]] - exists_think_end = (sampler_output.sampled_token_ids == model_output.think_end_id) & thinking_mask - paddle.assign( - paddle.where( - exists_think_end, - model_output.need_think_end - 1, - model_output.need_think_end, - ), - model_output.need_think_end, - ) - - reasoning_index_update_cond = model_output.need_think_end.cast("bool") & thinking_mask - paddle.assign( - paddle.where( - reasoning_index_update_cond, - model_output.reasoning_index - 1, - model_output.reasoning_index, - ), - model_output.reasoning_index, - ) - - stop_wo_think = ( - (sampler_output.sampled_token_ids == model_output.eos_token_id.T).any(axis=1, keepdim=True) - | (model_output.reasoning_index == 0) - ) & (model_output.need_think_end > 0) - - stop_wo_think = stop_wo_think & thinking_mask - sampler_output.sampled_token_ids = paddle.where( - stop_wo_think, - model_output.think_end_id, - sampler_output.sampled_token_ids, - ) - paddle.assign( - paddle.where( - stop_wo_think, - model_output.need_think_end - 1, - model_output.need_think_end, - ), - model_output.need_think_end, - ) # 1. Set stop value paddle.assign( paddle.where( diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index 5c0580ea898..36c87c03e85 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -837,10 +837,6 @@ def _dummy_run( ), accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), - enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None), - think_end_id=(self.model_config.think_end_id if self.enable_mm else -1), - need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None), - reasoning_index=(self.share_inputs["reasoning_index"] if self.enable_mm else None), ) post_process( @@ -1065,10 +1061,6 @@ class at the server level, which is too granular for ModelRunner. ), accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), - enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None), - think_end_id=(self.model_config.think_end_id if self.enable_mm else -1), - need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None), - reasoning_index=(self.share_inputs["reasoning_index"] if self.enable_mm else None), ) if self.speculative_config.method in ["mtp"] and self.scheduler_config.splitwise_role == "prefill": diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 1c55b467945..c6034a725a5 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -259,13 +259,9 @@ def _init_logits_processor(self, request): elif request.structural_tag is not None: schemata_key = ("structural_tag", request.structural_tag) - enable_thinking = request.get("enable_thinking", True) - enable_thinking = enable_thinking if enable_thinking is not None else True - return ( self.guided_backend.get_logits_processor( schemata_key=schemata_key, - enable_thinking=enable_thinking, ), schemata_key, ) @@ -327,23 +323,6 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = position_ids, request.get("max_tokens", 2048) ) - if request.get("enable_thinking", False): - # Enable thinking - req_reasoning_max_tokens = request.get("reasoning_max_tokens") - req_max_tokens = request.get("max_tokens") - final_reasoning_tokens = ( - req_reasoning_max_tokens if req_reasoning_max_tokens is not None else req_max_tokens - ) - - self.share_inputs["enable_thinking"][idx : idx + 1] = True - self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = final_reasoning_tokens - else: - # Disable thinking - self.share_inputs["enable_thinking"][idx : idx + 1] = False - self.share_inputs["need_think_end"][idx : idx + 1, :] = 0 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = 0 - if isinstance(request.prompt_token_ids, np.ndarray): prompt_token_ids = request.prompt_token_ids.tolist() else: @@ -567,23 +546,6 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests: ) self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0 - if request.get("enable_thinking", False): - # Enable thinking - req_reasoning_max_tokens = request.get("reasoning_max_tokens") - req_max_tokens = request.get("max_tokens") - final_reasoning_tokens = ( - req_reasoning_max_tokens if req_reasoning_max_tokens is not None else req_max_tokens - ) - - self.share_inputs["enable_thinking"][idx : idx + 1] = True - self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = final_reasoning_tokens - else: - # Disable thinking - self.share_inputs["enable_thinking"][idx : idx + 1] = False - self.share_inputs["need_think_end"][idx : idx + 1, :] = 0 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = 0 - def get_attr_from_request(request, attr, default_value=None): res = request.get(attr, default_value) if res is not None: @@ -879,9 +841,6 @@ def _init_share_inputs(self, max_num_seqs: int): tmp_position_ids = paddle.arange(self.model_config.max_model_len).reshape((1, -1)) # Initialize thinking related buffers - self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") - self.share_inputs["enable_thinking"] = paddle.full(shape=[max_num_seqs, 1], fill_value=False, dtype="bool") - self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") # TODO(gongshaotian): move to models if not self.enable_mm: @@ -1457,10 +1416,6 @@ def _dummy_run( ), accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), - enable_thinking=self.share_inputs["enable_thinking"], - think_end_id=self.model_config.think_end_id, - need_think_end=self.share_inputs["need_think_end"], - reasoning_index=self.share_inputs["reasoning_index"], stop_token_ids=self.share_inputs["stop_seqs"], stop_seqs_len=self.share_inputs["stop_seqs_len"], ) @@ -1811,10 +1766,6 @@ class at the server level, which is too granular for ModelRunner. ), accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), - enable_thinking=self.share_inputs["enable_thinking"], - think_end_id=self.model_config.think_end_id, - need_think_end=self.share_inputs["need_think_end"][:num_running_requests], - reasoning_index=self.share_inputs["reasoning_index"][:num_running_requests], stop_token_ids=self.share_inputs["stop_seqs"], stop_seqs_len=self.share_inputs["stop_seqs_len"], ) diff --git a/fastdeploy/worker/metax_model_runner.py b/fastdeploy/worker/metax_model_runner.py index dcce154ea51..3ea98654ee5 100644 --- a/fastdeploy/worker/metax_model_runner.py +++ b/fastdeploy/worker/metax_model_runner.py @@ -242,11 +242,6 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = else: position_ids = None - enable_thinking = request.get("enable_thinking", True) - enable_thinking = enable_thinking if enable_thinking is not None else True - self.share_inputs["enable_thinking"][:] = enable_thinking - self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048) self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d( position_ids, request.get("max_tokens", 2048) ) @@ -459,11 +454,6 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests: self.share_inputs["prompt_lens"][idx : idx + 1] = length if self.enable_mm: - enable_thinking = request.get("enable_thinking", True) - enable_thinking = enable_thinking if enable_thinking is not None else True - self.share_inputs["enable_thinking"][:] = enable_thinking - self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048) self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d( position_ids, request.get("max_tokens", 2048) ) @@ -779,9 +769,6 @@ def _init_share_inputs(self, max_num_seqs: int): dtype="float32", ) self.share_inputs["image_features"] = None - self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") - self.share_inputs["enable_thinking"] = paddle.full(shape=[1], fill_value=True, dtype="bool") - self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") def _prepare_inputs(self) -> None: """Prepare the model inputs""" @@ -1133,10 +1120,6 @@ def _dummy_run( ), accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), - enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None), - think_end_id=(self.model_config.think_end_id if self.enable_mm else -1), - need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None), - reasoning_index=(self.share_inputs["reasoning_index"] if self.enable_mm else None), stop_token_ids=self.share_inputs["stop_seqs"], stop_seqs_len=self.share_inputs["stop_seqs_len"], ) @@ -1401,10 +1384,6 @@ class at the server level, which is too granular for ModelRunner. ), accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), - enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None), - think_end_id=(self.model_config.think_end_id if self.enable_mm else -1), - need_think_end=(self.share_inputs["need_think_end"][:num_running_requests] if self.enable_mm else None), - reasoning_index=(self.share_inputs["reasoning_index"][:num_running_requests] if self.enable_mm else None), stop_token_ids=self.share_inputs["stop_seqs"], stop_seqs_len=self.share_inputs["stop_seqs_len"], ) diff --git a/fastdeploy/worker/output.py b/fastdeploy/worker/output.py index 6d820a873a4..9b110628595 100644 --- a/fastdeploy/worker/output.py +++ b/fastdeploy/worker/output.py @@ -220,26 +220,6 @@ class ModelOutputData: """ accept_num: paddle.Tensor - """ - vl model enable to think - """ - enable_thinking: paddle.Tensor = None - - """ - vl model think end id - """ - think_end_id: int = -1 - - """ - vl model need to think - """ - need_think_end: paddle.Tensor = None - - """ - vl model reasoning index - """ - reasoning_index: paddle.Tensor = None - """ the token ids of stop sequence """ diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index f04680498c0..1ea3266f681 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -190,45 +190,6 @@ def xpu_post_process( update_inputs, ) - # handle vl: - if model_output.enable_thinking: - exists_think_end = sampled_token_ids == model_output.think_end_id - paddle.assign( - paddle.where( - exists_think_end, - model_output.need_think_end - 1, - model_output.need_think_end, - ), - model_output.need_think_end, - ) - - paddle.assign( - paddle.where( - model_output.need_think_end.cast("bool"), - model_output.reasoning_index - 1, - model_output.reasoning_index, - ), - model_output.reasoning_index, - ) - - stop_wo_think = ( - (sampled_token_ids == model_output.eos_token_id.T).any(axis=1, keepdim=True) - | (model_output.reasoning_index == 0) - ) & (model_output.need_think_end > 0) - sampled_token_ids = paddle.where( - stop_wo_think, - model_output.think_end_id, - sampled_token_ids, - ) - paddle.assign( - paddle.where( - stop_wo_think, - model_output.need_think_end - 1, - model_output.need_think_end, - ), - model_output.need_think_end, - ) - # 1. Set stop value paddle.assign( paddle.where( @@ -455,11 +416,6 @@ def insert_tasks_v1(self, req_dicts: List[Request]): else: position_ids = None - enable_thinking = request.get("enable_thinking", True) - enable_thinking = enable_thinking if enable_thinking is not None else True - self.share_inputs["enable_thinking"][:] = enable_thinking - self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048) self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d( position_ids, request.get("max_tokens", 2048) ) @@ -594,11 +550,6 @@ def insert_prefill_inputs(self, req_dicts: List[Request]): self.share_inputs["prompt_lens"][idx : idx + 1] = length if self.enable_mm: - enable_thinking = request.get("enable_thinking", True) - enable_thinking = enable_thinking if enable_thinking is not None else True - self.share_inputs["enable_thinking"][:] = enable_thinking - self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048) self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d( position_ids, request.get("max_tokens", 2048) ) @@ -813,9 +764,6 @@ def _init_share_inputs(self, max_num_seqs: int): dtype="float32", ) self.share_inputs["image_features"] = None - self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") - self.share_inputs["enable_thinking"] = paddle.full(shape=[1], fill_value=True, dtype="bool") - self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") def _prepare_inputs(self, is_dummy_run=False) -> None: """Prepare the model inputs""" @@ -1112,10 +1060,6 @@ class at the server level, which is too granular for ModelRunner. actual_draft_token_num=None, accept_tokens=None, accept_num=None, - enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None), - think_end_id=(self.model_config.think_end_id if self.enable_mm else -1), - need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None), - reasoning_index=(self.share_inputs["reasoning_index"] if self.enable_mm else None), stop_token_ids=self.share_inputs["stop_seqs"], stop_seqs_len=self.share_inputs["stop_seqs_len"], ) From fe924357c7a03bfea87a0844f049bdd298f87e78 Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Mon, 13 Oct 2025 11:28:23 +0800 Subject: [PATCH 02/24] delete min_length&max_length --- fastdeploy/worker/gcu_model_runner.py | 4 ---- fastdeploy/worker/gpu_model_runner.py | 4 ---- fastdeploy/worker/hpu_model_runner.py | 4 ---- fastdeploy/worker/metax_model_runner.py | 4 ---- fastdeploy/worker/xpu_model_runner.py | 4 ---- 5 files changed, 20 deletions(-) diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index 36c87c03e85..75c1ad40797 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -383,10 +383,6 @@ def _init_share_inputs(self, max_num_seqs: int): self.share_inputs["max_dec_len"] = paddle.full( [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64" ) - self.share_inputs["min_length"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64") - self.share_inputs["max_length"] = paddle.full( - [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64" - ) self.seq_lens_this_time_buffer = paddle.full(max_num_seqs, 0, dtype="int32") self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index c6034a725a5..9b2e531b880 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -776,10 +776,6 @@ def _init_share_inputs(self, max_num_seqs: int): self.share_inputs["max_dec_len"] = paddle.full( [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64" ) - self.share_inputs["min_length"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64") - self.share_inputs["max_length"] = paddle.full( - [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64" - ) self.seq_lens_this_time_buffer = paddle.full([max_num_seqs, 1], 0, dtype="int32") if self.fd_config.parallel_config.enable_expert_parallel: self.share_inputs["seq_lens_this_time"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") diff --git a/fastdeploy/worker/hpu_model_runner.py b/fastdeploy/worker/hpu_model_runner.py index 56f84fd86da..317eb0a7398 100644 --- a/fastdeploy/worker/hpu_model_runner.py +++ b/fastdeploy/worker/hpu_model_runner.py @@ -591,10 +591,6 @@ def _init_share_inputs(self, max_num_seqs: int): self.share_inputs["max_dec_len"] = paddle.full( [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64" ) - self.share_inputs["min_length"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64") - self.share_inputs["max_length"] = paddle.full( - [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64" - ) self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, 0, dtype="int32") self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") diff --git a/fastdeploy/worker/metax_model_runner.py b/fastdeploy/worker/metax_model_runner.py index 3ea98654ee5..b88a16579c3 100644 --- a/fastdeploy/worker/metax_model_runner.py +++ b/fastdeploy/worker/metax_model_runner.py @@ -628,10 +628,6 @@ def _init_share_inputs(self, max_num_seqs: int): self.share_inputs["max_dec_len"] = paddle.full( [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64" ) - self.share_inputs["min_length"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64") - self.share_inputs["max_length"] = paddle.full( - [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64" - ) self.seq_lens_this_time_buffer = paddle.full([max_num_seqs, 1], 0, dtype="int32") if self.fd_config.parallel_config.enable_expert_parallel: self.share_inputs["seq_lens_this_time"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index 1ea3266f681..87bf5dd4331 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -670,10 +670,6 @@ def _init_share_inputs(self, max_num_seqs: int): self.share_inputs["max_dec_len"] = paddle.full( [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64" ) - self.share_inputs["min_length"] = paddle.full([max_num_seqs, 1], self.model_config.min_length, dtype="int64") - self.share_inputs["max_length"] = paddle.full( - [max_num_seqs, 1], self.model_config.max_model_len, dtype="int64" - ) self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, 0, dtype="int32") self.share_inputs["seq_lens_encoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") self.share_inputs["seq_lens_decoder"] = paddle.full([max_num_seqs, 1], 0, dtype="int32") From 1b289b688142784fcd293da50eb09be9401ac343 Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Mon, 13 Oct 2025 16:06:24 +0800 Subject: [PATCH 03/24] support limit thinking content strategy --- .../limit_thinking_content_length_v1.cu | 100 +++++++++ .../limit_thinking_content_length_v2.cu | 130 ++++++++++++ custom_ops/setup_ops.py | 2 + fastdeploy/config.py | 79 ++++--- fastdeploy/engine/common_engine.py | 197 ++++++++++-------- fastdeploy/engine/engine.py | 22 +- fastdeploy/entrypoints/cli/tokenizer.py | 3 +- fastdeploy/entrypoints/engine_client.py | 17 +- fastdeploy/entrypoints/openai/api_server.py | 1 - fastdeploy/input/preprocess.py | 33 +-- .../model_executor/models/interfaces_base.py | 54 ----- .../model_executor/models/model_base.py | 53 +++-- .../model_executor/pre_and_post_process.py | 33 ++- fastdeploy/multimodal/registry.py | 35 ---- fastdeploy/worker/gpu_model_runner.py | 45 +++- fastdeploy/worker/utils.py | 50 ----- fastdeploy/worker/worker_process.py | 21 +- tests/ce/server/test_completions.py | 6 - 18 files changed, 515 insertions(+), 366 deletions(-) create mode 100644 custom_ops/gpu_ops/limit_thinking_content_length_v1.cu create mode 100644 custom_ops/gpu_ops/limit_thinking_content_length_v2.cu delete mode 100644 fastdeploy/model_executor/models/interfaces_base.py delete mode 100644 fastdeploy/multimodal/registry.py delete mode 100644 fastdeploy/worker/utils.py diff --git a/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu b/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu new file mode 100644 index 00000000000..e16e6dc66eb --- /dev/null +++ b/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu @@ -0,0 +1,100 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "helper.h" +#include "paddle/extension.h" + +__global__ void limit_thinking_content_length_kernel_v1( + int64_t *next_tokens, + const int *max_think_lens, + const int64_t *step_idx, // step_idx 不再需要被修改,改为 const + int *limit_think_status, + const int64_t think_end_id, + const int bs) +{ + int bid = threadIdx.x; + if (bid >= bs) + return; + + // 如果该序列未启用思考功能,则直接返回,默认值为 -1,表示不限制思考长度 + const int max_think_len = max_think_lens[bid]; + if (max_think_len < 0) + return; + int current_limit_content_status = limit_think_status[bid]; + // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行. + if (current_limit_content_status == 2) + { + return; + } + + int64_t next_token = next_tokens[bid]; + const int64_t step = step_idx[bid]; + + // ======================= 思考阶段控制 ======================= + // 阶段 1: 仍在思考 (status == 0), 检查是否需要强制结束 + if (current_limit_content_status < 1) + { + // 当开启思考长度控制时,检查是否超时 + if (step >= max_think_len) + { + // 强制将当前token替换为结束思考的token + next_token = think_end_id; + // 将状态推进到 1, 表示 "正在结束思考" + current_limit_content_status = 1; + } + } + // ======================= 思考结束处理 ======================= + // 阶段 2: 检查是否已满足结束思考的条件 (status < 2) + // 这种情况会处理两种场景: + // 1. status == 0: 模型自己生成了 think_end_id + // 2. status == 1: 上一阶段强制注入了 think_end_id + if (current_limit_content_status < 2) + { + if (next_token == think_end_id) + { + // 确认思考结束,将状态推进到 2 (响应阶段) + current_limit_content_status = 2; + } + } + // 写回更新后的 token + next_tokens[bid] = next_token; + // 更新全局状态 + limit_think_status[bid] = current_limit_content_status; +} + +void LimitThinkingContentLengthV1(const paddle::Tensor &next_tokens, + const paddle::Tensor &max_think_lens, + const paddle::Tensor &step_idx, + const paddle::Tensor &limit_think_status, + const int64_t think_end_id) +{ + const int batch_size = next_tokens.shape()[0]; + limit_thinking_content_length_kernel_v1<<<1, 1024>>>( + const_cast(next_tokens.data()), + max_think_lens.data(), + step_idx.data(), + const_cast(limit_think_status.data()), + think_end_id, + batch_size); +} + +PD_BUILD_OP(limit_thinking_content_length_v1) + .Inputs({"next_tokens", + "max_think_lens", + "step_idx", + "limit_think_status"}) + .Attrs({"think_end_id: int64_t"}) + .Outputs({"next_tokens_out"}) + .SetInplaceMap({{"next_tokens", "next_tokens_out"}}) + .SetKernelFn(PD_KERNEL(LimitThinkingContentLengthV1)); diff --git a/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu b/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu new file mode 100644 index 00000000000..fdac1cff5d0 --- /dev/null +++ b/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu @@ -0,0 +1,130 @@ +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "helper.h" +#include "paddle/extension.h" + +// status == 0: 正常生成阶段 +// status == 1: 替换阶段 +// status == 2: 替换结束阶段 +// status == 3: 思考结束阶段 +__global__ void limit_thinking_content_length_kernel_v2( + int64_t *next_tokens, + const int *max_think_lens, + const int64_t *step_idx, // step_idx 不再需要被修改,改为 const + int *limit_think_status, + const int64_t think_end_id, + const int64_t line_break_id, + const int bs) +{ + int bid = threadIdx.x; + if (bid >= bs) + return; + // 如果该序列未启用思考功能,则直接返回,默认值为 -1,表示不限制思考长度 + const int max_think_len = max_think_lens[bid]; + if (max_think_len < 0) + return; + int current_limit_content_status = limit_think_status[bid]; + // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行. + if (current_limit_content_status == 3) + { + return; + } + + int64_t next_token = next_tokens[bid]; + const int64_t step = step_idx[bid]; + + // ======================= 思考阶段控制 ======================= + // 阶段 1: 仍在思考 (status == 0), 检查是否需要强制结束 + // 阶段 2: 在替换 (status == 1), 检查是否替换结束 + if (current_limit_content_status <= 1) + { + // 当开启思考长度控制时,检查是否超时 + if (step == max_think_len) + { + // 强制将当前token替换为结束思考的token + next_token = line_break_id; + current_limit_content_status = 1; + } + else if (step == max_think_len + 1) + { + // 强制将当前token替换为结束思考的token + next_token = think_end_id; + current_limit_content_status = 1; + } + else if (step == max_think_len + 2) + { + // 强制将当前token替换为结束思考的token + next_token = line_break_id; + current_limit_content_status = 1; + } + else if (step == max_think_len + 3) + { + // 强制将当前token替换为结束思考的token + next_token = line_break_id; + // 将状态推进到 1, 表示 "正在结束思考" + current_limit_content_status = 2; + } + } + // ======================= 思考结束处理 ======================= + // 阶段 3: 检查是否已满足结束思考的条件 (status == 0 || status == 2) + // 这种情况会处理两种场景: + // 1. status == 0: 模型可能自己生成了 + // 2. status == 2: 上一阶段强制注入了 \n\n\n + if (current_limit_content_status == 0) + { + if (next_token == think_end_id) + { + // 确认思考结束,将状态推进到 3 (响应阶段) + current_limit_content_status = 3; + } + } + if (current_limit_content_status == 2) + { + // 确认思考结束,将状态推进到 3 (响应阶段) + current_limit_content_status = 3; + } + // 写回更新后的 token + next_tokens[bid] = next_token; + // 更新全局状态 + limit_think_status[bid] = current_limit_content_status; +} + +void LimitThinkingContentLengthV2(const paddle::Tensor &next_tokens, + const paddle::Tensor &max_think_lens, + const paddle::Tensor &step_idx, + const paddle::Tensor &limit_think_status, + const int64_t think_end_id, + const int64_t line_break_id) +{ + const int batch_size = next_tokens.shape()[0]; + limit_thinking_content_length_kernel_v2<<<1, 1024>>>( + const_cast(next_tokens.data()), + max_think_lens.data(), + step_idx.data(), + const_cast(limit_think_status.data()), + think_end_id, + line_break_id, + batch_size); +} + +PD_BUILD_OP(limit_thinking_content_length_v2) + .Inputs({"next_tokens", + "max_think_lens", + "step_idx", + "limit_think_status"}) + .Attrs({"think_end_id: int64_t", "line_break_id: int64_t"}) + .Outputs({"next_tokens_out"}) + .SetInplaceMap({{"next_tokens", "next_tokens_out"}}) + .SetKernelFn(PD_KERNEL(LimitThinkingContentLengthV2)); diff --git a/custom_ops/setup_ops.py b/custom_ops/setup_ops.py index 8636c3de440..8d8d42f2d7e 100644 --- a/custom_ops/setup_ops.py +++ b/custom_ops/setup_ops.py @@ -302,6 +302,8 @@ def find_end_files(directory, end_str): "gpu_ops/noaux_tc.cu", "gpu_ops/custom_all_reduce/all_reduce.cu", "gpu_ops/merge_prefill_decode_output.cu", + "gpu_ops/limit_thinking_content_length_v1.cu", + "gpu_ops/limit_thinking_content_length_v2.cu", ] # pd_disaggregation diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 20de8534451..400807339a2 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -30,7 +30,6 @@ import fastdeploy from fastdeploy import envs from fastdeploy.model_executor.layers.quantization.quant_base import QuantConfigBase -from fastdeploy.multimodal.registry import MultimodalRegistry from fastdeploy.platforms import current_platform from fastdeploy.scheduler import SchedulerConfig from fastdeploy.transformer_utils.config import get_pooling_config @@ -225,50 +224,12 @@ def __init__( self.ori_vocab_size = args.get("ori_vocab_size", self.vocab_size) self.think_end_id = args.get("think_end_id", -1) - - architectures = self.architectures[0] - - if MultimodalRegistry.contains_model(architectures): - self.enable_mm = True - else: - self.enable_mm = False - - self.is_unified_ckpt = check_unified_ckpt(self.model) + self.im_patch_id = args.get("image_patch_id", -1) + self.line_break_id = args.get("line_break_id", -1) self.override_name_from_config() self.read_from_env() self.read_model_config() - self.runner_type = self._get_runner_type(self.architectures, self.runner) - self.convert_type = self._get_convert_type(self.architectures, self.runner_type, self.convert) - - registry = self.registry - is_generative_model = registry.is_text_generation_model(self.architectures, self) - is_pooling_model = registry.is_pooling_model(self.architectures, self) - is_multimodal_model = registry.is_multimodal_model(self.architectures, self) - - if self.runner_type == "generate" and not is_generative_model: - if is_multimodal_model: - pass - else: - generate_converts = _RUNNER_CONVERTS["generate"] - if self.convert_type not in generate_converts: - raise ValueError("This model does not support '--runner generate.") - if self.runner_type == "pooling" and not is_pooling_model: - pooling_converts = _RUNNER_CONVERTS["pooling"] - if self.convert_type not in pooling_converts: - convert_option = "<" + "|".join(pooling_converts) + ">" - raise ValueError( - "This model does not support `--runner pooling`. " - f"You can pass `--convert {convert_option} to adapt " - "it into a pooling model." - ) - - self.supported_tasks = self._get_supported_tasks(self.architectures, self.runner_type, self.convert_type) - model_info, arch = registry.inspect_model_cls(self.architectures, self) - self._model_info = model_info - self._architecture = arch - - self.pooler_config = self._init_pooler_config() @property def registry(self): @@ -510,6 +471,42 @@ def print(self): logger.info("{:<20}:{:<6}{}".format(k, "", v)) logger.info("=============================================================") + def __post_init__(self): + self.is_unified_ckpt = check_unified_ckpt(self.model) + self.runner_type = self._get_runner_type(self.architectures, self.runner) + self.convert_type = self._get_convert_type(self.architectures, self.runner_type, self.convert) + registry = self.registry + is_generative_model = registry.is_text_generation_model(self.architectures, self) + is_pooling_model = registry.is_pooling_model(self.architectures, self) + is_multimodal_model = registry.is_multimodal_model(self.architectures, self) + self.is_reasoning_model = registry.is_reasoning_model(self.architectures, self) + + self.enable_mm = is_multimodal_model + + if self.runner_type == "generate" and not is_generative_model: + if is_multimodal_model: + pass + else: + generate_converts = _RUNNER_CONVERTS["generate"] + if self.convert_type not in generate_converts: + raise ValueError("This model does not support '--runner generate.") + if self.runner_type == "pooling" and not is_pooling_model: + pooling_converts = _RUNNER_CONVERTS["pooling"] + if self.convert_type not in pooling_converts: + convert_option = "<" + "|".join(pooling_converts) + ">" + raise ValueError( + "This model does not support `--runner pooling`. " + f"You can pass `--convert {convert_option} to adapt " + "it into a pooling model." + ) + + self.supported_tasks = self._get_supported_tasks(self.architectures, self.runner_type, self.convert_type) + model_info, arch = registry.inspect_model_cls(self.architectures, self) + self._model_info = model_info + self._architecture = arch + + self.pooler_config = self._init_pooler_config() + class ParallelConfig: """Configuration for the distributed execution.""" diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index 93dc7258a8c..4c650477fee 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -60,77 +60,77 @@ class EngineService: Base class containing common engine functionality """ - def __init__(self, cfg, start_queue=True): + def __init__(self, fd_config, start_queue=True): """ Initializes the LLMEngine with the provided configuration. Args: - cfg (Config): Config object containing all the configuration parameters. + fd_config (FDConfig): Config object containing all the configuration parameters. """ - self.cfg = cfg - if isinstance(self.cfg.cache_config.cache_queue_port, str): - self.cfg.cache_config.cache_queue_port = self.cfg.cache_config.cache_queue_port.split(",") - if isinstance(self.cfg.cache_config.cache_queue_port, list): - self.cfg.cache_config.cache_queue_port = int( - self.cfg.cache_config.cache_queue_port[self.cfg.parallel_config.local_data_parallel_id] + self.fd_config = fd_config + if isinstance(self.fd_config.cache_config.cache_queue_port, str): + self.fd_config.cache_config.cache_queue_port = self.fd_config.cache_config.cache_queue_port.split(",") + if isinstance(self.fd_config.cache_config.cache_queue_port, list): + self.fd_config.cache_config.cache_queue_port = int( + self.fd_config.cache_config.cache_queue_port[self.fd_config.parallel_config.local_data_parallel_id] ) - if self.cfg.parallel_config.enable_expert_parallel: + if self.fd_config.parallel_config.enable_expert_parallel: self.llm_logger = get_logger( - "fastdeploy", f"fastdeploy_rank{self.cfg.parallel_config.local_data_parallel_id}.log" + "fastdeploy", f"fastdeploy_rank{self.fd_config.parallel_config.local_data_parallel_id}.log" ) else: self.llm_logger = llm_logger - self.scheduler = cfg.scheduler_config.scheduler() + self.scheduler = fd_config.scheduler_config.scheduler() self.enable_decode_cache_task = envs.FD_ENABLE_CACHE_TASK == "1" if envs.ENABLE_V1_KVCACHE_SCHEDULER: self.resource_manager = ResourceManagerV1( - cfg.scheduler_config.max_num_seqs, - cfg, - cfg.parallel_config.tensor_parallel_size, - cfg.scheduler_config.splitwise_role, - cfg.parallel_config.local_data_parallel_id, + fd_config.scheduler_config.max_num_seqs, + fd_config, + fd_config.parallel_config.tensor_parallel_size, + fd_config.scheduler_config.splitwise_role, + fd_config.parallel_config.local_data_parallel_id, ) else: self.resource_manager = ResourceManager( - cfg.scheduler_config.max_num_seqs, - cfg, - cfg.parallel_config.tensor_parallel_size, - cfg.scheduler_config.splitwise_role, - cfg.parallel_config.local_data_parallel_id, + fd_config.scheduler_config.max_num_seqs, + fd_config, + fd_config.parallel_config.tensor_parallel_size, + fd_config.scheduler_config.splitwise_role, + fd_config.parallel_config.local_data_parallel_id, ) self.start_worker_queue_service(start_queue) - os.environ["INFERENCE_MSG_QUEUE_ID"] = self.cfg.parallel_config.engine_worker_queue_port[ - self.cfg.parallel_config.local_data_parallel_id + os.environ["INFERENCE_MSG_QUEUE_ID"] = self.fd_config.parallel_config.engine_worker_queue_port[ + self.fd_config.parallel_config.local_data_parallel_id ] - self.split_connector = SplitwiseConnector(cfg, self.engine_worker_queue, self.resource_manager) + self.split_connector = SplitwiseConnector(fd_config, self.engine_worker_queue, self.resource_manager) self.waiting_requests = [] self.token_processor = TokenProcessor( - cfg=cfg, + cfg=fd_config, cached_generated_tokens=self.scheduler, engine_worker_queue=self.engine_worker_queue, split_connector=self.split_connector, ) self.token_processor.set_resource_manager(self.resource_manager) - self.partial_chunked_tokens = [0] * (self.cfg.max_num_partial_prefills + 1) - for idx in range(1, self.cfg.max_num_partial_prefills + 1): + self.partial_chunked_tokens = [0] * (self.fd_config.max_num_partial_prefills + 1) + for idx in range(1, self.fd_config.max_num_partial_prefills + 1): self.partial_chunked_tokens[idx] = ( - (self.cfg.scheduler_config.max_num_batched_tokens // idx) - // self.cfg.cache_config.block_size - * self.cfg.cache_config.block_size + (self.fd_config.scheduler_config.max_num_batched_tokens // idx) + // self.fd_config.cache_config.block_size + * self.fd_config.cache_config.block_size ) self.guided_decoding_checker = None - if self.cfg.guided_decoding_backend != "off": + if self.fd_config.guided_decoding_backend != "off": self.guided_decoding_checker = schema_checker( - self.cfg.guided_decoding_backend, - disable_any_whitespace=self.cfg.disable_any_whitespace, + self.fd_config.guided_decoding_backend, + disable_any_whitespace=self.fd_config.disable_any_whitespace, ) self._init_worker_monitor_signals() @@ -145,12 +145,14 @@ def start(self): self.insert_task_to_worker_thread.start() self.token_processor.tasks_queue = self.engine_worker_queue self.token_processor.run() - if self.cfg.scheduler_config.splitwise_role != "mixed": + if self.fd_config.scheduler_config.splitwise_role != "mixed": self.split_mode_get_tasks() def _init_worker_monitor_signals(self): # exist_task_signal 用于各worker进程感知是否有新Task需要处理 current_suffix = int( - self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id] + self.fd_config.parallel_config.engine_worker_queue_port[ + self.fd_config.parallel_config.local_data_parallel_id + ] ) self.llm_logger.info(f"current_suffix: {current_suffix}") exist_task_signal_data = np.zeros([1], dtype=np.int32) @@ -184,7 +186,8 @@ def _init_worker_monitor_signals(self): # exist_task_signal 用于各worker进 # worker_live_signal 用于engine感知各worker进程是否存活,记录每个step 时间 worker_healthy_live_recorded_time_array = np.zeros( - shape=[min(self.cfg.worker_num_per_node, self.cfg.parallel_config.tensor_parallel_size)], dtype=np.int32 + shape=[min(self.fd_config.worker_num_per_node, self.fd_config.parallel_config.tensor_parallel_size)], + dtype=np.int32, ) self.worker_healthy_live_signal = IPCSignal( name="worker_healthy_live_signal", @@ -194,7 +197,7 @@ def _init_worker_monitor_signals(self): # exist_task_signal 用于各worker进 create=True, ) - cache_ready_signal_data = np.zeros(shape=[self.cfg.parallel_config.tensor_parallel_size], dtype=np.int32) + cache_ready_signal_data = np.zeros(shape=[self.fd_config.parallel_config.tensor_parallel_size], dtype=np.int32) self.cache_ready_signal = IPCSignal( name="cache_ready_signal", array=cache_ready_signal_data, @@ -203,7 +206,9 @@ def _init_worker_monitor_signals(self): # exist_task_signal 用于各worker进 create=True, ) - swap_space_ready_signal_data = np.zeros(shape=[self.cfg.parallel_config.tensor_parallel_size], dtype=np.int32) + swap_space_ready_signal_data = np.zeros( + shape=[self.fd_config.parallel_config.tensor_parallel_size], dtype=np.int32 + ) self.swap_space_ready_signal = IPCSignal( name="swap_space_ready_signal", array=swap_space_ready_signal_data, @@ -244,43 +249,53 @@ def start_worker_queue_service(self, start_queue): start queue service for engine worker communication """ address = ( - self.cfg.master_ip, - int(self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]), + self.fd_config.master_ip, + int( + self.fd_config.parallel_config.engine_worker_queue_port[ + self.fd_config.parallel_config.local_data_parallel_id + ] + ), ) - if start_queue and (self.cfg.host_ip == self.cfg.master_ip or self.cfg.master_ip == "0.0.0.0"): + if start_queue and ( + self.fd_config.host_ip == self.fd_config.master_ip or self.fd_config.master_ip == "0.0.0.0" + ): self.llm_logger.info(f"Starting engine worker queue server service at {address}") self.engine_worker_queue_server = EngineWorkerQueue( address=address, is_server=True, - num_client=self.cfg.parallel_config.tensor_parallel_size, - local_data_parallel_size=self.cfg.parallel_config.data_parallel_size, + num_client=self.fd_config.parallel_config.tensor_parallel_size, + local_data_parallel_size=self.fd_config.parallel_config.data_parallel_size, ) - if self.cfg.cache_config.enable_prefix_caching or self.cfg.scheduler_config.splitwise_role != "mixed": + if ( + self.fd_config.cache_config.enable_prefix_caching + or self.fd_config.scheduler_config.splitwise_role != "mixed" + ): self.cache_task_queue = EngineCacheQueue( address=( - self.cfg.master_ip, - self.cfg.cache_config.cache_queue_port, + self.fd_config.master_ip, + self.fd_config.cache_config.cache_queue_port, ), authkey=b"cache_queue_service", is_server=True, - num_client=self.cfg.parallel_config.tensor_parallel_size, + num_client=self.fd_config.parallel_config.tensor_parallel_size, client_id=-1, - local_data_parallel_size=self.cfg.parallel_config.data_parallel_size, + local_data_parallel_size=self.fd_config.parallel_config.data_parallel_size, ) self.llm_logger.info( - f"local {min(self.cfg.worker_num_per_node * self.cfg.node_rank + self.cfg.parallel_config.local_data_parallel_id,self.cfg.parallel_config.data_parallel_size - 1)}" + f"local {min(self.fd_config.worker_num_per_node * self.fd_config.node_rank + self.fd_config.parallel_config.local_data_parallel_id,self.fd_config.parallel_config.data_parallel_size - 1)}" ) self.engine_worker_queue = EngineWorkerQueue( address=address, is_server=False, - num_client=self.cfg.parallel_config.tensor_parallel_size, + num_client=self.fd_config.parallel_config.tensor_parallel_size, client_id=0, - local_data_parallel_size=self.cfg.parallel_config.data_parallel_size, + local_data_parallel_size=self.fd_config.parallel_config.data_parallel_size, local_data_parallel_id=min( - self.cfg.worker_num_per_node * self.cfg.node_rank + self.cfg.parallel_config.local_data_parallel_id, - self.cfg.parallel_config.data_parallel_size - 1, + self.fd_config.worker_num_per_node * self.fd_config.node_rank + + self.fd_config.parallel_config.local_data_parallel_id, + self.fd_config.parallel_config.data_parallel_size - 1, ), ) @@ -310,8 +325,8 @@ def insert_tasks(self, tasks, current_id=-1, allocated=False): cur_task.prompt_token_ids[0] = task.outputs.token_ids[0] cur_task.num_cached_tokens = task.num_cached_tokens if ( - self.cfg.speculative_config.method in ["mtp"] - and self.cfg.scheduler_config.splitwise_role == "decode" + self.fd_config.speculative_config.method in ["mtp"] + and self.fd_config.scheduler_config.splitwise_role == "decode" ): cur_task.draft_token_ids = copy.deepcopy(task.outputs.draft_token_ids) if task.error_code != 200: @@ -338,7 +353,7 @@ def insert_tasks(self, tasks, current_id=-1, allocated=False): need_delete_tasks = [] for task in tasks: - if self.cfg.scheduler_config.splitwise_role != "mixed": + if self.fd_config.scheduler_config.splitwise_role != "mixed": status, msg = self.split_connector.check_decode_allocated(task) if not status: self.llm_logger.error(f"{task.request_id} prefill failed with msg:{msg}.") @@ -394,12 +409,12 @@ def insert_tasks(self, tasks, current_id=-1, allocated=False): for task in tasks: task.inference_start_time = time.time() if not is_prefill: - if not self.cfg.model_config.enable_mm: + if not self.fd_config.model_config.enable_mm: self.update_requests_chunk_size(tasks) else: self.update_mm_requests_chunk_size(tasks) self.engine_worker_queue.put_tasks((tasks, self.resource_manager.real_bsz)) - if is_prefill and self.cfg.scheduler_config.name != "splitwise": + if is_prefill and self.fd_config.scheduler_config.name != "splitwise": self.engine_worker_queue.available_prefill_instances.put(1) return True @@ -432,14 +447,14 @@ def update_tokens(idx, chunk_size, update_chunk=False): if current_request_size[idx] <= 0: chunk_request_num -= 1 - if not self.cfg.cache_config.enable_chunked_prefill or len(requests) == 0: + if not self.fd_config.cache_config.enable_chunked_prefill or len(requests) == 0: return current_request_size = [request.prompt_token_ids_len for request in requests] requests_chunk = [[] for _ in range(len(requests))] chunk_request_num = len(current_request_size) while chunk_request_num >= 1: - remain_batched_tokens = self.cfg.scheduler_config.max_num_batched_tokens + remain_batched_tokens = self.fd_config.scheduler_config.max_num_batched_tokens for idx in range(len(current_request_size)): if current_request_size[idx] <= 0: continue @@ -449,14 +464,16 @@ def update_tokens(idx, chunk_size, update_chunk=False): ) update_tokens(idx, chunk_size) - while remain_batched_tokens >= self.cfg.cache_config.block_size: + while remain_batched_tokens >= self.fd_config.cache_config.block_size: # 当前 max_num_batched_tokens 还有剩余时,优先分配给较短的请求 waiting_requests = [input_lens for input_lens in current_request_size if input_lens > 0] if len(waiting_requests) == 0: break available_tokens = ( - remain_batched_tokens // self.cfg.cache_config.block_size * self.cfg.cache_config.block_size + remain_batched_tokens + // self.fd_config.cache_config.block_size + * self.fd_config.cache_config.block_size ) append_idx = current_request_size.index(min(waiting_requests)) chunk_size = min( @@ -473,7 +490,7 @@ def update_mm_requests_chunk_size(self, requests): """ update each multimodal request's chunk size info """ - if not self.cfg.cache_config.enable_chunked_prefill or len(requests) == 0: + if not self.fd_config.cache_config.enable_chunked_prefill or len(requests) == 0: return for request in requests: @@ -560,7 +577,7 @@ def _insert_task_to_worker(self): continue if hasattr(self, "exist_prefill_task_signal") and self.exist_prefill_task_signal.value[0] > 0: if ( - self.cfg.scheduler_config.splitwise_role == "mixed" + self.fd_config.scheduler_config.splitwise_role == "mixed" or self.split_connector.has_splitwise_tasks() ): time.sleep(0.005) @@ -574,15 +591,15 @@ def _insert_task_to_worker(self): num_prefill_batch = min( int(self.resource_manager.available_batch()), - self.cfg.max_prefill_batch, + self.fd_config.max_prefill_batch, ) self.resource_manager.check_and_free_block_tables() tasks = self.scheduler.get_requests( available_blocks=self.resource_manager.available_block_num(), - block_size=self.cfg.cache_config.block_size, - reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num, - max_num_batched_tokens=self.cfg.scheduler_config.max_num_batched_tokens, + block_size=self.fd_config.cache_config.block_size, + reserved_output_blocks=self.fd_config.cache_config.enc_dec_block_num, + max_num_batched_tokens=self.fd_config.scheduler_config.max_num_batched_tokens, batch=num_prefill_batch, ) @@ -590,7 +607,7 @@ def _insert_task_to_worker(self): time.sleep(0.001) continue - if self.cfg.scheduler_config.splitwise_role != "mixed": + if self.fd_config.scheduler_config.splitwise_role != "mixed": self.llm_logger.info("Inserting splitwise tasks") self.split_connector.send_splitwise_tasks(tasks, current_id) @@ -619,21 +636,21 @@ def _fetch_request(): is_fetching = True num_prefill_batch = min( int(self.resource_manager.available_batch()), - self.cfg.max_prefill_batch, + self.fd_config.max_prefill_batch, ) - if self.cfg.model_config.enable_mm: + if self.fd_config.model_config.enable_mm: available_blocks = self.resource_manager.available_block_num() else: - available_blocks = self.cfg.cache_config.max_block_num_per_seq + available_blocks = self.fd_config.cache_config.max_block_num_per_seq tasks = self.scheduler.get_requests( available_blocks=available_blocks, - block_size=self.cfg.cache_config.block_size, - reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num, - max_num_batched_tokens=self.cfg.model_config.max_model_len, + block_size=self.fd_config.cache_config.block_size, + reserved_output_blocks=self.fd_config.cache_config.enc_dec_block_num, + max_num_batched_tokens=self.fd_config.model_config.max_model_len, batch=num_prefill_batch, ) - if self.cfg.scheduler_config.splitwise_role != "mixed": + if self.fd_config.scheduler_config.splitwise_role != "mixed": for task in tasks: # assure can allocate block ids in P while not self.resource_manager.preallocate_resource_in_p(task): @@ -642,7 +659,7 @@ def _fetch_request(): self.split_connector.send_splitwise_tasks([task], task.idx) need_delete_tasks = [] for task in tasks: - if self.cfg.scheduler_config.splitwise_role != "mixed": + if self.fd_config.scheduler_config.splitwise_role != "mixed": # assure fetch block ids from D status, msg = self.split_connector.check_decode_allocated(task) if not status: @@ -663,7 +680,7 @@ def _fetch_request(): tasks.remove(tmp_task) # release resource in P self.resource_manager.prerelease_resource(tmp_task) - if self.cfg.scheduler_config.splitwise_role == "prefill": + if self.fd_config.scheduler_config.splitwise_role == "prefill": # to send cache info to cache messager if tasks: self.split_connector.send_cache_infos(tasks, 0) @@ -680,7 +697,7 @@ def _fetch_request(): time.sleep(0.001) # Fetch requests and add them to the scheduling queue if tasks: - if self.cfg.scheduler_config.splitwise_role == "prefill": + if self.fd_config.scheduler_config.splitwise_role == "prefill": self.resource_manager.add_request_in_p(tasks) else: for task in tasks: @@ -695,7 +712,7 @@ def _fetch_request(): if self.engine_worker_queue.num_tasks() > 0: time.sleep(0.001) continue - if self.cfg.scheduler_config.splitwise_role != "mixed": + if self.fd_config.scheduler_config.splitwise_role != "mixed": if self.scheduler.get_unhandled_request_num() <= envs.FD_EP_MAX_PREFETCH_TASK_NUM and ( not is_fetching ): @@ -712,7 +729,7 @@ def _fetch_request(): tasks = self.resource_manager.schedule() # 3. Send to engine if tasks: - if self.cfg.scheduler_config.splitwise_role == "decode": + if self.fd_config.scheduler_config.splitwise_role == "decode": for task in tasks: if task.task_type == RequestType.PREEMPTED: msg = f"{task.request_id} decode not enough blocks, need to be rescheduled." @@ -744,7 +761,7 @@ def start_zmq_service(self, api_server_pid=None): self.recv_request_server = ZmqTcpServer(port=envs.FD_ZMQ_RECV_REQUEST_SERVER_PORT, mode=zmq.PULL) self.send_response_server = ZmqTcpServer(port=envs.FD_ZMQ_SEND_RESPONSE_SERVER_PORT, mode=zmq.ROUTER) self.internal_adapter = InternalAdapter( - cfg=self.cfg, engine=self, dp_rank=self.cfg.node_rank * self.cfg.worker_num_per_node + cfg=self.fd_config, engine=self, dp_rank=self.fd_config.node_rank * self.fd_config.worker_num_per_node ) else: self.recv_request_server = ZmqIpcServer(name=api_server_pid, mode=zmq.PULL) @@ -763,12 +780,12 @@ def start_zmq_service(self, api_server_pid=None): def _insert_zmq_task_to_scheduler(self): added_requests: Dict[str, int] = dict() if envs.FD_ENABLE_INTERNAL_ADAPTER: - if self.cfg.scheduler_config.splitwise_role == "decode": + if self.fd_config.scheduler_config.splitwise_role == "decode": return while self.running: try: block = True if len(added_requests) == 0 else False - if not self.cfg.model_config.enable_mm: + if not self.fd_config.model_config.enable_mm: err, data = self.recv_request_server.receive_json_once(block) else: err, data = self.recv_request_server.receive_pyobj_once(block) @@ -930,7 +947,7 @@ def receiver_loop(): else: self.insert_tasks(tasks, allocated=True) - if self.cfg.innode_prefill_ports is not None: + if self.fd_config.innode_prefill_ports is not None: self.scheduler.put_results(tasks) else: if len(self.waiting_requests): @@ -975,12 +992,14 @@ def receiver_loop(): def start_cache_service(self, device_ids, ipc_signal_suffix, create_cache_tensor): return self.resource_manager.cache_manager.launch_cache_manager( - cache_config=self.cfg.cache_config, - tensor_parallel_size=self.cfg.parallel_config.tensor_parallel_size, + cache_config=self.fd_config.cache_config, + tensor_parallel_size=self.fd_config.parallel_config.tensor_parallel_size, device_ids=device_ids, - pod_ip=self.cfg.master_ip, + pod_ip=self.fd_config.master_ip, engine_worker_queue_port=int( - self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id] + self.fd_config.parallel_config.engine_worker_queue_port[ + self.fd_config.parallel_config.local_data_parallel_id + ] ), pid_suffix=ipc_signal_suffix, create_cache_tensor=create_cache_tensor, diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index ca754566702..207229a98be 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -34,7 +34,6 @@ import paddle from tqdm import tqdm -from fastdeploy.config import ErnieArchitectures from fastdeploy.engine.args_utils import EngineArgs from fastdeploy.engine.common_engine import EngineService from fastdeploy.engine.expert_service import start_data_parallel_service @@ -89,11 +88,10 @@ def __init__(self, cfg): self.is_started = False self.input_processor = InputPreprocessor( - cfg.tokenizer, + cfg.model_config, cfg.reasoning_parser, cfg.limit_mm_per_prompt, cfg.mm_processor_kwargs, - cfg.model_config.enable_mm, cfg.tool_parser, ) self.engine = EngineService(cfg) @@ -490,13 +488,13 @@ def _start_worker_service(self): else len(self.data_processor.tokenizer.vocab) ) - is_ernie = ErnieArchitectures.contains_ernie_arch(self.cfg.model_config.architectures) - if is_ernie: - self.cfg.model_config.think_end_id = self.data_processor.tokenizer.get_vocab().get("", -1) - if self.cfg.model_config.think_end_id != -1: - llm_logger.info(f"Get think_end_id {self.cfg.model_config.think_end_id} from vocab.") - else: - llm_logger.info("No token found in vocabulary, the model can not do reasoning.") + think_end_id = self.data_processor.tokenizer.get_vocab().get("", -1) + if think_end_id > 0: + llm_logger.info(f"Get think_end_id {think_end_id} from vocab.") + else: + llm_logger.info("No token found in vocabulary, the model can not do reasoning.") + image_patch_id = self.data_processor.tokenizer.get_vocab().get("<|IMAGE_PLACEHOLDER|>", -1) + line_break_id = self.data_processor.tokenizer.get_vocab().get("\n", -1) ports = ",".join(self.cfg.parallel_config.engine_worker_queue_port) ips = None @@ -524,7 +522,9 @@ def _start_worker_service(self): f" --data_parallel_size {self.cfg.parallel_config.data_parallel_size}" f" --quantization '{json.dumps(self.cfg.model_config.quantization)}'" f" --ori_vocab_size {ori_vocab_size}" - f" --think_end_id {self.cfg.model_config.think_end_id}" + f" --think_end_id {think_end_id}" + f" --image_patch_id {image_patch_id}" + f" --line_break_id {line_break_id}" f" --speculative_config '{self.cfg.speculative_config.to_json_string()}'" f" --graph_optimization_config '{self.cfg.graph_opt_config.to_json_string()}'" f" --guided_decoding_backend {self.cfg.guided_decoding_backend}" diff --git a/fastdeploy/entrypoints/cli/tokenizer.py b/fastdeploy/entrypoints/cli/tokenizer.py index fe477a7e66d..3012fd1f6c6 100644 --- a/fastdeploy/entrypoints/cli/tokenizer.py +++ b/fastdeploy/entrypoints/cli/tokenizer.py @@ -21,6 +21,7 @@ import typing from pathlib import Path +from fastdeploy.config import ModelConfig from fastdeploy.entrypoints.cli.types import CLISubcommand from fastdeploy.input.preprocess import InputPreprocessor @@ -199,7 +200,7 @@ def print_separator(title=""): return # 初始化tokenizer - preprocessor = InputPreprocessor(model_name_or_path=args.model_name_or_path, enable_mm=args.enable_mm) + preprocessor = InputPreprocessor(model_config=ModelConfig({"model": args.model_name_or_path})) tokenizer = preprocessor.create_processor().tokenizer # 执行操作 diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py index 525498ed5ce..eb2b8f297c1 100644 --- a/fastdeploy/entrypoints/engine_client.py +++ b/fastdeploy/entrypoints/engine_client.py @@ -36,7 +36,6 @@ ZmqIpcClient, ) from fastdeploy.metrics.work_metrics import work_process_metrics -from fastdeploy.multimodal.registry import MultimodalRegistry from fastdeploy.platforms import current_platform from fastdeploy.utils import ( EngineError, @@ -61,7 +60,6 @@ def __init__( port, limit_mm_per_prompt, mm_processor_kwargs, - # enable_mm=False, reasoning_parser=None, data_parallel_size=1, enable_logprob=False, @@ -70,20 +68,15 @@ def __init__( enable_prefix_caching=None, splitwise_role=None, ): - architectures = ModelConfig({"model": model_name_or_path}).architectures[0] - if MultimodalRegistry.contains_model(architectures): - self.enable_mm = True - else: - self.enable_mm = False - + model_config = ModelConfig({"model": model_name_or_path}).architectures[0] input_processor = InputPreprocessor( - tokenizer, + model_config, reasoning_parser, limit_mm_per_prompt, mm_processor_kwargs, - self.enable_mm, tool_parser, ) + self.enable_mm = model_config.enable_mm self.enable_logprob = enable_logprob self.reasoning_parser = reasoning_parser self.data_processor = input_processor.create_processor() @@ -263,8 +256,8 @@ def valid_parameters(self, data): raise ParameterError("max_tokens", f"max_tokens can be defined [1, {self.max_model_len}).") if data.get("reasoning_max_tokens") is not None: - if data["reasoning_max_tokens"] < 1: - raise ParameterError("reasoning_max_tokens", "reasoning_max_tokens must be greater than 1") + if data["reasoning_max_tokens"] < 0: + raise ParameterError("reasoning_max_tokens", "reasoning_max_tokens must be greater than 0") if data["reasoning_max_tokens"] > data["max_tokens"]: data["reasoning_max_tokens"] = data["max_tokens"] api_server_logger.warning( diff --git a/fastdeploy/entrypoints/openai/api_server.py b/fastdeploy/entrypoints/openai/api_server.py index 235f1cccd26..d246bbb15a9 100644 --- a/fastdeploy/entrypoints/openai/api_server.py +++ b/fastdeploy/entrypoints/openai/api_server.py @@ -156,7 +156,6 @@ async def lifespan(app: FastAPI): port=int(args.engine_worker_queue_port[args.local_data_parallel_id]), limit_mm_per_prompt=args.limit_mm_per_prompt, mm_processor_kwargs=args.mm_processor_kwargs, - # args.enable_mm, reasoning_parser=args.reasoning_parser, data_parallel_size=args.data_parallel_size, enable_logprob=args.enable_logprob, diff --git a/fastdeploy/input/preprocess.py b/fastdeploy/input/preprocess.py index 5b8eb3ccd3c..b3af46c95fa 100644 --- a/fastdeploy/input/preprocess.py +++ b/fastdeploy/input/preprocess.py @@ -24,7 +24,7 @@ class InputPreprocessor: """ Args: - model_name_or_path (str): + model_config (ModelConfig): Model name or path to the pretrained model. If a model name is provided, it should be a key in the Hugging Face Transformers' model registry (https://huggingface.co/models). The model will be downloaded from the Hugging Face model hub if necessary. @@ -32,8 +32,6 @@ class InputPreprocessor: reasoning_parser (str, optional): Reasoning parser type. Defaults to None. Flag specifies the reasoning parser to use for extracting reasoning content from the model output - enable_mm (bool, optional): - Whether to use the multi-modal model processor. Defaults to False. Raises: ValueError: @@ -43,32 +41,20 @@ class InputPreprocessor: def __init__( self, - model_name_or_path: str, + model_config: ModelConfig, reasoning_parser: str = None, limit_mm_per_prompt: Optional[Dict[str, Any]] = None, mm_processor_kwargs: Optional[Dict[str, Any]] = None, - enable_mm: bool = False, tool_parser: str = None, ) -> None: - - self.model_name_or_path = model_name_or_path + self.model_config = model_config + self.model_name_or_path = self.model_config.model self.reasoning_parser = reasoning_parser - self.enable_mm = enable_mm self.limit_mm_per_prompt = limit_mm_per_prompt self.mm_processor_kwargs = mm_processor_kwargs self.tool_parser = tool_parser def create_processor(self): - """ - 创建数据处理器。如果启用了多模态注册表,则使用该表中的模型;否则,使用传递给构造函数的模型名称或路径。 - 返回值:DataProcessor(如果不启用多模态注册表)或MultiModalRegistry.Processor(如果启用多模态注册表)。 - - Args: - 无参数。 - - Returns: - DataProcessor or MultiModalRegistry.Processor (Union[DataProcessor, MultiModalRegistry.Processor]): 数据处理器。 - """ reasoning_parser_obj = None tool_parser_obj = None @@ -77,8 +63,7 @@ def create_processor(self): if self.tool_parser: tool_parser_obj = ToolParserManager.get_tool_parser(self.tool_parser) - config = ModelConfig({"model": self.model_name_or_path}) - architectures = config.architectures[0] + architecture = self.model_config.architectures[0] try: from fastdeploy.plugins.input_processor import load_input_processor_plugins @@ -90,8 +75,8 @@ def create_processor(self): tool_parser_obj=tool_parser_obj, ) except: - if not self.enable_mm: - if not ErnieArchitectures.contains_ernie_arch(architectures): + if not self.model_config.enable_mm: + if not ErnieArchitectures.contains_ernie_arch(architecture): from fastdeploy.input.text_processor import DataProcessor self.processor = DataProcessor( @@ -108,7 +93,7 @@ def create_processor(self): tool_parser_obj=tool_parser_obj, ) else: - if ErnieArchitectures.contains_ernie_arch(architectures): + if ErnieArchitectures.contains_ernie_arch(architecture): from fastdeploy.input.ernie4_5_vl_processor import ( Ernie4_5_VLProcessor, ) @@ -124,7 +109,7 @@ def create_processor(self): from fastdeploy.input.qwen_vl_processor import QwenVLProcessor self.processor = QwenVLProcessor( - config=config, + config=self.model_config, model_name_or_path=self.model_name_or_path, limit_mm_per_prompt=self.limit_mm_per_prompt, mm_processor_kwargs=self.mm_processor_kwargs, diff --git a/fastdeploy/model_executor/models/interfaces_base.py b/fastdeploy/model_executor/models/interfaces_base.py deleted file mode 100644 index b7ece5fe69a..00000000000 --- a/fastdeploy/model_executor/models/interfaces_base.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Type - -from paddle import nn - - -def is_text_generation_model(model_cls: Type[nn.Layer]) -> bool: - from .model_base import ModelForCasualLM - - return issubclass(model_cls, ModelForCasualLM) - - -def is_pooling_model(model_cls: Type[nn.Layer]) -> bool: - class_name = model_cls.__name__ - pooling_indicators = ["Embedding", "ForSequenceClassification"] - return ( - any(indicator in class_name for indicator in pooling_indicators) - or hasattr(model_cls, "is_embedding_model") - and model_cls.is_embedding_model - ) - - -def is_multimodal_model(class_name: str) -> bool: - multimodal_indicators = ["VL", "Vision", "ConditionalGeneration"] - return any(indicator in class_name for indicator in multimodal_indicators) - - -def determine_model_category(class_name: str): - from fastdeploy.model_executor.models.model_base import ModelCategory - - if any(pattern in class_name for pattern in ["VL", "Vision", "ConditionalGeneration"]): - return ModelCategory.MULTIMODAL - elif any(pattern in class_name for pattern in ["Embedding", "ForSequenceClassification"]): - return ModelCategory.EMBEDDING - return ModelCategory.TEXT_GENERATION - - -def get_default_pooling_type(model_cls: Type[nn.Layer] = None) -> str: - if model_cls is not None: - return getattr(model_cls, "default_pooling_type", "LAST") - return "LAST" diff --git a/fastdeploy/model_executor/models/model_base.py b/fastdeploy/model_executor/models/model_base.py index fddfb4de51a..1ddd39c9a32 100644 --- a/fastdeploy/model_executor/models/model_base.py +++ b/fastdeploy/model_executor/models/model_base.py @@ -12,7 +12,7 @@ import importlib from abc import ABC, abstractmethod from dataclasses import dataclass -from enum import Enum +from enum import IntFlag, auto from functools import lru_cache from typing import Dict, List, Optional, Tuple, Type, Union @@ -26,19 +26,19 @@ iter_architecture_defaults, try_match_architecture_defaults, ) -from fastdeploy.model_executor.models.interfaces_base import ( - determine_model_category, - get_default_pooling_type, - is_multimodal_model, - is_pooling_model, - is_text_generation_model, -) -class ModelCategory(Enum): - TEXT_GENERATION = "text_generation" - MULTIMODAL = "multimodal" - EMBEDDING = "embedding" +def get_default_pooling_type(model_cls: Type[nn.Layer] = None) -> str: + if model_cls is not None: + return getattr(model_cls, "default_pooling_type", "LAST") + return "LAST" + + +class ModelCategory(IntFlag): + TEXT_GENERATION = auto() + MULTIMODAL = auto() + EMBEDDING = auto() + REASONING = auto() @dataclass(frozen=True) @@ -47,18 +47,22 @@ class ModelInfo: category: ModelCategory is_text_generation: bool is_multimodal: bool + is_reasoning: bool is_pooling: bool module_path: str default_pooling_type: str @staticmethod - def from_model_cls(model_cls: Type[nn.Layer], module_path: str = "") -> "ModelInfo": + def from_model_cls( + model_cls: Type[nn.Layer], module_path: str = "", category: ModelCategory = None + ) -> "ModelInfo": return ModelInfo( architecture=model_cls.__name__, - category=determine_model_category(model_cls.__name__), - is_text_generation=is_text_generation_model(model_cls), - is_multimodal=is_multimodal_model(model_cls.__name__), - is_pooling=is_pooling_model(model_cls), + category=category, + is_text_generation=ModelCategory.TEXT_GENERATION in category, + is_multimodal=ModelCategory.MULTIMODAL in category, + is_reasoning=ModelCategory.REASONING in category, + is_pooling=ModelCategory.EMBEDDING in category, default_pooling_type=get_default_pooling_type(model_cls), module_path=module_path, ) @@ -83,6 +87,7 @@ class LazyRegisteredModel(BaseRegisteredModel): module_name: str module_path: str class_name: str + category: ModelCategory def load_model_cls(self) -> Type[nn.Layer]: try: @@ -94,7 +99,7 @@ def load_model_cls(self) -> Type[nn.Layer]: def inspect_model_cls(self) -> ModelInfo: model_cls = self.load_model_cls() - return ModelInfo.from_model_cls(model_cls, self.module_name) + return ModelInfo.from_model_cls(model_cls, self.module_name, self.category) @lru_cache(maxsize=128) @@ -126,6 +131,7 @@ def _register_enhanced_models(self): module_name=model_info["module_name"], module_path=model_info["module_path"], class_name=model_info["class_name"], + category=model_info["category"], ) self.models[arch] = model self._registered_models[arch] = model @@ -317,6 +323,17 @@ def is_multimodal_model(self, architectures: Union[str, List[str]], model_config return model_info.is_multimodal return False + def is_reasoning_model(self, architectures: Union[str, List[str]], model_config: ModelConfig = None) -> bool: + """Check if it's a reasoning model""" + if isinstance(architectures, str): + architectures = [architectures] + + for arch in architectures: + model_info = self._try_inspect_model_cls(arch) + if model_info is not None: + return model_info.is_reasoning + return False + def is_text_generation_model(self, architectures: Union[str, List[str]], model_config: ModelConfig = None) -> bool: """Check if it's a text generation model""" if isinstance(architectures, str): diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index 34685b52019..384c893f3fc 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -42,6 +42,8 @@ elif current_platform.is_dcu(): from fastdeploy.model_executor.ops.gpu import ( get_padding_offset, + limit_thinking_content_length_v1, + limit_thinking_content_length_v2, save_output, set_stop_value_multi_ends, step_paddle, @@ -81,7 +83,6 @@ speculate_step_reschedule, ) - from fastdeploy.inter_communicator import ZmqIpcClient from fastdeploy.output.stream_transfer_data import DecoderState, StreamTransferData from fastdeploy.worker.output import ModelOutputData, ModelRunnerOutput, SamplerOutput @@ -641,3 +642,33 @@ def rebuild_padding( else: raise RuntimeError("Not supported platform") return hidden_states + + +def limit_thinking_content_length( + limit_strategy: str, + sampled_token_ids: paddle.Tensor, + max_think_lens: paddle.Tensor, + step_idx: paddle.Tensor, + limit_think_status: paddle.Tensor, + think_end_id: int, + line_break_id: int = None, +): + if limit_strategy == "": + # for ernie4_5_vl + limit_thinking_content_length_v1( + sampled_token_ids, + max_think_lens, + step_idx, + limit_think_status, + think_end_id, + ) + elif limit_strategy == "\n\n\n": + # for ernie_x1 + limit_thinking_content_length_v2( + sampled_token_ids, + max_think_lens, + step_idx, + limit_think_status, + think_end_id, + line_break_id, + ) diff --git a/fastdeploy/multimodal/registry.py b/fastdeploy/multimodal/registry.py deleted file mode 100644 index f014ba55532..00000000000 --- a/fastdeploy/multimodal/registry.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - - -class MultimodalRegistry: - """ - A registry for multimodal models - """ - - mm_models: set[str] = { - "Ernie4_5_VLMoeForConditionalGeneration", - "Ernie5MoeForCausalLM", - "Qwen2_5_VLForConditionalGeneration", - "Ernie5ForCausalLM", - } - - @classmethod - def contains_model(cls, name: str) -> bool: - """ - Check if the given name exists in registry. - """ - return name in cls.mm_models diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 9b2e531b880..d33116d45ed 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -63,6 +63,7 @@ ) from fastdeploy.model_executor.pre_and_post_process import ( + limit_thinking_content_length, post_process, pre_process, rebuild_padding, @@ -323,6 +324,16 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = position_ids, request.get("max_tokens", 2048) ) + if request.get("reasoning_max_tokens") is not None: + assert request.get("reasoning_max_tokens") >= 0, "reasoning_max_tokens in requests need >= 0." + # Enable thinking + self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get("reasoning_max_tokens") + self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0 + else: + # Disable thinking + self.share_inputs["max_think_lens"][idx : idx + 1, :] = -1 + self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0 + if isinstance(request.prompt_token_ids, np.ndarray): prompt_token_ids = request.prompt_token_ids.tolist() else: @@ -546,6 +557,16 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests: ) self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0 + if request.get("reasoning_max_tokens") is not None: + assert request.get("reasoning_max_tokens") >= 0, "reasoning_max_tokens in requests need >= 0." + # Enable thinking + self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get("reasoning_max_tokens") + self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0 + else: + # Disable thinking + self.share_inputs["max_think_lens"][idx : idx + 1, :] = -1 + self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0 + def get_attr_from_request(request, attr, default_value=None): res = request.get(attr, default_value) if res is not None: @@ -833,16 +854,15 @@ def _init_share_inputs(self, max_num_seqs: int): self.share_inputs["kv_num_blocks_x_cpu"] = None # CPU self.share_inputs["max_len_kv_cpu"] = None # CPU - # Initialize rotary position embedding - tmp_position_ids = paddle.arange(self.model_config.max_model_len).reshape((1, -1)) - # Initialize thinking related buffers + self.share_inputs["max_think_lens"] = paddle.full(shape=[max_num_seqs, 1], fill_value=-1, dtype="int32") + self.share_inputs["limit_think_status"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") - # TODO(gongshaotian): move to models + # Initialize rotary position embedding if not self.enable_mm: self.share_inputs["rope_emb"] = get_rope( rotary_dim=self.model_config.head_dim, - position_ids=tmp_position_ids, + position_ids=paddle.arange(self.model_config.max_model_len).reshape((1, -1)), base=self.model_config.rope_theta, model_config=self.model_config, partial_rotary_factor=self.model_config.partial_rotary_factor, @@ -1737,6 +1757,21 @@ class at the server level, which is too granular for ModelRunner. group=self.parallel_config.tp_group, ) + if self.model_config.think_end_id > 0 and not self.speculative_decoding: + assert ( + sampler_output is not None + ), "Warning, limit thinking content length not support speculative decoding." + assert self.model_config.line_break_id > 0 + limit_thinking_content_length( + limit_strategy="", # Temporary writing death + sampled_token_ids=sampler_output.sampled_token_ids, + max_think_lens=self.share_inputs["max_think_lens"], + step_idx=self.share_inputs["step_idx"], + limit_think_status=self.share_inputs["limit_think_status"], + think_end_id=self.model_config.think_end_id, + line_break_id=self.model_config.line_break_id, + ) + # 5. Post Process model_output_data = ModelOutputData( next_tokens=self.share_inputs["next_tokens"], diff --git a/fastdeploy/worker/utils.py b/fastdeploy/worker/utils.py deleted file mode 100644 index 7a2562f248b..00000000000 --- a/fastdeploy/worker/utils.py +++ /dev/null @@ -1,50 +0,0 @@ -""" -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License" -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" - -import os -import traceback - - -def check_safetensors_model(model_dir: str): - """ - model_dir : the directory of the model - Check whether the model is safetensors format - """ - model_files = list() - all_files = os.listdir(model_dir) - for x in all_files: - if x.startswith("model") and x.endswith(".safetensors"): - model_files.append(x) - - is_safetensors = len(model_files) > 0 - if not is_safetensors: - return False - - if len(model_files) == 1 and model_files[0] == "model.safetensors": - return True - try: - # check all the file exists - safetensors_num = int(model_files[0].strip(".safetensors").split("-")[-1]) - flags = [0] * safetensors_num - for x in model_files: - current_index = int(x.strip(".safetensors").split("-")[1]) - flags[current_index - 1] = 1 - assert ( - sum(flags) == safetensors_num - ), f"Number of safetensor files should be {len(model_files)}, but now it's {sum(flags)}" - except Exception as e: - raise Exception(f"Failed to check unified checkpoint, details: {e}, {str(traceback.format_exc())}.") - return is_safetensors diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index b2dcc792024..21a854d5f17 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -40,7 +40,6 @@ PlasAttentionConfig, SpeculativeConfig, ) -from fastdeploy.input.ernie4_5_tokenizer import Ernie4_5Tokenizer from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue from fastdeploy.inter_communicator import ExistTaskStatus, IPCSignal, ModelWeightsStatus from fastdeploy.model_executor.layers.quantization import parse_quant_config @@ -115,25 +114,9 @@ def init_distributed_environment(seed: int = 20) -> Tuple[int, int]: def update_fd_config_for_mm(fd_config: FDConfig) -> None: architectures = fd_config.model_config.architectures if fd_config.model_config.enable_mm and ErnieArchitectures.contains_ernie_arch(architectures): - tokenizer = Ernie4_5Tokenizer.from_pretrained( - fd_config.model_config.model, - model_max_length=fd_config.model_config.max_model_len, - padding_side="right", - use_fast=False, - ) - tokenizer.ignored_index = -100 - if tokenizer.pad_token is None: - tokenizer.pad_token = tokenizer.unk_token - fd_config.model_config.tensor_parallel_degree = fd_config.parallel_config.tensor_parallel_size fd_config.model_config.tensor_parallel_rank = fd_config.parallel_config.tensor_parallel_rank - vision_config = fd_config.model_config.vision_config - vision_config.dtype = fd_config.model_config.dtype - # vision_config.tensor_parallel_degree = fd_config.parallel_config.tensor_parallel_size - # vision_config.tensor_parallel_rank = fd_config.parallel_config.tensor_parallel_rank - fd_config.model_config.im_patch_id = tokenizer.get_vocab()["<|IMAGE_PLACEHOLDER|>"] - fd_config.model_config.think_end_id = tokenizer.get_vocab()[""] - fd_config.model_config.sequence_parallel = fd_config.parallel_config.sequence_parallel + fd_config.model_config.vision_config.dtype = fd_config.model_config.dtype class PaddleDisWorkerProc: @@ -573,6 +556,8 @@ def parse_args(): ) parser.add_argument("--ori_vocab_size", type=int, default=None) parser.add_argument("--think_end_id", type=int, default=-1) + parser.add_argument("--image_patch_id", type=int, default=-1) + parser.add_argument("--line_break_id", type=int, default=-1) parser.add_argument( "--quantization", diff --git a/tests/ce/server/test_completions.py b/tests/ce/server/test_completions.py index 5fff0ae1251..188586d7d43 100644 --- a/tests/ce/server/test_completions.py +++ b/tests/ce/server/test_completions.py @@ -52,7 +52,6 @@ def test_completion_echo_stream_one_prompt_rti(): payload = build_request_payload(TEMPLATE, data) resp = send_request(COMPLETIONS_URL, payload, stream=True) - last_data = None # 初始化计数器 counter = 0 second_data = None @@ -87,7 +86,6 @@ def test_completion_echo_stream_one_prompt(): payload = build_request_payload(TEMPLATE, data) resp = send_request(COMPLETIONS_URL, payload, stream=True) - last_data = None # 初始化计数器 counter = 0 second_data = None @@ -123,10 +121,6 @@ def test_completion_echo_stream_more_prompt(): payload = build_request_payload(TEMPLATE, data) resp = send_request(COMPLETIONS_URL, payload, stream=True) - last_data = None - # 初始化计数器 - counter = 0 - second_data = None # 初始化字典来存储每个index的第二包数据 second_data_by_index = {0: None, 1: None} # 初始化字典来记录每个index的包计数 From 1912e72171a8cd6f22285fdcc6973cb1a1d06a90 Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Mon, 13 Oct 2025 16:26:07 +0800 Subject: [PATCH 04/24] fix --- fastdeploy/entrypoints/engine_client.py | 2 +- fastdeploy/model_executor/pre_and_post_process.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py index eb2b8f297c1..7a23525530f 100644 --- a/fastdeploy/entrypoints/engine_client.py +++ b/fastdeploy/entrypoints/engine_client.py @@ -68,7 +68,7 @@ def __init__( enable_prefix_caching=None, splitwise_role=None, ): - model_config = ModelConfig({"model": model_name_or_path}).architectures[0] + model_config = ModelConfig({"model": model_name_or_path}) input_processor = InputPreprocessor( model_config, reasoning_parser, diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index 384c893f3fc..70ae261c525 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -672,3 +672,5 @@ def limit_thinking_content_length( think_end_id, line_break_id, ) + else: + raise NotImplementedError(f"Not support {limit_strategy=} for limit thinking content length.") From fe0fee884b211ae8c60421c6c3a57b399dea9134 Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Mon, 13 Oct 2025 17:03:37 +0800 Subject: [PATCH 05/24] fix --- fastdeploy/config.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 400807339a2..a4d055a5620 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -227,10 +227,6 @@ def __init__( self.im_patch_id = args.get("image_patch_id", -1) self.line_break_id = args.get("line_break_id", -1) - self.override_name_from_config() - self.read_from_env() - self.read_model_config() - @property def registry(self): from fastdeploy.model_executor.models.model_base import ModelRegistry @@ -506,6 +502,9 @@ def __post_init__(self): self._architecture = arch self.pooler_config = self._init_pooler_config() + self.override_name_from_config() + self.read_from_env() + self.read_model_config() class ParallelConfig: From 81674a5cdd07a59175c84e5f476195f48915661d Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Mon, 13 Oct 2025 19:26:47 +0800 Subject: [PATCH 06/24] fix --- fastdeploy/config.py | 80 +++++++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 39 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index a4d055a5620..15c3957c7fb 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -227,6 +227,47 @@ def __init__( self.im_patch_id = args.get("image_patch_id", -1) self.line_break_id = args.get("line_break_id", -1) + self._post_init() + + def _post_init(self): + self.is_unified_ckpt = check_unified_ckpt(self.model) + self.runner_type = self._get_runner_type(self.architectures, self.runner) + self.convert_type = self._get_convert_type(self.architectures, self.runner_type, self.convert) + registry = self.registry + is_generative_model = registry.is_text_generation_model(self.architectures, self) + is_pooling_model = registry.is_pooling_model(self.architectures, self) + is_multimodal_model = registry.is_multimodal_model(self.architectures, self) + self.is_reasoning_model = registry.is_reasoning_model(self.architectures, self) + + self.enable_mm = is_multimodal_model + + if self.runner_type == "generate" and not is_generative_model: + if is_multimodal_model: + pass + else: + generate_converts = _RUNNER_CONVERTS["generate"] + if self.convert_type not in generate_converts: + raise ValueError("This model does not support '--runner generate.") + if self.runner_type == "pooling" and not is_pooling_model: + pooling_converts = _RUNNER_CONVERTS["pooling"] + if self.convert_type not in pooling_converts: + convert_option = "<" + "|".join(pooling_converts) + ">" + raise ValueError( + "This model does not support `--runner pooling`. " + f"You can pass `--convert {convert_option} to adapt " + "it into a pooling model." + ) + + self.supported_tasks = self._get_supported_tasks(self.architectures, self.runner_type, self.convert_type) + model_info, arch = registry.inspect_model_cls(self.architectures, self) + self._model_info = model_info + self._architecture = arch + + self.pooler_config = self._init_pooler_config() + self.override_name_from_config() + self.read_from_env() + self.read_model_config() + @property def registry(self): from fastdeploy.model_executor.models.model_base import ModelRegistry @@ -467,45 +508,6 @@ def print(self): logger.info("{:<20}:{:<6}{}".format(k, "", v)) logger.info("=============================================================") - def __post_init__(self): - self.is_unified_ckpt = check_unified_ckpt(self.model) - self.runner_type = self._get_runner_type(self.architectures, self.runner) - self.convert_type = self._get_convert_type(self.architectures, self.runner_type, self.convert) - registry = self.registry - is_generative_model = registry.is_text_generation_model(self.architectures, self) - is_pooling_model = registry.is_pooling_model(self.architectures, self) - is_multimodal_model = registry.is_multimodal_model(self.architectures, self) - self.is_reasoning_model = registry.is_reasoning_model(self.architectures, self) - - self.enable_mm = is_multimodal_model - - if self.runner_type == "generate" and not is_generative_model: - if is_multimodal_model: - pass - else: - generate_converts = _RUNNER_CONVERTS["generate"] - if self.convert_type not in generate_converts: - raise ValueError("This model does not support '--runner generate.") - if self.runner_type == "pooling" and not is_pooling_model: - pooling_converts = _RUNNER_CONVERTS["pooling"] - if self.convert_type not in pooling_converts: - convert_option = "<" + "|".join(pooling_converts) + ">" - raise ValueError( - "This model does not support `--runner pooling`. " - f"You can pass `--convert {convert_option} to adapt " - "it into a pooling model." - ) - - self.supported_tasks = self._get_supported_tasks(self.architectures, self.runner_type, self.convert_type) - model_info, arch = registry.inspect_model_cls(self.architectures, self) - self._model_info = model_info - self._architecture = arch - - self.pooler_config = self._init_pooler_config() - self.override_name_from_config() - self.read_from_env() - self.read_model_config() - class ParallelConfig: """Configuration for the distributed execution.""" From 3282a2fdadc3e5c24330bb798c33f36c9db13e7a Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Tue, 14 Oct 2025 11:53:11 +0800 Subject: [PATCH 07/24] update --- fastdeploy/config.py | 17 ----------------- fastdeploy/envs.py | 2 ++ fastdeploy/worker/gpu_model_runner.py | 2 +- fastdeploy/worker/worker_process.py | 3 --- 4 files changed, 3 insertions(+), 21 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 15c3957c7fb..bcccd80e09b 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1305,21 +1305,6 @@ def print(self): logger.info("=============================================================") -class DecodingConfig: - """ - Configuration for decoding - """ - - def __init__( - self, - args, - ): - self.pad_token_id = None - for key, value in args.items(): - if hasattr(self, key): - setattr(self, key, value) - - class CommitConfig: """ Configuration for tracking version information from version.txt @@ -1392,7 +1377,6 @@ def __init__( commit_config: CommitConfig = CommitConfig(), scheduler_config: SchedulerConfig = None, device_config: DeviceConfig = None, - decoding_config: DecodingConfig = None, quant_config: QuantConfigBase = None, graph_opt_config: GraphOptimizationConfig = None, plas_attention_config: PlasAttentionConfig = None, @@ -1423,7 +1407,6 @@ def __init__( self.quant_config: Optional[QuantConfigBase] = quant_config self.graph_opt_config: Optional[GraphOptimizationConfig] = graph_opt_config self.early_stop_config: Optional[EarlyStopConfig] = early_stop_config - self.decoding_config: DecodingConfig = decoding_config # type: ignore self.cache_config: CacheConfig = cache_config # type: ignore self.plas_attention_config: Optional[PlasAttentionConfig] = plas_attention_config # Initialize cuda graph capture list diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index bb5bee9680c..68f860719e5 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -118,6 +118,8 @@ "FD_ENABLE_MODEL_LOAD_CACHE": lambda: bool(int(os.getenv("FD_ENABLE_MODEL_LOAD_CACHE", "0"))), # Whether to clear cpu cache when clearing model weights. "FD_ENABLE_SWAP_SPACE_CLEARING": lambda: int(os.getenv("FD_ENABLE_SWAP_SPACE_CLEARING", "0")), + # Used to truncate the string inserted during thinking when reasoning in a model. ( for ernie4_5_vl, \n\n\n for ernie_x1) + "FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", ""), } diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index d33116d45ed..bc3f98f87cb 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1763,7 +1763,7 @@ class at the server level, which is too granular for ModelRunner. ), "Warning, limit thinking content length not support speculative decoding." assert self.model_config.line_break_id > 0 limit_thinking_content_length( - limit_strategy="", # Temporary writing death + limit_strategy=envs.FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR, sampled_token_ids=sampler_output.sampled_token_ids, max_think_lens=self.share_inputs["max_think_lens"], step_idx=self.share_inputs["step_idx"], diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 21a854d5f17..e63896ce220 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -28,7 +28,6 @@ from fastdeploy import envs from fastdeploy.config import ( CacheConfig, - DecodingConfig, DeviceConfig, EarlyStopConfig, ErnieArchitectures, @@ -688,7 +687,6 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: paddle.set_default_dtype(args.dtype) model_config = ModelConfig(vars(args)) device_config = DeviceConfig(vars(args)) - decoding_config = DecodingConfig(vars(args)) speculative_config = SpeculativeConfig(args.speculative_config) parallel_config = ParallelConfig(vars(args)) cache_config = CacheConfig(vars(args)) @@ -787,7 +785,6 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: speculative_config=speculative_config, device_config=device_config, load_config=load_config, - decoding_config=decoding_config, quant_config=quant_config, graph_opt_config=graph_opt_config, early_stop_config=early_stop_config, From 6f1f082c2156053d1caf0b59d962880044274f88 Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Tue, 14 Oct 2025 18:24:19 +0800 Subject: [PATCH 08/24] fix set_value_by_flags_and_idx --- custom_ops/gpu_ops/set_value_by_flags_and_idx.cu | 2 +- .../speculate_decoding/speculate_set_value_by_flags_and_idx.cu | 2 +- docs/usage/environment_variables.md | 3 +++ docs/zh/usage/environment_variables.md | 3 +++ 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu b/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu index 9e7a0ce1134..39181683038 100644 --- a/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu +++ b/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu @@ -35,7 +35,7 @@ __global__ void set_value_by_flag_and_id(const bool *stop_flags, const int seq_len_dec = seq_lens_decoder[tid]; const int seq_len_enc = seq_lens_encoder[tid]; if (seq_len_dec == 0 && seq_len_enc == 0) return; // stopped - if (step_idx[tid] >= 0) { + if (step_idx[tid] > 0) { if (seq_len_enc > 0) { // encoder, get last token accord to seq_lens_encoder pre_ids_all_now[step_idx[tid]] = input_ids_now[seq_len_enc - 1]; } else { // decoedr, get first token diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu index 4b1c7747e9c..a53a4eda641 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu @@ -37,7 +37,7 @@ __global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all, const int seq_len_enc = seq_lens_encoder[tid]; if (seq_len_dec == 0 && seq_len_enc == 0) return; // stopped // printf("step_idx[tid] %d\n", step_idx[tid]); - if (step_idx[tid] >= 0) { + if (step_idx[tid] > 0) { for (int i = 0; i < accept_num[tid]; i++) { pre_ids_all_now[step_idx[tid] - i] = accept_tokens_now[accept_num[tid] - 1 - i]; diff --git a/docs/usage/environment_variables.md b/docs/usage/environment_variables.md index 103ff1e5ef2..aa2ac7ad2d8 100644 --- a/docs/usage/environment_variables.md +++ b/docs/usage/environment_variables.md @@ -78,5 +78,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # Whether to use Machete for wint4 dense GEMM. "FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "0"), + # Used to truncate the string inserted during thinking when reasoning in a model. ( for ernie4_5_vl, \n\n\n for ernie_x1) + "FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", ""), + } ``` diff --git a/docs/zh/usage/environment_variables.md b/docs/zh/usage/environment_variables.md index c1289bf984d..e6b9900bd27 100644 --- a/docs/zh/usage/environment_variables.md +++ b/docs/zh/usage/environment_variables.md @@ -77,5 +77,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # 是否使用 Machete 后端的 wint4 GEMM. "FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "0"), + + # Used to truncate the string inserted during thinking when reasoning in a model. ( for ernie4_5_vl, \n\n\n for ernie_x1) + "FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", ""), } ``` From 31aa8eefdfe9f1b34c7c2da8ae21ade5352aed2e Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Wed, 15 Oct 2025 11:03:51 +0800 Subject: [PATCH 09/24] fix --- fastdeploy/model_executor/pre_and_post_process.py | 1 + fastdeploy/worker/gpu_model_runner.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index 70ae261c525..f0c4c7011e4 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -664,6 +664,7 @@ def limit_thinking_content_length( ) elif limit_strategy == "\n\n\n": # for ernie_x1 + assert line_break_id > 0 limit_thinking_content_length_v2( sampled_token_ids, max_think_lens, diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index f1e4d3b803b..6fe1620ac3f 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1758,7 +1758,6 @@ class at the server level, which is too granular for ModelRunner. assert ( sampler_output is not None ), "Warning, limit thinking content length not support speculative decoding." - assert self.model_config.line_break_id > 0 limit_thinking_content_length( limit_strategy=envs.FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR, sampled_token_ids=sampler_output.sampled_token_ids, From bc60b268abf016cd52276d4beb0ab3f92483dccc Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Wed, 15 Oct 2025 16:03:15 +0800 Subject: [PATCH 10/24] fix --- custom_ops/gpu_ops/cpp_extensions.cc | 17 ++ .../limit_thinking_content_length_v1.cu | 114 ++++++------- .../limit_thinking_content_length_v2.cu | 157 ++++++++---------- .../model_executor/pre_and_post_process.py | 4 +- 4 files changed, 139 insertions(+), 153 deletions(-) diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc index 57d6201efd9..d7c6bb44e5e 100644 --- a/custom_ops/gpu_ops/cpp_extensions.cc +++ b/custom_ops/gpu_ops/cpp_extensions.cc @@ -898,6 +898,19 @@ void SaveOutMmsgStatic(const paddle::Tensor& x, int64_t rank_id, bool save_each_rank); +void LimitThinkingContentLengthV1(const paddle::Tensor &next_tokens, + const paddle::Tensor &max_think_lens, + const paddle::Tensor &step_idx, + const paddle::Tensor &limit_think_status, + const int64_t think_end_id); + +void LimitThinkingContentLengthV2(const paddle::Tensor &next_tokens, + const paddle::Tensor &max_think_lens, + const paddle::Tensor &step_idx, + const paddle::Tensor &limit_think_status, + const int64_t think_end_id, + const int64_t line_break_id); + PYBIND11_MODULE(fastdeploy_ops, m) { m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"), @@ -1286,4 +1299,8 @@ PYBIND11_MODULE(fastdeploy_ops, m) { m.def("min_p_sampling", &MinPSamplingFromProbs, "min_p_sampling function"); m.def("save_output", &SaveOutMmsgStatic, "save_output function"); + + m.def("limit_thinking_content_length_v1", &LimitThinkingContentLengthV1, "save_output function"); + + m.def("limit_thinking_content_length_v2", &LimitThinkingContentLengthV2, "save_output function"); } diff --git a/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu b/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu index e16e6dc66eb..d4c494b53a4 100644 --- a/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu +++ b/custom_ops/gpu_ops/limit_thinking_content_length_v1.cu @@ -18,82 +18,70 @@ __global__ void limit_thinking_content_length_kernel_v1( int64_t *next_tokens, const int *max_think_lens, - const int64_t *step_idx, // step_idx 不再需要被修改,改为 const + const int64_t *step_idx, int *limit_think_status, const int64_t think_end_id, - const int bs) -{ - int bid = threadIdx.x; - if (bid >= bs) - return; + const int bs) { + int bid = threadIdx.x; + if (bid >= bs) return; - // 如果该序列未启用思考功能,则直接返回,默认值为 -1,表示不限制思考长度 - const int max_think_len = max_think_lens[bid]; - if (max_think_len < 0) - return; - int current_limit_content_status = limit_think_status[bid]; - // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行. - if (current_limit_content_status == 2) - { - return; - } + // 如果该序列未启用思考功能,则直接返回,默认值为 -1,表示不限制思考长度 + const int max_think_len = max_think_lens[bid]; + if (max_think_len < 0) return; + int current_limit_think_status = limit_think_status[bid]; + // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行. + if (current_limit_think_status == 2) { + return; + } - int64_t next_token = next_tokens[bid]; - const int64_t step = step_idx[bid]; + int64_t next_token = next_tokens[bid]; + const int64_t step = step_idx[bid]; - // ======================= 思考阶段控制 ======================= - // 阶段 1: 仍在思考 (status == 0), 检查是否需要强制结束 - if (current_limit_content_status < 1) - { - // 当开启思考长度控制时,检查是否超时 - if (step >= max_think_len) - { - // 强制将当前token替换为结束思考的token - next_token = think_end_id; - // 将状态推进到 1, 表示 "正在结束思考" - current_limit_content_status = 1; + // ======================= 思考阶段控制 ======================= + // 阶段 1: 仍在思考 (status == 0), 检查是否需要强制结束 + if (current_limit_think_status < 1) { + // 当开启思考长度控制时,检查是否超时 + if (step >= max_think_len) { + // 强制将当前token替换为结束思考的token + next_token = think_end_id; + // 将状态推进到 1, 表示 "正在结束思考" + current_limit_think_status = 1; + } } - } - // ======================= 思考结束处理 ======================= - // 阶段 2: 检查是否已满足结束思考的条件 (status < 2) - // 这种情况会处理两种场景: - // 1. status == 0: 模型自己生成了 think_end_id - // 2. status == 1: 上一阶段强制注入了 think_end_id - if (current_limit_content_status < 2) - { - if (next_token == think_end_id) - { - // 确认思考结束,将状态推进到 2 (响应阶段) - current_limit_content_status = 2; + // ======================= 思考结束处理 ======================= + // 阶段 2: 检查是否已满足结束思考的条件 (status < 2) + // 这种情况会处理两种场景: + // 1. status == 0: 模型自己生成了 think_end_id + // 2. status == 1: 上一阶段强制注入了 think_end_id + if (current_limit_think_status < 2) { + if (next_token == think_end_id) { + // 确认思考结束,将状态推进到 2 (响应阶段) + current_limit_think_status = 2; + } } - } - // 写回更新后的 token - next_tokens[bid] = next_token; - // 更新全局状态 - limit_think_status[bid] = current_limit_content_status; + // 写回更新后的 token + next_tokens[bid] = next_token; + // 更新全局状态 + limit_think_status[bid] = current_limit_think_status; } void LimitThinkingContentLengthV1(const paddle::Tensor &next_tokens, - const paddle::Tensor &max_think_lens, - const paddle::Tensor &step_idx, - const paddle::Tensor &limit_think_status, - const int64_t think_end_id) -{ - const int batch_size = next_tokens.shape()[0]; - limit_thinking_content_length_kernel_v1<<<1, 1024>>>( - const_cast(next_tokens.data()), - max_think_lens.data(), - step_idx.data(), - const_cast(limit_think_status.data()), - think_end_id, - batch_size); + const paddle::Tensor &max_think_lens, + const paddle::Tensor &step_idx, + const paddle::Tensor &limit_think_status, + const int64_t think_end_id) { + const int batch_size = next_tokens.shape()[0]; + limit_thinking_content_length_kernel_v1<<<1, 1024>>>( + const_cast(next_tokens.data()), + max_think_lens.data(), + step_idx.data(), + const_cast(limit_think_status.data()), + think_end_id, + batch_size); } PD_BUILD_OP(limit_thinking_content_length_v1) - .Inputs({"next_tokens", - "max_think_lens", - "step_idx", - "limit_think_status"}) + .Inputs({"next_tokens", "max_think_lens", "step_idx", "limit_think_status"}) .Attrs({"think_end_id: int64_t"}) .Outputs({"next_tokens_out"}) .SetInplaceMap({{"next_tokens", "next_tokens_out"}}) diff --git a/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu b/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu index fdac1cff5d0..c28073ba74e 100644 --- a/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu +++ b/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu @@ -22,108 +22,89 @@ __global__ void limit_thinking_content_length_kernel_v2( int64_t *next_tokens, const int *max_think_lens, - const int64_t *step_idx, // step_idx 不再需要被修改,改为 const + const int64_t *step_idx, // step_idx 不再需要被修改,改为 const int *limit_think_status, const int64_t think_end_id, const int64_t line_break_id, - const int bs) -{ - int bid = threadIdx.x; - if (bid >= bs) - return; - // 如果该序列未启用思考功能,则直接返回,默认值为 -1,表示不限制思考长度 - const int max_think_len = max_think_lens[bid]; - if (max_think_len < 0) - return; - int current_limit_content_status = limit_think_status[bid]; - // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行. - if (current_limit_content_status == 3) - { - return; - } + const int bs) { + int bid = threadIdx.x; + if (bid >= bs) return; + // 如果该序列未启用思考功能,则直接返回,默认值为 -1,表示不限制思考长度 + const int max_think_len = max_think_lens[bid]; + if (max_think_len < 0) return; + int current_limit_think_status = limit_think_status[bid]; + // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行. + if (current_limit_think_status == 3) { + return; + } - int64_t next_token = next_tokens[bid]; - const int64_t step = step_idx[bid]; + int64_t next_token = next_tokens[bid]; + const int64_t step = step_idx[bid]; - // ======================= 思考阶段控制 ======================= - // 阶段 1: 仍在思考 (status == 0), 检查是否需要强制结束 - // 阶段 2: 在替换 (status == 1), 检查是否替换结束 - if (current_limit_content_status <= 1) - { - // 当开启思考长度控制时,检查是否超时 - if (step == max_think_len) - { - // 强制将当前token替换为结束思考的token - next_token = line_break_id; - current_limit_content_status = 1; - } - else if (step == max_think_len + 1) - { - // 强制将当前token替换为结束思考的token - next_token = think_end_id; - current_limit_content_status = 1; - } - else if (step == max_think_len + 2) - { - // 强制将当前token替换为结束思考的token - next_token = line_break_id; - current_limit_content_status = 1; + // ======================= 思考阶段控制 ======================= + // 阶段 1: 仍在思考 (status == 0), 检查是否需要强制结束 + // 阶段 2: 在替换 (status == 1), 检查是否替换结束 + if (current_limit_think_status <= 1) { + // 当开启思考长度控制时,检查是否超时 + if (step == max_think_len) { + // 强制将当前token替换为结束思考的token + next_token = line_break_id; + current_limit_think_status = 1; + } else if (step == max_think_len + 1) { + // 强制将当前token替换为结束思考的token + next_token = think_end_id; + current_limit_think_status = 1; + } else if (step == max_think_len + 2) { + // 强制将当前token替换为结束思考的token + next_token = line_break_id; + current_limit_think_status = 1; + } else if (step == max_think_len + 3) { + // 强制将当前token替换为结束思考的token + next_token = line_break_id; + // 将状态推进到 1, 表示 "正在结束思考" + current_limit_think_status = 2; + } } - else if (step == max_think_len + 3) - { - // 强制将当前token替换为结束思考的token - next_token = line_break_id; - // 将状态推进到 1, 表示 "正在结束思考" - current_limit_content_status = 2; + // ======================= 思考结束处理 ======================= + // 阶段 3: 检查是否已满足结束思考的条件 (status == 0 || status == 2) + // 这种情况会处理两种场景: + // 1. status == 0: 模型可能自己生成了 + // 2. status == 2: 上一阶段强制注入了 \n\n\n + if (current_limit_think_status == 0) { + if (next_token == think_end_id) { + // 确认思考结束,将状态推进到 3 (响应阶段) + current_limit_think_status = 3; + } } - } - // ======================= 思考结束处理 ======================= - // 阶段 3: 检查是否已满足结束思考的条件 (status == 0 || status == 2) - // 这种情况会处理两种场景: - // 1. status == 0: 模型可能自己生成了 - // 2. status == 2: 上一阶段强制注入了 \n\n\n - if (current_limit_content_status == 0) - { - if (next_token == think_end_id) - { - // 确认思考结束,将状态推进到 3 (响应阶段) - current_limit_content_status = 3; + if (current_limit_think_status == 2) { + // 确认思考结束,将状态推进到 3 (响应阶段) + current_limit_think_status = 3; } - } - if (current_limit_content_status == 2) - { - // 确认思考结束,将状态推进到 3 (响应阶段) - current_limit_content_status = 3; - } - // 写回更新后的 token - next_tokens[bid] = next_token; - // 更新全局状态 - limit_think_status[bid] = current_limit_content_status; + // 写回更新后的 token + next_tokens[bid] = next_token; + // 更新全局状态 + limit_think_status[bid] = current_limit_think_status; } void LimitThinkingContentLengthV2(const paddle::Tensor &next_tokens, - const paddle::Tensor &max_think_lens, - const paddle::Tensor &step_idx, - const paddle::Tensor &limit_think_status, - const int64_t think_end_id, - const int64_t line_break_id) -{ - const int batch_size = next_tokens.shape()[0]; - limit_thinking_content_length_kernel_v2<<<1, 1024>>>( - const_cast(next_tokens.data()), - max_think_lens.data(), - step_idx.data(), - const_cast(limit_think_status.data()), - think_end_id, - line_break_id, - batch_size); + const paddle::Tensor &max_think_lens, + const paddle::Tensor &step_idx, + const paddle::Tensor &limit_think_status, + const int64_t think_end_id, + const int64_t line_break_id) { + const int batch_size = next_tokens.shape()[0]; + limit_thinking_content_length_kernel_v2<<<1, 1024>>>( + const_cast(next_tokens.data()), + max_think_lens.data(), + step_idx.data(), + const_cast(limit_think_status.data()), + think_end_id, + line_break_id, + batch_size); } PD_BUILD_OP(limit_thinking_content_length_v2) - .Inputs({"next_tokens", - "max_think_lens", - "step_idx", - "limit_think_status"}) + .Inputs({"next_tokens", "max_think_lens", "step_idx", "limit_think_status"}) .Attrs({"think_end_id: int64_t", "line_break_id: int64_t"}) .Outputs({"next_tokens_out"}) .SetInplaceMap({{"next_tokens", "next_tokens_out"}}) diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index f0c4c7011e4..e0625fccd8a 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -42,8 +42,6 @@ elif current_platform.is_dcu(): from fastdeploy.model_executor.ops.gpu import ( get_padding_offset, - limit_thinking_content_length_v1, - limit_thinking_content_length_v2, save_output, set_stop_value_multi_ends, step_paddle, @@ -81,6 +79,8 @@ step_reschedule, update_inputs_v1, speculate_step_reschedule, + limit_thinking_content_length_v1, + limit_thinking_content_length_v2, ) from fastdeploy.inter_communicator import ZmqIpcClient From 61d9b72e213c2c81003bdd5f071da2c3827b9fdf Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Wed, 15 Oct 2025 16:04:51 +0800 Subject: [PATCH 11/24] fix --- custom_ops/gpu_ops/cpp_extensions.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc index 898d3e2ccdc..bdd3c231e9d 100644 --- a/custom_ops/gpu_ops/cpp_extensions.cc +++ b/custom_ops/gpu_ops/cpp_extensions.cc @@ -1305,7 +1305,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) { m.def("save_output", &SaveOutMmsgStatic, "save_output function"); - m.def("limit_thinking_content_length_v1", &LimitThinkingContentLengthV1, "save_output function"); + m.def("limit_thinking_content_length_v1", &LimitThinkingContentLengthV1, "limit_thinking_content_length_v1 function"); - m.def("limit_thinking_content_length_v2", &LimitThinkingContentLengthV2, "save_output function"); + m.def("limit_thinking_content_length_v2", &LimitThinkingContentLengthV2, "limit_thinking_content_length_v2 function"); } From 06b5441262b5698682cef42b14a46a87065be3fa Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Wed, 15 Oct 2025 18:09:05 +0800 Subject: [PATCH 12/24] fix --- .../input/ernie4_5_vl_processor/ernie4_5_vl_processor.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py index 9251dd9d955..61e02828db9 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py +++ b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py @@ -255,7 +255,11 @@ def process_request_dict(self, request, max_model_len=None): else: request["max_tokens"] = min(max_model_len - len(request["prompt_token_ids"]), request["max_tokens"]) if request.get("reasoning_max_tokens") is None: - request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1) + if request.get("enable_thinking"): + request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1) + else: + if not request.get("enable_thinking"): + request["reasoning_max_tokens"] = None data_processor_logger.info(f"Processed request {request}") return request From 36ed90d521729d1394233e7defbd676d2688edfa Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Thu, 16 Oct 2025 16:47:08 +0800 Subject: [PATCH 13/24] update --- fastdeploy/worker/gpu_model_runner.py | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 63c5e2423ee..cc16cef2014 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1744,6 +1744,16 @@ class at the server level, which is too granular for ModelRunner. group=self.parallel_config.tp_group, ) + if self.model_config.think_end_id > 0: + limit_thinking_content_length( + limit_strategy=envs.FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR, + sampled_token_ids=sampler_output.sampled_token_ids, + max_think_lens=self.share_inputs["max_think_lens"], + step_idx=self.share_inputs["step_idx"], + limit_think_status=self.share_inputs["limit_think_status"], + think_end_id=self.model_config.think_end_id, + line_break_id=self.model_config.line_break_id, + ) else: self.sampler( logits, @@ -1774,20 +1784,6 @@ class at the server level, which is too granular for ModelRunner. group=self.parallel_config.tp_group, ) - if self.model_config.think_end_id > 0 and not self.speculative_decoding: - assert ( - sampler_output is not None - ), "Warning, limit thinking content length not support speculative decoding." - limit_thinking_content_length( - limit_strategy=envs.FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR, - sampled_token_ids=sampler_output.sampled_token_ids, - max_think_lens=self.share_inputs["max_think_lens"], - step_idx=self.share_inputs["step_idx"], - limit_think_status=self.share_inputs["limit_think_status"], - think_end_id=self.model_config.think_end_id, - line_break_id=self.model_config.line_break_id, - ) - # 5. Post Process model_output_data = ModelOutputData( next_tokens=self.share_inputs["next_tokens"], From 2f8aa118f924682caa3f8e4d1ec6c5cd62c520ab Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Thu, 16 Oct 2025 17:41:05 +0800 Subject: [PATCH 14/24] fix --- custom_ops/gpu_ops/set_value_by_flags_and_idx.cu | 2 +- .../speculate_decoding/speculate_set_value_by_flags_and_idx.cu | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu b/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu index 39181683038..9e7a0ce1134 100644 --- a/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu +++ b/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu @@ -35,7 +35,7 @@ __global__ void set_value_by_flag_and_id(const bool *stop_flags, const int seq_len_dec = seq_lens_decoder[tid]; const int seq_len_enc = seq_lens_encoder[tid]; if (seq_len_dec == 0 && seq_len_enc == 0) return; // stopped - if (step_idx[tid] > 0) { + if (step_idx[tid] >= 0) { if (seq_len_enc > 0) { // encoder, get last token accord to seq_lens_encoder pre_ids_all_now[step_idx[tid]] = input_ids_now[seq_len_enc - 1]; } else { // decoedr, get first token diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu index 316604c7363..d1ee733fecf 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu @@ -29,6 +29,7 @@ __global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all, int length, int max_draft_tokens) { int tid = threadIdx.x; + if (tid < bs) { if (!stop_flags[tid]) { int64_t *pre_ids_all_now = pre_ids_all + tid * length; @@ -37,7 +38,7 @@ __global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all, const int seq_len_dec = seq_lens_decoder[tid]; const int seq_len_enc = seq_lens_encoder[tid]; if (seq_len_dec == 0 && seq_len_enc == 0) return; // stoped - if (step_idx[tid] > 0) { + if (step_idx[tid] >= 0) { for (int i = 0; i < accept_num[tid]; i++) { pre_ids_all_now[step_idx[tid] - i] = accept_tokens_now[accept_num[tid] - 1 - i]; From 324d17ed995deaca60259d47a644a4666f4a1d46 Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Thu, 16 Oct 2025 21:08:44 +0800 Subject: [PATCH 15/24] fix --- custom_ops/gpu_ops/set_value_by_flags_and_idx.cu | 2 +- .../speculate_decoding/speculate_set_value_by_flags_and_idx.cu | 2 +- fastdeploy/worker/gpu_model_runner.py | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu b/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu index 9e7a0ce1134..39181683038 100644 --- a/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu +++ b/custom_ops/gpu_ops/set_value_by_flags_and_idx.cu @@ -35,7 +35,7 @@ __global__ void set_value_by_flag_and_id(const bool *stop_flags, const int seq_len_dec = seq_lens_decoder[tid]; const int seq_len_enc = seq_lens_encoder[tid]; if (seq_len_dec == 0 && seq_len_enc == 0) return; // stopped - if (step_idx[tid] >= 0) { + if (step_idx[tid] > 0) { if (seq_len_enc > 0) { // encoder, get last token accord to seq_lens_encoder pre_ids_all_now[step_idx[tid]] = input_ids_now[seq_len_enc - 1]; } else { // decoedr, get first token diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu index d1ee733fecf..f28e8369394 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_set_value_by_flags_and_idx.cu @@ -38,7 +38,7 @@ __global__ void speculate_set_value_by_flag_and_id(int64_t *pre_ids_all, const int seq_len_dec = seq_lens_decoder[tid]; const int seq_len_enc = seq_lens_encoder[tid]; if (seq_len_dec == 0 && seq_len_enc == 0) return; // stoped - if (step_idx[tid] >= 0) { + if (step_idx[tid] > 0) { for (int i = 0; i < accept_num[tid]; i++) { pre_ids_all_now[step_idx[tid] - i] = accept_tokens_now[accept_num[tid] - 1 - i]; diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index cc16cef2014..3ba7f4d1544 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -282,6 +282,7 @@ def _init_logits_processor(self, request): return ( self.guided_backend.get_logits_processor( schemata_key=schemata_key, + enable_thinking=True, ), schemata_key, ) From 0710f34a2b0591bff427f800a3dfaf634662afcd Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Thu, 16 Oct 2025 21:14:09 +0800 Subject: [PATCH 16/24] fix typo --- custom_ops/gpu_ops/limit_thinking_content_length_v2.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu b/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu index c28073ba74e..a61dec8960d 100644 --- a/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu +++ b/custom_ops/gpu_ops/limit_thinking_content_length_v2.cu @@ -22,7 +22,7 @@ __global__ void limit_thinking_content_length_kernel_v2( int64_t *next_tokens, const int *max_think_lens, - const int64_t *step_idx, // step_idx 不再需要被修改,改为 const + const int64_t *step_idx, int *limit_think_status, const int64_t think_end_id, const int64_t line_break_id, From 141608fc4a353f9c2bdf996d6681c8d1f4b32e15 Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Fri, 17 Oct 2025 11:32:19 +0800 Subject: [PATCH 17/24] fix ci --- fastdeploy/engine/common_engine.py | 200 ++++++++---------- fastdeploy/entrypoints/engine_client.py | 4 +- .../model_executor/models/interfaces_base.py | 54 +++++ fastdeploy/worker/gpu_model_runner.py | 2 - .../test_set_value_by_flags_and_idx.py | 2 +- 5 files changed, 148 insertions(+), 114 deletions(-) create mode 100644 fastdeploy/model_executor/models/interfaces_base.py diff --git a/fastdeploy/engine/common_engine.py b/fastdeploy/engine/common_engine.py index 0c3545fc1f6..c3a936b0e3f 100644 --- a/fastdeploy/engine/common_engine.py +++ b/fastdeploy/engine/common_engine.py @@ -60,78 +60,78 @@ class EngineService: Base class containing common engine functionality """ - def __init__(self, fd_config, start_queue=True): + def __init__(self, cfg, start_queue=True): """ Initializes the LLMEngine with the provided configuration. Args: - fd_config (FDConfig): Config object containing all the configuration parameters. + cfg (Config): Config object containing all the configuration parameters. """ - self.fd_config = fd_config - if fd_config.scheduler_config.splitwise_role != "mixed" or fd_config.cache_config.enable_prefix_caching: - if isinstance(self.fd_config.cache_config.cache_queue_port, str): - self.fd_config.cache_config.cache_queue_port = self.fd_config.cache_config.cache_queue_port.split(",") - if isinstance(self.fd_config.cache_config.cache_queue_port, list): - self.fd_config.cache_config.cache_queue_port = int( - self.fd_config.cache_config.cache_queue_port[self.fd_config.parallel_config.local_data_parallel_id] + self.cfg = cfg + if cfg.scheduler_config.splitwise_role != "mixed" or cfg.cache_config.enable_prefix_caching: + if isinstance(self.cfg.cache_config.cache_queue_port, str): + self.cfg.cache_config.cache_queue_port = self.cfg.cache_config.cache_queue_port.split(",") + if isinstance(self.cfg.cache_config.cache_queue_port, list): + self.cfg.cache_config.cache_queue_port = int( + self.cfg.cache_config.cache_queue_port[self.cfg.parallel_config.local_data_parallel_id] ) - if self.fd_config.parallel_config.enable_expert_parallel: + if self.cfg.parallel_config.enable_expert_parallel: self.llm_logger = get_logger( - "fastdeploy", f"fastdeploy_rank{self.fd_config.parallel_config.local_data_parallel_id}.log" + "fastdeploy", f"fastdeploy_rank{self.cfg.parallel_config.local_data_parallel_id}.log" ) else: self.llm_logger = llm_logger - self.scheduler = fd_config.scheduler_config.scheduler() + self.scheduler = cfg.scheduler_config.scheduler() self.enable_decode_cache_task = envs.FD_ENABLE_CACHE_TASK == "1" if envs.ENABLE_V1_KVCACHE_SCHEDULER: self.resource_manager = ResourceManagerV1( - fd_config.scheduler_config.max_num_seqs, - fd_config, - fd_config.parallel_config.tensor_parallel_size, - fd_config.scheduler_config.splitwise_role, - fd_config.parallel_config.local_data_parallel_id, + cfg.scheduler_config.max_num_seqs, + cfg, + cfg.parallel_config.tensor_parallel_size, + cfg.scheduler_config.splitwise_role, + cfg.parallel_config.local_data_parallel_id, ) else: self.resource_manager = ResourceManager( - fd_config.scheduler_config.max_num_seqs, - fd_config, - fd_config.parallel_config.tensor_parallel_size, - fd_config.scheduler_config.splitwise_role, - fd_config.parallel_config.local_data_parallel_id, + cfg.scheduler_config.max_num_seqs, + cfg, + cfg.parallel_config.tensor_parallel_size, + cfg.scheduler_config.splitwise_role, + cfg.parallel_config.local_data_parallel_id, ) self.start_worker_queue_service(start_queue) - os.environ["INFERENCE_MSG_QUEUE_ID"] = self.fd_config.parallel_config.engine_worker_queue_port[ - self.fd_config.parallel_config.local_data_parallel_id + os.environ["INFERENCE_MSG_QUEUE_ID"] = self.cfg.parallel_config.engine_worker_queue_port[ + self.cfg.parallel_config.local_data_parallel_id ] - self.split_connector = SplitwiseConnector(fd_config, self.engine_worker_queue, self.resource_manager) + self.split_connector = SplitwiseConnector(cfg, self.engine_worker_queue, self.resource_manager) self.waiting_requests = [] self.token_processor = TokenProcessor( - cfg=fd_config, + cfg=cfg, cached_generated_tokens=self.scheduler, engine_worker_queue=self.engine_worker_queue, split_connector=self.split_connector, ) self.token_processor.set_resource_manager(self.resource_manager) - self.partial_chunked_tokens = [0] * (self.fd_config.max_num_partial_prefills + 1) - for idx in range(1, self.fd_config.max_num_partial_prefills + 1): + self.partial_chunked_tokens = [0] * (self.cfg.max_num_partial_prefills + 1) + for idx in range(1, self.cfg.max_num_partial_prefills + 1): self.partial_chunked_tokens[idx] = ( - (self.fd_config.scheduler_config.max_num_batched_tokens // idx) - // self.fd_config.cache_config.block_size - * self.fd_config.cache_config.block_size + (self.cfg.scheduler_config.max_num_batched_tokens // idx) + // self.cfg.cache_config.block_size + * self.cfg.cache_config.block_size ) self.guided_decoding_checker = None - if self.fd_config.guided_decoding_backend != "off": + if self.cfg.structured_outputs_config.guided_decoding_backend != "off": self.guided_decoding_checker = schema_checker( - self.fd_config.guided_decoding_backend, - disable_any_whitespace=self.fd_config.disable_any_whitespace, + self.cfg.structured_outputs_config.guided_decoding_backend, + disable_any_whitespace=self.cfg.structured_outputs_config.disable_any_whitespace, ) self._init_worker_monitor_signals() @@ -146,14 +146,12 @@ def start(self): self.insert_task_to_worker_thread.start() self.token_processor.tasks_queue = self.engine_worker_queue self.token_processor.run() - if self.fd_config.scheduler_config.splitwise_role != "mixed": + if self.cfg.scheduler_config.splitwise_role != "mixed": self.split_mode_get_tasks() def _init_worker_monitor_signals(self): # exist_task_signal 用于各worker进程感知是否有新Task需要处理 current_suffix = int( - self.fd_config.parallel_config.engine_worker_queue_port[ - self.fd_config.parallel_config.local_data_parallel_id - ] + self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id] ) self.llm_logger.info(f"current_suffix: {current_suffix}") exist_task_signal_data = np.zeros([1], dtype=np.int32) @@ -187,8 +185,7 @@ def _init_worker_monitor_signals(self): # exist_task_signal 用于各worker进 # worker_live_signal 用于engine感知各worker进程是否存活,记录每个step 时间 worker_healthy_live_recorded_time_array = np.zeros( - shape=[min(self.fd_config.worker_num_per_node, self.fd_config.parallel_config.tensor_parallel_size)], - dtype=np.int32, + shape=[min(self.cfg.worker_num_per_node, self.cfg.parallel_config.tensor_parallel_size)], dtype=np.int32 ) self.worker_healthy_live_signal = IPCSignal( name="worker_healthy_live_signal", @@ -198,7 +195,7 @@ def _init_worker_monitor_signals(self): # exist_task_signal 用于各worker进 create=True, ) - cache_ready_signal_data = np.zeros(shape=[self.fd_config.parallel_config.tensor_parallel_size], dtype=np.int32) + cache_ready_signal_data = np.zeros(shape=[self.cfg.parallel_config.tensor_parallel_size], dtype=np.int32) self.cache_ready_signal = IPCSignal( name="cache_ready_signal", array=cache_ready_signal_data, @@ -207,9 +204,7 @@ def _init_worker_monitor_signals(self): # exist_task_signal 用于各worker进 create=True, ) - swap_space_ready_signal_data = np.zeros( - shape=[self.fd_config.parallel_config.tensor_parallel_size], dtype=np.int32 - ) + swap_space_ready_signal_data = np.zeros(shape=[self.cfg.parallel_config.tensor_parallel_size], dtype=np.int32) self.swap_space_ready_signal = IPCSignal( name="swap_space_ready_signal", array=swap_space_ready_signal_data, @@ -250,53 +245,43 @@ def start_worker_queue_service(self, start_queue): start queue service for engine worker communication """ address = ( - self.fd_config.master_ip, - int( - self.fd_config.parallel_config.engine_worker_queue_port[ - self.fd_config.parallel_config.local_data_parallel_id - ] - ), + self.cfg.master_ip, + int(self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id]), ) - if start_queue and ( - self.fd_config.host_ip == self.fd_config.master_ip or self.fd_config.master_ip == "0.0.0.0" - ): + if start_queue and (self.cfg.host_ip == self.cfg.master_ip or self.cfg.master_ip == "0.0.0.0"): self.llm_logger.info(f"Starting engine worker queue server service at {address}") self.engine_worker_queue_server = EngineWorkerQueue( address=address, is_server=True, - num_client=self.fd_config.parallel_config.tensor_parallel_size, - local_data_parallel_size=self.fd_config.parallel_config.data_parallel_size, + num_client=self.cfg.parallel_config.tensor_parallel_size, + local_data_parallel_size=self.cfg.parallel_config.data_parallel_size, ) - if ( - self.fd_config.cache_config.enable_prefix_caching - or self.fd_config.scheduler_config.splitwise_role != "mixed" - ): + if self.cfg.cache_config.enable_prefix_caching or self.cfg.scheduler_config.splitwise_role != "mixed": self.cache_task_queue = EngineCacheQueue( address=( - self.fd_config.master_ip, - self.fd_config.cache_config.cache_queue_port, + self.cfg.master_ip, + self.cfg.cache_config.cache_queue_port, ), authkey=b"cache_queue_service", is_server=True, - num_client=self.fd_config.parallel_config.tensor_parallel_size, + num_client=self.cfg.parallel_config.tensor_parallel_size, client_id=-1, - local_data_parallel_size=self.fd_config.parallel_config.data_parallel_size, + local_data_parallel_size=self.cfg.parallel_config.data_parallel_size, ) self.llm_logger.info( - f"local {min(self.fd_config.worker_num_per_node * self.fd_config.node_rank + self.fd_config.parallel_config.local_data_parallel_id,self.fd_config.parallel_config.data_parallel_size - 1)}" + f"local {min(self.cfg.worker_num_per_node * self.cfg.node_rank + self.cfg.parallel_config.local_data_parallel_id,self.cfg.parallel_config.data_parallel_size - 1)}" ) self.engine_worker_queue = EngineWorkerQueue( address=address, is_server=False, - num_client=self.fd_config.parallel_config.tensor_parallel_size, + num_client=self.cfg.parallel_config.tensor_parallel_size, client_id=0, - local_data_parallel_size=self.fd_config.parallel_config.data_parallel_size, + local_data_parallel_size=self.cfg.parallel_config.data_parallel_size, local_data_parallel_id=min( - self.fd_config.worker_num_per_node * self.fd_config.node_rank - + self.fd_config.parallel_config.local_data_parallel_id, - self.fd_config.parallel_config.data_parallel_size - 1, + self.cfg.worker_num_per_node * self.cfg.node_rank + self.cfg.parallel_config.local_data_parallel_id, + self.cfg.parallel_config.data_parallel_size - 1, ), ) @@ -326,8 +311,8 @@ def insert_tasks(self, tasks, current_id=-1, allocated=False): cur_task.prompt_token_ids[0] = task.outputs.token_ids[0] cur_task.num_cached_tokens = task.num_cached_tokens if ( - self.fd_config.speculative_config.method in ["mtp"] - and self.fd_config.scheduler_config.splitwise_role == "decode" + self.cfg.speculative_config.method in ["mtp"] + and self.cfg.scheduler_config.splitwise_role == "decode" ): cur_task.draft_token_ids = copy.deepcopy(task.outputs.draft_token_ids) if task.error_code != 200: @@ -354,7 +339,7 @@ def insert_tasks(self, tasks, current_id=-1, allocated=False): need_delete_tasks = [] for task in tasks: - if self.fd_config.scheduler_config.splitwise_role != "mixed": + if self.cfg.scheduler_config.splitwise_role != "mixed": status, msg = self.split_connector.check_decode_allocated(task) if not status: self.llm_logger.error(f"{task.request_id} prefill failed with msg:{msg}.") @@ -410,12 +395,12 @@ def insert_tasks(self, tasks, current_id=-1, allocated=False): for task in tasks: task.inference_start_time = time.time() if not is_prefill: - if not self.fd_config.model_config.enable_mm: + if not self.cfg.model_config.enable_mm: self.update_requests_chunk_size(tasks) else: self.update_mm_requests_chunk_size(tasks) self.engine_worker_queue.put_tasks((tasks, self.resource_manager.real_bsz)) - if is_prefill and self.fd_config.scheduler_config.name != "splitwise": + if is_prefill and self.cfg.scheduler_config.name != "splitwise": self.engine_worker_queue.available_prefill_instances.put(1) return True @@ -448,14 +433,14 @@ def update_tokens(idx, chunk_size, update_chunk=False): if current_request_size[idx] <= 0: chunk_request_num -= 1 - if not self.fd_config.cache_config.enable_chunked_prefill or len(requests) == 0: + if not self.cfg.cache_config.enable_chunked_prefill or len(requests) == 0: return current_request_size = [request.prompt_token_ids_len for request in requests] requests_chunk = [[] for _ in range(len(requests))] chunk_request_num = len(current_request_size) while chunk_request_num >= 1: - remain_batched_tokens = self.fd_config.scheduler_config.max_num_batched_tokens + remain_batched_tokens = self.cfg.scheduler_config.max_num_batched_tokens for idx in range(len(current_request_size)): if current_request_size[idx] <= 0: continue @@ -465,16 +450,14 @@ def update_tokens(idx, chunk_size, update_chunk=False): ) update_tokens(idx, chunk_size) - while remain_batched_tokens >= self.fd_config.cache_config.block_size: + while remain_batched_tokens >= self.cfg.cache_config.block_size: # 当前 max_num_batched_tokens 还有剩余时,优先分配给较短的请求 waiting_requests = [input_lens for input_lens in current_request_size if input_lens > 0] if len(waiting_requests) == 0: break available_tokens = ( - remain_batched_tokens - // self.fd_config.cache_config.block_size - * self.fd_config.cache_config.block_size + remain_batched_tokens // self.cfg.cache_config.block_size * self.cfg.cache_config.block_size ) append_idx = current_request_size.index(min(waiting_requests)) chunk_size = min( @@ -491,7 +474,7 @@ def update_mm_requests_chunk_size(self, requests): """ update each multimodal request's chunk size info """ - if not self.fd_config.cache_config.enable_chunked_prefill or len(requests) == 0: + if not self.cfg.cache_config.enable_chunked_prefill or len(requests) == 0: return for request in requests: @@ -578,7 +561,7 @@ def _insert_task_to_worker(self): continue if hasattr(self, "exist_prefill_task_signal") and self.exist_prefill_task_signal.value[0] > 0: if ( - self.fd_config.scheduler_config.splitwise_role == "mixed" + self.cfg.scheduler_config.splitwise_role == "mixed" or self.split_connector.has_splitwise_tasks() ): time.sleep(0.005) @@ -592,15 +575,15 @@ def _insert_task_to_worker(self): num_prefill_batch = min( int(self.resource_manager.available_batch()), - self.fd_config.max_prefill_batch, + self.cfg.max_prefill_batch, ) self.resource_manager.check_and_free_block_tables() tasks = self.scheduler.get_requests( available_blocks=self.resource_manager.available_block_num(), - block_size=self.fd_config.cache_config.block_size, - reserved_output_blocks=self.fd_config.cache_config.enc_dec_block_num, - max_num_batched_tokens=self.fd_config.scheduler_config.max_num_batched_tokens, + block_size=self.cfg.cache_config.block_size, + reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num, + max_num_batched_tokens=self.cfg.scheduler_config.max_num_batched_tokens, batch=num_prefill_batch, ) @@ -608,7 +591,7 @@ def _insert_task_to_worker(self): time.sleep(0.001) continue - if self.fd_config.scheduler_config.splitwise_role != "mixed": + if self.cfg.scheduler_config.splitwise_role != "mixed": self.llm_logger.info("Inserting splitwise tasks") self.split_connector.send_splitwise_tasks(tasks, current_id) @@ -637,21 +620,21 @@ def _fetch_request(): is_fetching = True num_prefill_batch = min( int(self.resource_manager.available_batch()), - self.fd_config.max_prefill_batch, + self.cfg.max_prefill_batch, ) - if self.fd_config.model_config.enable_mm: + if self.cfg.model_config.enable_mm: available_blocks = self.resource_manager.available_block_num() else: - available_blocks = self.fd_config.cache_config.max_block_num_per_seq + available_blocks = self.cfg.cache_config.max_block_num_per_seq tasks = self.scheduler.get_requests( available_blocks=available_blocks, - block_size=self.fd_config.cache_config.block_size, - reserved_output_blocks=self.fd_config.cache_config.enc_dec_block_num, - max_num_batched_tokens=self.fd_config.model_config.max_model_len, + block_size=self.cfg.cache_config.block_size, + reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num, + max_num_batched_tokens=self.cfg.model_config.max_model_len, batch=num_prefill_batch, ) - if self.fd_config.scheduler_config.splitwise_role != "mixed": + if self.cfg.scheduler_config.splitwise_role != "mixed": for task in tasks: # assure can allocate block ids in P while not self.resource_manager.preallocate_resource_in_p(task): @@ -660,7 +643,7 @@ def _fetch_request(): self.split_connector.send_splitwise_tasks([task], task.idx) need_delete_tasks = [] for task in tasks: - if self.fd_config.scheduler_config.splitwise_role != "mixed": + if self.cfg.scheduler_config.splitwise_role != "mixed": # assure fetch block ids from D status, msg = self.split_connector.check_decode_allocated(task) if not status: @@ -681,7 +664,7 @@ def _fetch_request(): tasks.remove(tmp_task) # release resource in P self.resource_manager.prerelease_resource(tmp_task) - if self.fd_config.scheduler_config.splitwise_role == "prefill": + if self.cfg.scheduler_config.splitwise_role == "prefill": # to send cache info to cache messager if tasks: self.split_connector.send_cache_infos(tasks, 0) @@ -698,7 +681,7 @@ def _fetch_request(): time.sleep(0.001) # Fetch requests and add them to the scheduling queue if tasks: - if self.fd_config.scheduler_config.splitwise_role == "prefill": + if self.cfg.scheduler_config.splitwise_role == "prefill": self.resource_manager.add_request_in_p(tasks) else: for task in tasks: @@ -713,9 +696,10 @@ def _fetch_request(): if self.engine_worker_queue.num_tasks() > 0: time.sleep(0.001) continue - if self.fd_config.scheduler_config.splitwise_role != "mixed": + if self.cfg.scheduler_config.splitwise_role != "mixed": if not is_fetching: get_request_pool.submit(_fetch_request) + else: if ( len(self.resource_manager.waiting) == 0 @@ -727,7 +711,7 @@ def _fetch_request(): tasks = self.resource_manager.schedule() # 3. Send to engine if tasks: - if self.fd_config.scheduler_config.splitwise_role == "decode": + if self.cfg.scheduler_config.splitwise_role == "decode": for task in tasks: if task.task_type == RequestType.PREEMPTED: msg = f"{task.request_id} decode not enough blocks, need to be rescheduled." @@ -759,7 +743,7 @@ def start_zmq_service(self, api_server_pid=None): self.recv_request_server = ZmqTcpServer(port=envs.FD_ZMQ_RECV_REQUEST_SERVER_PORT, mode=zmq.PULL) self.send_response_server = ZmqTcpServer(port=envs.FD_ZMQ_SEND_RESPONSE_SERVER_PORT, mode=zmq.ROUTER) self.internal_adapter = InternalAdapter( - cfg=self.fd_config, engine=self, dp_rank=self.fd_config.node_rank * self.fd_config.worker_num_per_node + cfg=self.cfg, engine=self, dp_rank=self.cfg.node_rank * self.cfg.worker_num_per_node ) else: self.recv_request_server = ZmqIpcServer(name=api_server_pid, mode=zmq.PULL) @@ -778,12 +762,12 @@ def start_zmq_service(self, api_server_pid=None): def _insert_zmq_task_to_scheduler(self): added_requests: Dict[str, int] = dict() if envs.FD_ENABLE_INTERNAL_ADAPTER: - if self.fd_config.scheduler_config.splitwise_role == "decode": + if self.cfg.scheduler_config.splitwise_role == "decode": return while self.running: try: block = True if len(added_requests) == 0 else False - if not self.fd_config.model_config.enable_mm: + if not self.cfg.model_config.enable_mm: err, data = self.recv_request_server.receive_json_once(block) else: err, data = self.recv_request_server.receive_pyobj_once(block) @@ -945,7 +929,7 @@ def receiver_loop(): else: self.insert_tasks(tasks, allocated=True) - if self.fd_config.innode_prefill_ports is not None: + if self.cfg.innode_prefill_ports is not None: self.scheduler.put_results(tasks) else: if len(self.waiting_requests): @@ -990,14 +974,12 @@ def receiver_loop(): def start_cache_service(self, device_ids, ipc_signal_suffix, create_cache_tensor): return self.resource_manager.cache_manager.launch_cache_manager( - cache_config=self.fd_config.cache_config, - tensor_parallel_size=self.fd_config.parallel_config.tensor_parallel_size, + cache_config=self.cfg.cache_config, + tensor_parallel_size=self.cfg.parallel_config.tensor_parallel_size, device_ids=device_ids, - pod_ip=self.fd_config.master_ip, + pod_ip=self.cfg.master_ip, engine_worker_queue_port=int( - self.fd_config.parallel_config.engine_worker_queue_port[ - self.fd_config.parallel_config.local_data_parallel_id - ] + self.cfg.parallel_config.engine_worker_queue_port[self.cfg.parallel_config.local_data_parallel_id] ), pid_suffix=ipc_signal_suffix, create_cache_tensor=create_cache_tensor, diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py index d92a2568cc2..51d49d15fb9 100644 --- a/fastdeploy/entrypoints/engine_client.py +++ b/fastdeploy/entrypoints/engine_client.py @@ -256,8 +256,8 @@ def valid_parameters(self, data): raise ParameterError("max_tokens", f"max_tokens can be defined [1, {self.max_model_len}).") if data.get("reasoning_max_tokens") is not None: - if data["reasoning_max_tokens"] < 0: - raise ParameterError("reasoning_max_tokens", "reasoning_max_tokens must be greater than 0") + if data["reasoning_max_tokens"] < 1: + raise ParameterError("reasoning_max_tokens", "reasoning_max_tokens must be greater than 1") if data["reasoning_max_tokens"] > data["max_tokens"]: data["reasoning_max_tokens"] = data["max_tokens"] api_server_logger.warning( diff --git a/fastdeploy/model_executor/models/interfaces_base.py b/fastdeploy/model_executor/models/interfaces_base.py new file mode 100644 index 00000000000..b7ece5fe69a --- /dev/null +++ b/fastdeploy/model_executor/models/interfaces_base.py @@ -0,0 +1,54 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Type + +from paddle import nn + + +def is_text_generation_model(model_cls: Type[nn.Layer]) -> bool: + from .model_base import ModelForCasualLM + + return issubclass(model_cls, ModelForCasualLM) + + +def is_pooling_model(model_cls: Type[nn.Layer]) -> bool: + class_name = model_cls.__name__ + pooling_indicators = ["Embedding", "ForSequenceClassification"] + return ( + any(indicator in class_name for indicator in pooling_indicators) + or hasattr(model_cls, "is_embedding_model") + and model_cls.is_embedding_model + ) + + +def is_multimodal_model(class_name: str) -> bool: + multimodal_indicators = ["VL", "Vision", "ConditionalGeneration"] + return any(indicator in class_name for indicator in multimodal_indicators) + + +def determine_model_category(class_name: str): + from fastdeploy.model_executor.models.model_base import ModelCategory + + if any(pattern in class_name for pattern in ["VL", "Vision", "ConditionalGeneration"]): + return ModelCategory.MULTIMODAL + elif any(pattern in class_name for pattern in ["Embedding", "ForSequenceClassification"]): + return ModelCategory.EMBEDDING + return ModelCategory.TEXT_GENERATION + + +def get_default_pooling_type(model_cls: Type[nn.Layer] = None) -> str: + if model_cls is not None: + return getattr(model_cls, "default_pooling_type", "LAST") + return "LAST" diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 241a42c7ab6..6d51997ba21 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -345,7 +345,6 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = ) if request.get("reasoning_max_tokens") is not None: - assert request.get("reasoning_max_tokens") >= 0, "reasoning_max_tokens in requests need >= 0." # Enable thinking self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get("reasoning_max_tokens") self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0 @@ -578,7 +577,6 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests: self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0 if request.get("reasoning_max_tokens") is not None: - assert request.get("reasoning_max_tokens") >= 0, "reasoning_max_tokens in requests need >= 0." # Enable thinking self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get("reasoning_max_tokens") self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0 diff --git a/tests/operators/test_set_value_by_flags_and_idx.py b/tests/operators/test_set_value_by_flags_and_idx.py index 6861ca218d0..aada9e2606f 100644 --- a/tests/operators/test_set_value_by_flags_and_idx.py +++ b/tests/operators/test_set_value_by_flags_and_idx.py @@ -34,7 +34,7 @@ def set_value_by_flags_and_idx_numpy( current_step_idx = step_idx[i] if seq_len_enc == 0 and seq_len_dec == 0: continue - if current_step_idx >= 0: + if current_step_idx > 0: if seq_len_enc > 0: token_idx = seq_len_enc - 1 token_to_assign = input_ids[i, token_idx] From 9bb46294552f9064afbcde3c784d57ec0e6525a4 Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Fri, 17 Oct 2025 15:42:23 +0800 Subject: [PATCH 18/24] fix --- fastdeploy/worker/gpu_model_runner.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 832f57c014f..894aed8794a 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1458,10 +1458,6 @@ def _dummy_sampler_run( ), accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), - enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None), - think_end_id=(getattr(self.model_config, "think_end_id", -1) if self.enable_mm else -1), - need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None), - reasoning_index=(self.share_inputs["reasoning_index"] if self.enable_mm else None), stop_token_ids=self.share_inputs["stop_seqs"], stop_seqs_len=self.share_inputs["stop_seqs_len"], ) From 41ef32c9d8abd19a0a4ce86a7b168bace80c2e2c Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Fri, 17 Oct 2025 15:46:40 +0800 Subject: [PATCH 19/24] fix --- .../model_executor/models/interfaces_base.py | 21 ------------------- .../model_executor/models/model_base.py | 7 +------ 2 files changed, 1 insertion(+), 27 deletions(-) diff --git a/fastdeploy/model_executor/models/interfaces_base.py b/fastdeploy/model_executor/models/interfaces_base.py index 77533209d9b..bc222634427 100644 --- a/fastdeploy/model_executor/models/interfaces_base.py +++ b/fastdeploy/model_executor/models/interfaces_base.py @@ -26,31 +26,10 @@ T_co = TypeVar("T_co", default=paddle.Tensor, covariant=True) -def is_text_generation_model(model_cls: Type[nn.Layer]) -> bool: - from .model_base import ModelForCasualLM - - return issubclass(model_cls, ModelForCasualLM) - - def is_pooling_model(model_cls: Type[nn.Layer]) -> bool: return getattr(model_cls, "is_pooling_model", False) -def is_multimodal_model(class_name: str) -> bool: - multimodal_indicators = ["VL", "Vision", "ConditionalGeneration"] - return any(indicator in class_name for indicator in multimodal_indicators) - - -def determine_model_category(class_name: str): - from fastdeploy.model_executor.models.model_base import ModelCategory - - if any(pattern in class_name for pattern in ["VL", "Vision", "ConditionalGeneration"]): - return ModelCategory.MULTIMODAL - elif any(pattern in class_name for pattern in ["Embedding", "ForSequenceClassification"]): - return ModelCategory.EMBEDDING - return ModelCategory.TEXT_GENERATION - - def get_default_pooling_type(model_cls: Type[nn.Layer] = None) -> str: if model_cls is not None: return getattr(model_cls, "default_pooling_type", "LAST") diff --git a/fastdeploy/model_executor/models/model_base.py b/fastdeploy/model_executor/models/model_base.py index 1ddd39c9a32..0ca56f343c1 100644 --- a/fastdeploy/model_executor/models/model_base.py +++ b/fastdeploy/model_executor/models/model_base.py @@ -26,12 +26,7 @@ iter_architecture_defaults, try_match_architecture_defaults, ) - - -def get_default_pooling_type(model_cls: Type[nn.Layer] = None) -> str: - if model_cls is not None: - return getattr(model_cls, "default_pooling_type", "LAST") - return "LAST" +from fastdeploy.model_executor.models.interfaces_base import get_default_pooling_type class ModelCategory(IntFlag): From 849eaa6395bddf020ec31841739f78f9937a0cf4 Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Mon, 20 Oct 2025 12:01:56 +0800 Subject: [PATCH 20/24] support mtp --- custom_ops/gpu_ops/cpp_extensions.cc | 23 +++ ...culate_limit_thinking_content_length_v1.cu | 132 +++++++++++++++ ...culate_limit_thinking_content_length_v2.cu | 159 ++++++++++++++++++ .../model_executor/pre_and_post_process.py | 153 +++++++++++++---- fastdeploy/worker/gpu_model_runner.py | 16 +- 5 files changed, 435 insertions(+), 48 deletions(-) create mode 100644 custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu create mode 100644 custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu diff --git a/custom_ops/gpu_ops/cpp_extensions.cc b/custom_ops/gpu_ops/cpp_extensions.cc index 40417b1e09a..e3c8f33ece5 100644 --- a/custom_ops/gpu_ops/cpp_extensions.cc +++ b/custom_ops/gpu_ops/cpp_extensions.cc @@ -916,6 +916,25 @@ void LimitThinkingContentLengthV2(const paddle::Tensor &next_tokens, const int64_t think_end_id, const int64_t line_break_id); +void SpeculateLimitThinkingContentLengthV1( + const paddle::Tensor& next_tokens, + const paddle::Tensor& max_think_lens, + const paddle::Tensor& step_idx, + const paddle::Tensor& limit_think_status, + const paddle::Tensor& accept_num, + const paddle::Tensor& seq_lens_decoder, + const int64_t think_end_id); + +void SpeculateLimitThinkingContentLengthV2( + const paddle::Tensor& next_tokens, + const paddle::Tensor& max_think_lens, + const paddle::Tensor& step_idx, + const paddle::Tensor& limit_think_status, + const paddle::Tensor& accept_num, + const paddle::Tensor& seq_lens_decoder, + const int64_t think_end_id, + const int64_t line_break_id); + PYBIND11_MODULE(fastdeploy_ops, m) { m.def("get_expert_token_num", &GetExpertTokenNum, py::arg("topk_ids"), @@ -1301,4 +1320,8 @@ PYBIND11_MODULE(fastdeploy_ops, m) { m.def("limit_thinking_content_length_v1", &LimitThinkingContentLengthV1, "limit_thinking_content_length_v1 function"); m.def("limit_thinking_content_length_v2", &LimitThinkingContentLengthV2, "limit_thinking_content_length_v2 function"); + + m.def("speculate_limit_thinking_content_length_v1", &SpeculateLimitThinkingContentLengthV1, "speculate limit thinking content length function"); + + m.def("speculate_limit_thinking_content_length_v2", &SpeculateLimitThinkingContentLengthV2, "speculate limit thinking content length function"); } diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu new file mode 100644 index 00000000000..90e3760af4b --- /dev/null +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu @@ -0,0 +1,132 @@ +/ Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "helper.h" +#include "paddle/extension.h" + +__global__ void speculate_limit_thinking_content_length_kernel_v1( + int64_t* next_tokens, + const int* max_think_lens, + int64_t* step_idx, + int* limit_think_status, + int* accept_num, + int* seq_lens_decoder, + const int64_t think_end_id, + const int tokens_per_step, + const int bs) { + int bid = threadIdx.x; + if (bid >= bs) return; + + const int original_accept_num = accept_num[bid]; + if (original_accept_num <= 0) return; + + // 如果该序列未启用思考功能,则直接返回,默认值为 -1,表示不限制思考长度 + const int max_think_len = max_think_lens[bid]; + if (max_think_len < 0) return; + int current_limit_think_status = limit_think_status[bid]; + // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行. + if (current_limit_think_status == 3) { + return; + } + + int new_accept_num = original_accept_num; + + const int64_t current_base_step = step_idx[bid] - original_accept_num + 1; + + for (int token_offset = 0; token_offset < original_accept_num; + token_offset++) { + const int token_idx = bid * tokens_per_step + token_offset; + int64_t next_token = next_tokens[token_idx]; + const int64_t current_step = current_base_step + token_offset; + + bool condition_triggered = false; + + // ======================= 思考阶段控制 ======================= + // 阶段 1: 仍在思考 (status == 0), 检查是否需要强制结束 + // 阶段 2: 在替换 (status == 1), 检查是否替换结束 + if (current_limit_think_status < 1) { + // 当开启思考长度控制时,检查是否超时 + if (current_step >= max_think_len) { + // 强制将当前token替换为结束思考的token + next_token = think_end_id; + current_limit_think_status = 1; + condition_triggered = true; // 因为修改了token,需要截断 + } + } + + // ======================= 思考结束处理 ======================= + // 阶段 3: 检查是否已满足结束思考的条件 (status == 0 || status == 2) + // 这种情况会处理两种场景: + // 1. status == 0: 模型可能自己生成了 + // 2. status == 2: 上一阶段强制注入了 + if (current_limit_think_status < 2) { + if (next_token == think_end_id) { + // 确认思考结束,将状态推进到 2 (响应阶段) + current_limit_think_status = 2; + } + } + + next_tokens[token_idx] = next_token; + + if (condition_triggered) { + new_accept_num = token_offset + 1; + break; + } + } + + // 更新全局状态 + int discarded_tokens = original_accept_num - new_accept_num; + if (discarded_tokens > 0) { + step_idx[bid] -= discarded_tokens; + seq_lens_decoder[bid] -= discarded_tokens; + } + + accept_num[bid] = new_accept_num; + limit_think_status[bid] = current_limit_think_status; +} + +void SpeculateLimitThinkingContentLengthV1( + const paddle::Tensor& next_tokens, + const paddle::Tensor& max_think_lens, + const paddle::Tensor& step_idx, + const paddle::Tensor& limit_think_status, + const paddle::Tensor& accept_num, + const paddle::Tensor& seq_lens_decoder, + const int64_t think_end_id) { + const int batch_size = next_tokens.shape()[0]; + const int tokens_per_step = next_tokens.shape()[1]; + + speculate_limit_thinking_content_length_kernel_v1<<<1, 1024>>>( + const_cast(next_tokens.data()), + max_think_lens.data(), + const_cast(step_idx.data()), + const_cast(limit_think_status.data()), + const_cast(accept_num.data()), + const_cast(seq_lens_decoder.data()), + think_end_id, + tokens_per_step, + batch_size); +} + +PD_BUILD_STATIC_OP(speculate_limit_thinking_content_length_v1) + .Inputs({"next_tokens", + "max_think_lens", + "step_idx", + "limit_think_status", + "accept_num", + "seq_lens_decoder"}) + .Attrs({"think_end_id: int64_t"}) + .Outputs({"next_tokens_out"}) + .SetInplaceMap({{"next_tokens", "next_tokens_out"}}) + .SetKernelFn(PD_KERNEL(SpeculateLimitThinkingContentLengthV1)); diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu new file mode 100644 index 00000000000..9277dd4c137 --- /dev/null +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu @@ -0,0 +1,159 @@ +/ Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "helper.h" +#include "paddle/extension.h" + +// status == 0: 正常生成阶段 +// status == 1: 替换阶段 +// status == 2: 替换结束阶段 +// status == 3: 思考结束阶段 +__global__ void speculate_limit_thinking_content_length_kernel_v2( + int64_t* next_tokens, + const int* max_think_lens, + int64_t* step_idx, + int* limit_think_status, + int* accept_num, + int* seq_lens_decoder, + const int64_t think_end_id, + const int64_t line_break_id, + const int tokens_per_step, + const int bs) { + int bid = threadIdx.x; + if (bid >= bs) return; + + const int original_accept_num = accept_num[bid]; + if (original_accept_num <= 0) return; + + // 如果该序列未启用思考功能,则直接返回,默认值为 -1,表示不限制思考长度 + const int max_think_len = max_think_lens[bid]; + if (max_think_len < 0) return; + int current_limit_think_status = limit_think_status[bid]; + // 如果在回复阶段, 且已经触发停止标志, 则直接返回, 无需多余执行. + if (current_limit_think_status == 3) { + return; + } + + int new_accept_num = original_accept_num; + + const int64_t current_base_step = step_idx[bid] - original_accept_num + 1; + + for (int token_offset = 0; token_offset < original_accept_num; + token_offset++) { + const int token_idx = bid * tokens_per_step + token_offset; + int64_t next_token = next_tokens[token_idx]; + const int64_t current_step = current_base_step + token_offset; + + bool condition_triggered = false; + + // ======================= 思考阶段控制 ======================= + // 阶段 1: 仍在思考 (status == 0), 检查是否需要强制结束 + // 阶段 2: 在替换 (status == 1), 检查是否替换结束 + if (current_limit_think_status <= 1) { + // 当开启思考长度控制时,检查是否超时 + if (current_step == max_think_len) { + // 强制将当前token替换为结束思考的token + next_token = line_break_id; + current_limit_think_status = 1; + condition_triggered = true; // 因为修改了token,需要截断 + } else if (current_step == max_think_len + 1) { + // 强制将当前token替换为结束思考的token + next_token = think_end_id; + current_limit_think_status = 1; + condition_triggered = true; // 因为修改了token,需要截断 + } else if (current_step == max_think_len + 2) { + // 强制将当前token替换为结束思考的token + next_token = line_break_id; + current_limit_think_status = 1; + condition_triggered = true; // 因为修改了token,需要截断 + } else if (current_step == max_think_len + 3) { + // 强制将当前token替换为结束思考的token + next_token = line_break_id; + // 将状态推进到 1, 表示 "正在结束思考" + current_limit_think_status = 2; + condition_triggered = true; // 因为修改了token,需要截断 + } + } + + // ======================= 思考结束处理 ======================= + // 阶段 3: 检查是否已满足结束思考的条件 (status == 0 || status == 2) + // 这种情况会处理两种场景: + // 1. status == 0: 模型可能自己生成了 + // 2. status == 2: 上一阶段强制注入了 \n\n\n + if (current_limit_think_status == 0) { + if (next_token == think_end_id) { + // 确认思考结束,将状态推进到 3 (响应阶段) + current_limit_think_status = 3; + } + } + if (current_limit_think_status == 2) { + // 确认思考结束,将状态推进到 3 (响应阶段) + current_limit_think_status = 3; + } + + next_tokens[token_idx] = next_token; + + if (condition_triggered) { + new_accept_num = token_offset + 1; + break; + } + } + + // 更新全局状态 + int discarded_tokens = original_accept_num - new_accept_num; + if (discarded_tokens > 0) { + step_idx[bid] -= discarded_tokens; + seq_lens_decoder[bid] -= discarded_tokens; + } + + accept_num[bid] = new_accept_num; + limit_think_status[bid] = current_limit_think_status; +} + +void SpeculateLimitThinkingContentLengthV2( + const paddle::Tensor& next_tokens, + const paddle::Tensor& max_think_lens, + const paddle::Tensor& step_idx, + const paddle::Tensor& limit_think_status, + const paddle::Tensor& accept_num, + const paddle::Tensor& seq_lens_decoder, + const int64_t think_end_id, + const int64_t line_break_id) { + const int batch_size = next_tokens.shape()[0]; + const int tokens_per_step = next_tokens.shape()[1]; + + speculate_limit_thinking_content_length_kernel_v2<<<1, 1024>>>( + const_cast(next_tokens.data()), + max_think_lens.data(), + const_cast(step_idx.data()), + const_cast(limit_think_status.data()), + const_cast(accept_num.data()), + const_cast(seq_lens_decoder.data()), + think_end_id, + line_break_id, + tokens_per_step, + batch_size); +} + +PD_BUILD_STATIC_OP(speculate_limit_thinking_content_length_v2) + .Inputs({"next_tokens", + "max_think_lens", + "step_idx", + "limit_think_status", + "accept_num", + "seq_lens_decoder"}) + .Attrs({"think_end_id: int64_t", "line_break_id: int64_t"}) + .Outputs({"next_tokens_out"}) + .SetInplaceMap({{"next_tokens", "next_tokens_out"}}) + .SetKernelFn(PD_KERNEL(SpeculateLimitThinkingContentLengthV2)); diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index e4b9aeccc63..9d0bf0dbc32 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -80,6 +80,8 @@ speculate_step_reschedule, limit_thinking_content_length_v1, limit_thinking_content_length_v2, + speculate_limit_thinking_content_length_v1, + speculate_limit_thinking_content_length_v2, ) from fastdeploy.output.stream_transfer_data import DecoderState, StreamTransferData @@ -88,6 +90,78 @@ DISABLE_RECOVER = envs.FD_DISABLED_RECOVER == "1" +def limit_thinking_content_length( + limit_strategy: str, + sampled_token_ids: paddle.Tensor, + max_think_lens: paddle.Tensor, + step_idx: paddle.Tensor, + limit_think_status: paddle.Tensor, + think_end_id: int, + line_break_id: int = None, +): + if limit_strategy == "": + # for ernie4_5_vl + limit_thinking_content_length_v1( + sampled_token_ids, + max_think_lens, + step_idx, + limit_think_status, + think_end_id, + ) + elif limit_strategy == "\n\n\n": + # for ernie_x1 + assert line_break_id > 0 + limit_thinking_content_length_v2( + sampled_token_ids, + max_think_lens, + step_idx, + limit_think_status, + think_end_id, + line_break_id, + ) + else: + raise NotImplementedError(f"Not support {limit_strategy=} for limit thinking content length.") + + +def speculate_limit_thinking_content_length( + limit_strategy: str, + accept_tokens: paddle.Tensor, + max_think_lens: paddle.Tensor, + step_idx: paddle.Tensor, + limit_think_status: paddle.Tensor, + accept_num: paddle.Tensor, + seq_lens_decoder: paddle.Tensor, + think_end_id: int, + line_break_id: int = None, +): + if limit_strategy == "": + # for ernie4_5_vl + speculate_limit_thinking_content_length_v1( + accept_tokens, + max_think_lens, + step_idx, + limit_think_status, + accept_num, + seq_lens_decoder, + think_end_id, + ) + elif limit_strategy == "\n\n\n": + # for ernie_x1 + assert line_break_id > 0 + speculate_limit_thinking_content_length_v2( + accept_tokens, + max_think_lens, + step_idx, + limit_think_status, + accept_num, + seq_lens_decoder, + think_end_id, + line_break_id, + ) + else: + raise NotImplementedError(f"Not support {limit_strategy=} for limit thinking content length.") + + def pre_process( input_ids: paddle.Tensor, seq_lens_this_time: int, @@ -185,8 +259,20 @@ def post_process_normal( save_each_rank: bool = False, skip_save_output: bool = False, async_output_queue: queue.Queue = None, + think_end_id: int = -1, + line_break_id: int = -1, ) -> ModelRunnerOutput: """Post-processing steps after completing a single token generation.""" + if think_end_id > 0: + limit_thinking_content_length( + limit_strategy=envs.FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR, + sampled_token_ids=sampler_output.sampled_token_ids, + max_think_lens=share_inputs["max_think_lens"], + step_idx=share_inputs["step_idx"], + limit_think_status=share_inputs["limit_think_status"], + think_end_id=think_end_id, + line_break_id=line_break_id, + ) # 1. Set stop value paddle.assign( paddle.where( @@ -296,9 +382,26 @@ def post_process_normal( def post_process_specualate( - model_output: ModelOutputData, save_each_rank: bool = False, skip_save_output: bool = False + model_output: ModelOutputData, + share_inputs: Dict[str, paddle.Tensor], + save_each_rank: bool = False, + skip_save_output: bool = False, + think_end_id: int = -1, + line_break_id: int = -1, ): - """""" + if think_end_id > 0: + speculate_limit_thinking_content_length( + limit_strategy=envs.FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR, + accept_tokens=share_inputs["accept_tokens"], + max_think_lens=share_inputs["max_think_lens"], + step_idx=share_inputs["step_idx"], + limit_think_status=share_inputs["limit_think_status"], + accept_num=share_inputs["accept_num"], + seq_lens_decoder=share_inputs["seq_lens_decoder"], + think_end_id=think_end_id, + line_break_id=line_break_id, + ) + speculate_update( model_output.seq_lens_encoder, model_output.seq_lens_decoder, @@ -348,10 +451,19 @@ def post_process( speculative_decoding: bool = False, skip_save_output: bool = False, async_output_queue: queue.Queue = None, + think_end_id: int = -1, + line_break_id: int = -1, ) -> None: """Post-processing steps after completing a single token generation.""" if speculative_decoding: - post_process_specualate(model_output, save_each_rank, skip_save_output) + post_process_specualate( + model_output, + share_inputs, + save_each_rank, + skip_save_output, + think_end_id, + line_break_id, + ) else: post_process_normal( sampler_output, @@ -361,6 +473,8 @@ def post_process( save_each_rank, skip_save_output, async_output_queue, + think_end_id, + line_break_id, ) @@ -639,36 +753,3 @@ def rebuild_padding( else: raise RuntimeError("Not supported platform") return hidden_states - - -def limit_thinking_content_length( - limit_strategy: str, - sampled_token_ids: paddle.Tensor, - max_think_lens: paddle.Tensor, - step_idx: paddle.Tensor, - limit_think_status: paddle.Tensor, - think_end_id: int, - line_break_id: int = None, -): - if limit_strategy == "": - # for ernie4_5_vl - limit_thinking_content_length_v1( - sampled_token_ids, - max_think_lens, - step_idx, - limit_think_status, - think_end_id, - ) - elif limit_strategy == "\n\n\n": - # for ernie_x1 - assert line_break_id > 0 - limit_thinking_content_length_v2( - sampled_token_ids, - max_think_lens, - step_idx, - limit_think_status, - think_end_id, - line_break_id, - ) - else: - raise NotImplementedError(f"Not support {limit_strategy=} for limit thinking content length.") diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 0fbfd2af31d..ddb27af718e 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -68,7 +68,6 @@ ) from fastdeploy.model_executor.pre_and_post_process import ( - limit_thinking_content_length, post_process, pre_process, rebuild_padding, @@ -1473,6 +1472,8 @@ def _dummy_sampler_run( speculative_decoding=self.speculative_decoding, skip_save_output=True, async_output_queue=self.async_output_queue, + think_end_id=self.model_config.think_end_id, + line_break_id=self.model_config.line_break_id, ) if self.speculative_decoding: if self.speculative_method == "mtp": @@ -1837,17 +1838,6 @@ class at the server level, which is too granular for ModelRunner. self.parallel_config.data_parallel_rank * self.parallel_config.tensor_parallel_size, group=self.parallel_config.tp_group, ) - - if self.model_config.think_end_id > 0: - limit_thinking_content_length( - limit_strategy=envs.FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR, - sampled_token_ids=sampler_output.sampled_token_ids, - max_think_lens=self.share_inputs["max_think_lens"], - step_idx=self.share_inputs["step_idx"], - limit_think_status=self.share_inputs["limit_think_status"], - think_end_id=self.model_config.think_end_id, - line_break_id=self.model_config.line_break_id, - ) else: self.sampler( logits, @@ -1921,6 +1911,8 @@ class at the server level, which is too granular for ModelRunner. speculative_decoding=self.speculative_decoding, skip_save_output=skip_save_output, async_output_queue=self.async_output_queue, + think_end_id=self.model_config.think_end_id, + line_break_id=self.model_config.line_break_id, ) if self.guided_backend is not None and sampler_output is not None: self.sampler.post_process(sampler_output.sampled_token_ids, skip_idx_list) From 393d830a1c99a28013ad36467a12d2b8ea478389 Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Mon, 20 Oct 2025 13:09:07 +0800 Subject: [PATCH 21/24] fix --- .../speculate_limit_thinking_content_length_v1.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu index 90e3760af4b..96e6a700487 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v1.cu @@ -1,4 +1,4 @@ -/ Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. From 2e0f6072fcad36dcbf3d1593ce8a13d4a7efa0cc Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Mon, 20 Oct 2025 13:09:32 +0800 Subject: [PATCH 22/24] fix --- .../speculate_limit_thinking_content_length_v2.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu index 9277dd4c137..e885cfb2a0f 100644 --- a/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu +++ b/custom_ops/gpu_ops/speculate_decoding/speculate_limit_thinking_content_length_v2.cu @@ -1,4 +1,4 @@ -/ Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. From 4fd1dde31a15eef06d7875124fdc459569325156 Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Mon, 20 Oct 2025 14:58:37 +0800 Subject: [PATCH 23/24] update --- .../input/ernie4_5_vl_processor/ernie4_5_vl_processor.py | 6 +----- fastdeploy/worker/gpu_model_runner.py | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py index 0d0920b71f9..439b752c2f8 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py +++ b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py @@ -255,11 +255,7 @@ def process_request_dict(self, request, max_model_len=None): else: request["max_tokens"] = min(max_model_len - len(request["prompt_token_ids"]), request["max_tokens"]) if request.get("reasoning_max_tokens") is None: - if request.get("enable_thinking"): - request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1) - else: - if not request.get("enable_thinking"): - request["reasoning_max_tokens"] = None + request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1) data_processor_logger.info(f"Processed request {request}") return request diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index ddb27af718e..891ae07a49e 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -352,7 +352,7 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = position_ids, request.get("max_tokens", 2048) ) - if request.get("reasoning_max_tokens") is not None: + if request.get("enable_thinking", False) and request.get("reasoning_max_tokens", None) is not None: # Enable thinking self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get("reasoning_max_tokens") self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0 From db4e279623396b4569e0a8e80a23ffee7f82da36 Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Mon, 20 Oct 2025 16:28:57 +0800 Subject: [PATCH 24/24] update --- fastdeploy/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 8083a37618b..7fbbb0ab270 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -584,7 +584,7 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests: ) self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0 - if request.get("reasoning_max_tokens") is not None: + if request.get("enable_thinking", False) and request.get("reasoning_max_tokens", None) is not None: # Enable thinking self.share_inputs["max_think_lens"][idx : idx + 1, :] = request.get("reasoning_max_tokens") self.share_inputs["limit_think_status"][idx : idx + 1, :] = 0