diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 21abac469c9..e2434008bbb 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -125,6 +125,7 @@ class ErnieArchitectures: "Ernie4_5_ForCausalLM", "Ernie4_5_MoeForCausalLM", "Ernie4_5_VLMoeForConditionalGeneration", + "Ernie4_5_VLMoeForProcessRewardModel", } @classmethod diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index beee8f940aa..b329844daa9 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -393,6 +393,7 @@ def __init__( with_bias: bool = False, add_bias: bool = False, skip_quant: bool = False, + weight_dtype="", ): """ Initializes a linear layer and provides additional parameters required for inference and quantization. @@ -421,6 +422,7 @@ def __init__( with_bias=with_bias, add_bias=add_bias, skip_quant=skip_quant, + weight_dtype=weight_dtype, ) assert self.quant_method is not None @@ -796,6 +798,7 @@ def __init__( add_bias: bool = False, reduce_results: bool = True, skip_quant: bool = False, + weight_dtype="", ): """ Initialize a linear layer with additional parameters for inference and quantization. @@ -830,6 +833,7 @@ def __init__( with_bias=with_bias, add_bias=add_bias, skip_quant=skip_quant, + weight_dtype=weight_dtype, ) if add_bias: assert with_bias, "with_bias must be True when add_bias is True." @@ -847,12 +851,6 @@ def __init__( if self.with_bias: # col parallel _set_var_distributed(self.bias, split_axis=0) - set_weight_attrs( - self.bias, - { - "output_dim": False, - }, - ) self.reduce_results = reduce_results diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index 3f81bc3a5a5..3f1a9f015c8 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -548,6 +548,12 @@ def forward( return out +@ModelRegistry.register_model_class( + architecture="Ernie4_5_VLMoeForConditionalGeneration", + module_name="ernie4_5_vl.ernie4_5_vl_moe", + category=ModelCategory.MULTIMODAL, + primary_use=ModelCategory.MULTIMODAL, +) class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM): """ Ernie4_5_VLMoeForConditionalGeneration @@ -678,6 +684,13 @@ def load_weights(self, weights_iterator) -> None: expert_id = None shard_id = None for loaded_weight_name, loaded_weight in weights_iterator: + loaded_weight_name = ( + self.process_weights_before_loading_fn(loaded_weight_name) + if getattr(self, "process_weights_before_loading_fn", None) + else loaded_weight_name + ) + if loaded_weight_name is None: + continue for param_name, weight_name, exp_id, shard_id in all_param_mapping: model_param_name = loaded_weight_name.replace(weight_name, param_name) if model_param_name.startswith("model.") and self.fd_config.model_config.model_format == "torch": @@ -792,12 +805,6 @@ def clear_grpah_opt_backend(self): self.ernie.clear_grpah_opt_backend(fd_config=self.fd_config) -@ModelRegistry.register_model_class( - architecture="Ernie4_5_VLMoeForConditionalGeneration", - module_name="ernie4_5_vl.ernie4_5_vl_moe", - category=ModelCategory.MULTIMODAL, - primary_use=ModelCategory.MULTIMODAL, -) class Ernie4_5_VLPretrainedModel(PretrainedModel): """ Ernie4_5_MoePretrainedModel diff --git a/fastdeploy/model_executor/models/ernie_vl_rm.py b/fastdeploy/model_executor/models/ernie_vl_rm.py new file mode 100644 index 00000000000..86cddcb42c2 --- /dev/null +++ b/fastdeploy/model_executor/models/ernie_vl_rm.py @@ -0,0 +1,158 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +from __future__ import annotations + +from typing import Optional + +import paddle +from paddle import nn + +from fastdeploy.config import FDConfig +from fastdeploy.model_executor.forward_meta import ForwardMeta +from fastdeploy.model_executor.layers.activation import SiluAndMul +from fastdeploy.model_executor.layers.linear import ( + MergedColumnParallelLinear, + RowParallelLinear, +) +from fastdeploy.model_executor.layers.pooler import DispatchPooler, Pooler +from fastdeploy.model_executor.utils import process_weights_before_loading + +from .ernie4_5_vl.ernie4_5_vl_moe import ( + Ernie4_5_VLModel, + Ernie4_5_VLMoeForConditionalGeneration, +) +from .interfaces_base import default_pooling_type +from .model_base import ModelCategory, ModelRegistry + + +class Ernie4_5_VLMoeRewardBaseModel(nn.Layer): + """ + Ernie4_5_VLMoeRewardBaseModel + """ + + is_pooling_model = True + pooler: Pooler + + def __init__(self, fd_config: FDConfig): + super().__init__() + # ----------- vision model ------------ + self.vision_model = Ernie4_5_VLMoeForConditionalGeneration._init_vision_model(self, fd_config.model_config) + # ----------- resampler_model ------------ + self.resampler_model = Ernie4_5_VLMoeForConditionalGeneration._init_resampler_model_model( + self, fd_config.model_config + ) + self.ernie = Ernie4_5_VLModel(fd_config=fd_config) + self.head_dtype = paddle.bfloat16 + + # Persistent buffers for CUDA graphs. + self._input_embeddings = paddle.zeros( + [fd_config.parallel_config.max_model_len, fd_config.model_config.hidden_size], + dtype=fd_config.model_config.dtype, + ) + + self.rm_head = nn.Sequential( + ( + "up_gate_proj", + MergedColumnParallelLinear( + fd_config=fd_config, + prefix="", + input_size=fd_config.model_config.hidden_size, + output_size=fd_config.model_config.hidden_size * 2, + with_bias=False, + ), + ), + ("act_fn", SiluAndMul(fd_config=fd_config, bias=None, act_method=fd_config.model_config.hidden_act)), + ( + "down_proj", + RowParallelLinear( + fd_config=fd_config, + input_size=fd_config.model_config.hidden_size, + output_size=fd_config.model_config.num_labels, + skip_quant=True, + weight_dtype=self.head_dtype, + with_bias=False, + ), + ), + ) + + def get_input_embeddings( + self, + ids_remove_padding: paddle.Tensor, + image_token_num: int, + image_features: Optional[paddle.Tensor] = None, + ) -> paddle.Tensor: + input_embeddings = self.ernie.get_input_embeddings(ids_remove_padding=ids_remove_padding) + if image_token_num > 0: + input_embeddings[ids_remove_padding == self.ernie.im_patch_id] = image_features.cast(self.ernie._dtype) + return input_embeddings + + def forward( + self, + ids_remove_padding: paddle.Tensor, + image_features: Optional[paddle.Tensor], + forward_meta: ForwardMeta, + ): + vl_moe_meta = self.ernie.prepare_vl_moe_meta(ids_remove_padding=ids_remove_padding) + input_embeddings = self.get_input_embeddings( + ids_remove_padding=ids_remove_padding, + image_features=image_features, + image_token_num=vl_moe_meta.image_token_num.item(), + ) + self._input_embeddings.copy_(input_embeddings, False) + + hidden_states = self.ernie( + input_embeddings=self._input_embeddings, + ids_remove_padding=ids_remove_padding, + forward_meta=forward_meta, + vl_moe_meta=vl_moe_meta, + ) + hidden_states = hidden_states.to(self.head_dtype) + logits = self.rm_head(hidden_states) + return logits + + +@ModelRegistry.register_model_class( + architecture="Ernie4_5_VLMoeForProcessRewardModel", + module_name="ernie_vl_rm", + category=[ModelCategory.REWARD], + primary_use=ModelCategory.REWARD, +) +@default_pooling_type("ALL") +class Ernie4_5_VLMoeForProcessRewardModel(Ernie4_5_VLMoeRewardBaseModel): + + def __init__(self, fd_config: FDConfig): + self.fd_config = fd_config + fd_config.model_config.num_labels = 1 + super().__init__(fd_config=fd_config) + self.tie_word_embeddings = False + + pooler_config = fd_config.model_config.pooler_config + assert pooler_config is not None + + self.pooler = DispatchPooler({"encode": Pooler.for_encode(pooler_config)}) + + self.process_weights_before_loading_fn = process_weights_before_loading(skip_prefixes=["lm_head"]) + + @classmethod + def name(self): + """ """ + return "Ernie4_5_VLMoeForProcessRewardModel" + + @paddle.no_grad() + def load_weights(self, weights_iterator): + # Filter out lm_head weights of Ernie4_5_VLMoeForConditionalGeneration + Ernie4_5_VLMoeForConditionalGeneration.load_weights(self, weights_iterator) diff --git a/fastdeploy/model_executor/models/interfaces_base.py b/fastdeploy/model_executor/models/interfaces_base.py index 77533209d9b..b5cea3d231d 100644 --- a/fastdeploy/model_executor/models/interfaces_base.py +++ b/fastdeploy/model_executor/models/interfaces_base.py @@ -48,6 +48,8 @@ def determine_model_category(class_name: str): return ModelCategory.MULTIMODAL elif any(pattern in class_name for pattern in ["Embedding", "ForSequenceClassification"]): return ModelCategory.EMBEDDING + elif any(pattern in class_name for pattern in ["Reward"]): + return ModelCategory.REWARD return ModelCategory.TEXT_GENERATION @@ -100,3 +102,11 @@ class FdModelForPooling(FdModel[T_co], Protocol[T_co]): """ pooler: Pooler """The pooler is only called on TP rank 0.""" + + +def default_pooling_type(pooling_type: str): + def func(model): + model.default_pooling_type = pooling_type # type: ignore + return model + + return func diff --git a/fastdeploy/model_executor/models/model_base.py b/fastdeploy/model_executor/models/model_base.py index fddfb4de51a..28eb6b7da0b 100644 --- a/fastdeploy/model_executor/models/model_base.py +++ b/fastdeploy/model_executor/models/model_base.py @@ -39,6 +39,7 @@ class ModelCategory(Enum): TEXT_GENERATION = "text_generation" MULTIMODAL = "multimodal" EMBEDDING = "embedding" + REWARD = "reward" @dataclass(frozen=True) @@ -228,8 +229,7 @@ def register_model_class( def _register(model_cls): # Traditional registration for ModelForCasualLM subclasses - if issubclass(model_cls, ModelForCasualLM) and model_cls is not ModelForCasualLM: - cls._arch_to_model_cls[model_cls.name()] = model_cls + cls._arch_to_model_cls[model_cls.name()] = model_cls # Enhanced decorator-style registration if architecture and module_name: diff --git a/fastdeploy/model_executor/models/qwen2.py b/fastdeploy/model_executor/models/qwen2.py index fd51358c5b5..ec4df06002e 100644 --- a/fastdeploy/model_executor/models/qwen2.py +++ b/fastdeploy/model_executor/models/qwen2.py @@ -44,6 +44,12 @@ ModelForCasualLM, ModelRegistry, ) +from fastdeploy.model_executor.utils import ( + WeightsMapper, + default_weight_loader, + process_weights_after_loading, + process_weights_before_loading, +) class Qwen2MLP(nn.Layer): @@ -316,6 +322,14 @@ def __init__(self, fd_config: FDConfig): prefix="lm_head", ) + self.process_weights_before_loading_fn = process_weights_before_loading( + mapper=( + WeightsMapper(orig_to_new_prefix={"model.": "qwen2."}) + if self.fd_config.model_config.model_format == "torch" + else None + ), + ) + @paddle.no_grad() def load_weights(self, weights_iterator) -> None: """ @@ -325,11 +339,6 @@ def load_weights(self, weights_iterator) -> None: weights_iterator (Iterator): An iterator yielding (name, weight) pairs. """ - from fastdeploy.model_executor.utils import ( - default_weight_loader, - process_weights_after_loading, - ) - stacked_params_mapping = [ # (param_name, shard_name, shard_id) ("qkv_proj", "q_proj", "q"), @@ -344,10 +353,13 @@ def load_weights(self, weights_iterator) -> None: params_dict = dict(self.named_parameters()) process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers())) for loaded_weight_name, loaded_weight in weights_iterator: - model_format = self.fd_config.model_config.model_format - # Because the prefix for Paddle is qwen2, and for Hugging Face it is model. - if model_format == "torch": - loaded_weight_name = loaded_weight_name.replace("model", "qwen2") + loaded_weight_name = ( + self.process_weights_before_loading_fn(loaded_weight_name) + if getattr(self, "process_weights_before_loading_fn", None) + else loaded_weight_name + ) + if loaded_weight_name is None: + continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in loaded_weight_name: continue diff --git a/fastdeploy/model_executor/models/qwen2_rm.py b/fastdeploy/model_executor/models/qwen2_rm.py new file mode 100644 index 00000000000..629f65f9248 --- /dev/null +++ b/fastdeploy/model_executor/models/qwen2_rm.py @@ -0,0 +1,109 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +from __future__ import annotations + +import paddle +from paddle import nn + +from fastdeploy.config import FDConfig +from fastdeploy.model_executor.forward_meta import ForwardMeta +from fastdeploy.model_executor.layers.linear import ( + ColumnParallelLinear, + RowParallelLinear, +) +from fastdeploy.model_executor.layers.pooler import DispatchPooler, Pooler +from fastdeploy.model_executor.utils import process_weights_before_loading + +from .interfaces_base import default_pooling_type +from .model_base import ModelCategory, ModelRegistry +from .qwen2 import Qwen2ForCausalLM, Qwen2Model + + +class Qwen2RewardBaseModel(nn.Layer): + """ + Qwen2RewardBaseModel + """ + + is_pooling_model = True + pooler: Pooler + + def __init__(self, fd_config: FDConfig): + super().__init__() + self.model = Qwen2Model(fd_config=fd_config) + self.head_dtype = paddle.float32 + + self.score = nn.Sequential( + ColumnParallelLinear( + fd_config=fd_config, + input_size=fd_config.model_config.hidden_size, + output_size=fd_config.model_config.hidden_size, + skip_quant=True, + weight_dtype=self.head_dtype, + with_bias=True, + ), + nn.ReLU(), + RowParallelLinear( + fd_config=fd_config, + input_size=fd_config.model_config.hidden_size, + output_size=fd_config.model_config.num_labels, + skip_quant=True, + weight_dtype=self.head_dtype, + with_bias=True, + ), + ) + + def forward( + self, + ids_remove_padding: paddle.Tensor, + forward_meta: ForwardMeta, + ): + hidden_states = self.model(ids_remove_padding=ids_remove_padding, forward_meta=forward_meta) + hidden_states = hidden_states.to(self.head_dtype) + logits = self.score(hidden_states) + return logits + + +@ModelRegistry.register_model_class( + architecture="Qwen2ForProcessRewardModel", + module_name="qwen2_rm", + category=[ModelCategory.REWARD], + primary_use=ModelCategory.REWARD, +) +@default_pooling_type("STEP") +class Qwen2ForProcessRewardModel(Qwen2RewardBaseModel): + + def __init__(self, fd_config: FDConfig): + self.fd_config = fd_config + fd_config.model_config.num_labels = 2 + super().__init__(fd_config=fd_config) + + pooler_config = fd_config.model_config.pooler_config + assert pooler_config is not None + + self.pooler = DispatchPooler({"encode": Pooler.for_encode(pooler_config)}) + + self.process_weights_before_loading_fn = process_weights_before_loading(skip_prefixes=["lm_head"]) + + @classmethod + def name(self): + """ """ + return "Qwen2ForProcessRewardModel" + + @paddle.no_grad() + def load_weights(self, weights_iterator): + # Filter out lm_head weights of Qwen2ForCausalLM + Qwen2ForCausalLM.load_weights(self, weights_iterator) diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py index 154024ca39c..15d285212b0 100644 --- a/fastdeploy/model_executor/utils.py +++ b/fastdeploy/model_executor/utils.py @@ -16,8 +16,10 @@ import os import re +from collections.abc import Mapping from contextlib import contextmanager -from typing import Any, Optional, Union +from dataclasses import dataclass, field +from typing import Any, List, Optional, Union import paddle from paddleformers.utils.log import logger @@ -150,6 +152,36 @@ def fn(model_sublayer_name: str, param=None): return fn +@dataclass +class WeightsMapper: + orig_to_new_prefix: Mapping[str, Optional[str]] = field(default_factory=dict) + + def _map_name(self, key: str) -> Optional[str]: + for prefix, new_key in self.orig_to_new_prefix.items(): + if key.startswith(prefix): + key = key.replace(prefix, new_key, 1) + return key + + def apply(self, weight_name): + return self._map_name(weight_name) + + +def process_weights_before_loading( + *, skip_prefixes: Optional[List[str]] = None, mapper: Optional[WeightsMapper] = None +): + def _can_skip(weight_name): + return any(weight_name.startswith(p) for p in (skip_prefixes or [])) + + def fn(weight_name): + if mapper is not None: + weight_name = mapper.apply(weight_name) + if _can_skip(weight_name): + weight_name = None + return weight_name + + return fn + + def free_tensor(tensor): if hasattr(tensor, "tensor_track"): tensor.tensor_track = None diff --git a/fastdeploy/multimodal/registry.py b/fastdeploy/multimodal/registry.py index f014ba55532..d827c9b8068 100644 --- a/fastdeploy/multimodal/registry.py +++ b/fastdeploy/multimodal/registry.py @@ -25,6 +25,7 @@ class MultimodalRegistry: "Ernie5MoeForCausalLM", "Qwen2_5_VLForConditionalGeneration", "Ernie5ForCausalLM", + "Ernie4_5_VLMoeForProcessRewardModel", } @classmethod