diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py index dd6337244ca6..90273463d64e 100644 --- a/vllm/model_executor/models/llama4_eagle.py +++ b/vllm/model_executor/models/llama4_eagle.py @@ -60,16 +60,23 @@ def __init__( prefix=maybe_prefix(prefix, "embed_tokens"), ) - self.layers = nn.ModuleList( - [ - Llama4DecoderLayer( - vllm_config=vllm_config, - prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"), - config=self.config, - ) - for i in range(self.config.num_hidden_layers) - ] - ) + # Temporarily modify vllm_config.quant_config for draft model layers + original_quant_config = vllm_config.quant_config + vllm_config.quant_config = quant_config + try: + self.layers = nn.ModuleList( + [ + Llama4DecoderLayer( + vllm_config=vllm_config, + prefix=maybe_prefix(prefix, f"layers.{i + start_layer_id}"), + config=self.config, + ) + for i in range(self.config.num_hidden_layers) + ] + ) + finally: + # Restore original quant_config + vllm_config.quant_config = original_quant_config self.fc = torch.nn.Linear( self.config.hidden_size * 2, self.config.hidden_size, bias=False )