@@ -2107,51 +2107,6 @@ def _pool(self, hidden_states: paddle.Tensor, num_running_requests: int) -> Opti
21072107
21082108 return pooler_output
21092109
2110- def _schedule_cache_and_update_buffer (
2111- self , model_forward_batch : Optional [List [Request ]], num_running_request : int
2112- ) -> None :
2113-
2114- # Update 'infer_seed' and step_cuda()
2115- self .share_inputs ["infer_seed" ].add_ (self .infer_seed_increment )
2116- self .share_inputs ["infer_seed" ][:] %= self .MAX_INFER_SEED
2117-
2118- if not envs .ENABLE_V1_KVCACHE_SCHEDULER :
2119- step_cuda (
2120- self .share_inputs ,
2121- self .cache_config .block_size ,
2122- self .cache_config .enc_dec_block_num ,
2123- self .speculative_config ,
2124- self .cache_config .enable_prefix_caching ,
2125- )
2126-
2127- self ._update_chunked_prefill (model_forward_batch )
2128- self ._add_cache (model_forward_batch )
2129- elif self .speculative_decoding :
2130- speculate_schedule_cache (
2131- self .share_inputs ["draft_tokens" ],
2132- self .share_inputs ["block_tables" ],
2133- self .share_inputs ["stop_flags" ],
2134- self .share_inputs ["prompt_lens" ],
2135- self .share_inputs ["seq_lens_this_time" ],
2136- self .share_inputs ["seq_lens_encoder" ],
2137- self .share_inputs ["seq_lens_decoder" ],
2138- self .share_inputs ["step_seq_lens_decoder" ],
2139- self .share_inputs ["step_draft_tokens" ],
2140- self .share_inputs ["step_seq_lens_this_time" ],
2141- self .share_inputs ["accept_num" ],
2142- self .share_inputs ["accept_tokens" ],
2143- self .share_inputs ["is_block_step" ],
2144- self .share_inputs ["not_need_stop" ],
2145- self .share_inputs ["stop_nums" ],
2146- self .cache_config .block_size ,
2147- self .speculative_config .num_speculative_tokens ,
2148- )
2149-
2150- # Copy seq_lens_this_time buffer
2151- self .seq_lens_this_time_buffer [:num_running_request ].copy_ (
2152- self .share_inputs ["seq_lens_this_time" ][:num_running_request ], False
2153- )
2154-
21552110 def _add_cache (self , model_forward_batch ) -> None :
21562111 """
21572112 Add cache for guided decoding.
0 commit comments