mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
[Feature] Enable prefix caching as default (#3816)
* [Feature] Enable prefix caching as default * [Feature] Enable prefix caching as default * Set prefix caching as default * skip dynamic load * fix kill bug * fix kill bug * fix kill bug * fix ci * fix --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
@@ -221,6 +221,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
req_len = len(req_dicts)
|
||||
has_prefill_task = False
|
||||
has_decode_task = False
|
||||
has_preempted_task = False
|
||||
for i in range(req_len):
|
||||
request = req_dicts[i]
|
||||
idx = request.idx
|
||||
@@ -320,6 +321,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
|
||||
self.share_inputs["seq_lens_encoder"][idx : idx + 1] = 0
|
||||
self.share_inputs["is_block_step"][idx : idx + 1] = False
|
||||
has_preempted_task = True
|
||||
continue
|
||||
|
||||
assert len(request.eos_token_ids) == self.model_config.eos_tokens_lens
|
||||
@@ -375,6 +377,10 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
|
||||
if has_prefill_task or has_decode_task:
|
||||
self.share_inputs["not_need_stop"][0] = True
|
||||
if has_preempted_task:
|
||||
self.share_inputs["not_need_stop"][0] = not (
|
||||
self.share_inputs["stop_flags"].sum() == self.parallel_config.max_num_seqs
|
||||
)
|
||||
self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer[:num_running_requests]
|
||||
|
||||
def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests: int = None):
|
||||
|
Reference in New Issue
Block a user