[Feature] Enable prefix caching as default (#3816)

* [Feature] Enable prefix caching as default

* [Feature] Enable prefix caching as default

* Set prefix caching as default

* skip dynamic load

* fix kill bug

* fix kill bug

* fix kill bug

* fix ci

* fix

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
chenjian
2025-09-06 09:51:34 +08:00
committed by GitHub
parent 11b18e5ef0
commit 41cd3e24c9
6 changed files with 37 additions and 5 deletions

View File

@@ -221,6 +221,7 @@ class GPUModelRunner(ModelRunnerBase):
req_len = len(req_dicts)
has_prefill_task = False
has_decode_task = False
has_preempted_task = False
for i in range(req_len):
request = req_dicts[i]
idx = request.idx
@@ -320,6 +321,7 @@ class GPUModelRunner(ModelRunnerBase):
self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
self.share_inputs["seq_lens_encoder"][idx : idx + 1] = 0
self.share_inputs["is_block_step"][idx : idx + 1] = False
has_preempted_task = True
continue
assert len(request.eos_token_ids) == self.model_config.eos_tokens_lens
@@ -375,6 +377,10 @@ class GPUModelRunner(ModelRunnerBase):
if has_prefill_task or has_decode_task:
self.share_inputs["not_need_stop"][0] = True
if has_preempted_task:
self.share_inputs["not_need_stop"][0] = not (
self.share_inputs["stop_flags"].sum() == self.parallel_config.max_num_seqs
)
self.share_inputs["seq_lens_this_time"] = self.seq_lens_this_time_buffer[:num_running_requests]
def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests: int = None):