[BugFix] support real batch_size (#3109)

* support real bsz

* fix

* fix xpu_model_runner.py,gpu_model_runner.py,gcu_model_runner.py,iluvatar_model_runner.py

* add event_loop_ep

* fix

* Add comments

* fix

* support mtp real_batch_size

* fix

* self.tmp_seq_lens_this_time->self.seq_lens_this_time_buffer

* fix

* fix VL real_seq_lens_this_time

* fix

* fix mtp

* fix

* fix mtp

* fix xpu

* fix
This commit is contained in:
lizexu123
2025-08-05 16:33:54 +08:00
committed by GitHub
parent 55939f7942
commit b01cfd6007
10 changed files with 110 additions and 58 deletions

View File

@@ -145,9 +145,14 @@ class XpuWorker(WorkerBase):
def execute_model(
self,
model_forward_batch: Optional[List[Request]] = None,
is_dummy_run: bool = False,
num_running_requests: Optional[int] = None,
) -> Optional[ModelRunnerOutput]:
""" """
output = self.model_runner.execute_model(model_forward_batch)
if is_dummy_run:
output = self.model_runner.execute_model(model_forward_batch)
else:
output = self.model_runner.execute_model(model_forward_batch, num_running_requests)
return output
def exist_prefill(self):
@@ -156,15 +161,15 @@ class XpuWorker(WorkerBase):
"""
return self.model_runner.exist_prefill()
def preprocess_new_task(self, req_dicts: List[Request]) -> None:
def preprocess_new_task(self, req_dicts: List[Request], num_running_requests: int) -> None:
"""Process new requests and then start the decode loop
TODO(gongshaotian):The scheduler should schedule the handling of prefill,
and workers and modelrunners should not perceive it.
"""
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
self.model_runner.insert_tasks_v1(req_dicts=req_dicts)
self.model_runner.insert_tasks_v1(req_dicts=req_dicts, num_running_requests=num_running_requests)
else:
self.model_runner.process_prefill_inputs(req_dicts=req_dicts)
self.model_runner.process_prefill_inputs(req_dicts=req_dicts, num_running_requests=num_running_requests)
def check_health(self) -> bool:
""" """