[BugFix] support real batch_size (#3109)

* support real bsz

* fix

* fix xpu_model_runner.py,gpu_model_runner.py,gcu_model_runner.py,iluvatar_model_runner.py

* add event_loop_ep

* fix

* Add comments

* fix

* support mtp real_batch_size

* fix

* self.tmp_seq_lens_this_time->self.seq_lens_this_time_buffer

* fix

* fix VL real_seq_lens_this_time

* fix

* fix mtp

* fix

* fix mtp

* fix xpu

* fix
This commit is contained in:
lizexu123
2025-08-05 16:33:54 +08:00
committed by GitHub
parent 55939f7942
commit b01cfd6007
10 changed files with 110 additions and 58 deletions

View File

@@ -105,17 +105,18 @@ class GcuWorker(WorkerBase):
def execute_model(
self,
model_forward_batch: Optional[List[Request]] = None,
num_running_requests: int = None,
) -> Optional[ModelRunnerOutput]:
""" """
output = self.model_runner.execute_model(model_forward_batch)
output = self.model_runner.execute_model(model_forward_batch, num_running_requests)
return output
def preprocess_new_task(self, req_dicts: List[Request]) -> None:
def preprocess_new_task(self, req_dicts: List[Request], num_running_requests: int) -> None:
"""Process new requests and then start the decode loop
TODO(gongshaotian):The scheduler should schedule the handling of prefill,
and workers and modelrunners should not perceive it.
"""
self.model_runner.insert_prefill_inputs(req_dicts=req_dicts)
self.model_runner.insert_prefill_inputs(req_dicts=req_dicts, num_running_requests=num_running_requests)
def graph_optimize_and_warm_up_model(self) -> None:
"""