[BugFix] support real batch_size (#3109)

* support real bsz * fix * fix xpu_model_runner.py,gpu_model_runner.py,gcu_model_runner.py,iluvatar_model_runner.py * add event_loop_ep * fix * Add comments * fix * support mtp real_batch_size * fix * self.tmp_seq_lens_this_time->self.seq_lens_this_time_buffer * fix * fix VL real_seq_lens_this_time * fix * fix mtp * fix * fix mtp * fix xpu * fix
2025-10-05 16:48:03 +08:00 · 2025-08-05 16:33:54 +08:00
parent 55939f7942
commit b01cfd6007
10 changed files with 110 additions and 58 deletions
--- a/fastdeploy/worker/gcu_worker.py
+++ b/fastdeploy/worker/gcu_worker.py
@@ -105,17 +105,18 @@ class GcuWorker(WorkerBase):
    def execute_model(
        self,
        model_forward_batch: Optional[List[Request]] = None,
+        num_running_requests: int = None,
    ) -> Optional[ModelRunnerOutput]:
        """ """
-        output = self.model_runner.execute_model(model_forward_batch)
+        output = self.model_runner.execute_model(model_forward_batch, num_running_requests)
        return output

-    def preprocess_new_task(self, req_dicts: List[Request]) -> None:
+    def preprocess_new_task(self, req_dicts: List[Request], num_running_requests: int) -> None:
        """Process new requests and then start the decode loop
        TODO(gongshaotian):The scheduler should schedule the handling of prefill,
        and workers and modelrunners should not perceive it.
        """
-        self.model_runner.insert_prefill_inputs(req_dicts=req_dicts)
+        self.model_runner.insert_prefill_inputs(req_dicts=req_dicts, num_running_requests=num_running_requests)

    def graph_optimize_and_warm_up_model(self) -> None:
        """