[BugFix] support real batch_size (#3109)

* support real bsz * fix * fix xpu_model_runner.py,gpu_model_runner.py,gcu_model_runner.py,iluvatar_model_runner.py * add event_loop_ep * fix * Add comments * fix * support mtp real_batch_size * fix * self.tmp_seq_lens_this_time->self.seq_lens_this_time_buffer * fix * fix VL real_seq_lens_this_time * fix * fix mtp * fix * fix mtp * fix xpu * fix
2025-10-05 08:37:06 +08:00 · 2025-08-05 16:33:54 +08:00
parent 55939f7942
commit b01cfd6007
10 changed files with 110 additions and 58 deletions
--- a/fastdeploy/worker/xpu_worker.py
+++ b/fastdeploy/worker/xpu_worker.py
@@ -145,9 +145,14 @@ class XpuWorker(WorkerBase):
    def execute_model(
        self,
        model_forward_batch: Optional[List[Request]] = None,
+        is_dummy_run: bool = False,
+        num_running_requests: Optional[int] = None,
    ) -> Optional[ModelRunnerOutput]:
        """ """
-        output = self.model_runner.execute_model(model_forward_batch)
+        if is_dummy_run:
+            output = self.model_runner.execute_model(model_forward_batch)
+        else:
+            output = self.model_runner.execute_model(model_forward_batch, num_running_requests)
        return output

    def exist_prefill(self):
@@ -156,15 +161,15 @@ class XpuWorker(WorkerBase):
        """
        return self.model_runner.exist_prefill()

-    def preprocess_new_task(self, req_dicts: List[Request]) -> None:
+    def preprocess_new_task(self, req_dicts: List[Request], num_running_requests: int) -> None:
        """Process new requests and then start the decode loop
        TODO(gongshaotian):The scheduler should schedule the handling of prefill,
        and workers and modelrunners should not perceive it.
        """
        if envs.ENABLE_V1_KVCACHE_SCHEDULER:
-            self.model_runner.insert_tasks_v1(req_dicts=req_dicts)
+            self.model_runner.insert_tasks_v1(req_dicts=req_dicts, num_running_requests=num_running_requests)
        else:
-            self.model_runner.process_prefill_inputs(req_dicts=req_dicts)
+            self.model_runner.process_prefill_inputs(req_dicts=req_dicts, num_running_requests=num_running_requests)

    def check_health(self) -> bool:
        """ """