[Optimization] Qwen2.5-VL support multi-batch prefill (#5269)

* update * fix Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * fix dict access --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-12-24 13:28:13 +08:00 · 2025-12-05 18:22:39 +08:00
parent 8f2b85362d
commit a8f8791668
3 changed files with 23 additions and 12 deletions
--- a/fastdeploy/engine/sched/resource_manager_v1.py
+++ b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -654,9 +654,11 @@ class ResourceManagerV1(ResourceManager):
                        break

                    request = self.waiting[0]
-                    if (self._is_mm_request(request) and self.exist_mm_prefill(scheduled_reqs)) or (
-                        paddle.is_compiled_with_xpu() and self.exist_prefill(scheduled_reqs)
-                    ):
+                    if (
+                        not envs.FD_ENABLE_MAX_PREFILL
+                        and self._is_mm_request(request)
+                        and self.exist_mm_prefill(scheduled_reqs)
+                    ) or (paddle.is_compiled_with_xpu() and self.exist_prefill(scheduled_reqs)):
                        break
                    if request.status == RequestStatus.WAITING:
                        result = self._waiting_async_process(request)
--- a/fastdeploy/entrypoints/engine_client.py
+++ b/fastdeploy/entrypoints/engine_client.py
@@ -48,6 +48,7 @@ from fastdeploy.utils import (
    ParameterError,
    StatefulSemaphore,
    api_server_logger,
+    to_tensor,
 )


@@ -387,6 +388,8 @@ class EngineClient:
        if not self.enable_mm:
            self.zmq_client.send_json(task)
        else:
+            if envs.FD_ENABLE_E2W_TENSOR_CONVERT:
+                to_tensor([task])
            self.zmq_client.send_pyobj(task)

    def valid_parameters(self, data):
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -478,12 +478,14 @@ class GPUModelRunner(ModelRunnerBase):
                multi_vision_inputs["grid_thw_lst"].extend(
                    inputs["grid_thw"][request.num_image_start : request.num_image_end]
                )
-                multi_vision_inputs["cu_seqlens"].extend(
-                    inputs["vit_seqlen"][request.num_image_start : request.num_image_end]
-                )
-                multi_vision_inputs["vit_position_ids_lst"].extend(
-                    inputs["vit_position_ids"][request.num_image_start : request.num_image_end]
-                )
+                if "vit_seqlen" in inputs:
+                    multi_vision_inputs["cu_seqlens"].extend(
+                        inputs["vit_seqlen"][request.num_image_start : request.num_image_end]
+                    )
+                if "vit_position_ids" in inputs:
+                    multi_vision_inputs["vit_position_ids_lst"].extend(
+                        inputs["vit_position_ids"][request.num_image_start : request.num_image_end]
+                    )
            else:
                vision_inputs = inputs
                if self.encoder_cache:
@@ -2737,9 +2739,13 @@ class GPUModelRunner(ModelRunnerBase):
        return image_features

    def extract_vision_features_qwen(self, inputs: list[paddle.Tensor]) -> paddle.Tensor:
-        assert inputs["images"] is not None
-        grid_thw = inputs["grid_thw"]
-        images = inputs["images"]
+        if envs.FD_ENABLE_MAX_PREFILL:
+            images = paddle.concat(inputs["images_lst"]).cast("bfloat16")
+            grid_thw = paddle.to_tensor(inputs["grid_thw_lst"], dtype="int64")
+        else:
+            assert inputs["images"] is not None
+            grid_thw = inputs["grid_thw"]
+            images = inputs["images"]
        with paddle.amp.auto_cast(
            True,
            custom_black_list=self.amp_black,