mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Optimization] Qwen2.5-VL support multi-batch prefill (#5269)
* update * fix Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> * fix dict access --------- Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
@@ -654,9 +654,11 @@ class ResourceManagerV1(ResourceManager):
|
||||
break
|
||||
|
||||
request = self.waiting[0]
|
||||
if (self._is_mm_request(request) and self.exist_mm_prefill(scheduled_reqs)) or (
|
||||
paddle.is_compiled_with_xpu() and self.exist_prefill(scheduled_reqs)
|
||||
):
|
||||
if (
|
||||
not envs.FD_ENABLE_MAX_PREFILL
|
||||
and self._is_mm_request(request)
|
||||
and self.exist_mm_prefill(scheduled_reqs)
|
||||
) or (paddle.is_compiled_with_xpu() and self.exist_prefill(scheduled_reqs)):
|
||||
break
|
||||
if request.status == RequestStatus.WAITING:
|
||||
result = self._waiting_async_process(request)
|
||||
|
||||
@@ -48,6 +48,7 @@ from fastdeploy.utils import (
|
||||
ParameterError,
|
||||
StatefulSemaphore,
|
||||
api_server_logger,
|
||||
to_tensor,
|
||||
)
|
||||
|
||||
|
||||
@@ -387,6 +388,8 @@ class EngineClient:
|
||||
if not self.enable_mm:
|
||||
self.zmq_client.send_json(task)
|
||||
else:
|
||||
if envs.FD_ENABLE_E2W_TENSOR_CONVERT:
|
||||
to_tensor([task])
|
||||
self.zmq_client.send_pyobj(task)
|
||||
|
||||
def valid_parameters(self, data):
|
||||
|
||||
@@ -478,12 +478,14 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
multi_vision_inputs["grid_thw_lst"].extend(
|
||||
inputs["grid_thw"][request.num_image_start : request.num_image_end]
|
||||
)
|
||||
multi_vision_inputs["cu_seqlens"].extend(
|
||||
inputs["vit_seqlen"][request.num_image_start : request.num_image_end]
|
||||
)
|
||||
multi_vision_inputs["vit_position_ids_lst"].extend(
|
||||
inputs["vit_position_ids"][request.num_image_start : request.num_image_end]
|
||||
)
|
||||
if "vit_seqlen" in inputs:
|
||||
multi_vision_inputs["cu_seqlens"].extend(
|
||||
inputs["vit_seqlen"][request.num_image_start : request.num_image_end]
|
||||
)
|
||||
if "vit_position_ids" in inputs:
|
||||
multi_vision_inputs["vit_position_ids_lst"].extend(
|
||||
inputs["vit_position_ids"][request.num_image_start : request.num_image_end]
|
||||
)
|
||||
else:
|
||||
vision_inputs = inputs
|
||||
if self.encoder_cache:
|
||||
@@ -2737,9 +2739,13 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
return image_features
|
||||
|
||||
def extract_vision_features_qwen(self, inputs: list[paddle.Tensor]) -> paddle.Tensor:
|
||||
assert inputs["images"] is not None
|
||||
grid_thw = inputs["grid_thw"]
|
||||
images = inputs["images"]
|
||||
if envs.FD_ENABLE_MAX_PREFILL:
|
||||
images = paddle.concat(inputs["images_lst"]).cast("bfloat16")
|
||||
grid_thw = paddle.to_tensor(inputs["grid_thw_lst"], dtype="int64")
|
||||
else:
|
||||
assert inputs["images"] is not None
|
||||
grid_thw = inputs["grid_thw"]
|
||||
images = inputs["images"]
|
||||
with paddle.amp.auto_cast(
|
||||
True,
|
||||
custom_black_list=self.amp_black,
|
||||
|
||||
Reference in New Issue
Block a user