[Model] Qwen2.5VL support --use-cudagraph and unit testing (#4087)

* [BugFix] qwen2.5vl enable_thinking=true and image_patch_id bug fix

* [Docs]offine infer add apply_chat_template add_generation_prompt parameter

* [Model]qwen2.5VL support --use-cudagraph

* [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test

* [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test

* [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v2

* [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v3

* [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v4

* [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v5

* [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v6

* [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v7
This commit is contained in:
CSWYF3634076
2025-09-24 19:45:01 +08:00
committed by GitHub
parent 18f4977aec
commit 5ff10c8ced
5 changed files with 1048 additions and 39 deletions

View File

@@ -27,6 +27,7 @@ from paddleformers.transformers.configuration_utils import PretrainedConfig
from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.decorator import (
support_graph_optimization,
)
@@ -39,12 +40,6 @@ from fastdeploy.model_executor.models.model_base import (
ModelRegistry,
)
from fastdeploy.model_executor.models.qwen2 import Qwen2DecoderLayer
from fastdeploy.platforms import current_platform
if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import extract_text_token_output
from fastdeploy.model_executor.forward_meta import ForwardMeta
@support_graph_optimization
@@ -108,31 +103,17 @@ class Qwen2_5_VLModel(nn.Layer):
logger.info(f"Start load layer {i}")
self.layers[i].load_state_dict(state_dict)
def get_input_embeddings(self, ids_remove_padding: paddle.Tensor) -> paddle.Tensor:
return self.embed_tokens(ids_remove_padding=ids_remove_padding)
def forward(
self,
input_embeddings: paddle.Tensor,
ids_remove_padding: paddle.Tensor,
image_features: Optional[paddle.Tensor],
forward_meta: ForwardMeta,
):
hidden_states = self.embed_tokens(ids_remove_padding=ids_remove_padding)
# -----------------------
# 将 image_embeds 替换 input_embeds 里的 image video 占位符
image_mask = ids_remove_padding == self.image_token_id
image_token_num = image_mask.sum()
video_mask = ids_remove_padding == self.video_token_id
video_token_num = video_mask.sum()
# 由于框架只有 image_features所以目前不支持图片和视频混合
# TODO(wangyafeng) 后续考虑支持传入 video_features
if image_token_num > 0:
hidden_states[image_mask] = image_features.cast(self._dtype)
if video_token_num > 0:
hidden_states[video_mask] = image_features.cast(self._dtype)
# -----------------------
hidden_states = input_embeddings
residual = None
for i in range(self.num_layers):
@@ -144,18 +125,6 @@ class Qwen2_5_VLModel(nn.Layer):
hidden_states = hidden_states + residual
# -----------------------
max_seq_len, max_seq_len_index = paddle.topk(forward_meta.seq_lens_this_time, k=1)
hidden_states = extract_text_token_output(
max_seq_len,
max_seq_len_index.cast("int32"),
image_token_num.cast("int32"),
forward_meta.seq_lens_this_time,
forward_meta.cu_seqlens_q,
hidden_states.cast("float32"),
).cast(self._dtype)
# -----------------------
out = self.norm(hidden_states)
return out
@@ -183,6 +152,12 @@ class Qwen2_5_VLForConditionalGeneration(ModelForCasualLM):
# ----------- language model -------------
self.model = Qwen2_5_VLModel(fd_config=fd_config)
# Persistent buffers for CUDA graphs.
self._input_embeddings = paddle.zeros(
[fd_config.parallel_config.max_model_len, fd_config.model_config.hidden_size],
dtype=fd_config.model_config.dtype,
)
self.ori_vocab_size = fd_config.model_config.ori_vocab_size
self.lm_head = ParallelLMHead(
@@ -246,14 +221,42 @@ class Qwen2_5_VLForConditionalGeneration(ModelForCasualLM):
self.ernie.layers[i].mlp.text_fused_moe(fake_hidden_states)
self.ernie.layers[i].mlp.image_fused_moe(fake_hidden_states)
def get_input_embeddings(
self,
ids_remove_padding: paddle.Tensor,
image_features: Optional[paddle.Tensor] = None,
) -> paddle.Tensor:
input_embeddings = self.model.get_input_embeddings(ids_remove_padding=ids_remove_padding)
image_mask = ids_remove_padding == self.model.image_token_id
image_token_num = image_mask.sum()
video_mask = ids_remove_padding == self.model.video_token_id
video_token_num = video_mask.sum()
# 由于框架只有 image_features所以目前不支持图片和视频混合
# TODO(wangyafeng) 后续考虑支持传入 video_features
if image_token_num > 0:
input_embeddings[image_mask] = image_features.cast(self.model._dtype)
if video_token_num > 0:
input_embeddings[video_mask] = image_features.cast(self.model._dtype)
return input_embeddings
def forward(
self,
ids_remove_padding: paddle.Tensor,
image_features: Optional[paddle.Tensor],
forward_meta: ForwardMeta,
):
input_embeddings = self.get_input_embeddings(
ids_remove_padding=ids_remove_padding, image_features=image_features
)
self._input_embeddings.copy_(input_embeddings, False)
hidden_states = self.model(
input_embeddings=self._input_embeddings,
ids_remove_padding=ids_remove_padding,
image_features=image_features,
forward_meta=forward_meta,