mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-15 13:10:55 +08:00
[Model] Qwen2.5VL support --use-cudagraph and unit testing (#4087)
* [BugFix] qwen2.5vl enable_thinking=true and image_patch_id bug fix * [Docs]offine infer add apply_chat_template add_generation_prompt parameter * [Model]qwen2.5VL support --use-cudagraph * [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test * [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test * [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v2 * [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v3 * [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v4 * [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v5 * [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v6 * [Model]qwen2.5VL support --use-cudagraph buffer and qwenvl test v7
This commit is contained in:
@@ -27,6 +27,7 @@ from paddleformers.transformers.configuration_utils import PretrainedConfig
|
||||
from paddleformers.utils.log import logger
|
||||
|
||||
from fastdeploy.config import FDConfig
|
||||
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||
from fastdeploy.model_executor.graph_optimization.decorator import (
|
||||
support_graph_optimization,
|
||||
)
|
||||
@@ -39,12 +40,6 @@ from fastdeploy.model_executor.models.model_base import (
|
||||
ModelRegistry,
|
||||
)
|
||||
from fastdeploy.model_executor.models.qwen2 import Qwen2DecoderLayer
|
||||
from fastdeploy.platforms import current_platform
|
||||
|
||||
if current_platform.is_cuda():
|
||||
from fastdeploy.model_executor.ops.gpu import extract_text_token_output
|
||||
|
||||
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||
|
||||
|
||||
@support_graph_optimization
|
||||
@@ -108,31 +103,17 @@ class Qwen2_5_VLModel(nn.Layer):
|
||||
logger.info(f"Start load layer {i}")
|
||||
self.layers[i].load_state_dict(state_dict)
|
||||
|
||||
def get_input_embeddings(self, ids_remove_padding: paddle.Tensor) -> paddle.Tensor:
|
||||
return self.embed_tokens(ids_remove_padding=ids_remove_padding)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_embeddings: paddle.Tensor,
|
||||
ids_remove_padding: paddle.Tensor,
|
||||
image_features: Optional[paddle.Tensor],
|
||||
forward_meta: ForwardMeta,
|
||||
):
|
||||
|
||||
hidden_states = self.embed_tokens(ids_remove_padding=ids_remove_padding)
|
||||
|
||||
# -----------------------
|
||||
# 将 image_embeds 替换 input_embeds 里的 image video 占位符
|
||||
image_mask = ids_remove_padding == self.image_token_id
|
||||
image_token_num = image_mask.sum()
|
||||
|
||||
video_mask = ids_remove_padding == self.video_token_id
|
||||
video_token_num = video_mask.sum()
|
||||
|
||||
# 由于框架只有 image_features,所以目前不支持图片和视频混合
|
||||
# TODO(wangyafeng) 后续考虑支持传入 video_features
|
||||
if image_token_num > 0:
|
||||
hidden_states[image_mask] = image_features.cast(self._dtype)
|
||||
if video_token_num > 0:
|
||||
hidden_states[video_mask] = image_features.cast(self._dtype)
|
||||
|
||||
# -----------------------
|
||||
hidden_states = input_embeddings
|
||||
|
||||
residual = None
|
||||
for i in range(self.num_layers):
|
||||
@@ -144,18 +125,6 @@ class Qwen2_5_VLModel(nn.Layer):
|
||||
|
||||
hidden_states = hidden_states + residual
|
||||
|
||||
# -----------------------
|
||||
max_seq_len, max_seq_len_index = paddle.topk(forward_meta.seq_lens_this_time, k=1)
|
||||
hidden_states = extract_text_token_output(
|
||||
max_seq_len,
|
||||
max_seq_len_index.cast("int32"),
|
||||
image_token_num.cast("int32"),
|
||||
forward_meta.seq_lens_this_time,
|
||||
forward_meta.cu_seqlens_q,
|
||||
hidden_states.cast("float32"),
|
||||
).cast(self._dtype)
|
||||
# -----------------------
|
||||
|
||||
out = self.norm(hidden_states)
|
||||
|
||||
return out
|
||||
@@ -183,6 +152,12 @@ class Qwen2_5_VLForConditionalGeneration(ModelForCasualLM):
|
||||
# ----------- language model -------------
|
||||
self.model = Qwen2_5_VLModel(fd_config=fd_config)
|
||||
|
||||
# Persistent buffers for CUDA graphs.
|
||||
self._input_embeddings = paddle.zeros(
|
||||
[fd_config.parallel_config.max_model_len, fd_config.model_config.hidden_size],
|
||||
dtype=fd_config.model_config.dtype,
|
||||
)
|
||||
|
||||
self.ori_vocab_size = fd_config.model_config.ori_vocab_size
|
||||
|
||||
self.lm_head = ParallelLMHead(
|
||||
@@ -246,14 +221,42 @@ class Qwen2_5_VLForConditionalGeneration(ModelForCasualLM):
|
||||
self.ernie.layers[i].mlp.text_fused_moe(fake_hidden_states)
|
||||
self.ernie.layers[i].mlp.image_fused_moe(fake_hidden_states)
|
||||
|
||||
def get_input_embeddings(
|
||||
self,
|
||||
ids_remove_padding: paddle.Tensor,
|
||||
image_features: Optional[paddle.Tensor] = None,
|
||||
) -> paddle.Tensor:
|
||||
|
||||
input_embeddings = self.model.get_input_embeddings(ids_remove_padding=ids_remove_padding)
|
||||
|
||||
image_mask = ids_remove_padding == self.model.image_token_id
|
||||
image_token_num = image_mask.sum()
|
||||
|
||||
video_mask = ids_remove_padding == self.model.video_token_id
|
||||
video_token_num = video_mask.sum()
|
||||
|
||||
# 由于框架只有 image_features,所以目前不支持图片和视频混合
|
||||
# TODO(wangyafeng) 后续考虑支持传入 video_features
|
||||
if image_token_num > 0:
|
||||
input_embeddings[image_mask] = image_features.cast(self.model._dtype)
|
||||
if video_token_num > 0:
|
||||
input_embeddings[video_mask] = image_features.cast(self.model._dtype)
|
||||
|
||||
return input_embeddings
|
||||
|
||||
def forward(
|
||||
self,
|
||||
ids_remove_padding: paddle.Tensor,
|
||||
image_features: Optional[paddle.Tensor],
|
||||
forward_meta: ForwardMeta,
|
||||
):
|
||||
input_embeddings = self.get_input_embeddings(
|
||||
ids_remove_padding=ids_remove_padding, image_features=image_features
|
||||
)
|
||||
self._input_embeddings.copy_(input_embeddings, False)
|
||||
|
||||
hidden_states = self.model(
|
||||
input_embeddings=self._input_embeddings,
|
||||
ids_remove_padding=ids_remove_padding,
|
||||
image_features=image_features,
|
||||
forward_meta=forward_meta,
|
||||
|
Reference in New Issue
Block a user