diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index 077e945d1..ed5b7ed58 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -27,6 +27,8 @@ from paddleformers.utils.log import logger from fastdeploy.config import FDConfig from fastdeploy.distributed.communication_op import \ tensor_model_parallel_all_reduce +from fastdeploy.model_executor.graph_optimization.decorator import \ + support_graph_optimization from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding from fastdeploy.model_executor.layers.lm_head import ParallelLMHead from fastdeploy.model_executor.layers.moe.moe import FusedMoE @@ -318,6 +320,7 @@ class Ernie4_5_VLDecoderLayer(nn.Layer): return hidden_states, residual +@support_graph_optimization class Ernie4_5_VLModel(nn.Layer): def __init__( @@ -512,7 +515,8 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM): image_features: paddle.Tensor, forward_meta: ForwardMeta, ): - hidden_states = self.model(ids_remove_padding, image_features, - forward_meta) + hidden_states = self.model(ids_remove_padding=ids_remove_padding, + image_features=image_features, + forward_meta=forward_meta) return hidden_states diff --git a/fastdeploy/worker/vl_gpu_model_runner.py b/fastdeploy/worker/vl_gpu_model_runner.py index f48cefe8f..e78c30946 100644 --- a/fastdeploy/worker/vl_gpu_model_runner.py +++ b/fastdeploy/worker/vl_gpu_model_runner.py @@ -13,10 +13,11 @@ # See the License for the specific language governing permissions and # limitations under the License. """ +import argparse import json import os import random -import argparse +from typing import Optional import numpy as np import paddle @@ -24,6 +25,10 @@ import paddle.distributed.fleet as fleet from paddleformers.transformers.model_utils import load_tp_checkpoint from safetensors import safe_open +from fastdeploy.config import (DeviceConfig, FDConfig, GraphOptimizationConfig, + KVCacheConfig, LoadConfig, ModelConfig, + MoEConfig, MoEPhase, ParallelConfig, + SpeculativeConfig) from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer from fastdeploy.input.mm_processor import DataProcessor from fastdeploy.model_executor.layers.attention import get_attention_backend @@ -44,9 +49,6 @@ from fastdeploy.platforms import current_platform from fastdeploy.worker.forward_meta import ForwardMeta from fastdeploy.worker.utils import check_safetensors_model from fastdeploy.worker.vl_model_runner_base import VLModelRunnerBase -from fastdeploy.config import (DeviceConfig, FDConfig, KVCacheConfig, - LoadConfig, ModelConfig, MoEConfig, - MoEPhase, ParallelConfig, SpeculativeConfig) if current_platform.is_cuda() and current_platform.available(): from fastdeploy.model_executor.layers.utils import ( @@ -268,6 +270,10 @@ class GPUVLModelRunner(VLModelRunnerBase): -1) self.image_preprocess = image_preprocess + graph_opt_config = GraphOptimizationConfig( + self.args.enable_static_graph_inference, self.args.use_cudagraph, + self.args.max_capture_batch_size) + fd_config, self.model = build_stream_line_model( self.args.model_name_or_path, self.args.dtype, @@ -275,6 +281,7 @@ class GPUVLModelRunner(VLModelRunnerBase): max_model_len=self.args.max_model_len, tokenizer=tokenizer, quantization=self.args.quantization, + graph_opt_config=graph_opt_config, ) self.model.eval() self.set_state_dict(self.args) @@ -1050,6 +1057,7 @@ def build_stream_line_model( max_model_len: int, tokenizer: ErnieBotTokenizer, quantization: str = "None", + graph_opt_config: Optional[GraphOptimizationConfig] = None ) -> tuple[FDConfig, paddle.nn.layer]: """ build model @@ -1221,6 +1229,7 @@ def build_stream_line_model( moe_config=moe_config, quant_config=quant_config, kv_cache_config=kv_cache_config, + graph_opt_config=graph_opt_config, ) fd_config.parallel_config.max_model_len = max_model_len fd_config.model_config.rope_theta = rope_theta