mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 00:57:33 +08:00
Enable SOT D2St in Multimodal Model (#2735)
This commit is contained in:
@@ -27,6 +27,8 @@ from paddleformers.utils.log import logger
|
||||
from fastdeploy.config import FDConfig
|
||||
from fastdeploy.distributed.communication_op import \
|
||||
tensor_model_parallel_all_reduce
|
||||
from fastdeploy.model_executor.graph_optimization.decorator import \
|
||||
support_graph_optimization
|
||||
from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding
|
||||
from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
|
||||
from fastdeploy.model_executor.layers.moe.moe import FusedMoE
|
||||
@@ -318,6 +320,7 @@ class Ernie4_5_VLDecoderLayer(nn.Layer):
|
||||
return hidden_states, residual
|
||||
|
||||
|
||||
@support_graph_optimization
|
||||
class Ernie4_5_VLModel(nn.Layer):
|
||||
|
||||
def __init__(
|
||||
@@ -512,7 +515,8 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM):
|
||||
image_features: paddle.Tensor,
|
||||
forward_meta: ForwardMeta,
|
||||
):
|
||||
hidden_states = self.model(ids_remove_padding, image_features,
|
||||
forward_meta)
|
||||
hidden_states = self.model(ids_remove_padding=ids_remove_padding,
|
||||
image_features=image_features,
|
||||
forward_meta=forward_meta)
|
||||
|
||||
return hidden_states
|
||||
|
@@ -13,10 +13,11 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import argparse
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
import paddle
|
||||
@@ -24,6 +25,10 @@ import paddle.distributed.fleet as fleet
|
||||
from paddleformers.transformers.model_utils import load_tp_checkpoint
|
||||
from safetensors import safe_open
|
||||
|
||||
from fastdeploy.config import (DeviceConfig, FDConfig, GraphOptimizationConfig,
|
||||
KVCacheConfig, LoadConfig, ModelConfig,
|
||||
MoEConfig, MoEPhase, ParallelConfig,
|
||||
SpeculativeConfig)
|
||||
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
|
||||
from fastdeploy.input.mm_processor import DataProcessor
|
||||
from fastdeploy.model_executor.layers.attention import get_attention_backend
|
||||
@@ -44,9 +49,6 @@ from fastdeploy.platforms import current_platform
|
||||
from fastdeploy.worker.forward_meta import ForwardMeta
|
||||
from fastdeploy.worker.utils import check_safetensors_model
|
||||
from fastdeploy.worker.vl_model_runner_base import VLModelRunnerBase
|
||||
from fastdeploy.config import (DeviceConfig, FDConfig, KVCacheConfig,
|
||||
LoadConfig, ModelConfig, MoEConfig,
|
||||
MoEPhase, ParallelConfig, SpeculativeConfig)
|
||||
|
||||
if current_platform.is_cuda() and current_platform.available():
|
||||
from fastdeploy.model_executor.layers.utils import (
|
||||
@@ -268,6 +270,10 @@ class GPUVLModelRunner(VLModelRunnerBase):
|
||||
-1)
|
||||
self.image_preprocess = image_preprocess
|
||||
|
||||
graph_opt_config = GraphOptimizationConfig(
|
||||
self.args.enable_static_graph_inference, self.args.use_cudagraph,
|
||||
self.args.max_capture_batch_size)
|
||||
|
||||
fd_config, self.model = build_stream_line_model(
|
||||
self.args.model_name_or_path,
|
||||
self.args.dtype,
|
||||
@@ -275,6 +281,7 @@ class GPUVLModelRunner(VLModelRunnerBase):
|
||||
max_model_len=self.args.max_model_len,
|
||||
tokenizer=tokenizer,
|
||||
quantization=self.args.quantization,
|
||||
graph_opt_config=graph_opt_config,
|
||||
)
|
||||
self.model.eval()
|
||||
self.set_state_dict(self.args)
|
||||
@@ -1050,6 +1057,7 @@ def build_stream_line_model(
|
||||
max_model_len: int,
|
||||
tokenizer: ErnieBotTokenizer,
|
||||
quantization: str = "None",
|
||||
graph_opt_config: Optional[GraphOptimizationConfig] = None
|
||||
) -> tuple[FDConfig, paddle.nn.layer]:
|
||||
"""
|
||||
build model
|
||||
@@ -1221,6 +1229,7 @@ def build_stream_line_model(
|
||||
moe_config=moe_config,
|
||||
quant_config=quant_config,
|
||||
kv_cache_config=kv_cache_config,
|
||||
graph_opt_config=graph_opt_config,
|
||||
)
|
||||
fd_config.parallel_config.max_model_len = max_model_len
|
||||
fd_config.model_config.rope_theta = rope_theta
|
||||
|
Reference in New Issue
Block a user