Enable SOT D2St in Multimodal Model (#2735)

This commit is contained in:
Ryan
2025-07-09 12:26:18 +08:00
committed by GitHub
parent f7cad30a38
commit c4718fd693
2 changed files with 19 additions and 6 deletions

View File

@@ -27,6 +27,8 @@ from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig from fastdeploy.config import FDConfig
from fastdeploy.distributed.communication_op import \ from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce tensor_model_parallel_all_reduce
from fastdeploy.model_executor.graph_optimization.decorator import \
support_graph_optimization
from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding
from fastdeploy.model_executor.layers.lm_head import ParallelLMHead from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
from fastdeploy.model_executor.layers.moe.moe import FusedMoE from fastdeploy.model_executor.layers.moe.moe import FusedMoE
@@ -318,6 +320,7 @@ class Ernie4_5_VLDecoderLayer(nn.Layer):
return hidden_states, residual return hidden_states, residual
@support_graph_optimization
class Ernie4_5_VLModel(nn.Layer): class Ernie4_5_VLModel(nn.Layer):
def __init__( def __init__(
@@ -512,7 +515,8 @@ class Ernie4_5_VLMoeForConditionalGeneration(ModelForCasualLM):
image_features: paddle.Tensor, image_features: paddle.Tensor,
forward_meta: ForwardMeta, forward_meta: ForwardMeta,
): ):
hidden_states = self.model(ids_remove_padding, image_features, hidden_states = self.model(ids_remove_padding=ids_remove_padding,
forward_meta) image_features=image_features,
forward_meta=forward_meta)
return hidden_states return hidden_states

View File

@@ -13,10 +13,11 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
""" """
import argparse
import json import json
import os import os
import random import random
import argparse from typing import Optional
import numpy as np import numpy as np
import paddle import paddle
@@ -24,6 +25,10 @@ import paddle.distributed.fleet as fleet
from paddleformers.transformers.model_utils import load_tp_checkpoint from paddleformers.transformers.model_utils import load_tp_checkpoint
from safetensors import safe_open from safetensors import safe_open
from fastdeploy.config import (DeviceConfig, FDConfig, GraphOptimizationConfig,
KVCacheConfig, LoadConfig, ModelConfig,
MoEConfig, MoEPhase, ParallelConfig,
SpeculativeConfig)
from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer from fastdeploy.input.ernie_tokenizer import ErnieBotTokenizer
from fastdeploy.input.mm_processor import DataProcessor from fastdeploy.input.mm_processor import DataProcessor
from fastdeploy.model_executor.layers.attention import get_attention_backend from fastdeploy.model_executor.layers.attention import get_attention_backend
@@ -44,9 +49,6 @@ from fastdeploy.platforms import current_platform
from fastdeploy.worker.forward_meta import ForwardMeta from fastdeploy.worker.forward_meta import ForwardMeta
from fastdeploy.worker.utils import check_safetensors_model from fastdeploy.worker.utils import check_safetensors_model
from fastdeploy.worker.vl_model_runner_base import VLModelRunnerBase from fastdeploy.worker.vl_model_runner_base import VLModelRunnerBase
from fastdeploy.config import (DeviceConfig, FDConfig, KVCacheConfig,
LoadConfig, ModelConfig, MoEConfig,
MoEPhase, ParallelConfig, SpeculativeConfig)
if current_platform.is_cuda() and current_platform.available(): if current_platform.is_cuda() and current_platform.available():
from fastdeploy.model_executor.layers.utils import ( from fastdeploy.model_executor.layers.utils import (
@@ -268,6 +270,10 @@ class GPUVLModelRunner(VLModelRunnerBase):
-1) -1)
self.image_preprocess = image_preprocess self.image_preprocess = image_preprocess
graph_opt_config = GraphOptimizationConfig(
self.args.enable_static_graph_inference, self.args.use_cudagraph,
self.args.max_capture_batch_size)
fd_config, self.model = build_stream_line_model( fd_config, self.model = build_stream_line_model(
self.args.model_name_or_path, self.args.model_name_or_path,
self.args.dtype, self.args.dtype,
@@ -275,6 +281,7 @@ class GPUVLModelRunner(VLModelRunnerBase):
max_model_len=self.args.max_model_len, max_model_len=self.args.max_model_len,
tokenizer=tokenizer, tokenizer=tokenizer,
quantization=self.args.quantization, quantization=self.args.quantization,
graph_opt_config=graph_opt_config,
) )
self.model.eval() self.model.eval()
self.set_state_dict(self.args) self.set_state_dict(self.args)
@@ -1050,6 +1057,7 @@ def build_stream_line_model(
max_model_len: int, max_model_len: int,
tokenizer: ErnieBotTokenizer, tokenizer: ErnieBotTokenizer,
quantization: str = "None", quantization: str = "None",
graph_opt_config: Optional[GraphOptimizationConfig] = None
) -> tuple[FDConfig, paddle.nn.layer]: ) -> tuple[FDConfig, paddle.nn.layer]:
""" """
build model build model
@@ -1221,6 +1229,7 @@ def build_stream_line_model(
moe_config=moe_config, moe_config=moe_config,
quant_config=quant_config, quant_config=quant_config,
kv_cache_config=kv_cache_config, kv_cache_config=kv_cache_config,
graph_opt_config=graph_opt_config,
) )
fd_config.parallel_config.max_model_len = max_model_len fd_config.parallel_config.max_model_len = max_model_len
fd_config.model_config.rope_theta = rope_theta fd_config.model_config.rope_theta = rope_theta