Simplify the Config code (#2770)

* simplify the code

* fix vl

* delete config

* fix

* perfect code

* fix ci

* fix xpu

* fix xpu

* fix server

* resolve conflict

* fix mtp

* resolve conflict

* fix xpu

* fix xpu

* fix vl

* fix log

* fix qwen moe

* fix qwen moe

* fix qwen moe
This commit is contained in:
YuanRisheng
2025-07-14 19:50:05 +08:00
committed by GitHub
parent 2e81792d64
commit 4c7b8bc458
34 changed files with 551 additions and 911 deletions

View File

@@ -25,7 +25,7 @@ from paddle import nn
from paddleformers.transformers import PretrainedModel
from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig, ModelConfig
from fastdeploy.config import FDConfig
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.decorator import \
support_graph_optimization
@@ -54,7 +54,7 @@ class Ernie4_5_MLP(nn.Layer):
reduce_results: bool = True,
) -> None:
super().__init__()
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
self.gate_up_proj = MergedColumnParallelLinear(
fd_config=fd_config,
prefix=f"{prefix}.up_gate_proj",
@@ -179,16 +179,16 @@ class Ernie4_5_MoE(nn.Layer):
self.fused_moe = FusedMoE(
fd_config=fd_config,
moe_intermediate_size=fd_config.moe_config.moe_intermediate_size,
num_experts=fd_config.moe_config.num_experts,
top_k=fd_config.moe_config.top_k,
moe_intermediate_size=fd_config.model_config.moe_intermediate_size,
num_experts=fd_config.model_config.moe_num_experts,
top_k=fd_config.model_config.moe_k,
layer_idx=layer_id,
weight_key_map=weight_key_map,
)
self.num_shared_experts = fd_config.moe_config.moe_num_shared_experts
self.num_shared_experts = fd_config.model_config.moe_num_shared_experts
if self.num_shared_experts > 0:
shared_experts_hidden_dim = self.num_shared_experts * fd_config.moe_config.moe_intermediate_size
shared_experts_hidden_dim = self.num_shared_experts * fd_config.model_config.moe_intermediate_size
self.shared_experts = Ernie4_5_MLP(
fd_config=fd_config,
intermediate_size=shared_experts_hidden_dim,
@@ -271,8 +271,8 @@ class Ernie4_5_DecoderLayer(nn.Layer):
prefix=f"{prefix}.self_attn",
)
if (fd_config.moe_config.num_experts is not None
and layer_id >= fd_config.moe_config.moe_layer_start_index):
if (fd_config.model_config.moe_num_experts is not None
and layer_id >= fd_config.model_config.moe_layer_start_index):
self.mlp = Ernie4_5_MoE(
fd_config=fd_config,
layer_id=layer_id,
@@ -281,7 +281,7 @@ class Ernie4_5_DecoderLayer(nn.Layer):
else:
self.mlp = Ernie4_5_MLP(
fd_config=fd_config,
intermediate_size=fd_config.model_config.ffn_hidden_size,
intermediate_size=fd_config.model_config.intermediate_size,
prefix=f"{prefix}.mlp",
)
@@ -346,20 +346,20 @@ class Ernie4_5_Model(nn.Layer):
"""
super().__init__()
self.num_layers = fd_config.model_config.num_layers
fd_config.model_config.prefix_name = "ernie"
self.num_layers = fd_config.model_config.num_hidden_layers
fd_config.model_config.pretrained_config.prefix_name = "ernie"
self.embeddings = VocabParallelEmbedding(
fd_config=fd_config,
num_embeddings=fd_config.model_config.vocab_size,
embedding_dim=fd_config.model_config.hidden_size,
params_dtype=paddle.get_default_dtype(),
prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"))
prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"))
self.hidden_layers = nn.LayerList([
Ernie4_5_DecoderLayer(
fd_config=fd_config,
prefix=f"{fd_config.model_config.prefix_name}.layers.{i}")
prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}")
for i in range(self.num_layers)
])
@@ -367,7 +367,7 @@ class Ernie4_5_Model(nn.Layer):
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=fd_config.model_config.rms_norm_eps,
prefix=f"{fd_config.model_config.prefix_name}.norm",
prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm",
)
def load_state_dict(self, state_dict):
@@ -466,8 +466,8 @@ class Ernie4_5_MoeForCausalLM(ModelForCasualLM):
shape=[0, self.fd_config.model_config.hidden_size],
dtype=paddle.get_default_dtype(),
)
for i in range(self.fd_config.moe_config.moe_layer_start_index,
self.fd_config.model_config.num_layers):
for i in range(self.fd_config.model_config.moe_layer_start_index,
self.fd_config.model_config.num_hidden_layers):
self.model.hidden_layers[i].mlp.fused_moe(fake_hidden_states)
def forward(
@@ -559,7 +559,7 @@ class Ernie4_5_PretrainedModel(PretrainedModel):
]
@classmethod
def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True):
def _get_tensor_parallel_mappings(cls, config, is_split=True):
"""
get_tensor_parallel_mappings
"""
@@ -603,7 +603,7 @@ class Ernie4_5_PretrainedModel(PretrainedModel):
)
return final_actions
mappings = get_tensor_parallel_split_mappings(
config.num_layers,
config.num_hidden_layers,
config.moe_num_experts,
config.moe_layer_start_index,
config.prefix_name,