Simplify the Config code (#2770)

* simplify the code

* fix vl

* delete config

* fix

* perfect code

* fix ci

* fix xpu

* fix xpu

* fix server

* resolve conflict

* fix mtp

* resolve conflict

* fix xpu

* fix xpu

* fix vl

* fix log

* fix qwen moe

* fix qwen moe

* fix qwen moe
This commit is contained in:
YuanRisheng
2025-07-14 19:50:05 +08:00
committed by GitHub
parent 2e81792d64
commit 4c7b8bc458
34 changed files with 551 additions and 911 deletions

View File

@@ -95,7 +95,7 @@ class AppendAttentionBackend(AttentionBackend):
self.kv_num_heads: int = kv_num_heads
self.num_heads: int = num_heads
self.head_dim: int = fd_config.model_config.head_dim
self.num_layers: int = fd_config.model_config.num_layers
self.num_layers: int = fd_config.model_config.num_hidden_layers
self.max_partition_size: int = int(
os.getenv("FLAGS_max_partition_size", 32768))

View File

@@ -67,10 +67,10 @@ class Attention(nn.Layer):
ValueError: If the `v_head_dim` is less than 0.
"""
super().__init__()
self.num_heads: int = fd_config.model_config.num_attention_heads // fd_config.parallel_config.tensor_parallel_degree
self.num_heads: int = fd_config.model_config.num_attention_heads // fd_config.parallel_config.tensor_parallel_size
self.head_dim: int = fd_config.model_config.head_dim
self.kv_num_heads: int = \
max(1, fd_config.model_config.num_key_value_heads // fd_config.parallel_config.tensor_parallel_degree)
max(1, fd_config.model_config.num_key_value_heads // fd_config.parallel_config.tensor_parallel_size)
self.layer_id: int = layer_id
self.v_head_dim: int = v_head_dim if v_head_dim > 0 else self.head_dim
self.rope_type: str = rope_type

View File

@@ -96,7 +96,7 @@ class FlashAttentionBackend(AttentionBackend):
self.head_dim = fd_config.model_config.head_dim
self.hidden_size = fd_config.model_config.hidden_size
self.block_size = fd_config.parallel_config.block_size
self.num_layers: int = fd_config.model_config.num_layers
self.num_layers: int = fd_config.model_config.num_hidden_layers
self.speculative_method = fd_config.speculative_config.method
self.use_speculate = self.speculative_method is not None

View File

@@ -102,7 +102,7 @@ class IluvatarAttnBackend(AttentionBackend):
self.head_dim = head_dim
# note: scale need to change if using MLA
self.attention_metadata.scale = 1.0 / sqrt(head_dim)
self.num_layers = llm_config.model_config.num_layers
self.num_layers = llm_config.model_config.num_hidden_layers
self.record_block_table_metadata = {}
self.only_use_flash_attn = int(
os.getenv("FD_ILUVATAR_ONLY_USE_FLASH_ATTN", 0)) == 1

View File

@@ -113,18 +113,18 @@ class MLAAttentionBackend(AttentionBackend):
self.kv_num_heads: int = kv_num_heads
self.num_heads: int = num_heads
self.head_dim: int = fd_config.model_config.head_dim
self.num_layers: int = fd_config.model_config.num_layers
self.num_layers: int = fd_config.model_config.num_hidden_layers
# For Multi Head Latent Attention
self.kv_lora_rank: int = fd_config.model_config.deepseekv3.kv_lora_rank
self.qk_rope_head_dim: int = fd_config.model_config.deepseekv3.qk_rope_head_dim
self.qk_head_dim: int = fd_config.model_config.deepseekv3.qk_nope_head_dim \
+ fd_config.model_config.deepseekv3.qk_rope_head_dim
self.kv_lora_rank: int = fd_config.model_config.kv_lora_rank
self.qk_rope_head_dim: int = fd_config.model_config.qk_rope_head_dim
self.qk_head_dim: int = fd_config.model_config.qk_nope_head_dim \
+ fd_config.model_config.qk_rope_head_dim
self.attn_softmax_scale: float = self.qk_head_dim**-0.5
if fd_config.model_config.deepseekv3.rope_scaling:
mscale_all_dim = fd_config.model_config.deepseekv3.rope_scaling.get(
if fd_config.model_config.rope_scaling:
mscale_all_dim = fd_config.model_config.rope_scaling.get(
"mscale_all_dim", False) # 1.0
scaling_factor = fd_config.model_config.deepseekv3.rope_scaling[
scaling_factor = fd_config.model_config.rope_scaling[
"factor"] # 40
mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
self.attn_softmax_scale = self.attn_softmax_scale * mscale * mscale

View File

@@ -22,7 +22,7 @@ def init_rank_and_device_id(fd_config: FDConfig):
"""
rank = (fd_config.parallel_config.expert_parallel_rank *
fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank)
fd_config.parallel_config.tensor_parallel_size + fd_config.parallel_config.tensor_parallel_rank)
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", None)

View File

@@ -95,7 +95,7 @@ class XPUAttentionBackend(AttentionBackend):
self.kv_num_heads: int = kv_num_heads
self.num_heads: int = num_heads
self.head_dim: int = head_dim
self.num_layers: int = fd_config.model_config.num_layers
self.num_layers: int = fd_config.model_config.num_hidden_layers
# pd_disaggregation
self.use_pd_disaggregation: int = int(

View File

@@ -88,7 +88,7 @@ class GCUFlashAttnBackend(AttentionBackend):
self.num_heads = num_heads
self.head_dim = head_dim
self.scaling = 1.0 / (self.head_dim**0.5)
self.num_layers = fd_config.model_config.num_layers
self.num_layers = fd_config.model_config.num_hidden_layers
self.position_ids_base = paddle.arange(self.max_seq_len)
# TODO(zhengjun): Need to adapt the allocation logic and

View File

@@ -88,7 +88,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
self.num_heads = num_heads
self.head_dim = head_dim
self.scaling = 1.0 / (self.head_dim**0.5)
self.num_layers = fd_config.model_config.num_layers
self.num_layers = fd_config.model_config.num_hidden_layers
self.position_ids_base = paddle.arange(self.max_seq_len)
# TODO(zhengjun): Need to adapt the allocation logic and

View File

@@ -59,13 +59,11 @@ class VocabParallelEmbedding(nn.Layer):
self.world_size: int = hcg.get_model_parallel_world_size()
self.ring_id: int = hcg.get_model_parallel_group().id
self.use_rope: bool = fd_config.model_config.use_rope
self.rope_head_dim: int = fd_config.model_config.rope_head_dim
self.use_ep: bool = fd_config.parallel_config.use_ep
self.hidden_dropout_prob: float = fd_config.model_config.hidden_dropout_prob
self.initializer_range: float = fd_config.model_config.initializer_range
self.sequence_parallel: bool = fd_config.parallel_config.sequence_parallel
self.max_position_embeddings: int = fd_config.model_config.max_position_embeddings
self.freeze_embedding: bool = fd_config.model_config.freeze_embedding
self.tie_word_embeddings: bool = fd_config.model_config.tie_word_embeddings
self.params_dtype: str = params_dtype
@@ -104,15 +102,7 @@ class VocabParallelEmbedding(nn.Layer):
)
self.prefix = prefix
if self.freeze_embedding:
self.word_embeddings.weight.learning_rate = 0.0
if not self.use_rope:
self.position_embeddings.weight.learning_rate = 0.0
self.dropout = nn.Dropout(self.hidden_dropout_prob)
self.rope_head_dim_shape_tensor = paddle.ones((self.rope_head_dim),
dtype="int8")
def load_state_dict(self, state_dict: Dict[str,
paddle.Tensor | np.ndarray]):
@@ -122,6 +112,7 @@ class VocabParallelEmbedding(nn.Layer):
Args:
state_dict (dict): A dictionary containing the checkpoint weights and biases.
"""
a = state_dict[self.prefix + ".weight"]
if self.tie_word_embeddings:
self.word_embeddings.weight.set_value(
get_tensor(state_dict[self.prefix + ".weight"]).astype(

View File

@@ -266,7 +266,7 @@ class ColumnParallelLinear(LinearBase):
with_bias=with_bias,
add_bias=add_bias,
skip_quant=skip_quant)
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
self.input_size = input_size
self.output_size = divide(
output_size,
@@ -348,7 +348,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
"""
self.activation = activation
self.hidden_size = fd_config.model_config.hidden_size
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
super().__init__(fd_config=fd_config,
prefix=prefix,
@@ -410,7 +410,7 @@ class QKVParallelLinear(ColumnParallelLinear):
self.kv_num_heads = fd_config.model_config.num_key_value_heads
self.hidden_size = fd_config.model_config.hidden_size
self.head_dim = fd_config.model_config.head_dim
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
self.num_heads_per_rank = divide(self.num_heads, self.nranks)
if self.kv_num_heads < self.nranks and self.nranks % self.kv_num_heads == 0:
self.kv_num_heads_per_rank = 1
@@ -545,7 +545,7 @@ class RowParallelLinear(LinearBase):
skip_quant=skip_quant)
self.fd_config = fd_config
self.skip_quant = False
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
self.hidden_size = fd_config.model_config.hidden_size
self.head_dim = fd_config.model_config.head_dim
self.num_heads = fd_config.model_config.num_attention_heads // self.nranks
@@ -638,7 +638,7 @@ class KVBatchLinear(LinearBase):
with_bias (bool): Whether to include bias or not. Defaults to False.
skip_quant (bool): Whether to skip quantization. Defaults to False.
"""
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
self.kv_lora_rank = kv_lora_rank
self.num_attention_heads = num_attention_heads
self.qk_nope_head_dim = qk_nope_head_dim

View File

@@ -49,7 +49,7 @@ class MoEMethodBase(QuantMethodBase):
from .ep import EPDecoderRunner
self.ep_decoder_runner = EPDecoderRunner(
layer.top_k, layer.hidden_size, layer.num_experts,
layer.moe_config.num_max_dispatch_tokens_per_rank,
layer.model_config.num_max_dispatch_tokens_per_rank,
layer.ep_size, layer.ep_rank)
else:
from .ep import EPPrefillRunner

View File

@@ -14,7 +14,6 @@
# limitations under the License.
"""
import numpy as np
import paddle
from paddle import nn
from paddleformers.utils.log import logger
@@ -23,8 +22,8 @@ import fastdeploy
import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func
from fastdeploy.model_executor.layers.utils import get_tensor
from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func
from ..utils import create_and_set_parameter
from .fused_moe_backend_base import MoEMethodBase
@@ -242,7 +241,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
[
layer.num_local_experts,
layer.ep_size *
layer.moe_config.num_max_dispatch_tokens_per_rank,
layer.model_config.num_max_dispatch_tokens_per_rank,
layer.moe_intermediate_size * 2,
],
dtype=paddle.bfloat16,
@@ -252,7 +251,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
[
layer.num_local_experts,
layer.ep_size *
layer.moe_config.num_max_dispatch_tokens_per_rank,
layer.model_config.num_max_dispatch_tokens_per_rank,
layer.hidden_size,
],
dtype=paddle.bfloat16,

View File

@@ -72,8 +72,8 @@ class FusedMoE(nn.Layer):
self.layer_idx = layer_idx
self.reduce_results = reduce_results
self.tp_size = fd_config.parallel_config.tensor_parallel_degree
self.ep_size = fd_config.parallel_config.expert_parallel_degree
self.tp_size = fd_config.parallel_config.tensor_parallel_size
self.ep_size = fd_config.parallel_config.expert_parallel_size
self.ep_rank = fd_config.parallel_config.expert_parallel_rank
assert (self.tp_size >= 1 and self.ep_size == 1) or \
@@ -81,7 +81,6 @@ class FusedMoE(nn.Layer):
'MoE only support parallelism on TP or EP dimension.'
self.hidden_size = fd_config.model_config.hidden_size
self.moe_config = fd_config.moe_config
self.num_experts = num_experts
self.num_local_experts = self.num_experts // self.ep_size
@@ -141,7 +140,7 @@ class FusedMoE(nn.Layer):
shape=gate_weight_shape,
dtype="float32",
)
if self.moe_config.moe_use_aux_free:
if self.model_config.moe_use_aux_free:
self.gate_correction_bias = self.create_parameter(
shape=gate_correction_bias_shape,
dtype="float32",