Simplify the Config code (#2770)

* simplify the code

* fix vl

* delete config

* fix

* perfect code

* fix ci

* fix xpu

* fix xpu

* fix server

* resolve conflict

* fix mtp

* resolve conflict

* fix xpu

* fix xpu

* fix vl

* fix log

* fix qwen moe

* fix qwen moe

* fix qwen moe
This commit is contained in:
YuanRisheng
2025-07-14 19:50:05 +08:00
committed by GitHub
parent 2e81792d64
commit 4c7b8bc458
34 changed files with 551 additions and 911 deletions

View File

@@ -21,14 +21,15 @@ from enum import Enum
from typing import Literal, Optional, Union
from paddleformers.transformers.configuration_utils import PretrainedConfig
from paddleformers.trl import llm_utils
from fastdeploy import envs
from fastdeploy.model_executor.layers.quantization.quant_base import \
QuantConfigBase
from fastdeploy.utils import get_logger
logger = get_logger("config", "config.log")
class MoEPhase(Enum):
"""
The generation phase of the moe.
@@ -37,274 +38,228 @@ class MoEPhase(Enum):
PREFILL = 1
DECODER = 2
PRETRAINED_INIT_CONFIGURATION = {
"rope_theta": 10000.0,
"num_key_value_heads":-1,
"start_layer_index": 0,
"moe_num_shared_experts":0,
"moe_layer_start_index": 0,
"num_max_dispatch_tokens_per_rank":256,
"moe_use_aux_free":False,
"vocab_size": -1,
"use_rope": True,
"hidden_dropout_prob":0.0,
"initializer_range":0.02,
"max_position_embeddings":512,
"quantization_config":None,
"use_recompute_resampler":False,
"use_temporal_conv":True,
"resampler_fuse_rms_norm":False,
"freq_allocation":20,
"tie_word_embeddings":False,
"rms_norm_eps":1e-5,
}
class ModelConfig(PretrainedConfig):
class ModelConfig:
"""
The configuration class to store the configuration of a `LLM`.
"""
max_stop_seqs_num = 5
stop_seqs_max_len = 8
architectures: list[str] = []
# NOTE(gongshaotain): form _load_model_init_val()
top_p = 0.0
temperature = 1.0
rope_theta = 10000.0
penalty_score = 1.0
frequency_score = 0.0
presence_score = 0.0
min_length = 1
def __init__(
self,
vocab_size: int = 100224,
hidden_size: int = 4096,
num_layers: int = 48,
num_attention_heads: int = 32,
num_key_value_heads: Optional[int] = None,
hidden_act: str = "swiglu",
hidden_dropout_prob: float = 0.0,
max_position_embeddings: int = 512,
max_seq_len: int = 512,
initializer_range: float = 0.02,
use_rope=True,
rope_theta: int = 10000,
rope_3d: bool = False,
ori_vocab_size: int | None = None,
moe_layer_start_index: Union[int, list[int], None] = None,
moe_num_experts: Union[int, list[int], None] = None,
moe_layer_end_index: Union[int, list[int], None] = None,
moe_num_shared_experts: int | None = None,
num_hidden_layers: int | None = None,
prefix_name="",
freeze_embedding=False,
rope_head_dim=None,
ffn_hidden_size: Optional[int] = None,
dtype="bfloat16",
start_layer_index: int = 0,
head_dim: Optional[int] = None,
tie_word_embeddings: bool = False,
is_quantized: bool = False,
rms_norm_eps: float = 1e-5,
**kwargs,
args,
):
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_layers = num_layers
if num_hidden_layers is not None:
self.num_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
if head_dim is None:
self.max_stop_seqs_num = 5
self.stop_seqs_max_len = 8
# NOTE(gongshaotain): form _load_model_init_val()
self.top_p = 0.0
self.temperature = 1.0
self.rope_theta = 10000.0
self.penalty_score = 1.0
self.frequency_score = 0.0
self.presence_score = 0.0
self.min_length = 1
self.model_name_or_path = ""
self.im_patch_id = (
100295 # multimodality, TODO(liuyuanle): read from config.json
)
self.is_quantized = False
self.max_model_len = 0
self.dtype = ""
self.enable_logprob = False
for key, value in args.items():
if hasattr(self, key):
setattr(self, key, value)
pretrained_config, _ = PretrainedConfig.get_config_dict(self.model_name_or_path)
self.pretrained_config = PretrainedConfig.from_dict(pretrained_config)
# set attribute from pretrained_config
for key, value in pretrained_config.items():
setattr(self, key, value)
# we need set default value when not exist
for key, value in PRETRAINED_INIT_CONFIGURATION.items():
if not hasattr(self, key):
setattr(self, key, value)
if not hasattr(self, "head_dim"):
self.head_dim = self.hidden_size // self.num_attention_heads
else:
self.head_dim = head_dim
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.use_rope = use_rope
self.rope_theta = rope_theta
self.ori_vocab_size = ori_vocab_size or vocab_size
self.max_seq_len = max_seq_len
self.prefix_name = prefix_name
self.freeze_embedding = freeze_embedding
self.rope_head_dim = rope_head_dim
self.moe_layer_start_index = moe_layer_start_index
self.moe_num_experts = moe_num_experts
self.moe_num_shared_experts = moe_num_shared_experts
self.moe_layer_end_index = moe_layer_end_index
self.ffn_hidden_size = ffn_hidden_size
self.rope_3d = rope_3d
self.start_layer_index = start_layer_index
self.dtype = dtype
self.tie_word_embeddings = tie_word_embeddings
self.is_quantized = is_quantized
self.rms_norm_eps = rms_norm_eps
if hasattr(self, "vision_config"):
self.vision_config = PretrainedConfig.from_dict(self.vision_config)
@dataclass
class MoEConfig:
"""
Configuration for MoE.
"""
num_experts: Union[int, list[int], None] = None
top_k: int = 8
moe_intermediate_size: int = -1
num_experts_per_rank: int = -1
num_experts_start_offset: int = -1
self.ori_vocab_size = self.vocab_size
if "Ernie4_5_ForCausalLM" in self.architectures or "Ernie4_5_MoeForCausalLM" in self.architectures:
self.ori_vocab_size = args["ori_vocab_size"]
moe_num_shared_experts = (0, )
moe_layer_start_index: Union[int, list[int], None] = None
moe_layer_end_index: Union[int, list[int], None] = None
moe_use_aux_free: bool = False
num_max_dispatch_tokens_per_rank = 256
im_patch_id = (
100295 # multimodality, TODO(liuyuanle): read from config.json
)
@dataclass
class ParallelConfig:
"""Configuration for the distributed execution."""
block_size = 16 # The block size for processing.
sequence_parallel = False # Whether to enable sequence parallelism.
use_ep = False # Whether to enable Expert Parallelism
moe_phase = MoEPhase.PREFILL # Generation phase
msg_queue_id = 1 # mesage queue id
tensor_parallel_rank = None # TP rank ID
tensor_parallel_degree = None # TP degree
expert_parallel_rank = None # EP rank ID
expert_parallel_degree = None # EP degree
# The embedding weight distributed on your gpu cards is divided by row or column.
# Defaults to False means divide by row. When vocab_size can not be divided by world_size
# but hidden_size can, we can consider split embedding weight by column.
"""
From old wersion worker args
TODO(gongshaotian): Reclassify
"""
model_name_or_path: str = "./output"
max_num_seqs: int = 34
# Set default block num for profile run
max_block_num: int = 2000
# block size
block_size: int = 64
# Engine worker queue port
engine_worker_queue_port: int = 9923
# Max model len
max_model_len: int = 3072 # max_seq_len
# cuda visible devices
device_ids: str = "0"
# Input dtype
dtype: str = "bfloat16"
# Encoder's decoder num
enc_dec_block_num: int = 1
# KV cache ratio for input
kv_cache_ratio: float = 0.7
# First token id
first_token_id: int = 1
# Gpu memory utilization
gpu_memory_utilization: float = 0.9
# Process ID of engine
engine_pid: Optional[int] = None
# Do profile or not
do_profile: bool = False
#
pad_token_id: int = -1
#
eos_tokens_lens: int = 2
# Enable chunked prefill
enable_chunked_prefill: str = "store_true"
def __init__(
self,
args,
):
self.sequence_parallel = False # Whether to enable sequence parallelism.
self.use_ep = False # Whether to enable Expert Parallelism
self.moe_phase = MoEPhase.PREFILL # Generation phase
self.msg_queue_id = 1 # mesage queue id
max_num_batched_tokens: int = 2048
# enable prefix cache
enable_prefix_caching = None
# splitwise role
splitwise_role: str = "mixed"
# guided decoding backend
guided_decoding_backend: str = None
# disable any whitespace for guided decoding
disable_any_whitespace: bool = True
# enable the custom all-reduce kernel and fall back to NCCL(dist.all_reduce).
enable_custom_all_reduce: str = "store_true"
tensor_parallel_rank, tensor_parallel_size = llm_utils.init_dist_env()
self.tensor_parallel_rank = tensor_parallel_rank # TP rank ID
self.tensor_parallel_size = tensor_parallel_size # TP degree
self.expert_parallel_rank = int(tensor_parallel_rank / tensor_parallel_size) # EP rank ID
self.expert_parallel_size = 1 # EP degree
# The embedding weight distributed on your gpu cards is divided by row or column.
# Defaults to False means divide by row. When vocab_size can not be divided by world_size
# but hidden_size can, we can consider split embedding weight by column.
"""
From old wersion worker args
TODO(gongshaotian): Reclassify
"""
self.model_name_or_path: str = "./output"
self.max_num_seqs: int = 34
# Set default block num for profile run
self.max_block_num: int = 2000
# block size
self.block_size: int = 64
# Engine worker queue port
self.engine_worker_queue_port: int = 9923
# Max model len
self.max_model_len: int = 3072 # max_seq_len
# cuda visible devices
self.device_ids: str = "0"
# Input dtype
self.dtype: str = "bfloat16"
# Encoder's decoder num
self.enc_dec_block_num: int = 1
# KV cache ratio for input
self.kv_cache_ratio: float = 0.7
# First token id
self.first_token_id: int = 1
# Gpu memory utilization
self.gpu_memory_utilization: float = 0.9
# Process ID of engine
self.engine_pid: Optional[int] = None
# Do profile or not
self.do_profile: bool = False
#
self.pad_token_id: int = -1
#
self.eos_tokens_lens: int = 2
# Enable chunked prefill
self.enable_chunked_prefill: bool = False
self.max_num_batched_tokens: int = 2048
# enable prefix cache
self.enable_prefix_caching = None
# splitwise role
self.splitwise_role: str = "mixed"
# guided decoding backend
self.guided_decoding_backend: str = None
# disable any whitespace for guided decoding
self.disable_any_whitespace: bool = True
self.pod_ip: str = None
for key, value in args.items():
if hasattr(self, key):
setattr(self, key, value)
self.use_ep = args["expert_parallel_size"] > 1
if self.splitwise_role == "mixed":
self.moe_phase = MoEPhase.PREFILL
elif self.splitwise_role == "prefill":
self.moe_phase = MoEPhase.PREFILL
elif self.splitwise_role == "decode":
self.moe_phase = MoEPhase.DECODER
else:
raise NotImplementedError
# enable the custom all-reduce kernel and fall back to NCCL(dist.all_reduce).
self.enable_custom_all_reduce: bool = False
@dataclass
class SpeculativeConfig:
"""
Configuration for speculative decoding.
"""
# speculative method, choose in [None, "ngram_match", "mtp"]
method: Optional[str] = None
# the max length of speculative tokens
num_speculative_tokens: int = 1
# the max length of candidate tokens for speculative method
max_candidate_len: int = 5
# the max length of verify window for speculative method
verify_window: int = 2
# ngram match
max_ngram_size: int = 5
# model for mtp/eagle/draft_model
model_name_or_path: Optional[str] = None
# quantization of model
quantization: Optional[str] = None
# allocate more blocks to prevent mtp from finishing the block earlier than the main model
# Fixed now
num_gpu_block_expand_ratio: Optional[float] = 1
# To distinguish the main model and draft model(mtp/eagle/draftmodel)
# ["main", "mtp"]
model_type: Optional[str] = "main"
# TODO(liuzichang): To reduce memory usage, MTP shares the main model's lm_head and embedding layers.
# A trick method is currently used to enable this sharing.
# This will be replaced with a more standardized solution in the future.
sharing_model = None
# During benchmarking, we need to enforce that the number of accepted tokens is 1.
# This means no tokens from MTP are accepted.
# This ensures that the specified simulation acceptance rate is not affected.
benchmark_mode: bool = False
def __init__(
self,
args,
):
# speculative method, choose in [None, "ngram_match", "mtp"]
self.method: Optional[str] = None
# the max length of speculative tokens
self.num_speculative_tokens: int = 1
# the max length of candidate tokens for speculative method
self.max_candidate_len: int = 5
# the max length of verify window for speculative method
self.verify_window: int = 2
# ngram match
self.max_ngram_size: int = 5
# model for mtp/eagle/draft_model
self.model_name_or_path: Optional[str] = None
# quantization of model
self.quantization: Optional[str] = None
# allocate more blocks to prevent mtp from finishing the block earlier than the main model
# Fixed now
self.num_gpu_block_expand_ratio: Optional[float] = 1
# To distinguish the main model and draft model(mtp/eagle/draftmodel)
# ["main", "mtp"]
self.model_type: Optional[str] = "main"
# TODO(liuzichang): To reduce memory usage, MTP shares the main model's lm_head and embedding layers.
# A trick method is currently used to enable this sharing.
# This will be replaced with a more standardized solution in the future.
self.sharing_model = None
# During benchmarking, we need to enforce that the number of accepted tokens is 1.
# This means no tokens from MTP are accepted.
# This ensures that the specified simulation acceptance rate is not affected.
self.benchmark_mode: bool = False
#TODO(YuanRisheng): The name of the server args is different from the name of the SpeculativeConfig.
#We temperately add the name map here and will delete it in future.
name_map = {"speculative_method": "method",
"speculative_max_draft_token_num": "num_speculative_tokens",
"speculative_model_name_or_path": "model_name_or_path",
"speculative_model_quantization": "quantization",
"speculative_benchmark_mode": "benchmark_mode"}
for key, value in args.items():
if key in name_map.keys() and hasattr(self, name_map[key]):
setattr(self, name_map[key], value)
@dataclass
class DeviceConfig:
"""
Configuration for device settings.
"""
device_type = "cuda"
def __init__(
self,
args,
):
self.device_type = "cuda"
for key, value in args.items():
if hasattr(self, key):
setattr(self, key, value)
class GraphOptimizationConfig:
"""The Top-level graph optimization contral corresponds to different backends.
- 0: dyncmic graph
- 1: static graph
- 2: static graph + cinn compilation backend
"""
graph_opt_level: int = 0
# CUDA Graph Config
""" Whether to use cudagraph.
- False: cudagraph is not used.
- True: cudagraph is used.
It requires that all input buffers have fixed addresses, and all
splitting ops write their outputs to input buffers.
- With dyncmic graph backend: ...
- With static grpah backend: WIP
"""
use_cudagraph: bool = False
"""Sizes to capture cudagraph.
- None (default): capture sizes are inferred from llm config.
- list[int]: capture sizes are specified as given."""
cudagraph_capture_sizes: Optional[list[int]] = None
""" Number of warmup runs for cudagraph. """
cudagraph_num_of_warmups: int = 2
"""Whether to copy input tensors for cudagraph.
If the caller can guarantee that the same input buffers
are always used, it can set this to False. Otherwise, it should
set this to True."""
cudagraph_copy_inputs: bool = False
""" In static graph, this is an operation list that does not need to be captured by the CUDA graph.
CudaGraphBackend will split these operations from the static graph.
Example usage:
cudagraph_splitting_ops = ["paddle.unified_attention"]
Note: If want to use subgraph capture functionality in a dynamic graph,
can manually split the model into multiple layers and apply the @support_cuda_graph decorator
only to the layer where CUDA graph functionality is required.
"""
cudagraph_splitting_ops = Optional[list[str]]
""""whether to use a full cuda graph for the entire forward pass rather than
splitting certain operations such as attention into subgraphs.
Thus this flag cannot be used together with splitting_ops."""
full_cuda_graph: bool = False
max_capture_size: int = field(default=None, init=False) # type: ignore
batch_size_to_captured_size: dict[int,
int] = field(default=None,
init=False) # type: ignore
# CINN Config ...
def init_with_cudagrpah_size(self,
cudagraph_capture_sizes: list[int]) -> None:
"""To complete the initialization of config,
@@ -338,18 +293,67 @@ class GraphOptimizationConfig:
def __init__(self,
enable_static_graph_inference: bool = False,
use_cudagraph: bool = False,
max_capture_batch_size: int = 64):
""" """
max_capture_batch_size: int = 64,
args = None):
"""The Top-level graph optimization contral corresponds to different backends.
- 0: dyncmic graph
- 1: static graph
- 2: static graph + cinn compilation backend
"""
self.graph_opt_level: int = 0
# CUDA Graph Config
""" Whether to use cudagraph.
- False: cudagraph is not used.
- True: cudagraph is used.
It requires that all input buffers have fixed addresses, and all
splitting ops write their outputs to input buffers.
- With dyncmic graph backend: ...
- With static grpah backend: WIP
"""
self.use_cudagraph: bool = False
"""Sizes to capture cudagraph.
- None (default): capture sizes are inferred from llm config.
- list[int]: capture sizes are specified as given."""
self.cudagraph_capture_sizes: Optional[list[int]] = None
""" Number of warmup runs for cudagraph. """
self.cudagraph_num_of_warmups: int = 2
"""Whether to copy input tensors for cudagraph.
If the caller can guarantee that the same input buffers
are always used, it can set this to False. Otherwise, it should
set this to True."""
self.cudagraph_copy_inputs: bool = False
""" In static graph, this is an operation list that does not need to be captured by the CUDA graph.
CudaGraphBackend will split these operations from the static graph.
Example usage:
cudagraph_splitting_ops = ["paddle.unified_attention"]
Note: If want to use subgraph capture functionality in a dynamic graph,
can manually split the model into multiple layers and apply the @support_cuda_graph decorator
only to the layer where CUDA graph functionality is required.
"""
self.cudagraph_splitting_ops = Optional[list[str]]
""""whether to use a full cuda graph for the entire forward pass rather than
splitting certain operations such as attention into subgraphs.
Thus this flag cannot be used together with splitting_ops."""
self.full_cuda_graph: bool = False
self.max_capture_size: int = field(default=None, init=False) # type: ignore
self.batch_size_to_captured_size: dict[int,
int] = field(default=None,
init=False) # type: ignore
# CINN Config ...
for key, value in args.items():
if hasattr(self, key):
setattr(self, key, value)
capture_size = [i for i in range(1, max_capture_batch_size + 1)]
self.init_with_cudagrpah_size(cudagraph_capture_sizes=capture_size)
self.use_cudagraph = use_cudagraph
#TODO(wangmingkai02): change graph_opt_level=2 when using static mode with cinn
if enable_static_graph_inference:
self.graph_opt_level = 1
@dataclass
class LoadConfig:
"""
Configuration for dynamic weight loading strategies
@@ -363,37 +367,39 @@ class LoadConfig:
- 'meta': provide RL traing worker, no_weights_load
- None: No dynamic loading
"""
use_fastsafetensor: bool = False
dynamic_load_weight: bool = False
load_strategy: Optional[Literal['ipc', 'ipc_no_reshard', 'ipc_snapshot', 'meta']] = None
def __init__(
self,
args,
):
self.use_fastsafetensor = int(envs.FD_USE_FASTSAFETENSOR) == 1
self.dynamic_load_weight: bool = False
self.load_strategy: Optional[Literal['ipc', 'ipc_no_reshard', 'ipc_snapshot', 'meta']] = None
for key, value in args.items():
if hasattr(self, key):
setattr(self, key, value)
def __post_init__(self):
if self.load_strategy is not None and not self.dynamic_load_weight:
raise ValueError("Load strategy requires dynamic_load_weight=True")
if self.dynamic_load_weight and self.load_strategy is None:
raise ValueError("Must specify load_strategy when dynamic_load_weight is True")
@dataclass
class LoRAConfig:
""" LoRA Config """
pass
@dataclass
class KVCacheConfig:
""" KV Cache Config """
cache_quant_dtype: str = "none"
@dataclass
class DecodingConfig:
"""
Configuration for decoding
"""
pad_token_id = None
def __init__(
self,
args,
):
self.pad_token_id = None
for key, value in args.items():
if hasattr(self, key):
setattr(self, key, value)
@dataclass
class FDConfig:
@@ -411,7 +417,6 @@ class FDConfig:
load_config: LoadConfig = field(default=None, init=True)
quant_config: Optional[QuantConfigBase] = None
graph_opt_config: Optional[GraphOptimizationConfig] = None
moe_config: MoEConfig = field(default=None, init=True) # type: ignore
decoding_config: DecodingConfig = field(default=None,
init=True) # type: ignore
kv_cache_config: KVCacheConfig = field(default=None,

View File

@@ -95,7 +95,7 @@ class AppendAttentionBackend(AttentionBackend):
self.kv_num_heads: int = kv_num_heads
self.num_heads: int = num_heads
self.head_dim: int = fd_config.model_config.head_dim
self.num_layers: int = fd_config.model_config.num_layers
self.num_layers: int = fd_config.model_config.num_hidden_layers
self.max_partition_size: int = int(
os.getenv("FLAGS_max_partition_size", 32768))

View File

@@ -67,10 +67,10 @@ class Attention(nn.Layer):
ValueError: If the `v_head_dim` is less than 0.
"""
super().__init__()
self.num_heads: int = fd_config.model_config.num_attention_heads // fd_config.parallel_config.tensor_parallel_degree
self.num_heads: int = fd_config.model_config.num_attention_heads // fd_config.parallel_config.tensor_parallel_size
self.head_dim: int = fd_config.model_config.head_dim
self.kv_num_heads: int = \
max(1, fd_config.model_config.num_key_value_heads // fd_config.parallel_config.tensor_parallel_degree)
max(1, fd_config.model_config.num_key_value_heads // fd_config.parallel_config.tensor_parallel_size)
self.layer_id: int = layer_id
self.v_head_dim: int = v_head_dim if v_head_dim > 0 else self.head_dim
self.rope_type: str = rope_type

View File

@@ -96,7 +96,7 @@ class FlashAttentionBackend(AttentionBackend):
self.head_dim = fd_config.model_config.head_dim
self.hidden_size = fd_config.model_config.hidden_size
self.block_size = fd_config.parallel_config.block_size
self.num_layers: int = fd_config.model_config.num_layers
self.num_layers: int = fd_config.model_config.num_hidden_layers
self.speculative_method = fd_config.speculative_config.method
self.use_speculate = self.speculative_method is not None

View File

@@ -102,7 +102,7 @@ class IluvatarAttnBackend(AttentionBackend):
self.head_dim = head_dim
# note: scale need to change if using MLA
self.attention_metadata.scale = 1.0 / sqrt(head_dim)
self.num_layers = llm_config.model_config.num_layers
self.num_layers = llm_config.model_config.num_hidden_layers
self.record_block_table_metadata = {}
self.only_use_flash_attn = int(
os.getenv("FD_ILUVATAR_ONLY_USE_FLASH_ATTN", 0)) == 1

View File

@@ -113,18 +113,18 @@ class MLAAttentionBackend(AttentionBackend):
self.kv_num_heads: int = kv_num_heads
self.num_heads: int = num_heads
self.head_dim: int = fd_config.model_config.head_dim
self.num_layers: int = fd_config.model_config.num_layers
self.num_layers: int = fd_config.model_config.num_hidden_layers
# For Multi Head Latent Attention
self.kv_lora_rank: int = fd_config.model_config.deepseekv3.kv_lora_rank
self.qk_rope_head_dim: int = fd_config.model_config.deepseekv3.qk_rope_head_dim
self.qk_head_dim: int = fd_config.model_config.deepseekv3.qk_nope_head_dim \
+ fd_config.model_config.deepseekv3.qk_rope_head_dim
self.kv_lora_rank: int = fd_config.model_config.kv_lora_rank
self.qk_rope_head_dim: int = fd_config.model_config.qk_rope_head_dim
self.qk_head_dim: int = fd_config.model_config.qk_nope_head_dim \
+ fd_config.model_config.qk_rope_head_dim
self.attn_softmax_scale: float = self.qk_head_dim**-0.5
if fd_config.model_config.deepseekv3.rope_scaling:
mscale_all_dim = fd_config.model_config.deepseekv3.rope_scaling.get(
if fd_config.model_config.rope_scaling:
mscale_all_dim = fd_config.model_config.rope_scaling.get(
"mscale_all_dim", False) # 1.0
scaling_factor = fd_config.model_config.deepseekv3.rope_scaling[
scaling_factor = fd_config.model_config.rope_scaling[
"factor"] # 40
mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
self.attn_softmax_scale = self.attn_softmax_scale * mscale * mscale

View File

@@ -22,7 +22,7 @@ def init_rank_and_device_id(fd_config: FDConfig):
"""
rank = (fd_config.parallel_config.expert_parallel_rank *
fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank)
fd_config.parallel_config.tensor_parallel_size + fd_config.parallel_config.tensor_parallel_rank)
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", None)

View File

@@ -95,7 +95,7 @@ class XPUAttentionBackend(AttentionBackend):
self.kv_num_heads: int = kv_num_heads
self.num_heads: int = num_heads
self.head_dim: int = head_dim
self.num_layers: int = fd_config.model_config.num_layers
self.num_layers: int = fd_config.model_config.num_hidden_layers
# pd_disaggregation
self.use_pd_disaggregation: int = int(

View File

@@ -88,7 +88,7 @@ class GCUFlashAttnBackend(AttentionBackend):
self.num_heads = num_heads
self.head_dim = head_dim
self.scaling = 1.0 / (self.head_dim**0.5)
self.num_layers = fd_config.model_config.num_layers
self.num_layers = fd_config.model_config.num_hidden_layers
self.position_ids_base = paddle.arange(self.max_seq_len)
# TODO(zhengjun): Need to adapt the allocation logic and

View File

@@ -88,7 +88,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
self.num_heads = num_heads
self.head_dim = head_dim
self.scaling = 1.0 / (self.head_dim**0.5)
self.num_layers = fd_config.model_config.num_layers
self.num_layers = fd_config.model_config.num_hidden_layers
self.position_ids_base = paddle.arange(self.max_seq_len)
# TODO(zhengjun): Need to adapt the allocation logic and

View File

@@ -59,13 +59,11 @@ class VocabParallelEmbedding(nn.Layer):
self.world_size: int = hcg.get_model_parallel_world_size()
self.ring_id: int = hcg.get_model_parallel_group().id
self.use_rope: bool = fd_config.model_config.use_rope
self.rope_head_dim: int = fd_config.model_config.rope_head_dim
self.use_ep: bool = fd_config.parallel_config.use_ep
self.hidden_dropout_prob: float = fd_config.model_config.hidden_dropout_prob
self.initializer_range: float = fd_config.model_config.initializer_range
self.sequence_parallel: bool = fd_config.parallel_config.sequence_parallel
self.max_position_embeddings: int = fd_config.model_config.max_position_embeddings
self.freeze_embedding: bool = fd_config.model_config.freeze_embedding
self.tie_word_embeddings: bool = fd_config.model_config.tie_word_embeddings
self.params_dtype: str = params_dtype
@@ -104,15 +102,7 @@ class VocabParallelEmbedding(nn.Layer):
)
self.prefix = prefix
if self.freeze_embedding:
self.word_embeddings.weight.learning_rate = 0.0
if not self.use_rope:
self.position_embeddings.weight.learning_rate = 0.0
self.dropout = nn.Dropout(self.hidden_dropout_prob)
self.rope_head_dim_shape_tensor = paddle.ones((self.rope_head_dim),
dtype="int8")
def load_state_dict(self, state_dict: Dict[str,
paddle.Tensor | np.ndarray]):
@@ -122,6 +112,7 @@ class VocabParallelEmbedding(nn.Layer):
Args:
state_dict (dict): A dictionary containing the checkpoint weights and biases.
"""
a = state_dict[self.prefix + ".weight"]
if self.tie_word_embeddings:
self.word_embeddings.weight.set_value(
get_tensor(state_dict[self.prefix + ".weight"]).astype(

View File

@@ -266,7 +266,7 @@ class ColumnParallelLinear(LinearBase):
with_bias=with_bias,
add_bias=add_bias,
skip_quant=skip_quant)
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
self.input_size = input_size
self.output_size = divide(
output_size,
@@ -348,7 +348,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
"""
self.activation = activation
self.hidden_size = fd_config.model_config.hidden_size
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
super().__init__(fd_config=fd_config,
prefix=prefix,
@@ -410,7 +410,7 @@ class QKVParallelLinear(ColumnParallelLinear):
self.kv_num_heads = fd_config.model_config.num_key_value_heads
self.hidden_size = fd_config.model_config.hidden_size
self.head_dim = fd_config.model_config.head_dim
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
self.num_heads_per_rank = divide(self.num_heads, self.nranks)
if self.kv_num_heads < self.nranks and self.nranks % self.kv_num_heads == 0:
self.kv_num_heads_per_rank = 1
@@ -545,7 +545,7 @@ class RowParallelLinear(LinearBase):
skip_quant=skip_quant)
self.fd_config = fd_config
self.skip_quant = False
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
self.hidden_size = fd_config.model_config.hidden_size
self.head_dim = fd_config.model_config.head_dim
self.num_heads = fd_config.model_config.num_attention_heads // self.nranks
@@ -638,7 +638,7 @@ class KVBatchLinear(LinearBase):
with_bias (bool): Whether to include bias or not. Defaults to False.
skip_quant (bool): Whether to skip quantization. Defaults to False.
"""
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
self.kv_lora_rank = kv_lora_rank
self.num_attention_heads = num_attention_heads
self.qk_nope_head_dim = qk_nope_head_dim

View File

@@ -49,7 +49,7 @@ class MoEMethodBase(QuantMethodBase):
from .ep import EPDecoderRunner
self.ep_decoder_runner = EPDecoderRunner(
layer.top_k, layer.hidden_size, layer.num_experts,
layer.moe_config.num_max_dispatch_tokens_per_rank,
layer.model_config.num_max_dispatch_tokens_per_rank,
layer.ep_size, layer.ep_rank)
else:
from .ep import EPPrefillRunner

View File

@@ -14,7 +14,6 @@
# limitations under the License.
"""
import numpy as np
import paddle
from paddle import nn
from paddleformers.utils.log import logger
@@ -23,8 +22,8 @@ import fastdeploy
import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func
from fastdeploy.model_executor.layers.utils import get_tensor
from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func
from ..utils import create_and_set_parameter
from .fused_moe_backend_base import MoEMethodBase
@@ -242,7 +241,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
[
layer.num_local_experts,
layer.ep_size *
layer.moe_config.num_max_dispatch_tokens_per_rank,
layer.model_config.num_max_dispatch_tokens_per_rank,
layer.moe_intermediate_size * 2,
],
dtype=paddle.bfloat16,
@@ -252,7 +251,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
[
layer.num_local_experts,
layer.ep_size *
layer.moe_config.num_max_dispatch_tokens_per_rank,
layer.model_config.num_max_dispatch_tokens_per_rank,
layer.hidden_size,
],
dtype=paddle.bfloat16,

View File

@@ -72,8 +72,8 @@ class FusedMoE(nn.Layer):
self.layer_idx = layer_idx
self.reduce_results = reduce_results
self.tp_size = fd_config.parallel_config.tensor_parallel_degree
self.ep_size = fd_config.parallel_config.expert_parallel_degree
self.tp_size = fd_config.parallel_config.tensor_parallel_size
self.ep_size = fd_config.parallel_config.expert_parallel_size
self.ep_rank = fd_config.parallel_config.expert_parallel_rank
assert (self.tp_size >= 1 and self.ep_size == 1) or \
@@ -81,7 +81,6 @@ class FusedMoE(nn.Layer):
'MoE only support parallelism on TP or EP dimension.'
self.hidden_size = fd_config.model_config.hidden_size
self.moe_config = fd_config.moe_config
self.num_experts = num_experts
self.num_local_experts = self.num_experts // self.ep_size
@@ -141,7 +140,7 @@ class FusedMoE(nn.Layer):
shape=gate_weight_shape,
dtype="float32",
)
if self.moe_config.moe_use_aux_free:
if self.model_config.moe_use_aux_free:
self.gate_correction_bias = self.create_parameter(
shape=gate_correction_bias_shape,
dtype="float32",

View File

@@ -43,7 +43,7 @@ def load_ep_checkpoint(model_path: str,
filtered_map = {k: v for k, v in weight_list.items() if "experts" not in k}
num_local_ffn_keys = []
for i in range(config.moe_layer_start_index, config.num_layers):
for i in range(config.moe_layer_start_index, config.num_hidden_layers):
for j in range(
config.num_experts_start_offset,
config.num_experts_start_offset + config.num_experts_per_rank,
@@ -261,7 +261,7 @@ def load_composite_checkpoint(
and os.path.isdir(os.path.join(model_path, f))
]
if len(rank_dirs) > 1:
if fd_config.parallel_config.tensor_parallel_degree != len(
if fd_config.parallel_config.tensor_parallel_size != len(
rank_dirs):
raise ValueError(
f"Your model only supports loading with tp{len(rank_dirs)}"
@@ -283,7 +283,7 @@ def load_composite_checkpoint(
else:
state_dict = load_tp_checkpoint(model_path,
cls,
fd_config.model_config,
fd_config.model_config.pretrained_config,
return_numpy=return_numpy)
if not state_dict:
raise ValueError("weight not found in state_dict !")

View File

@@ -27,6 +27,7 @@ from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig
from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.layers.activation import SiluAndMul
from fastdeploy.model_executor.layers.attention.attention import Attention
from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding
@@ -40,7 +41,6 @@ from fastdeploy.model_executor.layers.rotary_embedding import \
DeepseekScalingRotaryEmbedding
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.platforms import current_platform
from fastdeploy.model_executor.forward_meta import ForwardMeta
if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import \
@@ -109,7 +109,7 @@ class DeepSeekV3MoE(nn.Layer):
prefix: str) -> None:
super().__init__()
self.tp_size = fd_config.parallel_config.tensor_parallel_degree
self.tp_size = fd_config.parallel_config.tensor_parallel_size
weight_key_map = {
"gate_weight_key": f"{prefix}.gate.weight",
@@ -124,23 +124,23 @@ class DeepSeekV3MoE(nn.Layer):
self.fused_moe = FusedMoE(
fd_config=fd_config,
reduce_results=False,
moe_intermediate_size=fd_config.model_config.deepseekv3.
moe_intermediate_size=fd_config.model_config.
moe_intermediate_size,
num_experts=fd_config.model_config.deepseekv3.n_routed_experts,
top_k=fd_config.model_config.deepseekv3.num_experts_per_tok,
topk_method=fd_config.model_config.deepseekv3.topk_method,
topk_group=fd_config.model_config.deepseekv3.topk_group,
n_group=fd_config.model_config.deepseekv3.n_group,
routed_scaling_factor=fd_config.model_config.deepseekv3.
num_experts=fd_config.model_config.n_routed_experts,
top_k=fd_config.model_config.num_experts_per_tok,
topk_method=fd_config.model_config.topk_method,
topk_group=fd_config.model_config.topk_group,
n_group=fd_config.model_config.n_group,
routed_scaling_factor=fd_config.model_config.
routed_scaling_factor,
layer_idx=layer_id,
weight_key_map=weight_key_map,
)
self.num_shared_experts = fd_config.model_config.deepseekv3.n_shared_experts
self.num_shared_experts = fd_config.model_config.n_shared_experts
shared_experts_intermediate_size = (
self.num_shared_experts *
fd_config.model_config.deepseekv3.moe_intermediate_size)
fd_config.model_config.moe_intermediate_size)
self.shared_experts = DeepSeekV3MLP(
fd_config=fd_config,
@@ -178,18 +178,18 @@ class DeepseekV3MLAAttention(nn.Layer):
prefix: str = "") -> None:
super().__init__()
self.tp_size = fd_config.parallel_config.tensor_parallel_degree
self.tp_size = fd_config.parallel_config.tensor_parallel_size
self.hidden_size = fd_config.model_config.hidden_size
self.num_attention_heads = fd_config.model_config.num_attention_heads
self.num_attention_heads_tp = self.num_attention_heads // self.tp_size
# MLA
self.qk_nope_head_dim = fd_config.model_config.deepseekv3.qk_nope_head_dim
self.qk_rope_head_dim = fd_config.model_config.deepseekv3.qk_rope_head_dim
self.qk_nope_head_dim = fd_config.model_config.qk_nope_head_dim
self.qk_rope_head_dim = fd_config.model_config.qk_rope_head_dim
self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
self.v_head_dim = fd_config.model_config.deepseekv3.v_head_dim
self.q_lora_rank = fd_config.model_config.deepseekv3.q_lora_rank
self.kv_lora_rank = fd_config.model_config.deepseekv3.kv_lora_rank
self.v_head_dim = fd_config.model_config.v_head_dim
self.q_lora_rank = fd_config.model_config.q_lora_rank
self.kv_lora_rank = fd_config.model_config.kv_lora_rank
self.attn_softmax_scale = self.qk_head_dim**-0.5
self.rope_theta = fd_config.model_config.rope_theta
@@ -255,7 +255,7 @@ class DeepseekV3MLAAttention(nn.Layer):
qk_nope_head_dim=self.qk_nope_head_dim,
v_head_dim=self.v_head_dim)
self.rope_scaling = fd_config.model_config.deepseekv3.rope_scaling
self.rope_scaling = fd_config.model_config.rope_scaling
if self.rope_scaling:
mscale_all_dim = self.rope_scaling.get("mscale_all_dim", False)
scaling_factor = self.rope_scaling["factor"]
@@ -449,9 +449,9 @@ class DeepSeekV3DecoderLayer(nn.Layer):
prefix=f"{prefix}.self_attn",
)
if (fd_config.model_config.deepseekv3.n_routed_experts is not None
if (fd_config.model_config.n_routed_experts is not None
and layer_id
>= fd_config.model_config.deepseekv3.first_k_dense_replace):
>= fd_config.model_config.first_k_dense_replace):
self.mlp = DeepSeekV3MoE(
fd_config=fd_config,
layer_id=layer_id,
@@ -525,8 +525,8 @@ class DeepSeekV3Model(nn.Layer):
Initializer for the DeepSeekV3Model class.
"""
super().__init__()
self.num_layers = fd_config.model_config.num_layers
fd_config.model_config.prefix_name = "deepseek_v3"
self.num_layers = fd_config.model_config.num_hidden_layers
fd_config.model_config.pretrained_config.prefix_name = "deepseek_v3"
self.embeddings = VocabParallelEmbedding(
fd_config,
@@ -539,7 +539,7 @@ class DeepSeekV3Model(nn.Layer):
self.decoder_layers = nn.LayerList([
DeepSeekV3DecoderLayer(
fd_config,
prefix=f"{fd_config.model_config.prefix_name}.layers.{i}")
prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}")
for i in range(self.num_layers)
])
@@ -755,5 +755,5 @@ class DeepSeekV3PretrainedModel(PretrainedModel):
return final_actions
mappings = get_tensor_parallel_split_mappings(config.num_layers)
mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
return mappings

View File

@@ -25,7 +25,7 @@ from paddle import nn
from paddleformers.transformers import PretrainedModel
from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig, ModelConfig
from fastdeploy.config import FDConfig
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.decorator import \
support_graph_optimization
@@ -54,7 +54,7 @@ class Ernie4_5_MLP(nn.Layer):
reduce_results: bool = True,
) -> None:
super().__init__()
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
self.gate_up_proj = MergedColumnParallelLinear(
fd_config=fd_config,
prefix=f"{prefix}.up_gate_proj",
@@ -179,16 +179,16 @@ class Ernie4_5_MoE(nn.Layer):
self.fused_moe = FusedMoE(
fd_config=fd_config,
moe_intermediate_size=fd_config.moe_config.moe_intermediate_size,
num_experts=fd_config.moe_config.num_experts,
top_k=fd_config.moe_config.top_k,
moe_intermediate_size=fd_config.model_config.moe_intermediate_size,
num_experts=fd_config.model_config.moe_num_experts,
top_k=fd_config.model_config.moe_k,
layer_idx=layer_id,
weight_key_map=weight_key_map,
)
self.num_shared_experts = fd_config.moe_config.moe_num_shared_experts
self.num_shared_experts = fd_config.model_config.moe_num_shared_experts
if self.num_shared_experts > 0:
shared_experts_hidden_dim = self.num_shared_experts * fd_config.moe_config.moe_intermediate_size
shared_experts_hidden_dim = self.num_shared_experts * fd_config.model_config.moe_intermediate_size
self.shared_experts = Ernie4_5_MLP(
fd_config=fd_config,
intermediate_size=shared_experts_hidden_dim,
@@ -271,8 +271,8 @@ class Ernie4_5_DecoderLayer(nn.Layer):
prefix=f"{prefix}.self_attn",
)
if (fd_config.moe_config.num_experts is not None
and layer_id >= fd_config.moe_config.moe_layer_start_index):
if (fd_config.model_config.moe_num_experts is not None
and layer_id >= fd_config.model_config.moe_layer_start_index):
self.mlp = Ernie4_5_MoE(
fd_config=fd_config,
layer_id=layer_id,
@@ -281,7 +281,7 @@ class Ernie4_5_DecoderLayer(nn.Layer):
else:
self.mlp = Ernie4_5_MLP(
fd_config=fd_config,
intermediate_size=fd_config.model_config.ffn_hidden_size,
intermediate_size=fd_config.model_config.intermediate_size,
prefix=f"{prefix}.mlp",
)
@@ -346,20 +346,20 @@ class Ernie4_5_Model(nn.Layer):
"""
super().__init__()
self.num_layers = fd_config.model_config.num_layers
fd_config.model_config.prefix_name = "ernie"
self.num_layers = fd_config.model_config.num_hidden_layers
fd_config.model_config.pretrained_config.prefix_name = "ernie"
self.embeddings = VocabParallelEmbedding(
fd_config=fd_config,
num_embeddings=fd_config.model_config.vocab_size,
embedding_dim=fd_config.model_config.hidden_size,
params_dtype=paddle.get_default_dtype(),
prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"))
prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"))
self.hidden_layers = nn.LayerList([
Ernie4_5_DecoderLayer(
fd_config=fd_config,
prefix=f"{fd_config.model_config.prefix_name}.layers.{i}")
prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}")
for i in range(self.num_layers)
])
@@ -367,7 +367,7 @@ class Ernie4_5_Model(nn.Layer):
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=fd_config.model_config.rms_norm_eps,
prefix=f"{fd_config.model_config.prefix_name}.norm",
prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm",
)
def load_state_dict(self, state_dict):
@@ -466,8 +466,8 @@ class Ernie4_5_MoeForCausalLM(ModelForCasualLM):
shape=[0, self.fd_config.model_config.hidden_size],
dtype=paddle.get_default_dtype(),
)
for i in range(self.fd_config.moe_config.moe_layer_start_index,
self.fd_config.model_config.num_layers):
for i in range(self.fd_config.model_config.moe_layer_start_index,
self.fd_config.model_config.num_hidden_layers):
self.model.hidden_layers[i].mlp.fused_moe(fake_hidden_states)
def forward(
@@ -559,7 +559,7 @@ class Ernie4_5_PretrainedModel(PretrainedModel):
]
@classmethod
def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True):
def _get_tensor_parallel_mappings(cls, config, is_split=True):
"""
get_tensor_parallel_mappings
"""
@@ -603,7 +603,7 @@ class Ernie4_5_PretrainedModel(PretrainedModel):
)
return final_actions
mappings = get_tensor_parallel_split_mappings(
config.num_layers,
config.num_hidden_layers,
config.moe_num_experts,
config.moe_layer_start_index,
config.prefix_name,

View File

@@ -25,12 +25,12 @@ from paddle import nn
from paddleformers.transformers import PretrainedModel
from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig, ModelConfig
from fastdeploy.config import FDConfig
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.layers.mtp_linear import ParallelEHProjection
from fastdeploy.model_executor.layers.normalization import RMSNorm
from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_DecoderLayer
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.model_executor.forward_meta import ForwardMeta
class Ernie4_5_MTPPretrainedModel(PretrainedModel):
@@ -47,7 +47,7 @@ class Ernie4_5_MTPPretrainedModel(PretrainedModel):
return None
@classmethod
def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True):
def _get_tensor_parallel_mappings(cls, config, is_split=True):
"""
get_tensor_parallel_mappings
"""
@@ -237,7 +237,7 @@ class Ernie4_5_MTPPretrainedModel(PretrainedModel):
moe_num_experts = 0
mappings = get_tensor_parallel_split_mappings(
config.num_layers,
config.num_hidden_layers,
moe_num_experts,
config.moe_layer_start_index,
)
@@ -262,13 +262,13 @@ class Ernie4_5_MTPModel(nn.Layer):
"""
super().__init__()
self.num_layers = fd_config.model_config.num_layers
self.num_layers = fd_config.model_config.num_hidden_layers
self.embeddings = fd_config.speculative_config.sharing_model.model.embeddings
self.hidden_layers = nn.LayerList([
Ernie4_5_DecoderLayer(
fd_config=fd_config,
prefix=f"{fd_config.model_config.prefix_name}.{i}")
prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.{i}")
for i in range(self.num_layers)
])
@@ -398,8 +398,8 @@ class Ernie4_5_MTPForCausalLM(ModelForCasualLM):
shape=[0, self.fd_config.model_config.hidden_size],
dtype=paddle.get_default_dtype(),
)
for i in range(self.fd_config.moe_config.moe_layer_start_index,
self.fd_config.model_config.num_layers):
for i in range(self.fd_config.model_config.moe_layer_start_index,
self.fd_config.model_config.num_hidden_layers):
self.model.hidden_layers[i].mlp.fused_moe(fake_hidden_states)
def forward(

View File

@@ -1,167 +0,0 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import copy
from fastdeploy.config import ModelConfig
from .dfnrope.modeling import DFNRopeVisionTransformerConfig
__all__ = [
"Ernie4_5_VLMoeConfig",
]
class Ernie4_5_VLMoeConfig(ModelConfig):
r"""
This is the configuration class to store the configuration of a [`~ErnieModel`]. It is used to instantiate an Ernie
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Ernie-7B.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the Ernie model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`~ErnieModel`] or [`~TFErnieModel`].
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer encoder.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
tie_word_embeddings(`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
Example:
```python
>>> from paddleformers.transformer import ErnieModel, ErnieConfig
>>> # Initializing a Ernie ernie-7b style configuration
>>> configuration = ErnieConfig()
>>> # Initializing a model from the ernie-7b style configuration
>>> model = ErnieModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "erniemoevl"
attribute_map = {
"n_positions": "max_position_embeddings",
"n_embd": "hidden_size",
"n_layer": "num_hidden_layers",
"n_head": "num_attention_heads",
"n_inner": "intermediate_size",
"activation_function": "hidden_act",
}
def __init__(
self,
vision_config=None,
im_patch_id=None,
pixel_hidden_size=None, # None for fuyu
modality_detach=False,
temporal_conv_size=2,
spatial_conv_size=2,
mm_vocab_size=0, # vocab for mm specialtokens
max_text_id=None,
use_temporal_conv=True,
moe_use_size_all2all=False,
moe_num_attn_experts=False,
moe_dense_experts_token_type_id: int = 3,
moe_use_hard_gate: bool = True,
moe_fuse_experts: bool = False,
moe_use_token_type_bias: bool = False,
disable_ffn_model_parallel=False,
fuse_attn_ffn=True,
rope_3d=True,
freq_allocation=20,
using_precision_check=False,
use_recompute_resampler=False,
resampler_fuse_rms_norm=False,
moe_layer_feed_fake_token=False,
moe_num_experts=0,
**kwargs,
):
super().__init__(**kwargs)
self.vision_config = DFNRopeVisionTransformerConfig(
**vision_config) if vision_config else None
self.im_patch_id = im_patch_id
self.pixel_hidden_size = pixel_hidden_size
self.modality_detach = modality_detach
self.temporal_conv_size = temporal_conv_size
self.spatial_conv_size = spatial_conv_size
self.mm_vocab_size = mm_vocab_size
self.max_text_id = max_text_id
self.use_temporal_conv = use_temporal_conv
self.moe_use_size_all2all = moe_use_size_all2all
self.moe_num_attn_experts = moe_num_attn_experts
self.moe_dense_experts_token_type_id = moe_dense_experts_token_type_id
self.moe_use_hard_gate = moe_use_hard_gate
self.moe_fuse_experts = moe_fuse_experts
self.moe_use_token_type_bias = moe_use_token_type_bias
self.disable_ffn_model_parallel = disable_ffn_model_parallel
self.fuse_attn_ffn = fuse_attn_ffn
self.rope_3d = rope_3d
self.freq_allocation = freq_allocation
self.using_precision_check = using_precision_check
self.use_recompute_resampler = use_recompute_resampler
self.resampler_fuse_rms_norm = resampler_fuse_rms_norm
self.moe_layer_feed_fake_token = moe_layer_feed_fake_token
self.moe_num_experts = moe_num_experts
@property
def multimodel_experts(self) -> bool:
"""是否有多种类型的experts."""
return isinstance(self.moe_num_experts,
(tuple, list)) and len(self.moe_num_experts) > 1
@property
def use_moe(self) -> bool:
"""
Check if model is using MoE architecture.
Returns:
bool: True if moe_num_experts > 0, False otherwise
"""
return sum(
self.moe_num_experts
) > 0 if self.multimodel_experts else self.moe_num_experts > 0
def to_dict(self, saving_file=False):
"""to_dict"""
output = copy.deepcopy(self.__dict__)
if self.vision_config:
output["vision_config"] = (
self.vision_config.to_diff_dict() if isinstance(
self.vision_config,
(DFNRopeVisionTransformerConfig)) else self.vision_config)
output["model_type"] = self.__class__.model_type
return output

View File

@@ -72,8 +72,8 @@ class Ernie4_5_VLMoE(nn.Layer):
prefix: str) -> None:
super().__init__()
self.tp_size = fd_config.parallel_config.tensor_parallel_degree
moe_layer_start_index = fd_config.moe_config.moe_layer_start_index
self.tp_size = fd_config.parallel_config.tensor_parallel_size
moe_layer_start_index = fd_config.model_config.moe_layer_start_index
if isinstance(moe_layer_start_index, int):
text_moe_layer_start_index = moe_layer_start_index
image_moe_layer_start_index = moe_layer_start_index
@@ -81,10 +81,10 @@ class Ernie4_5_VLMoE(nn.Layer):
text_moe_layer_start_index = moe_layer_start_index[0]
image_moe_layer_start_index = moe_layer_start_index[1]
moe_layer_end_index = fd_config.moe_config.moe_layer_end_index
moe_layer_end_index = fd_config.model_config.moe_layer_end_index
if moe_layer_end_index is None:
text_moe_layer_end_index = fd_config.model_config.num_layers
image_moe_layer_end_index = fd_config.model_config.num_layers
text_moe_layer_end_index = fd_config.model_config.num_hidden_layers
image_moe_layer_end_index = fd_config.model_config.num_hidden_layers
elif isinstance(moe_layer_end_index, int):
text_moe_layer_end_index = moe_layer_end_index
image_moe_layer_end_index = moe_layer_end_index
@@ -107,11 +107,11 @@ class Ernie4_5_VLMoE(nn.Layer):
self.mlp_text = FusedMoE(
fd_config=fd_config,
reduce_results=False,
moe_intermediate_size=fd_config.moe_config.
moe_intermediate_size=fd_config.model_config.
moe_intermediate_size[0],
num_experts=fd_config.moe_config.num_experts[0],
num_experts=fd_config.model_config.moe_num_experts[0],
expert_id_offset=0,
top_k=fd_config.moe_config.top_k,
top_k=fd_config.model_config.moe_k,
layer_idx=layer_id,
moe_tag="Text",
weight_key_map=weight_key_map,
@@ -120,7 +120,7 @@ class Ernie4_5_VLMoE(nn.Layer):
else:
self.mlp_text = Ernie4_5_VLMLP(
fd_config=fd_config,
intermediate_size=fd_config.model_config.ffn_hidden_size,
intermediate_size=fd_config.model_config.intermediate_size,
prefix=f"{prefix}",
)
@@ -139,11 +139,11 @@ class Ernie4_5_VLMoE(nn.Layer):
self.mlp_image = FusedMoE(
fd_config=fd_config,
reduce_results=False,
moe_intermediate_size=fd_config.moe_config.
moe_intermediate_size=fd_config.model_config.
moe_intermediate_size[1],
num_experts=fd_config.moe_config.num_experts[1],
expert_id_offset=fd_config.moe_config.num_experts[0],
top_k=fd_config.moe_config.top_k,
num_experts=fd_config.model_config.moe_num_experts[1],
expert_id_offset=fd_config.model_config.moe_num_experts[0],
top_k=fd_config.model_config.moe_k,
layer_idx=layer_id,
moe_tag="Image",
weight_key_map=weight_key_map,
@@ -152,16 +152,16 @@ class Ernie4_5_VLMoE(nn.Layer):
else:
self.mlp_image = Ernie4_5_VLMLP(
fd_config=fd_config,
intermediate_size=fd_config.model_config.ffn_hidden_size,
intermediate_size=fd_config.model_config.intermediate_size,
prefix=f"{prefix}",
)
self.num_shared_experts = fd_config.moe_config.moe_num_shared_experts
self.num_shared_experts = fd_config.model_config.moe_num_shared_experts
if self.num_shared_experts > 0:
self.share_experts = Ernie4_5_VLMLP(
fd_config=fd_config,
intermediate_size=self.num_shared_experts *
fd_config.moe_config.moe_intermediate_size[0],
fd_config.model_config.moe_intermediate_size[0],
prefix=f"{prefix}.shared_experts",
reduce_results=False,
)
@@ -235,15 +235,15 @@ class Ernie4_5_VLDecoderLayer(nn.Layer):
super().__init__()
layer_id = int(prefix.split(sep='.')[-1])
moe_layer_start_index = fd_config.moe_config.moe_layer_start_index
moe_layer_start_index = fd_config.model_config.moe_layer_start_index
if isinstance(moe_layer_start_index, list):
min_moe_layer_start_index = min(moe_layer_start_index)
else:
min_moe_layer_start_index = moe_layer_start_index
max_moe_layer_end_index = fd_config.model_config.num_layers
if fd_config.moe_config.moe_layer_end_index is not None:
moe_layer_end_index = fd_config.moe_config.moe_layer_end_index
max_moe_layer_end_index = fd_config.model_config.num_hidden_layers
if fd_config.model_config.moe_layer_end_index is not None:
moe_layer_end_index = fd_config.model_config.moe_layer_end_index
if isinstance(moe_layer_start_index, list):
max_moe_layer_end_index = max(moe_layer_end_index)
else:
@@ -257,7 +257,7 @@ class Ernie4_5_VLDecoderLayer(nn.Layer):
assert min_moe_layer_start_index <= max_moe_layer_end_index
if (fd_config.moe_config.num_experts is not None
if (fd_config.model_config.moe_num_experts is not None
and layer_id >= min_moe_layer_start_index
and layer_id <= max_moe_layer_end_index):
self.mlp = Ernie4_5_VLMoE(
@@ -268,7 +268,7 @@ class Ernie4_5_VLDecoderLayer(nn.Layer):
else:
self.mlp = Ernie4_5_VLMLP(
fd_config=fd_config,
intermediate_size=fd_config.model_config.ffn_hidden_size,
intermediate_size=fd_config.model_config.intermediate_size,
prefix=f"{prefix}.mlp",
)
@@ -337,23 +337,23 @@ class Ernie4_5_VLModel(nn.Layer):
"""
super().__init__()
self.num_layers = fd_config.model_config.num_layers
self.im_patch_id = fd_config.moe_config.im_patch_id
self.num_layers = fd_config.model_config.num_hidden_layers
self.im_patch_id = fd_config.model_config.im_patch_id
self._dtype = fd_config.model_config.dtype
fd_config.model_config.prefix_name = "ernie"
fd_config.model_config.pretrained_config.prefix_name = "ernie"
self.embeddings = VocabParallelEmbedding(
fd_config=fd_config,
num_embeddings=fd_config.model_config.vocab_size,
embedding_dim=fd_config.model_config.hidden_size,
params_dtype=paddle.get_default_dtype,
prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"),
prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"),
)
self.hidden_layers = nn.LayerList([
Ernie4_5_VLDecoderLayer(
fd_config=fd_config,
prefix=f"{fd_config.model_config.prefix_name}.layers.{i}")
prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}")
for i in range(self.num_layers)
])
@@ -361,7 +361,7 @@ class Ernie4_5_VLModel(nn.Layer):
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=fd_config.model_config.rms_norm_eps,
prefix=f"{fd_config.model_config.prefix_name}.norm",
prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm",
)
def load_state_dict(self, state_dict):
@@ -748,7 +748,7 @@ class Ernie4_5_VLPretrainedModel(PretrainedModel):
moe_layer_start_index = config.moe_layer_start_index
mappings = get_tensor_parallel_split_mappings(
config.num_layers,
config.num_hidden_layers,
config.moe_num_experts,
moe_layer_start_index,
config.prefix_name,

View File

@@ -53,7 +53,7 @@ class ModelForCasualLM(nn.Layer, ABC):
"""
Args:
configs (dict): Configurations including parameters such as max_dec_len, min_dec_len, decode_strategy,
ori_vocab_size, use_topp_sampling, etc.
vocab_size, use_topp_sampling, etc.
"""
super(ModelForCasualLM, self).__init__()
self.fd_config = configs

View File

@@ -24,6 +24,7 @@ from paddleformers.transformers import PretrainedModel
from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig, ModelConfig
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.decorator import \
support_graph_optimization
from fastdeploy.model_executor.layers.activation import SiluAndMul
@@ -34,7 +35,6 @@ from fastdeploy.model_executor.layers.linear import (
from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
from fastdeploy.model_executor.layers.normalization import RMSNorm
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.model_executor.forward_meta import ForwardMeta
class Qwen2MLP(nn.Layer):
@@ -47,12 +47,12 @@ class Qwen2MLP(nn.Layer):
prefix: str = "",
) -> None:
super().__init__()
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
self.gate_up_proj = MergedColumnParallelLinear(
fd_config=fd_config,
prefix=f"{prefix}.up_gate_proj",
input_size=fd_config.model_config.hidden_size,
output_size=fd_config.model_config.ffn_hidden_size * 2,
output_size=fd_config.model_config.intermediate_size * 2,
with_bias=False,
activation=fd_config.model_config.hidden_act,
)
@@ -60,7 +60,7 @@ class Qwen2MLP(nn.Layer):
self.down_proj = RowParallelLinear(
fd_config=fd_config,
prefix=f"{prefix}.down_proj",
input_size=fd_config.model_config.ffn_hidden_size,
input_size=fd_config.model_config.intermediate_size,
output_size=fd_config.model_config.hidden_size,
with_bias=False,
)
@@ -227,21 +227,21 @@ class Qwen2Model(nn.Layer):
"""
super().__init__()
self.num_layers = fd_config.model_config.num_layers
fd_config.model_config.prefix_name = "qwen2"
self.num_layers = fd_config.model_config.num_hidden_layers
fd_config.model_config.pretrained_config.prefix_name = "qwen2"
self.embeddings = VocabParallelEmbedding(
fd_config=fd_config,
num_embeddings=fd_config.model_config.vocab_size,
embedding_dim=fd_config.model_config.hidden_size,
params_dtype=paddle.get_default_dtype,
prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"),
prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"),
)
self.layers = nn.LayerList([
Qwen2DecoderLayer(
fd_config=fd_config,
prefix=f"{fd_config.model_config.prefix_name}.layers.{i}")
prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}")
for i in range(self.num_layers)
])
@@ -249,7 +249,7 @@ class Qwen2Model(nn.Layer):
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=fd_config.model_config.rms_norm_eps,
prefix=f"{fd_config.model_config.prefix_name}.norm",
prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm",
)
def load_state_dict(self, state_dict):
@@ -427,6 +427,6 @@ class Qwen2PretrainedModel(PretrainedModel):
return final_actions
mappings = get_tensor_parallel_split_mappings(config.num_layers)
mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
return mappings

View File

@@ -23,7 +23,8 @@ from paddle import nn
from paddleformers.transformers import PretrainedModel
from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig, ModelConfig
from fastdeploy.config import FDConfig
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.decorator import \
support_graph_optimization
from fastdeploy.model_executor.layers.attention.attention import Attention
@@ -34,7 +35,6 @@ from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
from fastdeploy.model_executor.layers.normalization import RMSNorm
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.model_executor.models.qwen2 import Qwen2DecoderLayer, Qwen2MLP
from fastdeploy.model_executor.forward_meta import ForwardMeta
class Qwen3MLP(Qwen2MLP):
@@ -59,7 +59,7 @@ class Qwen3Attention(nn.Layer):
self.qkv_proj = QKVParallelLinear(fd_config,
prefix=f"{prefix}.qkv_proj",
with_bias=False)
nranks = fd_config.parallel_config.tensor_parallel_degree
nranks = fd_config.parallel_config.tensor_parallel_size
self.o_proj = RowParallelLinear(
fd_config,
@@ -85,7 +85,7 @@ class Qwen3Attention(nn.Layer):
prefix=f"{prefix}.k_norm",
begin_norm_axis=2)
nranks = fd_config.parallel_config.tensor_parallel_degree
nranks = fd_config.parallel_config.tensor_parallel_size
num_kv_heads_replicas = max(1, nranks // fd_config.model_config.num_key_value_heads)
self.q_size = fd_config.model_config.num_attention_heads * self.head_dim // nranks
self.kv_size = fd_config.model_config.num_key_value_heads * self.head_dim * num_kv_heads_replicas // nranks
@@ -163,21 +163,21 @@ class Qwen3Model(nn.Layer):
"""
super().__init__()
self.num_layers = fd_config.model_config.num_layers
fd_config.model_config.prefix_name = "model"
self.num_layers = fd_config.model_config.num_hidden_layers
fd_config.model_config.pretrained_config.prefix_name = "model"
self.embeddings = VocabParallelEmbedding(
fd_config=fd_config,
num_embeddings=fd_config.model_config.vocab_size,
embedding_dim=fd_config.model_config.hidden_size,
params_dtype=paddle.get_default_dtype,
prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"),
prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"),
)
self.layers = nn.LayerList([
Qwen3DecoderLayer(
fd_config=fd_config,
prefix=f"{fd_config.model_config.prefix_name}.layers.{i}")
prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}")
for i in range(self.num_layers)
])
@@ -185,7 +185,7 @@ class Qwen3Model(nn.Layer):
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=fd_config.model_config.rms_norm_eps,
prefix=f"{fd_config.model_config.prefix_name}.norm",
prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm",
)
def load_state_dict(self, state_dict):
@@ -307,7 +307,7 @@ class Qwen3PretrainedModel(PretrainedModel):
return None
@classmethod
def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True):
def _get_tensor_parallel_mappings(cls, config, is_split=True):
from paddleformers.transformers.conversion_utils import \
split_or_merge_func
@@ -358,5 +358,5 @@ class Qwen3PretrainedModel(PretrainedModel):
return final_actions
mappings = get_tensor_parallel_split_mappings(config.num_layers)
mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
return mappings

View File

@@ -23,20 +23,19 @@ from paddle import nn
from paddleformers.transformers import PretrainedModel
from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig, ModelConfig
from fastdeploy.config import FDConfig
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.decorator import \
support_graph_optimization
from fastdeploy.model_executor.layers.activation import SiluAndMul
from fastdeploy.model_executor.layers.attention.attention import Attention
from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding
from fastdeploy.model_executor.layers.linear import (
MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear)
MergedColumnParallelLinear, RowParallelLinear)
from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
from fastdeploy.model_executor.layers.moe.moe import FusedMoE
from fastdeploy.model_executor.layers.normalization import RMSNorm
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.model_executor.models.qwen3 import Qwen3Attention
from fastdeploy.model_executor.forward_meta import ForwardMeta
class Qwen3MLP(nn.Layer):
@@ -49,13 +48,13 @@ class Qwen3MLP(nn.Layer):
prefix: str = "",
) -> None:
super().__init__()
self.nranks = fd_config.parallel_config.tensor_parallel_degree
self.nranks = fd_config.parallel_config.tensor_parallel_size
self.gate_up_proj = MergedColumnParallelLinear(
fd_config,
prefix=f"{prefix}.up_gate_proj",
input_size=fd_config.model_config.hidden_size,
output_size=fd_config.model_config.ffn_hidden_size * 2,
output_size=fd_config.model_config.intermediate_size * 2,
with_bias=False,
activation=fd_config.model_config.hidden_act,
)
@@ -63,7 +62,7 @@ class Qwen3MLP(nn.Layer):
self.down_proj = RowParallelLinear(
fd_config,
prefix=f"{prefix}.down_proj",
input_size=fd_config.model_config.ffn_hidden_size,
input_size=fd_config.model_config.intermediate_size,
output_size=fd_config.model_config.hidden_size,
with_bias=False,
)
@@ -115,14 +114,14 @@ class Qwen3DecoderLayer(nn.Layer):
f"{prefix}.mlp.experts.{{}}.down_proj.weight",
}
if (fd_config.moe_config.num_experts is not None
and layer_id >= fd_config.moe_config.moe_layer_start_index):
if (fd_config.model_config.moe_num_experts is not None
and layer_id >= fd_config.model_config.moe_layer_start_index):
self.mlp = FusedMoE(fd_config,
moe_intermediate_size=fd_config.moe_config.
moe_intermediate_size=fd_config.model_config.
moe_intermediate_size,
num_experts=fd_config.moe_config.num_experts,
top_k=fd_config.moe_config.top_k,
num_experts=fd_config.model_config.moe_num_experts,
top_k=fd_config.model_config.moe_topk,
layer_idx=layer_id,
weight_key_map=weight_key_map)
else:
@@ -199,21 +198,21 @@ class Qwen3MoeModel(nn.Layer):
"""
super().__init__()
self.num_layers = fd_config.model_config.num_layers
fd_config.model_config.prefix_name = "model"
self.num_layers = fd_config.model_config.num_hidden_layers
fd_config.model_config.pretrained_config.prefix_name = "model"
self.embeddings = VocabParallelEmbedding(
fd_config,
num_embeddings=fd_config.model_config.vocab_size,
embedding_dim=fd_config.model_config.hidden_size,
params_dtype=paddle.get_default_dtype,
prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"),
prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"),
)
self.layers = nn.LayerList([
Qwen3DecoderLayer(
fd_config,
prefix=f"{fd_config.model_config.prefix_name}.layers.{i}")
prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}")
for i in range(self.num_layers)
])
@@ -221,7 +220,7 @@ class Qwen3MoeModel(nn.Layer):
fd_config,
hidden_size=fd_config.model_config.hidden_size,
eps=1e-6,
prefix=f"{fd_config.model_config.prefix_name}.norm",
prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm",
)
def load_state_dict(self, state_dict):
@@ -338,7 +337,7 @@ class Qwen3MoePretrainedModel(PretrainedModel):
return None
@classmethod
def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True):
def _get_tensor_parallel_mappings(cls, config, is_split=True):
# TODO not support TP split now, next PR will support TP.
from paddleformers.transformers.conversion_utils import \
@@ -351,7 +350,7 @@ class Qwen3MoePretrainedModel(PretrainedModel):
num_attention_heads=config.num_attention_heads,
)
def get_tensor_parallel_split_mappings(num_layers, moe_num_experts):
def get_tensor_parallel_split_mappings(num_layers, num_experts):
final_actions = {}
base_actions = {
@@ -402,23 +401,23 @@ class Qwen3MoePretrainedModel(PretrainedModel):
for key, action in base_actions.items():
for i in range(num_layers):
newkey = key.replace("layers.0.", f"layers.{i}.")
for j in range(moe_num_experts):
for j in range(num_experts):
newkey2 = newkey.replace("experts.0.", f"experts.{j}.")
final_actions[newkey2] = action
return final_actions
moe_num_experts = 0
num_experts = 0
if isinstance(config.moe_num_experts, list):
moe_num_experts = sum(config.moe_num_experts)
num_experts = sum(config.moe_num_experts)
elif isinstance(config.moe_num_experts, int):
moe_num_experts = config.moe_num_experts
num_experts = config.moe_num_experts
else:
raise ValueError(
f"Not support type of moe_num_experts [{type(config.moe_num_experts)}]"
f"Not support type of num_experts [{type(config.moe_num_experts)}]"
)
mappings = get_tensor_parallel_split_mappings(config.num_layers,
moe_num_experts)
mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers,
num_experts)
return mappings

View File

@@ -36,10 +36,9 @@ def check_tensor_parallel_prerequisites(
safetensor_keys: List[str],
) -> None:
"""check_tensor_parallel_prerequisites"""
if fd_config.parallel_config.tensor_parallel_degree > 1:
if fd_config.parallel_config.tensor_parallel_size > 1:
tensor_parallel_map = cls._get_tensor_parallel_mappings(
fd_config.model_config, is_split=True
)
fd_config.model_config.pretrained_config, is_split=True)
if not tensor_parallel_map:
logger.error(
"filtered_quant_map should not be empty. \

View File

@@ -165,7 +165,7 @@ class Ernie4_5_MoeForCausalLMRL(Ernie4_5_MoeForCausalLM):
infer_to_train[f"{infer_base_name}.{layer_idx}.mlp.fused_moe.gate_weight"] = \
f"{train_base_name}.{layer_idx}.mlp.gate.weight"
if self.fd_config.moe_config.moe_use_aux_free:
if self.fd_config.model_config.moe_use_aux_free:
infer_to_train[f"{infer_base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias"] = \
f"{train_base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias"
@@ -178,7 +178,7 @@ class Ernie4_5_MoeForCausalLMRL(Ernie4_5_MoeForCausalLM):
f"{train_base_name}.{layer_idx}.mlp.shared_experts.down_proj.weight"
# MoE experts mappings
for expert_idx in range(self.fd_config.moe_config.num_experts):
for expert_idx in range(self.fd_config.model_config.moe_num_experts):
for ph in place_holders:
# FFN1 (up_gate_proj)
ffn1_key = f"{infer_base_name}.{layer_idx}.mlp.fused_moe.moe_ffn1_weight"
@@ -198,12 +198,12 @@ class Ernie4_5_MoeForCausalLMRL(Ernie4_5_MoeForCausalLM):
# Process non-MoE layers
for layer_idx in range(
self.fd_config.moe_config.moe_layer_start_index):
self.fd_config.model_config.moe_layer_start_index):
_add_layer_mappings(layer_idx, is_moe_layer=False)
# Process MoE layers
for layer_idx in range(self.fd_config.moe_config.moe_layer_start_index,
self.fd_config.model_config.num_layers):
for layer_idx in range(self.fd_config.model_config.moe_layer_start_index,
self.fd_config.model_config.num_hidden_layers):
_add_layer_mappings(layer_idx, is_moe_layer=True)
return infer_to_train
@@ -278,7 +278,7 @@ class Qwen2ForCausalLMRL(Qwen2ForCausalLM):
f"{train_base_name}.{layer_idx}.mlp.down_proj.{ph}"
for layer_idx in range(
self.fd_config.model_config.num_layers):
self.fd_config.model_config.num_hidden_layers):
_add_layer_mappings(layer_idx)
return infer_to_train
@@ -396,7 +396,7 @@ class Qwen3MoeForCausalLMRL(Qwen3MoeForCausalLM):
)
# Process MoE layers
for layer_idx in range(self.fd_config.model_config.num_layers):
for layer_idx in range(self.fd_config.model_config.num_hidden_layers):
_add_layer_mappings(layer_idx, is_moe_layer=True)
return infer_to_train

View File

@@ -21,6 +21,7 @@ import numpy as np
import paddle
from fastdeploy.engine.request import Request
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.layers.attention import get_attention_backend
from fastdeploy.model_executor.layers.attention.base_attention_backend import \
AttentionBackend
@@ -36,7 +37,6 @@ from fastdeploy.model_executor.ops.gpu import (draft_model_postprocess,
share_external_data)
from fastdeploy.model_executor.pre_and_post_process import (pre_process,
rebuild_padding)
from fastdeploy.model_executor.forward_meta import ForwardMeta
from .base import Proposer
@@ -49,7 +49,7 @@ class MTPProposer(Proposer):
def __init__(self, cfg, main_model, local_rank, device_id,
main_model_inputs):
super().__init__(cfg)
self.num_main_model_layers = self.model_config.num_layers
self.num_main_model_layers = self.model_config.num_hidden_layers
self.local_rank = local_rank
self.device_id = device_id
self._update_cfg(main_model)
@@ -70,10 +70,10 @@ class MTPProposer(Proposer):
"""
self.model_config.architectures[0] = "Ernie4_5_MTPForCausalLM"
self.speculative_config.sharing_model = main_model
self.model_config.num_layers = 1
self.model_config.num_hidden_layers = 1
self.parallel_config.model_name_or_path = (
self.speculative_config.model_name_or_path)
self.model_config.prefix_name = "ernie.mtp_block"
self.model_config.pretrained_config.prefix_name = "ernie.mtp_block"
if self.speculative_config.quantization != "":
self.model_config.quantization = (
self.speculative_config.quantization)
@@ -145,7 +145,7 @@ class MTPProposer(Proposer):
cache_kvs_list = []
for i in range(
self.num_main_model_layers,
self.num_main_model_layers + self.model_config.num_layers):
self.num_main_model_layers + self.model_config.num_hidden_layers):
key_cache = paddle.empty(shape=[], dtype=cache_type)
key_cache_name = f"key_caches_{i}_rank{self.local_rank}.device{self.device_id}"
val_cache_name = f"value_caches_{i}_rank{self.local_rank}.device{self.device_id}"
@@ -159,7 +159,7 @@ class MTPProposer(Proposer):
self.model_inputs["caches"] = cache_kvs_list
else:
for i in range(self.model_config.num_layers):
for i in range(self.model_config.num_hidden_layers):
self.cache_kvs["key_caches_{}".format(i)] = paddle.full(
shape=kv_cache_shape,
fill_value=0,
@@ -183,10 +183,10 @@ class MTPProposer(Proposer):
# TODO(gongshaotian): Get rank from config
num_heads = (self.model_config.num_attention_heads //
self.parallel_config.tensor_parallel_degree)
self.parallel_config.tensor_parallel_size)
self.model_config.kv_num_heads = (
int(self.model_config.num_key_value_heads) //
self.parallel_config.tensor_parallel_degree)
self.parallel_config.tensor_parallel_size)
head_dim = self.model_config.head_dim
# Get the attention backend
@@ -608,7 +608,7 @@ class MTPProposer(Proposer):
self.model_inputs,
)
if self.parallel_config.tensor_parallel_degree > 1:
if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(sampled_token_ids, 0)
self._post_process(sampled_token_ids)

View File

@@ -670,7 +670,7 @@ class GCUModelRunner(ModelRunnerBase):
# Get kv cache shape
kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(
max_num_blocks=max_block_num)
# local_rank = self.local_rank % self.parallel_config.tensor_parallel_degree
# local_rank = self.local_rank % self.parallel_config.tensor_parallel_size
if not self.parallel_config.do_profile and (
self.parallel_config.enable_prefix_caching \
@@ -679,7 +679,7 @@ class GCUModelRunner(ModelRunnerBase):
"prefix_caching is not support by GCUModelRunner."
)
else:
for i in range(self.model_config.num_layers):
for i in range(self.model_config.num_hidden_layers):
cache_kvs["key_caches_{}".format(i)] = paddle.full(
shape=kv_cache_shape,
@@ -701,10 +701,10 @@ class GCUModelRunner(ModelRunnerBase):
"""
assert len(self.attn_backends) == 0
num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_degree
num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size
self.model_config.kv_num_heads = int(
self.model_config.num_key_value_heads
) // self.parallel_config.tensor_parallel_degree
) // self.parallel_config.tensor_parallel_size
head_dim = self.model_config.head_dim
# Get the attention backend
@@ -783,14 +783,14 @@ class GCUModelRunner(ModelRunnerBase):
)
sampler_output = self.sampler(logits,
self.sampling_metadata)
if self.parallel_config.tensor_parallel_degree > 1:
if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0)
else:
self.sampler(logits, self.sampling_metadata,
self.parallel_config.max_model_len,
self.share_inputs)
sampler_output = None
if self.parallel_config.tensor_parallel_degree > 1:
if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(
self.share_inputs["accept_tokens"], 0)
paddle.distributed.broadcast(
@@ -1016,14 +1016,14 @@ class GCUModelRunner(ModelRunnerBase):
self.sampling_metadata,
skip_idx_list,
)
if self.parallel_config.tensor_parallel_degree > 1:
if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0)
else:
self.sampler(logits, self.sampling_metadata,
self.parallel_config.max_model_len, self.share_inputs)
sampler_output = None
if self.parallel_config.tensor_parallel_degree > 1:
if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(
self.share_inputs["accept_tokens"], 0)
paddle.distributed.broadcast(self.share_inputs["accept_num"],
@@ -1192,11 +1192,11 @@ class GCUModelRunner(ModelRunnerBase):
byte_of_dtype = 2
hidden_dim = self.model_config.head_dim * self.model_config.kv_num_heads
num_layers = self.model_config.num_layers + \
num_layers = self.model_config.num_hidden_layers + \
self.speculative_config.num_gpu_block_expand_ratio if \
self.speculative_method in [
"mtp"
] else self.model_config.num_layers
] else self.model_config.num_hidden_layers
required_memory = (
byte_of_dtype * 2 * # k + v
(self.parallel_config.block_size * hidden_dim) * num_layers)

View File

@@ -259,7 +259,7 @@ class GPUModelRunner(ModelRunnerBase):
self.share_inputs["min_dec_len"][idx:idx + 1] = request.get(
"min_tokens", 1)
self.share_inputs["max_dec_len"][idx:idx + 1] = request.get(
"max_tokens", self.model_config.max_length)
"max_tokens", self.model_config.max_model_len)
self.share_inputs["stop_flags"][idx:idx + 1] = False
self.share_inputs["first_token_ids"][
@@ -375,11 +375,11 @@ class GPUModelRunner(ModelRunnerBase):
self.share_inputs["min_dec_len"] = paddle.full(
[max_num_seqs, 1], self.model_config.min_length, dtype='int64')
self.share_inputs["max_dec_len"] = paddle.full(
[max_num_seqs, 1], self.model_config.max_length, dtype='int64')
[max_num_seqs, 1], self.model_config.max_model_len, dtype='int64')
self.share_inputs["min_length"] = paddle.full(
[max_num_seqs, 1], self.model_config.min_length, dtype='int64')
self.share_inputs["max_length"] = paddle.full(
[max_num_seqs, 1], self.model_config.max_length, dtype='int64')
[max_num_seqs, 1], self.model_config.max_model_len, dtype='int64')
self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs,
0,
dtype='int32')
@@ -666,13 +666,13 @@ class GPUModelRunner(ModelRunnerBase):
# Get kv cache shape
kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(
max_num_blocks=max_block_num)
local_rank = self.local_rank % self.parallel_config.tensor_parallel_degree
local_rank = self.local_rank % self.parallel_config.tensor_parallel_size
if not self.parallel_config.do_profile and (
self.parallel_config.enable_prefix_caching \
or self.parallel_config.splitwise_role != "mixed"):
cache_kvs_list = []
for i in range(self.model_config.num_layers):
for i in range(self.model_config.num_hidden_layers):
key_cache = paddle.empty(shape=[], dtype=cache_type)
key_cache_name = f"key_caches_{i}_rank{local_rank}.device{self.device_id}"
val_cache_name = f"value_caches_{i}_rank{local_rank}.device{self.device_id}"
@@ -687,7 +687,7 @@ class GPUModelRunner(ModelRunnerBase):
self.share_inputs["caches"] = cache_kvs_list
else:
for i in range(self.model_config.num_layers):
for i in range(self.model_config.num_hidden_layers):
cache_kvs["key_caches_{}".format(i)] = paddle.full(
shape=kv_cache_shape,
@@ -710,10 +710,10 @@ class GPUModelRunner(ModelRunnerBase):
"""
assert len(self.attn_backends) == 0
num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_degree
num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size
self.model_config.kv_num_heads = max(1, int(
self.model_config.num_key_value_heads
) // self.parallel_config.tensor_parallel_degree)
) // self.parallel_config.tensor_parallel_size)
head_dim = self.model_config.head_dim
# Get the attention backend
@@ -787,14 +787,14 @@ class GPUModelRunner(ModelRunnerBase):
)
sampler_output = self.sampler(logits,
self.sampling_metadata)
if self.parallel_config.tensor_parallel_degree > 1:
if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0)
else:
self.sampler(logits, self.sampling_metadata,
self.parallel_config.max_model_len,
self.share_inputs)
sampler_output = None
if self.parallel_config.tensor_parallel_degree > 1:
if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(
self.share_inputs["accept_tokens"], 0)
paddle.distributed.broadcast(
@@ -1021,14 +1021,14 @@ class GPUModelRunner(ModelRunnerBase):
self.sampling_metadata,
skip_idx_list,
)
if self.parallel_config.tensor_parallel_degree > 1:
if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0)
else:
self.sampler(logits, self.sampling_metadata,
self.parallel_config.max_model_len, self.share_inputs)
sampler_output = None
if self.parallel_config.tensor_parallel_degree > 1:
if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(
self.share_inputs["accept_tokens"], 0)
paddle.distributed.broadcast(self.share_inputs["accept_num"],
@@ -1206,11 +1206,11 @@ class GPUModelRunner(ModelRunnerBase):
hidden_dim = self.model_config.head_dim * self.model_config.kv_num_heads
# NOTE(liuzichang): Implement multi-layer MTP architecture in the future
num_layers = self.model_config.num_layers + \
num_layers = self.model_config.num_hidden_layers + \
self.speculative_config.num_gpu_block_expand_ratio if \
self.speculative_method in [
"mtp"
] else self.model_config.num_layers
] else self.model_config.num_hidden_layers
required_memory = (
byte_of_dtype * 2 * # k + v
(self.parallel_config.block_size * hidden_dim) * num_layers)

View File

@@ -648,7 +648,7 @@ class IluvatarModelRunner(ModelRunnerBase):
or self.parallel_config.splitwise_role != "mixed"):
raise NotImplementedError("Iluvatar does not support yet")
else:
for i in range(self.model_config.num_layers):
for i in range(self.model_config.num_hidden_layers):
cache_kvs["key_caches_{}".format(i)] = paddle.full(
shape=kv_cache_shape,
@@ -672,11 +672,11 @@ class IluvatarModelRunner(ModelRunnerBase):
assert len(self.attn_backends) == 0
# TODO(gongshaotian): Get rank from config
num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_degree
num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size
self.model_config.kv_num_heads = max(
1,
int(self.model_config.num_key_value_heads) //
self.parallel_config.tensor_parallel_degree)
self.parallel_config.tensor_parallel_size)
head_dim = self.model_config.head_dim
# Get the attention backend
@@ -748,14 +748,14 @@ class IluvatarModelRunner(ModelRunnerBase):
)
sampled_token_ids = self.sampler(logits,
self.sampling_metadata)
if self.parallel_config.tensor_parallel_degree > 1:
if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(sampled_token_ids, 0)
else:
self.sampler(logits, self.sampling_metadata,
self.parallel_config.max_model_len,
self.share_inputs)
sampled_token_ids = None
if self.parallel_config.tensor_parallel_degree > 1:
if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(
self.share_inputs["accept_tokens"], 0)
paddle.distributed.broadcast(
@@ -977,14 +977,14 @@ class IluvatarModelRunner(ModelRunnerBase):
self.sampling_metadata,
skip_idx_list,
)
if self.parallel_config.tensor_parallel_degree > 1:
if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(sampled_token_ids, 0)
else:
self.sampler(logits, self.sampling_metadata,
self.parallel_config.max_model_len, self.share_inputs)
sampled_token_ids = None
if self.parallel_config.tensor_parallel_degree > 1:
if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(
self.share_inputs["accept_tokens"], 0)
paddle.distributed.broadcast(self.share_inputs["accept_num"],
@@ -1145,11 +1145,11 @@ class IluvatarModelRunner(ModelRunnerBase):
hidden_dim = self.model_config.head_dim * self.model_config.kv_num_heads
# NOTE(liuzichang): Implement multi-layer MTP architecture in the future
num_layers = self.model_config.num_layers + \
num_layers = self.model_config.num_hidden_layers + \
self.speculative_config.num_gpu_block_expand_ratio if \
self.speculative_method in [
"mtp"
] else self.model_config.num_layers
] else self.model_config.num_hidden_layers
required_memory = (
byte_of_dtype * 2 * # k + v
(self.parallel_config.block_size * hidden_dim) * num_layers)

View File

@@ -29,8 +29,6 @@ from fastdeploy.model_executor.layers.attention import get_attention_backend
from fastdeploy.model_executor.layers.rotary_embedding import get_rope_3d
from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
from fastdeploy.model_executor.layers.sample.sampler import Sampler
from fastdeploy.model_executor.models.ernie4_5_vl.configuration import \
Ernie4_5_VLMoeConfig
from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import \
ScatterOp
from fastdeploy.platforms import current_platform
@@ -221,9 +219,9 @@ class GPUVLModelRunner(VLModelRunnerBase):
fd_config = initialize_fd_config(
self.args, self.tensor_parallel_degree, self.tensor_parallel_rank
)
fd_config.model_config = Ernie4_5_VLMoeConfig(
**fd_config.model_config.__dict__
)
fd_config.model_config.tensor_parallel_degree=self.tensor_parallel_degree
fd_config.model_config.tensor_parallel_rank=self.tensor_parallel_rank
fd_config.model_config.moe_group="dummy"
fd_config.parallel_config.column_cut = False
vision_config = fd_config.model_config.vision_config
vision_config.attn_sep = False
@@ -237,8 +235,8 @@ class GPUVLModelRunner(VLModelRunnerBase):
fd_config.model_config.think_end_id = tokenizer.get_vocab()["</think>"]
fd_config.model_config.max_text_id = fd_config.model_config.im_patch_id
fd_config.model_config.sequence_parallel = False
# TODO (bukejiyu): Remove the assignment
fd_config.moe_config.top_k = 8
# TODO(YuanRisheng) The moe_k in develop is fixed to 8, need to be changed according to json config
fd_config.model_config.moe_k = 8
self.fd_config = fd_config
self.model_cfg = self.fd_config.model_config
self.image_preprocess = self._init_image_preprocess(
@@ -250,10 +248,10 @@ class GPUVLModelRunner(VLModelRunnerBase):
self.model = get_model_from_loader(self.fd_config)
attn_backend_cls = get_attention_backend()
num_heads = self.fd_config.model_config.num_attention_heads // \
self.fd_config.parallel_config.tensor_parallel_degree
self.fd_config.parallel_config.tensor_parallel_size
self.fd_config.model_config.kv_num_heads = int(
self.fd_config.model_config.num_key_value_heads
) // self.fd_config.parallel_config.tensor_parallel_degree
) // self.fd_config.parallel_config.tensor_parallel_size
head_dim = self.fd_config.model_config.head_dim
self.attn_backend = attn_backend_cls(
self.fd_config,
@@ -305,14 +303,10 @@ class GPUVLModelRunner(VLModelRunnerBase):
"""
cache_kvs = {}
total_block_num = self.num_gpu_blocks
num_layers = self.model_cfg.get("num_layers",
None) or self.model_cfg.get(
"num_hidden_layers", None)
num_layers = self.model_cfg.num_hidden_layers
kv_num_head = self.model_cfg.num_key_value_heads if self.model_cfg.num_key_value_heads != -1 else self.model_cfg.num_attention_heads
kv_num_head = self.model_cfg.get(
"num_key_value_heads",
self.model_cfg.num_attention_heads,
)
kv_num_head = kv_num_head // self.tensor_parallel_degree
self.model_cfg.kv_num_head = kv_num_head
@@ -647,7 +641,7 @@ class GPUVLModelRunner(VLModelRunnerBase):
)
# sampler & save_output
sampler_output = self.sampler(logits, self.sampling_metadata)
if self.fd_config.parallel_config.tensor_parallel_degree > 1:
if self.fd_config.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0)
self.post_process(sampler_output)
@@ -740,9 +734,7 @@ class GPUVLModelRunner(VLModelRunnerBase):
"""
Calculate the size of kvcache for computational theory
"""
num_layers = self.model_cfg.get("num_layers",
None) or self.model_cfg.get(
"num_hidden_layers", None)
num_layers = self.model_cfg.num_hidden_layers
byte_of_cache = 2
# support c8 c4

View File

@@ -22,11 +22,9 @@ import paddle
import paddle.distributed as dist
import paddle.distributed.fleet as fleet
from fastdeploy import envs
from fastdeploy.config import (DecodingConfig, DeviceConfig, FDConfig,
GraphOptimizationConfig, LoadConfig,
ModelConfig, MoEConfig, MoEPhase,
ParallelConfig, SpeculativeConfig)
ModelConfig, ParallelConfig, SpeculativeConfig)
from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
from fastdeploy.inter_communicator import IPCSignal
from fastdeploy.model_executor.layers.quantization import \
@@ -122,7 +120,7 @@ class PaddleDisWorkerProc():
self.task_queue = TaskQueue(
address=task_address,
is_server=False,
num_client=self.parallel_config.tensor_parallel_degree,
num_client=self.parallel_config.tensor_parallel_size,
client_id=self.parallel_config.tensor_parallel_rank,
local_data_parallel_id=self.parallel_config.expert_parallel_rank)
@@ -139,8 +137,8 @@ class PaddleDisWorkerProc():
# init worker_ready_signal
max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
array_size = min(
max_chips_per_node, self.parallel_config.tensor_parallel_degree *
self.parallel_config.expert_parallel_degree)
max_chips_per_node, self.parallel_config.tensor_parallel_size *
self.parallel_config.expert_parallel_size)
workers_ready = np.zeros(shape=[array_size], dtype=np.int32)
self.worker_ready_signal = IPCSignal(
name="worker_ready_signal",
@@ -173,7 +171,7 @@ class PaddleDisWorkerProc():
# init exist_task_signal
workers_exist_task = np.zeros(
[self.parallel_config.expert_parallel_degree], dtype=np.int32)
[self.parallel_config.expert_parallel_size], dtype=np.int32)
self.exist_task_signal = IPCSignal(
name="exist_task_signal",
array=workers_exist_task,
@@ -183,7 +181,7 @@ class PaddleDisWorkerProc():
# init exist_swapped_task_signal
workers_swapped_task = np.zeros(
shape=[self.parallel_config.expert_parallel_degree],
shape=[self.parallel_config.expert_parallel_size],
dtype=np.int32)
self.exist_swapped_task_signal = IPCSignal(
name="exist_swapped_task_signal",
@@ -231,8 +229,8 @@ class PaddleDisWorkerProc():
TODO(gongshaotian): support remote calling of functions that control worker.
"""
# Currently, only support single node
self.nnode = int((self.parallel_config.tensor_parallel_degree + 7) // 8)
mp_num_per_node = self.parallel_config.tensor_parallel_degree // self.nnode
self.nnode = int((self.parallel_config.tensor_parallel_size + 7) // 8)
mp_num_per_node = self.parallel_config.tensor_parallel_size // self.nnode
req_ids = []
while True:
if self.local_rank == 0:
@@ -241,7 +239,7 @@ class PaddleDisWorkerProc():
else:
self.exist_task_signal.value[0] = 0
if self.parallel_config.tensor_parallel_degree > 1:
if self.parallel_config.tensor_parallel_size > 1:
# Synchronize before updating weights
paddle.distributed.barrier()
@@ -259,7 +257,7 @@ class PaddleDisWorkerProc():
self.fd_config.parallel_config.
expert_parallel_rank] = 1
if self.parallel_config.tensor_parallel_degree > 1:
if self.parallel_config.tensor_parallel_size > 1:
# Synchronize the signal for other workers
# TODO(@wufeisheng): Split TP group and EP group
paddle.distributed.barrier()
@@ -479,8 +477,8 @@ def parse_args():
)
parser.add_argument(
"--speculative_benchmark_mode",
default="false",
type=str,
default=False,
type=bool,
)
parser.add_argument("--max_num_batched_tokens",
type=int,
@@ -559,7 +557,7 @@ def parse_args():
return args
def initialize_fd_config(config_or_args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
"""Initialize FDConfig from either RolloutModelConfig or argparse.Namespace
Args:
@@ -568,196 +566,37 @@ def initialize_fd_config(config_or_args, ranks: int = 1, local_rank: int = 0) ->
Returns:
FDConfig: Initialized FastDeploy configuration object
"""
# Get model config from model directory
model_config_dict, _ = ModelConfig.get_config_dict(config_or_args.model_name_or_path)
# Handle MoE related configs
if 'num_experts' in model_config_dict:
model_config_dict['moe_num_experts'] = model_config_dict.pop('num_experts')
if 'num_experts_per_tok' in model_config_dict:
model_config_dict['moe_topk'] = model_config_dict.pop('num_experts_per_tok')
# Set default values for model config
model_config_dict["head_dim"] = model_config_dict.get(
"head_dim", model_config_dict["hidden_size"] // model_config_dict["num_attention_heads"])
model_config_dict["rope_theta"] = model_config_dict.get("rope_theta", 10000.0)
# Create model config object
model_config = ModelConfig.from_dict(model_config_dict)
model_config.head_dim = model_config_dict["head_dim"]
paddle.set_default_dtype(config_or_args.dtype)
if 'tie_word_embeddings' in model_config_dict:
model_config.tie_word_embeddings = model_config_dict['tie_word_embeddings']
# Initialize all config components
device_config = DeviceConfig()
decoding_config = DecodingConfig()
speculative_config = SpeculativeConfig()
parallel_config = ParallelConfig()
load_config = LoadConfig()
moe_config = MoEConfig()
# Handle graph optimization config (check for attribute existence for backward compatibility)
enable_static_graph_inference = getattr(config_or_args, 'enable_static_graph_inference', False)
use_cudagraph = getattr(config_or_args, 'use_cudagraph', False)
max_capture_batch_size = getattr(config_or_args, 'max_capture_batch_size', 0)
paddle.set_default_dtype(args.dtype)
model_config = ModelConfig(vars(args))
device_config = DeviceConfig(vars(args))
decoding_config = DecodingConfig(vars(args))
speculative_config = SpeculativeConfig(vars(args))
parallel_config = ParallelConfig(vars(args))
load_config = LoadConfig(vars(args))
graph_opt_config = GraphOptimizationConfig(
enable_static_graph_inference,
use_cudagraph,
max_capture_batch_size
)
args.enable_static_graph_inference,
args.max_capture_batch_size,
vars(args))
# Handle quantization (check for attribute existence)
model_config.quantization = getattr(config_or_args, 'quantization', None)
# Note(tangbinhan): used for load_checkpoint
model_config.pretrained_config.tensor_parallel_rank = parallel_config.tensor_parallel_rank
model_config.pretrained_config.tensor_parallel_degree = parallel_config.tensor_parallel_size
model_config.pretrained_config.is_mtp = False
model_config.pretrained_config.head_dim = model_config.head_dim
# Update speculative config_or_args
speculative_config.method = getattr(config_or_args, 'speculative_method', None)
speculative_config.num_speculative_tokens = getattr(config_or_args, 'speculative_max_draft_token_num', 0)
speculative_config.model_name_or_path = getattr(config_or_args, 'speculative_model_name_or_path', None)
speculative_config.quantization = getattr(config_or_args, 'speculative_model_quantization', None)
speculative_config.benchmark_mode = (
getattr(config_or_args, "speculative_benchmark_mode", "false").lower() == "true"
)
# Update parallel config
parallel_config.engine_pid = getattr(config_or_args, 'engine_pid', None)
parallel_config.model_name_or_path = config_or_args.model_name_or_path
parallel_config.max_num_seqs = getattr(config_or_args, 'max_num_seqs', 0)
parallel_config.max_block_num = getattr(config_or_args, 'total_block_num', 0)
parallel_config.block_size = getattr(config_or_args, 'block_size', 64)
parallel_config.pod_ip = getattr(config_or_args, 'pod_ip', None)
parallel_config.engine_worker_queue_port = getattr(config_or_args, 'engine_worker_queue_port', 0)
parallel_config.max_model_len = getattr(config_or_args, 'max_model_len', 0)
model_config.max_seq_len = getattr(config_or_args, 'max_model_len', 0)
model_config.max_length = getattr(config_or_args, 'max_model_len', 0)
parallel_config.device_ids = getattr(config_or_args, 'device_ids', [])
parallel_config.dtype = config_or_args.dtype
parallel_config.enc_dec_block_num = getattr(config_or_args, 'enc_dec_block_num', 0)
parallel_config.kv_cache_ratio = getattr(config_or_args, 'kv_cache_ratio', 1.0)
parallel_config.first_token_id = getattr(config_or_args, 'first_token_id', None)
parallel_config.gpu_memory_utilization = getattr(config_or_args, 'gpu_memory_utilization', 0.9)
parallel_config.do_profile = getattr(config_or_args, 'do_profile', False)
parallel_config.dynamic_load_weight = getattr(config_or_args, 'dynamic_load_weight', False)
parallel_config.pad_token_id = getattr(config_or_args, 'pad_token_id', None)
parallel_config.eos_tokens_lens = getattr(config_or_args, 'eos_tokens_lens', 0)
parallel_config.enable_chunked_prefill = getattr(config_or_args, 'enable_chunked_prefill', False)
parallel_config.max_num_batched_tokens = getattr(config_or_args, 'max_num_batched_tokens', 0)
parallel_config.enable_prefix_caching = getattr(config_or_args, 'enable_prefix_caching', False)
parallel_config.enable_custom_all_reduce = getattr(config_or_args, 'enable_custom_all_reduce', False)
parallel_config.use_ep = getattr(config_or_args, 'enable_expert_parallell', False)
parallel_config.tensor_parallel_degree = getattr(config_or_args, 'tensor_parallel_size', 1)
parallel_config.expert_parallel_degree = getattr(config_or_args, 'expert_parallel_size', 1)
parallel_config.splitwise_role = getattr(config_or_args, 'splitwise_role', None)
parallel_config.guided_decoding_backend = getattr(config_or_args, 'guided_decoding_backend', None)
parallel_config.disable_any_whitespace = getattr(config_or_args, 'disable_any_whitespace', False)
# Log parallel config info
logger.info(f"parallel_config.use_ep {parallel_config.use_ep}")
logger.info(f"parallel_config.tensor_parallel_degree {parallel_config.tensor_parallel_degree}")
logger.info(f"splitwise_role {parallel_config.splitwise_role}")
logger.info(
f"parallel_config.tensor_parallel_size {parallel_config.tensor_parallel_size}"
)
logger.info(
f"parallel_config.tensor_parallel_rank {parallel_config.tensor_parallel_rank}"
)
# Set MoE phase based on splitwise role
if parallel_config.splitwise_role == "mixed":
parallel_config.moe_phase = MoEPhase.PREFILL
elif parallel_config.splitwise_role == "prefill":
parallel_config.moe_phase = MoEPhase.PREFILL
elif parallel_config.splitwise_role == "decode":
parallel_config.moe_phase = MoEPhase.DECODER
elif parallel_config.splitwise_role is not None:
raise NotImplementedError
if getattr(model_config, 'num_hidden_layers', None) is None:
raise ValueError("num_hidden_layers is None")
# Handle model architecture specific configurations
num_key_value_heads = model_config_dict.get("num_key_value_heads", -1)
if num_key_value_heads is None:
num_key_value_heads = -1
# Calculate FFN hidden size
if model_config_dict.get("ffn_hidden_size", None) is not None:
ffn_hidden_size = model_config_dict["ffn_hidden_size"]
elif model_config_dict.get("intermediate_size", None) is not None:
ffn_hidden_size = model_config_dict["intermediate_size"]
else:
ffn_hidden_size = 4 * model_config_dict["hidden_size"]
if model_config_dict["hidden_act"].lower() == "swiglu":
if paddle.distributed.get_world_size() > 1:
multiple_of = 8 * model_config_dict["num_attention_heads"]
else:
multiple_of = 4 * model_config_dict["num_attention_heads"]
ffn_hidden_size = multiple_of * (
(int(2 * ffn_hidden_size / 3) + multiple_of - 1) //
multiple_of)
# Get number of layers
num_layers = model_config_dict.get("num_layers", None) or model_config_dict.get(
"num_hidden_layers", None)
if num_layers is None:
raise ValueError(f"num_layers<{num_layers}> is invalid")
if "moe_layer_start_index" in model_config_dict:
moe_layer_start_index = model_config_dict["moe_layer_start_index"]
use_moe = (
isinstance(moe_layer_start_index, int)
and moe_layer_start_index < num_layers
) or (
isinstance(moe_layer_start_index, list)
and min(moe_layer_start_index) < num_layers
)
else:
use_moe = False
# Update model config
model_config.ffn_hidden_size = ffn_hidden_size
model_config.num_layers = num_layers
model_config.num_key_value_heads = num_key_value_heads
model_config.start_layer_index = model_config_dict.get("start_layer_index", 0)
# Update MoE config
moe_config.num_experts = model_config_dict.get("moe_num_experts", None)
moe_config.moe_intermediate_size = model_config_dict.get("moe_intermediate_size", None)
moe_config.top_k = model_config_dict.get("moe_k", model_config_dict.get("moe_topk", 8))
moe_config.moe_num_shared_experts = model_config_dict.get("moe_num_shared_experts", 0)
moe_config.moe_layer_start_index = model_config_dict.get("moe_layer_start_index", 0)
moe_config.num_max_dispatch_tokens_per_rank = model_config_dict.get(
"num_max_dispatch_tokens_per_rank", 256)
moe_config.moe_use_aux_free = model_config_dict.get("moe_use_aux_free", False)
# Handle vocabulary size
model_config.ori_vocab_size = model_config_dict.get("vocab_size", -1)
archs = model_config_dict.get("architectures", [])
if "Ernie4_5_ForCausalLM" in archs or "Ernie4_5_MoeForCausalLM" in archs:
model_config.ori_vocab_size = getattr(config_or_args, 'ori_vocab_size', model_config.ori_vocab_size)
# Handle DeepseekV3 specific config
if "DeepseekV3ForCausalLM" in model_config_dict.get("architectures", []):
from paddleformers.transformers import AutoConfig
model_config.deepseekv3 = AutoConfig.from_pretrained(
config_or_args.model_name_or_path)
assert parallel_config.tensor_parallel_degree * parallel_config.expert_parallel_degree == ranks
parallel_config.tensor_parallel_rank = \
local_rank % parallel_config.tensor_parallel_degree
parallel_config.expert_parallel_rank = \
int(local_rank / parallel_config.tensor_parallel_degree)
if parallel_config.use_ep:
moe_config.num_experts_per_rank = \
moe_config.num_experts // parallel_config.expert_parallel_degree
moe_config.num_experts_start_offset = \
parallel_config.expert_parallel_rank * moe_config.num_experts_per_rank
# For auto TP split
model_config.tensor_parallel_degree = parallel_config.tensor_parallel_degree
model_config.tensor_parallel_rank = parallel_config.tensor_parallel_rank
model_config.use_ep = parallel_config.use_ep
if parallel_config.use_ep:
model_config.num_experts_per_rank = moe_config.num_experts_per_rank
model_config.num_experts_start_offset = moe_config.num_experts_start_offset
# Handle quantization config
quantization_config = model_config_dict.get("quantization_config", None)
quantization_config = model_config.quantization_config
if not model_config.is_quantized:
if quantization_config is not None:
if "kv_cache_quant_type" not in quantization_config:
@@ -772,16 +611,15 @@ def initialize_fd_config(config_or_args, ranks: int = 1, local_rank: int = 0) ->
if quantization_config is not None:
quant_config_name = quantization_config["quantization"]
elif getattr(config_or_args, 'quantization', None) != "None":
elif args.quantization != "None":
quantization_config = {}
quant_config_name = getattr(config_or_args, 'quantization', None)
quant_config_name = args.quantization
quantization_config["quantization"] = quant_config_name
# Special handling for Ernie models
is_ernie = "Ernie4_5_ForCausalLM" in model_config_dict.get("architectures", []) or \
"Ernie4_5_MoeForCausalLM" in model_config_dict.get("architectures", []) or \
"Ernie4_5_VLMoeForConditionalGeneration" in model_config_dict.get(
"architectures", [])
if use_moe and quant_config_name == "wint4" and is_ernie:
is_ernie = "Ernie4_5_ForCausalLM" in model_config.architectures or \
"Ernie4_5_MoeForCausalLM" in model_config.architectures or \
"Ernie4_5_VLMoeForConditionalGeneration" in model_config.architectures
if quant_config_name == "wint4" and is_ernie:
quantization_config["dense_quant_type"] = "wint8"
quantization_config["moe_quant_type"] = "wint4"
quantization_config["quantization"] = "mix_quant"
@@ -806,38 +644,23 @@ def initialize_fd_config(config_or_args, ranks: int = 1, local_rank: int = 0) ->
logger.info(
"Model Status: Original (will apply online quantization)")
logger.info(f"Quantization Method: {getattr(config_or_args, 'quantization', 'None')}")
logger.info(f"{quantization_config}")
else:
logger.info(
"No quantization config found and use original weight and act dtype."
)
model_config.enable_logprob = config_or_args.enable_logprob
model_config.architectures = model_config_dict.get("architectures")
# Update load config
logger.info("===========load_config==============")
# Handle load config (check for environment variable)
load_config.use_fastsafetensor = int(envs.FD_USE_FASTSAFETENSOR) == 1
load_config.dynamic_load_weight = getattr(config_or_args, 'dynamic_load_weight', False)
load_config.load_strategy = getattr(config_or_args, 'load_strategy', None)
logger.info(f"- Dynamic load weight: {load_config.dynamic_load_weight}")
logger.info(f"- Load strategy: {load_config.load_strategy}")
logger.info(f"- Use fastsafetensor: {load_config.use_fastsafetensor}")
# Create and return FDConfig
fd_config = FDConfig(
model_config=model_config,
parallel_config=parallel_config,
speculative_config=speculative_config,
device_config=device_config,
load_config=load_config,
moe_config=moe_config,
decoding_config=decoding_config,
quant_config=quant_config,
graph_opt_config=graph_opt_config
)
fd_config = FDConfig(model_config=model_config,
parallel_config=parallel_config,
speculative_config=speculative_config,
device_config=device_config,
load_config=load_config,
decoding_config=decoding_config,
quant_config=quant_config,
graph_opt_config=graph_opt_config)
return fd_config

View File

@@ -314,7 +314,7 @@ class XPUModelRunner(ModelRunnerBase):
"min_tokens", 1)
self.share_inputs["max_dec_len"][idx:idx + 1] = request.get(
"max_tokens", self.model_config.max_length)
"max_tokens", self.model_config.max_model_len)
self.share_inputs["stop_flags"][idx:idx + 1] = False
self.share_inputs["first_token_ids"][
@@ -387,11 +387,11 @@ class XPUModelRunner(ModelRunnerBase):
self.share_inputs["min_dec_len"] = paddle.full(
[max_num_seqs, 1], self.model_config.min_length, dtype='int64')
self.share_inputs["max_dec_len"] = paddle.full(
[max_num_seqs, 1], self.model_config.max_length, dtype='int64')
[max_num_seqs, 1], self.model_config.max_model_len, dtype='int64')
self.share_inputs["min_length"] = paddle.full(
[max_num_seqs, 1], self.model_config.min_length, dtype='int64')
self.share_inputs["max_length"] = paddle.full(
[max_num_seqs, 1], self.model_config.max_length, dtype='int64')
[max_num_seqs, 1], self.model_config.max_model_len, dtype='int64')
self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs,
0,
dtype='int32')
@@ -574,7 +574,7 @@ class XPUModelRunner(ModelRunnerBase):
kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(
max_num_blocks=max_block_num)
for i in range(self.model_config.num_layers):
for i in range(self.model_config.num_hidden_layers):
cache_kvs["key_caches_{}".format(i)] = paddle.full(
shape=kv_cache_shape,
fill_value=0,
@@ -597,10 +597,10 @@ class XPUModelRunner(ModelRunnerBase):
assert len(self.attn_backends) == 0
# TODO(gongshaotian): Get rank from config
num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_degree
num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size
self.model_config.kv_num_heads = int(
self.model_config.num_key_value_heads
) // self.parallel_config.tensor_parallel_degree
) // self.parallel_config.tensor_parallel_size
head_dim = self.model_config.head_dim
# Get the attention backend
@@ -803,7 +803,7 @@ class XPUModelRunner(ModelRunnerBase):
required_memory = (
byte_of_dtype * 2 * # k + v
(self.parallel_config.block_size * hidden_dim) *
self.model_config.num_layers)
self.model_config.num_hidden_layers)
return required_memory
def update_share_input_block_num(self, num_gpu_blocks: int) -> None: