Simplify the Config code (#2770)

* simplify the code

* fix vl

* delete config

* fix

* perfect code

* fix ci

* fix xpu

* fix xpu

* fix server

* resolve conflict

* fix mtp

* resolve conflict

* fix xpu

* fix xpu

* fix vl

* fix log

* fix qwen moe

* fix qwen moe

* fix qwen moe
This commit is contained in:
YuanRisheng
2025-07-14 19:50:05 +08:00
committed by GitHub
parent 2e81792d64
commit 4c7b8bc458
34 changed files with 551 additions and 911 deletions

View File

@@ -21,14 +21,15 @@ from enum import Enum
from typing import Literal, Optional, Union from typing import Literal, Optional, Union
from paddleformers.transformers.configuration_utils import PretrainedConfig from paddleformers.transformers.configuration_utils import PretrainedConfig
from paddleformers.trl import llm_utils
from fastdeploy import envs
from fastdeploy.model_executor.layers.quantization.quant_base import \ from fastdeploy.model_executor.layers.quantization.quant_base import \
QuantConfigBase QuantConfigBase
from fastdeploy.utils import get_logger from fastdeploy.utils import get_logger
logger = get_logger("config", "config.log") logger = get_logger("config", "config.log")
class MoEPhase(Enum): class MoEPhase(Enum):
""" """
The generation phase of the moe. The generation phase of the moe.
@@ -37,274 +38,228 @@ class MoEPhase(Enum):
PREFILL = 1 PREFILL = 1
DECODER = 2 DECODER = 2
PRETRAINED_INIT_CONFIGURATION = {
"rope_theta": 10000.0,
"num_key_value_heads":-1,
"start_layer_index": 0,
"moe_num_shared_experts":0,
"moe_layer_start_index": 0,
"num_max_dispatch_tokens_per_rank":256,
"moe_use_aux_free":False,
"vocab_size": -1,
"use_rope": True,
"hidden_dropout_prob":0.0,
"initializer_range":0.02,
"max_position_embeddings":512,
"quantization_config":None,
"use_recompute_resampler":False,
"use_temporal_conv":True,
"resampler_fuse_rms_norm":False,
"freq_allocation":20,
"tie_word_embeddings":False,
"rms_norm_eps":1e-5,
}
class ModelConfig(PretrainedConfig):
class ModelConfig:
""" """
The configuration class to store the configuration of a `LLM`. The configuration class to store the configuration of a `LLM`.
""" """
max_stop_seqs_num = 5
stop_seqs_max_len = 8
architectures: list[str] = []
# NOTE(gongshaotain): form _load_model_init_val()
top_p = 0.0
temperature = 1.0
rope_theta = 10000.0
penalty_score = 1.0
frequency_score = 0.0
presence_score = 0.0
min_length = 1
def __init__( def __init__(
self, self,
vocab_size: int = 100224, args,
hidden_size: int = 4096,
num_layers: int = 48,
num_attention_heads: int = 32,
num_key_value_heads: Optional[int] = None,
hidden_act: str = "swiglu",
hidden_dropout_prob: float = 0.0,
max_position_embeddings: int = 512,
max_seq_len: int = 512,
initializer_range: float = 0.02,
use_rope=True,
rope_theta: int = 10000,
rope_3d: bool = False,
ori_vocab_size: int | None = None,
moe_layer_start_index: Union[int, list[int], None] = None,
moe_num_experts: Union[int, list[int], None] = None,
moe_layer_end_index: Union[int, list[int], None] = None,
moe_num_shared_experts: int | None = None,
num_hidden_layers: int | None = None,
prefix_name="",
freeze_embedding=False,
rope_head_dim=None,
ffn_hidden_size: Optional[int] = None,
dtype="bfloat16",
start_layer_index: int = 0,
head_dim: Optional[int] = None,
tie_word_embeddings: bool = False,
is_quantized: bool = False,
rms_norm_eps: float = 1e-5,
**kwargs,
): ):
super().__init__(**kwargs) self.max_stop_seqs_num = 5
self.vocab_size = vocab_size self.stop_seqs_max_len = 8
self.hidden_size = hidden_size
self.num_layers = num_layers # NOTE(gongshaotain): form _load_model_init_val()
if num_hidden_layers is not None: self.top_p = 0.0
self.num_layers = num_hidden_layers self.temperature = 1.0
self.num_attention_heads = num_attention_heads self.rope_theta = 10000.0
self.num_key_value_heads = num_key_value_heads self.penalty_score = 1.0
if head_dim is None: self.frequency_score = 0.0
self.presence_score = 0.0
self.min_length = 1
self.model_name_or_path = ""
self.im_patch_id = (
100295 # multimodality, TODO(liuyuanle): read from config.json
)
self.is_quantized = False
self.max_model_len = 0
self.dtype = ""
self.enable_logprob = False
for key, value in args.items():
if hasattr(self, key):
setattr(self, key, value)
pretrained_config, _ = PretrainedConfig.get_config_dict(self.model_name_or_path)
self.pretrained_config = PretrainedConfig.from_dict(pretrained_config)
# set attribute from pretrained_config
for key, value in pretrained_config.items():
setattr(self, key, value)
# we need set default value when not exist
for key, value in PRETRAINED_INIT_CONFIGURATION.items():
if not hasattr(self, key):
setattr(self, key, value)
if not hasattr(self, "head_dim"):
self.head_dim = self.hidden_size // self.num_attention_heads self.head_dim = self.hidden_size // self.num_attention_heads
else:
self.head_dim = head_dim
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.initializer_range = initializer_range
self.use_rope = use_rope
self.rope_theta = rope_theta
self.ori_vocab_size = ori_vocab_size or vocab_size
self.max_seq_len = max_seq_len
self.prefix_name = prefix_name
self.freeze_embedding = freeze_embedding
self.rope_head_dim = rope_head_dim
self.moe_layer_start_index = moe_layer_start_index
self.moe_num_experts = moe_num_experts
self.moe_num_shared_experts = moe_num_shared_experts
self.moe_layer_end_index = moe_layer_end_index
self.ffn_hidden_size = ffn_hidden_size
self.rope_3d = rope_3d
self.start_layer_index = start_layer_index
self.dtype = dtype
self.tie_word_embeddings = tie_word_embeddings
self.is_quantized = is_quantized
self.rms_norm_eps = rms_norm_eps
if hasattr(self, "vision_config"):
self.vision_config = PretrainedConfig.from_dict(self.vision_config)
@dataclass self.ori_vocab_size = self.vocab_size
class MoEConfig: if "Ernie4_5_ForCausalLM" in self.architectures or "Ernie4_5_MoeForCausalLM" in self.architectures:
""" self.ori_vocab_size = args["ori_vocab_size"]
Configuration for MoE.
"""
num_experts: Union[int, list[int], None] = None
top_k: int = 8
moe_intermediate_size: int = -1
num_experts_per_rank: int = -1
num_experts_start_offset: int = -1
moe_num_shared_experts = (0, )
moe_layer_start_index: Union[int, list[int], None] = None
moe_layer_end_index: Union[int, list[int], None] = None
moe_use_aux_free: bool = False
num_max_dispatch_tokens_per_rank = 256
im_patch_id = (
100295 # multimodality, TODO(liuyuanle): read from config.json
)
@dataclass
class ParallelConfig: class ParallelConfig:
"""Configuration for the distributed execution.""" """Configuration for the distributed execution."""
block_size = 16 # The block size for processing. def __init__(
sequence_parallel = False # Whether to enable sequence parallelism. self,
use_ep = False # Whether to enable Expert Parallelism args,
moe_phase = MoEPhase.PREFILL # Generation phase ):
msg_queue_id = 1 # mesage queue id self.sequence_parallel = False # Whether to enable sequence parallelism.
tensor_parallel_rank = None # TP rank ID self.use_ep = False # Whether to enable Expert Parallelism
tensor_parallel_degree = None # TP degree self.moe_phase = MoEPhase.PREFILL # Generation phase
expert_parallel_rank = None # EP rank ID self.msg_queue_id = 1 # mesage queue id
expert_parallel_degree = None # EP degree
# The embedding weight distributed on your gpu cards is divided by row or column.
# Defaults to False means divide by row. When vocab_size can not be divided by world_size
# but hidden_size can, we can consider split embedding weight by column.
"""
From old wersion worker args
TODO(gongshaotian): Reclassify
"""
model_name_or_path: str = "./output"
max_num_seqs: int = 34
# Set default block num for profile run
max_block_num: int = 2000
# block size
block_size: int = 64
# Engine worker queue port
engine_worker_queue_port: int = 9923
# Max model len
max_model_len: int = 3072 # max_seq_len
# cuda visible devices
device_ids: str = "0"
# Input dtype
dtype: str = "bfloat16"
# Encoder's decoder num
enc_dec_block_num: int = 1
# KV cache ratio for input
kv_cache_ratio: float = 0.7
# First token id
first_token_id: int = 1
# Gpu memory utilization
gpu_memory_utilization: float = 0.9
# Process ID of engine
engine_pid: Optional[int] = None
# Do profile or not
do_profile: bool = False
#
pad_token_id: int = -1
#
eos_tokens_lens: int = 2
# Enable chunked prefill
enable_chunked_prefill: str = "store_true"
max_num_batched_tokens: int = 2048 tensor_parallel_rank, tensor_parallel_size = llm_utils.init_dist_env()
# enable prefix cache self.tensor_parallel_rank = tensor_parallel_rank # TP rank ID
enable_prefix_caching = None self.tensor_parallel_size = tensor_parallel_size # TP degree
# splitwise role self.expert_parallel_rank = int(tensor_parallel_rank / tensor_parallel_size) # EP rank ID
splitwise_role: str = "mixed" self.expert_parallel_size = 1 # EP degree
# guided decoding backend # The embedding weight distributed on your gpu cards is divided by row or column.
guided_decoding_backend: str = None # Defaults to False means divide by row. When vocab_size can not be divided by world_size
# disable any whitespace for guided decoding # but hidden_size can, we can consider split embedding weight by column.
disable_any_whitespace: bool = True """
# enable the custom all-reduce kernel and fall back to NCCL(dist.all_reduce). From old wersion worker args
enable_custom_all_reduce: str = "store_true" TODO(gongshaotian): Reclassify
"""
self.model_name_or_path: str = "./output"
self.max_num_seqs: int = 34
# Set default block num for profile run
self.max_block_num: int = 2000
# block size
self.block_size: int = 64
# Engine worker queue port
self.engine_worker_queue_port: int = 9923
# Max model len
self.max_model_len: int = 3072 # max_seq_len
# cuda visible devices
self.device_ids: str = "0"
# Input dtype
self.dtype: str = "bfloat16"
# Encoder's decoder num
self.enc_dec_block_num: int = 1
# KV cache ratio for input
self.kv_cache_ratio: float = 0.7
# First token id
self.first_token_id: int = 1
# Gpu memory utilization
self.gpu_memory_utilization: float = 0.9
# Process ID of engine
self.engine_pid: Optional[int] = None
# Do profile or not
self.do_profile: bool = False
#
self.pad_token_id: int = -1
#
self.eos_tokens_lens: int = 2
# Enable chunked prefill
self.enable_chunked_prefill: bool = False
self.max_num_batched_tokens: int = 2048
# enable prefix cache
self.enable_prefix_caching = None
# splitwise role
self.splitwise_role: str = "mixed"
# guided decoding backend
self.guided_decoding_backend: str = None
# disable any whitespace for guided decoding
self.disable_any_whitespace: bool = True
self.pod_ip: str = None
for key, value in args.items():
if hasattr(self, key):
setattr(self, key, value)
self.use_ep = args["expert_parallel_size"] > 1
if self.splitwise_role == "mixed":
self.moe_phase = MoEPhase.PREFILL
elif self.splitwise_role == "prefill":
self.moe_phase = MoEPhase.PREFILL
elif self.splitwise_role == "decode":
self.moe_phase = MoEPhase.DECODER
else:
raise NotImplementedError
# enable the custom all-reduce kernel and fall back to NCCL(dist.all_reduce).
self.enable_custom_all_reduce: bool = False
@dataclass
class SpeculativeConfig: class SpeculativeConfig:
""" """
Configuration for speculative decoding. Configuration for speculative decoding.
""" """
# speculative method, choose in [None, "ngram_match", "mtp"] def __init__(
method: Optional[str] = None self,
# the max length of speculative tokens args,
num_speculative_tokens: int = 1 ):
# the max length of candidate tokens for speculative method # speculative method, choose in [None, "ngram_match", "mtp"]
max_candidate_len: int = 5 self.method: Optional[str] = None
# the max length of verify window for speculative method # the max length of speculative tokens
verify_window: int = 2 self.num_speculative_tokens: int = 1
# ngram match # the max length of candidate tokens for speculative method
max_ngram_size: int = 5 self.max_candidate_len: int = 5
# model for mtp/eagle/draft_model # the max length of verify window for speculative method
model_name_or_path: Optional[str] = None self.verify_window: int = 2
# quantization of model # ngram match
quantization: Optional[str] = None self.max_ngram_size: int = 5
# allocate more blocks to prevent mtp from finishing the block earlier than the main model # model for mtp/eagle/draft_model
# Fixed now self.model_name_or_path: Optional[str] = None
num_gpu_block_expand_ratio: Optional[float] = 1 # quantization of model
# To distinguish the main model and draft model(mtp/eagle/draftmodel) self.quantization: Optional[str] = None
# ["main", "mtp"] # allocate more blocks to prevent mtp from finishing the block earlier than the main model
model_type: Optional[str] = "main" # Fixed now
# TODO(liuzichang): To reduce memory usage, MTP shares the main model's lm_head and embedding layers. self.num_gpu_block_expand_ratio: Optional[float] = 1
# A trick method is currently used to enable this sharing. # To distinguish the main model and draft model(mtp/eagle/draftmodel)
# This will be replaced with a more standardized solution in the future. # ["main", "mtp"]
sharing_model = None self.model_type: Optional[str] = "main"
# During benchmarking, we need to enforce that the number of accepted tokens is 1. # TODO(liuzichang): To reduce memory usage, MTP shares the main model's lm_head and embedding layers.
# This means no tokens from MTP are accepted. # A trick method is currently used to enable this sharing.
# This ensures that the specified simulation acceptance rate is not affected. # This will be replaced with a more standardized solution in the future.
benchmark_mode: bool = False self.sharing_model = None
# During benchmarking, we need to enforce that the number of accepted tokens is 1.
# This means no tokens from MTP are accepted.
# This ensures that the specified simulation acceptance rate is not affected.
self.benchmark_mode: bool = False
#TODO(YuanRisheng): The name of the server args is different from the name of the SpeculativeConfig.
#We temperately add the name map here and will delete it in future.
name_map = {"speculative_method": "method",
"speculative_max_draft_token_num": "num_speculative_tokens",
"speculative_model_name_or_path": "model_name_or_path",
"speculative_model_quantization": "quantization",
"speculative_benchmark_mode": "benchmark_mode"}
for key, value in args.items():
if key in name_map.keys() and hasattr(self, name_map[key]):
setattr(self, name_map[key], value)
@dataclass
class DeviceConfig: class DeviceConfig:
""" """
Configuration for device settings. Configuration for device settings.
""" """
device_type = "cuda" def __init__(
self,
args,
):
self.device_type = "cuda"
for key, value in args.items():
if hasattr(self, key):
setattr(self, key, value)
class GraphOptimizationConfig: class GraphOptimizationConfig:
"""The Top-level graph optimization contral corresponds to different backends.
- 0: dyncmic graph
- 1: static graph
- 2: static graph + cinn compilation backend
"""
graph_opt_level: int = 0
# CUDA Graph Config
""" Whether to use cudagraph.
- False: cudagraph is not used.
- True: cudagraph is used.
It requires that all input buffers have fixed addresses, and all
splitting ops write their outputs to input buffers.
- With dyncmic graph backend: ...
- With static grpah backend: WIP
"""
use_cudagraph: bool = False
"""Sizes to capture cudagraph.
- None (default): capture sizes are inferred from llm config.
- list[int]: capture sizes are specified as given."""
cudagraph_capture_sizes: Optional[list[int]] = None
""" Number of warmup runs for cudagraph. """
cudagraph_num_of_warmups: int = 2
"""Whether to copy input tensors for cudagraph.
If the caller can guarantee that the same input buffers
are always used, it can set this to False. Otherwise, it should
set this to True."""
cudagraph_copy_inputs: bool = False
""" In static graph, this is an operation list that does not need to be captured by the CUDA graph.
CudaGraphBackend will split these operations from the static graph.
Example usage:
cudagraph_splitting_ops = ["paddle.unified_attention"]
Note: If want to use subgraph capture functionality in a dynamic graph,
can manually split the model into multiple layers and apply the @support_cuda_graph decorator
only to the layer where CUDA graph functionality is required.
"""
cudagraph_splitting_ops = Optional[list[str]]
""""whether to use a full cuda graph for the entire forward pass rather than
splitting certain operations such as attention into subgraphs.
Thus this flag cannot be used together with splitting_ops."""
full_cuda_graph: bool = False
max_capture_size: int = field(default=None, init=False) # type: ignore
batch_size_to_captured_size: dict[int,
int] = field(default=None,
init=False) # type: ignore
# CINN Config ...
def init_with_cudagrpah_size(self, def init_with_cudagrpah_size(self,
cudagraph_capture_sizes: list[int]) -> None: cudagraph_capture_sizes: list[int]) -> None:
"""To complete the initialization of config, """To complete the initialization of config,
@@ -338,18 +293,67 @@ class GraphOptimizationConfig:
def __init__(self, def __init__(self,
enable_static_graph_inference: bool = False, enable_static_graph_inference: bool = False,
use_cudagraph: bool = False, max_capture_batch_size: int = 64,
max_capture_batch_size: int = 64): args = None):
""" """ """The Top-level graph optimization contral corresponds to different backends.
- 0: dyncmic graph
- 1: static graph
- 2: static graph + cinn compilation backend
"""
self.graph_opt_level: int = 0
# CUDA Graph Config
""" Whether to use cudagraph.
- False: cudagraph is not used.
- True: cudagraph is used.
It requires that all input buffers have fixed addresses, and all
splitting ops write their outputs to input buffers.
- With dyncmic graph backend: ...
- With static grpah backend: WIP
"""
self.use_cudagraph: bool = False
"""Sizes to capture cudagraph.
- None (default): capture sizes are inferred from llm config.
- list[int]: capture sizes are specified as given."""
self.cudagraph_capture_sizes: Optional[list[int]] = None
""" Number of warmup runs for cudagraph. """
self.cudagraph_num_of_warmups: int = 2
"""Whether to copy input tensors for cudagraph.
If the caller can guarantee that the same input buffers
are always used, it can set this to False. Otherwise, it should
set this to True."""
self.cudagraph_copy_inputs: bool = False
""" In static graph, this is an operation list that does not need to be captured by the CUDA graph.
CudaGraphBackend will split these operations from the static graph.
Example usage:
cudagraph_splitting_ops = ["paddle.unified_attention"]
Note: If want to use subgraph capture functionality in a dynamic graph,
can manually split the model into multiple layers and apply the @support_cuda_graph decorator
only to the layer where CUDA graph functionality is required.
"""
self.cudagraph_splitting_ops = Optional[list[str]]
""""whether to use a full cuda graph for the entire forward pass rather than
splitting certain operations such as attention into subgraphs.
Thus this flag cannot be used together with splitting_ops."""
self.full_cuda_graph: bool = False
self.max_capture_size: int = field(default=None, init=False) # type: ignore
self.batch_size_to_captured_size: dict[int,
int] = field(default=None,
init=False) # type: ignore
# CINN Config ...
for key, value in args.items():
if hasattr(self, key):
setattr(self, key, value)
capture_size = [i for i in range(1, max_capture_batch_size + 1)] capture_size = [i for i in range(1, max_capture_batch_size + 1)]
self.init_with_cudagrpah_size(cudagraph_capture_sizes=capture_size) self.init_with_cudagrpah_size(cudagraph_capture_sizes=capture_size)
self.use_cudagraph = use_cudagraph
#TODO(wangmingkai02): change graph_opt_level=2 when using static mode with cinn #TODO(wangmingkai02): change graph_opt_level=2 when using static mode with cinn
if enable_static_graph_inference: if enable_static_graph_inference:
self.graph_opt_level = 1 self.graph_opt_level = 1
@dataclass
class LoadConfig: class LoadConfig:
""" """
Configuration for dynamic weight loading strategies Configuration for dynamic weight loading strategies
@@ -363,37 +367,39 @@ class LoadConfig:
- 'meta': provide RL traing worker, no_weights_load - 'meta': provide RL traing worker, no_weights_load
- None: No dynamic loading - None: No dynamic loading
""" """
use_fastsafetensor: bool = False def __init__(
dynamic_load_weight: bool = False self,
load_strategy: Optional[Literal['ipc', 'ipc_no_reshard', 'ipc_snapshot', 'meta']] = None args,
):
self.use_fastsafetensor = int(envs.FD_USE_FASTSAFETENSOR) == 1
self.dynamic_load_weight: bool = False
self.load_strategy: Optional[Literal['ipc', 'ipc_no_reshard', 'ipc_snapshot', 'meta']] = None
for key, value in args.items():
if hasattr(self, key):
setattr(self, key, value)
def __post_init__(self):
if self.load_strategy is not None and not self.dynamic_load_weight:
raise ValueError("Load strategy requires dynamic_load_weight=True")
if self.dynamic_load_weight and self.load_strategy is None:
raise ValueError("Must specify load_strategy when dynamic_load_weight is True")
@dataclass
class LoRAConfig: class LoRAConfig:
""" LoRA Config """ """ LoRA Config """
pass pass
@dataclass
class KVCacheConfig: class KVCacheConfig:
""" KV Cache Config """ """ KV Cache Config """
cache_quant_dtype: str = "none" cache_quant_dtype: str = "none"
@dataclass
class DecodingConfig: class DecodingConfig:
""" """
Configuration for decoding Configuration for decoding
""" """
pad_token_id = None def __init__(
self,
args,
):
self.pad_token_id = None
for key, value in args.items():
if hasattr(self, key):
setattr(self, key, value)
@dataclass @dataclass
class FDConfig: class FDConfig:
@@ -411,7 +417,6 @@ class FDConfig:
load_config: LoadConfig = field(default=None, init=True) load_config: LoadConfig = field(default=None, init=True)
quant_config: Optional[QuantConfigBase] = None quant_config: Optional[QuantConfigBase] = None
graph_opt_config: Optional[GraphOptimizationConfig] = None graph_opt_config: Optional[GraphOptimizationConfig] = None
moe_config: MoEConfig = field(default=None, init=True) # type: ignore
decoding_config: DecodingConfig = field(default=None, decoding_config: DecodingConfig = field(default=None,
init=True) # type: ignore init=True) # type: ignore
kv_cache_config: KVCacheConfig = field(default=None, kv_cache_config: KVCacheConfig = field(default=None,

View File

@@ -95,7 +95,7 @@ class AppendAttentionBackend(AttentionBackend):
self.kv_num_heads: int = kv_num_heads self.kv_num_heads: int = kv_num_heads
self.num_heads: int = num_heads self.num_heads: int = num_heads
self.head_dim: int = fd_config.model_config.head_dim self.head_dim: int = fd_config.model_config.head_dim
self.num_layers: int = fd_config.model_config.num_layers self.num_layers: int = fd_config.model_config.num_hidden_layers
self.max_partition_size: int = int( self.max_partition_size: int = int(
os.getenv("FLAGS_max_partition_size", 32768)) os.getenv("FLAGS_max_partition_size", 32768))

View File

@@ -67,10 +67,10 @@ class Attention(nn.Layer):
ValueError: If the `v_head_dim` is less than 0. ValueError: If the `v_head_dim` is less than 0.
""" """
super().__init__() super().__init__()
self.num_heads: int = fd_config.model_config.num_attention_heads // fd_config.parallel_config.tensor_parallel_degree self.num_heads: int = fd_config.model_config.num_attention_heads // fd_config.parallel_config.tensor_parallel_size
self.head_dim: int = fd_config.model_config.head_dim self.head_dim: int = fd_config.model_config.head_dim
self.kv_num_heads: int = \ self.kv_num_heads: int = \
max(1, fd_config.model_config.num_key_value_heads // fd_config.parallel_config.tensor_parallel_degree) max(1, fd_config.model_config.num_key_value_heads // fd_config.parallel_config.tensor_parallel_size)
self.layer_id: int = layer_id self.layer_id: int = layer_id
self.v_head_dim: int = v_head_dim if v_head_dim > 0 else self.head_dim self.v_head_dim: int = v_head_dim if v_head_dim > 0 else self.head_dim
self.rope_type: str = rope_type self.rope_type: str = rope_type

View File

@@ -96,7 +96,7 @@ class FlashAttentionBackend(AttentionBackend):
self.head_dim = fd_config.model_config.head_dim self.head_dim = fd_config.model_config.head_dim
self.hidden_size = fd_config.model_config.hidden_size self.hidden_size = fd_config.model_config.hidden_size
self.block_size = fd_config.parallel_config.block_size self.block_size = fd_config.parallel_config.block_size
self.num_layers: int = fd_config.model_config.num_layers self.num_layers: int = fd_config.model_config.num_hidden_layers
self.speculative_method = fd_config.speculative_config.method self.speculative_method = fd_config.speculative_config.method
self.use_speculate = self.speculative_method is not None self.use_speculate = self.speculative_method is not None

View File

@@ -102,7 +102,7 @@ class IluvatarAttnBackend(AttentionBackend):
self.head_dim = head_dim self.head_dim = head_dim
# note: scale need to change if using MLA # note: scale need to change if using MLA
self.attention_metadata.scale = 1.0 / sqrt(head_dim) self.attention_metadata.scale = 1.0 / sqrt(head_dim)
self.num_layers = llm_config.model_config.num_layers self.num_layers = llm_config.model_config.num_hidden_layers
self.record_block_table_metadata = {} self.record_block_table_metadata = {}
self.only_use_flash_attn = int( self.only_use_flash_attn = int(
os.getenv("FD_ILUVATAR_ONLY_USE_FLASH_ATTN", 0)) == 1 os.getenv("FD_ILUVATAR_ONLY_USE_FLASH_ATTN", 0)) == 1

View File

@@ -113,18 +113,18 @@ class MLAAttentionBackend(AttentionBackend):
self.kv_num_heads: int = kv_num_heads self.kv_num_heads: int = kv_num_heads
self.num_heads: int = num_heads self.num_heads: int = num_heads
self.head_dim: int = fd_config.model_config.head_dim self.head_dim: int = fd_config.model_config.head_dim
self.num_layers: int = fd_config.model_config.num_layers self.num_layers: int = fd_config.model_config.num_hidden_layers
# For Multi Head Latent Attention # For Multi Head Latent Attention
self.kv_lora_rank: int = fd_config.model_config.deepseekv3.kv_lora_rank self.kv_lora_rank: int = fd_config.model_config.kv_lora_rank
self.qk_rope_head_dim: int = fd_config.model_config.deepseekv3.qk_rope_head_dim self.qk_rope_head_dim: int = fd_config.model_config.qk_rope_head_dim
self.qk_head_dim: int = fd_config.model_config.deepseekv3.qk_nope_head_dim \ self.qk_head_dim: int = fd_config.model_config.qk_nope_head_dim \
+ fd_config.model_config.deepseekv3.qk_rope_head_dim + fd_config.model_config.qk_rope_head_dim
self.attn_softmax_scale: float = self.qk_head_dim**-0.5 self.attn_softmax_scale: float = self.qk_head_dim**-0.5
if fd_config.model_config.deepseekv3.rope_scaling: if fd_config.model_config.rope_scaling:
mscale_all_dim = fd_config.model_config.deepseekv3.rope_scaling.get( mscale_all_dim = fd_config.model_config.rope_scaling.get(
"mscale_all_dim", False) # 1.0 "mscale_all_dim", False) # 1.0
scaling_factor = fd_config.model_config.deepseekv3.rope_scaling[ scaling_factor = fd_config.model_config.rope_scaling[
"factor"] # 40 "factor"] # 40
mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim)) mscale = yarn_get_mscale(scaling_factor, float(mscale_all_dim))
self.attn_softmax_scale = self.attn_softmax_scale * mscale * mscale self.attn_softmax_scale = self.attn_softmax_scale * mscale * mscale

View File

@@ -22,7 +22,7 @@ def init_rank_and_device_id(fd_config: FDConfig):
""" """
rank = (fd_config.parallel_config.expert_parallel_rank * rank = (fd_config.parallel_config.expert_parallel_rank *
fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank) fd_config.parallel_config.tensor_parallel_size + fd_config.parallel_config.tensor_parallel_rank)
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", None) cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", None)

View File

@@ -95,7 +95,7 @@ class XPUAttentionBackend(AttentionBackend):
self.kv_num_heads: int = kv_num_heads self.kv_num_heads: int = kv_num_heads
self.num_heads: int = num_heads self.num_heads: int = num_heads
self.head_dim: int = head_dim self.head_dim: int = head_dim
self.num_layers: int = fd_config.model_config.num_layers self.num_layers: int = fd_config.model_config.num_hidden_layers
# pd_disaggregation # pd_disaggregation
self.use_pd_disaggregation: int = int( self.use_pd_disaggregation: int = int(

View File

@@ -88,7 +88,7 @@ class GCUFlashAttnBackend(AttentionBackend):
self.num_heads = num_heads self.num_heads = num_heads
self.head_dim = head_dim self.head_dim = head_dim
self.scaling = 1.0 / (self.head_dim**0.5) self.scaling = 1.0 / (self.head_dim**0.5)
self.num_layers = fd_config.model_config.num_layers self.num_layers = fd_config.model_config.num_hidden_layers
self.position_ids_base = paddle.arange(self.max_seq_len) self.position_ids_base = paddle.arange(self.max_seq_len)
# TODO(zhengjun): Need to adapt the allocation logic and # TODO(zhengjun): Need to adapt the allocation logic and

View File

@@ -88,7 +88,7 @@ class GCUMemEfficientAttnBackend(AttentionBackend):
self.num_heads = num_heads self.num_heads = num_heads
self.head_dim = head_dim self.head_dim = head_dim
self.scaling = 1.0 / (self.head_dim**0.5) self.scaling = 1.0 / (self.head_dim**0.5)
self.num_layers = fd_config.model_config.num_layers self.num_layers = fd_config.model_config.num_hidden_layers
self.position_ids_base = paddle.arange(self.max_seq_len) self.position_ids_base = paddle.arange(self.max_seq_len)
# TODO(zhengjun): Need to adapt the allocation logic and # TODO(zhengjun): Need to adapt the allocation logic and

View File

@@ -59,13 +59,11 @@ class VocabParallelEmbedding(nn.Layer):
self.world_size: int = hcg.get_model_parallel_world_size() self.world_size: int = hcg.get_model_parallel_world_size()
self.ring_id: int = hcg.get_model_parallel_group().id self.ring_id: int = hcg.get_model_parallel_group().id
self.use_rope: bool = fd_config.model_config.use_rope self.use_rope: bool = fd_config.model_config.use_rope
self.rope_head_dim: int = fd_config.model_config.rope_head_dim
self.use_ep: bool = fd_config.parallel_config.use_ep self.use_ep: bool = fd_config.parallel_config.use_ep
self.hidden_dropout_prob: float = fd_config.model_config.hidden_dropout_prob self.hidden_dropout_prob: float = fd_config.model_config.hidden_dropout_prob
self.initializer_range: float = fd_config.model_config.initializer_range self.initializer_range: float = fd_config.model_config.initializer_range
self.sequence_parallel: bool = fd_config.parallel_config.sequence_parallel self.sequence_parallel: bool = fd_config.parallel_config.sequence_parallel
self.max_position_embeddings: int = fd_config.model_config.max_position_embeddings self.max_position_embeddings: int = fd_config.model_config.max_position_embeddings
self.freeze_embedding: bool = fd_config.model_config.freeze_embedding
self.tie_word_embeddings: bool = fd_config.model_config.tie_word_embeddings self.tie_word_embeddings: bool = fd_config.model_config.tie_word_embeddings
self.params_dtype: str = params_dtype self.params_dtype: str = params_dtype
@@ -104,15 +102,7 @@ class VocabParallelEmbedding(nn.Layer):
) )
self.prefix = prefix self.prefix = prefix
if self.freeze_embedding:
self.word_embeddings.weight.learning_rate = 0.0
if not self.use_rope:
self.position_embeddings.weight.learning_rate = 0.0
self.dropout = nn.Dropout(self.hidden_dropout_prob) self.dropout = nn.Dropout(self.hidden_dropout_prob)
self.rope_head_dim_shape_tensor = paddle.ones((self.rope_head_dim),
dtype="int8")
def load_state_dict(self, state_dict: Dict[str, def load_state_dict(self, state_dict: Dict[str,
paddle.Tensor | np.ndarray]): paddle.Tensor | np.ndarray]):
@@ -122,6 +112,7 @@ class VocabParallelEmbedding(nn.Layer):
Args: Args:
state_dict (dict): A dictionary containing the checkpoint weights and biases. state_dict (dict): A dictionary containing the checkpoint weights and biases.
""" """
a = state_dict[self.prefix + ".weight"]
if self.tie_word_embeddings: if self.tie_word_embeddings:
self.word_embeddings.weight.set_value( self.word_embeddings.weight.set_value(
get_tensor(state_dict[self.prefix + ".weight"]).astype( get_tensor(state_dict[self.prefix + ".weight"]).astype(

View File

@@ -266,7 +266,7 @@ class ColumnParallelLinear(LinearBase):
with_bias=with_bias, with_bias=with_bias,
add_bias=add_bias, add_bias=add_bias,
skip_quant=skip_quant) skip_quant=skip_quant)
self.nranks = fd_config.parallel_config.tensor_parallel_degree self.nranks = fd_config.parallel_config.tensor_parallel_size
self.input_size = input_size self.input_size = input_size
self.output_size = divide( self.output_size = divide(
output_size, output_size,
@@ -348,7 +348,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
""" """
self.activation = activation self.activation = activation
self.hidden_size = fd_config.model_config.hidden_size self.hidden_size = fd_config.model_config.hidden_size
self.nranks = fd_config.parallel_config.tensor_parallel_degree self.nranks = fd_config.parallel_config.tensor_parallel_size
super().__init__(fd_config=fd_config, super().__init__(fd_config=fd_config,
prefix=prefix, prefix=prefix,
@@ -410,7 +410,7 @@ class QKVParallelLinear(ColumnParallelLinear):
self.kv_num_heads = fd_config.model_config.num_key_value_heads self.kv_num_heads = fd_config.model_config.num_key_value_heads
self.hidden_size = fd_config.model_config.hidden_size self.hidden_size = fd_config.model_config.hidden_size
self.head_dim = fd_config.model_config.head_dim self.head_dim = fd_config.model_config.head_dim
self.nranks = fd_config.parallel_config.tensor_parallel_degree self.nranks = fd_config.parallel_config.tensor_parallel_size
self.num_heads_per_rank = divide(self.num_heads, self.nranks) self.num_heads_per_rank = divide(self.num_heads, self.nranks)
if self.kv_num_heads < self.nranks and self.nranks % self.kv_num_heads == 0: if self.kv_num_heads < self.nranks and self.nranks % self.kv_num_heads == 0:
self.kv_num_heads_per_rank = 1 self.kv_num_heads_per_rank = 1
@@ -545,7 +545,7 @@ class RowParallelLinear(LinearBase):
skip_quant=skip_quant) skip_quant=skip_quant)
self.fd_config = fd_config self.fd_config = fd_config
self.skip_quant = False self.skip_quant = False
self.nranks = fd_config.parallel_config.tensor_parallel_degree self.nranks = fd_config.parallel_config.tensor_parallel_size
self.hidden_size = fd_config.model_config.hidden_size self.hidden_size = fd_config.model_config.hidden_size
self.head_dim = fd_config.model_config.head_dim self.head_dim = fd_config.model_config.head_dim
self.num_heads = fd_config.model_config.num_attention_heads // self.nranks self.num_heads = fd_config.model_config.num_attention_heads // self.nranks
@@ -638,7 +638,7 @@ class KVBatchLinear(LinearBase):
with_bias (bool): Whether to include bias or not. Defaults to False. with_bias (bool): Whether to include bias or not. Defaults to False.
skip_quant (bool): Whether to skip quantization. Defaults to False. skip_quant (bool): Whether to skip quantization. Defaults to False.
""" """
self.nranks = fd_config.parallel_config.tensor_parallel_degree self.nranks = fd_config.parallel_config.tensor_parallel_size
self.kv_lora_rank = kv_lora_rank self.kv_lora_rank = kv_lora_rank
self.num_attention_heads = num_attention_heads self.num_attention_heads = num_attention_heads
self.qk_nope_head_dim = qk_nope_head_dim self.qk_nope_head_dim = qk_nope_head_dim

View File

@@ -49,7 +49,7 @@ class MoEMethodBase(QuantMethodBase):
from .ep import EPDecoderRunner from .ep import EPDecoderRunner
self.ep_decoder_runner = EPDecoderRunner( self.ep_decoder_runner = EPDecoderRunner(
layer.top_k, layer.hidden_size, layer.num_experts, layer.top_k, layer.hidden_size, layer.num_experts,
layer.moe_config.num_max_dispatch_tokens_per_rank, layer.model_config.num_max_dispatch_tokens_per_rank,
layer.ep_size, layer.ep_rank) layer.ep_size, layer.ep_rank)
else: else:
from .ep import EPPrefillRunner from .ep import EPPrefillRunner

View File

@@ -14,7 +14,6 @@
# limitations under the License. # limitations under the License.
""" """
import numpy as np
import paddle import paddle
from paddle import nn from paddle import nn
from paddleformers.utils.log import logger from paddleformers.utils.log import logger
@@ -23,8 +22,8 @@ import fastdeploy
import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm import fastdeploy.model_executor.ops.gpu.deep_gemm as deep_gemm
from fastdeploy.distributed.communication_op import \ from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce tensor_model_parallel_all_reduce
from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func
from fastdeploy.model_executor.layers.utils import get_tensor from fastdeploy.model_executor.layers.utils import get_tensor
from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func
from ..utils import create_and_set_parameter from ..utils import create_and_set_parameter
from .fused_moe_backend_base import MoEMethodBase from .fused_moe_backend_base import MoEMethodBase
@@ -242,7 +241,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
[ [
layer.num_local_experts, layer.num_local_experts,
layer.ep_size * layer.ep_size *
layer.moe_config.num_max_dispatch_tokens_per_rank, layer.model_config.num_max_dispatch_tokens_per_rank,
layer.moe_intermediate_size * 2, layer.moe_intermediate_size * 2,
], ],
dtype=paddle.bfloat16, dtype=paddle.bfloat16,
@@ -252,7 +251,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
[ [
layer.num_local_experts, layer.num_local_experts,
layer.ep_size * layer.ep_size *
layer.moe_config.num_max_dispatch_tokens_per_rank, layer.model_config.num_max_dispatch_tokens_per_rank,
layer.hidden_size, layer.hidden_size,
], ],
dtype=paddle.bfloat16, dtype=paddle.bfloat16,

View File

@@ -72,8 +72,8 @@ class FusedMoE(nn.Layer):
self.layer_idx = layer_idx self.layer_idx = layer_idx
self.reduce_results = reduce_results self.reduce_results = reduce_results
self.tp_size = fd_config.parallel_config.tensor_parallel_degree self.tp_size = fd_config.parallel_config.tensor_parallel_size
self.ep_size = fd_config.parallel_config.expert_parallel_degree self.ep_size = fd_config.parallel_config.expert_parallel_size
self.ep_rank = fd_config.parallel_config.expert_parallel_rank self.ep_rank = fd_config.parallel_config.expert_parallel_rank
assert (self.tp_size >= 1 and self.ep_size == 1) or \ assert (self.tp_size >= 1 and self.ep_size == 1) or \
@@ -81,7 +81,6 @@ class FusedMoE(nn.Layer):
'MoE only support parallelism on TP or EP dimension.' 'MoE only support parallelism on TP or EP dimension.'
self.hidden_size = fd_config.model_config.hidden_size self.hidden_size = fd_config.model_config.hidden_size
self.moe_config = fd_config.moe_config
self.num_experts = num_experts self.num_experts = num_experts
self.num_local_experts = self.num_experts // self.ep_size self.num_local_experts = self.num_experts // self.ep_size
@@ -141,7 +140,7 @@ class FusedMoE(nn.Layer):
shape=gate_weight_shape, shape=gate_weight_shape,
dtype="float32", dtype="float32",
) )
if self.moe_config.moe_use_aux_free: if self.model_config.moe_use_aux_free:
self.gate_correction_bias = self.create_parameter( self.gate_correction_bias = self.create_parameter(
shape=gate_correction_bias_shape, shape=gate_correction_bias_shape,
dtype="float32", dtype="float32",

View File

@@ -43,7 +43,7 @@ def load_ep_checkpoint(model_path: str,
filtered_map = {k: v for k, v in weight_list.items() if "experts" not in k} filtered_map = {k: v for k, v in weight_list.items() if "experts" not in k}
num_local_ffn_keys = [] num_local_ffn_keys = []
for i in range(config.moe_layer_start_index, config.num_layers): for i in range(config.moe_layer_start_index, config.num_hidden_layers):
for j in range( for j in range(
config.num_experts_start_offset, config.num_experts_start_offset,
config.num_experts_start_offset + config.num_experts_per_rank, config.num_experts_start_offset + config.num_experts_per_rank,
@@ -261,7 +261,7 @@ def load_composite_checkpoint(
and os.path.isdir(os.path.join(model_path, f)) and os.path.isdir(os.path.join(model_path, f))
] ]
if len(rank_dirs) > 1: if len(rank_dirs) > 1:
if fd_config.parallel_config.tensor_parallel_degree != len( if fd_config.parallel_config.tensor_parallel_size != len(
rank_dirs): rank_dirs):
raise ValueError( raise ValueError(
f"Your model only supports loading with tp{len(rank_dirs)}" f"Your model only supports loading with tp{len(rank_dirs)}"
@@ -283,7 +283,7 @@ def load_composite_checkpoint(
else: else:
state_dict = load_tp_checkpoint(model_path, state_dict = load_tp_checkpoint(model_path,
cls, cls,
fd_config.model_config, fd_config.model_config.pretrained_config,
return_numpy=return_numpy) return_numpy=return_numpy)
if not state_dict: if not state_dict:
raise ValueError("weight not found in state_dict !") raise ValueError("weight not found in state_dict !")

View File

@@ -27,6 +27,7 @@ from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig from fastdeploy.config import FDConfig
from fastdeploy.distributed.communication_op import \ from fastdeploy.distributed.communication_op import \
tensor_model_parallel_all_reduce tensor_model_parallel_all_reduce
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.layers.activation import SiluAndMul from fastdeploy.model_executor.layers.activation import SiluAndMul
from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.attention import Attention
from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding
@@ -40,7 +41,6 @@ from fastdeploy.model_executor.layers.rotary_embedding import \
DeepseekScalingRotaryEmbedding DeepseekScalingRotaryEmbedding
from fastdeploy.model_executor.models.model_base import ModelForCasualLM from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.platforms import current_platform from fastdeploy.platforms import current_platform
from fastdeploy.model_executor.forward_meta import ForwardMeta
if current_platform.is_cuda(): if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import \ from fastdeploy.model_executor.ops.gpu import \
@@ -109,7 +109,7 @@ class DeepSeekV3MoE(nn.Layer):
prefix: str) -> None: prefix: str) -> None:
super().__init__() super().__init__()
self.tp_size = fd_config.parallel_config.tensor_parallel_degree self.tp_size = fd_config.parallel_config.tensor_parallel_size
weight_key_map = { weight_key_map = {
"gate_weight_key": f"{prefix}.gate.weight", "gate_weight_key": f"{prefix}.gate.weight",
@@ -124,23 +124,23 @@ class DeepSeekV3MoE(nn.Layer):
self.fused_moe = FusedMoE( self.fused_moe = FusedMoE(
fd_config=fd_config, fd_config=fd_config,
reduce_results=False, reduce_results=False,
moe_intermediate_size=fd_config.model_config.deepseekv3. moe_intermediate_size=fd_config.model_config.
moe_intermediate_size, moe_intermediate_size,
num_experts=fd_config.model_config.deepseekv3.n_routed_experts, num_experts=fd_config.model_config.n_routed_experts,
top_k=fd_config.model_config.deepseekv3.num_experts_per_tok, top_k=fd_config.model_config.num_experts_per_tok,
topk_method=fd_config.model_config.deepseekv3.topk_method, topk_method=fd_config.model_config.topk_method,
topk_group=fd_config.model_config.deepseekv3.topk_group, topk_group=fd_config.model_config.topk_group,
n_group=fd_config.model_config.deepseekv3.n_group, n_group=fd_config.model_config.n_group,
routed_scaling_factor=fd_config.model_config.deepseekv3. routed_scaling_factor=fd_config.model_config.
routed_scaling_factor, routed_scaling_factor,
layer_idx=layer_id, layer_idx=layer_id,
weight_key_map=weight_key_map, weight_key_map=weight_key_map,
) )
self.num_shared_experts = fd_config.model_config.deepseekv3.n_shared_experts self.num_shared_experts = fd_config.model_config.n_shared_experts
shared_experts_intermediate_size = ( shared_experts_intermediate_size = (
self.num_shared_experts * self.num_shared_experts *
fd_config.model_config.deepseekv3.moe_intermediate_size) fd_config.model_config.moe_intermediate_size)
self.shared_experts = DeepSeekV3MLP( self.shared_experts = DeepSeekV3MLP(
fd_config=fd_config, fd_config=fd_config,
@@ -178,18 +178,18 @@ class DeepseekV3MLAAttention(nn.Layer):
prefix: str = "") -> None: prefix: str = "") -> None:
super().__init__() super().__init__()
self.tp_size = fd_config.parallel_config.tensor_parallel_degree self.tp_size = fd_config.parallel_config.tensor_parallel_size
self.hidden_size = fd_config.model_config.hidden_size self.hidden_size = fd_config.model_config.hidden_size
self.num_attention_heads = fd_config.model_config.num_attention_heads self.num_attention_heads = fd_config.model_config.num_attention_heads
self.num_attention_heads_tp = self.num_attention_heads // self.tp_size self.num_attention_heads_tp = self.num_attention_heads // self.tp_size
# MLA # MLA
self.qk_nope_head_dim = fd_config.model_config.deepseekv3.qk_nope_head_dim self.qk_nope_head_dim = fd_config.model_config.qk_nope_head_dim
self.qk_rope_head_dim = fd_config.model_config.deepseekv3.qk_rope_head_dim self.qk_rope_head_dim = fd_config.model_config.qk_rope_head_dim
self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim self.qk_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
self.v_head_dim = fd_config.model_config.deepseekv3.v_head_dim self.v_head_dim = fd_config.model_config.v_head_dim
self.q_lora_rank = fd_config.model_config.deepseekv3.q_lora_rank self.q_lora_rank = fd_config.model_config.q_lora_rank
self.kv_lora_rank = fd_config.model_config.deepseekv3.kv_lora_rank self.kv_lora_rank = fd_config.model_config.kv_lora_rank
self.attn_softmax_scale = self.qk_head_dim**-0.5 self.attn_softmax_scale = self.qk_head_dim**-0.5
self.rope_theta = fd_config.model_config.rope_theta self.rope_theta = fd_config.model_config.rope_theta
@@ -255,7 +255,7 @@ class DeepseekV3MLAAttention(nn.Layer):
qk_nope_head_dim=self.qk_nope_head_dim, qk_nope_head_dim=self.qk_nope_head_dim,
v_head_dim=self.v_head_dim) v_head_dim=self.v_head_dim)
self.rope_scaling = fd_config.model_config.deepseekv3.rope_scaling self.rope_scaling = fd_config.model_config.rope_scaling
if self.rope_scaling: if self.rope_scaling:
mscale_all_dim = self.rope_scaling.get("mscale_all_dim", False) mscale_all_dim = self.rope_scaling.get("mscale_all_dim", False)
scaling_factor = self.rope_scaling["factor"] scaling_factor = self.rope_scaling["factor"]
@@ -449,9 +449,9 @@ class DeepSeekV3DecoderLayer(nn.Layer):
prefix=f"{prefix}.self_attn", prefix=f"{prefix}.self_attn",
) )
if (fd_config.model_config.deepseekv3.n_routed_experts is not None if (fd_config.model_config.n_routed_experts is not None
and layer_id and layer_id
>= fd_config.model_config.deepseekv3.first_k_dense_replace): >= fd_config.model_config.first_k_dense_replace):
self.mlp = DeepSeekV3MoE( self.mlp = DeepSeekV3MoE(
fd_config=fd_config, fd_config=fd_config,
layer_id=layer_id, layer_id=layer_id,
@@ -525,8 +525,8 @@ class DeepSeekV3Model(nn.Layer):
Initializer for the DeepSeekV3Model class. Initializer for the DeepSeekV3Model class.
""" """
super().__init__() super().__init__()
self.num_layers = fd_config.model_config.num_layers self.num_layers = fd_config.model_config.num_hidden_layers
fd_config.model_config.prefix_name = "deepseek_v3" fd_config.model_config.pretrained_config.prefix_name = "deepseek_v3"
self.embeddings = VocabParallelEmbedding( self.embeddings = VocabParallelEmbedding(
fd_config, fd_config,
@@ -539,7 +539,7 @@ class DeepSeekV3Model(nn.Layer):
self.decoder_layers = nn.LayerList([ self.decoder_layers = nn.LayerList([
DeepSeekV3DecoderLayer( DeepSeekV3DecoderLayer(
fd_config, fd_config,
prefix=f"{fd_config.model_config.prefix_name}.layers.{i}") prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}")
for i in range(self.num_layers) for i in range(self.num_layers)
]) ])
@@ -755,5 +755,5 @@ class DeepSeekV3PretrainedModel(PretrainedModel):
return final_actions return final_actions
mappings = get_tensor_parallel_split_mappings(config.num_layers) mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
return mappings return mappings

View File

@@ -25,7 +25,7 @@ from paddle import nn
from paddleformers.transformers import PretrainedModel from paddleformers.transformers import PretrainedModel
from paddleformers.utils.log import logger from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig, ModelConfig from fastdeploy.config import FDConfig
from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.decorator import \ from fastdeploy.model_executor.graph_optimization.decorator import \
support_graph_optimization support_graph_optimization
@@ -54,7 +54,7 @@ class Ernie4_5_MLP(nn.Layer):
reduce_results: bool = True, reduce_results: bool = True,
) -> None: ) -> None:
super().__init__() super().__init__()
self.nranks = fd_config.parallel_config.tensor_parallel_degree self.nranks = fd_config.parallel_config.tensor_parallel_size
self.gate_up_proj = MergedColumnParallelLinear( self.gate_up_proj = MergedColumnParallelLinear(
fd_config=fd_config, fd_config=fd_config,
prefix=f"{prefix}.up_gate_proj", prefix=f"{prefix}.up_gate_proj",
@@ -179,16 +179,16 @@ class Ernie4_5_MoE(nn.Layer):
self.fused_moe = FusedMoE( self.fused_moe = FusedMoE(
fd_config=fd_config, fd_config=fd_config,
moe_intermediate_size=fd_config.moe_config.moe_intermediate_size, moe_intermediate_size=fd_config.model_config.moe_intermediate_size,
num_experts=fd_config.moe_config.num_experts, num_experts=fd_config.model_config.moe_num_experts,
top_k=fd_config.moe_config.top_k, top_k=fd_config.model_config.moe_k,
layer_idx=layer_id, layer_idx=layer_id,
weight_key_map=weight_key_map, weight_key_map=weight_key_map,
) )
self.num_shared_experts = fd_config.moe_config.moe_num_shared_experts self.num_shared_experts = fd_config.model_config.moe_num_shared_experts
if self.num_shared_experts > 0: if self.num_shared_experts > 0:
shared_experts_hidden_dim = self.num_shared_experts * fd_config.moe_config.moe_intermediate_size shared_experts_hidden_dim = self.num_shared_experts * fd_config.model_config.moe_intermediate_size
self.shared_experts = Ernie4_5_MLP( self.shared_experts = Ernie4_5_MLP(
fd_config=fd_config, fd_config=fd_config,
intermediate_size=shared_experts_hidden_dim, intermediate_size=shared_experts_hidden_dim,
@@ -271,8 +271,8 @@ class Ernie4_5_DecoderLayer(nn.Layer):
prefix=f"{prefix}.self_attn", prefix=f"{prefix}.self_attn",
) )
if (fd_config.moe_config.num_experts is not None if (fd_config.model_config.moe_num_experts is not None
and layer_id >= fd_config.moe_config.moe_layer_start_index): and layer_id >= fd_config.model_config.moe_layer_start_index):
self.mlp = Ernie4_5_MoE( self.mlp = Ernie4_5_MoE(
fd_config=fd_config, fd_config=fd_config,
layer_id=layer_id, layer_id=layer_id,
@@ -281,7 +281,7 @@ class Ernie4_5_DecoderLayer(nn.Layer):
else: else:
self.mlp = Ernie4_5_MLP( self.mlp = Ernie4_5_MLP(
fd_config=fd_config, fd_config=fd_config,
intermediate_size=fd_config.model_config.ffn_hidden_size, intermediate_size=fd_config.model_config.intermediate_size,
prefix=f"{prefix}.mlp", prefix=f"{prefix}.mlp",
) )
@@ -346,20 +346,20 @@ class Ernie4_5_Model(nn.Layer):
""" """
super().__init__() super().__init__()
self.num_layers = fd_config.model_config.num_layers self.num_layers = fd_config.model_config.num_hidden_layers
fd_config.model_config.prefix_name = "ernie" fd_config.model_config.pretrained_config.prefix_name = "ernie"
self.embeddings = VocabParallelEmbedding( self.embeddings = VocabParallelEmbedding(
fd_config=fd_config, fd_config=fd_config,
num_embeddings=fd_config.model_config.vocab_size, num_embeddings=fd_config.model_config.vocab_size,
embedding_dim=fd_config.model_config.hidden_size, embedding_dim=fd_config.model_config.hidden_size,
params_dtype=paddle.get_default_dtype(), params_dtype=paddle.get_default_dtype(),
prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens")) prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"))
self.hidden_layers = nn.LayerList([ self.hidden_layers = nn.LayerList([
Ernie4_5_DecoderLayer( Ernie4_5_DecoderLayer(
fd_config=fd_config, fd_config=fd_config,
prefix=f"{fd_config.model_config.prefix_name}.layers.{i}") prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}")
for i in range(self.num_layers) for i in range(self.num_layers)
]) ])
@@ -367,7 +367,7 @@ class Ernie4_5_Model(nn.Layer):
fd_config, fd_config,
hidden_size=fd_config.model_config.hidden_size, hidden_size=fd_config.model_config.hidden_size,
eps=fd_config.model_config.rms_norm_eps, eps=fd_config.model_config.rms_norm_eps,
prefix=f"{fd_config.model_config.prefix_name}.norm", prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm",
) )
def load_state_dict(self, state_dict): def load_state_dict(self, state_dict):
@@ -466,8 +466,8 @@ class Ernie4_5_MoeForCausalLM(ModelForCasualLM):
shape=[0, self.fd_config.model_config.hidden_size], shape=[0, self.fd_config.model_config.hidden_size],
dtype=paddle.get_default_dtype(), dtype=paddle.get_default_dtype(),
) )
for i in range(self.fd_config.moe_config.moe_layer_start_index, for i in range(self.fd_config.model_config.moe_layer_start_index,
self.fd_config.model_config.num_layers): self.fd_config.model_config.num_hidden_layers):
self.model.hidden_layers[i].mlp.fused_moe(fake_hidden_states) self.model.hidden_layers[i].mlp.fused_moe(fake_hidden_states)
def forward( def forward(
@@ -559,7 +559,7 @@ class Ernie4_5_PretrainedModel(PretrainedModel):
] ]
@classmethod @classmethod
def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True): def _get_tensor_parallel_mappings(cls, config, is_split=True):
""" """
get_tensor_parallel_mappings get_tensor_parallel_mappings
""" """
@@ -603,7 +603,7 @@ class Ernie4_5_PretrainedModel(PretrainedModel):
) )
return final_actions return final_actions
mappings = get_tensor_parallel_split_mappings( mappings = get_tensor_parallel_split_mappings(
config.num_layers, config.num_hidden_layers,
config.moe_num_experts, config.moe_num_experts,
config.moe_layer_start_index, config.moe_layer_start_index,
config.prefix_name, config.prefix_name,

View File

@@ -25,12 +25,12 @@ from paddle import nn
from paddleformers.transformers import PretrainedModel from paddleformers.transformers import PretrainedModel
from paddleformers.utils.log import logger from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig, ModelConfig from fastdeploy.config import FDConfig
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.layers.mtp_linear import ParallelEHProjection from fastdeploy.model_executor.layers.mtp_linear import ParallelEHProjection
from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.layers.normalization import RMSNorm
from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_DecoderLayer from fastdeploy.model_executor.models.ernie4_5_moe import Ernie4_5_DecoderLayer
from fastdeploy.model_executor.models.model_base import ModelForCasualLM from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.model_executor.forward_meta import ForwardMeta
class Ernie4_5_MTPPretrainedModel(PretrainedModel): class Ernie4_5_MTPPretrainedModel(PretrainedModel):
@@ -47,7 +47,7 @@ class Ernie4_5_MTPPretrainedModel(PretrainedModel):
return None return None
@classmethod @classmethod
def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True): def _get_tensor_parallel_mappings(cls, config, is_split=True):
""" """
get_tensor_parallel_mappings get_tensor_parallel_mappings
""" """
@@ -237,7 +237,7 @@ class Ernie4_5_MTPPretrainedModel(PretrainedModel):
moe_num_experts = 0 moe_num_experts = 0
mappings = get_tensor_parallel_split_mappings( mappings = get_tensor_parallel_split_mappings(
config.num_layers, config.num_hidden_layers,
moe_num_experts, moe_num_experts,
config.moe_layer_start_index, config.moe_layer_start_index,
) )
@@ -262,13 +262,13 @@ class Ernie4_5_MTPModel(nn.Layer):
""" """
super().__init__() super().__init__()
self.num_layers = fd_config.model_config.num_layers self.num_layers = fd_config.model_config.num_hidden_layers
self.embeddings = fd_config.speculative_config.sharing_model.model.embeddings self.embeddings = fd_config.speculative_config.sharing_model.model.embeddings
self.hidden_layers = nn.LayerList([ self.hidden_layers = nn.LayerList([
Ernie4_5_DecoderLayer( Ernie4_5_DecoderLayer(
fd_config=fd_config, fd_config=fd_config,
prefix=f"{fd_config.model_config.prefix_name}.{i}") prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.{i}")
for i in range(self.num_layers) for i in range(self.num_layers)
]) ])
@@ -398,8 +398,8 @@ class Ernie4_5_MTPForCausalLM(ModelForCasualLM):
shape=[0, self.fd_config.model_config.hidden_size], shape=[0, self.fd_config.model_config.hidden_size],
dtype=paddle.get_default_dtype(), dtype=paddle.get_default_dtype(),
) )
for i in range(self.fd_config.moe_config.moe_layer_start_index, for i in range(self.fd_config.model_config.moe_layer_start_index,
self.fd_config.model_config.num_layers): self.fd_config.model_config.num_hidden_layers):
self.model.hidden_layers[i].mlp.fused_moe(fake_hidden_states) self.model.hidden_layers[i].mlp.fused_moe(fake_hidden_states)
def forward( def forward(

View File

@@ -1,167 +0,0 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import copy
from fastdeploy.config import ModelConfig
from .dfnrope.modeling import DFNRopeVisionTransformerConfig
__all__ = [
"Ernie4_5_VLMoeConfig",
]
class Ernie4_5_VLMoeConfig(ModelConfig):
r"""
This is the configuration class to store the configuration of a [`~ErnieModel`]. It is used to instantiate an Ernie
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Ernie-7B.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the Ernie model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`~ErnieModel`] or [`~TFErnieModel`].
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer encoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer encoder.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-12):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
tie_word_embeddings(`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
Example:
```python
>>> from paddleformers.transformer import ErnieModel, ErnieConfig
>>> # Initializing a Ernie ernie-7b style configuration
>>> configuration = ErnieConfig()
>>> # Initializing a model from the ernie-7b style configuration
>>> model = ErnieModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "erniemoevl"
attribute_map = {
"n_positions": "max_position_embeddings",
"n_embd": "hidden_size",
"n_layer": "num_hidden_layers",
"n_head": "num_attention_heads",
"n_inner": "intermediate_size",
"activation_function": "hidden_act",
}
def __init__(
self,
vision_config=None,
im_patch_id=None,
pixel_hidden_size=None, # None for fuyu
modality_detach=False,
temporal_conv_size=2,
spatial_conv_size=2,
mm_vocab_size=0, # vocab for mm specialtokens
max_text_id=None,
use_temporal_conv=True,
moe_use_size_all2all=False,
moe_num_attn_experts=False,
moe_dense_experts_token_type_id: int = 3,
moe_use_hard_gate: bool = True,
moe_fuse_experts: bool = False,
moe_use_token_type_bias: bool = False,
disable_ffn_model_parallel=False,
fuse_attn_ffn=True,
rope_3d=True,
freq_allocation=20,
using_precision_check=False,
use_recompute_resampler=False,
resampler_fuse_rms_norm=False,
moe_layer_feed_fake_token=False,
moe_num_experts=0,
**kwargs,
):
super().__init__(**kwargs)
self.vision_config = DFNRopeVisionTransformerConfig(
**vision_config) if vision_config else None
self.im_patch_id = im_patch_id
self.pixel_hidden_size = pixel_hidden_size
self.modality_detach = modality_detach
self.temporal_conv_size = temporal_conv_size
self.spatial_conv_size = spatial_conv_size
self.mm_vocab_size = mm_vocab_size
self.max_text_id = max_text_id
self.use_temporal_conv = use_temporal_conv
self.moe_use_size_all2all = moe_use_size_all2all
self.moe_num_attn_experts = moe_num_attn_experts
self.moe_dense_experts_token_type_id = moe_dense_experts_token_type_id
self.moe_use_hard_gate = moe_use_hard_gate
self.moe_fuse_experts = moe_fuse_experts
self.moe_use_token_type_bias = moe_use_token_type_bias
self.disable_ffn_model_parallel = disable_ffn_model_parallel
self.fuse_attn_ffn = fuse_attn_ffn
self.rope_3d = rope_3d
self.freq_allocation = freq_allocation
self.using_precision_check = using_precision_check
self.use_recompute_resampler = use_recompute_resampler
self.resampler_fuse_rms_norm = resampler_fuse_rms_norm
self.moe_layer_feed_fake_token = moe_layer_feed_fake_token
self.moe_num_experts = moe_num_experts
@property
def multimodel_experts(self) -> bool:
"""是否有多种类型的experts."""
return isinstance(self.moe_num_experts,
(tuple, list)) and len(self.moe_num_experts) > 1
@property
def use_moe(self) -> bool:
"""
Check if model is using MoE architecture.
Returns:
bool: True if moe_num_experts > 0, False otherwise
"""
return sum(
self.moe_num_experts
) > 0 if self.multimodel_experts else self.moe_num_experts > 0
def to_dict(self, saving_file=False):
"""to_dict"""
output = copy.deepcopy(self.__dict__)
if self.vision_config:
output["vision_config"] = (
self.vision_config.to_diff_dict() if isinstance(
self.vision_config,
(DFNRopeVisionTransformerConfig)) else self.vision_config)
output["model_type"] = self.__class__.model_type
return output

View File

@@ -72,8 +72,8 @@ class Ernie4_5_VLMoE(nn.Layer):
prefix: str) -> None: prefix: str) -> None:
super().__init__() super().__init__()
self.tp_size = fd_config.parallel_config.tensor_parallel_degree self.tp_size = fd_config.parallel_config.tensor_parallel_size
moe_layer_start_index = fd_config.moe_config.moe_layer_start_index moe_layer_start_index = fd_config.model_config.moe_layer_start_index
if isinstance(moe_layer_start_index, int): if isinstance(moe_layer_start_index, int):
text_moe_layer_start_index = moe_layer_start_index text_moe_layer_start_index = moe_layer_start_index
image_moe_layer_start_index = moe_layer_start_index image_moe_layer_start_index = moe_layer_start_index
@@ -81,10 +81,10 @@ class Ernie4_5_VLMoE(nn.Layer):
text_moe_layer_start_index = moe_layer_start_index[0] text_moe_layer_start_index = moe_layer_start_index[0]
image_moe_layer_start_index = moe_layer_start_index[1] image_moe_layer_start_index = moe_layer_start_index[1]
moe_layer_end_index = fd_config.moe_config.moe_layer_end_index moe_layer_end_index = fd_config.model_config.moe_layer_end_index
if moe_layer_end_index is None: if moe_layer_end_index is None:
text_moe_layer_end_index = fd_config.model_config.num_layers text_moe_layer_end_index = fd_config.model_config.num_hidden_layers
image_moe_layer_end_index = fd_config.model_config.num_layers image_moe_layer_end_index = fd_config.model_config.num_hidden_layers
elif isinstance(moe_layer_end_index, int): elif isinstance(moe_layer_end_index, int):
text_moe_layer_end_index = moe_layer_end_index text_moe_layer_end_index = moe_layer_end_index
image_moe_layer_end_index = moe_layer_end_index image_moe_layer_end_index = moe_layer_end_index
@@ -107,11 +107,11 @@ class Ernie4_5_VLMoE(nn.Layer):
self.mlp_text = FusedMoE( self.mlp_text = FusedMoE(
fd_config=fd_config, fd_config=fd_config,
reduce_results=False, reduce_results=False,
moe_intermediate_size=fd_config.moe_config. moe_intermediate_size=fd_config.model_config.
moe_intermediate_size[0], moe_intermediate_size[0],
num_experts=fd_config.moe_config.num_experts[0], num_experts=fd_config.model_config.moe_num_experts[0],
expert_id_offset=0, expert_id_offset=0,
top_k=fd_config.moe_config.top_k, top_k=fd_config.model_config.moe_k,
layer_idx=layer_id, layer_idx=layer_id,
moe_tag="Text", moe_tag="Text",
weight_key_map=weight_key_map, weight_key_map=weight_key_map,
@@ -120,7 +120,7 @@ class Ernie4_5_VLMoE(nn.Layer):
else: else:
self.mlp_text = Ernie4_5_VLMLP( self.mlp_text = Ernie4_5_VLMLP(
fd_config=fd_config, fd_config=fd_config,
intermediate_size=fd_config.model_config.ffn_hidden_size, intermediate_size=fd_config.model_config.intermediate_size,
prefix=f"{prefix}", prefix=f"{prefix}",
) )
@@ -139,11 +139,11 @@ class Ernie4_5_VLMoE(nn.Layer):
self.mlp_image = FusedMoE( self.mlp_image = FusedMoE(
fd_config=fd_config, fd_config=fd_config,
reduce_results=False, reduce_results=False,
moe_intermediate_size=fd_config.moe_config. moe_intermediate_size=fd_config.model_config.
moe_intermediate_size[1], moe_intermediate_size[1],
num_experts=fd_config.moe_config.num_experts[1], num_experts=fd_config.model_config.moe_num_experts[1],
expert_id_offset=fd_config.moe_config.num_experts[0], expert_id_offset=fd_config.model_config.moe_num_experts[0],
top_k=fd_config.moe_config.top_k, top_k=fd_config.model_config.moe_k,
layer_idx=layer_id, layer_idx=layer_id,
moe_tag="Image", moe_tag="Image",
weight_key_map=weight_key_map, weight_key_map=weight_key_map,
@@ -152,16 +152,16 @@ class Ernie4_5_VLMoE(nn.Layer):
else: else:
self.mlp_image = Ernie4_5_VLMLP( self.mlp_image = Ernie4_5_VLMLP(
fd_config=fd_config, fd_config=fd_config,
intermediate_size=fd_config.model_config.ffn_hidden_size, intermediate_size=fd_config.model_config.intermediate_size,
prefix=f"{prefix}", prefix=f"{prefix}",
) )
self.num_shared_experts = fd_config.moe_config.moe_num_shared_experts self.num_shared_experts = fd_config.model_config.moe_num_shared_experts
if self.num_shared_experts > 0: if self.num_shared_experts > 0:
self.share_experts = Ernie4_5_VLMLP( self.share_experts = Ernie4_5_VLMLP(
fd_config=fd_config, fd_config=fd_config,
intermediate_size=self.num_shared_experts * intermediate_size=self.num_shared_experts *
fd_config.moe_config.moe_intermediate_size[0], fd_config.model_config.moe_intermediate_size[0],
prefix=f"{prefix}.shared_experts", prefix=f"{prefix}.shared_experts",
reduce_results=False, reduce_results=False,
) )
@@ -235,15 +235,15 @@ class Ernie4_5_VLDecoderLayer(nn.Layer):
super().__init__() super().__init__()
layer_id = int(prefix.split(sep='.')[-1]) layer_id = int(prefix.split(sep='.')[-1])
moe_layer_start_index = fd_config.moe_config.moe_layer_start_index moe_layer_start_index = fd_config.model_config.moe_layer_start_index
if isinstance(moe_layer_start_index, list): if isinstance(moe_layer_start_index, list):
min_moe_layer_start_index = min(moe_layer_start_index) min_moe_layer_start_index = min(moe_layer_start_index)
else: else:
min_moe_layer_start_index = moe_layer_start_index min_moe_layer_start_index = moe_layer_start_index
max_moe_layer_end_index = fd_config.model_config.num_layers max_moe_layer_end_index = fd_config.model_config.num_hidden_layers
if fd_config.moe_config.moe_layer_end_index is not None: if fd_config.model_config.moe_layer_end_index is not None:
moe_layer_end_index = fd_config.moe_config.moe_layer_end_index moe_layer_end_index = fd_config.model_config.moe_layer_end_index
if isinstance(moe_layer_start_index, list): if isinstance(moe_layer_start_index, list):
max_moe_layer_end_index = max(moe_layer_end_index) max_moe_layer_end_index = max(moe_layer_end_index)
else: else:
@@ -257,7 +257,7 @@ class Ernie4_5_VLDecoderLayer(nn.Layer):
assert min_moe_layer_start_index <= max_moe_layer_end_index assert min_moe_layer_start_index <= max_moe_layer_end_index
if (fd_config.moe_config.num_experts is not None if (fd_config.model_config.moe_num_experts is not None
and layer_id >= min_moe_layer_start_index and layer_id >= min_moe_layer_start_index
and layer_id <= max_moe_layer_end_index): and layer_id <= max_moe_layer_end_index):
self.mlp = Ernie4_5_VLMoE( self.mlp = Ernie4_5_VLMoE(
@@ -268,7 +268,7 @@ class Ernie4_5_VLDecoderLayer(nn.Layer):
else: else:
self.mlp = Ernie4_5_VLMLP( self.mlp = Ernie4_5_VLMLP(
fd_config=fd_config, fd_config=fd_config,
intermediate_size=fd_config.model_config.ffn_hidden_size, intermediate_size=fd_config.model_config.intermediate_size,
prefix=f"{prefix}.mlp", prefix=f"{prefix}.mlp",
) )
@@ -337,23 +337,23 @@ class Ernie4_5_VLModel(nn.Layer):
""" """
super().__init__() super().__init__()
self.num_layers = fd_config.model_config.num_layers self.num_layers = fd_config.model_config.num_hidden_layers
self.im_patch_id = fd_config.moe_config.im_patch_id self.im_patch_id = fd_config.model_config.im_patch_id
self._dtype = fd_config.model_config.dtype self._dtype = fd_config.model_config.dtype
fd_config.model_config.prefix_name = "ernie" fd_config.model_config.pretrained_config.prefix_name = "ernie"
self.embeddings = VocabParallelEmbedding( self.embeddings = VocabParallelEmbedding(
fd_config=fd_config, fd_config=fd_config,
num_embeddings=fd_config.model_config.vocab_size, num_embeddings=fd_config.model_config.vocab_size,
embedding_dim=fd_config.model_config.hidden_size, embedding_dim=fd_config.model_config.hidden_size,
params_dtype=paddle.get_default_dtype, params_dtype=paddle.get_default_dtype,
prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"), prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"),
) )
self.hidden_layers = nn.LayerList([ self.hidden_layers = nn.LayerList([
Ernie4_5_VLDecoderLayer( Ernie4_5_VLDecoderLayer(
fd_config=fd_config, fd_config=fd_config,
prefix=f"{fd_config.model_config.prefix_name}.layers.{i}") prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}")
for i in range(self.num_layers) for i in range(self.num_layers)
]) ])
@@ -361,7 +361,7 @@ class Ernie4_5_VLModel(nn.Layer):
fd_config, fd_config,
hidden_size=fd_config.model_config.hidden_size, hidden_size=fd_config.model_config.hidden_size,
eps=fd_config.model_config.rms_norm_eps, eps=fd_config.model_config.rms_norm_eps,
prefix=f"{fd_config.model_config.prefix_name}.norm", prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm",
) )
def load_state_dict(self, state_dict): def load_state_dict(self, state_dict):
@@ -748,7 +748,7 @@ class Ernie4_5_VLPretrainedModel(PretrainedModel):
moe_layer_start_index = config.moe_layer_start_index moe_layer_start_index = config.moe_layer_start_index
mappings = get_tensor_parallel_split_mappings( mappings = get_tensor_parallel_split_mappings(
config.num_layers, config.num_hidden_layers,
config.moe_num_experts, config.moe_num_experts,
moe_layer_start_index, moe_layer_start_index,
config.prefix_name, config.prefix_name,

View File

@@ -53,7 +53,7 @@ class ModelForCasualLM(nn.Layer, ABC):
""" """
Args: Args:
configs (dict): Configurations including parameters such as max_dec_len, min_dec_len, decode_strategy, configs (dict): Configurations including parameters such as max_dec_len, min_dec_len, decode_strategy,
ori_vocab_size, use_topp_sampling, etc. vocab_size, use_topp_sampling, etc.
""" """
super(ModelForCasualLM, self).__init__() super(ModelForCasualLM, self).__init__()
self.fd_config = configs self.fd_config = configs

View File

@@ -24,6 +24,7 @@ from paddleformers.transformers import PretrainedModel
from paddleformers.utils.log import logger from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig, ModelConfig from fastdeploy.config import FDConfig, ModelConfig
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.decorator import \ from fastdeploy.model_executor.graph_optimization.decorator import \
support_graph_optimization support_graph_optimization
from fastdeploy.model_executor.layers.activation import SiluAndMul from fastdeploy.model_executor.layers.activation import SiluAndMul
@@ -34,7 +35,6 @@ from fastdeploy.model_executor.layers.linear import (
from fastdeploy.model_executor.layers.lm_head import ParallelLMHead from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.layers.normalization import RMSNorm
from fastdeploy.model_executor.models.model_base import ModelForCasualLM from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.model_executor.forward_meta import ForwardMeta
class Qwen2MLP(nn.Layer): class Qwen2MLP(nn.Layer):
@@ -47,12 +47,12 @@ class Qwen2MLP(nn.Layer):
prefix: str = "", prefix: str = "",
) -> None: ) -> None:
super().__init__() super().__init__()
self.nranks = fd_config.parallel_config.tensor_parallel_degree self.nranks = fd_config.parallel_config.tensor_parallel_size
self.gate_up_proj = MergedColumnParallelLinear( self.gate_up_proj = MergedColumnParallelLinear(
fd_config=fd_config, fd_config=fd_config,
prefix=f"{prefix}.up_gate_proj", prefix=f"{prefix}.up_gate_proj",
input_size=fd_config.model_config.hidden_size, input_size=fd_config.model_config.hidden_size,
output_size=fd_config.model_config.ffn_hidden_size * 2, output_size=fd_config.model_config.intermediate_size * 2,
with_bias=False, with_bias=False,
activation=fd_config.model_config.hidden_act, activation=fd_config.model_config.hidden_act,
) )
@@ -60,7 +60,7 @@ class Qwen2MLP(nn.Layer):
self.down_proj = RowParallelLinear( self.down_proj = RowParallelLinear(
fd_config=fd_config, fd_config=fd_config,
prefix=f"{prefix}.down_proj", prefix=f"{prefix}.down_proj",
input_size=fd_config.model_config.ffn_hidden_size, input_size=fd_config.model_config.intermediate_size,
output_size=fd_config.model_config.hidden_size, output_size=fd_config.model_config.hidden_size,
with_bias=False, with_bias=False,
) )
@@ -227,21 +227,21 @@ class Qwen2Model(nn.Layer):
""" """
super().__init__() super().__init__()
self.num_layers = fd_config.model_config.num_layers self.num_layers = fd_config.model_config.num_hidden_layers
fd_config.model_config.prefix_name = "qwen2" fd_config.model_config.pretrained_config.prefix_name = "qwen2"
self.embeddings = VocabParallelEmbedding( self.embeddings = VocabParallelEmbedding(
fd_config=fd_config, fd_config=fd_config,
num_embeddings=fd_config.model_config.vocab_size, num_embeddings=fd_config.model_config.vocab_size,
embedding_dim=fd_config.model_config.hidden_size, embedding_dim=fd_config.model_config.hidden_size,
params_dtype=paddle.get_default_dtype, params_dtype=paddle.get_default_dtype,
prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"), prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"),
) )
self.layers = nn.LayerList([ self.layers = nn.LayerList([
Qwen2DecoderLayer( Qwen2DecoderLayer(
fd_config=fd_config, fd_config=fd_config,
prefix=f"{fd_config.model_config.prefix_name}.layers.{i}") prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}")
for i in range(self.num_layers) for i in range(self.num_layers)
]) ])
@@ -249,7 +249,7 @@ class Qwen2Model(nn.Layer):
fd_config, fd_config,
hidden_size=fd_config.model_config.hidden_size, hidden_size=fd_config.model_config.hidden_size,
eps=fd_config.model_config.rms_norm_eps, eps=fd_config.model_config.rms_norm_eps,
prefix=f"{fd_config.model_config.prefix_name}.norm", prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm",
) )
def load_state_dict(self, state_dict): def load_state_dict(self, state_dict):
@@ -427,6 +427,6 @@ class Qwen2PretrainedModel(PretrainedModel):
return final_actions return final_actions
mappings = get_tensor_parallel_split_mappings(config.num_layers) mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
return mappings return mappings

View File

@@ -23,7 +23,8 @@ from paddle import nn
from paddleformers.transformers import PretrainedModel from paddleformers.transformers import PretrainedModel
from paddleformers.utils.log import logger from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig, ModelConfig from fastdeploy.config import FDConfig
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.decorator import \ from fastdeploy.model_executor.graph_optimization.decorator import \
support_graph_optimization support_graph_optimization
from fastdeploy.model_executor.layers.attention.attention import Attention from fastdeploy.model_executor.layers.attention.attention import Attention
@@ -34,7 +35,6 @@ from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.layers.normalization import RMSNorm
from fastdeploy.model_executor.models.model_base import ModelForCasualLM from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.model_executor.models.qwen2 import Qwen2DecoderLayer, Qwen2MLP from fastdeploy.model_executor.models.qwen2 import Qwen2DecoderLayer, Qwen2MLP
from fastdeploy.model_executor.forward_meta import ForwardMeta
class Qwen3MLP(Qwen2MLP): class Qwen3MLP(Qwen2MLP):
@@ -59,7 +59,7 @@ class Qwen3Attention(nn.Layer):
self.qkv_proj = QKVParallelLinear(fd_config, self.qkv_proj = QKVParallelLinear(fd_config,
prefix=f"{prefix}.qkv_proj", prefix=f"{prefix}.qkv_proj",
with_bias=False) with_bias=False)
nranks = fd_config.parallel_config.tensor_parallel_degree nranks = fd_config.parallel_config.tensor_parallel_size
self.o_proj = RowParallelLinear( self.o_proj = RowParallelLinear(
fd_config, fd_config,
@@ -85,7 +85,7 @@ class Qwen3Attention(nn.Layer):
prefix=f"{prefix}.k_norm", prefix=f"{prefix}.k_norm",
begin_norm_axis=2) begin_norm_axis=2)
nranks = fd_config.parallel_config.tensor_parallel_degree nranks = fd_config.parallel_config.tensor_parallel_size
num_kv_heads_replicas = max(1, nranks // fd_config.model_config.num_key_value_heads) num_kv_heads_replicas = max(1, nranks // fd_config.model_config.num_key_value_heads)
self.q_size = fd_config.model_config.num_attention_heads * self.head_dim // nranks self.q_size = fd_config.model_config.num_attention_heads * self.head_dim // nranks
self.kv_size = fd_config.model_config.num_key_value_heads * self.head_dim * num_kv_heads_replicas // nranks self.kv_size = fd_config.model_config.num_key_value_heads * self.head_dim * num_kv_heads_replicas // nranks
@@ -163,21 +163,21 @@ class Qwen3Model(nn.Layer):
""" """
super().__init__() super().__init__()
self.num_layers = fd_config.model_config.num_layers self.num_layers = fd_config.model_config.num_hidden_layers
fd_config.model_config.prefix_name = "model" fd_config.model_config.pretrained_config.prefix_name = "model"
self.embeddings = VocabParallelEmbedding( self.embeddings = VocabParallelEmbedding(
fd_config=fd_config, fd_config=fd_config,
num_embeddings=fd_config.model_config.vocab_size, num_embeddings=fd_config.model_config.vocab_size,
embedding_dim=fd_config.model_config.hidden_size, embedding_dim=fd_config.model_config.hidden_size,
params_dtype=paddle.get_default_dtype, params_dtype=paddle.get_default_dtype,
prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"), prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"),
) )
self.layers = nn.LayerList([ self.layers = nn.LayerList([
Qwen3DecoderLayer( Qwen3DecoderLayer(
fd_config=fd_config, fd_config=fd_config,
prefix=f"{fd_config.model_config.prefix_name}.layers.{i}") prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}")
for i in range(self.num_layers) for i in range(self.num_layers)
]) ])
@@ -185,7 +185,7 @@ class Qwen3Model(nn.Layer):
fd_config, fd_config,
hidden_size=fd_config.model_config.hidden_size, hidden_size=fd_config.model_config.hidden_size,
eps=fd_config.model_config.rms_norm_eps, eps=fd_config.model_config.rms_norm_eps,
prefix=f"{fd_config.model_config.prefix_name}.norm", prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm",
) )
def load_state_dict(self, state_dict): def load_state_dict(self, state_dict):
@@ -307,7 +307,7 @@ class Qwen3PretrainedModel(PretrainedModel):
return None return None
@classmethod @classmethod
def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True): def _get_tensor_parallel_mappings(cls, config, is_split=True):
from paddleformers.transformers.conversion_utils import \ from paddleformers.transformers.conversion_utils import \
split_or_merge_func split_or_merge_func
@@ -358,5 +358,5 @@ class Qwen3PretrainedModel(PretrainedModel):
return final_actions return final_actions
mappings = get_tensor_parallel_split_mappings(config.num_layers) mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
return mappings return mappings

View File

@@ -23,20 +23,19 @@ from paddle import nn
from paddleformers.transformers import PretrainedModel from paddleformers.transformers import PretrainedModel
from paddleformers.utils.log import logger from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig, ModelConfig from fastdeploy.config import FDConfig
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.decorator import \ from fastdeploy.model_executor.graph_optimization.decorator import \
support_graph_optimization support_graph_optimization
from fastdeploy.model_executor.layers.activation import SiluAndMul from fastdeploy.model_executor.layers.activation import SiluAndMul
from fastdeploy.model_executor.layers.attention.attention import Attention
from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding from fastdeploy.model_executor.layers.embeddings import VocabParallelEmbedding
from fastdeploy.model_executor.layers.linear import ( from fastdeploy.model_executor.layers.linear import (
MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) MergedColumnParallelLinear, RowParallelLinear)
from fastdeploy.model_executor.layers.lm_head import ParallelLMHead from fastdeploy.model_executor.layers.lm_head import ParallelLMHead
from fastdeploy.model_executor.layers.moe.moe import FusedMoE from fastdeploy.model_executor.layers.moe.moe import FusedMoE
from fastdeploy.model_executor.layers.normalization import RMSNorm from fastdeploy.model_executor.layers.normalization import RMSNorm
from fastdeploy.model_executor.models.model_base import ModelForCasualLM from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.model_executor.models.qwen3 import Qwen3Attention from fastdeploy.model_executor.models.qwen3 import Qwen3Attention
from fastdeploy.model_executor.forward_meta import ForwardMeta
class Qwen3MLP(nn.Layer): class Qwen3MLP(nn.Layer):
@@ -49,13 +48,13 @@ class Qwen3MLP(nn.Layer):
prefix: str = "", prefix: str = "",
) -> None: ) -> None:
super().__init__() super().__init__()
self.nranks = fd_config.parallel_config.tensor_parallel_degree self.nranks = fd_config.parallel_config.tensor_parallel_size
self.gate_up_proj = MergedColumnParallelLinear( self.gate_up_proj = MergedColumnParallelLinear(
fd_config, fd_config,
prefix=f"{prefix}.up_gate_proj", prefix=f"{prefix}.up_gate_proj",
input_size=fd_config.model_config.hidden_size, input_size=fd_config.model_config.hidden_size,
output_size=fd_config.model_config.ffn_hidden_size * 2, output_size=fd_config.model_config.intermediate_size * 2,
with_bias=False, with_bias=False,
activation=fd_config.model_config.hidden_act, activation=fd_config.model_config.hidden_act,
) )
@@ -63,7 +62,7 @@ class Qwen3MLP(nn.Layer):
self.down_proj = RowParallelLinear( self.down_proj = RowParallelLinear(
fd_config, fd_config,
prefix=f"{prefix}.down_proj", prefix=f"{prefix}.down_proj",
input_size=fd_config.model_config.ffn_hidden_size, input_size=fd_config.model_config.intermediate_size,
output_size=fd_config.model_config.hidden_size, output_size=fd_config.model_config.hidden_size,
with_bias=False, with_bias=False,
) )
@@ -115,14 +114,14 @@ class Qwen3DecoderLayer(nn.Layer):
f"{prefix}.mlp.experts.{{}}.down_proj.weight", f"{prefix}.mlp.experts.{{}}.down_proj.weight",
} }
if (fd_config.moe_config.num_experts is not None if (fd_config.model_config.moe_num_experts is not None
and layer_id >= fd_config.moe_config.moe_layer_start_index): and layer_id >= fd_config.model_config.moe_layer_start_index):
self.mlp = FusedMoE(fd_config, self.mlp = FusedMoE(fd_config,
moe_intermediate_size=fd_config.moe_config. moe_intermediate_size=fd_config.model_config.
moe_intermediate_size, moe_intermediate_size,
num_experts=fd_config.moe_config.num_experts, num_experts=fd_config.model_config.moe_num_experts,
top_k=fd_config.moe_config.top_k, top_k=fd_config.model_config.moe_topk,
layer_idx=layer_id, layer_idx=layer_id,
weight_key_map=weight_key_map) weight_key_map=weight_key_map)
else: else:
@@ -199,21 +198,21 @@ class Qwen3MoeModel(nn.Layer):
""" """
super().__init__() super().__init__()
self.num_layers = fd_config.model_config.num_layers self.num_layers = fd_config.model_config.num_hidden_layers
fd_config.model_config.prefix_name = "model" fd_config.model_config.pretrained_config.prefix_name = "model"
self.embeddings = VocabParallelEmbedding( self.embeddings = VocabParallelEmbedding(
fd_config, fd_config,
num_embeddings=fd_config.model_config.vocab_size, num_embeddings=fd_config.model_config.vocab_size,
embedding_dim=fd_config.model_config.hidden_size, embedding_dim=fd_config.model_config.hidden_size,
params_dtype=paddle.get_default_dtype, params_dtype=paddle.get_default_dtype,
prefix=(f"{fd_config.model_config.prefix_name}.embed_tokens"), prefix=(f"{fd_config.model_config.pretrained_config.prefix_name}.embed_tokens"),
) )
self.layers = nn.LayerList([ self.layers = nn.LayerList([
Qwen3DecoderLayer( Qwen3DecoderLayer(
fd_config, fd_config,
prefix=f"{fd_config.model_config.prefix_name}.layers.{i}") prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.layers.{i}")
for i in range(self.num_layers) for i in range(self.num_layers)
]) ])
@@ -221,7 +220,7 @@ class Qwen3MoeModel(nn.Layer):
fd_config, fd_config,
hidden_size=fd_config.model_config.hidden_size, hidden_size=fd_config.model_config.hidden_size,
eps=1e-6, eps=1e-6,
prefix=f"{fd_config.model_config.prefix_name}.norm", prefix=f"{fd_config.model_config.pretrained_config.prefix_name}.norm",
) )
def load_state_dict(self, state_dict): def load_state_dict(self, state_dict):
@@ -338,7 +337,7 @@ class Qwen3MoePretrainedModel(PretrainedModel):
return None return None
@classmethod @classmethod
def _get_tensor_parallel_mappings(cls, config: ModelConfig, is_split=True): def _get_tensor_parallel_mappings(cls, config, is_split=True):
# TODO not support TP split now, next PR will support TP. # TODO not support TP split now, next PR will support TP.
from paddleformers.transformers.conversion_utils import \ from paddleformers.transformers.conversion_utils import \
@@ -351,7 +350,7 @@ class Qwen3MoePretrainedModel(PretrainedModel):
num_attention_heads=config.num_attention_heads, num_attention_heads=config.num_attention_heads,
) )
def get_tensor_parallel_split_mappings(num_layers, moe_num_experts): def get_tensor_parallel_split_mappings(num_layers, num_experts):
final_actions = {} final_actions = {}
base_actions = { base_actions = {
@@ -402,23 +401,23 @@ class Qwen3MoePretrainedModel(PretrainedModel):
for key, action in base_actions.items(): for key, action in base_actions.items():
for i in range(num_layers): for i in range(num_layers):
newkey = key.replace("layers.0.", f"layers.{i}.") newkey = key.replace("layers.0.", f"layers.{i}.")
for j in range(moe_num_experts): for j in range(num_experts):
newkey2 = newkey.replace("experts.0.", f"experts.{j}.") newkey2 = newkey.replace("experts.0.", f"experts.{j}.")
final_actions[newkey2] = action final_actions[newkey2] = action
return final_actions return final_actions
moe_num_experts = 0 num_experts = 0
if isinstance(config.moe_num_experts, list): if isinstance(config.moe_num_experts, list):
moe_num_experts = sum(config.moe_num_experts) num_experts = sum(config.moe_num_experts)
elif isinstance(config.moe_num_experts, int): elif isinstance(config.moe_num_experts, int):
moe_num_experts = config.moe_num_experts num_experts = config.moe_num_experts
else: else:
raise ValueError( raise ValueError(
f"Not support type of moe_num_experts [{type(config.moe_num_experts)}]" f"Not support type of num_experts [{type(config.moe_num_experts)}]"
) )
mappings = get_tensor_parallel_split_mappings(config.num_layers, mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers,
moe_num_experts) num_experts)
return mappings return mappings

View File

@@ -36,10 +36,9 @@ def check_tensor_parallel_prerequisites(
safetensor_keys: List[str], safetensor_keys: List[str],
) -> None: ) -> None:
"""check_tensor_parallel_prerequisites""" """check_tensor_parallel_prerequisites"""
if fd_config.parallel_config.tensor_parallel_degree > 1: if fd_config.parallel_config.tensor_parallel_size > 1:
tensor_parallel_map = cls._get_tensor_parallel_mappings( tensor_parallel_map = cls._get_tensor_parallel_mappings(
fd_config.model_config, is_split=True fd_config.model_config.pretrained_config, is_split=True)
)
if not tensor_parallel_map: if not tensor_parallel_map:
logger.error( logger.error(
"filtered_quant_map should not be empty. \ "filtered_quant_map should not be empty. \

View File

@@ -165,7 +165,7 @@ class Ernie4_5_MoeForCausalLMRL(Ernie4_5_MoeForCausalLM):
infer_to_train[f"{infer_base_name}.{layer_idx}.mlp.fused_moe.gate_weight"] = \ infer_to_train[f"{infer_base_name}.{layer_idx}.mlp.fused_moe.gate_weight"] = \
f"{train_base_name}.{layer_idx}.mlp.gate.weight" f"{train_base_name}.{layer_idx}.mlp.gate.weight"
if self.fd_config.moe_config.moe_use_aux_free: if self.fd_config.model_config.moe_use_aux_free:
infer_to_train[f"{infer_base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias"] = \ infer_to_train[f"{infer_base_name}.{layer_idx}.mlp.fused_moe.gate_correction_bias"] = \
f"{train_base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias" f"{train_base_name}.{layer_idx}.mlp.moe_statics.e_score_correction_bias"
@@ -178,7 +178,7 @@ class Ernie4_5_MoeForCausalLMRL(Ernie4_5_MoeForCausalLM):
f"{train_base_name}.{layer_idx}.mlp.shared_experts.down_proj.weight" f"{train_base_name}.{layer_idx}.mlp.shared_experts.down_proj.weight"
# MoE experts mappings # MoE experts mappings
for expert_idx in range(self.fd_config.moe_config.num_experts): for expert_idx in range(self.fd_config.model_config.moe_num_experts):
for ph in place_holders: for ph in place_holders:
# FFN1 (up_gate_proj) # FFN1 (up_gate_proj)
ffn1_key = f"{infer_base_name}.{layer_idx}.mlp.fused_moe.moe_ffn1_weight" ffn1_key = f"{infer_base_name}.{layer_idx}.mlp.fused_moe.moe_ffn1_weight"
@@ -198,12 +198,12 @@ class Ernie4_5_MoeForCausalLMRL(Ernie4_5_MoeForCausalLM):
# Process non-MoE layers # Process non-MoE layers
for layer_idx in range( for layer_idx in range(
self.fd_config.moe_config.moe_layer_start_index): self.fd_config.model_config.moe_layer_start_index):
_add_layer_mappings(layer_idx, is_moe_layer=False) _add_layer_mappings(layer_idx, is_moe_layer=False)
# Process MoE layers # Process MoE layers
for layer_idx in range(self.fd_config.moe_config.moe_layer_start_index, for layer_idx in range(self.fd_config.model_config.moe_layer_start_index,
self.fd_config.model_config.num_layers): self.fd_config.model_config.num_hidden_layers):
_add_layer_mappings(layer_idx, is_moe_layer=True) _add_layer_mappings(layer_idx, is_moe_layer=True)
return infer_to_train return infer_to_train
@@ -278,7 +278,7 @@ class Qwen2ForCausalLMRL(Qwen2ForCausalLM):
f"{train_base_name}.{layer_idx}.mlp.down_proj.{ph}" f"{train_base_name}.{layer_idx}.mlp.down_proj.{ph}"
for layer_idx in range( for layer_idx in range(
self.fd_config.model_config.num_layers): self.fd_config.model_config.num_hidden_layers):
_add_layer_mappings(layer_idx) _add_layer_mappings(layer_idx)
return infer_to_train return infer_to_train
@@ -396,7 +396,7 @@ class Qwen3MoeForCausalLMRL(Qwen3MoeForCausalLM):
) )
# Process MoE layers # Process MoE layers
for layer_idx in range(self.fd_config.model_config.num_layers): for layer_idx in range(self.fd_config.model_config.num_hidden_layers):
_add_layer_mappings(layer_idx, is_moe_layer=True) _add_layer_mappings(layer_idx, is_moe_layer=True)
return infer_to_train return infer_to_train

View File

@@ -21,6 +21,7 @@ import numpy as np
import paddle import paddle
from fastdeploy.engine.request import Request from fastdeploy.engine.request import Request
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.layers.attention import get_attention_backend from fastdeploy.model_executor.layers.attention import get_attention_backend
from fastdeploy.model_executor.layers.attention.base_attention_backend import \ from fastdeploy.model_executor.layers.attention.base_attention_backend import \
AttentionBackend AttentionBackend
@@ -36,7 +37,6 @@ from fastdeploy.model_executor.ops.gpu import (draft_model_postprocess,
share_external_data) share_external_data)
from fastdeploy.model_executor.pre_and_post_process import (pre_process, from fastdeploy.model_executor.pre_and_post_process import (pre_process,
rebuild_padding) rebuild_padding)
from fastdeploy.model_executor.forward_meta import ForwardMeta
from .base import Proposer from .base import Proposer
@@ -49,7 +49,7 @@ class MTPProposer(Proposer):
def __init__(self, cfg, main_model, local_rank, device_id, def __init__(self, cfg, main_model, local_rank, device_id,
main_model_inputs): main_model_inputs):
super().__init__(cfg) super().__init__(cfg)
self.num_main_model_layers = self.model_config.num_layers self.num_main_model_layers = self.model_config.num_hidden_layers
self.local_rank = local_rank self.local_rank = local_rank
self.device_id = device_id self.device_id = device_id
self._update_cfg(main_model) self._update_cfg(main_model)
@@ -70,10 +70,10 @@ class MTPProposer(Proposer):
""" """
self.model_config.architectures[0] = "Ernie4_5_MTPForCausalLM" self.model_config.architectures[0] = "Ernie4_5_MTPForCausalLM"
self.speculative_config.sharing_model = main_model self.speculative_config.sharing_model = main_model
self.model_config.num_layers = 1 self.model_config.num_hidden_layers = 1
self.parallel_config.model_name_or_path = ( self.parallel_config.model_name_or_path = (
self.speculative_config.model_name_or_path) self.speculative_config.model_name_or_path)
self.model_config.prefix_name = "ernie.mtp_block" self.model_config.pretrained_config.prefix_name = "ernie.mtp_block"
if self.speculative_config.quantization != "": if self.speculative_config.quantization != "":
self.model_config.quantization = ( self.model_config.quantization = (
self.speculative_config.quantization) self.speculative_config.quantization)
@@ -145,7 +145,7 @@ class MTPProposer(Proposer):
cache_kvs_list = [] cache_kvs_list = []
for i in range( for i in range(
self.num_main_model_layers, self.num_main_model_layers,
self.num_main_model_layers + self.model_config.num_layers): self.num_main_model_layers + self.model_config.num_hidden_layers):
key_cache = paddle.empty(shape=[], dtype=cache_type) key_cache = paddle.empty(shape=[], dtype=cache_type)
key_cache_name = f"key_caches_{i}_rank{self.local_rank}.device{self.device_id}" key_cache_name = f"key_caches_{i}_rank{self.local_rank}.device{self.device_id}"
val_cache_name = f"value_caches_{i}_rank{self.local_rank}.device{self.device_id}" val_cache_name = f"value_caches_{i}_rank{self.local_rank}.device{self.device_id}"
@@ -159,7 +159,7 @@ class MTPProposer(Proposer):
self.model_inputs["caches"] = cache_kvs_list self.model_inputs["caches"] = cache_kvs_list
else: else:
for i in range(self.model_config.num_layers): for i in range(self.model_config.num_hidden_layers):
self.cache_kvs["key_caches_{}".format(i)] = paddle.full( self.cache_kvs["key_caches_{}".format(i)] = paddle.full(
shape=kv_cache_shape, shape=kv_cache_shape,
fill_value=0, fill_value=0,
@@ -183,10 +183,10 @@ class MTPProposer(Proposer):
# TODO(gongshaotian): Get rank from config # TODO(gongshaotian): Get rank from config
num_heads = (self.model_config.num_attention_heads // num_heads = (self.model_config.num_attention_heads //
self.parallel_config.tensor_parallel_degree) self.parallel_config.tensor_parallel_size)
self.model_config.kv_num_heads = ( self.model_config.kv_num_heads = (
int(self.model_config.num_key_value_heads) // int(self.model_config.num_key_value_heads) //
self.parallel_config.tensor_parallel_degree) self.parallel_config.tensor_parallel_size)
head_dim = self.model_config.head_dim head_dim = self.model_config.head_dim
# Get the attention backend # Get the attention backend
@@ -608,7 +608,7 @@ class MTPProposer(Proposer):
self.model_inputs, self.model_inputs,
) )
if self.parallel_config.tensor_parallel_degree > 1: if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(sampled_token_ids, 0) paddle.distributed.broadcast(sampled_token_ids, 0)
self._post_process(sampled_token_ids) self._post_process(sampled_token_ids)

View File

@@ -670,7 +670,7 @@ class GCUModelRunner(ModelRunnerBase):
# Get kv cache shape # Get kv cache shape
kv_cache_shape = self.attn_backends[0].get_kv_cache_shape( kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(
max_num_blocks=max_block_num) max_num_blocks=max_block_num)
# local_rank = self.local_rank % self.parallel_config.tensor_parallel_degree # local_rank = self.local_rank % self.parallel_config.tensor_parallel_size
if not self.parallel_config.do_profile and ( if not self.parallel_config.do_profile and (
self.parallel_config.enable_prefix_caching \ self.parallel_config.enable_prefix_caching \
@@ -679,7 +679,7 @@ class GCUModelRunner(ModelRunnerBase):
"prefix_caching is not support by GCUModelRunner." "prefix_caching is not support by GCUModelRunner."
) )
else: else:
for i in range(self.model_config.num_layers): for i in range(self.model_config.num_hidden_layers):
cache_kvs["key_caches_{}".format(i)] = paddle.full( cache_kvs["key_caches_{}".format(i)] = paddle.full(
shape=kv_cache_shape, shape=kv_cache_shape,
@@ -701,10 +701,10 @@ class GCUModelRunner(ModelRunnerBase):
""" """
assert len(self.attn_backends) == 0 assert len(self.attn_backends) == 0
num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_degree num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size
self.model_config.kv_num_heads = int( self.model_config.kv_num_heads = int(
self.model_config.num_key_value_heads self.model_config.num_key_value_heads
) // self.parallel_config.tensor_parallel_degree ) // self.parallel_config.tensor_parallel_size
head_dim = self.model_config.head_dim head_dim = self.model_config.head_dim
# Get the attention backend # Get the attention backend
@@ -783,14 +783,14 @@ class GCUModelRunner(ModelRunnerBase):
) )
sampler_output = self.sampler(logits, sampler_output = self.sampler(logits,
self.sampling_metadata) self.sampling_metadata)
if self.parallel_config.tensor_parallel_degree > 1: if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0) paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0)
else: else:
self.sampler(logits, self.sampling_metadata, self.sampler(logits, self.sampling_metadata,
self.parallel_config.max_model_len, self.parallel_config.max_model_len,
self.share_inputs) self.share_inputs)
sampler_output = None sampler_output = None
if self.parallel_config.tensor_parallel_degree > 1: if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast( paddle.distributed.broadcast(
self.share_inputs["accept_tokens"], 0) self.share_inputs["accept_tokens"], 0)
paddle.distributed.broadcast( paddle.distributed.broadcast(
@@ -1016,14 +1016,14 @@ class GCUModelRunner(ModelRunnerBase):
self.sampling_metadata, self.sampling_metadata,
skip_idx_list, skip_idx_list,
) )
if self.parallel_config.tensor_parallel_degree > 1: if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0) paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0)
else: else:
self.sampler(logits, self.sampling_metadata, self.sampler(logits, self.sampling_metadata,
self.parallel_config.max_model_len, self.share_inputs) self.parallel_config.max_model_len, self.share_inputs)
sampler_output = None sampler_output = None
if self.parallel_config.tensor_parallel_degree > 1: if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast( paddle.distributed.broadcast(
self.share_inputs["accept_tokens"], 0) self.share_inputs["accept_tokens"], 0)
paddle.distributed.broadcast(self.share_inputs["accept_num"], paddle.distributed.broadcast(self.share_inputs["accept_num"],
@@ -1192,11 +1192,11 @@ class GCUModelRunner(ModelRunnerBase):
byte_of_dtype = 2 byte_of_dtype = 2
hidden_dim = self.model_config.head_dim * self.model_config.kv_num_heads hidden_dim = self.model_config.head_dim * self.model_config.kv_num_heads
num_layers = self.model_config.num_layers + \ num_layers = self.model_config.num_hidden_layers + \
self.speculative_config.num_gpu_block_expand_ratio if \ self.speculative_config.num_gpu_block_expand_ratio if \
self.speculative_method in [ self.speculative_method in [
"mtp" "mtp"
] else self.model_config.num_layers ] else self.model_config.num_hidden_layers
required_memory = ( required_memory = (
byte_of_dtype * 2 * # k + v byte_of_dtype * 2 * # k + v
(self.parallel_config.block_size * hidden_dim) * num_layers) (self.parallel_config.block_size * hidden_dim) * num_layers)

View File

@@ -259,7 +259,7 @@ class GPUModelRunner(ModelRunnerBase):
self.share_inputs["min_dec_len"][idx:idx + 1] = request.get( self.share_inputs["min_dec_len"][idx:idx + 1] = request.get(
"min_tokens", 1) "min_tokens", 1)
self.share_inputs["max_dec_len"][idx:idx + 1] = request.get( self.share_inputs["max_dec_len"][idx:idx + 1] = request.get(
"max_tokens", self.model_config.max_length) "max_tokens", self.model_config.max_model_len)
self.share_inputs["stop_flags"][idx:idx + 1] = False self.share_inputs["stop_flags"][idx:idx + 1] = False
self.share_inputs["first_token_ids"][ self.share_inputs["first_token_ids"][
@@ -375,11 +375,11 @@ class GPUModelRunner(ModelRunnerBase):
self.share_inputs["min_dec_len"] = paddle.full( self.share_inputs["min_dec_len"] = paddle.full(
[max_num_seqs, 1], self.model_config.min_length, dtype='int64') [max_num_seqs, 1], self.model_config.min_length, dtype='int64')
self.share_inputs["max_dec_len"] = paddle.full( self.share_inputs["max_dec_len"] = paddle.full(
[max_num_seqs, 1], self.model_config.max_length, dtype='int64') [max_num_seqs, 1], self.model_config.max_model_len, dtype='int64')
self.share_inputs["min_length"] = paddle.full( self.share_inputs["min_length"] = paddle.full(
[max_num_seqs, 1], self.model_config.min_length, dtype='int64') [max_num_seqs, 1], self.model_config.min_length, dtype='int64')
self.share_inputs["max_length"] = paddle.full( self.share_inputs["max_length"] = paddle.full(
[max_num_seqs, 1], self.model_config.max_length, dtype='int64') [max_num_seqs, 1], self.model_config.max_model_len, dtype='int64')
self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs,
0, 0,
dtype='int32') dtype='int32')
@@ -666,13 +666,13 @@ class GPUModelRunner(ModelRunnerBase):
# Get kv cache shape # Get kv cache shape
kv_cache_shape = self.attn_backends[0].get_kv_cache_shape( kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(
max_num_blocks=max_block_num) max_num_blocks=max_block_num)
local_rank = self.local_rank % self.parallel_config.tensor_parallel_degree local_rank = self.local_rank % self.parallel_config.tensor_parallel_size
if not self.parallel_config.do_profile and ( if not self.parallel_config.do_profile and (
self.parallel_config.enable_prefix_caching \ self.parallel_config.enable_prefix_caching \
or self.parallel_config.splitwise_role != "mixed"): or self.parallel_config.splitwise_role != "mixed"):
cache_kvs_list = [] cache_kvs_list = []
for i in range(self.model_config.num_layers): for i in range(self.model_config.num_hidden_layers):
key_cache = paddle.empty(shape=[], dtype=cache_type) key_cache = paddle.empty(shape=[], dtype=cache_type)
key_cache_name = f"key_caches_{i}_rank{local_rank}.device{self.device_id}" key_cache_name = f"key_caches_{i}_rank{local_rank}.device{self.device_id}"
val_cache_name = f"value_caches_{i}_rank{local_rank}.device{self.device_id}" val_cache_name = f"value_caches_{i}_rank{local_rank}.device{self.device_id}"
@@ -687,7 +687,7 @@ class GPUModelRunner(ModelRunnerBase):
self.share_inputs["caches"] = cache_kvs_list self.share_inputs["caches"] = cache_kvs_list
else: else:
for i in range(self.model_config.num_layers): for i in range(self.model_config.num_hidden_layers):
cache_kvs["key_caches_{}".format(i)] = paddle.full( cache_kvs["key_caches_{}".format(i)] = paddle.full(
shape=kv_cache_shape, shape=kv_cache_shape,
@@ -710,10 +710,10 @@ class GPUModelRunner(ModelRunnerBase):
""" """
assert len(self.attn_backends) == 0 assert len(self.attn_backends) == 0
num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_degree num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size
self.model_config.kv_num_heads = max(1, int( self.model_config.kv_num_heads = max(1, int(
self.model_config.num_key_value_heads self.model_config.num_key_value_heads
) // self.parallel_config.tensor_parallel_degree) ) // self.parallel_config.tensor_parallel_size)
head_dim = self.model_config.head_dim head_dim = self.model_config.head_dim
# Get the attention backend # Get the attention backend
@@ -787,14 +787,14 @@ class GPUModelRunner(ModelRunnerBase):
) )
sampler_output = self.sampler(logits, sampler_output = self.sampler(logits,
self.sampling_metadata) self.sampling_metadata)
if self.parallel_config.tensor_parallel_degree > 1: if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0) paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0)
else: else:
self.sampler(logits, self.sampling_metadata, self.sampler(logits, self.sampling_metadata,
self.parallel_config.max_model_len, self.parallel_config.max_model_len,
self.share_inputs) self.share_inputs)
sampler_output = None sampler_output = None
if self.parallel_config.tensor_parallel_degree > 1: if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast( paddle.distributed.broadcast(
self.share_inputs["accept_tokens"], 0) self.share_inputs["accept_tokens"], 0)
paddle.distributed.broadcast( paddle.distributed.broadcast(
@@ -1021,14 +1021,14 @@ class GPUModelRunner(ModelRunnerBase):
self.sampling_metadata, self.sampling_metadata,
skip_idx_list, skip_idx_list,
) )
if self.parallel_config.tensor_parallel_degree > 1: if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0) paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0)
else: else:
self.sampler(logits, self.sampling_metadata, self.sampler(logits, self.sampling_metadata,
self.parallel_config.max_model_len, self.share_inputs) self.parallel_config.max_model_len, self.share_inputs)
sampler_output = None sampler_output = None
if self.parallel_config.tensor_parallel_degree > 1: if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast( paddle.distributed.broadcast(
self.share_inputs["accept_tokens"], 0) self.share_inputs["accept_tokens"], 0)
paddle.distributed.broadcast(self.share_inputs["accept_num"], paddle.distributed.broadcast(self.share_inputs["accept_num"],
@@ -1206,11 +1206,11 @@ class GPUModelRunner(ModelRunnerBase):
hidden_dim = self.model_config.head_dim * self.model_config.kv_num_heads hidden_dim = self.model_config.head_dim * self.model_config.kv_num_heads
# NOTE(liuzichang): Implement multi-layer MTP architecture in the future # NOTE(liuzichang): Implement multi-layer MTP architecture in the future
num_layers = self.model_config.num_layers + \ num_layers = self.model_config.num_hidden_layers + \
self.speculative_config.num_gpu_block_expand_ratio if \ self.speculative_config.num_gpu_block_expand_ratio if \
self.speculative_method in [ self.speculative_method in [
"mtp" "mtp"
] else self.model_config.num_layers ] else self.model_config.num_hidden_layers
required_memory = ( required_memory = (
byte_of_dtype * 2 * # k + v byte_of_dtype * 2 * # k + v
(self.parallel_config.block_size * hidden_dim) * num_layers) (self.parallel_config.block_size * hidden_dim) * num_layers)

View File

@@ -648,7 +648,7 @@ class IluvatarModelRunner(ModelRunnerBase):
or self.parallel_config.splitwise_role != "mixed"): or self.parallel_config.splitwise_role != "mixed"):
raise NotImplementedError("Iluvatar does not support yet") raise NotImplementedError("Iluvatar does not support yet")
else: else:
for i in range(self.model_config.num_layers): for i in range(self.model_config.num_hidden_layers):
cache_kvs["key_caches_{}".format(i)] = paddle.full( cache_kvs["key_caches_{}".format(i)] = paddle.full(
shape=kv_cache_shape, shape=kv_cache_shape,
@@ -672,11 +672,11 @@ class IluvatarModelRunner(ModelRunnerBase):
assert len(self.attn_backends) == 0 assert len(self.attn_backends) == 0
# TODO(gongshaotian): Get rank from config # TODO(gongshaotian): Get rank from config
num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_degree num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size
self.model_config.kv_num_heads = max( self.model_config.kv_num_heads = max(
1, 1,
int(self.model_config.num_key_value_heads) // int(self.model_config.num_key_value_heads) //
self.parallel_config.tensor_parallel_degree) self.parallel_config.tensor_parallel_size)
head_dim = self.model_config.head_dim head_dim = self.model_config.head_dim
# Get the attention backend # Get the attention backend
@@ -748,14 +748,14 @@ class IluvatarModelRunner(ModelRunnerBase):
) )
sampled_token_ids = self.sampler(logits, sampled_token_ids = self.sampler(logits,
self.sampling_metadata) self.sampling_metadata)
if self.parallel_config.tensor_parallel_degree > 1: if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(sampled_token_ids, 0) paddle.distributed.broadcast(sampled_token_ids, 0)
else: else:
self.sampler(logits, self.sampling_metadata, self.sampler(logits, self.sampling_metadata,
self.parallel_config.max_model_len, self.parallel_config.max_model_len,
self.share_inputs) self.share_inputs)
sampled_token_ids = None sampled_token_ids = None
if self.parallel_config.tensor_parallel_degree > 1: if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast( paddle.distributed.broadcast(
self.share_inputs["accept_tokens"], 0) self.share_inputs["accept_tokens"], 0)
paddle.distributed.broadcast( paddle.distributed.broadcast(
@@ -977,14 +977,14 @@ class IluvatarModelRunner(ModelRunnerBase):
self.sampling_metadata, self.sampling_metadata,
skip_idx_list, skip_idx_list,
) )
if self.parallel_config.tensor_parallel_degree > 1: if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(sampled_token_ids, 0) paddle.distributed.broadcast(sampled_token_ids, 0)
else: else:
self.sampler(logits, self.sampling_metadata, self.sampler(logits, self.sampling_metadata,
self.parallel_config.max_model_len, self.share_inputs) self.parallel_config.max_model_len, self.share_inputs)
sampled_token_ids = None sampled_token_ids = None
if self.parallel_config.tensor_parallel_degree > 1: if self.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast( paddle.distributed.broadcast(
self.share_inputs["accept_tokens"], 0) self.share_inputs["accept_tokens"], 0)
paddle.distributed.broadcast(self.share_inputs["accept_num"], paddle.distributed.broadcast(self.share_inputs["accept_num"],
@@ -1145,11 +1145,11 @@ class IluvatarModelRunner(ModelRunnerBase):
hidden_dim = self.model_config.head_dim * self.model_config.kv_num_heads hidden_dim = self.model_config.head_dim * self.model_config.kv_num_heads
# NOTE(liuzichang): Implement multi-layer MTP architecture in the future # NOTE(liuzichang): Implement multi-layer MTP architecture in the future
num_layers = self.model_config.num_layers + \ num_layers = self.model_config.num_hidden_layers + \
self.speculative_config.num_gpu_block_expand_ratio if \ self.speculative_config.num_gpu_block_expand_ratio if \
self.speculative_method in [ self.speculative_method in [
"mtp" "mtp"
] else self.model_config.num_layers ] else self.model_config.num_hidden_layers
required_memory = ( required_memory = (
byte_of_dtype * 2 * # k + v byte_of_dtype * 2 * # k + v
(self.parallel_config.block_size * hidden_dim) * num_layers) (self.parallel_config.block_size * hidden_dim) * num_layers)

View File

@@ -29,8 +29,6 @@ from fastdeploy.model_executor.layers.attention import get_attention_backend
from fastdeploy.model_executor.layers.rotary_embedding import get_rope_3d from fastdeploy.model_executor.layers.rotary_embedding import get_rope_3d
from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata from fastdeploy.model_executor.layers.sample.meta_data import SamplingMetadata
from fastdeploy.model_executor.layers.sample.sampler import Sampler from fastdeploy.model_executor.layers.sample.sampler import Sampler
from fastdeploy.model_executor.models.ernie4_5_vl.configuration import \
Ernie4_5_VLMoeConfig
from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import \ from fastdeploy.model_executor.models.ernie4_5_vl.modeling_resampler import \
ScatterOp ScatterOp
from fastdeploy.platforms import current_platform from fastdeploy.platforms import current_platform
@@ -221,9 +219,9 @@ class GPUVLModelRunner(VLModelRunnerBase):
fd_config = initialize_fd_config( fd_config = initialize_fd_config(
self.args, self.tensor_parallel_degree, self.tensor_parallel_rank self.args, self.tensor_parallel_degree, self.tensor_parallel_rank
) )
fd_config.model_config = Ernie4_5_VLMoeConfig( fd_config.model_config.tensor_parallel_degree=self.tensor_parallel_degree
**fd_config.model_config.__dict__ fd_config.model_config.tensor_parallel_rank=self.tensor_parallel_rank
) fd_config.model_config.moe_group="dummy"
fd_config.parallel_config.column_cut = False fd_config.parallel_config.column_cut = False
vision_config = fd_config.model_config.vision_config vision_config = fd_config.model_config.vision_config
vision_config.attn_sep = False vision_config.attn_sep = False
@@ -237,8 +235,8 @@ class GPUVLModelRunner(VLModelRunnerBase):
fd_config.model_config.think_end_id = tokenizer.get_vocab()["</think>"] fd_config.model_config.think_end_id = tokenizer.get_vocab()["</think>"]
fd_config.model_config.max_text_id = fd_config.model_config.im_patch_id fd_config.model_config.max_text_id = fd_config.model_config.im_patch_id
fd_config.model_config.sequence_parallel = False fd_config.model_config.sequence_parallel = False
# TODO (bukejiyu): Remove the assignment # TODO(YuanRisheng) The moe_k in develop is fixed to 8, need to be changed according to json config
fd_config.moe_config.top_k = 8 fd_config.model_config.moe_k = 8
self.fd_config = fd_config self.fd_config = fd_config
self.model_cfg = self.fd_config.model_config self.model_cfg = self.fd_config.model_config
self.image_preprocess = self._init_image_preprocess( self.image_preprocess = self._init_image_preprocess(
@@ -250,10 +248,10 @@ class GPUVLModelRunner(VLModelRunnerBase):
self.model = get_model_from_loader(self.fd_config) self.model = get_model_from_loader(self.fd_config)
attn_backend_cls = get_attention_backend() attn_backend_cls = get_attention_backend()
num_heads = self.fd_config.model_config.num_attention_heads // \ num_heads = self.fd_config.model_config.num_attention_heads // \
self.fd_config.parallel_config.tensor_parallel_degree self.fd_config.parallel_config.tensor_parallel_size
self.fd_config.model_config.kv_num_heads = int( self.fd_config.model_config.kv_num_heads = int(
self.fd_config.model_config.num_key_value_heads self.fd_config.model_config.num_key_value_heads
) // self.fd_config.parallel_config.tensor_parallel_degree ) // self.fd_config.parallel_config.tensor_parallel_size
head_dim = self.fd_config.model_config.head_dim head_dim = self.fd_config.model_config.head_dim
self.attn_backend = attn_backend_cls( self.attn_backend = attn_backend_cls(
self.fd_config, self.fd_config,
@@ -305,14 +303,10 @@ class GPUVLModelRunner(VLModelRunnerBase):
""" """
cache_kvs = {} cache_kvs = {}
total_block_num = self.num_gpu_blocks total_block_num = self.num_gpu_blocks
num_layers = self.model_cfg.get("num_layers", num_layers = self.model_cfg.num_hidden_layers
None) or self.model_cfg.get(
"num_hidden_layers", None) kv_num_head = self.model_cfg.num_key_value_heads if self.model_cfg.num_key_value_heads != -1 else self.model_cfg.num_attention_heads
kv_num_head = self.model_cfg.get(
"num_key_value_heads",
self.model_cfg.num_attention_heads,
)
kv_num_head = kv_num_head // self.tensor_parallel_degree kv_num_head = kv_num_head // self.tensor_parallel_degree
self.model_cfg.kv_num_head = kv_num_head self.model_cfg.kv_num_head = kv_num_head
@@ -647,7 +641,7 @@ class GPUVLModelRunner(VLModelRunnerBase):
) )
# sampler & save_output # sampler & save_output
sampler_output = self.sampler(logits, self.sampling_metadata) sampler_output = self.sampler(logits, self.sampling_metadata)
if self.fd_config.parallel_config.tensor_parallel_degree > 1: if self.fd_config.parallel_config.tensor_parallel_size > 1:
paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0) paddle.distributed.broadcast(sampler_output.sampled_token_ids, 0)
self.post_process(sampler_output) self.post_process(sampler_output)
@@ -740,9 +734,7 @@ class GPUVLModelRunner(VLModelRunnerBase):
""" """
Calculate the size of kvcache for computational theory Calculate the size of kvcache for computational theory
""" """
num_layers = self.model_cfg.get("num_layers", num_layers = self.model_cfg.num_hidden_layers
None) or self.model_cfg.get(
"num_hidden_layers", None)
byte_of_cache = 2 byte_of_cache = 2
# support c8 c4 # support c8 c4

View File

@@ -22,11 +22,9 @@ import paddle
import paddle.distributed as dist import paddle.distributed as dist
import paddle.distributed.fleet as fleet import paddle.distributed.fleet as fleet
from fastdeploy import envs
from fastdeploy.config import (DecodingConfig, DeviceConfig, FDConfig, from fastdeploy.config import (DecodingConfig, DeviceConfig, FDConfig,
GraphOptimizationConfig, LoadConfig, GraphOptimizationConfig, LoadConfig,
ModelConfig, MoEConfig, MoEPhase, ModelConfig, ParallelConfig, SpeculativeConfig)
ParallelConfig, SpeculativeConfig)
from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue from fastdeploy.inter_communicator import EngineWorkerQueue as TaskQueue
from fastdeploy.inter_communicator import IPCSignal from fastdeploy.inter_communicator import IPCSignal
from fastdeploy.model_executor.layers.quantization import \ from fastdeploy.model_executor.layers.quantization import \
@@ -122,7 +120,7 @@ class PaddleDisWorkerProc():
self.task_queue = TaskQueue( self.task_queue = TaskQueue(
address=task_address, address=task_address,
is_server=False, is_server=False,
num_client=self.parallel_config.tensor_parallel_degree, num_client=self.parallel_config.tensor_parallel_size,
client_id=self.parallel_config.tensor_parallel_rank, client_id=self.parallel_config.tensor_parallel_rank,
local_data_parallel_id=self.parallel_config.expert_parallel_rank) local_data_parallel_id=self.parallel_config.expert_parallel_rank)
@@ -139,8 +137,8 @@ class PaddleDisWorkerProc():
# init worker_ready_signal # init worker_ready_signal
max_chips_per_node = 16 if current_platform.is_iluvatar() else 8 max_chips_per_node = 16 if current_platform.is_iluvatar() else 8
array_size = min( array_size = min(
max_chips_per_node, self.parallel_config.tensor_parallel_degree * max_chips_per_node, self.parallel_config.tensor_parallel_size *
self.parallel_config.expert_parallel_degree) self.parallel_config.expert_parallel_size)
workers_ready = np.zeros(shape=[array_size], dtype=np.int32) workers_ready = np.zeros(shape=[array_size], dtype=np.int32)
self.worker_ready_signal = IPCSignal( self.worker_ready_signal = IPCSignal(
name="worker_ready_signal", name="worker_ready_signal",
@@ -173,7 +171,7 @@ class PaddleDisWorkerProc():
# init exist_task_signal # init exist_task_signal
workers_exist_task = np.zeros( workers_exist_task = np.zeros(
[self.parallel_config.expert_parallel_degree], dtype=np.int32) [self.parallel_config.expert_parallel_size], dtype=np.int32)
self.exist_task_signal = IPCSignal( self.exist_task_signal = IPCSignal(
name="exist_task_signal", name="exist_task_signal",
array=workers_exist_task, array=workers_exist_task,
@@ -183,7 +181,7 @@ class PaddleDisWorkerProc():
# init exist_swapped_task_signal # init exist_swapped_task_signal
workers_swapped_task = np.zeros( workers_swapped_task = np.zeros(
shape=[self.parallel_config.expert_parallel_degree], shape=[self.parallel_config.expert_parallel_size],
dtype=np.int32) dtype=np.int32)
self.exist_swapped_task_signal = IPCSignal( self.exist_swapped_task_signal = IPCSignal(
name="exist_swapped_task_signal", name="exist_swapped_task_signal",
@@ -231,8 +229,8 @@ class PaddleDisWorkerProc():
TODO(gongshaotian): support remote calling of functions that control worker. TODO(gongshaotian): support remote calling of functions that control worker.
""" """
# Currently, only support single node # Currently, only support single node
self.nnode = int((self.parallel_config.tensor_parallel_degree + 7) // 8) self.nnode = int((self.parallel_config.tensor_parallel_size + 7) // 8)
mp_num_per_node = self.parallel_config.tensor_parallel_degree // self.nnode mp_num_per_node = self.parallel_config.tensor_parallel_size // self.nnode
req_ids = [] req_ids = []
while True: while True:
if self.local_rank == 0: if self.local_rank == 0:
@@ -241,7 +239,7 @@ class PaddleDisWorkerProc():
else: else:
self.exist_task_signal.value[0] = 0 self.exist_task_signal.value[0] = 0
if self.parallel_config.tensor_parallel_degree > 1: if self.parallel_config.tensor_parallel_size > 1:
# Synchronize before updating weights # Synchronize before updating weights
paddle.distributed.barrier() paddle.distributed.barrier()
@@ -259,7 +257,7 @@ class PaddleDisWorkerProc():
self.fd_config.parallel_config. self.fd_config.parallel_config.
expert_parallel_rank] = 1 expert_parallel_rank] = 1
if self.parallel_config.tensor_parallel_degree > 1: if self.parallel_config.tensor_parallel_size > 1:
# Synchronize the signal for other workers # Synchronize the signal for other workers
# TODO(@wufeisheng): Split TP group and EP group # TODO(@wufeisheng): Split TP group and EP group
paddle.distributed.barrier() paddle.distributed.barrier()
@@ -479,8 +477,8 @@ def parse_args():
) )
parser.add_argument( parser.add_argument(
"--speculative_benchmark_mode", "--speculative_benchmark_mode",
default="false", default=False,
type=str, type=bool,
) )
parser.add_argument("--max_num_batched_tokens", parser.add_argument("--max_num_batched_tokens",
type=int, type=int,
@@ -559,7 +557,7 @@ def parse_args():
return args return args
def initialize_fd_config(config_or_args, ranks: int = 1, local_rank: int = 0) -> FDConfig: def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig:
"""Initialize FDConfig from either RolloutModelConfig or argparse.Namespace """Initialize FDConfig from either RolloutModelConfig or argparse.Namespace
Args: Args:
@@ -568,196 +566,37 @@ def initialize_fd_config(config_or_args, ranks: int = 1, local_rank: int = 0) ->
Returns: Returns:
FDConfig: Initialized FastDeploy configuration object FDConfig: Initialized FastDeploy configuration object
""" """
# Get model config from model directory paddle.set_default_dtype(args.dtype)
model_config_dict, _ = ModelConfig.get_config_dict(config_or_args.model_name_or_path) model_config = ModelConfig(vars(args))
device_config = DeviceConfig(vars(args))
# Handle MoE related configs decoding_config = DecodingConfig(vars(args))
if 'num_experts' in model_config_dict: speculative_config = SpeculativeConfig(vars(args))
model_config_dict['moe_num_experts'] = model_config_dict.pop('num_experts') parallel_config = ParallelConfig(vars(args))
if 'num_experts_per_tok' in model_config_dict: load_config = LoadConfig(vars(args))
model_config_dict['moe_topk'] = model_config_dict.pop('num_experts_per_tok')
# Set default values for model config
model_config_dict["head_dim"] = model_config_dict.get(
"head_dim", model_config_dict["hidden_size"] // model_config_dict["num_attention_heads"])
model_config_dict["rope_theta"] = model_config_dict.get("rope_theta", 10000.0)
# Create model config object
model_config = ModelConfig.from_dict(model_config_dict)
model_config.head_dim = model_config_dict["head_dim"]
paddle.set_default_dtype(config_or_args.dtype)
if 'tie_word_embeddings' in model_config_dict:
model_config.tie_word_embeddings = model_config_dict['tie_word_embeddings']
# Initialize all config components
device_config = DeviceConfig()
decoding_config = DecodingConfig()
speculative_config = SpeculativeConfig()
parallel_config = ParallelConfig()
load_config = LoadConfig()
moe_config = MoEConfig()
# Handle graph optimization config (check for attribute existence for backward compatibility)
enable_static_graph_inference = getattr(config_or_args, 'enable_static_graph_inference', False)
use_cudagraph = getattr(config_or_args, 'use_cudagraph', False)
max_capture_batch_size = getattr(config_or_args, 'max_capture_batch_size', 0)
graph_opt_config = GraphOptimizationConfig( graph_opt_config = GraphOptimizationConfig(
enable_static_graph_inference, args.enable_static_graph_inference,
use_cudagraph, args.max_capture_batch_size,
max_capture_batch_size vars(args))
)
# Handle quantization (check for attribute existence) # Note(tangbinhan): used for load_checkpoint
model_config.quantization = getattr(config_or_args, 'quantization', None) model_config.pretrained_config.tensor_parallel_rank = parallel_config.tensor_parallel_rank
model_config.pretrained_config.tensor_parallel_degree = parallel_config.tensor_parallel_size
model_config.pretrained_config.is_mtp = False
model_config.pretrained_config.head_dim = model_config.head_dim
# Update speculative config_or_args
speculative_config.method = getattr(config_or_args, 'speculative_method', None)
speculative_config.num_speculative_tokens = getattr(config_or_args, 'speculative_max_draft_token_num', 0)
speculative_config.model_name_or_path = getattr(config_or_args, 'speculative_model_name_or_path', None)
speculative_config.quantization = getattr(config_or_args, 'speculative_model_quantization', None)
speculative_config.benchmark_mode = (
getattr(config_or_args, "speculative_benchmark_mode", "false").lower() == "true"
)
# Update parallel config
parallel_config.engine_pid = getattr(config_or_args, 'engine_pid', None)
parallel_config.model_name_or_path = config_or_args.model_name_or_path
parallel_config.max_num_seqs = getattr(config_or_args, 'max_num_seqs', 0)
parallel_config.max_block_num = getattr(config_or_args, 'total_block_num', 0)
parallel_config.block_size = getattr(config_or_args, 'block_size', 64)
parallel_config.pod_ip = getattr(config_or_args, 'pod_ip', None)
parallel_config.engine_worker_queue_port = getattr(config_or_args, 'engine_worker_queue_port', 0)
parallel_config.max_model_len = getattr(config_or_args, 'max_model_len', 0)
model_config.max_seq_len = getattr(config_or_args, 'max_model_len', 0)
model_config.max_length = getattr(config_or_args, 'max_model_len', 0)
parallel_config.device_ids = getattr(config_or_args, 'device_ids', [])
parallel_config.dtype = config_or_args.dtype
parallel_config.enc_dec_block_num = getattr(config_or_args, 'enc_dec_block_num', 0)
parallel_config.kv_cache_ratio = getattr(config_or_args, 'kv_cache_ratio', 1.0)
parallel_config.first_token_id = getattr(config_or_args, 'first_token_id', None)
parallel_config.gpu_memory_utilization = getattr(config_or_args, 'gpu_memory_utilization', 0.9)
parallel_config.do_profile = getattr(config_or_args, 'do_profile', False)
parallel_config.dynamic_load_weight = getattr(config_or_args, 'dynamic_load_weight', False)
parallel_config.pad_token_id = getattr(config_or_args, 'pad_token_id', None)
parallel_config.eos_tokens_lens = getattr(config_or_args, 'eos_tokens_lens', 0)
parallel_config.enable_chunked_prefill = getattr(config_or_args, 'enable_chunked_prefill', False)
parallel_config.max_num_batched_tokens = getattr(config_or_args, 'max_num_batched_tokens', 0)
parallel_config.enable_prefix_caching = getattr(config_or_args, 'enable_prefix_caching', False)
parallel_config.enable_custom_all_reduce = getattr(config_or_args, 'enable_custom_all_reduce', False)
parallel_config.use_ep = getattr(config_or_args, 'enable_expert_parallell', False)
parallel_config.tensor_parallel_degree = getattr(config_or_args, 'tensor_parallel_size', 1)
parallel_config.expert_parallel_degree = getattr(config_or_args, 'expert_parallel_size', 1)
parallel_config.splitwise_role = getattr(config_or_args, 'splitwise_role', None)
parallel_config.guided_decoding_backend = getattr(config_or_args, 'guided_decoding_backend', None)
parallel_config.disable_any_whitespace = getattr(config_or_args, 'disable_any_whitespace', False)
# Log parallel config info
logger.info(f"parallel_config.use_ep {parallel_config.use_ep}") logger.info(f"parallel_config.use_ep {parallel_config.use_ep}")
logger.info(f"parallel_config.tensor_parallel_degree {parallel_config.tensor_parallel_degree}") logger.info(
logger.info(f"splitwise_role {parallel_config.splitwise_role}") f"parallel_config.tensor_parallel_size {parallel_config.tensor_parallel_size}"
)
logger.info(
f"parallel_config.tensor_parallel_rank {parallel_config.tensor_parallel_rank}"
)
# Set MoE phase based on splitwise role if getattr(model_config, 'num_hidden_layers', None) is None:
if parallel_config.splitwise_role == "mixed": raise ValueError("num_hidden_layers is None")
parallel_config.moe_phase = MoEPhase.PREFILL
elif parallel_config.splitwise_role == "prefill":
parallel_config.moe_phase = MoEPhase.PREFILL
elif parallel_config.splitwise_role == "decode":
parallel_config.moe_phase = MoEPhase.DECODER
elif parallel_config.splitwise_role is not None:
raise NotImplementedError
# Handle model architecture specific configurations quantization_config = model_config.quantization_config
num_key_value_heads = model_config_dict.get("num_key_value_heads", -1)
if num_key_value_heads is None:
num_key_value_heads = -1
# Calculate FFN hidden size
if model_config_dict.get("ffn_hidden_size", None) is not None:
ffn_hidden_size = model_config_dict["ffn_hidden_size"]
elif model_config_dict.get("intermediate_size", None) is not None:
ffn_hidden_size = model_config_dict["intermediate_size"]
else:
ffn_hidden_size = 4 * model_config_dict["hidden_size"]
if model_config_dict["hidden_act"].lower() == "swiglu":
if paddle.distributed.get_world_size() > 1:
multiple_of = 8 * model_config_dict["num_attention_heads"]
else:
multiple_of = 4 * model_config_dict["num_attention_heads"]
ffn_hidden_size = multiple_of * (
(int(2 * ffn_hidden_size / 3) + multiple_of - 1) //
multiple_of)
# Get number of layers
num_layers = model_config_dict.get("num_layers", None) or model_config_dict.get(
"num_hidden_layers", None)
if num_layers is None:
raise ValueError(f"num_layers<{num_layers}> is invalid")
if "moe_layer_start_index" in model_config_dict:
moe_layer_start_index = model_config_dict["moe_layer_start_index"]
use_moe = (
isinstance(moe_layer_start_index, int)
and moe_layer_start_index < num_layers
) or (
isinstance(moe_layer_start_index, list)
and min(moe_layer_start_index) < num_layers
)
else:
use_moe = False
# Update model config
model_config.ffn_hidden_size = ffn_hidden_size
model_config.num_layers = num_layers
model_config.num_key_value_heads = num_key_value_heads
model_config.start_layer_index = model_config_dict.get("start_layer_index", 0)
# Update MoE config
moe_config.num_experts = model_config_dict.get("moe_num_experts", None)
moe_config.moe_intermediate_size = model_config_dict.get("moe_intermediate_size", None)
moe_config.top_k = model_config_dict.get("moe_k", model_config_dict.get("moe_topk", 8))
moe_config.moe_num_shared_experts = model_config_dict.get("moe_num_shared_experts", 0)
moe_config.moe_layer_start_index = model_config_dict.get("moe_layer_start_index", 0)
moe_config.num_max_dispatch_tokens_per_rank = model_config_dict.get(
"num_max_dispatch_tokens_per_rank", 256)
moe_config.moe_use_aux_free = model_config_dict.get("moe_use_aux_free", False)
# Handle vocabulary size
model_config.ori_vocab_size = model_config_dict.get("vocab_size", -1)
archs = model_config_dict.get("architectures", [])
if "Ernie4_5_ForCausalLM" in archs or "Ernie4_5_MoeForCausalLM" in archs:
model_config.ori_vocab_size = getattr(config_or_args, 'ori_vocab_size', model_config.ori_vocab_size)
# Handle DeepseekV3 specific config
if "DeepseekV3ForCausalLM" in model_config_dict.get("architectures", []):
from paddleformers.transformers import AutoConfig
model_config.deepseekv3 = AutoConfig.from_pretrained(
config_or_args.model_name_or_path)
assert parallel_config.tensor_parallel_degree * parallel_config.expert_parallel_degree == ranks
parallel_config.tensor_parallel_rank = \
local_rank % parallel_config.tensor_parallel_degree
parallel_config.expert_parallel_rank = \
int(local_rank / parallel_config.tensor_parallel_degree)
if parallel_config.use_ep:
moe_config.num_experts_per_rank = \
moe_config.num_experts // parallel_config.expert_parallel_degree
moe_config.num_experts_start_offset = \
parallel_config.expert_parallel_rank * moe_config.num_experts_per_rank
# For auto TP split
model_config.tensor_parallel_degree = parallel_config.tensor_parallel_degree
model_config.tensor_parallel_rank = parallel_config.tensor_parallel_rank
model_config.use_ep = parallel_config.use_ep
if parallel_config.use_ep:
model_config.num_experts_per_rank = moe_config.num_experts_per_rank
model_config.num_experts_start_offset = moe_config.num_experts_start_offset
# Handle quantization config
quantization_config = model_config_dict.get("quantization_config", None)
if not model_config.is_quantized: if not model_config.is_quantized:
if quantization_config is not None: if quantization_config is not None:
if "kv_cache_quant_type" not in quantization_config: if "kv_cache_quant_type" not in quantization_config:
@@ -772,16 +611,15 @@ def initialize_fd_config(config_or_args, ranks: int = 1, local_rank: int = 0) ->
if quantization_config is not None: if quantization_config is not None:
quant_config_name = quantization_config["quantization"] quant_config_name = quantization_config["quantization"]
elif getattr(config_or_args, 'quantization', None) != "None": elif args.quantization != "None":
quantization_config = {} quantization_config = {}
quant_config_name = getattr(config_or_args, 'quantization', None) quant_config_name = args.quantization
quantization_config["quantization"] = quant_config_name quantization_config["quantization"] = quant_config_name
# Special handling for Ernie models # Special handling for Ernie models
is_ernie = "Ernie4_5_ForCausalLM" in model_config_dict.get("architectures", []) or \ is_ernie = "Ernie4_5_ForCausalLM" in model_config.architectures or \
"Ernie4_5_MoeForCausalLM" in model_config_dict.get("architectures", []) or \ "Ernie4_5_MoeForCausalLM" in model_config.architectures or \
"Ernie4_5_VLMoeForConditionalGeneration" in model_config_dict.get( "Ernie4_5_VLMoeForConditionalGeneration" in model_config.architectures
"architectures", []) if quant_config_name == "wint4" and is_ernie:
if use_moe and quant_config_name == "wint4" and is_ernie:
quantization_config["dense_quant_type"] = "wint8" quantization_config["dense_quant_type"] = "wint8"
quantization_config["moe_quant_type"] = "wint4" quantization_config["moe_quant_type"] = "wint4"
quantization_config["quantization"] = "mix_quant" quantization_config["quantization"] = "mix_quant"
@@ -806,38 +644,23 @@ def initialize_fd_config(config_or_args, ranks: int = 1, local_rank: int = 0) ->
logger.info( logger.info(
"Model Status: Original (will apply online quantization)") "Model Status: Original (will apply online quantization)")
logger.info(f"Quantization Method: {getattr(config_or_args, 'quantization', 'None')}") logger.info(f"{quantization_config}")
else: else:
logger.info( logger.info(
"No quantization config found and use original weight and act dtype." "No quantization config found and use original weight and act dtype."
) )
model_config.enable_logprob = config_or_args.enable_logprob
model_config.architectures = model_config_dict.get("architectures")
# Update load config
logger.info("===========load_config==============")
# Handle load config (check for environment variable)
load_config.use_fastsafetensor = int(envs.FD_USE_FASTSAFETENSOR) == 1
load_config.dynamic_load_weight = getattr(config_or_args, 'dynamic_load_weight', False)
load_config.load_strategy = getattr(config_or_args, 'load_strategy', None)
logger.info(f"- Dynamic load weight: {load_config.dynamic_load_weight}") logger.info(f"- Dynamic load weight: {load_config.dynamic_load_weight}")
logger.info(f"- Load strategy: {load_config.load_strategy}") logger.info(f"- Load strategy: {load_config.load_strategy}")
logger.info(f"- Use fastsafetensor: {load_config.use_fastsafetensor}")
# Create and return FDConfig fd_config = FDConfig(model_config=model_config,
fd_config = FDConfig( parallel_config=parallel_config,
model_config=model_config, speculative_config=speculative_config,
parallel_config=parallel_config, device_config=device_config,
speculative_config=speculative_config, load_config=load_config,
device_config=device_config, decoding_config=decoding_config,
load_config=load_config, quant_config=quant_config,
moe_config=moe_config, graph_opt_config=graph_opt_config)
decoding_config=decoding_config,
quant_config=quant_config,
graph_opt_config=graph_opt_config
)
return fd_config return fd_config

View File

@@ -314,7 +314,7 @@ class XPUModelRunner(ModelRunnerBase):
"min_tokens", 1) "min_tokens", 1)
self.share_inputs["max_dec_len"][idx:idx + 1] = request.get( self.share_inputs["max_dec_len"][idx:idx + 1] = request.get(
"max_tokens", self.model_config.max_length) "max_tokens", self.model_config.max_model_len)
self.share_inputs["stop_flags"][idx:idx + 1] = False self.share_inputs["stop_flags"][idx:idx + 1] = False
self.share_inputs["first_token_ids"][ self.share_inputs["first_token_ids"][
@@ -387,11 +387,11 @@ class XPUModelRunner(ModelRunnerBase):
self.share_inputs["min_dec_len"] = paddle.full( self.share_inputs["min_dec_len"] = paddle.full(
[max_num_seqs, 1], self.model_config.min_length, dtype='int64') [max_num_seqs, 1], self.model_config.min_length, dtype='int64')
self.share_inputs["max_dec_len"] = paddle.full( self.share_inputs["max_dec_len"] = paddle.full(
[max_num_seqs, 1], self.model_config.max_length, dtype='int64') [max_num_seqs, 1], self.model_config.max_model_len, dtype='int64')
self.share_inputs["min_length"] = paddle.full( self.share_inputs["min_length"] = paddle.full(
[max_num_seqs, 1], self.model_config.min_length, dtype='int64') [max_num_seqs, 1], self.model_config.min_length, dtype='int64')
self.share_inputs["max_length"] = paddle.full( self.share_inputs["max_length"] = paddle.full(
[max_num_seqs, 1], self.model_config.max_length, dtype='int64') [max_num_seqs, 1], self.model_config.max_model_len, dtype='int64')
self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs, self.share_inputs["seq_lens_this_time"] = paddle.full(max_num_seqs,
0, 0,
dtype='int32') dtype='int32')
@@ -574,7 +574,7 @@ class XPUModelRunner(ModelRunnerBase):
kv_cache_shape = self.attn_backends[0].get_kv_cache_shape( kv_cache_shape = self.attn_backends[0].get_kv_cache_shape(
max_num_blocks=max_block_num) max_num_blocks=max_block_num)
for i in range(self.model_config.num_layers): for i in range(self.model_config.num_hidden_layers):
cache_kvs["key_caches_{}".format(i)] = paddle.full( cache_kvs["key_caches_{}".format(i)] = paddle.full(
shape=kv_cache_shape, shape=kv_cache_shape,
fill_value=0, fill_value=0,
@@ -597,10 +597,10 @@ class XPUModelRunner(ModelRunnerBase):
assert len(self.attn_backends) == 0 assert len(self.attn_backends) == 0
# TODO(gongshaotian): Get rank from config # TODO(gongshaotian): Get rank from config
num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_degree num_heads = self.model_config.num_attention_heads // self.parallel_config.tensor_parallel_size
self.model_config.kv_num_heads = int( self.model_config.kv_num_heads = int(
self.model_config.num_key_value_heads self.model_config.num_key_value_heads
) // self.parallel_config.tensor_parallel_degree ) // self.parallel_config.tensor_parallel_size
head_dim = self.model_config.head_dim head_dim = self.model_config.head_dim
# Get the attention backend # Get the attention backend
@@ -803,7 +803,7 @@ class XPUModelRunner(ModelRunnerBase):
required_memory = ( required_memory = (
byte_of_dtype * 2 * # k + v byte_of_dtype * 2 * # k + v
(self.parallel_config.block_size * hidden_dim) * (self.parallel_config.block_size * hidden_dim) *
self.model_config.num_layers) self.model_config.num_hidden_layers)
return required_memory return required_memory
def update_share_input_block_num(self, num_gpu_blocks: int) -> None: def update_share_input_block_num(self, num_gpu_blocks: int) -> None: