diff --git a/fastdeploy/config.py b/fastdeploy/config.py index ceeb7c4a8..3d7b0caad 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -84,6 +84,7 @@ class ModelConfig(PretrainedConfig): head_dim: Optional[int] = None, tie_word_embeddings: bool = False, is_quantized: bool = False, + rms_norm_eps: float = 1e-5, **kwargs, ): super().__init__(**kwargs) @@ -123,6 +124,7 @@ class ModelConfig(PretrainedConfig): self.dtype = dtype self.tie_word_embeddings = tie_word_embeddings self.is_quantized = is_quantized + self.rms_norm_eps = rms_norm_eps @dataclass diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index f6b73622a..a6d064043 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -288,14 +288,14 @@ class Ernie4_5_DecoderLayer(nn.Layer): self.input_layernorm = RMSNorm( fd_config, hidden_size=fd_config.model_config.hidden_size, - eps=1e-5, + eps=fd_config.model_config.rms_norm_eps, prefix=f"{prefix}.input_layernorm", ) self.post_attention_layernorm = RMSNorm( fd_config, hidden_size=fd_config.model_config.hidden_size, - eps=1e-5, + eps=fd_config.model_config.rms_norm_eps, prefix=f"{prefix}.post_attention_layernorm", ) @@ -366,7 +366,7 @@ class Ernie4_5_Model(nn.Layer): self.norm = RMSNorm( fd_config, hidden_size=fd_config.model_config.hidden_size, - eps=1e-5, + eps=fd_config.model_config.rms_norm_eps, prefix=f"{fd_config.model_config.prefix_name}.norm", ) diff --git a/fastdeploy/model_executor/models/ernie4_5_mtp.py b/fastdeploy/model_executor/models/ernie4_5_mtp.py index 029becc1e..7920155ec 100644 --- a/fastdeploy/model_executor/models/ernie4_5_mtp.py +++ b/fastdeploy/model_executor/models/ernie4_5_mtp.py @@ -275,14 +275,14 @@ class Ernie4_5_MTPModel(nn.Layer): self.enorm = RMSNorm( fd_config, hidden_size=fd_config.model_config.hidden_size, - eps=1e-5, + eps=fd_config.model_config.rms_norm_eps, prefix="ernie.mtp_emb_norm.0", ) self.hnorm = RMSNorm( fd_config, hidden_size=fd_config.model_config.hidden_size, - eps=1e-5, + eps=fd_config.model_config.rms_norm_eps, prefix="ernie.mtp_hidden_norm.0", ) diff --git a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py index 9ed28bc0b..b6de4a2f8 100644 --- a/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_vl/ernie4_5_vl_moe.py @@ -273,14 +273,14 @@ class Ernie4_5_VLDecoderLayer(nn.Layer): self.input_layernorm = RMSNorm( fd_config, hidden_size=fd_config.model_config.hidden_size, - eps=1e-5, + eps=fd_config.model_config.rms_norm_eps, prefix=f"{prefix}.input_layernorm", ) self.post_attention_layernorm = RMSNorm( fd_config, hidden_size=fd_config.model_config.hidden_size, - eps=1e-5, + eps=fd_config.model_config.rms_norm_eps, prefix=f"{prefix}.post_attention_layernorm", ) @@ -358,7 +358,7 @@ class Ernie4_5_VLModel(nn.Layer): self.norm = RMSNorm( fd_config, hidden_size=fd_config.model_config.hidden_size, - eps=1e-5, + eps=fd_config.model_config.rms_norm_eps, prefix=f"{fd_config.model_config.prefix_name}.norm", ) diff --git a/fastdeploy/model_executor/models/qwen2.py b/fastdeploy/model_executor/models/qwen2.py index 242d6f9da..4fab1e30b 100644 --- a/fastdeploy/model_executor/models/qwen2.py +++ b/fastdeploy/model_executor/models/qwen2.py @@ -161,14 +161,14 @@ class Qwen2DecoderLayer(nn.Layer): self.input_layernorm = RMSNorm( fd_config, hidden_size=fd_config.model_config.hidden_size, - eps=1e-6, + eps=fd_config.model_config.rms_norm_eps, prefix=f"{prefix}.input_layernorm", ) self.post_attention_layernorm = RMSNorm( fd_config, hidden_size=fd_config.model_config.hidden_size, - eps=1e-6, + eps=fd_config.model_config.rms_norm_eps, prefix=f"{prefix}.post_attention_layernorm", ) @@ -248,7 +248,7 @@ class Qwen2Model(nn.Layer): self.norm = RMSNorm( fd_config, hidden_size=fd_config.model_config.hidden_size, - eps=1e-5, + eps=fd_config.model_config.rms_norm_eps, prefix=f"{fd_config.model_config.prefix_name}.norm", ) diff --git a/fastdeploy/model_executor/models/qwen3.py b/fastdeploy/model_executor/models/qwen3.py index ef0ef9a9c..8c734e422 100644 --- a/fastdeploy/model_executor/models/qwen3.py +++ b/fastdeploy/model_executor/models/qwen3.py @@ -79,12 +79,12 @@ class Qwen3Attention(nn.Layer): self.q_norm = RMSNorm(fd_config=fd_config, hidden_size=fd_config.model_config.head_dim, - eps=1e-6, + eps=fd_config.model_config.rms_norm_eps, prefix=f"{prefix}.q_norm", begin_norm_axis=2) self.k_norm = RMSNorm(fd_config=fd_config, hidden_size=fd_config.model_config.head_dim, - eps=1e-6, + eps=fd_config.model_config.rms_norm_eps, prefix=f"{prefix}.k_norm", begin_norm_axis=2) @@ -183,7 +183,7 @@ class Qwen3Model(nn.Layer): self.norm = RMSNorm( fd_config, hidden_size=fd_config.model_config.hidden_size, - eps=1e-6, + eps=fd_config.model_config.rms_norm_eps, prefix=f"{fd_config.model_config.prefix_name}.norm", ) diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index c4d01ef6e..9962fa1ee 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -121,12 +121,12 @@ class Qwen3Attention(nn.Layer): self.q_norm = RMSNorm(fd_config, hidden_size=self.head_dim, - eps=1e-6, + eps=fd_config.model_config.rms_norm_eps, prefix=f"{prefix}.q_norm", begin_norm_axis=2) self.k_norm = RMSNorm(fd_config, hidden_size=self.head_dim, - eps=1e-6, + eps=fd_config.model_config.rms_norm_eps, prefix=f"{prefix}.k_norm", begin_norm_axis=2) diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 8988a68f5..e30800260 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -594,7 +594,6 @@ def initialize_fd_config(config_or_args) -> FDConfig: model_config_dict, _ = ModelConfig.get_config_dict(config_or_args.model_name_or_path) - # Handle MoE related configs if 'num_experts' in model_config_dict: model_config_dict['moe_num_experts'] = model_config_dict.pop('num_experts')