[Sync] Update to latest code (#2679)

* [Sync] Update to latest code * Add new code files * Add new code files * update code * Try to fix build.sh * Try to fix build.sh * Update code * Update requirements.txt * Update code --------- Co-authored-by: Jiang-Jia-Jun <jiangjiajun@baidu.com>
2025-10-05 16:48:03 +08:00 · 2025-07-03 15:43:53 +08:00
parent d222248d00
commit 05c670e593
95 changed files with 9916 additions and 1312 deletions
--- a/fastdeploy/model_executor/layers/normalization.py
+++ b/fastdeploy/model_executor/layers/normalization.py
@@ -14,10 +14,15 @@
 # limitations under the License.
 """

+from typing import Callable, Dict, Optional
+
+import numpy as np
 import paddle
 from paddle import nn
 from paddle.incubate.nn.functional import fused_layer_norm, fused_rms_norm

+from fastdeploy.config import FDConfig
+
 from .utils import get_tensor


@@ -28,16 +33,16 @@ class RMSNorm(nn.Layer):

    def __init__(
        self,
-        fd_config,
-        hidden_size,
-        eps=1e-5,
-        prefix="",
-        linear_bias=None,
-        quant_scale=None,
-        begin_norm_axis=1,
-    ):
+        fd_config: FDConfig,
+        hidden_size: int,
+        eps: float = 1e-5,
+        prefix: str = "",
+        linear_bias: paddle.Tensor = None,
+        quant_scale: float = None,
+        begin_norm_axis: int = 1,
+    ) -> None:
        """
-        Initializes the normalization layer.
+        Initializes the RMSNormalization layer.

        Args:
            fd_config (FDConfig): Arguments related to inference, containing
@@ -45,33 +50,33 @@ class RMSNorm(nn.Layer):
                num_attention_heads, and ffn_hidden_size.
            hidden_size (int) : size of hidden state.
            eps:(float, optional): Small value added to the variance to avoid division by zero. Defaults to 1e-5.
-            weight_key (str): Key name of weight in the pdparams state dict. Defaults to None, means no weight.
-            bias_key (str): Key name of bias in the pdparams state dict. Defaults to None, means no bias.
-            linear_bias (float, optional): Initial bias value for the linear layer (if used). Defaults to None.
+            prefix(str,optional):The name of current layer. Defaults to "".
+            linear_bias (paddle.Tensor,optional): Initial bias value for the linear layer (if used). Defaults to None.
+            quant_scale(float,optional):Quantization scale, used in quantization scenarios. Defaults to -1, indicating no quantization.
+            begin_norm_axis (int, optional): The axis along which to perform normalization. Defaults to 1.

        Raises:
            NotImplementedError: If the specified norm_type is not supported.
        """
        super().__init__()
        self.fd_config = fd_config
-        self.prefix = prefix
-        self.hidden_size = hidden_size
+        self.prefix: str = prefix
+        self.hidden_size: int = hidden_size
        if len(prefix) == 0:
-            self.weight_key = None
+            self.weight_key: Optional[str] = None
        else:
-            self.weight_key = f"{prefix}.weight"
-        self.with_weight = self.weight_key is not None
-        self.eps = eps
-        self.norm_func = fused_rms_norm
-        self.linear_bias = linear_bias
-        self.quant_scale = quant_scale
-        self._dtype = self._helper.get_default_dtype()
-        self._norm_weight_dtype = self._dtype
-        self.begin_norm_axis = begin_norm_axis
-        self.quant_round_type = self.fd_config.quant_config.quant_round_type if fd_config.quant_config else 0
-        self.quant_max_bound = self.fd_config.quant_config.quant_max_bound if fd_config.quant_config else 0
-        self.quant_min_bound = self.fd_config.quant_config.quant_min_bound if fd_config.quant_config else 0
-        self.begin_norm_axis = begin_norm_axis
+            self.weight_key: Optional[str] = f"{prefix}.weight"
+        self.with_weight: bool = self.weight_key is not None
+        self.eps: float = eps
+        self.norm_func: Callable = fused_rms_norm
+        self.linear_bias: Optional[paddle.Tensor] = linear_bias
+        self.quant_scale: Optional[float] = quant_scale
+        self._dtype: str = self._helper.get_default_dtype()
+        self._norm_weight_dtype: str = self._dtype
+        self.quant_round_type: int = self.fd_config.quant_config.quant_round_type if fd_config.quant_config else 0
+        self.quant_max_bound: int = self.fd_config.quant_config.quant_max_bound if fd_config.quant_config else 0
+        self.quant_min_bound: int = self.fd_config.quant_config.quant_min_bound if fd_config.quant_config else 0
+        self.begin_norm_axis: int = begin_norm_axis

        self.init_weight()

@@ -88,7 +93,8 @@ class RMSNorm(nn.Layer):
                dtype=self._norm_weight_dtype,
            )

-    def load_state_dict(self, state_dict):
+    def load_state_dict(self, state_dict: Dict[str,
+                                               paddle.Tensor | np.ndarray]):
        """
        Load the checkpoint state dictionary into the layer.

@@ -102,7 +108,10 @@ class RMSNorm(nn.Layer):
            self._norm_weight_dtype)
        self.ln_weight.set_value(weight_tensor)

-    def forward(self, x, residual_input=None):
+    def forward(
+            self,
+            x,
+            residual_input: Optional[paddle.Tensor] = None) -> paddle.Tensor:
        """
        Defines the forward computation of the layer.

@@ -140,18 +149,18 @@ class RMSNorm(nn.Layer):

 class LayerNorm(nn.Layer):
    """
-    Normalization layer.
+    Initializes the LayerNormalization layer
    """

    def __init__(
        self,
-        fd_config,
-        hidden_size,
-        eps=1e-5,
+        fd_config: FDConfig,
+        hidden_size: int,
+        eps: float = 1e-5,
        prefix="",
-        linear_bias=None,
-        quant_scale=None,
-        with_bias=False,
+        linear_bias: paddle.Tensor = None,
+        quant_scale: float = None,
+        with_bias: bool = False,
    ):
        """
        Initializes the normalization layer.
@@ -160,35 +169,37 @@ class LayerNorm(nn.Layer):
            fd_config (FDConfig): Arguments related to inference, containing
                attributes such as weight_dtype, act_dtype, mp_size, hidden_size, head_dim,
                num_attention_heads, and ffn_hidden_size.
-            prefix (str): Unique name of the layer, used for naming internal attributes,
-                you can give it any name you like.
            hidden_size (int) : size of hidden state.
            eps:(float, optional): Small value added to the variance to avoid division by zero. Defaults to 1e-5.
+            prefix (str): Unique name of the layer, used for naming internal attributes,
+                you can give it any name you like.
            linear_bias (float, optional): Initial bias value for the linear layer (if used). Defaults to None.
+            quant_scale(float,optional):Quantization scale, used in quantization scenarios. Defaults to -1, indicating no quantization.
+            with_bias (bool):Whether to include bias or not. Defaults to False.
        Raises:
            NotImplementedError: If the specified norm_type is not supported.
        """
        super().__init__()
        self.fd_config = fd_config
-        self.prefix = prefix
-        self.hidden_size = hidden_size
+        self.prefix: str = prefix
+        self.hidden_size: int = hidden_size
        if len(prefix) == 0:
-            self.weight_key = None
+            self.weight_key: Optional[str] = None
        else:
-            self.weight_key = f"{prefix}.weight"
-        self.with_weight = self.weight_key is not None
-        self.bias_key = f"{prefix}.bias"
-        self.with_bias = with_bias
-        self.eps = eps
+            self.weight_key: Optional[str] = f"{prefix}.weight"
+        self.with_weight: bool = self.weight_key is not None
+        self.bias_key: str = f"{prefix}.bias"
+        self.with_bias: bool = with_bias
+        self.eps: float = eps
+        self.quant_scale: float = quant_scale
+        self.norm_func: Callable = fused_layer_norm
+        self.linear_bias: Optional[paddle.Tensor] = linear_bias
+        self._dtype: str = self._helper.get_default_dtype()
+        self._norm_weight_dtype: str = "float32"

-        self.norm_func = fused_layer_norm
-        self.linear_bias = linear_bias
-        self._dtype = self._helper.get_default_dtype()
-        self._norm_weight_dtype = "float32"
-
-        self.quant_round_type = self.fd_config.quant_config.quant_round_type if fd_config.quant_config else 0
-        self.quant_max_bound = self.fd_config.quant_config.quant_max_bound if fd_config.quant_config else 0
-        self.quant_min_bound = self.fd_config.quant_config.quant_min_bound if fd_config.quant_config else 0
+        self.quant_round_type: int = self.fd_config.quant_config.quant_round_type if fd_config.quant_config else 0
+        self.quant_max_bound: int = self.fd_config.quant_config.quant_max_bound if fd_config.quant_config else 0
+        self.quant_min_bound: int = self.fd_config.quant_config.quant_min_bound if fd_config.quant_config else 0

        self.init_weight()

@@ -212,7 +223,8 @@ class LayerNorm(nn.Layer):
                dtype=self._norm_weight_dtype,
            )

-    def load_state_dict(self, state_dict):
+    def load_state_dict(self, state_dict: Dict[str,
+                                               paddle.Tensor | np.ndarray]):
        """
        Load the checkpoint state dictionary into the layer.

@@ -233,7 +245,10 @@ class LayerNorm(nn.Layer):
                self._norm_weight_dtype)
            self.ln_bias.set_value(bias_tensor)

-    def forward(self, x, residual_input=None):
+    def forward(
+            self,
+            x,
+            residual_input: Optional[paddle.Tensor] = None) -> paddle.Tensor:
        """
        Defines the forward computation of the layer.

@@ -259,7 +274,7 @@ class LayerNorm(nn.Layer):
            begin_norm_axis=1,
            bias=self.linear_bias,
            residual=residual_input,
-            quant_scale=-1,
+            quant_scale=-1 if self.quant_scale is None else self.quant_scale,
            quant_round_type=self.quant_round_type,
            quant_max_bound=self.quant_max_bound,
            quant_min_bound=self.quant_min_bound,