mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
refactor rl get_name_mappings_to_training (#2847)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* refactor rl get_name_mappings_to_training * fix tp>1 * change variable name(ffn1->up_gate_proj/ffn2->down_proj) * change variable name(linear_weight->weight/linear_bias->bias) * add rl names mapping for vl * fix ernie 0.3B error * fix develop code * fix
This commit is contained in:
@@ -43,7 +43,7 @@ class RMSNorm(nn.Layer):
|
||||
hidden_size: int,
|
||||
eps: float = 1e-5,
|
||||
prefix: str = "",
|
||||
linear_bias: paddle.Tensor = None,
|
||||
bias: paddle.Tensor = None,
|
||||
quant_scale: float = None,
|
||||
begin_norm_axis: int = 1,
|
||||
) -> None:
|
||||
@@ -57,7 +57,7 @@ class RMSNorm(nn.Layer):
|
||||
hidden_size (int) : size of hidden state.
|
||||
eps:(float, optional): Small value added to the variance to avoid division by zero. Defaults to 1e-5.
|
||||
prefix(str,optional):The name of current layer. Defaults to "".
|
||||
linear_bias (paddle.Tensor,optional): Initial bias value for the linear layer (if used). Defaults to None.
|
||||
bias (paddle.Tensor,optional): Initial bias value for the linear layer (if used). Defaults to None.
|
||||
quant_scale(float,optional):Quantization scale, used in quantization scenarios. Defaults to -1, indicating no quantization.
|
||||
begin_norm_axis (int, optional): The axis along which to perform normalization. Defaults to 1.
|
||||
|
||||
@@ -78,7 +78,7 @@ class RMSNorm(nn.Layer):
|
||||
self.norm_func: Callable = fused_add_rms_norm
|
||||
else:
|
||||
self.norm_func: Callable = fused_rms_norm
|
||||
self.linear_bias: Optional[paddle.Tensor] = linear_bias
|
||||
self.bias: Optional[paddle.Tensor] = bias
|
||||
self.quant_scale: Optional[float] = quant_scale
|
||||
self._dtype: str = self._helper.get_default_dtype()
|
||||
self._norm_weight_dtype: str = self._dtype
|
||||
@@ -94,9 +94,9 @@ class RMSNorm(nn.Layer):
|
||||
Initialize the weights and biases.
|
||||
"""
|
||||
|
||||
self.ln_weight = None
|
||||
self.weight = None
|
||||
if self.with_weight:
|
||||
self.ln_weight = self.create_parameter(
|
||||
self.weight = self.create_parameter(
|
||||
shape=[self.hidden_size],
|
||||
default_initializer=nn.initializer.Constant(value=1.0),
|
||||
dtype=self._norm_weight_dtype,
|
||||
@@ -115,7 +115,7 @@ class RMSNorm(nn.Layer):
|
||||
weight_tensor = paddle.cast(
|
||||
get_tensor(state_dict.pop(self.weight_key)),
|
||||
self._norm_weight_dtype)
|
||||
self.ln_weight.set_value(weight_tensor)
|
||||
self.weight.set_value(weight_tensor)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@@ -139,18 +139,18 @@ class RMSNorm(nn.Layer):
|
||||
"""
|
||||
if current_platform.is_gcu():
|
||||
if residual_input is None:
|
||||
return rms_norm(x, self.ln_weight, self.eps)
|
||||
return rms_norm(x, self.weight, self.eps)
|
||||
norm_out = self.norm_func(
|
||||
x, residual_input, self.ln_weight, self.eps
|
||||
x, residual_input, self.weight, self.eps
|
||||
)
|
||||
else:
|
||||
norm_out = self.norm_func(
|
||||
x,
|
||||
norm_weight=self.ln_weight,
|
||||
norm_weight=self.weight,
|
||||
norm_bias=None,
|
||||
epsilon=self.eps,
|
||||
begin_norm_axis=self.begin_norm_axis,
|
||||
bias=self.linear_bias,
|
||||
bias=self.bias,
|
||||
residual=residual_input,
|
||||
quant_scale=-1 if self.quant_scale is None else self.quant_scale,
|
||||
quant_round_type=self.quant_round_type,
|
||||
@@ -174,7 +174,7 @@ class LayerNorm(nn.Layer):
|
||||
hidden_size: int,
|
||||
eps: float = 1e-5,
|
||||
prefix="",
|
||||
linear_bias: paddle.Tensor = None,
|
||||
bias: paddle.Tensor = None,
|
||||
quant_scale: float = None,
|
||||
with_bias: bool = False,
|
||||
):
|
||||
@@ -189,7 +189,7 @@ class LayerNorm(nn.Layer):
|
||||
eps:(float, optional): Small value added to the variance to avoid division by zero. Defaults to 1e-5.
|
||||
prefix (str): Unique name of the layer, used for naming internal attributes,
|
||||
you can give it any name you like.
|
||||
linear_bias (float, optional): Initial bias value for the linear layer (if used). Defaults to None.
|
||||
bias (float, optional): Initial bias value for the linear layer (if used). Defaults to None.
|
||||
quant_scale(float,optional):Quantization scale, used in quantization scenarios. Defaults to -1, indicating no quantization.
|
||||
with_bias (bool):Whether to include bias or not. Defaults to False.
|
||||
Raises:
|
||||
@@ -212,7 +212,7 @@ class LayerNorm(nn.Layer):
|
||||
self.norm_func: Callable = paddle.nn.functional.layer_norm
|
||||
else:
|
||||
self.norm_func: Callable = fused_layer_norm
|
||||
self.linear_bias: Optional[paddle.Tensor] = linear_bias
|
||||
self.bias: Optional[paddle.Tensor] = bias
|
||||
self._dtype: str = self._helper.get_default_dtype()
|
||||
self._norm_weight_dtype: str = "float32"
|
||||
|
||||
@@ -227,16 +227,16 @@ class LayerNorm(nn.Layer):
|
||||
Initialize the weights and biases.
|
||||
"""
|
||||
|
||||
self.ln_weight = None
|
||||
self.weight = None
|
||||
if self.with_weight:
|
||||
self.ln_weight = self.create_parameter(
|
||||
self.weight = self.create_parameter(
|
||||
shape=[self.hidden_size],
|
||||
default_initializer=nn.initializer.Constant(value=1.0),
|
||||
dtype=self._norm_weight_dtype,
|
||||
)
|
||||
self.ln_bias = None
|
||||
self.bias = None
|
||||
if self.with_bias:
|
||||
self.ln_bias = self.create_parameter(
|
||||
self.bias = self.create_parameter(
|
||||
shape=[self.hidden_size],
|
||||
is_bias=True,
|
||||
dtype=self._norm_weight_dtype,
|
||||
@@ -255,14 +255,14 @@ class LayerNorm(nn.Layer):
|
||||
weight_tensor = paddle.cast(
|
||||
get_tensor(state_dict.pop(self.weight_key)),
|
||||
self._norm_weight_dtype)
|
||||
self.ln_weight.set_value(weight_tensor)
|
||||
self.weight.set_value(weight_tensor)
|
||||
|
||||
# bias
|
||||
if self.with_bias:
|
||||
bias_tensor = paddle.cast(
|
||||
get_tensor(state_dict.pop(self.bias_key)),
|
||||
self._norm_weight_dtype)
|
||||
self.ln_bias.set_value(bias_tensor)
|
||||
self.bias.set_value(bias_tensor)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@@ -285,10 +285,10 @@ class LayerNorm(nn.Layer):
|
||||
operations (like linear transformation) on the `residual_input`.
|
||||
"""
|
||||
if current_platform.is_iluvatar():
|
||||
if self.ln_weight is None and self.ln_bias is None:
|
||||
if self.weight is None and self.bias is None:
|
||||
out = x
|
||||
if self.linear_bias is not None:
|
||||
out += self.linear_bias
|
||||
if self.bias is not None:
|
||||
out += self.bias
|
||||
if residual_input is not None:
|
||||
out += residual_input
|
||||
return out, out
|
||||
@@ -303,8 +303,8 @@ class LayerNorm(nn.Layer):
|
||||
out = self.norm_func(
|
||||
x=y,
|
||||
normalized_shape=y.shape[1:],
|
||||
weight=self.ln_weight,
|
||||
bias=self.linear_bias,
|
||||
weight=self.weight,
|
||||
bias=self.bias,
|
||||
epsilon=self.eps,
|
||||
)
|
||||
return out, y
|
||||
@@ -312,19 +312,19 @@ class LayerNorm(nn.Layer):
|
||||
out = self.norm_func(
|
||||
x=x,
|
||||
normalized_shape=x.shape[1:],
|
||||
weight=self.ln_weight,
|
||||
bias=self.linear_bias,
|
||||
weight=self.weight,
|
||||
bias=self.bias,
|
||||
epsilon=self.eps,
|
||||
)
|
||||
return out
|
||||
else:
|
||||
norm_out = self.norm_func(
|
||||
x,
|
||||
norm_weight=self.ln_weight,
|
||||
norm_bias=self.ln_bias,
|
||||
norm_weight=self.weight,
|
||||
norm_bias=self.bias,
|
||||
epsilon=self.eps,
|
||||
begin_norm_axis=1,
|
||||
bias=self.linear_bias,
|
||||
bias=self.bias,
|
||||
residual=residual_input,
|
||||
quant_scale=-1 if self.quant_scale is None else self.quant_scale,
|
||||
quant_round_type=self.quant_round_type,
|
||||
|
Reference in New Issue
Block a user