[V1 Loader]support param create and load for wint2 and xpu backend (#3581)

* support wint2 backend'

* [V1 Loader]support param create and load for wint2 and xpu backend

* update weight shape name

* update

* update

* update baseline.txt

* update model name

* update baseline.txt

* fix codestyle

* remove debug coode
This commit is contained in:
Zero Rains
2025-08-28 09:49:36 +08:00
committed by GitHub
parent b28a0343a6
commit e37e86b3b8
9 changed files with 307 additions and 326 deletions

View File

@@ -1,44 +0,0 @@
"""
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import MoEMethodBase
from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import (
CutlassMoEMethod,
)
from fastdeploy.model_executor.layers.moe.fused_moe_deepgemm_backend import (
DeepGemmFusedMoeMethod,
)
from fastdeploy.model_executor.layers.moe.fused_moe_marlin_backend import (
MarlinWeightOnlyMoEMethod,
)
from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import (
BlockWiseFP8MoEMethod,
TensorWiseFP8MoEMethod,
TritonWeightOnlyMoEMethod,
)
pre_create_weights_list = (
CutlassMoEMethod,
TensorWiseFP8MoEMethod,
BlockWiseFP8MoEMethod,
TritonWeightOnlyMoEMethod,
DeepGemmFusedMoeMethod,
MarlinWeightOnlyMoEMethod,
)
def is_supported_moe_backend(quant_method: MoEMethodBase):
return isinstance(quant_method, pre_create_weights_list)

View File

@@ -145,12 +145,12 @@ class MarlinWeightOnlyMoEMethod(QuantMethodBase):
up_gate_proj_weight_name = self.added_weight_attrs[0]
down_proj_weight_name = self.added_weight_attrs[1]
self.ffn1_weight_shape = [
self.up_gate_proj_weight_shape = [
layer.num_local_experts,
layer.hidden_size // 16,
layer.moe_intermediate_size * 4,
]
self.ffn2_weight_shape = [
self.down_proj_weight_shape = [
layer.num_local_experts,
layer.moe_intermediate_size // 16,
layer.hidden_size * 2,
@@ -159,7 +159,7 @@ class MarlinWeightOnlyMoEMethod(QuantMethodBase):
layer,
up_gate_proj_weight_name,
layer.create_parameter(
shape=self.ffn1_weight_shape,
shape=self.up_gate_proj_weight_shape,
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
@@ -168,7 +168,7 @@ class MarlinWeightOnlyMoEMethod(QuantMethodBase):
layer,
down_proj_weight_name,
layer.create_parameter(
shape=self.ffn2_weight_shape,
shape=self.down_proj_weight_shape,
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),

View File

@@ -61,12 +61,12 @@ class TritonWeightOnlyMoEMethod(QuantMethodBase):
self.default_dtype = layer._helper.get_default_dtype()
up_gate_proj_weight_name = self.added_weight_attrs[0]
down_proj_weight_name = self.added_weight_attrs[1]
self.ffn1_weight_shape = [
self.up_gate_proj_weight_shape = [
layer.num_local_experts,
layer.hidden_size,
layer.moe_intermediate_size * 2,
]
self.ffn2_weight_shape = [
self.down_proj_weight_shape = [
layer.num_local_experts,
layer.moe_intermediate_size,
layer.hidden_size,
@@ -75,7 +75,7 @@ class TritonWeightOnlyMoEMethod(QuantMethodBase):
layer,
up_gate_proj_weight_name,
layer.create_parameter(
shape=self.ffn1_weight_shape,
shape=self.up_gate_proj_weight_shape,
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
@@ -84,7 +84,7 @@ class TritonWeightOnlyMoEMethod(QuantMethodBase):
layer,
down_proj_weight_name,
layer.create_parameter(
shape=self.ffn2_weight_shape,
shape=self.down_proj_weight_shape,
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
@@ -364,12 +364,12 @@ class TensorWiseFP8MoEMethod(QuantMethodBase):
self.default_dtype = layer._helper.get_default_dtype()
up_gate_proj_weight_name = self.added_wfp8afp8_attrs[0]
down_proj_weight_name = self.added_wfp8afp8_attrs[1]
self.ffn1_weight_shape = [
self.up_gate_proj_weight_shape = [
layer.num_local_experts,
layer.moe_intermediate_size * 2,
layer.hidden_size,
]
self.ffn2_weight_shape = [
self.down_proj_weight_shape = [
layer.num_local_experts,
layer.hidden_size,
layer.moe_intermediate_size,
@@ -378,7 +378,7 @@ class TensorWiseFP8MoEMethod(QuantMethodBase):
layer,
up_gate_proj_weight_name,
layer.create_parameter(
shape=self.ffn1_weight_shape,
shape=self.up_gate_proj_weight_shape,
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
@@ -387,7 +387,7 @@ class TensorWiseFP8MoEMethod(QuantMethodBase):
layer,
down_proj_weight_name,
layer.create_parameter(
shape=self.ffn2_weight_shape,
shape=self.down_proj_weight_shape,
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),

View File

@@ -22,7 +22,7 @@ from fastdeploy.distributed.communication import tensor_model_parallel_all_reduc
from fastdeploy.utils import ceil_div
from ..quantization.quant_base import QuantMethodBase
from ..utils import create_and_set_parameter, get_tensor
from ..utils import get_tensor
class Wint2MoeMethod(QuantMethodBase):
@@ -33,6 +33,11 @@ class Wint2MoeMethod(QuantMethodBase):
def __init__(self, quant_config):
super().__init__()
self.moe_quant_type = quant_config.moe_quant_type
self.added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"]
self.added_scale_attrs = [
"up_gate_proj_weight_scale",
"down_proj_weight_scale",
]
def process_loaded_weights(self, layer, weights) -> None:
"""
@@ -51,11 +56,102 @@ class Wint2MoeMethod(QuantMethodBase):
len(down_proj_weights) == layer.num_local_experts
), "down_proj_weights length should be equal to num_local_experts."
def create_weights(self, layer: nn.Layer, state_dict):
def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
"""
Paddle cutlass create weight process.
"""
pass
self.weight_dtype = "uint8"
self.default_dtype = layer._helper.get_default_dtype()
setattr(
layer,
"up_gate_proj_weight",
layer.create_parameter(
shape=[layer.num_local_experts, layer.hidden_size // 4, layer.moe_intermediate_size * 2],
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
setattr(
layer,
"down_proj_weight",
layer.create_parameter(
shape=[layer.num_local_experts, layer.moe_intermediate_size // 4, layer.hidden_size],
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
setattr(
layer,
"up_gate_proj_weight_scale",
layer.create_parameter(
shape=[layer.num_local_experts, layer.hidden_size // 128, layer.moe_intermediate_size * 2],
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
setattr(
layer,
"down_proj_weight_scale",
layer.create_parameter(
shape=[layer.num_local_experts, layer.moe_intermediate_size // 128, layer.hidden_size],
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
setattr(
layer,
"up_gate_proj_super_scales",
layer.create_parameter(
shape=[layer.num_local_experts, layer.moe_intermediate_size * 2],
dtype=self.default_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
setattr(
layer,
"down_proj_super_scales",
layer.create_parameter(
shape=[layer.num_local_experts, layer.hidden_size],
dtype=self.default_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
setattr(
layer,
"up_gate_proj_code_scale",
layer.create_parameter(
shape=[layer.num_local_experts, layer.moe_intermediate_size * 2],
dtype="float32",
default_initializer=paddle.nn.initializer.Constant(0),
),
)
setattr(
layer,
"down_proj_code_scale",
layer.create_parameter(
shape=[layer.num_local_experts, layer.hidden_size],
dtype="float32",
default_initializer=paddle.nn.initializer.Constant(0),
),
)
setattr(
layer,
"up_gate_proj_code_zp",
layer.create_parameter(
shape=[layer.num_local_experts, layer.moe_intermediate_size * 2],
dtype="float32",
default_initializer=paddle.nn.initializer.Constant(0),
),
)
setattr(
layer,
"down_proj_code_zp",
layer.create_parameter(
shape=[layer.num_local_experts, layer.hidden_size],
dtype="float32",
default_initializer=paddle.nn.initializer.Constant(0),
),
)
class CutlassWint2FusedMoeMethod(Wint2MoeMethod):
@@ -65,7 +161,6 @@ class CutlassWint2FusedMoeMethod(Wint2MoeMethod):
def __init__(self, quant_config):
super().__init__(quant_config)
self.moe_quant_type = quant_config.moe_quant_type
def process_loaded_weights(self, layer, weights) -> None:
"""
@@ -159,13 +254,7 @@ class CutlassWint2FusedMoeMethod(Wint2MoeMethod):
"down_proj_code_zp": down_proj_code_zp,
}
for name, tensor in name_tensor_map.items():
create_and_set_parameter(layer, name, tensor)
def create_weights(self, layer: nn.Layer, state_dict):
"""
Paddle cutlass create weight process.
"""
pass
getattr(layer, name).set_value(tensor)
def apply(
self,

View File

@@ -14,8 +14,6 @@
# limitations under the License.
"""
from typing import Dict
import paddle
from paddle import nn
@@ -114,11 +112,86 @@ class XPUWeightOnlyMoEMethod(QuantMethodBase):
super().__init__()
self.quant_config = quant_config
self.moe_quant_type = self.quant_config.algo
self.added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"]
self.added_scale_attrs = [
"up_gate_proj_weight_scale",
"down_proj_weight_scale",
]
def create_weights(self, layer: nn.Layer, state_dict: Dict[str, paddle.Tensor]):
def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
"""
Paddle cutlass create weight process.
"""
self.default_dtype = "float32"
self.weight_dtype = "int8"
if self.moe_quant_type in ["weight_only_int4", "w4a8"]:
self.up_gate_proj_weight_shape = [
layer.num_local_experts,
layer.moe_intermediate_size * 2,
layer.hidden_size // 2,
]
else:
self.up_gate_proj_weight_shape = [
layer.num_local_experts,
layer.moe_intermediate_size * 2,
layer.hidden_size,
]
if self.moe_quant_type in ["weight_only_int4", "w4a8"]:
self.down_proj_weight_shape = [
layer.num_local_experts,
layer.hidden_size,
layer.moe_intermediate_size // 2,
]
else:
self.down_proj_weight_shape = [
layer.num_local_experts,
layer.hidden_size,
layer.moe_intermediate_size,
]
setattr(
layer,
self.added_weight_attrs[0],
layer.create_parameter(
shape=self.up_gate_proj_weight_shape,
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
setattr(
layer,
self.added_weight_attrs[1],
layer.create_parameter(
shape=self.down_proj_weight_shape,
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
# weight_scale
setattr(
layer,
self.added_scale_attrs[0],
layer.create_parameter(
shape=[layer.num_local_experts, layer.moe_intermediate_size * 2],
dtype=self.default_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
setattr(
layer,
self.added_scale_attrs[1],
layer.create_parameter(
shape=[layer.num_local_experts, layer.hidden_size],
dtype=self.default_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
def process_loaded_weights(self, layer: nn.Layer, state_dict):
"""
Paddle xpu load weight process.
"""
up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
assert len(up_gate_proj_weights) == layer.num_local_experts
assert len(down_proj_weights) == layer.num_local_experts
@@ -131,15 +204,9 @@ class XPUWeightOnlyMoEMethod(QuantMethodBase):
layer.hidden_size,
]
added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"]
added_scale_attrs = [
"up_gate_proj_weight_scale",
"down_proj_weight_scale",
]
for idx, weight_tensor in enumerate([up_gate_proj_weights, down_proj_weights]):
weight_name = added_weight_attrs[idx]
scale_name = added_scale_attrs[idx]
weight_name = self.added_weight_attrs[idx]
scale_name = self.added_scale_attrs[idx]
weight_list = []
weight_scale_list = []
@@ -150,26 +217,9 @@ class XPUWeightOnlyMoEMethod(QuantMethodBase):
weight_list.append(quant_weight.transpose([1, 0])) # transpose weight to [n,k]
weight_scale_list.append(scale)
quanted_weight = paddle.stack(weight_list, axis=0)
setattr(
layer,
weight_name,
layer.create_parameter(
shape=quanted_weight.shape,
dtype=quanted_weight.dtype,
default_initializer=paddle.nn.initializer.Constant(0),
),
)
getattr(layer, weight_name).set_value(quanted_weight)
quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
setattr(
layer,
scale_name,
layer.create_parameter(
shape=quanted_weight_scale.shape,
dtype=quanted_weight_scale.dtype,
),
)
getattr(layer, scale_name).set_value(quanted_weight_scale)
def apply(

View File

@@ -27,17 +27,11 @@ from fastdeploy.model_executor.utils import slice_fn
from fastdeploy.platforms import current_platform
from fastdeploy.worker.experts_manager import RedundantExpertManger
# TODO(lulinjun): remove this import after supporting all backends
is_supported_moe_backend = None
if current_platform.is_cuda():
from .check_backend_supported import is_supported_moe_backend
def get_moe_method():
"""
return moe method based on device platform
"""
from fastdeploy.platforms import current_platform
if current_platform.is_cuda():
from .fused_moe_cutlass_backend import CutlassMoEMethod
@@ -152,24 +146,12 @@ class FusedMoE(nn.Layer):
if self.ep_size > 1:
self.quant_method.init_ep(self)
if fd_config.load_config.dynamic_load_weight:
# It's for RL to build model
self.init_moe_weights()
# Merge normal and RL build model
if gate_correction_bias is not None:
self.gate_correction_bias = gate_correction_bias
else:
if gate_correction_bias is not None:
self.gate_correction_bias = gate_correction_bias
else:
self.gate_correction_bias = None
if moe_quant_config:
if (
moe_quant_config
and is_supported_moe_backend is not None
and is_supported_moe_backend(self.quant_method)
):
self.quant_method.create_weights(self, weight_loader=self.weight_loader)
else:
# w_fp16 a_fp16
self.quant_method.create_weights(self, weight_loader=self.weight_loader)
self.gate_correction_bias = None
self.quant_method.create_weights(self, weight_loader=self.weight_loader)
logger.info(
f"{moe_tag}MoE config is {num_experts=}[{expert_id_offset}, {expert_id_offset + self.num_local_experts}), \
@@ -179,7 +161,6 @@ class FusedMoE(nn.Layer):
)
def weight_loader(self, param, loaded_weight, expert_id, shard_id: Optional[str] = None):
from fastdeploy.platforms import current_platform
if hasattr(param, "SHARD_ID_TO_SHARDED_DIM"):
SHARD_ID_TO_SHARDED_DIM = param.SHARD_ID_TO_SHARDED_DIM
@@ -332,86 +313,6 @@ class FusedMoE(nn.Layer):
for shard_id, weight_name in param_name_maping
]
def init_moe_weights(self):
"""
Initialize the weight shapes and parameters for the MoE layer.
Combines weight shape initialization and parameter creation into a single function.
"""
# Initialize weight shapes
up_gate_proj_output_dim = self.moe_intermediate_size * 2
if self.moe_quant_type in ["block_wise_fp8", "wint8"]:
up_gate_proj_weight_shape = [
self.num_local_experts,
up_gate_proj_output_dim,
self.hidden_size,
]
down_proj_weight_shape = [
self.num_local_experts,
self.hidden_size,
self.moe_intermediate_size,
]
else:
up_gate_proj_weight_shape = [
self.num_local_experts,
self.hidden_size,
up_gate_proj_output_dim,
]
down_proj_weight_shape = [
self.num_local_experts,
self.moe_intermediate_size,
self.hidden_size,
]
# Create parameters
if self.moe_quant_type == "block_wise_fp8":
# (TODO:gaoziyuan)
self.weight_dtype = "float8_e4m3fn"
self.init_block_wise_fp8_scale()
elif self.moe_quant_type == "wint8":
self.weight_dtype = "int8"
self.init_weight_only_scale()
# up_gate_proj parameters
self.up_gate_proj_weight = self.create_parameter(
shape=up_gate_proj_weight_shape,
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
)
# down_proj parameters
self.down_proj_weight = self.create_parameter(
shape=down_proj_weight_shape,
dtype=self.weight_dtype,
default_initializer=paddle.nn.initializer.Constant(0),
)
def init_weight_only_scale(self):
"""
Initialize the weight scale.
"""
self.up_gate_proj_weight_scale = self.create_parameter(
shape=[self.num_local_experts, self.moe_intermediate_size * 2],
dtype=self._dtype,
)
self.down_proj_weight_scale = self.create_parameter(
shape=[self.num_local_experts, self.hidden_size],
dtype=self._dtype,
)
def init_block_wise_fp8_scale(self):
"""
Initialize the weight scale.
"""
self.up_gate_proj_weight_scale = self.create_parameter(
shape=[self.num_local_experts, self.moe_intermediate_size * 2 // 128, self.hidden_size // 128],
dtype="float32",
is_bias=False,
)
self.down_proj_weight_scale = self.create_parameter(
shape=[self.num_local_experts, self.hidden_size // 128, self.moe_intermediate_size // 128],
dtype="float32",
is_bias=False,
)
def load_experts_weight(
self,
state_dict: dict,
@@ -560,26 +461,13 @@ class FusedMoE(nn.Layer):
"""
load_state_dict function.
"""
if is_supported_moe_backend is not None and is_supported_moe_backend(self.quant_method):
if self.fd_config.model_config.is_quantized:
if getattr(self.fd_config.quant_config, "is_permuted", True):
self.quant_method.process_prequanted_weights(self, state_dict, is_rearrange)
else:
self.quant_method.process_loaded_weights(self, state_dict)
if self.fd_config.model_config.is_quantized:
if getattr(self.fd_config.quant_config, "is_permuted", True):
self.quant_method.process_prequanted_weights(self, state_dict, is_rearrange)
else:
self.quant_method.process_loaded_weights(self, state_dict)
else:
if self.fd_config.model_config.is_quantized:
if getattr(self.fd_config.quant_config, "is_permuted", True):
self.quant_method.process_prequanted_weights(self, state_dict, is_rearrange)
else:
self.quant_method.create_weights(self, state_dict)
else:
if self.moe_quant_config:
self.quant_method.create_weights(self, state_dict)
else:
# w_fp16 a_fp16
self.quant_method.process_loaded_weights(self, state_dict)
self.quant_method.process_loaded_weights(self, state_dict)
def forward(self, x: paddle.Tensor, gate: nn.Layer):
"""

View File

@@ -1752,7 +1752,6 @@ class GPUModelRunner(ModelRunnerBase):
token_type_ids_w_video = token_type_ids
input_ids = inputs["input_ids"]
# convert to img patch id
# TODO(lulinjun): may need to check model_config and model_cfg
image_mask = input_ids == self.model_config.im_patch_id
image_type_ids = inputs["image_type_ids"]
with paddle.amp.auto_cast(

View File

@@ -1664,7 +1664,6 @@ class MetaxModelRunner(ModelRunnerBase):
token_type_ids_w_video = token_type_ids
input_ids = inputs["input_ids"]
# convert to img patch id
# TODO(lulinjun): may need to check model_config and model_cfg
image_mask = input_ids == self.model_config.im_patch_id
image_type_ids = inputs["image_type_ids"]
with paddle.amp.auto_cast(

View File

@@ -416,15 +416,15 @@ ernie.layers.1.self_attn.qkv_proj.weight_scale
ernie.layers.1.self_attn.o_proj.weight
ernie.layers.1.self_attn.o_proj.weight_scale
ernie.layers.1.mlp.gate_correction_bias
ernie.layers.1.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.1.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.1.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.1.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.1.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.1.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.1.mlp.text_fused_moe.gate.weight
ernie.layers.1.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.1.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.1.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.1.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.1.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.1.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.1.mlp.image_fused_moe.gate.weight
ernie.layers.1.mlp.shared_experts.up_gate_proj.weight
ernie.layers.1.mlp.shared_experts.up_gate_proj.weight_scale
@@ -437,15 +437,15 @@ ernie.layers.2.self_attn.qkv_proj.weight_scale
ernie.layers.2.self_attn.o_proj.weight
ernie.layers.2.self_attn.o_proj.weight_scale
ernie.layers.2.mlp.gate_correction_bias
ernie.layers.2.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.2.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.2.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.2.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.2.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.2.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.2.mlp.text_fused_moe.gate.weight
ernie.layers.2.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.2.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.2.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.2.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.2.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.2.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.2.mlp.image_fused_moe.gate.weight
ernie.layers.2.mlp.shared_experts.up_gate_proj.weight
ernie.layers.2.mlp.shared_experts.up_gate_proj.weight_scale
@@ -458,15 +458,15 @@ ernie.layers.3.self_attn.qkv_proj.weight_scale
ernie.layers.3.self_attn.o_proj.weight
ernie.layers.3.self_attn.o_proj.weight_scale
ernie.layers.3.mlp.gate_correction_bias
ernie.layers.3.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.3.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.3.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.3.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.3.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.3.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.3.mlp.text_fused_moe.gate.weight
ernie.layers.3.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.3.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.3.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.3.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.3.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.3.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.3.mlp.image_fused_moe.gate.weight
ernie.layers.3.mlp.shared_experts.up_gate_proj.weight
ernie.layers.3.mlp.shared_experts.up_gate_proj.weight_scale
@@ -479,15 +479,15 @@ ernie.layers.4.self_attn.qkv_proj.weight_scale
ernie.layers.4.self_attn.o_proj.weight
ernie.layers.4.self_attn.o_proj.weight_scale
ernie.layers.4.mlp.gate_correction_bias
ernie.layers.4.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.4.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.4.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.4.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.4.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.4.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.4.mlp.text_fused_moe.gate.weight
ernie.layers.4.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.4.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.4.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.4.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.4.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.4.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.4.mlp.image_fused_moe.gate.weight
ernie.layers.4.mlp.shared_experts.up_gate_proj.weight
ernie.layers.4.mlp.shared_experts.up_gate_proj.weight_scale
@@ -500,15 +500,15 @@ ernie.layers.5.self_attn.qkv_proj.weight_scale
ernie.layers.5.self_attn.o_proj.weight
ernie.layers.5.self_attn.o_proj.weight_scale
ernie.layers.5.mlp.gate_correction_bias
ernie.layers.5.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.5.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.5.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.5.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.5.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.5.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.5.mlp.text_fused_moe.gate.weight
ernie.layers.5.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.5.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.5.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.5.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.5.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.5.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.5.mlp.image_fused_moe.gate.weight
ernie.layers.5.mlp.shared_experts.up_gate_proj.weight
ernie.layers.5.mlp.shared_experts.up_gate_proj.weight_scale
@@ -521,15 +521,15 @@ ernie.layers.6.self_attn.qkv_proj.weight_scale
ernie.layers.6.self_attn.o_proj.weight
ernie.layers.6.self_attn.o_proj.weight_scale
ernie.layers.6.mlp.gate_correction_bias
ernie.layers.6.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.6.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.6.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.6.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.6.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.6.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.6.mlp.text_fused_moe.gate.weight
ernie.layers.6.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.6.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.6.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.6.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.6.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.6.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.6.mlp.image_fused_moe.gate.weight
ernie.layers.6.mlp.shared_experts.up_gate_proj.weight
ernie.layers.6.mlp.shared_experts.up_gate_proj.weight_scale
@@ -542,15 +542,15 @@ ernie.layers.7.self_attn.qkv_proj.weight_scale
ernie.layers.7.self_attn.o_proj.weight
ernie.layers.7.self_attn.o_proj.weight_scale
ernie.layers.7.mlp.gate_correction_bias
ernie.layers.7.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.7.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.7.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.7.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.7.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.7.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.7.mlp.text_fused_moe.gate.weight
ernie.layers.7.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.7.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.7.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.7.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.7.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.7.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.7.mlp.image_fused_moe.gate.weight
ernie.layers.7.mlp.shared_experts.up_gate_proj.weight
ernie.layers.7.mlp.shared_experts.up_gate_proj.weight_scale
@@ -563,15 +563,15 @@ ernie.layers.8.self_attn.qkv_proj.weight_scale
ernie.layers.8.self_attn.o_proj.weight
ernie.layers.8.self_attn.o_proj.weight_scale
ernie.layers.8.mlp.gate_correction_bias
ernie.layers.8.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.8.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.8.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.8.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.8.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.8.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.8.mlp.text_fused_moe.gate.weight
ernie.layers.8.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.8.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.8.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.8.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.8.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.8.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.8.mlp.image_fused_moe.gate.weight
ernie.layers.8.mlp.shared_experts.up_gate_proj.weight
ernie.layers.8.mlp.shared_experts.up_gate_proj.weight_scale
@@ -584,15 +584,15 @@ ernie.layers.9.self_attn.qkv_proj.weight_scale
ernie.layers.9.self_attn.o_proj.weight
ernie.layers.9.self_attn.o_proj.weight_scale
ernie.layers.9.mlp.gate_correction_bias
ernie.layers.9.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.9.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.9.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.9.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.9.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.9.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.9.mlp.text_fused_moe.gate.weight
ernie.layers.9.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.9.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.9.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.9.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.9.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.9.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.9.mlp.image_fused_moe.gate.weight
ernie.layers.9.mlp.shared_experts.up_gate_proj.weight
ernie.layers.9.mlp.shared_experts.up_gate_proj.weight_scale
@@ -605,15 +605,15 @@ ernie.layers.10.self_attn.qkv_proj.weight_scale
ernie.layers.10.self_attn.o_proj.weight
ernie.layers.10.self_attn.o_proj.weight_scale
ernie.layers.10.mlp.gate_correction_bias
ernie.layers.10.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.10.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.10.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.10.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.10.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.10.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.10.mlp.text_fused_moe.gate.weight
ernie.layers.10.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.10.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.10.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.10.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.10.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.10.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.10.mlp.image_fused_moe.gate.weight
ernie.layers.10.mlp.shared_experts.up_gate_proj.weight
ernie.layers.10.mlp.shared_experts.up_gate_proj.weight_scale
@@ -626,15 +626,15 @@ ernie.layers.11.self_attn.qkv_proj.weight_scale
ernie.layers.11.self_attn.o_proj.weight
ernie.layers.11.self_attn.o_proj.weight_scale
ernie.layers.11.mlp.gate_correction_bias
ernie.layers.11.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.11.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.11.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.11.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.11.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.11.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.11.mlp.text_fused_moe.gate.weight
ernie.layers.11.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.11.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.11.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.11.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.11.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.11.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.11.mlp.image_fused_moe.gate.weight
ernie.layers.11.mlp.shared_experts.up_gate_proj.weight
ernie.layers.11.mlp.shared_experts.up_gate_proj.weight_scale
@@ -647,15 +647,15 @@ ernie.layers.12.self_attn.qkv_proj.weight_scale
ernie.layers.12.self_attn.o_proj.weight
ernie.layers.12.self_attn.o_proj.weight_scale
ernie.layers.12.mlp.gate_correction_bias
ernie.layers.12.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.12.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.12.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.12.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.12.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.12.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.12.mlp.text_fused_moe.gate.weight
ernie.layers.12.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.12.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.12.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.12.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.12.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.12.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.12.mlp.image_fused_moe.gate.weight
ernie.layers.12.mlp.shared_experts.up_gate_proj.weight
ernie.layers.12.mlp.shared_experts.up_gate_proj.weight_scale
@@ -668,15 +668,15 @@ ernie.layers.13.self_attn.qkv_proj.weight_scale
ernie.layers.13.self_attn.o_proj.weight
ernie.layers.13.self_attn.o_proj.weight_scale
ernie.layers.13.mlp.gate_correction_bias
ernie.layers.13.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.13.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.13.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.13.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.13.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.13.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.13.mlp.text_fused_moe.gate.weight
ernie.layers.13.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.13.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.13.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.13.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.13.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.13.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.13.mlp.image_fused_moe.gate.weight
ernie.layers.13.mlp.shared_experts.up_gate_proj.weight
ernie.layers.13.mlp.shared_experts.up_gate_proj.weight_scale
@@ -689,15 +689,15 @@ ernie.layers.14.self_attn.qkv_proj.weight_scale
ernie.layers.14.self_attn.o_proj.weight
ernie.layers.14.self_attn.o_proj.weight_scale
ernie.layers.14.mlp.gate_correction_bias
ernie.layers.14.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.14.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.14.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.14.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.14.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.14.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.14.mlp.text_fused_moe.gate.weight
ernie.layers.14.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.14.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.14.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.14.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.14.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.14.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.14.mlp.image_fused_moe.gate.weight
ernie.layers.14.mlp.shared_experts.up_gate_proj.weight
ernie.layers.14.mlp.shared_experts.up_gate_proj.weight_scale
@@ -710,15 +710,15 @@ ernie.layers.15.self_attn.qkv_proj.weight_scale
ernie.layers.15.self_attn.o_proj.weight
ernie.layers.15.self_attn.o_proj.weight_scale
ernie.layers.15.mlp.gate_correction_bias
ernie.layers.15.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.15.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.15.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.15.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.15.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.15.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.15.mlp.text_fused_moe.gate.weight
ernie.layers.15.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.15.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.15.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.15.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.15.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.15.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.15.mlp.image_fused_moe.gate.weight
ernie.layers.15.mlp.shared_experts.up_gate_proj.weight
ernie.layers.15.mlp.shared_experts.up_gate_proj.weight_scale
@@ -731,15 +731,15 @@ ernie.layers.16.self_attn.qkv_proj.weight_scale
ernie.layers.16.self_attn.o_proj.weight
ernie.layers.16.self_attn.o_proj.weight_scale
ernie.layers.16.mlp.gate_correction_bias
ernie.layers.16.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.16.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.16.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.16.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.16.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.16.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.16.mlp.text_fused_moe.gate.weight
ernie.layers.16.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.16.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.16.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.16.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.16.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.16.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.16.mlp.image_fused_moe.gate.weight
ernie.layers.16.mlp.shared_experts.up_gate_proj.weight
ernie.layers.16.mlp.shared_experts.up_gate_proj.weight_scale
@@ -752,15 +752,15 @@ ernie.layers.17.self_attn.qkv_proj.weight_scale
ernie.layers.17.self_attn.o_proj.weight
ernie.layers.17.self_attn.o_proj.weight_scale
ernie.layers.17.mlp.gate_correction_bias
ernie.layers.17.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.17.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.17.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.17.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.17.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.17.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.17.mlp.text_fused_moe.gate.weight
ernie.layers.17.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.17.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.17.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.17.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.17.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.17.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.17.mlp.image_fused_moe.gate.weight
ernie.layers.17.mlp.shared_experts.up_gate_proj.weight
ernie.layers.17.mlp.shared_experts.up_gate_proj.weight_scale
@@ -773,15 +773,15 @@ ernie.layers.18.self_attn.qkv_proj.weight_scale
ernie.layers.18.self_attn.o_proj.weight
ernie.layers.18.self_attn.o_proj.weight_scale
ernie.layers.18.mlp.gate_correction_bias
ernie.layers.18.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.18.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.18.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.18.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.18.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.18.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.18.mlp.text_fused_moe.gate.weight
ernie.layers.18.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.18.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.18.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.18.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.18.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.18.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.18.mlp.image_fused_moe.gate.weight
ernie.layers.18.mlp.shared_experts.up_gate_proj.weight
ernie.layers.18.mlp.shared_experts.up_gate_proj.weight_scale
@@ -794,15 +794,15 @@ ernie.layers.19.self_attn.qkv_proj.weight_scale
ernie.layers.19.self_attn.o_proj.weight
ernie.layers.19.self_attn.o_proj.weight_scale
ernie.layers.19.mlp.gate_correction_bias
ernie.layers.19.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.19.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.19.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.19.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.19.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.19.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.19.mlp.text_fused_moe.gate.weight
ernie.layers.19.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.19.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.19.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.19.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.19.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.19.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.19.mlp.image_fused_moe.gate.weight
ernie.layers.19.mlp.shared_experts.up_gate_proj.weight
ernie.layers.19.mlp.shared_experts.up_gate_proj.weight_scale
@@ -815,15 +815,15 @@ ernie.layers.20.self_attn.qkv_proj.weight_scale
ernie.layers.20.self_attn.o_proj.weight
ernie.layers.20.self_attn.o_proj.weight_scale
ernie.layers.20.mlp.gate_correction_bias
ernie.layers.20.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.20.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.20.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.20.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.20.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.20.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.20.mlp.text_fused_moe.gate.weight
ernie.layers.20.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.20.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.20.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.20.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.20.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.20.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.20.mlp.image_fused_moe.gate.weight
ernie.layers.20.mlp.shared_experts.up_gate_proj.weight
ernie.layers.20.mlp.shared_experts.up_gate_proj.weight_scale
@@ -836,15 +836,15 @@ ernie.layers.21.self_attn.qkv_proj.weight_scale
ernie.layers.21.self_attn.o_proj.weight
ernie.layers.21.self_attn.o_proj.weight_scale
ernie.layers.21.mlp.gate_correction_bias
ernie.layers.21.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.21.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.21.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.21.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.21.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.21.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.21.mlp.text_fused_moe.gate.weight
ernie.layers.21.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.21.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.21.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.21.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.21.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.21.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.21.mlp.image_fused_moe.gate.weight
ernie.layers.21.mlp.shared_experts.up_gate_proj.weight
ernie.layers.21.mlp.shared_experts.up_gate_proj.weight_scale
@@ -857,15 +857,15 @@ ernie.layers.22.self_attn.qkv_proj.weight_scale
ernie.layers.22.self_attn.o_proj.weight
ernie.layers.22.self_attn.o_proj.weight_scale
ernie.layers.22.mlp.gate_correction_bias
ernie.layers.22.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.22.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.22.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.22.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.22.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.22.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.22.mlp.text_fused_moe.gate.weight
ernie.layers.22.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.22.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.22.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.22.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.22.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.22.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.22.mlp.image_fused_moe.gate.weight
ernie.layers.22.mlp.shared_experts.up_gate_proj.weight
ernie.layers.22.mlp.shared_experts.up_gate_proj.weight_scale
@@ -878,15 +878,15 @@ ernie.layers.23.self_attn.qkv_proj.weight_scale
ernie.layers.23.self_attn.o_proj.weight
ernie.layers.23.self_attn.o_proj.weight_scale
ernie.layers.23.mlp.gate_correction_bias
ernie.layers.23.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.23.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.23.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.23.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.23.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.23.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.23.mlp.text_fused_moe.gate.weight
ernie.layers.23.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.23.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.23.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.23.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.23.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.23.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.23.mlp.image_fused_moe.gate.weight
ernie.layers.23.mlp.shared_experts.up_gate_proj.weight
ernie.layers.23.mlp.shared_experts.up_gate_proj.weight_scale
@@ -899,15 +899,15 @@ ernie.layers.24.self_attn.qkv_proj.weight_scale
ernie.layers.24.self_attn.o_proj.weight
ernie.layers.24.self_attn.o_proj.weight_scale
ernie.layers.24.mlp.gate_correction_bias
ernie.layers.24.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.24.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.24.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.24.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.24.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.24.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.24.mlp.text_fused_moe.gate.weight
ernie.layers.24.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.24.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.24.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.24.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.24.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.24.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.24.mlp.image_fused_moe.gate.weight
ernie.layers.24.mlp.shared_experts.up_gate_proj.weight
ernie.layers.24.mlp.shared_experts.up_gate_proj.weight_scale
@@ -920,15 +920,15 @@ ernie.layers.25.self_attn.qkv_proj.weight_scale
ernie.layers.25.self_attn.o_proj.weight
ernie.layers.25.self_attn.o_proj.weight_scale
ernie.layers.25.mlp.gate_correction_bias
ernie.layers.25.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.25.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.25.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.25.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.25.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.25.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.25.mlp.text_fused_moe.gate.weight
ernie.layers.25.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.25.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.25.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.25.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.25.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.25.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.25.mlp.image_fused_moe.gate.weight
ernie.layers.25.mlp.shared_experts.up_gate_proj.weight
ernie.layers.25.mlp.shared_experts.up_gate_proj.weight_scale
@@ -941,15 +941,15 @@ ernie.layers.26.self_attn.qkv_proj.weight_scale
ernie.layers.26.self_attn.o_proj.weight
ernie.layers.26.self_attn.o_proj.weight_scale
ernie.layers.26.mlp.gate_correction_bias
ernie.layers.26.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.26.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.26.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.26.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.26.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.26.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.26.mlp.text_fused_moe.gate.weight
ernie.layers.26.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.26.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.26.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.26.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.26.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.26.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.26.mlp.image_fused_moe.gate.weight
ernie.layers.26.mlp.shared_experts.up_gate_proj.weight
ernie.layers.26.mlp.shared_experts.up_gate_proj.weight_scale
@@ -962,15 +962,15 @@ ernie.layers.27.self_attn.qkv_proj.weight_scale
ernie.layers.27.self_attn.o_proj.weight
ernie.layers.27.self_attn.o_proj.weight_scale
ernie.layers.27.mlp.gate_correction_bias
ernie.layers.27.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.27.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.27.mlp.text_fused_moe.experts.up_gate_proj_weight
ernie.layers.27.mlp.text_fused_moe.experts.down_proj_weight
ernie.layers.27.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.27.mlp.text_fused_moe.experts.down_proj_weight_scale
ernie.layers.27.mlp.text_fused_moe.gate.weight
ernie.layers.27.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.27.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.27.mlp.image_fused_moe.experts.up_gate_proj_weight
ernie.layers.27.mlp.image_fused_moe.experts.down_proj_weight
ernie.layers.27.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
ernie.layers.27.mlp.image_fused_moe.experts.down_proj_weight_scale
ernie.layers.27.mlp.image_fused_moe.gate.weight
ernie.layers.27.mlp.shared_experts.up_gate_proj.weight
ernie.layers.27.mlp.shared_experts.up_gate_proj.weight_scale