mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-09-26 20:41:53 +08:00
[V1 Loader]support param create and load for wint2 and xpu backend (#3581)
* support wint2 backend' * [V1 Loader]support param create and load for wint2 and xpu backend * update weight shape name * update * update * update baseline.txt * update model name * update baseline.txt * fix codestyle * remove debug coode
This commit is contained in:
@@ -1,44 +0,0 @@
|
|||||||
"""
|
|
||||||
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
"""
|
|
||||||
|
|
||||||
from fastdeploy.model_executor.layers.moe.fused_moe_backend_base import MoEMethodBase
|
|
||||||
from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import (
|
|
||||||
CutlassMoEMethod,
|
|
||||||
)
|
|
||||||
from fastdeploy.model_executor.layers.moe.fused_moe_deepgemm_backend import (
|
|
||||||
DeepGemmFusedMoeMethod,
|
|
||||||
)
|
|
||||||
from fastdeploy.model_executor.layers.moe.fused_moe_marlin_backend import (
|
|
||||||
MarlinWeightOnlyMoEMethod,
|
|
||||||
)
|
|
||||||
from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import (
|
|
||||||
BlockWiseFP8MoEMethod,
|
|
||||||
TensorWiseFP8MoEMethod,
|
|
||||||
TritonWeightOnlyMoEMethod,
|
|
||||||
)
|
|
||||||
|
|
||||||
pre_create_weights_list = (
|
|
||||||
CutlassMoEMethod,
|
|
||||||
TensorWiseFP8MoEMethod,
|
|
||||||
BlockWiseFP8MoEMethod,
|
|
||||||
TritonWeightOnlyMoEMethod,
|
|
||||||
DeepGemmFusedMoeMethod,
|
|
||||||
MarlinWeightOnlyMoEMethod,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def is_supported_moe_backend(quant_method: MoEMethodBase):
|
|
||||||
return isinstance(quant_method, pre_create_weights_list)
|
|
@@ -145,12 +145,12 @@ class MarlinWeightOnlyMoEMethod(QuantMethodBase):
|
|||||||
|
|
||||||
up_gate_proj_weight_name = self.added_weight_attrs[0]
|
up_gate_proj_weight_name = self.added_weight_attrs[0]
|
||||||
down_proj_weight_name = self.added_weight_attrs[1]
|
down_proj_weight_name = self.added_weight_attrs[1]
|
||||||
self.ffn1_weight_shape = [
|
self.up_gate_proj_weight_shape = [
|
||||||
layer.num_local_experts,
|
layer.num_local_experts,
|
||||||
layer.hidden_size // 16,
|
layer.hidden_size // 16,
|
||||||
layer.moe_intermediate_size * 4,
|
layer.moe_intermediate_size * 4,
|
||||||
]
|
]
|
||||||
self.ffn2_weight_shape = [
|
self.down_proj_weight_shape = [
|
||||||
layer.num_local_experts,
|
layer.num_local_experts,
|
||||||
layer.moe_intermediate_size // 16,
|
layer.moe_intermediate_size // 16,
|
||||||
layer.hidden_size * 2,
|
layer.hidden_size * 2,
|
||||||
@@ -159,7 +159,7 @@ class MarlinWeightOnlyMoEMethod(QuantMethodBase):
|
|||||||
layer,
|
layer,
|
||||||
up_gate_proj_weight_name,
|
up_gate_proj_weight_name,
|
||||||
layer.create_parameter(
|
layer.create_parameter(
|
||||||
shape=self.ffn1_weight_shape,
|
shape=self.up_gate_proj_weight_shape,
|
||||||
dtype=self.weight_dtype,
|
dtype=self.weight_dtype,
|
||||||
default_initializer=paddle.nn.initializer.Constant(0),
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
),
|
),
|
||||||
@@ -168,7 +168,7 @@ class MarlinWeightOnlyMoEMethod(QuantMethodBase):
|
|||||||
layer,
|
layer,
|
||||||
down_proj_weight_name,
|
down_proj_weight_name,
|
||||||
layer.create_parameter(
|
layer.create_parameter(
|
||||||
shape=self.ffn2_weight_shape,
|
shape=self.down_proj_weight_shape,
|
||||||
dtype=self.weight_dtype,
|
dtype=self.weight_dtype,
|
||||||
default_initializer=paddle.nn.initializer.Constant(0),
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
),
|
),
|
||||||
|
@@ -61,12 +61,12 @@ class TritonWeightOnlyMoEMethod(QuantMethodBase):
|
|||||||
self.default_dtype = layer._helper.get_default_dtype()
|
self.default_dtype = layer._helper.get_default_dtype()
|
||||||
up_gate_proj_weight_name = self.added_weight_attrs[0]
|
up_gate_proj_weight_name = self.added_weight_attrs[0]
|
||||||
down_proj_weight_name = self.added_weight_attrs[1]
|
down_proj_weight_name = self.added_weight_attrs[1]
|
||||||
self.ffn1_weight_shape = [
|
self.up_gate_proj_weight_shape = [
|
||||||
layer.num_local_experts,
|
layer.num_local_experts,
|
||||||
layer.hidden_size,
|
layer.hidden_size,
|
||||||
layer.moe_intermediate_size * 2,
|
layer.moe_intermediate_size * 2,
|
||||||
]
|
]
|
||||||
self.ffn2_weight_shape = [
|
self.down_proj_weight_shape = [
|
||||||
layer.num_local_experts,
|
layer.num_local_experts,
|
||||||
layer.moe_intermediate_size,
|
layer.moe_intermediate_size,
|
||||||
layer.hidden_size,
|
layer.hidden_size,
|
||||||
@@ -75,7 +75,7 @@ class TritonWeightOnlyMoEMethod(QuantMethodBase):
|
|||||||
layer,
|
layer,
|
||||||
up_gate_proj_weight_name,
|
up_gate_proj_weight_name,
|
||||||
layer.create_parameter(
|
layer.create_parameter(
|
||||||
shape=self.ffn1_weight_shape,
|
shape=self.up_gate_proj_weight_shape,
|
||||||
dtype=self.weight_dtype,
|
dtype=self.weight_dtype,
|
||||||
default_initializer=paddle.nn.initializer.Constant(0),
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
),
|
),
|
||||||
@@ -84,7 +84,7 @@ class TritonWeightOnlyMoEMethod(QuantMethodBase):
|
|||||||
layer,
|
layer,
|
||||||
down_proj_weight_name,
|
down_proj_weight_name,
|
||||||
layer.create_parameter(
|
layer.create_parameter(
|
||||||
shape=self.ffn2_weight_shape,
|
shape=self.down_proj_weight_shape,
|
||||||
dtype=self.weight_dtype,
|
dtype=self.weight_dtype,
|
||||||
default_initializer=paddle.nn.initializer.Constant(0),
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
),
|
),
|
||||||
@@ -364,12 +364,12 @@ class TensorWiseFP8MoEMethod(QuantMethodBase):
|
|||||||
self.default_dtype = layer._helper.get_default_dtype()
|
self.default_dtype = layer._helper.get_default_dtype()
|
||||||
up_gate_proj_weight_name = self.added_wfp8afp8_attrs[0]
|
up_gate_proj_weight_name = self.added_wfp8afp8_attrs[0]
|
||||||
down_proj_weight_name = self.added_wfp8afp8_attrs[1]
|
down_proj_weight_name = self.added_wfp8afp8_attrs[1]
|
||||||
self.ffn1_weight_shape = [
|
self.up_gate_proj_weight_shape = [
|
||||||
layer.num_local_experts,
|
layer.num_local_experts,
|
||||||
layer.moe_intermediate_size * 2,
|
layer.moe_intermediate_size * 2,
|
||||||
layer.hidden_size,
|
layer.hidden_size,
|
||||||
]
|
]
|
||||||
self.ffn2_weight_shape = [
|
self.down_proj_weight_shape = [
|
||||||
layer.num_local_experts,
|
layer.num_local_experts,
|
||||||
layer.hidden_size,
|
layer.hidden_size,
|
||||||
layer.moe_intermediate_size,
|
layer.moe_intermediate_size,
|
||||||
@@ -378,7 +378,7 @@ class TensorWiseFP8MoEMethod(QuantMethodBase):
|
|||||||
layer,
|
layer,
|
||||||
up_gate_proj_weight_name,
|
up_gate_proj_weight_name,
|
||||||
layer.create_parameter(
|
layer.create_parameter(
|
||||||
shape=self.ffn1_weight_shape,
|
shape=self.up_gate_proj_weight_shape,
|
||||||
dtype=self.weight_dtype,
|
dtype=self.weight_dtype,
|
||||||
default_initializer=paddle.nn.initializer.Constant(0),
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
),
|
),
|
||||||
@@ -387,7 +387,7 @@ class TensorWiseFP8MoEMethod(QuantMethodBase):
|
|||||||
layer,
|
layer,
|
||||||
down_proj_weight_name,
|
down_proj_weight_name,
|
||||||
layer.create_parameter(
|
layer.create_parameter(
|
||||||
shape=self.ffn2_weight_shape,
|
shape=self.down_proj_weight_shape,
|
||||||
dtype=self.weight_dtype,
|
dtype=self.weight_dtype,
|
||||||
default_initializer=paddle.nn.initializer.Constant(0),
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
),
|
),
|
||||||
|
@@ -22,7 +22,7 @@ from fastdeploy.distributed.communication import tensor_model_parallel_all_reduc
|
|||||||
from fastdeploy.utils import ceil_div
|
from fastdeploy.utils import ceil_div
|
||||||
|
|
||||||
from ..quantization.quant_base import QuantMethodBase
|
from ..quantization.quant_base import QuantMethodBase
|
||||||
from ..utils import create_and_set_parameter, get_tensor
|
from ..utils import get_tensor
|
||||||
|
|
||||||
|
|
||||||
class Wint2MoeMethod(QuantMethodBase):
|
class Wint2MoeMethod(QuantMethodBase):
|
||||||
@@ -33,6 +33,11 @@ class Wint2MoeMethod(QuantMethodBase):
|
|||||||
def __init__(self, quant_config):
|
def __init__(self, quant_config):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.moe_quant_type = quant_config.moe_quant_type
|
self.moe_quant_type = quant_config.moe_quant_type
|
||||||
|
self.added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"]
|
||||||
|
self.added_scale_attrs = [
|
||||||
|
"up_gate_proj_weight_scale",
|
||||||
|
"down_proj_weight_scale",
|
||||||
|
]
|
||||||
|
|
||||||
def process_loaded_weights(self, layer, weights) -> None:
|
def process_loaded_weights(self, layer, weights) -> None:
|
||||||
"""
|
"""
|
||||||
@@ -51,11 +56,102 @@ class Wint2MoeMethod(QuantMethodBase):
|
|||||||
len(down_proj_weights) == layer.num_local_experts
|
len(down_proj_weights) == layer.num_local_experts
|
||||||
), "down_proj_weights length should be equal to num_local_experts."
|
), "down_proj_weights length should be equal to num_local_experts."
|
||||||
|
|
||||||
def create_weights(self, layer: nn.Layer, state_dict):
|
def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
|
||||||
"""
|
"""
|
||||||
Paddle cutlass create weight process.
|
Paddle cutlass create weight process.
|
||||||
"""
|
"""
|
||||||
pass
|
self.weight_dtype = "uint8"
|
||||||
|
self.default_dtype = layer._helper.get_default_dtype()
|
||||||
|
setattr(
|
||||||
|
layer,
|
||||||
|
"up_gate_proj_weight",
|
||||||
|
layer.create_parameter(
|
||||||
|
shape=[layer.num_local_experts, layer.hidden_size // 4, layer.moe_intermediate_size * 2],
|
||||||
|
dtype=self.weight_dtype,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
setattr(
|
||||||
|
layer,
|
||||||
|
"down_proj_weight",
|
||||||
|
layer.create_parameter(
|
||||||
|
shape=[layer.num_local_experts, layer.moe_intermediate_size // 4, layer.hidden_size],
|
||||||
|
dtype=self.weight_dtype,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
setattr(
|
||||||
|
layer,
|
||||||
|
"up_gate_proj_weight_scale",
|
||||||
|
layer.create_parameter(
|
||||||
|
shape=[layer.num_local_experts, layer.hidden_size // 128, layer.moe_intermediate_size * 2],
|
||||||
|
dtype=self.weight_dtype,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
setattr(
|
||||||
|
layer,
|
||||||
|
"down_proj_weight_scale",
|
||||||
|
layer.create_parameter(
|
||||||
|
shape=[layer.num_local_experts, layer.moe_intermediate_size // 128, layer.hidden_size],
|
||||||
|
dtype=self.weight_dtype,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
setattr(
|
||||||
|
layer,
|
||||||
|
"up_gate_proj_super_scales",
|
||||||
|
layer.create_parameter(
|
||||||
|
shape=[layer.num_local_experts, layer.moe_intermediate_size * 2],
|
||||||
|
dtype=self.default_dtype,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
setattr(
|
||||||
|
layer,
|
||||||
|
"down_proj_super_scales",
|
||||||
|
layer.create_parameter(
|
||||||
|
shape=[layer.num_local_experts, layer.hidden_size],
|
||||||
|
dtype=self.default_dtype,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
setattr(
|
||||||
|
layer,
|
||||||
|
"up_gate_proj_code_scale",
|
||||||
|
layer.create_parameter(
|
||||||
|
shape=[layer.num_local_experts, layer.moe_intermediate_size * 2],
|
||||||
|
dtype="float32",
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
setattr(
|
||||||
|
layer,
|
||||||
|
"down_proj_code_scale",
|
||||||
|
layer.create_parameter(
|
||||||
|
shape=[layer.num_local_experts, layer.hidden_size],
|
||||||
|
dtype="float32",
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
setattr(
|
||||||
|
layer,
|
||||||
|
"up_gate_proj_code_zp",
|
||||||
|
layer.create_parameter(
|
||||||
|
shape=[layer.num_local_experts, layer.moe_intermediate_size * 2],
|
||||||
|
dtype="float32",
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
setattr(
|
||||||
|
layer,
|
||||||
|
"down_proj_code_zp",
|
||||||
|
layer.create_parameter(
|
||||||
|
shape=[layer.num_local_experts, layer.hidden_size],
|
||||||
|
dtype="float32",
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class CutlassWint2FusedMoeMethod(Wint2MoeMethod):
|
class CutlassWint2FusedMoeMethod(Wint2MoeMethod):
|
||||||
@@ -65,7 +161,6 @@ class CutlassWint2FusedMoeMethod(Wint2MoeMethod):
|
|||||||
|
|
||||||
def __init__(self, quant_config):
|
def __init__(self, quant_config):
|
||||||
super().__init__(quant_config)
|
super().__init__(quant_config)
|
||||||
self.moe_quant_type = quant_config.moe_quant_type
|
|
||||||
|
|
||||||
def process_loaded_weights(self, layer, weights) -> None:
|
def process_loaded_weights(self, layer, weights) -> None:
|
||||||
"""
|
"""
|
||||||
@@ -159,13 +254,7 @@ class CutlassWint2FusedMoeMethod(Wint2MoeMethod):
|
|||||||
"down_proj_code_zp": down_proj_code_zp,
|
"down_proj_code_zp": down_proj_code_zp,
|
||||||
}
|
}
|
||||||
for name, tensor in name_tensor_map.items():
|
for name, tensor in name_tensor_map.items():
|
||||||
create_and_set_parameter(layer, name, tensor)
|
getattr(layer, name).set_value(tensor)
|
||||||
|
|
||||||
def create_weights(self, layer: nn.Layer, state_dict):
|
|
||||||
"""
|
|
||||||
Paddle cutlass create weight process.
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
def apply(
|
def apply(
|
||||||
self,
|
self,
|
||||||
|
@@ -14,8 +14,6 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Dict
|
|
||||||
|
|
||||||
import paddle
|
import paddle
|
||||||
from paddle import nn
|
from paddle import nn
|
||||||
|
|
||||||
@@ -114,11 +112,86 @@ class XPUWeightOnlyMoEMethod(QuantMethodBase):
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
self.quant_config = quant_config
|
self.quant_config = quant_config
|
||||||
self.moe_quant_type = self.quant_config.algo
|
self.moe_quant_type = self.quant_config.algo
|
||||||
|
self.added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"]
|
||||||
|
self.added_scale_attrs = [
|
||||||
|
"up_gate_proj_weight_scale",
|
||||||
|
"down_proj_weight_scale",
|
||||||
|
]
|
||||||
|
|
||||||
def create_weights(self, layer: nn.Layer, state_dict: Dict[str, paddle.Tensor]):
|
def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
|
||||||
"""
|
"""
|
||||||
Paddle cutlass create weight process.
|
Paddle cutlass create weight process.
|
||||||
"""
|
"""
|
||||||
|
self.default_dtype = "float32"
|
||||||
|
self.weight_dtype = "int8"
|
||||||
|
|
||||||
|
if self.moe_quant_type in ["weight_only_int4", "w4a8"]:
|
||||||
|
self.up_gate_proj_weight_shape = [
|
||||||
|
layer.num_local_experts,
|
||||||
|
layer.moe_intermediate_size * 2,
|
||||||
|
layer.hidden_size // 2,
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
self.up_gate_proj_weight_shape = [
|
||||||
|
layer.num_local_experts,
|
||||||
|
layer.moe_intermediate_size * 2,
|
||||||
|
layer.hidden_size,
|
||||||
|
]
|
||||||
|
if self.moe_quant_type in ["weight_only_int4", "w4a8"]:
|
||||||
|
self.down_proj_weight_shape = [
|
||||||
|
layer.num_local_experts,
|
||||||
|
layer.hidden_size,
|
||||||
|
layer.moe_intermediate_size // 2,
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
self.down_proj_weight_shape = [
|
||||||
|
layer.num_local_experts,
|
||||||
|
layer.hidden_size,
|
||||||
|
layer.moe_intermediate_size,
|
||||||
|
]
|
||||||
|
|
||||||
|
setattr(
|
||||||
|
layer,
|
||||||
|
self.added_weight_attrs[0],
|
||||||
|
layer.create_parameter(
|
||||||
|
shape=self.up_gate_proj_weight_shape,
|
||||||
|
dtype=self.weight_dtype,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
setattr(
|
||||||
|
layer,
|
||||||
|
self.added_weight_attrs[1],
|
||||||
|
layer.create_parameter(
|
||||||
|
shape=self.down_proj_weight_shape,
|
||||||
|
dtype=self.weight_dtype,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
# weight_scale
|
||||||
|
setattr(
|
||||||
|
layer,
|
||||||
|
self.added_scale_attrs[0],
|
||||||
|
layer.create_parameter(
|
||||||
|
shape=[layer.num_local_experts, layer.moe_intermediate_size * 2],
|
||||||
|
dtype=self.default_dtype,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
setattr(
|
||||||
|
layer,
|
||||||
|
self.added_scale_attrs[1],
|
||||||
|
layer.create_parameter(
|
||||||
|
shape=[layer.num_local_experts, layer.hidden_size],
|
||||||
|
dtype=self.default_dtype,
|
||||||
|
default_initializer=paddle.nn.initializer.Constant(0),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
def process_loaded_weights(self, layer: nn.Layer, state_dict):
|
||||||
|
"""
|
||||||
|
Paddle xpu load weight process.
|
||||||
|
"""
|
||||||
up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
|
up_gate_proj_weights, down_proj_weights, _, _ = layer.extract_moe_ffn_weights(state_dict)
|
||||||
assert len(up_gate_proj_weights) == layer.num_local_experts
|
assert len(up_gate_proj_weights) == layer.num_local_experts
|
||||||
assert len(down_proj_weights) == layer.num_local_experts
|
assert len(down_proj_weights) == layer.num_local_experts
|
||||||
@@ -131,15 +204,9 @@ class XPUWeightOnlyMoEMethod(QuantMethodBase):
|
|||||||
layer.hidden_size,
|
layer.hidden_size,
|
||||||
]
|
]
|
||||||
|
|
||||||
added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"]
|
|
||||||
added_scale_attrs = [
|
|
||||||
"up_gate_proj_weight_scale",
|
|
||||||
"down_proj_weight_scale",
|
|
||||||
]
|
|
||||||
|
|
||||||
for idx, weight_tensor in enumerate([up_gate_proj_weights, down_proj_weights]):
|
for idx, weight_tensor in enumerate([up_gate_proj_weights, down_proj_weights]):
|
||||||
weight_name = added_weight_attrs[idx]
|
weight_name = self.added_weight_attrs[idx]
|
||||||
scale_name = added_scale_attrs[idx]
|
scale_name = self.added_scale_attrs[idx]
|
||||||
|
|
||||||
weight_list = []
|
weight_list = []
|
||||||
weight_scale_list = []
|
weight_scale_list = []
|
||||||
@@ -150,26 +217,9 @@ class XPUWeightOnlyMoEMethod(QuantMethodBase):
|
|||||||
weight_list.append(quant_weight.transpose([1, 0])) # transpose weight to [n,k]
|
weight_list.append(quant_weight.transpose([1, 0])) # transpose weight to [n,k]
|
||||||
weight_scale_list.append(scale)
|
weight_scale_list.append(scale)
|
||||||
quanted_weight = paddle.stack(weight_list, axis=0)
|
quanted_weight = paddle.stack(weight_list, axis=0)
|
||||||
setattr(
|
|
||||||
layer,
|
|
||||||
weight_name,
|
|
||||||
layer.create_parameter(
|
|
||||||
shape=quanted_weight.shape,
|
|
||||||
dtype=quanted_weight.dtype,
|
|
||||||
default_initializer=paddle.nn.initializer.Constant(0),
|
|
||||||
),
|
|
||||||
)
|
|
||||||
getattr(layer, weight_name).set_value(quanted_weight)
|
getattr(layer, weight_name).set_value(quanted_weight)
|
||||||
|
|
||||||
quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
|
quanted_weight_scale = paddle.stack(weight_scale_list, axis=0)
|
||||||
setattr(
|
|
||||||
layer,
|
|
||||||
scale_name,
|
|
||||||
layer.create_parameter(
|
|
||||||
shape=quanted_weight_scale.shape,
|
|
||||||
dtype=quanted_weight_scale.dtype,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
getattr(layer, scale_name).set_value(quanted_weight_scale)
|
getattr(layer, scale_name).set_value(quanted_weight_scale)
|
||||||
|
|
||||||
def apply(
|
def apply(
|
||||||
|
@@ -27,17 +27,11 @@ from fastdeploy.model_executor.utils import slice_fn
|
|||||||
from fastdeploy.platforms import current_platform
|
from fastdeploy.platforms import current_platform
|
||||||
from fastdeploy.worker.experts_manager import RedundantExpertManger
|
from fastdeploy.worker.experts_manager import RedundantExpertManger
|
||||||
|
|
||||||
# TODO(lulinjun): remove this import after supporting all backends
|
|
||||||
is_supported_moe_backend = None
|
|
||||||
if current_platform.is_cuda():
|
|
||||||
from .check_backend_supported import is_supported_moe_backend
|
|
||||||
|
|
||||||
|
|
||||||
def get_moe_method():
|
def get_moe_method():
|
||||||
"""
|
"""
|
||||||
return moe method based on device platform
|
return moe method based on device platform
|
||||||
"""
|
"""
|
||||||
from fastdeploy.platforms import current_platform
|
|
||||||
|
|
||||||
if current_platform.is_cuda():
|
if current_platform.is_cuda():
|
||||||
from .fused_moe_cutlass_backend import CutlassMoEMethod
|
from .fused_moe_cutlass_backend import CutlassMoEMethod
|
||||||
@@ -152,24 +146,12 @@ class FusedMoE(nn.Layer):
|
|||||||
if self.ep_size > 1:
|
if self.ep_size > 1:
|
||||||
self.quant_method.init_ep(self)
|
self.quant_method.init_ep(self)
|
||||||
|
|
||||||
if fd_config.load_config.dynamic_load_weight:
|
# Merge normal and RL build model
|
||||||
# It's for RL to build model
|
if gate_correction_bias is not None:
|
||||||
self.init_moe_weights()
|
self.gate_correction_bias = gate_correction_bias
|
||||||
else:
|
else:
|
||||||
if gate_correction_bias is not None:
|
self.gate_correction_bias = None
|
||||||
self.gate_correction_bias = gate_correction_bias
|
self.quant_method.create_weights(self, weight_loader=self.weight_loader)
|
||||||
else:
|
|
||||||
self.gate_correction_bias = None
|
|
||||||
if moe_quant_config:
|
|
||||||
if (
|
|
||||||
moe_quant_config
|
|
||||||
and is_supported_moe_backend is not None
|
|
||||||
and is_supported_moe_backend(self.quant_method)
|
|
||||||
):
|
|
||||||
self.quant_method.create_weights(self, weight_loader=self.weight_loader)
|
|
||||||
else:
|
|
||||||
# w_fp16 a_fp16
|
|
||||||
self.quant_method.create_weights(self, weight_loader=self.weight_loader)
|
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"{moe_tag}MoE config is {num_experts=}[{expert_id_offset}, {expert_id_offset + self.num_local_experts}), \
|
f"{moe_tag}MoE config is {num_experts=}[{expert_id_offset}, {expert_id_offset + self.num_local_experts}), \
|
||||||
@@ -179,7 +161,6 @@ class FusedMoE(nn.Layer):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def weight_loader(self, param, loaded_weight, expert_id, shard_id: Optional[str] = None):
|
def weight_loader(self, param, loaded_weight, expert_id, shard_id: Optional[str] = None):
|
||||||
from fastdeploy.platforms import current_platform
|
|
||||||
|
|
||||||
if hasattr(param, "SHARD_ID_TO_SHARDED_DIM"):
|
if hasattr(param, "SHARD_ID_TO_SHARDED_DIM"):
|
||||||
SHARD_ID_TO_SHARDED_DIM = param.SHARD_ID_TO_SHARDED_DIM
|
SHARD_ID_TO_SHARDED_DIM = param.SHARD_ID_TO_SHARDED_DIM
|
||||||
@@ -332,86 +313,6 @@ class FusedMoE(nn.Layer):
|
|||||||
for shard_id, weight_name in param_name_maping
|
for shard_id, weight_name in param_name_maping
|
||||||
]
|
]
|
||||||
|
|
||||||
def init_moe_weights(self):
|
|
||||||
"""
|
|
||||||
Initialize the weight shapes and parameters for the MoE layer.
|
|
||||||
Combines weight shape initialization and parameter creation into a single function.
|
|
||||||
"""
|
|
||||||
# Initialize weight shapes
|
|
||||||
up_gate_proj_output_dim = self.moe_intermediate_size * 2
|
|
||||||
if self.moe_quant_type in ["block_wise_fp8", "wint8"]:
|
|
||||||
up_gate_proj_weight_shape = [
|
|
||||||
self.num_local_experts,
|
|
||||||
up_gate_proj_output_dim,
|
|
||||||
self.hidden_size,
|
|
||||||
]
|
|
||||||
down_proj_weight_shape = [
|
|
||||||
self.num_local_experts,
|
|
||||||
self.hidden_size,
|
|
||||||
self.moe_intermediate_size,
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
up_gate_proj_weight_shape = [
|
|
||||||
self.num_local_experts,
|
|
||||||
self.hidden_size,
|
|
||||||
up_gate_proj_output_dim,
|
|
||||||
]
|
|
||||||
down_proj_weight_shape = [
|
|
||||||
self.num_local_experts,
|
|
||||||
self.moe_intermediate_size,
|
|
||||||
self.hidden_size,
|
|
||||||
]
|
|
||||||
|
|
||||||
# Create parameters
|
|
||||||
if self.moe_quant_type == "block_wise_fp8":
|
|
||||||
# (TODO:gaoziyuan)
|
|
||||||
self.weight_dtype = "float8_e4m3fn"
|
|
||||||
self.init_block_wise_fp8_scale()
|
|
||||||
elif self.moe_quant_type == "wint8":
|
|
||||||
self.weight_dtype = "int8"
|
|
||||||
self.init_weight_only_scale()
|
|
||||||
|
|
||||||
# up_gate_proj parameters
|
|
||||||
self.up_gate_proj_weight = self.create_parameter(
|
|
||||||
shape=up_gate_proj_weight_shape,
|
|
||||||
dtype=self.weight_dtype,
|
|
||||||
default_initializer=paddle.nn.initializer.Constant(0),
|
|
||||||
)
|
|
||||||
# down_proj parameters
|
|
||||||
self.down_proj_weight = self.create_parameter(
|
|
||||||
shape=down_proj_weight_shape,
|
|
||||||
dtype=self.weight_dtype,
|
|
||||||
default_initializer=paddle.nn.initializer.Constant(0),
|
|
||||||
)
|
|
||||||
|
|
||||||
def init_weight_only_scale(self):
|
|
||||||
"""
|
|
||||||
Initialize the weight scale.
|
|
||||||
"""
|
|
||||||
self.up_gate_proj_weight_scale = self.create_parameter(
|
|
||||||
shape=[self.num_local_experts, self.moe_intermediate_size * 2],
|
|
||||||
dtype=self._dtype,
|
|
||||||
)
|
|
||||||
self.down_proj_weight_scale = self.create_parameter(
|
|
||||||
shape=[self.num_local_experts, self.hidden_size],
|
|
||||||
dtype=self._dtype,
|
|
||||||
)
|
|
||||||
|
|
||||||
def init_block_wise_fp8_scale(self):
|
|
||||||
"""
|
|
||||||
Initialize the weight scale.
|
|
||||||
"""
|
|
||||||
self.up_gate_proj_weight_scale = self.create_parameter(
|
|
||||||
shape=[self.num_local_experts, self.moe_intermediate_size * 2 // 128, self.hidden_size // 128],
|
|
||||||
dtype="float32",
|
|
||||||
is_bias=False,
|
|
||||||
)
|
|
||||||
self.down_proj_weight_scale = self.create_parameter(
|
|
||||||
shape=[self.num_local_experts, self.hidden_size // 128, self.moe_intermediate_size // 128],
|
|
||||||
dtype="float32",
|
|
||||||
is_bias=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
def load_experts_weight(
|
def load_experts_weight(
|
||||||
self,
|
self,
|
||||||
state_dict: dict,
|
state_dict: dict,
|
||||||
@@ -560,26 +461,13 @@ class FusedMoE(nn.Layer):
|
|||||||
"""
|
"""
|
||||||
load_state_dict function.
|
load_state_dict function.
|
||||||
"""
|
"""
|
||||||
if is_supported_moe_backend is not None and is_supported_moe_backend(self.quant_method):
|
if self.fd_config.model_config.is_quantized:
|
||||||
if self.fd_config.model_config.is_quantized:
|
if getattr(self.fd_config.quant_config, "is_permuted", True):
|
||||||
if getattr(self.fd_config.quant_config, "is_permuted", True):
|
self.quant_method.process_prequanted_weights(self, state_dict, is_rearrange)
|
||||||
self.quant_method.process_prequanted_weights(self, state_dict, is_rearrange)
|
|
||||||
else:
|
|
||||||
self.quant_method.process_loaded_weights(self, state_dict)
|
|
||||||
else:
|
else:
|
||||||
self.quant_method.process_loaded_weights(self, state_dict)
|
self.quant_method.process_loaded_weights(self, state_dict)
|
||||||
else:
|
else:
|
||||||
if self.fd_config.model_config.is_quantized:
|
self.quant_method.process_loaded_weights(self, state_dict)
|
||||||
if getattr(self.fd_config.quant_config, "is_permuted", True):
|
|
||||||
self.quant_method.process_prequanted_weights(self, state_dict, is_rearrange)
|
|
||||||
else:
|
|
||||||
self.quant_method.create_weights(self, state_dict)
|
|
||||||
else:
|
|
||||||
if self.moe_quant_config:
|
|
||||||
self.quant_method.create_weights(self, state_dict)
|
|
||||||
else:
|
|
||||||
# w_fp16 a_fp16
|
|
||||||
self.quant_method.process_loaded_weights(self, state_dict)
|
|
||||||
|
|
||||||
def forward(self, x: paddle.Tensor, gate: nn.Layer):
|
def forward(self, x: paddle.Tensor, gate: nn.Layer):
|
||||||
"""
|
"""
|
||||||
|
@@ -1752,7 +1752,6 @@ class GPUModelRunner(ModelRunnerBase):
|
|||||||
token_type_ids_w_video = token_type_ids
|
token_type_ids_w_video = token_type_ids
|
||||||
input_ids = inputs["input_ids"]
|
input_ids = inputs["input_ids"]
|
||||||
# convert to img patch id
|
# convert to img patch id
|
||||||
# TODO(lulinjun): may need to check model_config and model_cfg
|
|
||||||
image_mask = input_ids == self.model_config.im_patch_id
|
image_mask = input_ids == self.model_config.im_patch_id
|
||||||
image_type_ids = inputs["image_type_ids"]
|
image_type_ids = inputs["image_type_ids"]
|
||||||
with paddle.amp.auto_cast(
|
with paddle.amp.auto_cast(
|
||||||
|
@@ -1664,7 +1664,6 @@ class MetaxModelRunner(ModelRunnerBase):
|
|||||||
token_type_ids_w_video = token_type_ids
|
token_type_ids_w_video = token_type_ids
|
||||||
input_ids = inputs["input_ids"]
|
input_ids = inputs["input_ids"]
|
||||||
# convert to img patch id
|
# convert to img patch id
|
||||||
# TODO(lulinjun): may need to check model_config and model_cfg
|
|
||||||
image_mask = input_ids == self.model_config.im_patch_id
|
image_mask = input_ids == self.model_config.im_patch_id
|
||||||
image_type_ids = inputs["image_type_ids"]
|
image_type_ids = inputs["image_type_ids"]
|
||||||
with paddle.amp.auto_cast(
|
with paddle.amp.auto_cast(
|
||||||
|
@@ -416,15 +416,15 @@ ernie.layers.1.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.1.self_attn.o_proj.weight
|
ernie.layers.1.self_attn.o_proj.weight
|
||||||
ernie.layers.1.self_attn.o_proj.weight_scale
|
ernie.layers.1.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.1.mlp.gate_correction_bias
|
ernie.layers.1.mlp.gate_correction_bias
|
||||||
ernie.layers.1.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.1.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.1.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.1.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.1.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.1.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.1.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.1.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.1.mlp.text_fused_moe.gate.weight
|
ernie.layers.1.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.1.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.1.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.1.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.1.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.1.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.1.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.1.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.1.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.1.mlp.image_fused_moe.gate.weight
|
ernie.layers.1.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.1.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.1.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.1.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.1.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -437,15 +437,15 @@ ernie.layers.2.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.2.self_attn.o_proj.weight
|
ernie.layers.2.self_attn.o_proj.weight
|
||||||
ernie.layers.2.self_attn.o_proj.weight_scale
|
ernie.layers.2.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.2.mlp.gate_correction_bias
|
ernie.layers.2.mlp.gate_correction_bias
|
||||||
ernie.layers.2.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.2.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.2.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.2.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.2.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.2.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.2.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.2.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.2.mlp.text_fused_moe.gate.weight
|
ernie.layers.2.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.2.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.2.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.2.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.2.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.2.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.2.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.2.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.2.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.2.mlp.image_fused_moe.gate.weight
|
ernie.layers.2.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.2.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.2.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.2.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.2.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -458,15 +458,15 @@ ernie.layers.3.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.3.self_attn.o_proj.weight
|
ernie.layers.3.self_attn.o_proj.weight
|
||||||
ernie.layers.3.self_attn.o_proj.weight_scale
|
ernie.layers.3.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.3.mlp.gate_correction_bias
|
ernie.layers.3.mlp.gate_correction_bias
|
||||||
ernie.layers.3.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.3.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.3.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.3.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.3.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.3.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.3.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.3.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.3.mlp.text_fused_moe.gate.weight
|
ernie.layers.3.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.3.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.3.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.3.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.3.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.3.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.3.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.3.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.3.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.3.mlp.image_fused_moe.gate.weight
|
ernie.layers.3.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.3.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.3.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.3.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.3.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -479,15 +479,15 @@ ernie.layers.4.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.4.self_attn.o_proj.weight
|
ernie.layers.4.self_attn.o_proj.weight
|
||||||
ernie.layers.4.self_attn.o_proj.weight_scale
|
ernie.layers.4.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.4.mlp.gate_correction_bias
|
ernie.layers.4.mlp.gate_correction_bias
|
||||||
ernie.layers.4.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.4.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.4.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.4.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.4.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.4.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.4.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.4.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.4.mlp.text_fused_moe.gate.weight
|
ernie.layers.4.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.4.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.4.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.4.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.4.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.4.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.4.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.4.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.4.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.4.mlp.image_fused_moe.gate.weight
|
ernie.layers.4.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.4.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.4.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.4.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.4.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -500,15 +500,15 @@ ernie.layers.5.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.5.self_attn.o_proj.weight
|
ernie.layers.5.self_attn.o_proj.weight
|
||||||
ernie.layers.5.self_attn.o_proj.weight_scale
|
ernie.layers.5.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.5.mlp.gate_correction_bias
|
ernie.layers.5.mlp.gate_correction_bias
|
||||||
ernie.layers.5.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.5.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.5.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.5.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.5.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.5.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.5.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.5.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.5.mlp.text_fused_moe.gate.weight
|
ernie.layers.5.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.5.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.5.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.5.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.5.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.5.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.5.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.5.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.5.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.5.mlp.image_fused_moe.gate.weight
|
ernie.layers.5.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.5.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.5.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.5.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.5.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -521,15 +521,15 @@ ernie.layers.6.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.6.self_attn.o_proj.weight
|
ernie.layers.6.self_attn.o_proj.weight
|
||||||
ernie.layers.6.self_attn.o_proj.weight_scale
|
ernie.layers.6.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.6.mlp.gate_correction_bias
|
ernie.layers.6.mlp.gate_correction_bias
|
||||||
ernie.layers.6.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.6.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.6.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.6.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.6.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.6.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.6.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.6.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.6.mlp.text_fused_moe.gate.weight
|
ernie.layers.6.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.6.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.6.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.6.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.6.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.6.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.6.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.6.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.6.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.6.mlp.image_fused_moe.gate.weight
|
ernie.layers.6.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.6.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.6.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.6.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.6.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -542,15 +542,15 @@ ernie.layers.7.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.7.self_attn.o_proj.weight
|
ernie.layers.7.self_attn.o_proj.weight
|
||||||
ernie.layers.7.self_attn.o_proj.weight_scale
|
ernie.layers.7.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.7.mlp.gate_correction_bias
|
ernie.layers.7.mlp.gate_correction_bias
|
||||||
ernie.layers.7.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.7.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.7.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.7.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.7.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.7.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.7.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.7.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.7.mlp.text_fused_moe.gate.weight
|
ernie.layers.7.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.7.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.7.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.7.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.7.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.7.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.7.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.7.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.7.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.7.mlp.image_fused_moe.gate.weight
|
ernie.layers.7.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.7.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.7.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.7.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.7.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -563,15 +563,15 @@ ernie.layers.8.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.8.self_attn.o_proj.weight
|
ernie.layers.8.self_attn.o_proj.weight
|
||||||
ernie.layers.8.self_attn.o_proj.weight_scale
|
ernie.layers.8.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.8.mlp.gate_correction_bias
|
ernie.layers.8.mlp.gate_correction_bias
|
||||||
ernie.layers.8.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.8.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.8.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.8.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.8.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.8.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.8.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.8.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.8.mlp.text_fused_moe.gate.weight
|
ernie.layers.8.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.8.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.8.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.8.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.8.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.8.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.8.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.8.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.8.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.8.mlp.image_fused_moe.gate.weight
|
ernie.layers.8.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.8.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.8.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.8.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.8.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -584,15 +584,15 @@ ernie.layers.9.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.9.self_attn.o_proj.weight
|
ernie.layers.9.self_attn.o_proj.weight
|
||||||
ernie.layers.9.self_attn.o_proj.weight_scale
|
ernie.layers.9.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.9.mlp.gate_correction_bias
|
ernie.layers.9.mlp.gate_correction_bias
|
||||||
ernie.layers.9.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.9.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.9.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.9.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.9.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.9.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.9.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.9.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.9.mlp.text_fused_moe.gate.weight
|
ernie.layers.9.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.9.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.9.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.9.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.9.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.9.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.9.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.9.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.9.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.9.mlp.image_fused_moe.gate.weight
|
ernie.layers.9.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.9.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.9.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.9.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.9.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -605,15 +605,15 @@ ernie.layers.10.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.10.self_attn.o_proj.weight
|
ernie.layers.10.self_attn.o_proj.weight
|
||||||
ernie.layers.10.self_attn.o_proj.weight_scale
|
ernie.layers.10.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.10.mlp.gate_correction_bias
|
ernie.layers.10.mlp.gate_correction_bias
|
||||||
ernie.layers.10.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.10.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.10.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.10.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.10.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.10.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.10.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.10.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.10.mlp.text_fused_moe.gate.weight
|
ernie.layers.10.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.10.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.10.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.10.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.10.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.10.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.10.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.10.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.10.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.10.mlp.image_fused_moe.gate.weight
|
ernie.layers.10.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.10.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.10.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.10.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.10.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -626,15 +626,15 @@ ernie.layers.11.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.11.self_attn.o_proj.weight
|
ernie.layers.11.self_attn.o_proj.weight
|
||||||
ernie.layers.11.self_attn.o_proj.weight_scale
|
ernie.layers.11.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.11.mlp.gate_correction_bias
|
ernie.layers.11.mlp.gate_correction_bias
|
||||||
ernie.layers.11.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.11.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.11.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.11.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.11.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.11.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.11.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.11.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.11.mlp.text_fused_moe.gate.weight
|
ernie.layers.11.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.11.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.11.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.11.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.11.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.11.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.11.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.11.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.11.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.11.mlp.image_fused_moe.gate.weight
|
ernie.layers.11.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.11.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.11.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.11.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.11.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -647,15 +647,15 @@ ernie.layers.12.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.12.self_attn.o_proj.weight
|
ernie.layers.12.self_attn.o_proj.weight
|
||||||
ernie.layers.12.self_attn.o_proj.weight_scale
|
ernie.layers.12.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.12.mlp.gate_correction_bias
|
ernie.layers.12.mlp.gate_correction_bias
|
||||||
ernie.layers.12.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.12.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.12.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.12.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.12.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.12.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.12.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.12.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.12.mlp.text_fused_moe.gate.weight
|
ernie.layers.12.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.12.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.12.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.12.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.12.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.12.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.12.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.12.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.12.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.12.mlp.image_fused_moe.gate.weight
|
ernie.layers.12.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.12.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.12.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.12.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.12.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -668,15 +668,15 @@ ernie.layers.13.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.13.self_attn.o_proj.weight
|
ernie.layers.13.self_attn.o_proj.weight
|
||||||
ernie.layers.13.self_attn.o_proj.weight_scale
|
ernie.layers.13.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.13.mlp.gate_correction_bias
|
ernie.layers.13.mlp.gate_correction_bias
|
||||||
ernie.layers.13.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.13.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.13.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.13.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.13.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.13.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.13.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.13.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.13.mlp.text_fused_moe.gate.weight
|
ernie.layers.13.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.13.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.13.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.13.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.13.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.13.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.13.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.13.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.13.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.13.mlp.image_fused_moe.gate.weight
|
ernie.layers.13.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.13.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.13.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.13.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.13.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -689,15 +689,15 @@ ernie.layers.14.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.14.self_attn.o_proj.weight
|
ernie.layers.14.self_attn.o_proj.weight
|
||||||
ernie.layers.14.self_attn.o_proj.weight_scale
|
ernie.layers.14.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.14.mlp.gate_correction_bias
|
ernie.layers.14.mlp.gate_correction_bias
|
||||||
ernie.layers.14.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.14.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.14.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.14.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.14.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.14.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.14.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.14.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.14.mlp.text_fused_moe.gate.weight
|
ernie.layers.14.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.14.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.14.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.14.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.14.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.14.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.14.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.14.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.14.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.14.mlp.image_fused_moe.gate.weight
|
ernie.layers.14.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.14.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.14.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.14.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.14.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -710,15 +710,15 @@ ernie.layers.15.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.15.self_attn.o_proj.weight
|
ernie.layers.15.self_attn.o_proj.weight
|
||||||
ernie.layers.15.self_attn.o_proj.weight_scale
|
ernie.layers.15.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.15.mlp.gate_correction_bias
|
ernie.layers.15.mlp.gate_correction_bias
|
||||||
ernie.layers.15.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.15.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.15.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.15.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.15.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.15.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.15.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.15.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.15.mlp.text_fused_moe.gate.weight
|
ernie.layers.15.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.15.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.15.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.15.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.15.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.15.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.15.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.15.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.15.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.15.mlp.image_fused_moe.gate.weight
|
ernie.layers.15.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.15.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.15.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.15.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.15.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -731,15 +731,15 @@ ernie.layers.16.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.16.self_attn.o_proj.weight
|
ernie.layers.16.self_attn.o_proj.weight
|
||||||
ernie.layers.16.self_attn.o_proj.weight_scale
|
ernie.layers.16.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.16.mlp.gate_correction_bias
|
ernie.layers.16.mlp.gate_correction_bias
|
||||||
ernie.layers.16.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.16.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.16.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.16.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.16.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.16.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.16.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.16.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.16.mlp.text_fused_moe.gate.weight
|
ernie.layers.16.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.16.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.16.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.16.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.16.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.16.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.16.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.16.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.16.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.16.mlp.image_fused_moe.gate.weight
|
ernie.layers.16.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.16.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.16.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.16.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.16.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -752,15 +752,15 @@ ernie.layers.17.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.17.self_attn.o_proj.weight
|
ernie.layers.17.self_attn.o_proj.weight
|
||||||
ernie.layers.17.self_attn.o_proj.weight_scale
|
ernie.layers.17.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.17.mlp.gate_correction_bias
|
ernie.layers.17.mlp.gate_correction_bias
|
||||||
ernie.layers.17.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.17.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.17.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.17.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.17.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.17.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.17.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.17.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.17.mlp.text_fused_moe.gate.weight
|
ernie.layers.17.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.17.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.17.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.17.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.17.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.17.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.17.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.17.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.17.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.17.mlp.image_fused_moe.gate.weight
|
ernie.layers.17.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.17.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.17.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.17.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.17.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -773,15 +773,15 @@ ernie.layers.18.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.18.self_attn.o_proj.weight
|
ernie.layers.18.self_attn.o_proj.weight
|
||||||
ernie.layers.18.self_attn.o_proj.weight_scale
|
ernie.layers.18.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.18.mlp.gate_correction_bias
|
ernie.layers.18.mlp.gate_correction_bias
|
||||||
ernie.layers.18.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.18.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.18.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.18.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.18.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.18.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.18.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.18.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.18.mlp.text_fused_moe.gate.weight
|
ernie.layers.18.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.18.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.18.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.18.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.18.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.18.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.18.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.18.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.18.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.18.mlp.image_fused_moe.gate.weight
|
ernie.layers.18.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.18.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.18.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.18.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.18.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -794,15 +794,15 @@ ernie.layers.19.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.19.self_attn.o_proj.weight
|
ernie.layers.19.self_attn.o_proj.weight
|
||||||
ernie.layers.19.self_attn.o_proj.weight_scale
|
ernie.layers.19.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.19.mlp.gate_correction_bias
|
ernie.layers.19.mlp.gate_correction_bias
|
||||||
ernie.layers.19.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.19.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.19.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.19.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.19.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.19.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.19.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.19.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.19.mlp.text_fused_moe.gate.weight
|
ernie.layers.19.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.19.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.19.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.19.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.19.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.19.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.19.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.19.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.19.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.19.mlp.image_fused_moe.gate.weight
|
ernie.layers.19.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.19.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.19.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.19.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.19.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -815,15 +815,15 @@ ernie.layers.20.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.20.self_attn.o_proj.weight
|
ernie.layers.20.self_attn.o_proj.weight
|
||||||
ernie.layers.20.self_attn.o_proj.weight_scale
|
ernie.layers.20.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.20.mlp.gate_correction_bias
|
ernie.layers.20.mlp.gate_correction_bias
|
||||||
ernie.layers.20.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.20.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.20.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.20.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.20.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.20.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.20.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.20.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.20.mlp.text_fused_moe.gate.weight
|
ernie.layers.20.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.20.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.20.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.20.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.20.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.20.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.20.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.20.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.20.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.20.mlp.image_fused_moe.gate.weight
|
ernie.layers.20.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.20.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.20.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.20.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.20.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -836,15 +836,15 @@ ernie.layers.21.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.21.self_attn.o_proj.weight
|
ernie.layers.21.self_attn.o_proj.weight
|
||||||
ernie.layers.21.self_attn.o_proj.weight_scale
|
ernie.layers.21.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.21.mlp.gate_correction_bias
|
ernie.layers.21.mlp.gate_correction_bias
|
||||||
ernie.layers.21.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.21.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.21.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.21.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.21.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.21.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.21.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.21.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.21.mlp.text_fused_moe.gate.weight
|
ernie.layers.21.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.21.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.21.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.21.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.21.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.21.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.21.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.21.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.21.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.21.mlp.image_fused_moe.gate.weight
|
ernie.layers.21.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.21.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.21.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.21.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.21.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -857,15 +857,15 @@ ernie.layers.22.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.22.self_attn.o_proj.weight
|
ernie.layers.22.self_attn.o_proj.weight
|
||||||
ernie.layers.22.self_attn.o_proj.weight_scale
|
ernie.layers.22.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.22.mlp.gate_correction_bias
|
ernie.layers.22.mlp.gate_correction_bias
|
||||||
ernie.layers.22.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.22.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.22.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.22.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.22.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.22.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.22.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.22.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.22.mlp.text_fused_moe.gate.weight
|
ernie.layers.22.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.22.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.22.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.22.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.22.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.22.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.22.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.22.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.22.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.22.mlp.image_fused_moe.gate.weight
|
ernie.layers.22.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.22.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.22.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.22.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.22.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -878,15 +878,15 @@ ernie.layers.23.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.23.self_attn.o_proj.weight
|
ernie.layers.23.self_attn.o_proj.weight
|
||||||
ernie.layers.23.self_attn.o_proj.weight_scale
|
ernie.layers.23.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.23.mlp.gate_correction_bias
|
ernie.layers.23.mlp.gate_correction_bias
|
||||||
ernie.layers.23.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.23.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.23.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.23.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.23.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.23.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.23.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.23.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.23.mlp.text_fused_moe.gate.weight
|
ernie.layers.23.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.23.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.23.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.23.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.23.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.23.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.23.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.23.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.23.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.23.mlp.image_fused_moe.gate.weight
|
ernie.layers.23.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.23.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.23.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.23.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.23.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -899,15 +899,15 @@ ernie.layers.24.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.24.self_attn.o_proj.weight
|
ernie.layers.24.self_attn.o_proj.weight
|
||||||
ernie.layers.24.self_attn.o_proj.weight_scale
|
ernie.layers.24.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.24.mlp.gate_correction_bias
|
ernie.layers.24.mlp.gate_correction_bias
|
||||||
ernie.layers.24.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.24.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.24.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.24.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.24.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.24.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.24.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.24.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.24.mlp.text_fused_moe.gate.weight
|
ernie.layers.24.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.24.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.24.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.24.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.24.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.24.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.24.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.24.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.24.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.24.mlp.image_fused_moe.gate.weight
|
ernie.layers.24.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.24.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.24.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.24.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.24.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -920,15 +920,15 @@ ernie.layers.25.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.25.self_attn.o_proj.weight
|
ernie.layers.25.self_attn.o_proj.weight
|
||||||
ernie.layers.25.self_attn.o_proj.weight_scale
|
ernie.layers.25.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.25.mlp.gate_correction_bias
|
ernie.layers.25.mlp.gate_correction_bias
|
||||||
ernie.layers.25.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.25.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.25.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.25.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.25.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.25.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.25.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.25.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.25.mlp.text_fused_moe.gate.weight
|
ernie.layers.25.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.25.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.25.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.25.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.25.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.25.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.25.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.25.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.25.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.25.mlp.image_fused_moe.gate.weight
|
ernie.layers.25.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.25.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.25.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.25.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.25.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -941,15 +941,15 @@ ernie.layers.26.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.26.self_attn.o_proj.weight
|
ernie.layers.26.self_attn.o_proj.weight
|
||||||
ernie.layers.26.self_attn.o_proj.weight_scale
|
ernie.layers.26.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.26.mlp.gate_correction_bias
|
ernie.layers.26.mlp.gate_correction_bias
|
||||||
ernie.layers.26.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.26.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.26.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.26.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.26.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.26.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.26.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.26.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.26.mlp.text_fused_moe.gate.weight
|
ernie.layers.26.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.26.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.26.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.26.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.26.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.26.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.26.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.26.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.26.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.26.mlp.image_fused_moe.gate.weight
|
ernie.layers.26.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.26.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.26.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.26.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.26.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
@@ -962,15 +962,15 @@ ernie.layers.27.self_attn.qkv_proj.weight_scale
|
|||||||
ernie.layers.27.self_attn.o_proj.weight
|
ernie.layers.27.self_attn.o_proj.weight
|
||||||
ernie.layers.27.self_attn.o_proj.weight_scale
|
ernie.layers.27.self_attn.o_proj.weight_scale
|
||||||
ernie.layers.27.mlp.gate_correction_bias
|
ernie.layers.27.mlp.gate_correction_bias
|
||||||
ernie.layers.27.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.27.mlp.text_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.27.mlp.text_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.27.mlp.text_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.27.mlp.text_fused_moe.experts.down_proj_weight
|
ernie.layers.27.mlp.text_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.27.mlp.text_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.27.mlp.text_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.27.mlp.text_fused_moe.gate.weight
|
ernie.layers.27.mlp.text_fused_moe.gate.weight
|
||||||
ernie.layers.27.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
|
||||||
ernie.layers.27.mlp.image_fused_moe.experts.down_proj_weight_scale
|
|
||||||
ernie.layers.27.mlp.image_fused_moe.experts.up_gate_proj_weight
|
ernie.layers.27.mlp.image_fused_moe.experts.up_gate_proj_weight
|
||||||
ernie.layers.27.mlp.image_fused_moe.experts.down_proj_weight
|
ernie.layers.27.mlp.image_fused_moe.experts.down_proj_weight
|
||||||
|
ernie.layers.27.mlp.image_fused_moe.experts.up_gate_proj_weight_scale
|
||||||
|
ernie.layers.27.mlp.image_fused_moe.experts.down_proj_weight_scale
|
||||||
ernie.layers.27.mlp.image_fused_moe.gate.weight
|
ernie.layers.27.mlp.image_fused_moe.gate.weight
|
||||||
ernie.layers.27.mlp.shared_experts.up_gate_proj.weight
|
ernie.layers.27.mlp.shared_experts.up_gate_proj.weight
|
||||||
ernie.layers.27.mlp.shared_experts.up_gate_proj.weight_scale
|
ernie.layers.27.mlp.shared_experts.up_gate_proj.weight_scale
|
||||||
|
Reference in New Issue
Block a user