diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py index 73c9b634a..2f6dff80d 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_backend_base.py @@ -189,12 +189,20 @@ class UnquantizedFusedMoEMethod(MoEMethodBase): def create_weights(self, layer: nn.Layer, **extra_weight_attrs): if current_platform.is_cuda(): - self.up_gate_proj_weight_shape = [layer.num_experts, layer.hidden_size, layer.moe_intermediate_size * 2] - self.down_proj_weight_shape = [layer.num_experts, layer.moe_intermediate_size, layer.hidden_size] + self.up_gate_proj_weight_shape = [ + layer.num_local_experts, + layer.hidden_size, + layer.moe_intermediate_size * 2, + ] + self.down_proj_weight_shape = [layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size] extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 1, "down": 0, "up": 1}} else: - self.up_gate_proj_weight_shape = [layer.num_experts, layer.moe_intermediate_size * 2, layer.hidden_size] - self.down_proj_weight_shape = [layer.num_experts, layer.hidden_size, layer.moe_intermediate_size] + self.up_gate_proj_weight_shape = [ + layer.num_local_experts, + layer.moe_intermediate_size * 2, + layer.hidden_size, + ] + self.down_proj_weight_shape = [layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size] extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}} layer.up_gate_proj_weight = layer.create_parameter( diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py index e83250d1d..78c011330 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py @@ -1054,13 +1054,13 @@ class CutlassWeightOnlyMoEMethod(CutlassMoEMethod): if layer.fd_config.load_config.load_choices == "default_v1": layer.up_gate_proj_weight = layer.create_parameter( - shape=[layer.num_experts, layer.hidden_size, layer.moe_intermediate_size * 2], + shape=[layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size * 2], dtype=layer.weight_dtype, default_initializer=paddle.nn.initializer.Constant(0), ) layer.down_proj_weight = layer.create_parameter( - shape=[layer.num_experts, layer.moe_intermediate_size, layer.hidden_size], + shape=[layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size], dtype=layer.weight_dtype, default_initializer=paddle.nn.initializer.Constant(0), ) @@ -1167,7 +1167,7 @@ class CutlassWeightOnlyMoEMethod(CutlassMoEMethod): # 3.quantize weight - for expert_id in range(layer.num_experts): + for expert_id in range(layer.num_local_experts): weight[expert_id], scale[expert_id] = weight_quantize( getattr(layer, unquantized_weight_name)[expert_id], algo=self.moe_quant_type ) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py index b2571b325..8799a9c22 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py @@ -59,13 +59,13 @@ class DeepGemmFusedMoeMethod(MoEMethodBase): ] if self.quant_config.is_checkpoint_bf16: layer.up_gate_proj_weight = layer.create_parameter( - shape=[layer.num_experts, layer.hidden_size, layer.moe_intermediate_size * 2], + shape=[layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size * 2], dtype=layer.weight_dtype, default_initializer=paddle.nn.initializer.Constant(0), ) layer.down_proj_weight = layer.create_parameter( - shape=[layer.num_experts, layer.moe_intermediate_size, layer.hidden_size], + shape=[layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size], dtype=layer.weight_dtype, default_initializer=paddle.nn.initializer.Constant(0), ) @@ -164,7 +164,7 @@ class DeepGemmFusedMoeMethod(MoEMethodBase): # 3.quantize weight from fastdeploy.model_executor.layers.utils import per_block_cast_to_fp8 - for expert_id in range(layer.num_experts): + for expert_id in range(layer.num_local_experts): weight_quant, scale[expert_id] = per_block_cast_to_fp8( getattr(layer, unquantized_weight_name)[expert_id], self.quant_config.weight_block_size ) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py index 69920649a..4c7b0385c 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py @@ -627,13 +627,13 @@ class BlockWiseFP8MoEMethod(QuantMethodBase): ] if self.quant_config.is_checkpoint_bf16: layer.up_gate_proj_weight = layer.create_parameter( - shape=[layer.num_experts, layer.hidden_size, layer.moe_intermediate_size * 2], + shape=[layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size * 2], dtype=layer.weight_dtype, default_initializer=paddle.nn.initializer.Constant(0), ) layer.down_proj_weight = layer.create_parameter( - shape=[layer.num_experts, layer.moe_intermediate_size, layer.hidden_size], + shape=[layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size], dtype=layer.weight_dtype, default_initializer=paddle.nn.initializer.Constant(0), ) @@ -732,7 +732,7 @@ class BlockWiseFP8MoEMethod(QuantMethodBase): # 3.quantize weight from fastdeploy.model_executor.layers.utils import per_block_cast_to_fp8 - for expert_id in range(layer.num_experts): + for expert_id in range(layer.num_local_experts): weight_quant, scale[expert_id] = per_block_cast_to_fp8( getattr(layer, unquantized_weight_name)[expert_id], self.quant_config.weight_block_size ) diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py index bc58ef3eb..c21cef480 100644 --- a/fastdeploy/model_executor/layers/moe/moe.py +++ b/fastdeploy/model_executor/layers/moe/moe.py @@ -299,6 +299,7 @@ class FusedMoE(nn.Layer): param_down_proj_name: Optional[str] = None, ckpt_expert_key_name: str = "experts", experts_offset: int = 0, + num_experts_start_offset: int = 0, ) -> list[tuple[str, str, int, str]]: param_name_maping = [] @@ -323,7 +324,9 @@ class FusedMoE(nn.Layer): expert_id, shard_id, ) - for expert_id in range(experts_offset, experts_offset + num_experts) + for expert_id in range( + experts_offset + num_experts_start_offset, experts_offset + num_experts_start_offset + num_experts + ) for shard_id, weight_name in param_name_maping ] diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index 2bf8a4bd1..dcfd02c3e 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -543,14 +543,22 @@ class Ernie4_5_MoeForCausalLM(ModelForCasualLM): expert_params_mapping = [] if getattr(self.fd_config.model_config, "moe_num_experts", None) is not None: + if self.fd_config.parallel_config.expert_parallel_size > 1: + num_experts = self.fd_config.parallel_config.num_experts_per_rank + num_experts_start_offset = self.fd_config.parallel_config.num_experts_start_offset + else: + num_experts = self.fd_config.model_config.moe_num_experts + num_experts_start_offset = 0 + expert_params_mapping = FusedMoE.make_expert_params_mapping( - num_experts=self.fd_config.model_config.moe_num_experts, + num_experts=num_experts, ckpt_down_proj_name="down_proj", ckpt_gate_up_proj_name="up_gate_proj", ckpt_gate_proj_name="gate_proj", ckpt_up_proj_name="up_proj", param_gate_up_proj_name="experts.up_gate_proj_", param_down_proj_name="experts.down_proj_", + num_experts_start_offset=num_experts_start_offset, ) all_param_mapping = general_params_mapping + expert_params_mapping