diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index c2dcdd216..acea471a8 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -1000,7 +1000,10 @@ class LLMEngine: "FLAGS_use_append_attn": 1, "NCCL_ALGO": "Ring", "FLAGS_max_partition_size": int(os.getenv("FLAGS_max_partition_size", 32768)), - "FLAGS_hardamard_moe_block_size": 128, + "FLAGS_hardamard_moe_block_size": int(os.getenv("FLAGS_hardamard_moe_block_size", 128)), + "FLAGS_hardamard_use_diagonal_block_matrix": int( + os.getenv("FLAGS_hardamard_use_diagonal_block_matrix", 0) + ), } # environment variables needed by Dy2St variables.update( diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py index c13b971ea..df56aeb98 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py @@ -213,8 +213,11 @@ class CutlassMoEMethod(MoEMethodBase): """ # 1. Select topk experts and weights topk_idx, topk_weights = self.ep_decoder_runner.moe_select(layer, gate_out) + expertwise_scale = getattr(layer, "up_gate_proj_in_scale_all_experts") # 2. EP Dispatch - permute_input, token_nums_per_expert, handle = self.ep_decoder_runner.dispatch(x, topk_idx, topk_weights) + permute_input, token_nums_per_expert, handle = self.ep_decoder_runner.dispatch( + x, topk_idx, topk_weights, expertwise_scale=expertwise_scale + ) # 3. Compute ffn if self.moe_quant_type == "w4a8": num_local_experts, max_num, _ = permute_input.shape @@ -376,6 +379,7 @@ class CutlassW4A8MoEMethod(CutlassMoEMethod): # 1. Init scale containers and maps up_gate_proj_weight_scales = [] down_proj_weight_scales = [] + up_gate_proj_in_scales_all_experts = [] up_gate_proj_in_scales = [] down_proj_in_scales = [] @@ -396,9 +400,16 @@ class CutlassW4A8MoEMethod(CutlassMoEMethod): raise ValueError(f"scale {name} should not be none in w4a8 mode.") # 2. Extract scale tensor from state dict + if layer.ep_size > 1: + for expert_idx in range(layer.num_experts): + scale_tensor = get_tensor(state_dict[scale_key_map["up_gate_proj_in_scale"].format(expert_idx)]) + up_gate_proj_in_scales_all_experts.append(1 / scale_tensor) + create_and_set_parameter( + layer, "up_gate_proj_in_scale_all_experts", paddle.concat(up_gate_proj_in_scales_all_experts) + ) for local_expert_idx in range(layer.num_local_experts): - expert_idx = local_expert_idx + layer.expert_id_offset * layer.num_local_experts + expert_idx = local_expert_idx + layer.expert_id_offset for name, scale_key_template in scale_key_map.items(): scale_tensor = _extract_scale_tensor(state_dict, scale_key_template, expert_idx) scale_weight_map[name].append(scale_tensor) diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py index 574df2159..dcc4b38ce 100644 --- a/fastdeploy/model_executor/layers/moe/moe.py +++ b/fastdeploy/model_executor/layers/moe/moe.py @@ -19,6 +19,9 @@ from paddle import nn from paddleformers.utils.log import logger from fastdeploy import envs +from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import ( + CutlassW4A8MoEMethod, +) from fastdeploy.model_executor.layers.utils import get_tensor from fastdeploy.worker.experts_manager import RedundantExpertManger @@ -385,7 +388,10 @@ class FusedMoE(nn.Layer): self.gate_weight.set_value(gate_weight_tensor.astype("float32")) if self.fd_config.model_config.is_quantized: - self.quant_method.process_prequanted_weights(self, state_dict) + if isinstance(self.quant_method, CutlassW4A8MoEMethod): + self.quant_method.create_weights(self, state_dict) + else: + self.quant_method.process_prequanted_weights(self, state_dict) else: self.quant_method.create_weights(self, state_dict) diff --git a/fastdeploy/model_executor/load_weight_utils.py b/fastdeploy/model_executor/load_weight_utils.py index f04dc9b81..2172d0f82 100644 --- a/fastdeploy/model_executor/load_weight_utils.py +++ b/fastdeploy/model_executor/load_weight_utils.py @@ -101,12 +101,20 @@ def load_ep_checkpoint(model_path: str, fd_config: FDConfig, return_numpy: bool up_gate_proj_scale_key = f"ernie.layers.{i}.mlp.experts.{j}.up_gate_proj.weight_scale" down_proj_scale_key = f"ernie.layers.{i}.mlp.experts.{j}.down_proj.weight_scale" + + down_proj_in_scale_key = f"ernie.layers.{i}.mlp.experts.{j}.down_proj.activation_scale" num_local_ffn_keys.append(up_gate_proj_key) num_local_ffn_keys.append(down_proj_key) num_local_ffn_keys.append(up_gate_proj_quant_key) num_local_ffn_keys.append(down_proj_quant_key) num_local_ffn_keys.append(up_gate_proj_scale_key) num_local_ffn_keys.append(down_proj_scale_key) + num_local_ffn_keys.append(down_proj_in_scale_key) + + # for EP w4a8, we need all expert's activation_scale for up_gate_proj + for j in range(fd_config.model_config.moe_num_experts): + up_gate_proj_in_scale_key = f"ernie.layers.{i}.mlp.experts.{j}.up_gate_proj.activation_scale" + num_local_ffn_keys.append(up_gate_proj_in_scale_key) for k in num_local_ffn_keys: if k in weight_list: