From 364197c4b5ccc714d9fdd28084b23c6e2e9c5701 Mon Sep 17 00:00:00 2001 From: Sunny-bot1 <68891411+Sunny-bot1@users.noreply.github.com> Date: Mon, 8 Dec 2025 20:24:00 +0800 Subject: [PATCH] support w4afp8 mtp (#5429) --- .../layers/moe/fused_moe_cutlass_backend.py | 4 +++- fastdeploy/model_executor/layers/utils.py | 8 +++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py index c3dbfc9ba..2a5369fbf 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_cutlass_backend.py @@ -930,11 +930,13 @@ class CutlassW4AFP8MoEMethod(CutlassMoEMethod): Paddle cutlass load weight process. """ if not layer.is_quantized: + prefix_layer_name = layer.fd_config.model_config.prefix_layer_name logger.info( - f"Rotating ernie.layers.{layer.layer_idx}.mlp.experts.[{layer.ep_rank * layer.num_local_experts},{layer.ep_rank * layer.num_local_experts + layer.num_local_experts}).down_proj.weight..." + f"Rotating ernie.{prefix_layer_name}.{layer.layer_idx}.mlp.experts.[{layer.ep_rank * layer.num_local_experts},{layer.ep_rank * layer.num_local_experts + layer.num_local_experts}).down_proj.weight..." ) rotate_model( state_dict, + prefix_layer_name, layer.layer_idx, layer.num_local_experts, layer.hidden_size, diff --git a/fastdeploy/model_executor/layers/utils.py b/fastdeploy/model_executor/layers/utils.py index c18f06245..914c0260d 100644 --- a/fastdeploy/model_executor/layers/utils.py +++ b/fastdeploy/model_executor/layers/utils.py @@ -141,7 +141,9 @@ def get_orthogonal_matrix(size, mode="hadamard", device="cuda"): raise ValueError(f"Unknown mode {mode}") -def rotate_model(state_dict, layer_idx, moe_num_experts=48, hidden_size=7168, moe_intermediate_size=3584, ep_rank=0): +def rotate_model( + state_dict, prefix_layer_name, layer_idx, moe_num_experts, hidden_size, moe_intermediate_size, ep_rank=0 +): with paddle.no_grad(): # collect hadamard rotation matrix [moe_intermediate_size, moe_intermediate_size] Q_ffn2, moe_block_size = get_orthogonal_matrix(size=moe_intermediate_size, mode="hadamard_ffn2") @@ -149,7 +151,7 @@ def rotate_model(state_dict, layer_idx, moe_num_experts=48, hidden_size=7168, mo expert_list = [ get_tensor( state_dict[ - f"ernie.layers.{layer_idx}.mlp.experts.{ep_rank * moe_num_experts + expert_idx}.down_proj.weight" + f"ernie.{prefix_layer_name}.{layer_idx}.mlp.experts.{ep_rank * moe_num_experts + expert_idx}.down_proj.weight" ] ) for expert_idx in range(moe_num_experts) @@ -159,7 +161,7 @@ def rotate_model(state_dict, layer_idx, moe_num_experts=48, hidden_size=7168, mo for expert_idx in range(moe_num_experts): rotated_weight = new_moe_weight[:, expert_idx * hidden_size : (expert_idx + 1) * hidden_size] expert_idx_local = ep_rank * moe_num_experts + expert_idx - state_dict[f"ernie.layers.{layer_idx}.mlp.experts.{expert_idx_local}.down_proj.weight"] = ( + state_dict[f"ernie.{prefix_layer_name}.{layer_idx}.mlp.experts.{expert_idx_local}.down_proj.weight"] = ( rotated_weight.cpu() ) del moe_weight, new_moe_weight, rotated_weight