From ec52d39e68911fa05f1eabc9df97ff914176fb68 Mon Sep 17 00:00:00 2001 From: AIbin <37361953+chang-wenbin@users.noreply.github.com> Date: Mon, 28 Jul 2025 16:31:56 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90Inference=20Optimize=E3=80=91Update=20?= =?UTF-8?q?wint2=20weight=20n-dim=20reorder=20(#3042)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../layers/moe/fused_moe_wint2_backend.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py index 041283bbd..fe16f7fc6 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_wint2_backend.py @@ -135,6 +135,17 @@ class CutlassWint2FusedMoeMethod(Wint2MoeMethod): up_gate_proj_code_zp = paddle.stack(up_gate_proj_code_zp, axis=0) down_proj_code_zp = paddle.stack(down_proj_code_zp, axis=0) + # Here we pre-arrange the n-dim weight matrix + w1_shape = up_gate_proj_weight.shape + up_gate_proj_weight = up_gate_proj_weight.reshape([w1_shape[0], w1_shape[1] // 16, 16, w1_shape[2] // 8, 8]) + up_gate_proj_weight = paddle.transpose(up_gate_proj_weight, perm=[0, 3, 1, 4, 2]) + up_gate_proj_weight = up_gate_proj_weight.reshape(w1_shape) + + w2_shape = down_proj_weight.shape + down_proj_weight = down_proj_weight.reshape([w2_shape[0], w2_shape[1] // 16, 16, w2_shape[2] // 8, 8]) + down_proj_weight = paddle.transpose(down_proj_weight, perm=[0, 3, 1, 4, 2]) + down_proj_weight = down_proj_weight.reshape(w2_shape) + name_tensor_map = { "up_gate_proj_weight": up_gate_proj_weight, "down_proj_weight": down_proj_weight,