[fix] w4a8 model loading and hadamard config (#3013)

2025-10-05 08:37:06 +08:00 · 2025-07-28 18:17:59 +08:00
parent c8bf8b3913
commit b1d787a272
4 changed files with 32 additions and 4 deletions
--- a/fastdeploy/model_executor/layers/moe/moe.py
+++ b/fastdeploy/model_executor/layers/moe/moe.py
@@ -19,6 +19,9 @@ from paddle import nn
 from paddleformers.utils.log import logger

 from fastdeploy import envs
+from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import (
+    CutlassW4A8MoEMethod,
+)
 from fastdeploy.model_executor.layers.utils import get_tensor
 from fastdeploy.worker.experts_manager import RedundantExpertManger

@@ -385,7 +388,10 @@ class FusedMoE(nn.Layer):
            self.gate_weight.set_value(gate_weight_tensor.astype("float32"))

        if self.fd_config.model_config.is_quantized:
-            self.quant_method.process_prequanted_weights(self, state_dict)
+            if isinstance(self.quant_method, CutlassW4A8MoEMethod):
+                self.quant_method.create_weights(self, state_dict)
+            else:
+                self.quant_method.process_prequanted_weights(self, state_dict)
        else:
            self.quant_method.create_weights(self, state_dict)