mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
[fix] w4a8 model loading and hadamard config (#3013)
This commit is contained in:
@@ -19,6 +19,9 @@ from paddle import nn
|
||||
from paddleformers.utils.log import logger
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.model_executor.layers.moe.fused_moe_cutlass_backend import (
|
||||
CutlassW4A8MoEMethod,
|
||||
)
|
||||
from fastdeploy.model_executor.layers.utils import get_tensor
|
||||
from fastdeploy.worker.experts_manager import RedundantExpertManger
|
||||
|
||||
@@ -385,7 +388,10 @@ class FusedMoE(nn.Layer):
|
||||
self.gate_weight.set_value(gate_weight_tensor.astype("float32"))
|
||||
|
||||
if self.fd_config.model_config.is_quantized:
|
||||
self.quant_method.process_prequanted_weights(self, state_dict)
|
||||
if isinstance(self.quant_method, CutlassW4A8MoEMethod):
|
||||
self.quant_method.create_weights(self, state_dict)
|
||||
else:
|
||||
self.quant_method.process_prequanted_weights(self, state_dict)
|
||||
else:
|
||||
self.quant_method.create_weights(self, state_dict)
|
||||
|
||||
|
Reference in New Issue
Block a user