mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-11-03 02:53:26 +08:00
[XPU] Supports BF16 for ERNIE-4.5-21B-A3B and ERNIE-4.5-0.3B (#2765)
* fix no quant xpu moe * change dir of xpu moe weight only
This commit is contained in:
@@ -20,9 +20,24 @@ from paddleformers.utils.log import logger
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.model_executor.layers.utils import get_tensor
|
||||
from fastdeploy.platforms import current_platform
|
||||
|
||||
|
||||
def get_moe_method():
|
||||
"""
|
||||
return moe method based on device platform
|
||||
"""
|
||||
from fastdeploy.platforms import current_platform
|
||||
if current_platform.is_cuda():
|
||||
from .fused_moe_cutlass_backend import CutlassMoEMethod
|
||||
return CutlassMoEMethod(None)
|
||||
elif current_platform.is_xpu():
|
||||
from .fused_moe_xpu_backend import XPUMoEMethod
|
||||
return XPUMoEMethod(None)
|
||||
elif current_platform.is_gcu():
|
||||
from fastdeploy.model_executor.layers.backends import GCUFusedMoeMethod
|
||||
return GCUFusedMoeMethod(None)
|
||||
raise NotImplementedError()
|
||||
|
||||
class FusedMoE(nn.Layer):
|
||||
"""
|
||||
FusedMoE is a layer that performs MoE (Mixture of Experts) computation.
|
||||
@@ -96,13 +111,7 @@ class FusedMoE(nn.Layer):
|
||||
self.moe_quant_type = moe_quant_config.name()
|
||||
else:
|
||||
# now, no quant method(w_fp16 a_fp16) can't get from quant_config, we will optimize it in future
|
||||
if current_platform.is_cuda():
|
||||
from .fused_moe_cutlass_backend import CutlassMoEMethod
|
||||
self.quant_method = CutlassMoEMethod(None)
|
||||
elif current_platform.is_gcu():
|
||||
from fastdeploy.model_executor.layers.backends import \
|
||||
GCUFusedMoeMethod
|
||||
self.quant_method = GCUFusedMoeMethod(None)
|
||||
self.quant_method = get_moe_method()
|
||||
|
||||
if self.ep_size > 1:
|
||||
self.quant_method.init_ep(self)
|
||||
|
||||
Reference in New Issue
Block a user