[XPU] Supports BF16 for ERNIE-4.5-21B-A3B and ERNIE-4.5-0.3B (#2765)

* fix no quant xpu moe

* change dir of xpu moe weight only
This commit is contained in:
yulangz
2025-07-09 15:57:51 +08:00
committed by GitHub
parent 771e71a24d
commit be21ef5047
5 changed files with 234 additions and 117 deletions

View File

@@ -20,9 +20,24 @@ from paddleformers.utils.log import logger
from fastdeploy import envs
from fastdeploy.model_executor.layers.utils import get_tensor
from fastdeploy.platforms import current_platform
def get_moe_method():
"""
return moe method based on device platform
"""
from fastdeploy.platforms import current_platform
if current_platform.is_cuda():
from .fused_moe_cutlass_backend import CutlassMoEMethod
return CutlassMoEMethod(None)
elif current_platform.is_xpu():
from .fused_moe_xpu_backend import XPUMoEMethod
return XPUMoEMethod(None)
elif current_platform.is_gcu():
from fastdeploy.model_executor.layers.backends import GCUFusedMoeMethod
return GCUFusedMoeMethod(None)
raise NotImplementedError()
class FusedMoE(nn.Layer):
"""
FusedMoE is a layer that performs MoE (Mixture of Experts) computation.
@@ -96,13 +111,7 @@ class FusedMoE(nn.Layer):
self.moe_quant_type = moe_quant_config.name()
else:
# now, no quant method(w_fp16 a_fp16) can't get from quant_config, we will optimize it in future
if current_platform.is_cuda():
from .fused_moe_cutlass_backend import CutlassMoEMethod
self.quant_method = CutlassMoEMethod(None)
elif current_platform.is_gcu():
from fastdeploy.model_executor.layers.backends import \
GCUFusedMoeMethod
self.quant_method = GCUFusedMoeMethod(None)
self.quant_method = get_moe_method()
if self.ep_size > 1:
self.quant_method.init_ep(self)