[XPU] refine fused moe (#4219)

This commit is contained in:
zhupengyang
2025-10-16 19:04:07 +08:00
committed by GitHub
parent 3bbe99eae7
commit 26ff2f8683
7 changed files with 354 additions and 585 deletions

View File

@@ -228,8 +228,9 @@ MoeLayer(const paddle::Tensor &x, const paddle::Tensor &gate_weight,
quant_method == "weight_only_int4") {
APPLY_MOE_LAYER_KERNEL(paddle::bfloat16, int4_t);
} else {
PD_THROW("MoeLayer not support x_type==%d, w_type==%d",
static_cast<int>(x_type), static_cast<int>(w_type));
PD_THROW("MoeLayer not support x_type=", static_cast<int>(x_type),
", w_type=", static_cast<int>(w_type),
", quant_method=", quant_method);
return {};
}
#undef APPLY_MOE_LAYER_KERNEL

View File

@@ -16,16 +16,11 @@
xpu backend methods
"""
from .moe.fused_moe import (
XPUMoEMethod,
XPUWeightOnlyMoeEpMethod,
XPUWeightOnlyMoEMethod,
)
from .moe.fused_moe import XPUMoEMethod, XPUWeightOnlyMoEMethod
from .quantization.weight_only import XPUWeightOnlyLinearMethod
__all__ = [
"XPUWeightOnlyLinearMethod",
"XPUMoEMethod",
"XPUWeightOnlyMoEMethod",
"XPUWeightOnlyMoeEpMethod",
]

View File

@@ -30,15 +30,22 @@ class MoEMethodBase(QuantMethodBase):
def __init__(self, quant_config):
super().__init__()
if quant_config is None:
self.quant_config = quant_config
if self.quant_config is None:
self.moe_quant_type = "w16a16"
elif hasattr(quant_config, "algo"):
self.moe_quant_type = quant_config.algo
else:
self.quant_config = quant_config
self.moe_quant_type = quant_config.name()
self.added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"]
self.added_scale_attrs = [
"up_gate_proj_weight_scale",
"down_proj_weight_scale",
]
self.added_in_scale_attrs = [
"up_gate_proj_in_scale",
"down_proj_in_scale",
]
self.pack_num = 1
self.ep_prefill_runner = None
self.ep_decoder_runner = None

View File

@@ -79,18 +79,11 @@ class WeightOnlyConfig(QuantConfigBase):
def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
if current_platform.is_xpu():
if isinstance(layer, FusedMoE):
if layer.ep_size > 1:
from fastdeploy.model_executor.layers.backends import (
XPUWeightOnlyMoeEpMethod,
)
from fastdeploy.model_executor.layers.backends import (
XPUWeightOnlyMoEMethod,
)
return XPUWeightOnlyMoeEpMethod(self)
else:
from fastdeploy.model_executor.layers.backends import (
XPUWeightOnlyMoEMethod,
)
return XPUWeightOnlyMoEMethod(self)
return XPUWeightOnlyMoEMethod(self)
else:
from fastdeploy.model_executor.layers.backends import (
XPUWeightOnlyLinearMethod,

View File

@@ -404,9 +404,9 @@ class PaddleDisWorkerProc:
if num_blocks_local <= 0:
raise ValueError(
"The total number of blocks cannot be less than zero."
"Please increase gpu_memory_utilization"
"Or decrease max_num_batched_tokens(max model length) "
"The total number of blocks cannot be less than zero. "
"Please increase gpu_memory_utilization "
"Or decrease max_num_batched_tokens(max model length)."
)
if self.ranks > 1:

View File

@@ -1227,7 +1227,8 @@ class XPUModelRunner(ModelRunnerBase):
"""
Clear the block tables and kv cache after profiling.
"""
del self.share_inputs["caches"]
if hasattr(self.share_inputs, "caches"):
del self.share_inputs["caches"]
if self.forward_meta is not None:
del self.forward_meta.caches
paddle.device.xpu.empty_cache()