mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[XPU] refine fused moe (#4219)
This commit is contained in:
@@ -228,8 +228,9 @@ MoeLayer(const paddle::Tensor &x, const paddle::Tensor &gate_weight,
|
||||
quant_method == "weight_only_int4") {
|
||||
APPLY_MOE_LAYER_KERNEL(paddle::bfloat16, int4_t);
|
||||
} else {
|
||||
PD_THROW("MoeLayer not support x_type==%d, w_type==%d",
|
||||
static_cast<int>(x_type), static_cast<int>(w_type));
|
||||
PD_THROW("MoeLayer not support x_type=", static_cast<int>(x_type),
|
||||
", w_type=", static_cast<int>(w_type),
|
||||
", quant_method=", quant_method);
|
||||
return {};
|
||||
}
|
||||
#undef APPLY_MOE_LAYER_KERNEL
|
||||
|
||||
@@ -16,16 +16,11 @@
|
||||
xpu backend methods
|
||||
"""
|
||||
|
||||
from .moe.fused_moe import (
|
||||
XPUMoEMethod,
|
||||
XPUWeightOnlyMoeEpMethod,
|
||||
XPUWeightOnlyMoEMethod,
|
||||
)
|
||||
from .moe.fused_moe import XPUMoEMethod, XPUWeightOnlyMoEMethod
|
||||
from .quantization.weight_only import XPUWeightOnlyLinearMethod
|
||||
|
||||
__all__ = [
|
||||
"XPUWeightOnlyLinearMethod",
|
||||
"XPUMoEMethod",
|
||||
"XPUWeightOnlyMoEMethod",
|
||||
"XPUWeightOnlyMoeEpMethod",
|
||||
]
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -30,15 +30,22 @@ class MoEMethodBase(QuantMethodBase):
|
||||
|
||||
def __init__(self, quant_config):
|
||||
super().__init__()
|
||||
if quant_config is None:
|
||||
self.quant_config = quant_config
|
||||
if self.quant_config is None:
|
||||
self.moe_quant_type = "w16a16"
|
||||
elif hasattr(quant_config, "algo"):
|
||||
self.moe_quant_type = quant_config.algo
|
||||
else:
|
||||
self.quant_config = quant_config
|
||||
self.moe_quant_type = quant_config.name()
|
||||
self.added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"]
|
||||
self.added_scale_attrs = [
|
||||
"up_gate_proj_weight_scale",
|
||||
"down_proj_weight_scale",
|
||||
]
|
||||
self.added_in_scale_attrs = [
|
||||
"up_gate_proj_in_scale",
|
||||
"down_proj_in_scale",
|
||||
]
|
||||
self.pack_num = 1
|
||||
self.ep_prefill_runner = None
|
||||
self.ep_decoder_runner = None
|
||||
|
||||
@@ -79,18 +79,11 @@ class WeightOnlyConfig(QuantConfigBase):
|
||||
def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
|
||||
if current_platform.is_xpu():
|
||||
if isinstance(layer, FusedMoE):
|
||||
if layer.ep_size > 1:
|
||||
from fastdeploy.model_executor.layers.backends import (
|
||||
XPUWeightOnlyMoeEpMethod,
|
||||
)
|
||||
from fastdeploy.model_executor.layers.backends import (
|
||||
XPUWeightOnlyMoEMethod,
|
||||
)
|
||||
|
||||
return XPUWeightOnlyMoeEpMethod(self)
|
||||
else:
|
||||
from fastdeploy.model_executor.layers.backends import (
|
||||
XPUWeightOnlyMoEMethod,
|
||||
)
|
||||
|
||||
return XPUWeightOnlyMoEMethod(self)
|
||||
return XPUWeightOnlyMoEMethod(self)
|
||||
else:
|
||||
from fastdeploy.model_executor.layers.backends import (
|
||||
XPUWeightOnlyLinearMethod,
|
||||
|
||||
@@ -404,9 +404,9 @@ class PaddleDisWorkerProc:
|
||||
|
||||
if num_blocks_local <= 0:
|
||||
raise ValueError(
|
||||
"The total number of blocks cannot be less than zero."
|
||||
"Please increase gpu_memory_utilization"
|
||||
"Or decrease max_num_batched_tokens(max model length) "
|
||||
"The total number of blocks cannot be less than zero. "
|
||||
"Please increase gpu_memory_utilization "
|
||||
"Or decrease max_num_batched_tokens(max model length)."
|
||||
)
|
||||
|
||||
if self.ranks > 1:
|
||||
|
||||
@@ -1227,7 +1227,8 @@ class XPUModelRunner(ModelRunnerBase):
|
||||
"""
|
||||
Clear the block tables and kv cache after profiling.
|
||||
"""
|
||||
del self.share_inputs["caches"]
|
||||
if hasattr(self.share_inputs, "caches"):
|
||||
del self.share_inputs["caches"]
|
||||
if self.forward_meta is not None:
|
||||
del self.forward_meta.caches
|
||||
paddle.device.xpu.empty_cache()
|
||||
|
||||
Reference in New Issue
Block a user