[xpu] support ep (#4067)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled

This commit is contained in:
zhupengyang
2025-09-15 13:53:11 +08:00
committed by GitHub
parent 29ed617f0f
commit 9409665713
18 changed files with 1179 additions and 329 deletions

View File

@@ -19,7 +19,7 @@ from abc import abstractmethod
from typing import Optional
import paddle
from paddle.nn.quant import weight_only_linear, weight_quantize
from paddle.nn.quant import weight_quantize
from fastdeploy import envs
from fastdeploy.model_executor.layers.linear import (
@@ -30,6 +30,13 @@ from fastdeploy.model_executor.layers.linear import (
from fastdeploy.model_executor.utils import TensorTracker, free_tensor, set_weight_attrs
from fastdeploy.platforms import current_platform
if current_platform.is_xpu():
from fastdeploy.model_executor.ops.xpu import (
weight_only_linear_xpu as weight_only_linear,
)
else:
from paddle.nn.quant import weight_only_linear
from ..moe import FusedMoE
from ..utils import get_tensor
from .quant_base import QuantConfigBase, QuantMethodBase
@@ -70,16 +77,24 @@ class WeightOnlyConfig(QuantConfigBase):
def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
if current_platform.is_xpu():
from fastdeploy.model_executor.layers.backends import (
XPUWeightOnlyLinearMethod,
)
from fastdeploy.model_executor.layers.moe.fused_moe_xpu_backend import (
XPUWeightOnlyMoEMethod,
)
if isinstance(layer, FusedMoE):
return XPUWeightOnlyMoEMethod(self)
if layer.ep_size > 1:
from fastdeploy.model_executor.layers.backends import (
XPUWeightOnlyMoeEpMethod,
)
return XPUWeightOnlyMoeEpMethod(self)
else:
from fastdeploy.model_executor.layers.backends import (
XPUWeightOnlyMoEMethod,
)
return XPUWeightOnlyMoEMethod(self)
else:
from fastdeploy.model_executor.layers.backends import (
XPUWeightOnlyLinearMethod,
)
return XPUWeightOnlyLinearMethod(self)
elif current_platform.is_gcu():
from fastdeploy.model_executor.layers.backends import (