[XPU] refactor moe ffn (#5501)

- remove BKCL_DISPATCH_ALL_GATHER
- support sparse mode
- support moe quant_method
This commit is contained in:
zhupengyang
2025-12-18 14:14:05 +08:00
committed by GitHub
parent d0a7834a17
commit 8735cb5045
12 changed files with 397 additions and 127 deletions

View File

@@ -143,6 +143,8 @@ std::vector<paddle::Tensor> WeightOnlyLinear(
const int arch,
const int group_size);
std::vector<paddle::Tensor> Quant2dPerToken(const paddle::Tensor& x);
std::vector<paddle::Tensor> MoeEPCombine(const paddle::Tensor& ffn_out,
const paddle::Tensor& moe_index,
const paddle::Tensor& weights,
@@ -1275,6 +1277,9 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
py::arg("arch"),
py::arg("group_size") = -1);
m.def(
"quant2d_per_token", &Quant2dPerToken, py::arg("x"), "quant x per token");
m.def("xpu_moe_layer",
&MoeLayer,
py::arg("x"),