[XPU] refactor moe ffn (#5501)

- remove BKCL_DISPATCH_ALL_GATHER - support sparse mode - support moe quant_method
2025-12-24 13:28:13 +08:00 · 2025-12-18 14:14:05 +08:00
parent d0a7834a17
commit 8735cb5045
12 changed files with 397 additions and 127 deletions
--- a/custom_ops/xpu_ops/src/ops/pybind/pybind.cc
+++ b/custom_ops/xpu_ops/src/ops/pybind/pybind.cc
@@ -143,6 +143,8 @@ std::vector<paddle::Tensor> WeightOnlyLinear(
    const int arch,
    const int group_size);

+std::vector<paddle::Tensor> Quant2dPerToken(const paddle::Tensor& x);
+
 std::vector<paddle::Tensor> MoeEPCombine(const paddle::Tensor& ffn_out,
                                         const paddle::Tensor& moe_index,
                                         const paddle::Tensor& weights,
@@ -1275,6 +1277,9 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
        py::arg("arch"),
        py::arg("group_size") = -1);

+  m.def(
+      "quant2d_per_token", &Quant2dPerToken, py::arg("x"), "quant x per token");
+
  m.def("xpu_moe_layer",
        &MoeLayer,
        py::arg("x"),