mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[XPU] refactor moe ffn (#5501)
- remove BKCL_DISPATCH_ALL_GATHER - support sparse mode - support moe quant_method
This commit is contained in:
@@ -143,6 +143,8 @@ std::vector<paddle::Tensor> WeightOnlyLinear(
|
||||
const int arch,
|
||||
const int group_size);
|
||||
|
||||
std::vector<paddle::Tensor> Quant2dPerToken(const paddle::Tensor& x);
|
||||
|
||||
std::vector<paddle::Tensor> MoeEPCombine(const paddle::Tensor& ffn_out,
|
||||
const paddle::Tensor& moe_index,
|
||||
const paddle::Tensor& weights,
|
||||
@@ -1275,6 +1277,9 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
||||
py::arg("arch"),
|
||||
py::arg("group_size") = -1);
|
||||
|
||||
m.def(
|
||||
"quant2d_per_token", &Quant2dPerToken, py::arg("x"), "quant x per token");
|
||||
|
||||
m.def("xpu_moe_layer",
|
||||
&MoeLayer,
|
||||
py::arg("x"),
|
||||
|
||||
Reference in New Issue
Block a user