support w4afp8 EP inference (#3382)

This commit is contained in:
Yuan Xiaolan
2025-08-13 21:41:34 +08:00
committed by GitHub
parent 4dbaa3d74c
commit 2513cd929b
17 changed files with 944 additions and 100 deletions

View File

@@ -188,7 +188,8 @@ paddle::Tensor MoeExpertFFNFunc(
const paddle::optional<paddle::Tensor>& down_proj_scale,
const paddle::optional<paddle::Tensor>& down_proj_in_scale,
const paddle::optional<paddle::Tensor>& expert_idx_per_token,
const std::string& quant_method, const bool used_in_ep_low_latency);
const std::string& quant_method, const bool used_in_ep_low_latency,
const int estimate_total_token_nums);
paddle::Tensor MoeExpertFFNWint2Func(
const paddle::Tensor& permute_input,