support w4afp8 EP inference (#3382)

2025-10-05 16:48:03 +08:00 · 2025-08-13 21:41:34 +08:00
parent 4dbaa3d74c
commit 2513cd929b
17 changed files with 944 additions and 100 deletions
--- a/custom_ops/gpu_ops/cpp_extensions.cc
+++ b/custom_ops/gpu_ops/cpp_extensions.cc
@@ -188,7 +188,8 @@ paddle::Tensor MoeExpertFFNFunc(
    const paddle::optional<paddle::Tensor>& down_proj_scale,
    const paddle::optional<paddle::Tensor>& down_proj_in_scale,
    const paddle::optional<paddle::Tensor>& expert_idx_per_token,
-    const std::string& quant_method, const bool used_in_ep_low_latency);
+    const std::string& quant_method, const bool used_in_ep_low_latency,
+    const int estimate_total_token_nums);

 paddle::Tensor MoeExpertFFNWint2Func(
    const paddle::Tensor& permute_input,