Fix noaux_tc cuda Error 700 in CUDAGraph and Add wfp8apf8 moe quant method (#4115)

* improve per_token_quant_fp8 performance * support moe wfp8apf8 * check glm test * fix noaux_tc op in cudagraph, support noaux_tc return the correct * check * check inf and overwrite score in noaux_tc --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-10-21 15:49:31 +08:00 · 2025-09-22 21:27:37 +08:00
parent 6b47773bd6
commit f38b174a75
17 changed files with 924 additions and 125 deletions
--- a/fastdeploy/model_executor/layers/moe/moe.py
+++ b/fastdeploy/model_executor/layers/moe/moe.py
@@ -66,6 +66,7 @@ def get_moe_scores(
    top_k,
    routed_scaling_factor,
    e_score_correction_bias,
+    renormalize: bool = False,
 ) -> paddle.Tensor:
    """
    compute moe scores using e_score_correction_bias.
@@ -79,6 +80,7 @@ def get_moe_scores(
        n_group if n_group > 0 else 1,
        topk_group if topk_group > 0 else 1,
        top_k,
+        renormalize,
        routed_scaling_factor,
    )
    return scores, topk_values, topk_idx
@@ -93,6 +95,7 @@ class FusedMoE(nn.Layer):
        self,
        fd_config,
        reduce_results: bool = True,
+        renormalize: bool = False,
        moe_intermediate_size: int = -1,
        num_experts: int = -1,
        expert_id_offset: int = 0,
@@ -119,6 +122,7 @@ class FusedMoE(nn.Layer):
        self.fd_config = fd_config
        self.layer_idx = layer_idx
        self.reduce_results = reduce_results
+        self.renormalize = renormalize
        self.tp_rank = fd_config.parallel_config.tensor_parallel_rank
        self.tp_size = fd_config.parallel_config.tensor_parallel_size
        self.ep_size = fd_config.parallel_config.expert_parallel_size