Fix noaux_tc cuda Error 700 in CUDAGraph and Add wfp8apf8 moe quant method (#4115)

* improve per_token_quant_fp8 performance * support moe wfp8apf8 * check glm test * fix noaux_tc op in cudagraph, support noaux_tc return the correct * check * check inf and overwrite score in noaux_tc --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-10-05 16:48:03 +08:00 · 2025-09-22 21:27:37 +08:00
parent 6b47773bd6
commit f38b174a75
17 changed files with 924 additions and 125 deletions
--- a/fastdeploy/model_executor/layers/quantization/wfp8afp8.py
+++ b/fastdeploy/model_executor/layers/quantization/wfp8afp8.py
@@ -23,6 +23,7 @@ from fastdeploy.model_executor.layers.linear import (
    MergedColumnParallelLinear,
    QKVParallelLinear,
 )
+from fastdeploy.model_executor.layers.moe import FusedMoE
 from fastdeploy.model_executor.layers.quantization.ops import (
    cutlass_scaled_mm,
    scaled_fp8_quant,
@@ -66,7 +67,14 @@ class WFP8AFP8Config(QuantConfigBase):

    def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
        """ """
-        return WFP8AFP8LinearMethod(self)
+        if isinstance(layer, FusedMoE):
+            from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import (
+                Wfp8Afp8MoEMethod,
+            )
+
+            return Wfp8Afp8MoEMethod(self)
+        else:
+            return WFP8AFP8LinearMethod(self)


 class WFP8AFP8LinearMethod(QuantMethodBase):