[OPs] MoE support wfp8afp8(channelwise) and improve per_token_quant_fp8 (#4238)

2025-10-06 17:17:14 +08:00 · 2025-09-24 16:39:51 +08:00
parent 8b0ce8e3ab
commit 7c1fd19f0f
7 changed files with 683 additions and 33 deletions
--- a/fastdeploy/model_executor/layers/quantization/wfp8afp8.py
+++ b/fastdeploy/model_executor/layers/quantization/wfp8afp8.py
@@ -23,6 +23,7 @@ from fastdeploy.model_executor.layers.linear import (
    MergedColumnParallelLinear,
    QKVParallelLinear,
 )
+from fastdeploy.model_executor.layers.moe import FusedMoE
 from fastdeploy.model_executor.layers.quantization.ops import (
    cutlass_scaled_mm,
    scaled_fp8_quant,
@@ -65,7 +66,14 @@ class WFP8AFP8Config(QuantConfigBase):

    def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
        """ """
-        return WFP8AFP8LinearMethod(self)
+        if isinstance(layer, FusedMoE):
+            from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import (
+                Wfp8Afp8MoEMethod,
+            )
+
+            return Wfp8Afp8MoEMethod(self)
+        else:
+            return WFP8AFP8LinearMethod(self)


 class WFP8AFP8LinearMethod(QuantMethodBase):