[Feature] block_wise_fp8 support triton_moe_backend (#2767)

2025-10-17 06:00:59 +08:00 · 2025-07-09 19:22:47 +08:00
parent e3768c5a83
commit 888780ffde
5 changed files with 248 additions and 10 deletions
--- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
+++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
@@ -18,9 +18,10 @@ from typing import Optional
 import paddle

 import fastdeploy
+from fastdeploy import envs
 from fastdeploy.model_executor.layers.moe import FusedMoE

-from ..utils import per_block_cast_to_fp8, get_tensor
+from ..utils import get_tensor, per_block_cast_to_fp8
 from .quant_base import QuantConfigBase, QuantMethodBase


@@ -37,6 +38,7 @@ class BlockWiseFP8Config(QuantConfigBase):
        self.quant_max_bound = 448
        self.quant_min_bound = -448
        self.quant_round_type = 1
+        self.use_deep_gemm = bool(envs.FD_USE_DEEP_GEMM)

    def name(self) -> str:
        return "block_wise_fp8"
@@ -51,9 +53,14 @@ class BlockWiseFP8Config(QuantConfigBase):
        Get quantization method.
        '''
        if isinstance(layer, FusedMoE):
-            from fastdeploy.model_executor.layers.moe.fused_moe_deepgemm_backend import \
-                DeepGemmFusedMoeMethod
-            return DeepGemmFusedMoeMethod(self)
+            if self.use_deep_gemm:
+                from fastdeploy.model_executor.layers.moe.fused_moe_deepgemm_backend import \
+                    DeepGemmFusedMoeMethod
+                return DeepGemmFusedMoeMethod(self)
+            else:
+                from fastdeploy.model_executor.layers.moe.fused_moe_triton_backend import \
+                    BlockWiseFP8MoEMethod
+            return BlockWiseFP8MoEMethod(self)
        else:
            return BlockWiseFP8LinearMethod(self)