MoE Default use triton's blockwise fp8 in TP Case (#3678)

2025-10-05 08:37:06 +08:00 · 2025-08-29 11:07:30 +08:00
parent b6edd15d55
commit 17b414c2df
7 changed files with 5 additions and 10 deletions
--- a/docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md
+++ b/docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md
@@ -115,7 +115,6 @@ export FD_ATTENTION_BACKEND=FLASH_ATTN
 export FD_LOG_DIR="prefill_log"

 quant_type=block_wise_fp8
-export FD_USE_DEEP_GEMM=0

 python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
    --max-model-len 131072 \
@@ -135,7 +134,6 @@ export FLAGS_max_partition_size=2048
 export FD_LOG_DIR="decode_log"

 quant_type=block_wise_fp8
-export FD_USE_DEEP_GEMM=0

 python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
    --max-model-len 131072 \
--- a/docs/usage/environment_variables.md
+++ b/docs/usage/environment_variables.md
@@ -70,7 +70,7 @@ environment_variables: dict[str, Callable[[], Any]] = {

    # Whether to use DeepGemm for FP8 blockwise MoE.
    "FD_USE_DEEP_GEMM":
-    lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
+    lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),

 }
 ```
--- a/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Paddle.md
+++ b/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Paddle.md
@@ -116,7 +116,6 @@ export FD_ATTENTION_BACKEND=FLASH_ATTN
 export FD_LOG_DIR="prefill_log"

 quant_type=block_wise_fp8
-export FD_USE_DEEP_GEMM=0

 python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
    --max-model-len 131072 \
@@ -136,7 +135,6 @@ export FLAGS_max_partition_size=2048
 export FD_LOG_DIR="decode_log"

 quant_type=block_wise_fp8
-export FD_USE_DEEP_GEMM=0

 python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
    --max-model-len 131072 \
--- a/docs/zh/usage/environment_variables.md
+++ b/docs/zh/usage/environment_variables.md
@@ -70,7 +70,6 @@ environment_variables: dict[str, Callable[[], Any]] = {

    # 是否使用DeepGemm后端的FP8 blockwise MoE.
    "FD_USE_DEEP_GEMM":
-    lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
-
+    lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),
 }
 ```
--- a/fastdeploy/envs.py
+++ b/fastdeploy/envs.py
@@ -65,7 +65,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # Whether to use fastsafetensor load weight (0 or 1)
    "FD_USE_FASTSAFETENSOR": lambda: bool(int(os.getenv("FD_USE_FASTSAFETENSOR", "0"))),
    # Whether to use DeepGemm for FP8 blockwise MoE.
-    "FD_USE_DEEP_GEMM": lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
+    "FD_USE_DEEP_GEMM": lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),
    # Whether to use aggregate send.
    "FD_USE_AGGREGATE_SEND": lambda: bool(int(os.getenv("FD_USE_AGGREGATE_SEND", "0"))),
    # Whether to open Trace.
--- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
+++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
@@ -61,7 +61,7 @@ class BlockWiseFP8Config(QuantConfigBase):
        Get quantization method.
        """
        if isinstance(layer, FusedMoE):
-            if self.use_deep_gemm:
+            if layer.ep_size > 1 or self.use_deep_gemm:
                from fastdeploy.model_executor.layers.moe.fused_moe_deepgemm_backend import (
                    DeepGemmFusedMoeMethod,
                )
--- a/tests/model_loader/test_common_model.py
+++ b/tests/model_loader/test_common_model.py
@@ -174,7 +174,7 @@ model_param_map = {
            {
                "quant_type": "block_wise_fp8",
                "backend": "triton",
-                "env": {"FD_USE_DEEP_GEMM": "0", "DG_NVCC_OVERRIDE_CPP_STANDARD": "17"},
+                "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"},
            },
            {"quant_type": "block_wise_fp8", "backend": "deepgemm", "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"}},
        ],