From 17b414c2df4ed1f7e74f0177bfc307ea29c384b6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=91=A8=E5=91=A8=E5=91=A8?=
 <39978853+zhoutianzi666@users.noreply.github.com>
Date: Fri, 29 Aug 2025 11:07:30 +0800
Subject: [PATCH] MoE Default use triton's blockwise fp8 in TP Case (#3678)

---
 docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md                | 2 --
 docs/usage/environment_variables.md                            | 2 +-
 docs/zh/best_practices/ERNIE-4.5-21B-A3B-Paddle.md             | 2 --
 docs/zh/usage/environment_variables.md                         | 3 +--
 fastdeploy/envs.py                                             | 2 +-
 .../model_executor/layers/quantization/block_wise_fp8.py       | 2 +-
 tests/model_loader/test_common_model.py                        | 2 +-
 7 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md b/docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md
index e3052ab97..c59878e4c 100644
--- a/docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md
+++ b/docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md
@@ -115,7 +115,6 @@ export FD_ATTENTION_BACKEND=FLASH_ATTN
 export FD_LOG_DIR="prefill_log"
 
 quant_type=block_wise_fp8
-export FD_USE_DEEP_GEMM=0
 
 python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
     --max-model-len 131072 \
@@ -135,7 +134,6 @@ export FLAGS_max_partition_size=2048
 export FD_LOG_DIR="decode_log"
 
 quant_type=block_wise_fp8
-export FD_USE_DEEP_GEMM=0
 
 python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
     --max-model-len 131072 \
diff --git a/docs/usage/environment_variables.md b/docs/usage/environment_variables.md
index 31f895370..313449a40 100644
--- a/docs/usage/environment_variables.md
+++ b/docs/usage/environment_variables.md
@@ -70,7 +70,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
 
     # Whether to use DeepGemm for FP8 blockwise MoE.
     "FD_USE_DEEP_GEMM":
-    lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
+    lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),
 
 }
 ```
diff --git a/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Paddle.md b/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Paddle.md
index b0753ee79..c6a3904bd 100644
--- a/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Paddle.md
+++ b/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Paddle.md
@@ -116,7 +116,6 @@ export FD_ATTENTION_BACKEND=FLASH_ATTN
 export FD_LOG_DIR="prefill_log"
 
 quant_type=block_wise_fp8
-export FD_USE_DEEP_GEMM=0
 
 python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
     --max-model-len 131072 \
@@ -136,7 +135,6 @@ export FLAGS_max_partition_size=2048
 export FD_LOG_DIR="decode_log"
 
 quant_type=block_wise_fp8
-export FD_USE_DEEP_GEMM=0
 
 python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
     --max-model-len 131072 \
diff --git a/docs/zh/usage/environment_variables.md b/docs/zh/usage/environment_variables.md
index cda1fc4f0..74b135eae 100644
--- a/docs/zh/usage/environment_variables.md
+++ b/docs/zh/usage/environment_variables.md
@@ -70,7 +70,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
 
     # 是否使用DeepGemm后端的FP8 blockwise MoE.
     "FD_USE_DEEP_GEMM":
-    lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
-
+    lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),
 }
 ```
diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py
index d726c0dca..96a082996 100644
--- a/fastdeploy/envs.py
+++ b/fastdeploy/envs.py
@@ -65,7 +65,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
     # Whether to use fastsafetensor load weight (0 or 1)
     "FD_USE_FASTSAFETENSOR": lambda: bool(int(os.getenv("FD_USE_FASTSAFETENSOR", "0"))),
     # Whether to use DeepGemm for FP8 blockwise MoE.
-    "FD_USE_DEEP_GEMM": lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
+    "FD_USE_DEEP_GEMM": lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),
     # Whether to use aggregate send.
     "FD_USE_AGGREGATE_SEND": lambda: bool(int(os.getenv("FD_USE_AGGREGATE_SEND", "0"))),
     # Whether to open Trace.
diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
index c3f503590..d91d2b606 100644
--- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
+++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py
@@ -61,7 +61,7 @@ class BlockWiseFP8Config(QuantConfigBase):
         Get quantization method.
         """
         if isinstance(layer, FusedMoE):
-            if self.use_deep_gemm:
+            if layer.ep_size > 1 or self.use_deep_gemm:
                 from fastdeploy.model_executor.layers.moe.fused_moe_deepgemm_backend import (
                     DeepGemmFusedMoeMethod,
                 )
diff --git a/tests/model_loader/test_common_model.py b/tests/model_loader/test_common_model.py
index ef21e52b3..acde9e87f 100644
--- a/tests/model_loader/test_common_model.py
+++ b/tests/model_loader/test_common_model.py
@@ -174,7 +174,7 @@ model_param_map = {
             {
                 "quant_type": "block_wise_fp8",
                 "backend": "triton",
-                "env": {"FD_USE_DEEP_GEMM": "0", "DG_NVCC_OVERRIDE_CPP_STANDARD": "17"},
+                "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"},
             },
             {"quant_type": "block_wise_fp8", "backend": "deepgemm", "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"}},
         ],