From 17b414c2df4ed1f7e74f0177bfc307ea29c384b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=91=A8=E5=91=A8=E5=91=A8?= <39978853+zhoutianzi666@users.noreply.github.com> Date: Fri, 29 Aug 2025 11:07:30 +0800 Subject: [PATCH] MoE Default use triton's blockwise fp8 in TP Case (#3678) --- docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md | 2 -- docs/usage/environment_variables.md | 2 +- docs/zh/best_practices/ERNIE-4.5-21B-A3B-Paddle.md | 2 -- docs/zh/usage/environment_variables.md | 3 +-- fastdeploy/envs.py | 2 +- .../model_executor/layers/quantization/block_wise_fp8.py | 2 +- tests/model_loader/test_common_model.py | 2 +- 7 files changed, 5 insertions(+), 10 deletions(-) diff --git a/docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md b/docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md index e3052ab97..c59878e4c 100644 --- a/docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md +++ b/docs/best_practices/ERNIE-4.5-21B-A3B-Paddle.md @@ -115,7 +115,6 @@ export FD_ATTENTION_BACKEND=FLASH_ATTN export FD_LOG_DIR="prefill_log" quant_type=block_wise_fp8 -export FD_USE_DEEP_GEMM=0 python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \ --max-model-len 131072 \ @@ -135,7 +134,6 @@ export FLAGS_max_partition_size=2048 export FD_LOG_DIR="decode_log" quant_type=block_wise_fp8 -export FD_USE_DEEP_GEMM=0 python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \ --max-model-len 131072 \ diff --git a/docs/usage/environment_variables.md b/docs/usage/environment_variables.md index 31f895370..313449a40 100644 --- a/docs/usage/environment_variables.md +++ b/docs/usage/environment_variables.md @@ -70,7 +70,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # Whether to use DeepGemm for FP8 blockwise MoE. "FD_USE_DEEP_GEMM": - lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))), + lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))), } ``` diff --git a/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Paddle.md b/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Paddle.md index b0753ee79..c6a3904bd 100644 --- a/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Paddle.md +++ b/docs/zh/best_practices/ERNIE-4.5-21B-A3B-Paddle.md @@ -116,7 +116,6 @@ export FD_ATTENTION_BACKEND=FLASH_ATTN export FD_LOG_DIR="prefill_log" quant_type=block_wise_fp8 -export FD_USE_DEEP_GEMM=0 python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \ --max-model-len 131072 \ @@ -136,7 +135,6 @@ export FLAGS_max_partition_size=2048 export FD_LOG_DIR="decode_log" quant_type=block_wise_fp8 -export FD_USE_DEEP_GEMM=0 python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \ --max-model-len 131072 \ diff --git a/docs/zh/usage/environment_variables.md b/docs/zh/usage/environment_variables.md index cda1fc4f0..74b135eae 100644 --- a/docs/zh/usage/environment_variables.md +++ b/docs/zh/usage/environment_variables.md @@ -70,7 +70,6 @@ environment_variables: dict[str, Callable[[], Any]] = { # 是否使用DeepGemm后端的FP8 blockwise MoE. "FD_USE_DEEP_GEMM": - lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))), - + lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))), } ``` diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index d726c0dca..96a082996 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -65,7 +65,7 @@ environment_variables: dict[str, Callable[[], Any]] = { # Whether to use fastsafetensor load weight (0 or 1) "FD_USE_FASTSAFETENSOR": lambda: bool(int(os.getenv("FD_USE_FASTSAFETENSOR", "0"))), # Whether to use DeepGemm for FP8 blockwise MoE. - "FD_USE_DEEP_GEMM": lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))), + "FD_USE_DEEP_GEMM": lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))), # Whether to use aggregate send. "FD_USE_AGGREGATE_SEND": lambda: bool(int(os.getenv("FD_USE_AGGREGATE_SEND", "0"))), # Whether to open Trace. diff --git a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py index c3f503590..d91d2b606 100644 --- a/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py +++ b/fastdeploy/model_executor/layers/quantization/block_wise_fp8.py @@ -61,7 +61,7 @@ class BlockWiseFP8Config(QuantConfigBase): Get quantization method. """ if isinstance(layer, FusedMoE): - if self.use_deep_gemm: + if layer.ep_size > 1 or self.use_deep_gemm: from fastdeploy.model_executor.layers.moe.fused_moe_deepgemm_backend import ( DeepGemmFusedMoeMethod, ) diff --git a/tests/model_loader/test_common_model.py b/tests/model_loader/test_common_model.py index ef21e52b3..acde9e87f 100644 --- a/tests/model_loader/test_common_model.py +++ b/tests/model_loader/test_common_model.py @@ -174,7 +174,7 @@ model_param_map = { { "quant_type": "block_wise_fp8", "backend": "triton", - "env": {"FD_USE_DEEP_GEMM": "0", "DG_NVCC_OVERRIDE_CPP_STANDARD": "17"}, + "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"}, }, {"quant_type": "block_wise_fp8", "backend": "deepgemm", "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"}}, ],