MoE Default use triton's blockwise fp8 in TP Case (#3678)

This commit is contained in:
周周周
2025-08-29 11:07:30 +08:00
committed by GitHub
parent b6edd15d55
commit 17b414c2df
7 changed files with 5 additions and 10 deletions

View File

@@ -115,7 +115,6 @@ export FD_ATTENTION_BACKEND=FLASH_ATTN
export FD_LOG_DIR="prefill_log"
quant_type=block_wise_fp8
export FD_USE_DEEP_GEMM=0
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
--max-model-len 131072 \
@@ -135,7 +134,6 @@ export FLAGS_max_partition_size=2048
export FD_LOG_DIR="decode_log"
quant_type=block_wise_fp8
export FD_USE_DEEP_GEMM=0
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
--max-model-len 131072 \

View File

@@ -70,7 +70,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Whether to use DeepGemm for FP8 blockwise MoE.
"FD_USE_DEEP_GEMM":
lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),
}
```

View File

@@ -116,7 +116,6 @@ export FD_ATTENTION_BACKEND=FLASH_ATTN
export FD_LOG_DIR="prefill_log"
quant_type=block_wise_fp8
export FD_USE_DEEP_GEMM=0
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
--max-model-len 131072 \
@@ -136,7 +135,6 @@ export FLAGS_max_partition_size=2048
export FD_LOG_DIR="decode_log"
quant_type=block_wise_fp8
export FD_USE_DEEP_GEMM=0
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
--max-model-len 131072 \

View File

@@ -70,7 +70,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
# 是否使用DeepGemm后端的FP8 blockwise MoE.
"FD_USE_DEEP_GEMM":
lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),
}
```

View File

@@ -65,7 +65,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
# Whether to use fastsafetensor load weight (0 or 1)
"FD_USE_FASTSAFETENSOR": lambda: bool(int(os.getenv("FD_USE_FASTSAFETENSOR", "0"))),
# Whether to use DeepGemm for FP8 blockwise MoE.
"FD_USE_DEEP_GEMM": lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
"FD_USE_DEEP_GEMM": lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),
# Whether to use aggregate send.
"FD_USE_AGGREGATE_SEND": lambda: bool(int(os.getenv("FD_USE_AGGREGATE_SEND", "0"))),
# Whether to open Trace.

View File

@@ -61,7 +61,7 @@ class BlockWiseFP8Config(QuantConfigBase):
Get quantization method.
"""
if isinstance(layer, FusedMoE):
if self.use_deep_gemm:
if layer.ep_size > 1 or self.use_deep_gemm:
from fastdeploy.model_executor.layers.moe.fused_moe_deepgemm_backend import (
DeepGemmFusedMoeMethod,
)

View File

@@ -174,7 +174,7 @@ model_param_map = {
{
"quant_type": "block_wise_fp8",
"backend": "triton",
"env": {"FD_USE_DEEP_GEMM": "0", "DG_NVCC_OVERRIDE_CPP_STANDARD": "17"},
"env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"},
},
{"quant_type": "block_wise_fp8", "backend": "deepgemm", "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"}},
],