mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
MoE Default use triton's blockwise fp8 in TP Case (#3678)
This commit is contained in:
@@ -115,7 +115,6 @@ export FD_ATTENTION_BACKEND=FLASH_ATTN
|
||||
export FD_LOG_DIR="prefill_log"
|
||||
|
||||
quant_type=block_wise_fp8
|
||||
export FD_USE_DEEP_GEMM=0
|
||||
|
||||
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
|
||||
--max-model-len 131072 \
|
||||
@@ -135,7 +134,6 @@ export FLAGS_max_partition_size=2048
|
||||
export FD_LOG_DIR="decode_log"
|
||||
|
||||
quant_type=block_wise_fp8
|
||||
export FD_USE_DEEP_GEMM=0
|
||||
|
||||
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
|
||||
--max-model-len 131072 \
|
||||
|
@@ -70,7 +70,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
|
||||
# Whether to use DeepGemm for FP8 blockwise MoE.
|
||||
"FD_USE_DEEP_GEMM":
|
||||
lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
|
||||
lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),
|
||||
|
||||
}
|
||||
```
|
||||
|
@@ -116,7 +116,6 @@ export FD_ATTENTION_BACKEND=FLASH_ATTN
|
||||
export FD_LOG_DIR="prefill_log"
|
||||
|
||||
quant_type=block_wise_fp8
|
||||
export FD_USE_DEEP_GEMM=0
|
||||
|
||||
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
|
||||
--max-model-len 131072 \
|
||||
@@ -136,7 +135,6 @@ export FLAGS_max_partition_size=2048
|
||||
export FD_LOG_DIR="decode_log"
|
||||
|
||||
quant_type=block_wise_fp8
|
||||
export FD_USE_DEEP_GEMM=0
|
||||
|
||||
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
|
||||
--max-model-len 131072 \
|
||||
|
@@ -70,7 +70,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
|
||||
# 是否使用DeepGemm后端的FP8 blockwise MoE.
|
||||
"FD_USE_DEEP_GEMM":
|
||||
lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
|
||||
|
||||
lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),
|
||||
}
|
||||
```
|
||||
|
@@ -65,7 +65,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# Whether to use fastsafetensor load weight (0 or 1)
|
||||
"FD_USE_FASTSAFETENSOR": lambda: bool(int(os.getenv("FD_USE_FASTSAFETENSOR", "0"))),
|
||||
# Whether to use DeepGemm for FP8 blockwise MoE.
|
||||
"FD_USE_DEEP_GEMM": lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
|
||||
"FD_USE_DEEP_GEMM": lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),
|
||||
# Whether to use aggregate send.
|
||||
"FD_USE_AGGREGATE_SEND": lambda: bool(int(os.getenv("FD_USE_AGGREGATE_SEND", "0"))),
|
||||
# Whether to open Trace.
|
||||
|
@@ -61,7 +61,7 @@ class BlockWiseFP8Config(QuantConfigBase):
|
||||
Get quantization method.
|
||||
"""
|
||||
if isinstance(layer, FusedMoE):
|
||||
if self.use_deep_gemm:
|
||||
if layer.ep_size > 1 or self.use_deep_gemm:
|
||||
from fastdeploy.model_executor.layers.moe.fused_moe_deepgemm_backend import (
|
||||
DeepGemmFusedMoeMethod,
|
||||
)
|
||||
|
@@ -174,7 +174,7 @@ model_param_map = {
|
||||
{
|
||||
"quant_type": "block_wise_fp8",
|
||||
"backend": "triton",
|
||||
"env": {"FD_USE_DEEP_GEMM": "0", "DG_NVCC_OVERRIDE_CPP_STANDARD": "17"},
|
||||
"env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"},
|
||||
},
|
||||
{"quant_type": "block_wise_fp8", "backend": "deepgemm", "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"}},
|
||||
],
|
||||
|
Reference in New Issue
Block a user