mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
MoE Default use triton's blockwise fp8 in TP Case (#3678)
This commit is contained in:
@@ -115,7 +115,6 @@ export FD_ATTENTION_BACKEND=FLASH_ATTN
|
|||||||
export FD_LOG_DIR="prefill_log"
|
export FD_LOG_DIR="prefill_log"
|
||||||
|
|
||||||
quant_type=block_wise_fp8
|
quant_type=block_wise_fp8
|
||||||
export FD_USE_DEEP_GEMM=0
|
|
||||||
|
|
||||||
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
|
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
|
||||||
--max-model-len 131072 \
|
--max-model-len 131072 \
|
||||||
@@ -135,7 +134,6 @@ export FLAGS_max_partition_size=2048
|
|||||||
export FD_LOG_DIR="decode_log"
|
export FD_LOG_DIR="decode_log"
|
||||||
|
|
||||||
quant_type=block_wise_fp8
|
quant_type=block_wise_fp8
|
||||||
export FD_USE_DEEP_GEMM=0
|
|
||||||
|
|
||||||
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
|
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
|
||||||
--max-model-len 131072 \
|
--max-model-len 131072 \
|
||||||
|
@@ -70,7 +70,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
|
|
||||||
# Whether to use DeepGemm for FP8 blockwise MoE.
|
# Whether to use DeepGemm for FP8 blockwise MoE.
|
||||||
"FD_USE_DEEP_GEMM":
|
"FD_USE_DEEP_GEMM":
|
||||||
lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
|
lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),
|
||||||
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
@@ -116,7 +116,6 @@ export FD_ATTENTION_BACKEND=FLASH_ATTN
|
|||||||
export FD_LOG_DIR="prefill_log"
|
export FD_LOG_DIR="prefill_log"
|
||||||
|
|
||||||
quant_type=block_wise_fp8
|
quant_type=block_wise_fp8
|
||||||
export FD_USE_DEEP_GEMM=0
|
|
||||||
|
|
||||||
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
|
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
|
||||||
--max-model-len 131072 \
|
--max-model-len 131072 \
|
||||||
@@ -136,7 +135,6 @@ export FLAGS_max_partition_size=2048
|
|||||||
export FD_LOG_DIR="decode_log"
|
export FD_LOG_DIR="decode_log"
|
||||||
|
|
||||||
quant_type=block_wise_fp8
|
quant_type=block_wise_fp8
|
||||||
export FD_USE_DEEP_GEMM=0
|
|
||||||
|
|
||||||
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
|
python -m fastdeploy.entrypoints.openai.api_server --model baidu/ERNIE-4.5-21B-A3B-Paddle \
|
||||||
--max-model-len 131072 \
|
--max-model-len 131072 \
|
||||||
|
@@ -70,7 +70,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
|
|
||||||
# 是否使用DeepGemm后端的FP8 blockwise MoE.
|
# 是否使用DeepGemm后端的FP8 blockwise MoE.
|
||||||
"FD_USE_DEEP_GEMM":
|
"FD_USE_DEEP_GEMM":
|
||||||
lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
|
lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),
|
||||||
|
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
@@ -65,7 +65,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# Whether to use fastsafetensor load weight (0 or 1)
|
# Whether to use fastsafetensor load weight (0 or 1)
|
||||||
"FD_USE_FASTSAFETENSOR": lambda: bool(int(os.getenv("FD_USE_FASTSAFETENSOR", "0"))),
|
"FD_USE_FASTSAFETENSOR": lambda: bool(int(os.getenv("FD_USE_FASTSAFETENSOR", "0"))),
|
||||||
# Whether to use DeepGemm for FP8 blockwise MoE.
|
# Whether to use DeepGemm for FP8 blockwise MoE.
|
||||||
"FD_USE_DEEP_GEMM": lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "1"))),
|
"FD_USE_DEEP_GEMM": lambda: bool(int(os.getenv("FD_USE_DEEP_GEMM", "0"))),
|
||||||
# Whether to use aggregate send.
|
# Whether to use aggregate send.
|
||||||
"FD_USE_AGGREGATE_SEND": lambda: bool(int(os.getenv("FD_USE_AGGREGATE_SEND", "0"))),
|
"FD_USE_AGGREGATE_SEND": lambda: bool(int(os.getenv("FD_USE_AGGREGATE_SEND", "0"))),
|
||||||
# Whether to open Trace.
|
# Whether to open Trace.
|
||||||
|
@@ -61,7 +61,7 @@ class BlockWiseFP8Config(QuantConfigBase):
|
|||||||
Get quantization method.
|
Get quantization method.
|
||||||
"""
|
"""
|
||||||
if isinstance(layer, FusedMoE):
|
if isinstance(layer, FusedMoE):
|
||||||
if self.use_deep_gemm:
|
if layer.ep_size > 1 or self.use_deep_gemm:
|
||||||
from fastdeploy.model_executor.layers.moe.fused_moe_deepgemm_backend import (
|
from fastdeploy.model_executor.layers.moe.fused_moe_deepgemm_backend import (
|
||||||
DeepGemmFusedMoeMethod,
|
DeepGemmFusedMoeMethod,
|
||||||
)
|
)
|
||||||
|
@@ -174,7 +174,7 @@ model_param_map = {
|
|||||||
{
|
{
|
||||||
"quant_type": "block_wise_fp8",
|
"quant_type": "block_wise_fp8",
|
||||||
"backend": "triton",
|
"backend": "triton",
|
||||||
"env": {"FD_USE_DEEP_GEMM": "0", "DG_NVCC_OVERRIDE_CPP_STANDARD": "17"},
|
"env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"},
|
||||||
},
|
},
|
||||||
{"quant_type": "block_wise_fp8", "backend": "deepgemm", "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"}},
|
{"quant_type": "block_wise_fp8", "backend": "deepgemm", "env": {"DG_NVCC_OVERRIDE_CPP_STANDARD": "17"}},
|
||||||
],
|
],
|
||||||
|
Reference in New Issue
Block a user