mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-11 19:40:25 +08:00
default enable chunked prefill (#3731)
* add error traceback info * update error msg * update code * default enable chunked prefill * update code * update code * add envs * update code --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
1
.github/workflows/_base_test.yml
vendored
1
.github/workflows/_base_test.yml
vendored
@@ -134,6 +134,7 @@ jobs:
|
||||
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
|
||||
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
|
||||
-e "FLASK_PORT=${FLASK_PORT}" \
|
||||
-e "FD_FORCE_CHUNKED_PREFILL=1" \
|
||||
-v "${MODEL_CACHE_DIR}:/MODELDATA" \
|
||||
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
|
||||
-v "${CACHE_DIR}/.cache:/root/.cache" \
|
||||
|
@@ -1233,6 +1233,15 @@ class FDConfig:
|
||||
|
||||
self.paddle_commit_id = paddle.version.commit
|
||||
|
||||
if self.cache_config.enable_chunked_prefill:
|
||||
self.force_chunked_prefill = int(envs.FD_FORCE_CHUNKED_PREFILL)
|
||||
if (
|
||||
self.speculative_config is not None
|
||||
and self.speculative_config.method in ["mtp"]
|
||||
and not self.force_chunked_prefill
|
||||
):
|
||||
self.cache_config.enable_chunked_prefill = False
|
||||
|
||||
if self.max_num_batched_tokens is None:
|
||||
if self.cache_config.enable_chunked_prefill:
|
||||
self.max_num_batched_tokens = 2048
|
||||
|
@@ -243,7 +243,7 @@ class EngineArgs:
|
||||
Ports for rdma communication.
|
||||
"""
|
||||
|
||||
enable_chunked_prefill: bool = False
|
||||
enable_chunked_prefill: bool = True
|
||||
"""
|
||||
Flag to enable chunked prefilling.
|
||||
"""
|
||||
|
@@ -93,6 +93,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# enable multi api server
|
||||
"FD_ENABLE_MULTI_API_SERVER": lambda: bool(int(os.getenv("FD_ENABLE_MULTI_API_SERVER", "0"))),
|
||||
"FD_FOR_TORCH_MODEL_FORMAT": lambda: bool(int(os.getenv("FD_FOR_TORCH_MODEL_FORMAT", "0"))),
|
||||
# force enable chunked prefill
|
||||
"FD_FORCE_CHUNKED_PREFILL": lambda: bool(int(os.getenv("FD_FORCE_CHUNKED_PREFILL", "0"))),
|
||||
}
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user