From 753772ace87fc490fccf2f9d09fdda1bbcbf488c Mon Sep 17 00:00:00 2001 From: kevin Date: Sun, 31 Aug 2025 13:15:13 +0800 Subject: [PATCH] default enable chunked prefill (#3731) * add error traceback info * update error msg * update code * default enable chunked prefill * update code * update code * add envs * update code --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> --- .github/workflows/_base_test.yml | 1 + fastdeploy/config.py | 9 +++++++++ fastdeploy/engine/args_utils.py | 2 +- fastdeploy/envs.py | 2 ++ 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/_base_test.yml b/.github/workflows/_base_test.yml index be3696320..1174bbafc 100644 --- a/.github/workflows/_base_test.yml +++ b/.github/workflows/_base_test.yml @@ -134,6 +134,7 @@ jobs: -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \ -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \ -e "FLASK_PORT=${FLASK_PORT}" \ + -e "FD_FORCE_CHUNKED_PREFILL=1" \ -v "${MODEL_CACHE_DIR}:/MODELDATA" \ -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \ -v "${CACHE_DIR}/.cache:/root/.cache" \ diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 080dd95ec..e4182e6c9 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -1233,6 +1233,15 @@ class FDConfig: self.paddle_commit_id = paddle.version.commit + if self.cache_config.enable_chunked_prefill: + self.force_chunked_prefill = int(envs.FD_FORCE_CHUNKED_PREFILL) + if ( + self.speculative_config is not None + and self.speculative_config.method in ["mtp"] + and not self.force_chunked_prefill + ): + self.cache_config.enable_chunked_prefill = False + if self.max_num_batched_tokens is None: if self.cache_config.enable_chunked_prefill: self.max_num_batched_tokens = 2048 diff --git a/fastdeploy/engine/args_utils.py b/fastdeploy/engine/args_utils.py index 24d3f5284..10ed83525 100644 --- a/fastdeploy/engine/args_utils.py +++ b/fastdeploy/engine/args_utils.py @@ -243,7 +243,7 @@ class EngineArgs: Ports for rdma communication. """ - enable_chunked_prefill: bool = False + enable_chunked_prefill: bool = True """ Flag to enable chunked prefilling. """ diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index 24152a036..9ee6656e3 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -93,6 +93,8 @@ environment_variables: dict[str, Callable[[], Any]] = { # enable multi api server "FD_ENABLE_MULTI_API_SERVER": lambda: bool(int(os.getenv("FD_ENABLE_MULTI_API_SERVER", "0"))), "FD_FOR_TORCH_MODEL_FORMAT": lambda: bool(int(os.getenv("FD_FOR_TORCH_MODEL_FORMAT", "0"))), + # force enable chunked prefill + "FD_FORCE_CHUNKED_PREFILL": lambda: bool(int(os.getenv("FD_FORCE_CHUNKED_PREFILL", "0"))), }