default enable chunked prefill (#3731)

* add error traceback info

* update error msg

* update code

* default enable chunked prefill

* update code

* update code

* add envs

* update code

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
kevin
2025-08-31 13:15:13 +08:00
committed by GitHub
parent 98e03fb4ea
commit 753772ace8
4 changed files with 13 additions and 1 deletions

View File

@@ -134,6 +134,7 @@ jobs:
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
-e "FLASK_PORT=${FLASK_PORT}" \
-e "FD_FORCE_CHUNKED_PREFILL=1" \
-v "${MODEL_CACHE_DIR}:/MODELDATA" \
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
-v "${CACHE_DIR}/.cache:/root/.cache" \

View File

@@ -1233,6 +1233,15 @@ class FDConfig:
self.paddle_commit_id = paddle.version.commit
if self.cache_config.enable_chunked_prefill:
self.force_chunked_prefill = int(envs.FD_FORCE_CHUNKED_PREFILL)
if (
self.speculative_config is not None
and self.speculative_config.method in ["mtp"]
and not self.force_chunked_prefill
):
self.cache_config.enable_chunked_prefill = False
if self.max_num_batched_tokens is None:
if self.cache_config.enable_chunked_prefill:
self.max_num_batched_tokens = 2048

View File

@@ -243,7 +243,7 @@ class EngineArgs:
Ports for rdma communication.
"""
enable_chunked_prefill: bool = False
enable_chunked_prefill: bool = True
"""
Flag to enable chunked prefilling.
"""

View File

@@ -93,6 +93,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
# enable multi api server
"FD_ENABLE_MULTI_API_SERVER": lambda: bool(int(os.getenv("FD_ENABLE_MULTI_API_SERVER", "0"))),
"FD_FOR_TORCH_MODEL_FORMAT": lambda: bool(int(os.getenv("FD_FOR_TORCH_MODEL_FORMAT", "0"))),
# force enable chunked prefill
"FD_FORCE_CHUNKED_PREFILL": lambda: bool(int(os.getenv("FD_FORCE_CHUNKED_PREFILL", "0"))),
}