[BugFix] Fix chunked prefill (#3759)

* add error traceback info

* update error msg

* update code

* default enable chunked prefill

* update code

* update code

* add envs

* update code

* update enable chunked_prefill

* update code

* update code

* update code

* update code

* update code

---------

Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
kevin
2025-09-02 13:40:45 +08:00
committed by GitHub
parent 27f2e7a6f1
commit 7e751c93ae
4 changed files with 29 additions and 25 deletions

View File

@@ -134,7 +134,6 @@ jobs:
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \ -e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \ -e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
-e "FLASK_PORT=${FLASK_PORT}" \ -e "FLASK_PORT=${FLASK_PORT}" \
-e "FD_FORCE_CHUNKED_PREFILL=1" \
-v "${MODEL_CACHE_DIR}:/MODELDATA" \ -v "${MODEL_CACHE_DIR}:/MODELDATA" \
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \ -v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
-v "${CACHE_DIR}/.cache:/root/.cache" \ -v "${CACHE_DIR}/.cache:/root/.cache" \

View File

@@ -1233,23 +1233,14 @@ class FDConfig:
self.paddle_commit_id = paddle.version.commit self.paddle_commit_id = paddle.version.commit
if self.cache_config.enable_chunked_prefill:
self.force_chunked_prefill = int(envs.FD_FORCE_CHUNKED_PREFILL)
if (
self.speculative_config is not None
and self.speculative_config.method in ["mtp"]
and not self.force_chunked_prefill
):
self.cache_config.enable_chunked_prefill = False
if self.max_num_batched_tokens is None: if self.max_num_batched_tokens is None:
if self.cache_config.enable_chunked_prefill: if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
self.max_num_batched_tokens = 2048 self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
else: else:
if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")): if self.cache_config.enable_chunked_prefill:
self.max_num_batched_tokens = self.max_model_len self.max_num_batched_tokens = 2048
else: else:
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM self.max_num_batched_tokens = self.max_model_len
if self.long_prefill_token_threshold == 0: if self.long_prefill_token_threshold == 0:
self.long_prefill_token_threshold = int(self.max_model_len * 0.04) self.long_prefill_token_threshold = int(self.max_model_len * 0.04)

View File

@@ -15,11 +15,11 @@
""" """
import json import json
import os
from dataclasses import asdict, dataclass from dataclasses import asdict, dataclass
from dataclasses import fields as dataclass_fields from dataclasses import fields as dataclass_fields
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
from fastdeploy import envs
from fastdeploy.config import ( from fastdeploy.config import (
CacheConfig, CacheConfig,
EarlyStopConfig, EarlyStopConfig,
@@ -243,7 +243,7 @@ class EngineArgs:
Ports for rdma communication. Ports for rdma communication.
""" """
enable_chunked_prefill: bool = True enable_chunked_prefill: bool = False
""" """
Flag to enable chunked prefilling. Flag to enable chunked prefilling.
""" """
@@ -981,14 +981,29 @@ class EngineArgs:
if not model_cfg.is_unified_ckpt and hasattr(model_cfg, "tensor_parallel_size"): if not model_cfg.is_unified_ckpt and hasattr(model_cfg, "tensor_parallel_size"):
self.tensor_parallel_size = model_cfg.tensor_parallel_size self.tensor_parallel_size = model_cfg.tensor_parallel_size
speculative_cfg = self.create_speculative_config()
if not self.enable_chunked_prefill:
if (
current_platform.is_cuda()
and self.splitwise_role == "mixed"
and (speculative_cfg is None or speculative_cfg.method not in ["mtp"])
):
# default enable chunked prefill
self.enable_chunked_prefill = True
self.disable_chunked_prefill = int(envs.FD_DISABLE_CHUNKED_PREFILL)
if self.disable_chunked_prefill:
self.enable_chunked_prefill = False
if self.max_num_batched_tokens is None: if self.max_num_batched_tokens is None:
if self.enable_chunked_prefill: if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
self.max_num_batched_tokens = 2048 self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
else: else:
if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")): if self.enable_chunked_prefill:
self.max_num_batched_tokens = self.max_model_len self.max_num_batched_tokens = 2048
else: else:
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM self.max_num_batched_tokens = self.max_model_len
all_dict = asdict(self) all_dict = asdict(self)
all_dict["model_cfg"] = model_cfg all_dict["model_cfg"] = model_cfg
@@ -996,7 +1011,6 @@ class EngineArgs:
load_cfg = LoadConfig(all_dict) load_cfg = LoadConfig(all_dict)
parallel_cfg = ParallelConfig(all_dict) parallel_cfg = ParallelConfig(all_dict)
scheduler_cfg = self.create_scheduler_config() scheduler_cfg = self.create_scheduler_config()
speculative_cfg = self.create_speculative_config()
graph_opt_cfg = self.create_graph_optimization_config() graph_opt_cfg = self.create_graph_optimization_config()
graph_opt_cfg.update_use_cudagraph(self.use_cudagraph) graph_opt_cfg.update_use_cudagraph(self.use_cudagraph)
moba_attention_config = self.create_moba_attention_config() moba_attention_config = self.create_moba_attention_config()

View File

@@ -93,8 +93,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
# enable multi api server # enable multi api server
"FD_ENABLE_MULTI_API_SERVER": lambda: bool(int(os.getenv("FD_ENABLE_MULTI_API_SERVER", "0"))), "FD_ENABLE_MULTI_API_SERVER": lambda: bool(int(os.getenv("FD_ENABLE_MULTI_API_SERVER", "0"))),
"FD_FOR_TORCH_MODEL_FORMAT": lambda: bool(int(os.getenv("FD_FOR_TORCH_MODEL_FORMAT", "0"))), "FD_FOR_TORCH_MODEL_FORMAT": lambda: bool(int(os.getenv("FD_FOR_TORCH_MODEL_FORMAT", "0"))),
# force enable chunked prefill # force disable default chunked prefill
"FD_FORCE_CHUNKED_PREFILL": lambda: bool(int(os.getenv("FD_FORCE_CHUNKED_PREFILL", "0"))), "FD_DISABLE_CHUNKED_PREFILL": lambda: bool(int(os.getenv("FD_DISABLE_CHUNKED_PREFILL", "0"))),
} }