Fix chunked prefill (#3778)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled

* update enable chunked_prefill

* update code

* update code

* update code
This commit is contained in:
kevin
2025-09-02 13:41:55 +08:00
committed by GitHub
parent 0cdbc950b5
commit a86b35ab49
4 changed files with 29 additions and 25 deletions

View File

@@ -1233,23 +1233,14 @@ class FDConfig:
self.paddle_commit_id = paddle.version.commit
if self.cache_config.enable_chunked_prefill:
self.force_chunked_prefill = int(envs.FD_FORCE_CHUNKED_PREFILL)
if (
self.speculative_config is not None
and self.speculative_config.method in ["mtp"]
and not self.force_chunked_prefill
):
self.cache_config.enable_chunked_prefill = False
if self.max_num_batched_tokens is None:
if self.cache_config.enable_chunked_prefill:
self.max_num_batched_tokens = 2048
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
else:
if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
self.max_num_batched_tokens = self.max_model_len
if self.cache_config.enable_chunked_prefill:
self.max_num_batched_tokens = 2048
else:
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
self.max_num_batched_tokens = self.max_model_len
if self.long_prefill_token_threshold == 0:
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)