[BugFix] Fix chunked prefill (#3759)

* add error traceback info * update error msg * update code * default enable chunked prefill * update code * update code * add envs * update code * update enable chunked_prefill * update code * update code * update code * update code * update code --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-10-06 00:57:33 +08:00 · 2025-09-02 13:40:45 +08:00
parent 27f2e7a6f1
commit 7e751c93ae
4 changed files with 29 additions and 25 deletions
--- a/fastdeploy/engine/args_utils.py
+++ b/fastdeploy/engine/args_utils.py
@@ -15,11 +15,11 @@
 """

 import json
-import os
 from dataclasses import asdict, dataclass
 from dataclasses import fields as dataclass_fields
 from typing import Any, Dict, List, Optional

+from fastdeploy import envs
 from fastdeploy.config import (
    CacheConfig,
    EarlyStopConfig,
@@ -243,7 +243,7 @@ class EngineArgs:
    Ports for rdma communication.
    """

-    enable_chunked_prefill: bool = True
+    enable_chunked_prefill: bool = False
    """
    Flag to enable chunked prefilling.
    """
@@ -981,14 +981,29 @@ class EngineArgs:

        if not model_cfg.is_unified_ckpt and hasattr(model_cfg, "tensor_parallel_size"):
            self.tensor_parallel_size = model_cfg.tensor_parallel_size
+
+        speculative_cfg = self.create_speculative_config()
+        if not self.enable_chunked_prefill:
+            if (
+                current_platform.is_cuda()
+                and self.splitwise_role == "mixed"
+                and (speculative_cfg is None or speculative_cfg.method not in ["mtp"])
+            ):
+                # default enable chunked prefill
+                self.enable_chunked_prefill = True
+
+            self.disable_chunked_prefill = int(envs.FD_DISABLE_CHUNKED_PREFILL)
+            if self.disable_chunked_prefill:
+                self.enable_chunked_prefill = False
+
        if self.max_num_batched_tokens is None:
-            if self.enable_chunked_prefill:
-                self.max_num_batched_tokens = 2048
+            if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
+                self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
            else:
-                if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
-                    self.max_num_batched_tokens = self.max_model_len
+                if self.enable_chunked_prefill:
+                    self.max_num_batched_tokens = 2048
                else:
-                    self.max_num_batched_tokens = 8192  # if set to max_model_len, it's easy to be OOM
+                    self.max_num_batched_tokens = self.max_model_len

        all_dict = asdict(self)
        all_dict["model_cfg"] = model_cfg
@@ -996,7 +1011,6 @@ class EngineArgs:
        load_cfg = LoadConfig(all_dict)
        parallel_cfg = ParallelConfig(all_dict)
        scheduler_cfg = self.create_scheduler_config()
-        speculative_cfg = self.create_speculative_config()
        graph_opt_cfg = self.create_graph_optimization_config()
        graph_opt_cfg.update_use_cudagraph(self.use_cudagraph)
        moba_attention_config = self.create_moba_attention_config()