mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 00:57:33 +08:00
[BugFix] Fix chunked prefill (#3759)
* add error traceback info * update error msg * update code * default enable chunked prefill * update code * update code * add envs * update code * update enable chunked_prefill * update code * update code * update code * update code * update code --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
This commit is contained in:
@@ -15,11 +15,11 @@
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
from dataclasses import asdict, dataclass
|
||||
from dataclasses import fields as dataclass_fields
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.config import (
|
||||
CacheConfig,
|
||||
EarlyStopConfig,
|
||||
@@ -243,7 +243,7 @@ class EngineArgs:
|
||||
Ports for rdma communication.
|
||||
"""
|
||||
|
||||
enable_chunked_prefill: bool = True
|
||||
enable_chunked_prefill: bool = False
|
||||
"""
|
||||
Flag to enable chunked prefilling.
|
||||
"""
|
||||
@@ -981,14 +981,29 @@ class EngineArgs:
|
||||
|
||||
if not model_cfg.is_unified_ckpt and hasattr(model_cfg, "tensor_parallel_size"):
|
||||
self.tensor_parallel_size = model_cfg.tensor_parallel_size
|
||||
|
||||
speculative_cfg = self.create_speculative_config()
|
||||
if not self.enable_chunked_prefill:
|
||||
if (
|
||||
current_platform.is_cuda()
|
||||
and self.splitwise_role == "mixed"
|
||||
and (speculative_cfg is None or speculative_cfg.method not in ["mtp"])
|
||||
):
|
||||
# default enable chunked prefill
|
||||
self.enable_chunked_prefill = True
|
||||
|
||||
self.disable_chunked_prefill = int(envs.FD_DISABLE_CHUNKED_PREFILL)
|
||||
if self.disable_chunked_prefill:
|
||||
self.enable_chunked_prefill = False
|
||||
|
||||
if self.max_num_batched_tokens is None:
|
||||
if self.enable_chunked_prefill:
|
||||
self.max_num_batched_tokens = 2048
|
||||
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
|
||||
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
|
||||
else:
|
||||
if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
|
||||
self.max_num_batched_tokens = self.max_model_len
|
||||
if self.enable_chunked_prefill:
|
||||
self.max_num_batched_tokens = 2048
|
||||
else:
|
||||
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
|
||||
self.max_num_batched_tokens = self.max_model_len
|
||||
|
||||
all_dict = asdict(self)
|
||||
all_dict["model_cfg"] = model_cfg
|
||||
@@ -996,7 +1011,6 @@ class EngineArgs:
|
||||
load_cfg = LoadConfig(all_dict)
|
||||
parallel_cfg = ParallelConfig(all_dict)
|
||||
scheduler_cfg = self.create_scheduler_config()
|
||||
speculative_cfg = self.create_speculative_config()
|
||||
graph_opt_cfg = self.create_graph_optimization_config()
|
||||
graph_opt_cfg.update_use_cudagraph(self.use_cudagraph)
|
||||
moba_attention_config = self.create_moba_attention_config()
|
||||
|
Reference in New Issue
Block a user