mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 16:22:57 +08:00
Fix chunked prefill (#3778)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
* update enable chunked_prefill * update code * update code * update code
This commit is contained in:
1
.github/workflows/_base_test.yml
vendored
1
.github/workflows/_base_test.yml
vendored
@@ -134,7 +134,6 @@ jobs:
|
|||||||
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
|
-e "FD_ENGINE_QUEUE_PORT=${FD_ENGINE_QUEUE_PORT}" \
|
||||||
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
|
-e "FD_METRICS_PORT=${FD_METRICS_PORT}" \
|
||||||
-e "FLASK_PORT=${FLASK_PORT}" \
|
-e "FLASK_PORT=${FLASK_PORT}" \
|
||||||
-e "FD_FORCE_CHUNKED_PREFILL=1" \
|
|
||||||
-v "${MODEL_CACHE_DIR}:/MODELDATA" \
|
-v "${MODEL_CACHE_DIR}:/MODELDATA" \
|
||||||
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
|
-v "${CACHE_DIR}/gitconfig:/etc/gitconfig:ro" \
|
||||||
-v "${CACHE_DIR}/.cache:/root/.cache" \
|
-v "${CACHE_DIR}/.cache:/root/.cache" \
|
||||||
|
@@ -1233,23 +1233,14 @@ class FDConfig:
|
|||||||
|
|
||||||
self.paddle_commit_id = paddle.version.commit
|
self.paddle_commit_id = paddle.version.commit
|
||||||
|
|
||||||
if self.cache_config.enable_chunked_prefill:
|
|
||||||
self.force_chunked_prefill = int(envs.FD_FORCE_CHUNKED_PREFILL)
|
|
||||||
if (
|
|
||||||
self.speculative_config is not None
|
|
||||||
and self.speculative_config.method in ["mtp"]
|
|
||||||
and not self.force_chunked_prefill
|
|
||||||
):
|
|
||||||
self.cache_config.enable_chunked_prefill = False
|
|
||||||
|
|
||||||
if self.max_num_batched_tokens is None:
|
if self.max_num_batched_tokens is None:
|
||||||
|
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
|
||||||
|
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
|
||||||
|
else:
|
||||||
if self.cache_config.enable_chunked_prefill:
|
if self.cache_config.enable_chunked_prefill:
|
||||||
self.max_num_batched_tokens = 2048
|
self.max_num_batched_tokens = 2048
|
||||||
else:
|
else:
|
||||||
if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
|
|
||||||
self.max_num_batched_tokens = self.max_model_len
|
self.max_num_batched_tokens = self.max_model_len
|
||||||
else:
|
|
||||||
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
|
|
||||||
|
|
||||||
if self.long_prefill_token_threshold == 0:
|
if self.long_prefill_token_threshold == 0:
|
||||||
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
|
self.long_prefill_token_threshold = int(self.max_model_len * 0.04)
|
||||||
|
@@ -15,11 +15,11 @@
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
|
||||||
from dataclasses import asdict, dataclass
|
from dataclasses import asdict, dataclass
|
||||||
from dataclasses import fields as dataclass_fields
|
from dataclasses import fields as dataclass_fields
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
from fastdeploy import envs
|
||||||
from fastdeploy.config import (
|
from fastdeploy.config import (
|
||||||
CacheConfig,
|
CacheConfig,
|
||||||
EarlyStopConfig,
|
EarlyStopConfig,
|
||||||
@@ -243,7 +243,7 @@ class EngineArgs:
|
|||||||
Ports for rdma communication.
|
Ports for rdma communication.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
enable_chunked_prefill: bool = True
|
enable_chunked_prefill: bool = False
|
||||||
"""
|
"""
|
||||||
Flag to enable chunked prefilling.
|
Flag to enable chunked prefilling.
|
||||||
"""
|
"""
|
||||||
@@ -981,14 +981,29 @@ class EngineArgs:
|
|||||||
|
|
||||||
if not model_cfg.is_unified_ckpt and hasattr(model_cfg, "tensor_parallel_size"):
|
if not model_cfg.is_unified_ckpt and hasattr(model_cfg, "tensor_parallel_size"):
|
||||||
self.tensor_parallel_size = model_cfg.tensor_parallel_size
|
self.tensor_parallel_size = model_cfg.tensor_parallel_size
|
||||||
|
|
||||||
|
speculative_cfg = self.create_speculative_config()
|
||||||
|
if not self.enable_chunked_prefill:
|
||||||
|
if (
|
||||||
|
current_platform.is_cuda()
|
||||||
|
and self.splitwise_role == "mixed"
|
||||||
|
and (speculative_cfg is None or speculative_cfg.method not in ["mtp"])
|
||||||
|
):
|
||||||
|
# default enable chunked prefill
|
||||||
|
self.enable_chunked_prefill = True
|
||||||
|
|
||||||
|
self.disable_chunked_prefill = int(envs.FD_DISABLE_CHUNKED_PREFILL)
|
||||||
|
if self.disable_chunked_prefill:
|
||||||
|
self.enable_chunked_prefill = False
|
||||||
|
|
||||||
if self.max_num_batched_tokens is None:
|
if self.max_num_batched_tokens is None:
|
||||||
|
if int(envs.ENABLE_V1_KVCACHE_SCHEDULER):
|
||||||
|
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
|
||||||
|
else:
|
||||||
if self.enable_chunked_prefill:
|
if self.enable_chunked_prefill:
|
||||||
self.max_num_batched_tokens = 2048
|
self.max_num_batched_tokens = 2048
|
||||||
else:
|
else:
|
||||||
if not int(os.getenv("ENABLE_V1_KVCACHE_SCHEDULER", "0")):
|
|
||||||
self.max_num_batched_tokens = self.max_model_len
|
self.max_num_batched_tokens = self.max_model_len
|
||||||
else:
|
|
||||||
self.max_num_batched_tokens = 8192 # if set to max_model_len, it's easy to be OOM
|
|
||||||
|
|
||||||
all_dict = asdict(self)
|
all_dict = asdict(self)
|
||||||
all_dict["model_cfg"] = model_cfg
|
all_dict["model_cfg"] = model_cfg
|
||||||
@@ -996,7 +1011,6 @@ class EngineArgs:
|
|||||||
load_cfg = LoadConfig(all_dict)
|
load_cfg = LoadConfig(all_dict)
|
||||||
parallel_cfg = ParallelConfig(all_dict)
|
parallel_cfg = ParallelConfig(all_dict)
|
||||||
scheduler_cfg = self.create_scheduler_config()
|
scheduler_cfg = self.create_scheduler_config()
|
||||||
speculative_cfg = self.create_speculative_config()
|
|
||||||
graph_opt_cfg = self.create_graph_optimization_config()
|
graph_opt_cfg = self.create_graph_optimization_config()
|
||||||
graph_opt_cfg.update_use_cudagraph(self.use_cudagraph)
|
graph_opt_cfg.update_use_cudagraph(self.use_cudagraph)
|
||||||
moba_attention_config = self.create_moba_attention_config()
|
moba_attention_config = self.create_moba_attention_config()
|
||||||
|
@@ -93,8 +93,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# enable multi api server
|
# enable multi api server
|
||||||
"FD_ENABLE_MULTI_API_SERVER": lambda: bool(int(os.getenv("FD_ENABLE_MULTI_API_SERVER", "0"))),
|
"FD_ENABLE_MULTI_API_SERVER": lambda: bool(int(os.getenv("FD_ENABLE_MULTI_API_SERVER", "0"))),
|
||||||
"FD_FOR_TORCH_MODEL_FORMAT": lambda: bool(int(os.getenv("FD_FOR_TORCH_MODEL_FORMAT", "0"))),
|
"FD_FOR_TORCH_MODEL_FORMAT": lambda: bool(int(os.getenv("FD_FOR_TORCH_MODEL_FORMAT", "0"))),
|
||||||
# force enable chunked prefill
|
# force disable default chunked prefill
|
||||||
"FD_FORCE_CHUNKED_PREFILL": lambda: bool(int(os.getenv("FD_FORCE_CHUNKED_PREFILL", "0"))),
|
"FD_DISABLE_CHUNKED_PREFILL": lambda: bool(int(os.getenv("FD_DISABLE_CHUNKED_PREFILL", "0"))),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user