mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Feature] mm support prefix cache (#4134)
* support mm prefix caching * update code * fix mm_hashes * support encoder cache * add encoder cache * update code * update encoder cache * fix features bug * fix worker bug * support processor cache, need to optimize yet * refactor multimodal data cache * update code * update code * update v1 scheduler * update code * update code * update codestyle * support turn off processor cache and encoder cache * update pre-commit * fix code * solve review * update code * update code * update test case * set processor cache in GiB * update test case * support mm prefix caching for qwen model * fix code style check * update pre-commit * fix unit test * fix unit test * add ci test case * fix rescheduled bug * change text_after_process to prompt_tokens * fix unit test * fix chat template * change model path * [EP] fix adapter bugs (#4572) * Update expert_service.py * Update common_engine.py * Update expert_service.py * fix v1 hang bug (#4573) * fix import image_ops error on some platforms (#4559) * [CLI]Update parameters in bench latecy cli tool and fix collect-env cli tool (#4558) * add collect-env * del files * [Graph Optimization] Add dy_runnable and introduce cudagraph_switch_threshold for cudagraph mode switching (#4578) * add new branch for sot * reorder * fix batch bug * [XPU]Moe uses a new operator (#4585) * [XPU]Moe uses a new operator * [XPU]Moe uses a new operator * update response * [Feature] Support Paddle-OCR (#4396) * init * update code * fix code style & disable thinking * adapt for common_engine.update_mm_requests_chunk_size * use 3d rope * use flash_attn_unpadded * opt siglip * update to be compatible with the latest codebase * fix typo * optim OCR performance * fix bug * fix bug * fix bug * fix bug * normlize name * modify xpu rope * revert logger * fix bug * fix bug * fix bug * support default_v1 * optim performance * fix bug --------- Co-authored-by: root <root@szzj-acg-tge1-fdda9.szzj.baidu.com> Co-authored-by: zhangyue66 <zhangyue66@baidu.com> * [DataProcessor] add reasoning_tokens into usage info (#4520) * add reasoning_tokens into usage info initial commit * add unit tests * modify unit test * modify and add unit tests * fix unit test * move steam usage to processor * modify processor * modify test_logprobs * modify test_logprobs.py * modify stream reasoning tokens accumulation * fix unit test * perf: Optimize task queue communication from engine to worker (#4531) * perf: Optimize task queue communication from engine to worker * perf: get_tasks to numpy * perf: get_tasks remove to_numpy * fix: request & replace ENV * remove test_e2w_perf.py * fix code style --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> * Clean up ports after processing results (#4587) * [CI] Add /re-run command in PR comments to restart failed CI workflows (#4593) * [Others] api server exits when worker process is dead (#3271) * [fix] fix terminal hangs when worker process is dead * [chore] change sleep time of monitor * [chore] remove redundant comments * update docs --------- Co-authored-by: ApplEOFDiscord <wwy640130@163.com> Co-authored-by: ApplEOFDiscord <31272106+ApplEOFDiscord@users.noreply.github.com> Co-authored-by: ltd0924 <32387785+ltd0924@users.noreply.github.com> Co-authored-by: yinwei <yinwei_hust@163.com> Co-authored-by: JYChen <zoooo0820@qq.com> Co-authored-by: qwes5s5 <45442318+qwes5s5@users.noreply.github.com> Co-authored-by: Ryan <zihaohuang@aliyun.com> Co-authored-by: yyssys <atyangshuang@foxmail.com> Co-authored-by: ming1753 <61511741+ming1753@users.noreply.github.com> Co-authored-by: root <root@szzj-acg-tge1-fdda9.szzj.baidu.com> Co-authored-by: zhangyue66 <zhangyue66@baidu.com> Co-authored-by: kxz2002 <115912648+kxz2002@users.noreply.github.com> Co-authored-by: SunLei <sunlei5788@gmail.com> Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> Co-authored-by: Zhang Yulong <35552275+ZhangYulongg@users.noreply.github.com> Co-authored-by: YuBaoku <49938469+EmmonsCurse@users.noreply.github.com> Co-authored-by: 李泳桦 <39643373+liyonghua0910@users.noreply.github.com>
This commit is contained in:
@@ -123,6 +123,14 @@ class EngineArgs:
|
||||
"""
|
||||
Limitation of numbers of multi-modal data.
|
||||
"""
|
||||
max_encoder_cache: int = -1
|
||||
"""
|
||||
Maximum number of tokens in the encoder cache.
|
||||
"""
|
||||
max_processor_cache: float = -1
|
||||
"""
|
||||
Maximum number of bytes(in GiB) in the processor cache.
|
||||
"""
|
||||
reasoning_parser: str = None
|
||||
"""
|
||||
specifies the reasoning parser to use for extracting reasoning content from the model output
|
||||
@@ -526,6 +534,18 @@ class EngineArgs:
|
||||
type=json.loads,
|
||||
help="Additional keyword arguments for the multi-modal processor.",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--max-encoder-cache",
|
||||
default=EngineArgs.max_encoder_cache,
|
||||
type=int,
|
||||
help="Maximum encoder cache tokens(use 0 to disable).",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--max-processor-cache",
|
||||
default=EngineArgs.max_processor_cache,
|
||||
type=float,
|
||||
help="Maximum processor cache bytes(use 0 to disable).",
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--enable-mm",
|
||||
action=DeprecatedOptionWarning,
|
||||
|
||||
@@ -634,13 +634,9 @@ class EngineService:
|
||||
int(self.resource_manager.available_batch()),
|
||||
self.cfg.max_prefill_batch,
|
||||
)
|
||||
if self.cfg.model_config.enable_mm:
|
||||
available_blocks = self.resource_manager.available_block_num()
|
||||
else:
|
||||
available_blocks = self.cfg.cache_config.max_block_num_per_seq
|
||||
|
||||
tasks = self.scheduler.get_requests(
|
||||
available_blocks=available_blocks,
|
||||
available_blocks=self.cfg.cache_config.max_block_num_per_seq,
|
||||
block_size=self.cfg.cache_config.block_size,
|
||||
reserved_output_blocks=self.cfg.cache_config.enc_dec_block_num,
|
||||
max_num_batched_tokens=self.cfg.model_config.max_model_len,
|
||||
|
||||
@@ -528,6 +528,7 @@ class LLMEngine:
|
||||
f" --load_choices {self.cfg.load_config.load_choices}"
|
||||
f" --plas_attention_config '{self.cfg.plas_attention_config.to_json_string()}'"
|
||||
f" --ips {ips}"
|
||||
f" --max_encoder_cache {self.cfg.cache_config.max_encoder_cache}"
|
||||
f" --cache-transfer-protocol {self.cfg.cache_config.cache_transfer_protocol}"
|
||||
f" --runner {self.cfg.model_config.runner}"
|
||||
f" --convert {self.cfg.model_config.convert}"
|
||||
|
||||
@@ -46,6 +46,12 @@ class RequestType(Enum):
|
||||
EXTEND = 3
|
||||
|
||||
|
||||
@dataclass
|
||||
class ImagePosition:
|
||||
offset: int = 0
|
||||
length: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class Request:
|
||||
def __init__(
|
||||
|
||||
@@ -28,10 +28,21 @@ import numpy as np
|
||||
import paddle
|
||||
|
||||
from fastdeploy import envs
|
||||
from fastdeploy.engine.request import Request, RequestOutput, RequestStatus, RequestType
|
||||
from fastdeploy.cache_manager.multimodal_cache_manager import (
|
||||
EncoderCacheManager,
|
||||
ProcessorCacheManager,
|
||||
)
|
||||
from fastdeploy.engine.request import (
|
||||
ImagePosition,
|
||||
Request,
|
||||
RequestOutput,
|
||||
RequestStatus,
|
||||
RequestType,
|
||||
)
|
||||
from fastdeploy.engine.resource_manager import ResourceManager
|
||||
from fastdeploy.inter_communicator import IPCSignal
|
||||
from fastdeploy.metrics.metrics import main_process_metrics
|
||||
from fastdeploy.multimodal.hasher import MultimodalHasher
|
||||
from fastdeploy.platforms import current_platform
|
||||
from fastdeploy.utils import llm_logger
|
||||
|
||||
@@ -175,6 +186,15 @@ class ResourceManagerV1(ResourceManager):
|
||||
|
||||
self.need_block_num_map = dict()
|
||||
|
||||
self.encoder_cache = None
|
||||
if config.model_config.enable_mm and config.cache_config.max_encoder_cache > 0:
|
||||
self.encoder_cache = EncoderCacheManager(config.cache_config.max_encoder_cache)
|
||||
|
||||
self.processor_cache = None
|
||||
if config.model_config.enable_mm and config.cache_config.max_processor_cache > 0:
|
||||
max_processor_cache_in_bytes = int(config.cache_config.max_processor_cache * 1024 * 1024 * 1024)
|
||||
self.processor_cache = ProcessorCacheManager(max_processor_cache_in_bytes)
|
||||
|
||||
def allocated_slots(self, request: Request):
|
||||
return len(request.block_tables) * self.config.cache_config.block_size
|
||||
|
||||
@@ -273,6 +293,44 @@ class ResourceManagerV1(ResourceManager):
|
||||
break
|
||||
return can_schedule
|
||||
|
||||
def _update_mm_hashes(self, request):
|
||||
if request.multimodal_inputs is None:
|
||||
return
|
||||
|
||||
inputs = request.multimodal_inputs
|
||||
if (
|
||||
inputs.get("images", None) is not None
|
||||
and inputs.get("image_patch_id", None) is not None
|
||||
and inputs.get("grid_thw", None) is not None
|
||||
and len(inputs["grid_thw"]) != 0
|
||||
):
|
||||
grid_thw = []
|
||||
new_mm_positions, new_mm_hashes = [], []
|
||||
image_st = 0
|
||||
for idx, one in enumerate(inputs["grid_thw"]):
|
||||
t, h, w = one[0], one[1], one[2]
|
||||
if t == 1:
|
||||
grid_thw.append(one)
|
||||
new_mm_positions.append(inputs["mm_positions"][idx])
|
||||
new_mm_hashes.append(inputs["mm_hashes"][idx])
|
||||
image_st += h * w
|
||||
else:
|
||||
grid_thw.extend([[2, h, w]] * (t // 2))
|
||||
token_st = inputs["mm_positions"][idx].offset
|
||||
for _ in range(t // 2):
|
||||
new_mm_positions.append(ImagePosition(token_st, h * w // 4))
|
||||
# videos are split into patches every 2 frames, need to rehash
|
||||
new_mm_hashes.append(
|
||||
MultimodalHasher.hash_features(inputs["images"][image_st : image_st + 2 * h * w])
|
||||
)
|
||||
image_st += 2 * h * w
|
||||
token_st += h * w // 4
|
||||
inputs["mm_positions"] = new_mm_positions
|
||||
inputs["mm_hashes"] = new_mm_hashes
|
||||
else:
|
||||
inputs["mm_positions"] = []
|
||||
inputs["mm_hashes"] = []
|
||||
|
||||
def _get_num_new_tokens(self, request, token_budget):
|
||||
# TODO: set condition to new _get_num_new_tokens
|
||||
num_new_tokens = request.need_prefill_tokens - request.num_computed_tokens
|
||||
@@ -333,11 +391,12 @@ class ResourceManagerV1(ResourceManager):
|
||||
|
||||
if request.multimodal_img_boundaries is None:
|
||||
grid_thw = []
|
||||
for one in inputs["grid_thw"]:
|
||||
if one[0] == 1:
|
||||
for idx, one in enumerate(inputs["grid_thw"]):
|
||||
t, h, w = one[0], one[1], one[2]
|
||||
if t == 1:
|
||||
grid_thw.append(one)
|
||||
else:
|
||||
grid_thw.extend([[2, one[1], one[2]]] * (one[0] // 2))
|
||||
grid_thw.extend([[2, h, w]] * (t // 2))
|
||||
|
||||
grid_thw = paddle.to_tensor(grid_thw, dtype="int64")
|
||||
if current_platform.is_xpu():
|
||||
@@ -398,6 +457,11 @@ class ResourceManagerV1(ResourceManager):
|
||||
request.image_start = np.sum(np.prod(grid_thw[: request.num_image_start], axis=1))
|
||||
request.image_end = np.sum(np.prod(grid_thw[: request.num_image_end], axis=1))
|
||||
|
||||
cur_mm_hashes = inputs["mm_hashes"][request.num_image_start : request.num_image_end]
|
||||
cur_mm_positions = inputs["mm_positions"][request.num_image_start : request.num_image_end]
|
||||
if self.encoder_cache:
|
||||
request.evict_mm_hashes = self.encoder_cache.apply_cache(cur_mm_hashes, cur_mm_positions)
|
||||
|
||||
# Compatible with scenarios without images and videos.
|
||||
return num_new_tokens
|
||||
|
||||
@@ -553,6 +617,7 @@ class ResourceManagerV1(ResourceManager):
|
||||
break
|
||||
request = self.waiting[0]
|
||||
if request.status == RequestStatus.WAITING:
|
||||
self._update_mm_hashes(request)
|
||||
# Enable prefix caching
|
||||
if self.config.cache_config.enable_prefix_caching:
|
||||
if (
|
||||
|
||||
Reference in New Issue
Block a user