mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
[feat] support prefix cache clearing when /clear_load_weight
is called (#4091)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
* [feat] support clearing prefix cache (cherry-picked from release/2.1) * [fix] fix ipc suffix, use port instead * [fix] fix prefix caching not enabled * [fix] fix code style * [fix] wait for rank0 to update weight status
This commit is contained in:
@@ -123,12 +123,12 @@ class LLMEngine:
|
||||
llm_logger.info(f"Start zmq server, api_server_pid: {api_server_pid}")
|
||||
self.engine.start_zmq_service(api_server_pid)
|
||||
|
||||
if self.do_profile == 0 and (
|
||||
self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != "mixed"
|
||||
):
|
||||
# If block numer is specified and model is deployed in mixed mode, start cache manager first
|
||||
if not self.do_profile and self.cfg.splitwise_role != "mixed":
|
||||
device_ids = self.cfg.device_ids.split(",")
|
||||
self.cache_manager_processes = self.engine.start_cache_service(device_ids, self.ipc_signal_suffix)
|
||||
self.cache_manager_processes = self.engine.start_cache_service(device_ids, self.ipc_signal_suffix, True)
|
||||
|
||||
# Start workers
|
||||
self.worker_proc = self._start_worker_service()
|
||||
console_logger.info("Waiting worker processes ready...")
|
||||
time.sleep(5)
|
||||
@@ -154,11 +154,17 @@ class LLMEngine:
|
||||
return False
|
||||
time.sleep(1)
|
||||
|
||||
# If block number is not specified, let workers do profiling to determine the block number,
|
||||
# and then start the cache manager
|
||||
if self.do_profile:
|
||||
self._stop_profile()
|
||||
elif self.cfg.cache_config.enable_prefix_caching:
|
||||
device_ids = self.cfg.device_ids.split(",")
|
||||
self.cache_manager_processes = self.engine.start_cache_service(device_ids, self.ipc_signal_suffix, False)
|
||||
|
||||
# Launch components: scheduler, cache_manager, expert_service et.al.
|
||||
self.launch_components()
|
||||
if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != "mixed":
|
||||
if self.cfg.splitwise_role != "mixed":
|
||||
self.launched_cache_manager_signal.value[0] = 1
|
||||
|
||||
# Worker launched
|
||||
@@ -168,6 +174,23 @@ class LLMEngine:
|
||||
return False
|
||||
|
||||
console_logger.info(f"Worker processes are launched with {time.time() - start_time} seconds.")
|
||||
|
||||
# Print blocks number & max running requests to console
|
||||
if envs.ENABLE_V1_KVCACHE_SCHEDULER:
|
||||
block_size = self.cfg.cache_config.block_size
|
||||
num_gpu_blocks = self.cfg.cache_config.num_gpu_blocks_override or self.cfg.cache_config.total_block_num
|
||||
num_cpu_blocks = self.cfg.cache_config.num_cpu_blocks
|
||||
max_running_requests = min(
|
||||
(num_gpu_blocks + num_cpu_blocks) * block_size // self.cfg.max_model_len, self.cfg.max_num_seqs
|
||||
)
|
||||
console_logger.info(
|
||||
f"Detected {num_gpu_blocks} gpu blocks and {num_cpu_blocks} cpu blocks in cache (block size: {block_size})."
|
||||
)
|
||||
console_logger.info(
|
||||
f"FastDeploy will be serving {max_running_requests} running requests "
|
||||
f"if each sequence reaches its maximum length: {self.cfg.max_model_len}"
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
def _get_generated_result(self):
|
||||
@@ -580,7 +603,9 @@ class LLMEngine:
|
||||
self.engine.resource_manager.reset_cache_config(self.cfg.cache_config)
|
||||
if self.cfg.cache_config.enable_prefix_caching or self.cfg.splitwise_role != "mixed":
|
||||
device_ids = self.cfg.device_ids.split(",")
|
||||
self.cache_manager_processes = self.engine.start_cache_service(device_ids, self.ipc_signal_suffix)
|
||||
self.cache_manager_processes = self.engine.start_cache_service(
|
||||
device_ids, self.ipc_signal_suffix, self.cfg.splitwise_role != "mixed"
|
||||
)
|
||||
|
||||
def check_health(self, time_interval_threashold=30):
|
||||
"""
|
||||
|
Reference in New Issue
Block a user