mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-11-01 20:32:52 +08:00
[fix] fix requests & block metrics (#4325)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
* [fix] fix requests & block metrics * [chore] rename variables
This commit is contained in:
@@ -115,6 +115,7 @@ class PrefixCacheManager:
|
|||||||
|
|
||||||
main_process_metrics.max_gpu_block_num.set(self.num_gpu_blocks)
|
main_process_metrics.max_gpu_block_num.set(self.num_gpu_blocks)
|
||||||
main_process_metrics.available_gpu_block_num.set(self.num_gpu_blocks)
|
main_process_metrics.available_gpu_block_num.set(self.num_gpu_blocks)
|
||||||
|
main_process_metrics.free_gpu_block_num.set(self.num_gpu_blocks)
|
||||||
main_process_metrics.available_gpu_resource.set(1.0)
|
main_process_metrics.available_gpu_resource.set(1.0)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -274,6 +275,7 @@ class PrefixCacheManager:
|
|||||||
|
|
||||||
main_process_metrics.max_gpu_block_num.set(self.num_gpu_blocks)
|
main_process_metrics.max_gpu_block_num.set(self.num_gpu_blocks)
|
||||||
main_process_metrics.available_gpu_block_num.set(self.num_gpu_blocks)
|
main_process_metrics.available_gpu_block_num.set(self.num_gpu_blocks)
|
||||||
|
main_process_metrics.free_gpu_block_num.set(self.num_gpu_blocks)
|
||||||
main_process_metrics.available_gpu_resource.set(1.0)
|
main_process_metrics.available_gpu_resource.set(1.0)
|
||||||
|
|
||||||
def can_allocate_gpu_blocks(self, num_blocks: int):
|
def can_allocate_gpu_blocks(self, num_blocks: int):
|
||||||
|
|||||||
@@ -311,8 +311,8 @@ class ResourceManager:
|
|||||||
break
|
break
|
||||||
|
|
||||||
# record batch size here
|
# record batch size here
|
||||||
task_used_block_num = sum([len(task.block_tables) if task else 0 for task in self.tasks_list])
|
num_blocks_used_by_tasks = sum([len(task.block_tables) if task else 0 for task in self.tasks_list])
|
||||||
main_process_metrics.available_gpu_block_num.set(self.total_block_number() - task_used_block_num)
|
main_process_metrics.available_gpu_block_num.set(self.total_block_number() - num_blocks_used_by_tasks)
|
||||||
main_process_metrics.batch_size.set(self.max_num_seqs - self.available_batch())
|
main_process_metrics.batch_size.set(self.max_num_seqs - self.available_batch())
|
||||||
main_process_metrics.gpu_cache_usage_perc.set(self.get_gpu_cache_usage_perc())
|
main_process_metrics.gpu_cache_usage_perc.set(self.get_gpu_cache_usage_perc())
|
||||||
|
|
||||||
|
|||||||
@@ -123,8 +123,6 @@ class ResourceManagerV1(ResourceManager):
|
|||||||
llm_logger.info(f"Preemption is triggered! Preempted request id: {preempted_req.request_id}")
|
llm_logger.info(f"Preemption is triggered! Preempted request id: {preempted_req.request_id}")
|
||||||
preempted_reqs.append(preempted_req)
|
preempted_reqs.append(preempted_req)
|
||||||
scheduled_reqs.append(self._prepare_preempt_task(preempted_req))
|
scheduled_reqs.append(self._prepare_preempt_task(preempted_req))
|
||||||
main_process_metrics.num_requests_waiting.inc(1)
|
|
||||||
main_process_metrics.num_requests_running.dec(1)
|
|
||||||
if preempted_req == request:
|
if preempted_req == request:
|
||||||
# No more request to preempt.
|
# No more request to preempt.
|
||||||
can_schedule = False
|
can_schedule = False
|
||||||
@@ -381,8 +379,6 @@ class ResourceManagerV1(ResourceManager):
|
|||||||
request, self.config.cache_config.block_size, request.num_computed_tokens
|
request, self.config.cache_config.block_size, request.num_computed_tokens
|
||||||
)
|
)
|
||||||
request.status = RequestStatus.RUNNING
|
request.status = RequestStatus.RUNNING
|
||||||
main_process_metrics.num_requests_waiting.dec(1)
|
|
||||||
main_process_metrics.num_requests_running.inc(1)
|
|
||||||
allocated_position = self.get_available_position()
|
allocated_position = self.get_available_position()
|
||||||
request.idx = allocated_position
|
request.idx = allocated_position
|
||||||
self.tasks_list[allocated_position] = request
|
self.tasks_list[allocated_position] = request
|
||||||
@@ -426,8 +422,6 @@ class ResourceManagerV1(ResourceManager):
|
|||||||
request, self.config.cache_config.block_size, request.num_computed_tokens
|
request, self.config.cache_config.block_size, request.num_computed_tokens
|
||||||
)
|
)
|
||||||
request.status = RequestStatus.RUNNING
|
request.status = RequestStatus.RUNNING
|
||||||
main_process_metrics.num_requests_waiting.dec(1)
|
|
||||||
main_process_metrics.num_requests_running.inc(1)
|
|
||||||
else:
|
else:
|
||||||
if self.config.cache_config.enable_prefix_caching:
|
if self.config.cache_config.enable_prefix_caching:
|
||||||
self._free_blocks(request)
|
self._free_blocks(request)
|
||||||
@@ -435,11 +429,17 @@ class ResourceManagerV1(ResourceManager):
|
|||||||
else:
|
else:
|
||||||
llm_logger.error("Unknown request status type")
|
llm_logger.error("Unknown request status type")
|
||||||
if scheduled_reqs:
|
if scheduled_reqs:
|
||||||
task_used_block_num = sum([len(task.block_tables) if task else 0 for task in self.tasks_list])
|
|
||||||
main_process_metrics.available_gpu_block_num.set(self.total_block_number() - task_used_block_num)
|
|
||||||
main_process_metrics.batch_size.set(self.max_num_seqs - self.available_batch())
|
|
||||||
main_process_metrics.gpu_cache_usage_perc.set(self.get_gpu_cache_usage_perc())
|
|
||||||
llm_logger.debug(f"schedued_reqs: {scheduled_reqs}")
|
llm_logger.debug(f"schedued_reqs: {scheduled_reqs}")
|
||||||
|
|
||||||
|
# Update metrics
|
||||||
|
num_tasks = sum([1 if task else 0 for task in self.tasks_list])
|
||||||
|
num_blocks_used_by_tasks = sum([len(task.block_tables) if task else 0 for task in self.tasks_list])
|
||||||
|
main_process_metrics.available_gpu_block_num.set(self.total_block_number() - num_blocks_used_by_tasks)
|
||||||
|
main_process_metrics.batch_size.set(self.max_num_seqs - self.available_batch())
|
||||||
|
main_process_metrics.gpu_cache_usage_perc.set(self.get_gpu_cache_usage_perc())
|
||||||
|
main_process_metrics.num_requests_running.set(len(self.running))
|
||||||
|
main_process_metrics.num_requests_waiting.set(num_tasks - len(self.running))
|
||||||
|
|
||||||
return scheduled_reqs
|
return scheduled_reqs
|
||||||
|
|
||||||
def get_available_position(self) -> int:
|
def get_available_position(self) -> int:
|
||||||
|
|||||||
@@ -311,7 +311,7 @@ class MetricsManager:
|
|||||||
"available_gpu_block_num": {
|
"available_gpu_block_num": {
|
||||||
"type": Gauge,
|
"type": Gauge,
|
||||||
"name": "fastdeploy:available_gpu_block_num",
|
"name": "fastdeploy:available_gpu_block_num",
|
||||||
"description": "Number of available gpu blocks in cache, including prefix caching blocks that are not officially released",
|
"description": "Number of available gpu blocks in cache, including blocks in LRU list",
|
||||||
"kwargs": {},
|
"kwargs": {},
|
||||||
},
|
},
|
||||||
"free_gpu_block_num": {
|
"free_gpu_block_num": {
|
||||||
|
|||||||
@@ -248,9 +248,12 @@ class TokenProcessor:
|
|||||||
self.resource_manager.tasks_list[index] = None
|
self.resource_manager.tasks_list[index] = None
|
||||||
self.resource_manager._recycle_block_tables(task)
|
self.resource_manager._recycle_block_tables(task)
|
||||||
|
|
||||||
task_used_block_num = sum([len(task.block_tables) if task else 0 for task in self.resource_manager.tasks_list])
|
# Update block metrics
|
||||||
|
num_blocks_used_by_tasks = sum(
|
||||||
|
[len(task.block_tables) if task else 0 for task in self.resource_manager.tasks_list]
|
||||||
|
)
|
||||||
main_process_metrics.available_gpu_block_num.set(
|
main_process_metrics.available_gpu_block_num.set(
|
||||||
self.resource_manager.total_block_number() - task_used_block_num
|
self.resource_manager.total_block_number() - num_blocks_used_by_tasks
|
||||||
)
|
)
|
||||||
main_process_metrics.batch_size.set(
|
main_process_metrics.batch_size.set(
|
||||||
self.resource_manager.max_num_seqs - self.resource_manager.available_batch()
|
self.resource_manager.max_num_seqs - self.resource_manager.available_batch()
|
||||||
|
|||||||
Reference in New Issue
Block a user