mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
[Bug fix] Test td cache messager (#3242)
* support disable cache task in decode node * fix busg * Update engine.py * Update expert_service.py * Update splitwise_connector.py * Optimize log for debug * Optimize log for debug * fix bug --------- Co-authored-by: ltd0924 <ltd0924@sina.com> Co-authored-by: ltd0924 <32387785+ltd0924@users.noreply.github.com>
This commit is contained in:
@@ -26,6 +26,7 @@ from collections import deque
|
||||
|
||||
import numpy as np
|
||||
|
||||
from fastdeploy.engine.request import RequestOutput
|
||||
from fastdeploy.engine.resource_manager import ResourceManager
|
||||
from fastdeploy.inter_communicator import EngineWorkerQueue
|
||||
from fastdeploy.metrics.metrics import main_process_metrics
|
||||
@@ -34,6 +35,7 @@ from fastdeploy.splitwise.internal_adapter_utils import InternalAdapter
|
||||
from fastdeploy.splitwise.splitwise_connector import SplitwiseConnector
|
||||
from fastdeploy.utils import EngineError, console_logger, envs, get_logger, llm_logger
|
||||
|
||||
|
||||
class ExpertService:
|
||||
"""
|
||||
Engine class responsible for managing the Large Language Model (LLM) operations.
|
||||
@@ -146,7 +148,7 @@ class ExpertService:
|
||||
|
||||
# Start TokenProcessor thread
|
||||
os.environ["INFERENCE_MSG_QUEUE_ID"] = str(local_data_parallel_id + int(self.cfg.engine_worker_queue_port))
|
||||
|
||||
self.enable_decode_cache_task = envs.FD_ENABLE_CACHE_TASK
|
||||
self.token_processor.run()
|
||||
|
||||
self.cfg.init_cache_info()
|
||||
@@ -262,11 +264,15 @@ class ExpertService:
|
||||
if self.resource_manager.is_resource_sufficient(task.prompt_token_ids_len):
|
||||
self.insert_tasks([task])
|
||||
else:
|
||||
if not self.enable_decode_cache_task:
|
||||
task.error_msg = "Not enough resources"
|
||||
new_waiting.append(task)
|
||||
|
||||
if new_waiting:
|
||||
self.waiting_requests.extend(new_waiting)
|
||||
self.llm_logger.info(f"Added {len(new_waiting)} tasks to waiting queue")
|
||||
if not self.enable_decode_cache_task:
|
||||
self.split_connector.send_cache_infos(new_waiting, -1)
|
||||
else:
|
||||
self.waiting_requests.extend(new_waiting)
|
||||
self.llm_logger.info(f"Added {len(new_waiting)} tasks to waiting queue")
|
||||
|
||||
else:
|
||||
time.sleep(0.001)
|
||||
@@ -310,8 +316,24 @@ class ExpertService:
|
||||
if not isinstance(tasks, list):
|
||||
tasks = [tasks]
|
||||
|
||||
for item in tasks:
|
||||
item.schedule_start_time = time.time()
|
||||
for task in tasks:
|
||||
if self.cfg.splitwise_role != "mixed":
|
||||
status, msg = self.split_connector.check_decode_allocated(task)
|
||||
if not status:
|
||||
self.llm_logger.error(f"{task.request_id} prefill failed with msg:{msg}.")
|
||||
self.scheduler.put_results(
|
||||
[
|
||||
RequestOutput(
|
||||
request_id=task.request_id,
|
||||
finished=True,
|
||||
error_code=500,
|
||||
error_msg=msg,
|
||||
)
|
||||
]
|
||||
)
|
||||
tasks.remove(task)
|
||||
continue
|
||||
task.schedule_start_time = time.time()
|
||||
|
||||
available_batch = np.sum(self.resource_manager.stop_flags)
|
||||
if len(tasks) > available_batch:
|
||||
|
Reference in New Issue
Block a user