mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-24 13:28:13 +08:00
[Optimization] Improve perf for fd response token with internal adapter (#4992)
* [Optimize] Improve perf for fd response token with internal adapter * fix * fix bug * fix ci * fix ci * fix ci * fix ci
This commit is contained in:
@@ -958,7 +958,10 @@ class EngineService:
|
||||
)
|
||||
# Since the request is not in scheduler
|
||||
# Send result by zmq directly
|
||||
self.send_response_server.send_response(request_id, [error_result])
|
||||
if envs.FD_ENABLE_INTERNAL_ADAPTER:
|
||||
self.send_response_server.send_response(None, [[error_result]])
|
||||
else:
|
||||
self.send_response_server.send_response(request_id, [error_result])
|
||||
|
||||
def _decode_token(self, token_ids, req_id, is_end):
|
||||
delta_text = ""
|
||||
@@ -984,33 +987,67 @@ class EngineService:
|
||||
if len(results) == 0:
|
||||
time.sleep(0.005)
|
||||
continue
|
||||
for request_id, contents in results.items():
|
||||
if envs.FD_ENABLE_INTERNAL_ADAPTER:
|
||||
new_contents = []
|
||||
for content in contents:
|
||||
if isinstance(content, RequestOutput) and content.outputs is not None:
|
||||
decode_type = content.outputs.decode_type
|
||||
delta_text = ""
|
||||
if decode_type == 0:
|
||||
delta_text, token_ids = self._decode_token(
|
||||
token_ids=content.outputs.token_ids, req_id=request_id, is_end=content.finished
|
||||
)
|
||||
for step_batch_results in results:
|
||||
new_step_contents = []
|
||||
for content in step_batch_results:
|
||||
if isinstance(content, RequestOutput) and content.outputs is not None:
|
||||
decode_type = content.outputs.decode_type
|
||||
delta_text = ""
|
||||
if decode_type == 0:
|
||||
delta_text, token_ids = self._decode_token(
|
||||
token_ids=content.outputs.token_ids,
|
||||
req_id=content.request_id,
|
||||
is_end=content.finished,
|
||||
)
|
||||
else:
|
||||
token_ids = content.outputs.token_ids
|
||||
if len(token_ids):
|
||||
content.outputs.token_ids = token_ids
|
||||
content.outputs.text = delta_text
|
||||
new_step_contents.append(content)
|
||||
elif content.finished:
|
||||
new_step_contents.append(content)
|
||||
else:
|
||||
llm_logger.warning(
|
||||
f"current tokens need to accumulate, req_id: {content.request_id} {content.outputs.token_ids}"
|
||||
)
|
||||
else:
|
||||
token_ids = content.outputs.token_ids
|
||||
if len(token_ids):
|
||||
content.outputs.token_ids = token_ids
|
||||
content.outputs.text = delta_text
|
||||
new_contents.append(content)
|
||||
elif content.finished:
|
||||
new_contents.append(content)
|
||||
new_step_contents.append(content)
|
||||
if new_step_contents:
|
||||
new_contents.append(new_step_contents)
|
||||
if new_contents:
|
||||
self.send_response_server.send_response(None, new_contents)
|
||||
|
||||
else:
|
||||
for request_id, contents in results.items():
|
||||
new_contents = []
|
||||
for content in contents:
|
||||
if isinstance(content, RequestOutput) and content.outputs is not None:
|
||||
decode_type = content.outputs.decode_type
|
||||
delta_text = ""
|
||||
if decode_type == 0:
|
||||
delta_text, token_ids = self._decode_token(
|
||||
token_ids=content.outputs.token_ids, req_id=request_id, is_end=content.finished
|
||||
)
|
||||
else:
|
||||
token_ids = content.outputs.token_ids
|
||||
if len(token_ids):
|
||||
content.outputs.token_ids = token_ids
|
||||
content.outputs.text = delta_text
|
||||
new_contents.append(content)
|
||||
elif content.finished:
|
||||
new_contents.append(content)
|
||||
else:
|
||||
llm_logger.warning(
|
||||
f"current tokens need to accumulate, req_id: {request_id} {content.outputs.token_ids}"
|
||||
)
|
||||
else:
|
||||
llm_logger.warning(
|
||||
f"current tokens need to accumulate, req_id: {request_id} {content.outputs.token_ids}"
|
||||
)
|
||||
else:
|
||||
new_contents.append(content)
|
||||
if len(new_contents):
|
||||
llm_logger.debug(f"Send response for request id: {request_id}, {new_contents}")
|
||||
self.send_response_server.send_response(request_id, new_contents)
|
||||
new_contents.append(content)
|
||||
if len(new_contents):
|
||||
llm_logger.debug(f"Send response for request id: {request_id}")
|
||||
self.send_response_server.send_response(request_id, new_contents)
|
||||
except Exception as e:
|
||||
llm_logger.error(f"Unexcepted error happend: {e}, {traceback.format_exc()!s}")
|
||||
|
||||
|
||||
@@ -180,6 +180,10 @@ class LLMEngine:
|
||||
if self.cfg.scheduler_config.splitwise_role != "mixed":
|
||||
self.launched_cache_manager_signal.value[0] = 1
|
||||
|
||||
if self.cfg.scheduler_config.splitwise_role != "mixed" and envs.FD_ENABLE_INTERNAL_ADAPTER:
|
||||
envs.FD_ZMQ_RECV_REQUEST_SERVER_PORT = envs.FD_ZMQ_RECV_REQUEST_SERVER_PORTS.split(",")[0]
|
||||
envs.FD_ZMQ_SEND_RESPONSE_SERVER_PORT = envs.FD_ZMQ_SEND_RESPONSE_SERVER_PORTS.split(",")[0]
|
||||
|
||||
if api_server_pid is not None:
|
||||
llm_logger.info(f"Start zmq server, api_server_pid: {api_server_pid}")
|
||||
self.engine.start_zmq_service(api_server_pid)
|
||||
@@ -707,18 +711,19 @@ class LLMEngine:
|
||||
host_ip = self.cfg.host_ip
|
||||
disaggregate = self.cfg.disaggregate_info
|
||||
request_queues_for_dp_ipc = None
|
||||
result_queue_for_dp_ipc = None
|
||||
result_queues_for_dp_ipc = None
|
||||
if self.cfg.scheduler_config.name == "splitwise":
|
||||
self.engine.scheduler.start(role, host_ip, disaggregate)
|
||||
elif self.cfg.scheduler_config.name == "dp":
|
||||
request_queues_for_dp_ipc = []
|
||||
result_queue_for_dp_ipc = multiprocessing.Queue()
|
||||
result_queues_for_dp_ipc = []
|
||||
for i in range(self.cfg.parallel_config.data_parallel_size):
|
||||
request_queues_for_dp_ipc.append(multiprocessing.Queue())
|
||||
result_queues_for_dp_ipc.append(multiprocessing.Queue())
|
||||
self.engine.scheduler.start(
|
||||
self.cfg.node_rank * self.cfg.worker_num_per_node % self.cfg.worker_num_per_node,
|
||||
request_queues_for_dp_ipc,
|
||||
result_queue_for_dp_ipc,
|
||||
result_queues_for_dp_ipc,
|
||||
)
|
||||
|
||||
if not envs.FD_ENABLE_MULTI_API_SERVER:
|
||||
@@ -755,7 +760,7 @@ class LLMEngine:
|
||||
i,
|
||||
None,
|
||||
request_queues_for_dp_ipc,
|
||||
result_queue_for_dp_ipc,
|
||||
result_queues_for_dp_ipc,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
@@ -27,7 +27,6 @@ import numpy as np
|
||||
|
||||
from fastdeploy.engine.common_engine import EngineService
|
||||
from fastdeploy.inter_communicator import IPCSignal
|
||||
from fastdeploy.splitwise.internal_adapter_utils import InternalAdapter
|
||||
from fastdeploy.utils import console_logger, envs, llm_logger
|
||||
|
||||
|
||||
@@ -53,6 +52,13 @@ class ExpertService:
|
||||
end_pos = start_pos + self.cfg.parallel_config.tensor_parallel_size
|
||||
if cfg.scheduler_config.splitwise_role != "mixed":
|
||||
self.cfg.cache_config.rdma_comm_ports = self.cfg.cache_config.rdma_comm_ports[start_pos:end_pos]
|
||||
if envs.FD_ENABLE_INTERNAL_ADAPTER:
|
||||
envs.FD_ZMQ_RECV_REQUEST_SERVER_PORT = envs.FD_ZMQ_RECV_REQUEST_SERVER_PORTS.split(",")[
|
||||
local_data_parallel_id
|
||||
]
|
||||
envs.FD_ZMQ_SEND_RESPONSE_SERVER_PORT = envs.FD_ZMQ_SEND_RESPONSE_SERVER_PORTS.split(",")[
|
||||
local_data_parallel_id
|
||||
]
|
||||
self.cfg.local_device_ids = self.cfg.parallel_config.device_ids.split(",")[start_pos:end_pos]
|
||||
llm_logger.info(f"local_data_parallel_id: {local_data_parallel_id}")
|
||||
self.cfg.disaggregate_info = None
|
||||
@@ -77,7 +83,7 @@ class ExpertService:
|
||||
self._finalizer = weakref.finalize(self, self._exit_sub_services)
|
||||
|
||||
def start(
|
||||
self, ipc_signal_suffix, local_data_parallel_id, request_queues_for_dp_ipc=None, result_queue_for_dp_ipc=None
|
||||
self, ipc_signal_suffix, local_data_parallel_id, request_queues_for_dp_ipc=None, result_queues_for_dp_ipc=None
|
||||
):
|
||||
"""
|
||||
Initializes the engine and starts its sub-services.
|
||||
@@ -92,18 +98,15 @@ class ExpertService:
|
||||
self.engine.create_data_processor()
|
||||
if self.cfg.scheduler_config.name == "dp":
|
||||
self.cfg.init_cache_info()
|
||||
assert (request_queues_for_dp_ipc is not None) and (result_queue_for_dp_ipc is not None)
|
||||
self.engine.scheduler.start(local_data_parallel_id, request_queues_for_dp_ipc, result_queue_for_dp_ipc)
|
||||
assert (request_queues_for_dp_ipc is not None) and (result_queues_for_dp_ipc is not None)
|
||||
self.engine.scheduler.start(local_data_parallel_id, request_queues_for_dp_ipc, result_queues_for_dp_ipc)
|
||||
|
||||
if ipc_signal_suffix is not None:
|
||||
self.api_server_pid = ipc_signal_suffix
|
||||
self.engine.start_zmq_service(ipc_signal_suffix)
|
||||
else:
|
||||
ipc_signal_suffix = self.cfg.parallel_config.engine_worker_queue_port[0]
|
||||
if envs.FD_ENABLE_INTERNAL_ADAPTER:
|
||||
self.internal_adapter = InternalAdapter(
|
||||
cfg=self.cfg, engine=self.engine, dp_rank=self.cfg.parallel_config.local_data_parallel_id
|
||||
)
|
||||
self.engine.start_zmq_service(self.cfg.parallel_config.engine_worker_queue_port[local_data_parallel_id])
|
||||
|
||||
llm_logger.info(f"start expert service {local_data_parallel_id}")
|
||||
|
||||
@@ -189,7 +192,7 @@ class ExpertService:
|
||||
|
||||
|
||||
def start_data_parallel_service(
|
||||
cfg, local_data_parallel_id, ipc_signal_suffix=None, request_queues_for_dp_ipc=None, result_queue_for_dp_ipc=None
|
||||
cfg, local_data_parallel_id, ipc_signal_suffix=None, request_queues_for_dp_ipc=None, result_queues_for_dp_ipc=None
|
||||
):
|
||||
"""
|
||||
Start expert service
|
||||
@@ -198,7 +201,7 @@ def start_data_parallel_service(
|
||||
|
||||
try:
|
||||
expert_service.start(
|
||||
ipc_signal_suffix, local_data_parallel_id, request_queues_for_dp_ipc, result_queue_for_dp_ipc
|
||||
ipc_signal_suffix, local_data_parallel_id, request_queues_for_dp_ipc, result_queues_for_dp_ipc
|
||||
)
|
||||
|
||||
def deamon_thread():
|
||||
|
||||
@@ -102,6 +102,8 @@ class Request:
|
||||
prefill_start_index: int = 0,
|
||||
prefill_end_index: int = 0,
|
||||
num_computed_tokens: int = 0,
|
||||
# for internal adapter
|
||||
ic_req_data: Optional[dict] = (None,),
|
||||
) -> None:
|
||||
self.request_id = request_id
|
||||
self.prompt = prompt
|
||||
@@ -172,6 +174,8 @@ class Request:
|
||||
self.extend_block_tables = []
|
||||
# dp
|
||||
self.dp_rank = dp_rank
|
||||
self.llm_engine_recv_req_timestamp = time.time()
|
||||
self.ic_req_data = ic_req_data
|
||||
|
||||
self.async_process_futures = []
|
||||
self.error_message = None
|
||||
@@ -226,6 +230,7 @@ class Request:
|
||||
video_end=d.get("video_end", 0),
|
||||
audio_end=d.get("audio_end", 0),
|
||||
dp_rank=d.get("dp_rank", None),
|
||||
ic_req_data=d.get("ic_req_data", None),
|
||||
inference_start_time=d.get("inference_start_time"),
|
||||
llm_engine_recv_req_timestamp=d.get("llm_engine_recv_req_timestamp"),
|
||||
)
|
||||
@@ -278,6 +283,7 @@ class Request:
|
||||
"image_end": self.image_end,
|
||||
"video_end": self.video_end,
|
||||
"audio_end": self.audio_end,
|
||||
"ic_req_data": self.ic_req_data,
|
||||
}
|
||||
add_params = [
|
||||
"guided_json",
|
||||
@@ -478,6 +484,9 @@ class RequestOutput:
|
||||
num_input_video_tokens: Optional[int] = 0,
|
||||
error_code: Optional[int] = 200,
|
||||
error_msg: Optional[str] = None,
|
||||
# for internal adapter
|
||||
ic_req_data: Optional[dict] = None,
|
||||
prompt_token_ids_len: Optional[int] = 0,
|
||||
) -> None:
|
||||
self.request_id = request_id
|
||||
self.prompt = prompt
|
||||
@@ -493,6 +502,8 @@ class RequestOutput:
|
||||
self.num_input_video_tokens = num_input_video_tokens
|
||||
self.error_code = error_code
|
||||
self.error_msg = error_msg
|
||||
self.ic_req_data = ic_req_data
|
||||
self.prompt_token_ids_len = prompt_token_ids_len
|
||||
|
||||
if prompt_token_ids is None:
|
||||
self.prompt_token_ids = []
|
||||
@@ -565,6 +576,8 @@ class RequestOutput:
|
||||
"num_input_video_tokens": self.num_input_video_tokens,
|
||||
"error_code": self.error_code,
|
||||
"error_msg": self.error_msg,
|
||||
"ic_req_data": self.ic_req_data,
|
||||
"prompt_token_ids_len": self.prompt_token_ids_len,
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user