mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 00:57:33 +08:00
[Bug fix] Fix zmq core bug (#3357)
* [Bug fix] Fix zmq core bug due to concurrently used by threads * Fix zmq core bug due to concurrently used by threads
This commit is contained in:
@@ -223,17 +223,15 @@ class ZmqTcpServer(ZmqServerBase):
|
|||||||
Recieve control command from client
|
Recieve control command from client
|
||||||
"""
|
"""
|
||||||
self._ensure_socket()
|
self._ensure_socket()
|
||||||
while self.running:
|
try:
|
||||||
try:
|
client, _, task_data = self.socket.recv_multipart(flags=zmq.NOBLOCK)
|
||||||
client, _, task_data = self.socket.recv_multipart(flags=zmq.NOBLOCK)
|
task = msgpack.unpackb(task_data)
|
||||||
task = msgpack.unpackb(task_data)
|
task_id_str = task["task_id"]
|
||||||
task_id_str = task["task_id"]
|
except zmq.Again:
|
||||||
except zmq.Again:
|
return None
|
||||||
time.sleep(0.001)
|
with self.mutex:
|
||||||
continue
|
self.req_dict[task_id_str] = client
|
||||||
with self.mutex:
|
return task
|
||||||
self.req_dict[task_id_str] = client
|
|
||||||
return task
|
|
||||||
|
|
||||||
def response_for_control_cmd(self, task_id, result):
|
def response_for_control_cmd(self, task_id, result):
|
||||||
"""
|
"""
|
||||||
@@ -251,7 +249,7 @@ class ZmqTcpServer(ZmqServerBase):
|
|||||||
|
|
||||||
with self.mutex:
|
with self.mutex:
|
||||||
self.req_dict.pop(task_id, None)
|
self.req_dict.pop(task_id, None)
|
||||||
llm_logger.info(f"response control cmd finished, task_id: {task_id}")
|
llm_logger.debug(f"response control cmd finished, task_id: {task_id}")
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
"""
|
"""
|
||||||
|
@@ -525,7 +525,8 @@ class TokenProcessor:
|
|||||||
for token_id in token_ids:
|
for token_id in token_ids:
|
||||||
self.tokens_counter[task_id] += 1
|
self.tokens_counter[task_id] += 1
|
||||||
if token_id != RECOVERY_STOP_SIGNAL:
|
if token_id != RECOVERY_STOP_SIGNAL:
|
||||||
result.outputs.token_ids.append(token_id)
|
if not (envs.FD_ENABLE_INTERNAL_ADAPTER and token_id in task.eos_token_ids):
|
||||||
|
result.outputs.token_ids.append(token_id)
|
||||||
task.output_token_ids.append(token_id)
|
task.output_token_ids.append(token_id)
|
||||||
if token_id in task.eos_token_ids or is_prefill or recovery_stop:
|
if token_id in task.eos_token_ids or is_prefill or recovery_stop:
|
||||||
result.finished = True
|
result.finished = True
|
||||||
|
@@ -34,6 +34,7 @@ class InternalAdapter:
|
|||||||
self.engine = engine
|
self.engine = engine
|
||||||
self.dp_rank = dp_rank
|
self.dp_rank = dp_rank
|
||||||
recv_control_cmd_ports = envs.FD_ZMQ_CONTROL_CMD_SERVER_PORTS.split(",")
|
recv_control_cmd_ports = envs.FD_ZMQ_CONTROL_CMD_SERVER_PORTS.split(",")
|
||||||
|
self.response_lock = threading.Lock() # prevent to call send_multipart in zmq concurrently
|
||||||
self.recv_control_cmd_server = ZmqTcpServer(port=recv_control_cmd_ports[dp_rank], mode=zmq.ROUTER)
|
self.recv_control_cmd_server = ZmqTcpServer(port=recv_control_cmd_ports[dp_rank], mode=zmq.ROUTER)
|
||||||
self.recv_external_instruct_thread = threading.Thread(
|
self.recv_external_instruct_thread = threading.Thread(
|
||||||
target=self._recv_external_module_control_instruct, daemon=True
|
target=self._recv_external_module_control_instruct, daemon=True
|
||||||
@@ -43,7 +44,6 @@ class InternalAdapter:
|
|||||||
target=self._response_external_module_control_instruct, daemon=True
|
target=self._response_external_module_control_instruct, daemon=True
|
||||||
)
|
)
|
||||||
self.response_external_instruct_thread.start()
|
self.response_external_instruct_thread.start()
|
||||||
self.response_lock = threading.Lock() # prevent to call send_multipart in zmq concurrently
|
|
||||||
|
|
||||||
def _get_current_server_info(self):
|
def _get_current_server_info(self):
|
||||||
"""
|
"""
|
||||||
@@ -71,13 +71,17 @@ class InternalAdapter:
|
|||||||
"""
|
"""
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
task = self.recv_control_cmd_server.recv_control_cmd()
|
with self.response_lock:
|
||||||
|
task = self.recv_control_cmd_server.recv_control_cmd()
|
||||||
|
if task is None:
|
||||||
|
time.sleep(0.001)
|
||||||
|
continue
|
||||||
logger.info(f"Recieve control task: {task}")
|
logger.info(f"Recieve control task: {task}")
|
||||||
task_id_str = task["task_id"]
|
task_id_str = task["task_id"]
|
||||||
if task["cmd"] == "get_payload":
|
if task["cmd"] == "get_payload":
|
||||||
payload_info = self._get_current_server_info()
|
payload_info = self._get_current_server_info()
|
||||||
result = {"task_id": task_id_str, "result": payload_info}
|
result = {"task_id": task_id_str, "result": payload_info}
|
||||||
logger.info(f"Response for task: {task_id_str}")
|
logger.debug(f"Response for task: {task_id_str}")
|
||||||
with self.response_lock:
|
with self.response_lock:
|
||||||
self.recv_control_cmd_server.response_for_control_cmd(task_id_str, result)
|
self.recv_control_cmd_server.response_for_control_cmd(task_id_str, result)
|
||||||
|
|
||||||
@@ -87,7 +91,7 @@ class InternalAdapter:
|
|||||||
extra_register_func=lambda reg: main_process_metrics.register_all(reg, workers=1),
|
extra_register_func=lambda reg: main_process_metrics.register_all(reg, workers=1),
|
||||||
)
|
)
|
||||||
result = {"task_id": task_id_str, "result": metrics_text}
|
result = {"task_id": task_id_str, "result": metrics_text}
|
||||||
logger.info(f"Response for task: {task_id_str}")
|
logger.debug(f"Response for task: {task_id_str}")
|
||||||
with self.response_lock:
|
with self.response_lock:
|
||||||
self.recv_control_cmd_server.response_for_control_cmd(task_id_str, result)
|
self.recv_control_cmd_server.response_for_control_cmd(task_id_str, result)
|
||||||
elif task["cmd"] == "connect_rdma":
|
elif task["cmd"] == "connect_rdma":
|
||||||
|
Reference in New Issue
Block a user