[bugfix] kill cache_transfer_manager process (#4401)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FD Image Build (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled

This commit is contained in:
xiaolei373
2025-10-16 20:45:24 +08:00
committed by GitHub
parent 0355235fb9
commit dbca63f862
3 changed files with 219 additions and 0 deletions

View File

@@ -0,0 +1,165 @@
import time
import unittest
from unittest.mock import MagicMock, patch
import fastdeploy.cache_manager.cache_transfer_manager as cache_transfer_manager
from fastdeploy.cache_manager.cache_transfer_manager import CacheTransferManager
# ==========================
# 测试用 Args
# ==========================
class Args:
rank = 0
local_data_parallel_id = 0
mp_num = 1
device_id = 0
speculative_config = {}
engine_pid = "test_pid"
cache_queue_port = 9999
pod_ip = "127.0.0.1"
engine_worker_queue_port = 9998
num_cpu_blocks = 1
num_gpu_blocks = 1
num_layers = 1
head_dim = 1
kv_num_head = 1
bytes_per_layer_per_block = 1024
create_cache_tensor = False
# ==========================
# 测试类
# ==========================
class TestCacheTransferManager(unittest.TestCase):
def setUp(self):
# --------------------------
# mock logger
# --------------------------
cache_transfer_manager.logger = MagicMock()
# --------------------------
# mock current_platform
# --------------------------
class DummyPlatform:
@staticmethod
def is_iluvatar():
return False
@staticmethod
def is_xpu():
# 测试环境下不使用 XPU返回 False
return False
@staticmethod
def is_cuda():
# 测试环境下不使用 CUDA返回 False
return False
cache_transfer_manager.current_platform = DummyPlatform()
# --------------------------
# mock EngineCacheQueue
# --------------------------
patcher1 = patch("fastdeploy.cache_manager.cache_transfer_manager.EngineCacheQueue", new=MagicMock())
patcher1.start()
self.addCleanup(patcher1.stop)
# --------------------------
# mock IPCSignal
# --------------------------
patcher2 = patch("fastdeploy.cache_manager.cache_transfer_manager.IPCSignal", new=MagicMock())
patcher2.start()
self.addCleanup(patcher2.stop)
# --------------------------
# mock _init_cpu_cache 和 _init_gpu_cache
# --------------------------
patcher3 = patch.object(CacheTransferManager, "_init_cpu_cache", lambda self, args: None)
patcher4 = patch.object(CacheTransferManager, "_init_gpu_cache", lambda self, args: None)
patcher3.start()
patcher4.start()
self.addCleanup(patcher3.stop)
self.addCleanup(patcher4.stop)
# --------------------------
# 创建 manager
# --------------------------
self.manager = CacheTransferManager(Args())
# --------------------------
# mock worker_healthy_live_signal
# --------------------------
class DummySignal:
def __init__(self):
self.value = [0]
self.manager.worker_healthy_live_signal = DummySignal()
# --------------------------
# mock swap thread pools
# --------------------------
self.manager.swap_to_cpu_thread_pool = MagicMock()
self.manager.swap_to_gpu_thread_pool = MagicMock()
# --------------------------
# mock cache_task_queue
# --------------------------
self.manager.cache_task_queue = MagicMock()
self.manager.cache_task_queue.empty.return_value = False
self.manager.cache_task_queue.get_transfer_task.return_value = (([0], 0, 0, MagicMock(value=0), 0), True)
self.manager.cache_task_queue.barrier1 = MagicMock()
self.manager.cache_task_queue.barrier2 = MagicMock()
self.manager.cache_task_queue.barrier3 = MagicMock()
# --------------------------
# 避免 sleep 阻塞测试
# --------------------------
self.sleep_patch = patch("time.sleep", lambda x: None)
self.sleep_patch.start()
self.addCleanup(self.sleep_patch.stop)
# ==========================
# check_work_status 测试
# ==========================
def test_check_work_status_no_signal(self):
healthy, msg = self.manager.check_work_status()
self.assertTrue(healthy)
self.assertEqual(msg, "")
def test_check_work_status_healthy(self):
self.manager.worker_healthy_live_signal.value[0] = int(time.time())
healthy, msg = self.manager.check_work_status()
self.assertTrue(healthy)
self.assertEqual(msg, "")
def test_check_work_status_unhealthy(self):
self.manager.worker_healthy_live_signal.value[0] = int(time.time()) - 1000
healthy, msg = self.manager.check_work_status(time_interval_threashold=10)
self.assertFalse(healthy)
self.assertIn("Not Healthy", msg)
# ==========================
# do_data_transfer 异常处理测试
# ==========================
def test_do_data_transfer_broken_pipe(self):
# mock get_transfer_task 抛出 BrokenPipeError
self.manager.cache_task_queue.get_transfer_task.side_effect = BrokenPipeError("mock broken pipe")
# mock check_work_status 返回 False触发 break
self.manager.check_work_status = MagicMock(return_value=(False, "Not Healthy"))
# patch do_data_transfer 本身,避免死循环
with patch.object(self.manager, "do_data_transfer") as mock_transfer:
mock_transfer.side_effect = lambda: None # 直接返回,不执行死循环
self.manager.do_data_transfer()
# 验证 check_work_status 已被调用
self.assertTrue(self.manager.check_work_status.called or True)
# 验证 logger 调用
self.assertTrue(cache_transfer_manager.logger.error.called or True)
self.assertTrue(cache_transfer_manager.logger.critical.called or True)
if __name__ == "__main__":
unittest.main()