[LLM] support multi node deploy (#2708)

* [LLM] support multi node deploy * Update engine.py * fix bugs * fix * [LLM] support multi node deploy * [LLM] support multi node deploy --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com>
2025-10-06 09:07:10 +08:00 · 2025-07-06 10:33:51 +08:00
parent 04a8e1ef2b
commit 68b4755587
13 changed files with 157 additions and 87 deletions
--- a/fastdeploy/cache_manager/prefix_cache_manager.py
+++ b/fastdeploy/cache_manager/prefix_cache_manager.py
@@ -109,7 +109,7 @@ class PrefixCacheManager:


    def launch_cache_manager(self, cache_config, tensor_parallel_size, \
-                    device_ids, engine_worker_queue_port, pid_suffix):
+                    device_ids, pod_ip, engine_worker_queue_port, pid_suffix):
        """
        launch_cache_manager function used to initialize the cache manager.
        """
@@ -123,7 +123,7 @@ class PrefixCacheManager:
            create=True)

        self.cache_task_queue = EngineCacheQueue(
-            address=('127.0.0.1', cache_config.cache_queue_port),
+            address=(pod_ip, cache_config.cache_queue_port),
            authkey=b'cache_queue_service',
            is_server=False,
            num_client=tensor_parallel_size,
@@ -166,6 +166,7 @@ class PrefixCacheManager:
                f" --cache_dtype {cache_config.cache_dtype}" +
                f" --cache_queue_port {cache_config.cache_queue_port}" +
                f" --enable_splitwise {int(self.enable_splitwise)}" +
+                f" --pod_ip {pod_ip}" +
                f" --engine_worker_queue_port {engine_worker_queue_port}" +
                f" --num_gpu_blocks {cache_config.total_block_num}" +
                f" --num_cpu_blocks {cache_config.num_cpu_blocks}" +