From fe17410f9c9efbe65f824973418f34b3048e2e63 Mon Sep 17 00:00:00 2001 From: chenjian <1435317881@qq.com> Date: Thu, 31 Jul 2025 20:17:27 +0800 Subject: [PATCH] [BUG] Fix bug for pd in fd (#3034) * Fix bug for pd in fd * Fix bug for pd in fd --------- Co-authored-by: Jiang-Jia-Jun <163579578+Jiang-Jia-Jun@users.noreply.github.com> --- fastdeploy/cache_manager/cache_messager.py | 9 +++++---- .../transfer_factory/rdma_cache_transfer.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/fastdeploy/cache_manager/cache_messager.py b/fastdeploy/cache_manager/cache_messager.py index f11c40690..e06d05a67 100644 --- a/fastdeploy/cache_manager/cache_messager.py +++ b/fastdeploy/cache_manager/cache_messager.py @@ -142,6 +142,7 @@ class CacheMessager: self.gpu_id = gpu_id self.cache_info = dict() + self.dp_rank_id = local_data_parallel_id layerwise_send_cache_thread = threading.Thread(target=self._prefill_layerwise_send_cache_thread) layerwise_send_cache_thread.daemon = True @@ -159,14 +160,14 @@ class CacheMessager: prefilled_layer_idx_data = np.zeros(shape=[1], dtype=np.int32) try: step_shm_value = IPCSignal( - name=f"splitwise_complete_prefilled_step_{self.rank}", + name=f"splitwise_complete_prefilled_step_{self.dp_rank_id}", array=prefilled_step_idx_data, dtype=np.int32, suffix=self.gpu_id, create=True, ) layer_shm_value = IPCSignal( - name=f"splitwise_complete_prefilled_layer_{self.rank}", + name=f"splitwise_complete_prefilled_layer_{self.dp_rank_id}", array=prefilled_layer_idx_data, dtype=np.int32, suffix=self.gpu_id, @@ -174,14 +175,14 @@ class CacheMessager: ) except: step_shm_value = IPCSignal( - name=f"splitwise_complete_prefilled_step_{self.rank}", + name=f"splitwise_complete_prefilled_step_{self.dp_rank_id}", array=prefilled_step_idx_data, dtype=np.int32, suffix=self.gpu_id, create=False, ) layer_shm_value = IPCSignal( - name=f"splitwise_complete_prefilled_layer_{self.rank}", + name=f"splitwise_complete_prefilled_layer_{self.dp_rank_id}", array=prefilled_layer_idx_data, dtype=np.int32, suffix=self.gpu_id, diff --git a/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py b/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py index f90abe798..94abbb3b8 100644 --- a/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py +++ b/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py @@ -45,7 +45,7 @@ class RDMACommManager: return self.messager = rdma_comm.RDMACommunicator( splitwise_role, - rank, + gpu_id, str(rdma_port) if splitwise_role == "decode" else "0", cache_k_ptr_list, cache_v_ptr_list,