diff --git a/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py b/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py index 94abbb3b8..6a0c0ac36 100644 --- a/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py +++ b/fastdeploy/cache_manager/transfer_factory/rdma_cache_transfer.py @@ -61,18 +61,12 @@ class RDMACommManager: Connect to remote gpu and write cache. """ assert self.splitwise_role == "prefill", "only prefill can call this method" - addr = f"{ip}:{port!s}" - if addr in self.connected_rdma: - return True ret = self.messager.is_connected(ip, str(port)) if ret: - self.connected_rdma.add(addr) return True ret = self.messager.connect(ip, str(port)) logger.info(f"connect to remote rdma address {ip}:{port} status is {ret}") - if ret == 0: - self.connected_rdma.add(addr) return ret == 0 def write_cache(self, ip, port, local_block_ids, remote_block_ids, layer_idx): diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py index a9bf68723..fbb978407 100644 --- a/fastdeploy/output/token_processor.py +++ b/fastdeploy/output/token_processor.py @@ -265,13 +265,13 @@ class TokenProcessor: llm_logger.info(f"finished_task_id: {finished_task_id}") self.prefill_result_status[finished_task_id[0]] = finished_task_id[1] if task_id in self.prefill_result_status: - self.split_connector.send_first_token(task.disaggregate_info, [result]) self.resource_manager.stop_flags[index] = True self.resource_manager.tasks_list[index] = None self.resource_manager._recycle_block_tables(task) if self.prefill_result_status[task_id] != "finished": result.error_code = 400 result.error_message = f"{task_id} failed to {self.prefill_result_status[task_id]}" + self.split_connector.send_first_token(task.disaggregate_info, [result]) del self.resource_manager.req_dict[task_id] break else: