[feat] support prefix cache clearing when /clear_load_weight is called (#4008)

* [feat] support clearing prefix cache (cherry-picked from release/2.1) * [fix] fix ipc suffix, use port instead * [fix] fix prefix caching not enabled * [fix] fix key/value_cache_scales indent * [fix] fix ep group all-reduce * [fix] fix clear/update lock not working when workers > 1 * [chore] add preemption triggered info log * [fix] fix code style * [fix] fix max_num_seqs config * [fix] do not force enable_prefix_caching=False in dynamic loading * [fix] fix ci * Revert "[fix] fix ci" This reverts commit 0bc6d55cc8. * [fix] initialize available_gpu_block_num with max_gpu_block_num * [fix] fix config splitwise_role * [fix] fix clearing caches synchronization and add more logs * [chore] print cache_ready_signal in log * [fix] fix scheduler_config.splitwise_role * [fix] fix cache_messager cache_ready_signal create=True * [fix] stop cache messager from launching in mixed deployment
2025-10-16 21:51:31 +08:00 · 2025-09-28 19:42:53 +08:00
parent 59313ed7f9
commit 6265f4385f
20 changed files with 697 additions and 213 deletions
--- a/custom_ops/gpu_ops/unset_data_ipc.cu
+++ b/custom_ops/gpu_ops/unset_data_ipc.cu
@@ -0,0 +1,71 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "helper.h"
+#include "cuda_multiprocess.h"
+
+#if !defined(_WIN32)
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#endif
+
+// 可选：仅删除/解除共享内存命名对象（不依赖之前保存的 addr/fd）
+static inline int sharedMemoryUnlinkByName(const char* name) {
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+  // Windows 上没有 shm_unlink 语义。命名对象在最后一个句柄关闭后消失。
+  // 这里做“尽力而为”：尝试打开后立即关闭，减少一次引用。
+  HANDLE hMap = OpenFileMappingA(FILE_MAP_ALL_ACCESS, FALSE, name);
+  if (hMap) {
+    CloseHandle(hMap);
+    return 0;
+  }
+  // 已经不存在也算成功
+  return 0;
+#else
+  // POSIX: 移除名字，未来不可再 open；已映射区仍存活直至 munmap
+  if (shm_unlink(name) != 0) {
+    if (errno == ENOENT) return 0;  // 不存在视作成功
+    return errno;
+  }
+  return 0;
+#endif
+}
+
+void UnsetDataIpc(const paddle::Tensor& tmp_input,
+                         const std::string& shm_name,
+                         bool close_ipc,
+                         bool unlink_shm) {
+  // 1) 关闭消费者导入的 IPC 映射（仅当 close_ipc=true 且该指针确为 OpenMemHandle 得来）
+  if (close_ipc) {
+    void* ptr = const_cast<void*>(tmp_input.data());
+    checkCudaErrors(cudaIpcCloseMemHandle(ptr));
+  }
+
+  // 2) 解除共享内存命名对象（仅处理“名字”，不保证解除旧映射）
+  if (unlink_shm) {
+    int rc = sharedMemoryUnlinkByName(shm_name.c_str());
+    if (rc != 0) {
+      PD_THROW("Unlink shared memory failed: name=%s, err=%d",
+               shm_name.c_str(), rc);
+    }
+  }
+}
+
+PD_BUILD_STATIC_OP(unset_data_ipc)
+    .Inputs({"tmp_input"})
+    .Attrs({"shm_name: std::string", "close_ipc: bool", "unlink_shm: bool"})
+    .SetKernelFn(PD_KERNEL(UnsetDataIpc));