[Feature] support custom all-reduce (#2758)

* [Feature] support custom all-reduce * add vllm adapted
2025-10-06 00:57:33 +08:00 · 2025-07-09 16:00:27 +08:00
parent be21ef5047
commit b89180f1cd
16 changed files with 1194 additions and 2 deletions
--- a/fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py
+++ b/fastdeploy/distributed/custom_all_reduce/custom_all_reduce.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from contextlib import contextmanager
+import atexit
+import ctypes
+from typing import List, Optional
+
+import paddle
+import paddle.distributed as dist
+from paddle.distributed.communication.group import Group
+from fastdeploy.model_executor.ops.gpu import (
+    all_reduce,
+    dispose,
+    init_custom_all_reduce,
+    meta_size,
+    register_buffer,
+    get_graph_buffer_ipc_meta,
+    register_graph_buffers,
+)
+
+from fastdeploy.distributed.custom_all_reduce import cuda_wrapper
+
+try:
+    meta_size()
+    custom_ar = True
+except Exception:
+    custom_ar = False
+
+_instances = []
+
+
+class CustomAllreduce:
+
+    _SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
+
+    # max_size: max supported allreduce size
+    def __init__(self, group: Group, max_size: int=8192 * 1024) -> None:
+        """
+        Args:
+            device: the device to bind the CustomAllreduce to. If None,
+                it will be bind to f"cuda:{local_rank}".
+        It is the caller's responsibility to make sure each communicator
+        is bind to a unique device, and all communicators in this group
+        are in the same node.
+        """
+        self._IS_CAPTURING = False
+        self.disabled = True
+        self.group = group
+
+        if not custom_ar:
+            # disable because of missing custom allreduce library
+            # e.g. in a non-cuda environment
+            return
+
+        rank = dist.get_rank(group=self.group)
+        self.rank = rank
+        world_size = dist.get_world_size(group=self.group)
+        if world_size == 1:
+            # No need to initialize custom allreduce for single GPU case.
+            return
+
+        if world_size not in CustomAllreduce._SUPPORTED_WORLD_SIZES:
+            return
+
+        if world_size < 2:
+            return
+
+        self.disabled = False
+
+        # Buffers memory are owned by this Python class and passed to C++.
+        # Meta data composes of two parts: meta data for synchronization and a
+        # temporary buffer for storing intermediate allreduce results.
+        self.meta_ptrs = self.create_shared_buffer(group, meta_size() + max_size)
+
+        # This is a pre-registered IPC buffer. In eager mode, input tensors
+        # are first copied into this buffer before allreduce is performed
+        self.buffer_ptrs = self.create_shared_buffer(group, max_size)
+
+        # This is a buffer for storing the tuples of pointers pointing to
+        # IPC buffers from all ranks. Each registered tuple has size of
+        # 8*world_size bytes where world_size is at most 8. Allocating 8MB
+        # is enough for 131072 such tuples. The largest model I've seen only
+        # needs less than 10000 of registered tuples.
+        self.rank_data = paddle.empty([8 * 1024 * 1024], dtype=paddle.uint8)
+        self.max_size = max_size
+        self.rank = rank
+        self.world_size = world_size
+        self.full_nvlink = True
+        self._ptr = init_custom_all_reduce(self.meta_ptrs, self.rank_data, rank, self.full_nvlink)
+        register_buffer(self._ptr, self.buffer_ptrs)
+        print("zss init custom allreduce", self._ptr)
+        _instances.append(self)
+
+    @staticmethod
+    def create_shared_buffer(group: Group, size_in_bytes: int) -> List[int]:
+        """
+        Creates a shared buffer and returns a list of pointers
+        representing the buffer on all processes in the group.
+        """
+        lib = cuda_wrapper.CudaRTLibrary()
+        pointer = lib.cudaMalloc(size_in_bytes)
+        # lib.cudaMemset(pointer, 2, size_in_bytes)
+        handle = lib.cudaIpcGetMemHandle(pointer)
+        rank = dist.get_rank(group=group)
+        handles = []
+        dist.all_gather_object(handles, handle, group=group)
+
+        pointers: List[int] = []
+        for i, h in enumerate(handles):
+            if i == rank:
+                pointers.append(pointer.value)  # type: ignore
+            else:
+                pointers.append(lib.cudaIpcOpenMemHandle(h).value)  # type: ignore
+
+        return pointers
+
+    @staticmethod
+    def free_shared_buffer(group: Group, pointers: List[int], rank: Optional[int] = None) -> None:
+        if rank is None:
+            rank = dist.get_rank(group=group)
+        lib = cuda_wrapper.CudaRTLibrary()
+        lib.cudaFree(ctypes.c_void_p(pointers[rank]))
+
+    def should_custom_ar(self, inp: paddle.Tensor):
+        if self.disabled:
+            return False
+        inp_size = inp.numel() * inp.element_size()
+        # custom allreduce requires input byte size to be multiples of 16
+        if inp_size % 16 != 0:
+            return False
+        # for 4 or more non NVLink-capable GPUs, custom allreduce provides
+        # little performance improvement over NCCL.
+        if self.world_size == 2 or self.full_nvlink:
+            return inp_size < self.max_size
+        return False
+
+    def all_reduce(self, inp: paddle.Tensor, out: paddle.Tensor = None, registered: bool = False):
+        """Performs an out-of-place all reduce.
+
+        If registered is True, this assumes inp's pointer is already
+        IPC-registered. Otherwise, inp is first copied into a pre-registered
+        buffer.
+        """
+        if out is None:
+            out = paddle.empty_like(inp)
+        if registered:
+            all_reduce(self._ptr, inp, out, 0, 0)
+        else:
+            all_reduce(self._ptr, inp, out, self.buffer_ptrs[self.rank], self.max_size)
+        return out
+
+    @contextmanager
+    def capture(self):
+        """
+        The main responsibility of this context manager is the 
+        `register_graph_buffers` call at the end of the context.
+        It records all the buffer addresses used in the CUDA graph.
+        """
+        try:
+            self._IS_CAPTURING = True
+            yield
+        finally:
+            self._IS_CAPTURING = False
+            if not self.disabled:
+                self.register_graph_buffers()
+
+    def register_graph_buffers(self):
+        handle, offset = get_graph_buffer_ipc_meta(self._ptr)
+        all_data = [[None, None]
+                    for _ in range(dist.get_world_size(group=self.group))]
+        all_data[self.rank] = [handle, offset]
+
+        ranks = sorted(dist.get_process_group_ranks(group=self.group))
+        for i, rank in enumerate(ranks):
+            dist.broadcast_object_list(all_data[i],
+                                       src=rank,
+                                       group=self.group,
+                                       device="cpu")
+
+        # Unpack list of tuples to tuple of lists.
+        handles = [d[0] for d in all_data]  # type: ignore
+        offsets = [d[1] for d in all_data]  # type: ignore
+        register_graph_buffers(self._ptr, handles, offsets)
+    
+    def custom_all_reduce(self, input: paddle.Tensor) -> Optional[paddle.Tensor]:
+        """The main allreduce API that provides support for cuda graph."""
+        # When custom allreduce is disabled, this will be None.
+        if self.disabled or not self.should_custom_ar(input):
+            return None
+        if self._IS_CAPTURING:
+            if paddle.cuda.is_current_stream_capturing():
+                return self.all_reduce(input, registered=True)
+            else:
+                # If warm up, mimic the allocation pattern since custom
+                # allreduce is out-of-place.
+                return paddle.empty_like(input)
+        else:
+            return self.all_reduce(input, registered=False)
+
+    def close(self):
+        if not self.disabled and self._ptr:
+            dispose(self._ptr)
+            self._ptr = 0
+            self.free_shared_buffer(self.group, self.meta_ptrs, rank=self.rank)
+            self.free_shared_buffer(self.group, self.buffer_ptrs, rank=self.rank)
+
+
+def _cleanup_instances():
+    for instance in _instances:
+        instance.close()
+
+
+atexit.register(_cleanup_instances)