mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
polish code with new pre-commit rule (#2923)
This commit is contained in:
@@ -14,4 +14,4 @@
|
||||
|
||||
from .custom_all_reduce import CustomAllreduce
|
||||
|
||||
__all__ = ["CustomAllreduce"]
|
||||
__all__ = ["CustomAllreduce"]
|
||||
|
@@ -41,7 +41,7 @@ def find_loaded_library(lib_name) -> Optional[str]:
|
||||
the file `/proc/self/maps` contains the memory maps of the process, which includes the
|
||||
shared libraries loaded by the process. We can use this file to find the path of the
|
||||
a loaded library.
|
||||
""" # noqa
|
||||
"""
|
||||
found = False
|
||||
with open("/proc/self/maps") as f:
|
||||
for line in f:
|
||||
@@ -73,18 +73,40 @@ class CudaRTLibrary:
|
||||
# const char* cudaGetErrorString ( cudaError_t error )
|
||||
Function("cudaGetErrorString", ctypes.c_char_p, [cudaError_t]),
|
||||
# cudaError_t cudaMalloc ( void** devPtr, size_t size )
|
||||
Function("cudaMalloc", cudaError_t, [ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t]),
|
||||
Function(
|
||||
"cudaMalloc",
|
||||
cudaError_t,
|
||||
[ctypes.POINTER(ctypes.c_void_p), ctypes.c_size_t],
|
||||
),
|
||||
# cudaError_t cudaFree ( void* devPtr )
|
||||
Function("cudaFree", cudaError_t, [ctypes.c_void_p]),
|
||||
# cudaError_t cudaMemset ( void* devPtr, int value, size_t count )
|
||||
Function("cudaMemset", cudaError_t, [ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t]),
|
||||
# cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) # noqa
|
||||
Function("cudaMemcpy", cudaError_t, [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, cudaMemcpyKind]),
|
||||
# cudaError_t cudaIpcGetMemHandle ( cudaIpcMemHandle_t* handle, void* devPtr ) # noqa
|
||||
Function("cudaIpcGetMemHandle", cudaError_t, [ctypes.POINTER(cudaIpcMemHandle_t), ctypes.c_void_p]),
|
||||
# cudaError_t cudaIpcOpenMemHandle ( void** devPtr, cudaIpcMemHandle_t handle, unsigned int flags ) # noqa
|
||||
Function(
|
||||
"cudaIpcOpenMemHandle", cudaError_t, [ctypes.POINTER(ctypes.c_void_p), cudaIpcMemHandle_t, ctypes.c_uint]
|
||||
"cudaMemset",
|
||||
cudaError_t,
|
||||
[ctypes.c_void_p, ctypes.c_int, ctypes.c_size_t],
|
||||
),
|
||||
# cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind )
|
||||
Function(
|
||||
"cudaMemcpy",
|
||||
cudaError_t,
|
||||
[ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, cudaMemcpyKind],
|
||||
),
|
||||
# cudaError_t cudaIpcGetMemHandle ( cudaIpcMemHandle_t* handle, void* devPtr )
|
||||
Function(
|
||||
"cudaIpcGetMemHandle",
|
||||
cudaError_t,
|
||||
[ctypes.POINTER(cudaIpcMemHandle_t), ctypes.c_void_p],
|
||||
),
|
||||
# cudaError_t cudaIpcOpenMemHandle ( void** devPtr, cudaIpcMemHandle_t handle, unsigned int flags )
|
||||
Function(
|
||||
"cudaIpcOpenMemHandle",
|
||||
cudaError_t,
|
||||
[
|
||||
ctypes.POINTER(ctypes.c_void_p),
|
||||
cudaIpcMemHandle_t,
|
||||
ctypes.c_uint,
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
|
@@ -13,26 +13,26 @@
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
from contextlib import contextmanager
|
||||
import atexit
|
||||
import ctypes
|
||||
from contextlib import contextmanager
|
||||
from typing import List, Optional
|
||||
|
||||
import paddle
|
||||
import paddle.distributed as dist
|
||||
from paddle.distributed.communication.group import Group
|
||||
|
||||
from fastdeploy.distributed.custom_all_reduce import cuda_wrapper
|
||||
from fastdeploy.model_executor.ops.gpu import (
|
||||
all_reduce,
|
||||
dispose,
|
||||
get_graph_buffer_ipc_meta,
|
||||
init_custom_all_reduce,
|
||||
meta_size,
|
||||
register_buffer,
|
||||
get_graph_buffer_ipc_meta,
|
||||
register_graph_buffers,
|
||||
)
|
||||
|
||||
from fastdeploy.distributed.custom_all_reduce import cuda_wrapper
|
||||
|
||||
try:
|
||||
meta_size()
|
||||
custom_ar = True
|
||||
@@ -47,7 +47,7 @@ class CustomAllreduce:
|
||||
_SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
|
||||
|
||||
# max_size: max supported allreduce size
|
||||
def __init__(self, group: Group, max_size: int=8192 * 1024) -> None:
|
||||
def __init__(self, group: Group, max_size: int = 8192 * 1024) -> None:
|
||||
"""
|
||||
Args:
|
||||
device: the device to bind the CustomAllreduce to. If None,
|
||||
@@ -147,7 +147,12 @@ class CustomAllreduce:
|
||||
return inp_size < self.max_size
|
||||
return False
|
||||
|
||||
def all_reduce(self, inp: paddle.Tensor, out: paddle.Tensor = None, registered: bool = False):
|
||||
def all_reduce(
|
||||
self,
|
||||
inp: paddle.Tensor,
|
||||
out: paddle.Tensor = None,
|
||||
registered: bool = False,
|
||||
):
|
||||
"""Performs an out-of-place all reduce.
|
||||
|
||||
If registered is True, this assumes inp's pointer is already
|
||||
@@ -165,7 +170,7 @@ class CustomAllreduce:
|
||||
@contextmanager
|
||||
def capture(self):
|
||||
"""
|
||||
The main responsibility of this context manager is the
|
||||
The main responsibility of this context manager is the
|
||||
`register_graph_buffers` call at the end of the context.
|
||||
It records all the buffer addresses used in the CUDA graph.
|
||||
"""
|
||||
@@ -179,22 +184,18 @@ class CustomAllreduce:
|
||||
|
||||
def register_graph_buffers(self):
|
||||
handle, offset = get_graph_buffer_ipc_meta(self._ptr)
|
||||
all_data = [[None, None]
|
||||
for _ in range(dist.get_world_size(group=self.group))]
|
||||
all_data = [[None, None] for _ in range(dist.get_world_size(group=self.group))]
|
||||
all_data[self.rank] = [handle, offset]
|
||||
|
||||
ranks = sorted(dist.get_process_group_ranks(group=self.group))
|
||||
for i, rank in enumerate(ranks):
|
||||
dist.broadcast_object_list(all_data[i],
|
||||
src=rank,
|
||||
group=self.group,
|
||||
device="cpu")
|
||||
dist.broadcast_object_list(all_data[i], src=rank, group=self.group, device="cpu")
|
||||
|
||||
# Unpack list of tuples to tuple of lists.
|
||||
handles = [d[0] for d in all_data] # type: ignore
|
||||
offsets = [d[1] for d in all_data] # type: ignore
|
||||
register_graph_buffers(self._ptr, handles, offsets)
|
||||
|
||||
|
||||
def custom_all_reduce(self, input: paddle.Tensor) -> Optional[paddle.Tensor]:
|
||||
"""The main allreduce API that provides support for cuda graph."""
|
||||
# When custom allreduce is disabled, this will be None.
|
||||
|
Reference in New Issue
Block a user