mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 17:17:14 +08:00
[NewFeature]custom_allreduce support cudagraph recapture (#4305)
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
Some checks failed
CE Compile Job / ce_job_pre_check (push) Has been cancelled
CE Compile Job / print_ce_job_pre_check_outputs (push) Has been cancelled
CE Compile Job / FD-Clone-Linux (push) Has been cancelled
CE Compile Job / Show Code Archive Output (push) Has been cancelled
CE Compile Job / BUILD_SM8090 (push) Has been cancelled
CE Compile Job / BUILD_SM8689 (push) Has been cancelled
CE Compile Job / CE_UPLOAD (push) Has been cancelled
Deploy GitHub Pages / deploy (push) Has been cancelled
Publish Job / publish_pre_check (push) Has been cancelled
Publish Job / print_publish_pre_check_outputs (push) Has been cancelled
Publish Job / FD-Clone-Linux (push) Has been cancelled
Publish Job / Show Code Archive Output (push) Has been cancelled
Publish Job / BUILD_SM8090 (push) Has been cancelled
Publish Job / BUILD_SM8689 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8090 (push) Has been cancelled
Publish Job / PADDLE_PYPI_UPLOAD_8689 (push) Has been cancelled
Publish Job / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
Publish Job / Run FastDeploy LogProb Tests (push) Has been cancelled
Publish Job / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
Publish Job / Run Base Tests (push) Has been cancelled
Publish Job / Run Accuracy Tests (push) Has been cancelled
Publish Job / Run Stable Tests (push) Has been cancelled
CI Images Build / FD-Clone-Linux (push) Has been cancelled
CI Images Build / Show Code Archive Output (push) Has been cancelled
CI Images Build / CI Images Build (push) Has been cancelled
CI Images Build / BUILD_SM8090 (push) Has been cancelled
CI Images Build / Run FastDeploy Unit Tests and Coverage (push) Has been cancelled
CI Images Build / Run FastDeploy LogProb Tests (push) Has been cancelled
CI Images Build / Extracted partial CE model tasks to run in CI. (push) Has been cancelled
CI Images Build / Run Base Tests (push) Has been cancelled
CI Images Build / Run Accuracy Tests (push) Has been cancelled
CI Images Build / Run Stable Tests (push) Has been cancelled
CI Images Build / Publish Docker Images Pre Check (push) Has been cancelled
* custom_allreduce support cudagraph recapture * add shut_down/restart default group
This commit is contained in:
@@ -623,6 +623,8 @@ int64_t open_mem_handle(paddle::Tensor& mem_handle);
|
|||||||
|
|
||||||
void free_shared_buffer(int64_t buffer);
|
void free_shared_buffer(int64_t buffer);
|
||||||
|
|
||||||
|
void clear_ipc_handles(int64_t _fa);
|
||||||
|
|
||||||
// speculative decoding Kernel
|
// speculative decoding Kernel
|
||||||
std::vector<paddle::Tensor> SpeculateGetPaddingOffset(
|
std::vector<paddle::Tensor> SpeculateGetPaddingOffset(
|
||||||
const paddle::Tensor& input_ids,
|
const paddle::Tensor& input_ids,
|
||||||
@@ -1229,6 +1231,8 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
|
|||||||
|
|
||||||
m.def("free_shared_buffer", &free_shared_buffer, "free_shared_buffer");
|
m.def("free_shared_buffer", &free_shared_buffer, "free_shared_buffer");
|
||||||
|
|
||||||
|
m.def("clear_ipc_handles", &clear_ipc_handles, "clear_ipc_handles");
|
||||||
|
|
||||||
m.def("open_mem_handle", &open_mem_handle, "open_mem_handle");
|
m.def("open_mem_handle", &open_mem_handle, "open_mem_handle");
|
||||||
|
|
||||||
m.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta, "get_graph_buffer_ipc_meta");
|
m.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta, "get_graph_buffer_ipc_meta");
|
||||||
|
@@ -122,10 +122,14 @@ void register_graph_buffers(fptr_t _fa,
|
|||||||
for (int i = 0; i < handles.size(); i++) {
|
for (int i = 0; i < handles.size(); i++) {
|
||||||
bytes.emplace_back(handles[i].begin(), handles[i].end());
|
bytes.emplace_back(handles[i].begin(), handles[i].end());
|
||||||
}
|
}
|
||||||
bytes.reserve(handles.size());
|
|
||||||
fa->register_graph_buffers(bytes, offsets);
|
fa->register_graph_buffers(bytes, offsets);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void clear_ipc_handles(fptr_t _fa) {
|
||||||
|
auto fa = reinterpret_cast<paddle::CustomAllreduce*>(_fa);
|
||||||
|
fa->clear_ipc_handles();
|
||||||
|
}
|
||||||
|
|
||||||
std::tuple<fptr_t, paddle::Tensor> allocate_shared_buffer_and_handle(
|
std::tuple<fptr_t, paddle::Tensor> allocate_shared_buffer_and_handle(
|
||||||
int64_t size) {
|
int64_t size) {
|
||||||
|
|
||||||
|
@@ -517,10 +517,15 @@ class CustomAllreduce {
|
|||||||
#undef KL
|
#undef KL
|
||||||
}
|
}
|
||||||
|
|
||||||
~CustomAllreduce() {
|
void clear_ipc_handles(){
|
||||||
for (auto [_, ptr] : ipc_handles_) {
|
for (auto [_, ptr] : ipc_handles_) {
|
||||||
CUDACHECK(cudaIpcCloseMemHandle(ptr));
|
CUDACHECK(cudaIpcCloseMemHandle(ptr));
|
||||||
}
|
}
|
||||||
|
ipc_handles_.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
~CustomAllreduce() {
|
||||||
|
clear_ipc_handles();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
} // namespace paddle
|
} // namespace paddle
|
||||||
|
@@ -42,6 +42,12 @@ def use_custom_allreduce(custom_all_reduce_max_bytes: int = 8192 * 1024):
|
|||||||
_TP_AR = CustomAllreduce(model_parallel_group, custom_all_reduce_max_bytes)
|
_TP_AR = CustomAllreduce(model_parallel_group, custom_all_reduce_max_bytes)
|
||||||
|
|
||||||
|
|
||||||
|
def custom_ar_clear_ipc_handles():
|
||||||
|
global _TP_AR
|
||||||
|
if _TP_AR is not None:
|
||||||
|
_TP_AR.clear_ipc_handles()
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
@paddle.jit.marker.unified
|
@paddle.jit.marker.unified
|
||||||
|
@@ -25,6 +25,7 @@ from paddle.distributed.communication.group import Group
|
|||||||
from fastdeploy.distributed.custom_all_reduce import cuda_wrapper
|
from fastdeploy.distributed.custom_all_reduce import cuda_wrapper
|
||||||
from fastdeploy.model_executor.ops.gpu import (
|
from fastdeploy.model_executor.ops.gpu import (
|
||||||
all_reduce,
|
all_reduce,
|
||||||
|
clear_ipc_handles,
|
||||||
dispose,
|
dispose,
|
||||||
get_graph_buffer_ipc_meta,
|
get_graph_buffer_ipc_meta,
|
||||||
init_custom_all_reduce,
|
init_custom_all_reduce,
|
||||||
@@ -220,6 +221,9 @@ class CustomAllreduce:
|
|||||||
else:
|
else:
|
||||||
return self.all_reduce(input, input, registered=False)
|
return self.all_reduce(input, input, registered=False)
|
||||||
|
|
||||||
|
def clear_ipc_handles(self):
|
||||||
|
clear_ipc_handles(self._ptr)
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
if self._ptr:
|
if self._ptr:
|
||||||
dispose(self._ptr)
|
dispose(self._ptr)
|
||||||
|
@@ -25,7 +25,10 @@ from paddle.device.cuda import graphs
|
|||||||
|
|
||||||
from fastdeploy import envs
|
from fastdeploy import envs
|
||||||
from fastdeploy.config import FDConfig
|
from fastdeploy.config import FDConfig
|
||||||
from fastdeploy.distributed.communication import capture_custom_allreduce
|
from fastdeploy.distributed.communication import (
|
||||||
|
capture_custom_allreduce,
|
||||||
|
custom_ar_clear_ipc_handles,
|
||||||
|
)
|
||||||
from fastdeploy.utils import get_logger
|
from fastdeploy.utils import get_logger
|
||||||
|
|
||||||
logger = get_logger("cudagrpah_piecewise_backend", "cudagraph_piecewise_backend.log")
|
logger = get_logger("cudagrpah_piecewise_backend", "cudagraph_piecewise_backend.log")
|
||||||
@@ -227,6 +230,7 @@ class CudaGraphPiecewiseBackend:
|
|||||||
def clear_graph(self):
|
def clear_graph(self):
|
||||||
""" """
|
""" """
|
||||||
# Clear graphs
|
# Clear graphs
|
||||||
|
custom_ar_clear_ipc_handles()
|
||||||
for id, entry in self.concrete_size_entries.items():
|
for id, entry in self.concrete_size_entries.items():
|
||||||
if entry.cuda_graph:
|
if entry.cuda_graph:
|
||||||
del entry.cuda_graph
|
del entry.cuda_graph
|
||||||
|
@@ -66,6 +66,7 @@ class DynamicWeightManager:
|
|||||||
|
|
||||||
# step1 : restart paddle process group
|
# step1 : restart paddle process group
|
||||||
if not self.first_load:
|
if not self.first_load:
|
||||||
|
paddle.distributed.restart_process_group()
|
||||||
paddle.distributed.restart_process_group(self.parallel_config.tp_group)
|
paddle.distributed.restart_process_group(self.parallel_config.tp_group)
|
||||||
if self.parallel_config.enable_expert_parallel:
|
if self.parallel_config.enable_expert_parallel:
|
||||||
paddle.distributed.restart_process_group(self.parallel_config.ep_group)
|
paddle.distributed.restart_process_group(self.parallel_config.ep_group)
|
||||||
@@ -148,6 +149,7 @@ class DynamicWeightManager:
|
|||||||
if self.parallel_config.enable_expert_parallel:
|
if self.parallel_config.enable_expert_parallel:
|
||||||
paddle.distributed.barrier(self.parallel_config.ep_group)
|
paddle.distributed.barrier(self.parallel_config.ep_group)
|
||||||
paddle.distributed.shutdown_process_group(self.parallel_config.ep_group)
|
paddle.distributed.shutdown_process_group(self.parallel_config.ep_group)
|
||||||
|
paddle.distributed.shutdown_process_group()
|
||||||
self._update_shared_status(pid, ModelWeightsStatus.CLEARED)
|
self._update_shared_status(pid, ModelWeightsStatus.CLEARED)
|
||||||
|
|
||||||
def _update_model_from_state(self, state_dict: Dict[str, paddle.Tensor], src_type: str):
|
def _update_model_from_state(self, state_dict: Dict[str, paddle.Tensor], src_type: str):
|
||||||
|
Reference in New Issue
Block a user