diff --git a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py index 6341d3d71..863ab0a44 100644 --- a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py +++ b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py @@ -133,8 +133,9 @@ class CudaGraphPiecewiseBackend: self.cuda_graph_manager.state = jit_utils.CUDAGraphState.CAPTURE self.cuda_graph_manager.batch_size = entry.real_shape entry.captured = True - with self.cuda_graph_manager.run_impl_guard(): - entry.runnable(**kwargs) + with capture_custom_allreduce(): + with self.cuda_graph_manager.run_impl_guard(): + entry.runnable(**kwargs) # Replay self.cuda_graph_manager.state = jit_utils.CUDAGraphState.REPLAY diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 02653c86a..7744dec26 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1575,6 +1575,7 @@ class GPUModelRunner(ModelRunnerBase): self.proposer.update_task_chunk_prefill(task) task.chunk_idx += 1 + @sot_warmup_guard(True) def capture_model(self) -> None: """ Trigger CUDA Graph capture for all shapes in cuda graph capture list diff --git a/fastdeploy/worker/gpu_worker.py b/fastdeploy/worker/gpu_worker.py index 601efd16b..8c72cfc75 100644 --- a/fastdeploy/worker/gpu_worker.py +++ b/fastdeploy/worker/gpu_worker.py @@ -207,7 +207,7 @@ class GpuWorker(WorkerBase): """ Perform the warm-up and the graph optimization """ - if self.fd_config.graph_opt_config.graph_opt_level >= 1: + if self.fd_config.graph_opt_config.graph_opt_level >= 1 and not self.model_runner.use_cudagraph: self.model_runner.sot_warmup() # Trigger cuda graph capture self.model_runner.capture_model()