diff --git a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
index 6341d3d71..863ab0a44 100644
--- a/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
+++ b/fastdeploy/model_executor/graph_optimization/cudagraph_piecewise_backend.py
@@ -133,8 +133,9 @@ class CudaGraphPiecewiseBackend:
             self.cuda_graph_manager.state = jit_utils.CUDAGraphState.CAPTURE
             self.cuda_graph_manager.batch_size = entry.real_shape
             entry.captured = True
-            with self.cuda_graph_manager.run_impl_guard():
-                entry.runnable(**kwargs)
+            with capture_custom_allreduce():
+                with self.cuda_graph_manager.run_impl_guard():
+                    entry.runnable(**kwargs)
 
         # Replay
         self.cuda_graph_manager.state = jit_utils.CUDAGraphState.REPLAY
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 02653c86a..7744dec26 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1575,6 +1575,7 @@ class GPUModelRunner(ModelRunnerBase):
                 self.proposer.update_task_chunk_prefill(task)
             task.chunk_idx += 1
 
+    @sot_warmup_guard(True)
     def capture_model(self) -> None:
         """
         Trigger CUDA Graph capture for all shapes in cuda graph capture list
diff --git a/fastdeploy/worker/gpu_worker.py b/fastdeploy/worker/gpu_worker.py
index 601efd16b..8c72cfc75 100644
--- a/fastdeploy/worker/gpu_worker.py
+++ b/fastdeploy/worker/gpu_worker.py
@@ -207,7 +207,7 @@ class GpuWorker(WorkerBase):
         """
         Perform the warm-up and the graph optimization
         """
-        if self.fd_config.graph_opt_config.graph_opt_level >= 1:
+        if self.fd_config.graph_opt_config.graph_opt_level >= 1 and not self.model_runner.use_cudagraph:
             self.model_runner.sot_warmup()
         # Trigger cuda graph capture
         self.model_runner.capture_model()