diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index 41086f5f5..83c18b512 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -434,7 +434,7 @@ class GraphOptimizationConfig:
             - With dyncmic graph backend: ...
             - With static grpah backend: WIP
         """
-        self.sot_warmup_sizes: Optional[list[int]] = []
+        self.sot_warmup_sizes: list[int] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 32, 64, 128]
         """  Number of warmup runs for SOT warmup. """
         self.use_cudagraph: bool = False
         """Sizes to capture cudagraph.
diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py
index 26522044f..6a8150f62 100644
--- a/fastdeploy/worker/gcu_model_runner.py
+++ b/fastdeploy/worker/gcu_model_runner.py
@@ -26,6 +26,10 @@ from paddleformers.utils.log import logger
 from fastdeploy.config import FDConfig
 from fastdeploy.engine.request import Request
 from fastdeploy.model_executor.forward_meta import ForwardMeta
+from fastdeploy.model_executor.graph_optimization.utils import (
+    profile_run_guard,
+    sot_warmup_guard,
+)
 from fastdeploy.model_executor.guided_decoding import get_guided_backend
 from fastdeploy.model_executor.guided_decoding.base_guided_decoding import (
     LogitsProcessorBase,
@@ -79,8 +83,10 @@ class GCUModelRunner(ModelRunnerBase):
             self.sampler = SpeculativeSampler(fd_config)
 
         # Cuda Graph
+        self.graph_opt_level = self.graph_opt_config.graph_opt_level
         self.use_cudagraph = self.graph_opt_config.use_cudagraph
         self.cudagraph_capture_sizes = list(reversed(self.graph_opt_config.cudagraph_capture_sizes))
+        self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes
 
         # Initialize share inputs
         self._init_share_inputs(self.parallel_config.max_num_seqs)
@@ -851,6 +857,17 @@ class GCUModelRunner(ModelRunnerBase):
         time_after_capture = time.perf_counter()
         logger.info(f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds")
 
+    @sot_warmup_guard(True)
+    def sot_warmup(self) -> None:
+        start_time = time.perf_counter()
+        for batch_size in self.sot_warmup_sizes:
+            self._dummy_run(
+                num_tokens=self.parallel_config.max_num_batched_tokens,
+                batch_size=batch_size,
+            )
+            logger.info(f"SOT warmup the model with the batch size:{batch_size}")
+        logger.info(f"SOT warmup took {time.perf_counter() - start_time} seconds")
+
     def _get_skip_idx(self, model_forward_batch: Optional[List[Request]] = None):
         """
         Get the index of the request that needs to be skipped during execution.
@@ -1041,6 +1058,7 @@ class GCUModelRunner(ModelRunnerBase):
         else:
             raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward")
 
+    @profile_run_guard(True)
     def profile_run(self) -> None:
         """Execute a forward pass with dummy inputs to profile the memory usage of the model"""
 
diff --git a/fastdeploy/worker/gcu_worker.py b/fastdeploy/worker/gcu_worker.py
index 004e0e801..77a8a50d4 100644
--- a/fastdeploy/worker/gcu_worker.py
+++ b/fastdeploy/worker/gcu_worker.py
@@ -123,7 +123,8 @@ class GcuWorker(WorkerBase):
         """
         # 1. Warm up model
         # NOTE(gongshaotian): may be not need warm_up at this place
-
+        if self.model_runner.graph_opt_level >= 1:
+            self.model_runner.sot_warmup()
         # 2. Triger cuda grpah capture
         self.model_runner.capture_model()
 
diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py
index 2fab26b74..db6a25e13 100644
--- a/fastdeploy/worker/iluvatar_model_runner.py
+++ b/fastdeploy/worker/iluvatar_model_runner.py
@@ -26,6 +26,10 @@ from paddleformers.utils.log import logger
 from fastdeploy.config import FDConfig
 from fastdeploy.engine.request import Request
 from fastdeploy.model_executor.forward_meta import ForwardMeta
+from fastdeploy.model_executor.graph_optimization.utils import (
+    profile_run_guard,
+    sot_warmup_guard,
+)
 from fastdeploy.model_executor.layers.attention import get_attention_backend
 from fastdeploy.model_executor.layers.attention.base_attention_backend import (
     AttentionBackend,
@@ -76,9 +80,11 @@ class IluvatarModelRunner(ModelRunnerBase):
         # self.kv_caches: list[paddle.Tensor] = []
 
         # Cuda Graph
+        self.graph_opt_level = self.graph_opt_config.graph_opt_level
         self.use_cudagraph = self.graph_opt_config.use_cudagraph
         self.cudagraph_capture_sizes = list(reversed(self.graph_opt_config.cudagraph_capture_sizes))
         self.cudagraph_num_of_warmups = self.graph_opt_config.cudagraph_num_of_warmups
+        self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes
         self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, dtype="int32")
 
         # Initialize share inputs
@@ -806,6 +812,17 @@ class IluvatarModelRunner(ModelRunnerBase):
         time_after_capture = time.perf_counter()
         logger.info(f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds")
 
+    @sot_warmup_guard(True)
+    def sot_warmup(self) -> None:
+        start_time = time.perf_counter()
+        for batch_size in self.sot_warmup_sizes:
+            self._dummy_run(
+                num_tokens=self.parallel_config.max_num_batched_tokens,
+                batch_size=batch_size,
+            )
+            logger.info(f"SOT warmup the model with the batch size:{batch_size}")
+        logger.info(f"SOT warmup took {time.perf_counter() - start_time} seconds")
+
     def _get_skip_idx(self, model_forward_batch):
         """
         Get the index of the request that needs to be skipped during execution.
@@ -987,6 +1004,7 @@ class IluvatarModelRunner(ModelRunnerBase):
         else:
             raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward")
 
+    @profile_run_guard(True)
     def profile_run(self) -> None:
         """Execute a forward pass with dummy inputs to profile the memory usage of the model."""
 
diff --git a/fastdeploy/worker/iluvatar_worker.py b/fastdeploy/worker/iluvatar_worker.py
index 8ed74c6fe..6c390584f 100644
--- a/fastdeploy/worker/iluvatar_worker.py
+++ b/fastdeploy/worker/iluvatar_worker.py
@@ -124,6 +124,8 @@ class IluvatarWorker(WorkerBase):
         """
         # 1. Warm up model
         # NOTE(gongshaotian): may be not need warm_up at this place
+        if self.model_runner.graph_opt_level >= 1:
+            self.model_runner.sot_warmup()
 
         # 2. Triger cuda grpah capture
         self.model_runner.capture_model()
diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py
index 01eff6c7c..bcf0c8df6 100644
--- a/fastdeploy/worker/xpu_model_runner.py
+++ b/fastdeploy/worker/xpu_model_runner.py
@@ -26,6 +26,10 @@ from fastdeploy import envs
 from fastdeploy.config import FDConfig
 from fastdeploy.engine.request import Request, RequestType
 from fastdeploy.model_executor.forward_meta import ForwardMeta, XPUForwardMeta
+from fastdeploy.model_executor.graph_optimization.utils import (
+    profile_run_guard,
+    sot_warmup_guard,
+)
 from fastdeploy.model_executor.layers.attention import get_attention_backend
 from fastdeploy.model_executor.layers.attention.base_attention_backend import (
     AttentionBackend,
@@ -346,7 +350,9 @@ class XPUModelRunner(ModelRunnerBase):
         # self.kv_caches: list[paddle.Tensor] = []
 
         # Cuda Graph
+        self.graph_opt_level = self.graph_opt_config.graph_opt_level
         self.use_cudagraph = False
+        self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes
         self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, dtype="int32")
 
         # Initialize share inputs
@@ -764,6 +770,17 @@ class XPUModelRunner(ModelRunnerBase):
         logger.warn("XPU not support cuda graph currently")
         pass
 
+    @sot_warmup_guard(True)
+    def sot_warmup(self) -> None:
+        start_time = time.perf_counter()
+        for batch_size in self.sot_warmup_sizes:
+            self._dummy_run(
+                num_tokens=self.parallel_config.max_num_batched_tokens,
+                batch_size=batch_size,
+            )
+            logger.info(f"SOT warmup the model with the batch size:{batch_size}")
+        logger.info(f"SOT warmup took {time.perf_counter() - start_time} seconds")
+
     def exist_prefill(self):
         """
         check whether prefill stage exist
@@ -901,6 +918,7 @@ class XPUModelRunner(ModelRunnerBase):
         self.num_gpu_blocks = self.parallel_config.total_block_num
         self.initialize_kv_cache()
 
+    @profile_run_guard(True)
     def profile_run(self) -> None:
         """Execute a forward pass with dummy inputs to profile the memory usage of the model."""
 
diff --git a/fastdeploy/worker/xpu_worker.py b/fastdeploy/worker/xpu_worker.py
index 82e239202..0332d34d2 100644
--- a/fastdeploy/worker/xpu_worker.py
+++ b/fastdeploy/worker/xpu_worker.py
@@ -70,9 +70,10 @@ class XpuWorker(WorkerBase):
 
     def graph_optimize_and_warm_up_model(self) -> None:
         """
-        Optimizes the inference graph using the specified optimization options.
+        Perform the warm-up and the graph optimization
         """
-        logger.warn("XPU current could not graph optimize and warm up model")
+        if self.model_runner.graph_opt_level >= 1:
+            self.model_runner.sot_warmup()
 
     def determine_available_memory(self) -> int:
         """