diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 41086f5f5..83c18b512 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -434,7 +434,7 @@ class GraphOptimizationConfig: - With dyncmic graph backend: ... - With static grpah backend: WIP """ - self.sot_warmup_sizes: Optional[list[int]] = [] + self.sot_warmup_sizes: list[int] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 32, 64, 128] """ Number of warmup runs for SOT warmup. """ self.use_cudagraph: bool = False """Sizes to capture cudagraph. diff --git a/fastdeploy/worker/gcu_model_runner.py b/fastdeploy/worker/gcu_model_runner.py index 26522044f..6a8150f62 100644 --- a/fastdeploy/worker/gcu_model_runner.py +++ b/fastdeploy/worker/gcu_model_runner.py @@ -26,6 +26,10 @@ from paddleformers.utils.log import logger from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request from fastdeploy.model_executor.forward_meta import ForwardMeta +from fastdeploy.model_executor.graph_optimization.utils import ( + profile_run_guard, + sot_warmup_guard, +) from fastdeploy.model_executor.guided_decoding import get_guided_backend from fastdeploy.model_executor.guided_decoding.base_guided_decoding import ( LogitsProcessorBase, @@ -79,8 +83,10 @@ class GCUModelRunner(ModelRunnerBase): self.sampler = SpeculativeSampler(fd_config) # Cuda Graph + self.graph_opt_level = self.graph_opt_config.graph_opt_level self.use_cudagraph = self.graph_opt_config.use_cudagraph self.cudagraph_capture_sizes = list(reversed(self.graph_opt_config.cudagraph_capture_sizes)) + self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes # Initialize share inputs self._init_share_inputs(self.parallel_config.max_num_seqs) @@ -851,6 +857,17 @@ class GCUModelRunner(ModelRunnerBase): time_after_capture = time.perf_counter() logger.info(f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds") + @sot_warmup_guard(True) + def sot_warmup(self) -> None: + start_time = time.perf_counter() + for batch_size in self.sot_warmup_sizes: + self._dummy_run( + num_tokens=self.parallel_config.max_num_batched_tokens, + batch_size=batch_size, + ) + logger.info(f"SOT warmup the model with the batch size:{batch_size}") + logger.info(f"SOT warmup took {time.perf_counter() - start_time} seconds") + def _get_skip_idx(self, model_forward_batch: Optional[List[Request]] = None): """ Get the index of the request that needs to be skipped during execution. @@ -1041,6 +1058,7 @@ class GCUModelRunner(ModelRunnerBase): else: raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward") + @profile_run_guard(True) def profile_run(self) -> None: """Execute a forward pass with dummy inputs to profile the memory usage of the model""" diff --git a/fastdeploy/worker/gcu_worker.py b/fastdeploy/worker/gcu_worker.py index 004e0e801..77a8a50d4 100644 --- a/fastdeploy/worker/gcu_worker.py +++ b/fastdeploy/worker/gcu_worker.py @@ -123,7 +123,8 @@ class GcuWorker(WorkerBase): """ # 1. Warm up model # NOTE(gongshaotian): may be not need warm_up at this place - + if self.model_runner.graph_opt_level >= 1: + self.model_runner.sot_warmup() # 2. Triger cuda grpah capture self.model_runner.capture_model() diff --git a/fastdeploy/worker/iluvatar_model_runner.py b/fastdeploy/worker/iluvatar_model_runner.py index 2fab26b74..db6a25e13 100644 --- a/fastdeploy/worker/iluvatar_model_runner.py +++ b/fastdeploy/worker/iluvatar_model_runner.py @@ -26,6 +26,10 @@ from paddleformers.utils.log import logger from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request from fastdeploy.model_executor.forward_meta import ForwardMeta +from fastdeploy.model_executor.graph_optimization.utils import ( + profile_run_guard, + sot_warmup_guard, +) from fastdeploy.model_executor.layers.attention import get_attention_backend from fastdeploy.model_executor.layers.attention.base_attention_backend import ( AttentionBackend, @@ -76,9 +80,11 @@ class IluvatarModelRunner(ModelRunnerBase): # self.kv_caches: list[paddle.Tensor] = [] # Cuda Graph + self.graph_opt_level = self.graph_opt_config.graph_opt_level self.use_cudagraph = self.graph_opt_config.use_cudagraph self.cudagraph_capture_sizes = list(reversed(self.graph_opt_config.cudagraph_capture_sizes)) self.cudagraph_num_of_warmups = self.graph_opt_config.cudagraph_num_of_warmups + self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, dtype="int32") # Initialize share inputs @@ -806,6 +812,17 @@ class IluvatarModelRunner(ModelRunnerBase): time_after_capture = time.perf_counter() logger.info(f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds") + @sot_warmup_guard(True) + def sot_warmup(self) -> None: + start_time = time.perf_counter() + for batch_size in self.sot_warmup_sizes: + self._dummy_run( + num_tokens=self.parallel_config.max_num_batched_tokens, + batch_size=batch_size, + ) + logger.info(f"SOT warmup the model with the batch size:{batch_size}") + logger.info(f"SOT warmup took {time.perf_counter() - start_time} seconds") + def _get_skip_idx(self, model_forward_batch): """ Get the index of the request that needs to be skipped during execution. @@ -987,6 +1004,7 @@ class IluvatarModelRunner(ModelRunnerBase): else: raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward") + @profile_run_guard(True) def profile_run(self) -> None: """Execute a forward pass with dummy inputs to profile the memory usage of the model.""" diff --git a/fastdeploy/worker/iluvatar_worker.py b/fastdeploy/worker/iluvatar_worker.py index 8ed74c6fe..6c390584f 100644 --- a/fastdeploy/worker/iluvatar_worker.py +++ b/fastdeploy/worker/iluvatar_worker.py @@ -124,6 +124,8 @@ class IluvatarWorker(WorkerBase): """ # 1. Warm up model # NOTE(gongshaotian): may be not need warm_up at this place + if self.model_runner.graph_opt_level >= 1: + self.model_runner.sot_warmup() # 2. Triger cuda grpah capture self.model_runner.capture_model() diff --git a/fastdeploy/worker/xpu_model_runner.py b/fastdeploy/worker/xpu_model_runner.py index 01eff6c7c..bcf0c8df6 100644 --- a/fastdeploy/worker/xpu_model_runner.py +++ b/fastdeploy/worker/xpu_model_runner.py @@ -26,6 +26,10 @@ from fastdeploy import envs from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request, RequestType from fastdeploy.model_executor.forward_meta import ForwardMeta, XPUForwardMeta +from fastdeploy.model_executor.graph_optimization.utils import ( + profile_run_guard, + sot_warmup_guard, +) from fastdeploy.model_executor.layers.attention import get_attention_backend from fastdeploy.model_executor.layers.attention.base_attention_backend import ( AttentionBackend, @@ -346,7 +350,9 @@ class XPUModelRunner(ModelRunnerBase): # self.kv_caches: list[paddle.Tensor] = [] # Cuda Graph + self.graph_opt_level = self.graph_opt_config.graph_opt_level self.use_cudagraph = False + self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, dtype="int32") # Initialize share inputs @@ -764,6 +770,17 @@ class XPUModelRunner(ModelRunnerBase): logger.warn("XPU not support cuda graph currently") pass + @sot_warmup_guard(True) + def sot_warmup(self) -> None: + start_time = time.perf_counter() + for batch_size in self.sot_warmup_sizes: + self._dummy_run( + num_tokens=self.parallel_config.max_num_batched_tokens, + batch_size=batch_size, + ) + logger.info(f"SOT warmup the model with the batch size:{batch_size}") + logger.info(f"SOT warmup took {time.perf_counter() - start_time} seconds") + def exist_prefill(self): """ check whether prefill stage exist @@ -901,6 +918,7 @@ class XPUModelRunner(ModelRunnerBase): self.num_gpu_blocks = self.parallel_config.total_block_num self.initialize_kv_cache() + @profile_run_guard(True) def profile_run(self) -> None: """Execute a forward pass with dummy inputs to profile the memory usage of the model.""" diff --git a/fastdeploy/worker/xpu_worker.py b/fastdeploy/worker/xpu_worker.py index 82e239202..0332d34d2 100644 --- a/fastdeploy/worker/xpu_worker.py +++ b/fastdeploy/worker/xpu_worker.py @@ -70,9 +70,10 @@ class XpuWorker(WorkerBase): def graph_optimize_and_warm_up_model(self) -> None: """ - Optimizes the inference graph using the specified optimization options. + Perform the warm-up and the graph optimization """ - logger.warn("XPU current could not graph optimize and warm up model") + if self.model_runner.graph_opt_level >= 1: + self.model_runner.sot_warmup() def determine_available_memory(self) -> int: """