diff --git a/fastdeploy/config.py b/fastdeploy/config.py index dfeb82a1c..869cea33f 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -319,6 +319,8 @@ class GraphOptimizationConfig: - With dyncmic graph backend: ... - With static grpah backend: WIP """ + sot_warmup_sizes: Optional[list[int]] = field(default_factory=list) + """ Number of warmup runs for SOT warmup. """ use_cudagraph: bool = False """Sizes to capture cudagraph. - None (default): capture sizes are inferred from llm config. diff --git a/fastdeploy/engine/config.py b/fastdeploy/engine/config.py index 7c4473177..9c559ce32 100644 --- a/fastdeploy/engine/config.py +++ b/fastdeploy/engine/config.py @@ -429,6 +429,7 @@ class GraphOptimizationConfig: graph_opt_level: Optional[int] = 0, use_cudagraph: Optional[bool] = None, cudagraph_capture_sizes: Optional[List[int]] = None, + sot_warmup_sizes: Optional[List[int]] = None, **kwargs, ): """ @@ -444,6 +445,7 @@ class GraphOptimizationConfig: self.graph_opt_level = graph_opt_level self.use_cudagraph = use_cudagraph self.cudagraph_capture_sizes = cudagraph_capture_sizes + self.sot_warmup_sizes = [] if sot_warmup_sizes is None else sot_warmup_sizes def to_json_string(self): """ diff --git a/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py b/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py index fac5237e2..9f56d313c 100644 --- a/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py +++ b/fastdeploy/model_executor/graph_optimization/graph_optimization_backend.py @@ -31,26 +31,15 @@ from fastdeploy.model_executor.graph_optimization.cudagraph_piecewise_backend im from fastdeploy.model_executor.graph_optimization.dynamic_dims_marker import ( resolve_dynamic_dims, ) +from fastdeploy.model_executor.graph_optimization.utils import in_profile_run_mode +from fastdeploy.model_executor.graph_optimization.utils import ( + in_sot_warmup_mode as in_warmup_mode, +) P = ParamSpec("P") T = TypeVar("T") -# TODO(SigureMo): Replace this fn with real implementation by DrRyanHuang -def create_in_warmup_mode(): - cnt = 0 - - def in_warmup_mode(): - nonlocal cnt - cnt += 1 - return cnt < 32 - - return in_warmup_mode - - -in_warmup_mode = create_in_warmup_mode() - - def apply_to_static_optimization(fn: Callable[P, T], backend: ToStaticBackend) -> Callable[P, T]: forward_fn = fn forward_sig = inspect.signature(forward_fn) @@ -99,6 +88,8 @@ def apply_to_static_optimization(fn: Callable[P, T], backend: ToStaticBackend) - @functools.wraps(forward_fn) def static_forward(self, *args, **kwargs): + if in_profile_run_mode(): + return forward_fn(self, *args, **kwargs) nonlocal need_warmup is_warmup = in_warmup_mode() and need_warmup if is_warmup: diff --git a/fastdeploy/model_executor/graph_optimization/utils.py b/fastdeploy/model_executor/graph_optimization/utils.py new file mode 100644 index 000000000..80e7dc0ed --- /dev/null +++ b/fastdeploy/model_executor/graph_optimization/utils.py @@ -0,0 +1,40 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import contextlib + + +def create_guard(default_value): + _state = default_value + + @contextlib.contextmanager + def state_guard(current_state): + nonlocal _state + old_state = _state + _state = current_state + try: + yield + finally: + _state = old_state + + def get_state(): + return _state + + return state_guard, get_state + + +sot_warmup_guard, in_sot_warmup_mode = create_guard(False) +profile_run_guard, in_profile_run_mode = create_guard(False) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 710ecaff3..15dc7de77 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -25,6 +25,10 @@ from paddleformers.utils.log import logger from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request +from fastdeploy.model_executor.graph_optimization.utils import ( + profile_run_guard, + sot_warmup_guard, +) from fastdeploy.model_executor.guided_decoding import get_guided_backend from fastdeploy.model_executor.guided_decoding.base_guided_decoding import ( LogitsProcessorBase, @@ -113,8 +117,10 @@ class GPUModelRunner(ModelRunnerBase): # self.kv_caches: list[paddle.Tensor] = [] # Cuda Graph + self.graph_opt_level = self.graph_opt_config.graph_opt_level self.use_cudagraph = self.graph_opt_config.use_cudagraph self.cudagraph_capture_sizes = list(reversed(self.graph_opt_config.cudagraph_capture_sizes)) + self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes # Initialize share inputs self._init_share_inputs(self.parallel_config.max_num_seqs) @@ -367,9 +373,6 @@ class GPUModelRunner(ModelRunnerBase): def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, expected_decode_len: int): """Set dummy prefill inputs to share_inputs""" # NOTE(gongshaotian): The maximum decoding length is equal to the expected decoded tokens plus the eos token - if self.enable_mm: - self.share_inputs["free_list"] = paddle.to_tensor([], dtype="int32") - self.share_inputs["free_list_len"][0] = 0 max_dec_len = expected_decode_len + 1 full_length = min( num_tokens // batch_size, @@ -1007,6 +1010,17 @@ class GPUModelRunner(ModelRunnerBase): time_after_capture = time.perf_counter() logger.info(f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds") + @sot_warmup_guard(True) + def sot_warmup(self) -> None: + start_time = time.perf_counter() + for batch_size in self.sot_warmup_sizes: + self._dummy_run( + num_tokens=self.parallel_config.max_num_batched_tokens, + batch_size=batch_size, + ) + logger.info(f"SOT warmup the model with the batch size:{batch_size}") + logger.info(f"SOT warmup took {time.perf_counter() - start_time} seconds") + def _get_skip_idx(self, model_forward_batch: Optional[List[Request]] = None): """ Get the index of the request that needs to be skipped during execution. @@ -1208,6 +1222,7 @@ class GPUModelRunner(ModelRunnerBase): else: raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward") + @profile_run_guard(True) def profile_run(self) -> None: """Execute a forward pass with dummy inputs to profile the memory usage of the model""" diff --git a/fastdeploy/worker/gpu_worker.py b/fastdeploy/worker/gpu_worker.py index 812b62935..a3abbf6d0 100644 --- a/fastdeploy/worker/gpu_worker.py +++ b/fastdeploy/worker/gpu_worker.py @@ -189,6 +189,8 @@ class GpuWorker(WorkerBase): """ Perform the warm-up and the graph optimization """ + if self.model_runner.graph_opt_level >= 1: + self.model_runner.sot_warmup() # Triger cuda grpah capture self.model_runner.capture_model() diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 17ed382b7..157300cd5 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -632,6 +632,7 @@ def initialize_fd_config(args, ranks: int = 1, local_rank: int = 0) -> FDConfig: use_cudagraph=args.graph_optimization_config["use_cudagraph"], graph_opt_level=args.graph_optimization_config["graph_opt_level"], cudagraph_capture_sizes=args.graph_optimization_config["cudagraph_capture_sizes"], + sot_warmup_sizes=args.graph_optimization_config["sot_warmup_sizes"], ) # Note(tangbinhan): used for load_checkpoint