mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
[SOT] Add sot warmup (NVIDIA GPU Only) (#2929)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* add sot warmup * fix code style * change batch_size list * add param to config * rm free_list settings && set sot_warmup_sizes * finish debug with dynamic dims by type annotations * add profile_run guard * rm sth useless
This commit is contained in:
@@ -25,6 +25,10 @@ from paddleformers.utils.log import logger
|
||||
|
||||
from fastdeploy.config import FDConfig
|
||||
from fastdeploy.engine.request import Request
|
||||
from fastdeploy.model_executor.graph_optimization.utils import (
|
||||
profile_run_guard,
|
||||
sot_warmup_guard,
|
||||
)
|
||||
from fastdeploy.model_executor.guided_decoding import get_guided_backend
|
||||
from fastdeploy.model_executor.guided_decoding.base_guided_decoding import (
|
||||
LogitsProcessorBase,
|
||||
@@ -113,8 +117,10 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
# self.kv_caches: list[paddle.Tensor] = []
|
||||
|
||||
# Cuda Graph
|
||||
self.graph_opt_level = self.graph_opt_config.graph_opt_level
|
||||
self.use_cudagraph = self.graph_opt_config.use_cudagraph
|
||||
self.cudagraph_capture_sizes = list(reversed(self.graph_opt_config.cudagraph_capture_sizes))
|
||||
self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes
|
||||
|
||||
# Initialize share inputs
|
||||
self._init_share_inputs(self.parallel_config.max_num_seqs)
|
||||
@@ -367,9 +373,6 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
def _dummy_prefill_inputs(self, num_tokens: int, batch_size: int, expected_decode_len: int):
|
||||
"""Set dummy prefill inputs to share_inputs"""
|
||||
# NOTE(gongshaotian): The maximum decoding length is equal to the expected decoded tokens plus the eos token
|
||||
if self.enable_mm:
|
||||
self.share_inputs["free_list"] = paddle.to_tensor([], dtype="int32")
|
||||
self.share_inputs["free_list_len"][0] = 0
|
||||
max_dec_len = expected_decode_len + 1
|
||||
full_length = min(
|
||||
num_tokens // batch_size,
|
||||
@@ -1007,6 +1010,17 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
time_after_capture = time.perf_counter()
|
||||
logger.info(f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds")
|
||||
|
||||
@sot_warmup_guard(True)
|
||||
def sot_warmup(self) -> None:
|
||||
start_time = time.perf_counter()
|
||||
for batch_size in self.sot_warmup_sizes:
|
||||
self._dummy_run(
|
||||
num_tokens=self.parallel_config.max_num_batched_tokens,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
logger.info(f"SOT warmup the model with the batch size:{batch_size}")
|
||||
logger.info(f"SOT warmup took {time.perf_counter() - start_time} seconds")
|
||||
|
||||
def _get_skip_idx(self, model_forward_batch: Optional[List[Request]] = None):
|
||||
"""
|
||||
Get the index of the request that needs to be skipped during execution.
|
||||
@@ -1208,6 +1222,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
else:
|
||||
raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward")
|
||||
|
||||
@profile_run_guard(True)
|
||||
def profile_run(self) -> None:
|
||||
"""Execute a forward pass with dummy inputs to profile the memory usage of the model"""
|
||||
|
||||
|
Reference in New Issue
Block a user