[SOT] Extend SOT warmup support to new hardware (#3032)

* add new hardware

* add_sot_warmup4new_hardware

* fix conflict

* rm Optional
This commit is contained in:
Ryan
2025-07-29 22:45:20 +08:00
committed by GitHub
parent b2f9a42d87
commit 73cfe1fd37
7 changed files with 62 additions and 4 deletions

View File

@@ -434,7 +434,7 @@ class GraphOptimizationConfig:
- With dyncmic graph backend: ... - With dyncmic graph backend: ...
- With static grpah backend: WIP - With static grpah backend: WIP
""" """
self.sot_warmup_sizes: Optional[list[int]] = [] self.sot_warmup_sizes: list[int] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 32, 64, 128]
""" Number of warmup runs for SOT warmup. """ """ Number of warmup runs for SOT warmup. """
self.use_cudagraph: bool = False self.use_cudagraph: bool = False
"""Sizes to capture cudagraph. """Sizes to capture cudagraph.

View File

@@ -26,6 +26,10 @@ from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig from fastdeploy.config import FDConfig
from fastdeploy.engine.request import Request from fastdeploy.engine.request import Request
from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.utils import (
profile_run_guard,
sot_warmup_guard,
)
from fastdeploy.model_executor.guided_decoding import get_guided_backend from fastdeploy.model_executor.guided_decoding import get_guided_backend
from fastdeploy.model_executor.guided_decoding.base_guided_decoding import ( from fastdeploy.model_executor.guided_decoding.base_guided_decoding import (
LogitsProcessorBase, LogitsProcessorBase,
@@ -79,8 +83,10 @@ class GCUModelRunner(ModelRunnerBase):
self.sampler = SpeculativeSampler(fd_config) self.sampler = SpeculativeSampler(fd_config)
# Cuda Graph # Cuda Graph
self.graph_opt_level = self.graph_opt_config.graph_opt_level
self.use_cudagraph = self.graph_opt_config.use_cudagraph self.use_cudagraph = self.graph_opt_config.use_cudagraph
self.cudagraph_capture_sizes = list(reversed(self.graph_opt_config.cudagraph_capture_sizes)) self.cudagraph_capture_sizes = list(reversed(self.graph_opt_config.cudagraph_capture_sizes))
self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes
# Initialize share inputs # Initialize share inputs
self._init_share_inputs(self.parallel_config.max_num_seqs) self._init_share_inputs(self.parallel_config.max_num_seqs)
@@ -851,6 +857,17 @@ class GCUModelRunner(ModelRunnerBase):
time_after_capture = time.perf_counter() time_after_capture = time.perf_counter()
logger.info(f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds") logger.info(f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds")
@sot_warmup_guard(True)
def sot_warmup(self) -> None:
start_time = time.perf_counter()
for batch_size in self.sot_warmup_sizes:
self._dummy_run(
num_tokens=self.parallel_config.max_num_batched_tokens,
batch_size=batch_size,
)
logger.info(f"SOT warmup the model with the batch size:{batch_size}")
logger.info(f"SOT warmup took {time.perf_counter() - start_time} seconds")
def _get_skip_idx(self, model_forward_batch: Optional[List[Request]] = None): def _get_skip_idx(self, model_forward_batch: Optional[List[Request]] = None):
""" """
Get the index of the request that needs to be skipped during execution. Get the index of the request that needs to be skipped during execution.
@@ -1041,6 +1058,7 @@ class GCUModelRunner(ModelRunnerBase):
else: else:
raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward") raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward")
@profile_run_guard(True)
def profile_run(self) -> None: def profile_run(self) -> None:
"""Execute a forward pass with dummy inputs to profile the memory usage of the model""" """Execute a forward pass with dummy inputs to profile the memory usage of the model"""

View File

@@ -123,7 +123,8 @@ class GcuWorker(WorkerBase):
""" """
# 1. Warm up model # 1. Warm up model
# NOTE(gongshaotian): may be not need warm_up at this place # NOTE(gongshaotian): may be not need warm_up at this place
if self.model_runner.graph_opt_level >= 1:
self.model_runner.sot_warmup()
# 2. Triger cuda grpah capture # 2. Triger cuda grpah capture
self.model_runner.capture_model() self.model_runner.capture_model()

View File

@@ -26,6 +26,10 @@ from paddleformers.utils.log import logger
from fastdeploy.config import FDConfig from fastdeploy.config import FDConfig
from fastdeploy.engine.request import Request from fastdeploy.engine.request import Request
from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.utils import (
profile_run_guard,
sot_warmup_guard,
)
from fastdeploy.model_executor.layers.attention import get_attention_backend from fastdeploy.model_executor.layers.attention import get_attention_backend
from fastdeploy.model_executor.layers.attention.base_attention_backend import ( from fastdeploy.model_executor.layers.attention.base_attention_backend import (
AttentionBackend, AttentionBackend,
@@ -76,9 +80,11 @@ class IluvatarModelRunner(ModelRunnerBase):
# self.kv_caches: list[paddle.Tensor] = [] # self.kv_caches: list[paddle.Tensor] = []
# Cuda Graph # Cuda Graph
self.graph_opt_level = self.graph_opt_config.graph_opt_level
self.use_cudagraph = self.graph_opt_config.use_cudagraph self.use_cudagraph = self.graph_opt_config.use_cudagraph
self.cudagraph_capture_sizes = list(reversed(self.graph_opt_config.cudagraph_capture_sizes)) self.cudagraph_capture_sizes = list(reversed(self.graph_opt_config.cudagraph_capture_sizes))
self.cudagraph_num_of_warmups = self.graph_opt_config.cudagraph_num_of_warmups self.cudagraph_num_of_warmups = self.graph_opt_config.cudagraph_num_of_warmups
self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes
self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, dtype="int32") self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, dtype="int32")
# Initialize share inputs # Initialize share inputs
@@ -806,6 +812,17 @@ class IluvatarModelRunner(ModelRunnerBase):
time_after_capture = time.perf_counter() time_after_capture = time.perf_counter()
logger.info(f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds") logger.info(f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds")
@sot_warmup_guard(True)
def sot_warmup(self) -> None:
start_time = time.perf_counter()
for batch_size in self.sot_warmup_sizes:
self._dummy_run(
num_tokens=self.parallel_config.max_num_batched_tokens,
batch_size=batch_size,
)
logger.info(f"SOT warmup the model with the batch size:{batch_size}")
logger.info(f"SOT warmup took {time.perf_counter() - start_time} seconds")
def _get_skip_idx(self, model_forward_batch): def _get_skip_idx(self, model_forward_batch):
""" """
Get the index of the request that needs to be skipped during execution. Get the index of the request that needs to be skipped during execution.
@@ -987,6 +1004,7 @@ class IluvatarModelRunner(ModelRunnerBase):
else: else:
raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward") raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward")
@profile_run_guard(True)
def profile_run(self) -> None: def profile_run(self) -> None:
"""Execute a forward pass with dummy inputs to profile the memory usage of the model.""" """Execute a forward pass with dummy inputs to profile the memory usage of the model."""

View File

@@ -124,6 +124,8 @@ class IluvatarWorker(WorkerBase):
""" """
# 1. Warm up model # 1. Warm up model
# NOTE(gongshaotian): may be not need warm_up at this place # NOTE(gongshaotian): may be not need warm_up at this place
if self.model_runner.graph_opt_level >= 1:
self.model_runner.sot_warmup()
# 2. Triger cuda grpah capture # 2. Triger cuda grpah capture
self.model_runner.capture_model() self.model_runner.capture_model()

View File

@@ -26,6 +26,10 @@ from fastdeploy import envs
from fastdeploy.config import FDConfig from fastdeploy.config import FDConfig
from fastdeploy.engine.request import Request, RequestType from fastdeploy.engine.request import Request, RequestType
from fastdeploy.model_executor.forward_meta import ForwardMeta, XPUForwardMeta from fastdeploy.model_executor.forward_meta import ForwardMeta, XPUForwardMeta
from fastdeploy.model_executor.graph_optimization.utils import (
profile_run_guard,
sot_warmup_guard,
)
from fastdeploy.model_executor.layers.attention import get_attention_backend from fastdeploy.model_executor.layers.attention import get_attention_backend
from fastdeploy.model_executor.layers.attention.base_attention_backend import ( from fastdeploy.model_executor.layers.attention.base_attention_backend import (
AttentionBackend, AttentionBackend,
@@ -346,7 +350,9 @@ class XPUModelRunner(ModelRunnerBase):
# self.kv_caches: list[paddle.Tensor] = [] # self.kv_caches: list[paddle.Tensor] = []
# Cuda Graph # Cuda Graph
self.graph_opt_level = self.graph_opt_config.graph_opt_level
self.use_cudagraph = False self.use_cudagraph = False
self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes
self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, dtype="int32") self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, dtype="int32")
# Initialize share inputs # Initialize share inputs
@@ -764,6 +770,17 @@ class XPUModelRunner(ModelRunnerBase):
logger.warn("XPU not support cuda graph currently") logger.warn("XPU not support cuda graph currently")
pass pass
@sot_warmup_guard(True)
def sot_warmup(self) -> None:
start_time = time.perf_counter()
for batch_size in self.sot_warmup_sizes:
self._dummy_run(
num_tokens=self.parallel_config.max_num_batched_tokens,
batch_size=batch_size,
)
logger.info(f"SOT warmup the model with the batch size:{batch_size}")
logger.info(f"SOT warmup took {time.perf_counter() - start_time} seconds")
def exist_prefill(self): def exist_prefill(self):
""" """
check whether prefill stage exist check whether prefill stage exist
@@ -901,6 +918,7 @@ class XPUModelRunner(ModelRunnerBase):
self.num_gpu_blocks = self.parallel_config.total_block_num self.num_gpu_blocks = self.parallel_config.total_block_num
self.initialize_kv_cache() self.initialize_kv_cache()
@profile_run_guard(True)
def profile_run(self) -> None: def profile_run(self) -> None:
"""Execute a forward pass with dummy inputs to profile the memory usage of the model.""" """Execute a forward pass with dummy inputs to profile the memory usage of the model."""

View File

@@ -70,9 +70,10 @@ class XpuWorker(WorkerBase):
def graph_optimize_and_warm_up_model(self) -> None: def graph_optimize_and_warm_up_model(self) -> None:
""" """
Optimizes the inference graph using the specified optimization options. Perform the warm-up and the graph optimization
""" """
logger.warn("XPU current could not graph optimize and warm up model") if self.model_runner.graph_opt_level >= 1:
self.model_runner.sot_warmup()
def determine_available_memory(self) -> int: def determine_available_memory(self) -> int:
""" """