mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 16:22:57 +08:00
[SOT] Extend SOT warmup support to new hardware (#3032)
* add new hardware * add_sot_warmup4new_hardware * fix conflict * rm Optional
This commit is contained in:
@@ -434,7 +434,7 @@ class GraphOptimizationConfig:
|
|||||||
- With dyncmic graph backend: ...
|
- With dyncmic graph backend: ...
|
||||||
- With static grpah backend: WIP
|
- With static grpah backend: WIP
|
||||||
"""
|
"""
|
||||||
self.sot_warmup_sizes: Optional[list[int]] = []
|
self.sot_warmup_sizes: list[int] = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 16, 32, 64, 128]
|
||||||
""" Number of warmup runs for SOT warmup. """
|
""" Number of warmup runs for SOT warmup. """
|
||||||
self.use_cudagraph: bool = False
|
self.use_cudagraph: bool = False
|
||||||
"""Sizes to capture cudagraph.
|
"""Sizes to capture cudagraph.
|
||||||
|
@@ -26,6 +26,10 @@ from paddleformers.utils.log import logger
|
|||||||
from fastdeploy.config import FDConfig
|
from fastdeploy.config import FDConfig
|
||||||
from fastdeploy.engine.request import Request
|
from fastdeploy.engine.request import Request
|
||||||
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
|
from fastdeploy.model_executor.graph_optimization.utils import (
|
||||||
|
profile_run_guard,
|
||||||
|
sot_warmup_guard,
|
||||||
|
)
|
||||||
from fastdeploy.model_executor.guided_decoding import get_guided_backend
|
from fastdeploy.model_executor.guided_decoding import get_guided_backend
|
||||||
from fastdeploy.model_executor.guided_decoding.base_guided_decoding import (
|
from fastdeploy.model_executor.guided_decoding.base_guided_decoding import (
|
||||||
LogitsProcessorBase,
|
LogitsProcessorBase,
|
||||||
@@ -79,8 +83,10 @@ class GCUModelRunner(ModelRunnerBase):
|
|||||||
self.sampler = SpeculativeSampler(fd_config)
|
self.sampler = SpeculativeSampler(fd_config)
|
||||||
|
|
||||||
# Cuda Graph
|
# Cuda Graph
|
||||||
|
self.graph_opt_level = self.graph_opt_config.graph_opt_level
|
||||||
self.use_cudagraph = self.graph_opt_config.use_cudagraph
|
self.use_cudagraph = self.graph_opt_config.use_cudagraph
|
||||||
self.cudagraph_capture_sizes = list(reversed(self.graph_opt_config.cudagraph_capture_sizes))
|
self.cudagraph_capture_sizes = list(reversed(self.graph_opt_config.cudagraph_capture_sizes))
|
||||||
|
self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes
|
||||||
|
|
||||||
# Initialize share inputs
|
# Initialize share inputs
|
||||||
self._init_share_inputs(self.parallel_config.max_num_seqs)
|
self._init_share_inputs(self.parallel_config.max_num_seqs)
|
||||||
@@ -851,6 +857,17 @@ class GCUModelRunner(ModelRunnerBase):
|
|||||||
time_after_capture = time.perf_counter()
|
time_after_capture = time.perf_counter()
|
||||||
logger.info(f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds")
|
logger.info(f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds")
|
||||||
|
|
||||||
|
@sot_warmup_guard(True)
|
||||||
|
def sot_warmup(self) -> None:
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
for batch_size in self.sot_warmup_sizes:
|
||||||
|
self._dummy_run(
|
||||||
|
num_tokens=self.parallel_config.max_num_batched_tokens,
|
||||||
|
batch_size=batch_size,
|
||||||
|
)
|
||||||
|
logger.info(f"SOT warmup the model with the batch size:{batch_size}")
|
||||||
|
logger.info(f"SOT warmup took {time.perf_counter() - start_time} seconds")
|
||||||
|
|
||||||
def _get_skip_idx(self, model_forward_batch: Optional[List[Request]] = None):
|
def _get_skip_idx(self, model_forward_batch: Optional[List[Request]] = None):
|
||||||
"""
|
"""
|
||||||
Get the index of the request that needs to be skipped during execution.
|
Get the index of the request that needs to be skipped during execution.
|
||||||
@@ -1041,6 +1058,7 @@ class GCUModelRunner(ModelRunnerBase):
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward")
|
raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward")
|
||||||
|
|
||||||
|
@profile_run_guard(True)
|
||||||
def profile_run(self) -> None:
|
def profile_run(self) -> None:
|
||||||
"""Execute a forward pass with dummy inputs to profile the memory usage of the model"""
|
"""Execute a forward pass with dummy inputs to profile the memory usage of the model"""
|
||||||
|
|
||||||
|
@@ -123,7 +123,8 @@ class GcuWorker(WorkerBase):
|
|||||||
"""
|
"""
|
||||||
# 1. Warm up model
|
# 1. Warm up model
|
||||||
# NOTE(gongshaotian): may be not need warm_up at this place
|
# NOTE(gongshaotian): may be not need warm_up at this place
|
||||||
|
if self.model_runner.graph_opt_level >= 1:
|
||||||
|
self.model_runner.sot_warmup()
|
||||||
# 2. Triger cuda grpah capture
|
# 2. Triger cuda grpah capture
|
||||||
self.model_runner.capture_model()
|
self.model_runner.capture_model()
|
||||||
|
|
||||||
|
@@ -26,6 +26,10 @@ from paddleformers.utils.log import logger
|
|||||||
from fastdeploy.config import FDConfig
|
from fastdeploy.config import FDConfig
|
||||||
from fastdeploy.engine.request import Request
|
from fastdeploy.engine.request import Request
|
||||||
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta
|
||||||
|
from fastdeploy.model_executor.graph_optimization.utils import (
|
||||||
|
profile_run_guard,
|
||||||
|
sot_warmup_guard,
|
||||||
|
)
|
||||||
from fastdeploy.model_executor.layers.attention import get_attention_backend
|
from fastdeploy.model_executor.layers.attention import get_attention_backend
|
||||||
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
||||||
AttentionBackend,
|
AttentionBackend,
|
||||||
@@ -76,9 +80,11 @@ class IluvatarModelRunner(ModelRunnerBase):
|
|||||||
# self.kv_caches: list[paddle.Tensor] = []
|
# self.kv_caches: list[paddle.Tensor] = []
|
||||||
|
|
||||||
# Cuda Graph
|
# Cuda Graph
|
||||||
|
self.graph_opt_level = self.graph_opt_config.graph_opt_level
|
||||||
self.use_cudagraph = self.graph_opt_config.use_cudagraph
|
self.use_cudagraph = self.graph_opt_config.use_cudagraph
|
||||||
self.cudagraph_capture_sizes = list(reversed(self.graph_opt_config.cudagraph_capture_sizes))
|
self.cudagraph_capture_sizes = list(reversed(self.graph_opt_config.cudagraph_capture_sizes))
|
||||||
self.cudagraph_num_of_warmups = self.graph_opt_config.cudagraph_num_of_warmups
|
self.cudagraph_num_of_warmups = self.graph_opt_config.cudagraph_num_of_warmups
|
||||||
|
self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes
|
||||||
self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, dtype="int32")
|
self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, dtype="int32")
|
||||||
|
|
||||||
# Initialize share inputs
|
# Initialize share inputs
|
||||||
@@ -806,6 +812,17 @@ class IluvatarModelRunner(ModelRunnerBase):
|
|||||||
time_after_capture = time.perf_counter()
|
time_after_capture = time.perf_counter()
|
||||||
logger.info(f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds")
|
logger.info(f"Cuda Graph capturing took {time_after_capture - time_before_capture} seconds")
|
||||||
|
|
||||||
|
@sot_warmup_guard(True)
|
||||||
|
def sot_warmup(self) -> None:
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
for batch_size in self.sot_warmup_sizes:
|
||||||
|
self._dummy_run(
|
||||||
|
num_tokens=self.parallel_config.max_num_batched_tokens,
|
||||||
|
batch_size=batch_size,
|
||||||
|
)
|
||||||
|
logger.info(f"SOT warmup the model with the batch size:{batch_size}")
|
||||||
|
logger.info(f"SOT warmup took {time.perf_counter() - start_time} seconds")
|
||||||
|
|
||||||
def _get_skip_idx(self, model_forward_batch):
|
def _get_skip_idx(self, model_forward_batch):
|
||||||
"""
|
"""
|
||||||
Get the index of the request that needs to be skipped during execution.
|
Get the index of the request that needs to be skipped during execution.
|
||||||
@@ -987,6 +1004,7 @@ class IluvatarModelRunner(ModelRunnerBase):
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward")
|
raise ValueError(f"{type(self.model)} has no attribute 'empty_input_forward")
|
||||||
|
|
||||||
|
@profile_run_guard(True)
|
||||||
def profile_run(self) -> None:
|
def profile_run(self) -> None:
|
||||||
"""Execute a forward pass with dummy inputs to profile the memory usage of the model."""
|
"""Execute a forward pass with dummy inputs to profile the memory usage of the model."""
|
||||||
|
|
||||||
|
@@ -124,6 +124,8 @@ class IluvatarWorker(WorkerBase):
|
|||||||
"""
|
"""
|
||||||
# 1. Warm up model
|
# 1. Warm up model
|
||||||
# NOTE(gongshaotian): may be not need warm_up at this place
|
# NOTE(gongshaotian): may be not need warm_up at this place
|
||||||
|
if self.model_runner.graph_opt_level >= 1:
|
||||||
|
self.model_runner.sot_warmup()
|
||||||
|
|
||||||
# 2. Triger cuda grpah capture
|
# 2. Triger cuda grpah capture
|
||||||
self.model_runner.capture_model()
|
self.model_runner.capture_model()
|
||||||
|
@@ -26,6 +26,10 @@ from fastdeploy import envs
|
|||||||
from fastdeploy.config import FDConfig
|
from fastdeploy.config import FDConfig
|
||||||
from fastdeploy.engine.request import Request, RequestType
|
from fastdeploy.engine.request import Request, RequestType
|
||||||
from fastdeploy.model_executor.forward_meta import ForwardMeta, XPUForwardMeta
|
from fastdeploy.model_executor.forward_meta import ForwardMeta, XPUForwardMeta
|
||||||
|
from fastdeploy.model_executor.graph_optimization.utils import (
|
||||||
|
profile_run_guard,
|
||||||
|
sot_warmup_guard,
|
||||||
|
)
|
||||||
from fastdeploy.model_executor.layers.attention import get_attention_backend
|
from fastdeploy.model_executor.layers.attention import get_attention_backend
|
||||||
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
|
||||||
AttentionBackend,
|
AttentionBackend,
|
||||||
@@ -346,7 +350,9 @@ class XPUModelRunner(ModelRunnerBase):
|
|||||||
# self.kv_caches: list[paddle.Tensor] = []
|
# self.kv_caches: list[paddle.Tensor] = []
|
||||||
|
|
||||||
# Cuda Graph
|
# Cuda Graph
|
||||||
|
self.graph_opt_level = self.graph_opt_config.graph_opt_level
|
||||||
self.use_cudagraph = False
|
self.use_cudagraph = False
|
||||||
|
self.sot_warmup_sizes = self.graph_opt_config.sot_warmup_sizes
|
||||||
self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, dtype="int32")
|
self.input_ids = paddle.zeros(self.parallel_config.max_num_seqs, dtype="int32")
|
||||||
|
|
||||||
# Initialize share inputs
|
# Initialize share inputs
|
||||||
@@ -764,6 +770,17 @@ class XPUModelRunner(ModelRunnerBase):
|
|||||||
logger.warn("XPU not support cuda graph currently")
|
logger.warn("XPU not support cuda graph currently")
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@sot_warmup_guard(True)
|
||||||
|
def sot_warmup(self) -> None:
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
for batch_size in self.sot_warmup_sizes:
|
||||||
|
self._dummy_run(
|
||||||
|
num_tokens=self.parallel_config.max_num_batched_tokens,
|
||||||
|
batch_size=batch_size,
|
||||||
|
)
|
||||||
|
logger.info(f"SOT warmup the model with the batch size:{batch_size}")
|
||||||
|
logger.info(f"SOT warmup took {time.perf_counter() - start_time} seconds")
|
||||||
|
|
||||||
def exist_prefill(self):
|
def exist_prefill(self):
|
||||||
"""
|
"""
|
||||||
check whether prefill stage exist
|
check whether prefill stage exist
|
||||||
@@ -901,6 +918,7 @@ class XPUModelRunner(ModelRunnerBase):
|
|||||||
self.num_gpu_blocks = self.parallel_config.total_block_num
|
self.num_gpu_blocks = self.parallel_config.total_block_num
|
||||||
self.initialize_kv_cache()
|
self.initialize_kv_cache()
|
||||||
|
|
||||||
|
@profile_run_guard(True)
|
||||||
def profile_run(self) -> None:
|
def profile_run(self) -> None:
|
||||||
"""Execute a forward pass with dummy inputs to profile the memory usage of the model."""
|
"""Execute a forward pass with dummy inputs to profile the memory usage of the model."""
|
||||||
|
|
||||||
|
@@ -70,9 +70,10 @@ class XpuWorker(WorkerBase):
|
|||||||
|
|
||||||
def graph_optimize_and_warm_up_model(self) -> None:
|
def graph_optimize_and_warm_up_model(self) -> None:
|
||||||
"""
|
"""
|
||||||
Optimizes the inference graph using the specified optimization options.
|
Perform the warm-up and the graph optimization
|
||||||
"""
|
"""
|
||||||
logger.warn("XPU current could not graph optimize and warm up model")
|
if self.model_runner.graph_opt_level >= 1:
|
||||||
|
self.model_runner.sot_warmup()
|
||||||
|
|
||||||
def determine_available_memory(self) -> int:
|
def determine_available_memory(self) -> int:
|
||||||
"""
|
"""
|
||||||
|
Reference in New Issue
Block a user