mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-01 06:42:23 +08:00
[Excutor] Change cudagraph hashkey from batch size to num_tokens (#3454)
This commit is contained in:
@@ -487,7 +487,7 @@ class GraphOptimizationConfig:
|
|||||||
self.full_cuda_graph: bool = True
|
self.full_cuda_graph: bool = True
|
||||||
|
|
||||||
self.max_capture_size: int = None
|
self.max_capture_size: int = None
|
||||||
self.batch_size_to_captured_size: dict[int, int] = None
|
self.real_shape_to_captured_size: dict[int, int] = None
|
||||||
# CINN Config ...
|
# CINN Config ...
|
||||||
if args is not None:
|
if args is not None:
|
||||||
for key, value in args.items():
|
for key, value in args.items():
|
||||||
@@ -516,26 +516,26 @@ class GraphOptimizationConfig:
|
|||||||
self.cudagraph_capture_sizes.sort(reverse=True)
|
self.cudagraph_capture_sizes.sort(reverse=True)
|
||||||
self.max_capture_size = self.cudagraph_capture_sizes[0] if self.cudagraph_capture_sizes else 0
|
self.max_capture_size = self.cudagraph_capture_sizes[0] if self.cudagraph_capture_sizes else 0
|
||||||
|
|
||||||
# Pre-compute the mapping from batch size to padded graph size
|
# Pre-compute the mapping from shape to padded graph size
|
||||||
self.batch_size_to_captured_size = {}
|
self.real_shape_to_captured_size = {}
|
||||||
for end, start in zip(self.cudagraph_capture_sizes, self.cudagraph_capture_sizes[1:] + [0]):
|
for end, start in zip(self.cudagraph_capture_sizes, self.cudagraph_capture_sizes[1:] + [0]):
|
||||||
for bs in range(start, end):
|
for bs in range(start, end):
|
||||||
if bs == start:
|
if bs == start:
|
||||||
self.batch_size_to_captured_size[bs] = start
|
self.real_shape_to_captured_size[bs] = start
|
||||||
else:
|
else:
|
||||||
self.batch_size_to_captured_size[bs] = end
|
self.real_shape_to_captured_size[bs] = end
|
||||||
self.batch_size_to_captured_size[self.max_capture_size] = self.max_capture_size
|
self.real_shape_to_captured_size[self.max_capture_size] = self.max_capture_size
|
||||||
|
|
||||||
def _set_cudagraph_sizes(self, max_num_seqs: int = 0):
|
def _set_cudagraph_sizes(self, max_num_seqs: int = 0):
|
||||||
"""
|
"""
|
||||||
Calculate a series of candidate capture batch sizes,
|
Calculate a series of candidate capture sizes,
|
||||||
and then extract a portion of them as the capture list for the CUDA graph based on user input.
|
and then extract a portion of them as the capture list for the CUDA graph based on user input.
|
||||||
"""
|
"""
|
||||||
# Batch Size [1, 2, 4, 8, 16, ... 120, 128]
|
# Shape [1, 2, 4, 8, 16, ... 120, 128]
|
||||||
draft_capture_sizes = [1, 2, 4] + [8 * i for i in range(1, 17)]
|
draft_capture_sizes = [1, 2, 4] + [8 * i for i in range(1, 17)]
|
||||||
# Batch Size [128, 144, ... 240, 256]
|
# Shape [128, 144, ... 240, 256]
|
||||||
draft_capture_sizes += [16 * i for i in range(9, 17)]
|
draft_capture_sizes += [16 * i for i in range(9, 17)]
|
||||||
# Batch Size [256, 288, ... 992, 1024]
|
# Shape [256, 288, ... 992, 1024]
|
||||||
draft_capture_sizes += [32 * i for i in range(17, 33)]
|
draft_capture_sizes += [32 * i for i in range(17, 33)]
|
||||||
|
|
||||||
draft_capture_sizes.append(max_num_seqs)
|
draft_capture_sizes.append(max_num_seqs)
|
||||||
|
@@ -29,9 +29,9 @@ logger = get_logger("cudagrpah_piecewise_backend", "cudagraph_piecewise_backend.
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class ConcreteSizeEntry:
|
class ConcreteSizeEntry:
|
||||||
"""Record the concrete information corresponding to the current batch size"""
|
"""Record the concrete information corresponding to the current shape(num_tokens)"""
|
||||||
|
|
||||||
# Concrete batch size
|
# Concrete shape
|
||||||
runtime_bs: int
|
runtime_bs: int
|
||||||
# The size is in cudagraph_capture_sizes
|
# The size is in cudagraph_capture_sizes
|
||||||
use_cudagraph: bool = True
|
use_cudagraph: bool = True
|
||||||
@@ -42,7 +42,7 @@ class ConcreteSizeEntry:
|
|||||||
runnable: Callable = None # type: ignore
|
runnable: Callable = None # type: ignore
|
||||||
# Number of completed warmups
|
# Number of completed warmups
|
||||||
num_finished_warmup: int = 0
|
num_finished_warmup: int = 0
|
||||||
# Captured cuda graph object corresponding to the current batch size
|
# Captured cuda graph object corresponding to the current real shape
|
||||||
cuda_graph: Optional[graphs.CUDAGraph] = None
|
cuda_graph: Optional[graphs.CUDAGraph] = None
|
||||||
# Output buffer of cudagraph
|
# Output buffer of cudagraph
|
||||||
output_buffer: Optional[paddle.Tensor] = None
|
output_buffer: Optional[paddle.Tensor] = None
|
||||||
@@ -60,33 +60,33 @@ class CudaGraphPiecewiseBackend:
|
|||||||
self.runnable = runnable
|
self.runnable = runnable
|
||||||
self.cudagraph_capture_sizes = fd_config.graph_opt_config.cudagraph_capture_sizes
|
self.cudagraph_capture_sizes = fd_config.graph_opt_config.cudagraph_capture_sizes
|
||||||
self.warm_up_size = fd_config.graph_opt_config.cudagraph_num_of_warmups
|
self.warm_up_size = fd_config.graph_opt_config.cudagraph_num_of_warmups
|
||||||
self.batch_size_to_captured_size = fd_config.graph_opt_config.batch_size_to_captured_size
|
self.real_shape_to_captured_size = fd_config.graph_opt_config.real_shape_to_captured_size
|
||||||
|
|
||||||
# Runtime batch size -> ConcreteSizeEntry
|
# Runtime real shape -> ConcreteSizeEntry
|
||||||
self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
|
self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
|
||||||
|
|
||||||
for shape in self.cudagraph_capture_sizes:
|
for shape in self.cudagraph_capture_sizes:
|
||||||
self.concrete_size_entries[shape] = ConcreteSizeEntry(runtime_bs=shape)
|
self.concrete_size_entries[shape] = ConcreteSizeEntry(runtime_bs=shape)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"[CUDA GRAPH] CUDAGraph capture list {self.cudagraph_capture_sizes}, " "Created all batch sizes entry."
|
f"[CUDA GRAPH] CUDAGraph capture list {self.cudagraph_capture_sizes}, " "Created all real shape entry."
|
||||||
)
|
)
|
||||||
|
|
||||||
def __call__(self, **kwargs):
|
def __call__(self, **kwargs):
|
||||||
# Get batch size
|
# Get real shape(all num tokens)
|
||||||
ids_remove_padding: paddle.Tensor = kwargs["ids_remove_padding"]
|
ids_remove_padding: paddle.Tensor = kwargs["ids_remove_padding"]
|
||||||
batch_size = ids_remove_padding.shape[0]
|
real_shape = ids_remove_padding.shape[0]
|
||||||
padding_batch_size = self.batch_size_to_captured_size[batch_size]
|
padding_real_shape = self.real_shape_to_captured_size[real_shape]
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"[CUDA GRAPH] The actual batch size obtained by CUDAGraph is :{batch_size}, "
|
f"[CUDA GRAPH] The actual real shape obtained by CUDAGraph is :{real_shape}, "
|
||||||
f"The padded batch size is :{padding_batch_size}"
|
f"The padded shape is :{padding_real_shape}"
|
||||||
)
|
)
|
||||||
|
|
||||||
entry = self.concrete_size_entries.get(padding_batch_size)
|
entry = self.concrete_size_entries.get(padding_real_shape)
|
||||||
assert entry is not None, f"Batch size:{padding_batch_size} is not in cuda graph capture list."
|
assert entry is not None, f"real shape:{padding_real_shape} is not in cuda graph capture list."
|
||||||
if entry.runnable is None:
|
if entry.runnable is None:
|
||||||
entry.runnable = self.runnable
|
entry.runnable = self.runnable
|
||||||
logger.debug(f"[CUDA GRAPH] New entry lazy initialize with batch size {padding_batch_size}")
|
logger.debug(f"[CUDA GRAPH] New entry lazy initialize with real shape {padding_real_shape}")
|
||||||
|
|
||||||
if not entry.use_cudagraph:
|
if not entry.use_cudagraph:
|
||||||
return entry.runnable(**kwargs)
|
return entry.runnable(**kwargs)
|
||||||
@@ -98,7 +98,7 @@ class CudaGraphPiecewiseBackend:
|
|||||||
entry.num_finished_warmup += 1
|
entry.num_finished_warmup += 1
|
||||||
entry.runnable(**kwargs)
|
entry.runnable(**kwargs)
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"[CUDA GRAPH] Warm up for batch size {padding_batch_size}, "
|
f"[CUDA GRAPH] Warm up for real shape {padding_real_shape}, "
|
||||||
f"finished ({n + 1}/{entry.num_finished_warmup}) times"
|
f"finished ({n + 1}/{entry.num_finished_warmup}) times"
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -122,9 +122,9 @@ class CudaGraphPiecewiseBackend:
|
|||||||
output._clear
|
output._clear
|
||||||
|
|
||||||
paddle.device.synchronize()
|
paddle.device.synchronize()
|
||||||
logger.debug(f"[CUDA GRAPH] CUDAGraph captured for batch size {padding_batch_size}")
|
logger.debug(f"[CUDA GRAPH] CUDAGraph captured for real shape {padding_real_shape}")
|
||||||
|
|
||||||
# Replay
|
# Replay
|
||||||
entry.cuda_graph.replay()
|
entry.cuda_graph.replay()
|
||||||
logger.debug(f"[CUDA GRAPH] CUDAGraph replayed for batch size {padding_batch_size}")
|
logger.debug(f"[CUDA GRAPH] CUDAGraph replayed for real shape {padding_real_shape}")
|
||||||
return entry.output_buffer
|
return entry.output_buffer
|
||||||
|
Reference in New Issue
Block a user