mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 00:33:03 +08:00
Clear dead code And supplementary notes (#2757)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled
* 1.supplementary notes 2.delete dead code * fix bug of forward meta * Global modification of forward meta * fix vl model_runner bug
This commit is contained in:
@@ -46,13 +46,9 @@ class ConcreteSizeEntry:
|
||||
# Output buffer of cudagraph
|
||||
output_buffer: Optional[paddle.Tensor] = None
|
||||
|
||||
# for cudagraph debugging, track the input addresses
|
||||
# during capture, and check if they are the same during replay
|
||||
input_addresses: Optional[list[int]] = None
|
||||
|
||||
|
||||
class CudaGraphPiecewiseBackend:
|
||||
""" """
|
||||
""" Manage the capture and replay of CUDA graphs at the subgraph level. """
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -65,33 +61,31 @@ class CudaGraphPiecewiseBackend:
|
||||
self.warm_up_size = fd_config.graph_opt_config.cudagraph_num_of_warmups
|
||||
self.batch_size_to_captured_size = fd_config.graph_opt_config.batch_size_to_captured_size
|
||||
|
||||
# runtime_bs -> ConcreteSizeEntry
|
||||
# Runtime batch size -> ConcreteSizeEntry
|
||||
self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
|
||||
|
||||
for shape in self.cudagraph_capture_sizes:
|
||||
self.concrete_size_entries[shape] = ConcreteSizeEntry(
|
||||
runtime_bs=shape)
|
||||
|
||||
print("[CUDA GRAPH] Created all batch size entry ")
|
||||
logger.debug("[CUDA GRAPH] Created all batch size entry ")
|
||||
|
||||
def __call__(self, **kwargs):
|
||||
# Get batch size
|
||||
ids_remove_padding: paddle.Tensor = kwargs["ids_remove_padding"]
|
||||
batch_size = ids_remove_padding.shape[0]
|
||||
|
||||
padding_batch_size = self.batch_size_to_captured_size[batch_size]
|
||||
# print(
|
||||
# f"[CUDA GRAPH] The actual batch size obtained by CUDAGraph is :{batch_size}, ",
|
||||
# f"The padded batch size is :{padding_batch_size}"
|
||||
# )
|
||||
logger.debug(
|
||||
f"[CUDA GRAPH] The actual batch size obtained by CUDAGraph is :{batch_size}, ",
|
||||
f"The padded batch size is :{padding_batch_size}")
|
||||
|
||||
entry = self.concrete_size_entries.get(padding_batch_size)
|
||||
assert entry is not None, f"Batch size:{padding_batch_size} is not in cuda graph capture list."
|
||||
if entry.runnable is None:
|
||||
entry.runnable = self.runnable
|
||||
# print(
|
||||
# f"[CUDA GRAPH] New entry lazy initialize with batch size {padding_batch_size}"
|
||||
# )
|
||||
logger.debug(
|
||||
f"[CUDA GRAPH] New entry lazy initialize with batch size {padding_batch_size}"
|
||||
)
|
||||
|
||||
if not entry.use_cudagraph:
|
||||
return entry.runnable(**kwargs)
|
||||
@@ -102,10 +96,10 @@ class CudaGraphPiecewiseBackend:
|
||||
for n in range(entry.num_finished_warmup, self.warm_up_size):
|
||||
entry.num_finished_warmup += 1
|
||||
entry.runnable(**kwargs)
|
||||
# print(
|
||||
# "[CUDA GRAPH] Warm up for batch size ",
|
||||
# f"{padding_batch_size}, finished ({n+1}/{entry.num_finished_warmup}) times"
|
||||
# )
|
||||
logger.debug(
|
||||
"[CUDA GRAPH] Warm up for batch size ",
|
||||
f"{padding_batch_size}, finished ({n+1}/{entry.num_finished_warmup}) times"
|
||||
)
|
||||
|
||||
# Store input addresses for debug
|
||||
input_addresses = [
|
||||
@@ -129,11 +123,13 @@ class CudaGraphPiecewiseBackend:
|
||||
output._clear
|
||||
|
||||
paddle.device.synchronize()
|
||||
# print(
|
||||
# f"[CUDA GRAPH] CUDAGraph captured for batch size {padding_batch_size}"
|
||||
# )
|
||||
logger.debug(
|
||||
f"[CUDA GRAPH] CUDAGraph captured for batch size {padding_batch_size}"
|
||||
)
|
||||
|
||||
# Replay
|
||||
entry.cuda_graph.replay()
|
||||
# print(f"[CUDA GRAPH] CUDAGraph replayed for batch size {padding_batch_size}")
|
||||
logger.debug(
|
||||
f"[CUDA GRAPH] CUDAGraph replayed for batch size {padding_batch_size}"
|
||||
)
|
||||
return entry.output_buffer
|
||||
|
@@ -28,7 +28,7 @@ _T = TypeVar("_T", bound=type[paddle.nn.Layer])
|
||||
|
||||
def support_graph_optimization(cls: Optional[_T] = None) -> _T:
|
||||
"""
|
||||
A decorator for wrapping models or layers with CUDA graph support.
|
||||
A decorator for wrapping models or layers with static graph and CUDAGraph support.
|
||||
This enables efficient kernel launch sequencing for improved GPU performance.
|
||||
|
||||
Example usage:
|
||||
@@ -74,7 +74,7 @@ def support_graph_optimization(cls: Optional[_T] = None) -> _T:
|
||||
|
||||
|
||||
class GraphOptWrapper:
|
||||
""" """
|
||||
""" The wrapper for GraphOptBackend """
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@@ -87,7 +87,7 @@ class GraphOptWrapper:
|
||||
|
||||
@abstractmethod
|
||||
def forward(self, **kwargs):
|
||||
""" """
|
||||
""" Abstract methods for implementing model.forward() """
|
||||
pass
|
||||
|
||||
def __call__(self, **kwargs):
|
||||
|
@@ -24,7 +24,10 @@ from fastdeploy.model_executor.graph_optimization.cudagraph_piecewise_backend im
|
||||
|
||||
|
||||
class GraphOptBackend:
|
||||
""" """
|
||||
"""
|
||||
Integrated various graph optimization functions, including dynamic graph to static graph conversion,
|
||||
CINN compilation optimization, CudaGraph, and so on.
|
||||
"""
|
||||
|
||||
fd_config: FDConfig
|
||||
cudagraph_piecewise_backend: Optional[CudaGraphPiecewiseBackend] = None
|
||||
|
Reference in New Issue
Block a user