Clear dead code And supplementary notes (#2757)
Some checks failed
Deploy GitHub Pages / deploy (push) Has been cancelled

* 1.supplementary notes 2.delete dead code

* fix bug of forward meta

* Global modification of forward meta

* fix vl model_runner bug
This commit is contained in:
RAM
2025-07-09 16:17:34 +08:00
committed by GitHub
parent b89180f1cd
commit 03a74995b8
12 changed files with 248 additions and 463 deletions

View File

@@ -46,13 +46,9 @@ class ConcreteSizeEntry:
# Output buffer of cudagraph
output_buffer: Optional[paddle.Tensor] = None
# for cudagraph debugging, track the input addresses
# during capture, and check if they are the same during replay
input_addresses: Optional[list[int]] = None
class CudaGraphPiecewiseBackend:
""" """
""" Manage the capture and replay of CUDA graphs at the subgraph level. """
def __init__(
self,
@@ -65,33 +61,31 @@ class CudaGraphPiecewiseBackend:
self.warm_up_size = fd_config.graph_opt_config.cudagraph_num_of_warmups
self.batch_size_to_captured_size = fd_config.graph_opt_config.batch_size_to_captured_size
# runtime_bs -> ConcreteSizeEntry
# Runtime batch size -> ConcreteSizeEntry
self.concrete_size_entries: Dict[int, ConcreteSizeEntry] = {}
for shape in self.cudagraph_capture_sizes:
self.concrete_size_entries[shape] = ConcreteSizeEntry(
runtime_bs=shape)
print("[CUDA GRAPH] Created all batch size entry ")
logger.debug("[CUDA GRAPH] Created all batch size entry ")
def __call__(self, **kwargs):
# Get batch size
ids_remove_padding: paddle.Tensor = kwargs["ids_remove_padding"]
batch_size = ids_remove_padding.shape[0]
padding_batch_size = self.batch_size_to_captured_size[batch_size]
# print(
# f"[CUDA GRAPH] The actual batch size obtained by CUDAGraph is :{batch_size}, ",
# f"The padded batch size is :{padding_batch_size}"
# )
logger.debug(
f"[CUDA GRAPH] The actual batch size obtained by CUDAGraph is :{batch_size}, ",
f"The padded batch size is :{padding_batch_size}")
entry = self.concrete_size_entries.get(padding_batch_size)
assert entry is not None, f"Batch size:{padding_batch_size} is not in cuda graph capture list."
if entry.runnable is None:
entry.runnable = self.runnable
# print(
# f"[CUDA GRAPH] New entry lazy initialize with batch size {padding_batch_size}"
# )
logger.debug(
f"[CUDA GRAPH] New entry lazy initialize with batch size {padding_batch_size}"
)
if not entry.use_cudagraph:
return entry.runnable(**kwargs)
@@ -102,10 +96,10 @@ class CudaGraphPiecewiseBackend:
for n in range(entry.num_finished_warmup, self.warm_up_size):
entry.num_finished_warmup += 1
entry.runnable(**kwargs)
# print(
# "[CUDA GRAPH] Warm up for batch size ",
# f"{padding_batch_size}, finished ({n+1}/{entry.num_finished_warmup}) times"
# )
logger.debug(
"[CUDA GRAPH] Warm up for batch size ",
f"{padding_batch_size}, finished ({n+1}/{entry.num_finished_warmup}) times"
)
# Store input addresses for debug
input_addresses = [
@@ -129,11 +123,13 @@ class CudaGraphPiecewiseBackend:
output._clear
paddle.device.synchronize()
# print(
# f"[CUDA GRAPH] CUDAGraph captured for batch size {padding_batch_size}"
# )
logger.debug(
f"[CUDA GRAPH] CUDAGraph captured for batch size {padding_batch_size}"
)
# Replay
entry.cuda_graph.replay()
# print(f"[CUDA GRAPH] CUDAGraph replayed for batch size {padding_batch_size}")
logger.debug(
f"[CUDA GRAPH] CUDAGraph replayed for batch size {padding_batch_size}"
)
return entry.output_buffer