[Executor] Default use CUDAGraph (#3594)

* add start intercept

* Adjustment GraphOptConfig

* pre-commit

* default use cudagraph

* set default value

* default use cuda graph

* pre-commit

* fix test case bug

* disable rl

* fix moba attention

* only support gpu

* Temporarily disable PD Disaggregation

* set max_num_seqs of test case as 1

* set max_num_seqs and temperature

* fix max_num_batched_tokens bug

* close cuda graph

* success run wint2

* profile run with max_num_batched_tokens

* 1.add c++ memchecker 2.success run wint2

* updatee a800 yaml

* update docs

* 1. delete check 2. fix plas attn test case

* default use use_unique_memory_pool

* add try-except for warmup

* ban mtp, mm, rl

* fix test case mock

* fix ci bug

* fix form_model_get_output_topp0 bug

* fix ci bug

* refine deepseek ci

* refine code

* Disable PD

* fix sot yaml
This commit is contained in:
RAM
2025-10-21 14:25:45 +08:00
committed by GitHub
parent 99564349a7
commit 775edcc09a
32 changed files with 417 additions and 144 deletions

View File

@@ -15,6 +15,100 @@
"""
import contextlib
from dataclasses import dataclass
import paddle
import pynvml
@dataclass
class PaddleMemoryInfo:
# Max memory reserved by Paddle
max_reserved: int = 0
# Max memory allocated by Paddle
max_allocated: int = 0
# Current memory reserved by Paddle
current_reserved: int = 0
# Current memory allocated by Paddle
current_allocated: int = 0
class GPUMemoryChecker:
def __init__(
self,
device: int = 0, # logic device id
device_id: int = 0, # physical device id
print_debug_info: bool = True,
):
self.gpu_memory_info = None
self.paddle_memory_info = None
self.device = device
self.device_id = device_id
self.print_debug_info = print_debug_info
pynvml.nvmlInit()
self.gpu_memory_handle = pynvml.nvmlDeviceGetHandleByIndex(self.device_id)
def __del__(self):
""" """
pynvml.nvmlShutdown()
def _print_memory_info(
self,
debug_title: str = "",
):
"""Print debug info"""
print(
f"\n{debug_title}:",
f"\n\tDevice Total memory: {self.gpu_memory_info.total}",
f"\n\tDevice Used memory: {self.gpu_memory_info.used}",
f"\n\tDevice Free memory: {self.gpu_memory_info.free}",
f"\n\tPaddle max memory Reserved: {self.paddle_memory_info.max_reserved}",
f"\n\tPaddle max memory Allocated: {self.paddle_memory_info.max_allocated}",
f"\n\tPaddle memory Reserved: {self.paddle_memory_info.current_reserved}",
f"\n\tPaddle memory Allocated: {self.paddle_memory_info.current_reserved}",
)
def get_gpu_memory_info(self):
"""Get Device memory information"""
current_meminfo = pynvml.nvmlDeviceGetMemoryInfo(self.gpu_memory_handle)
return current_meminfo
def get_paddle_memory_info(self) -> PaddleMemoryInfo:
"""Get GPU memory information managed by Paddle"""
current_paddle_memory_info = PaddleMemoryInfo()
current_paddle_memory_info.max_reserved = paddle.device.cuda.max_memory_reserved(self.device)
current_paddle_memory_info.max_allocated = paddle.device.cuda.max_memory_allocated(self.device)
current_paddle_memory_info.reserved = paddle.device.cuda.memory_reserved(self.device)
current_paddle_memory_info.allocated = paddle.device.cuda.memory_allocated(self.device)
return current_paddle_memory_info
def _check_memory(self):
"""Check current device memory usage with pre checkpoint"""
current_gpu_memory_info = self.get_gpu_memory_info()
current_paddle_memory_info = self.get_paddle_memory_info()
if self.gpu_memory_info is not None and self.paddle_memory_info is not None:
assert (
current_paddle_memory_info.max_reserved <= self.paddle_memory_info.max_reserved
), f"Memory Check Failed! Current checkpoint Padddle memory usage ({current_paddle_memory_info.max_reserved}) must be less than or equal to the previous one ({self.paddle_memory_info.max_reserved})."
assert (
current_gpu_memory_info.used <= self.gpu_memory_info.used
), f"Memory Check Failed! Current checkpoint GPU memory usage ({current_gpu_memory_info.used}) must be less than or equal to the previous one ({self.gpu_memory_info.used})."
self.gpu_memory_info = current_gpu_memory_info
self.paddle_memory_info = current_paddle_memory_info
def add_check_point(
self,
debug_title: str = "",
):
"""Add checkpoints for GPU memory usage"""
self._check_memory()
if self.print_debug_info:
self._print_memory_info(debug_title)
def create_guard(default_value):