mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 16:48:03 +08:00
[BugFix] fix multinode deployment (#2977)
This commit is contained in:
@@ -100,13 +100,14 @@ class GpuWorker(WorkerBase):
|
||||
# 1. Record memory state before profile run
|
||||
start_time = time.perf_counter()
|
||||
Gb = 1024**3
|
||||
paddle.device.cuda.reset_max_memory_reserved(self.local_rank)
|
||||
paddle.device.cuda.reset_max_memory_allocated(self.local_rank)
|
||||
paddle_reserved_mem_before_run = paddle.device.cuda.max_memory_reserved(self.local_rank)
|
||||
paddle_allocated_mem_before_run = paddle.device.cuda.max_memory_allocated(self.local_rank) # not reserved
|
||||
local_rank = self.local_rank % self.max_chips_per_node
|
||||
paddle.device.cuda.reset_max_memory_reserved(local_rank)
|
||||
paddle.device.cuda.reset_max_memory_allocated(local_rank)
|
||||
paddle_reserved_mem_before_run = paddle.device.cuda.max_memory_reserved(local_rank)
|
||||
paddle_allocated_mem_before_run = paddle.device.cuda.max_memory_allocated(local_rank) # not reserved
|
||||
|
||||
pynvml.nvmlInit()
|
||||
handle = pynvml.nvmlDeviceGetHandleByIndex(int(self.device_ids[self.local_rank]))
|
||||
handle = pynvml.nvmlDeviceGetHandleByIndex(int(self.device_ids[local_rank]))
|
||||
before_run_meminfo = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
||||
|
||||
logger.info(
|
||||
@@ -124,8 +125,8 @@ class GpuWorker(WorkerBase):
|
||||
self.model_runner.profile_run()
|
||||
|
||||
# 3. Statistical memory information
|
||||
paddle_reserved_mem_after_run = paddle.device.cuda.max_memory_reserved(self.local_rank)
|
||||
paddle_allocated_mem_after_run = paddle.device.cuda.max_memory_allocated(self.local_rank)
|
||||
paddle_reserved_mem_after_run = paddle.device.cuda.max_memory_reserved(local_rank)
|
||||
paddle_allocated_mem_after_run = paddle.device.cuda.max_memory_allocated(local_rank)
|
||||
|
||||
model_block_memory_used = self.cal_theortical_kvcache()
|
||||
paddle_peak_increase = paddle_reserved_mem_after_run - paddle_allocated_mem_before_run
|
||||
|
Reference in New Issue
Block a user