From c3b2a60fb81d5a2d370e58e4734c9551bde4aba0 Mon Sep 17 00:00:00 2001 From: Yuanle Liu Date: Wed, 10 Sep 2025 11:05:54 +0800 Subject: [PATCH] [BugFix] Fix the abnormal memory usage caused by shape errors in the triton moe backend (#4026) * fix device_id to in * fix triton_moe bug --- .../layers/moe/fused_moe_triton_backend.py | 2 +- fastdeploy/spec_decode/mtp.py | 4 +++- fastdeploy/utils.py | 14 +++++++------- fastdeploy/worker/gpu_worker.py | 2 +- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py index 4c7b0385c..21ac7976a 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py @@ -671,7 +671,7 @@ class BlockWiseFP8MoEMethod(QuantMethodBase): layer, down_proj_weight_name, layer.create_parameter( - shape=self.up_gate_proj_weight_shape, + shape=self.down_proj_weight_shape, dtype=self.weight_dtype, default_initializer=paddle.nn.initializer.Constant(0), ), diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py index 45c727419..41c18eb54 100644 --- a/fastdeploy/spec_decode/mtp.py +++ b/fastdeploy/spec_decode/mtp.py @@ -19,9 +19,11 @@ from typing import List import numpy as np import paddle +from paddle import nn from paddleformers.utils.log import logger from fastdeploy import envs +from fastdeploy.config import FDConfig from fastdeploy.engine.request import Request, RequestType from fastdeploy.model_executor.forward_meta import ForwardMeta from fastdeploy.model_executor.layers.attention import get_attention_backend @@ -52,7 +54,7 @@ class MTPProposer(Proposer): Proposer for Multi-Token-Prediction(MTP) """ - def __init__(self, cfg, main_model, local_rank, device_id, target_model_inputs): + def __init__(self, cfg: FDConfig, main_model: nn.Layer, local_rank: int, device_id: int, target_model_inputs): super().__init__(cfg) self.num_main_model_layers = self.model_config.num_hidden_layers self.local_rank = local_rank diff --git a/fastdeploy/utils.py b/fastdeploy/utils.py index 2d939fa41..46df46cc7 100644 --- a/fastdeploy/utils.py +++ b/fastdeploy/utils.py @@ -516,13 +516,13 @@ def print_gpu_memory_use(gpu_id: int, title: str) -> None: print( f"\n{title}:", - f"\n\tDevice Total memory: {meminfo.total}", - f"\n\tDevice Used memory: {meminfo.used}", - f"\n\tDevice Free memory: {meminfo.free}", - f"\n\tPaddle max memory Reserved: {paddle_max_reserved}", - f"\n\tPaddle max memory Allocated: {paddle_max_allocated}", - f"\n\tPaddle memory Reserved: {paddle_reserved}", - f"\n\tPaddle memory Allocated: {paddle_allocated}", + f"\n\tDevice Total memory(GiB): {meminfo.total / 1024.0 / 1024.0 / 1024.0}", + f"\n\tDevice Used memory(GiB): {meminfo.used / 1024.0 / 1024.0 / 1024.0}", + f"\n\tDevice Free memory(GiB): {meminfo.free / 1024.0 / 1024.0 / 1024.0}", + f"\n\tPaddle max memory Reserved(GiB): {paddle_max_reserved / 1024.0 / 1024.0 / 1024.0}", + f"\n\tPaddle max memory Allocated(GiB): {paddle_max_allocated / 1024.0 / 1024.0 / 1024.0}", + f"\n\tPaddle memory Reserved(GiB): {paddle_reserved / 1024.0 / 1024.0 / 1024.0}", + f"\n\tPaddle memory Allocated(GiB): {paddle_allocated / 1024.0 / 1024.0 / 1024.0}", ) diff --git a/fastdeploy/worker/gpu_worker.py b/fastdeploy/worker/gpu_worker.py index 1bd0107d5..23bd4788e 100644 --- a/fastdeploy/worker/gpu_worker.py +++ b/fastdeploy/worker/gpu_worker.py @@ -84,7 +84,7 @@ class GpuWorker(WorkerBase): self.model_runner: ModelRunnerBase = ModelRunner( fd_config=self.fd_config, device=self.device, - device_id=self.device_ids[self.local_rank % self.max_chips_per_node], + device_id=int(self.device_ids[self.local_rank % self.max_chips_per_node]), rank=self.rank, local_rank=self.local_rank, )