From c3b2a60fb81d5a2d370e58e4734c9551bde4aba0 Mon Sep 17 00:00:00 2001
From: Yuanle Liu <yuanlehome@163.com>
Date: Wed, 10 Sep 2025 11:05:54 +0800
Subject: [PATCH] [BugFix] Fix the abnormal memory usage caused by shape errors
 in the triton moe backend (#4026)

* fix device_id to in

* fix triton_moe bug
---
 .../layers/moe/fused_moe_triton_backend.py         |  2 +-
 fastdeploy/spec_decode/mtp.py                      |  4 +++-
 fastdeploy/utils.py                                | 14 +++++++-------
 fastdeploy/worker/gpu_worker.py                    |  2 +-
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
index 4c7b0385c..21ac7976a 100644
--- a/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_triton_backend.py
@@ -671,7 +671,7 @@ class BlockWiseFP8MoEMethod(QuantMethodBase):
                 layer,
                 down_proj_weight_name,
                 layer.create_parameter(
-                    shape=self.up_gate_proj_weight_shape,
+                    shape=self.down_proj_weight_shape,
                     dtype=self.weight_dtype,
                     default_initializer=paddle.nn.initializer.Constant(0),
                 ),
diff --git a/fastdeploy/spec_decode/mtp.py b/fastdeploy/spec_decode/mtp.py
index 45c727419..41c18eb54 100644
--- a/fastdeploy/spec_decode/mtp.py
+++ b/fastdeploy/spec_decode/mtp.py
@@ -19,9 +19,11 @@ from typing import List
 
 import numpy as np
 import paddle
+from paddle import nn
 from paddleformers.utils.log import logger
 
 from fastdeploy import envs
+from fastdeploy.config import FDConfig
 from fastdeploy.engine.request import Request, RequestType
 from fastdeploy.model_executor.forward_meta import ForwardMeta
 from fastdeploy.model_executor.layers.attention import get_attention_backend
@@ -52,7 +54,7 @@ class MTPProposer(Proposer):
     Proposer for Multi-Token-Prediction(MTP)
     """
 
-    def __init__(self, cfg, main_model, local_rank, device_id, target_model_inputs):
+    def __init__(self, cfg: FDConfig, main_model: nn.Layer, local_rank: int, device_id: int, target_model_inputs):
         super().__init__(cfg)
         self.num_main_model_layers = self.model_config.num_hidden_layers
         self.local_rank = local_rank
diff --git a/fastdeploy/utils.py b/fastdeploy/utils.py
index 2d939fa41..46df46cc7 100644
--- a/fastdeploy/utils.py
+++ b/fastdeploy/utils.py
@@ -516,13 +516,13 @@ def print_gpu_memory_use(gpu_id: int, title: str) -> None:
 
     print(
         f"\n{title}:",
-        f"\n\tDevice Total memory: {meminfo.total}",
-        f"\n\tDevice Used memory: {meminfo.used}",
-        f"\n\tDevice Free memory: {meminfo.free}",
-        f"\n\tPaddle max memory Reserved: {paddle_max_reserved}",
-        f"\n\tPaddle max memory Allocated: {paddle_max_allocated}",
-        f"\n\tPaddle memory Reserved: {paddle_reserved}",
-        f"\n\tPaddle memory Allocated: {paddle_allocated}",
+        f"\n\tDevice Total memory(GiB): {meminfo.total / 1024.0 / 1024.0 / 1024.0}",
+        f"\n\tDevice Used memory(GiB): {meminfo.used / 1024.0 / 1024.0 / 1024.0}",
+        f"\n\tDevice Free memory(GiB): {meminfo.free / 1024.0 / 1024.0 / 1024.0}",
+        f"\n\tPaddle max memory Reserved(GiB): {paddle_max_reserved / 1024.0 / 1024.0 / 1024.0}",
+        f"\n\tPaddle max memory Allocated(GiB): {paddle_max_allocated / 1024.0 / 1024.0 / 1024.0}",
+        f"\n\tPaddle memory Reserved(GiB): {paddle_reserved / 1024.0 / 1024.0 / 1024.0}",
+        f"\n\tPaddle memory Allocated(GiB): {paddle_allocated / 1024.0 / 1024.0 / 1024.0}",
     )
 
 
diff --git a/fastdeploy/worker/gpu_worker.py b/fastdeploy/worker/gpu_worker.py
index 1bd0107d5..23bd4788e 100644
--- a/fastdeploy/worker/gpu_worker.py
+++ b/fastdeploy/worker/gpu_worker.py
@@ -84,7 +84,7 @@ class GpuWorker(WorkerBase):
         self.model_runner: ModelRunnerBase = ModelRunner(
             fd_config=self.fd_config,
             device=self.device,
-            device_id=self.device_ids[self.local_rank % self.max_chips_per_node],
+            device_id=int(self.device_ids[self.local_rank % self.max_chips_per_node]),
             rank=self.rank,
             local_rank=self.local_rank,
         )