From 2845bde964b3a22440402cc29e3d144268646bc9 Mon Sep 17 00:00:00 2001
From: littledgg <61149469+littledgg@users.noreply.github.com>
Date: Mon, 21 Jul 2025 16:25:51 +0800
Subject: [PATCH] [Executor] Avoid OOM when start the service while Enable
 Chunked Prefill + CudaGraph  (#2936)

* [Executor] Avoid OOM when start the service while Enable Chunked Prefill + CudaGraph

* Fix: Apply black formatting
---
 fastdeploy/worker/gpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index f2cd2af78..0d199c57d 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1001,7 +1001,7 @@ class GPUModelRunner(ModelRunnerBase):
         capture_sizes = self.cudagraph_capture_sizes.copy()
         for batch_size in sorted(capture_sizes, reverse=True):
             self._dummy_run(
-                num_tokens=self.parallel_config.max_model_len,
+                num_tokens=self.parallel_config.max_num_batched_tokens,
                 batch_size=batch_size,
                 in_capturing=True,
                 expected_decode_len=expected_decode_len,