From 2845bde964b3a22440402cc29e3d144268646bc9 Mon Sep 17 00:00:00 2001 From: littledgg <61149469+littledgg@users.noreply.github.com> Date: Mon, 21 Jul 2025 16:25:51 +0800 Subject: [PATCH] [Executor] Avoid OOM when start the service while Enable Chunked Prefill + CudaGraph (#2936) * [Executor] Avoid OOM when start the service while Enable Chunked Prefill + CudaGraph * Fix: Apply black formatting --- fastdeploy/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index f2cd2af78..0d199c57d 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1001,7 +1001,7 @@ class GPUModelRunner(ModelRunnerBase): capture_sizes = self.cudagraph_capture_sizes.copy() for batch_size in sorted(capture_sizes, reverse=True): self._dummy_run( - num_tokens=self.parallel_config.max_model_len, + num_tokens=self.parallel_config.max_num_batched_tokens, batch_size=batch_size, in_capturing=True, expected_decode_len=expected_decode_len,