[Executor] Avoid OOM when start the service while Enable Chunked Prefill + CudaGraph (#2936)

* [Executor] Avoid OOM when start the service while Enable Chunked Prefill + CudaGraph

* Fix: Apply black formatting
This commit is contained in:
littledgg
2025-07-21 16:25:51 +08:00
committed by GitHub
parent 2f74e93d7e
commit 2845bde964

View File

@@ -1001,7 +1001,7 @@ class GPUModelRunner(ModelRunnerBase):
capture_sizes = self.cudagraph_capture_sizes.copy()
for batch_size in sorted(capture_sizes, reverse=True):
self._dummy_run(
num_tokens=self.parallel_config.max_model_len,
num_tokens=self.parallel_config.max_num_batched_tokens,
batch_size=batch_size,
in_capturing=True,
expected_decode_len=expected_decode_len,