mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-12-18 16:38:24 +08:00
[Executor] Avoid OOM when start the service while Enable Chunked Prefill + CudaGraph (#2936)
* [Executor] Avoid OOM when start the service while Enable Chunked Prefill + CudaGraph * Fix: Apply black formatting
This commit is contained in:
@@ -1001,7 +1001,7 @@ class GPUModelRunner(ModelRunnerBase):
|
||||
capture_sizes = self.cudagraph_capture_sizes.copy()
|
||||
for batch_size in sorted(capture_sizes, reverse=True):
|
||||
self._dummy_run(
|
||||
num_tokens=self.parallel_config.max_model_len,
|
||||
num_tokens=self.parallel_config.max_num_batched_tokens,
|
||||
batch_size=batch_size,
|
||||
in_capturing=True,
|
||||
expected_decode_len=expected_decode_len,
|
||||
|
||||
Reference in New Issue
Block a user