diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 2dfe1021c..af567cba1 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1594,6 +1594,10 @@ class GPUModelRunner(ModelRunnerBase): In FastDeploy, almost all input tensors have a buffer. So, just keep the buffer clean when replaying the CUDA graph with the padded batch. """ # In init_attention_metadata, the decode buffer has already been cleared + + # To adapt to CUDA Graph, keep the forward pass at the maximum batch size. + if self.use_cudagraph: + self.forward_meta.seq_lens_this_time = self.seq_lens_this_time_buffer return def _init_image_preprocess(self) -> None: