[BugFix] fix num_running_requests in cuda_graph (#3457)

* fix cuda_grpah * add note --------- Co-authored-by: RAM <gstian5555@outlook.com>
2025-10-04 08:16:42 +08:00 · 2025-08-19 10:47:22 +08:00
parent beec24fd89
commit a053ab889b
1 changed files with 4 additions and 0 deletions
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1594,6 +1594,10 @@ class GPUModelRunner(ModelRunnerBase):
        In FastDeploy, almost all input tensors have a buffer. So, just keep the buffer clean when replaying the CUDA graph with the padded batch.
        """
        # In init_attention_metadata, the decode buffer has already been cleared
+
+        # To adapt to CUDA Graph, keep the forward pass at the maximum batch size.
+        if self.use_cudagraph:
+            self.forward_meta.seq_lens_this_time = self.seq_lens_this_time_buffer
        return

    def _init_image_preprocess(self) -> None: