mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-05 08:37:06 +08:00
[BugFix] fix num_running_requests in cuda_graph (#3457)
* fix cuda_grpah * add note --------- Co-authored-by: RAM <gstian5555@outlook.com>
This commit is contained in:
@@ -1594,6 +1594,10 @@ class GPUModelRunner(ModelRunnerBase):
|
|||||||
In FastDeploy, almost all input tensors have a buffer. So, just keep the buffer clean when replaying the CUDA graph with the padded batch.
|
In FastDeploy, almost all input tensors have a buffer. So, just keep the buffer clean when replaying the CUDA graph with the padded batch.
|
||||||
"""
|
"""
|
||||||
# In init_attention_metadata, the decode buffer has already been cleared
|
# In init_attention_metadata, the decode buffer has already been cleared
|
||||||
|
|
||||||
|
# To adapt to CUDA Graph, keep the forward pass at the maximum batch size.
|
||||||
|
if self.use_cudagraph:
|
||||||
|
self.forward_meta.seq_lens_this_time = self.seq_lens_this_time_buffer
|
||||||
return
|
return
|
||||||
|
|
||||||
def _init_image_preprocess(self) -> None:
|
def _init_image_preprocess(self) -> None:
|
||||||
|
Reference in New Issue
Block a user