mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-04 08:16:42 +08:00
[Executor] CUDA Graph support padding batch (#2844)
* cuda graph support padding batch * Integrate the startup parameters for the graph optimization backend and provide support for user - defined capture sizes. * Do not insert max_num_seqs when the user specifies a capture list * Support set graph optimization config from YAML file * update cuda graph ci * fix ci bug * fix ci bug
This commit is contained in:
@@ -748,10 +748,6 @@ class GCUModelRunner(ModelRunnerBase):
|
||||
# 3. Prepare lora
|
||||
|
||||
# 4. Run model
|
||||
is_decode_batch = not ((self.share_inputs["seq_lens_this_time"]
|
||||
> 1).sum() > 0)
|
||||
self.forward_meta.step_use_cudagraph = is_decode_batch and in_capturing
|
||||
self.forward_meta.is_decode_batch = is_decode_batch
|
||||
model_output = self.model(
|
||||
ids_remove_padding=self.share_inputs["ids_remove_padding"],
|
||||
forward_meta=self.forward_meta)
|
||||
@@ -979,10 +975,6 @@ class GCUModelRunner(ModelRunnerBase):
|
||||
# 2. Padding inputs for cuda grph
|
||||
|
||||
# 3. Execute model
|
||||
is_decode_batch = not ((self.share_inputs["seq_lens_this_time"]
|
||||
> 1).sum() > 0)
|
||||
self.forward_meta.step_use_cudagraph = self.use_cudagraph and is_decode_batch
|
||||
self.forward_meta.is_decode_batch = is_decode_batch
|
||||
model_output = self.model(
|
||||
ids_remove_padding=self.share_inputs["ids_remove_padding"],
|
||||
forward_meta=self.forward_meta)
|
||||
|
Reference in New Issue
Block a user