mirror of
https://github.com/PaddlePaddle/FastDeploy.git
synced 2025-10-06 00:57:33 +08:00
[Executor] CUDA Graph support padding batch (#2844)
* cuda graph support padding batch * Integrate the startup parameters for the graph optimization backend and provide support for user - defined capture sizes. * Do not insert max_num_seqs when the user specifies a capture list * Support set graph optimization config from YAML file * update cuda graph ci * fix ci bug * fix ci bug
This commit is contained in:
@@ -1026,7 +1026,7 @@ class LLMEngine(object):
|
||||
f" --speculative_model_name_or_path {self.cfg.speculative_config.model_name_or_path}"
|
||||
f" --speculative_model_quantization {self.cfg.speculative_config.quantization}"
|
||||
f" --speculative_benchmark_mode {self.cfg.speculative_config.benchmark_mode}"
|
||||
f" --max_capture_batch_size {self.cfg.max_capture_batch_size}"
|
||||
f" --graph_optimiaztion_config '{self.cfg.graph_optimization_config.to_json_string()}'"
|
||||
f" --guided_decoding_backend {self.cfg.guided_decoding_backend}"
|
||||
f" --load_strategy {self.cfg.model_config.load_strategy}"
|
||||
f" --enable_mm {self.cfg.enable_mm}")
|
||||
@@ -1041,9 +1041,6 @@ class LLMEngine(object):
|
||||
self.cfg.cache_config.enable_chunked_prefill,
|
||||
"do_profile": self.do_profile,
|
||||
"dynamic_load_weight": self.cfg.model_config.dynamic_load_weight,
|
||||
"enable_static_graph_inference":
|
||||
self.cfg.enable_static_graph_inference,
|
||||
"use_cudagraph": self.cfg.use_cudagraph,
|
||||
"disable_any_whitespace": self.cfg.disable_any_whitespace,
|
||||
"enable-custom-all-reduce": self.cfg.parallel_config.enable_custom_all_reduce,
|
||||
"enable_logprob": self.cfg.enable_logprob,
|
||||
|
Reference in New Issue
Block a user